Extracting embeddings¶
This notebook is a simple guide to extracting learned feature embeddings using Tab2Vec
In [1]:
Copied!
import numpy as np
import pandas as pd
import torch
from pytorch_widedeep.preprocessing import TabPreprocessor
from pytorch_widedeep.training import Trainer
from pytorch_widedeep.models import FTTransformer, WideDeep
from pytorch_widedeep.metrics import Accuracy
from pytorch_widedeep import Tab2Vec
from pytorch_widedeep.datasets import load_adult
import numpy as np
import pandas as pd
import torch
from pytorch_widedeep.preprocessing import TabPreprocessor
from pytorch_widedeep.training import Trainer
from pytorch_widedeep.models import FTTransformer, WideDeep
from pytorch_widedeep.metrics import Accuracy
from pytorch_widedeep import Tab2Vec
from pytorch_widedeep.datasets import load_adult
/Users/javierrodriguezzaurin/.pyenv/versions/3.10.15/envs/widedeep310/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
In [2]:
Copied!
df = load_adult(as_frame=True)
df.head()
df = load_adult(as_frame=True)
df.head()
Out[2]:
age | workclass | fnlwgt | education | educational-num | marital-status | occupation | relationship | race | gender | capital-gain | capital-loss | hours-per-week | native-country | income | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 25 | Private | 226802 | 11th | 7 | Never-married | Machine-op-inspct | Own-child | Black | Male | 0 | 0 | 40 | United-States | <=50K |
1 | 38 | Private | 89814 | HS-grad | 9 | Married-civ-spouse | Farming-fishing | Husband | White | Male | 0 | 0 | 50 | United-States | <=50K |
2 | 28 | Local-gov | 336951 | Assoc-acdm | 12 | Married-civ-spouse | Protective-serv | Husband | White | Male | 0 | 0 | 40 | United-States | >50K |
3 | 44 | Private | 160323 | Some-college | 10 | Married-civ-spouse | Machine-op-inspct | Husband | Black | Male | 7688 | 0 | 40 | United-States | >50K |
4 | 18 | ? | 103497 | Some-college | 10 | Never-married | ? | Own-child | White | Female | 0 | 0 | 30 | United-States | <=50K |
In [3]:
Copied!
# For convenience, we'll replace '-' with '_'
df.columns = [c.replace("-", "_") for c in df.columns]
# binary target
df["target"] = (df["income"].apply(lambda x: ">50K" in x)).astype(int)
df.drop(["income", "educational_num"], axis=1, inplace=True)
df.head()
# For convenience, we'll replace '-' with '_'
df.columns = [c.replace("-", "_") for c in df.columns]
# binary target
df["target"] = (df["income"].apply(lambda x: ">50K" in x)).astype(int)
df.drop(["income", "educational_num"], axis=1, inplace=True)
df.head()
Out[3]:
age | workclass | fnlwgt | education | marital_status | occupation | relationship | race | gender | capital_gain | capital_loss | hours_per_week | native_country | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 25 | Private | 226802 | 11th | Never-married | Machine-op-inspct | Own-child | Black | Male | 0 | 0 | 40 | United-States | 0 |
1 | 38 | Private | 89814 | HS-grad | Married-civ-spouse | Farming-fishing | Husband | White | Male | 0 | 0 | 50 | United-States | 0 |
2 | 28 | Local-gov | 336951 | Assoc-acdm | Married-civ-spouse | Protective-serv | Husband | White | Male | 0 | 0 | 40 | United-States | 1 |
3 | 44 | Private | 160323 | Some-college | Married-civ-spouse | Machine-op-inspct | Husband | Black | Male | 7688 | 0 | 40 | United-States | 1 |
4 | 18 | ? | 103497 | Some-college | Never-married | ? | Own-child | White | Female | 0 | 0 | 30 | United-States | 0 |
In [4]:
Copied!
cat_cols, cont_cols = [], []
for col in df.columns:
# 50 is just a random number I choose here for this example
if df[col].dtype == "O" or df[col].nunique() < 50 and col != "target":
cat_cols.append(col)
elif col != "target":
cont_cols.append(col)
target_col = "target"
cat_cols, cont_cols = [], []
for col in df.columns:
# 50 is just a random number I choose here for this example
if df[col].dtype == "O" or df[col].nunique() < 50 and col != "target":
cat_cols.append(col)
elif col != "target":
cont_cols.append(col)
target_col = "target"
In [5]:
Copied!
target = df[target_col].values
tab_preprocessor = TabPreprocessor(
embed_cols=cat_cols, continuous_cols=cont_cols, for_transformer=True
)
X_tab = tab_preprocessor.fit_transform(df)
target = df[target_col].values
tab_preprocessor = TabPreprocessor(
embed_cols=cat_cols, continuous_cols=cont_cols, for_transformer=True
)
X_tab = tab_preprocessor.fit_transform(df)
/Users/javierrodriguezzaurin/Projects/pytorch-widedeep/pytorch_widedeep/preprocessing/tab_preprocessor.py:364: UserWarning: Continuous columns will not be normalised warnings.warn("Continuous columns will not be normalised")
In [6]:
Copied!
ft_transformer = FTTransformer(
column_idx=tab_preprocessor.column_idx,
cat_embed_input=tab_preprocessor.cat_embed_input,
continuous_cols=tab_preprocessor.continuous_cols,
embed_continuous_method="standard",
n_blocks=2,
n_heads=4,
input_dim=16,
)
ft_transformer = FTTransformer(
column_idx=tab_preprocessor.column_idx,
cat_embed_input=tab_preprocessor.cat_embed_input,
continuous_cols=tab_preprocessor.continuous_cols,
embed_continuous_method="standard",
n_blocks=2,
n_heads=4,
input_dim=16,
)
In [7]:
Copied!
model = WideDeep(deeptabular=ft_transformer)
trainer = Trainer(model, objective="binary", metrics=[Accuracy])
trainer.fit(X_tab=X_tab, target=target, n_epochs=1, batch_size=256, val_split=0.2)
model = WideDeep(deeptabular=ft_transformer)
trainer = Trainer(model, objective="binary", metrics=[Accuracy])
trainer.fit(X_tab=X_tab, target=target, n_epochs=1, batch_size=256, val_split=0.2)
epoch 1: 100%|██████████| 153/153 [00:02<00:00, 56.56it/s, loss=48.7, metrics={'acc': 0.7371}] valid: 100%|██████████| 39/39 [00:00<00:00, 129.23it/s, loss=7.44, metrics={'acc': 0.6928}]
In [8]:
Copied!
t2v = Tab2Vec(model=model, tab_preprocessor=tab_preprocessor, device="mps")
t2v = Tab2Vec(model=model, tab_preprocessor=tab_preprocessor, device="mps")
In [9]:
Copied!
# assuming is a test set with target col
X_vec, y = t2v.transform(df.sample(100), target_col="target")
# assuming is a test set with target col
X_vec, y = t2v.transform(df.sample(100), target_col="target")
In [10]:
Copied!
# X vec is the dataframe turned into the embeddings
X_vec.shape
# X vec is the dataframe turned into the embeddings
X_vec.shape
Out[10]:
(100, 208)
208 = input_dim (16) * n_cols (13)
In [11]:
Copied!
# ...or if we don't have target col
X_vec = t2v.transform(df.sample(100))
# ...or if we don't have target col
X_vec = t2v.transform(df.sample(100))