Extracting embeddings¶

This notebook is a simple guide to extracting learned feature embeddings using Tab2Vec

In [1]:

Copied!





import numpy as np
import pandas as pd
import torch

from pytorch_widedeep.preprocessing import TabPreprocessor
from pytorch_widedeep.training import Trainer
from pytorch_widedeep.models import FTTransformer, WideDeep
from pytorch_widedeep.metrics import Accuracy
from pytorch_widedeep import Tab2Vec
from pytorch_widedeep.datasets import load_adult
import numpy as np
import pandas as pd
import torch

from pytorch_widedeep.preprocessing import TabPreprocessor
from pytorch_widedeep.training import Trainer
from pytorch_widedeep.models import FTTransformer, WideDeep
from pytorch_widedeep.metrics import Accuracy
from pytorch_widedeep import Tab2Vec
from pytorch_widedeep.datasets import load_adult

In [2]:

Copied!

df = load_adult(as_frame=True)
df.head()
df = load_adult(as_frame=True)
df.head()

Out[2]:

	age	workclass	fnlwgt	education	educational-num	marital-status	occupation	relationship	race	gender	capital-gain	hours-per-week	native-country	income
0	25	Private	226802	11th	7	Never-married	Machine-op-inspct	Own-child	Black	Male	0	40	United-States	<=50K
1	38	Private	89814	HS-grad	9	Married-civ-spouse	Farming-fishing	Husband	White	Male	0	50	United-States	<=50K
2	28	Local-gov	336951	Assoc-acdm	12	Married-civ-spouse	Protective-serv	Husband	White	Male	0	40	United-States	>50K
3	44	Private	160323	Some-college	10	Married-civ-spouse	Machine-op-inspct	Husband	Black	Male	7688	40	United-States	>50K
4	18	?	103497	Some-college	10	Never-married	?	Own-child	White	Female	0	30	United-States	<=50K

In [3]:

Copied!





# For convenience, we'll replace '-' with '_'
df.columns = [c.replace("-", "_") for c in df.columns]
# binary target
df["target"] = (df["income"].apply(lambda x: ">50K" in x)).astype(int)
df.drop(["income", "educational_num"], axis=1, inplace=True)

df.head()
# For convenience, we'll replace '-' with '_'
df.columns = [c.replace("-", "_") for c in df.columns]
# binary target
df["target"] = (df["income"].apply(lambda x: ">50K" in x)).astype(int)
df.drop(["income", "educational_num"], axis=1, inplace=True)

df.head()

Out[3]:

	age	workclass	fnlwgt	education	marital_status	occupation	relationship	race	gender	capital_gain	hours_per_week	native_country	target
0	25	Private	226802	11th	Never-married	Machine-op-inspct	Own-child	Black	Male	0	40	United-States	0
1	38	Private	89814	HS-grad	Married-civ-spouse	Farming-fishing	Husband	White	Male	0	50	United-States	0
2	28	Local-gov	336951	Assoc-acdm	Married-civ-spouse	Protective-serv	Husband	White	Male	0	40	United-States	1
3	44	Private	160323	Some-college	Married-civ-spouse	Machine-op-inspct	Husband	Black	Male	7688	40	United-States	1
4	18	?	103497	Some-college	Never-married	?	Own-child	White	Female	0	30	United-States	0

In [4]:

Copied!





cat_cols, cont_cols = [], []
for col in df.columns:
    # 50 is just a random number I choose here for this example
    if df[col].dtype == "O" or df[col].nunique() < 50 and col != "target":
        cat_cols.append(col)
    elif col != "target":
        cont_cols.append(col)
target_col = "target"
cat_cols, cont_cols = [], []
for col in df.columns:
    # 50 is just a random number I choose here for this example
    if df[col].dtype == "O" or df[col].nunique() < 50 and col != "target":
        cat_cols.append(col)
    elif col != "target":
        cont_cols.append(col)
target_col = "target"

In [5]:

Copied!





target = df[target_col].values

tab_preprocessor = TabPreprocessor(
    embed_cols=cat_cols, continuous_cols=cont_cols, for_transformer=True
)
X_tab = tab_preprocessor.fit_transform(df)
target = df[target_col].values

tab_preprocessor = TabPreprocessor(
    embed_cols=cat_cols, continuous_cols=cont_cols, for_transformer=True
)
X_tab = tab_preprocessor.fit_transform(df)

/Users/javierrodriguezzaurin/Projects/pytorch-widedeep/pytorch_widedeep/preprocessing/tab_preprocessor.py:358: UserWarning: Continuous columns will not be normalised
  warnings.warn("Continuous columns will not be normalised")

In [6]:

Copied!





ft_transformer = FTTransformer(
    column_idx=tab_preprocessor.column_idx,
    cat_embed_input=tab_preprocessor.cat_embed_input,
    continuous_cols=tab_preprocessor.continuous_cols,
    embed_continuous_method="standard",
    n_blocks=2,
    n_heads=4,
    input_dim=16,
)
ft_transformer = FTTransformer(
    column_idx=tab_preprocessor.column_idx,
    cat_embed_input=tab_preprocessor.cat_embed_input,
    continuous_cols=tab_preprocessor.continuous_cols,
    embed_continuous_method="standard",
    n_blocks=2,
    n_heads=4,
    input_dim=16,
)

/Users/javierrodriguezzaurin/Projects/pytorch-widedeep/pytorch_widedeep/utils/general_utils.py:12: DeprecationWarning: The 'embed_continuous' parameter is deprecated and will be removed in the next release. Please use 'embed_continuous_method' instead See the documentation for more details.
  return func(*args, **kwargs)

In [7]:

Copied!

model = WideDeep(deeptabular=ft_transformer)
trainer = Trainer(model, objective="binary", metrics=[Accuracy])
trainer.fit(X_tab=X_tab, target=target, n_epochs=1, batch_size=256, val_split=0.2)
model = WideDeep(deeptabular=ft_transformer)
trainer = Trainer(model, objective="binary", metrics=[Accuracy])
trainer.fit(X_tab=X_tab, target=target, n_epochs=1, batch_size=256, val_split=0.2)

epoch 1: 100%|█████████████████████████████████████████████████████████████| 153/153 [00:03<00:00, 41.47it/s, loss=221, metrics={'acc': 0.686}]
valid: 100%|█████████████████████████████████████████████████████████████████| 39/39 [00:00<00:00, 86.24it/s, loss=9.28, metrics={'acc': 0.76}]

In [8]:

Copied!

t2v = Tab2Vec(model=model, tab_preprocessor=tab_preprocessor)
t2v = Tab2Vec(model=model, tab_preprocessor=tab_preprocessor)

In [9]:

Copied!

# assuming is a test set with target col
X_vec, y = t2v.transform(df.sample(100), target_col="target")
# assuming is a test set with target col
X_vec, y = t2v.transform(df.sample(100), target_col="target")

In [10]:

Copied!

# X vec is the dataframe turned into the embeddings
X_vec.shape
# X vec is the dataframe turned into the embeddings
X_vec.shape

Out[10]:

(100, 208)

208 = input_dim (16) * n_cols (13)

In [11]:

Copied!

# ...or if we don't have target col
X_vec = t2v.transform(df.sample(100))
# ...or if we don't have target col
X_vec = t2v.transform(df.sample(100))