Simple Binary Classification with defaults¶

In this notebook we will train a Wide and Deep model and simply a "Deep" model using the well known adult dataset

In [1]:

Copied!





import numpy as np
import pandas as pd
import torch

from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
from pytorch_widedeep.training import Trainer
from pytorch_widedeep.models import Wide, TabMlp, WideDeep
from pytorch_widedeep.metrics import Accuracy, Precision
from pytorch_widedeep.datasets import load_adult
import numpy as np
import pandas as pd
import torch

from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
from pytorch_widedeep.training import Trainer
from pytorch_widedeep.models import Wide, TabMlp, WideDeep
from pytorch_widedeep.metrics import Accuracy, Precision
from pytorch_widedeep.datasets import load_adult

/Users/javierrodriguezzaurin/.pyenv/versions/3.10.13/envs/widedeep310/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

In [2]:

Copied!

df = load_adult(as_frame=True)
df.head()
df = load_adult(as_frame=True)
df.head()

Out[2]:

	age	workclass	fnlwgt	education	educational-num	marital-status	occupation	relationship	race	gender	capital-gain	hours-per-week	native-country	income
0	25	Private	226802	11th	7	Never-married	Machine-op-inspct	Own-child	Black	Male	0	40	United-States	<=50K
1	38	Private	89814	HS-grad	9	Married-civ-spouse	Farming-fishing	Husband	White	Male	0	50	United-States	<=50K
2	28	Local-gov	336951	Assoc-acdm	12	Married-civ-spouse	Protective-serv	Husband	White	Male	0	40	United-States	>50K
3	44	Private	160323	Some-college	10	Married-civ-spouse	Machine-op-inspct	Husband	Black	Male	7688	40	United-States	>50K
4	18	?	103497	Some-college	10	Never-married	?	Own-child	White	Female	0	30	United-States	<=50K

In [3]:

Copied!





# For convenience, we'll replace '-' with '_'
df.columns = [c.replace("-", "_") for c in df.columns]
# binary target
df["income_label"] = (df["income"].apply(lambda x: ">50K" in x)).astype(int)
df.drop("income", axis=1, inplace=True)
df.head()
# For convenience, we'll replace '-' with '_'
df.columns = [c.replace("-", "_") for c in df.columns]
# binary target
df["income_label"] = (df["income"].apply(lambda x: ">50K" in x)).astype(int)
df.drop("income", axis=1, inplace=True)
df.head()

Out[3]:

	age	workclass	fnlwgt	education	educational_num	marital_status	occupation	relationship	race	gender	capital_gain	hours_per_week	native_country	income_label
0	25	Private	226802	11th	7	Never-married	Machine-op-inspct	Own-child	Black	Male	0	40	United-States	0
1	38	Private	89814	HS-grad	9	Married-civ-spouse	Farming-fishing	Husband	White	Male	0	50	United-States	0
2	28	Local-gov	336951	Assoc-acdm	12	Married-civ-spouse	Protective-serv	Husband	White	Male	0	40	United-States	1
3	44	Private	160323	Some-college	10	Married-civ-spouse	Machine-op-inspct	Husband	Black	Male	7688	40	United-States	1
4	18	?	103497	Some-college	10	Never-married	?	Own-child	White	Female	0	30	United-States	0

In [4]:

Copied!

df.drop(["fnlwgt", "educational_num"], axis=1, inplace=True)
df.drop(["fnlwgt", "educational_num"], axis=1, inplace=True)

Preparing the data¶

In [5]:

Copied!





# Define wide, crossed and deep tabular columns
wide_cols = [
    "workclass",
    "education",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "gender",
    "native_country",
]
crossed_cols = [("education", "occupation"), ("native_country", "occupation")]
# Define wide, crossed and deep tabular columns
wide_cols = [
    "workclass",
    "education",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "gender",
    "native_country",
]
crossed_cols = [("education", "occupation"), ("native_country", "occupation")]

In [6]:

Copied!





cat_embed_cols = [
    "workclass",
    "education",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "gender",
    "capital_gain",
    "capital_loss",
    "native_country",
]
continuous_cols = ["age", "hours_per_week"]
cat_embed_cols = [
    "workclass",
    "education",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "gender",
    "capital_gain",
    "capital_loss",
    "native_country",
]
continuous_cols = ["age", "hours_per_week"]

In [7]:

Copied!

# TARGET
target_col = "income_label"
target = df[target_col].values
# TARGET
target_col = "income_label"
target = df[target_col].values

let's see what the preprocessors do

In [8]:

Copied!

# wide
wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)
X_wide = wide_preprocessor.fit_transform(df)
# wide
wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)
X_wide = wide_preprocessor.fit_transform(df)

In [9]:

Copied!

# # wide_preprocessor has an attribute called encoding_dict with the encoding dictionary
# wide_preprocessor.encoding_dict
# # wide_preprocessor has an attribute called encoding_dict with the encoding dictionary
# wide_preprocessor.encoding_dict

In [10]:

Copied!





# deeptabular
tab_preprocessor = TabPreprocessor(
    embed_cols=cat_embed_cols,
    continuous_cols=continuous_cols,
    cols_to_scale=continuous_cols,
)
X_tab = tab_preprocessor.fit_transform(df)
# deeptabular
tab_preprocessor = TabPreprocessor(
    embed_cols=cat_embed_cols,
    continuous_cols=continuous_cols,
    cols_to_scale=continuous_cols,
)
X_tab = tab_preprocessor.fit_transform(df)

In [11]:

Copied!





# check the docs to understand the useful attributes that the tab_preprocessor has. For example,
# as well as an encoding dictionary, tab_preprocessor has an attribute called cat_embed_input
# that specifies the categortical columns that will be represented as embeddings, the number
# of different categories per feature, and the dimension of the embeddings as defined by some
# of the internal rules of thumb that the preprocessor has (have a look to the docs)
tab_preprocessor.cat_embed_input
# check the docs to understand the useful attributes that the tab_preprocessor has. For example,
# as well as an encoding dictionary, tab_preprocessor has an attribute called cat_embed_input
# that specifies the categortical columns that will be represented as embeddings, the number
# of different categories per feature, and the dimension of the embeddings as defined by some
# of the internal rules of thumb that the preprocessor has (have a look to the docs)
tab_preprocessor.cat_embed_input

Out[11]:

[('workclass', 9, 5),
 ('education', 16, 8),
 ('marital_status', 7, 5),
 ('occupation', 15, 7),
 ('relationship', 6, 4),
 ('race', 5, 4),
 ('gender', 2, 2),
 ('capital_gain', 123, 24),
 ('capital_loss', 99, 21),
 ('native_country', 42, 13)]

In [12]:

Copied!

print(X_wide)
print(X_wide.shape)
print(X_wide)
print(X_wide.shape)

[[  1  10  26 ...  61 103 328]
 [  1  11  27 ...  61 104 329]
 [  2  12  27 ...  61 105 330]
 ...
 [  1  11  28 ...  61 115 335]
 [  1  11  26 ...  61 115 335]
 [  7  11  27 ...  61 127 336]]
(48842, 10)

In [13]:

Copied!

print(X_tab)
print(X_tab.shape)
print(X_tab)
print(X_tab.shape)

[[ 1.          1.          1.         ...  1.         -0.99512893
  -0.03408696]
 [ 1.          2.          2.         ...  1.         -0.04694151
   0.77292975]
 [ 2.          3.          2.         ...  1.         -0.77631645
  -0.03408696]
 ...
 [ 1.          2.          3.         ...  1.          1.41180837
  -0.03408696]
 [ 1.          2.          1.         ...  1.         -1.21394141
  -1.64812038]
 [ 7.          2.          2.         ...  1.          0.97418341
  -0.03408696]]
(48842, 12)

Defining the model¶

In [14]:

Copied!





wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)
tab_mlp = TabMlp(
    column_idx=tab_preprocessor.column_idx,
    cat_embed_input=tab_preprocessor.cat_embed_input,
    cat_embed_dropout=0.1,
    continuous_cols=continuous_cols,
    mlp_hidden_dims=[400, 200],
    mlp_dropout=0.5,
    mlp_activation="leaky_relu",
)
wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)
tab_mlp = TabMlp(
    column_idx=tab_preprocessor.column_idx,
    cat_embed_input=tab_preprocessor.cat_embed_input,
    cat_embed_dropout=0.1,
    continuous_cols=continuous_cols,
    mlp_hidden_dims=[400, 200],
    mlp_dropout=0.5,
    mlp_activation="leaky_relu",
)

Let's first find out how a linear model performs

In [15]:

Copied!

wide
wide

Out[15]:

Wide(
  (wide_linear): Embedding(809, 1, padding_idx=0)
)

Before being passed to the Trainer, the models need to be "constructed" with the WideDeep constructor class. For the particular case of the wide/linear model, not much really happens

In [16]:

Copied!

lin_model = WideDeep(wide=wide)
lin_model = WideDeep(wide=wide)

In [17]:

Copied!

lin_model
lin_model

Out[17]:

WideDeep(
  (wide): Wide(
    (wide_linear): Embedding(809, 1, padding_idx=0)
  )
)

In [18]:

Copied!





lin_trainer = Trainer(
    model=lin_model,
    objective="binary",
    optimizers=torch.optim.AdamW(lin_model.parameters(), lr=0.01),
    metrics=[Accuracy, Precision],
)
lin_trainer = Trainer(
    model=lin_model,
    objective="binary",
    optimizers=torch.optim.AdamW(lin_model.parameters(), lr=0.01),
    metrics=[Accuracy, Precision],
)

In [19]:

Copied!

lin_trainer.fit(X_wide=X_wide, target=target, n_epochs=4, batch_size=128, val_split=0.2)
lin_trainer.fit(X_wide=X_wide, target=target, n_epochs=4, batch_size=128, val_split=0.2)

epoch 1: 100%|█████████████████████████████████████████| 306/306 [00:02<00:00, 109.04it/s, loss=0.426, metrics={'acc': 0.7983, 'prec': 0.6152}]
valid: 100%|██████████████████████████████████████████████| 77/77 [00:00<00:00, 102.46it/s, loss=0.366, metrics={'acc': 0.832, 'prec': 0.6916}]
epoch 2: 100%|█████████████████████████████████████████| 306/306 [00:02<00:00, 130.27it/s, loss=0.364, metrics={'acc': 0.8305, 'prec': 0.6933}]
valid: 100%|█████████████████████████████████████████████| 77/77 [00:00<00:00, 150.46it/s, loss=0.361, metrics={'acc': 0.8357, 'prec': 0.6982}]
epoch 3: 100%|█████████████████████████████████████████| 306/306 [00:02<00:00, 133.19it/s, loss=0.359, metrics={'acc': 0.8329, 'prec': 0.6994}]
valid: 100%|██████████████████████████████████████████████| 77/77 [00:00<00:00, 145.75it/s, loss=0.361, metrics={'acc': 0.836, 'prec': 0.7009}]
epoch 4: 100%|█████████████████████████████████████████| 306/306 [00:02<00:00, 130.91it/s, loss=0.358, metrics={'acc': 0.8333, 'prec': 0.7005}]
valid: 100%|██████████████████████████████████████████████| 77/77 [00:00<00:00, 155.08it/s, loss=0.361, metrics={'acc': 0.8364, 'prec': 0.702}]

Bear in mind that wide is a linear model where the non-linearities are captured via the crossed columns. For the crossed-columns to be effective one needs proper business knowledge. There is no magic formula to produce them

Let's have a look to the tabular model by itself

In [20]:

Copied!

tab_model = WideDeep(deeptabular=tab_mlp)
tab_model = WideDeep(deeptabular=tab_mlp)

In [21]:

Copied!

tab_model
tab_model

Out[21]:

WideDeep(
  (deeptabular): Sequential(
    (0): TabMlp(
      (cat_embed): DiffSizeCatEmbeddings(
        (embed_layers): ModuleDict(
          (emb_layer_workclass): Embedding(10, 5, padding_idx=0)
          (emb_layer_education): Embedding(17, 8, padding_idx=0)
          (emb_layer_marital_status): Embedding(8, 5, padding_idx=0)
          (emb_layer_occupation): Embedding(16, 7, padding_idx=0)
          (emb_layer_relationship): Embedding(7, 4, padding_idx=0)
          (emb_layer_race): Embedding(6, 4, padding_idx=0)
          (emb_layer_gender): Embedding(3, 2, padding_idx=0)
          (emb_layer_capital_gain): Embedding(124, 24, padding_idx=0)
          (emb_layer_capital_loss): Embedding(100, 21, padding_idx=0)
          (emb_layer_native_country): Embedding(43, 13, padding_idx=0)
        )
        (embedding_dropout): Dropout(p=0.1, inplace=False)
      )
      (cont_norm): Identity()
      (encoder): MLP(
        (mlp): Sequential(
          (dense_layer_0): Sequential(
            (0): Linear(in_features=95, out_features=400, bias=True)
            (1): LeakyReLU(negative_slope=0.01, inplace=True)
            (2): Dropout(p=0.5, inplace=False)
          )
          (dense_layer_1): Sequential(
            (0): Linear(in_features=400, out_features=200, bias=True)
            (1): LeakyReLU(negative_slope=0.01, inplace=True)
            (2): Dropout(p=0.5, inplace=False)
          )
        )
      )
    )
    (1): Linear(in_features=200, out_features=1, bias=True)
  )
)

You can see how the WideDeep class has added a final prediction layer that collects the activations from the last layer of the model and plugs them into the output neuron. If this was a multiclass classification problem, the prediction dimension (i.e. the size of that final layer) needs to be specified via the pred_dim when instantiating the WideDeep class, as we will see later

In [22]:

Copied!





tab_trainer = Trainer(
    model=tab_model,
    objective="binary",
    optimizers=torch.optim.AdamW(tab_model.parameters(), lr=0.001),
    metrics=[Accuracy, Precision],
)
tab_trainer = Trainer(
    model=tab_model,
    objective="binary",
    optimizers=torch.optim.AdamW(tab_model.parameters(), lr=0.001),
    metrics=[Accuracy, Precision],
)

In [23]:

Copied!

tab_trainer.fit(X_tab=X_tab, target=target, n_epochs=4, batch_size=128, val_split=0.2)
tab_trainer.fit(X_tab=X_tab, target=target, n_epochs=4, batch_size=128, val_split=0.2)

epoch 1: 100%|███████████████████████████████████████████| 306/306 [00:03<00:00, 97.00it/s, loss=0.37, metrics={'acc': 0.8267, 'prec': 0.7037}]
valid: 100%|█████████████████████████████████████████████| 77/77 [00:00<00:00, 134.91it/s, loss=0.313, metrics={'acc': 0.8588, 'prec': 0.7577}]
epoch 2: 100%|███████████████████████████████████████████| 306/306 [00:03<00:00, 86.86it/s, loss=0.319, metrics={'acc': 0.8514, 'prec': 0.761}]
valid: 100%|██████████████████████████████████████████████| 77/77 [00:01<00:00, 73.13it/s, loss=0.296, metrics={'acc': 0.8675, 'prec': 0.7685}]
epoch 3: 100%|██████████████████████████████████████████| 306/306 [00:03<00:00, 79.07it/s, loss=0.305, metrics={'acc': 0.8574, 'prec': 0.7646}]
valid: 100%|█████████████████████████████████████████████| 77/77 [00:00<00:00, 130.11it/s, loss=0.289, metrics={'acc': 0.8696, 'prec': 0.7765}]
epoch 4: 100%|██████████████████████████████████████████| 306/306 [00:03<00:00, 87.39it/s, loss=0.296, metrics={'acc': 0.8622, 'prec': 0.7769}]
valid: 100%|██████████████████████████████████████████████| 77/77 [00:00<00:00, 90.63it/s, loss=0.285, metrics={'acc': 0.8697, 'prec': 0.7741}]

The best result I ever obtained with LightGBM on this dataset is 0.8782...so we are pretty close.

Let's combine the wide and tab_mlp components see if it helps

In [24]:

Copied!





wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)
tab_mlp = TabMlp(
    column_idx=tab_preprocessor.column_idx,
    cat_embed_input=tab_preprocessor.cat_embed_input,
    cat_embed_dropout=0.1,
    continuous_cols=continuous_cols,
    mlp_hidden_dims=[400, 200],
    mlp_dropout=0.5,
    mlp_activation="leaky_relu",
)
wd_model = WideDeep(wide=wide, deeptabular=tab_mlp)
wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)
tab_mlp = TabMlp(
    column_idx=tab_preprocessor.column_idx,
    cat_embed_input=tab_preprocessor.cat_embed_input,
    cat_embed_dropout=0.1,
    continuous_cols=continuous_cols,
    mlp_hidden_dims=[400, 200],
    mlp_dropout=0.5,
    mlp_activation="leaky_relu",
)
wd_model = WideDeep(wide=wide, deeptabular=tab_mlp)

In [25]:

Copied!





wd_trainer = Trainer(
    model=wd_model,
    objective="binary",
    optimizers=torch.optim.AdamW(wd_model.parameters(), lr=0.001),
    metrics=[Accuracy, Precision],
)
wd_trainer = Trainer(
    model=wd_model,
    objective="binary",
    optimizers=torch.optim.AdamW(wd_model.parameters(), lr=0.001),
    metrics=[Accuracy, Precision],
)

In [26]:

Copied!

wd_trainer.fit(
    X_wide=X_wide, X_tab=X_tab, target=target, n_epochs=4, batch_size=128, val_split=0.2
)
wd_trainer.fit(
    X_wide=X_wide, X_tab=X_tab, target=target, n_epochs=4, batch_size=128, val_split=0.2
)

epoch 1: 100%|██████████████████████████████████████████| 306/306 [00:03<00:00, 77.48it/s, loss=0.418, metrics={'acc': 0.8047, 'prec': 0.6154}]
valid: 100%|█████████████████████████████████████████████| 77/77 [00:00<00:00, 110.51it/s, loss=0.321, metrics={'acc': 0.8521, 'prec': 0.7059}]
epoch 2: 100%|██████████████████████████████████████████| 306/306 [00:03<00:00, 82.70it/s, loss=0.333, metrics={'acc': 0.8428, 'prec': 0.7141}]
valid: 100%|██████████████████████████████████████████████| 77/77 [00:00<00:00, 112.52it/s, loss=0.299, metrics={'acc': 0.866, 'prec': 0.7447}]
epoch 3: 100%|██████████████████████████████████████████| 306/306 [00:04<00:00, 74.34it/s, loss=0.312, metrics={'acc': 0.8533, 'prec': 0.7404}]
valid: 100%|███████████████████████████████████████████████| 77/77 [00:00<00:00, 89.86it/s, loss=0.29, metrics={'acc': 0.8683, 'prec': 0.7496}]
epoch 4: 100%|██████████████████████████████████████████| 306/306 [00:04<00:00, 65.32it/s, loss=0.301, metrics={'acc': 0.8591, 'prec': 0.7542}]
valid: 100%|██████████████████████████████████████████████| 77/77 [00:00<00:00, 86.81it/s, loss=0.286, metrics={'acc': 0.8712, 'prec': 0.7552}]

For this particular case, the combination of both did not lead to better results that using just the tab_mlp model, when using only 4 epochs.

Note that we have use a TabMlp model, but we could use any other model in the library using the same syntax

In [27]:

Copied!

from pytorch_widedeep.models import TabTransformer
from pytorch_widedeep.models import TabTransformer

The parameters for the TabTransformer are this

column_idx: Dict[str, int],
cat_embed_input: Optional[List[Tuple[str, int]]] = None,
cat_embed_dropout: Optional[float] = None,
use_cat_bias: Optional[bool] = None,
cat_embed_activation: Optional[str] = None,
shared_embed: Optional[bool] = None,
add_shared_embed: Optional[bool] = None,
frac_shared_embed: Optional[float] = None,
continuous_cols: Optional[List[str]] = None,
cont_norm_layer: Optional[Literal["batchnorm", "layernorm"]] = None,
embed_continuous: Optional[bool] = None,
embed_continuous_method: Optional[Literal["standard", "piecewise", "periodic"]] = None,
cont_embed_dropout: Optional[float] = None,
cont_embed_activation: Optional[str] = None,
quantization_setup: Optional[Dict[str, List[float]]] = None,
n_frequencies: Optional[int] = None,
sigma: Optional[float] = None,
share_last_layer: Optional[bool] = None,
full_embed_dropout: Optional[bool] = None,
input_dim: int = 32,
n_heads: int = 8,
use_qkv_bias: bool = False,
n_blocks: int = 4,
attn_dropout: float = 0.2,
ff_dropout: float = 0.1,
ff_factor: int = 4,
transformer_activation: str = "gelu",
use_linear_attention: bool = False,
use_flash_attention: bool = False,
mlp_hidden_dims: Optional[List[int]] = None,
mlp_activation: str = "relu",
mlp_dropout: float = 0.1,
mlp_batchnorm: bool = False,
mlp_batchnorm_last: bool = False,
mlp_linear_first: bool = True,

Please, see the documentation for details on each one of them, for now let's see how one could use a TabTransformer model in a few lines of code

In [28]:

Copied!





tab_transformer = TabTransformer(
    column_idx=tab_preprocessor.column_idx,
    cat_embed_input=tab_preprocessor.cat_embed_input,
    cat_embed_dropout=0.1,
    continuous_cols=continuous_cols,
    embed_continuous_method="standard",
    cont_norm_layer="layernorm",
    cont_embed_dropout=0.2,
    cont_embed_activation="leaky_relu",
    n_heads=4,
    ff_dropout=0.2,
    mlp_dropout=0.5,
    mlp_activation="leaky_relu",
    mlp_linear_first="True",
)
tab_transformer = TabTransformer(
    column_idx=tab_preprocessor.column_idx,
    cat_embed_input=tab_preprocessor.cat_embed_input,
    cat_embed_dropout=0.1,
    continuous_cols=continuous_cols,
    embed_continuous_method="standard",
    cont_norm_layer="layernorm",
    cont_embed_dropout=0.2,
    cont_embed_activation="leaky_relu",
    n_heads=4,
    ff_dropout=0.2,
    mlp_dropout=0.5,
    mlp_activation="leaky_relu",
    mlp_linear_first="True",
)

In [29]:

Copied!

tab_model = WideDeep(deeptabular=tab_transformer)
tab_model = WideDeep(deeptabular=tab_transformer)

In [30]:

Copied!

tab_model
tab_model

Out[30]:

WideDeep(
  (deeptabular): Sequential(
    (0): TabTransformer(
      (cat_embed): SameSizeCatEmbeddings(
        (embed): Embedding(325, 32, padding_idx=0)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (cont_norm): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
      (cont_embed): ContEmbeddings(
        INFO: [ContLinear = weight(n_cont_cols, embed_dim) + bias(n_cont_cols, embed_dim)]
        (linear): ContLinear(n_cont_cols=2, embed_dim=32, embed_dropout=0.2)
        (activation_fn): LeakyReLU(negative_slope=0.01, inplace=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (encoder): Sequential(
        (transformer_block0): TransformerEncoder(
          (attn): MultiHeadedAttention(
            (dropout): Dropout(p=0.2, inplace=False)
            (q_proj): Linear(in_features=32, out_features=32, bias=False)
            (kv_proj): Linear(in_features=32, out_features=64, bias=False)
            (out_proj): Linear(in_features=32, out_features=32, bias=False)
          )
          (ff): FeedForward(
            (w_1): Linear(in_features=32, out_features=128, bias=True)
            (w_2): Linear(in_features=128, out_features=32, bias=True)
            (dropout): Dropout(p=0.2, inplace=False)
            (activation): GELU(approximate='none')
          )
          (attn_addnorm): AddNorm(
            (dropout): Dropout(p=0.2, inplace=False)
            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
          )
          (ff_addnorm): AddNorm(
            (dropout): Dropout(p=0.2, inplace=False)
            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
          )
        )
        (transformer_block1): TransformerEncoder(
          (attn): MultiHeadedAttention(
            (dropout): Dropout(p=0.2, inplace=False)
            (q_proj): Linear(in_features=32, out_features=32, bias=False)
            (kv_proj): Linear(in_features=32, out_features=64, bias=False)
            (out_proj): Linear(in_features=32, out_features=32, bias=False)
          )
          (ff): FeedForward(
            (w_1): Linear(in_features=32, out_features=128, bias=True)
            (w_2): Linear(in_features=128, out_features=32, bias=True)
            (dropout): Dropout(p=0.2, inplace=False)
            (activation): GELU(approximate='none')
          )
          (attn_addnorm): AddNorm(
            (dropout): Dropout(p=0.2, inplace=False)
            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
          )
          (ff_addnorm): AddNorm(
            (dropout): Dropout(p=0.2, inplace=False)
            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
          )
        )
        (transformer_block2): TransformerEncoder(
          (attn): MultiHeadedAttention(
            (dropout): Dropout(p=0.2, inplace=False)
            (q_proj): Linear(in_features=32, out_features=32, bias=False)
            (kv_proj): Linear(in_features=32, out_features=64, bias=False)
            (out_proj): Linear(in_features=32, out_features=32, bias=False)
          )
          (ff): FeedForward(
            (w_1): Linear(in_features=32, out_features=128, bias=True)
            (w_2): Linear(in_features=128, out_features=32, bias=True)
            (dropout): Dropout(p=0.2, inplace=False)
            (activation): GELU(approximate='none')
          )
          (attn_addnorm): AddNorm(
            (dropout): Dropout(p=0.2, inplace=False)
            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
          )
          (ff_addnorm): AddNorm(
            (dropout): Dropout(p=0.2, inplace=False)
            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
          )
        )
        (transformer_block3): TransformerEncoder(
          (attn): MultiHeadedAttention(
            (dropout): Dropout(p=0.2, inplace=False)
            (q_proj): Linear(in_features=32, out_features=32, bias=False)
            (kv_proj): Linear(in_features=32, out_features=64, bias=False)
            (out_proj): Linear(in_features=32, out_features=32, bias=False)
          )
          (ff): FeedForward(
            (w_1): Linear(in_features=32, out_features=128, bias=True)
            (w_2): Linear(in_features=128, out_features=32, bias=True)
            (dropout): Dropout(p=0.2, inplace=False)
            (activation): GELU(approximate='none')
          )
          (attn_addnorm): AddNorm(
            (dropout): Dropout(p=0.2, inplace=False)
            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
          )
          (ff_addnorm): AddNorm(
            (dropout): Dropout(p=0.2, inplace=False)
            (ln): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
          )
        )
      )
    )
    (1): Linear(in_features=384, out_features=1, bias=True)
  )
)

In [31]:

Copied!





tab_trainer = Trainer(
    model=tab_model,
    objective="binary",
    optimizers=torch.optim.AdamW(tab_model.parameters(), lr=0.001),
    metrics=[Accuracy, Precision],
)
tab_trainer = Trainer(
    model=tab_model,
    objective="binary",
    optimizers=torch.optim.AdamW(tab_model.parameters(), lr=0.001),
    metrics=[Accuracy, Precision],
)

In [32]:

Copied!

tab_trainer.fit(X_tab=X_tab, target=target, n_epochs=1, batch_size=128, val_split=0.2)
tab_trainer.fit(X_tab=X_tab, target=target, n_epochs=1, batch_size=128, val_split=0.2)

epoch 1: 100%|██████████████████████████████████████████| 306/306 [00:11<00:00, 27.57it/s, loss=0.359, metrics={'acc': 0.8334, 'prec': 0.7082}]
valid: 100%|███████████████████████████████████████████████| 77/77 [00:01<00:00, 57.89it/s, loss=0.33, metrics={'acc': 0.8536, 'prec': 0.7152}]