AUC multiclass computation¶
Initial imports¶
In [1]:
Copied!
import numpy as np
import pandas as pd
from torch.optim import SGD, lr_scheduler
from pytorch_widedeep import Trainer
from pytorch_widedeep.preprocessing import TabPreprocessor
from pytorch_widedeep.models import TabMlp, WideDeep
from torchmetrics import AUROC
from pytorch_widedeep.initializers import XavierNormal
from pytorch_widedeep.datasets import load_ecoli
from pytorch_widedeep.utils import LabelEncoder
from sklearn.model_selection import train_test_split
# increase displayed columns in jupyter notebook
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 300)
import numpy as np
import pandas as pd
from torch.optim import SGD, lr_scheduler
from pytorch_widedeep import Trainer
from pytorch_widedeep.preprocessing import TabPreprocessor
from pytorch_widedeep.models import TabMlp, WideDeep
from torchmetrics import AUROC
from pytorch_widedeep.initializers import XavierNormal
from pytorch_widedeep.datasets import load_ecoli
from pytorch_widedeep.utils import LabelEncoder
from sklearn.model_selection import train_test_split
# increase displayed columns in jupyter notebook
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 300)
In [2]:
Copied!
df = load_ecoli(as_frame=True)
df.head()
df = load_ecoli(as_frame=True)
df.head()
Out[2]:
SequenceName | mcg | gvh | lip | chg | aac | alm1 | alm2 | class | |
---|---|---|---|---|---|---|---|---|---|
0 | AAT_ECOLI | 0.49 | 0.29 | 0.48 | 0.5 | 0.56 | 0.24 | 0.35 | cp |
1 | ACEA_ECOLI | 0.07 | 0.40 | 0.48 | 0.5 | 0.54 | 0.35 | 0.44 | cp |
2 | ACEK_ECOLI | 0.56 | 0.40 | 0.48 | 0.5 | 0.49 | 0.37 | 0.46 | cp |
3 | ACKA_ECOLI | 0.59 | 0.49 | 0.48 | 0.5 | 0.52 | 0.45 | 0.36 | cp |
4 | ADI_ECOLI | 0.23 | 0.32 | 0.48 | 0.5 | 0.55 | 0.25 | 0.35 | cp |
In [3]:
Copied!
# imbalance of the classes
df["class"].value_counts()
# imbalance of the classes
df["class"].value_counts()
Out[3]:
class cp 143 im 77 pp 52 imU 35 om 20 omL 5 imS 2 imL 2 Name: count, dtype: int64
In [4]:
Copied!
df = df.loc[~df["class"].isin(["omL", "imS", "imL"])]
df.reset_index(inplace=True, drop=True)
df = df.loc[~df["class"].isin(["omL", "imS", "imL"])]
df.reset_index(inplace=True, drop=True)
In [5]:
Copied!
encoder = LabelEncoder(["class"])
df_enc = encoder.fit_transform(df)
df_enc["class"] = df_enc["class"] - 1
encoder = LabelEncoder(["class"])
df_enc = encoder.fit_transform(df)
df_enc["class"] = df_enc["class"] - 1
In [6]:
Copied!
# drop columns we won't need in this example
df_enc = df_enc.drop(columns=["SequenceName"])
# drop columns we won't need in this example
df_enc = df_enc.drop(columns=["SequenceName"])
In [7]:
Copied!
df_train, df_valid = train_test_split(
df_enc, test_size=0.2, stratify=df_enc["class"], random_state=1
)
df_valid, df_test = train_test_split(
df_valid, test_size=0.5, stratify=df_valid["class"], random_state=1
)
df_train, df_valid = train_test_split(
df_enc, test_size=0.2, stratify=df_enc["class"], random_state=1
)
df_valid, df_test = train_test_split(
df_valid, test_size=0.5, stratify=df_valid["class"], random_state=1
)
Preparing the data¶
In [8]:
Copied!
continuous_cols = df_enc.drop(columns=["class"]).columns.values.tolist()
continuous_cols = df_enc.drop(columns=["class"]).columns.values.tolist()
In [9]:
Copied!
# deeptabular
tab_preprocessor = TabPreprocessor(continuous_cols=continuous_cols, scale=True)
X_tab_train = tab_preprocessor.fit_transform(df_train)
X_tab_valid = tab_preprocessor.transform(df_valid)
X_tab_test = tab_preprocessor.transform(df_test)
# target
y_train = df_train["class"].values
y_valid = df_valid["class"].values
y_test = df_test["class"].values
X_train = {"X_tab": X_tab_train, "target": y_train}
X_val = {"X_tab": X_tab_valid, "target": y_valid}
# deeptabular
tab_preprocessor = TabPreprocessor(continuous_cols=continuous_cols, scale=True)
X_tab_train = tab_preprocessor.fit_transform(df_train)
X_tab_valid = tab_preprocessor.transform(df_valid)
X_tab_test = tab_preprocessor.transform(df_test)
# target
y_train = df_train["class"].values
y_valid = df_valid["class"].values
y_test = df_test["class"].values
X_train = {"X_tab": X_tab_train, "target": y_train}
X_val = {"X_tab": X_tab_valid, "target": y_valid}
/Users/javierrodriguezzaurin/Projects/pytorch-widedeep/pytorch_widedeep/preprocessing/tab_preprocessor.py:295: DeprecationWarning: 'scale' and 'already_standard' will be deprecated in the next release. Please use 'cols_to_scale' instead self._check_inputs(cat_embed_cols)
Define the model¶
In [10]:
Copied!
deeptabular = TabMlp(
column_idx=tab_preprocessor.column_idx,
continuous_cols=tab_preprocessor.continuous_cols,
)
model = WideDeep(deeptabular=deeptabular, pred_dim=df_enc["class"].nunique())
model
deeptabular = TabMlp(
column_idx=tab_preprocessor.column_idx,
continuous_cols=tab_preprocessor.continuous_cols,
)
model = WideDeep(deeptabular=deeptabular, pred_dim=df_enc["class"].nunique())
model
Out[10]:
WideDeep( (deeptabular): Sequential( (0): TabMlp( (cont_norm): Identity() (encoder): MLP( (mlp): Sequential( (dense_layer_0): Sequential( (0): Linear(in_features=7, out_features=200, bias=True) (1): ReLU(inplace=True) (2): Dropout(p=0.1, inplace=False) ) (dense_layer_1): Sequential( (0): Linear(in_features=200, out_features=100, bias=True) (1): ReLU(inplace=True) (2): Dropout(p=0.1, inplace=False) ) ) ) ) (1): Linear(in_features=100, out_features=5, bias=True) ) )
In [11]:
Copied!
auroc = AUROC(num_classes=df_enc["class"].nunique(), task="multiclass")
auroc = AUROC(num_classes=df_enc["class"].nunique(), task="multiclass")
In [12]:
Copied!
# Optimizers
deep_opt = SGD(model.deeptabular.parameters(), lr=0.1)
# LR Scheduler
deep_sch = lr_scheduler.StepLR(deep_opt, step_size=3)
# Hyperparameters
trainer = Trainer(
model,
objective="multiclass_focal_loss",
lr_schedulers={"deeptabular": deep_sch},
initializers={"deeptabular": XavierNormal},
optimizers={"deeptabular": deep_opt},
metrics=[auroc],
)
trainer.fit(X_train=X_train, X_val=X_val, n_epochs=5, batch_size=50)
# Optimizers
deep_opt = SGD(model.deeptabular.parameters(), lr=0.1)
# LR Scheduler
deep_sch = lr_scheduler.StepLR(deep_opt, step_size=3)
# Hyperparameters
trainer = Trainer(
model,
objective="multiclass_focal_loss",
lr_schedulers={"deeptabular": deep_sch},
initializers={"deeptabular": XavierNormal},
optimizers={"deeptabular": deep_opt},
metrics=[auroc],
)
trainer.fit(X_train=X_train, X_val=X_val, n_epochs=5, batch_size=50)
epoch 1: 100%|███████████████████████████████████████████████████| 6/6 [00:00<00:00, 54.59it/s, loss=0.109, metrics={'MulticlassAUROC': 0.314}] valid: 100%|████████████████████████████████████████████████████| 1/1 [00:00<00:00, 98.35it/s, loss=0.105, metrics={'MulticlassAUROC': 0.2558}] epoch 2: 100%|██████████████████████████████████████████████████| 6/6 [00:00<00:00, 91.55it/s, loss=0.105, metrics={'MulticlassAUROC': 0.3546}] valid: 100%|███████████████████████████████████████████████████| 1/1 [00:00<00:00, 111.68it/s, loss=0.101, metrics={'MulticlassAUROC': 0.2737}] epoch 3: 100%|████████████████████████████████████████████████████| 6/6 [00:00<00:00, 62.55it/s, loss=0.1, metrics={'MulticlassAUROC': 0.3795}] valid: 100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00, 108.51it/s, loss=0.0966, metrics={'MulticlassAUROC': 0.3053}] epoch 4: 100%|█████████████████████████████████████████████████| 6/6 [00:00<00:00, 99.35it/s, loss=0.0965, metrics={'MulticlassAUROC': 0.3809}] valid: 100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00, 117.73it/s, loss=0.0962, metrics={'MulticlassAUROC': 0.3089}] epoch 5: 100%|████████████████████████████████████████████████| 6/6 [00:00<00:00, 110.56it/s, loss=0.0967, metrics={'MulticlassAUROC': 0.3509}] valid: 100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00, 127.35it/s, loss=0.0958, metrics={'MulticlassAUROC': 0.3089}]
In [ ]:
Copied!