Skip to content

deeptabular utils

LabelEncoder

LabelEncoder(columns_to_encode=None, with_attention=False, shared_embed=False)

Label Encode categorical values for multiple columns at once

ℹ️ NOTE: LabelEncoder reserves 0 for unseen new categories. This is convenient when defining the embedding layers, since we can just set padding idx to 0.

Parameters:

  • columns_to_encode (Optional[List[str]], default: None ) –

    List of strings containing the names of the columns to encode. If None all columns of type object in the dataframe will be label encoded.

  • with_attention (bool, default: False ) –

    Boolean indicating whether the preprocessed data will be passed to an attention-based model. Aliased as for_transformer.

  • shared_embed (bool, default: False ) –

    Boolean indicating if the embeddings will be "shared" when using attention-based models. The idea behind shared_embed is described in the Appendix A in the TabTransformer paper: 'The goal of having column embedding is to enable the model to distinguish the classes in one column from those in the other columns'. In other words, the idea is to let the model learn which column is embedded at the time. See: pytorch_widedeep.models.transformers._layers.SharedEmbeddings.

Attributes:

  • encoding_dict (Dict) –

    Dictionary containing the encoding mappings in the format, e.g. :
    {'colname1': {'cat1': 1, 'cat2': 2, ...}, 'colname2': {'cat1': 1, 'cat2': 2, ...}, ...}

  • inverse_encoding_dict (Dict) –

    Dictionary containing the inverse encoding mappings in the format, e.g. :
    {'colname1': {1: 'cat1', 2: 'cat2', ...}, 'colname2': {1: 'cat1', 2: 'cat2', ...}, ...}

Source code in pytorch_widedeep/utils/deeptabular_utils.py
62
63
64
65
66
67
68
69
70
71
72
73
74
@alias("with_attention", ["for_transformer"])
def __init__(
    self,
    columns_to_encode: Optional[List[str]] = None,
    with_attention: bool = False,
    shared_embed: bool = False,
):
    self.columns_to_encode = columns_to_encode

    self.shared_embed = shared_embed
    self.with_attention = with_attention

    self.reset_embed_idx = not self.with_attention or self.shared_embed

partial_fit

partial_fit(df)

Main method. Creates encoding attributes.

Returns:

Source code in pytorch_widedeep/utils/deeptabular_utils.py
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
def partial_fit(self, df: pd.DataFrame) -> "LabelEncoder":  # noqa: C901
    """Main method. Creates encoding attributes.

    Returns
    -------
    LabelEncoder
        `LabelEncoder` fitted object
    """
    # here df is a chunk of the data. this is meant to be run when the
    # data is large and we pass a chunk at a time. Therefore, we do not
    # copy the input chunk as mutating a chunk is ok
    if self.columns_to_encode is None:
        self.columns_to_encode = list(df.select_dtypes(include=["object"]).columns)
    else:
        # sanity check to make sure all categorical columns are in an adequate
        # format
        for col in self.columns_to_encode:
            df[col] = df[col].astype("O")

    unique_column_vals: Dict[str, List[str]] = {}
    for c in self.columns_to_encode:
        unique_column_vals[c] = df[c].unique().tolist()

    if not hasattr(self, "encoding_dict"):
        # we run the method 'partial_fit' for the 1st time
        self.encoding_dict: Dict[str, Dict[str, int]] = {}
        if "cls_token" in unique_column_vals and self.shared_embed:
            self.encoding_dict["cls_token"] = {"[CLS]": 0}
            del unique_column_vals["cls_token"]

        # leave 0 for padding/"unseen" categories. Also we need an
        # attribute to keep track of the encoding in case we use
        # attention and we do not re-start the index/counter
        self.cum_idx: int = 1
        for k, v in unique_column_vals.items():
            self.encoding_dict[k] = {o: i + self.cum_idx for i, o in enumerate(v)}
            self.cum_idx = 1 if self.reset_embed_idx else self.cum_idx + len(v)
    else:
        # the 'partial_fit' method has already run.
        # "cls_token" will have been added already
        if "cls_token" in unique_column_vals and self.shared_embed:
            del unique_column_vals["cls_token"]

        # Classes in the new df/chunk of the dataset that have not been seen
        # before
        unseen_classes: Dict[str, List[str]] = {}
        for c in self.columns_to_encode:
            unseen_classes[c] = list(
                np.setdiff1d(
                    unique_column_vals[c], list(self.encoding_dict[c].keys())
                )
            )

        # leave 0 for padding/"unseen" categories
        for k, v in unique_column_vals.items():
            # if we use attention we need to start encoding from the
            # last 'overall' encoding index. Otherwise, we use the max
            # encoding index per categorical col
            _idx = (
                max(self.encoding_dict[k].values()) + 1
                if self.reset_embed_idx
                else self.cum_idx
            )
            if len(unseen_classes[k]) != 0:
                for i, o in enumerate(unseen_classes[k]):
                    if o not in self.encoding_dict[k]:
                        self.encoding_dict[k][o] = i + _idx
                # if self.reset_embed_idx is True it will be 1 anyway
                self.cum_idx = (
                    1
                    if self.reset_embed_idx
                    else self.cum_idx + len(unseen_classes[k])
                )

    return self

fit

fit(df)

Simply runs the partial_fit method when the data fits in memory

Returns:

Source code in pytorch_widedeep/utils/deeptabular_utils.py
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
def fit(self, df: pd.DataFrame) -> "LabelEncoder":
    """Simply runs the `partial_fit` method when the data fits in memory

    Returns
    -------
    LabelEncoder
        `LabelEncoder` fitted object
    """
    # this is meant to be run when the data fits in memory and therefore,
    # we do not want to mutate the original df, so we copy it
    self.partial_fit(df.copy())

    self.inverse_encoding_dict = self.create_inverse_encoding_dict()

    return self

transform

transform(df)

Label Encoded the categories in columns_to_encode

Returns:

  • DataFrame

    label-encoded dataframe

Source code in pytorch_widedeep/utils/deeptabular_utils.py
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
    """Label Encoded the categories in `columns_to_encode`

    Returns
    -------
    pd.DataFrame
        label-encoded dataframe
    """
    try:
        self.encoding_dict
    except AttributeError:
        raise NotFittedError(
            "This LabelEncoder instance is not fitted yet. "
            "Call 'fit' with appropriate arguments before using this LabelEncoder."
        )

    df_inp = df.copy()
    # sanity check to make sure all categorical columns are in an adequate
    # format
    for col in self.columns_to_encode:  # type: ignore
        df_inp[col] = df_inp[col].astype("O")

    for k, v in self.encoding_dict.items():
        df_inp[k] = df_inp[k].apply(lambda x: v[x] if x in v.keys() else 0)

    return df_inp

fit_transform

fit_transform(df)

Combines fit and transform

Examples:

>>> import pandas as pd
>>> from pytorch_widedeep.utils import LabelEncoder
>>> df = pd.DataFrame({'col1': [1,2,3], 'col2': ['me', 'you', 'him']})
>>> columns_to_encode = ['col2']
>>> encoder = LabelEncoder(columns_to_encode)
>>> encoder.fit_transform(df)
   col1  col2
0     1     1
1     2     2
2     3     3
>>> encoder.encoding_dict
{'col2': {'me': 1, 'you': 2, 'him': 3}}

Returns:

  • DataFrame

    label-encoded dataframe

Source code in pytorch_widedeep/utils/deeptabular_utils.py
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
    """Combines `fit` and `transform`

    Examples
    --------

    >>> import pandas as pd
    >>> from pytorch_widedeep.utils import LabelEncoder
    >>> df = pd.DataFrame({'col1': [1,2,3], 'col2': ['me', 'you', 'him']})
    >>> columns_to_encode = ['col2']
    >>> encoder = LabelEncoder(columns_to_encode)
    >>> encoder.fit_transform(df)
       col1  col2
    0     1     1
    1     2     2
    2     3     3
    >>> encoder.encoding_dict
    {'col2': {'me': 1, 'you': 2, 'him': 3}}

    Returns
    -------
    pd.DataFrame
        label-encoded dataframe
    """
    return self.fit(df).transform(df)

inverse_transform

inverse_transform(df)

Returns the original categories

Examples:

>>> import pandas as pd
>>> from pytorch_widedeep.utils import LabelEncoder
>>> df = pd.DataFrame({'col1': [1,2,3], 'col2': ['me', 'you', 'him']})
>>> columns_to_encode = ['col2']
>>> encoder = LabelEncoder(columns_to_encode)
>>> df_enc = encoder.fit_transform(df)
>>> encoder.inverse_transform(df_enc)
   col1 col2
0     1   me
1     2  you
2     3  him

Returns:

  • DataFrame

    DataFrame with original categories

Source code in pytorch_widedeep/utils/deeptabular_utils.py
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame:
    """Returns the original categories

    Examples
    --------

    >>> import pandas as pd
    >>> from pytorch_widedeep.utils import LabelEncoder
    >>> df = pd.DataFrame({'col1': [1,2,3], 'col2': ['me', 'you', 'him']})
    >>> columns_to_encode = ['col2']
    >>> encoder = LabelEncoder(columns_to_encode)
    >>> df_enc = encoder.fit_transform(df)
    >>> encoder.inverse_transform(df_enc)
       col1 col2
    0     1   me
    1     2  you
    2     3  him

    Returns
    -------
    pd.DataFrame
        DataFrame with original categories
    """

    if not hasattr(self, "inverse_encoding_dict"):
        self.inverse_encoding_dict = self.create_inverse_encoding_dict()

    for k, v in self.inverse_encoding_dict.items():
        df[k] = df[k].apply(lambda x: v[x])

    return df