ZILNLoss¶

[DISCLAIMER]

Purpose of this notebook is to check if ZILNloss implemented originaly Keras give same results in pytorch-widedeep implemenatation

In [1]:

Copied!





# @title Copyright 2019 The Lifetime Value Authors.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
# @title Copyright 2019 The Lifetime Value Authors.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================

KDD Cup 98 LTV Prediction¶

Run in Google Colab

View source on GitHub

In [3]:

Copied!





import os

import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import tensorflow_probability as tfp
from typing import Sequence

# install and import ltv
!pip install -q git+https://github.com/google/lifetime_value
import lifetime_value as ltv
import os

import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import tensorflow_probability as tfp
from typing import Sequence

# install and import ltv
!pip install -q git+https://github.com/google/lifetime_value
import lifetime_value as ltv

In [ ]:

Copied!

tfd = tfp.distributions
%config InlineBackend.figure_format='retina'
sns.set_style("whitegrid")
tfd = tfp.distributions
%config InlineBackend.figure_format='retina'
sns.set_style("whitegrid")

Configs¶

In [ ]:

Copied!





MODEL = "dnn"
LOSS = "ziln"  # @param { isTemplate: true, type: 'string'} ['mse', 'ziln']
LEARNING_RATE = 0.001  # @param { isTemplate: true}
VERSION = 0  # @param { isTemplate: true, type: 'integer'}
OUTPUT_CSV_FOLDER = "/tmp/lifetime-value/kdd_cup_98/result"  # @param { isTemplate: true, type: 'string'}
MODEL = "dnn"
LOSS = "ziln"  # @param { isTemplate: true, type: 'string'} ['mse', 'ziln']
LEARNING_RATE = 0.001  # @param { isTemplate: true}
VERSION = 0  # @param { isTemplate: true, type: 'integer'}
OUTPUT_CSV_FOLDER = "/tmp/lifetime-value/kdd_cup_98/result"  # @param { isTemplate: true, type: 'string'}

Load data¶

Download kdd_cup_98 data to /tmp/lifetime-value/kdd_cup_98

In [ ]:

Copied!





%%bash
mkdir -p /tmp/lifetime-value/kdd_cup_98
wget https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/cup98lrn.zip -P /tmp/lifetime-value/kdd_cup_98/
wget https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/cup98val.zip -P /tmp/lifetime-value/kdd_cup_98/
wget https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/valtargt.txt -P /tmp/lifetime-value/kdd_cup_98/
cd /tmp/lifetime-value/kdd_cup_98/
unzip cup98lrn.zip
unzip cup98val.zip
%%bash
mkdir -p /tmp/lifetime-value/kdd_cup_98
wget https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/cup98lrn.zip -P /tmp/lifetime-value/kdd_cup_98/
wget https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/cup98val.zip -P /tmp/lifetime-value/kdd_cup_98/
wget https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/valtargt.txt -P /tmp/lifetime-value/kdd_cup_98/
cd /tmp/lifetime-value/kdd_cup_98/
unzip cup98lrn.zip
unzip cup98val.zip

In [ ]:

Copied!





df_train = pd.read_csv("/tmp/lifetime-value/kdd_cup_98/cup98LRN.txt")
num_train = df_train.shape[0]
df_eval = pd.read_csv("/tmp/lifetime-value/kdd_cup_98/cup98VAL.txt")
df_eval_target = pd.read_csv("/tmp/lifetime-value/kdd_cup_98/valtargt.txt")
df_eval = df_eval.merge(df_eval_target, on="CONTROLN")
df_train = pd.read_csv("/tmp/lifetime-value/kdd_cup_98/cup98LRN.txt")
num_train = df_train.shape[0]
df_eval = pd.read_csv("/tmp/lifetime-value/kdd_cup_98/cup98VAL.txt")
df_eval_target = pd.read_csv("/tmp/lifetime-value/kdd_cup_98/valtargt.txt")
df_eval = df_eval.merge(df_eval_target, on="CONTROLN")

In [ ]:

Copied!

df = pd.concat([df_train, df_eval], axis=0, sort=True)
df = pd.concat([df_train, df_eval], axis=0, sort=True)

Label distribution¶

In [ ]:

Copied!

y = df["TARGET_D"][:num_train]
y = df["TARGET_D"][:num_train]

In [ ]:

Copied!





def plot_hist_log_scale(y):
    max_val = y.max() + 1.0
    ax = pd.Series(y).hist(
        figsize=(8, 5), bins=10 ** np.linspace(0.0, np.log10(max_val), 20)
    )

    plt.xlabel("Donation ($)")
    plt.ylabel("Count")
    # plt.title('Histogram of LTV')
    plt.xticks(rotation="horizontal")
    plt.legend(loc="upper left")
    ax.set_xscale("log")
    ax.grid(False)
    # Hide the right and top spines
    ax.spines["right"].set_visible(False)
    ax.spines["top"].set_visible(False)
    # Only show ticks on the left and bottom spines
    ax.yaxis.set_ticks_position("left")
    ax.xaxis.set_ticks_position("bottom")
    plt.show()

    fig = ax.get_figure()
    output_file = tf.io.gfile.GFile(
        "/tmp/lifetime-value/kdd_cup_98/histogram_kdd98_log_scale.pdf", "wb"
    )
    fig.savefig(output_file, bbox_inches="tight", format="pdf")
def plot_hist_log_scale(y):
    max_val = y.max() + 1.0
    ax = pd.Series(y).hist(
        figsize=(8, 5), bins=10 ** np.linspace(0.0, np.log10(max_val), 20)
    )

    plt.xlabel("Donation ($)")
    plt.ylabel("Count")
    # plt.title('Histogram of LTV')
    plt.xticks(rotation="horizontal")
    plt.legend(loc="upper left")
    ax.set_xscale("log")
    ax.grid(False)
    # Hide the right and top spines
    ax.spines["right"].set_visible(False)
    ax.spines["top"].set_visible(False)
    # Only show ticks on the left and bottom spines
    ax.yaxis.set_ticks_position("left")
    ax.xaxis.set_ticks_position("bottom")
    plt.show()

    fig = ax.get_figure()
    output_file = tf.io.gfile.GFile(
        "/tmp/lifetime-value/kdd_cup_98/histogram_kdd98_log_scale.pdf", "wb"
    )
    fig.savefig(output_file, bbox_inches="tight", format="pdf")

In [ ]:

Copied!

plot_hist_log_scale(y[y > 0])
plot_hist_log_scale(y[y > 0])

Preprocess features¶

Vocab¶

In [ ]:

Copied!





VOCAB_FEATURES = [
    "ODATEDW",  # date of donor's first gift (YYMM)
    "OSOURCE",  # donor acquisition mailing list
    "TCODE",  # donor title code
    "STATE",
    "ZIP",
    "DOMAIN",  # urbanicity level and socio-economic status of the neighborhood
    "CLUSTER",  # socio-economic status
    "GENDER",
    "MAXADATE",  # date of the most recent promotion received
    "MINRDATE",
    "LASTDATE",
    "FISTDATE",
    "RFA_2A",
]
VOCAB_FEATURES = [
    "ODATEDW",  # date of donor's first gift (YYMM)
    "OSOURCE",  # donor acquisition mailing list
    "TCODE",  # donor title code
    "STATE",
    "ZIP",
    "DOMAIN",  # urbanicity level and socio-economic status of the neighborhood
    "CLUSTER",  # socio-economic status
    "GENDER",
    "MAXADATE",  # date of the most recent promotion received
    "MINRDATE",
    "LASTDATE",
    "FISTDATE",
    "RFA_2A",
]

In [ ]:

Copied!





df["ODATEDW"] = df["ODATEDW"].astype("str")
df["TCODE"] = df["TCODE"].apply(lambda x: "{:03d}".format(x // 1000 if x > 1000 else x))
df["ZIP"] = df["ZIP"].str.slice(0, 5)
df["MAXADATE"] = df["MAXADATE"].astype("str")
df["MINRDATE"] = df["MINRDATE"].astype("str")
df["LASTDATE"] = df["LASTDATE"].astype("str")
df["FISTDATE"] = df["FISTDATE"].astype("str")
df["ODATEDW"] = df["ODATEDW"].astype("str")
df["TCODE"] = df["TCODE"].apply(lambda x: "{:03d}".format(x // 1000 if x > 1000 else x))
df["ZIP"] = df["ZIP"].str.slice(0, 5)
df["MAXADATE"] = df["MAXADATE"].astype("str")
df["MINRDATE"] = df["MINRDATE"].astype("str")
df["LASTDATE"] = df["LASTDATE"].astype("str")
df["FISTDATE"] = df["FISTDATE"].astype("str")

In [ ]:

Copied!





def label_encoding(y, frequency_threshold=100):
    value_counts = pd.value_counts(y)
    categories = value_counts[value_counts >= frequency_threshold].index.to_numpy()
    # 0 indicates the unknown category.
    return pd.Categorical(y, categories=categories).codes + 1
def label_encoding(y, frequency_threshold=100):
    value_counts = pd.value_counts(y)
    categories = value_counts[value_counts >= frequency_threshold].index.to_numpy()
    # 0 indicates the unknown category.
    return pd.Categorical(y, categories=categories).codes + 1

In [ ]:

Copied!

for key in VOCAB_FEATURES:
    df[key] = label_encoding(df[key])
for key in VOCAB_FEATURES:
    df[key] = label_encoding(df[key])

Indicator¶

In [ ]:

Copied!





MAIL_ORDER_RESPONSES = [
    "MBCRAFT",
    "MBGARDEN",
    "MBBOOKS",
    "MBCOLECT",
    "MAGFAML",
    "MAGFEM",
    "MAGMALE",
    "PUBGARDN",
    "PUBCULIN",
    "PUBHLTH",
    "PUBDOITY",
    "PUBNEWFN",
    "PUBPHOTO",
    "PUBOPP",
    "RFA_2F",
]
MAIL_ORDER_RESPONSES = [
    "MBCRAFT",
    "MBGARDEN",
    "MBBOOKS",
    "MBCOLECT",
    "MAGFAML",
    "MAGFEM",
    "MAGMALE",
    "PUBGARDN",
    "PUBCULIN",
    "PUBHLTH",
    "PUBDOITY",
    "PUBNEWFN",
    "PUBPHOTO",
    "PUBOPP",
    "RFA_2F",
]

In [ ]:

Copied!





INDICATOR_FEATURES = [
    "AGE",  # age decile, 0 indicates unknown
    "NUMCHLD",
    "INCOME",
    "WEALTH1",
    "HIT",
] + MAIL_ORDER_RESPONSES
INDICATOR_FEATURES = [
    "AGE",  # age decile, 0 indicates unknown
    "NUMCHLD",
    "INCOME",
    "WEALTH1",
    "HIT",
] + MAIL_ORDER_RESPONSES

In [ ]:

Copied!





df["AGE"] = pd.qcut(df["AGE"].values, 10).codes + 1
df["NUMCHLD"] = df["NUMCHLD"].apply(lambda x: 0 if np.isnan(x) else int(x))
df["INCOME"] = df["INCOME"].apply(lambda x: 0 if np.isnan(x) else int(x))
df["WEALTH1"] = df["WEALTH1"].apply(lambda x: 0 if np.isnan(x) else int(x) + 1)
df["HIT"] = pd.qcut(df["HIT"].values, q=50, duplicates="drop").codes

for col in MAIL_ORDER_RESPONSES:
    df[col] = pd.qcut(df[col].values, q=20, duplicates="drop").codes + 1
df["AGE"] = pd.qcut(df["AGE"].values, 10).codes + 1
df["NUMCHLD"] = df["NUMCHLD"].apply(lambda x: 0 if np.isnan(x) else int(x))
df["INCOME"] = df["INCOME"].apply(lambda x: 0 if np.isnan(x) else int(x))
df["WEALTH1"] = df["WEALTH1"].apply(lambda x: 0 if np.isnan(x) else int(x) + 1)
df["HIT"] = pd.qcut(df["HIT"].values, q=50, duplicates="drop").codes

for col in MAIL_ORDER_RESPONSES:
    df[col] = pd.qcut(df[col].values, q=20, duplicates="drop").codes + 1

Numeric¶

In [ ]:

Copied!





NUMERIC_FEATURES = [
    # binary
    "MAILCODE",  # bad address
    "NOEXCH",  # do not exchange
    "RECINHSE",  # donor has given to PVA's in house program
    "RECP3",  # donor has given to PVA's P3 program
    "RECPGVG",  # planned giving record
    "RECSWEEP",  # sweepstakes record
    "HOMEOWNR",  # home owner
    "CHILD03",
    "CHILD07",
    "CHILD12",
    "CHILD18",
    # continuous
    "CARDPROM",
    "NUMPROM",
    "CARDPM12",
    "NUMPRM12",
    "RAMNTALL",
    "NGIFTALL",
    "MINRAMNT",
    "MAXRAMNT",
    "LASTGIFT",
    "AVGGIFT",
]
NUMERIC_FEATURES = [
    # binary
    "MAILCODE",  # bad address
    "NOEXCH",  # do not exchange
    "RECINHSE",  # donor has given to PVA's in house program
    "RECP3",  # donor has given to PVA's P3 program
    "RECPGVG",  # planned giving record
    "RECSWEEP",  # sweepstakes record
    "HOMEOWNR",  # home owner
    "CHILD03",
    "CHILD07",
    "CHILD12",
    "CHILD18",
    # continuous
    "CARDPROM",
    "NUMPROM",
    "CARDPM12",
    "NUMPRM12",
    "RAMNTALL",
    "NGIFTALL",
    "MINRAMNT",
    "MAXRAMNT",
    "LASTGIFT",
    "AVGGIFT",
]

In [ ]:

Copied!





df["MAILCODE"] = (df["MAILCODE"] == "B").astype("float32")
df["PVASTATE"] = df["PVASTATE"].isin(["P", "E"]).astype("float32")
df["NOEXCH"] = df["NOEXCH"].isin(["X", "1"]).astype("float32")
df["RECINHSE"] = (df["RECINHSE"] == "X").astype("float32")
df["RECP3"] = (df["RECP3"] == "X").astype("float32")
df["RECPGVG"] = (df["RECPGVG"] == "X").astype("float32")
df["RECSWEEP"] = (df["RECSWEEP"] == "X").astype("float32")
df["HOMEOWNR"] = (df["HOMEOWNR"] == "H").astype("float32")
df["CHILD03"] = df["CHILD03"].isin(["M", "F", "B"]).astype("float32")
df["CHILD07"] = df["CHILD07"].isin(["M", "F", "B"]).astype("float32")
df["CHILD12"] = df["CHILD12"].isin(["M", "F", "B"]).astype("float32")
df["CHILD18"] = df["CHILD18"].isin(["M", "F", "B"]).astype("float32")

df["CARDPROM"] = df["CARDPROM"] / 100
df["NUMPROM"] = df["NUMPROM"] / 100
df["CARDPM12"] = df["CARDPM12"] / 100
df["NUMPRM12"] = df["NUMPRM12"] / 100
df["RAMNTALL"] = np.log1p(df["RAMNTALL"])
df["NGIFTALL"] = np.log1p(df["NGIFTALL"])
df["MINRAMNT"] = np.log1p(df["MINRAMNT"])
df["MAXRAMNT"] = np.log1p(df["MAXRAMNT"])
df["LASTGIFT"] = np.log1p(df["LASTGIFT"])
df["AVGGIFT"] = np.log1p(df["AVGGIFT"])
df["MAILCODE"] = (df["MAILCODE"] == "B").astype("float32")
df["PVASTATE"] = df["PVASTATE"].isin(["P", "E"]).astype("float32")
df["NOEXCH"] = df["NOEXCH"].isin(["X", "1"]).astype("float32")
df["RECINHSE"] = (df["RECINHSE"] == "X").astype("float32")
df["RECP3"] = (df["RECP3"] == "X").astype("float32")
df["RECPGVG"] = (df["RECPGVG"] == "X").astype("float32")
df["RECSWEEP"] = (df["RECSWEEP"] == "X").astype("float32")
df["HOMEOWNR"] = (df["HOMEOWNR"] == "H").astype("float32")
df["CHILD03"] = df["CHILD03"].isin(["M", "F", "B"]).astype("float32")
df["CHILD07"] = df["CHILD07"].isin(["M", "F", "B"]).astype("float32")
df["CHILD12"] = df["CHILD12"].isin(["M", "F", "B"]).astype("float32")
df["CHILD18"] = df["CHILD18"].isin(["M", "F", "B"]).astype("float32")

df["CARDPROM"] = df["CARDPROM"] / 100
df["NUMPROM"] = df["NUMPROM"] / 100
df["CARDPM12"] = df["CARDPM12"] / 100
df["NUMPRM12"] = df["NUMPRM12"] / 100
df["RAMNTALL"] = np.log1p(df["RAMNTALL"])
df["NGIFTALL"] = np.log1p(df["NGIFTALL"])
df["MINRAMNT"] = np.log1p(df["MINRAMNT"])
df["MAXRAMNT"] = np.log1p(df["MAXRAMNT"])
df["LASTGIFT"] = np.log1p(df["LASTGIFT"])
df["AVGGIFT"] = np.log1p(df["AVGGIFT"])

All¶

In [ ]:

Copied!

CATEGORICAL_FEATURES = VOCAB_FEATURES + INDICATOR_FEATURES
ALL_FEATURES = CATEGORICAL_FEATURES + NUMERIC_FEATURES
CATEGORICAL_FEATURES = VOCAB_FEATURES + INDICATOR_FEATURES
ALL_FEATURES = CATEGORICAL_FEATURES + NUMERIC_FEATURES

Train/eval split¶

In [ ]:

Copied!





def dnn_split(df):
    df_train = df.iloc[:num_train]
    df_eval = df.iloc[num_train:]

    def feature_dict(df):
        features = {k: v.values for k, v in dict(df[CATEGORICAL_FEATURES]).items()}
        features["numeric"] = df[NUMERIC_FEATURES].astype("float32").values
        return features

    x_train, y_train = (
        feature_dict(df_train),
        df_train["TARGET_D"].astype("float32").values,
    )
    x_eval, y_eval = feature_dict(df_eval), df_eval["TARGET_D"].astype("float32").values

    return x_train, x_eval, y_train, y_eval
def dnn_split(df):
    df_train = df.iloc[:num_train]
    df_eval = df.iloc[num_train:]

    def feature_dict(df):
        features = {k: v.values for k, v in dict(df[CATEGORICAL_FEATURES]).items()}
        features["numeric"] = df[NUMERIC_FEATURES].astype("float32").values
        return features

    x_train, y_train = (
        feature_dict(df_train),
        df_train["TARGET_D"].astype("float32").values,
    )
    x_eval, y_eval = feature_dict(df_eval), df_eval["TARGET_D"].astype("float32").values

    return x_train, x_eval, y_train, y_eval

Model¶

In [ ]:

Copied!





def embedding_dim(x):
    return int(x**0.25) + 1


def embedding_layer(vocab_size):
    return tf.keras.Sequential(
        [
            tf.keras.layers.Embedding(
                input_dim=vocab_size,
                output_dim=embedding_dim(vocab_size),
                input_length=1,
            ),
            tf.keras.layers.Flatten(),
        ]
    )


def dnn_model(output_units):
    numeric_input = tf.keras.layers.Input(
        shape=(len(NUMERIC_FEATURES),), name="numeric"
    )

    embedding_inputs = [
        tf.keras.layers.Input(shape=(1,), name=key, dtype=np.int64)
        for key in CATEGORICAL_FEATURES
    ]

    embedding_outputs = [
        embedding_layer(vocab_size=df[key].max() + 1)(input)
        for key, input in zip(CATEGORICAL_FEATURES, embedding_inputs)
    ]

    deep_input = tf.keras.layers.concatenate([numeric_input] + embedding_outputs)
    deep_model = tf.keras.Sequential(
        [
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(units=output_units),
        ]
    )
    return tf.keras.Model(
        inputs=[numeric_input] + embedding_inputs, outputs=deep_model(deep_input)
    )
def embedding_dim(x):
    return int(x**0.25) + 1


def embedding_layer(vocab_size):
    return tf.keras.Sequential(
        [
            tf.keras.layers.Embedding(
                input_dim=vocab_size,
                output_dim=embedding_dim(vocab_size),
                input_length=1,
            ),
            tf.keras.layers.Flatten(),
        ]
    )


def dnn_model(output_units):
    numeric_input = tf.keras.layers.Input(
        shape=(len(NUMERIC_FEATURES),), name="numeric"
    )

    embedding_inputs = [
        tf.keras.layers.Input(shape=(1,), name=key, dtype=np.int64)
        for key in CATEGORICAL_FEATURES
    ]

    embedding_outputs = [
        embedding_layer(vocab_size=df[key].max() + 1)(input)
        for key, input in zip(CATEGORICAL_FEATURES, embedding_inputs)
    ]

    deep_input = tf.keras.layers.concatenate([numeric_input] + embedding_outputs)
    deep_model = tf.keras.Sequential(
        [
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(units=output_units),
        ]
    )
    return tf.keras.Model(
        inputs=[numeric_input] + embedding_inputs, outputs=deep_model(deep_input)
    )

Loss¶

In [ ]:

Copied!





if LOSS == "mse":
    loss = tf.keras.losses.MeanSquaredError()
    output_units = 1

if LOSS == "ziln":
    loss = ltv.zero_inflated_lognormal_loss
    output_units = 3
if LOSS == "mse":
    loss = tf.keras.losses.MeanSquaredError()
    output_units = 1

if LOSS == "ziln":
    loss = ltv.zero_inflated_lognormal_loss
    output_units = 3

In [ ]:

Copied!

x_train, x_eval, y_train, y_eval = dnn_split(df)
model = dnn_model(output_units)
x_train, x_eval, y_train, y_eval = dnn_split(df)
model = dnn_model(output_units)

In [ ]:

Copied!

model.compile(optimizer=tf.keras.optimizers.Nadam(lr=LEARNING_RATE), loss=loss)
model.compile(optimizer=tf.keras.optimizers.Nadam(lr=LEARNING_RATE), loss=loss)

Train¶

In [ ]:

Copied!





callbacks = [
    tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss", min_lr=1e-6),
    tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10),
]
callbacks = [
    tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss", min_lr=1e-6),
    tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10),
]

In [ ]:

Copied!





history = model.fit(
    x=x_train,
    y=y_train,
    batch_size=2048,
    epochs=200,
    verbose=2,
    callbacks=callbacks,
    validation_data=(x_eval, y_eval),
).history
history = model.fit(
    x=x_train,
    y=y_train,
    batch_size=2048,
    epochs=200,
    verbose=2,
    callbacks=callbacks,
    validation_data=(x_eval, y_eval),
).history

In [ ]:

Copied!

pd.DataFrame(history)[["loss", "val_loss"]].plot();
pd.DataFrame(history)[["loss", "val_loss"]].plot();

Eval¶

In [ ]:

Copied!





if LOSS == "mse":
    y_pred = model.predict(x=x_eval, batch_size=1024).flatten()

if LOSS == "ziln":
    logits = model.predict(x=x_eval, batch_size=1024)
    y_pred = ltv.zero_inflated_lognormal_pred(logits).numpy().flatten()
if LOSS == "mse":
    y_pred = model.predict(x=x_eval, batch_size=1024).flatten()

if LOSS == "ziln":
    logits = model.predict(x=x_eval, batch_size=1024)
    y_pred = ltv.zero_inflated_lognormal_pred(logits).numpy().flatten()

Pytorch-widedeep approach¶

In [ ]:

Copied!





from pytorch_widedeep.preprocessing import TabPreprocessor
from pytorch_widedeep.training import Trainer
from pytorch_widedeep.models import TabMlp, WideDeep
from torch.optim.lr_scheduler import ReduceLROnPlateau
from pytorch_widedeep.callbacks import EarlyStopping
from torch.optim import NAdam
from pytorch_widedeep.preprocessing import TabPreprocessor
from pytorch_widedeep.training import Trainer
from pytorch_widedeep.models import TabMlp, WideDeep
from torch.optim.lr_scheduler import ReduceLROnPlateau
from pytorch_widedeep.callbacks import EarlyStopping
from torch.optim import NAdam

In [ ]:

Copied!





# CATEGORICAL_FEATURES
NUMERICAL_FEATURES = ["num" + str(i) for i in range(21)]
x_train_pyt_num = pd.DataFrame(columns=NUMERICAL_FEATURES, data=x_train["numeric"])
x_train_pyt_cat = pd.DataFrame(
    {key: value for key, value in x_train.items() if key not in ["numeric"]}
)

x_eval_pyt_num = pd.DataFrame(columns=NUMERICAL_FEATURES, data=x_eval["numeric"])
x_eval_pyt_cat = pd.DataFrame(
    {key: value for key, value in x_eval.items() if key not in ["numeric"]}
)
# CATEGORICAL_FEATURES
NUMERICAL_FEATURES = ["num" + str(i) for i in range(21)]
x_train_pyt_num = pd.DataFrame(columns=NUMERICAL_FEATURES, data=x_train["numeric"])
x_train_pyt_cat = pd.DataFrame(
    {key: value for key, value in x_train.items() if key not in ["numeric"]}
)

x_eval_pyt_num = pd.DataFrame(columns=NUMERICAL_FEATURES, data=x_eval["numeric"])
x_eval_pyt_cat = pd.DataFrame(
    {key: value for key, value in x_eval.items() if key not in ["numeric"]}
)

In [ ]:

Copied!

x_train_pyt = pd.concat([x_train_pyt_num, x_train_pyt_cat], axis=1)
x_eval_pyt = pd.concat([x_eval_pyt_num, x_eval_pyt_cat], axis=1)
x_train_pyt = pd.concat([x_train_pyt_num, x_train_pyt_cat], axis=1)
x_eval_pyt = pd.concat([x_eval_pyt_num, x_eval_pyt_cat], axis=1)

In [ ]:

Copied!

embed_input = [
    (u, int(x_train_pyt[u].nunique() ** 0.25) + 1) for u in CATEGORICAL_FEATURES
]
embed_input = [
    (u, int(x_train_pyt[u].nunique() ** 0.25) + 1) for u in CATEGORICAL_FEATURES
]

In [ ]:

Copied!





# deeptabular
tab_preprocessor = TabPreprocessor(
    embed_cols=embed_input,
    continuous_cols=NUMERICAL_FEATURES,
    shared_embed=False,
    scale=False,
)
X_tab_train = tab_preprocessor.fit_transform(x_train_pyt)
X_tab_valid = tab_preprocessor.transform(x_eval_pyt)
X_tab_test = tab_preprocessor.transform(x_eval_pyt)

# target
y_train = y_train
y_valid = y_eval
y_test = y_train

X_train = {"X_tab": X_tab_train, "target": y_train}
X_val = {"X_tab": X_tab_valid, "target": y_valid}
X_test = {"X_tab": X_tab_test}

deeptabular = TabMlp(
    mlp_hidden_dims=[128, 128, 64, 64],
    column_idx=tab_preprocessor.column_idx,
    embed_input=tab_preprocessor.cat_embed_input,
    continuous_cols=tab_preprocessor.continuous_cols,
)

model = WideDeep(deeptabular=deeptabular, pred_dim=3)

deep_opt = NAdam(model.deeptabular.parameters(), lr=LEARNING_RATE)
callbacks = [EarlyStopping()]
deep_sch = ReduceLROnPlateau(deep_opt, min_lr=1e-6)

objective = "ziln"

trainer = Trainer(
    model,
    callbacks=callbacks,
    lr_schedulers={"deeptabular": deep_sch},
    objective=objective,
    optimizers={"deeptabular": deep_opt},
)

trainer.fit(
    X_train=X_train,
    X_val=X_val,
    n_epochs=200,
    batch_size=2048,
)

y_pred_pytorch = trainer.predict(X_test=X_test)
# deeptabular
tab_preprocessor = TabPreprocessor(
    embed_cols=embed_input,
    continuous_cols=NUMERICAL_FEATURES,
    shared_embed=False,
    scale=False,
)
X_tab_train = tab_preprocessor.fit_transform(x_train_pyt)
X_tab_valid = tab_preprocessor.transform(x_eval_pyt)
X_tab_test = tab_preprocessor.transform(x_eval_pyt)

# target
y_train = y_train
y_valid = y_eval
y_test = y_train

X_train = {"X_tab": X_tab_train, "target": y_train}
X_val = {"X_tab": X_tab_valid, "target": y_valid}
X_test = {"X_tab": X_tab_test}

deeptabular = TabMlp(
    mlp_hidden_dims=[128, 128, 64, 64],
    column_idx=tab_preprocessor.column_idx,
    embed_input=tab_preprocessor.cat_embed_input,
    continuous_cols=tab_preprocessor.continuous_cols,
)

model = WideDeep(deeptabular=deeptabular, pred_dim=3)

deep_opt = NAdam(model.deeptabular.parameters(), lr=LEARNING_RATE)
callbacks = [EarlyStopping()]
deep_sch = ReduceLROnPlateau(deep_opt, min_lr=1e-6)

objective = "ziln"

trainer = Trainer(
    model,
    callbacks=callbacks,
    lr_schedulers={"deeptabular": deep_sch},
    objective=objective,
    optimizers={"deeptabular": deep_opt},
)

trainer.fit(
    X_train=X_train,
    X_val=X_val,
    n_epochs=200,
    batch_size=2048,
)

y_pred_pytorch = trainer.predict(X_test=X_test)

In [ ]:

Copied!

pd.DataFrame(trainer.history)[["train_loss", "val_loss"]].plot();
pd.DataFrame(trainer.history)[["train_loss", "val_loss"]].plot();

In [ ]:

Copied!

from sklearn.metrics import mean_squared_error

mean_squared_error(y_pred, y_pred_pytorch)
from sklearn.metrics import mean_squared_error

mean_squared_error(y_pred, y_pred_pytorch)

Appendix¶

Total Profit¶

In [ ]:

Copied!

unit_costs = [0.4, 0.5, 0.6, 0.68, 0.7, 0.8, 0.9, 1.0]
unit_costs = [0.4, 0.5, 0.6, 0.68, 0.7, 0.8, 0.9, 1.0]

In [ ]:

Copied!

num_mailed = [np.sum(y_pred > v) for v in unit_costs]
num_mailed
num_mailed = [np.sum(y_pred > v) for v in unit_costs]
num_mailed

In [ ]:

Copied!

baseline_total_profit = np.sum(y_eval - 0.68)
baseline_total_profit
baseline_total_profit = np.sum(y_eval - 0.68)
baseline_total_profit

In [ ]:

Copied!

total_profits = [np.sum(y_eval[y_pred > v] - v) for v in unit_costs]
total_profits
total_profits = [np.sum(y_eval[y_pred > v] - v) for v in unit_costs]
total_profits

Gini Coefficient¶

In [ ]:

Copied!





gain = pd.DataFrame(
    {
        "lorenz": ltv.cumulative_true(y_eval, y_eval),
        "baseline": ltv.cumulative_true(y_eval, x_eval["numeric"][:, 19]),
        "model": ltv.cumulative_true(y_eval, y_pred),
    }
)
gain = pd.DataFrame(
    {
        "lorenz": ltv.cumulative_true(y_eval, y_eval),
        "baseline": ltv.cumulative_true(y_eval, x_eval["numeric"][:, 19]),
        "model": ltv.cumulative_true(y_eval, y_pred),
    }
)

In [ ]:

Copied!

num_customers = np.float32(gain.shape[0])
gain["cumulative_customer"] = (np.arange(num_customers) + 1.0) / num_customers
num_customers = np.float32(gain.shape[0])
gain["cumulative_customer"] = (np.arange(num_customers) + 1.0) / num_customers

In [ ]:

Copied!





ax = gain[
    [
        "cumulative_customer",
        "lorenz",
        "baseline",
        "model",
    ]
].plot(x="cumulative_customer", figsize=(8, 5), legend=True)

ax.legend(["Groundtruth", "Baseline", "Model"], loc="lower right")

ax.set_xlabel("Cumulative Fraction of Customers")
ax.set_xticks(np.arange(0, 1.1, 0.1))
ax.set_xlim((0, 1.0))

ax.set_ylabel("Cumulative Fraction of Total Lifetime Value")
ax.set_yticks(np.arange(0, 1.1, 0.1))
ax.set_ylim((0, 1.05))
ax.set_title("Gain Chart");
ax = gain[
    [
        "cumulative_customer",
        "lorenz",
        "baseline",
        "model",
    ]
].plot(x="cumulative_customer", figsize=(8, 5), legend=True)

ax.legend(["Groundtruth", "Baseline", "Model"], loc="lower right")

ax.set_xlabel("Cumulative Fraction of Customers")
ax.set_xticks(np.arange(0, 1.1, 0.1))
ax.set_xlim((0, 1.0))

ax.set_ylabel("Cumulative Fraction of Total Lifetime Value")
ax.set_yticks(np.arange(0, 1.1, 0.1))
ax.set_ylim((0, 1.05))
ax.set_title("Gain Chart");

In [ ]:

Copied!

gini = ltv.gini_from_gain(gain[["lorenz", "baseline", "model"]])
gini
gini = ltv.gini_from_gain(gain[["lorenz", "baseline", "model"]])
gini

Calibration¶

In [ ]:

Copied!

df_decile = ltv.decile_stats(y_eval, y_pred)
df_decile
df_decile = ltv.decile_stats(y_eval, y_pred)
df_decile

In [ ]:

Copied!





ax = df_decile[["label_mean", "pred_mean"]].plot.bar(rot=0)

ax.set_title("Decile Chart")
ax.set_xlabel("Prediction bucket")
ax.set_ylabel("Average bucket value")
ax.legend(["Label", "Prediction"], loc="upper left");
ax = df_decile[["label_mean", "pred_mean"]].plot.bar(rot=0)

ax.set_title("Decile Chart")
ax.set_xlabel("Prediction bucket")
ax.set_ylabel("Average bucket value")
ax.legend(["Label", "Prediction"], loc="upper left");

Rank Correlation¶

In [ ]:

Copied!





def spearmanr(x1: Sequence[float], x2: Sequence[float]) -> float:
    """Calculates spearmanr rank correlation coefficient.

    See https://docs.scipy.org/doc/scipy/reference/stats.html.

    Args:
      x1: 1D array_like.
      x2: 1D array_like.

    Returns:
      correlation: float.
    """
    return stats.spearmanr(x1, x2, nan_policy="raise")[0]


spearman_corr = spearmanr(y_eval, y_pred)
spearman_corr
def spearmanr(x1: Sequence[float], x2: Sequence[float]) -> float:
    """Calculates spearmanr rank correlation coefficient.

    See https://docs.scipy.org/doc/scipy/reference/stats.html.

    Args:
      x1: 1D array_like.
      x2: 1D array_like.

    Returns:
      correlation: float.
    """
    return stats.spearmanr(x1, x2, nan_policy="raise")[0]


spearman_corr = spearmanr(y_eval, y_pred)
spearman_corr

All metrics together¶

In [ ]:

Copied!





df_metrics = pd.DataFrame(
    {
        "model": MODEL,
        "loss_function": LOSS,
        "train_loss": history["loss"][-1],
        "eval_loss": history["val_loss"][-1],
        "label_positive": np.mean(y_eval > 0),
        "label_mean": y_eval.mean(),
        "pred_mean": y_pred.mean(),
        "decile_mape": df_decile["decile_mape"].mean(),
        "baseline_gini": gini["normalized"][1],
        "gini": gini["normalized"][2],
        "spearman_corr": spearman_corr,
    },
    index=[VERSION],
)
df_metrics = pd.DataFrame(
    {
        "model": MODEL,
        "loss_function": LOSS,
        "train_loss": history["loss"][-1],
        "eval_loss": history["val_loss"][-1],
        "label_positive": np.mean(y_eval > 0),
        "label_mean": y_eval.mean(),
        "pred_mean": y_pred.mean(),
        "decile_mape": df_decile["decile_mape"].mean(),
        "baseline_gini": gini["normalized"][1],
        "gini": gini["normalized"][2],
        "spearman_corr": spearman_corr,
    },
    index=[VERSION],
)

In [ ]:

Copied!

for unit_cost, total_profit in zip(unit_costs, total_profits):
    df_metrics["total_profit_{:02d}".format(int(unit_cost * 100))] = total_profit
for unit_cost, total_profit in zip(unit_costs, total_profits):
    df_metrics["total_profit_{:02d}".format(int(unit_cost * 100))] = total_profit

In [ ]:

Copied!

df_metrics.T
df_metrics.T

Save¶

In [ ]:

Copied!

output_path = OUTPUT_CSV_FOLDER
output_path = OUTPUT_CSV_FOLDER

In [ ]:

Copied!

if not os.path.isdir(output_path):
    os.makedirs(output_path)
if not os.path.isdir(output_path):
    os.makedirs(output_path)

In [ ]:

Copied!

output_file = os.path.join(
    output_path, "{}_regression_{}_{}.csv".format(MODEL, LOSS, VERSION)
)
output_file = os.path.join(
    output_path, "{}_regression_{}_{}.csv".format(MODEL, LOSS, VERSION)
)

In [ ]:

Copied!

df_metrics.to_csv(output_file, index=False)
df_metrics.to_csv(output_file, index=False)