ZILNLoss¶
[DISCLAIMER]
Purpose of this notebook is to check if ZILNloss implemented originaly Keras give same results in pytorch-widedeep implemenatation
In [1]:
Copied!
# @title Copyright 2019 The Lifetime Value Authors.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
# @title Copyright 2019 The Lifetime Value Authors.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
KDD Cup 98 LTV Prediction¶
In [3]:
Copied!
import os
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import tensorflow_probability as tfp
from typing import Sequence
# install and import ltv
!pip install -q git+https://github.com/google/lifetime_value
import lifetime_value as ltv
import os
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import tensorflow_probability as tfp
from typing import Sequence
# install and import ltv
!pip install -q git+https://github.com/google/lifetime_value
import lifetime_value as ltv
In [ ]:
Copied!
tfd = tfp.distributions
%config InlineBackend.figure_format='retina'
sns.set_style("whitegrid")
tfd = tfp.distributions
%config InlineBackend.figure_format='retina'
sns.set_style("whitegrid")
Configs¶
In [ ]:
Copied!
MODEL = "dnn"
LOSS = "ziln" # @param { isTemplate: true, type: 'string'} ['mse', 'ziln']
LEARNING_RATE = 0.001 # @param { isTemplate: true}
VERSION = 0 # @param { isTemplate: true, type: 'integer'}
OUTPUT_CSV_FOLDER = "/tmp/lifetime-value/kdd_cup_98/result" # @param { isTemplate: true, type: 'string'}
MODEL = "dnn"
LOSS = "ziln" # @param { isTemplate: true, type: 'string'} ['mse', 'ziln']
LEARNING_RATE = 0.001 # @param { isTemplate: true}
VERSION = 0 # @param { isTemplate: true, type: 'integer'}
OUTPUT_CSV_FOLDER = "/tmp/lifetime-value/kdd_cup_98/result" # @param { isTemplate: true, type: 'string'}
Load data¶
Download kdd_cup_98 data to /tmp/lifetime-value/kdd_cup_98
In [ ]:
Copied!
%%bash
mkdir -p /tmp/lifetime-value/kdd_cup_98
wget https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/cup98lrn.zip -P /tmp/lifetime-value/kdd_cup_98/
wget https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/cup98val.zip -P /tmp/lifetime-value/kdd_cup_98/
wget https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/valtargt.txt -P /tmp/lifetime-value/kdd_cup_98/
cd /tmp/lifetime-value/kdd_cup_98/
unzip cup98lrn.zip
unzip cup98val.zip
%%bash
mkdir -p /tmp/lifetime-value/kdd_cup_98
wget https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/cup98lrn.zip -P /tmp/lifetime-value/kdd_cup_98/
wget https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/cup98val.zip -P /tmp/lifetime-value/kdd_cup_98/
wget https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/valtargt.txt -P /tmp/lifetime-value/kdd_cup_98/
cd /tmp/lifetime-value/kdd_cup_98/
unzip cup98lrn.zip
unzip cup98val.zip
In [ ]:
Copied!
df_train = pd.read_csv("/tmp/lifetime-value/kdd_cup_98/cup98LRN.txt")
num_train = df_train.shape[0]
df_eval = pd.read_csv("/tmp/lifetime-value/kdd_cup_98/cup98VAL.txt")
df_eval_target = pd.read_csv("/tmp/lifetime-value/kdd_cup_98/valtargt.txt")
df_eval = df_eval.merge(df_eval_target, on="CONTROLN")
df_train = pd.read_csv("/tmp/lifetime-value/kdd_cup_98/cup98LRN.txt")
num_train = df_train.shape[0]
df_eval = pd.read_csv("/tmp/lifetime-value/kdd_cup_98/cup98VAL.txt")
df_eval_target = pd.read_csv("/tmp/lifetime-value/kdd_cup_98/valtargt.txt")
df_eval = df_eval.merge(df_eval_target, on="CONTROLN")
In [ ]:
Copied!
df = pd.concat([df_train, df_eval], axis=0, sort=True)
df = pd.concat([df_train, df_eval], axis=0, sort=True)
Label distribution¶
In [ ]:
Copied!
y = df["TARGET_D"][:num_train]
y = df["TARGET_D"][:num_train]
In [ ]:
Copied!
def plot_hist_log_scale(y):
max_val = y.max() + 1.0
ax = pd.Series(y).hist(
figsize=(8, 5), bins=10 ** np.linspace(0.0, np.log10(max_val), 20)
)
plt.xlabel("Donation ($)")
plt.ylabel("Count")
# plt.title('Histogram of LTV')
plt.xticks(rotation="horizontal")
plt.legend(loc="upper left")
ax.set_xscale("log")
ax.grid(False)
# Hide the right and top spines
ax.spines["right"].set_visible(False)
ax.spines["top"].set_visible(False)
# Only show ticks on the left and bottom spines
ax.yaxis.set_ticks_position("left")
ax.xaxis.set_ticks_position("bottom")
plt.show()
fig = ax.get_figure()
output_file = tf.io.gfile.GFile(
"/tmp/lifetime-value/kdd_cup_98/histogram_kdd98_log_scale.pdf", "wb"
)
fig.savefig(output_file, bbox_inches="tight", format="pdf")
def plot_hist_log_scale(y):
max_val = y.max() + 1.0
ax = pd.Series(y).hist(
figsize=(8, 5), bins=10 ** np.linspace(0.0, np.log10(max_val), 20)
)
plt.xlabel("Donation ($)")
plt.ylabel("Count")
# plt.title('Histogram of LTV')
plt.xticks(rotation="horizontal")
plt.legend(loc="upper left")
ax.set_xscale("log")
ax.grid(False)
# Hide the right and top spines
ax.spines["right"].set_visible(False)
ax.spines["top"].set_visible(False)
# Only show ticks on the left and bottom spines
ax.yaxis.set_ticks_position("left")
ax.xaxis.set_ticks_position("bottom")
plt.show()
fig = ax.get_figure()
output_file = tf.io.gfile.GFile(
"/tmp/lifetime-value/kdd_cup_98/histogram_kdd98_log_scale.pdf", "wb"
)
fig.savefig(output_file, bbox_inches="tight", format="pdf")
In [ ]:
Copied!
plot_hist_log_scale(y[y > 0])
plot_hist_log_scale(y[y > 0])
Preprocess features¶
Vocab¶
In [ ]:
Copied!
VOCAB_FEATURES = [
"ODATEDW", # date of donor's first gift (YYMM)
"OSOURCE", # donor acquisition mailing list
"TCODE", # donor title code
"STATE",
"ZIP",
"DOMAIN", # urbanicity level and socio-economic status of the neighborhood
"CLUSTER", # socio-economic status
"GENDER",
"MAXADATE", # date of the most recent promotion received
"MINRDATE",
"LASTDATE",
"FISTDATE",
"RFA_2A",
]
VOCAB_FEATURES = [
"ODATEDW", # date of donor's first gift (YYMM)
"OSOURCE", # donor acquisition mailing list
"TCODE", # donor title code
"STATE",
"ZIP",
"DOMAIN", # urbanicity level and socio-economic status of the neighborhood
"CLUSTER", # socio-economic status
"GENDER",
"MAXADATE", # date of the most recent promotion received
"MINRDATE",
"LASTDATE",
"FISTDATE",
"RFA_2A",
]
In [ ]:
Copied!
df["ODATEDW"] = df["ODATEDW"].astype("str")
df["TCODE"] = df["TCODE"].apply(lambda x: "{:03d}".format(x // 1000 if x > 1000 else x))
df["ZIP"] = df["ZIP"].str.slice(0, 5)
df["MAXADATE"] = df["MAXADATE"].astype("str")
df["MINRDATE"] = df["MINRDATE"].astype("str")
df["LASTDATE"] = df["LASTDATE"].astype("str")
df["FISTDATE"] = df["FISTDATE"].astype("str")
df["ODATEDW"] = df["ODATEDW"].astype("str")
df["TCODE"] = df["TCODE"].apply(lambda x: "{:03d}".format(x // 1000 if x > 1000 else x))
df["ZIP"] = df["ZIP"].str.slice(0, 5)
df["MAXADATE"] = df["MAXADATE"].astype("str")
df["MINRDATE"] = df["MINRDATE"].astype("str")
df["LASTDATE"] = df["LASTDATE"].astype("str")
df["FISTDATE"] = df["FISTDATE"].astype("str")
In [ ]:
Copied!
def label_encoding(y, frequency_threshold=100):
value_counts = pd.value_counts(y)
categories = value_counts[value_counts >= frequency_threshold].index.to_numpy()
# 0 indicates the unknown category.
return pd.Categorical(y, categories=categories).codes + 1
def label_encoding(y, frequency_threshold=100):
value_counts = pd.value_counts(y)
categories = value_counts[value_counts >= frequency_threshold].index.to_numpy()
# 0 indicates the unknown category.
return pd.Categorical(y, categories=categories).codes + 1
In [ ]:
Copied!
for key in VOCAB_FEATURES:
df[key] = label_encoding(df[key])
for key in VOCAB_FEATURES:
df[key] = label_encoding(df[key])
Indicator¶
In [ ]:
Copied!
MAIL_ORDER_RESPONSES = [
"MBCRAFT",
"MBGARDEN",
"MBBOOKS",
"MBCOLECT",
"MAGFAML",
"MAGFEM",
"MAGMALE",
"PUBGARDN",
"PUBCULIN",
"PUBHLTH",
"PUBDOITY",
"PUBNEWFN",
"PUBPHOTO",
"PUBOPP",
"RFA_2F",
]
MAIL_ORDER_RESPONSES = [
"MBCRAFT",
"MBGARDEN",
"MBBOOKS",
"MBCOLECT",
"MAGFAML",
"MAGFEM",
"MAGMALE",
"PUBGARDN",
"PUBCULIN",
"PUBHLTH",
"PUBDOITY",
"PUBNEWFN",
"PUBPHOTO",
"PUBOPP",
"RFA_2F",
]
In [ ]:
Copied!
INDICATOR_FEATURES = [
"AGE", # age decile, 0 indicates unknown
"NUMCHLD",
"INCOME",
"WEALTH1",
"HIT",
] + MAIL_ORDER_RESPONSES
INDICATOR_FEATURES = [
"AGE", # age decile, 0 indicates unknown
"NUMCHLD",
"INCOME",
"WEALTH1",
"HIT",
] + MAIL_ORDER_RESPONSES
In [ ]:
Copied!
df["AGE"] = pd.qcut(df["AGE"].values, 10).codes + 1
df["NUMCHLD"] = df["NUMCHLD"].apply(lambda x: 0 if np.isnan(x) else int(x))
df["INCOME"] = df["INCOME"].apply(lambda x: 0 if np.isnan(x) else int(x))
df["WEALTH1"] = df["WEALTH1"].apply(lambda x: 0 if np.isnan(x) else int(x) + 1)
df["HIT"] = pd.qcut(df["HIT"].values, q=50, duplicates="drop").codes
for col in MAIL_ORDER_RESPONSES:
df[col] = pd.qcut(df[col].values, q=20, duplicates="drop").codes + 1
df["AGE"] = pd.qcut(df["AGE"].values, 10).codes + 1
df["NUMCHLD"] = df["NUMCHLD"].apply(lambda x: 0 if np.isnan(x) else int(x))
df["INCOME"] = df["INCOME"].apply(lambda x: 0 if np.isnan(x) else int(x))
df["WEALTH1"] = df["WEALTH1"].apply(lambda x: 0 if np.isnan(x) else int(x) + 1)
df["HIT"] = pd.qcut(df["HIT"].values, q=50, duplicates="drop").codes
for col in MAIL_ORDER_RESPONSES:
df[col] = pd.qcut(df[col].values, q=20, duplicates="drop").codes + 1
Numeric¶
In [ ]:
Copied!
NUMERIC_FEATURES = [
# binary
"MAILCODE", # bad address
"NOEXCH", # do not exchange
"RECINHSE", # donor has given to PVA's in house program
"RECP3", # donor has given to PVA's P3 program
"RECPGVG", # planned giving record
"RECSWEEP", # sweepstakes record
"HOMEOWNR", # home owner
"CHILD03",
"CHILD07",
"CHILD12",
"CHILD18",
# continuous
"CARDPROM",
"NUMPROM",
"CARDPM12",
"NUMPRM12",
"RAMNTALL",
"NGIFTALL",
"MINRAMNT",
"MAXRAMNT",
"LASTGIFT",
"AVGGIFT",
]
NUMERIC_FEATURES = [
# binary
"MAILCODE", # bad address
"NOEXCH", # do not exchange
"RECINHSE", # donor has given to PVA's in house program
"RECP3", # donor has given to PVA's P3 program
"RECPGVG", # planned giving record
"RECSWEEP", # sweepstakes record
"HOMEOWNR", # home owner
"CHILD03",
"CHILD07",
"CHILD12",
"CHILD18",
# continuous
"CARDPROM",
"NUMPROM",
"CARDPM12",
"NUMPRM12",
"RAMNTALL",
"NGIFTALL",
"MINRAMNT",
"MAXRAMNT",
"LASTGIFT",
"AVGGIFT",
]
In [ ]:
Copied!
df["MAILCODE"] = (df["MAILCODE"] == "B").astype("float32")
df["PVASTATE"] = df["PVASTATE"].isin(["P", "E"]).astype("float32")
df["NOEXCH"] = df["NOEXCH"].isin(["X", "1"]).astype("float32")
df["RECINHSE"] = (df["RECINHSE"] == "X").astype("float32")
df["RECP3"] = (df["RECP3"] == "X").astype("float32")
df["RECPGVG"] = (df["RECPGVG"] == "X").astype("float32")
df["RECSWEEP"] = (df["RECSWEEP"] == "X").astype("float32")
df["HOMEOWNR"] = (df["HOMEOWNR"] == "H").astype("float32")
df["CHILD03"] = df["CHILD03"].isin(["M", "F", "B"]).astype("float32")
df["CHILD07"] = df["CHILD07"].isin(["M", "F", "B"]).astype("float32")
df["CHILD12"] = df["CHILD12"].isin(["M", "F", "B"]).astype("float32")
df["CHILD18"] = df["CHILD18"].isin(["M", "F", "B"]).astype("float32")
df["CARDPROM"] = df["CARDPROM"] / 100
df["NUMPROM"] = df["NUMPROM"] / 100
df["CARDPM12"] = df["CARDPM12"] / 100
df["NUMPRM12"] = df["NUMPRM12"] / 100
df["RAMNTALL"] = np.log1p(df["RAMNTALL"])
df["NGIFTALL"] = np.log1p(df["NGIFTALL"])
df["MINRAMNT"] = np.log1p(df["MINRAMNT"])
df["MAXRAMNT"] = np.log1p(df["MAXRAMNT"])
df["LASTGIFT"] = np.log1p(df["LASTGIFT"])
df["AVGGIFT"] = np.log1p(df["AVGGIFT"])
df["MAILCODE"] = (df["MAILCODE"] == "B").astype("float32")
df["PVASTATE"] = df["PVASTATE"].isin(["P", "E"]).astype("float32")
df["NOEXCH"] = df["NOEXCH"].isin(["X", "1"]).astype("float32")
df["RECINHSE"] = (df["RECINHSE"] == "X").astype("float32")
df["RECP3"] = (df["RECP3"] == "X").astype("float32")
df["RECPGVG"] = (df["RECPGVG"] == "X").astype("float32")
df["RECSWEEP"] = (df["RECSWEEP"] == "X").astype("float32")
df["HOMEOWNR"] = (df["HOMEOWNR"] == "H").astype("float32")
df["CHILD03"] = df["CHILD03"].isin(["M", "F", "B"]).astype("float32")
df["CHILD07"] = df["CHILD07"].isin(["M", "F", "B"]).astype("float32")
df["CHILD12"] = df["CHILD12"].isin(["M", "F", "B"]).astype("float32")
df["CHILD18"] = df["CHILD18"].isin(["M", "F", "B"]).astype("float32")
df["CARDPROM"] = df["CARDPROM"] / 100
df["NUMPROM"] = df["NUMPROM"] / 100
df["CARDPM12"] = df["CARDPM12"] / 100
df["NUMPRM12"] = df["NUMPRM12"] / 100
df["RAMNTALL"] = np.log1p(df["RAMNTALL"])
df["NGIFTALL"] = np.log1p(df["NGIFTALL"])
df["MINRAMNT"] = np.log1p(df["MINRAMNT"])
df["MAXRAMNT"] = np.log1p(df["MAXRAMNT"])
df["LASTGIFT"] = np.log1p(df["LASTGIFT"])
df["AVGGIFT"] = np.log1p(df["AVGGIFT"])
All¶
In [ ]:
Copied!
CATEGORICAL_FEATURES = VOCAB_FEATURES + INDICATOR_FEATURES
ALL_FEATURES = CATEGORICAL_FEATURES + NUMERIC_FEATURES
CATEGORICAL_FEATURES = VOCAB_FEATURES + INDICATOR_FEATURES
ALL_FEATURES = CATEGORICAL_FEATURES + NUMERIC_FEATURES
Train/eval split¶
In [ ]:
Copied!
def dnn_split(df):
df_train = df.iloc[:num_train]
df_eval = df.iloc[num_train:]
def feature_dict(df):
features = {k: v.values for k, v in dict(df[CATEGORICAL_FEATURES]).items()}
features["numeric"] = df[NUMERIC_FEATURES].astype("float32").values
return features
x_train, y_train = (
feature_dict(df_train),
df_train["TARGET_D"].astype("float32").values,
)
x_eval, y_eval = feature_dict(df_eval), df_eval["TARGET_D"].astype("float32").values
return x_train, x_eval, y_train, y_eval
def dnn_split(df):
df_train = df.iloc[:num_train]
df_eval = df.iloc[num_train:]
def feature_dict(df):
features = {k: v.values for k, v in dict(df[CATEGORICAL_FEATURES]).items()}
features["numeric"] = df[NUMERIC_FEATURES].astype("float32").values
return features
x_train, y_train = (
feature_dict(df_train),
df_train["TARGET_D"].astype("float32").values,
)
x_eval, y_eval = feature_dict(df_eval), df_eval["TARGET_D"].astype("float32").values
return x_train, x_eval, y_train, y_eval
Model¶
In [ ]:
Copied!
def embedding_dim(x):
return int(x**0.25) + 1
def embedding_layer(vocab_size):
return tf.keras.Sequential(
[
tf.keras.layers.Embedding(
input_dim=vocab_size,
output_dim=embedding_dim(vocab_size),
input_length=1,
),
tf.keras.layers.Flatten(),
]
)
def dnn_model(output_units):
numeric_input = tf.keras.layers.Input(
shape=(len(NUMERIC_FEATURES),), name="numeric"
)
embedding_inputs = [
tf.keras.layers.Input(shape=(1,), name=key, dtype=np.int64)
for key in CATEGORICAL_FEATURES
]
embedding_outputs = [
embedding_layer(vocab_size=df[key].max() + 1)(input)
for key, input in zip(CATEGORICAL_FEATURES, embedding_inputs)
]
deep_input = tf.keras.layers.concatenate([numeric_input] + embedding_outputs)
deep_model = tf.keras.Sequential(
[
tf.keras.layers.Dense(128, activation="relu"),
tf.keras.layers.Dense(128, activation="relu"),
tf.keras.layers.Dense(64, activation="relu"),
tf.keras.layers.Dense(64, activation="relu"),
tf.keras.layers.Dense(units=output_units),
]
)
return tf.keras.Model(
inputs=[numeric_input] + embedding_inputs, outputs=deep_model(deep_input)
)
def embedding_dim(x):
return int(x**0.25) + 1
def embedding_layer(vocab_size):
return tf.keras.Sequential(
[
tf.keras.layers.Embedding(
input_dim=vocab_size,
output_dim=embedding_dim(vocab_size),
input_length=1,
),
tf.keras.layers.Flatten(),
]
)
def dnn_model(output_units):
numeric_input = tf.keras.layers.Input(
shape=(len(NUMERIC_FEATURES),), name="numeric"
)
embedding_inputs = [
tf.keras.layers.Input(shape=(1,), name=key, dtype=np.int64)
for key in CATEGORICAL_FEATURES
]
embedding_outputs = [
embedding_layer(vocab_size=df[key].max() + 1)(input)
for key, input in zip(CATEGORICAL_FEATURES, embedding_inputs)
]
deep_input = tf.keras.layers.concatenate([numeric_input] + embedding_outputs)
deep_model = tf.keras.Sequential(
[
tf.keras.layers.Dense(128, activation="relu"),
tf.keras.layers.Dense(128, activation="relu"),
tf.keras.layers.Dense(64, activation="relu"),
tf.keras.layers.Dense(64, activation="relu"),
tf.keras.layers.Dense(units=output_units),
]
)
return tf.keras.Model(
inputs=[numeric_input] + embedding_inputs, outputs=deep_model(deep_input)
)
Loss¶
In [ ]:
Copied!
if LOSS == "mse":
loss = tf.keras.losses.MeanSquaredError()
output_units = 1
if LOSS == "ziln":
loss = ltv.zero_inflated_lognormal_loss
output_units = 3
if LOSS == "mse":
loss = tf.keras.losses.MeanSquaredError()
output_units = 1
if LOSS == "ziln":
loss = ltv.zero_inflated_lognormal_loss
output_units = 3
In [ ]:
Copied!
x_train, x_eval, y_train, y_eval = dnn_split(df)
model = dnn_model(output_units)
x_train, x_eval, y_train, y_eval = dnn_split(df)
model = dnn_model(output_units)
In [ ]:
Copied!
model.compile(optimizer=tf.keras.optimizers.Nadam(lr=LEARNING_RATE), loss=loss)
model.compile(optimizer=tf.keras.optimizers.Nadam(lr=LEARNING_RATE), loss=loss)
Train¶
In [ ]:
Copied!
callbacks = [
tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss", min_lr=1e-6),
tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10),
]
callbacks = [
tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss", min_lr=1e-6),
tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10),
]
In [ ]:
Copied!
history = model.fit(
x=x_train,
y=y_train,
batch_size=2048,
epochs=200,
verbose=2,
callbacks=callbacks,
validation_data=(x_eval, y_eval),
).history
history = model.fit(
x=x_train,
y=y_train,
batch_size=2048,
epochs=200,
verbose=2,
callbacks=callbacks,
validation_data=(x_eval, y_eval),
).history
In [ ]:
Copied!
pd.DataFrame(history)[["loss", "val_loss"]].plot();
pd.DataFrame(history)[["loss", "val_loss"]].plot();
Eval¶
In [ ]:
Copied!
if LOSS == "mse":
y_pred = model.predict(x=x_eval, batch_size=1024).flatten()
if LOSS == "ziln":
logits = model.predict(x=x_eval, batch_size=1024)
y_pred = ltv.zero_inflated_lognormal_pred(logits).numpy().flatten()
if LOSS == "mse":
y_pred = model.predict(x=x_eval, batch_size=1024).flatten()
if LOSS == "ziln":
logits = model.predict(x=x_eval, batch_size=1024)
y_pred = ltv.zero_inflated_lognormal_pred(logits).numpy().flatten()
Pytorch-widedeep approach¶
In [ ]:
Copied!
from pytorch_widedeep.preprocessing import TabPreprocessor
from pytorch_widedeep.training import Trainer
from pytorch_widedeep.models import TabMlp, WideDeep
from torch.optim.lr_scheduler import ReduceLROnPlateau
from pytorch_widedeep.callbacks import EarlyStopping
from torch.optim import NAdam
from pytorch_widedeep.preprocessing import TabPreprocessor
from pytorch_widedeep.training import Trainer
from pytorch_widedeep.models import TabMlp, WideDeep
from torch.optim.lr_scheduler import ReduceLROnPlateau
from pytorch_widedeep.callbacks import EarlyStopping
from torch.optim import NAdam
In [ ]:
Copied!
# CATEGORICAL_FEATURES
NUMERICAL_FEATURES = ["num" + str(i) for i in range(21)]
x_train_pyt_num = pd.DataFrame(columns=NUMERICAL_FEATURES, data=x_train["numeric"])
x_train_pyt_cat = pd.DataFrame(
{key: value for key, value in x_train.items() if key not in ["numeric"]}
)
x_eval_pyt_num = pd.DataFrame(columns=NUMERICAL_FEATURES, data=x_eval["numeric"])
x_eval_pyt_cat = pd.DataFrame(
{key: value for key, value in x_eval.items() if key not in ["numeric"]}
)
# CATEGORICAL_FEATURES
NUMERICAL_FEATURES = ["num" + str(i) for i in range(21)]
x_train_pyt_num = pd.DataFrame(columns=NUMERICAL_FEATURES, data=x_train["numeric"])
x_train_pyt_cat = pd.DataFrame(
{key: value for key, value in x_train.items() if key not in ["numeric"]}
)
x_eval_pyt_num = pd.DataFrame(columns=NUMERICAL_FEATURES, data=x_eval["numeric"])
x_eval_pyt_cat = pd.DataFrame(
{key: value for key, value in x_eval.items() if key not in ["numeric"]}
)
In [ ]:
Copied!
x_train_pyt = pd.concat([x_train_pyt_num, x_train_pyt_cat], axis=1)
x_eval_pyt = pd.concat([x_eval_pyt_num, x_eval_pyt_cat], axis=1)
x_train_pyt = pd.concat([x_train_pyt_num, x_train_pyt_cat], axis=1)
x_eval_pyt = pd.concat([x_eval_pyt_num, x_eval_pyt_cat], axis=1)
In [ ]:
Copied!
embed_input = [
(u, int(x_train_pyt[u].nunique() ** 0.25) + 1) for u in CATEGORICAL_FEATURES
]
embed_input = [
(u, int(x_train_pyt[u].nunique() ** 0.25) + 1) for u in CATEGORICAL_FEATURES
]
In [ ]:
Copied!
# deeptabular
tab_preprocessor = TabPreprocessor(
embed_cols=embed_input,
continuous_cols=NUMERICAL_FEATURES,
shared_embed=False,
scale=False,
)
X_tab_train = tab_preprocessor.fit_transform(x_train_pyt)
X_tab_valid = tab_preprocessor.transform(x_eval_pyt)
X_tab_test = tab_preprocessor.transform(x_eval_pyt)
# target
y_train = y_train
y_valid = y_eval
y_test = y_train
X_train = {"X_tab": X_tab_train, "target": y_train}
X_val = {"X_tab": X_tab_valid, "target": y_valid}
X_test = {"X_tab": X_tab_test}
deeptabular = TabMlp(
mlp_hidden_dims=[128, 128, 64, 64],
column_idx=tab_preprocessor.column_idx,
embed_input=tab_preprocessor.cat_embed_input,
continuous_cols=tab_preprocessor.continuous_cols,
)
model = WideDeep(deeptabular=deeptabular, pred_dim=3)
deep_opt = NAdam(model.deeptabular.parameters(), lr=LEARNING_RATE)
callbacks = [EarlyStopping()]
deep_sch = ReduceLROnPlateau(deep_opt, min_lr=1e-6)
objective = "ziln"
trainer = Trainer(
model,
callbacks=callbacks,
lr_schedulers={"deeptabular": deep_sch},
objective=objective,
optimizers={"deeptabular": deep_opt},
)
trainer.fit(
X_train=X_train,
X_val=X_val,
n_epochs=200,
batch_size=2048,
)
y_pred_pytorch = trainer.predict(X_test=X_test)
# deeptabular
tab_preprocessor = TabPreprocessor(
embed_cols=embed_input,
continuous_cols=NUMERICAL_FEATURES,
shared_embed=False,
scale=False,
)
X_tab_train = tab_preprocessor.fit_transform(x_train_pyt)
X_tab_valid = tab_preprocessor.transform(x_eval_pyt)
X_tab_test = tab_preprocessor.transform(x_eval_pyt)
# target
y_train = y_train
y_valid = y_eval
y_test = y_train
X_train = {"X_tab": X_tab_train, "target": y_train}
X_val = {"X_tab": X_tab_valid, "target": y_valid}
X_test = {"X_tab": X_tab_test}
deeptabular = TabMlp(
mlp_hidden_dims=[128, 128, 64, 64],
column_idx=tab_preprocessor.column_idx,
embed_input=tab_preprocessor.cat_embed_input,
continuous_cols=tab_preprocessor.continuous_cols,
)
model = WideDeep(deeptabular=deeptabular, pred_dim=3)
deep_opt = NAdam(model.deeptabular.parameters(), lr=LEARNING_RATE)
callbacks = [EarlyStopping()]
deep_sch = ReduceLROnPlateau(deep_opt, min_lr=1e-6)
objective = "ziln"
trainer = Trainer(
model,
callbacks=callbacks,
lr_schedulers={"deeptabular": deep_sch},
objective=objective,
optimizers={"deeptabular": deep_opt},
)
trainer.fit(
X_train=X_train,
X_val=X_val,
n_epochs=200,
batch_size=2048,
)
y_pred_pytorch = trainer.predict(X_test=X_test)
In [ ]:
Copied!
pd.DataFrame(trainer.history)[["train_loss", "val_loss"]].plot();
pd.DataFrame(trainer.history)[["train_loss", "val_loss"]].plot();
In [ ]:
Copied!
from sklearn.metrics import mean_squared_error
mean_squared_error(y_pred, y_pred_pytorch)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_pred, y_pred_pytorch)
Appendix¶
Total Profit¶
In [ ]:
Copied!
unit_costs = [0.4, 0.5, 0.6, 0.68, 0.7, 0.8, 0.9, 1.0]
unit_costs = [0.4, 0.5, 0.6, 0.68, 0.7, 0.8, 0.9, 1.0]
In [ ]:
Copied!
num_mailed = [np.sum(y_pred > v) for v in unit_costs]
num_mailed
num_mailed = [np.sum(y_pred > v) for v in unit_costs]
num_mailed
In [ ]:
Copied!
baseline_total_profit = np.sum(y_eval - 0.68)
baseline_total_profit
baseline_total_profit = np.sum(y_eval - 0.68)
baseline_total_profit
In [ ]:
Copied!
total_profits = [np.sum(y_eval[y_pred > v] - v) for v in unit_costs]
total_profits
total_profits = [np.sum(y_eval[y_pred > v] - v) for v in unit_costs]
total_profits
Gini Coefficient¶
In [ ]:
Copied!
gain = pd.DataFrame(
{
"lorenz": ltv.cumulative_true(y_eval, y_eval),
"baseline": ltv.cumulative_true(y_eval, x_eval["numeric"][:, 19]),
"model": ltv.cumulative_true(y_eval, y_pred),
}
)
gain = pd.DataFrame(
{
"lorenz": ltv.cumulative_true(y_eval, y_eval),
"baseline": ltv.cumulative_true(y_eval, x_eval["numeric"][:, 19]),
"model": ltv.cumulative_true(y_eval, y_pred),
}
)
In [ ]:
Copied!
num_customers = np.float32(gain.shape[0])
gain["cumulative_customer"] = (np.arange(num_customers) + 1.0) / num_customers
num_customers = np.float32(gain.shape[0])
gain["cumulative_customer"] = (np.arange(num_customers) + 1.0) / num_customers
In [ ]:
Copied!
ax = gain[
[
"cumulative_customer",
"lorenz",
"baseline",
"model",
]
].plot(x="cumulative_customer", figsize=(8, 5), legend=True)
ax.legend(["Groundtruth", "Baseline", "Model"], loc="lower right")
ax.set_xlabel("Cumulative Fraction of Customers")
ax.set_xticks(np.arange(0, 1.1, 0.1))
ax.set_xlim((0, 1.0))
ax.set_ylabel("Cumulative Fraction of Total Lifetime Value")
ax.set_yticks(np.arange(0, 1.1, 0.1))
ax.set_ylim((0, 1.05))
ax.set_title("Gain Chart");
ax = gain[
[
"cumulative_customer",
"lorenz",
"baseline",
"model",
]
].plot(x="cumulative_customer", figsize=(8, 5), legend=True)
ax.legend(["Groundtruth", "Baseline", "Model"], loc="lower right")
ax.set_xlabel("Cumulative Fraction of Customers")
ax.set_xticks(np.arange(0, 1.1, 0.1))
ax.set_xlim((0, 1.0))
ax.set_ylabel("Cumulative Fraction of Total Lifetime Value")
ax.set_yticks(np.arange(0, 1.1, 0.1))
ax.set_ylim((0, 1.05))
ax.set_title("Gain Chart");
In [ ]:
Copied!
gini = ltv.gini_from_gain(gain[["lorenz", "baseline", "model"]])
gini
gini = ltv.gini_from_gain(gain[["lorenz", "baseline", "model"]])
gini
Calibration¶
In [ ]:
Copied!
df_decile = ltv.decile_stats(y_eval, y_pred)
df_decile
df_decile = ltv.decile_stats(y_eval, y_pred)
df_decile
In [ ]:
Copied!
ax = df_decile[["label_mean", "pred_mean"]].plot.bar(rot=0)
ax.set_title("Decile Chart")
ax.set_xlabel("Prediction bucket")
ax.set_ylabel("Average bucket value")
ax.legend(["Label", "Prediction"], loc="upper left");
ax = df_decile[["label_mean", "pred_mean"]].plot.bar(rot=0)
ax.set_title("Decile Chart")
ax.set_xlabel("Prediction bucket")
ax.set_ylabel("Average bucket value")
ax.legend(["Label", "Prediction"], loc="upper left");
Rank Correlation¶
In [ ]:
Copied!
def spearmanr(x1: Sequence[float], x2: Sequence[float]) -> float:
"""Calculates spearmanr rank correlation coefficient.
See https://docs.scipy.org/doc/scipy/reference/stats.html.
Args:
x1: 1D array_like.
x2: 1D array_like.
Returns:
correlation: float.
"""
return stats.spearmanr(x1, x2, nan_policy="raise")[0]
spearman_corr = spearmanr(y_eval, y_pred)
spearman_corr
def spearmanr(x1: Sequence[float], x2: Sequence[float]) -> float:
"""Calculates spearmanr rank correlation coefficient.
See https://docs.scipy.org/doc/scipy/reference/stats.html.
Args:
x1: 1D array_like.
x2: 1D array_like.
Returns:
correlation: float.
"""
return stats.spearmanr(x1, x2, nan_policy="raise")[0]
spearman_corr = spearmanr(y_eval, y_pred)
spearman_corr
All metrics together¶
In [ ]:
Copied!
df_metrics = pd.DataFrame(
{
"model": MODEL,
"loss_function": LOSS,
"train_loss": history["loss"][-1],
"eval_loss": history["val_loss"][-1],
"label_positive": np.mean(y_eval > 0),
"label_mean": y_eval.mean(),
"pred_mean": y_pred.mean(),
"decile_mape": df_decile["decile_mape"].mean(),
"baseline_gini": gini["normalized"][1],
"gini": gini["normalized"][2],
"spearman_corr": spearman_corr,
},
index=[VERSION],
)
df_metrics = pd.DataFrame(
{
"model": MODEL,
"loss_function": LOSS,
"train_loss": history["loss"][-1],
"eval_loss": history["val_loss"][-1],
"label_positive": np.mean(y_eval > 0),
"label_mean": y_eval.mean(),
"pred_mean": y_pred.mean(),
"decile_mape": df_decile["decile_mape"].mean(),
"baseline_gini": gini["normalized"][1],
"gini": gini["normalized"][2],
"spearman_corr": spearman_corr,
},
index=[VERSION],
)
In [ ]:
Copied!
for unit_cost, total_profit in zip(unit_costs, total_profits):
df_metrics["total_profit_{:02d}".format(int(unit_cost * 100))] = total_profit
for unit_cost, total_profit in zip(unit_costs, total_profits):
df_metrics["total_profit_{:02d}".format(int(unit_cost * 100))] = total_profit
In [ ]:
Copied!
df_metrics.T
df_metrics.T
Save¶
In [ ]:
Copied!
output_path = OUTPUT_CSV_FOLDER
output_path = OUTPUT_CSV_FOLDER
In [ ]:
Copied!
if not os.path.isdir(output_path):
os.makedirs(output_path)
if not os.path.isdir(output_path):
os.makedirs(output_path)
In [ ]:
Copied!
output_file = os.path.join(
output_path, "{}_regression_{}_{}.csv".format(MODEL, LOSS, VERSION)
)
output_file = os.path.join(
output_path, "{}_regression_{}_{}.csv".format(MODEL, LOSS, VERSION)
)
In [ ]:
Copied!
df_metrics.to_csv(output_file, index=False)
df_metrics.to_csv(output_file, index=False)