Appendix - Predicting Churn

Prepare

Data Dictionary

Jupyter Notebook

# Importing packages for data manipulation

import pandas as pd

import numpy as np


# Loading dataset into dataframe

df = pd.read_csv('waze_dataset.csv')


#Viewing the data

df.head(10)


# Checking how many rows and columns they are, if there are missing values and what datatypes are used in the dataset

df.info()


# Isolating rows with null values

df_null = df[df['label'].isnull()]

# Displaying summary stats of rows with null values

df_null.describe()


# Isolating rows without null values

df_not_null = df[~df['label'].isnull()]

# Displaying summary stats of rows without null values

df_not_null.describe()


# Counting null values by device

df_null.value_counts('device')


# Calculating % of iPhone nulls and Android nulls

df_null.value_counts('device', normalize=True)


# Calculating % of iPhone users and Android users in full dataset

df.value_counts('device', normalize=True)

Process

Jupyter Notebook

# Calculating counts of churned vs. retained

print(df['label'].value_counts())


# Calculating  percentages of churned vs. retained

print(df['label'].value_counts(normalize=True))


# Calculating median values of all columns for churned and retained users

df.groupby('label').median(numeric_only=True)


# Grouping data by `label` and calculating medians

medians_by_label = df.groupby('label').median(numeric_only=True)

# Calculating median kilometers per drive

medians_by_label['driven_km_drives'] / medians_by_label['drives']

# Calculating median kilometers per driving day

medians_by_label['driven_km_drives'] / medians_by_label['driving_days']

# Calculating median drives per driving day

medians_by_label['drives'] / medians_by_label['driving_days']


# For each label, calculating the number of Android users and iPhone users

df.groupby(['label', 'device']).size()


# For each label, calculating the percentage of Android users and iPhone users

df.groupby('label')['device'].value_counts(normalize=True)

Analyze

EDA - Exploratory Data Analysis

Jupyter Notebook

#Box plot 'sessions'

plt.figure(figsize=(5,2))

sns.boxplot(x=df['sessions'], fliersize=2)

plt.title('sessions box plot');


#Histogram 'sessions'

sns.histplot(x=df['sessions'])

median = df['sessions'].median()

plt.axvline(median, color='red', linestyle='--')

plt.text(75,1200, 'median=56.0', color='red')

plt.title('sessions box plot');


# Box plot 'drives'

plt.figure(figsize=(5,2))

sns.boxplot(x=df['drives'], fliersize=2)

plt.title('drives box plot');


# Histogram 'drives'

sns.histplot(x=df['drives'])

median = df['drives'].median()

plt.axvline(median, color='red', linestyle='--')

plt.text(75,1000, 'median=48.0', color='red')

plt.title('drives histogram');


# Box plot total_sessions

plt.figure(figsize=(5,2))

sns.boxplot(x=df['total_sessions'], fliersize=2)

plt.title('total_sessions box plot');


# Histogram 'total_sessions'

sns.histplot(x=df['total_sessions'])

median = df['total_sessions'].median()

plt.axvline(median, color='red', linestyle='--')

plt.text(170,700, 'median=159.6', color='red')

plt.title('total_sessions histogram');


# Box plot 'n_days_after_onboarding'

plt.figure(figsize=(5,2))

sns.boxplot(x=df['n_days_after_onboarding'], fliersize=2)

plt.title('n_days_after_onboarding box plot')


# Histogram 'n_days_after_onboarding'

sns.histplot(x=df['n_days_after_onboarding'])

median = df['n_days_after_onboarding'].median()

plt.axvline(median, color='red', linestyle='--')

plt.title('n_days_after_onboarding histogram');


# Box plot 'driven_km_drives'

plt.figure(figsize=(5,2))

sns.boxplot(x=df['driven_km_drives'], fliersize=2)

plt.title('driven_km_drives box plot');


# Histogram 'driven_km_drives'

sns.histplot(x=df['driven_km_drives'])

median = df['driven_km_drives'].median()

plt.axvline(median, color='red', linestyle='--')

plt.text(4000,700, 'median=3493.9', color='red')

plt.title('driven_km_drives histogram');


# Box plot 'duration_minutes_drives'

plt.figure(figsize=(5,2))

sns.boxplot(x=df['duration_minutes_drives'], fliersize=2)

plt.title('duration_minutes_drives box plot');


# Histogram 'duration_minutes_drives'

sns.histplot(x=df['duration_minutes_drives'])

median = df['duration_minutes_drives'].median()

plt.axvline(median, color='red', linestyle='--')

plt.text(2000,700, 'median=1478.2', color='red')

plt.title('driven km drives box plot');


# Box plot 'activity_days'

plt.figure(figsize=(5,2))

sns.boxplot(x=df['activity_days'], fliersize=2)

plt.title('activity_days box plot');


# Histogram 'activity_days'

sns.histplot(x=df['activity_days'], discrete=True)

median = df['activity_days'].median()

plt.axvline(median, color='red', linestyle='--')

plt.title('activity_days histogram');


# Box plot 'driving_days'

plt.figure(figsize=(5,2))

sns.boxplot(x=df['driving_days'], fliersize=2)

plt.title('driving_days box plot');


# Histogram 'driving_days'

sns.histplot(x=df['driving_days'], discrete=True)

median = df['driving_days'].median()

plt.axvline(median, color='red', linestyle='--')

plt.title('driving_days histogram');


# Pie chart 'device'

fig = plt.figure(figsize=(3,3))

data=df['device'].value_counts()

plt.pie(data,

        labels=[f'{data.index[0]}: {data.values[0]}',

                f'{data.index[1]}: {data.values[1]}'],

        autopct='%1.1f%%'

        )

plt.title('Users by device');


# Pie chart 'label'

fig = plt.figure(figsize=(3,3))

data=df['label'].value_counts()

plt.pie(data,

        labels=[f'{data.index[0]}: {data.values[0]}',

                f'{data.index[1]}: {data.values[1]}'],

        autopct='%1.1f%%'

        )

plt.title('Count of retained vs. churned');


# Histogram 'driving_days' vs 'activity_days'

plt.figure(figsize=(12,4))

label=['driving days', 'activity days']

plt.hist([df['driving_days'], df['activity_days']],

         bins=range(0,33),

         label=label)

plt.xlabel('days')

plt.ylabel('count')

plt.legend()

plt.title('driving_days vs. activity_days');


#Calculating maximum number of 'driving_days' and 'activity_days'

print(df['driving_days'].max())

print(df['activity_days'].max())


# Scatter plot 'driving_days' vs 'activity_days'

sns.scatterplot(data=df, x='driving_days', y='activity_days')

plt.title('driving_days vs. activity_days')

plt.plot([0,31], [0,31], color='red', linestyle='--');


# Histogram retention by 'device'

plt.figure(figsize=(5,4))

sns.histplot(data=df,

             x='device',

             hue='label',

             multiple='dodge',

             shrink=0.9

             )

plt.title('Retention by device histogram');


#Retention by kilometers driven per driving day

# 1. Create `km_per_driving_day` column

df['km_per_driving_day'] = df['driven_km_drives'] / df['driving_days']

# 2. Call `describe()` on the new column

df['km_per_driving_day'].describe()

# 1. Convert infinite values to zero

df.loc[df['km_per_driving_day']==np.inf, 'km_per_driving_day'] = 0

# 2. Confirm that it worked

df['km_per_driving_day'].describe()


# Histogram etention by km_per_driving_day

plt.figure(figsize=(12,5))

sns.histplot(data=df,

             x='km_per_driving_day',

             bins=range(0,1201,20),

             hue='label',

             multiple='fill')

plt.ylabel('%', rotation=0)

plt.title('Churn rate by mean km per driving day');


# Histogram churn rate per number of driving_days

plt.figure(figsize=(12,5))

sns.histplot(data=df,

             x='driving_days',

             bins=range(1,32),

             hue='label',

             multiple='fill',

             discrete=True)

plt.ylabel('%', rotation=0)

plt.title('Churn rate per driving day');


#Proportion of sessions that occurred in the last month

df['percent_sessions_in_last_month'] = df['sessions'] / df['total_sessions']

#Median

df['percent_sessions_in_last_month'].median()


# Histogram 'percent_sessions_in_last_month'

sns.histplot(x=df['percent_sessions_in_last_month'], hue=df['label'],)

median = df['percent_sessions_in_last_month'].median()

plt.axvline(median, color='red', linestyle='--')

plt.title('percent_sessions_in_last_month');


#Median 'n_days_after_onboarding'

df['n_days_after_onboarding'].median()


# Histogram 'n_days_after_onboarding'

data = df.loc[df['percent_sessions_in_last_month']>=0.4]

plt.figure(figsize=(5,3))

sns.histplot(x=data['n_days_after_onboarding'])

plt.title('Num. days after onboarding for users with >=40% sessions in last month');


#Handling outliers 

#Calculate the 95th percentile of a given column, then replaces values > the 95th percentile with the value at the 95th percentile

def outlier_imputer(column_name, percentile):

    # Calculate threshold

    threshold = df[column_name].quantile(percentile)

    # Impute threshold for values > than threshold

    df.loc[df[column_name] > threshold, column_name] = threshold


    print('{:>25} | percentile: {} | threshold: {}'.format(column_name, percentile, threshold))


# Calculate 95th percentile for the following columns

for column in ['sessions', 'drives', 'total_sessions',

               'driven_km_drives', 'duration_minutes_drives']:

               outlier_imputer(column, 0.95)

Hyphitesis Test

Jupyter Notebook

# Import relevant packages or libraries

import pandas as pd

import numpy as np

from scipy import stats


# Load dataset into dataframe

df = pd.read_csv('waze_dataset.csv')


# Create new dictionary `map_dictionary`

map_dictionary = {'Android':2,'iPhone':1}

# Create new `device_type` column

df['device_type'] = df['device']

# Map the new column to the dictionary

df['device_type'] = df['device_type'].map(map_dictionary)


# Calculate mean for device types

df.groupby('device_type')['drives'].mean()


# Isolate the `drives` column for iPhone users.

drives_iphone = df[df['device_type'] == 1]['drives']

# Isolate the `drives` column for Android users.

drives_android = df[df['device_type'] == 2]['drives']

# Perform the t-test

stats.ttest_ind(a=drives_iphone, b=drives_android, equal_var=False)

Regression Model 

Jupyter Notebook

# Import relevant packages or libraries

# Packages for numerics + dataframes

import numpy as np

import pandas as pd

# Packages for visualization

import matplotlib.pyplot as plt

import seaborn as sns

# Packages for Logistic Regression & Confusion Matrix

from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, accuracy_score, precision_score, \

recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay

from sklearn.linear_model import LogisticRegression


# Load the dataset by running this cell

df = pd.read_csv('waze_dataset.csv')


# Create `km_per_driving_day` column

df['km_per_driving_day'] = df['driven_km_drives']/df['driving_days']


# Convert infinite values to zero

df.loc[df['km_per_driving_day']==np.inf, 'km_per_driving_day'] = 0


# Create `professional_driver` column

df['professional_driver'] = np.where((df['drives'] >= 60) & (df['driving_days'] >= 15), 1, 0)


# Check count of professionals and non-professionals

df['professional_driver'].value_counts(normalize=True)


# Check in-class churn rate

df.groupby(['professional_driver'])['label'].value_counts(normalize=True)


# Drop rows with missing data in `label` column

df = df.dropna(subset=['label']) 


# Calculate the 95th percentile specified columns, and change outliers in the columns to this value

for column in ['sessions', 'drives', 'total_sessions', 'total_navigations_fav1',

               'total_navigations_fav2', 'driven_km_drives', 'duration_minutes_drives']:

    threshold = df[column].quantile(0.95)

    df.loc[df[column] > threshold, column] = threshold


# Encode categorical variable 'label'. Create binary `label2` column

df['label2'] = np.where((df['label'] == 'churned'), 1, 0)

df[['label', 'label2']].tail()


# Generate a correlation matrix

df.corr(method='pearson')


# Plot correlation heatmap

plt.figure(figsize=(15,10))

sns.heatmap(df.corr(method='pearson'), vmin=-1, vmax=1, annot=True, cmap='coolwarm')

plt.title('Correlation heatmap indicates many low correlated variables',

          fontsize=18)

plt.show();


# Encode categorical variable 'device'. Create new `device2` variable

df['device2'] = np.where((df['device'] == 'iPhone'), 1, 0)

df[['device', 'device2']].tail()


# Isolate predictor variables

X = df.drop(columns = ['label', 'label2', 'device', 'sessions', 'driving_days'])


# Isolate target variable

y = df['label2']


# Perform the train-test split

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)


# Use .head()

X_train.head()


# Instantiate a logistic regression model. Add the argument `penalty = None`, since the predictors are unscaled.

model = LogisticRegression(penalty='none', max_iter=400)

# Fit the model on `X_train` and `y_train`.

model.fit(X_train, y_train)


# Create a series whose index is the column names and whose values are the coefficients in model.coef_

pd.Series(model.coef_[0], index=X.columns)


#Get the model intercept

model.intercept_


# Get the predicted probabilities of the training data

training_probabilities = model.predict_proba(X_train)

training_probabilities


# Copy the `X_train` dataframe and assign to `logit_data`

logit_data = X_train.copy()

# Create a new `logit` column in the `logit_data` df

logit_data['logit'] = [np.log(prob[1] / prob[0]) for prob in training_probabilities]


# Plot regplot of `activity_days` log-odds

sns.regplot(x='activity_days', y='logit', data=logit_data, scatter_kws={'s': 2, 'alpha': 0.5})

plt.title('Log-odds: activity_days');


# Generate predictions on X_test

y_preds = model.predict(X_test)


# Score the model (accuracy) on the test data

model.score(X_test, y_test)


# Display a confusion matrix

cm = confusion_matrix(y_test, y_preds)

disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=['retained', 'churned'],)

disp.plot();


# Calculate precision manually

precision = cm[1,1] / (cm[0, 1] + cm[1, 1])


# Calculate recall manually

recall = cm[1,1] / (cm[1, 0] + cm[1, 1])


# Create a classification report

target_labels = ['retained', 'churned']

print(classification_report(y_test, y_preds, target_names=target_labels))


#Generate a bar graph of the model's coefficients for a visual representation of the importance of the model's features. 

# Create a list of (column_name, coefficient) tuples

feature_importance = list(zip(X_train.columns, model.coef_[0]))

# Sort the list by coefficient value

feature_importance = sorted(feature_importance, key=lambda x: x[1], reverse=True)

# Plot the feature importances

import seaborn as sns

sns.barplot(x=[x[1] for x in feature_importance], y=[x[0] for x in feature_importance],orient='h')

plt.title('Feature importance');

Building and Testing Random Forest and XGBoost Model 

Jupyter Notebook

# Import packages for data manipulation

import numpy as np

import pandas as pd


# Import packages for data visualization

import matplotlib.pyplot as plt


# This lets us see all of the columns, preventing Juptyer from redacting them.

pd.set_option('display.max_columns', None)


# Import packages for data modeling

from sklearn.model_selection import GridSearchCV, train_test_split

from sklearn.metrics import roc_auc_score, roc_curve, auc

from sklearn.metrics import accuracy_score, precision_score, recall_score,\

f1_score, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay, PrecisionRecallDisplay

from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier


# This is the function that helps plot feature importance

from xgboost import plot_importance


# This module lets us save our models once we fit them.

import pickle


# Import dataset

df = pd.read_csv('waze_dataset.csv')


# Create `km_per_driving_day` feature

df['km_per_driving_day'] = df['driven_km_drives'] / df['driving_days']

# Convert infinite values to zero

df.loc[df['km_per_driving_day']==np.inf, 'km_per_driving_day'] = 0


# Create `percent_sessions_in_last_month` feature

df['percent_sessions_in_last_month'] = df['sessions']/df['total_sessions']


# Create `professional_driver` feature

df['professional_driver'] = np.where((df['drives'] >= 60) & (df['driving_days'] >= 15), 1, 0)


# Create `total_sessions_per_day` feature

df['total_sessions_per_day'] = df['total_sessions'] / df['n_days_after_onboarding']


# Create `km_per_hour` feature

df['km_per_hour'] = df['driven_km_drives'] / (df['duration_minutes_drives'] / 60)


# Create `km_per_drive` feature

df['km_per_drive'] = df['driven_km_drives'] / df['drives']

# Convert infinite values to zero

df.loc[df['km_per_drive']==np.inf, 'km_per_drive'] = 0


# Create `percent_of_sessions_to_favorite` feature

df['percent_of_drives_to_favorite'] = (df['total_navigations_fav1'] + df['total_navigations_fav2']) / df['total_sessions']


# Drop rows with missing values

df.dropna(subset = ['label'], axis = 0)


# Drop rows with missing values

df = df.dropna(subset=['label'])


# Encode 'device' column. Create new `device2` variable

df['device2'] = np.where(df['device']=='Android', 0, 1)

# Encode 'label' column. Create binary `label2` column

df['label2'] = np.where(df['label']=='churned', 1, 0)


# Drop `ID` column

df = df.drop(['ID'], axis=1)


# Get class balance of 'label' col

df['label'].value_counts(normalize=True)


# Isolate X variables

X = df.drop(columns=['label', 'label2', 'device'])

# Isolate y variable

y = df['label2']

# Split into train and test sets

X_tr, X_test, y_tr, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Split into train and validate sets

X_train, X_val, y_train, y_val = train_test_split(X_tr, y_tr, stratify=y_tr, test_size=0.25, random_state=42)


# Instantiate the random forest classifier

rf = RandomForestClassifier(random_state=42)

# Create a dictionary of hyperparameters to tune

cv_params = {'max_depth': [None],

             'max_features': [1.0],

             'max_samples': [1.0],

             'min_samples_leaf': [2],

             'min_samples_split': [2],

             'n_estimators': [300],

             }

# Define a dictionary of scoring metrics to capture

scoring = {'accuracy', 'precision', 'recall', 'f1'}

# Instantiate the GridSearchCV object

rf_cv = GridSearchCV(rf, cv_params, scoring=scoring, cv=4, refit='recall')

#Fit the model to the training data

%%time

rf_cv.fit(X_train, y_train)

# Examine best score

rf_cv.best_score_

# Examine best hyperparameter combo (only one param used for each hyperparameter, not needed)

rf_cv.best_params_


# Instantiate the XGBoost classifier

xgb = XGBClassifier(objective='binary:logistic', random_state=42)

# Create a dictionary of hyperparameters to tune

cv_params = {'max_depth': [6, 12],

             'min_child_weight': [3, 5],

             'learning_rate': [0.01, 0.1],

             'n_estimators': [300]

             }

# Define a dictionary of scoring metrics to capture

scoring = {'accuracy', 'precision', 'recall', 'f1'}

# Instantiate the GridSearchCV object

xgb_cv = GridSearchCV(xgb, cv_params, scoring=scoring, cv=4, refit='recall')

# Fit the model to the training data

%%time

xgb_cv.fit(X_train, y_train)

# Examine best score

xgb_cv.best_score_

# Examine best parameters

xgb_cv.best_params_


# Use random forest model to predict on validation data

rf_val_preds = rf_cv.best_estimator_.predict(X_val)


# Use XGBoost model to predict on validation data

xgb_val_preds = xgb_cv.best_estimator_.predict(X_val)


# Use XGBoost model to predict on test data

xgb_test_preds = xgb_cv.best_estimator_.predict(X_test)


# Generate array of values for confusion matrix

cm = confusion_matrix(y_test, xgb_test_preds, labels=xgb_cv.classes_)

# Plot confusion matrix

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['retained', 'churned'])

disp.plot();


# Plottet the most important features of the final model

plot_importance(xgb_cv.best_estimator_);