PyMC3 aims for intuitive and readable, yet powerful syntax that reflects how statisticians describe models. The modeling process generally follows these five steps:
The resulting model can be used for inference to gain detailed insights into parameter values as well as to predict outcomes for new data points.
Logistic regression estimates a linear relationship between a set of features and a binary outcome, mediated by a sigmoid function to ensure the model produces probabilities. The frequentist approach resulted in point estimates for the parameters that measure the influence of each feature on the probability that a data point belongs to the positive class, with confidence intervals based on assumptions about the parameter distribution.
In contrast, Bayesian logistic regression estimates the posterior distribution over the parameters itself. The posterior allows for more robust estimates of what is called a Bayesian credible interval for each parameter with the benefit of more transparency about the model’s uncertainty.
import warnings
warnings.filterwarnings('ignore')
from pathlib import Path
import pickle
from collections import OrderedDict
import pandas as pd
import numpy as np
from scipy import stats
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import (roc_curve, roc_auc_score, confusion_matrix, accuracy_score, f1_score,
precision_recall_curve)
import theano
import pymc3 as pm
from pymc3.variational.callbacks import CheckParametersConvergence
import statsmodels.formula.api as smf
import arviz as az
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.lines as mlines
import seaborn as sns
from IPython.display import HTML
sns.set(style="white", color_codes=True)
pd.set_option('display.max_columns', None)
data_path = Path('data')
fig_path = Path('figures')
model_path = Path('models')
for p in [data_path, fig_path, model_path]:
if not p.exists():
p.mkdir()
The goal of this dataset is to create a binary classification model that predicts whether or not over $11,000$ customers will subscribe to a term deposit after a marketing campaign the bank has performed, based on $17$ predictor variables. The target variable is given as deposit and takes on a value of $1$ if the customer has subscribed and $0$ otherwise.
This is a slightly imbalanced class problem because there are more customers that did not subscribe the term deposit than did.
data = pd.read_csv('bank.csv')
data.info()
data.head()
unique = data.nunique()
unique
mask = data.nunique().values < 31
cat_features = list(unique[mask].index)
cat_features
total = len(data)
plt.figure(figsize=(7,5))
g = sns.countplot(x='deposit', data=data)
g.set_ylabel('Count', fontsize=14)
for p in g.patches:
height = p.get_height()
g.text(p.get_x()+p.get_width()/2.,
height + 1.5,
'{:1.2f}%'.format(height/total*100),
ha="center", fontsize=14, fontweight='bold')
plt.margins(y=0.1)
plt.show()
fig, axs = plt.subplots(ncols=2, nrows=0, figsize=(13, 45))
plt.subplots_adjust(right=2)
plt.subplots_adjust(top=2)
for i, feature in enumerate(list(data[cat_features]), 1):
if(feature=='deposit'):
break
plt.subplot(len(cat_features), 3, i)
sns.countplot(y=feature, hue='deposit', data=data)
plt.title(f'{feature}', size=15, fontsize=12)
plt.legend(loc='best', prop={'size': 8})
plt.tight_layout()
plt.show()
plt.figure(figsize=(8,4))
sns.countplot(x='day', hue='deposit', data=data);
num_features = [f for f in data.columns if f not in cat_features]
num_features.remove('day')
num_features
basetable0 = pd.DataFrame(index=data.index)
basetable0['deposit'] = data['deposit']
deposit_dict = {'no': 0, 'yes': 1}
basetable0 = basetable0.replace({'deposit': deposit_dict})
for variable in num_features:
if len(data.groupby(variable)) > 31:
new_variable = "disc_" + variable
basetable0[new_variable] = pd.qcut(data[variable], 5, duplicates='drop')
basetable0.head()
def create_pig_table(basetable, target, variable):
groups = basetable[[target,variable]].groupby(variable)
pig_table = groups[target].agg(Incidence = np.mean, Size = np.size).reset_index()
return pig_table
def plot_pig(pig_table, variable):
plt.figure(figsize=(10, 4))
plt.ylabel("Size", rotation=0, rotation_mode="anchor", ha="right")
pig_table["Size"].plot(kind="bar", width=0.5, color="lightgray", edgecolor="none")
pig_table["Incidence"].plot(secondary_y=True)
plt.xticks(np.arange(len(pig_table)), pig_table[variable])
plt.xlim([-0.5, len(pig_table) - 0.5])
plt.ylabel("Incidence", rotation=0, rotation_mode="anchor", ha="left")
plt.title(f'{variable}')
plt.show()
variables = list(basetable0.drop('deposit', axis=1).columns)
pig_tables = {}
for variable in variables:
pig_table = create_pig_table(basetable0, 'deposit', variable)
pig_tables[variable] = pig_table
for variable in variables:
pig_table = create_pig_table(basetable0, "deposit", variable)
plot_pig(pig_table, variable)
job_dict = {'management': 12, 'admin.': 11, 'entrepreneur': 10, 'technician': 9, 'services': 8, 'self-employed': 7,
'blue-collar': 6, 'retired': 5, 'housemaid': 4, 'unemployed': 3, 'unknown': 2, 'student': 1}
data = data.replace({'job': job_dict})
marital_dict = {'married': 4, 'single': 3, 'divorced': 2, 'unknown': 1}
data = data.replace({'marital': marital_dict})
data['marital'] = data['marital'].astype(int)
edu_dict = {'tertiary': 4, 'secondary': 3, 'primary': 2, 'unknown': 1}
data = data.replace({'education': edu_dict})
default_dict = {'no': 1, 'yes': 0}
data = data.replace({'default': default_dict})
housing_dict = {'no': 1, 'yes': 0}
data = data.replace({'housing': housing_dict})
loan_dict = {'no': 1, 'yes': 0}
data = data.replace({'loan': loan_dict})
contact_dict = {'cellular': 3, 'telephone': 2, 'unknown': 1}
data = data.replace({'contact': contact_dict})
poutcome_dict = {'success': 4, 'other': 3, 'unknown': 2, 'failure': 1}
data = data.replace({'poutcome': poutcome_dict})
month_dict = {'jan': 1, 'feb': 2, 'aug': 8, 'nov': 11, 'jun': 6, 'apr': 4, 'jul': 7, 'may': 5, 'oct': 10,
'mar': 3, 'sep': 9, 'dec': 12}
data = data.replace({'month': month_dict})
pdays_dict = {-1: 0}
data = data.replace({'pdays': pdays_dict})
data.head()
data.dtypes
mask = data.nunique().values > 31
num_features = list(unique[mask].index)
num_features
data[num_features].hist(bins=30, figsize=(8,7))
plt.tight_layout();
from sklearn.preprocessing import MinMaxScaler
import scipy.stats as stats
colors = cm.rainbow(np.linspace(0, 1, len(num_features)))
for col, c in zip(num_features, colors):
minmax = MinMaxScaler(feature_range=(0.0001, max(data[col].values)))
if data[col].describe().iloc[3] <= 0.0:
z = minmax.fit_transform(data[col].values.reshape(-1, 1))
z = np.squeeze(z)
print('BC MinMax Scaled')
else:
z = data[col]
z0, _ = stats.boxcox(z, lmbda=None)
z1 = stats.boxcox(z, lmbda=-1)
z2 = stats.boxcox(z, lmbda=-0.5)
z3 = stats.boxcox(z, lmbda=0.0)
z4 = stats.boxcox(z, lmbda=0.5)
z5 = stats.boxcox(z, lmbda=2)
z6 = stats.boxcox(z, lmbda=-2)
z7 = np.cbrt(data[col])
trans = [data[col], z0, z1, z2, z3, z4, z5, z6, z7]
names = ['Original', 'Boxcox', 'Recip', 'Recip_Sqrt', 'Log', 'Sqrt', 'Square', 'Recip_Square', 'Cube_rt']
fig = plt.figure(figsize=(12,6))
for i, t, n in zip(range(1, 10), trans, names):
ax = fig.add_subplot(3, 3, i)
plt.hist(t, bins=30, color=c, alpha=0.6)
plt.title(col+'_'+n)
plt.tight_layout()
plt.show();
data.balance = np.cbrt(data.balance)
age, _ = stats.boxcox(data.age, lmbda=None)
data.age = age
duration, _ = stats.boxcox(data.duration, lmbda=None)
data.duration = duration
from sklearn.preprocessing import scale
cols = ['duration', 'balance', 'age']
data.loc[:, cols] = scale(data.loc[:, cols])
nums = data[num_features]
nums['deposit'] = data.deposit
sns.pairplot(nums, hue='deposit', diag_kind='hist');