In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import scipy as scp
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.neighbors.kde import KernelDensity
import itertools
from sklearn.metrics import roc_curve, auc, roc_auc_score, log_loss, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
In [2]:
df = pd.read_excel("/home/gogol/mypy/default of credit card clients.xls", header = 1)
In [3]:
df.shape
Out[3]:
In [4]:
df.head()
Out[4]:
In [5]:
df.columns
Out[5]:
In [6]:
df_pay_status = df.loc[:, 'PAY_0':'PAY_6']
df_pay_status.columns = ['PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
df_pay_status.head(10)
Out[6]:
In [7]:
plt.tight_layout()
fig, axis = plt.subplots(2,3)
fig.set_size_inches(17,7)
ttl = fig.suptitle('Distribution of dalays in the past 6 months')
ttl.set_position([.5, 1.05])
left = 0.125 # the left side of the subplots of the figure
right = 0.9 # the right side of the subplots of the figure
bottom = 0.1 # the bottom of the subplots of the figure
top = 0.9 # the top of the subplots of the figure
wspace = .5 # the amount of width reserved for blank space between subplots
hspace = 1.1 # the amount of height reserved for white space between subplots
plt.subplots_adjust(
left = left,
bottom = bottom,
right = right,
top = top,
wspace = wspace,
hspace = hspace
)
columns = df_pay_status.columns
for i in range(len(columns)):
row, col = int(i/3), i%3
d = df_pay_status[columns[i]].value_counts()
e = d.index
g = sns.barplot(x = e, y = d, ax = axis[row, col], palette = 'Blues_d' )
In [8]:
df_bill_amt = df.loc[:, 'BILL_AMT1':'BILL_AMT6']
In [9]:
df_pay_amt = df.loc[:, 'PAY_AMT1':'PAY_AMT6']
In [10]:
df.drop(['default payment next month'], axis =1)
Out[10]:
In [11]:
bill_description = df_bill_amt.describe()
In [12]:
plt.tight_layout()
fig, axis = plt.subplots(1,3)
fig.set_size_inches(18,5)
ttl = fig.suptitle('Distribution of Mean, STD & Min of Bill Amounts in the past 6 months')
ttl.set_position([.5, 1.05])
left = 0.125 # the left side of the subplots of the figure
right = 0.9 # the right side of the subplots of the figure
bottom = 0.1 # the bottom of the subplots of the figure
top = 0.9 # the top of the subplots of the figure
wspace = .5 # the amount of width reserved for blank space between subplots
hspace = 2 # the amount of height reserved for white space between subplots
plt.subplots_adjust(
left = left,
bottom = bottom,
right = right,
top = top,
wspace = wspace,
hspace = hspace
)
columns = bill_description.columns
sns.set(font_scale = 1.1)
sns.set_style("whitegrid")
for i in range(len(columns)/2):
col = i%3
j = i+1
X = bill_description.columns[0:6]
c = bill_description.index[j]
Y = bill_description.loc[c]
g = sns.barplot(x = X, y = Y, ax = axis[col], label='small', palette = 'Blues_d')
g.set_xticklabels(bill_description.columns[0:6], rotation=30)
g.set_title(c)
In [13]:
pay_description = df_pay_amt.describe()
In [14]:
pay_description
Out[14]:
In [15]:
plt.tight_layout()
fig, axis = plt.subplots(1,2)
fig.set_size_inches(18,5)
ttl = fig.suptitle('Distribution of Mean & STD of Payment Amounts in the past 6 months')
ttl.set_position([.5, 1.05])
left = 0.125 # the left side of the subplots of the figure
right = 0.9 # the right side of the subplots of the figure
bottom = 0.1 # the bottom of the subplots of the figure
top = 0.9 # the top of the subplots of the figure
wspace = .5 # the amount of width reserved for blank space between subplots
hspace = 2 # the amount of height reserved for white space between subplots
plt.subplots_adjust(
left = left,
bottom = bottom,
right = right,
top = top,
wspace = wspace,
hspace = hspace
)
columns = pay_description.columns
sns.set(font_scale = 1.1)
for i in range(2):
col = i%3
j = i+1
X = pay_description.columns[0:6]
c = pay_description.index[j]
Y = pay_description.loc[c]
g = sns.barplot(x = X, y = Y, ax = axis[col], label='small', palette = 'Blues_d')
g.set_xticklabels(pay_description.columns[0:6], rotation=30)
g.set_title(c)
In [16]:
df_pay_amt.min()
Out[16]:
In [17]:
d = df['LIMIT_BAL'].value_counts()
In [18]:
fig = plt.figure()
fig.set_size_inches(30,5)
sns.set_style("whitegrid")
ttl = fig.suptitle('Distribution of Limit Balance')
ttl.set_position([.5, 1.05])
dd = d.index
np.sort(dd)
g = sns.barplot(x = dd, y = d, label='small', palette = 'Blues_d')
g.set_xticklabels(d.index, rotation=90)
Out[18]:
In [19]:
d.head()
Out[19]:
In [20]:
Y = df['default payment next month']
pd.DataFrame(Y)
Out[20]:
In [21]:
d = Y.value_counts()
In [22]:
fig1 = plt.figure()
fig1.set_size_inches(10,5)
sns.set_style("whitegrid")
ttl = fig1.suptitle('Defaulters out of 30,000 sample size (Defaulters = 1)')
ttl.set_position([.5, 1.05])
Defaulter_vs_NonDefaulters = d. index
Number_of_Customers = Y.value_counts()
g1 = sns.barplot(x = Defaulter_vs_NonDefaulters,
y = Number_of_Customers,
saturation = 1,
palette = 'Blues_d'
)
In [23]:
X = df.drop(['default payment next month'], axis = 1)
Y = df['default payment next month']
In [24]:
df['SEX'] = df['SEX'].astype('category').cat.rename_categories(['M', 'F'])
df['MARRIAGE'] = df['MARRIAGE'].astype('category').cat.rename_categories(['na', 'married', 'single', 'other'])
df['age_cat'] = pd.cut(df['AGE'], range(0, 100, 10), right=False)
In [25]:
fig, ax = plt.subplots(1,3)
fig.set_size_inches(20,5)
fig.suptitle('Defaulting by absolute numbers, for various demographics')
df_demo_1 = df.groupby(['default payment next month', 'SEX']).size()
df_demo_1 = df_demo_1.unstack(level = 1)
df_demo_1.plot(kind = 'bar', ax = ax[0])
df_demo_1 = df.groupby(['default payment next month', 'MARRIAGE']).size()
df_demo_1 = df_demo_1.unstack(level = 1)
df_demo_1.plot(kind = 'bar', ax = ax[1])
df_demo_1 = df.groupby(['default payment next month', 'age_cat']).size()
df_demo_1 = df_demo_1.unstack(level = 1)
df_demo_1.plot(kind = 'bar', ax = ax[2])
Out[25]:
In [26]:
fig, ax = plt.subplots(1,3)
fig.set_size_inches(20,5)
fig.suptitle('Defaulting by relative numbers given each class, for various demographics')
d = df.groupby(['default payment next month', 'SEX']).size().unstack(level=1)
d = d / d.sum()
p = d.plot(kind='bar', ax=ax[0])
d = df.groupby(['default payment next month', 'MARRIAGE']).size().unstack(level=1)
d = d / d.sum()
p = d.plot(kind='bar', ax=ax[1])
d = df.groupby(['default payment next month', 'age_cat']).size().unstack(level=1)
d = d / d.sum()
p = d.plot(kind='bar', ax=ax[2])
In [27]:
df.head()
Out[27]:
In [28]:
from math import log
df['pay_amt_avg_log'] = df_pay_amt.mean(axis = 1).apply(lambda x : log(x+1))
df['pay_amt_avg'] = df_pay_amt.mean(axis = 1)
df['pay_std'] = df_pay_amt.std(axis=1)
df['pay_rel_amt_1'] = df_pay_amt['PAY_AMT1']/df['pay_amt_avg']
df['pay_rel_amt_2'] = df_pay_amt['PAY_AMT2']/df['pay_amt_avg']
df['pay_rel_amt_3'] = df_pay_amt['PAY_AMT3']/df['pay_amt_avg']
df['pay_rel_amt_4'] = df_pay_amt['PAY_AMT4']/df['pay_amt_avg']
df['pay_rel_amt_5'] = df_pay_amt['PAY_AMT5']/df['pay_amt_avg']
df['pay_rel_amt_6'] = df_pay_amt['PAY_AMT6']/df['pay_amt_avg']
In [29]:
df['bill_amt_avg'] = df_bill_amt.mean(axis = 1)
df['bill_amt_avg_log'] = df_bill_amt.mean(axis = 1).apply(lambda x : log(x+1) if x>0 else 0)
df['billamt_rel_1'] = df_bill_amt['BILL_AMT1']/df['LIMIT_BAL']
df['billamt_rel_2'] = df_bill_amt['BILL_AMT2']/df['LIMIT_BAL']
df['billamt_rel_3'] = df_bill_amt['BILL_AMT3']/df['LIMIT_BAL']
df['billamt_rel_4'] = df_bill_amt['BILL_AMT4']/df['LIMIT_BAL']
df['billamt_rel_5'] = df_bill_amt['BILL_AMT5']/df['LIMIT_BAL']
df['billamt_rel_6'] = df_bill_amt['BILL_AMT6']/df['LIMIT_BAL']
In [30]:
df['LIMIT_BAL_LOG'] = df['LIMIT_BAL'].apply(lambda x: log(x+1))
df['LIMIT_BAL_CAT'] = pd.cut(df['LIMIT_BAL'], range(0, int(1e6), 10000), right=False)
In [31]:
df.columns
Out[31]:
In [32]:
df_bill_amt = df.loc[:, 'billamt_rel_1': 'billamt_rel_6']
In [33]:
df_bill_amt.head()
Out[33]:
In [34]:
#Plotting Kernal Density Estimation on Relative Bill Amount (bill_amount/credit limit) Columns:
fig, ax = plt.subplots(2,3)
fig.set_size_inches(15,5)
fig.set_size_inches(17,7)
ttl = fig.suptitle('Distribution of bill relative to credit in the path 6 months')
ttl.set_position([.5, 1.05])
left = 0.125 # the left side of the subplots of the figure
right = 0.9 # the right side of the subplots of the figure
bottom = 0.1 # the bottom of the subplots of the figure
top = 0.9 # the top of the subplots of the figure
wspace = .5 # the amount of width reserved for blank space between subplots
hspace = 1.1 # the amount of height reserved for white space between subplots
plt.subplots_adjust(
left = left,
bottom = bottom,
right = right,
top = top,
wspace = wspace,
hspace = hspace
)
columns = df_bill_amt.columns
for i in range(len(columns)):
row, col = int(i/3), i%3
#Plot the Histogram#
n, bins, patches = ax[row,col].hist(df_bill_amt[columns[i]], 50, normed=1, facecolor='green', alpha=0.75)
#Estimate Kernal Density#
kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(df_bill_amt[columns[i]].values.reshape(-1, 1))
x_grid = np.linspace(df_bill_amt[columns[i]].min(), df_bill_amt[columns[i]].max(), 1000)
log_pdf = kde.score_samples(x_grid.reshape(-1, 1))
# add the density line
ax[row,col].plot(x_grid, np.exp(log_pdf), color='blue', alpha=0.5, lw=3)
ax[row,col].set_title(columns[i])
In [35]:
X = df.drop(['ID', 'LIMIT_BAL'], axis = 1)
In [36]:
X.head()
Out[36]:
In [37]:
X1 = pd.concat([ X, pd.get_dummies(df['SEX'], drop_first = True) ], axis = 1)
X1 = pd.concat([ X, pd.get_dummies(df['MARRIAGE'], drop_first = True) ], axis = 1)
In [38]:
X1 = X1.drop(['SEX', 'MARRIAGE'], axis = 1)
In [39]:
X1.head()
Out[39]:
In [40]:
D = pd.concat([X1, Y], axis = 1)
In [41]:
D = D.rename(index=str, columns={"default payment next month": "target"})
In [42]:
D.columns
Out[42]:
In [43]:
formula = 'target ~ '
# original features & engineered features
formula += '+ C(married) + C(single) + C(other) + C(married) + C(EDUCATION) + AGE '
formula += '+ PAY_0 + PAY_2 + PAY_3 + PAY_4 + PAY_5 + PAY_6 '
formula += '+ C(age_cat) + C(LIMIT_BAL_CAT) + C(LIMIT_BAL_LOG) + pay_amt_avg + pay_std '
formula += '+ pay_amt_avg_log + pay_rel_amt_1 + pay_rel_amt_2 + pay_rel_amt_3 + pay_rel_amt_4 + pay_rel_amt_5 + pay_rel_amt_6 '
formula += '+ bill_amt_avg + bill_amt_avg_log + billamt_rel_1 + billamt_rel_2 + billamt_rel_3 + billamt_rel_4 + billamt_rel_5 + billamt_rel_6'
In [44]:
formula
Out[44]:
In [45]:
from patsy import dmatrices
Y, X = dmatrices(formula, data= D, return_type='dataframe')
Y = Y.iloc[:, 1]
In [46]:
import warnings
from sklearn.feature_selection import SelectKBest, f_classif
warnings.simplefilter(action='ignore', category=(UserWarning,RuntimeWarning))
selector = SelectKBest(f_classif, 25)
selector.fit(X, Y)
Out[46]:
In [47]:
top_indices = np.nan_to_num(selector.scores_).argsort()[-25:][::-1]
selector.scores_[top_indices]
X.columns[top_indices]
Out[47]:
In [48]:
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
scaler.fit(X)
Out[48]:
In [49]:
from sklearn.pipeline import Pipeline
preprocess = Pipeline([('anova', selector), ('scale', scaler)])
preprocess.fit(X,Y)
X_prep = preprocess.transform(X)
In [52]:
X_prep = pd.DataFrame(X_prep)
In [82]:
Y = pd.DataFrame(Y)
In [91]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(X_prep)
Df = pd.DataFrame(data = principalComponents
, columns = ['principal component 1', 'principal component 2'])
In [101]:
Df_p = pd.concat([Df, Y], axis = 1)
In [125]:
Df_p.columns
Out[125]:
In [124]:
import seaborn as sns
g = sns.lmplot(x = 'principal component 1',
y = 'principal component 2',
data = Df_p,
hue = 'target[1]'
)
In [50]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_prep, Y, test_size=0.2, random_state=42)
In [ ]:
import itertools
from sklearn.metrics import roc_curve, auc, roc_auc_score, log_loss, accuracy_score, confusion_matrix
In [63]:
#Plotting Confusion Matrix
def plot_cm(ax, y_true, y_pred, classes, title, th=0.5, cmap=plt.cm.Blues):
y_pred_labels = (y_pred>th).astype(int)
cm = confusion_matrix(y_true, y_pred_labels)
im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
ax.set_title(title)
tick_marks = np.arange(len(classes))
ax.set_xticks(tick_marks)
ax.set_yticks(tick_marks)
ax.set_xticklabels(classes)
ax.set_yticklabels(classes)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
ax.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
ax.set_ylabel('True label')
ax.set_xlabel('Predicted label')
In [64]:
#Plotting ROC Curve and AUC
def plot_auc(ax, y_train, y_train_pred, y_test, y_test_pred, th=0.5):
y_train_pred_labels = (y_train_pred>th).astype(int)
y_test_pred_labels = (y_test_pred>th).astype(int)
fpr_train, tpr_train, _ = roc_curve(y_train,y_train_pred)
roc_auc_train = auc(fpr_train, tpr_train)
acc_train = accuracy_score(y_train, y_train_pred_labels)
fpr_test, tpr_test, _ = roc_curve(y_test,y_test_pred)
roc_auc_test = auc(fpr_test, tpr_test)
acc_test = accuracy_score(y_test, y_test_pred_labels)
ax.plot(fpr_train, tpr_train)
ax.plot(fpr_test, tpr_test)
ax.plot([0, 1], [0, 1], 'k--')
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('ROC curve')
train_text = 'train acc = {:.3f}, auc = {:.2f}'.format(acc_train, roc_auc_train)
test_text = 'test acc = {:.3f}, auc = {:.2f}'.format(acc_test, roc_auc_test)
ax.legend([train_text, test_text])
In [51]:
from sklearn import linear_model
# Create logistic regression object
regr = linear_model.LogisticRegression()
# Train the model using the training sets
regr.fit(X_train, y_train)
Out[51]:
In [52]:
predictions = regr.predict(X_test)
In [65]:
y_train_pred = regr.predict_proba(X_train)[:,1]
y_test_pred = regr.predict_proba(X_test)[:,1]
threshold = 0.5
fig,ax = plt.subplots(1,3)
fig.set_size_inches(15,5)
plot_cm(ax[0], y_train, y_train_pred, [0,1], 'Confusion matrix (TRAIN)', threshold)
plot_cm(ax[1], y_test, y_test_pred, [0,1], 'Confusion matrix (TEST)', threshold)
plot_auc(ax[2], y_train, y_train_pred, y_test, y_test_pred, threshold)
plt.tight_layout()
plt.show()
In [66]:
#Using RandomForest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=500, min_samples_leaf=5)
rf.fit(X_train,y_train)
Out[66]:
In [67]:
predictions = rf.predict(X_test)
In [68]:
y_train_pred = rf.predict_proba(X_train)[:,1]
y_test_pred = rf.predict_proba(X_test)[:,1]
threshold = 0.5
fig,ax = plt.subplots(1,3)
fig.set_size_inches(15,5)
plot_cm(ax[0], y_train, y_train_pred, [0,1], 'Confusion matrix (TRAIN)', threshold)
plot_cm(ax[1], y_test, y_test_pred, [0,1], 'Confusion matrix (TEST)', threshold)
plot_auc(ax[2], y_train, y_train_pred, y_test, y_test_pred, threshold)
plt.tight_layout()
plt.show()
In [79]:
rf.feature_importances_
Out[79]:
In [85]:
pd.DataFrame(X_train)
Out[85]:
In [86]:
X_train.shape
Out[86]:
No comments:
Post a Comment