Saturday, February 10, 2018

Credit Card Defaulter Prediction

Credit Card Defaulter Prediction
In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import scipy as scp

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.neighbors.kde import KernelDensity

import itertools
from sklearn.metrics import roc_curve, auc, roc_auc_score, log_loss, accuracy_score, confusion_matrix

from  sklearn.model_selection import train_test_split
from  sklearn import preprocessing
from  sklearn.ensemble import RandomForestRegressor
from  sklearn.pipeline import make_pipeline
from  sklearn.model_selection import GridSearchCV
from  sklearn.metrics import mean_squared_error, r2_score
In [2]:
df = pd.read_excel("/home/gogol/mypy/default of credit card clients.xls", header = 1)
In [3]:
df.shape
Out[3]:
(30000, 25)
In [4]:
df.head()
Out[4]:
ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 ... BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6 default payment next month
0 1 20000 2 2 1 24 2 2 -1 -1 ... 0 0 0 0 689 0 0 0 0 1
1 2 120000 2 2 2 26 -1 2 0 0 ... 3272 3455 3261 0 1000 1000 1000 0 2000 1
2 3 90000 2 2 2 34 0 0 0 0 ... 14331 14948 15549 1518 1500 1000 1000 1000 5000 0
3 4 50000 2 2 1 37 0 0 0 0 ... 28314 28959 29547 2000 2019 1200 1100 1069 1000 0
4 5 50000 1 2 1 57 -1 0 -1 0 ... 20940 19146 19131 2000 36681 10000 9000 689 679 0
5 rows × 25 columns
In [5]:
df.columns
Out[5]:
Index([u'ID', u'LIMIT_BAL', u'SEX', u'EDUCATION', u'MARRIAGE', u'AGE',
       u'PAY_0', u'PAY_2', u'PAY_3', u'PAY_4', u'PAY_5', u'PAY_6',
       u'BILL_AMT1', u'BILL_AMT2', u'BILL_AMT3', u'BILL_AMT4', u'BILL_AMT5',
       u'BILL_AMT6', u'PAY_AMT1', u'PAY_AMT2', u'PAY_AMT3', u'PAY_AMT4',
       u'PAY_AMT5', u'PAY_AMT6', u'default payment next month'],
      dtype='object')
In [6]:
df_pay_status = df.loc[:, 'PAY_0':'PAY_6']
df_pay_status.columns = ['PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
df_pay_status.head(10)
Out[6]:
PAY_1 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6
0 2 2 -1 -1 -2 -2
1 -1 2 0 0 0 2
2 0 0 0 0 0 0
3 0 0 0 0 0 0
4 -1 0 -1 0 0 0
5 0 0 0 0 0 0
6 0 0 0 0 0 0
7 0 -1 -1 0 0 -1
8 0 0 2 0 0 0
9 -2 -2 -2 -2 -1 -1
In [7]:
plt.tight_layout()
fig, axis = plt.subplots(2,3)
fig.set_size_inches(17,7)
ttl = fig.suptitle('Distribution of dalays in the past 6 months')
ttl.set_position([.5, 1.05])


left   =  0.125  # the left side of the subplots of the figure
right  =  0.9    # the right side of the subplots of the figure
bottom =  0.1    # the bottom of the subplots of the figure
top    =  0.9    # the top of the subplots of the figure
wspace =  .5     # the amount of width reserved for blank space between subplots
hspace = 1.1 # the amount of height reserved for white space between subplots

plt.subplots_adjust(
    left    =  left, 
    bottom  =  bottom, 
    right   =  right, 
    top     =  top, 
    wspace  =  wspace, 
    hspace  =  hspace
)

columns = df_pay_status.columns

for i in range(len(columns)):
    row, col = int(i/3), i%3
    d = df_pay_status[columns[i]].value_counts()
    e = d.index
    g = sns.barplot(x = e, y = d, ax = axis[row, col], palette = 'Blues_d' )
<matplotlib.figure.Figure at 0x7f2ccb9daa90>
In [8]:
df_bill_amt = df.loc[:, 'BILL_AMT1':'BILL_AMT6']
In [9]:
df_pay_amt = df.loc[:, 'PAY_AMT1':'PAY_AMT6']
In [10]:
df.drop(['default payment next month'], axis =1)
Out[10]:
ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 ... BILL_AMT3 BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6
0 1 20000 2 2 1 24 2 2 -1 -1 ... 689 0 0 0 0 689 0 0 0 0
1 2 120000 2 2 2 26 -1 2 0 0 ... 2682 3272 3455 3261 0 1000 1000 1000 0 2000
2 3 90000 2 2 2 34 0 0 0 0 ... 13559 14331 14948 15549 1518 1500 1000 1000 1000 5000
3 4 50000 2 2 1 37 0 0 0 0 ... 49291 28314 28959 29547 2000 2019 1200 1100 1069 1000
4 5 50000 1 2 1 57 -1 0 -1 0 ... 35835 20940 19146 19131 2000 36681 10000 9000 689 679
5 6 50000 1 1 2 37 0 0 0 0 ... 57608 19394 19619 20024 2500 1815 657 1000 1000 800
6 7 500000 1 1 2 29 0 0 0 0 ... 445007 542653 483003 473944 55000 40000 38000 20239 13750 13770
7 8 100000 2 2 2 23 0 -1 -1 0 ... 601 221 -159 567 380 601 0 581 1687 1542
8 9 140000 2 3 1 28 0 0 2 0 ... 12108 12211 11793 3719 3329 0 432 1000 1000 1000
9 10 20000 1 3 2 35 -2 -2 -2 -2 ... 0 0 13007 13912 0 0 0 13007 1122 0
10 11 200000 2 3 2 34 0 0 2 0 ... 5535 2513 1828 3731 2306 12 50 300 3738 66
11 12 260000 2 1 2 51 -1 -1 -1 -1 ... 9966 8517 22287 13668 21818 9966 8583 22301 0 3640
12 13 630000 2 2 2 41 -1 0 -1 -1 ... 6500 6500 6500 2870 1000 6500 6500 6500 2870 0
13 14 70000 1 2 2 30 1 2 2 0 ... 65701 66782 36137 36894 3200 0 3000 3000 1500 0
14 15 250000 1 1 2 29 0 0 0 0 ... 63561 59696 56875 55512 3000 3000 3000 3000 3000 3000
15 16 50000 2 3 3 23 1 2 0 0 ... 28116 28771 29531 30211 0 1500 1100 1200 1300 1100
16 17 20000 1 1 2 24 0 0 2 2 ... 17428 18338 17905 19104 3200 0 1500 0 1650 0
17 18 320000 1 1 1 49 0 0 0 -1 ... 194663 70074 5856 195599 10358 10000 75940 20000 195599 50000
18 19 360000 2 1 1 49 1 -2 -2 -2 ... 0 0 0 0 0 0 0 0 0 0
19 20 180000 2 1 2 29 1 -2 -2 -2 ... 0 0 0 0 0 0 0 0 0 0
20 21 130000 2 3 2 39 0 0 0 0 ... 24489 20616 11802 930 3000 1537 1000 2000 930 33764
21 22 120000 2 2 1 39 -1 -1 -1 -1 ... 316 0 632 316 316 316 0 632 316 0
22 23 70000 2 2 2 26 2 0 0 2 ... 45020 44006 46905 46012 2007 3582 0 3601 0 1820
23 24 450000 2 1 1 40 -2 -2 -2 -2 ... 1473 560 0 0 19428 1473 560 0 0 1128
24 25 90000 1 1 2 23 0 0 0 -1 ... 0 5398 6360 8292 5757 0 5398 1200 2045 2000
25 26 50000 1 3 2 23 0 0 0 0 ... 36023 28967 29829 30046 1973 1426 1001 1432 1062 997
26 27 60000 1 1 2 27 1 -2 -1 -1 ... 259 -57 127 -189 0 1000 0 500 0 1000
27 28 50000 2 3 2 30 0 0 0 0 ... 17163 17878 18931 19617 1300 1300 1000 1500 1000 1012
28 29 50000 2 3 1 47 -1 -1 -1 -1 ... 3416 2040 30430 257 3415 3421 2044 30430 257 0
29 30 50000 1 1 2 26 0 0 0 0 ... 17496 17907 18375 11400 1500 1500 1000 1000 1600 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
29970 29971 360000 1 1 1 34 -1 -1 -1 0 ... 64069 49005 8676 19487 52951 64535 8907 53 19584 16080
29971 29972 80000 1 3 1 36 0 0 0 0 ... 68279 69674 71070 73612 2395 2500 2530 2556 3700 3000
29972 29973 190000 1 1 1 37 0 0 0 0 ... 5869 29223 19616 148482 2000 3869 25128 10115 148482 4800
29973 29974 230000 1 2 1 35 1 -2 -2 -2 ... 0 0 0 0 0 0 0 0 0 0
29974 29975 50000 1 2 1 37 1 2 2 2 ... 4328 2846 1585 1324 0 3000 0 0 1000 1000
29975 29976 220000 1 2 1 41 0 0 -1 -1 ... 1369 5924 1759 1824 8840 6643 5924 1759 1824 7022
29976 29977 40000 1 2 2 47 2 2 3 2 ... 53415 51259 47151 46934 4000 0 2000 0 3520 0
29977 29978 420000 1 1 2 34 0 0 0 0 ... 140011 141695 144839 147954 7000 7000 5500 5500 5600 5000
29978 29979 310000 1 2 1 39 0 0 0 0 ... 233854 219409 216540 210675 10029 9218 10029 8049 8040 10059
29979 29980 180000 1 1 1 32 -2 -2 -2 -2 ... 0 0 0 0 0 0 0 0 0 0
29980 29981 50000 1 3 2 42 0 0 0 0 ... 49397 50360 19971 19694 10000 4000 5000 3000 4500 2000
29981 29982 50000 1 2 1 44 1 2 2 2 ... 33101 28192 22676 14647 2300 1700 0 517 503 585
29982 29983 90000 1 2 1 36 0 0 0 0 ... 10306 11328 12036 14329 1500 1500 1500 1200 2500 0
29983 29984 20000 1 2 1 44 -2 -2 -2 -2 ... 2712 2882 9235 1719 2890 2720 2890 9263 1824 1701
29984 29985 30000 1 2 2 38 -1 -1 -2 -1 ... 2939 1993 1907 3319 923 2977 1999 3057 3319 1000
29985 29986 240000 1 1 2 30 -2 -2 -2 -2 ... 0 0 0 0 0 0 0 0 0 0
29986 29987 360000 1 1 2 35 -1 -1 -2 -2 ... 0 0 0 0 0 0 0 0 0 0
29987 29988 130000 1 1 2 34 0 0 0 0 ... 15546 108047 93708 97353 3000 2000 93000 4000 5027 4005
29988 29989 250000 1 1 1 34 0 0 0 0 ... 243075 245750 175005 179687 65000 8800 9011 6000 7000 6009
29989 29990 150000 1 1 2 35 -1 -1 -1 -1 ... -3 780 0 0 9054 0 783 0 0 0
29990 29991 140000 1 2 1 41 0 0 0 0 ... 139110 138262 49675 46121 6000 7000 4228 1505 2000 2000
29991 29992 210000 1 2 1 34 3 2 2 2 ... 2500 2500 2500 2500 0 0 0 0 0 0
29992 29993 10000 1 3 1 43 0 0 0 -2 ... 0 0 0 0 2000 0 0 0 0 0
29993 29994 100000 1 1 2 38 0 -1 -1 0 ... 102996 70626 69473 55004 2000 111784 4000 3000 2000 2000
29994 29995 80000 1 2 2 34 2 2 2 2 ... 79384 77519 82607 81158 7000 3500 0 7000 0 4000
29995 29996 220000 1 3 1 39 0 0 0 0 ... 208365 88004 31237 15980 8500 20000 5003 3047 5000 1000
29996 29997 150000 1 3 2 43 -1 -1 -1 -1 ... 3502 8979 5190 0 1837 3526 8998 129 0 0
29997 29998 30000 1 2 2 37 4 3 2 -1 ... 2758 20878 20582 19357 0 0 22000 4200 2000 3100
29998 29999 80000 1 3 1 41 1 -1 0 0 ... 76304 52774 11855 48944 85900 3409 1178 1926 52964 1804
29999 30000 50000 1 2 1 46 0 0 0 0 ... 49764 36535 32428 15313 2078 1800 1430 1000 1000 1000
30000 rows × 24 columns
In [11]:
bill_description = df_bill_amt.describe()
In [12]:
plt.tight_layout()
fig, axis = plt.subplots(1,3)
fig.set_size_inches(18,5)
ttl = fig.suptitle('Distribution of Mean, STD & Min of Bill Amounts in the past 6 months')
ttl.set_position([.5, 1.05])

left   =  0.125  # the left side of the subplots of the figure
right  =  0.9    # the right side of the subplots of the figure
bottom =  0.1    # the bottom of the subplots of the figure
top    =  0.9    # the top of the subplots of the figure
wspace =  .5     # the amount of width reserved for blank space between subplots
hspace =   2 # the amount of height reserved for white space between subplots

plt.subplots_adjust(
    left    =  left, 
    bottom  =  bottom, 
    right   =  right, 
    top     =  top, 
    wspace  =  wspace, 
    hspace  =  hspace
)

columns = bill_description.columns
sns.set(font_scale = 1.1)
sns.set_style("whitegrid")

for i in range(len(columns)/2):
    col = i%3
    j = i+1
    X = bill_description.columns[0:6]
    c = bill_description.index[j]
    Y = bill_description.loc[c]
    
    g = sns.barplot(x = X, y = Y, ax = axis[col], label='small', palette = 'Blues_d')
    g.set_xticklabels(bill_description.columns[0:6], rotation=30)

    g.set_title(c)

    
<matplotlib.figure.Figure at 0x7f2cc3523cd0>
In [13]:
pay_description = df_pay_amt.describe()
In [14]:
pay_description
Out[14]:
PAY_AMT1 PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6
count 30000.000000 3.000000e+04 30000.00000 30000.000000 30000.000000 30000.000000
mean 5663.580500 5.921163e+03 5225.68150 4826.076867 4799.387633 5215.502567
std 16563.280354 2.304087e+04 17606.96147 15666.159744 15278.305679 17777.465775
min 0.000000 0.000000e+00 0.00000 0.000000 0.000000 0.000000
25% 1000.000000 8.330000e+02 390.00000 296.000000 252.500000 117.750000
50% 2100.000000 2.009000e+03 1800.00000 1500.000000 1500.000000 1500.000000
75% 5006.000000 5.000000e+03 4505.00000 4013.250000 4031.500000 4000.000000
max 873552.000000 1.684259e+06 896040.00000 621000.000000 426529.000000 528666.000000
In [15]:
plt.tight_layout()
fig, axis = plt.subplots(1,2)
fig.set_size_inches(18,5)
ttl = fig.suptitle('Distribution of Mean & STD of Payment Amounts in the past 6 months')
ttl.set_position([.5, 1.05])

left   =  0.125  # the left side of the subplots of the figure
right  =  0.9    # the right side of the subplots of the figure
bottom =  0.1    # the bottom of the subplots of the figure
top    =  0.9    # the top of the subplots of the figure
wspace =  .5     # the amount of width reserved for blank space between subplots
hspace = 2 # the amount of height reserved for white space between subplots

plt.subplots_adjust(
    left    =  left, 
    bottom  =  bottom, 
    right   =  right, 
    top     =  top, 
    wspace  =  wspace, 
    hspace  =  hspace
)

columns = pay_description.columns
sns.set(font_scale = 1.1)


for i in range(2):
    col = i%3
    j = i+1
    
    X = pay_description.columns[0:6]
    c = pay_description.index[j]
    Y = pay_description.loc[c]
    
    g = sns.barplot(x = X, y = Y, ax = axis[col], label='small', palette = 'Blues_d')
    g.set_xticklabels(pay_description.columns[0:6], rotation=30)

    g.set_title(c)
<matplotlib.figure.Figure at 0x7f2cc362d450>
In [16]:
df_pay_amt.min()
Out[16]:
PAY_AMT1    0
PAY_AMT2    0
PAY_AMT3    0
PAY_AMT4    0
PAY_AMT5    0
PAY_AMT6    0
dtype: int64
In [17]:
d = df['LIMIT_BAL'].value_counts()
In [18]:
fig = plt.figure()
fig.set_size_inches(30,5)
sns.set_style("whitegrid")

ttl = fig.suptitle('Distribution of Limit Balance')
ttl.set_position([.5, 1.05])

dd = d.index
np.sort(dd)
g = sns.barplot(x = dd, y = d, label='small', palette = 'Blues_d')
g.set_xticklabels(d.index, rotation=90)
Out[18]:
[Text(0,0,u'50000'),
 Text(0,0,u'20000'),
 Text(0,0,u'30000'),
 Text(0,0,u'80000'),
 Text(0,0,u'200000'),
 Text(0,0,u'150000'),
 Text(0,0,u'100000'),
 Text(0,0,u'180000'),
 Text(0,0,u'360000'),
 Text(0,0,u'60000'),
 Text(0,0,u'140000'),
 Text(0,0,u'230000'),
 Text(0,0,u'70000'),
 Text(0,0,u'210000'),
 Text(0,0,u'130000'),
 Text(0,0,u'120000'),
 Text(0,0,u'500000'),
 Text(0,0,u'160000'),
 Text(0,0,u'90000'),
 Text(0,0,u'240000'),
 Text(0,0,u'110000'),
 Text(0,0,u'300000'),
 Text(0,0,u'170000'),
 Text(0,0,u'260000'),
 Text(0,0,u'280000'),
 Text(0,0,u'10000'),
 Text(0,0,u'220000'),
 Text(0,0,u'250000'),
 Text(0,0,u'290000'),
 Text(0,0,u'320000'),
 Text(0,0,u'310000'),
 Text(0,0,u'400000'),
 Text(0,0,u'270000'),
 Text(0,0,u'350000'),
 Text(0,0,u'40000'),
 Text(0,0,u'190000'),
 Text(0,0,u'340000'),
 Text(0,0,u'390000'),
 Text(0,0,u'330000'),
 Text(0,0,u'420000'),
 Text(0,0,u'450000'),
 Text(0,0,u'380000'),
 Text(0,0,u'430000'),
 Text(0,0,u'440000'),
 Text(0,0,u'470000'),
 Text(0,0,u'460000'),
 Text(0,0,u'480000'),
 Text(0,0,u'410000'),
 Text(0,0,u'370000'),
 Text(0,0,u'490000'),
 Text(0,0,u'550000'),
 Text(0,0,u'520000'),
 Text(0,0,u'510000'),
 Text(0,0,u'600000'),
 Text(0,0,u'580000'),
 Text(0,0,u'610000'),
 Text(0,0,u'530000'),
 Text(0,0,u'560000'),
 Text(0,0,u'620000'),
 Text(0,0,u'700000'),
 Text(0,0,u'570000'),
 Text(0,0,u'630000'),
 Text(0,0,u'640000'),
 Text(0,0,u'540000'),
 Text(0,0,u'590000'),
 Text(0,0,u'710000'),
 Text(0,0,u'680000'),
 Text(0,0,u'750000'),
 Text(0,0,u'650000'),
 Text(0,0,u'670000'),
 Text(0,0,u'720000'),
 Text(0,0,u'660000'),
 Text(0,0,u'16000'),
 Text(0,0,u'780000'),
 Text(0,0,u'740000'),
 Text(0,0,u'730000'),
 Text(0,0,u'800000'),
 Text(0,0,u'760000'),
 Text(0,0,u'690000'),
 Text(0,0,u'1000000'),
 Text(0,0,u'327680')]
In [19]:
d.head()
Out[19]:
50000     3365
20000     1976
30000     1610
80000     1567
200000    1528
Name: LIMIT_BAL, dtype: int64
In [20]:
Y = df['default payment next month']
pd.DataFrame(Y)
Out[20]:
default payment next month
0 1
1 1
2 0
3 0
4 0
5 0
6 0
7 0
8 0
9 0
10 0
11 0
12 0
13 1
14 0
15 0
16 1
17 0
18 0
19 0
20 0
21 1
22 1
23 1
24 0
25 0
26 1
27 0
28 0
29 0
... ...
29970 0
29971 0
29972 0
29973 1
29974 1
29975 0
29976 1
29977 0
29978 0
29979 0
29980 0
29981 0
29982 1
29983 0
29984 0
29985 0
29986 0
29987 0
29988 0
29989 0
29990 0
29991 1
29992 0
29993 0
29994 1
29995 0
29996 0
29997 1
29998 1
29999 1
30000 rows × 1 columns
In [21]:
d = Y.value_counts()
In [22]:
fig1 = plt.figure()
fig1.set_size_inches(10,5)
sns.set_style("whitegrid")

ttl = fig1.suptitle('Defaulters out of 30,000 sample size (Defaulters = 1)')
ttl.set_position([.5, 1.05])

Defaulter_vs_NonDefaulters = d. index
Number_of_Customers = Y.value_counts()

g1 = sns.barplot(x = Defaulter_vs_NonDefaulters, 
                y = Number_of_Customers, 
                saturation = 1,
                palette = 'Blues_d'
                )
In [23]:
X = df.drop(['default payment next month'], axis = 1)

Y = df['default payment next month']
In [24]:
df['SEX'] = df['SEX'].astype('category').cat.rename_categories(['M', 'F'])
df['MARRIAGE'] = df['MARRIAGE'].astype('category').cat.rename_categories(['na', 'married', 'single', 'other'])
df['age_cat'] = pd.cut(df['AGE'], range(0, 100, 10), right=False)
In [25]:
fig, ax = plt.subplots(1,3)
fig.set_size_inches(20,5)
fig.suptitle('Defaulting by absolute numbers, for various demographics')

df_demo_1 = df.groupby(['default payment next month', 'SEX']).size()
df_demo_1 = df_demo_1.unstack(level = 1)
df_demo_1.plot(kind = 'bar', ax = ax[0])

df_demo_1 = df.groupby(['default payment next month', 'MARRIAGE']).size()
df_demo_1 = df_demo_1.unstack(level = 1)
df_demo_1.plot(kind = 'bar', ax = ax[1])

df_demo_1 = df.groupby(['default payment next month', 'age_cat']).size()
df_demo_1 = df_demo_1.unstack(level = 1)
df_demo_1.plot(kind = 'bar', ax = ax[2])
Out[25]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f2ccb5aff10>
In [26]:
fig, ax = plt.subplots(1,3)
fig.set_size_inches(20,5)
fig.suptitle('Defaulting by relative numbers given each class, for various demographics')

d = df.groupby(['default payment next month', 'SEX']).size().unstack(level=1)
d = d / d.sum()
p = d.plot(kind='bar', ax=ax[0])

d = df.groupby(['default payment next month', 'MARRIAGE']).size().unstack(level=1)
d = d / d.sum()
p = d.plot(kind='bar', ax=ax[1])

d = df.groupby(['default payment next month', 'age_cat']).size().unstack(level=1)
d = d / d.sum()
p = d.plot(kind='bar', ax=ax[2])
In [27]:
df.head()
Out[27]:
ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 ... BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6 default payment next month age_cat
0 1 20000 F 2 married 24 2 2 -1 -1 ... 0 0 0 689 0 0 0 0 1 [20, 30)
1 2 120000 F 2 single 26 -1 2 0 0 ... 3455 3261 0 1000 1000 1000 0 2000 1 [20, 30)
2 3 90000 F 2 single 34 0 0 0 0 ... 14948 15549 1518 1500 1000 1000 1000 5000 0 [30, 40)
3 4 50000 F 2 married 37 0 0 0 0 ... 28959 29547 2000 2019 1200 1100 1069 1000 0 [30, 40)
4 5 50000 M 2 married 57 -1 0 -1 0 ... 19146 19131 2000 36681 10000 9000 689 679 0 [50, 60)
5 rows × 26 columns
In [28]:
from math import log

df['pay_amt_avg_log'] = df_pay_amt.mean(axis = 1).apply(lambda x : log(x+1))

df['pay_amt_avg'] = df_pay_amt.mean(axis = 1)
df['pay_std'] = df_pay_amt.std(axis=1)

df['pay_rel_amt_1'] = df_pay_amt['PAY_AMT1']/df['pay_amt_avg']
df['pay_rel_amt_2'] = df_pay_amt['PAY_AMT2']/df['pay_amt_avg']
df['pay_rel_amt_3'] = df_pay_amt['PAY_AMT3']/df['pay_amt_avg']
df['pay_rel_amt_4'] = df_pay_amt['PAY_AMT4']/df['pay_amt_avg']
df['pay_rel_amt_5'] = df_pay_amt['PAY_AMT5']/df['pay_amt_avg']
df['pay_rel_amt_6'] = df_pay_amt['PAY_AMT6']/df['pay_amt_avg']
In [29]:
df['bill_amt_avg'] = df_bill_amt.mean(axis = 1)

df['bill_amt_avg_log'] = df_bill_amt.mean(axis = 1).apply(lambda x : log(x+1) if x>0 else 0)

df['billamt_rel_1'] = df_bill_amt['BILL_AMT1']/df['LIMIT_BAL']
df['billamt_rel_2'] = df_bill_amt['BILL_AMT2']/df['LIMIT_BAL']
df['billamt_rel_3'] = df_bill_amt['BILL_AMT3']/df['LIMIT_BAL']
df['billamt_rel_4'] = df_bill_amt['BILL_AMT4']/df['LIMIT_BAL']
df['billamt_rel_5'] = df_bill_amt['BILL_AMT5']/df['LIMIT_BAL']
df['billamt_rel_6'] = df_bill_amt['BILL_AMT6']/df['LIMIT_BAL']
In [30]:
df['LIMIT_BAL_LOG'] = df['LIMIT_BAL'].apply(lambda x: log(x+1))
df['LIMIT_BAL_CAT'] = pd.cut(df['LIMIT_BAL'], range(0, int(1e6), 10000), right=False)
In [31]:
df.columns
Out[31]:
Index([                        u'ID',                  u'LIMIT_BAL',
                              u'SEX',                  u'EDUCATION',
                         u'MARRIAGE',                        u'AGE',
                            u'PAY_0',                      u'PAY_2',
                            u'PAY_3',                      u'PAY_4',
                            u'PAY_5',                      u'PAY_6',
                        u'BILL_AMT1',                  u'BILL_AMT2',
                        u'BILL_AMT3',                  u'BILL_AMT4',
                        u'BILL_AMT5',                  u'BILL_AMT6',
                         u'PAY_AMT1',                   u'PAY_AMT2',
                         u'PAY_AMT3',                   u'PAY_AMT4',
                         u'PAY_AMT5',                   u'PAY_AMT6',
       u'default payment next month',                    u'age_cat',
                  u'pay_amt_avg_log',                u'pay_amt_avg',
                          u'pay_std',              u'pay_rel_amt_1',
                    u'pay_rel_amt_2',              u'pay_rel_amt_3',
                    u'pay_rel_amt_4',              u'pay_rel_amt_5',
                    u'pay_rel_amt_6',               u'bill_amt_avg',
                 u'bill_amt_avg_log',              u'billamt_rel_1',
                    u'billamt_rel_2',              u'billamt_rel_3',
                    u'billamt_rel_4',              u'billamt_rel_5',
                    u'billamt_rel_6',              u'LIMIT_BAL_LOG',
                    u'LIMIT_BAL_CAT'],
      dtype='object')
In [32]:
df_bill_amt = df.loc[:, 'billamt_rel_1': 'billamt_rel_6']
In [33]:
df_bill_amt.head()
Out[33]:
billamt_rel_1 billamt_rel_2 billamt_rel_3 billamt_rel_4 billamt_rel_5 billamt_rel_6
0 0.195650 0.155100 0.034450 0.000000 0.000000 0.000000
1 0.022350 0.014375 0.022350 0.027267 0.028792 0.027175
2 0.324878 0.155856 0.150656 0.159233 0.166089 0.172767
3 0.939800 0.964660 0.985820 0.566280 0.579180 0.590940
4 0.172340 0.113400 0.716700 0.418800 0.382920 0.382620
In [34]:
#Plotting Kernal Density Estimation on Relative Bill Amount (bill_amount/credit limit) Columns:

fig, ax = plt.subplots(2,3)
fig.set_size_inches(15,5)
fig.set_size_inches(17,7)
ttl = fig.suptitle('Distribution of bill relative to credit in the path 6 months')
ttl.set_position([.5, 1.05])

left   =  0.125  # the left side of the subplots of the figure
right  =  0.9    # the right side of the subplots of the figure
bottom =  0.1    # the bottom of the subplots of the figure
top    =  0.9    # the top of the subplots of the figure
wspace =  .5     # the amount of width reserved for blank space between subplots
hspace = 1.1 # the amount of height reserved for white space between subplots

plt.subplots_adjust(
    left    =  left, 
    bottom  =  bottom, 
    right   =  right, 
    top     =  top, 
    wspace  =  wspace, 
    hspace  =  hspace
)

columns = df_bill_amt.columns

for i in range(len(columns)):
    row, col = int(i/3), i%3
    #Plot the Histogram#
    n, bins, patches = ax[row,col].hist(df_bill_amt[columns[i]], 50, normed=1, facecolor='green', alpha=0.75)
    
    #Estimate Kernal Density#
    kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(df_bill_amt[columns[i]].values.reshape(-1, 1))
    x_grid = np.linspace(df_bill_amt[columns[i]].min(), df_bill_amt[columns[i]].max(), 1000)
    log_pdf = kde.score_samples(x_grid.reshape(-1, 1))
    
    # add the density line
    ax[row,col].plot(x_grid, np.exp(log_pdf), color='blue', alpha=0.5, lw=3)
    ax[row,col].set_title(columns[i])
    
In [35]:
X = df.drop(['ID', 'LIMIT_BAL'], axis = 1)
In [36]:
X.head()
Out[36]:
SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6 ... bill_amt_avg bill_amt_avg_log billamt_rel_1 billamt_rel_2 billamt_rel_3 billamt_rel_4 billamt_rel_5 billamt_rel_6 LIMIT_BAL_LOG LIMIT_BAL_CAT
0 F 2 married 24 2 2 -1 -1 -2 -2 ... 1284.000000 7.158514 0.195650 0.155100 0.034450 0.000000 0.000000 0.000000 9.903538 [20000, 30000)
1 F 2 single 26 -1 2 0 0 0 2 ... 2846.166667 7.954080 0.022350 0.014375 0.022350 0.027267 0.028792 0.027175 11.695255 [120000, 130000)
2 F 2 single 34 0 0 0 0 0 0 ... 16942.166667 9.737620 0.324878 0.155856 0.150656 0.159233 0.166089 0.172767 11.407576 [90000, 100000)
3 F 2 married 37 0 0 0 0 0 0 ... 38555.666667 10.559884 0.939800 0.964660 0.985820 0.566280 0.579180 0.590940 10.819798 [50000, 60000)
4 M 2 married 57 -1 0 -1 0 0 0 ... 18223.166667 9.810504 0.172340 0.113400 0.716700 0.418800 0.382920 0.382620 10.819798 [50000, 60000)
5 rows × 43 columns
In [37]:
X1 = pd.concat([ X, pd.get_dummies(df['SEX'], drop_first = True) ], axis = 1)
X1 = pd.concat([ X, pd.get_dummies(df['MARRIAGE'], drop_first = True) ], axis = 1)
In [38]:
X1 = X1.drop(['SEX', 'MARRIAGE'], axis = 1)
In [39]:
X1.head()
Out[39]:
EDUCATION AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6 BILL_AMT1 BILL_AMT2 ... billamt_rel_2 billamt_rel_3 billamt_rel_4 billamt_rel_5 billamt_rel_6 LIMIT_BAL_LOG LIMIT_BAL_CAT married single other
0 2 24 2 2 -1 -1 -2 -2 3913 3102 ... 0.155100 0.034450 0.000000 0.000000 0.000000 9.903538 [20000, 30000) 1 0 0
1 2 26 -1 2 0 0 0 2 2682 1725 ... 0.014375 0.022350 0.027267 0.028792 0.027175 11.695255 [120000, 130000) 0 1 0
2 2 34 0 0 0 0 0 0 29239 14027 ... 0.155856 0.150656 0.159233 0.166089 0.172767 11.407576 [90000, 100000) 0 1 0
3 2 37 0 0 0 0 0 0 46990 48233 ... 0.964660 0.985820 0.566280 0.579180 0.590940 10.819798 [50000, 60000) 1 0 0
4 2 57 -1 0 -1 0 0 0 8617 5670 ... 0.113400 0.716700 0.418800 0.382920 0.382620 10.819798 [50000, 60000) 1 0 0
5 rows × 44 columns
In [40]:
D = pd.concat([X1, Y], axis = 1)
In [41]:
D = D.rename(index=str, columns={"default payment next month": "target"})
In [42]:
D.columns
Out[42]:
Index([       u'EDUCATION',              u'AGE',            u'PAY_0',
                  u'PAY_2',            u'PAY_3',            u'PAY_4',
                  u'PAY_5',            u'PAY_6',        u'BILL_AMT1',
              u'BILL_AMT2',        u'BILL_AMT3',        u'BILL_AMT4',
              u'BILL_AMT5',        u'BILL_AMT6',         u'PAY_AMT1',
               u'PAY_AMT2',         u'PAY_AMT3',         u'PAY_AMT4',
               u'PAY_AMT5',         u'PAY_AMT6',           u'target',
                u'age_cat',  u'pay_amt_avg_log',      u'pay_amt_avg',
                u'pay_std',    u'pay_rel_amt_1',    u'pay_rel_amt_2',
          u'pay_rel_amt_3',    u'pay_rel_amt_4',    u'pay_rel_amt_5',
          u'pay_rel_amt_6',     u'bill_amt_avg', u'bill_amt_avg_log',
          u'billamt_rel_1',    u'billamt_rel_2',    u'billamt_rel_3',
          u'billamt_rel_4',    u'billamt_rel_5',    u'billamt_rel_6',
          u'LIMIT_BAL_LOG',    u'LIMIT_BAL_CAT',          u'married',
                 u'single',            u'other',           u'target'],
      dtype='object')
In [43]:
formula = 'target ~ '

# original features & engineered features
formula += '+ C(married) + C(single) + C(other) + C(married) +  C(EDUCATION) + AGE '
formula += '+ PAY_0 + PAY_2 + PAY_3 + PAY_4 + PAY_5 + PAY_6 '
formula += '+ C(age_cat) + C(LIMIT_BAL_CAT) + C(LIMIT_BAL_LOG) + pay_amt_avg + pay_std '
formula += '+ pay_amt_avg_log + pay_rel_amt_1 + pay_rel_amt_2 + pay_rel_amt_3 + pay_rel_amt_4 + pay_rel_amt_5 + pay_rel_amt_6 '
formula += '+ bill_amt_avg + bill_amt_avg_log + billamt_rel_1 + billamt_rel_2 + billamt_rel_3 + billamt_rel_4 + billamt_rel_5 + billamt_rel_6'
In [44]:
formula
Out[44]:
'target ~ + C(married) + C(single) + C(other) + C(married) +  C(EDUCATION) + AGE + PAY_0 + PAY_2 + PAY_3 + PAY_4 + PAY_5 + PAY_6 + C(age_cat) + C(LIMIT_BAL_CAT) + C(LIMIT_BAL_LOG) + pay_amt_avg + pay_std + pay_amt_avg_log + pay_rel_amt_1 + pay_rel_amt_2 + pay_rel_amt_3 + pay_rel_amt_4 + pay_rel_amt_5 + pay_rel_amt_6 + bill_amt_avg + bill_amt_avg_log + billamt_rel_1 + billamt_rel_2 + billamt_rel_3 + billamt_rel_4 + billamt_rel_5 + billamt_rel_6'
In [45]:
from patsy import dmatrices

Y, X = dmatrices(formula, data= D, return_type='dataframe')
Y = Y.iloc[:, 1]
In [46]:
import warnings
from sklearn.feature_selection import SelectKBest, f_classif

warnings.simplefilter(action='ignore', category=(UserWarning,RuntimeWarning))

selector = SelectKBest(f_classif, 25)
selector.fit(X, Y)
Out[46]:
SelectKBest(k=25, score_func=<function f_classif at 0x7f2cc2c119b0>)
In [47]:
top_indices = np.nan_to_num(selector.scores_).argsort()[-25:][::-1]
selector.scores_[top_indices]
X.columns[top_indices]
Out[47]:
Index([u'PAY_0', u'PAY_2', u'PAY_3', u'PAY_4', u'PAY_5', u'PAY_6',
       u'pay_amt_avg_log', u'billamt_rel_6', u'billamt_rel_5',
       u'billamt_rel_4', u'billamt_rel_3', u'billamt_rel_2', u'billamt_rel_1',
       u'pay_amt_avg',
       u'C(LIMIT_BAL_CAT)[T.Interval(20000, 30000, closed='left')]',
       u'C(LIMIT_BAL_LOG)[T.9.9035375512861705]',
       u'C(LIMIT_BAL_LOG)[T.10.308985993422082]',
       u'C(LIMIT_BAL_CAT)[T.Interval(30000, 40000, closed='left')]',
       u'pay_std', u'C(EDUCATION)[T.1]',
       u'C(LIMIT_BAL_CAT)[T.Interval(10000, 20000, closed='left')]',
       u'C(LIMIT_BAL_LOG)[T.13.122365377402328]',
       u'C(LIMIT_BAL_CAT)[T.Interval(500000, 510000, closed='left')]',
       u'C(EDUCATION)[T.2]', u'C(LIMIT_BAL_LOG)[T.10.819798284210286]'],
      dtype='object')
In [48]:
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()
scaler.fit(X)
Out[48]:
MinMaxScaler(copy=True, feature_range=(0, 1))
In [49]:
from sklearn.pipeline import Pipeline

preprocess = Pipeline([('anova', selector), ('scale', scaler)])
preprocess.fit(X,Y)

X_prep = preprocess.transform(X)
In [52]:
X_prep = pd.DataFrame(X_prep)
In [82]:
Y = pd.DataFrame(Y)
In [91]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)

principalComponents = pca.fit_transform(X_prep)

Df = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])
In [101]:
Df_p = pd.concat([Df, Y], axis = 1)
In [125]:
Df_p.columns
Out[125]:
Index([u'principal component 1', u'principal component 2', u'target[1]'], dtype='object')
In [124]:
import seaborn as sns

g = sns.lmplot(x = 'principal component 1', 
               y = 'principal component 2', 
               data = Df_p,
               hue = 'target[1]'
              )
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-124-9fb311d1a933> in <module>()
      6                y = 'principal component 2',
      7                data = Df_p,
----> 8                hue = 'target[1]'
      9               )

/home/gogol/anaconda2/lib/python2.7/site-packages/seaborn/regression.pyc in lmplot(x, y, data, hue, col, row, palette, col_wrap, size, aspect, markers, sharex, sharey, hue_order, col_order, row_order, legend, legend_out, x_estimator, x_bins, x_ci, scatter, fit_reg, ci, n_boot, units, order, logistic, lowess, robust, logx, x_partial, y_partial, truncate, x_jitter, y_jitter, scatter_kws, line_kws)
    588         scatter_kws=scatter_kws, line_kws=line_kws,
    589         )
--> 590     facets.map_dataframe(regplot, x, y, **regplot_kws)
    591 
    592     # Add a legend

/home/gogol/anaconda2/lib/python2.7/site-packages/seaborn/axisgrid.pyc in map_dataframe(self, func, *args, **kwargs)
    795 
    796             # Draw the plot
--> 797             self._facet_plot(func, ax, args, kwargs)
    798 
    799         # Finalize the annotations and layout

/home/gogol/anaconda2/lib/python2.7/site-packages/seaborn/axisgrid.pyc in _facet_plot(self, func, ax, plot_args, plot_kwargs)
    813 
    814         # Draw the plot
--> 815         func(*plot_args, **plot_kwargs)
    816 
    817         # Sort out the supporting information

/home/gogol/anaconda2/lib/python2.7/site-packages/seaborn/regression.pyc in regplot(x, y, data, x_estimator, x_bins, x_ci, scatter, fit_reg, ci, n_boot, units, order, logistic, lowess, robust, logx, x_partial, y_partial, truncate, dropna, x_jitter, y_jitter, label, color, marker, scatter_kws, line_kws, ax)
    788     scatter_kws["marker"] = marker
    789     line_kws = {} if line_kws is None else copy.copy(line_kws)
--> 790     plotter.plot(ax, scatter_kws, line_kws)
    791     return ax
    792 

/home/gogol/anaconda2/lib/python2.7/site-packages/seaborn/regression.pyc in plot(self, ax, scatter_kws, line_kws)
    340             self.scatterplot(ax, scatter_kws)
    341         if self.fit_reg:
--> 342             self.lineplot(ax, line_kws)
    343 
    344         # Label the axes

/home/gogol/anaconda2/lib/python2.7/site-packages/seaborn/regression.pyc in lineplot(self, ax, kws)
    385 
    386         # Fit the regression model
--> 387         grid, yhat, err_bands = self.fit_regression(ax)
    388 
    389         # Get set default aesthetics

/home/gogol/anaconda2/lib/python2.7/site-packages/seaborn/regression.pyc in fit_regression(self, ax, x_range, grid)
    208             yhat, yhat_boots = self.fit_logx(grid)
    209         else:
--> 210             yhat, yhat_boots = self.fit_fast(grid)
    211 
    212         # Compute the confidence interval at each grid point

/home/gogol/anaconda2/lib/python2.7/site-packages/seaborn/regression.pyc in fit_fast(self, grid)
    228 
    229         beta_boots = algo.bootstrap(X, y, func=reg_func,
--> 230                                     n_boot=self.n_boot, units=self.units).T
    231         yhat_boots = grid.dot(beta_boots).T
    232         return yhat, yhat_boots

/home/gogol/anaconda2/lib/python2.7/site-packages/seaborn/algorithms.pyc in bootstrap(*args, **kwargs)
     72     boot_dist = []
     73     for i in range(int(n_boot)):
---> 74         resampler = rs.randint(0, n, n)
     75         sample = [a.take(resampler, axis=0) for a in args]
     76         boot_dist.append(func(*sample, **func_kwargs))

mtrand.pyx in mtrand.RandomState.randint()

ValueError: low >= high
In [50]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_prep, Y, test_size=0.2, random_state=42)
In [ ]:
import itertools
from sklearn.metrics import roc_curve, auc, roc_auc_score, log_loss, accuracy_score, confusion_matrix
In [63]:
#Plotting Confusion Matrix

def plot_cm(ax, y_true, y_pred, classes, title, th=0.5, cmap=plt.cm.Blues):
    y_pred_labels = (y_pred>th).astype(int)
    
    cm = confusion_matrix(y_true, y_pred_labels)
    
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.set_title(title)

    tick_marks = np.arange(len(classes))
    ax.set_xticks(tick_marks)
    ax.set_yticks(tick_marks)
    ax.set_xticklabels(classes)
    ax.set_yticklabels(classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        ax.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    ax.set_ylabel('True label')
    ax.set_xlabel('Predicted label')
In [64]:
#Plotting ROC Curve and AUC

def plot_auc(ax, y_train, y_train_pred, y_test, y_test_pred, th=0.5):

    y_train_pred_labels = (y_train_pred>th).astype(int)
    y_test_pred_labels  = (y_test_pred>th).astype(int)

    fpr_train, tpr_train, _ = roc_curve(y_train,y_train_pred)
    roc_auc_train = auc(fpr_train, tpr_train)
    acc_train = accuracy_score(y_train, y_train_pred_labels)

    fpr_test, tpr_test, _ = roc_curve(y_test,y_test_pred)
    roc_auc_test = auc(fpr_test, tpr_test)
    acc_test = accuracy_score(y_test, y_test_pred_labels)

    ax.plot(fpr_train, tpr_train)
    ax.plot(fpr_test, tpr_test)

    ax.plot([0, 1], [0, 1], 'k--')

    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title('ROC curve')
    
    train_text = 'train acc = {:.3f}, auc = {:.2f}'.format(acc_train, roc_auc_train)
    test_text = 'test acc = {:.3f}, auc = {:.2f}'.format(acc_test, roc_auc_test)
    ax.legend([train_text, test_text])
In [51]:
from sklearn import linear_model

# Create logistic regression object
regr = linear_model.LogisticRegression()

# Train the model using the training sets
regr.fit(X_train, y_train)
Out[51]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
In [52]:
predictions = regr.predict(X_test)
In [65]:
y_train_pred = regr.predict_proba(X_train)[:,1]
y_test_pred = regr.predict_proba(X_test)[:,1]

threshold = 0.5

fig,ax = plt.subplots(1,3)
fig.set_size_inches(15,5)

plot_cm(ax[0],  y_train, y_train_pred, [0,1], 'Confusion matrix (TRAIN)', threshold)
plot_cm(ax[1],  y_test, y_test_pred,   [0,1], 'Confusion matrix (TEST)', threshold)
plot_auc(ax[2], y_train, y_train_pred, y_test, y_test_pred, threshold)
    
plt.tight_layout()
plt.show()
In [66]:
#Using RandomForest

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=500, min_samples_leaf=5)
rf.fit(X_train,y_train)
Out[66]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
In [67]:
predictions = rf.predict(X_test)
In [68]:
y_train_pred = rf.predict_proba(X_train)[:,1]
y_test_pred = rf.predict_proba(X_test)[:,1]

threshold = 0.5

fig,ax = plt.subplots(1,3)
fig.set_size_inches(15,5)

plot_cm(ax[0],  y_train, y_train_pred, [0,1], 'Confusion matrix (TRAIN)', threshold)
plot_cm(ax[1],  y_test, y_test_pred,   [0,1], 'Confusion matrix (TEST)', threshold)
plot_auc(ax[2], y_train, y_train_pred, y_test, y_test_pred, threshold)
    
plt.tight_layout()
plt.show()
In [79]:
rf.feature_importances_
Out[79]:
array([ 0.00731615,  0.00974254,  0.00122019,  0.00240541,  0.00207649,
        0.00027795,  0.00246733,  0.00219625,  0.00399343,  0.00025585,
        0.17306891,  0.08011276,  0.04581053,  0.03485668,  0.02850615,
        0.02540373,  0.06749281,  0.06594812,  0.06762942,  0.06831926,
        0.06589945,  0.0622378 ,  0.06101222,  0.05998936,  0.06176122])
In [85]:
pd.DataFrame(X_train)
Out[85]:
0 1 2 3 4 5 6 7 8 9 ... 15 16 17 18 19 20 21 22 23 24
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.2 0.004409 0.000628 0.580996 0.284853 0.270284 0.147272 0.318566 0.270724 0.408937
1 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.2 0.006641 0.000409 0.612622 0.232766 0.308661 0.174488 0.365644 0.324939 0.463906
2 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.2 0.010118 0.018488 0.645138 0.216661 0.277087 0.142489 0.225716 0.222095 0.356743
3 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.2 0.009563 0.002383 0.640787 0.220395 0.302259 0.171015 0.307590 0.261560 0.401205
4 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.2 0.009724 0.000376 0.642075 0.135019 0.223633 0.117356 0.265898 0.214340 0.349995
5 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.1 0.091626 0.203063 0.815354 0.114522 0.199267 0.099591 0.218084 0.156844 0.368135
6 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.024567 0.053166 0.713664 0.090341 0.181196 0.088969 0.210769 0.150844 0.355932
7 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.2 0.017581 0.001611 0.687820 0.225891 0.307666 0.171510 0.357408 0.315907 0.461681
8 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.1 0.018059 0.004777 0.689893 0.091829 0.184899 0.091070 0.214920 0.156156 0.283980
9 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.1 0.013655 0.015064 0.668296 0.087488 0.179791 0.087594 0.215962 0.170810 0.295453
10 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.1 0.000627 0.000202 0.430521 0.088021 0.179836 0.087758 0.211210 0.151573 0.280330
11 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.2 0.003041 0.000649 0.552308 0.224770 0.263916 0.143897 0.313170 0.229700 0.365744
12 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.008737 0.004013 0.633808 0.160226 0.243201 0.130788 0.271243 0.177673 0.309571
13 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.2 0.010472 0.004780 0.647798 0.163552 0.250678 0.135392 0.299900 0.252639 0.398869
14 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.2 0.004471 0.000343 0.582068 0.131388 0.220178 0.115130 0.261394 0.208340 0.343179
15 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.2 0.005020 0.000977 0.591018 0.187289 0.261216 0.138368 0.278529 0.221589 0.332519
16 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.1 0.003203 0.006071 0.556316 0.204664 0.271849 0.094084 0.222572 0.152146 0.279798
17 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.4 0.013303 0.016184 0.666280 0.219056 0.301897 0.175881 0.366252 0.325762 0.433422
18 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.006908 0.009418 0.615666 0.091148 0.182614 0.087468 0.216058 0.150844 0.279798
19 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 ... 0.4 0.001992 0.001252 0.519650 0.207084 0.292836 0.164193 0.358781 0.319500 0.464163
20 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 ... 0.2 0.002789 0.001354 0.545627 0.216765 0.300665 0.170985 0.325052 0.211014 0.347815
21 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.4 0.003639 0.002157 0.566168 0.197155 0.281248 0.153279 0.335226 0.293531 0.436676
22 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.1 0.020778 0.019782 0.700726 0.087763 0.181281 0.094911 0.214451 0.153187 0.284684
23 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.1 0.018021 0.010638 0.689728 0.091140 0.187608 0.090588 0.211063 0.156970 0.283974
24 0.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 ... 0.2 0.001285 0.000768 0.485814 0.135069 0.229209 0.124841 0.282794 0.229136 0.365820
25 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.2 0.009832 0.006837 0.642927 0.089870 0.184756 0.088115 0.216809 0.159607 0.291115
26 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.2 0.005747 0.009460 0.601456 0.129880 0.216677 0.113239 0.262159 0.206723 0.388415
27 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.4 0.006588 0.000947 0.612001 0.196121 0.280767 0.155696 0.333607 0.291818 0.435365
28 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.2 0.035108 0.059510 0.741245 0.090333 0.184840 0.093627 0.271777 0.210153 0.306517
29 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.000525 0.001243 0.416930 0.087708 0.179552 0.087513 0.212793 0.150844 0.279798
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
22823 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.2 0.002820 0.002894 0.546475 0.116121 0.199301 0.096321 0.227034 0.167515 0.288839
22824 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 ... 0.2 0.040565 0.067882 0.752407 0.117194 0.206261 0.099589 0.224703 0.167846 0.299583
22825 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.2 0.007300 0.001121 0.619927 0.159873 0.246316 0.133097 0.294277 0.245094 0.383556
22826 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.2 0.004914 0.000756 0.589366 0.174640 0.254742 0.136754 0.301930 0.255341 0.395613
22827 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.1 0.017121 0.012615 0.685772 0.088086 0.176323 0.089884 0.214177 0.155623 0.284946
22828 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.2 0.047166 0.094511 0.764053 0.099132 0.307042 0.170647 0.304845 0.257362 0.396784
22829 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 ... 0.4 0.001762 0.002202 0.510213 0.205025 0.291885 0.160084 0.356894 0.309222 0.452195
22830 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.2 0.004449 0.004080 0.581691 0.135951 0.226768 0.118907 0.244684 0.187490 0.322607
22831 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 ... 0.7 0.003984 0.000000 0.573169 0.142666 0.243670 0.138740 0.317940 0.287900 0.442650
22832 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.012815 0.013227 0.663393 0.090560 0.182976 0.088787 0.218402 0.152664 0.299003
22833 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.1 0.025725 0.035682 0.717224 0.107657 0.198269 0.100884 0.210841 0.151340 0.296137
22834 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 ... 0.4 0.000929 0.000839 0.460839 0.107911 0.197270 0.100915 0.235359 0.181414 0.311652
22835 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.2 0.002313 0.001218 0.531196 0.137724 0.229849 0.122156 0.274325 0.223658 0.360305
22836 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 ... 0.2 0.001170 0.001385 0.478632 0.235573 0.310587 0.163948 0.318952 0.272891 0.421519
22837 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.2 0.009457 0.005203 0.639924 0.222044 0.300630 0.169557 0.358533 0.315199 0.457801
22838 0.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 ... 0.2 0.001693 0.000296 0.507129 0.204749 0.292821 0.166703 0.330085 0.291134 0.437664
22839 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 ... 0.2 0.001911 0.000652 0.516455 0.133966 0.224163 0.117432 0.267405 0.212318 0.347392
22840 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.000909 0.000960 0.459146 0.087987 0.179805 0.087738 0.211173 0.151297 0.280286
22841 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.2 0.008242 0.005834 0.629299 0.140382 0.226408 0.116646 0.259274 0.199185 0.300092
22842 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.2 0.003385 0.001608 0.560589 0.220399 0.249662 0.121508 0.273068 0.216199 0.356701
22843 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 ... 0.4 0.003110 0.003092 0.554049 0.213870 0.302995 0.167736 0.365596 0.324285 0.466045
22844 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.2 0.011447 0.006903 0.654676 0.112580 0.200369 0.102177 0.235554 0.179832 0.316075
22845 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.008705 0.005515 0.633528 0.087772 0.182195 0.090225 0.214190 0.153477 0.282843
22846 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.1 0.005529 0.003208 0.598476 0.088847 0.182171 0.089687 0.211305 0.155220 0.281936
22847 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 ... 0.4 0.003007 0.002670 0.551447 0.206529 0.295533 0.162820 0.348284 0.314753 0.452985
22848 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.005104 0.000906 0.592299 0.093751 0.184818 0.090649 0.217721 0.159163 0.284879
22849 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 ... 0.1 0.002390 0.002120 0.533725 0.087553 0.181612 0.086422 0.215550 0.150139 0.282070
22850 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.1 0.068365 0.014719 0.792730 0.118960 0.189275 0.103666 0.227426 0.176870 0.303560
22851 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 ... 0.4 0.001158 0.001014 0.477787 0.137187 0.222968 0.111949 0.253531 0.199752 0.323232
22852 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.1 0.003474 0.003407 0.562592 0.089014 0.180625 0.089924 0.213532 0.152394 0.281468
22853 rows × 25 columns
In [86]:
X_train.shape
Out[86]:
(22853, 25)

No comments:

Post a Comment