In [1]:
from sklearn.cross_validation import train_test_split
/usr/local/lib/python2.7/dist-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
%matplotlib inline

import seaborn as sns
sns.set_style('whitegrid')
import warnings
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
In [3]:
df=pd.read_csv('~/dm/pbl1/loan_predictions_output.csv',low_memory=False)
In [4]:
X=df.copy()
y=df['Loan Status']
In [5]:
#h=df2[ (df2['Current Loan Amount'] < 9999999 )  & (df2['Credit Score']>700 )& (df2['Credit Score'] <800)]
##plt.hist(h['Current Loan Amount'])
#print h['Current Loan Amount'].mean()
#print h['Current Loan Amount'].median() ############# This one
#print h['Current Loan Amount'].mode()
#df2['Current Loan Amount'].replace(99999999,h['Current Loan Amount'].median(),inplace=True)
#df2['Credit Score']=df2['Credit Score'].apply(lambda x: x/10 if x > 800 else x)
#df3=df2[df2['Credit Score'] < 800]
#high_score=df3[df3['Credit Score']>700]
#low_score=df3[df3['Credit Score']<600]
##df3['Maximum Open Credit']=df3['Maximum Open Credit'].astype(float)
##plt.scatter( df3['Maximum Open Credit'],df3['Credit Score'])
In [6]:
del df['Unnamed: 0']

Dummies


In [7]:
df['Purpose'].value_counts()
Out[7]:
Debt Consolidation      170830
Home Improvements        12806
other                    11664
Other                     8251
Business Loan             3585
Buy a Car                 2913
Medical Bills             2369
Take a Trip               1307
Buy House                 1305
Educational Expenses       216
Name: Purpose, dtype: int64
In [8]:
Xy=df.copy()
X=df.copy()
del X['Loan Status']
del X['Loan ID']
y=df['Loan Status']
In [9]:
X=pd.get_dummies(X,drop_first=True,sparse=True)
In [10]:
dd={'Fully Paid': 1, 'Charged Off': 0}
In [11]:
y=y.map(dd)
In [12]:
X_train,X_test,y_train,y_test = train_test_split(X,y, train_size = 0.8, random_state = 10)
In [13]:
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.metrics import mean_squared_error,accuracy_score
from xgboost import XGBClassifier

#log = LogisticRegression()
tree = DecisionTreeClassifier()
forest = RandomForestClassifier()
gradient = GradientBoostingClassifier()
ada = AdaBoostClassifier()
xgb=XGBClassifier()
In [14]:
#l = log.fit(X_train, y_train)
t = tree.fit(X_train, y_train)
f = forest.fit(X_train, y_train)
g = gradient.fit(X_train, y_train)
a = ada.fit(X_train, y_train)
In [15]:
x=xgb.fit(X_train,y_train)
In [16]:
#print("The score for Logistic Regression is, ", l.score(X_test, y_test))
print("The score for Decision Trees is  %.2f%%" % (t.score(X_test,y_test) * 100.0))
print("The score for Random Forest is  %.2f%%" % (f.score(X_test,y_test) * 100.0))
print("The score for Gradient Boosting is  %.2f%% " % (g.score(X_test,y_test) * 100.0))
print("The score for AdaBoost is  %.2f%%" % (a.score(X_test,y_test) * 100.0))
print("The score for XGB is  %.2f%%" % (x.score(X_test,y_test) * 100.0))
The score for Decision Trees is  75.00%
The score for Random Forest is  80.31%
The score for Gradient Boosting is  82.10% 
The score for AdaBoost is  81.93%
The score for XGB is  82.02%
In [17]:
from sklearn.metrics import recall_score,precision_score,f1_score
In [18]:
pred=g.predict(X_test)
In [20]:
print accuracy_score(y_test,pred)
print precision_score(y_test,pred)
print f1_score(y_test,pred)
0.820975609756
0.834861710475
0.898331244641