from sklearn.cross_validation import train_test_split
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')
import warnings
warnings.filterwarnings('ignore')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
df=pd.read_csv('~/dm/pbl1/loan_predictions_output.csv',low_memory=False)
X=df.copy()
y=df['Loan Status']
#h=df2[ (df2['Current Loan Amount'] < 9999999 ) & (df2['Credit Score']>700 )& (df2['Credit Score'] <800)]
##plt.hist(h['Current Loan Amount'])
#print h['Current Loan Amount'].mean()
#print h['Current Loan Amount'].median() ############# This one
#print h['Current Loan Amount'].mode()
#df2['Current Loan Amount'].replace(99999999,h['Current Loan Amount'].median(),inplace=True)
#df2['Credit Score']=df2['Credit Score'].apply(lambda x: x/10 if x > 800 else x)
#df3=df2[df2['Credit Score'] < 800]
#high_score=df3[df3['Credit Score']>700]
#low_score=df3[df3['Credit Score']<600]
##df3['Maximum Open Credit']=df3['Maximum Open Credit'].astype(float)
##plt.scatter( df3['Maximum Open Credit'],df3['Credit Score'])
del df['Unnamed: 0']
df['Purpose'].value_counts()
Xy=df.copy()
X=df.copy()
del X['Loan Status']
del X['Loan ID']
y=df['Loan Status']
X=pd.get_dummies(X,drop_first=True,sparse=True)
dd={'Fully Paid': 1, 'Charged Off': 0}
y=y.map(dd)
X_train,X_test,y_train,y_test = train_test_split(X,y, train_size = 0.8, random_state = 10)
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.metrics import mean_squared_error,accuracy_score
from xgboost import XGBClassifier
#log = LogisticRegression()
tree = DecisionTreeClassifier()
forest = RandomForestClassifier()
gradient = GradientBoostingClassifier()
ada = AdaBoostClassifier()
xgb=XGBClassifier()
#l = log.fit(X_train, y_train)
t = tree.fit(X_train, y_train)
f = forest.fit(X_train, y_train)
g = gradient.fit(X_train, y_train)
a = ada.fit(X_train, y_train)
x=xgb.fit(X_train,y_train)
#print("The score for Logistic Regression is, ", l.score(X_test, y_test))
print("The score for Decision Trees is %.2f%%" % (t.score(X_test,y_test) * 100.0))
print("The score for Random Forest is %.2f%%" % (f.score(X_test,y_test) * 100.0))
print("The score for Gradient Boosting is %.2f%% " % (g.score(X_test,y_test) * 100.0))
print("The score for AdaBoost is %.2f%%" % (a.score(X_test,y_test) * 100.0))
print("The score for XGB is %.2f%%" % (x.score(X_test,y_test) * 100.0))
from sklearn.metrics import recall_score,precision_score,f1_score
pred=g.predict(X_test)
print accuracy_score(y_test,pred)
print precision_score(y_test,pred)
print f1_score(y_test,pred)