import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')
import warnings
warnings.filterwarnings('ignore')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
df=pd.read_csv('~/dm/pbl1/LoansTrainingSet.csv',low_memory=False)
df.info()
df[df['Credit Score'].isnull()]['Loan ID'].value_counts()
df[df['Loan ID']=='9cd8febc-48ff-4f54-8475-e57a4e5b1c9b']
df['Monthly Debt']=df['Monthly Debt'].str.replace('$','')
df['Monthly Debt']=df['Monthly Debt'].replace('[ ]','',regex=True)
df['Monthly Debt']=df['Monthly Debt'].replace(',','',regex=True)
df['Monthly Debt']=df['Monthly Debt'].astype(float)
col=df.columns
print col
for i in col:
print i,' ',df[i].isnull().sum()
# print i,' ', df[i].nunique()
cat_df=df.select_dtypes(include=['object'])
df['Maximum Open Credit'].replace('#VALUE!',np.nan,regex=True,inplace=True)
df['Maximum Open Credit'].isnull().sum()
df['Maximum Open Credit'].iloc[187512]
#length=len(df['Maximum Open Credit'])
'''
for i in range(0,length):
#v=df['Maximum Open Credit'].iloc[i]
try:
v=float(df['Maximum Open Credit'].iloc[i])
except ValueError,e:
print 'line i is bad ',i
'''
df['Maximum Open Credit']=df['Maximum Open Credit'].astype(float)
df.dropna(subset=['Maximum Open Credit'],inplace=True)
#df['Maximum Open Credit']=df['Maximum Open Credit'].str.replace(r"[a-zA-Z]",'')#extract('(\d+)', expand=False)
#df['Maximum Open Credit']=df['Maximum Open Credit'].str.replace(r"[a-zA-Z]",'')#extract('(\d+)', expand=False)
#df['Maximum Open Credit']=df['Maximum Open Credit'].replace('\D','')
df['Years in current job'].value_counts()
11476.0/250000 * 100.0
tmp2_df=df[df['Years in current job']=='n/a']
j=pd.DataFrame()
df['Years in current job'].replace('< 1 year','1',inplace=True)
df['Years in current job'].replace(' 1 year','1',inplace=True)
df['Years in current job'].replace('1 year','1',inplace=True)
df['Years in current job'].replace('2 years','1',inplace=True)
df['Years in current job'].replace('3 years','1',inplace=True)
df['Years in current job'].replace('4 years','1',inplace=True)
df['Years in current job'].replace('5 years','1',inplace=True)
df['Years in current job'].replace('6 years','2',inplace=True)
df['Years in current job'].replace('7 years','2',inplace=True)
df['Years in current job'].replace('8 years','2',inplace=True)
df['Years in current job'].replace('9 years','2',inplace=True)
df['Years in current job'].replace('10+ years','2',inplace=True)
df['Years in current job'].replace('n/a','1',inplace=True)
tmp3_df=df[df['Years in current job']!='n/a']
df['Years in current job'].value_counts()
#df['Years in current job']=df['Years in current job'].astype('category')
df.head(2)
df['Purpose'].value_counts()
df['Years in current job']=df['Years in current job'].astype(int)
#df['Years in current job'].plot(kind='barh')
df['Years in current job']=df['Years in current job'].astype('category')
df['Years in current job'].value_counts()
#dont del df['Years in current job']
num_df=df.select_dtypes(include=['float64','int64'])
num_df.info()
num_df['Loan ID']=df['Loan ID']
num_df.isnull().sum()
num_df=num_df.groupby(by='Loan ID',as_index=False).max()
num_df2=num_df.groupby(by='Loan ID',as_index=False).min()
num_df.info()
num_df[num_df['Loan ID']=='9cd8febc-48ff-4f54-8475-e57a4e5b1c9b']
num_df2[num_df['Loan ID']=='9cd8febc-48ff-4f54-8475-e57a4e5b1c9b']
cat_df=df.select_dtypes(include=['object'])
del cat_df['Customer ID']
cat_df.info()
cat_df.drop_duplicates(['Loan ID'],inplace=True)
cat_df.info()
cat_df.shape,num_df.shape
df=pd.merge(cat_df,num_df,on='Loan ID')
df.shape
df.nunique()
df['Loan Status'].isnull().sum()
df['Loan Status'].value_counts()
df['Current Loan Amount'].isnull().sum()
df['Current Loan Amount'].value_counts().sum()
df['Current Loan Amount'].hist(bins=50)
tmp_df=df[df['Current Loan Amount']<99999]
tmp_df['Current Loan Amount'].hist()
print tmp_df.shape,df.shape, df.shape[0]/tmp_df.shape[0]
tmp_df['Current Loan Amount'].mean(),tmp_df['Current Loan Amount'].median(),tmp_df['Current Loan Amount'].mode()
df['Term'].isnull().sum(), df['Term'].nunique(),df['Term'].value_counts()
df['Credit Score'].isnull().sum(),df['Credit Score'].nunique(),df['Credit Score'].value_counts().sum()
df2=df[df['Credit Score']<800]
df2['Credit Score'].hist()
df['Credit Score']=df['Credit Score'].apply(lambda x: x/10 if x > 800 else x)
df['Credit Score'].hist()
#def replaceCreditScore(score):
# if score > 800:
# a=score/10
# return a
# else:
# return score
df['Home Ownership'].value_counts()
del df['Home Ownership']
test_df=df[df['Annual Income']!= np.nan]
h=test_df[test_df['Annual Income']<0.4e7]
h['Annual Income'].hist(bins=50)
print h['Annual Income'].mean(),h['Annual Income'].median()
#del df['Purpose']
df['Purpose'].value_counts()
#del df['Purpose']
#Dont Delete
#df['Months since last delinquent'].fillna('3',inplace=True)
df['Months since last delinquent'].median()
df['Months since last delinquent'].hist()
df['Months since last delinquent'].fillna(df['Months since last delinquent'].median(),inplace=True)
df['Months since last delinquent']
df['Number of Open Accounts'].value_counts()
df['Number of Credit Problems'].hist(log=True)
df[df['Tax Liens']==0]['Tax Liens'].unique()
df['Current Credit Balance']
df.Bankruptcies.value_counts(dropna=False)
bank_df=df[df.Bankruptcies.isnull()]
k=bank_df[bank_df['Credit Score']<800]
k['Credit Score'].hist(bins=50)
df.dropna(subset=['Bankruptcies'],inplace=True)
bank_df['Months since last delinquent']
df['Tax Liens'].shape
df.isnull().sum()
df.dropna(axis=0,subset=['Tax Liens'], inplace=True)
df['Tax Liens'].shape
#for index,row in df.iterrows():
#if row['Maximum Open Credit']==9999:
# row['Maximum Open Credit'].astype(float)
#df['Maximum Open Credit'].astype(float,inplace=True)
#df['Maximum Open Credit'].hist()
#tmp3_df['Years in current job'].astype(int)
#tmp3_df['Years in current job'].hist()
df.isnull().sum()
train_data=df[df['Credit Score'].notnull()]
test_data=df[df['Credit Score'].isnull()]
to_predict1=df['Annual Income']
to_predict2=df['Credit Score']
train_data.shape,test_data.shape,df.shape
#def multiple_regression_model(train,test,input_features,to_predict):
# regr = LinearRegression()
# regr.fit(train.as_matrix(columns = input_features), train.as_matrix(columns =to_predict))
# return regr
input_features=df[['Years of Credit History','Monthly Debt']]
to_predict1=df[['Annual Income']]
#rg=multiple_regression_model(train_data,test_data,input_features,['Annual Income'])
regr1 = LinearRegression(fit_intercept=False)
regr2 = LinearRegression()
#regr.fit(train_data.as_matrix(columns = ['Years of Credit History','Current Loan Amount','Monthly Debt']), train_data.as_matrix(columns =to_predict1))
#regr.fit(train_data.as_matrix(columns = ['Years of Credit History','Current Loan Amount','Monthly Debt']), train_data.as_matrix(columns =to_predict1))
regr1.fit(train_data[['Years of Credit History','Monthly Debt','Years in current job']],train_data['Annual Income'])
filled_in_annual_income=regr1.predict(test_data[['Years of Credit History','Monthly Debt','Years in current job']])
from sklearn.metrics import mean_squared_error
mean_squared_error(train_data['Annual Income'],regr1.predict(train_data[['Years of Credit History','Monthly Debt','Years in current job']]))**0.5
test_data['Annual Income']=filled_in_annual_income
test_data['Annual Income'].isnull().sum()
train_data['Annual Income'].hist(log=True,bins=50)
df=train_data.append(test_data)
df['Annual Income'].hist(bins=50,log=True)
regr2.fit(train_data[['Years of Credit History','Monthly Debt','Years in current job','Annual Income']],train_data['Credit Score'])
filled_in_credit_score=regr2.predict(test_data[['Years of Credit History','Monthly Debt','Years in current job','Annual Income']])
test_data['Credit Score']=filled_in_credit_score
test_data['Credit Score'].isnull().sum()
result=pd.concat([train_data,test_data])
result.sort_index(ascending=True,inplace=True)
result.to_csv('loan_predictions_output.csv')
result.head(3)
#new_df=df.duplicated(subset='Loan ID')
'''newdf=groupby(df['Loan ID']).max()
#agg({'Loan Status':'max', 'height':sum, 'weight':sum})[['age','height','weight']]
'''
#newdf=df.groupby(df['Loan ID'])[''].max()
#newdf.shape
#newdf[newdf['Loan ID']=='b624940f-da82-410b-b885-77d92c2e4f0c']
#df2['Credit Score'].dropna(inplace=True)
#df2['Annual Income'].dropna(inplace=True)
#df2['Annual Income'].isnull().sum()
df2.dropna(inplace=True)
h=df2[ (df2['Current Loan Amount'] < 9999999 ) & (df2['Credit Score']>700 )& (df2['Credit Score'] <800)]
#plt.hist(h['Current Loan Amount'])
print h['Current Loan Amount'].mean()
print h['Current Loan Amount'].median() ############# This one
print h['Current Loan Amount'].mode()
df2['Current Loan Amount'].replace(99999999,h['Current Loan Amount'].median(),inplace=True)
df2['Credit Score']=df2['Credit Score'].apply(lambda x: x/10 if x > 800 else x)
df3=df2[df2['Credit Score'] < 800]
high_score=df3[df3['Credit Score']>700]
low_score=df3[df3['Credit Score']<600]
#df3['Maximum Open Credit']=df3['Maximum Open Credit'].astype(float)
#plt.scatter( df3['Maximum Open Credit'],df3['Credit Score'])