In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
%matplotlib inline

import seaborn as sns
sns.set_style('whitegrid')
import warnings
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
In [2]:
df=pd.read_csv('~/dm/pbl1/LoansTrainingSet.csv',low_memory=False)
In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256984 entries, 0 to 256983
Data columns (total 19 columns):
Loan ID                         256984 non-null object
Customer ID                     256984 non-null object
Loan Status                     256984 non-null object
Current Loan Amount             256984 non-null int64
Term                            256984 non-null object
Credit Score                    195308 non-null float64
Years in current job            256984 non-null object
Home Ownership                  256984 non-null object
Annual Income                   195308 non-null float64
Purpose                         256984 non-null object
Monthly Debt                    256984 non-null object
Years of Credit History         256984 non-null float64
Months since last delinquent    116601 non-null float64
Number of Open Accounts         256984 non-null int64
Number of Credit Problems       256984 non-null int64
Current Credit Balance          256984 non-null int64
Maximum Open Credit             256984 non-null object
Bankruptcies                    256455 non-null float64
Tax Liens                       256961 non-null float64
dtypes: float64(6), int64(4), object(9)
memory usage: 37.3+ MB
In [4]:
df[df['Credit Score'].isnull()]['Loan ID'].value_counts()
Out[4]:
63164ab4-08af-4b58-85c0-90751bc49f88    4
1724fdd3-4b1a-49a4-8840-0f8d666ba992    4
b4d07db5-bc2f-45ef-af19-e73b3e52e706    3
bda9d10c-1306-47ea-9438-bf593d7a895f    3
65602505-7fdb-45dc-af2f-4820718025cf    3
2bd9fa11-32fc-438a-9a76-d275e1f23a7e    3
067da0c9-c3fe-4dfe-99e5-5e988b2f4ddb    3
f33b0e23-d566-4b8f-9526-3a3d5da846af    3
26950f0b-bb37-4e17-a86f-52c6d01db2aa    3
3cba834f-6f5f-4fdf-b351-fa7ca76d4454    3
fbc90756-e1b8-4203-93ab-b3edb428f0e6    3
e3e25be2-ab13-49b2-89c8-9a1be2c78361    3
d0e451d9-d1bf-4d86-8bda-bde83a16d964    3
12f7ace5-8b92-464b-b875-bb9ac285c186    3
977a4f3a-4c42-40c9-abfc-01bf620321b3    3
f28d1d2f-d5fc-425b-833c-d1ca2d428788    3
e49915d9-eff7-4fbe-b7f9-c4e2bba0e34b    3
419fb4f2-5a21-487a-9b8d-7de6c6b97470    3
4b550239-305e-453f-9ac7-ff9246afa276    3
9cd8febc-48ff-4f54-8475-e57a4e5b1c9b    3
6bbec77f-2d74-4314-a2bb-f93a42e8b628    3
deab27c3-ebd0-45f9-be33-15d2b5ab05ca    3
03c9cefe-3893-4241-82c1-a003752fa8cf    3
f7088804-c8da-4883-ba06-78bd13ee5ced    3
0799c191-2514-4651-b542-76f0b3123582    3
427d5b9a-abf0-48b2-9453-ab6cf8105556    3
115478a8-37f7-48a3-85a4-636b99157d84    3
19cd8bc8-d953-494a-9fa8-da847e4ff46a    3
49ec5aac-f6ac-435e-a7e7-79c01e2e8689    3
0a575c08-b977-4be8-9a77-92ff9a97f771    3
                                       ..
db76728f-0282-4799-9381-009a6758a925    1
3fe298a0-b5bb-47b6-b1dc-9872a317fa54    1
da281e5f-179c-4ee1-9531-8ae8a1ddfee5    1
3d5b48a9-3f37-4306-b850-21ac409966f7    1
a6819245-f0ad-4d84-869c-a9ce4ebdfdb6    1
86bd5a3e-9f86-40da-a8b1-125f580269eb    1
22e61937-6443-407a-afca-dcb331beab56    1
af142acf-558a-40d5-a9f8-cc22f452f4d0    1
c26092af-8207-447d-afd9-9e6ae6fa7e14    1
87396623-9cd2-4cb2-9f4a-066d54ff98c7    1
3056f6f7-0299-40bd-9eee-57c1295467af    1
5e0b07d4-0e88-4a6f-941e-f816bd040c19    1
66a9a8f2-17bc-4aa8-96ba-f6461befd471    1
d28f9534-55a0-4c94-980d-4c3a16d1bc19    1
e1f76d2a-953a-4af6-a920-ebe8676ffc4f    1
1299f184-7afe-4afa-8bf5-60604e70ec2c    1
502493d3-3c9e-4f77-a5ef-7d49a9e581e7    1
411cfb9e-c5d3-4e6a-8e0c-50ba1601b2ec    1
bc8369f0-7357-41f0-8479-816348668e06    1
28d9cb33-5120-4e08-b3da-cb3f9f4d4114    1
a26bd1d0-37ac-4cb1-a820-043d725fd675    1
759d7bf2-121e-4d20-aac0-f41e04a5b9cb    1
9cf8b557-115d-47ae-b1f0-fc8ae84c3b8c    1
19351776-b036-4525-8212-d978814a6b76    1
6d3ab177-4595-4ad9-b3d8-e6f6eab4c8e0    1
1edf241d-95d3-41ad-971c-9eab957c618c    1
a680199d-1e4f-4f55-bd8d-b3d2566ef2c0    1
245fc1f1-0850-46f1-ae4a-b63937fda335    1
aa0cc942-3cf5-4d81-ad36-aa179fee2010    1
29396142-2f09-4ec8-b270-ca269b602a89    1
Name: Loan ID, Length: 59156, dtype: int64
In [5]:
df[df['Loan ID']=='9cd8febc-48ff-4f54-8475-e57a4e5b1c9b']
Out[5]:
Loan ID Customer ID Loan Status Current Loan Amount Term Credit Score Years in current job Home Ownership Annual Income Purpose Monthly Debt Years of Credit History Months since last delinquent Number of Open Accounts Number of Credit Problems Current Credit Balance Maximum Open Credit Bankruptcies Tax Liens
157676 9cd8febc-48ff-4f54-8475-e57a4e5b1c9b 2d945c7c-6ad2-4e3c-ba83-d72e8a3db8b6 Charged Off 7808 Short Term NaN n/a Rent NaN other $747.95 41.2 28.0 10 0 3565 23299 0.0 0.0
157677 9cd8febc-48ff-4f54-8475-e57a4e5b1c9b 2d945c7c-6ad2-4e3c-ba83-d72e8a3db8b6 Charged Off 7900 Short Term NaN n/a Rent NaN other $756.80 41.2 28.0 10 0 3607 23575 0.0 0.0
157678 9cd8febc-48ff-4f54-8475-e57a4e5b1c9b 2d945c7c-6ad2-4e3c-ba83-d72e8a3db8b6 Charged Off 7900 Short Term NaN n/a Rent NaN other $756.80 41.2 28.0 10 0 3607 23575 0.0 0.0
157679 9cd8febc-48ff-4f54-8475-e57a4e5b1c9b 2d945c7c-6ad2-4e3c-ba83-d72e8a3db8b6 Charged Off 7808 Short Term 620.0 n/a Rent 26091.0 other $747.95 41.2 28.0 10 0 3565 23299 0.0 0.0

Monthly Debt

In [6]:
df['Monthly Debt']=df['Monthly Debt'].str.replace('$','')
In [7]:
df['Monthly Debt']=df['Monthly Debt'].replace('[ ]','',regex=True)
In [8]:
df['Monthly Debt']=df['Monthly Debt'].replace(',','',regex=True)
In [9]:
df['Monthly Debt']=df['Monthly Debt'].astype(float)
In [10]:
col=df.columns
print col
for i in col:
    
    print i,'  ',df[i].isnull().sum()
  #  print i,'  ', df[i].nunique()

   
Index([u'Loan ID', u'Customer ID', u'Loan Status', u'Current Loan Amount',
       u'Term', u'Credit Score', u'Years in current job', u'Home Ownership',
       u'Annual Income', u'Purpose', u'Monthly Debt',
       u'Years of Credit History', u'Months since last delinquent',
       u'Number of Open Accounts', u'Number of Credit Problems',
       u'Current Credit Balance', u'Maximum Open Credit', u'Bankruptcies',
       u'Tax Liens'],
      dtype='object')
Loan ID    0
Customer ID    0
Loan Status    0
Current Loan Amount    0
Term    0
Credit Score    61676
Years in current job    0
Home Ownership    0
Annual Income    61676
Purpose    0
Monthly Debt    0
Years of Credit History    0
Months since last delinquent    140383
Number of Open Accounts    0
Number of Credit Problems    0
Current Credit Balance    0
Maximum Open Credit    0
Bankruptcies    529
Tax Liens    23
In [11]:
cat_df=df.select_dtypes(include=['object'])

Maximum open credit

In [12]:
df['Maximum Open Credit'].replace('#VALUE!',np.nan,regex=True,inplace=True)
In [13]:
df['Maximum Open Credit'].isnull().sum()
Out[13]:
4
In [14]:
df['Maximum Open Credit'].iloc[187512]
Out[14]:
nan
In [15]:
#length=len(df['Maximum Open Credit'])
In [16]:
'''
for i in range(0,length):
    
    #v=df['Maximum Open Credit'].iloc[i]
    
    try:
        
        v=float(df['Maximum Open Credit'].iloc[i])
    except ValueError,e:
        print 'line i is bad ',i
            
'''
Out[16]:
"\nfor i in range(0,length):\n    \n    #v=df['Maximum Open Credit'].iloc[i]\n    \n    try:\n        \n        v=float(df['Maximum Open Credit'].iloc[i])\n    except ValueError,e:\n        print 'line i is bad ',i\n            \n"
In [17]:
df['Maximum Open Credit']=df['Maximum Open Credit'].astype(float) 
In [18]:
df.dropna(subset=['Maximum Open Credit'],inplace=True)
In [19]:
#df['Maximum Open Credit']=df['Maximum Open Credit'].str.replace(r"[a-zA-Z]",'')#extract('(\d+)', expand=False)
In [20]:
#df['Maximum Open Credit']=df['Maximum Open Credit'].str.replace(r"[a-zA-Z]",'')#extract('(\d+)', expand=False)
In [21]:
#df['Maximum Open Credit']=df['Maximum Open Credit'].replace('\D','')

Years in Current Job

In [22]:
df['Years in current job'].value_counts()
Out[22]:
10+ years    78896
2 years      23460
< 1 year     21012
3 years      20659
5 years      17864
1 year       16746
4 years      16166
6 years      14597
7 years      13966
8 years      12206
n/a          11476
9 years       9932
Name: Years in current job, dtype: int64
In [23]:
11476.0/250000 * 100.0
Out[23]:
4.5904
In [24]:
tmp2_df=df[df['Years in current job']=='n/a']

j=pd.DataFrame()
df['Years in current job'].replace('< 1 year','1',inplace=True)
df['Years in current job'].replace(' 1 year','1',inplace=True)

df['Years in current job'].replace('1 year','1',inplace=True)
df['Years in current job'].replace('2 years','1',inplace=True)
df['Years in current job'].replace('3 years','1',inplace=True)
df['Years in current job'].replace('4 years','1',inplace=True)
df['Years in current job'].replace('5 years','1',inplace=True)
df['Years in current job'].replace('6 years','2',inplace=True)
df['Years in current job'].replace('7 years','2',inplace=True)

df['Years in current job'].replace('8 years','2',inplace=True)
df['Years in current job'].replace('9 years','2',inplace=True)
df['Years in current job'].replace('10+ years','2',inplace=True)
df['Years in current job'].replace('n/a','1',inplace=True)
tmp3_df=df[df['Years in current job']!='n/a']

df['Years in current job'].value_counts()
#df['Years in current job']=df['Years in current job'].astype('category')
Out[24]:
2    129597
1    127383
Name: Years in current job, dtype: int64
In [25]:
df.head(2)
Out[25]:
Loan ID Customer ID Loan Status Current Loan Amount Term Credit Score Years in current job Home Ownership Annual Income Purpose Monthly Debt Years of Credit History Months since last delinquent Number of Open Accounts Number of Credit Problems Current Credit Balance Maximum Open Credit Bankruptcies Tax Liens
0 000025bb-5694-4cff-b17d-192b1a98ba44 5ebc8bb1-5eb9-4404-b11b-a6eebc401a19 Fully Paid 11520 Short Term 741.0 2 Home Mortgage 33694.0 Debt Consolidation 584.03 12.3 41.0 10 0 6760 16056.0 0.0 0.0
1 00002c49-3a29-4bd4-8f67-c8f8fbc1048c 927b388d-2e01-423f-a8dc-f7e42d668f46 Fully Paid 3441 Short Term 734.0 1 Home Mortgage 42269.0 other 1106.04 26.3 NaN 17 0 6262 19149.0 0.0 0.0
In [127]:
df['Purpose'].value_counts()
Out[127]:
Debt Consolidation      170830
Home Improvements        12806
other                    11664
Other                     8251
Business Loan             3585
Buy a Car                 2913
Medical Bills             2369
Take a Trip               1307
Buy House                 1305
Educational Expenses       216
Name: Purpose, dtype: int64
In [26]:
df['Years in current job']=df['Years in current job'].astype(int)
In [27]:
#df['Years in current job'].plot(kind='barh')
In [130]:
df['Years in current job']=df['Years in current job'].astype('category')
df['Years in current job'].value_counts()
Out[130]:
2    109022
1    106224
Name: Years in current job, dtype: int64
In [28]:
#dont del df['Years in current job']

++++++++++++++++++++++++++++++++++++++++++++++++++++=

In [29]:
num_df=df.select_dtypes(include=['float64','int64'])
In [30]:
num_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 256980 entries, 0 to 256983
Data columns (total 13 columns):
Current Loan Amount             256980 non-null int64
Credit Score                    195305 non-null float64
Years in current job            256980 non-null int64
Annual Income                   195305 non-null float64
Monthly Debt                    256980 non-null float64
Years of Credit History         256980 non-null float64
Months since last delinquent    116599 non-null float64
Number of Open Accounts         256980 non-null int64
Number of Credit Problems       256980 non-null int64
Current Credit Balance          256980 non-null int64
Maximum Open Credit             256980 non-null float64
Bankruptcies                    256451 non-null float64
Tax Liens                       256957 non-null float64
dtypes: float64(8), int64(5)
memory usage: 27.4 MB
In [31]:
num_df['Loan ID']=df['Loan ID']
In [32]:
num_df.isnull().sum()
Out[32]:
Current Loan Amount                  0
Credit Score                     61675
Years in current job                 0
Annual Income                    61675
Monthly Debt                         0
Years of Credit History              0
Months since last delinquent    140381
Number of Open Accounts              0
Number of Credit Problems            0
Current Credit Balance               0
Maximum Open Credit                  0
Bankruptcies                       529
Tax Liens                           23
Loan ID                              0
dtype: int64
In [33]:
num_df=num_df.groupby(by='Loan ID',as_index=False).max()
num_df2=num_df.groupby(by='Loan ID',as_index=False).min()
In [131]:
num_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 215698 entries, 0 to 215697
Data columns (total 14 columns):
Loan ID                         215698 non-null object
Current Loan Amount             215698 non-null int64
Credit Score                    171200 non-null float64
Years in current job            215698 non-null int64
Annual Income                   171200 non-null float64
Monthly Debt                    215698 non-null float64
Years of Credit History         215698 non-null float64
Months since last delinquent    97437 non-null float64
Number of Open Accounts         215698 non-null int64
Number of Credit Problems       215698 non-null int64
Current Credit Balance          215698 non-null int64
Maximum Open Credit             215698 non-null float64
Bankruptcies                    215246 non-null float64
Tax Liens                       215676 non-null float64
dtypes: float64(8), int64(5), object(1)
memory usage: 24.7+ MB
In [34]:
num_df[num_df['Loan ID']=='9cd8febc-48ff-4f54-8475-e57a4e5b1c9b']
num_df2[num_df['Loan ID']=='9cd8febc-48ff-4f54-8475-e57a4e5b1c9b']
Out[34]:
Loan ID Current Loan Amount Credit Score Years in current job Annual Income Monthly Debt Years of Credit History Months since last delinquent Number of Open Accounts Number of Credit Problems Current Credit Balance Maximum Open Credit Bankruptcies Tax Liens
132166 9cd8febc-48ff-4f54-8475-e57a4e5b1c9b 7900 620.0 1 26091.0 756.8 41.2 28.0 10 0 3607 23575.0 0.0 0.0
Out[34]:
Loan ID Current Loan Amount Credit Score Years in current job Annual Income Monthly Debt Years of Credit History Months since last delinquent Number of Open Accounts Number of Credit Problems Current Credit Balance Maximum Open Credit Bankruptcies Tax Liens
132166 9cd8febc-48ff-4f54-8475-e57a4e5b1c9b 7900 620.0 1 26091.0 756.8 41.2 28.0 10 0 3607 23575.0 0.0 0.0
In [35]:
cat_df=df.select_dtypes(include=['object'])
In [36]:
del cat_df['Customer ID']
In [37]:
cat_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 256980 entries, 0 to 256983
Data columns (total 5 columns):
Loan ID           256980 non-null object
Loan Status       256980 non-null object
Term              256980 non-null object
Home Ownership    256980 non-null object
Purpose           256980 non-null object
dtypes: object(5)
memory usage: 11.8+ MB
In [38]:
cat_df.drop_duplicates(['Loan ID'],inplace=True)
In [39]:
cat_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 215698 entries, 0 to 256983
Data columns (total 5 columns):
Loan ID           215698 non-null object
Loan Status       215698 non-null object
Term              215698 non-null object
Home Ownership    215698 non-null object
Purpose           215698 non-null object
dtypes: object(5)
memory usage: 9.9+ MB
In [40]:
cat_df.shape,num_df.shape
Out[40]:
((215698, 5), (215698, 14))

++++++++++++++++++++++++++++++++++++++++++++++++++++=

In [41]:
df=pd.merge(cat_df,num_df,on='Loan ID')
In [42]:
df.shape
Out[42]:
(215698, 18)
In [43]:
df.nunique()
Out[43]:
Loan ID                         215698
Loan Status                          2
Term                                 2
Home Ownership                       4
Purpose                             10
Current Loan Amount              27308
Credit Score                       334
Years in current job                 2
Annual Income                    60412
Monthly Debt                    128759
Years of Credit History            541
Months since last delinquent       131
Number of Open Accounts             59
Number of Credit Problems           12
Current Credit Balance           45630
Maximum Open Credit              68884
Bankruptcies                         8
Tax Liens                           12
dtype: int64

Loan Status:

In [44]:
df['Loan Status'].isnull().sum()
Out[44]:
0
In [45]:
df['Loan Status'].value_counts()
Out[45]:
Fully Paid     176191
Charged Off     39507
Name: Loan Status, dtype: int64

Current Loan Amount

In [46]:
df['Current Loan Amount'].isnull().sum()
Out[46]:
0
In [47]:
df['Current Loan Amount'].value_counts().sum()
Out[47]:
215698
In [48]:
df['Current Loan Amount'].hist(bins=50)
Out[48]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f196890acd0>
In [49]:
tmp_df=df[df['Current Loan Amount']<99999]
In [50]:
tmp_df['Current Loan Amount'].hist()
print tmp_df.shape,df.shape, df.shape[0]/tmp_df.shape[0]
Out[50]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f19737c2c10>
(180488, 18) (215698, 18) 1
In [51]:
tmp_df['Current Loan Amount'].mean(),tmp_df['Current Loan Amount'].median(),tmp_df['Current Loan Amount'].mode()
Out[51]:
(13807.304496697841, 11968.0, 0    9793
 dtype: int64)

Term

In [52]:
df['Term'].isnull().sum(), df['Term'].nunique(),df['Term'].value_counts()
Out[52]:
(0, 2, Short Term    166522
 Long Term      49176
 Name: Term, dtype: int64)

Credit Score

In [53]:
df['Credit Score'].isnull().sum(),df['Credit Score'].nunique(),df['Credit Score'].value_counts().sum()
Out[53]:
(44498, 334, 171200)
In [54]:
df2=df[df['Credit Score']<800]
In [133]:
df2['Credit Score'].hist()
Out[133]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f1970ac2b10>
In [56]:
df['Credit Score']=df['Credit Score'].apply(lambda x: x/10 if x > 800 else x)
In [132]:
df['Credit Score'].hist()
Out[132]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f1970befd90>
In [58]:
#def replaceCreditScore(score):

#    if score > 800:
#        a=score/10
#        return a
#    else:
#        return score

Home Ownership

In [59]:
df['Home Ownership'].value_counts()
Out[59]:
Home Mortgage    106491
Rent              89618
Own Home          19094
HaveMortgage        495
Name: Home Ownership, dtype: int64
In [60]:
del df['Home Ownership']

Annual Income

In [61]:
test_df=df[df['Annual Income']!= np.nan]
In [62]:
h=test_df[test_df['Annual Income']<0.4e7]
h['Annual Income'].hist(bins=50)
print h['Annual Income'].mean(),h['Annual Income'].median()        
Out[62]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f1971abe690>
72679.9039469 62112.0

Purpose

In [63]:
#del df['Purpose']
df['Purpose'].value_counts()
Out[63]:
Debt Consolidation      171095
Home Improvements        12839
other                    11726
Other                     8281
Business Loan             3611
Buy a Car                 2926
Medical Bills             2375
Take a Trip               1312
Buy House                 1308
Educational Expenses       225
Name: Purpose, dtype: int64
In [64]:
#del df['Purpose']
#Dont Delete

Months since last

In [65]:
#df['Months since last delinquent'].fillna('3',inplace=True)
In [66]:
df['Months since last delinquent'].median()
Out[66]:
32.0
In [67]:
df['Months since last delinquent'].hist()
Out[67]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f1971e6ded0>
In [68]:
df['Months since last delinquent'].fillna(df['Months since last delinquent'].median(),inplace=True)
In [69]:
df['Months since last delinquent']
Out[69]:
0         41.0
1         32.0
2         32.0
3         32.0
4         32.0
5         32.0
6         73.0
7         32.0
8         29.0
9         32.0
10        32.0
11        43.0
12        32.0
13        32.0
14        32.0
15        32.0
16        32.0
17        79.0
18        32.0
19        32.0
20        32.0
21        32.0
22         2.0
23        32.0
24        15.0
25        27.0
26         5.0
27        32.0
28        32.0
29        55.0
          ... 
215668    32.0
215669    32.0
215670    32.0
215671    11.0
215672    72.0
215673    32.0
215674    32.0
215675    32.0
215676    32.0
215677    46.0
215678    56.0
215679    22.0
215680    80.0
215681    47.0
215682    32.0
215683    62.0
215684    25.0
215685    32.0
215686    32.0
215687    18.0
215688    32.0
215689    32.0
215690    63.0
215691    32.0
215692     1.0
215693    32.0
215694    32.0
215695    47.0
215696    82.0
215697    11.0
Name: Months since last delinquent, Length: 215698, dtype: float64

Number of open accounts

In [70]:
df['Number of Open Accounts'].value_counts()
Out[70]:
9     20533
10    19526
8     19488
11    17985
7     17626
12    15862
6     14797
13    13447
14    11380
5     10406
15     9070
16     7522
4      6158
17     6061
18     4857
19     3926
20     3030
3      2835
21     2340
22     1817
23     1429
24     1131
2       914
25      841
26      608
27      473
28      336
29      243
30      198
31      176
32      149
33       97
34       86
35       65
37       46
36       46
38       31
1        31
39       25
41       21
40       21
42       13
45        8
43        7
47        6
44        5
0         3
46        3
48        3
53        3
58        2
50        2
52        2
55        2
76        2
49        1
51        1
54        1
56        1
Name: Number of Open Accounts, dtype: int64

number of credit problems

In [71]:
df['Number of Credit Problems'].hist(log=True)
Out[71]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f197183c390>
In [148]:
df[df['Tax Liens']==0]['Tax Liens'].unique()
Out[148]:
array([ 0.])

Current Credit Balance

In [72]:
df['Current Credit Balance']
Out[72]:
0          6760
1          6262
2         20967
3         22529
4         17391
5          2289
6         14123
7         11970
8         10926
9          8880
10        32630
11         6817
12        11514
13         8519
14        17092
15        28513
16         8488
17         4636
18         9959
19         2184
20         8919
21        47201
22         6720
23         9126
24        24461
25        43375
26        27706
27        41603
28        26722
29         9031
          ...  
215668     3501
215669    13192
215670    18732
215671    13960
215672     6453
215673    13892
215674    19104
215675    35089
215676    10116
215677     3686
215678    22110
215679     3274
215680     5022
215681     7297
215682     3860
215683     7114
215684     3146
215685     9284
215686    12818
215687    10629
215688    13415
215689    10013
215690     4237
215691    20183
215692     2590
215693     4176
215694    43992
215695     1717
215696     3315
215697    28317
Name: Current Credit Balance, Length: 215698, dtype: int64

Bankruptcies

In [73]:
df.Bankruptcies.value_counts(dropna=False)
Out[73]:
 0.0    192725
 1.0     21506
 2.0       818
NaN        452
 3.0       155
 4.0        25
 5.0        13
 6.0         3
 7.0         1
Name: Bankruptcies, dtype: int64
In [74]:
bank_df=df[df.Bankruptcies.isnull()]
In [75]:
k=bank_df[bank_df['Credit Score']<800]
In [76]:
k['Credit Score'].hist(bins=50)
Out[76]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f19715f6510>
In [77]:
df.dropna(subset=['Bankruptcies'],inplace=True)
In [78]:
bank_df['Months since last delinquent']
Out[78]:
98         0.0
150        0.0
165       15.0
181       31.0
188        0.0
320        0.0
2128       0.0
2495       0.0
3711       0.0
3963       0.0
4559       4.0
4588       0.0
4590       0.0
5181      16.0
5784      25.0
6088       0.0
6228       0.0
6925       0.0
7225      47.0
7548       5.0
7612       0.0
8628       0.0
9770       0.0
9807      19.0
9976       0.0
10534      0.0
11216      0.0
11252      0.0
12118      0.0
13325      0.0
          ... 
197795    21.0
199008     0.0
199094     0.0
199337    41.0
200174     0.0
200700    24.0
200811     0.0
200857     0.0
201024    61.0
201200     0.0
201327    54.0
203022    41.0
203279    59.0
203360    34.0
203547     0.0
203758    39.0
204370     0.0
204463    63.0
205771    56.0
206224     0.0
208313    46.0
209326     0.0
209378     0.0
209617     0.0
210327     0.0
211915     0.0
211954     0.0
214383    49.0
214752     6.0
214783     0.0
Name: Months since last delinquent, Length: 452, dtype: float64

Tax Liens

In [79]:
df['Tax Liens'].shape
Out[79]:
(215246,)
In [80]:
df.isnull().sum()
Out[80]:
Loan ID                             0
Loan Status                         0
Term                                0
Purpose                             0
Current Loan Amount                 0
Credit Score                    44411
Years in current job                0
Annual Income                   44411
Monthly Debt                        0
Years of Credit History             0
Months since last delinquent        0
Number of Open Accounts             0
Number of Credit Problems           0
Current Credit Balance              0
Maximum Open Credit                 0
Bankruptcies                        0
Tax Liens                           0
dtype: int64
In [81]:
df.dropna(axis=0,subset=['Tax Liens'], inplace=True)
In [82]:
df['Tax Liens'].shape
Out[82]:
(215246,)
In [83]:
#for index,row in df.iterrows():
    #if row['Maximum Open Credit']==9999:
#    row['Maximum Open Credit'].astype(float)
    
In [84]:
#df['Maximum Open Credit'].astype(float,inplace=True)
#df['Maximum Open Credit'].hist()
In [85]:
#tmp3_df['Years in current job'].astype(int)
#tmp3_df['Years in current job'].hist()

Linear Regression



In [86]:
df.isnull().sum()
Out[86]:
Loan ID                             0
Loan Status                         0
Term                                0
Purpose                             0
Current Loan Amount                 0
Credit Score                    44411
Years in current job                0
Annual Income                   44411
Monthly Debt                        0
Years of Credit History             0
Months since last delinquent        0
Number of Open Accounts             0
Number of Credit Problems           0
Current Credit Balance              0
Maximum Open Credit                 0
Bankruptcies                        0
Tax Liens                           0
dtype: int64
In [87]:
train_data=df[df['Credit Score'].notnull()]
test_data=df[df['Credit Score'].isnull()]
to_predict1=df['Annual Income']
to_predict2=df['Credit Score']
In [88]:
train_data.shape,test_data.shape,df.shape
Out[88]:
((170835, 17), (44411, 17), (215246, 17))
In [89]:
#def multiple_regression_model(train,test,input_features,to_predict):
#    regr = LinearRegression()
#    regr.fit(train.as_matrix(columns = input_features), train.as_matrix(columns =to_predict))
#    return regr
In [90]:
input_features=df[['Years of Credit History','Monthly Debt']]
to_predict1=df[['Annual Income']]
In [149]:
#rg=multiple_regression_model(train_data,test_data,input_features,['Annual Income'])
regr1 = LinearRegression(fit_intercept=False)
regr2 = LinearRegression()

#regr.fit(train_data.as_matrix(columns = ['Years of Credit History','Current Loan Amount','Monthly Debt']), train_data.as_matrix(columns =to_predict1))
#regr.fit(train_data.as_matrix(columns = ['Years of Credit History','Current Loan Amount','Monthly Debt']), train_data.as_matrix(columns =to_predict1))
In [92]:
regr1.fit(train_data[['Years of Credit History','Monthly Debt','Years in current job']],train_data['Annual Income'])
Out[92]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
In [93]:
filled_in_annual_income=regr1.predict(test_data[['Years of Credit History','Monthly Debt','Years in current job']])
In [94]:
from sklearn.metrics import mean_squared_error
In [95]:
mean_squared_error(train_data['Annual Income'],regr1.predict(train_data[['Years of Credit History','Monthly Debt','Years in current job']]))**0.5
Out[95]:
50389.770380903654
In [96]:
test_data['Annual Income']=filled_in_annual_income
In [97]:
test_data['Annual Income'].isnull().sum()
Out[97]:
0
In [98]:
train_data['Annual Income'].hist(log=True,bins=50)
Out[98]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f19716814d0>
In [99]:
df=train_data.append(test_data)
In [100]:
df['Annual Income'].hist(bins=50,log=True)
Out[100]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f1971f81690>

Filling Credit Score

In [101]:
regr2.fit(train_data[['Years of Credit History','Monthly Debt','Years in current job','Annual Income']],train_data['Credit Score'])
Out[101]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
In [102]:
filled_in_credit_score=regr2.predict(test_data[['Years of Credit History','Monthly Debt','Years in current job','Annual Income']])
In [103]:
test_data['Credit Score']=filled_in_credit_score
In [104]:
test_data['Credit Score'].isnull().sum()
Out[104]:
0
In [105]:
result=pd.concat([train_data,test_data])
In [106]:
result.sort_index(ascending=True,inplace=True)
result.to_csv('loan_predictions_output.csv')
In [107]:
result.head(3)
Out[107]:
Loan ID Loan Status Term Purpose Current Loan Amount Credit Score Years in current job Annual Income Monthly Debt Years of Credit History Months since last delinquent Number of Open Accounts Number of Credit Problems Current Credit Balance Maximum Open Credit Bankruptcies Tax Liens
0 000025bb-5694-4cff-b17d-192b1a98ba44 Fully Paid Short Term Debt Consolidation 11520 741.0 2 33694.0 584.03 12.3 41.0 10 0 6760 16056.0 0.0 0.0
1 00002c49-3a29-4bd4-8f67-c8f8fbc1048c Fully Paid Short Term other 3441 734.0 1 42269.0 1106.04 26.3 32.0 17 0 6262 19149.0 0.0 0.0
2 00002d89-27f3-409b-aa76-90834f359a65 Fully Paid Short Term Debt Consolidation 21029 747.0 2 90126.0 1321.85 28.8 32.0 5 0 20967 28335.0 0.0 0.0

OLD:

In [113]:
#new_df=df.duplicated(subset='Loan ID')
In [114]:
'''newdf=groupby(df['Loan ID']).max()
#agg({'Loan Status':'max', 'height':sum, 'weight':sum})[['age','height','weight']]
'''
Out[114]:
"newdf=groupby(df['Loan ID']).max()\n#agg({'Loan Status':'max', 'height':sum, 'weight':sum})[['age','height','weight']]\n"
In [115]:
#newdf=df.groupby(df['Loan ID'])[''].max()
In [116]:
#newdf.shape
In [117]:
#newdf[newdf['Loan ID']=='b624940f-da82-410b-b885-77d92c2e4f0c']
In [118]:
#df2['Credit Score'].dropna(inplace=True)
In [119]:
#df2['Annual Income'].dropna(inplace=True)
In [120]:
#df2['Annual Income'].isnull().sum()
In [121]:
df2.dropna(inplace=True)
In [122]:
h=df2[ (df2['Current Loan Amount'] < 9999999 )  & (df2['Credit Score']>700 )& (df2['Credit Score'] <800)]
#plt.hist(h['Current Loan Amount'])
print h['Current Loan Amount'].mean()
print h['Current Loan Amount'].median() ############# This one
print h['Current Loan Amount'].mode()
df2['Current Loan Amount'].replace(99999999,h['Current Loan Amount'].median(),inplace=True)
df2['Credit Score']=df2['Credit Score'].apply(lambda x: x/10 if x > 800 else x)
df3=df2[df2['Credit Score'] < 800]
high_score=df3[df3['Credit Score']>700]
low_score=df3[df3['Credit Score']<600]
#df3['Maximum Open Credit']=df3['Maximum Open Credit'].astype(float)
#plt.scatter( df3['Maximum Open Credit'],df3['Credit Score'])
12800.6233092
11029.5
0    9820
dtype: int64