import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
%matplotlib inline

import seaborn as sns
sns.set_style('whitegrid')
import warnings
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

df=pd.read_csv('~/dm/pbl1/LoansTrainingSet.csv',low_memory=False)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256984 entries, 0 to 256983
Data columns (total 19 columns):
Loan ID                         256984 non-null object
Customer ID                     256984 non-null object
Loan Status                     256984 non-null object
Current Loan Amount             256984 non-null int64
Term                            256984 non-null object
Credit Score                    195308 non-null float64
Years in current job            256984 non-null object
Home Ownership                  256984 non-null object
Annual Income                   195308 non-null float64
Purpose                         256984 non-null object
Monthly Debt                    256984 non-null object
Years of Credit History         256984 non-null float64
Months since last delinquent    116601 non-null float64
Number of Open Accounts         256984 non-null int64
Number of Credit Problems       256984 non-null int64
Current Credit Balance          256984 non-null int64
Maximum Open Credit             256984 non-null object
Bankruptcies                    256455 non-null float64
Tax Liens                       256961 non-null float64
dtypes: float64(6), int64(4), object(9)
memory usage: 37.3+ MB

df[df['Credit Score'].isnull()]['Loan ID'].value_counts()

63164ab4-08af-4b58-85c0-90751bc49f88    4
1724fdd3-4b1a-49a4-8840-0f8d666ba992    4
b4d07db5-bc2f-45ef-af19-e73b3e52e706    3
bda9d10c-1306-47ea-9438-bf593d7a895f    3
65602505-7fdb-45dc-af2f-4820718025cf    3
2bd9fa11-32fc-438a-9a76-d275e1f23a7e    3
067da0c9-c3fe-4dfe-99e5-5e988b2f4ddb    3
f33b0e23-d566-4b8f-9526-3a3d5da846af    3
26950f0b-bb37-4e17-a86f-52c6d01db2aa    3
3cba834f-6f5f-4fdf-b351-fa7ca76d4454    3
fbc90756-e1b8-4203-93ab-b3edb428f0e6    3
e3e25be2-ab13-49b2-89c8-9a1be2c78361    3
d0e451d9-d1bf-4d86-8bda-bde83a16d964    3
12f7ace5-8b92-464b-b875-bb9ac285c186    3
977a4f3a-4c42-40c9-abfc-01bf620321b3    3
f28d1d2f-d5fc-425b-833c-d1ca2d428788    3
e49915d9-eff7-4fbe-b7f9-c4e2bba0e34b    3
419fb4f2-5a21-487a-9b8d-7de6c6b97470    3
4b550239-305e-453f-9ac7-ff9246afa276    3
9cd8febc-48ff-4f54-8475-e57a4e5b1c9b    3
6bbec77f-2d74-4314-a2bb-f93a42e8b628    3
deab27c3-ebd0-45f9-be33-15d2b5ab05ca    3
03c9cefe-3893-4241-82c1-a003752fa8cf    3
f7088804-c8da-4883-ba06-78bd13ee5ced    3
0799c191-2514-4651-b542-76f0b3123582    3
427d5b9a-abf0-48b2-9453-ab6cf8105556    3
115478a8-37f7-48a3-85a4-636b99157d84    3
19cd8bc8-d953-494a-9fa8-da847e4ff46a    3
49ec5aac-f6ac-435e-a7e7-79c01e2e8689    3
0a575c08-b977-4be8-9a77-92ff9a97f771    3
                                       ..
db76728f-0282-4799-9381-009a6758a925    1
3fe298a0-b5bb-47b6-b1dc-9872a317fa54    1
da281e5f-179c-4ee1-9531-8ae8a1ddfee5    1
3d5b48a9-3f37-4306-b850-21ac409966f7    1
a6819245-f0ad-4d84-869c-a9ce4ebdfdb6    1
86bd5a3e-9f86-40da-a8b1-125f580269eb    1
22e61937-6443-407a-afca-dcb331beab56    1
af142acf-558a-40d5-a9f8-cc22f452f4d0    1
c26092af-8207-447d-afd9-9e6ae6fa7e14    1
87396623-9cd2-4cb2-9f4a-066d54ff98c7    1
3056f6f7-0299-40bd-9eee-57c1295467af    1
5e0b07d4-0e88-4a6f-941e-f816bd040c19    1
66a9a8f2-17bc-4aa8-96ba-f6461befd471    1
d28f9534-55a0-4c94-980d-4c3a16d1bc19    1
e1f76d2a-953a-4af6-a920-ebe8676ffc4f    1
1299f184-7afe-4afa-8bf5-60604e70ec2c    1
502493d3-3c9e-4f77-a5ef-7d49a9e581e7    1
411cfb9e-c5d3-4e6a-8e0c-50ba1601b2ec    1
bc8369f0-7357-41f0-8479-816348668e06    1
28d9cb33-5120-4e08-b3da-cb3f9f4d4114    1
a26bd1d0-37ac-4cb1-a820-043d725fd675    1
759d7bf2-121e-4d20-aac0-f41e04a5b9cb    1
9cf8b557-115d-47ae-b1f0-fc8ae84c3b8c    1
19351776-b036-4525-8212-d978814a6b76    1
6d3ab177-4595-4ad9-b3d8-e6f6eab4c8e0    1
1edf241d-95d3-41ad-971c-9eab957c618c    1
a680199d-1e4f-4f55-bd8d-b3d2566ef2c0    1
245fc1f1-0850-46f1-ae4a-b63937fda335    1
aa0cc942-3cf5-4d81-ad36-aa179fee2010    1
29396142-2f09-4ec8-b270-ca269b602a89    1
Name: Loan ID, Length: 59156, dtype: int64

df[df['Loan ID']=='9cd8febc-48ff-4f54-8475-e57a4e5b1c9b']

Monthly Debt¶

df['Monthly Debt']=df['Monthly Debt'].str.replace('$','')

df['Monthly Debt']=df['Monthly Debt'].replace('[ ]','',regex=True)

df['Monthly Debt']=df['Monthly Debt'].replace(',','',regex=True)

df['Monthly Debt']=df['Monthly Debt'].astype(float)

col=df.columns
print col
for i in col:
    
    print i,'  ',df[i].isnull().sum()
  #  print i,'  ', df[i].nunique()

Index([u'Loan ID', u'Customer ID', u'Loan Status', u'Current Loan Amount',
       u'Term', u'Credit Score', u'Years in current job', u'Home Ownership',
       u'Annual Income', u'Purpose', u'Monthly Debt',
       u'Years of Credit History', u'Months since last delinquent',
       u'Number of Open Accounts', u'Number of Credit Problems',
       u'Current Credit Balance', u'Maximum Open Credit', u'Bankruptcies',
       u'Tax Liens'],
      dtype='object')
Loan ID    0
Customer ID    0
Loan Status    0
Current Loan Amount    0
Term    0
Credit Score    61676
Years in current job    0
Home Ownership    0
Annual Income    61676
Purpose    0
Monthly Debt    0
Years of Credit History    0
Months since last delinquent    140383
Number of Open Accounts    0
Number of Credit Problems    0
Current Credit Balance    0
Maximum Open Credit    0
Bankruptcies    529
Tax Liens    23

cat_df=df.select_dtypes(include=['object'])

Maximum open credit¶

df['Maximum Open Credit'].replace('#VALUE!',np.nan,regex=True,inplace=True)

df['Maximum Open Credit'].isnull().sum()

4

df['Maximum Open Credit'].iloc[187512]

nan

#length=len(df['Maximum Open Credit'])

'''
for i in range(0,length):
    
    #v=df['Maximum Open Credit'].iloc[i]
    
    try:
        
        v=float(df['Maximum Open Credit'].iloc[i])
    except ValueError,e:
        print 'line i is bad ',i
            
'''

"\nfor i in range(0,length):\n    \n    #v=df['Maximum Open Credit'].iloc[i]\n    \n    try:\n        \n        v=float(df['Maximum Open Credit'].iloc[i])\n    except ValueError,e:\n        print 'line i is bad ',i\n            \n"

df['Maximum Open Credit']=df['Maximum Open Credit'].astype(float)

df.dropna(subset=['Maximum Open Credit'],inplace=True)

#df['Maximum Open Credit']=df['Maximum Open Credit'].str.replace(r"[a-zA-Z]",'')#extract('(\d+)', expand=False)

#df['Maximum Open Credit']=df['Maximum Open Credit'].str.replace(r"[a-zA-Z]",'')#extract('(\d+)', expand=False)

#df['Maximum Open Credit']=df['Maximum Open Credit'].replace('\D','')

Years in Current Job¶

df['Years in current job'].value_counts()

10+ years    78896
2 years      23460
< 1 year     21012
3 years      20659
5 years      17864
1 year       16746
4 years      16166
6 years      14597
7 years      13966
8 years      12206
n/a          11476
9 years       9932
Name: Years in current job, dtype: int64

11476.0/250000 * 100.0

4.5904

tmp2_df=df[df['Years in current job']=='n/a']

j=pd.DataFrame()
df['Years in current job'].replace('< 1 year','1',inplace=True)
df['Years in current job'].replace(' 1 year','1',inplace=True)

df['Years in current job'].replace('1 year','1',inplace=True)
df['Years in current job'].replace('2 years','1',inplace=True)
df['Years in current job'].replace('3 years','1',inplace=True)
df['Years in current job'].replace('4 years','1',inplace=True)
df['Years in current job'].replace('5 years','1',inplace=True)
df['Years in current job'].replace('6 years','2',inplace=True)
df['Years in current job'].replace('7 years','2',inplace=True)

df['Years in current job'].replace('8 years','2',inplace=True)
df['Years in current job'].replace('9 years','2',inplace=True)
df['Years in current job'].replace('10+ years','2',inplace=True)
df['Years in current job'].replace('n/a','1',inplace=True)
tmp3_df=df[df['Years in current job']!='n/a']

df['Years in current job'].value_counts()
#df['Years in current job']=df['Years in current job'].astype('category')

2    129597
1    127383
Name: Years in current job, dtype: int64

df.head(2)

df['Purpose'].value_counts()

Debt Consolidation      170830
Home Improvements        12806
other                    11664
Other                     8251
Business Loan             3585
Buy a Car                 2913
Medical Bills             2369
Take a Trip               1307
Buy House                 1305
Educational Expenses       216
Name: Purpose, dtype: int64

df['Years in current job']=df['Years in current job'].astype(int)

#df['Years in current job'].plot(kind='barh')

df['Years in current job']=df['Years in current job'].astype('category')
df['Years in current job'].value_counts()

2    109022
1    106224
Name: Years in current job, dtype: int64

#dont del df['Years in current job']

++++++++++++++++++++++++++++++++++++++++++++++++++++=¶

num_df=df.select_dtypes(include=['float64','int64'])

num_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 256980 entries, 0 to 256983
Data columns (total 13 columns):
Current Loan Amount             256980 non-null int64
Credit Score                    195305 non-null float64
Years in current job            256980 non-null int64
Annual Income                   195305 non-null float64
Monthly Debt                    256980 non-null float64
Years of Credit History         256980 non-null float64
Months since last delinquent    116599 non-null float64
Number of Open Accounts         256980 non-null int64
Number of Credit Problems       256980 non-null int64
Current Credit Balance          256980 non-null int64
Maximum Open Credit             256980 non-null float64
Bankruptcies                    256451 non-null float64
Tax Liens                       256957 non-null float64
dtypes: float64(8), int64(5)
memory usage: 27.4 MB

num_df['Loan ID']=df['Loan ID']

num_df.isnull().sum()

Current Loan Amount                  0
Credit Score                     61675
Years in current job                 0
Annual Income                    61675
Monthly Debt                         0
Years of Credit History              0
Months since last delinquent    140381
Number of Open Accounts              0
Number of Credit Problems            0
Current Credit Balance               0
Maximum Open Credit                  0
Bankruptcies                       529
Tax Liens                           23
Loan ID                              0
dtype: int64

num_df=num_df.groupby(by='Loan ID',as_index=False).max()
num_df2=num_df.groupby(by='Loan ID',as_index=False).min()

num_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 215698 entries, 0 to 215697
Data columns (total 14 columns):
Loan ID                         215698 non-null object
Current Loan Amount             215698 non-null int64
Credit Score                    171200 non-null float64
Years in current job            215698 non-null int64
Annual Income                   171200 non-null float64
Monthly Debt                    215698 non-null float64
Years of Credit History         215698 non-null float64
Months since last delinquent    97437 non-null float64
Number of Open Accounts         215698 non-null int64
Number of Credit Problems       215698 non-null int64
Current Credit Balance          215698 non-null int64
Maximum Open Credit             215698 non-null float64
Bankruptcies                    215246 non-null float64
Tax Liens                       215676 non-null float64
dtypes: float64(8), int64(5), object(1)
memory usage: 24.7+ MB

num_df[num_df['Loan ID']=='9cd8febc-48ff-4f54-8475-e57a4e5b1c9b']
num_df2[num_df['Loan ID']=='9cd8febc-48ff-4f54-8475-e57a4e5b1c9b']

cat_df=df.select_dtypes(include=['object'])

del cat_df['Customer ID']

cat_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 256980 entries, 0 to 256983
Data columns (total 5 columns):
Loan ID           256980 non-null object
Loan Status       256980 non-null object
Term              256980 non-null object
Home Ownership    256980 non-null object
Purpose           256980 non-null object
dtypes: object(5)
memory usage: 11.8+ MB

cat_df.drop_duplicates(['Loan ID'],inplace=True)

cat_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 215698 entries, 0 to 256983
Data columns (total 5 columns):
Loan ID           215698 non-null object
Loan Status       215698 non-null object
Term              215698 non-null object
Home Ownership    215698 non-null object
Purpose           215698 non-null object
dtypes: object(5)
memory usage: 9.9+ MB

cat_df.shape,num_df.shape

((215698, 5), (215698, 14))

++++++++++++++++++++++++++++++++++++++++++++++++++++=¶

df=pd.merge(cat_df,num_df,on='Loan ID')

df.shape

(215698, 18)

df.nunique()

Loan ID                         215698
Loan Status                          2
Term                                 2
Home Ownership                       4
Purpose                             10
Current Loan Amount              27308
Credit Score                       334
Years in current job                 2
Annual Income                    60412
Monthly Debt                    128759
Years of Credit History            541
Months since last delinquent       131
Number of Open Accounts             59
Number of Credit Problems           12
Current Credit Balance           45630
Maximum Open Credit              68884
Bankruptcies                         8
Tax Liens                           12
dtype: int64

Loan Status:¶

df['Loan Status'].isnull().sum()

0

df['Loan Status'].value_counts()

Fully Paid     176191
Charged Off     39507
Name: Loan Status, dtype: int64

Current Loan Amount¶

df['Current Loan Amount'].isnull().sum()

0

df['Current Loan Amount'].value_counts().sum()

215698

df['Current Loan Amount'].hist(bins=50)

<matplotlib.axes._subplots.AxesSubplot at 0x7f196890acd0>

tmp_df=df[df['Current Loan Amount']<99999]

tmp_df['Current Loan Amount'].hist()
print tmp_df.shape,df.shape, df.shape[0]/tmp_df.shape[0]

<matplotlib.axes._subplots.AxesSubplot at 0x7f19737c2c10>

(180488, 18) (215698, 18) 1

tmp_df['Current Loan Amount'].mean(),tmp_df['Current Loan Amount'].median(),tmp_df['Current Loan Amount'].mode()

(13807.304496697841, 11968.0, 0    9793
 dtype: int64)

Term¶

df['Term'].isnull().sum(), df['Term'].nunique(),df['Term'].value_counts()

(0, 2, Short Term    166522
 Long Term      49176
 Name: Term, dtype: int64)

Credit Score¶

df['Credit Score'].isnull().sum(),df['Credit Score'].nunique(),df['Credit Score'].value_counts().sum()

(44498, 334, 171200)

df2=df[df['Credit Score']<800]

df2['Credit Score'].hist()

<matplotlib.axes._subplots.AxesSubplot at 0x7f1970ac2b10>

df['Credit Score']=df['Credit Score'].apply(lambda x: x/10 if x > 800 else x)

df['Credit Score'].hist()

<matplotlib.axes._subplots.AxesSubplot at 0x7f1970befd90>

#def replaceCreditScore(score):

#    if score > 800:
#        a=score/10
#        return a
#    else:
#        return score

Home Ownership¶

df['Home Ownership'].value_counts()

Home Mortgage    106491
Rent              89618
Own Home          19094
HaveMortgage        495
Name: Home Ownership, dtype: int64

del df['Home Ownership']

Annual Income¶

test_df=df[df['Annual Income']!= np.nan]

h=test_df[test_df['Annual Income']<0.4e7]
h['Annual Income'].hist(bins=50)
print h['Annual Income'].mean(),h['Annual Income'].median()

<matplotlib.axes._subplots.AxesSubplot at 0x7f1971abe690>

72679.9039469 62112.0

Purpose¶

#del df['Purpose']
df['Purpose'].value_counts()

Debt Consolidation      171095
Home Improvements        12839
other                    11726
Other                     8281
Business Loan             3611
Buy a Car                 2926
Medical Bills             2375
Take a Trip               1312
Buy House                 1308
Educational Expenses       225
Name: Purpose, dtype: int64

#del df['Purpose']
#Dont Delete

Months since last¶

#df['Months since last delinquent'].fillna('3',inplace=True)

df['Months since last delinquent'].median()

32.0

df['Months since last delinquent'].hist()

<matplotlib.axes._subplots.AxesSubplot at 0x7f1971e6ded0>

df['Months since last delinquent'].fillna(df['Months since last delinquent'].median(),inplace=True)

df['Months since last delinquent']

0         41.0
1         32.0
2         32.0
3         32.0
4         32.0
5         32.0
6         73.0
7         32.0
8         29.0
9         32.0
10        32.0
11        43.0
12        32.0
13        32.0
14        32.0
15        32.0
16        32.0
17        79.0
18        32.0
19        32.0
20        32.0
21        32.0
22         2.0
23        32.0
24        15.0
25        27.0
26         5.0
27        32.0
28        32.0
29        55.0
          ... 
215668    32.0
215669    32.0
215670    32.0
215671    11.0
215672    72.0
215673    32.0
215674    32.0
215675    32.0
215676    32.0
215677    46.0
215678    56.0
215679    22.0
215680    80.0
215681    47.0
215682    32.0
215683    62.0
215684    25.0
215685    32.0
215686    32.0
215687    18.0
215688    32.0
215689    32.0
215690    63.0
215691    32.0
215692     1.0
215693    32.0
215694    32.0
215695    47.0
215696    82.0
215697    11.0
Name: Months since last delinquent, Length: 215698, dtype: float64

Number of open accounts¶

df['Number of Open Accounts'].value_counts()

9     20533
10    19526
8     19488
11    17985
7     17626
12    15862
6     14797
13    13447
14    11380
5     10406
15     9070
16     7522
4      6158
17     6061
18     4857
19     3926
20     3030
3      2835
21     2340
22     1817
23     1429
24     1131
2       914
25      841
26      608
27      473
28      336
29      243
30      198
31      176
32      149
33       97
34       86
35       65
37       46
36       46
38       31
1        31
39       25
41       21
40       21
42       13
45        8
43        7
47        6
44        5
0         3
46        3
48        3
53        3
58        2
50        2
52        2
55        2
76        2
49        1
51        1
54        1
56        1
Name: Number of Open Accounts, dtype: int64

number of credit problems¶

df['Number of Credit Problems'].hist(log=True)

<matplotlib.axes._subplots.AxesSubplot at 0x7f197183c390>

df[df['Tax Liens']==0]['Tax Liens'].unique()

array([ 0.])

Current Credit Balance¶

df['Current Credit Balance']

0          6760
1          6262
2         20967
3         22529
4         17391
5          2289
6         14123
7         11970
8         10926
9          8880
10        32630
11         6817
12        11514
13         8519
14        17092
15        28513
16         8488
17         4636
18         9959
19         2184
20         8919
21        47201
22         6720
23         9126
24        24461
25        43375
26        27706
27        41603
28        26722
29         9031
          ...  
215668     3501
215669    13192
215670    18732
215671    13960
215672     6453
215673    13892
215674    19104
215675    35089
215676    10116
215677     3686
215678    22110
215679     3274
215680     5022
215681     7297
215682     3860
215683     7114
215684     3146
215685     9284
215686    12818
215687    10629
215688    13415
215689    10013
215690     4237
215691    20183
215692     2590
215693     4176
215694    43992
215695     1717
215696     3315
215697    28317
Name: Current Credit Balance, Length: 215698, dtype: int64

Bankruptcies¶

df.Bankruptcies.value_counts(dropna=False)

 0.0    192725
 1.0     21506
 2.0       818
NaN        452
 3.0       155
 4.0        25
 5.0        13
 6.0         3
 7.0         1
Name: Bankruptcies, dtype: int64

bank_df=df[df.Bankruptcies.isnull()]

k=bank_df[bank_df['Credit Score']<800]

k['Credit Score'].hist(bins=50)

<matplotlib.axes._subplots.AxesSubplot at 0x7f19715f6510>

df.dropna(subset=['Bankruptcies'],inplace=True)

bank_df['Months since last delinquent']

98         0.0
150        0.0
165       15.0
181       31.0
188        0.0
320        0.0
2128       0.0
2495       0.0
3711       0.0
3963       0.0
4559       4.0
4588       0.0
4590       0.0
5181      16.0
5784      25.0
6088       0.0
6228       0.0
6925       0.0
7225      47.0
7548       5.0
7612       0.0
8628       0.0
9770       0.0
9807      19.0
9976       0.0
10534      0.0
11216      0.0
11252      0.0
12118      0.0
13325      0.0
          ... 
197795    21.0
199008     0.0
199094     0.0
199337    41.0
200174     0.0
200700    24.0
200811     0.0
200857     0.0
201024    61.0
201200     0.0
201327    54.0
203022    41.0
203279    59.0
203360    34.0
203547     0.0
203758    39.0
204370     0.0
204463    63.0
205771    56.0
206224     0.0
208313    46.0
209326     0.0
209378     0.0
209617     0.0
210327     0.0
211915     0.0
211954     0.0
214383    49.0
214752     6.0
214783     0.0
Name: Months since last delinquent, Length: 452, dtype: float64

Tax Liens¶

df['Tax Liens'].shape

(215246,)

df.isnull().sum()

Loan ID                             0
Loan Status                         0
Term                                0
Purpose                             0
Current Loan Amount                 0
Credit Score                    44411
Years in current job                0
Annual Income                   44411
Monthly Debt                        0
Years of Credit History             0
Months since last delinquent        0
Number of Open Accounts             0
Number of Credit Problems           0
Current Credit Balance              0
Maximum Open Credit                 0
Bankruptcies                        0
Tax Liens                           0
dtype: int64

df.dropna(axis=0,subset=['Tax Liens'], inplace=True)

df['Tax Liens'].shape

(215246,)

#for index,row in df.iterrows():
    #if row['Maximum Open Credit']==9999:
#    row['Maximum Open Credit'].astype(float)

#df['Maximum Open Credit'].astype(float,inplace=True)
#df['Maximum Open Credit'].hist()

#tmp3_df['Years in current job'].astype(int)
#tmp3_df['Years in current job'].hist()

Linear Regression¶

df.isnull().sum()

Loan ID                             0
Loan Status                         0
Term                                0
Purpose                             0
Current Loan Amount                 0
Credit Score                    44411
Years in current job                0
Annual Income                   44411
Monthly Debt                        0
Years of Credit History             0
Months since last delinquent        0
Number of Open Accounts             0
Number of Credit Problems           0
Current Credit Balance              0
Maximum Open Credit                 0
Bankruptcies                        0
Tax Liens                           0
dtype: int64

train_data=df[df['Credit Score'].notnull()]
test_data=df[df['Credit Score'].isnull()]
to_predict1=df['Annual Income']
to_predict2=df['Credit Score']

train_data.shape,test_data.shape,df.shape

((170835, 17), (44411, 17), (215246, 17))

#def multiple_regression_model(train,test,input_features,to_predict):
#    regr = LinearRegression()
#    regr.fit(train.as_matrix(columns = input_features), train.as_matrix(columns =to_predict))
#    return regr

input_features=df[['Years of Credit History','Monthly Debt']]
to_predict1=df[['Annual Income']]

#rg=multiple_regression_model(train_data,test_data,input_features,['Annual Income'])
regr1 = LinearRegression(fit_intercept=False)
regr2 = LinearRegression()

#regr.fit(train_data.as_matrix(columns = ['Years of Credit History','Current Loan Amount','Monthly Debt']), train_data.as_matrix(columns =to_predict1))
#regr.fit(train_data.as_matrix(columns = ['Years of Credit History','Current Loan Amount','Monthly Debt']), train_data.as_matrix(columns =to_predict1))

regr1.fit(train_data[['Years of Credit History','Monthly Debt','Years in current job']],train_data['Annual Income'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

filled_in_annual_income=regr1.predict(test_data[['Years of Credit History','Monthly Debt','Years in current job']])

from sklearn.metrics import mean_squared_error

mean_squared_error(train_data['Annual Income'],regr1.predict(train_data[['Years of Credit History','Monthly Debt','Years in current job']]))**0.5

50389.770380903654

test_data['Annual Income']=filled_in_annual_income

test_data['Annual Income'].isnull().sum()

0

train_data['Annual Income'].hist(log=True,bins=50)

<matplotlib.axes._subplots.AxesSubplot at 0x7f19716814d0>

df=train_data.append(test_data)

df['Annual Income'].hist(bins=50,log=True)

<matplotlib.axes._subplots.AxesSubplot at 0x7f1971f81690>

Filling Credit Score¶

regr2.fit(train_data[['Years of Credit History','Monthly Debt','Years in current job','Annual Income']],train_data['Credit Score'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

filled_in_credit_score=regr2.predict(test_data[['Years of Credit History','Monthly Debt','Years in current job','Annual Income']])

test_data['Credit Score']=filled_in_credit_score

test_data['Credit Score'].isnull().sum()

0

result=pd.concat([train_data,test_data])

result.sort_index(ascending=True,inplace=True)
result.to_csv('loan_predictions_output.csv')

result.head(3)

OLD:¶

#new_df=df.duplicated(subset='Loan ID')

'''newdf=groupby(df['Loan ID']).max()
#agg({'Loan Status':'max', 'height':sum, 'weight':sum})[['age','height','weight']]
'''

"newdf=groupby(df['Loan ID']).max()\n#agg({'Loan Status':'max', 'height':sum, 'weight':sum})[['age','height','weight']]\n"

#newdf=df.groupby(df['Loan ID'])[''].max()

#newdf.shape

#newdf[newdf['Loan ID']=='b624940f-da82-410b-b885-77d92c2e4f0c']

#df2['Credit Score'].dropna(inplace=True)

#df2['Annual Income'].dropna(inplace=True)

#df2['Annual Income'].isnull().sum()

df2.dropna(inplace=True)

h=df2[ (df2['Current Loan Amount'] < 9999999 )  & (df2['Credit Score']>700 )& (df2['Credit Score'] <800)]
#plt.hist(h['Current Loan Amount'])
print h['Current Loan Amount'].mean()
print h['Current Loan Amount'].median() ############# This one
print h['Current Loan Amount'].mode()
df2['Current Loan Amount'].replace(99999999,h['Current Loan Amount'].median(),inplace=True)
df2['Credit Score']=df2['Credit Score'].apply(lambda x: x/10 if x > 800 else x)
df3=df2[df2['Credit Score'] < 800]
high_score=df3[df3['Credit Score']>700]
low_score=df3[df3['Credit Score']<600]
#df3['Maximum Open Credit']=df3['Maximum Open Credit'].astype(float)
#plt.scatter( df3['Maximum Open Credit'],df3['Credit Score'])

12800.6233092
11029.5
0    9820
dtype: int64

	Loan ID	Customer ID	Loan Status	Current Loan Amount	Term	Credit Score	Years in current job	Home Ownership	Annual Income	Purpose	Monthly Debt	Years of Credit History	Months since last delinquent	Number of Open Accounts	Current Credit Balance	Maximum Open Credit
157676	9cd8febc-48ff-4f54-8475-e57a4e5b1c9b	2d945c7c-6ad2-4e3c-ba83-d72e8a3db8b6	Charged Off	7808	Short Term	NaN	n/a	Rent	NaN	other	$747.95	41.2	28.0	10	3565	23299
157677	9cd8febc-48ff-4f54-8475-e57a4e5b1c9b	2d945c7c-6ad2-4e3c-ba83-d72e8a3db8b6	Charged Off	7900	Short Term	NaN	n/a	Rent	NaN	other	$756.80	41.2	28.0	10	3607	23575
157678	9cd8febc-48ff-4f54-8475-e57a4e5b1c9b	2d945c7c-6ad2-4e3c-ba83-d72e8a3db8b6	Charged Off	7900	Short Term	NaN	n/a	Rent	NaN	other	$756.80	41.2	28.0	10	3607	23575
157679	9cd8febc-48ff-4f54-8475-e57a4e5b1c9b	2d945c7c-6ad2-4e3c-ba83-d72e8a3db8b6	Charged Off	7808	Short Term	620.0	n/a	Rent	26091.0	other	$747.95	41.2	28.0	10	3565	23299

	Loan ID	Customer ID	Loan Status	Current Loan Amount	Term	Credit Score	Years in current job	Home Ownership	Annual Income	Purpose	Monthly Debt	Years of Credit History	Months since last delinquent	Number of Open Accounts	Number of Credit Problems	Current Credit Balance	Maximum Open Credit	Bankruptcies	Tax Liens
0	000025bb-5694-4cff-b17d-192b1a98ba44	5ebc8bb1-5eb9-4404-b11b-a6eebc401a19	Fully Paid	11520	Short Term	741.0	2	Home Mortgage	33694.0	Debt Consolidation	584.03	12.3	41.0	10	0	6760	16056.0	0.0	0.0
1	00002c49-3a29-4bd4-8f67-c8f8fbc1048c	927b388d-2e01-423f-a8dc-f7e42d668f46	Fully Paid	3441	Short Term	734.0	1	Home Mortgage	42269.0	other	1106.04	26.3	NaN	17	0	6262	19149.0	0.0	0.0