In [1]:
import pandas as pd
train = pd.read_csv('~/Desktop/DevMaster/Hack_day/train.csv', parse_dates=True)
test = pd.read_csv('~/Desktop/DevMaster/Hack_day/test.csv', parse_dates=True)

# it is recommended that you do this as to let the date column appropriate itself nicer
#train.info()
#train.head()


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_style('whitegrid')
import warnings
warnings.filterwarnings('ignore')


from scipy import stats
from sklearn import linear_model
from sklearn import neighbors
from sklearn.metrics import mean_squared_error,accuracy_score
from sklearn import preprocessing

from math import log
In [2]:
train.head(2)
Out[2]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
In [3]:
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
In [4]:
train.notnull().sum()
Out[4]:
PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64
In [5]:
test.notnull().sum()
Out[5]:
PassengerId    418
Pclass         418
Name           418
Sex            418
Age            332
SibSp          418
Parch          418
Ticket         418
Fare           417
Cabin           91
Embarked       418
dtype: int64
In [6]:
# Set an empty list so we can determine which variables are numerical & categorical.
numericals = []
categoricals = []
# Set an empty list so we can determine which columns to delete.
deleteColumns = []
deleteRows = []
# Set an empty list so we can jot down our notes later for the other data preparation section.
reassess = []
transforms = []
mined = []
engineered = []
In [7]:
# Here is a list of the columns.
columns = train.columns
columns
Out[7]:
Index([u'PassengerId', u'Survived', u'Pclass', u'Name', u'Sex', u'Age',
       u'SibSp', u'Parch', u'Ticket', u'Fare', u'Cabin', u'Embarked'],
      dtype='object')
In [8]:
############################
############################
############################
In [9]:
### APPEND test to train
train.tail(1)
#train.nunique()
Out[9]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.75 NaN Q
In [10]:
test.tail(1)
Out[10]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
417 1309 3 Peter, Master. Michael J male NaN 1 1 2668 22.3583 NaN C
In [11]:
full_train=train.append(test,ignore_index=True)
In [12]:
############################
############################
############################
############################
In [13]:
def age(a):
    
    if a <= 10.0:
        return 'A'
    elif a <= 20.0: ## should be 16
        return 'B'
    elif a <= 40.0:
        return 'C'
    elif a <= 55.0:
        return 'D'
    elif a <= 100:
        return 'E'
    
In [14]:
def fare(a):
    
    if a <= 200.0:
        return 'A'
    elif a <= 200.0:
        return 'B'
    elif a <= 300.0:
        return 'C'
    elif a <= 600.0:
        return 'D'
  
In [15]:
#full_train.Age.value_counts()
In [16]:
#train[train['Age'].isnull()]
#train[train['Embarked'].isnull()]
#train[train['Cabin'].isnull()]
#import seaborn as sns; sns.set(style="ticks", color_codes=True)
In [17]:
#g=sns.pairplot(train)
#d={'NaN':0}

fare_median = full_train.groupby(['Sex', 'Pclass']).Fare.median()
fare_median.name = 'FareMedian'

age_mean = full_train.groupby(['Sex', 'Pclass']).Age.mean()
age_mean.name = 'AgeMean'

def join(df, stat):
    return pd.merge(df, stat.to_frame(), left_on=['Sex', 'Pclass'], right_index=True, how='left')

full_train['Fare'] = full_train.Fare.fillna(join(full_train, fare_median).FareMedian)
full_train['Age'] = full_train.Age.fillna(join(full_train, age_mean).AgeMean)

#a=full_train['Age'].mean()
#b=train['Cabin'].mean()
#full_train.Age.fillna(a,inplace=True)
#train['Embarked'].value_counts()
full_train.Embarked.fillna('S',inplace=True)

#train['Embarked'].mean()
#train['Cabin'].mean()
#train['Age'].fillna(4)
In [18]:
full_train.Age=full_train.Age.apply(lambda x: age(x))
In [19]:
full_train.Fare=full_train.Fare.apply(lambda x: fare(x))
In [20]:
full_train.groupby(['Sex']).mean()
Out[20]:
Parch PassengerId Pclass SibSp Survived
Sex
female 0.633047 648.186695 2.154506 0.652361 0.742038
male 0.247924 658.766311 2.372479 0.413998 0.188908
In [21]:
#sns.pairplot(train) 
#,x_vars=['Age','Embarked'],y_vars=['Age','Embarked']
In [22]:
#full_train['Fare'].fillna(train['Fare'].median(), inplace = True)

FEATURE ENGINEERING

In [23]:
######################################


import string

def subst_strings(single,whole_list):
    
    for single_str in whole_list:
          if string.find(single,single_str)  !=-1:
            return single_str
    return np.nan
    
    
    
title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                    'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                    'Don', 'Jonkheer']

full_train['Title']=full_train['Name'].map(lambda x: subst_strings(x,title_list))


def replace_titles(x):
    title=x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    elif title in ['Countess', 'Mme']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

    
   
full_train['Age*Class']=full_train['Age']*full_train['Pclass']
full_train['Title']=full_train.apply(replace_titles,axis=1)

###########################################



full_train['FamilySize']=full_train['SibSp']+full_train['Parch']
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'NaN']
full_train['Deck']=full_train['Cabin'].map(lambda x: subst_strings(str(x), cabin_list))
In [24]:
full_train.Deck.fillna('C',inplace=True)
In [25]:
#full_train.Deck.value_counts()
In [26]:
full_train['Deck']
Out[26]:
0       C
1       C
2       C
3       C
4       C
5       C
6       E
7       C
8       C
9       C
10      G
11      C
12      C
13      C
14      C
15      C
16      C
17      C
18      C
19      C
20      C
21      D
22      C
23      A
24      C
25      C
26      C
27      C
28      C
29      C
       ..
1279    C
1280    C
1281    B
1282    D
1283    C
1284    C
1285    C
1286    C
1287    C
1288    B
1289    C
1290    C
1291    C
1292    C
1293    C
1294    C
1295    D
1296    D
1297    C
1298    C
1299    C
1300    C
1301    C
1302    C
1303    C
1304    C
1305    C
1306    C
1307    C
1308    C
Name: Deck, Length: 1309, dtype: object
In [27]:
full_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 16 columns):
Age            1309 non-null object
Cabin          295 non-null object
Embarked       1309 non-null object
Fare           1309 non-null object
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
Title          1309 non-null object
Age*Class      1309 non-null object
FamilySize     1309 non-null int64
Deck           1309 non-null object
dtypes: float64(1), int64(5), object(10)
memory usage: 163.7+ KB
In [28]:
deleteColumns.append('PassengerId')
deleteColumns.append('Cabin')
#deleteColumns.append('Age')
afterDelete=full_train.copy()

for columns in range(len(deleteColumns)):
    del afterDelete[deleteColumns[columns]]



#del Xy['PassengerId']
del afterDelete['Ticket']
del afterDelete['Name']
In [29]:
afterDelete['Pclass']=afterDelete['Pclass'].astype('str')
In [30]:
afterDelete.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
Age           1309 non-null object
Embarked      1309 non-null object
Fare          1309 non-null object
Parch         1309 non-null int64
Pclass        1309 non-null object
Sex           1309 non-null object
SibSp         1309 non-null int64
Survived      891 non-null float64
Title         1309 non-null object
Age*Class     1309 non-null object
FamilySize    1309 non-null int64
Deck          1309 non-null object
dtypes: float64(1), int64(3), object(8)
memory usage: 122.8+ KB
In [31]:
#afterDelete['Deck']
In [32]:
afterDelete.head(2)
afterDelete.nunique()
afterDelete.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
Age           1309 non-null object
Embarked      1309 non-null object
Fare          1309 non-null object
Parch         1309 non-null int64
Pclass        1309 non-null object
Sex           1309 non-null object
SibSp         1309 non-null int64
Survived      891 non-null float64
Title         1309 non-null object
Age*Class     1309 non-null object
FamilySize    1309 non-null int64
Deck          1309 non-null object
dtypes: float64(1), int64(3), object(8)
memory usage: 122.8+ KB
In [66]:
Xy.columns
Out[66]:
Index([u'Parch', u'SibSp', u'Survived', u'FamilySize', u'Age_B', u'Age_C',
       u'Age_D', u'Age_E', u'Embarked_Q', u'Embarked_S', u'Fare_C', u'Fare_D',
       u'Pclass_2', u'Pclass_3', u'Sex_male', u'Title_Miss', u'Title_Mr',
       u'Title_Mrs', u'Age*Class_AA', u'Age*Class_AAA', u'Age*Class_B',
       u'Age*Class_BB', u'Age*Class_BBB', u'Age*Class_C', u'Age*Class_CC',
       u'Age*Class_CCC', u'Age*Class_D', u'Age*Class_DD', u'Age*Class_DDD',
       u'Age*Class_E', u'Age*Class_EE', u'Age*Class_EEE', u'Deck_B', u'Deck_C',
       u'Deck_D', u'Deck_E', u'Deck_F', u'Deck_G', u'Deck_T'],
      dtype='object')

dummies

Sex PClass Ticket

In [33]:
Xy = pd.get_dummies(afterDelete, drop_first = True)
#Xy.head(10)
Xy.columns
Xy.head(2)
#Xy.info()
Out[33]:
Parch SibSp Survived FamilySize Age_B Age_C Age_D Age_E Embarked_Q Embarked_S ... Age*Class_E Age*Class_EE Age*Class_EEE Deck_B Deck_C Deck_D Deck_E Deck_F Deck_G Deck_T
0 0 1 0.0 1 0 1 0 0 0 1 ... 0 0 0 0 1 0 0 0 0 0
1 0 1 1.0 1 0 1 0 0 0 0 ... 0 0 0 0 1 0 0 0 0 0

2 rows × 39 columns

In [34]:
Xy.head(2)
Out[34]:
Parch SibSp Survived FamilySize Age_B Age_C Age_D Age_E Embarked_Q Embarked_S ... Age*Class_E Age*Class_EE Age*Class_EEE Deck_B Deck_C Deck_D Deck_E Deck_F Deck_G Deck_T
0 0 1 0.0 1 0 1 0 0 0 1 ... 0 0 0 0 1 0 0 0 0 0
1 0 1 1.0 1 0 1 0 0 0 0 ... 0 0 0 0 1 0 0 0 0 0

2 rows × 39 columns

Final Step.............

In [35]:
len(Xy),len(train),len(test)
Out[35]:
(1309, 891, 418)
In [36]:
print('split at',len(train))
('split at', 891)
In [37]:
editedTrain=Xy[:891]
editedTest=Xy[891:]
In [38]:
len(editedTrain), len(editedTest)
Out[38]:
(891, 418)
In [39]:
y=editedTrain['Survived']
X=editedTrain.copy()
del X['Survived']
In [40]:
from sklearn.cross_validation import train_test_split as tts
X_train, X_test, y_train, y_test = tts(X, y, test_size = .2, random_state = 44)
/usr/local/lib/python2.7/dist-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
In [41]:
################################
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier


log = LogisticRegression()
tree = DecisionTreeClassifier()
forest = RandomForestClassifier()
gradient = GradientBoostingClassifier()
ada = AdaBoostClassifier()




#################################
In [42]:
#models = []
#models.append(('l', LogisticRegression()))
#models.append(('t', DecisionTreeClassifier()))
#models.append(('f', RandomForestClassifier()))
#models.append(('g', GradientBoostingClassifier()))
#models.append(('a', AdaBoostClassifier()))




l = log.fit(X_train, y_train)
t = tree.fit(X_train, y_train)
ff = forest.fit(X_train, y_train)
importances=forest.feature_importances_
g = gradient.fit(X_train, y_train)
a = ada.fit(X_train, y_train)

NEW NEW NEW NEW


In [43]:
X_train.head(3)
Out[43]:
Parch SibSp FamilySize Age_B Age_C Age_D Age_E Embarked_Q Embarked_S Fare_C ... Age*Class_E Age*Class_EE Age*Class_EEE Deck_B Deck_C Deck_D Deck_E Deck_F Deck_G Deck_T
62 0 1 1 0 0 1 0 0 1 0 ... 0 0 0 0 1 0 0 0 0 0
847 0 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 1 0 0 0 0 0
511 0 0 0 0 1 0 0 0 1 0 ... 0 0 0 0 1 0 0 0 0 0

3 rows × 38 columns

In [44]:
type(importances)
Out[44]:
numpy.ndarray
In [45]:
len(forest.feature_importances_)
Out[45]:
38
In [46]:
len(X_train.columns)
Out[46]:
38
In [65]:
afterDelete.columns
Out[65]:
Index([u'Age', u'Embarked', u'Fare', u'Parch', u'Pclass', u'Sex', u'SibSp',
       u'Survived', u'Title', u'Age*Class', u'FamilySize', u'Deck'],
      dtype='object')
In [47]:
feat = pd.DataFrame()

#plt.figure(figsize=(10,20))
#plt.title('Feature Importance')

indices=np.argsort(importances)[::-1]
feat['x']=forest.feature_importances_
feat['y']=X_train.columns
feat.sort_values(by='x',inplace=True)
feat.set_index(feat['y'],inplace=True)

feat.plot(kind='barh',figsize=(10,10))
Out[47]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa015e45890>
In [48]:
'''
plt.bar(range(X_train.shape[1]),importances[indices],align="center")#, color="r", yerr=std[indices], align="center")
plt.xticks(range(X_train.shape[1]),indices)
plt.xlim([-1,X_train.shape[1]])
plt.show()
'''
Out[48]:
'\nplt.bar(range(X_train.shape[1]),importances[indices],align="center")#, color="r", yerr=std[indices], align="center")\nplt.xticks(range(X_train.shape[1]),indices)\nplt.xlim([-1,X_train.shape[1]])\nplt.show()\n'
In [49]:
#range(X_train.shape[1])

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
1. feature 13 (0.204818)
2. feature 15 (0.130891)
3. feature 14 (0.067256)
4. feature 1 (0.064559)
5. feature 2 (0.057337)
6. feature 12 (0.055584)
7. feature 22 (0.051394)
8. feature 16 (0.050875)
9. feature 8 (0.048219)
10. feature 32 (0.035121)
11. feature 0 (0.034542)
12. feature 34 (0.021379)
13. feature 24 (0.019956)
14. feature 27 (0.016990)
15. feature 7 (0.016420)
16. feature 11 (0.014977)
17. feature 4 (0.014727)
18. feature 17 (0.014192)
19. feature 5 (0.011808)
20. feature 3 (0.010619)
21. feature 33 (0.010006)
22. feature 31 (0.007160)
23. feature 25 (0.006673)
24. feature 23 (0.005480)
25. feature 18 (0.005155)
26. feature 21 (0.003917)
27. feature 6 (0.003500)
28. feature 9 (0.003188)
29. feature 35 (0.002527)
30. feature 10 (0.002362)
31. feature 19 (0.001857)
32. feature 36 (0.001725)
33. feature 29 (0.001715)
34. feature 28 (0.001188)
35. feature 20 (0.000872)
36. feature 37 (0.000775)
37. feature 30 (0.000121)
38. feature 26 (0.000117)
In [50]:
X_train[X_train.columns[10:16]]
Out[50]:
Fare_D Pclass_2 Pclass_3 Sex_male Title_Miss Title_Mr
62 0 0 0 1 0 1
847 0 0 1 1 0 1
511 0 0 1 1 0 1
187 0 0 0 1 0 1
833 0 0 1 1 0 1
83 0 0 0 1 0 1
417 0 1 0 0 1 0
206 0 0 1 1 0 1
5 0 0 1 1 0 1
667 0 0 1 1 0 1
114 0 0 1 0 1 0
153 0 0 1 1 0 1
400 0 0 1 1 0 1
281 0 0 1 1 0 1
509 0 0 1 1 0 1
765 0 0 0 0 0 0
812 0 1 0 1 0 1
723 0 1 0 1 0 1
180 0 0 1 0 1 0
231 0 0 1 1 0 1
438 0 0 0 1 0 1
297 0 0 0 0 1 0
469 0 0 1 0 1 0
479 0 0 1 0 1 0
201 0 0 1 1 0 1
418 0 1 0 1 0 1
634 0 0 1 0 1 0
100 0 0 1 0 1 0
122 0 1 0 1 0 1
125 0 0 1 1 0 0
... ... ... ... ... ... ...
330 0 0 1 0 1 0
885 0 0 1 0 0 0
339 0 0 0 1 0 1
725 0 0 1 1 0 1
349 0 0 1 1 0 1
359 0 0 1 0 1 0
882 0 0 1 0 1 0
14 0 0 1 0 1 0
257 0 0 0 0 1 0
57 0 0 1 1 0 1
501 0 0 1 0 1 0
612 0 0 1 0 1 0
757 0 1 0 1 0 1
109 0 0 1 0 1 0
227 0 0 1 1 0 1
579 0 0 1 1 0 1
711 0 0 0 1 0 1
707 0 0 0 1 0 1
473 0 1 0 0 0 0
311 0 0 0 0 1 0
151 0 0 0 0 0 0
120 0 1 0 1 0 1
751 0 0 1 1 0 0
84 0 1 0 0 1 0
96 0 0 0 1 0 1
571 0 0 0 0 0 0
173 0 0 1 1 0 1
753 0 0 1 1 0 1
419 0 0 1 0 1 0
788 0 0 1 1 0 0

712 rows × 6 columns

In [51]:
###y_test

print("The score for Logistic Regression is, ", l.score(X_test, y_test))
print("The score for Decision Trees is ", t.score(X_test,y_test))
print("The score for Random Forest is ", ff.score(X_test,y_test))
print("The score for Gradient Descent is ", g.score(X_test, y_test))
print("The score for AdaBoost is ", a.score(X_test, y_test))
('The score for Logistic Regression is, ', 0.77653631284916202)
('The score for Decision Trees is ', 0.75418994413407825)
('The score for Random Forest is ', 0.75977653631284914)
('The score for Gradient Descent is ', 0.77094972067039103)
('The score for AdaBoost is ', 0.75977653631284914)
In [52]:
X_train['FamilySize'].value_counts()
Out[52]:
0     437
1     128
2      81
3      20
5      18
4      13
6       7
10      6
7       2
Name: FamilySize, dtype: int64

Submission

In [53]:
editedTest['Fare_C'].value_counts()
Out[53]:
0    401
1     17
Name: Fare_C, dtype: int64
In [54]:
del editedTest['Survived']
editedTest.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 891 to 1308
Data columns (total 38 columns):
Parch            418 non-null int64
SibSp            418 non-null int64
FamilySize       418 non-null int64
Age_B            418 non-null uint8
Age_C            418 non-null uint8
Age_D            418 non-null uint8
Age_E            418 non-null uint8
Embarked_Q       418 non-null uint8
Embarked_S       418 non-null uint8
Fare_C           418 non-null uint8
Fare_D           418 non-null uint8
Pclass_2         418 non-null uint8
Pclass_3         418 non-null uint8
Sex_male         418 non-null uint8
Title_Miss       418 non-null uint8
Title_Mr         418 non-null uint8
Title_Mrs        418 non-null uint8
Age*Class_AA     418 non-null uint8
Age*Class_AAA    418 non-null uint8
Age*Class_B      418 non-null uint8
Age*Class_BB     418 non-null uint8
Age*Class_BBB    418 non-null uint8
Age*Class_C      418 non-null uint8
Age*Class_CC     418 non-null uint8
Age*Class_CCC    418 non-null uint8
Age*Class_D      418 non-null uint8
Age*Class_DD     418 non-null uint8
Age*Class_DDD    418 non-null uint8
Age*Class_E      418 non-null uint8
Age*Class_EE     418 non-null uint8
Age*Class_EEE    418 non-null uint8
Deck_B           418 non-null uint8
Deck_C           418 non-null uint8
Deck_D           418 non-null uint8
Deck_E           418 non-null uint8
Deck_F           418 non-null uint8
Deck_G           418 non-null uint8
Deck_T           418 non-null uint8
dtypes: int64(3), uint8(35)
memory usage: 24.2 KB
In [55]:
def run_this_model(mod,test):
    ret=mod.predict(test)
    return ret
In [56]:
res=pd.DataFrame(columns=['PassengerId','Survived'])
res.head(2)
res['PassengerId']=test['PassengerId']

Learning curves (adapted from kaggle tutorial,not needed at the moment):

In [57]:
from sklearn.model_selection import cross_val_score, learning_curve, validation_curve

num_folds = 7
def plot_curve(ticks, train_scores, test_scores):
    
    import matplotlib.pyplot as plt
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.figure()
    plt.fill_between(ticks, 
                     
                    train_scores_mean - train_scores_std, 
                    train_scores_mean + train_scores_std, alpha=0.1, color="b")
    plt.fill_between(ticks, 
                     test_scores_mean - test_scores_std, 
                     test_scores_mean + test_scores_std, alpha=0.1, color="r")
    plt.plot(ticks, train_scores_mean, 'b-', label='Training score')
    plt.plot(ticks, test_scores_mean, 'r-', label='CV score')
    plt.legend()
    return plt.gca()


def plot_learning_curve(clf, X, y, scoring='accuracy'):
    
    ax = plot_curve(*learning_curve(clf, X, y, cv=num_folds, scoring=scoring, 
                                    train_sizes=np.linspace(0.1,1,10), n_jobs=-1))
    ax.set_title('Learning curve: {}'.format(clf.__class__.__name__))
    ax.set_xlabel('Training size')
    ax.set_ylabel(scoring)
    
    
#names=['l','g']

#for i in range(len(names)):
    
#    plot_learning_curve(i,X_train,y_train)
In [58]:
import numpy as np
In [59]:
#res['Survived']=np.random.randint(2, size=418)
res['Survived']=np.random.uniform(0,1)
#res['Survived']=np.zeros(418)
#res['Survived']=res.Survived.astype(int)
In [60]:
res['Survived']
Out[60]:
0      0.383936
1      0.383936
2      0.383936
3      0.383936
4      0.383936
5      0.383936
6      0.383936
7      0.383936
8      0.383936
9      0.383936
10     0.383936
11     0.383936
12     0.383936
13     0.383936
14     0.383936
15     0.383936
16     0.383936
17     0.383936
18     0.383936
19     0.383936
20     0.383936
21     0.383936
22     0.383936
23     0.383936
24     0.383936
25     0.383936
26     0.383936
27     0.383936
28     0.383936
29     0.383936
         ...   
388    0.383936
389    0.383936
390    0.383936
391    0.383936
392    0.383936
393    0.383936
394    0.383936
395    0.383936
396    0.383936
397    0.383936
398    0.383936
399    0.383936
400    0.383936
401    0.383936
402    0.383936
403    0.383936
404    0.383936
405    0.383936
406    0.383936
407    0.383936
408    0.383936
409    0.383936
410    0.383936
411    0.383936
412    0.383936
413    0.383936
414    0.383936
415    0.383936
416    0.383936
417    0.383936
Name: Survived, Length: 418, dtype: float64
In [61]:
res
Out[61]:
PassengerId Survived
0 892 0.383936
1 893 0.383936
2 894 0.383936
3 895 0.383936
4 896 0.383936
5 897 0.383936
6 898 0.383936
7 899 0.383936
8 900 0.383936
9 901 0.383936
10 902 0.383936
11 903 0.383936
12 904 0.383936
13 905 0.383936
14 906 0.383936
15 907 0.383936
16 908 0.383936
17 909 0.383936
18 910 0.383936
19 911 0.383936
20 912 0.383936
21 913 0.383936
22 914 0.383936
23 915 0.383936
24 916 0.383936
25 917 0.383936
26 918 0.383936
27 919 0.383936
28 920 0.383936
29 921 0.383936
... ... ...
388 1280 0.383936
389 1281 0.383936
390 1282 0.383936
391 1283 0.383936
392 1284 0.383936
393 1285 0.383936
394 1286 0.383936
395 1287 0.383936
396 1288 0.383936
397 1289 0.383936
398 1290 0.383936
399 1291 0.383936
400 1292 0.383936
401 1293 0.383936
402 1294 0.383936
403 1295 0.383936
404 1296 0.383936
405 1297 0.383936
406 1298 0.383936
407 1299 0.383936
408 1300 0.383936
409 1301 0.383936
410 1302 0.383936
411 1303 0.383936
412 1304 0.383936
413 1305 0.383936
414 1306 0.383936
415 1307 0.383936
416 1308 0.383936
417 1309 0.383936

418 rows × 2 columns

In [62]:
res.to_csv('Random.csv', index=False)
In [63]:
res
Out[63]:
PassengerId Survived
0 892 0.383936
1 893 0.383936
2 894 0.383936
3 895 0.383936
4 896 0.383936
5 897 0.383936
6 898 0.383936
7 899 0.383936
8 900 0.383936
9 901 0.383936
10 902 0.383936
11 903 0.383936
12 904 0.383936
13 905 0.383936
14 906 0.383936
15 907 0.383936
16 908 0.383936
17 909 0.383936
18 910 0.383936
19 911 0.383936
20 912 0.383936
21 913 0.383936
22 914 0.383936
23 915 0.383936
24 916 0.383936
25 917 0.383936
26 918 0.383936
27 919 0.383936
28 920 0.383936
29 921 0.383936
... ... ...
388 1280 0.383936
389 1281 0.383936
390 1282 0.383936
391 1283 0.383936
392 1284 0.383936
393 1285 0.383936
394 1286 0.383936
395 1287 0.383936
396 1288 0.383936
397 1289 0.383936
398 1290 0.383936
399 1291 0.383936
400 1292 0.383936
401 1293 0.383936
402 1294 0.383936
403 1295 0.383936
404 1296 0.383936
405 1297 0.383936
406 1298 0.383936
407 1299 0.383936
408 1300 0.383936
409 1301 0.383936
410 1302 0.383936
411 1303 0.383936
412 1304 0.383936
413 1305 0.383936
414 1306 0.383936
415 1307 0.383936
416 1308 0.383936
417 1309 0.383936

418 rows × 2 columns

This is repeat of the prediction above (ala .score()):

In [64]:
#for name,model in models:
#    model.predict(editedTest)

mod=[l,t,ff,g,a]
#survived=[]

for i in mod:
    survived=run_this_model(i,X_test) #X_test
    type(survived)
  #  print i.__class__.__name__
  #  print  len(y_test)
    sc=accuracy_score(survived,y_test) 
    survived=run_this_model(i,editedTest)
    res['Survived']=survived.astype(int)
    file_name='results_'+i.__class__.__name__+'_acc'+sc.astype(str)+'.csv'
    res.to_csv(file_name, index=False)
    plot_learning_curve(i,X_train,y_train)
    
#survived=l.predict(editedTest)
#survived=t.predict(editedTest)
#survived=f.predict(editedTest)
#survived=g.predict(editedTest)
#survived=a.predict(editedTest)