import pandas as pd
train = pd.read_csv('~/Desktop/DevMaster/Hack_day/train.csv', parse_dates=True)
test = pd.read_csv('~/Desktop/DevMaster/Hack_day/test.csv', parse_dates=True)

# it is recommended that you do this as to let the date column appropriate itself nicer
#train.info()
#train.head()


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_style('whitegrid')
import warnings
warnings.filterwarnings('ignore')


from scipy import stats
from sklearn import linear_model
from sklearn import neighbors
from sklearn.metrics import mean_squared_error,accuracy_score
from sklearn import preprocessing

from math import log

train.head(2)

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB

train.notnull().sum()

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64

test.notnull().sum()

PassengerId    418
Pclass         418
Name           418
Sex            418
Age            332
SibSp          418
Parch          418
Ticket         418
Fare           417
Cabin           91
Embarked       418
dtype: int64

# Set an empty list so we can determine which variables are numerical & categorical.
numericals = []
categoricals = []
# Set an empty list so we can determine which columns to delete.
deleteColumns = []
deleteRows = []
# Set an empty list so we can jot down our notes later for the other data preparation section.
reassess = []
transforms = []
mined = []
engineered = []

# Here is a list of the columns.
columns = train.columns
columns

Index([u'PassengerId', u'Survived', u'Pclass', u'Name', u'Sex', u'Age',
       u'SibSp', u'Parch', u'Ticket', u'Fare', u'Cabin', u'Embarked'],
      dtype='object')

############################
############################
############################

### APPEND test to train
train.tail(1)
#train.nunique()

test.tail(1)

full_train=train.append(test,ignore_index=True)

############################
############################
############################
############################

def age(a):
    
    if a <= 10.0:
        return 'A'
    elif a <= 20.0: ## should be 16
        return 'B'
    elif a <= 40.0:
        return 'C'
    elif a <= 55.0:
        return 'D'
    elif a <= 100:
        return 'E'

def fare(a):
    
    if a <= 200.0:
        return 'A'
    elif a <= 200.0:
        return 'B'
    elif a <= 300.0:
        return 'C'
    elif a <= 600.0:
        return 'D'

#full_train.Age.value_counts()

#train[train['Age'].isnull()]
#train[train['Embarked'].isnull()]
#train[train['Cabin'].isnull()]
#import seaborn as sns; sns.set(style="ticks", color_codes=True)

#g=sns.pairplot(train)
#d={'NaN':0}

fare_median = full_train.groupby(['Sex', 'Pclass']).Fare.median()
fare_median.name = 'FareMedian'

age_mean = full_train.groupby(['Sex', 'Pclass']).Age.mean()
age_mean.name = 'AgeMean'

def join(df, stat):
    return pd.merge(df, stat.to_frame(), left_on=['Sex', 'Pclass'], right_index=True, how='left')

full_train['Fare'] = full_train.Fare.fillna(join(full_train, fare_median).FareMedian)
full_train['Age'] = full_train.Age.fillna(join(full_train, age_mean).AgeMean)

#a=full_train['Age'].mean()
#b=train['Cabin'].mean()
#full_train.Age.fillna(a,inplace=True)
#train['Embarked'].value_counts()
full_train.Embarked.fillna('S',inplace=True)

#train['Embarked'].mean()
#train['Cabin'].mean()
#train['Age'].fillna(4)

full_train.Age=full_train.Age.apply(lambda x: age(x))

full_train.Fare=full_train.Fare.apply(lambda x: fare(x))

full_train.groupby(['Sex']).mean()

#sns.pairplot(train) 
#,x_vars=['Age','Embarked'],y_vars=['Age','Embarked']

#full_train['Fare'].fillna(train['Fare'].median(), inplace = True)

FEATURE ENGINEERING¶

######################################


import string

def subst_strings(single,whole_list):
    
    for single_str in whole_list:
          if string.find(single,single_str)  !=-1:
            return single_str
    return np.nan
    
    
    
title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                    'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                    'Don', 'Jonkheer']

full_train['Title']=full_train['Name'].map(lambda x: subst_strings(x,title_list))


def replace_titles(x):
    title=x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    elif title in ['Countess', 'Mme']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

    
   
full_train['Age*Class']=full_train['Age']*full_train['Pclass']
full_train['Title']=full_train.apply(replace_titles,axis=1)

###########################################



full_train['FamilySize']=full_train['SibSp']+full_train['Parch']
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'NaN']
full_train['Deck']=full_train['Cabin'].map(lambda x: subst_strings(str(x), cabin_list))

full_train.Deck.fillna('C',inplace=True)

#full_train.Deck.value_counts()

full_train['Deck']

0       C
1       C
2       C
3       C
4       C
5       C
6       E
7       C
8       C
9       C
10      G
11      C
12      C
13      C
14      C
15      C
16      C
17      C
18      C
19      C
20      C
21      D
22      C
23      A
24      C
25      C
26      C
27      C
28      C
29      C
       ..
1279    C
1280    C
1281    B
1282    D
1283    C
1284    C
1285    C
1286    C
1287    C
1288    B
1289    C
1290    C
1291    C
1292    C
1293    C
1294    C
1295    D
1296    D
1297    C
1298    C
1299    C
1300    C
1301    C
1302    C
1303    C
1304    C
1305    C
1306    C
1307    C
1308    C
Name: Deck, Length: 1309, dtype: object

full_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 16 columns):
Age            1309 non-null object
Cabin          295 non-null object
Embarked       1309 non-null object
Fare           1309 non-null object
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
Title          1309 non-null object
Age*Class      1309 non-null object
FamilySize     1309 non-null int64
Deck           1309 non-null object
dtypes: float64(1), int64(5), object(10)
memory usage: 163.7+ KB

deleteColumns.append('PassengerId')
deleteColumns.append('Cabin')
#deleteColumns.append('Age')
afterDelete=full_train.copy()

for columns in range(len(deleteColumns)):
    del afterDelete[deleteColumns[columns]]



#del Xy['PassengerId']
del afterDelete['Ticket']
del afterDelete['Name']

afterDelete['Pclass']=afterDelete['Pclass'].astype('str')

afterDelete.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
Age           1309 non-null object
Embarked      1309 non-null object
Fare          1309 non-null object
Parch         1309 non-null int64
Pclass        1309 non-null object
Sex           1309 non-null object
SibSp         1309 non-null int64
Survived      891 non-null float64
Title         1309 non-null object
Age*Class     1309 non-null object
FamilySize    1309 non-null int64
Deck          1309 non-null object
dtypes: float64(1), int64(3), object(8)
memory usage: 122.8+ KB

#afterDelete['Deck']

afterDelete.head(2)
afterDelete.nunique()
afterDelete.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
Age           1309 non-null object
Embarked      1309 non-null object
Fare          1309 non-null object
Parch         1309 non-null int64
Pclass        1309 non-null object
Sex           1309 non-null object
SibSp         1309 non-null int64
Survived      891 non-null float64
Title         1309 non-null object
Age*Class     1309 non-null object
FamilySize    1309 non-null int64
Deck          1309 non-null object
dtypes: float64(1), int64(3), object(8)
memory usage: 122.8+ KB

Xy.columns

Index([u'Parch', u'SibSp', u'Survived', u'FamilySize', u'Age_B', u'Age_C',
       u'Age_D', u'Age_E', u'Embarked_Q', u'Embarked_S', u'Fare_C', u'Fare_D',
       u'Pclass_2', u'Pclass_3', u'Sex_male', u'Title_Miss', u'Title_Mr',
       u'Title_Mrs', u'Age*Class_AA', u'Age*Class_AAA', u'Age*Class_B',
       u'Age*Class_BB', u'Age*Class_BBB', u'Age*Class_C', u'Age*Class_CC',
       u'Age*Class_CCC', u'Age*Class_D', u'Age*Class_DD', u'Age*Class_DDD',
       u'Age*Class_E', u'Age*Class_EE', u'Age*Class_EEE', u'Deck_B', u'Deck_C',
       u'Deck_D', u'Deck_E', u'Deck_F', u'Deck_G', u'Deck_T'],
      dtype='object')

dummies¶

Sex PClass Ticket

Xy = pd.get_dummies(afterDelete, drop_first = True)
#Xy.head(10)
Xy.columns
Xy.head(2)
#Xy.info()

Xy.head(2)

Final Step.............¶

len(Xy),len(train),len(test)

(1309, 891, 418)

print('split at',len(train))

('split at', 891)

editedTrain=Xy[:891]
editedTest=Xy[891:]

len(editedTrain), len(editedTest)

(891, 418)

y=editedTrain['Survived']
X=editedTrain.copy()
del X['Survived']

from sklearn.cross_validation import train_test_split as tts
X_train, X_test, y_train, y_test = tts(X, y, test_size = .2, random_state = 44)

/usr/local/lib/python2.7/dist-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

################################
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier


log = LogisticRegression()
tree = DecisionTreeClassifier()
forest = RandomForestClassifier()
gradient = GradientBoostingClassifier()
ada = AdaBoostClassifier()




#################################

#models = []
#models.append(('l', LogisticRegression()))
#models.append(('t', DecisionTreeClassifier()))
#models.append(('f', RandomForestClassifier()))
#models.append(('g', GradientBoostingClassifier()))
#models.append(('a', AdaBoostClassifier()))




l = log.fit(X_train, y_train)
t = tree.fit(X_train, y_train)
ff = forest.fit(X_train, y_train)
importances=forest.feature_importances_
g = gradient.fit(X_train, y_train)
a = ada.fit(X_train, y_train)

NEW NEW NEW NEW¶

X_train.head(3)

type(importances)

numpy.ndarray

len(forest.feature_importances_)

38

len(X_train.columns)

38

afterDelete.columns

Index([u'Age', u'Embarked', u'Fare', u'Parch', u'Pclass', u'Sex', u'SibSp',
       u'Survived', u'Title', u'Age*Class', u'FamilySize', u'Deck'],
      dtype='object')

feat = pd.DataFrame()

#plt.figure(figsize=(10,20))
#plt.title('Feature Importance')

indices=np.argsort(importances)[::-1]
feat['x']=forest.feature_importances_
feat['y']=X_train.columns
feat.sort_values(by='x',inplace=True)
feat.set_index(feat['y'],inplace=True)

feat.plot(kind='barh',figsize=(10,10))

<matplotlib.axes._subplots.AxesSubplot at 0x7fa015e45890>

'''
plt.bar(range(X_train.shape[1]),importances[indices],align="center")#, color="r", yerr=std[indices], align="center")
plt.xticks(range(X_train.shape[1]),indices)
plt.xlim([-1,X_train.shape[1]])
plt.show()
'''

'\nplt.bar(range(X_train.shape[1]),importances[indices],align="center")#, color="r", yerr=std[indices], align="center")\nplt.xticks(range(X_train.shape[1]),indices)\nplt.xlim([-1,X_train.shape[1]])\nplt.show()\n'

#range(X_train.shape[1])

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

1. feature 13 (0.204818)
2. feature 15 (0.130891)
3. feature 14 (0.067256)
4. feature 1 (0.064559)
5. feature 2 (0.057337)
6. feature 12 (0.055584)
7. feature 22 (0.051394)
8. feature 16 (0.050875)
9. feature 8 (0.048219)
10. feature 32 (0.035121)
11. feature 0 (0.034542)
12. feature 34 (0.021379)
13. feature 24 (0.019956)
14. feature 27 (0.016990)
15. feature 7 (0.016420)
16. feature 11 (0.014977)
17. feature 4 (0.014727)
18. feature 17 (0.014192)
19. feature 5 (0.011808)
20. feature 3 (0.010619)
21. feature 33 (0.010006)
22. feature 31 (0.007160)
23. feature 25 (0.006673)
24. feature 23 (0.005480)
25. feature 18 (0.005155)
26. feature 21 (0.003917)
27. feature 6 (0.003500)
28. feature 9 (0.003188)
29. feature 35 (0.002527)
30. feature 10 (0.002362)
31. feature 19 (0.001857)
32. feature 36 (0.001725)
33. feature 29 (0.001715)
34. feature 28 (0.001188)
35. feature 20 (0.000872)
36. feature 37 (0.000775)
37. feature 30 (0.000121)
38. feature 26 (0.000117)

X_train[X_train.columns[10:16]]

###y_test

print("The score for Logistic Regression is, ", l.score(X_test, y_test))
print("The score for Decision Trees is ", t.score(X_test,y_test))
print("The score for Random Forest is ", ff.score(X_test,y_test))
print("The score for Gradient Descent is ", g.score(X_test, y_test))
print("The score for AdaBoost is ", a.score(X_test, y_test))

('The score for Logistic Regression is, ', 0.77653631284916202)
('The score for Decision Trees is ', 0.75418994413407825)
('The score for Random Forest is ', 0.75977653631284914)
('The score for Gradient Descent is ', 0.77094972067039103)
('The score for AdaBoost is ', 0.75977653631284914)

X_train['FamilySize'].value_counts()

0     437
1     128
2      81
3      20
5      18
4      13
6       7
10      6
7       2
Name: FamilySize, dtype: int64

Submission¶

editedTest['Fare_C'].value_counts()

0    401
1     17
Name: Fare_C, dtype: int64

del editedTest['Survived']
editedTest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 891 to 1308
Data columns (total 38 columns):
Parch            418 non-null int64
SibSp            418 non-null int64
FamilySize       418 non-null int64
Age_B            418 non-null uint8
Age_C            418 non-null uint8
Age_D            418 non-null uint8
Age_E            418 non-null uint8
Embarked_Q       418 non-null uint8
Embarked_S       418 non-null uint8
Fare_C           418 non-null uint8
Fare_D           418 non-null uint8
Pclass_2         418 non-null uint8
Pclass_3         418 non-null uint8
Sex_male         418 non-null uint8
Title_Miss       418 non-null uint8
Title_Mr         418 non-null uint8
Title_Mrs        418 non-null uint8
Age*Class_AA     418 non-null uint8
Age*Class_AAA    418 non-null uint8
Age*Class_B      418 non-null uint8
Age*Class_BB     418 non-null uint8
Age*Class_BBB    418 non-null uint8
Age*Class_C      418 non-null uint8
Age*Class_CC     418 non-null uint8
Age*Class_CCC    418 non-null uint8
Age*Class_D      418 non-null uint8
Age*Class_DD     418 non-null uint8
Age*Class_DDD    418 non-null uint8
Age*Class_E      418 non-null uint8
Age*Class_EE     418 non-null uint8
Age*Class_EEE    418 non-null uint8
Deck_B           418 non-null uint8
Deck_C           418 non-null uint8
Deck_D           418 non-null uint8
Deck_E           418 non-null uint8
Deck_F           418 non-null uint8
Deck_G           418 non-null uint8
Deck_T           418 non-null uint8
dtypes: int64(3), uint8(35)
memory usage: 24.2 KB

def run_this_model(mod,test):
    ret=mod.predict(test)
    return ret

res=pd.DataFrame(columns=['PassengerId','Survived'])
res.head(2)
res['PassengerId']=test['PassengerId']

Learning curves (adapted from kaggle tutorial,not needed at the moment):¶

from sklearn.model_selection import cross_val_score, learning_curve, validation_curve

num_folds = 7
def plot_curve(ticks, train_scores, test_scores):
    
    import matplotlib.pyplot as plt
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.figure()
    plt.fill_between(ticks, 
                     
                    train_scores_mean - train_scores_std, 
                    train_scores_mean + train_scores_std, alpha=0.1, color="b")
    plt.fill_between(ticks, 
                     test_scores_mean - test_scores_std, 
                     test_scores_mean + test_scores_std, alpha=0.1, color="r")
    plt.plot(ticks, train_scores_mean, 'b-', label='Training score')
    plt.plot(ticks, test_scores_mean, 'r-', label='CV score')
    plt.legend()
    return plt.gca()


def plot_learning_curve(clf, X, y, scoring='accuracy'):
    
    ax = plot_curve(*learning_curve(clf, X, y, cv=num_folds, scoring=scoring, 
                                    train_sizes=np.linspace(0.1,1,10), n_jobs=-1))
    ax.set_title('Learning curve: {}'.format(clf.__class__.__name__))
    ax.set_xlabel('Training size')
    ax.set_ylabel(scoring)
    
    
#names=['l','g']

#for i in range(len(names)):
    
#    plot_learning_curve(i,X_train,y_train)

import numpy as np

#res['Survived']=np.random.randint(2, size=418)
res['Survived']=np.random.uniform(0,1)
#res['Survived']=np.zeros(418)
#res['Survived']=res.Survived.astype(int)

res['Survived']

0      0.383936
1      0.383936
2      0.383936
3      0.383936
4      0.383936
5      0.383936
6      0.383936
7      0.383936
8      0.383936
9      0.383936
10     0.383936
11     0.383936
12     0.383936
13     0.383936
14     0.383936
15     0.383936
16     0.383936
17     0.383936
18     0.383936
19     0.383936
20     0.383936
21     0.383936
22     0.383936
23     0.383936
24     0.383936
25     0.383936
26     0.383936
27     0.383936
28     0.383936
29     0.383936
         ...   
388    0.383936
389    0.383936
390    0.383936
391    0.383936
392    0.383936
393    0.383936
394    0.383936
395    0.383936
396    0.383936
397    0.383936
398    0.383936
399    0.383936
400    0.383936
401    0.383936
402    0.383936
403    0.383936
404    0.383936
405    0.383936
406    0.383936
407    0.383936
408    0.383936
409    0.383936
410    0.383936
411    0.383936
412    0.383936
413    0.383936
414    0.383936
415    0.383936
416    0.383936
417    0.383936
Name: Survived, Length: 418, dtype: float64

res

res.to_csv('Random.csv', index=False)

res

This is repeat of the prediction above (ala .score()):¶

#for name,model in models:
#    model.predict(editedTest)

mod=[l,t,ff,g,a]
#survived=[]

for i in mod:
    survived=run_this_model(i,X_test) #X_test
    type(survived)
  #  print i.__class__.__name__
  #  print  len(y_test)
    sc=accuracy_score(survived,y_test) 
    survived=run_this_model(i,editedTest)
    res['Survived']=survived.astype(int)
    file_name='results_'+i.__class__.__name__+'_acc'+sc.astype(str)+'.csv'
    res.to_csv(file_name, index=False)
    plot_learning_curve(i,X_train,y_train)
    
#survived=l.predict(editedTest)
#survived=t.predict(editedTest)
#survived=f.predict(editedTest)
#survived=g.predict(editedTest)
#survived=a.predict(editedTest)

	Parch	PassengerId	Pclass	SibSp	Survived
Sex
female	0.633047	648.186695	2.154506	0.652361	0.742038
male	0.247924	658.766311	2.372479	0.413998	0.188908

	Fare_D	Pclass_2	Pclass_3	Sex_male	Title_Miss	Title_Mr
62	0	0	0	1	0	1
847	0	0	1	1	0	1
511	0	0	1	1	0	1
187	0	0	0	1	0	1
833	0	0	1	1	0	1
83	0	0	0	1	0	1
417	0	1	0	0	1	0
206	0	0	1	1	0	1
5	0	0	1	1	0	1
667	0	0	1	1	0	1
114	0	0	1	0	1	0
153	0	0	1	1	0	1
400	0	0	1	1	0	1
281	0	0	1	1	0	1
509	0	0	1	1	0	1
765	0	0	0	0	0	0
812	0	1	0	1	0	1
723	0	1	0	1	0	1
180	0	0	1	0	1	0
231	0	0	1	1	0	1
438	0	0	0	1	0	1
297	0	0	0	0	1	0
469	0	0	1	0	1	0
479	0	0	1	0	1	0
201	0	0	1	1	0	1
418	0	1	0	1	0	1
634	0	0	1	0	1	0
100	0	0	1	0	1	0
122	0	1	0	1	0	1
125	0	0	1	1	0	0
...	...	...	...	...	...	...
330	0	0	1	0	1	0
885	0	0	1	0	0	0
339	0	0	0	1	0	1
725	0	0	1	1	0	1
349	0	0	1	1	0	1
359	0	0	1	0	1	0
882	0	0	1	0	1	0
14	0	0	1	0	1	0
257	0	0	0	0	1	0
57	0	0	1	1	0	1
501	0	0	1	0	1	0
612	0	0	1	0	1	0
757	0	1	0	1	0	1
109	0	0	1	0	1	0
227	0	0	1	1	0	1
579	0	0	1	1	0	1
711	0	0	0	1	0	1
707	0	0	0	1	0	1
473	0	1	0	0	0	0
311	0	0	0	0	1	0
151	0	0	0	0	0	0
120	0	1	0	1	0	1
751	0	0	1	1	0	0
84	0	1	0	0	1	0
96	0	0	0	1	0	1
571	0	0	0	0	0	0
173	0	0	1	1	0	1
753	0	0	1	1	0	1
419	0	0	1	0	1	0
788	0	0	1	1	0	0

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	0	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	0	PC 17599	71.2833	C85	C

	SibSp	FamilySize	Age_C	Age_D	Embarked_S	...	Deck_C
62	1	1	0	1	1	...	1
847	0	0	1	0	0	...	1
511	0	0	1	0	1	...	1

	Fare_D	Pclass_2	Pclass_3	Sex_male	Title_Miss	Title_Mr
62	0	0	0	1	0	1
847	0	0	1	1	0	1
511	0	0	1	1	0	1
187	0	0	0	1	0	1
833	0	0	1	1	0	1
83	0	0	0	1	0	1
417	0	1	0	0	1	0
206	0	0	1	1	0	1
5	0	0	1	1	0	1
667	0	0	1	1	0	1
114	0	0	1	0	1	0
153	0	0	1	1	0	1
400	0	0	1	1	0	1
281	0	0	1	1	0	1
509	0	0	1	1	0	1
765	0	0	0	0	0	0
812	0	1	0	1	0	1
723	0	1	0	1	0	1
180	0	0	1	0	1	0
231	0	0	1	1	0	1
438	0	0	0	1	0	1
297	0	0	0	0	1	0
469	0	0	1	0	1	0
479	0	0	1	0	1	0
201	0	0	1	1	0	1
418	0	1	0	1	0	1
634	0	0	1	0	1	0
100	0	0	1	0	1	0
122	0	1	0	1	0	1
125	0	0	1	1	0	0
...	...	...	...	...	...	...
330	0	0	1	0	1	0
885	0	0	1	0	0	0
339	0	0	0	1	0	1
725	0	0	1	1	0	1
349	0	0	1	1	0	1
359	0	0	1	0	1	0
882	0	0	1	0	1	0
14	0	0	1	0	1	0
257	0	0	0	0	1	0
57	0	0	1	1	0	1
501	0	0	1	0	1	0
612	0	0	1	0	1	0
757	0	1	0	1	0	1
109	0	0	1	0	1	0
227	0	0	1	1	0	1
579	0	0	1	1	0	1
711	0	0	0	1	0	1
707	0	0	0	1	0	1
473	0	1	0	0	0	0
311	0	0	0	0	1	0
151	0	0	0	0	0	0
120	0	1	0	1	0	1
751	0	0	1	1	0	0
84	0	1	0	0	1	0
96	0	0	0	1	0	1
571	0	0	0	0	0	0
173	0	0	1	1	0	1
753	0	0	1	1	0	1
419	0	0	1	0	1	0
788	0	0	1	1	0	0

	PassengerId	Survived
0	892	0.383936
1	893	0.383936
2	894	0.383936
3	895	0.383936
4	896	0.383936
5	897	0.383936
6	898	0.383936
7	899	0.383936
8	900	0.383936
9	901	0.383936
10	902	0.383936
11	903	0.383936
12	904	0.383936
13	905	0.383936
14	906	0.383936
15	907	0.383936
16	908	0.383936
17	909	0.383936
18	910	0.383936
19	911	0.383936
20	912	0.383936
21	913	0.383936
22	914	0.383936
23	915	0.383936
24	916	0.383936
25	917	0.383936
26	918	0.383936
27	919	0.383936
28	920	0.383936
29	921	0.383936
...	...	...
388	1280	0.383936
389	1281	0.383936
390	1282	0.383936
391	1283	0.383936
392	1284	0.383936
393	1285	0.383936
394	1286	0.383936
395	1287	0.383936
396	1288	0.383936
397	1289	0.383936
398	1290	0.383936
399	1291	0.383936
400	1292	0.383936
401	1293	0.383936
402	1294	0.383936
403	1295	0.383936
404	1296	0.383936
405	1297	0.383936
406	1298	0.383936
407	1299	0.383936
408	1300	0.383936
409	1301	0.383936
410	1302	0.383936
411	1303	0.383936
412	1304	0.383936
413	1305	0.383936
414	1306	0.383936
415	1307	0.383936
416	1308	0.383936
417	1309	0.383936

	SibSp	FamilySize	Age_C	Age_D	Embarked_S	...	Deck_C
62	1	1	0	1	1	...	1
847	0	0	1	0	0	...	1
511	0	0	1	0	1	...	1

	Fare_D	Pclass_2	Pclass_3	Sex_male	Title_Miss	Title_Mr
62	0	0	0	1	0	1
847	0	0	1	1	0	1
511	0	0	1	1	0	1
187	0	0	0	1	0	1
833	0	0	1	1	0	1
83	0	0	0	1	0	1
417	0	1	0	0	1	0
206	0	0	1	1	0	1
5	0	0	1	1	0	1
667	0	0	1	1	0	1
114	0	0	1	0	1	0
153	0	0	1	1	0	1
400	0	0	1	1	0	1
281	0	0	1	1	0	1
509	0	0	1	1	0	1
765	0	0	0	0	0	0
812	0	1	0	1	0	1
723	0	1	0	1	0	1
180	0	0	1	0	1	0
231	0	0	1	1	0	1
438	0	0	0	1	0	1
297	0	0	0	0	1	0
469	0	0	1	0	1	0
479	0	0	1	0	1	0
201	0	0	1	1	0	1
418	0	1	0	1	0	1
634	0	0	1	0	1	0
100	0	0	1	0	1	0
122	0	1	0	1	0	1
125	0	0	1	1	0	0
...	...	...	...	...	...	...
330	0	0	1	0	1	0
885	0	0	1	0	0	0
339	0	0	0	1	0	1
725	0	0	1	1	0	1
349	0	0	1	1	0	1
359	0	0	1	0	1	0
882	0	0	1	0	1	0
14	0	0	1	0	1	0
257	0	0	0	0	1	0
57	0	0	1	1	0	1
501	0	0	1	0	1	0
612	0	0	1	0	1	0
757	0	1	0	1	0	1
109	0	0	1	0	1	0
227	0	0	1	1	0	1
579	0	0	1	1	0	1
711	0	0	0	1	0	1
707	0	0	0	1	0	1
473	0	1	0	0	0	0
311	0	0	0	0	1	0
151	0	0	0	0	0	0
120	0	1	0	1	0	1
751	0	0	1	1	0	0
84	0	1	0	0	1	0
96	0	0	0	1	0	1
571	0	0	0	0	0	0
173	0	0	1	1	0	1
753	0	0	1	1	0	1
419	0	0	1	0	1	0
788	0	0	1	1	0	0

	SibSp	FamilySize	Age_C	Age_D	Embarked_S	...	Deck_C
62	1	1	0	1	1	...	1
847	0	0	1	0	0	...	1
511	0	0	1	0	1	...	1

	Fare_D	Pclass_2	Pclass_3	Sex_male	Title_Miss	Title_Mr
62	0	0	0	1	0	1
847	0	0	1	1	0	1
511	0	0	1	1	0	1
187	0	0	0	1	0	1
833	0	0	1	1	0	1
83	0	0	0	1	0	1
417	0	1	0	0	1	0
206	0	0	1	1	0	1
5	0	0	1	1	0	1
667	0	0	1	1	0	1
114	0	0	1	0	1	0
153	0	0	1	1	0	1
400	0	0	1	1	0	1
281	0	0	1	1	0	1
509	0	0	1	1	0	1
765	0	0	0	0	0	0
812	0	1	0	1	0	1
723	0	1	0	1	0	1
180	0	0	1	0	1	0
231	0	0	1	1	0	1
438	0	0	0	1	0	1
297	0	0	0	0	1	0
469	0	0	1	0	1	0
479	0	0	1	0	1	0
201	0	0	1	1	0	1
418	0	1	0	1	0	1
634	0	0	1	0	1	0
100	0	0	1	0	1	0
122	0	1	0	1	0	1
125	0	0	1	1	0	0
...	...	...	...	...	...	...
330	0	0	1	0	1	0
885	0	0	1	0	0	0
339	0	0	0	1	0	1
725	0	0	1	1	0	1
349	0	0	1	1	0	1
359	0	0	1	0	1	0
882	0	0	1	0	1	0
14	0	0	1	0	1	0
257	0	0	0	0	1	0
57	0	0	1	1	0	1
501	0	0	1	0	1	0
612	0	0	1	0	1	0
757	0	1	0	1	0	1
109	0	0	1	0	1	0
227	0	0	1	1	0	1
579	0	0	1	1	0	1
711	0	0	0	1	0	1
707	0	0	0	1	0	1
473	0	1	0	0	0	0
311	0	0	0	0	1	0
151	0	0	0	0	0	0
120	0	1	0	1	0	1
751	0	0	1	1	0	0
84	0	1	0	0	1	0
96	0	0	0	1	0	1
571	0	0	0	0	0	0
173	0	0	1	1	0	1
753	0	0	1	1	0	1
419	0	0	1	0	1	0
788	0	0	1	1	0	0