import pandas as pd
train = pd.read_csv('~/Desktop/DevMaster/Hack_day/train.csv', parse_dates=True)
test = pd.read_csv('~/Desktop/DevMaster/Hack_day/test.csv', parse_dates=True)
# it is recommended that you do this as to let the date column appropriate itself nicer
#train.info()
#train.head()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')
import warnings
warnings.filterwarnings('ignore')
from scipy import stats
from sklearn import linear_model
from sklearn import neighbors
from sklearn.metrics import mean_squared_error,accuracy_score
from sklearn import preprocessing
from math import log
train.head(2)
train.info()
train.notnull().sum()
test.notnull().sum()
# Set an empty list so we can determine which variables are numerical & categorical.
numericals = []
categoricals = []
# Set an empty list so we can determine which columns to delete.
deleteColumns = []
deleteRows = []
# Set an empty list so we can jot down our notes later for the other data preparation section.
reassess = []
transforms = []
mined = []
engineered = []
# Here is a list of the columns.
columns = train.columns
columns
############################
############################
############################
### APPEND test to train
train.tail(1)
#train.nunique()
test.tail(1)
full_train=train.append(test,ignore_index=True)
############################
############################
############################
############################
def age(a):
if a <= 10.0:
return 'A'
elif a <= 20.0: ## should be 16
return 'B'
elif a <= 40.0:
return 'C'
elif a <= 55.0:
return 'D'
elif a <= 100:
return 'E'
def fare(a):
if a <= 200.0:
return 'A'
elif a <= 200.0:
return 'B'
elif a <= 300.0:
return 'C'
elif a <= 600.0:
return 'D'
#full_train.Age.value_counts()
#train[train['Age'].isnull()]
#train[train['Embarked'].isnull()]
#train[train['Cabin'].isnull()]
#import seaborn as sns; sns.set(style="ticks", color_codes=True)
#g=sns.pairplot(train)
#d={'NaN':0}
fare_median = full_train.groupby(['Sex', 'Pclass']).Fare.median()
fare_median.name = 'FareMedian'
age_mean = full_train.groupby(['Sex', 'Pclass']).Age.mean()
age_mean.name = 'AgeMean'
def join(df, stat):
return pd.merge(df, stat.to_frame(), left_on=['Sex', 'Pclass'], right_index=True, how='left')
full_train['Fare'] = full_train.Fare.fillna(join(full_train, fare_median).FareMedian)
full_train['Age'] = full_train.Age.fillna(join(full_train, age_mean).AgeMean)
#a=full_train['Age'].mean()
#b=train['Cabin'].mean()
#full_train.Age.fillna(a,inplace=True)
#train['Embarked'].value_counts()
full_train.Embarked.fillna('S',inplace=True)
#train['Embarked'].mean()
#train['Cabin'].mean()
#train['Age'].fillna(4)
full_train.Age=full_train.Age.apply(lambda x: age(x))
full_train.Fare=full_train.Fare.apply(lambda x: fare(x))
full_train.groupby(['Sex']).mean()
#sns.pairplot(train)
#,x_vars=['Age','Embarked'],y_vars=['Age','Embarked']
#full_train['Fare'].fillna(train['Fare'].median(), inplace = True)
######################################
import string
def subst_strings(single,whole_list):
for single_str in whole_list:
if string.find(single,single_str) !=-1:
return single_str
return np.nan
title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
'Don', 'Jonkheer']
full_train['Title']=full_train['Name'].map(lambda x: subst_strings(x,title_list))
def replace_titles(x):
title=x['Title']
if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
return 'Mr'
elif title in ['Countess', 'Mme']:
return 'Mrs'
elif title in ['Mlle', 'Ms']:
return 'Miss'
elif title =='Dr':
if x['Sex']=='Male':
return 'Mr'
else:
return 'Mrs'
else:
return title
full_train['Age*Class']=full_train['Age']*full_train['Pclass']
full_train['Title']=full_train.apply(replace_titles,axis=1)
###########################################
full_train['FamilySize']=full_train['SibSp']+full_train['Parch']
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'NaN']
full_train['Deck']=full_train['Cabin'].map(lambda x: subst_strings(str(x), cabin_list))
full_train.Deck.fillna('C',inplace=True)
#full_train.Deck.value_counts()
full_train['Deck']
full_train.info()
deleteColumns.append('PassengerId')
deleteColumns.append('Cabin')
#deleteColumns.append('Age')
afterDelete=full_train.copy()
for columns in range(len(deleteColumns)):
del afterDelete[deleteColumns[columns]]
#del Xy['PassengerId']
del afterDelete['Ticket']
del afterDelete['Name']
afterDelete['Pclass']=afterDelete['Pclass'].astype('str')
afterDelete.info()
#afterDelete['Deck']
afterDelete.head(2)
afterDelete.nunique()
afterDelete.info()
Xy.columns
Sex PClass Ticket
Xy = pd.get_dummies(afterDelete, drop_first = True)
#Xy.head(10)
Xy.columns
Xy.head(2)
#Xy.info()
Xy.head(2)
len(Xy),len(train),len(test)
print('split at',len(train))
editedTrain=Xy[:891]
editedTest=Xy[891:]
len(editedTrain), len(editedTest)
y=editedTrain['Survived']
X=editedTrain.copy()
del X['Survived']
from sklearn.cross_validation import train_test_split as tts
X_train, X_test, y_train, y_test = tts(X, y, test_size = .2, random_state = 44)
################################
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
log = LogisticRegression()
tree = DecisionTreeClassifier()
forest = RandomForestClassifier()
gradient = GradientBoostingClassifier()
ada = AdaBoostClassifier()
#################################
#models = []
#models.append(('l', LogisticRegression()))
#models.append(('t', DecisionTreeClassifier()))
#models.append(('f', RandomForestClassifier()))
#models.append(('g', GradientBoostingClassifier()))
#models.append(('a', AdaBoostClassifier()))
l = log.fit(X_train, y_train)
t = tree.fit(X_train, y_train)
ff = forest.fit(X_train, y_train)
importances=forest.feature_importances_
g = gradient.fit(X_train, y_train)
a = ada.fit(X_train, y_train)
X_train.head(3)
type(importances)
len(forest.feature_importances_)
len(X_train.columns)
afterDelete.columns
feat = pd.DataFrame()
#plt.figure(figsize=(10,20))
#plt.title('Feature Importance')
indices=np.argsort(importances)[::-1]
feat['x']=forest.feature_importances_
feat['y']=X_train.columns
feat.sort_values(by='x',inplace=True)
feat.set_index(feat['y'],inplace=True)
feat.plot(kind='barh',figsize=(10,10))
'''
plt.bar(range(X_train.shape[1]),importances[indices],align="center")#, color="r", yerr=std[indices], align="center")
plt.xticks(range(X_train.shape[1]),indices)
plt.xlim([-1,X_train.shape[1]])
plt.show()
'''
#range(X_train.shape[1])
for f in range(X.shape[1]):
print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
X_train[X_train.columns[10:16]]
###y_test
print("The score for Logistic Regression is, ", l.score(X_test, y_test))
print("The score for Decision Trees is ", t.score(X_test,y_test))
print("The score for Random Forest is ", ff.score(X_test,y_test))
print("The score for Gradient Descent is ", g.score(X_test, y_test))
print("The score for AdaBoost is ", a.score(X_test, y_test))
X_train['FamilySize'].value_counts()
editedTest['Fare_C'].value_counts()
del editedTest['Survived']
editedTest.info()
def run_this_model(mod,test):
ret=mod.predict(test)
return ret
res=pd.DataFrame(columns=['PassengerId','Survived'])
res.head(2)
res['PassengerId']=test['PassengerId']
from sklearn.model_selection import cross_val_score, learning_curve, validation_curve
num_folds = 7
def plot_curve(ticks, train_scores, test_scores):
import matplotlib.pyplot as plt
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.figure()
plt.fill_between(ticks,
train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1, color="b")
plt.fill_between(ticks,
test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="r")
plt.plot(ticks, train_scores_mean, 'b-', label='Training score')
plt.plot(ticks, test_scores_mean, 'r-', label='CV score')
plt.legend()
return plt.gca()
def plot_learning_curve(clf, X, y, scoring='accuracy'):
ax = plot_curve(*learning_curve(clf, X, y, cv=num_folds, scoring=scoring,
train_sizes=np.linspace(0.1,1,10), n_jobs=-1))
ax.set_title('Learning curve: {}'.format(clf.__class__.__name__))
ax.set_xlabel('Training size')
ax.set_ylabel(scoring)
#names=['l','g']
#for i in range(len(names)):
# plot_learning_curve(i,X_train,y_train)
import numpy as np
#res['Survived']=np.random.randint(2, size=418)
res['Survived']=np.random.uniform(0,1)
#res['Survived']=np.zeros(418)
#res['Survived']=res.Survived.astype(int)
res['Survived']
res
res.to_csv('Random.csv', index=False)
res
#for name,model in models:
# model.predict(editedTest)
mod=[l,t,ff,g,a]
#survived=[]
for i in mod:
survived=run_this_model(i,X_test) #X_test
type(survived)
# print i.__class__.__name__
# print len(y_test)
sc=accuracy_score(survived,y_test)
survived=run_this_model(i,editedTest)
res['Survived']=survived.astype(int)
file_name='results_'+i.__class__.__name__+'_acc'+sc.astype(str)+'.csv'
res.to_csv(file_name, index=False)
plot_learning_curve(i,X_train,y_train)
#survived=l.predict(editedTest)
#survived=t.predict(editedTest)
#survived=f.predict(editedTest)
#survived=g.predict(editedTest)
#survived=a.predict(editedTest)