Passengers on the Titanic: predict who will survive

Step 1: Import libraries, define testing function, and read in data

#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
#define classification algorithms
gbc = GradientBoostingClassifier()
r = RandomForestClassifier()
d = DecisionTreeClassifier()
l = LogisticRegression()
k = KNeighborsClassifier()
g = GaussianNB()
b = BernoulliNB()

#list of algorithms and names for our function later on
algorithms = [gbc, r, d, l, k, g, b]
names= ['Gradient Boosting', 'Random Forest', 'Decision Tree','Logisic Regression','K Nearest', 'GaussianNB', 'BernoulliNB']
#define the function that we will use to determine best classification algorithm
def tDMassess(X, y, algorithms= algorithms, names = names):
    #train the data
    for i in range(len(algorithms)):
        algorithms[i] = algorithms[i].fit(X,y)
    #print metrics
    accuracy = []
    precision = []
    recall = []
    f1 = []
    for i in range(len(algorithms)):
        print(i)
        accuracy.append(accuracy_score(y, algorithms[i].predict(X)))
        print(accuracy)
        precision.append(precision_score(y, algorithms[i].predict(X)))
        print(precision)
        recall.append(recall_score(y, algorithms[i].predict(X)))
        print(recall)
        f1.append(f1_score(y, algorithms[i].predict(X)))
        print(f1)
        print('next loop')
    metrics = pd.DataFrame(columns = ['Accuracy', 'Precision', 'Recall', 'F1'], index= names)
    metrics['Accuracy'] = accuracy
    metrics['Precision'] = precision
    metrics['Recall'] = recall
    metrics['F1'] = f1
    return metrics
training = pd.read_csv('C:\\Users\\ckoni\\Desktop\\DevMastersWork\\Day 6 Files\\all\\train.csv')
test = pd.read_csv('C:\\Users\\ckoni\\Desktop\\DevMastersWork\\Day 6 Files\\all\\test.csv')

training.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

Step 2: Do some primary EDA to see how data will need to be cleaned/ augmented

test.isnull().sum()
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64
#We will handle the rest of the nulls later, but this Fare one we can take care of easily now.
test[test.Fare.isnull()]
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
152 1044 3 Storey, Mr. Thomas male 60.5 0 0 3701 NaN NaN S
test[test['Pclass'] == 3].mean()
PassengerId    1094.178899
Pclass            3.000000
Age              24.027945
SibSp             0.463303
Parch             0.417431
Fare             12.459678
dtype: float64
test.loc[152, 'Fare'] = 12.459678
#combine the data for feature engineering
data = pd.concat([training, test])
C:\Users\ckoni\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.

To retain the current behavior and silence the warning, pass sort=False
data.shape
(1309, 12)
data.describe()
Age Fare Parch PassengerId Pclass SibSp Survived
count 1046.000000 1309.000000 1309.000000 1309.000000 1309.000000 1309.000000 891.000000
mean 29.881138 33.279562 0.385027 655.000000 2.294882 0.498854 0.383838
std 14.413493 51.742084 0.865560 378.020061 0.837836 1.041658 0.486592
min 0.170000 0.000000 0.000000 1.000000 1.000000 0.000000 0.000000
25% 21.000000 7.895800 0.000000 328.000000 2.000000 0.000000 0.000000
50% 28.000000 14.454200 0.000000 655.000000 3.000000 0.000000 0.000000
75% 39.000000 31.275000 0.000000 982.000000 3.000000 1.000000 1.000000
max 80.000000 512.329200 9.000000 1309.000000 3.000000 8.000000 1.000000
data.isnull().sum()
Age             263
Cabin          1014
Embarked          2
Fare              0
Name              0
Parch             0
PassengerId       0
Pclass            0
Sex               0
SibSp             0
Survived        418
Ticket            0
dtype: int64
data.index
Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            408, 409, 410, 411, 412, 413, 414, 415, 416, 417],
           dtype='int64', length=1309)

Visual EDA

Step 2: Personally, I’m a visual learner. So I like to take a look at graphs of some possible key features in order to get a sense for what impacts the survivability of passengers. Below, I have graphed some features that I knew may play major roles in impacting our key variable (Passenger Class/Age), or features that I could not get a good sense of from our info/describe statements (SibSp/Fare).

#I used swarmplots because theyre a good plot to use to show data density and spread when you dont have an especially large dataset

sns.set(style="darkgrid")
_ = plt.figure(figsize= (13, 9))
_ = plt.subplot(2,3,1)
_ = sns.swarmplot(x="Pclass", y="Age", hue="Survived", data= data)

_ = plt.subplot(2,3,2)
_ = sns.swarmplot(x="Pclass", y="Parch", hue="Survived", data= data)

_ = plt.subplot(2,3,3)
_ = sns.swarmplot(x="Pclass", y="SibSp", hue="Survived", data= data)

_ = plt.subplot(2,3,4)
_ = sns.swarmplot(x="Sex", y="Age", hue="Survived", data= data)

_ = plt.subplot(2,3,5)
_ = sns.swarmplot(x="Embarked", y="Age", hue="Survived", data= data)

_ = plt.subplot(2,3,6)
_ = sns.swarmplot(x="Sex", y="Fare", hue="Survived", data= data)

plt.tight_layout()

png

Based on these graphs, there are only a few features that seem to have an effect on survivability: #Age #Pclass #Sex #Parch (possibly the amount of Parents/Children has an effect on survivability)

The ones that we looked at that we can determine have a very small or no effect on survivability: #Fare (Pclass is better indicator) #SibSp #Ticket, Cabin number, and port of embarkation dont seem to matter at all.

Data Cleaning

Step 3: Time to do everyone’s favorite task, data cleaning! I noticed that there were a lot of null values in the Age column, which was one of our major features that we identified, which means we have to fill it somehow.

#Take a look at the null Age values just to see what we're workin with
data[data['Age'].isnull()]
Age Cabin Embarked Fare Name Parch PassengerId Pclass Sex SibSp Survived Ticket
5 NaN NaN Q 8.4583 Moran, Mr. James 0 6 3 male 0 0.0 330877
17 NaN NaN S 13.0000 Williams, Mr. Charles Eugene 0 18 2 male 0 1.0 244373
19 NaN NaN C 7.2250 Masselmani, Mrs. Fatima 0 20 3 female 0 1.0 2649
26 NaN NaN C 7.2250 Emir, Mr. Farred Chehab 0 27 3 male 0 0.0 2631
28 NaN NaN Q 7.8792 O'Dwyer, Miss. Ellen "Nellie" 0 29 3 female 0 1.0 330959
29 NaN NaN S 7.8958 Todoroff, Mr. Lalio 0 30 3 male 0 0.0 349216
31 NaN B78 C 146.5208 Spencer, Mrs. William Augustus (Marie Eugenie) 0 32 1 female 1 1.0 PC 17569
32 NaN NaN Q 7.7500 Glynn, Miss. Mary Agatha 0 33 3 female 0 1.0 335677
36 NaN NaN C 7.2292 Mamee, Mr. Hanna 0 37 3 male 0 1.0 2677
42 NaN NaN C 7.8958 Kraeff, Mr. Theodor 0 43 3 male 0 0.0 349253
45 NaN NaN S 8.0500 Rogers, Mr. William John 0 46 3 male 0 0.0 S.C./A.4. 23567
46 NaN NaN Q 15.5000 Lennon, Mr. Denis 0 47 3 male 1 0.0 370371
47 NaN NaN Q 7.7500 O'Driscoll, Miss. Bridget 0 48 3 female 0 1.0 14311
48 NaN NaN C 21.6792 Samaan, Mr. Youssef 0 49 3 male 2 0.0 2662
55 NaN C52 S 35.5000 Woolner, Mr. Hugh 0 56 1 male 0 1.0 19947
64 NaN NaN C 27.7208 Stewart, Mr. Albert A 0 65 1 male 0 0.0 PC 17605
65 NaN NaN C 15.2458 Moubarek, Master. Gerios 1 66 3 male 1 1.0 2661
76 NaN NaN S 7.8958 Staneff, Mr. Ivan 0 77 3 male 0 0.0 349208
77 NaN NaN S 8.0500 Moutal, Mr. Rahamin Haim 0 78 3 male 0 0.0 374746
82 NaN NaN Q 7.7875 McDermott, Miss. Brigdet Delia 0 83 3 female 0 1.0 330932
87 NaN NaN S 8.0500 Slocovski, Mr. Selman Francis 0 88 3 male 0 0.0 SOTON/OQ 392086
95 NaN NaN S 8.0500 Shorney, Mr. Charles Joseph 0 96 3 male 0 0.0 374910
101 NaN NaN S 7.8958 Petroff, Mr. Pastcho ("Pentcho") 0 102 3 male 0 0.0 349215
107 NaN NaN S 7.7750 Moss, Mr. Albert Johan 0 108 3 male 0 1.0 312991
109 NaN NaN Q 24.1500 Moran, Miss. Bertha 0 110 3 female 1 1.0 371110
121 NaN NaN S 8.0500 Moore, Mr. Leonard Charles 0 122 3 male 0 0.0 A4. 54510
126 NaN NaN Q 7.7500 McMahon, Mr. Martin 0 127 3 male 0 0.0 370372
128 NaN F E69 C 22.3583 Peter, Miss. Anna 1 129 3 female 1 1.0 2668
140 NaN NaN C 15.2458 Boulos, Mrs. Joseph (Sultana) 2 141 3 female 0 0.0 2678
154 NaN NaN S 7.3125 Olsen, Mr. Ole Martin 0 155 3 male 0 0.0 Fa 265302
... ... ... ... ... ... ... ... ... ... ... ... ...
268 NaN NaN S 8.0500 Howard, Miss. May Elizabeth 0 1160 3 female 0 NaN A. 2. 39186
271 NaN NaN Q 7.7500 Fox, Mr. Patrick 0 1163 3 male 0 NaN 368573
273 NaN NaN Q 15.5000 Lennon, Miss. Mary 0 1165 3 female 1 NaN 370371
274 NaN NaN C 7.2250 Saade, Mr. Jean Nassr 0 1166 3 male 0 NaN 2676
282 NaN NaN Q 7.7500 Fleming, Miss. Honora 0 1174 3 female 0 NaN 364859
286 NaN NaN S 7.2500 Franklin, Mr. Charles (Charles Fardon) 0 1178 3 male 0 NaN SOTON/O.Q. 3101314
288 NaN F E46 C 7.2292 Mardirosian, Mr. Sarkis 0 1180 3 male 0 NaN 2655
289 NaN NaN S 8.0500 Ford, Mr. Arthur 0 1181 3 male 0 NaN A/5 1478
290 NaN NaN S 39.6000 Rheims, Mr. George Alexander Lucien 0 1182 1 male 0 NaN PC 17607
292 NaN NaN C 7.2292 Nasr, Mr. Mustafa 0 1184 3 male 0 NaN 2652
297 NaN NaN C 21.6792 Samaan, Mr. Hanna 0 1189 3 male 2 NaN 2662
301 NaN D C 15.0458 Malachard, Mr. Noel 0 1193 2 male 0 NaN 237735
304 NaN NaN Q 7.7500 McCarthy, Miss. Catherine Katie"" 0 1196 3 female 0 NaN 383123
312 NaN NaN S 7.5750 Sadowitz, Mr. Harry 0 1204 3 male 0 NaN LP 1588
332 NaN NaN C 7.2250 Thomas, Mr. Tannous 0 1224 3 male 0 NaN 2684
339 NaN NaN C 7.2292 Betros, Master. Seman 0 1231 3 male 0 NaN 2622
342 NaN NaN S 69.5500 Sage, Mr. John George 9 1234 3 male 1 NaN CA. 2343
344 NaN NaN S 14.5000 van Billiard, Master. James William 1 1236 3 male 1 NaN A/5. 851
357 NaN NaN S 7.8792 Lockyer, Mr. Edward 0 1249 3 male 0 NaN 1222
358 NaN NaN Q 7.7500 O'Keefe, Mr. Patrick 0 1250 3 male 0 NaN 368402
365 NaN NaN S 69.5500 Sage, Mrs. John (Annie Bullen) 9 1257 3 female 1 NaN CA. 2343
366 NaN NaN C 14.4583 Caram, Mr. Joseph 0 1258 3 male 1 NaN 2689
380 NaN NaN Q 7.7500 O'Connor, Mr. Patrick 0 1272 3 male 0 NaN 366713
382 NaN NaN S 14.5000 Risien, Mrs. Samuel (Emma) 0 1274 3 female 0 NaN 364498
384 NaN NaN S 12.8750 Wheeler, Mr. Edwin Frederick"" 0 1276 2 male 0 NaN SC/PARIS 2159
408 NaN NaN Q 7.7208 Riordan, Miss. Johanna Hannah"" 0 1300 3 female 0 NaN 334915
410 NaN NaN Q 7.7500 Naughton, Miss. Hannah 0 1302 3 female 0 NaN 365237
413 NaN NaN S 8.0500 Spector, Mr. Woolf 0 1305 3 male 0 NaN A.5. 3236
416 NaN NaN S 8.0500 Ware, Mr. Frederick 0 1308 3 male 0 NaN 359309
417 NaN NaN C 22.3583 Peter, Master. Michael J 1 1309 3 male 1 NaN 2668

263 rows × 12 columns

# to fit age better, take the names, split on the "," on the left side. Split on "." on the right side
#do this to find 'Master vs. Mister' and 'Miss vs. Mrs'
names1 = pd.DataFrame(data['Name'].str.split(',', n=1).tolist(), columns = ['surname', 'given'])
names2 = pd.DataFrame(names1['given'].str.split('.', n=1).tolist(), columns = ['prefix', 'given'])
prefix = names2['prefix']
prefix = pd.DataFrame(prefix)
prefix.columns = ['title']
data['title'] = prefix
data.head()
Age Cabin Embarked Fare Name Parch PassengerId Pclass Sex SibSp Survived Ticket title
0 22.0 NaN S 7.2500 Braund, Mr. Owen Harris 0 1 3 male 1 0.0 A/5 21171 Mr
1 38.0 C85 C 71.2833 Cumings, Mrs. John Bradley (Florence Briggs Th... 0 2 1 female 1 1.0 PC 17599 Mrs
2 26.0 NaN S 7.9250 Heikkinen, Miss. Laina 0 3 3 female 0 1.0 STON/O2. 3101282 Miss
3 35.0 C123 S 53.1000 Futrelle, Mrs. Jacques Heath (Lily May Peel) 0 4 1 female 1 1.0 113803 Mrs
4 35.0 NaN S 8.0500 Allen, Mr. William Henry 0 5 3 male 0 0.0 373450 Mr
#fill the null values in 'Age' column with the mean of each title
data['Age'] = data.groupby('title').transform(lambda x: x.fillna(x.mean()))
data.Age.isnull().sum()
0
data.reset_index()
index Age Cabin Embarked Fare Name Parch PassengerId Pclass Sex SibSp Survived Ticket title
0 0 22.000000 NaN S 7.2500 Braund, Mr. Owen Harris 0 1 3 male 1 0.0 A/5 21171 Mr
1 1 38.000000 C85 C 71.2833 Cumings, Mrs. John Bradley (Florence Briggs Th... 0 2 1 female 1 1.0 PC 17599 Mrs
2 2 26.000000 NaN S 7.9250 Heikkinen, Miss. Laina 0 3 3 female 0 1.0 STON/O2. 3101282 Miss
3 3 35.000000 C123 S 53.1000 Futrelle, Mrs. Jacques Heath (Lily May Peel) 0 4 1 female 1 1.0 113803 Mrs
4 4 35.000000 NaN S 8.0500 Allen, Mr. William Henry 0 5 3 male 0 0.0 373450 Mr
5 5 31.563774 NaN Q 8.4583 Moran, Mr. James 0 6 3 male 0 0.0 330877 Mr
6 6 54.000000 E46 S 51.8625 McCarthy, Mr. Timothy J 0 7 1 male 0 0.0 17463 Mr
7 7 2.000000 NaN S 21.0750 Palsson, Master. Gosta Leonard 1 8 3 male 3 0.0 349909 Master
8 8 27.000000 NaN S 11.1333 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) 2 9 3 female 0 1.0 347742 Mrs
9 9 14.000000 NaN C 30.0708 Nasser, Mrs. Nicholas (Adele Achem) 0 10 2 female 1 1.0 237736 Mrs
10 10 4.000000 G6 S 16.7000 Sandstrom, Miss. Marguerite Rut 1 11 3 female 1 1.0 PP 9549 Miss
11 11 58.000000 C103 S 26.5500 Bonnell, Miss. Elizabeth 0 12 1 female 0 1.0 113783 Miss
12 12 20.000000 NaN S 8.0500 Saundercock, Mr. William Henry 0 13 3 male 0 0.0 A/5. 2151 Mr
13 13 39.000000 NaN S 31.2750 Andersson, Mr. Anders Johan 5 14 3 male 1 0.0 347082 Mr
14 14 14.000000 NaN S 7.8542 Vestrom, Miss. Hulda Amanda Adolfina 0 15 3 female 0 0.0 350406 Miss
15 15 55.000000 NaN S 16.0000 Hewlett, Mrs. (Mary D Kingcome) 0 16 2 female 0 1.0 248706 Mrs
16 16 2.000000 NaN Q 29.1250 Rice, Master. Eugene 1 17 3 male 4 0.0 382652 Master
17 17 31.563774 NaN S 13.0000 Williams, Mr. Charles Eugene 0 18 2 male 0 1.0 244373 Mr
18 18 31.000000 NaN S 18.0000 Vander Planke, Mrs. Julius (Emelia Maria Vande... 0 19 3 female 1 0.0 345763 Mrs
19 19 33.760194 NaN C 7.2250 Masselmani, Mrs. Fatima 0 20 3 female 0 1.0 2649 Mrs
20 20 35.000000 NaN S 26.0000 Fynney, Mr. Joseph J 0 21 2 male 0 0.0 239865 Mr
21 21 34.000000 D56 S 13.0000 Beesley, Mr. Lawrence 0 22 2 male 0 1.0 248698 Mr
22 22 15.000000 NaN Q 8.0292 McGowan, Miss. Anna "Annie" 0 23 3 female 0 1.0 330923 Miss
23 23 28.000000 A6 S 35.5000 Sloper, Mr. William Thompson 0 24 1 male 0 1.0 113788 Mr
24 24 8.000000 NaN S 21.0750 Palsson, Miss. Torborg Danira 1 25 3 female 3 0.0 349909 Miss
25 25 38.000000 NaN S 31.3875 Asplund, Mrs. Carl Oscar (Selma Augusta Emilia... 5 26 3 female 1 1.0 347077 Mrs
26 26 31.563774 NaN C 7.2250 Emir, Mr. Farred Chehab 0 27 3 male 0 0.0 2631 Mr
27 27 19.000000 C23 C25 C27 S 263.0000 Fortune, Mr. Charles Alexander 2 28 1 male 3 0.0 19950 Mr
28 28 25.455752 NaN Q 7.8792 O'Dwyer, Miss. Ellen "Nellie" 0 29 3 female 0 1.0 330959 Miss
29 29 31.563774 NaN S 7.8958 Todoroff, Mr. Lalio 0 30 3 male 0 0.0 349216 Mr
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1279 388 21.000000 NaN Q 7.7500 Canavan, Mr. Patrick 0 1280 3 male 0 NaN 364858 Mr
1280 389 6.000000 NaN S 21.0750 Palsson, Master. Paul Folke 1 1281 3 male 3 NaN 349909 Miss
1281 390 23.000000 B24 S 93.5000 Payne, Mr. Vivian Ponsonby 0 1282 1 male 0 NaN 12749 Mr
1282 391 51.000000 D28 S 39.4000 Lines, Mrs. Ernest H (Elizabeth Lindsey James) 1 1283 1 female 0 NaN PC 17592 Mr
1283 392 13.000000 NaN S 20.2500 Abbott, Master. Eugene Joseph 2 1284 3 male 0 NaN C.A. 2673 Mr
1284 393 47.000000 NaN S 10.5000 Gilbert, Mr. William 0 1285 2 male 0 NaN C.A. 30769 Miss
1285 394 29.000000 NaN S 22.0250 Kink-Heilmann, Mr. Anton 1 1286 3 male 3 NaN 315153 Mrs
1286 395 18.000000 C31 S 60.0000 Smith, Mrs. Lucien Philip (Mary Eloise Hughes) 0 1287 1 female 1 NaN 13695 Mr
1287 396 24.000000 NaN Q 7.2500 Colbert, Mr. Patrick 0 1288 3 male 0 NaN 371109 Miss
1288 397 48.000000 B41 C 79.2000 Frolicher-Stehli, Mrs. Maxmillian (Margaretha ... 1 1289 1 female 1 NaN 13567 Mr
1289 398 22.000000 NaN S 7.7750 Larsson-Rondberg, Mr. Edvard A 0 1290 3 male 0 NaN 347065 Dr
1290 399 31.000000 NaN Q 7.7333 Conlon, Mr. Thomas Henry 0 1291 3 male 0 NaN 21332 Mrs
1291 400 30.000000 C7 S 164.8667 Bonnell, Miss. Caroline 0 1292 1 female 0 NaN 36928 Mr
1292 401 38.000000 NaN S 21.0000 Gale, Mr. Harry 0 1293 2 male 1 NaN 28664 Mr
1293 402 22.000000 NaN C 59.4000 Gibson, Miss. Dorothy Winifred 1 1294 1 female 0 NaN 112378 Miss
1294 403 17.000000 NaN S 47.1000 Carrau, Mr. Jose Pedro 0 1295 1 male 0 NaN 113059 Mr
1295 404 43.000000 D40 C 27.7208 Frauenthal, Mr. Isaac Gerald 0 1296 1 male 1 NaN 17765 Miss
1296 405 20.000000 D38 C 13.8625 Nourney, Mr. Alfred (Baron von Drachstedt")" 0 1297 2 male 0 NaN SC/PARIS 2166 Mr
1297 406 23.000000 NaN S 10.5000 Ware, Mr. William Jeffery 0 1298 2 male 1 NaN 28666 Mr
1298 407 50.000000 C80 C 211.5000 Widener, Mr. George Dunton 1 1299 1 male 1 NaN 113503 Master
1299 408 31.563774 NaN Q 7.7208 Riordan, Miss. Johanna Hannah"" 0 1300 3 female 0 NaN 334915 Mr
1300 409 3.000000 NaN S 13.7750 Peacock, Miss. Treasteall 1 1301 3 female 1 NaN SOTON/O.Q. 3101315 Miss
1301 410 31.563774 NaN Q 7.7500 Naughton, Miss. Hannah 0 1302 3 female 0 NaN 365237 Mr
1302 411 37.000000 C78 Q 90.0000 Minahan, Mrs. William Edward (Lillian E Thorpe) 0 1303 1 female 1 NaN 19928 Mr
1303 412 28.000000 NaN S 7.7750 Henriksson, Miss. Jenny Lovisa 0 1304 3 female 0 NaN 347086 Miss
1304 413 31.563774 NaN S 8.0500 Spector, Mr. Woolf 0 1305 3 male 0 NaN A.5. 3236 Mr
1305 414 39.000000 C105 C 108.9000 Oliva y Ocana, Dona. Fermina 0 1306 1 female 0 NaN PC 17758 Mr
1306 415 38.500000 NaN S 7.2500 Saether, Mr. Simon Sivertsen 0 1307 3 male 0 NaN SOTON/O.Q. 3101262 Mrs
1307 416 33.760194 NaN S 8.0500 Ware, Mr. Frederick 0 1308 3 male 0 NaN 359309 Mrs
1308 417 25.455752 NaN C 22.3583 Peter, Master. Michael J 1 1309 3 male 1 NaN 2668 Miss

1309 rows × 14 columns

#to do the above name strip in one line:
#data['title'] = data.Name.map(lambda x: x.split(',')[1].split('.')[0].strip())

Feature Engineering

Step 3.5: Add/remove some features based on whether or not they are going to matter in our model.

data.Parch.describe()
#I dont know what this does and the std is higher than the mean which makes me think it's a bad predictor of anything
count    1309.000000
mean        0.385027
std         0.865560
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         9.000000
Name: Parch, dtype: float64
#Filling the 2 Embarked null values with the most common value (Do I even need this column?)
#data.Embarked.fillna('S', inplace = True)
#Create dummy variables for the categoricals in the dataset
Pclassdumm = pd.get_dummies(data['Pclass'], drop_first = True)
data = pd.concat([data, Pclassdumm], axis= 1)
data.drop('Pclass', inplace = True, axis= 1)
sexdumm = pd.get_dummies(data['Sex'], drop_first = True)
data = pd.concat([data, sexdumm], axis= 1)
data.drop('Sex', inplace = True, axis=1)
#Do I even need this?
#Embardumm = pd.get_dummies(data['Embarked'], drop_first = True)
#data = pd.concat([data, Embardumm], axis= 1)
#data.drop('Embarked', inplace = True, axis=1)
#Drop all the columns we dont need
data.drop(['Name', 'PassengerId', 'title', 'Cabin', 'Ticket', 'Embarked', 'SibSp', 'Fare'], inplace = True, axis = 1)
data['Children'] = data['Age'] < 16
data['Children'].head()
0    False
1    False
2    False
3    False
4    False
Name: Children, dtype: bool
data.shape
(1309, 6)
data.describe()
Age Survived 2 3 male
count 1309.000000 891.000000 1309.000000 1309.000000 1309.000000
mean 29.930167 0.383838 0.211612 0.541635 0.644003
std 12.990814 0.486592 0.408607 0.498454 0.478997
min 0.170000 0.000000 0.000000 0.000000 0.000000
25% 22.000000 0.000000 0.000000 0.000000 0.000000
50% 30.000000 0.000000 0.000000 1.000000 1.000000
75% 35.000000 1.000000 0.000000 1.000000 1.000000
max 80.000000 1.000000 1.000000 1.000000 1.000000
data.describe()
Age Survived 2 3 male
count 1309.000000 891.000000 1309.000000 1309.000000 1309.000000
mean 29.930167 0.383838 0.211612 0.541635 0.644003
std 12.990814 0.486592 0.408607 0.498454 0.478997
min 0.170000 0.000000 0.000000 0.000000 0.000000
25% 22.000000 0.000000 0.000000 0.000000 0.000000
50% 30.000000 0.000000 0.000000 1.000000 1.000000
75% 35.000000 1.000000 0.000000 1.000000 1.000000
max 80.000000 1.000000 1.000000 1.000000 1.000000

Training the Model

Step 4: Since we’re done changing around our features and cleaning our data, we can now re-split the data into training and test sets, run our assessment function to determine which would be the best predictor, and then predict on the test data.

#re-split data into our TRAINING SET so that we can use it to assess
train = data[data['Survived'].notnull()]
train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 6 columns):
Age         891 non-null float64
Survived    891 non-null float64
2           891 non-null uint8
3           891 non-null uint8
male        891 non-null uint8
Children    891 non-null bool
dtypes: bool(1), float64(2), uint8(3)
memory usage: 24.4 KB
#split the 'Survived' column off of the TRAINING set so that we can assess our predictions
target = train[['Survived']]
features = train.drop('Survived', axis= 1)
features.shape, target.shape
((891, 5), (891, 1))
features.isnull().sum()
Age         0
2           0
3           0
male        0
Children    0
dtype: int64

On previous iterations, my model was performing at about 77.5% accuracy (with the best performance at 79%). It has come to my attention that this may be due to overtraining or overfitting since this is a smaller dataset. I will use train_test_split to try and reduce this error.

x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = .2, random_state = 42)
#adding the train_test_split decreased my score to 75%
tDMassess(x_train, y_train)
C:\Users\ckoni\Anaconda3\lib\site-packages\sklearn\utils\validation.py:578: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
C:\Users\ckoni\Anaconda3\lib\site-packages\ipykernel_launcher.py:5: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  """
C:\Users\ckoni\Anaconda3\lib\site-packages\sklearn\utils\validation.py:578: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
C:\Users\ckoni\Anaconda3\lib\site-packages\ipykernel_launcher.py:5: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  """
C:\Users\ckoni\Anaconda3\lib\site-packages\sklearn\utils\validation.py:578: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
C:\Users\ckoni\Anaconda3\lib\site-packages\sklearn\utils\validation.py:578: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)


0
[0.8525280898876404]
[0.8721461187214612]
[0.7126865671641791]
[0.784394250513347]
next loop
1
[0.8525280898876404, 0.875]
[0.8721461187214612, 0.8744769874476988]
[0.7126865671641791, 0.7798507462686567]
[0.784394250513347, 0.8244575936883628]
next loop
2
[0.8525280898876404, 0.875, 0.8778089887640449]
[0.8721461187214612, 0.8744769874476988, 0.8951965065502183]
[0.7126865671641791, 0.7798507462686567, 0.7649253731343284]
[0.784394250513347, 0.8244575936883628, 0.8249496981891348]
next loop
3
[0.8525280898876404, 0.875, 0.8778089887640449, 0.7893258426966292]
[0.8721461187214612, 0.8744769874476988, 0.8951965065502183, 0.7398373983739838]
[0.7126865671641791, 0.7798507462686567, 0.7649253731343284, 0.6791044776119403]
[0.784394250513347, 0.8244575936883628, 0.8249496981891348, 0.708171206225681]
next loop
4
[0.8525280898876404, 0.875, 0.8778089887640449, 0.7893258426966292, 0.8441011235955056]
[0.8721461187214612, 0.8744769874476988, 0.8951965065502183, 0.7398373983739838, 0.8257261410788381]
[0.7126865671641791, 0.7798507462686567, 0.7649253731343284, 0.6791044776119403, 0.7425373134328358]
[0.784394250513347, 0.8244575936883628, 0.8249496981891348, 0.708171206225681, 0.7819253438113948]
next loop
5
[0.8525280898876404, 0.875, 0.8778089887640449, 0.7893258426966292, 0.8441011235955056, 0.7851123595505618]
[0.8721461187214612, 0.8744769874476988, 0.8951965065502183, 0.7398373983739838, 0.8257261410788381, 0.7031802120141343]
[0.7126865671641791, 0.7798507462686567, 0.7649253731343284, 0.6791044776119403, 0.7425373134328358, 0.7425373134328358]
[0.784394250513347, 0.8244575936883628, 0.8249496981891348, 0.708171206225681, 0.7819253438113948, 0.722323049001815]
next loop
6
[0.8525280898876404, 0.875, 0.8778089887640449, 0.7893258426966292, 0.8441011235955056, 0.7851123595505618, 0.7991573033707865]
[0.8721461187214612, 0.8744769874476988, 0.8951965065502183, 0.7398373983739838, 0.8257261410788381, 0.7031802120141343, 0.7470355731225297]
[0.7126865671641791, 0.7798507462686567, 0.7649253731343284, 0.6791044776119403, 0.7425373134328358, 0.7425373134328358, 0.7052238805970149]
[0.784394250513347, 0.8244575936883628, 0.8249496981891348, 0.708171206225681, 0.7819253438113948, 0.722323049001815, 0.72552783109405]
next loop
Accuracy Precision Recall F1
Gradient Boosting 0.852528 0.872146 0.712687 0.784394
Random Forest 0.875000 0.874477 0.779851 0.824458
Decision Tree 0.877809 0.895197 0.764925 0.824950
Logisic Regression 0.789326 0.739837 0.679104 0.708171
K Nearest 0.844101 0.825726 0.742537 0.781925
GaussianNB 0.785112 0.703180 0.742537 0.722323
BernoulliNB 0.799157 0.747036 0.705224 0.725528

Seems like Gradient Boosting or Random Forest would be the best option to predict here

#Since we found that GB would be the best model, we can re-split our ORIGINAL, FULL dataset into the training and testing sets
training = data[:891]
testing = data[891:]
testing.describe()
Age Survived 2 3 male
count 418.000000 0.0 418.000000 418.000000 418.000000
mean 30.223491 NaN 0.222488 0.521531 0.636364
std 12.757613 NaN 0.416416 0.500135 0.481622
min 0.170000 NaN 0.000000 0.000000 0.000000
25% 22.625000 NaN 0.000000 0.000000 0.000000
50% 30.000000 NaN 0.000000 1.000000 1.000000
75% 36.000000 NaN 0.000000 1.000000 1.000000
max 76.000000 NaN 1.000000 1.000000 1.000000
#remove this column since it's all nulls, we will be replacing it with our predictions soon
testing.drop('Survived', inplace = True, axis= 1)
C:\Users\ckoni\Anaconda3\lib\site-packages\pandas\core\frame.py:3694: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
training.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 6 columns):
Age         891 non-null float64
Survived    891 non-null float64
2           891 non-null uint8
3           891 non-null uint8
male        891 non-null uint8
Children    891 non-null bool
dtypes: bool(1), float64(2), uint8(3)
memory usage: 24.4 KB
testing.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 5 columns):
Age         418 non-null float64
2           418 non-null uint8
3           418 non-null uint8
male        418 non-null uint8
Children    418 non-null bool
dtypes: bool(1), float64(1), uint8(3)
memory usage: 8.2 KB
#Use Gradient Boosting to predict on the TESTING set and save to a variable, predict
predict = gbc.predict(testing)
#predict = r.predict(testing)
#Using random forest got me a 67% accuracy rating... 12% lower than my best with GB
#add the predictions as the survival values in the TESTING test data set which we recently dropped
test["Survived"] = predict.astype('int')
test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           418 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
Survived       418 non-null int32
dtypes: float64(2), int32(1), int64(4), object(5)
memory usage: 37.6+ KB
#Kaggle only needs a DataFrame containing the PassengerId and our predicted values
test = test[['PassengerId', 'Survived']]
test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
PassengerId    418 non-null int64
Survived       418 non-null int32
dtypes: int32(1), int64(1)
memory usage: 5.0 KB
#write to a csv file so we can upload to Kaggle
test.to_csv('titanic_predict.csv', index = False)
test.describe()
PassengerId Survived
count 418.000000 418.000000
mean 1100.500000 0.325359
std 120.810458 0.469070
min 892.000000 0.000000
25% 996.250000 0.000000
50% 1100.500000 0.000000
75% 1204.750000 1.000000
max 1309.000000 1.000000