Predicting Survival on the Titanic
Passengers on the Titanic: predict who will survive
Step 1: Import libraries, define testing function, and read in data
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
#define classification algorithms
gbc = GradientBoostingClassifier()
r = RandomForestClassifier()
d = DecisionTreeClassifier()
l = LogisticRegression()
k = KNeighborsClassifier()
g = GaussianNB()
b = BernoulliNB()
#list of algorithms and names for our function later on
algorithms = [gbc, r, d, l, k, g, b]
names= ['Gradient Boosting', 'Random Forest', 'Decision Tree','Logisic Regression','K Nearest', 'GaussianNB', 'BernoulliNB']
#define the function that we will use to determine best classification algorithm
def tDMassess(X, y, algorithms= algorithms, names = names):
#train the data
for i in range(len(algorithms)):
algorithms[i] = algorithms[i].fit(X,y)
#print metrics
accuracy = []
precision = []
recall = []
f1 = []
for i in range(len(algorithms)):
print(i)
accuracy.append(accuracy_score(y, algorithms[i].predict(X)))
print(accuracy)
precision.append(precision_score(y, algorithms[i].predict(X)))
print(precision)
recall.append(recall_score(y, algorithms[i].predict(X)))
print(recall)
f1.append(f1_score(y, algorithms[i].predict(X)))
print(f1)
print('next loop')
metrics = pd.DataFrame(columns = ['Accuracy', 'Precision', 'Recall', 'F1'], index= names)
metrics['Accuracy'] = accuracy
metrics['Precision'] = precision
metrics['Recall'] = recall
metrics['F1'] = f1
return metrics
training = pd.read_csv('C:\\Users\\ckoni\\Desktop\\DevMastersWork\\Day 6 Files\\all\\train.csv')
test = pd.read_csv('C:\\Users\\ckoni\\Desktop\\DevMastersWork\\Day 6 Files\\all\\test.csv')
training.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
Step 2: Do some primary EDA to see how data will need to be cleaned/ augmented
test.isnull().sum()
PassengerId 0
Pclass 0
Name 0
Sex 0
Age 86
SibSp 0
Parch 0
Ticket 0
Fare 1
Cabin 327
Embarked 0
dtype: int64
#We will handle the rest of the nulls later, but this Fare one we can take care of easily now.
test[test.Fare.isnull()]
PassengerId | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|
152 | 1044 | 3 | Storey, Mr. Thomas | male | 60.5 | 0 | 0 | 3701 | NaN | NaN | S |
test[test['Pclass'] == 3].mean()
PassengerId 1094.178899
Pclass 3.000000
Age 24.027945
SibSp 0.463303
Parch 0.417431
Fare 12.459678
dtype: float64
test.loc[152, 'Fare'] = 12.459678
#combine the data for feature engineering
data = pd.concat([training, test])
C:\Users\ckoni\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.
To accept the future behavior, pass 'sort=True'.
To retain the current behavior and silence the warning, pass sort=False
data.shape
(1309, 12)
data.describe()
Age | Fare | Parch | PassengerId | Pclass | SibSp | Survived | |
---|---|---|---|---|---|---|---|
count | 1046.000000 | 1309.000000 | 1309.000000 | 1309.000000 | 1309.000000 | 1309.000000 | 891.000000 |
mean | 29.881138 | 33.279562 | 0.385027 | 655.000000 | 2.294882 | 0.498854 | 0.383838 |
std | 14.413493 | 51.742084 | 0.865560 | 378.020061 | 0.837836 | 1.041658 | 0.486592 |
min | 0.170000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 |
25% | 21.000000 | 7.895800 | 0.000000 | 328.000000 | 2.000000 | 0.000000 | 0.000000 |
50% | 28.000000 | 14.454200 | 0.000000 | 655.000000 | 3.000000 | 0.000000 | 0.000000 |
75% | 39.000000 | 31.275000 | 0.000000 | 982.000000 | 3.000000 | 1.000000 | 1.000000 |
max | 80.000000 | 512.329200 | 9.000000 | 1309.000000 | 3.000000 | 8.000000 | 1.000000 |
data.isnull().sum()
Age 263
Cabin 1014
Embarked 2
Fare 0
Name 0
Parch 0
PassengerId 0
Pclass 0
Sex 0
SibSp 0
Survived 418
Ticket 0
dtype: int64
data.index
Int64Index([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
...
408, 409, 410, 411, 412, 413, 414, 415, 416, 417],
dtype='int64', length=1309)
Visual EDA
Step 2: Personally, I’m a visual learner. So I like to take a look at graphs of some possible key features in order to get a sense for what impacts the survivability of passengers. Below, I have graphed some features that I knew may play major roles in impacting our key variable (Passenger Class/Age), or features that I could not get a good sense of from our info/describe statements (SibSp/Fare).
#I used swarmplots because theyre a good plot to use to show data density and spread when you dont have an especially large dataset
sns.set(style="darkgrid")
_ = plt.figure(figsize= (13, 9))
_ = plt.subplot(2,3,1)
_ = sns.swarmplot(x="Pclass", y="Age", hue="Survived", data= data)
_ = plt.subplot(2,3,2)
_ = sns.swarmplot(x="Pclass", y="Parch", hue="Survived", data= data)
_ = plt.subplot(2,3,3)
_ = sns.swarmplot(x="Pclass", y="SibSp", hue="Survived", data= data)
_ = plt.subplot(2,3,4)
_ = sns.swarmplot(x="Sex", y="Age", hue="Survived", data= data)
_ = plt.subplot(2,3,5)
_ = sns.swarmplot(x="Embarked", y="Age", hue="Survived", data= data)
_ = plt.subplot(2,3,6)
_ = sns.swarmplot(x="Sex", y="Fare", hue="Survived", data= data)
plt.tight_layout()
Based on these graphs, there are only a few features that seem to have an effect on survivability: #Age #Pclass #Sex #Parch (possibly the amount of Parents/Children has an effect on survivability)
The ones that we looked at that we can determine have a very small or no effect on survivability: #Fare (Pclass is better indicator) #SibSp #Ticket, Cabin number, and port of embarkation dont seem to matter at all.
Data Cleaning
Step 3: Time to do everyone’s favorite task, data cleaning! I noticed that there were a lot of null values in the Age column, which was one of our major features that we identified, which means we have to fill it somehow.
#Take a look at the null Age values just to see what we're workin with
data[data['Age'].isnull()]
Age | Cabin | Embarked | Fare | Name | Parch | PassengerId | Pclass | Sex | SibSp | Survived | Ticket | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
5 | NaN | NaN | Q | 8.4583 | Moran, Mr. James | 0 | 6 | 3 | male | 0 | 0.0 | 330877 |
17 | NaN | NaN | S | 13.0000 | Williams, Mr. Charles Eugene | 0 | 18 | 2 | male | 0 | 1.0 | 244373 |
19 | NaN | NaN | C | 7.2250 | Masselmani, Mrs. Fatima | 0 | 20 | 3 | female | 0 | 1.0 | 2649 |
26 | NaN | NaN | C | 7.2250 | Emir, Mr. Farred Chehab | 0 | 27 | 3 | male | 0 | 0.0 | 2631 |
28 | NaN | NaN | Q | 7.8792 | O'Dwyer, Miss. Ellen "Nellie" | 0 | 29 | 3 | female | 0 | 1.0 | 330959 |
29 | NaN | NaN | S | 7.8958 | Todoroff, Mr. Lalio | 0 | 30 | 3 | male | 0 | 0.0 | 349216 |
31 | NaN | B78 | C | 146.5208 | Spencer, Mrs. William Augustus (Marie Eugenie) | 0 | 32 | 1 | female | 1 | 1.0 | PC 17569 |
32 | NaN | NaN | Q | 7.7500 | Glynn, Miss. Mary Agatha | 0 | 33 | 3 | female | 0 | 1.0 | 335677 |
36 | NaN | NaN | C | 7.2292 | Mamee, Mr. Hanna | 0 | 37 | 3 | male | 0 | 1.0 | 2677 |
42 | NaN | NaN | C | 7.8958 | Kraeff, Mr. Theodor | 0 | 43 | 3 | male | 0 | 0.0 | 349253 |
45 | NaN | NaN | S | 8.0500 | Rogers, Mr. William John | 0 | 46 | 3 | male | 0 | 0.0 | S.C./A.4. 23567 |
46 | NaN | NaN | Q | 15.5000 | Lennon, Mr. Denis | 0 | 47 | 3 | male | 1 | 0.0 | 370371 |
47 | NaN | NaN | Q | 7.7500 | O'Driscoll, Miss. Bridget | 0 | 48 | 3 | female | 0 | 1.0 | 14311 |
48 | NaN | NaN | C | 21.6792 | Samaan, Mr. Youssef | 0 | 49 | 3 | male | 2 | 0.0 | 2662 |
55 | NaN | C52 | S | 35.5000 | Woolner, Mr. Hugh | 0 | 56 | 1 | male | 0 | 1.0 | 19947 |
64 | NaN | NaN | C | 27.7208 | Stewart, Mr. Albert A | 0 | 65 | 1 | male | 0 | 0.0 | PC 17605 |
65 | NaN | NaN | C | 15.2458 | Moubarek, Master. Gerios | 1 | 66 | 3 | male | 1 | 1.0 | 2661 |
76 | NaN | NaN | S | 7.8958 | Staneff, Mr. Ivan | 0 | 77 | 3 | male | 0 | 0.0 | 349208 |
77 | NaN | NaN | S | 8.0500 | Moutal, Mr. Rahamin Haim | 0 | 78 | 3 | male | 0 | 0.0 | 374746 |
82 | NaN | NaN | Q | 7.7875 | McDermott, Miss. Brigdet Delia | 0 | 83 | 3 | female | 0 | 1.0 | 330932 |
87 | NaN | NaN | S | 8.0500 | Slocovski, Mr. Selman Francis | 0 | 88 | 3 | male | 0 | 0.0 | SOTON/OQ 392086 |
95 | NaN | NaN | S | 8.0500 | Shorney, Mr. Charles Joseph | 0 | 96 | 3 | male | 0 | 0.0 | 374910 |
101 | NaN | NaN | S | 7.8958 | Petroff, Mr. Pastcho ("Pentcho") | 0 | 102 | 3 | male | 0 | 0.0 | 349215 |
107 | NaN | NaN | S | 7.7750 | Moss, Mr. Albert Johan | 0 | 108 | 3 | male | 0 | 1.0 | 312991 |
109 | NaN | NaN | Q | 24.1500 | Moran, Miss. Bertha | 0 | 110 | 3 | female | 1 | 1.0 | 371110 |
121 | NaN | NaN | S | 8.0500 | Moore, Mr. Leonard Charles | 0 | 122 | 3 | male | 0 | 0.0 | A4. 54510 |
126 | NaN | NaN | Q | 7.7500 | McMahon, Mr. Martin | 0 | 127 | 3 | male | 0 | 0.0 | 370372 |
128 | NaN | F E69 | C | 22.3583 | Peter, Miss. Anna | 1 | 129 | 3 | female | 1 | 1.0 | 2668 |
140 | NaN | NaN | C | 15.2458 | Boulos, Mrs. Joseph (Sultana) | 2 | 141 | 3 | female | 0 | 0.0 | 2678 |
154 | NaN | NaN | S | 7.3125 | Olsen, Mr. Ole Martin | 0 | 155 | 3 | male | 0 | 0.0 | Fa 265302 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
268 | NaN | NaN | S | 8.0500 | Howard, Miss. May Elizabeth | 0 | 1160 | 3 | female | 0 | NaN | A. 2. 39186 |
271 | NaN | NaN | Q | 7.7500 | Fox, Mr. Patrick | 0 | 1163 | 3 | male | 0 | NaN | 368573 |
273 | NaN | NaN | Q | 15.5000 | Lennon, Miss. Mary | 0 | 1165 | 3 | female | 1 | NaN | 370371 |
274 | NaN | NaN | C | 7.2250 | Saade, Mr. Jean Nassr | 0 | 1166 | 3 | male | 0 | NaN | 2676 |
282 | NaN | NaN | Q | 7.7500 | Fleming, Miss. Honora | 0 | 1174 | 3 | female | 0 | NaN | 364859 |
286 | NaN | NaN | S | 7.2500 | Franklin, Mr. Charles (Charles Fardon) | 0 | 1178 | 3 | male | 0 | NaN | SOTON/O.Q. 3101314 |
288 | NaN | F E46 | C | 7.2292 | Mardirosian, Mr. Sarkis | 0 | 1180 | 3 | male | 0 | NaN | 2655 |
289 | NaN | NaN | S | 8.0500 | Ford, Mr. Arthur | 0 | 1181 | 3 | male | 0 | NaN | A/5 1478 |
290 | NaN | NaN | S | 39.6000 | Rheims, Mr. George Alexander Lucien | 0 | 1182 | 1 | male | 0 | NaN | PC 17607 |
292 | NaN | NaN | C | 7.2292 | Nasr, Mr. Mustafa | 0 | 1184 | 3 | male | 0 | NaN | 2652 |
297 | NaN | NaN | C | 21.6792 | Samaan, Mr. Hanna | 0 | 1189 | 3 | male | 2 | NaN | 2662 |
301 | NaN | D | C | 15.0458 | Malachard, Mr. Noel | 0 | 1193 | 2 | male | 0 | NaN | 237735 |
304 | NaN | NaN | Q | 7.7500 | McCarthy, Miss. Catherine Katie"" | 0 | 1196 | 3 | female | 0 | NaN | 383123 |
312 | NaN | NaN | S | 7.5750 | Sadowitz, Mr. Harry | 0 | 1204 | 3 | male | 0 | NaN | LP 1588 |
332 | NaN | NaN | C | 7.2250 | Thomas, Mr. Tannous | 0 | 1224 | 3 | male | 0 | NaN | 2684 |
339 | NaN | NaN | C | 7.2292 | Betros, Master. Seman | 0 | 1231 | 3 | male | 0 | NaN | 2622 |
342 | NaN | NaN | S | 69.5500 | Sage, Mr. John George | 9 | 1234 | 3 | male | 1 | NaN | CA. 2343 |
344 | NaN | NaN | S | 14.5000 | van Billiard, Master. James William | 1 | 1236 | 3 | male | 1 | NaN | A/5. 851 |
357 | NaN | NaN | S | 7.8792 | Lockyer, Mr. Edward | 0 | 1249 | 3 | male | 0 | NaN | 1222 |
358 | NaN | NaN | Q | 7.7500 | O'Keefe, Mr. Patrick | 0 | 1250 | 3 | male | 0 | NaN | 368402 |
365 | NaN | NaN | S | 69.5500 | Sage, Mrs. John (Annie Bullen) | 9 | 1257 | 3 | female | 1 | NaN | CA. 2343 |
366 | NaN | NaN | C | 14.4583 | Caram, Mr. Joseph | 0 | 1258 | 3 | male | 1 | NaN | 2689 |
380 | NaN | NaN | Q | 7.7500 | O'Connor, Mr. Patrick | 0 | 1272 | 3 | male | 0 | NaN | 366713 |
382 | NaN | NaN | S | 14.5000 | Risien, Mrs. Samuel (Emma) | 0 | 1274 | 3 | female | 0 | NaN | 364498 |
384 | NaN | NaN | S | 12.8750 | Wheeler, Mr. Edwin Frederick"" | 0 | 1276 | 2 | male | 0 | NaN | SC/PARIS 2159 |
408 | NaN | NaN | Q | 7.7208 | Riordan, Miss. Johanna Hannah"" | 0 | 1300 | 3 | female | 0 | NaN | 334915 |
410 | NaN | NaN | Q | 7.7500 | Naughton, Miss. Hannah | 0 | 1302 | 3 | female | 0 | NaN | 365237 |
413 | NaN | NaN | S | 8.0500 | Spector, Mr. Woolf | 0 | 1305 | 3 | male | 0 | NaN | A.5. 3236 |
416 | NaN | NaN | S | 8.0500 | Ware, Mr. Frederick | 0 | 1308 | 3 | male | 0 | NaN | 359309 |
417 | NaN | NaN | C | 22.3583 | Peter, Master. Michael J | 1 | 1309 | 3 | male | 1 | NaN | 2668 |
263 rows × 12 columns
# to fit age better, take the names, split on the "," on the left side. Split on "." on the right side
#do this to find 'Master vs. Mister' and 'Miss vs. Mrs'
names1 = pd.DataFrame(data['Name'].str.split(',', n=1).tolist(), columns = ['surname', 'given'])
names2 = pd.DataFrame(names1['given'].str.split('.', n=1).tolist(), columns = ['prefix', 'given'])
prefix = names2['prefix']
prefix = pd.DataFrame(prefix)
prefix.columns = ['title']
data['title'] = prefix
data.head()
Age | Cabin | Embarked | Fare | Name | Parch | PassengerId | Pclass | Sex | SibSp | Survived | Ticket | title | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22.0 | NaN | S | 7.2500 | Braund, Mr. Owen Harris | 0 | 1 | 3 | male | 1 | 0.0 | A/5 21171 | Mr |
1 | 38.0 | C85 | C | 71.2833 | Cumings, Mrs. John Bradley (Florence Briggs Th... | 0 | 2 | 1 | female | 1 | 1.0 | PC 17599 | Mrs |
2 | 26.0 | NaN | S | 7.9250 | Heikkinen, Miss. Laina | 0 | 3 | 3 | female | 0 | 1.0 | STON/O2. 3101282 | Miss |
3 | 35.0 | C123 | S | 53.1000 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | 0 | 4 | 1 | female | 1 | 1.0 | 113803 | Mrs |
4 | 35.0 | NaN | S | 8.0500 | Allen, Mr. William Henry | 0 | 5 | 3 | male | 0 | 0.0 | 373450 | Mr |
#fill the null values in 'Age' column with the mean of each title
data['Age'] = data.groupby('title').transform(lambda x: x.fillna(x.mean()))
data.Age.isnull().sum()
0
data.reset_index()
index | Age | Cabin | Embarked | Fare | Name | Parch | PassengerId | Pclass | Sex | SibSp | Survived | Ticket | title | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 22.000000 | NaN | S | 7.2500 | Braund, Mr. Owen Harris | 0 | 1 | 3 | male | 1 | 0.0 | A/5 21171 | Mr |
1 | 1 | 38.000000 | C85 | C | 71.2833 | Cumings, Mrs. John Bradley (Florence Briggs Th... | 0 | 2 | 1 | female | 1 | 1.0 | PC 17599 | Mrs |
2 | 2 | 26.000000 | NaN | S | 7.9250 | Heikkinen, Miss. Laina | 0 | 3 | 3 | female | 0 | 1.0 | STON/O2. 3101282 | Miss |
3 | 3 | 35.000000 | C123 | S | 53.1000 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | 0 | 4 | 1 | female | 1 | 1.0 | 113803 | Mrs |
4 | 4 | 35.000000 | NaN | S | 8.0500 | Allen, Mr. William Henry | 0 | 5 | 3 | male | 0 | 0.0 | 373450 | Mr |
5 | 5 | 31.563774 | NaN | Q | 8.4583 | Moran, Mr. James | 0 | 6 | 3 | male | 0 | 0.0 | 330877 | Mr |
6 | 6 | 54.000000 | E46 | S | 51.8625 | McCarthy, Mr. Timothy J | 0 | 7 | 1 | male | 0 | 0.0 | 17463 | Mr |
7 | 7 | 2.000000 | NaN | S | 21.0750 | Palsson, Master. Gosta Leonard | 1 | 8 | 3 | male | 3 | 0.0 | 349909 | Master |
8 | 8 | 27.000000 | NaN | S | 11.1333 | Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) | 2 | 9 | 3 | female | 0 | 1.0 | 347742 | Mrs |
9 | 9 | 14.000000 | NaN | C | 30.0708 | Nasser, Mrs. Nicholas (Adele Achem) | 0 | 10 | 2 | female | 1 | 1.0 | 237736 | Mrs |
10 | 10 | 4.000000 | G6 | S | 16.7000 | Sandstrom, Miss. Marguerite Rut | 1 | 11 | 3 | female | 1 | 1.0 | PP 9549 | Miss |
11 | 11 | 58.000000 | C103 | S | 26.5500 | Bonnell, Miss. Elizabeth | 0 | 12 | 1 | female | 0 | 1.0 | 113783 | Miss |
12 | 12 | 20.000000 | NaN | S | 8.0500 | Saundercock, Mr. William Henry | 0 | 13 | 3 | male | 0 | 0.0 | A/5. 2151 | Mr |
13 | 13 | 39.000000 | NaN | S | 31.2750 | Andersson, Mr. Anders Johan | 5 | 14 | 3 | male | 1 | 0.0 | 347082 | Mr |
14 | 14 | 14.000000 | NaN | S | 7.8542 | Vestrom, Miss. Hulda Amanda Adolfina | 0 | 15 | 3 | female | 0 | 0.0 | 350406 | Miss |
15 | 15 | 55.000000 | NaN | S | 16.0000 | Hewlett, Mrs. (Mary D Kingcome) | 0 | 16 | 2 | female | 0 | 1.0 | 248706 | Mrs |
16 | 16 | 2.000000 | NaN | Q | 29.1250 | Rice, Master. Eugene | 1 | 17 | 3 | male | 4 | 0.0 | 382652 | Master |
17 | 17 | 31.563774 | NaN | S | 13.0000 | Williams, Mr. Charles Eugene | 0 | 18 | 2 | male | 0 | 1.0 | 244373 | Mr |
18 | 18 | 31.000000 | NaN | S | 18.0000 | Vander Planke, Mrs. Julius (Emelia Maria Vande... | 0 | 19 | 3 | female | 1 | 0.0 | 345763 | Mrs |
19 | 19 | 33.760194 | NaN | C | 7.2250 | Masselmani, Mrs. Fatima | 0 | 20 | 3 | female | 0 | 1.0 | 2649 | Mrs |
20 | 20 | 35.000000 | NaN | S | 26.0000 | Fynney, Mr. Joseph J | 0 | 21 | 2 | male | 0 | 0.0 | 239865 | Mr |
21 | 21 | 34.000000 | D56 | S | 13.0000 | Beesley, Mr. Lawrence | 0 | 22 | 2 | male | 0 | 1.0 | 248698 | Mr |
22 | 22 | 15.000000 | NaN | Q | 8.0292 | McGowan, Miss. Anna "Annie" | 0 | 23 | 3 | female | 0 | 1.0 | 330923 | Miss |
23 | 23 | 28.000000 | A6 | S | 35.5000 | Sloper, Mr. William Thompson | 0 | 24 | 1 | male | 0 | 1.0 | 113788 | Mr |
24 | 24 | 8.000000 | NaN | S | 21.0750 | Palsson, Miss. Torborg Danira | 1 | 25 | 3 | female | 3 | 0.0 | 349909 | Miss |
25 | 25 | 38.000000 | NaN | S | 31.3875 | Asplund, Mrs. Carl Oscar (Selma Augusta Emilia... | 5 | 26 | 3 | female | 1 | 1.0 | 347077 | Mrs |
26 | 26 | 31.563774 | NaN | C | 7.2250 | Emir, Mr. Farred Chehab | 0 | 27 | 3 | male | 0 | 0.0 | 2631 | Mr |
27 | 27 | 19.000000 | C23 C25 C27 | S | 263.0000 | Fortune, Mr. Charles Alexander | 2 | 28 | 1 | male | 3 | 0.0 | 19950 | Mr |
28 | 28 | 25.455752 | NaN | Q | 7.8792 | O'Dwyer, Miss. Ellen "Nellie" | 0 | 29 | 3 | female | 0 | 1.0 | 330959 | Miss |
29 | 29 | 31.563774 | NaN | S | 7.8958 | Todoroff, Mr. Lalio | 0 | 30 | 3 | male | 0 | 0.0 | 349216 | Mr |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1279 | 388 | 21.000000 | NaN | Q | 7.7500 | Canavan, Mr. Patrick | 0 | 1280 | 3 | male | 0 | NaN | 364858 | Mr |
1280 | 389 | 6.000000 | NaN | S | 21.0750 | Palsson, Master. Paul Folke | 1 | 1281 | 3 | male | 3 | NaN | 349909 | Miss |
1281 | 390 | 23.000000 | B24 | S | 93.5000 | Payne, Mr. Vivian Ponsonby | 0 | 1282 | 1 | male | 0 | NaN | 12749 | Mr |
1282 | 391 | 51.000000 | D28 | S | 39.4000 | Lines, Mrs. Ernest H (Elizabeth Lindsey James) | 1 | 1283 | 1 | female | 0 | NaN | PC 17592 | Mr |
1283 | 392 | 13.000000 | NaN | S | 20.2500 | Abbott, Master. Eugene Joseph | 2 | 1284 | 3 | male | 0 | NaN | C.A. 2673 | Mr |
1284 | 393 | 47.000000 | NaN | S | 10.5000 | Gilbert, Mr. William | 0 | 1285 | 2 | male | 0 | NaN | C.A. 30769 | Miss |
1285 | 394 | 29.000000 | NaN | S | 22.0250 | Kink-Heilmann, Mr. Anton | 1 | 1286 | 3 | male | 3 | NaN | 315153 | Mrs |
1286 | 395 | 18.000000 | C31 | S | 60.0000 | Smith, Mrs. Lucien Philip (Mary Eloise Hughes) | 0 | 1287 | 1 | female | 1 | NaN | 13695 | Mr |
1287 | 396 | 24.000000 | NaN | Q | 7.2500 | Colbert, Mr. Patrick | 0 | 1288 | 3 | male | 0 | NaN | 371109 | Miss |
1288 | 397 | 48.000000 | B41 | C | 79.2000 | Frolicher-Stehli, Mrs. Maxmillian (Margaretha ... | 1 | 1289 | 1 | female | 1 | NaN | 13567 | Mr |
1289 | 398 | 22.000000 | NaN | S | 7.7750 | Larsson-Rondberg, Mr. Edvard A | 0 | 1290 | 3 | male | 0 | NaN | 347065 | Dr |
1290 | 399 | 31.000000 | NaN | Q | 7.7333 | Conlon, Mr. Thomas Henry | 0 | 1291 | 3 | male | 0 | NaN | 21332 | Mrs |
1291 | 400 | 30.000000 | C7 | S | 164.8667 | Bonnell, Miss. Caroline | 0 | 1292 | 1 | female | 0 | NaN | 36928 | Mr |
1292 | 401 | 38.000000 | NaN | S | 21.0000 | Gale, Mr. Harry | 0 | 1293 | 2 | male | 1 | NaN | 28664 | Mr |
1293 | 402 | 22.000000 | NaN | C | 59.4000 | Gibson, Miss. Dorothy Winifred | 1 | 1294 | 1 | female | 0 | NaN | 112378 | Miss |
1294 | 403 | 17.000000 | NaN | S | 47.1000 | Carrau, Mr. Jose Pedro | 0 | 1295 | 1 | male | 0 | NaN | 113059 | Mr |
1295 | 404 | 43.000000 | D40 | C | 27.7208 | Frauenthal, Mr. Isaac Gerald | 0 | 1296 | 1 | male | 1 | NaN | 17765 | Miss |
1296 | 405 | 20.000000 | D38 | C | 13.8625 | Nourney, Mr. Alfred (Baron von Drachstedt")" | 0 | 1297 | 2 | male | 0 | NaN | SC/PARIS 2166 | Mr |
1297 | 406 | 23.000000 | NaN | S | 10.5000 | Ware, Mr. William Jeffery | 0 | 1298 | 2 | male | 1 | NaN | 28666 | Mr |
1298 | 407 | 50.000000 | C80 | C | 211.5000 | Widener, Mr. George Dunton | 1 | 1299 | 1 | male | 1 | NaN | 113503 | Master |
1299 | 408 | 31.563774 | NaN | Q | 7.7208 | Riordan, Miss. Johanna Hannah"" | 0 | 1300 | 3 | female | 0 | NaN | 334915 | Mr |
1300 | 409 | 3.000000 | NaN | S | 13.7750 | Peacock, Miss. Treasteall | 1 | 1301 | 3 | female | 1 | NaN | SOTON/O.Q. 3101315 | Miss |
1301 | 410 | 31.563774 | NaN | Q | 7.7500 | Naughton, Miss. Hannah | 0 | 1302 | 3 | female | 0 | NaN | 365237 | Mr |
1302 | 411 | 37.000000 | C78 | Q | 90.0000 | Minahan, Mrs. William Edward (Lillian E Thorpe) | 0 | 1303 | 1 | female | 1 | NaN | 19928 | Mr |
1303 | 412 | 28.000000 | NaN | S | 7.7750 | Henriksson, Miss. Jenny Lovisa | 0 | 1304 | 3 | female | 0 | NaN | 347086 | Miss |
1304 | 413 | 31.563774 | NaN | S | 8.0500 | Spector, Mr. Woolf | 0 | 1305 | 3 | male | 0 | NaN | A.5. 3236 | Mr |
1305 | 414 | 39.000000 | C105 | C | 108.9000 | Oliva y Ocana, Dona. Fermina | 0 | 1306 | 1 | female | 0 | NaN | PC 17758 | Mr |
1306 | 415 | 38.500000 | NaN | S | 7.2500 | Saether, Mr. Simon Sivertsen | 0 | 1307 | 3 | male | 0 | NaN | SOTON/O.Q. 3101262 | Mrs |
1307 | 416 | 33.760194 | NaN | S | 8.0500 | Ware, Mr. Frederick | 0 | 1308 | 3 | male | 0 | NaN | 359309 | Mrs |
1308 | 417 | 25.455752 | NaN | C | 22.3583 | Peter, Master. Michael J | 1 | 1309 | 3 | male | 1 | NaN | 2668 | Miss |
1309 rows × 14 columns
#to do the above name strip in one line:
#data['title'] = data.Name.map(lambda x: x.split(',')[1].split('.')[0].strip())
Feature Engineering
Step 3.5: Add/remove some features based on whether or not they are going to matter in our model.
data.Parch.describe()
#I dont know what this does and the std is higher than the mean which makes me think it's a bad predictor of anything
count 1309.000000
mean 0.385027
std 0.865560
min 0.000000
25% 0.000000
50% 0.000000
75% 0.000000
max 9.000000
Name: Parch, dtype: float64
#Filling the 2 Embarked null values with the most common value (Do I even need this column?)
#data.Embarked.fillna('S', inplace = True)
#Create dummy variables for the categoricals in the dataset
Pclassdumm = pd.get_dummies(data['Pclass'], drop_first = True)
data = pd.concat([data, Pclassdumm], axis= 1)
data.drop('Pclass', inplace = True, axis= 1)
sexdumm = pd.get_dummies(data['Sex'], drop_first = True)
data = pd.concat([data, sexdumm], axis= 1)
data.drop('Sex', inplace = True, axis=1)
#Do I even need this?
#Embardumm = pd.get_dummies(data['Embarked'], drop_first = True)
#data = pd.concat([data, Embardumm], axis= 1)
#data.drop('Embarked', inplace = True, axis=1)
#Drop all the columns we dont need
data.drop(['Name', 'PassengerId', 'title', 'Cabin', 'Ticket', 'Embarked', 'SibSp', 'Fare'], inplace = True, axis = 1)
data['Children'] = data['Age'] < 16
data['Children'].head()
0 False
1 False
2 False
3 False
4 False
Name: Children, dtype: bool
data.shape
(1309, 6)
data.describe()
Age | Survived | 2 | 3 | male | |
---|---|---|---|---|---|
count | 1309.000000 | 891.000000 | 1309.000000 | 1309.000000 | 1309.000000 |
mean | 29.930167 | 0.383838 | 0.211612 | 0.541635 | 0.644003 |
std | 12.990814 | 0.486592 | 0.408607 | 0.498454 | 0.478997 |
min | 0.170000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 22.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
50% | 30.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
75% | 35.000000 | 1.000000 | 0.000000 | 1.000000 | 1.000000 |
max | 80.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
data.describe()
Age | Survived | 2 | 3 | male | |
---|---|---|---|---|---|
count | 1309.000000 | 891.000000 | 1309.000000 | 1309.000000 | 1309.000000 |
mean | 29.930167 | 0.383838 | 0.211612 | 0.541635 | 0.644003 |
std | 12.990814 | 0.486592 | 0.408607 | 0.498454 | 0.478997 |
min | 0.170000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 22.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
50% | 30.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
75% | 35.000000 | 1.000000 | 0.000000 | 1.000000 | 1.000000 |
max | 80.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
Training the Model
Step 4: Since we’re done changing around our features and cleaning our data, we can now re-split the data into training and test sets, run our assessment function to determine which would be the best predictor, and then predict on the test data.
#re-split data into our TRAINING SET so that we can use it to assess
train = data[data['Survived'].notnull()]
train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 6 columns):
Age 891 non-null float64
Survived 891 non-null float64
2 891 non-null uint8
3 891 non-null uint8
male 891 non-null uint8
Children 891 non-null bool
dtypes: bool(1), float64(2), uint8(3)
memory usage: 24.4 KB
#split the 'Survived' column off of the TRAINING set so that we can assess our predictions
target = train[['Survived']]
features = train.drop('Survived', axis= 1)
features.shape, target.shape
((891, 5), (891, 1))
features.isnull().sum()
Age 0
2 0
3 0
male 0
Children 0
dtype: int64
On previous iterations, my model was performing at about 77.5% accuracy (with the best performance at 79%). It has come to my attention that this may be due to overtraining or overfitting since this is a smaller dataset. I will use train_test_split to try and reduce this error.
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = .2, random_state = 42)
#adding the train_test_split decreased my score to 75%
tDMassess(x_train, y_train)
C:\Users\ckoni\Anaconda3\lib\site-packages\sklearn\utils\validation.py:578: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
C:\Users\ckoni\Anaconda3\lib\site-packages\ipykernel_launcher.py:5: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
"""
C:\Users\ckoni\Anaconda3\lib\site-packages\sklearn\utils\validation.py:578: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
C:\Users\ckoni\Anaconda3\lib\site-packages\ipykernel_launcher.py:5: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
"""
C:\Users\ckoni\Anaconda3\lib\site-packages\sklearn\utils\validation.py:578: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
C:\Users\ckoni\Anaconda3\lib\site-packages\sklearn\utils\validation.py:578: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
0
[0.8525280898876404]
[0.8721461187214612]
[0.7126865671641791]
[0.784394250513347]
next loop
1
[0.8525280898876404, 0.875]
[0.8721461187214612, 0.8744769874476988]
[0.7126865671641791, 0.7798507462686567]
[0.784394250513347, 0.8244575936883628]
next loop
2
[0.8525280898876404, 0.875, 0.8778089887640449]
[0.8721461187214612, 0.8744769874476988, 0.8951965065502183]
[0.7126865671641791, 0.7798507462686567, 0.7649253731343284]
[0.784394250513347, 0.8244575936883628, 0.8249496981891348]
next loop
3
[0.8525280898876404, 0.875, 0.8778089887640449, 0.7893258426966292]
[0.8721461187214612, 0.8744769874476988, 0.8951965065502183, 0.7398373983739838]
[0.7126865671641791, 0.7798507462686567, 0.7649253731343284, 0.6791044776119403]
[0.784394250513347, 0.8244575936883628, 0.8249496981891348, 0.708171206225681]
next loop
4
[0.8525280898876404, 0.875, 0.8778089887640449, 0.7893258426966292, 0.8441011235955056]
[0.8721461187214612, 0.8744769874476988, 0.8951965065502183, 0.7398373983739838, 0.8257261410788381]
[0.7126865671641791, 0.7798507462686567, 0.7649253731343284, 0.6791044776119403, 0.7425373134328358]
[0.784394250513347, 0.8244575936883628, 0.8249496981891348, 0.708171206225681, 0.7819253438113948]
next loop
5
[0.8525280898876404, 0.875, 0.8778089887640449, 0.7893258426966292, 0.8441011235955056, 0.7851123595505618]
[0.8721461187214612, 0.8744769874476988, 0.8951965065502183, 0.7398373983739838, 0.8257261410788381, 0.7031802120141343]
[0.7126865671641791, 0.7798507462686567, 0.7649253731343284, 0.6791044776119403, 0.7425373134328358, 0.7425373134328358]
[0.784394250513347, 0.8244575936883628, 0.8249496981891348, 0.708171206225681, 0.7819253438113948, 0.722323049001815]
next loop
6
[0.8525280898876404, 0.875, 0.8778089887640449, 0.7893258426966292, 0.8441011235955056, 0.7851123595505618, 0.7991573033707865]
[0.8721461187214612, 0.8744769874476988, 0.8951965065502183, 0.7398373983739838, 0.8257261410788381, 0.7031802120141343, 0.7470355731225297]
[0.7126865671641791, 0.7798507462686567, 0.7649253731343284, 0.6791044776119403, 0.7425373134328358, 0.7425373134328358, 0.7052238805970149]
[0.784394250513347, 0.8244575936883628, 0.8249496981891348, 0.708171206225681, 0.7819253438113948, 0.722323049001815, 0.72552783109405]
next loop
Accuracy | Precision | Recall | F1 | |
---|---|---|---|---|
Gradient Boosting | 0.852528 | 0.872146 | 0.712687 | 0.784394 |
Random Forest | 0.875000 | 0.874477 | 0.779851 | 0.824458 |
Decision Tree | 0.877809 | 0.895197 | 0.764925 | 0.824950 |
Logisic Regression | 0.789326 | 0.739837 | 0.679104 | 0.708171 |
K Nearest | 0.844101 | 0.825726 | 0.742537 | 0.781925 |
GaussianNB | 0.785112 | 0.703180 | 0.742537 | 0.722323 |
BernoulliNB | 0.799157 | 0.747036 | 0.705224 | 0.725528 |
Seems like Gradient Boosting or Random Forest would be the best option to predict here
#Since we found that GB would be the best model, we can re-split our ORIGINAL, FULL dataset into the training and testing sets
training = data[:891]
testing = data[891:]
testing.describe()
Age | Survived | 2 | 3 | male | |
---|---|---|---|---|---|
count | 418.000000 | 0.0 | 418.000000 | 418.000000 | 418.000000 |
mean | 30.223491 | NaN | 0.222488 | 0.521531 | 0.636364 |
std | 12.757613 | NaN | 0.416416 | 0.500135 | 0.481622 |
min | 0.170000 | NaN | 0.000000 | 0.000000 | 0.000000 |
25% | 22.625000 | NaN | 0.000000 | 0.000000 | 0.000000 |
50% | 30.000000 | NaN | 0.000000 | 1.000000 | 1.000000 |
75% | 36.000000 | NaN | 0.000000 | 1.000000 | 1.000000 |
max | 76.000000 | NaN | 1.000000 | 1.000000 | 1.000000 |
#remove this column since it's all nulls, we will be replacing it with our predictions soon
testing.drop('Survived', inplace = True, axis= 1)
C:\Users\ckoni\Anaconda3\lib\site-packages\pandas\core\frame.py:3694: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
errors=errors)
training.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 6 columns):
Age 891 non-null float64
Survived 891 non-null float64
2 891 non-null uint8
3 891 non-null uint8
male 891 non-null uint8
Children 891 non-null bool
dtypes: bool(1), float64(2), uint8(3)
memory usage: 24.4 KB
testing.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 5 columns):
Age 418 non-null float64
2 418 non-null uint8
3 418 non-null uint8
male 418 non-null uint8
Children 418 non-null bool
dtypes: bool(1), float64(1), uint8(3)
memory usage: 8.2 KB
#Use Gradient Boosting to predict on the TESTING set and save to a variable, predict
predict = gbc.predict(testing)
#predict = r.predict(testing)
#Using random forest got me a 67% accuracy rating... 12% lower than my best with GB
#add the predictions as the survival values in the TESTING test data set which we recently dropped
test["Survived"] = predict.astype('int')
test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
PassengerId 418 non-null int64
Pclass 418 non-null int64
Name 418 non-null object
Sex 418 non-null object
Age 332 non-null float64
SibSp 418 non-null int64
Parch 418 non-null int64
Ticket 418 non-null object
Fare 418 non-null float64
Cabin 91 non-null object
Embarked 418 non-null object
Survived 418 non-null int32
dtypes: float64(2), int32(1), int64(4), object(5)
memory usage: 37.6+ KB
#Kaggle only needs a DataFrame containing the PassengerId and our predicted values
test = test[['PassengerId', 'Survived']]
test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
PassengerId 418 non-null int64
Survived 418 non-null int32
dtypes: int32(1), int64(1)
memory usage: 5.0 KB
#write to a csv file so we can upload to Kaggle
test.to_csv('titanic_predict.csv', index = False)
test.describe()
PassengerId | Survived | |
---|---|---|
count | 418.000000 | 418.000000 |
mean | 1100.500000 | 0.325359 |
std | 120.810458 | 0.469070 |
min | 892.000000 | 0.000000 |
25% | 996.250000 | 0.000000 |
50% | 1100.500000 | 0.000000 |
75% | 1204.750000 | 1.000000 |
max | 1309.000000 | 1.000000 |