Passengers on the Titanic: predict who will survive

Step 1: Import libraries, define testing function, and read in data

#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split

#define classification algorithms
gbc = GradientBoostingClassifier()
r = RandomForestClassifier()
d = DecisionTreeClassifier()
l = LogisticRegression()
k = KNeighborsClassifier()
g = GaussianNB()
b = BernoulliNB()

#list of algorithms and names for our function later on
algorithms = [gbc, r, d, l, k, g, b]
names= ['Gradient Boosting', 'Random Forest', 'Decision Tree','Logisic Regression','K Nearest', 'GaussianNB', 'BernoulliNB']

#define the function that we will use to determine best classification algorithm
def tDMassess(X, y, algorithms= algorithms, names = names):
    #train the data
    for i in range(len(algorithms)):
        algorithms[i] = algorithms[i].fit(X,y)
    #print metrics
    accuracy = []
    precision = []
    recall = []
    f1 = []
    for i in range(len(algorithms)):
        print(i)
        accuracy.append(accuracy_score(y, algorithms[i].predict(X)))
        print(accuracy)
        precision.append(precision_score(y, algorithms[i].predict(X)))
        print(precision)
        recall.append(recall_score(y, algorithms[i].predict(X)))
        print(recall)
        f1.append(f1_score(y, algorithms[i].predict(X)))
        print(f1)
        print('next loop')
    metrics = pd.DataFrame(columns = ['Accuracy', 'Precision', 'Recall', 'F1'], index= names)
    metrics['Accuracy'] = accuracy
    metrics['Precision'] = precision
    metrics['Recall'] = recall
    metrics['F1'] = f1
    return metrics

training = pd.read_csv('C:\\Users\\ckoni\\Desktop\\DevMastersWork\\Day 6 Files\\all\\train.csv')
test = pd.read_csv('C:\\Users\\ckoni\\Desktop\\DevMastersWork\\Day 6 Files\\all\\test.csv')

training.head()

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

Step 2: Do some primary EDA to see how data will need to be cleaned/ augmented

test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

#We will handle the rest of the nulls later, but this Fare one we can take care of easily now.
test[test.Fare.isnull()]

	PassengerId	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
152	1044	3	Storey, Mr. Thomas	male	60.5	0	0	3701	NaN	NaN	S

test[test['Pclass'] == 3].mean()

PassengerId    1094.178899
Pclass            3.000000
Age              24.027945
SibSp             0.463303
Parch             0.417431
Fare             12.459678
dtype: float64

test.loc[152, 'Fare'] = 12.459678

#combine the data for feature engineering
data = pd.concat([training, test])

C:\Users\ckoni\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.

To retain the current behavior and silence the warning, pass sort=False

data.shape

(1309, 12)

data.describe()

	Age	Fare	Parch	PassengerId	Pclass	SibSp	Survived
count	1046.000000	1309.000000	1309.000000	1309.000000	1309.000000	1309.000000	891.000000
mean	29.881138	33.279562	0.385027	655.000000	2.294882	0.498854	0.383838
std	14.413493	51.742084	0.865560	378.020061	0.837836	1.041658	0.486592
min	0.170000	0.000000	0.000000	1.000000	1.000000	0.000000	0.000000
25%	21.000000	7.895800	0.000000	328.000000	2.000000	0.000000	0.000000
50%	28.000000	14.454200	0.000000	655.000000	3.000000	0.000000	0.000000
75%	39.000000	31.275000	0.000000	982.000000	3.000000	1.000000	1.000000
max	80.000000	512.329200	9.000000	1309.000000	3.000000	8.000000	1.000000

data.isnull().sum()

Age             263
Cabin          1014
Embarked          2
Fare              0
Name              0
Parch             0
PassengerId       0
Pclass            0
Sex               0
SibSp             0
Survived        418
Ticket            0
dtype: int64

data.index

Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            408, 409, 410, 411, 412, 413, 414, 415, 416, 417],
           dtype='int64', length=1309)

Visual EDA

Step 2: Personally, I’m a visual learner. So I like to take a look at graphs of some possible key features in order to get a sense for what impacts the survivability of passengers. Below, I have graphed some features that I knew may play major roles in impacting our key variable (Passenger Class/Age), or features that I could not get a good sense of from our info/describe statements (SibSp/Fare).

#I used swarmplots because theyre a good plot to use to show data density and spread when you dont have an especially large dataset

sns.set(style="darkgrid")
_ = plt.figure(figsize= (13, 9))
_ = plt.subplot(2,3,1)
_ = sns.swarmplot(x="Pclass", y="Age", hue="Survived", data= data)

_ = plt.subplot(2,3,2)
_ = sns.swarmplot(x="Pclass", y="Parch", hue="Survived", data= data)

_ = plt.subplot(2,3,3)
_ = sns.swarmplot(x="Pclass", y="SibSp", hue="Survived", data= data)

_ = plt.subplot(2,3,4)
_ = sns.swarmplot(x="Sex", y="Age", hue="Survived", data= data)

_ = plt.subplot(2,3,5)
_ = sns.swarmplot(x="Embarked", y="Age", hue="Survived", data= data)

_ = plt.subplot(2,3,6)
_ = sns.swarmplot(x="Sex", y="Fare", hue="Survived", data= data)

plt.tight_layout()

png

Based on these graphs, there are only a few features that seem to have an effect on survivability: #Age #Pclass #Sex #Parch (possibly the amount of Parents/Children has an effect on survivability)

The ones that we looked at that we can determine have a very small or no effect on survivability: #Fare (Pclass is better indicator) #SibSp #Ticket, Cabin number, and port of embarkation dont seem to matter at all.

Data Cleaning

Step 3: Time to do everyone’s favorite task, data cleaning! I noticed that there were a lot of null values in the Age column, which was one of our major features that we identified, which means we have to fill it somehow.

#Take a look at the null Age values just to see what we're workin with
data[data['Age'].isnull()]

	Age	Cabin	Embarked	Fare	Name	Parch	PassengerId	Pclass	Sex	SibSp	Survived	Ticket
5	NaN	NaN	Q	8.4583	Moran, Mr. James	0	6	3	male	0	0.0	330877
17	NaN	NaN	S	13.0000	Williams, Mr. Charles Eugene	0	18	2	male	0	1.0	244373
19	NaN	NaN	C	7.2250	Masselmani, Mrs. Fatima	0	20	3	female	0	1.0	2649
26	NaN	NaN	C	7.2250	Emir, Mr. Farred Chehab	0	27	3	male	0	0.0	2631
28	NaN	NaN	Q	7.8792	O'Dwyer, Miss. Ellen "Nellie"	0	29	3	female	0	1.0	330959
29	NaN	NaN	S	7.8958	Todoroff, Mr. Lalio	0	30	3	male	0	0.0	349216
31	NaN	B78	C	146.5208	Spencer, Mrs. William Augustus (Marie Eugenie)	0	32	1	female	1	1.0	PC 17569
32	NaN	NaN	Q	7.7500	Glynn, Miss. Mary Agatha	0	33	3	female	0	1.0	335677
36	NaN	NaN	C	7.2292	Mamee, Mr. Hanna	0	37	3	male	0	1.0	2677
42	NaN	NaN	C	7.8958	Kraeff, Mr. Theodor	0	43	3	male	0	0.0	349253
45	NaN	NaN	S	8.0500	Rogers, Mr. William John	0	46	3	male	0	0.0	S.C./A.4. 23567
46	NaN	NaN	Q	15.5000	Lennon, Mr. Denis	0	47	3	male	1	0.0	370371
47	NaN	NaN	Q	7.7500	O'Driscoll, Miss. Bridget	0	48	3	female	0	1.0	14311
48	NaN	NaN	C	21.6792	Samaan, Mr. Youssef	0	49	3	male	2	0.0	2662
55	NaN	C52	S	35.5000	Woolner, Mr. Hugh	0	56	1	male	0	1.0	19947
64	NaN	NaN	C	27.7208	Stewart, Mr. Albert A	0	65	1	male	0	0.0	PC 17605
65	NaN	NaN	C	15.2458	Moubarek, Master. Gerios	1	66	3	male	1	1.0	2661
76	NaN	NaN	S	7.8958	Staneff, Mr. Ivan	0	77	3	male	0	0.0	349208
77	NaN	NaN	S	8.0500	Moutal, Mr. Rahamin Haim	0	78	3	male	0	0.0	374746
82	NaN	NaN	Q	7.7875	McDermott, Miss. Brigdet Delia	0	83	3	female	0	1.0	330932
87	NaN	NaN	S	8.0500	Slocovski, Mr. Selman Francis	0	88	3	male	0	0.0	SOTON/OQ 392086
95	NaN	NaN	S	8.0500	Shorney, Mr. Charles Joseph	0	96	3	male	0	0.0	374910
101	NaN	NaN	S	7.8958	Petroff, Mr. Pastcho ("Pentcho")	0	102	3	male	0	0.0	349215
107	NaN	NaN	S	7.7750	Moss, Mr. Albert Johan	0	108	3	male	0	1.0	312991
109	NaN	NaN	Q	24.1500	Moran, Miss. Bertha	0	110	3	female	1	1.0	371110
121	NaN	NaN	S	8.0500	Moore, Mr. Leonard Charles	0	122	3	male	0	0.0	A4. 54510
126	NaN	NaN	Q	7.7500	McMahon, Mr. Martin	0	127	3	male	0	0.0	370372
128	NaN	F E69	C	22.3583	Peter, Miss. Anna	1	129	3	female	1	1.0	2668
140	NaN	NaN	C	15.2458	Boulos, Mrs. Joseph (Sultana)	2	141	3	female	0	0.0	2678
154	NaN	NaN	S	7.3125	Olsen, Mr. Ole Martin	0	155	3	male	0	0.0	Fa 265302
...	...	...	...	...	...	...	...	...	...	...	...	...
268	NaN	NaN	S	8.0500	Howard, Miss. May Elizabeth	0	1160	3	female	0	NaN	A. 2. 39186
271	NaN	NaN	Q	7.7500	Fox, Mr. Patrick	0	1163	3	male	0	NaN	368573
273	NaN	NaN	Q	15.5000	Lennon, Miss. Mary	0	1165	3	female	1	NaN	370371
274	NaN	NaN	C	7.2250	Saade, Mr. Jean Nassr	0	1166	3	male	0	NaN	2676
282	NaN	NaN	Q	7.7500	Fleming, Miss. Honora	0	1174	3	female	0	NaN	364859
286	NaN	NaN	S	7.2500	Franklin, Mr. Charles (Charles Fardon)	0	1178	3	male	0	NaN	SOTON/O.Q. 3101314
288	NaN	F E46	C	7.2292	Mardirosian, Mr. Sarkis	0	1180	3	male	0	NaN	2655
289	NaN	NaN	S	8.0500	Ford, Mr. Arthur	0	1181	3	male	0	NaN	A/5 1478
290	NaN	NaN	S	39.6000	Rheims, Mr. George Alexander Lucien	0	1182	1	male	0	NaN	PC 17607
292	NaN	NaN	C	7.2292	Nasr, Mr. Mustafa	0	1184	3	male	0	NaN	2652
297	NaN	NaN	C	21.6792	Samaan, Mr. Hanna	0	1189	3	male	2	NaN	2662
301	NaN	D	C	15.0458	Malachard, Mr. Noel	0	1193	2	male	0	NaN	237735
304	NaN	NaN	Q	7.7500	McCarthy, Miss. Catherine Katie""	0	1196	3	female	0	NaN	383123
312	NaN	NaN	S	7.5750	Sadowitz, Mr. Harry	0	1204	3	male	0	NaN	LP 1588
332	NaN	NaN	C	7.2250	Thomas, Mr. Tannous	0	1224	3	male	0	NaN	2684
339	NaN	NaN	C	7.2292	Betros, Master. Seman	0	1231	3	male	0	NaN	2622
342	NaN	NaN	S	69.5500	Sage, Mr. John George	9	1234	3	male	1	NaN	CA. 2343
344	NaN	NaN	S	14.5000	van Billiard, Master. James William	1	1236	3	male	1	NaN	A/5. 851
357	NaN	NaN	S	7.8792	Lockyer, Mr. Edward	0	1249	3	male	0	NaN	1222
358	NaN	NaN	Q	7.7500	O'Keefe, Mr. Patrick	0	1250	3	male	0	NaN	368402
365	NaN	NaN	S	69.5500	Sage, Mrs. John (Annie Bullen)	9	1257	3	female	1	NaN	CA. 2343
366	NaN	NaN	C	14.4583	Caram, Mr. Joseph	0	1258	3	male	1	NaN	2689
380	NaN	NaN	Q	7.7500	O'Connor, Mr. Patrick	0	1272	3	male	0	NaN	366713
382	NaN	NaN	S	14.5000	Risien, Mrs. Samuel (Emma)	0	1274	3	female	0	NaN	364498
384	NaN	NaN	S	12.8750	Wheeler, Mr. Edwin Frederick""	0	1276	2	male	0	NaN	SC/PARIS 2159
408	NaN	NaN	Q	7.7208	Riordan, Miss. Johanna Hannah""	0	1300	3	female	0	NaN	334915
410	NaN	NaN	Q	7.7500	Naughton, Miss. Hannah	0	1302	3	female	0	NaN	365237
413	NaN	NaN	S	8.0500	Spector, Mr. Woolf	0	1305	3	male	0	NaN	A.5. 3236
416	NaN	NaN	S	8.0500	Ware, Mr. Frederick	0	1308	3	male	0	NaN	359309
417	NaN	NaN	C	22.3583	Peter, Master. Michael J	1	1309	3	male	1	NaN	2668

263 rows × 12 columns

# to fit age better, take the names, split on the "," on the left side. Split on "." on the right side
#do this to find 'Master vs. Mister' and 'Miss vs. Mrs'

names1 = pd.DataFrame(data['Name'].str.split(',', n=1).tolist(), columns = ['surname', 'given'])
names2 = pd.DataFrame(names1['given'].str.split('.', n=1).tolist(), columns = ['prefix', 'given'])
prefix = names2['prefix']
prefix = pd.DataFrame(prefix)
prefix.columns = ['title']
data['title'] = prefix
data.head()

	Age	Cabin	Embarked	Fare	Name	PassengerId	Pclass	Sex	SibSp	Survived	Ticket	title
0	22.0	NaN	S	7.2500	Braund, Mr. Owen Harris	1	3	male	1	0.0	A/5 21171	Mr
1	38.0	C85	C	71.2833	Cumings, Mrs. John Bradley (Florence Briggs Th...	2	1	female	1	1.0	PC 17599	Mrs
2	26.0	NaN	S	7.9250	Heikkinen, Miss. Laina	3	3	female	0	1.0	STON/O2. 3101282	Miss
3	35.0	C123	S	53.1000	Futrelle, Mrs. Jacques Heath (Lily May Peel)	4	1	female	1	1.0	113803	Mrs
4	35.0	NaN	S	8.0500	Allen, Mr. William Henry	5	3	male	0	0.0	373450	Mr

#fill the null values in 'Age' column with the mean of each title
data['Age'] = data.groupby('title').transform(lambda x: x.fillna(x.mean()))

data.Age.isnull().sum()

data.reset_index()

	index	Age	Cabin	Embarked	Fare	Name	Parch	PassengerId	Pclass	Sex	SibSp	Survived	Ticket	title
0	0	22.000000	NaN	S	7.2500	Braund, Mr. Owen Harris	0	1	3	male	1	0.0	A/5 21171	Mr
1	1	38.000000	C85	C	71.2833	Cumings, Mrs. John Bradley (Florence Briggs Th...	0	2	1	female	1	1.0	PC 17599	Mrs
2	2	26.000000	NaN	S	7.9250	Heikkinen, Miss. Laina	0	3	3	female	0	1.0	STON/O2. 3101282	Miss
3	3	35.000000	C123	S	53.1000	Futrelle, Mrs. Jacques Heath (Lily May Peel)	0	4	1	female	1	1.0	113803	Mrs
4	4	35.000000	NaN	S	8.0500	Allen, Mr. William Henry	0	5	3	male	0	0.0	373450	Mr
5	5	31.563774	NaN	Q	8.4583	Moran, Mr. James	0	6	3	male	0	0.0	330877	Mr
6	6	54.000000	E46	S	51.8625	McCarthy, Mr. Timothy J	0	7	1	male	0	0.0	17463	Mr
7	7	2.000000	NaN	S	21.0750	Palsson, Master. Gosta Leonard	1	8	3	male	3	0.0	349909	Master
8	8	27.000000	NaN	S	11.1333	Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)	2	9	3	female	0	1.0	347742	Mrs
9	9	14.000000	NaN	C	30.0708	Nasser, Mrs. Nicholas (Adele Achem)	0	10	2	female	1	1.0	237736	Mrs
10	10	4.000000	G6	S	16.7000	Sandstrom, Miss. Marguerite Rut	1	11	3	female	1	1.0	PP 9549	Miss
11	11	58.000000	C103	S	26.5500	Bonnell, Miss. Elizabeth	0	12	1	female	0	1.0	113783	Miss
12	12	20.000000	NaN	S	8.0500	Saundercock, Mr. William Henry	0	13	3	male	0	0.0	A/5. 2151	Mr
13	13	39.000000	NaN	S	31.2750	Andersson, Mr. Anders Johan	5	14	3	male	1	0.0	347082	Mr
14	14	14.000000	NaN	S	7.8542	Vestrom, Miss. Hulda Amanda Adolfina	0	15	3	female	0	0.0	350406	Miss
15	15	55.000000	NaN	S	16.0000	Hewlett, Mrs. (Mary D Kingcome)	0	16	2	female	0	1.0	248706	Mrs
16	16	2.000000	NaN	Q	29.1250	Rice, Master. Eugene	1	17	3	male	4	0.0	382652	Master
17	17	31.563774	NaN	S	13.0000	Williams, Mr. Charles Eugene	0	18	2	male	0	1.0	244373	Mr
18	18	31.000000	NaN	S	18.0000	Vander Planke, Mrs. Julius (Emelia Maria Vande...	0	19	3	female	1	0.0	345763	Mrs
19	19	33.760194	NaN	C	7.2250	Masselmani, Mrs. Fatima	0	20	3	female	0	1.0	2649	Mrs
20	20	35.000000	NaN	S	26.0000	Fynney, Mr. Joseph J	0	21	2	male	0	0.0	239865	Mr
21	21	34.000000	D56	S	13.0000	Beesley, Mr. Lawrence	0	22	2	male	0	1.0	248698	Mr
22	22	15.000000	NaN	Q	8.0292	McGowan, Miss. Anna "Annie"	0	23	3	female	0	1.0	330923	Miss
23	23	28.000000	A6	S	35.5000	Sloper, Mr. William Thompson	0	24	1	male	0	1.0	113788	Mr
24	24	8.000000	NaN	S	21.0750	Palsson, Miss. Torborg Danira	1	25	3	female	3	0.0	349909	Miss
25	25	38.000000	NaN	S	31.3875	Asplund, Mrs. Carl Oscar (Selma Augusta Emilia...	5	26	3	female	1	1.0	347077	Mrs
26	26	31.563774	NaN	C	7.2250	Emir, Mr. Farred Chehab	0	27	3	male	0	0.0	2631	Mr
27	27	19.000000	C23 C25 C27	S	263.0000	Fortune, Mr. Charles Alexander	2	28	1	male	3	0.0	19950	Mr
28	28	25.455752	NaN	Q	7.8792	O'Dwyer, Miss. Ellen "Nellie"	0	29	3	female	0	1.0	330959	Miss
29	29	31.563774	NaN	S	7.8958	Todoroff, Mr. Lalio	0	30	3	male	0	0.0	349216	Mr
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1279	388	21.000000	NaN	Q	7.7500	Canavan, Mr. Patrick	0	1280	3	male	0	NaN	364858	Mr
1280	389	6.000000	NaN	S	21.0750	Palsson, Master. Paul Folke	1	1281	3	male	3	NaN	349909	Miss
1281	390	23.000000	B24	S	93.5000	Payne, Mr. Vivian Ponsonby	0	1282	1	male	0	NaN	12749	Mr
1282	391	51.000000	D28	S	39.4000	Lines, Mrs. Ernest H (Elizabeth Lindsey James)	1	1283	1	female	0	NaN	PC 17592	Mr
1283	392	13.000000	NaN	S	20.2500	Abbott, Master. Eugene Joseph	2	1284	3	male	0	NaN	C.A. 2673	Mr
1284	393	47.000000	NaN	S	10.5000	Gilbert, Mr. William	0	1285	2	male	0	NaN	C.A. 30769	Miss
1285	394	29.000000	NaN	S	22.0250	Kink-Heilmann, Mr. Anton	1	1286	3	male	3	NaN	315153	Mrs
1286	395	18.000000	C31	S	60.0000	Smith, Mrs. Lucien Philip (Mary Eloise Hughes)	0	1287	1	female	1	NaN	13695	Mr
1287	396	24.000000	NaN	Q	7.2500	Colbert, Mr. Patrick	0	1288	3	male	0	NaN	371109	Miss
1288	397	48.000000	B41	C	79.2000	Frolicher-Stehli, Mrs. Maxmillian (Margaretha ...	1	1289	1	female	1	NaN	13567	Mr
1289	398	22.000000	NaN	S	7.7750	Larsson-Rondberg, Mr. Edvard A	0	1290	3	male	0	NaN	347065	Dr
1290	399	31.000000	NaN	Q	7.7333	Conlon, Mr. Thomas Henry	0	1291	3	male	0	NaN	21332	Mrs
1291	400	30.000000	C7	S	164.8667	Bonnell, Miss. Caroline	0	1292	1	female	0	NaN	36928	Mr
1292	401	38.000000	NaN	S	21.0000	Gale, Mr. Harry	0	1293	2	male	1	NaN	28664	Mr
1293	402	22.000000	NaN	C	59.4000	Gibson, Miss. Dorothy Winifred	1	1294	1	female	0	NaN	112378	Miss
1294	403	17.000000	NaN	S	47.1000	Carrau, Mr. Jose Pedro	0	1295	1	male	0	NaN	113059	Mr
1295	404	43.000000	D40	C	27.7208	Frauenthal, Mr. Isaac Gerald	0	1296	1	male	1	NaN	17765	Miss
1296	405	20.000000	D38	C	13.8625	Nourney, Mr. Alfred (Baron von Drachstedt")"	0	1297	2	male	0	NaN	SC/PARIS 2166	Mr
1297	406	23.000000	NaN	S	10.5000	Ware, Mr. William Jeffery	0	1298	2	male	1	NaN	28666	Mr
1298	407	50.000000	C80	C	211.5000	Widener, Mr. George Dunton	1	1299	1	male	1	NaN	113503	Master
1299	408	31.563774	NaN	Q	7.7208	Riordan, Miss. Johanna Hannah""	0	1300	3	female	0	NaN	334915	Mr
1300	409	3.000000	NaN	S	13.7750	Peacock, Miss. Treasteall	1	1301	3	female	1	NaN	SOTON/O.Q. 3101315	Miss
1301	410	31.563774	NaN	Q	7.7500	Naughton, Miss. Hannah	0	1302	3	female	0	NaN	365237	Mr
1302	411	37.000000	C78	Q	90.0000	Minahan, Mrs. William Edward (Lillian E Thorpe)	0	1303	1	female	1	NaN	19928	Mr
1303	412	28.000000	NaN	S	7.7750	Henriksson, Miss. Jenny Lovisa	0	1304	3	female	0	NaN	347086	Miss
1304	413	31.563774	NaN	S	8.0500	Spector, Mr. Woolf	0	1305	3	male	0	NaN	A.5. 3236	Mr
1305	414	39.000000	C105	C	108.9000	Oliva y Ocana, Dona. Fermina	0	1306	1	female	0	NaN	PC 17758	Mr
1306	415	38.500000	NaN	S	7.2500	Saether, Mr. Simon Sivertsen	0	1307	3	male	0	NaN	SOTON/O.Q. 3101262	Mrs
1307	416	33.760194	NaN	S	8.0500	Ware, Mr. Frederick	0	1308	3	male	0	NaN	359309	Mrs
1308	417	25.455752	NaN	C	22.3583	Peter, Master. Michael J	1	1309	3	male	1	NaN	2668	Miss

1309 rows × 14 columns

#to do the above name strip in one line:
#data['title'] = data.Name.map(lambda x: x.split(',')[1].split('.')[0].strip())

Feature Engineering

Step 3.5: Add/remove some features based on whether or not they are going to matter in our model.

data.Parch.describe()
#I dont know what this does and the std is higher than the mean which makes me think it's a bad predictor of anything

count    1309.000000
mean        0.385027
std         0.865560
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         9.000000
Name: Parch, dtype: float64

#Filling the 2 Embarked null values with the most common value (Do I even need this column?)
#data.Embarked.fillna('S', inplace = True)

#Create dummy variables for the categoricals in the dataset
Pclassdumm = pd.get_dummies(data['Pclass'], drop_first = True)
data = pd.concat([data, Pclassdumm], axis= 1)
data.drop('Pclass', inplace = True, axis= 1)

sexdumm = pd.get_dummies(data['Sex'], drop_first = True)
data = pd.concat([data, sexdumm], axis= 1)
data.drop('Sex', inplace = True, axis=1)

#Do I even need this?
#Embardumm = pd.get_dummies(data['Embarked'], drop_first = True)
#data = pd.concat([data, Embardumm], axis= 1)
#data.drop('Embarked', inplace = True, axis=1)

#Drop all the columns we dont need
data.drop(['Name', 'PassengerId', 'title', 'Cabin', 'Ticket', 'Embarked', 'SibSp', 'Fare'], inplace = True, axis = 1)

data['Children'] = data['Age'] < 16

data['Children'].head()

  False
  False
  False
  False
  False
Name: Children, dtype: bool

data.shape

(1309, 6)

data.describe()

	Age	Survived	2	3	male
count	1309.000000	891.000000	1309.000000	1309.000000	1309.000000
mean	29.930167	0.383838	0.211612	0.541635	0.644003
std	12.990814	0.486592	0.408607	0.498454	0.478997
min	0.170000	0.000000	0.000000	0.000000	0.000000
25%	22.000000	0.000000	0.000000	0.000000	0.000000
50%	30.000000	0.000000	0.000000	1.000000	1.000000
75%	35.000000	1.000000	0.000000	1.000000	1.000000
max	80.000000	1.000000	1.000000	1.000000	1.000000

data.describe()

	Age	Survived	2	3	male
count	1309.000000	891.000000	1309.000000	1309.000000	1309.000000
mean	29.930167	0.383838	0.211612	0.541635	0.644003
std	12.990814	0.486592	0.408607	0.498454	0.478997
min	0.170000	0.000000	0.000000	0.000000	0.000000
25%	22.000000	0.000000	0.000000	0.000000	0.000000
50%	30.000000	0.000000	0.000000	1.000000	1.000000
75%	35.000000	1.000000	0.000000	1.000000	1.000000
max	80.000000	1.000000	1.000000	1.000000	1.000000

Training the Model

Step 4: Since we’re done changing around our features and cleaning our data, we can now re-split the data into training and test sets, run our assessment function to determine which would be the best predictor, and then predict on the test data.

#re-split data into our TRAINING SET so that we can use it to assess
train = data[data['Survived'].notnull()]

train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 6 columns):
Age         891 non-null float64
Survived    891 non-null float64
2           891 non-null uint8
3           891 non-null uint8
male        891 non-null uint8
Children    891 non-null bool
dtypes: bool(1), float64(2), uint8(3)
memory usage: 24.4 KB

#split the 'Survived' column off of the TRAINING set so that we can assess our predictions
target = train[['Survived']]
features = train.drop('Survived', axis= 1)

features.shape, target.shape

((891, 5), (891, 1))

features.isnull().sum()

Age         0
2           0
3           0
male        0
Children    0
dtype: int64

On previous iterations, my model was performing at about 77.5% accuracy (with the best performance at 79%). It has come to my attention that this may be due to overtraining or overfitting since this is a smaller dataset. I will use train_test_split to try and reduce this error.

x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = .2, random_state = 42)
#adding the train_test_split decreased my score to 75%

tDMassess(x_train, y_train)

C:\Users\ckoni\Anaconda3\lib\site-packages\sklearn\utils\validation.py:578: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
C:\Users\ckoni\Anaconda3\lib\site-packages\ipykernel_launcher.py:5: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  """
C:\Users\ckoni\Anaconda3\lib\site-packages\sklearn\utils\validation.py:578: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
C:\Users\ckoni\Anaconda3\lib\site-packages\ipykernel_launcher.py:5: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  """
C:\Users\ckoni\Anaconda3\lib\site-packages\sklearn\utils\validation.py:578: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
C:\Users\ckoni\Anaconda3\lib\site-packages\sklearn\utils\validation.py:578: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)


0
[0.8525280898876404]
[0.8721461187214612]
[0.7126865671641791]
[0.784394250513347]
next loop
1
[0.8525280898876404, 0.875]
[0.8721461187214612, 0.8744769874476988]
[0.7126865671641791, 0.7798507462686567]
[0.784394250513347, 0.8244575936883628]
next loop
2
[0.8525280898876404, 0.875, 0.8778089887640449]
[0.8721461187214612, 0.8744769874476988, 0.8951965065502183]
[0.7126865671641791, 0.7798507462686567, 0.7649253731343284]
[0.784394250513347, 0.8244575936883628, 0.8249496981891348]
next loop
3
[0.8525280898876404, 0.875, 0.8778089887640449, 0.7893258426966292]
[0.8721461187214612, 0.8744769874476988, 0.8951965065502183, 0.7398373983739838]
[0.7126865671641791, 0.7798507462686567, 0.7649253731343284, 0.6791044776119403]
[0.784394250513347, 0.8244575936883628, 0.8249496981891348, 0.708171206225681]
next loop
4
[0.8525280898876404, 0.875, 0.8778089887640449, 0.7893258426966292, 0.8441011235955056]
[0.8721461187214612, 0.8744769874476988, 0.8951965065502183, 0.7398373983739838, 0.8257261410788381]
[0.7126865671641791, 0.7798507462686567, 0.7649253731343284, 0.6791044776119403, 0.7425373134328358]
[0.784394250513347, 0.8244575936883628, 0.8249496981891348, 0.708171206225681, 0.7819253438113948]
next loop
5
[0.8525280898876404, 0.875, 0.8778089887640449, 0.7893258426966292, 0.8441011235955056, 0.7851123595505618]
[0.8721461187214612, 0.8744769874476988, 0.8951965065502183, 0.7398373983739838, 0.8257261410788381, 0.7031802120141343]
[0.7126865671641791, 0.7798507462686567, 0.7649253731343284, 0.6791044776119403, 0.7425373134328358, 0.7425373134328358]
[0.784394250513347, 0.8244575936883628, 0.8249496981891348, 0.708171206225681, 0.7819253438113948, 0.722323049001815]
next loop
6
[0.8525280898876404, 0.875, 0.8778089887640449, 0.7893258426966292, 0.8441011235955056, 0.7851123595505618, 0.7991573033707865]
[0.8721461187214612, 0.8744769874476988, 0.8951965065502183, 0.7398373983739838, 0.8257261410788381, 0.7031802120141343, 0.7470355731225297]
[0.7126865671641791, 0.7798507462686567, 0.7649253731343284, 0.6791044776119403, 0.7425373134328358, 0.7425373134328358, 0.7052238805970149]
[0.784394250513347, 0.8244575936883628, 0.8249496981891348, 0.708171206225681, 0.7819253438113948, 0.722323049001815, 0.72552783109405]
next loop

	Accuracy	Precision	Recall	F1
Gradient Boosting	0.852528	0.872146	0.712687	0.784394
Random Forest	0.875000	0.874477	0.779851	0.824458
Decision Tree	0.877809	0.895197	0.764925	0.824950
Logisic Regression	0.789326	0.739837	0.679104	0.708171
K Nearest	0.844101	0.825726	0.742537	0.781925
GaussianNB	0.785112	0.703180	0.742537	0.722323
BernoulliNB	0.799157	0.747036	0.705224	0.725528

Seems like Gradient Boosting or Random Forest would be the best option to predict here

#Since we found that GB would be the best model, we can re-split our ORIGINAL, FULL dataset into the training and testing sets
training = data[:891]
testing = data[891:]

testing.describe()

	Age	Survived	2	3	male
count	418.000000	0.0	418.000000	418.000000	418.000000
mean	30.223491	NaN	0.222488	0.521531	0.636364
std	12.757613	NaN	0.416416	0.500135	0.481622
min	0.170000	NaN	0.000000	0.000000	0.000000
25%	22.625000	NaN	0.000000	0.000000	0.000000
50%	30.000000	NaN	0.000000	1.000000	1.000000
75%	36.000000	NaN	0.000000	1.000000	1.000000
max	76.000000	NaN	1.000000	1.000000	1.000000

#remove this column since it's all nulls, we will be replacing it with our predictions soon
testing.drop('Survived', inplace = True, axis= 1)

C:\Users\ckoni\Anaconda3\lib\site-packages\pandas\core\frame.py:3694: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)

training.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 6 columns):
Age         891 non-null float64
Survived    891 non-null float64
2           891 non-null uint8
3           891 non-null uint8
male        891 non-null uint8
Children    891 non-null bool
dtypes: bool(1), float64(2), uint8(3)
memory usage: 24.4 KB

testing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 5 columns):
Age         418 non-null float64
2           418 non-null uint8
3           418 non-null uint8
male        418 non-null uint8
Children    418 non-null bool
dtypes: bool(1), float64(1), uint8(3)
memory usage: 8.2 KB

#Use Gradient Boosting to predict on the TESTING set and save to a variable, predict
predict = gbc.predict(testing)
#predict = r.predict(testing)
#Using random forest got me a 67% accuracy rating... 12% lower than my best with GB

#add the predictions as the survival values in the TESTING test data set which we recently dropped
test["Survived"] = predict.astype('int')

test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           418 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
Survived       418 non-null int32
dtypes: float64(2), int32(1), int64(4), object(5)
memory usage: 37.6+ KB

#Kaggle only needs a DataFrame containing the PassengerId and our predicted values
test = test[['PassengerId', 'Survived']]

test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
PassengerId    418 non-null int64
Survived       418 non-null int32
dtypes: int32(1), int64(1)
memory usage: 5.0 KB

#write to a csv file so we can upload to Kaggle
test.to_csv('titanic_predict.csv', index = False)

test.describe()

	PassengerId	Survived
count	418.000000	418.000000
mean	1100.500000	0.325359
std	120.810458	0.469070
min	892.000000	0.000000
25%	996.250000	0.000000
50%	1100.500000	0.000000
75%	1204.750000	1.000000
max	1309.000000	1.000000