We first load the official Kaggle csv files into a pandas.DataFrame.
import sys
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install pandas
import numpy as np
import pandas as pd
import os
train_data, test_data = None, None
paths = ['.', '/kaggle/input', 'data']
for path in paths:
if os.path.exists(os.path.join(path, "titanic", "train.csv")) and os.path.exists(os.path.join(path, "titanic", "test.csv")):
train_data = pd.read_csv(os.path.join(path, "titanic", "train.csv"))
test_data = pd.read_csv(os.path.join(path, "titanic", "test.csv"))
break
if not isinstance(train_data, pd.DataFrame) or not isinstance(test_data, pd.DataFrame):
raise FileNotFoundError("Couldn't find csv files in %s" % paths)
For better result assessment, we further load the entire data set and extract the ground truth test data.
import re
if os.path.exists(os.path.join(path, "titanic", "test_labelled.csv")):
test_labelled = pd.read_csv(os.path.join(path, "titanic", "test_labelled.csv"))
elif os.path.exists(os.path.join(path, "labelled-test-data", "test_labelled.csv")):
test_labelled = pd.read_csv(os.path.join(path, "labelled-test-data", "test_labelled.csv"))
test_gt = test_labelled['survived']
for i, name in enumerate(test_labelled['name']):
if '"' in name:
test_labelled['name'][i] = re.sub('"', '', name)
for i, name in enumerate(test_data['Name']):
if '"' in name:
test_data['Name'][i] = re.sub('"', '', name)
test_gt = []
for name in test_data['Name']:
test_gt.append(int(test_labelled.loc[test_labelled['name'] == name]['survived'].values[-1]))
By default Kaggle presents a neat and short solution using a RandomForestClassifier based on only 4 features with an accuracy of 77.51%, which we aim to improve.
!{sys.executable} -m pip install scikit-learn
from sklearn.ensemble import RandomForestClassifier
y = train_data["Survived"]
features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
# print confusion matrix
fig, ax = plt.subplots(1, 1, figsize=(15, 5))
conf_mat = confusion_matrix(y_true=test_gt, y_pred=model.predict(X_test))
ax.matshow(conf_mat, cmap=plt.cm.Blues, alpha=0.3)
for i in range(conf_mat.shape[0]):
for j in range(conf_mat.shape[1]):
ax.text(x=j, y=i, s=conf_mat[i, j], va='center', ha='center', size='xx-large')
plt.title('Confusion Matrix', fontsize=18)
plt.xlabel('Predictions $\mathbf{\hat{y}}$', fontsize=18)
plt.ylabel('Actuals $\mathbf{y}$', fontsize=18)
class_labels = ['Dead', 'Survived']
plt.xticks([0, 1], class_labels)
plt.yticks([0, 1], class_labels)
plt.tick_params(top=False, bottom=True, labeltop=False, labelbottom=True)
plt.show()
# (TP+TN)/ALL
accuracy = lambda conf_mat: (conf_mat[0, 0]+conf_mat[1, 1])/np.sum(conf_mat) * 100
print('%.2f %% Accuracy' % accuracy(conf_mat))
77.51 % Accuracy
At the beginning, it is important to familiarize oneself with the data. Let's specifically have a look at the column titles, that will act as features.
train_data.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
By reviewing the the first entries as an excerpt, we observe the following plain features for each passenger:
Let's get a first impression of the numerical distribution. From this, we observe sparse data (e.g. see Age count).
pd.concat([
pd.concat([train_data, test_data]).drop(['Survived'], axis=1).isnull().sum().rename('Missing #'),
pd.concat([train_data, test_data]).drop(['Survived'], axis=1).isnull().sum().rename('Percent %')*100/pd.concat([train_data, test_data]).shape[0]
], axis=1)
Missing # | Percent % | |
---|---|---|
PassengerId | 0 | 0.000000 |
Pclass | 0 | 0.000000 |
Name | 0 | 0.000000 |
Sex | 0 | 0.000000 |
Age | 263 | 20.091673 |
SibSp | 0 | 0.000000 |
Parch | 0 | 0.000000 |
Ticket | 0 | 0.000000 |
Fare | 1 | 0.076394 |
Cabin | 1014 | 77.463713 |
Embarked | 2 | 0.152788 |
Next, we visualize existing feature data to analyze relations and to make assumptions on potential underlying concepts.
def bar_chart(feature, df):
survived = df[df['Survived']==1][feature].value_counts()
dead = df[df['Survived']==0][feature].value_counts()
feat_df = pd.DataFrame([survived, dead])
feat_df.index = ['Survived', 'Dead']
feat_df.plot(kind='bar', stacked=True, figsize=(10, 5), title=feature)
bar_chart('Pclass', train_data)
bar_chart('Sex', train_data)
bar_chart('SibSp', train_data)
bar_chart('Parch', train_data)
bar_chart('Embarked', train_data)
From the plots above, we get an idea of how much of an impact class, gender and number of family members onboard (siblings/spouses, parents/children) had on the survival/death rate. To see how features depend on each other, it is meaningful to catch a glimpse on the correlation matrix.
import seaborn as sns
sns.heatmap(train_data.corr(), annot=True, cmap='coolwarm', linewidths=0.2)
fig=plt.gcf()
fig.set_size_inches(20, 12)
plt.show()
From the correlation matrix, we observe strong correlations between Pclass and Fare as well as SibSp and Parch, which appear reasonable. These links will be useful for data imputation as demonstrated in the following feature engineering stage.
The major task for a potential improvement on the survival prediction is to extract, transform, reduce or simplify the given passenger data. For that purpose, we make a copy of the original data frame which is subject to change.
import copy
# copy data for adjustments
train_df, test_df = copy.deepcopy(train_data), copy.deepcopy(test_data)
First, it may be obvious to take the given gender information and convert the original string type to binary values.
# binary mapping
sex_mapping = {"male": 0, "female": 1}
for df in [train_df, test_df]:
df['Sex'] = df['Sex'].map(sex_mapping)
df['Sex'].astype('bool')
Passenger names contain rich information encoded as strings. One of the most common approaches is to extract the salutation and re-organize them into groups with specific titles.
for df in [train_df, test_df]:
df['Title'] = train_df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
pd.concat([train_df, test_df])['Title'].value_counts()
Mr 745 Miss 283 Mrs 183 Master 63 Dr 10 Rev 9 Mme 2 Mlle 2 Col 2 Don 2 Major 2 Jonkheer 1 Capt 1 Ms 1 Countess 1 Sir 1 Lady 1 Name: Title, dtype: int64
if False:
# extract title-survival chance
title_mapping_probs = {}
title_keys = pd.concat([train_df, test_df])['Title'].value_counts().index.to_list()
for title in title_keys:
title_mapping_probs[title] = train_df[train_df['Title']==title].Survived.mean()
# account for title "Dona" which occurs in test data
title_mapping_probs['Dona'] = train_df[train_df['Title']=='Don'].Survived.mean()
title_mapping_orig = {
"Mr": 0, "Miss": 1, "Mrs": 2,
"Master": 3, "Dr": 3, "Rev": 3, "Col": 3, "Major": 3, "Mlle": 3,"Countess": 3,
"Ms": 3, "Lady": 3, "Jonkheer": 3, "Don": 3, "Dona": 5, "Mme": 3, "Capt": 3, "Sir": 3
}
title_map_alt = {
# likely to leave last
"Mr": 1,
"Rev": 2,
# likely to leave first
"Miss": 3, "Ms": 3, "Mrs": 3, "Mlle": 3,
"Master": 4, "Dr": 4, "Col": 4, "Major": 4,
# noble titles
"Jonkheer": 5, "Countess": 5, "Lady": 5, "Don": 5, "Dona": 5, "Mme": 5, "Sir": 5, "Capt": 5,
}
# map title categories to numerical values
for df in [train_df, test_df]:
df['TitleGroup'] = df['Title'].map(title_map_alt)
if True:
# group titles into 5 categories
for df in [train_df, test_df]:
red_title_keys = ['Mr', 'Miss', 'Mrs', 'Master']
# combine similar titles to group of common titles
df['Title'].replace(['Ms', 'Mlle', 'Mme'], 'Miss', inplace=True)
df['Title'].replace(['Lady'], 'Mrs', inplace=True)
df['Title'].replace(['Sir', 'Rev'], 'Mr', inplace=True)
# form a group that consists of rare titles
df['Title'][~df['Title'].isin(red_title_keys)] = 'Others'
# convert categories to numerical values
for df in [train_df, test_df]:
df['TitleGroup'] = df['Title'].map({'Mr': 0, 'Miss': 1, 'Mrs': 2, 'Master': 3, 'Others': 4}).astype('int')
sns.boxplot(x='Title', y='Age', data=train_df)
fig=plt.gcf()
fig.set_size_inches(20, 5)
plt.show()
A variety of nationalities were on board of the Titanic. Given that the ship was British-flagged, speaking English might have been of a great advantage when allocating seats in lifeboats. Below are attempts to generate nationality data from the passenger names.
# language feature generation
!{sys.executable} -m pip install name2nat
try:
from name2nat import Name2nat
found = True
except ImportError:
found = False
if found:
eng_lang_keys = ['English', 'American', 'British', 'Irish', 'Australian', 'Canadian', 'German', 'French', 'Austrian', 'Dutch']
for df in [test_df, train_df]:
# split fore- and surname
names = df['Name'].str.split(',', expand=True)
# remove title from surname
names[1] = names[1].str.extract('. ([A-Za-z]+)', expand=False)
# concatenate fore- and surname
names = names[1] + ' ' + names[0]
# treat missing names
names.fillna('Unknown', inplace=True)
# convert to list
name_list = names.tolist()
# estimate nationality
my_nanat = Name2nat()
nation_df = pd.DataFrame(my_nanat(name_list, top_n=3))
# English language mapping
eng_col = []
for row in nation_df[1]:
eng_col.append(1 if row[0][0] in eng_lang_keys and row[0][1] >= .1 else 0)
# append result to current data frame
df['English'] = eng_col
comp_df = pd.concat([df['Name'], names, df['Survived'], df['English'], nation_df[1]], axis=1)
#comp_df.head(40)
sns.barplot(x="English", y="Survived", data=train_df)
plt.show()
2021-01-24 19:57:49,640 loading file /Users/Admin/PycharmProjects/18_optimizay/venv/lib/python3.7/site-packages/name2nat/best-model.pt 2021-01-24 19:57:53,408 loading file /Users/Admin/PycharmProjects/18_optimizay/venv/lib/python3.7/site-packages/name2nat/best-model.pt
As there seems to be a link between language and survival, we may regard people's ethnicity as an additional measure.
# ethnicity feature generation
!{sys.executable} -m pip install ethnicolr
try:
from ethnicolr import census_ln
found = True
except ImportError:
found = False
if found:
for df in [train_df, test_df]:
names = df['Name'].str.split(',', expand=True)
estimates = census_ln(names, namecol=0)
df['Ethnicity'] = estimates['pctwhite']
df['Ethnicity'].fillna(value=df['Ethnicity'].median(), inplace=True)
df['Ethnicity'] = df.Ethnicity.astype(float)
ethn_arr = df.Ethnicity.to_numpy()
# normalization
df['Ethnicity'] = (ethn_arr-ethn_arr.min()) / (ethn_arr.max()-ethn_arr.min())
train_df.head()
From the presence of NaN values, we observe that our data ia partially incomplete. Instead of removing incomplete data columns and their useful data, we may employ data imputation techniques and fill missing information with reasonable guesses.
With regards to the ticket fare, we fill the single missing entry by a corresponding Pclass-grouped mean since the fare feature strongly correlates with passenger class.
# fare imputation - group-based mean
for df in [train_df, test_df]:
group_df = df.groupby(['Pclass', 'Title'], sort=False)['Fare']
df['Fare'] = group_df.apply(lambda x: x.fillna(x.mean()))
Missing age information is imputed similarly, whereas groups rely on more features from which we infer the median value.
# age imputation - conditional group-based median
# print some feature information
train_age_nan_s = round(train_df[train_df.Age.isnull()]['Survived'].sum()/train_df.Age.isnull().sum(), 3)
print('Passenger number without original age information is %s with %s survival chance for training data and %s with unknown survival rate for test data.' % (train_df.Age.isnull().sum(), train_age_nan_s, test_df.Age.isnull().sum()))
# prepare grouped medians dataframe
group_df = pd.concat([train_df, test_df]).groupby(['Sex', 'Title', 'Pclass']).median().reset_index()[['Sex', 'Title', 'Pclass', 'Age']]
group_df.head()
def fill_age(row):
condition = (
(group_df['Sex'] == row['Sex']) &
(group_df['Title'] == row['Title']) &
(group_df['Pclass'] == row['Pclass'])
)
if np.isnan(group_df[condition]['Age'].values[0]):
condition = (
(group_df['Sex'] == row['Sex']) &
(group_df['Pclass'] == row['Pclass'])
)
return group_df[condition]['Age'].values[0]
if True:
for df in [train_df, test_df]:
# fill the missing values of the Age variable based on groupd medians
df['Age'] = df.apply(lambda row: fill_age(row) if np.isnan(row['Age']) else row['Age'], axis=1)
elif False:
# fill missing values with dedicated value representing the group of missing data
for df in [train_df, test_df]:
df[df.Age.isnull()]['Age'] = -1
Passenger number without original age information is 177 with 0.294 survival chance for training data and 86 with unknown survival rate for test data.
As an alternative to this, we may assign a dedicated class value for a group of passengers that lacks certain feature information.
# cabin imputation - assign 'U' unknown cabin information
for df in [train_df, test_df]:
df['Cabin'].fillna('U', inplace=True)
# embarked imputation
for df in [train_df, test_df]:
df['Embarked'].fillna('U', inplace=True)
# embarkment hierarchy
embarked_mapping = {"U": 0, "S": 1, "C": 2, "Q": 3}
for df in [train_df, test_df]:
df['Embarked'] = df['Embarked'].map(embarked_mapping)
To reduce feature complexity, numerical values may be grouped together in several bins, which act as feature categories. Let's have a look at the distribution of the Fare feature and make a reasonable choice on the boundaries.
fig, ax = plt.figure(figsize=(20, 8)), plt.axes()
ax = train_df[train_df['Survived']==0]['Fare'].plot.hist(alpha=0.5, label='Died')
ax = train_df[train_df['Survived']==1]['Fare'].plot.hist(alpha=0.5, label='Survived')
ax.set_xlabel('Fare')
ax.legend()
<matplotlib.legend.Legend at 0x7fa1cc8c4450>
# fare binning
if True:
# manual fare binning
for df in [train_df, test_df]:
df.loc[ df['Fare'] <= 17, 'Fare'] = 0
df.loc[(df['Fare'] > 17) & (df['Fare'] <= 30), 'Fare'] = 1
df.loc[(df['Fare'] > 30) & (df['Fare'] <= 100), 'Fare'] = 2
df.loc[ df['Fare'] > 100, 'Fare'] = 3
df['Fare'] = df['Fare'].astype('int')
elif False:
# automatic fare binning using sklearn
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
for df in [train_df, test_df]:
df['Fare'] = pd.qcut(df['Fare'], 5)
df['Fare'] = label.fit_transform(df['Fare'])
train_df.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Title | TitleGroup | English | Ethnicity | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | 0 | 22.0 | 1 | 0 | A/5 21171 | 0 | U | 1 | Mr | 0 | 1 | 0.979182 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | 1 | 38.0 | 1 | 0 | PC 17599 | 2 | C85 | 2 | Mrs | 2 | 1 | 0.917592 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | 1 | 26.0 | 0 | 0 | STON/O2. 3101282 | 0 | U | 1 | Miss | 1 | 1 | 0.986301 |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | 1 | 35.0 | 1 | 0 | 113803 | 2 | C123 | 1 | Mrs | 2 | 1 | 0.937655 |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | 0 | 35.0 | 0 | 0 | 373450 | 0 | U | 1 | Mr | 0 | 1 | 0.686442 |
age_survi = np.array(train_df[train_df['Survived']==1]['Age'])
age_death = np.array(train_df[train_df['Survived']==0]['Age'])
fig, ax = plt.figure(figsize=(20, 8)), plt.axes()
ax = train_df[train_df['Survived']==0]['Age'].plot.hist(bins=50, alpha=0.5, label='Died')
ax = train_df[train_df['Survived']==1]['Age'].plot.hist(bins=50, alpha=0.5, label='Survived')
ax.set_xlabel('Age')
ax.legend()
<matplotlib.legend.Legend at 0x7fa1cc8dd210>
# age binning
if False:
# manual age group numeric labels
for df in [train_df, test_df]:
df.loc[ df['Age'] <= 5, 'Age'] = 0
df.loc[(df['Age'] > 5) & (df['Age'] <= 15), 'Age'] = 1
df.loc[(df['Age'] > 15) & (df['Age'] <= 25), 'Age'] = 2
df.loc[(df['Age'] > 25) & (df['Age'] <= 35), 'Age'] = 3
df.loc[(df['Age'] > 35) & (df['Age'] <= 50), 'Age'] = 4
df.loc[(df['Age'] > 50) & (df['Age'] <= 60), 'Age'] = 5
df.loc[ df['Age'] > 60, 'Age'] = 6
df['Age'] = df['Age'].astype('int')
elif False:
# manual age group string labels
for df in [train_df, test_df]:
bins = [-1, 0, 5, 12, 18, 24, 35, 60, np.inf]
labels = ['Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Senior']
df['AgeGroup'] = pd.cut(df["Age"], bins, labels = labels)
age_mapping = {'Baby': 6, 'Child': 2, 'Teenager': 5, 'Student': 1, 'Young Adult': 4, 'Adult': 3, 'Senior': 0, 'Unknown': 1}
df['AgeGroup'] = df['AgeGroup'].map(age_mapping)
df["AgeGroup"].fillna(1, inplace=True)
elif True:
# automatic binning using sklearn
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
for df in [train_df, test_df]:
df['AgeGroup'] = pd.qcut(df['Age'], 5)
df['AgeGroup'] = label.fit_transform(df['AgeGroup'])
# bar plot of age vs. survival
sns.barplot(x="AgeGroup", y="Survived", data=train_df)
plt.show()