The Titanic dataset from Kaggle is more than just the numbers, its a snapshot of history, rich with stories waiting to be uncovered through data.
In this project, I embarked on an exploratory data analysis of the iconic Titanic dataset. Using Python libraries like Pandas(pd), Matplotlib(Matlib) and Seaborn(sns), I explored data cleaning techniques, addressed missing values and visualized key patterns and distributions.
The findings offer a fascinating glimpse into correlations and demographics that played a role in the survival outcomes of those on board. In the article delves into the process and insights gained along the way, revealing what the data tells us about that fateful voyage.
Importing and reading the data
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
plt.style.use('ggplot')
#pd.set_option('max_columns', 200)
#Defining the path to the Titanic Folder
path = "/kaggle/input/titanic/"
#Loading the CSV files into Dataframes
gender_submission_df = pd.read_csv(path + gender_submission.csv)
train_df = pd.read_csv(path + train.csv)
test_df = pd.read_csv(path + test.csv)
Step 1: Understanding the data
#Checking the shape of each Dataframe
print("Gender Submission DataFrame shape:", gender_submission_df.shape)
print("Train DataFrame shape:", train_df.shape)
print("Test DataFrame shape:", test_df.shape)
Gender Submission DataFrame shape: (418, 2)
train DataFrame shape: (891, 12)
test DataFrame shape: (418, 11)
#Displaying the first few rows of each DataFrame
print("First 5 rows of Gender Submission DataFrame:")
print(gender_submission_df.head())
print("-"*70)
print("First 5 rows of Train DataFrame:")
print(train_df.head())
print("-"*70)
print("First 5 rows of Test DataFrame:")
print(test_df.head())
print("-"*70)
First 5 rows of Gender Submission DataFrame:
First 5 rows of Train DataFrame:
First 5 rows of Test DataFrame:
#Displaying the columns for the above
print("Column for gender submission data_frame: ")
print(gender_submission_df.columns)
print(" ")
print("Column for train data_frame: ")
print(train_df.columns)
print(" ")
print("Column for test data_frame: ")
print(test_df.columns)
print(" ")
Column for gender submission data_frame:
Index(['PassengerId', 'Survived'], dtype='object')
Column for train data_frame:
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
dtype='object')
Column for test data_frame:
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
'Ticket', 'Fare', 'Cabin', 'Embarked'],
dtype='object')
Finding out what d_type Pandas has discovered for each
print("d_type for gender submission data_frame: ")
print(gender_submission_df.dtypes)
print(" ")
print("d_type for train data_frame: ")
print(train_df.dtypes)
print(" ")
print("d_type for test data_frame: ")
print(test_df.dtypes)
print(" ")
d_type for gender submission data_frame:
PassengerId int64
Survived int64
dtype: object
d_type for train data_frame:
PassengerId int64
Survived int64
Pclass int64
Name object
Sex object
Age float64
SibSp int64
Parch int64
Ticket object
Fare float64
Cabin object
Embarked object
dtype: object
d_type for test data_frame:
PassengerId int64
Pclass int64
Name object
Sex object
Age float64
SibSp int64
Parch int64
Ticket object
Fare float64
Cabin object
Embarked object
dtype: object
Step 2: Data Preparation
#Dropping irrelevant columns and rows(Cleaning the data)
#Displaying the first few rows of each DataFrame
print("First 5 rows of Gender Submission DataFrame:")
print(gender_submission_df.head())
print("-"*90)
print("First 5 rows of Train DataFrame:")
print(train_df.head())
print("-"*90)
print("First 5 rows of Test DataFrame:")
print(test_df.head())
print("-"*90)
print("Column for gender submission data_frame: ")
gender_submission_cleaned_df = gender_submission_df[['PassengerId', 'Survived']].copy()
print(gender_submission_cleaned_df)
print("-"*75)
print("Column for train data_frame: ")
train_cleaned_df = train_df[[#'PassengerId',
'Survived', 'Pclass',
#'Name',
'Sex', 'Age', 'SibSp',
'Parch', #'Ticket',
'Fare',
#'Cabin',
'Embarked']].copy()
print(train_cleaned_df)
print("-"*75)
print("Column for test data_frame: ")
test_cleaned_df = test_df[['PassengerId', 'Pclass', #'Name',
'Sex', 'Age', 'SibSp', 'Parch',
#'Ticket',
'Fare', 'Cabin', 'Embarked']].copy()
print(test_cleaned_df)
print("-"*75)
print("d_type for gender submission data_frame: ")
print(gender_submission_cleaned_df.dtypes)
print(" ")
print("d_type for train data_frame: ")
print(train_cleaned_df.dtypes)
print(" ")
print("d_type for test data_frame: ")
print(test_cleaned_df.dtypes)
print(" ")
d_type for gender submission data_frame:
PassengerId int64
Survived int64
dtype: object
d_type for train data_frame:
Survived int64
Pclass int64
Sex object
Age float64
SibSp int64
Parch int64
Fare float64
Embarked object
dtype: object
d_type for test data_frame:
PassengerId int64
Pclass int64
Sex object
Age float64
SibSp int64
Parch int64
Fare float64
Cabin object
Embarked object
dtype: object
#Renaming our columns
print("Column for gender submission data_frame: ")
gender_submission_cleaned_df = gender_submission_cleaned_df.rename(columns = {'PassengerId': 'Passenger_Id',
'Survived': 'Survival_Status'})
print(gender_submission_cleaned_df)
print("-"*75)
print("Column for train data_frame: ")
train_cleaned_df = train_cleaned_df.rename(columns = {'Survived': 'Survival_Status',
'Pclass' : 'Passenger_Class',
'Sex' : 'Gender',
'Age' : 'Passenger_Age',
'SibSp': 'Siblings_Spouses',
'Parch': 'Parents_Children',
'Fare': 'Ticket_Fare',
'Embarked':'Port_of_Embarkation'})
print(train_cleaned_df)
print("-"*75)
print("Column for test data_frame: ")
test_cleaned_df = test_cleaned_df.rename(columns = {'PassengerId': 'Passenger_Id',
'Pclass': 'Passenger_Class',
'Sex': 'Gender',
'Age': 'Passenger_Age',
'SibSp': 'Siblings_Spouses',
'Parch': 'Parents_Children',
'Fare': 'Ticket_Fare',
'Cabin': 'Cabin_Number',
'Embarked': 'Port_of_Embarkation',})
print(test_cleaned_df)
print("-"*75)`
#Checking if there arent any missing values:
print("for gender submission data_frame: ")
print(gender_submission_cleaned_df.isna().sum())
print("-" * 90)
print("for gender submission data_frame: ")
print(train_cleaned_df.isna().sum())
print("-" * 90)
print("for gender submission data_frame: ")
print(test_cleaned_df.isna().sum())
print("-" * 90)`
#Checking if there aren't any duplicates
print("Gender submission data_frame: ")
print(gender_submission_cleaned_df.loc[gender_submission_cleaned_df.duplicated()])
print("-" * 90)
print("Test data_frame: ")
print(test_cleaned_df.loc[test_cleaned_df.duplicated()])
print("-" * 90)
print("Train data_frame: ")
print(train_cleaned_df.loc[train_cleaned_df.duplicated()])
print("-" * 90)
print("Gender submission data_frame shape: ", gender_submission_df.shape)
print("train DataFrame shape:", train_df.shape)
print("test DataFrame shape:", test_df.shape)
Gender submission data_frame shape: (418, 2)
train DataFrame shape: (891, 12)
test DataFrame shape: (418, 11)
#Checking if there arent any duplicated rows
print("Checking duplicates for gender submission data_frame shape: ")
duplicates = gender_submission_cleaned_df.loc[gender_submission_cleaned_df.duplicated(subset=['Survival_Status'], keep=False)].head(5)
print(duplicates)
print("-" * 90)
print("Checking duplicates for train data_frame shape: ")
duplicates = train_cleaned_df.loc[train_cleaned_df.duplicated(subset=['Parents_Children', 'Ticket_Fare', 'Gender'], keep=False)].head(5)
print(duplicates)
print("-" * 90)
print("Checking duplicates for test data_frame shape: ")
duplicates = test_cleaned_df.loc[test_cleaned_df.duplicated(subset=['Siblings_Spouses', 'Passenger_Age', 'Port_of_Embarkation'], keep=False)].head(5)
print(duplicates)
print("-" * 90)
#Filtering and Querying my data above:
#Checking an example duplicate
print('gender submission data_frame: ')
Filter = gender_submission_cleaned_df.query('Survival_Status == 0')
print(Filter)
print("-" * 90)
print('train data_frame: ')
Filter = train_cleaned_df.query('Passenger_Age == 35.0' )
print(Filter)
print("-" * 90)
print('test data_frame: ')
Filter = test_cleaned_df.query('Siblings_Spouses')
print(Filter)
print("-" * 90)
print('Column for gender submission data_frame: ')
print(gender_submission_cleaned_df.columns)
print("-" * 90)
print('Column for gender submission data_frame: ')
print(train_cleaned_df.columns)
print("-" * 90)
print('Column for gender submission data_frame: ')
print(test_cleaned_df.columns)
print("-" * 90)
Checking duplicated rows for gender submission data_frame:
Number of duplicates: 416
Checking duplicated rows for train data_frame:
Number of duplicates: 486
Checking duplicated rows for test data_frame:
Number of duplicates: 231
#Cleaning up duplicates in each DataFrame
print("Gender submission data_frame: ")
gender_submission_cleaned_df = gender_submission_cleaned_df.loc[~gender_submission_cleaned_df.duplicated(subset=['Survival_Status'])] \
.reset_index(drop=True).copy()
print(gender_submission_cleaned_df)
print('-' * 90)
print("Train data_frame: ")
train_cleaned_df = train_cleaned_df.loc[~train_cleaned_df.duplicated(subset=['Passenger_Class', 'Passenger_Age', 'Ticket_Fare'])] \
.reset_index(drop=True).copy()
print(train_cleaned_df)
print('-' * 90)
print("Test data_frame: ")
test_cleaned_df = test_cleaned_df.loc[~test_cleaned_df.duplicated(subset=['Ticket_Fare', 'Cabin_Number', 'Port_of_Embarkation'])] \
.reset_index(drop=True).copy()
print(test_cleaned_df)
print('-' * 90)
print("Gender submission data_frame shape: ", gender_submission_cleaned_df.shape)
print("train DataFrame shape:", train_cleaned_df.shape)
print("test DataFrame shape:", test_cleaned_df.shape)
Gender submission data_frame shape: (2, 2)
train DataFrame shape: (712, 8)
test DataFrame shape: (208, 9)
Step 3: Feature Understanding
- Plotting Feature Distributions
- Histogram
- KDE
- Boxplot
print("Gender submission data_frame shape: ", gender_submission_cleaned_df)
print('-' * 90)
print("train DataFrame shape:", train_cleaned_df)
print('-' * 90)
print("test DataFrame shape:", test_cleaned_df)
print('-' * 90)
#value_counts
print(gender_submission_cleaned_df['Passenger_Id'].value_counts())
print('-' * 90)
print(train_cleaned_df['Passenger_Age'].value_counts())
print('-' * 90)
print(test_cleaned_df['Cabin_Number'].value_counts())
print('-' * 90)`
Passenger_Id
892 1
893 1
Name: count, dtype: int64
Passenger_Age
24.00 27
18.00 23
22.00 21
21.00 21
30.00 21
..
36.50 1
0.92 1
23.50 1
55.50 1
74.00 1
Name: count, Length: 88, dtype: int64
Cabin_Number
C101 2
C78 2
B45 1
C89 1
B69 1
..
C6 1
G6 1
A29 1
F 1
C105 1
Name: count, Length: 76, dtype: int64
ax = gender_submission_cleaned_df['Survival_Status'].value_counts()\
.head(10)\
.plot(kind='bar', title='Top 10 Gender submission Survival_Status Counts')
ax.set_xlabel('Passenger_Age')
ax.set_ylabel('Count')
plt.show()
ax = train_cleaned_df['Passenger_Age'].value_counts()\
.head(10)\
.plot(kind='bar', title=' Top 10 counts for Passenger_Age from train dataset')
ax.set_xlabel('Passenger_Age')
ax.set_ylabel('Count')
plt.show()
ax = test_cleaned_df['Cabin_Number'].value_counts()\
.head(10)\
.plot(kind='bar', title='Top 10 counts for Cabin_Number from test dataset')
ax.set_xlabel('Cabin_Number')
ax.set_ylabel('Count')
plt.show()
print(gender_submission_cleaned_df)
print('-' * 90)
print(train_cleaned_df)
print('-' * 90)
print(test_cleaned_df)
print('-' * 90)
ax = gender_submission_cleaned_df['Passenger_Id'].plot(kind = 'hist',
bins = 20,
title = 'Gender_Submission Dataset')
ax.set_label('Passenger_Id')
plt.show()
ax = train_cleaned_df['Passenger_Class'].plot(kind = 'hist',
bins = 20,
title = 'Train Dataset')
ax.set_label('Passenger_Class')
plt.show()
ax = test_cleaned_df['Ticket_Fare'].plot(kind = 'hist',
bins = 20,
title = 'Train Dataset')
ax.set_label('Ticket_Fare')
plt.show()
ax = gender_submission_cleaned_df['Passenger_Id'].plot(kind = 'kde',
title = 'Gender_Submission Dataset')
ax.set_label('Passenger_Id')
plt.show()
ax = train_cleaned_df['Passenger_Class'].plot(kind = 'kde',
title = 'Train Dataset')
ax.set_label('Passenger_Class')
plt.show()
ax = test_cleaned_df['Ticket_Fare'].plot(kind = 'kde',
title = 'Test Dataset')
ax.set_label('Ticket_Fare')
plt.show()
Step 4: Feature Relationships
- Heatmap Correlation
- Scatterplot
- Pairplot
- Groupby Comparison
print(gender_submission_cleaned_df)
print('-' * 90)
print(train_cleaned_df)
print('-' * 90)
print(test_cleaned_df)
print('-' * 90)
#Scatterplot Analysis
gender_submission_cleaned_df.plot(kind= 'scatter',
x = 'Passenger_Id',
y = 'Survival_Status',
title = 'Relationship Analysis: Passenger_Id vs Survival_Status(Gender Submission Data)')
plt.show()
#Scatterplot
train_cleaned_df.plot(kind= 'scatter',
x = 'Siblings_Spouses',
y = 'Parents_Children',
title = 'Relationship Analysis: Siblings_Spouses vs Parents_Children(Train Data)')
plt.show()
#Scatterplot
test_cleaned_df.plot(kind= 'scatter',
x = 'Parents_Children',
y = 'Ticket_Fare',
title = 'Relationship Analysis: Ticket_Fare vs Family Size(Parents/Children) in Test Dataset')
plt.show()
#With a hue included
sns.scatterplot(x = 'Passenger_Id',
y = 'Survival_Status',
hue ='Passenger_Id',
data = gender_submission_cleaned_df)
plt.show()
sns.scatterplot(y = 'Siblings_Spouses',
x = 'Parents_Children',
hue = 'Passenger_Age',
data = train_cleaned_df)
plt.show()
sns.scatterplot(x = 'Parents_Children',
y = 'Ticket_Fare',
hue = 'Passenger_Age',
data = test_cleaned_df)
plt.show()``
#Pairplot
sns.pairplot(data=gender_submission_cleaned_df,
vars=['Passenger_Id', 'Survival_Status'],
hue = 'Survival_Status')
plt.show()
sns.pairplot(data = train_cleaned_df,
vars = ['Siblings_Spouses','Parents_Children'],
hue = 'Passenger_Age')
plt.show()
sns.pairplot(data = test_cleaned_df,
vars =['Passenger_Class',
'Passenger_Age',
'Parents_Children','Ticket_Fare'],
hue = 'Passenger_Age')
plt.show()
#Heatmap Correlation
gender_submission_cleaned_df = gender_submission_cleaned_df[['Passenger_Id', 'Survival_Status']].dropna().corr()
sns.heatmap(gender_submission_cleaned_df, annot=True)
train_cleaned_df = train_cleaned_df[['Passenger_Class',
'Passenger_Age',
'Siblings_Spouses',
'Parents_Children',
'Ticket_Fare']].dropna().corr()
sns.heatmap(train_cleaned_df, annot=True)
test_cleaned_df = test_cleaned_df[['Passenger_Id',
'Passenger_Age',
'Parents_Children',
'Ticket_Fare']].dropna().corr()`
sns.heatmap(test_cleaned_df, annot=True)
Questions And Visualisation
#What is the relationship between Passenger_Age and Survival_Status under trained data?
print(train_cleaned_df.columns)
Index(['Passenger_Class', 'Passenger_Age', 'Siblings_Spouses',
'Parents_Children', 'Ticket_Fare'],
dtype='object')
print(train_cleaned_df['Passenger_Age'].isnull().sum())
0
#option 1: Drop rows with null values
train_cleaned_df = train_cleaned_df.dropna(subset=['Passenger_Age'])
#option 2: Fill null values with specific value(e.g., mean or median)
train_cleaned_df['Passenger_Age'] = train_cleaned_df['Passenger_Age'].fillna(train_cleaned_df['Passenger_Age'].mean())
#Verifying DataTypes
print(train_cleaned_df.dtypes)
Passenger_Class float64
Passenger_Age float64
Siblings_Spouses float64
Parents_Children float64
Ticket_Fare float64
dtype: object
#Plot Again
sns.scatterplot(data=train_cleaned_df, x='Passenger_Age', y='Passenger_Class')
plt.title('Passenger_Class vs. Passenger_Age')
plt.show()
If you have read till this far please like and comment below, your input matters alot. Thank youš