building a titanic eda app
To upload files, please first save the app
import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import seaborn as sns
import numpy as np
st.set_page_config(page_title="Titanic EDA", layout="wide")
# Load the Titanic dataset
@st.cache_data
def load_data():
# NOTE: Using corsproxy.io because we're in a WASM environment. If running locally,
# you can remove the corsproxy.io prefix.
url = "https://corsproxy.io/?https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
return pd.read_csv(url)
# Load data
df = load_data()
# Title and dataset info
st.title("Titanic Dataset Explorer")
st.write("Explore the famous Titanic dataset with interactive visualizations")
# Display basic dataset information
st.header("Dataset Overview")
col1, col2 = st.columns(2)
with col1:
st.write("Dataset Shape:", df.shape)
st.write("### Data Sample")
st.dataframe(df.head())
with col2:
st.write("### Missing Values")
missing_values = df.isnull().sum()
st.dataframe(pd.DataFrame({
'Column': missing_values.index,
'Missing Values': missing_values.values,
'Percentage': (missing_values.values / len(df) * 100).round(2)
}))
# Survival Distribution
st.header("Survival Analysis")
col3, col4 = st.columns(2)
with col3:
survival_count = df['Survived'].value_counts()
fig_survival = px.pie(values=survival_count.values,
names=['Did not Survive', 'Survived'],
title='Survival Distribution')
st.plotly_chart(fig_survival)
with col4:
fig_survival_class = px.histogram(df,
x='Pclass',
color='Survived',
barmode='group',
title='Survival by Passenger Class',
labels={'Pclass': 'Passenger Class', 'count': 'Count'})
st.plotly_chart(fig_survival_class)
# Age Distribution
st.header("Age Distribution")
col5, col6 = st.columns(2)
with col5:
fig_age = px.histogram(df,
x='Age',
nbins=30,
title='Age Distribution',
labels={'Age': 'Age', 'count': 'Count'})
st.plotly_chart(fig_age)
with col6:
fig_age_survival = px.box(df,
x='Survived',
y='Age',
title='Age Distribution by Survival',
labels={'Survived': 'Survived', 'Age': 'Age'})
st.plotly_chart(fig_age_survival)
# Gender Analysis
st.header("Gender Analysis")
col7, col8 = st.columns(2)
with col7:
gender_survival = pd.crosstab(df['Sex'], df['Survived'])
fig_gender = px.bar(gender_survival,
title='Survival by Gender',
labels={'Sex': 'Gender', 'value': 'Count'},
barmode='group')
st.plotly_chart(fig_gender)
with col8:
fig_gender_class = px.histogram(df,
x='Sex',
color='Pclass',
title='Gender Distribution by Class',
barmode='group',
labels={'Sex': 'Gender', 'count': 'Count'})
st.plotly_chart(fig_gender_class)
# Fare Analysis
st.header("Fare Analysis")
col9, col10 = st.columns(2)
with col9:
fig_fare = px.box(df,
x='Pclass',
y='Fare',
title='Fare Distribution by Passenger Class',
labels={'Pclass': 'Passenger Class', 'Fare': 'Fare'})
st.plotly_chart(fig_fare)
with col10:
fig_fare_survival = px.scatter(df,
x='Fare',
y='Age',
color='Survived',
title='Fare vs Age by Survival',
labels={'Fare': 'Fare', 'Age': 'Age'})
st.plotly_chart(fig_fare_survival)
# Interactive Feature Analysis
st.header("Interactive Feature Analysis")
feature_x = st.selectbox('Select X-axis feature:', df.select_dtypes(include=['int64', 'float64']).columns)
feature_y = st.selectbox('Select Y-axis feature:', df.select_dtypes(include=['int64', 'float64']).columns)
color_by = st.selectbox('Color by:', ['Survived', 'Pclass', 'Sex'])
fig_interactive = px.scatter(df,
x=feature_x,
y=feature_y,
color=color_by,
title=f'{feature_x} vs {feature_y} by {color_by}')
st.plotly_chart(fig_interactive)
# Correlation Heatmap
st.header("Correlation Analysis")
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
correlation = df[numeric_cols].corr()
fig_corr = px.imshow(correlation,
labels=dict(color="Correlation"),
title="Correlation Heatmap")
st.plotly_chart(fig_corr)
# Data filtering
st.header("Data Filter")
st.write("Filter the dataset based on your criteria:")
col11, col12, col13 = st.columns(3)
with col11:
selected_class = st.multiselect('Passenger Class:', sorted(df['Pclass'].unique()))
with col12:
selected_sex = st.multiselect('Gender:', sorted(df['Sex'].unique()))
with col13:
selected_survival = st.multiselect('Survival:', sorted(df['Survived'].unique()))
# Apply filters
filtered_df = df.copy()
if selected_class:
filtered_df = filtered_df[filtered_df['Pclass'].isin(selected_class)]
if selected_sex:
filtered_df = filtered_df[filtered_df['Sex'].isin(selected_sex)]
if selected_survival:
filtered_df = filtered_df[filtered_df['Survived'].isin(selected_survival)]
st.write("Filtered Dataset:")
st.dataframe(filtered_df)
Hi! I can help you with any questions about Streamlit and Python. What would you like to know?