Create a Streamlit application to predict the survival rate of a person on the titanic. Use a simple logistic regression to do the prediction. The data is located here : https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv and the col are: PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked . MakBe arBe aware, the might have some missing data in the dataset
Drop files here
or click to upload
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
st.title("Titanic Survival Predictor")
# Load and preprocess data
@st.cache_data
def load_data() -> pd.DataFrame:
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
data = pd.read_csv(url)
return data
def preprocess_data(df: pd.DataFrame):
# Select features we'll use for prediction
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
# Make sure all required columns exist
if not all(feature in df.columns for feature in features):
missing_cols = [col for col in features if col not in df.columns]
raise ValueError(f"Missing columns in dataset: {missing_cols}")
X = df[features].copy()
y = df['Survived']
# Handle categorical variables
le = LabelEncoder()
X['Sex'] = le.fit_transform(X['Sex'])
X['Embarked'] = le.fit_transform(X['Embarked'].fillna('S'))
# Handle missing values
imputer = SimpleImputer(strategy='median')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
return X, y
# Load data
data = load_data()
# Add this after load_data()
st.write("Available columns:", data.columns)
X, y = preprocess_data(data)
# Train model
model = LogisticRegression()
model.fit(X, y)
# Create input form
st.sidebar.header("Enter Passenger Information")
pclass = st.sidebar.selectbox("Passenger Class", options=[1, 2, 3], help="1 = 1st class, 2 = 2nd class, 3 = 3rd class")
sex = st.sidebar.selectbox("Sex", options=["male", "female"])
age = st.sidebar.slider("Age", min_value=0, max_value=100, value=30)
sibsp = st.sidebar.number_input("Number of Siblings/Spouses Aboard", min_value=0, max_value=10, value=0)
parch = st.sidebar.number_input("Number of Parents/Children Aboard", min_value=0, max_value=10, value=0)
fare = st.sidebar.number_input("Fare", min_value=0.0, max_value=600.0, value=32.2)
embarked = st.sidebar.selectbox("Port of Embarkation", options=['S', 'C', 'Q'], help="S = Southampton, C = Cherbourg, Q = Queenstown")
# Make prediction when user clicks button
if st.sidebar.button("Predict Survival"):
# Prepare input data
le = LabelEncoder()
sex_encoded = le.fit_transform([sex])[0]
embarked_encoded = le.fit_transform([embarked])[0]
input_data = pd.DataFrame({
'Pclass': [pclass],
'Sex': [sex_encoded],
'Age': [age],
'SibSp': [sibsp],
'Parch': [parch],
'Fare': [fare],
'Embarked': [embarked_encoded]
})
# Make prediction
prediction = model.predict_proba(input_data)[0]
# Display results
st.header("Survival Prediction")
col1, col2 = st.columns(2)
with col1:
st.metric("Survival Probability", f"{prediction[1]:.1%}")
with col2:
st.metric("Death Probability", f"{prediction[0]:.1%}")
# Display prediction message
if prediction[1] > 0.5:
st.success("This passenger would likely have survived!")
else:
st.error("This passenger would likely not have survived.")
# Show feature importance
st.header("Feature Importance")
feature_importance = pd.DataFrame({
'Feature': ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'],
'Importance': abs(model.coef_[0])
})
feature_importance = feature_importance.sort_values('Importance', ascending=False)
st.bar_chart(data=feature_importance, x='Feature', y='Importance')
# Display some statistics about the dataset
st.header("Dataset Statistics")
col1, col2 = st.columns(2)
with col1:
st.subheader("Survival Rate by Class")
survival_by_class = data.groupby('Pclass')['Survived'].mean()
st.bar_chart(survival_by_class)
with col2:
st.subheader("Survival Rate by Sex")
survival_by_sex = data.groupby('Sex')['Survived'].mean()
st.bar_chart(survival_by_sex)
st.markdown("""
### About this Predictor
This application uses a Logistic Regression model to predict the probability of survival on the Titanic.
The model is trained on historical data from the Titanic disaster and takes into account several factors:
- Passenger Class (1st, 2nd, or 3rd class)
- Sex
- Age
- Number of Siblings/Spouses Aboard
- Number of Parents/Children Aboard
- Fare
- Port of Embarkation
The model has been trained on the actual Titanic dataset, but please note that this is a simplified model
and predictions should be taken as approximate probabilities rather than definitive answers.
""")
Hi! I can help you with any questions about Streamlit and Python. What would you like to know?