Ploomber AI Editor

Drop files here
or click to upload
Code Editor for app.py

import streamlit as st
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

st.title("Titanic Survival Predictor")

# Load and preprocess data
@st.cache_data
def load_data() -> pd.DataFrame:
    url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
    data = pd.read_csv(url)
    return data

def preprocess_data(df: pd.DataFrame):
    # Select features we'll use for prediction
    features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
    
    # Make sure all required columns exist
    if not all(feature in df.columns for feature in features):
        missing_cols = [col for col in features if col not in df.columns]
        raise ValueError(f"Missing columns in dataset: {missing_cols}")
    
    X = df[features].copy()
    y = df['Survived']
    
    # Handle categorical variables
    le = LabelEncoder()
    X['Sex'] = le.fit_transform(X['Sex'])
    X['Embarked'] = le.fit_transform(X['Embarked'].fillna('S'))
    
    # Handle missing values
    imputer = SimpleImputer(strategy='median')
    X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
    
    return X, y


# Load data
data = load_data()

# Add this after load_data()
st.write("Available columns:", data.columns)

X, y = preprocess_data(data)

# Train model
model = LogisticRegression()
model.fit(X, y)

# Create input form
st.sidebar.header("Enter Passenger Information")

pclass = st.sidebar.selectbox("Passenger Class", options=[1, 2, 3], help="1 = 1st class, 2 = 2nd class, 3 = 3rd class")
sex = st.sidebar.selectbox("Sex", options=["male", "female"])
age = st.sidebar.slider("Age", min_value=0, max_value=100, value=30)
sibsp = st.sidebar.number_input("Number of Siblings/Spouses Aboard", min_value=0, max_value=10, value=0)
parch = st.sidebar.number_input("Number of Parents/Children Aboard", min_value=0, max_value=10, value=0)
fare = st.sidebar.number_input("Fare", min_value=0.0, max_value=600.0, value=32.2)
embarked = st.sidebar.selectbox("Port of Embarkation", options=['S', 'C', 'Q'], help="S = Southampton, C = Cherbourg, Q = Queenstown")

# Make prediction when user clicks button
if st.sidebar.button("Predict Survival"):
    # Prepare input data
    le = LabelEncoder()
    sex_encoded = le.fit_transform([sex])[0]
    embarked_encoded = le.fit_transform([embarked])[0]
    
    input_data = pd.DataFrame({
        'Pclass': [pclass],
        'Sex': [sex_encoded],
        'Age': [age],
        'SibSp': [sibsp],
        'Parch': [parch],
        'Fare': [fare],
        'Embarked': [embarked_encoded]
    })
    
    # Make prediction
    prediction = model.predict_proba(input_data)[0]
    
    # Display results
    st.header("Survival Prediction")
    col1, col2 = st.columns(2)
    
    with col1:
        st.metric("Survival Probability", f"{prediction[1]:.1%}")
    with col2:
        st.metric("Death Probability", f"{prediction[0]:.1%}")
        
    # Display prediction message
    if prediction[1] > 0.5:
        st.success("This passenger would likely have survived!")
    else:
        st.error("This passenger would likely not have survived.")
        
    # Show feature importance
    st.header("Feature Importance")
    feature_importance = pd.DataFrame({
        'Feature': ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'],
        'Importance': abs(model.coef_[0])
    })
    feature_importance = feature_importance.sort_values('Importance', ascending=False)
    st.bar_chart(data=feature_importance, x='Feature', y='Importance')

# Display some statistics about the dataset
st.header("Dataset Statistics")
col1, col2 = st.columns(2)

with col1:
    st.subheader("Survival Rate by Class")
    survival_by_class = data.groupby('Pclass')['Survived'].mean()
    st.bar_chart(survival_by_class)

with col2:
    st.subheader("Survival Rate by Sex")
    survival_by_sex = data.groupby('Sex')['Survived'].mean()
    st.bar_chart(survival_by_sex)

st.markdown("""
### About this Predictor
This application uses a Logistic Regression model to predict the probability of survival on the Titanic.
The model is trained on historical data from the Titanic disaster and takes into account several factors:

- Passenger Class (1st, 2nd, or 3rd class)
- Sex
- Age
- Number of Siblings/Spouses Aboard
- Number of Parents/Children Aboard
- Fare
- Port of Embarkation

The model has been trained on the actual Titanic dataset, but please note that this is a simplified model
and predictions should be taken as approximate probabilities rather than definitive answers.
""")
Loading code editor...
Click Run to preview your app
Terminal