build a simple data science project app
To upload files, please first save the app
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
st.set_page_config(page_title="Data Science Explorer", layout="wide")
st.title("Data Science Explorer")
st.write("Upload your dataset and explore it with basic data science techniques.")
# File uploader
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
if uploaded_file is not None:
# Load data
df = pd.read_csv(uploaded_file)
# Display basic info
st.header("Dataset Overview")
col1, col2 = st.columns(2)
with col1:
st.write("Data Shape:", df.shape)
st.write("Data Types:")
st.write(df.dtypes)
with col2:
st.write("Basic Statistics:")
st.write(df.describe())
# Display data
st.subheader("Data Preview")
st.dataframe(df.head(10))
# Handle missing values
st.header("Missing Values")
missing_values = df.isnull().sum()
st.write(missing_values[missing_values > 0])
if missing_values.sum() > 0:
st.subheader("Handle Missing Values")
missing_method = st.selectbox(
"Choose a method to handle missing values",
["Drop rows with missing values", "Fill numeric with mean", "Fill categorical with mode", "Do nothing"]
)
if missing_method == "Drop rows with missing values":
df = df.dropna()
st.write("Rows with missing values dropped. New shape:", df.shape)
elif missing_method == "Fill numeric with mean":
for col in df.select_dtypes(include=np.number).columns:
df[col].fillna(df[col].mean(), inplace=True)
st.write("Numeric missing values filled with mean.")
elif missing_method == "Fill categorical with mode":
for col in df.select_dtypes(exclude=np.number).columns:
df[col].fillna(df[col].mode()[0], inplace=True)
st.write("Categorical missing values filled with mode.")
# Data Visualization
st.header("Data Visualization")
# Select columns for visualization
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
categorical_cols = df.select_dtypes(exclude=np.number).columns.tolist()
if numeric_cols:
st.subheader("Numeric Data Visualization")
viz_type = st.selectbox(
"Choose visualization type for numeric data",
["Histogram", "Box Plot", "Correlation Heatmap", "Scatter Plot"]
)
if viz_type == "Histogram":
selected_col = st.selectbox("Select column for histogram", numeric_cols)
fig, ax = plt.subplots(figsize=(10, 6))
sns.histplot(df[selected_col], kde=True, ax=ax)
st.pyplot(fig)
elif viz_type == "Box Plot":
selected_col = st.selectbox("Select column for box plot", numeric_cols)
fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(y=df[selected_col], ax=ax)
st.pyplot(fig)
elif viz_type == "Correlation Heatmap":
fig, ax = plt.subplots(figsize=(12, 8))
correlation = df[numeric_cols].corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', ax=ax)
st.pyplot(fig)
elif viz_type == "Scatter Plot":
col1 = st.selectbox("Select X-axis column", numeric_cols)
col2 = st.selectbox("Select Y-axis column", numeric_cols, index=min(1, len(numeric_cols)-1))
fig, ax = plt.subplots(figsize=(10, 6))
sns.scatterplot(x=df[col1], y=df[col2], ax=ax)
st.pyplot(fig)
if categorical_cols:
st.subheader("Categorical Data Visualization")
selected_cat_col = st.selectbox("Select categorical column", categorical_cols)
fig, ax = plt.subplots(figsize=(10, 6))
df[selected_cat_col].value_counts().plot(kind='bar', ax=ax)
plt.xticks(rotation=45)
st.pyplot(fig)
# Simple Machine Learning
if len(df) > 10: # Only show ML if we have enough data
st.header("Simple Machine Learning")
st.write("""
Let's build a simple classification model. Select a target column (what you want to predict)
and features (columns used for prediction).
""")
# Select target
target_col = st.selectbox("Select target column (categorical)", categorical_cols)
# Select features
available_features = [col for col in df.columns if col != target_col]
selected_features = st.multiselect("Select features for prediction", available_features)
if selected_features and target_col:
st.subheader("Model Training")
# Prepare data
X = df[selected_features]
# Handle categorical features
categorical_features = X.select_dtypes(exclude=np.number).columns
if not categorical_features.empty:
st.write("Encoding categorical features...")
for feat in categorical_features:
le = LabelEncoder()
X[feat] = le.fit_transform(X[feat].astype(str))
# Encode target
le_target = LabelEncoder()
y = le_target.fit_transform(df[target_col].astype(str))
# Split data
test_size = st.slider("Test set size", 0.1, 0.5, 0.2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
st.write(f"Training set size: {X_train.shape[0]}")
st.write(f"Test set size: {X_test.shape[0]}")
# Model parameters
n_estimators = st.slider("Number of trees", 10, 200, 100)
max_depth = st.slider("Maximum depth of trees", 1, 20, 10)
# Train model
if st.button("Train Random Forest Model"):
with st.spinner("Training model..."):
model = RandomForestClassifier(
n_estimators=n_estimators,
max_depth=max_depth,
random_state=42
)
model.fit(X_train, y_train)
# Predictions
y_pred = model.predict(X_test)
# Evaluate
accuracy = accuracy_score(y_test, y_pred)
st.subheader(f"Model Accuracy: {accuracy:.4f}")
# Feature importance
feature_imp = pd.DataFrame({
'Feature': X.columns,
'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)
st.subheader("Feature Importance")
fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_imp, ax=ax)
st.pyplot(fig)
# Confusion Matrix
st.subheader("Confusion Matrix")
cm = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=ax)
plt.ylabel('Actual')
plt.xlabel('Predicted')
st.pyplot(fig)
# Classification Report
st.subheader("Classification Report")
report = classification_report(y_test, y_pred, target_names=le_target.classes_, output_dict=True)
report_df = pd.DataFrame(report).transpose()
st.dataframe(report_df)
else:
st.info("Please upload a CSV file to get started.")
# Sample data option
if st.button("Use Sample Dataset (Iris)"):
from sklearn.datasets import load_iris
iris = load_iris()
iris_df = pd.DataFrame(data=np.c_[iris['data'], iris['target']],
columns=iris['feature_names'] + ['target'])
# Convert target to categorical
iris_df['target'] = iris_df['target'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})
# Save to CSV and load it
iris_df.to_csv('iris_dataset.csv', index=False)
df = iris_df
st.success("Iris dataset loaded!")
st.experimental_rerun()
Hi! I can help you with any questions about Streamlit and Python. What would you like to know?