Create an application that has a text field to enter a URL (default value: https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv). The app should read the CSV file (and cache it) from the URL and display a DataFrame. Add controls to do the following: - Control how many rows are displayed - A checkbox to determine to randomly shuffle the DataFrame Users select one or two variables and you must determine the best way to visualize them.
To upload files, please first save the app
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Set page title
st.title("CSV Data Visualizer")
# Text field for URL input
url = st.text_input(
"Enter CSV URL",
value="https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv"
)
# Cache data loading to improve performance
@st.cache_data
def load_data(url):
try:
# NOTE: Using corsproxy.io because we're in a WASM environment. If running locally,
# you can remove the corsproxy.io prefix. Some websites don't work with the proxy,
# in those cases try removing the proxy prefix.
if not url.startswith(("http://", "https://")):
st.error("Please enter a valid URL starting with http:// or https://")
return None
proxy_url = f"https://corsproxy.io/?{url}" if "corsproxy.io" not in url else url
data = pd.read_csv(proxy_url)
return data
except Exception as e:
st.error(f"Error loading data: {e}")
return None
# Load data
data = load_data(url)
if data is not None:
# Display dataset info
st.write(f"Dataset shape: {data.shape[0]} rows, {data.shape[1]} columns")
# Control options
col1, col2 = st.columns(2)
with col1:
# Number of rows to display
num_rows = st.slider("Number of rows to display", 5, min(100, data.shape[0]), 10)
with col2:
# Option to shuffle data
shuffle = st.checkbox("Randomly shuffle data")
# Process data based on user options
if shuffle:
display_data = data.sample(frac=1, random_state=42).reset_index(drop=True)
else:
display_data = data.copy()
# Display the dataframe
st.subheader("Data Preview")
st.dataframe(display_data.head(num_rows))
# Column selection for visualization
st.subheader("Data Visualization")
# Get numerical and categorical columns for selection
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()
all_cols = numerical_cols + categorical_cols
# Let user select columns for visualization
st.write("Select variables to visualize:")
col1, col2 = st.columns(2)
with col1:
x_var = st.selectbox("X-axis variable", all_cols, index=0)
with col2:
y_var = st.selectbox("Y-axis variable (optional)", ["None"] + all_cols, index=0)
# Determine the best visualization based on selected variables
if y_var == "None":
# Single variable visualization
st.subheader(f"Visualization of {x_var}")
fig, ax = plt.subplots(figsize=(10, 6))
if x_var in numerical_cols:
# For numerical variables: histogram and KDE
sns.histplot(data=data, x=x_var, kde=True, ax=ax)
ax.set_title(f"Distribution of {x_var}")
else:
# For categorical variables: count plot
sns.countplot(data=data, x=x_var, ax=ax)
ax.set_title(f"Count of {x_var}")
plt.xticks(rotation=45)
st.pyplot(fig)
# Show summary statistics
if x_var in numerical_cols:
st.write("Summary statistics:")
st.write(data[x_var].describe())
else:
# Two variables visualization
st.subheader(f"Visualization of {x_var} vs {y_var}")
fig, ax = plt.subplots(figsize=(10, 6))
if (x_var in numerical_cols) and (y_var in numerical_cols):
# Both numerical: scatter plot
sns.scatterplot(data=data, x=x_var, y=y_var, ax=ax)
ax.set_title(f"Scatter plot of {x_var} vs {y_var}")
# Add correlation info
corr = data[[x_var, y_var]].corr().iloc[0, 1]
st.write(f"Correlation between {x_var} and {y_var}: {corr:.2f}")
elif (x_var in categorical_cols) and (y_var in numerical_cols):
# Categorical x, Numerical y: box plot
sns.boxplot(data=data, x=x_var, y=y_var, ax=ax)
ax.set_title(f"Box plot of {y_var} by {x_var}")
plt.xticks(rotation=45)
elif (x_var in numerical_cols) and (y_var in categorical_cols):
# Numerical x, Categorical y: box plot (flipped)
sns.boxplot(data=data, x=y_var, y=x_var, ax=ax)
ax.set_title(f"Box plot of {x_var} by {y_var}")
plt.xticks(rotation=45)
else:
# Both categorical: count plot
crosstab = pd.crosstab(data[y_var], data[x_var])
crosstab.plot(kind='bar', stacked=True, ax=ax)
ax.set_title(f"Stacked bar plot of {x_var} and {y_var}")
plt.xticks(rotation=45)
st.pyplot(fig)
else:
st.warning("Please enter a valid URL to a CSV file.")
Hi! I can help you with any questions about Streamlit and Python. What would you like to know?