Create an application that has a text field to enter a URL (default value: https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv). The app should read the CSV file (and cache it) from the URL and display a DataFrame. Add controls to do the following: - Control how many rows are displayed - A checkbox to determine to randomly shuffle the DataFrame Users select one or two variables and you must determine the best way to visualize them.
Drop files here
or click to upload
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# Set page title
st.title("Penguin Data Explorer")
# Add text field for URL input with default value
url = st.text_input(
"Enter CSV URL",
value="https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv"
)
# Cache data loading function
@st.cache_data
def load_data(url):
try:
df = pd.read_csv(url)
return df
except Exception as e:
st.error(f"Error loading data: {e}")
return None
# Load data
if url:
data = load_data(url)
if data is not None:
# Display basic information
st.write(f"Dataset shape: {data.shape}")
# Add controls in sidebar
st.sidebar.header("Data Controls")
# Control for shuffling data
shuffle = st.sidebar.checkbox("Randomly shuffle data", value=False)
# Apply shuffling if selected
if shuffle:
data = data.sample(frac=1, random_state=42).reset_index(drop=True)
# Control for number of rows to display
num_rows = st.sidebar.slider("Number of rows to display",
min_value=1,
max_value=len(data),
value=min(10, len(data)))
# Display the dataframe with selected number of rows
st.subheader("Data Preview")
st.dataframe(data.head(num_rows))
# Data visualization section
st.subheader("Data Visualization")
# Get column names for selection
numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()
all_cols = data.columns.tolist()
# Allow user to select variables for visualization
st.sidebar.header("Visualization Controls")
# Option to select 1 or 2 variables
var_count = st.sidebar.radio("Select number of variables to visualize", [1, 2])
if var_count == 1:
# Single variable visualization
variable = st.sidebar.selectbox("Select a variable", all_cols)
st.write(f"Visualization for: {variable}")
# Determine best plot based on data type
if variable in numeric_cols:
# For numeric data, show histogram and basic stats
fig, ax = plt.subplots(1, 2, figsize=(12, 5))
# Histogram
sns.histplot(data[variable].dropna(), kde=True, ax=ax[0])
ax[0].set_title(f"Distribution of {variable}")
# Boxplot
sns.boxplot(y=data[variable].dropna(), ax=ax[1])
ax[1].set_title(f"Boxplot of {variable}")
st.pyplot(fig)
# Display basic statistics
st.write("Summary statistics:")
st.write(data[variable].describe())
else:
# For categorical data, show count plot
fig, ax = plt.subplots(figsize=(10, 6))
value_counts = data[variable].value_counts()
sns.countplot(y=data[variable], order=value_counts.index, ax=ax)
ax.set_title(f"Count of {variable}")
st.pyplot(fig)
# Display value counts
st.write("Value counts:")
st.write(value_counts)
else:
# Two variables visualization
var1 = st.sidebar.selectbox("Select first variable", all_cols, key="var1")
var2 = st.sidebar.selectbox("Select second variable", all_cols, key="var2")
st.write(f"Visualization for: {var1} vs {var2}")
# Determine plot type based on variable types
if (var1 in numeric_cols) and (var2 in numeric_cols):
# Both numeric - scatter plot with regression line
fig, ax = plt.subplots(figsize=(10, 6))
sns.scatterplot(data=data, x=var1, y=var2, ax=ax)
sns.regplot(data=data, x=var1, y=var2, scatter=False, ax=ax, color='red')
ax.set_title(f"{var1} vs {var2}")
# Calculate and display correlation
correlation = data[[var1, var2]].corr().iloc[0, 1]
st.write(f"Correlation coefficient: {correlation:.4f}")
elif (var1 in categorical_cols) and (var2 in numeric_cols):
# Categorical vs Numeric - box plot
fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(data=data, x=var1, y=var2, ax=ax)
ax.set_title(f"{var2} by {var1}")
elif (var1 in numeric_cols) and (var2 in categorical_cols):
# Numeric vs Categorical - box plot (swapped)
fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(data=data, x=var2, y=var1, ax=ax)
ax.set_title(f"{var1} by {var2}")
else:
# Both categorical - heatmap of counts
fig, ax = plt.subplots(figsize=(10, 6))
# Create cross-tabulation
crosstab = pd.crosstab(data[var1], data[var2], normalize=False)
sns.heatmap(crosstab, annot=True, fmt='d', cmap='Blues', ax=ax)
ax.set_title(f"Counts of {var1} vs {var2}")
st.pyplot(fig)
else:
st.warning("Please enter a valid CSV URL")
Hi! I can help you with any questions about Streamlit and Python. What would you like to know?