Create an application that has a text field to enter a URL (default value: https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv). The app should read the CSV file (and cache it) from the URL and display a DataFrame. Add controls to do the following: - Control how many rows are displayed - A checkbox to determine to randomly shuffle the DataFrame Users select one or two variables and you must determine the best way to visualize them.
To upload files, please first save the app
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pyplot as plt
# Set page config
st.set_page_config(
page_title="CSV Data Visualizer",
page_icon="📊",
layout="wide"
)
@st.cache_data
def load_data(url):
"""Load and cache CSV data from URL"""
try:
df = pd.read_csv(url)
return df, None
except Exception as e:
return None, str(e)
def get_column_type(series):
"""Determine if a column is numeric or categorical"""
if pd.api.types.is_numeric_dtype(series):
return "numeric"
else:
return "categorical"
def create_single_variable_plot(df, column):
"""Create appropriate plot for single variable"""
col_type = get_column_type(df[column])
if col_type == "numeric":
# Histogram for numeric data
fig = px.histogram(df, x=column, title=f"Distribution of {column}")
fig.update_layout(showlegend=False)
return fig
else:
# Bar chart for categorical data
value_counts = df[column].value_counts()
fig = px.bar(x=value_counts.index, y=value_counts.values,
title=f"Count of {column}")
fig.update_xaxes(title=column)
fig.update_yaxes(title="Count")
return fig
def create_two_variable_plot(df, col1, col2):
"""Create appropriate plot for two variables"""
col1_type = get_column_type(df[col1])
col2_type = get_column_type(df[col2])
if col1_type == "numeric" and col2_type == "numeric":
# Scatter plot for two numeric variables
fig = px.scatter(df, x=col1, y=col2, title=f"{col1} vs {col2}")
return fig
elif col1_type == "categorical" and col2_type == "numeric":
# Box plot for categorical vs numeric
fig = px.box(df, x=col1, y=col2, title=f"{col2} by {col1}")
return fig
elif col1_type == "numeric" and col2_type == "categorical":
# Box plot for numeric vs categorical (swap axes)
fig = px.box(df, x=col2, y=col1, title=f"{col1} by {col2}")
return fig
else:
# Stacked bar chart for two categorical variables
crosstab = pd.crosstab(df[col1], df[col2])
fig = px.bar(crosstab, title=f"{col1} vs {col2}")
fig.update_xaxes(title=col1)
fig.update_yaxes(title="Count")
return fig
# Main app
st.title("📊 CSV Data Visualizer")
st.markdown("Load CSV data from a URL and create interactive visualizations")
# URL input
st.subheader("Data Source")
url = st.text_input(
"Enter CSV URL:",
value="https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv",
help="Enter the URL of a CSV file to analyze"
)
if url:
# Load data
with st.spinner("Loading data..."):
df, error = load_data(url)
if error:
st.error(f"Error loading data: {error}")
elif df is not None:
st.success(f"Data loaded successfully! Shape: {df.shape}")
# Data controls
st.subheader("Data Controls")
col1, col2 = st.columns(2)
with col1:
max_rows = len(df)
num_rows = st.slider(
"Number of rows to display:",
min_value=1,
max_value=max_rows,
value=min(100, max_rows),
help=f"Total rows available: {max_rows}"
)
with col2:
shuffle_data = st.checkbox(
"Randomly shuffle data",
value=False,
help="Shuffle the DataFrame before displaying"
)
# Process data based on controls
display_df = df.copy()
if shuffle_data:
display_df = display_df.sample(frac=1).reset_index(drop=True)
display_df = display_df.head(num_rows)
# Display DataFrame
st.subheader("Data Preview")
st.dataframe(display_df, use_container_width=True)
# Data info
with st.expander("Dataset Information"):
col1, col2 = st.columns(2)
with col1:
st.write("**Column Information:**")
info_df = pd.DataFrame({
'Column': df.columns,
'Type': [get_column_type(df[col]) for col in df.columns],
'Non-null Count': [df[col].count() for col in df.columns],
'Null Count': [df[col].isnull().sum() for col in df.columns]
})
st.dataframe(info_df, use_container_width=True)
with col2:
st.write("**Basic Statistics:**")
st.dataframe(df.describe(), use_container_width=True)
# Visualization section
st.subheader("Data Visualization")
# Variable selection
st.write("Select variables to visualize:")
available_columns = list(df.columns)
col1, col2 = st.columns(2)
with col1:
var1 = st.selectbox("First variable:", available_columns, index=0)
with col2:
var2 = st.selectbox("Second variable (optional):", [None] + available_columns, index=0)
# Create and display plot
if var1:
try:
if var2 is None:
# Single variable plot
fig = create_single_variable_plot(display_df, var1)
st.plotly_chart(fig, use_container_width=True)
# Show some statistics
st.write(f"**Statistics for {var1}:**")
if get_column_type(df[var1]) == "numeric":
stats = df[var1].describe()
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Mean", f"{stats['mean']:.2f}")
with col2:
st.metric("Median", f"{stats['50%']:.2f}")
with col3:
st.metric("Std Dev", f"{stats['std']:.2f}")
with col4:
st.metric("Range", f"{stats['max'] - stats['min']:.2f}")
else:
value_counts = df[var1].value_counts()
st.write(f"Unique values: {len(value_counts)}")
st.write("Top categories:")
st.dataframe(value_counts.head(), use_container_width=True)
else:
# Two variable plot
fig = create_two_variable_plot(display_df, var1, var2)
st.plotly_chart(fig, use_container_width=True)
# Show correlation if both are numeric
if get_column_type(df[var1]) == "numeric" and get_column_type(df[var2]) == "numeric":
correlation = df[var1].corr(df[var2])
st.metric("Correlation", f"{correlation:.3f}")
if abs(correlation) > 0.7:
st.info("Strong correlation detected!")
elif abs(correlation) > 0.3:
st.info("Moderate correlation detected.")
else:
st.info("Weak correlation.")
except Exception as e:
st.error(f"Error creating visualization: {str(e)}")
# Additional insights
with st.expander("Data Quality Insights"):
st.write("**Missing Values:**")
missing_data = df.isnull().sum()
missing_data = missing_data[missing_data > 0]
if len(missing_data) > 0:
st.dataframe(missing_data.to_frame("Missing Count"), use_container_width=True)
else:
st.success("No missing values found!")
st.write("**Duplicate Rows:**")
duplicate_count = df.duplicated().sum()
st.write(f"Number of duplicate rows: {duplicate_count}")
if duplicate_count > 0:
st.warning(f"Found {duplicate_count} duplicate rows in the dataset.")
else:
st.success("No duplicate rows found!")
else:
st.info("Please enter a CSV URL to get started.")
Hi! I can help you with any questions about Streamlit and Python. What would you like to know?