Ploomber AI Editor | url-data-visualizer-f717

To upload files, please first save the app
Code Editor for app.py

import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set page title
st.title("CSV Data Visualizer")

# Text field for URL input
url = st.text_input(
    "Enter CSV URL",
    value="https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv"
)

# Cache data loading to improve performance
@st.cache_data
def load_data(url):
    try:
        # NOTE: Using corsproxy.io because we're in a WASM environment. If running locally,
        # you can remove the corsproxy.io prefix. Some websites don't work with the proxy,
        # in those cases try removing the proxy prefix.
        if not url.startswith(("http://", "https://")):
            st.error("Please enter a valid URL starting with http:// or https://")
            return None
            
        proxy_url = f"https://corsproxy.io/?{url}" if "corsproxy.io" not in url else url
        data = pd.read_csv(proxy_url)
        return data
    except Exception as e:
        st.error(f"Error loading data: {e}")
        return None

# Load data
data = load_data(url)

if data is not None:
    # Display dataset info
    st.write(f"Dataset shape: {data.shape[0]} rows, {data.shape[1]} columns")
    
    # Control options
    col1, col2 = st.columns(2)
    
    with col1:
        # Number of rows to display
        num_rows = st.slider("Number of rows to display", 5, min(100, data.shape[0]), 10)
    
    with col2:
        # Option to shuffle data
        shuffle = st.checkbox("Randomly shuffle data")
    
    # Process data based on user options
    if shuffle:
        display_data = data.sample(frac=1, random_state=42).reset_index(drop=True)
    else:
        display_data = data.copy()
    
    # Display the dataframe
    st.subheader("Data Preview")
    st.dataframe(display_data.head(num_rows))
    
    # Column selection for visualization
    st.subheader("Data Visualization")
    
    # Get numerical and categorical columns for selection
    numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()
    all_cols = numerical_cols + categorical_cols
    
    # Let user select columns for visualization
    st.write("Select variables to visualize:")
    col1, col2 = st.columns(2)
    
    with col1:
        x_var = st.selectbox("X-axis variable", all_cols, index=0)
    
    with col2:
        y_var = st.selectbox("Y-axis variable (optional)", ["None"] + all_cols, index=0)
    
    # Determine the best visualization based on selected variables
    if y_var == "None":
        # Single variable visualization
        st.subheader(f"Visualization of {x_var}")
        
        fig, ax = plt.subplots(figsize=(10, 6))
        
        if x_var in numerical_cols:
            # For numerical variables: histogram and KDE
            sns.histplot(data=data, x=x_var, kde=True, ax=ax)
            ax.set_title(f"Distribution of {x_var}")
        else:
            # For categorical variables: count plot
            sns.countplot(data=data, x=x_var, ax=ax)
            ax.set_title(f"Count of {x_var}")
            plt.xticks(rotation=45)
        
        st.pyplot(fig)
        
        # Show summary statistics
        if x_var in numerical_cols:
            st.write("Summary statistics:")
            st.write(data[x_var].describe())
    else:
        # Two variables visualization
        st.subheader(f"Visualization of {x_var} vs {y_var}")
        
        fig, ax = plt.subplots(figsize=(10, 6))
        
        if (x_var in numerical_cols) and (y_var in numerical_cols):
            # Both numerical: scatter plot
            sns.scatterplot(data=data, x=x_var, y=y_var, ax=ax)
            ax.set_title(f"Scatter plot of {x_var} vs {y_var}")
            
            # Add correlation info
            corr = data[[x_var, y_var]].corr().iloc[0, 1]
            st.write(f"Correlation between {x_var} and {y_var}: {corr:.2f}")
            
        elif (x_var in categorical_cols) and (y_var in numerical_cols):
            # Categorical x, Numerical y: box plot
            sns.boxplot(data=data, x=x_var, y=y_var, ax=ax)
            ax.set_title(f"Box plot of {y_var} by {x_var}")
            plt.xticks(rotation=45)
            
        elif (x_var in numerical_cols) and (y_var in categorical_cols):
            # Numerical x, Categorical y: box plot (flipped)
            sns.boxplot(data=data, x=y_var, y=x_var, ax=ax)
            ax.set_title(f"Box plot of {x_var} by {y_var}")
            plt.xticks(rotation=45)
            
        else:
            # Both categorical: count plot
            crosstab = pd.crosstab(data[y_var], data[x_var])
            crosstab.plot(kind='bar', stacked=True, ax=ax)
            ax.set_title(f"Stacked bar plot of {x_var} and {y_var}")
            plt.xticks(rotation=45)
        
        st.pyplot(fig)
else:
    st.warning("Please enter a valid URL to a CSV file.")
Loading code editor...
Click Save & Run to preview your app
Terminal