Ploomber AI Editor | data-visualizer-3897

To upload files, please first save the app
Code Editor for app.py

import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

# Set page config
st.set_page_config(page_title="Data Visualizer", layout="wide")

@st.cache_data
def load_data(url):
    """Load and cache data from URL"""
    try:
        df = pd.read_csv(url)
        return df, None
    except Exception as e:
        return None, str(e)

def shuffle_dataframe(df, shuffle=False):
    """Shuffle dataframe if requested"""
    if shuffle:
        return df.sample(frac=1).reset_index(drop=True)
    return df

def determine_column_type(column):
    """Determine if column is numeric or categorical"""
    if pd.api.types.is_numeric_dtype(column):
        return "numeric"
    else:
        return "categorical"

def create_visualization(df, var1, var2=None):
    """Create appropriate visualization based on variable types"""
    if var2 is None:
        # Single variable visualization
        col_type = determine_column_type(df[var1])
        
        if col_type == "numeric":
            # Histogram for numeric variables
            fig = px.histogram(df, x=var1, title=f"Distribution of {var1}")
        else:
            # Bar chart for categorical variables
            value_counts = df[var1].value_counts()
            fig = px.bar(x=value_counts.index, y=value_counts.values, 
                        title=f"Count of {var1}")
            fig.update_xaxis(title=var1)
            fig.update_yaxis(title="Count")
    else:
        # Two variable visualization
        var1_type = determine_column_type(df[var1])
        var2_type = determine_column_type(df[var2])
        
        if var1_type == "numeric" and var2_type == "numeric":
            # Scatter plot for two numeric variables
            fig = px.scatter(df, x=var1, y=var2, title=f"{var1} vs {var2}")
        elif var1_type == "categorical" and var2_type == "categorical":
            # Stacked bar chart for two categorical variables
            contingency = pd.crosstab(df[var1], df[var2])
            fig = px.bar(contingency, title=f"{var1} by {var2}")
        else:
            # Box plot for numeric vs categorical
            if var1_type == "categorical":
                fig = px.box(df, x=var1, y=var2, title=f"{var2} by {var1}")
            else:
                fig = px.box(df, x=var2, y=var1, title=f"{var1} by {var2}")
    
    return fig

# Main app
st.title("📊 Data Visualizer")
st.markdown("Load CSV data from a URL and create interactive visualizations")

# URL input
default_url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv"
url = st.text_input("Enter CSV URL:", value=default_url)

if url:
    # Load data
    with st.spinner("Loading data..."):
        df, error = load_data(url)
    
    if error:
        st.error(f"Error loading data: {error}")
    elif df is not None:
        st.success(f"Successfully loaded {len(df)} rows and {len(df.columns)} columns")
        
        # Controls
        col1, col2 = st.columns(2)
        
        with col1:
            max_rows = st.slider("Number of rows to display:", 
                               min_value=1, 
                               max_value=len(df), 
                               value=min(100, len(df)))
        
        with col2:
            shuffle = st.checkbox("Randomly shuffle DataFrame")
        
        # Apply controls
        display_df = shuffle_dataframe(df, shuffle)
        display_df = display_df.head(max_rows)
        
        # Display DataFrame
        st.subheader("Data Preview")
        st.dataframe(display_df, use_container_width=True)
        
        # Data info
        with st.expander("Data Information"):
            col1, col2 = st.columns(2)
            with col1:
                st.write("**Column Types:**")
                for col in df.columns:
                    col_type = determine_column_type(df[col])
                    st.write(f"- {col}: {col_type}")
            
            with col2:
                st.write("**Summary Statistics:**")
                st.dataframe(df.describe())
        
        # Visualization section
        st.subheader("🎨 Data Visualization")
        
        # Variable selection
        columns = df.columns.tolist()
        
        col1, col2 = st.columns(2)
        with col1:
            var1 = st.selectbox("Select first variable:", columns)
        
        with col2:
            var2 = st.selectbox("Select second variable (optional):", 
                               ["None"] + columns, 
                               index=0)
        
        if var2 == "None":
            var2 = None
        
        # Create and display visualization
        if var1:
            try:
                fig = create_visualization(df, var1, var2)
                st.plotly_chart(fig, use_container_width=True)
                
                # Show visualization info
                if var2 is None:
                    st.info(f"📈 Showing distribution of **{var1}** ({determine_column_type(df[var1])} variable)")
                else:
                    var1_type = determine_column_type(df[var1])
                    var2_type = determine_column_type(df[var2])
                    st.info(f"📈 Showing relationship between **{var1}** ({var1_type}) and **{var2}** ({var2_type})")
                
            except Exception as e:
                st.error(f"Error creating visualization: {str(e)}")
        
        # Additional insights
        if len(df.columns) > 0:
            st.subheader("📋 Quick Insights")
            
            # Missing values
            missing_data = df.isnull().sum()
            if missing_data.sum() > 0:
                st.warning("⚠️ Missing values detected:")
                for col, missing in missing_data[missing_data > 0].items():
                    st.write(f"- {col}: {missing} missing values ({missing/len(df)*100:.1f}%)")
            else:
                st.success("✅ No missing values detected")
            
            # Unique values for categorical columns
            categorical_cols = [col for col in df.columns if determine_column_type(df[col]) == "categorical"]
            if categorical_cols:
                st.write("**Unique values in categorical columns:**")
                for col in categorical_cols[:5]:  # Show first 5 categorical columns
                    unique_count = df[col].nunique()
                    st.write(f"- {col}: {unique_count} unique values")
                    if unique_count <= 10:
                        st.write(f"  Values: {', '.join(map(str, df[col].unique()))}")
Loading code editor...
Click Save & Run to preview your app
Terminal