Ploomber AI Editor | csv-visualizer-3e1c

To upload files, please first save the app
Code Editor for app.py

import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Set page config
st.set_page_config(
    page_title="CSV Data Visualizer",
    page_icon="📊",
    layout="wide"
)

@st.cache_data
def load_data(url):
    """Load CSV data from URL and cache it"""
    try:
        df = pd.read_csv(url)
        return df, None
    except Exception as e:
        return None, str(e)

def shuffle_dataframe(df):
    """Shuffle the DataFrame"""
    return df.sample(frac=1).reset_index(drop=True)

def determine_column_type(series):
    """Determine if a column is numeric or categorical"""
    if pd.api.types.is_numeric_dtype(series):
        return 'numeric'
    else:
        return 'categorical'

def create_visualization(df, selected_vars):
    """Create appropriate visualization based on selected variables"""
    if len(selected_vars) == 1:
        var = selected_vars[0]
        var_type = determine_column_type(df[var])
        
        if var_type == 'numeric':
            # Histogram for single numeric variable
            fig = px.histogram(df, x=var, title=f'Distribution of {var}')
            fig.update_layout(showlegend=False)
        else:
            # Bar chart for single categorical variable
            value_counts = df[var].value_counts()
            fig = px.bar(x=value_counts.index, y=value_counts.values, 
                        title=f'Count of {var}')
            fig.update_xaxes(title=var)
            fig.update_yaxes(title='Count')
        
        return fig
    
    elif len(selected_vars) == 2:
        var1, var2 = selected_vars
        type1 = determine_column_type(df[var1])
        type2 = determine_column_type(df[var2])
        
        if type1 == 'numeric' and type2 == 'numeric':
            # Scatter plot for two numeric variables
            fig = px.scatter(df, x=var1, y=var2, 
                           title=f'{var1} vs {var2}')
        elif type1 == 'categorical' and type2 == 'numeric':
            # Box plot for categorical vs numeric
            fig = px.box(df, x=var1, y=var2, 
                        title=f'{var2} by {var1}')
        elif type1 == 'numeric' and type2 == 'categorical':
            # Box plot for numeric vs categorical
            fig = px.box(df, x=var2, y=var1, 
                        title=f'{var1} by {var2}')
        else:
            # Grouped bar chart for two categorical variables
            crosstab = pd.crosstab(df[var1], df[var2])
            fig = px.imshow(crosstab, text_auto=True, aspect="auto",
                           title=f'{var1} vs {var2} (Heatmap)')
            fig.update_xaxes(title=var2)
            fig.update_yaxes(title=var1)
        
        return fig
    
    return None

# Main app
st.title("📊 CSV Data Visualizer")
st.markdown("Load CSV data from a URL and create interactive visualizations")

# URL input
st.subheader("📥 Data Source")
url = st.text_input(
    "Enter CSV URL:",
    value="https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv",
    help="Enter the URL of a CSV file to analyze"
)

if url:
    # Load data
    df, error = load_data(url)
    
    if error:
        st.error(f"Error loading data: {error}")
    else:
        st.success(f"✅ Data loaded successfully! Shape: {df.shape}")
        
        # Controls
        st.subheader("⚙️ Controls")
        col1, col2 = st.columns(2)
        
        with col1:
            max_rows = st.slider(
                "Number of rows to display:",
                min_value=1,
                max_value=len(df),
                value=min(100, len(df)),
                help="Select how many rows to show in the table"
            )
        
        with col2:
            shuffle = st.checkbox(
                "Randomly shuffle DataFrame",
                value=False,
                help="Shuffle the order of rows in the DataFrame"
            )
        
        # Process data based on controls
        display_df = df.copy()
        if shuffle:
            display_df = shuffle_dataframe(display_df)
        display_df = display_df.head(max_rows)
        
        # Display DataFrame
        st.subheader("📋 Data Preview")
        st.dataframe(display_df, use_container_width=True)
        
        # Data summary
        st.subheader("📈 Data Summary")
        col1, col2, col3 = st.columns(3)
        with col1:
            st.metric("Total Rows", len(df))
        with col2:
            st.metric("Total Columns", len(df.columns))
        with col3:
            st.metric("Missing Values", df.isnull().sum().sum())
        
        # Variable selection for visualization
        st.subheader("🎨 Visualization")
        st.markdown("Select one or two variables to visualize:")
        
        # Get column names and types
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
        all_cols = df.columns.tolist()
        
        # Create column info
        col_info = []
        for col in all_cols:
            col_type = determine_column_type(df[col])
            col_info.append(f"{col} ({col_type})")
        
        selected_vars = st.multiselect(
            "Choose variables:",
            options=all_cols,
            format_func=lambda x: f"{x} ({determine_column_type(df[x])})",
            max_selections=2,
            help="Select 1-2 variables to create an appropriate visualization"
        )
        
        if selected_vars:
            # Create and display visualization
            fig = create_visualization(df, selected_vars)
            if fig:
                st.plotly_chart(fig, use_container_width=True)
                
                # Show visualization explanation
                with st.expander("ℹ️ Visualization Info"):
                    if len(selected_vars) == 1:
                        var_type = determine_column_type(df[selected_vars[0]])
                        if var_type == 'numeric':
                            st.write("**Histogram**: Shows the distribution of a numeric variable.")
                        else:
                            st.write("**Bar Chart**: Shows the count of each category in a categorical variable.")
                    else:
                        type1 = determine_column_type(df[selected_vars[0]])
                        type2 = determine_column_type(df[selected_vars[1]])
                        if type1 == 'numeric' and type2 == 'numeric':
                            st.write("**Scatter Plot**: Shows the relationship between two numeric variables.")
                        elif (type1 == 'categorical' and type2 == 'numeric') or (type1 == 'numeric' and type2 == 'categorical'):
                            st.write("**Box Plot**: Shows the distribution of a numeric variable across categories.")
                        else:
                            st.write("**Heatmap**: Shows the relationship between two categorical variables.")
        
        # Column information
        with st.expander("📊 Column Information"):
            col_info_df = pd.DataFrame({
                'Column': df.columns,
                'Type': [determine_column_type(df[col]) for col in df.columns],
                'Non-Null Count': [df[col].count() for col in df.columns],
                'Null Count': [df[col].isnull().sum() for col in df.columns]
            })
            st.dataframe(col_info_df, use_container_width=True)
Loading code editor...
Click Save & Run to preview your app
Terminal