Ploomber AI Editor | csv-visualizer-4f4c

To upload files, please first save the app
Code Editor for app.py

import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

st.set_page_config(page_title="CSV Data Visualizer", layout="wide")

@st.cache_data
def load_data(url):
    """Load and cache CSV data from URL"""
    try:
        df = pd.read_csv(url)
        return df, None
    except Exception as e:
        return None, str(e)

def get_column_type(series):
    """Determine if a column is numeric or categorical"""
    if pd.api.types.is_numeric_dtype(series):
        return 'numeric'
    else:
        return 'categorical'

def create_visualization(df, selected_vars):
    """Create appropriate visualization based on selected variables"""
    if len(selected_vars) == 1:
        var = selected_vars[0]
        var_type = get_column_type(df[var])
        
        if var_type == 'numeric':
            # Histogram for single numeric variable
            fig = px.histogram(df, x=var, title=f'Distribution of {var}')
        else:
            # Bar chart for single categorical variable
            value_counts = df[var].value_counts()
            fig = px.bar(x=value_counts.index, y=value_counts.values, 
                        title=f'Count of {var}',
                        labels={'x': var, 'y': 'Count'})
    
    elif len(selected_vars) == 2:
        var1, var2 = selected_vars
        var1_type = get_column_type(df[var1])
        var2_type = get_column_type(df[var2])
        
        if var1_type == 'numeric' and var2_type == 'numeric':
            # Scatter plot for two numeric variables
            fig = px.scatter(df, x=var1, y=var2, title=f'{var1} vs {var2}')
        elif var1_type == 'categorical' and var2_type == 'numeric':
            # Box plot: categorical x, numeric y
            fig = px.box(df, x=var1, y=var2, title=f'{var2} by {var1}')
        elif var1_type == 'numeric' and var2_type == 'categorical':
            # Box plot: numeric x, categorical y (swap variables)
            fig = px.box(df, x=var2, y=var1, title=f'{var1} by {var2}')
        else:
            # Grouped bar chart for two categorical variables
            crosstab = pd.crosstab(df[var1], df[var2])
            fig = px.bar(crosstab, title=f'{var1} vs {var2}')
    
    return fig

# Main app
st.title("📊 CSV Data Visualizer")
st.markdown("Load CSV data from a URL and create interactive visualizations")

# URL input
url = st.text_input(
    "Enter CSV URL:",
    value="https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv",
    help="Enter the URL of a CSV file to load and visualize"
)

if url:
    # Load data
    with st.spinner("Loading data..."):
        df, error = load_data(url)
    
    if error:
        st.error(f"Error loading data: {error}")
    elif df is not None:
        st.success(f"Data loaded successfully! Shape: {df.shape}")
        
        # Controls
        col1, col2 = st.columns(2)
        
        with col1:
            max_rows = len(df)
            num_rows = st.slider(
                "Number of rows to display:",
                min_value=1,
                max_value=max_rows,
                value=min(100, max_rows),
                help="Control how many rows are shown in the table"
            )
        
        with col2:
            shuffle = st.checkbox(
                "Randomly shuffle data",
                help="Randomly reorder the DataFrame rows"
            )
        
        # Process data based on controls
        display_df = df.copy()
        if shuffle:
            display_df = display_df.sample(frac=1, random_state=42).reset_index(drop=True)
        display_df = display_df.head(num_rows)
        
        # Display DataFrame
        st.subheader("📋 Data Preview")
        st.dataframe(display_df, use_container_width=True)
        
        # Data info
        with st.expander("📊 Data Information"):
            col1, col2 = st.columns(2)
            with col1:
                st.write("**Dataset Info:**")
                st.write(f"- Total rows: {len(df)}")
                st.write(f"- Total columns: {len(df.columns)}")
                st.write(f"- Memory usage: {df.memory_usage(deep=True).sum() / 1024:.1f} KB")
            
            with col2:
                st.write("**Column Types:**")
                for col in df.columns:
                    col_type = get_column_type(df[col])
                    st.write(f"- {col}: {col_type}")
        
        # Variable selection for visualization
        st.subheader("📈 Create Visualization")
        st.markdown("Select one or two variables to create an appropriate visualization:")
        
        selected_vars = st.multiselect(
            "Select variables:",
            options=df.columns.tolist(),
            max_selections=2,
            help="Choose 1 variable for distribution plots or 2 variables for relationship plots"
        )
        
        if selected_vars:
            try:
                fig = create_visualization(df, selected_vars)
                st.plotly_chart(fig, use_container_width=True)
                
                # Show visualization explanation
                with st.expander("🔍 Visualization Details"):
                    if len(selected_vars) == 1:
                        var = selected_vars[0]
                        var_type = get_column_type(df[var])
                        if var_type == 'numeric':
                            st.write(f"**Histogram** - Shows the distribution of the numeric variable '{var}'")
                        else:
                            st.write(f"**Bar Chart** - Shows the count of each category in '{var}'")
                    else:
                        var1, var2 = selected_vars
                        var1_type = get_column_type(df[var1])
                        var2_type = get_column_type(df[var2])
                        
                        if var1_type == 'numeric' and var2_type == 'numeric':
                            st.write(f"**Scatter Plot** - Shows the relationship between two numeric variables '{var1}' and '{var2}'")
                        elif 'categorical' in [var1_type, var2_type] and 'numeric' in [var1_type, var2_type]:
                            st.write(f"**Box Plot** - Shows the distribution of the numeric variable across categories")
                        else:
                            st.write(f"**Grouped Bar Chart** - Shows the relationship between two categorical variables")
            
            except Exception as e:
                st.error(f"Error creating visualization: {str(e)}")
        else:
            st.info("👆 Select one or two variables above to create a visualization")
        
        # Summary statistics
        with st.expander("📊 Summary Statistics"):
            numeric_cols = df.select_dtypes(include=['number']).columns
            if len(numeric_cols) > 0:
                st.write("**Numeric Variables:**")
                st.dataframe(df[numeric_cols].describe())
            
            categorical_cols = df.select_dtypes(exclude=['number']).columns
            if len(categorical_cols) > 0:
                st.write("**Categorical Variables:**")
                for col in categorical_cols:
                    st.write(f"**{col}** - Unique values: {df[col].nunique()}")
                    if df[col].nunique() <= 10:
                        st.write(df[col].value_counts().head())
                    st.write("---")

else:
    st.info("👆 Enter a CSV URL above to get started!")
Loading code editor...
Click Save & Run to preview your app
Terminal