Ploomber AI Editor | csv-data-visualizer-284b

To upload files, please first save the app
Code Editor for app.py

import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

st.set_page_config(page_title="CSV Data Visualizer", layout="wide")

@st.cache_data
def load_data(url):
    """Load and cache CSV data from URL"""
    try:
        df = pd.read_csv(url)
        return df, None
    except Exception as e:
        return None, str(e)

def get_column_type(series):
    """Determine if a column is numeric or categorical"""
    if pd.api.types.is_numeric_dtype(series):
        return 'numeric'
    else:
        return 'categorical'

def create_visualization(df, selected_vars):
    """Create appropriate visualization based on selected variables"""
    if len(selected_vars) == 1:
        col = selected_vars[0]
        col_type = get_column_type(df[col])
        
        if col_type == 'numeric':
            # Histogram for single numeric variable
            fig = px.histogram(df, x=col, title=f'Distribution of {col}')
        else:
            # Bar chart for single categorical variable
            value_counts = df[col].value_counts()
            fig = px.bar(x=value_counts.index, y=value_counts.values, 
                        title=f'Count of {col}', labels={'x': col, 'y': 'Count'})
    
    elif len(selected_vars) == 2:
        col1, col2 = selected_vars
        col1_type = get_column_type(df[col1])
        col2_type = get_column_type(df[col2])
        
        if col1_type == 'numeric' and col2_type == 'numeric':
            # Scatter plot for two numeric variables
            fig = px.scatter(df, x=col1, y=col2, title=f'{col1} vs {col2}')
        elif col1_type == 'categorical' and col2_type == 'numeric':
            # Box plot for categorical vs numeric
            fig = px.box(df, x=col1, y=col2, title=f'{col2} by {col1}')
        elif col1_type == 'numeric' and col2_type == 'categorical':
            # Box plot for numeric vs categorical (swap axes)
            fig = px.box(df, x=col2, y=col1, title=f'{col1} by {col2}')
        else:
            # Stacked bar chart for two categorical variables
            crosstab = pd.crosstab(df[col1], df[col2])
            fig = px.bar(crosstab, title=f'{col1} vs {col2}', 
                        labels={'value': 'Count', 'index': col1})
    
    return fig

# Main app
st.title("🔍 CSV Data Visualizer")
st.markdown("Upload a CSV from a URL and explore your data with interactive visualizations!")

# URL input
url = st.text_input(
    "Enter CSV URL:", 
    value="https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv",
    help="Enter a direct URL to a CSV file"
)

if url:
    # Load data
    with st.spinner("Loading data..."):
        df, error = load_data(url)
    
    if error:
        st.error(f"Error loading data: {error}")
    elif df is not None:
        st.success(f"✅ Data loaded successfully! Shape: {df.shape}")
        
        # Data controls
        st.sidebar.header("Data Controls")
        
        # Number of rows to display
        max_rows = len(df)
        num_rows = st.sidebar.slider(
            "Number of rows to display", 
            min_value=1, 
            max_value=max_rows, 
            value=min(100, max_rows)
        )
        
        # Shuffle checkbox
        shuffle_data = st.sidebar.checkbox("Randomly shuffle data", value=False)
        
        # Apply controls
        display_df = df.copy()
        if shuffle_data:
            display_df = display_df.sample(frac=1).reset_index(drop=True)
        display_df = display_df.head(num_rows)
        
        # Variable selection for visualization
        st.sidebar.header("Visualization Controls")
        st.sidebar.markdown("Select 1 or 2 variables to visualize:")
        
        selected_vars = st.sidebar.multiselect(
            "Select variables:",
            options=df.columns.tolist(),
            max_selections=2,
            help="Choose 1 variable for distribution plots or 2 variables for relationship plots"
        )
        
        # Create two columns for layout
        col1, col2 = st.columns([1, 1])
        
        with col1:
            st.subheader("📊 Data Preview")
            st.dataframe(display_df, use_container_width=True)
            
            # Data info
            st.subheader("📈 Dataset Info")
            info_col1, info_col2 = st.columns(2)
            with info_col1:
                st.metric("Total Rows", f"{len(df):,}")
                st.metric("Columns", len(df.columns))
            with info_col2:
                st.metric("Displayed Rows", f"{len(display_df):,}")
                st.metric("Missing Values", df.isnull().sum().sum())
        
        with col2:
            if selected_vars:
                st.subheader("📊 Visualization")
                try:
                    fig = create_visualization(df, selected_vars)
                    st.plotly_chart(fig, use_container_width=True)
                    
                    # Show basic statistics for selected variables
                    st.subheader("📋 Statistics")
                    for var in selected_vars:
                        if get_column_type(df[var]) == 'numeric':
                            st.write(f"**{var}** (Numeric)")
                            stats = df[var].describe()
                            st.write(stats)
                        else:
                            st.write(f"**{var}** (Categorical)")
                            value_counts = df[var].value_counts().head(10)
                            st.write(value_counts)
                        st.write("---")
                        
                except Exception as e:
                    st.error(f"Error creating visualization: {str(e)}")
            else:
                st.info("👆 Select 1 or 2 variables from the sidebar to create a visualization")
                
                # Show column information when no variables selected
                st.subheader("📋 Column Information")
                col_info = []
                for col in df.columns:
                    col_type = get_column_type(df[col])
                    missing = df[col].isnull().sum()
                    unique = df[col].nunique()
                    col_info.append({
                        'Column': col,
                        'Type': col_type,
                        'Missing': missing,
                        'Unique Values': unique
                    })
                
                col_info_df = pd.DataFrame(col_info)
                st.dataframe(col_info_df, use_container_width=True)
Loading code editor...
Click Save & Run to preview your app
Terminal