Ploomber AI Editor | csv-data-visualizer-50e7

To upload files, please first save the app
Code Editor for app.py

import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

st.set_page_config(page_title="CSV Data Visualizer", layout="wide")

@st.cache_data
def load_data(url):
    """Load and cache data from URL"""
    try:
        df = pd.read_csv(url)
        return df, None
    except Exception as e:
        return None, str(e)

def get_column_type(series):
    """Determine if a column is numeric or categorical"""
    if pd.api.types.is_numeric_dtype(series):
        return 'numeric'
    else:
        return 'categorical'

def create_visualization(df, var1, var2=None):
    """Create appropriate visualization based on variable types"""
    if var2 is None:
        # Single variable visualization
        col_type = get_column_type(df[var1])
        
        if col_type == 'numeric':
            # Histogram for numeric variables
            fig = px.histogram(df, x=var1, title=f'Distribution of {var1}')
        else:
            # Bar chart for categorical variables
            value_counts = df[var1].value_counts()
            fig = px.bar(x=value_counts.index, y=value_counts.values, 
                        title=f'Count of {var1}',
                        labels={'x': var1, 'y': 'Count'})
    else:
        # Two variable visualization
        var1_type = get_column_type(df[var1])
        var2_type = get_column_type(df[var2])
        
        if var1_type == 'numeric' and var2_type == 'numeric':
            # Scatter plot for two numeric variables
            fig = px.scatter(df, x=var1, y=var2, title=f'{var1} vs {var2}')
        elif var1_type == 'categorical' and var2_type == 'numeric':
            # Box plot: categorical vs numeric
            fig = px.box(df, x=var1, y=var2, title=f'{var2} by {var1}')
        elif var1_type == 'numeric' and var2_type == 'categorical':
            # Box plot: numeric vs categorical (swap axes)
            fig = px.box(df, x=var2, y=var1, title=f'{var1} by {var2}')
        else:
            # Grouped bar chart for two categorical variables
            crosstab = pd.crosstab(df[var1], df[var2])
            fig = px.bar(crosstab, title=f'{var1} vs {var2}')
    
    return fig

# Main app
st.title("CSV Data Visualizer")
st.markdown("Load CSV data from a URL and create visualizations")

# URL input
url = st.text_input(
    "Enter CSV URL:",
    value="https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv",
    help="Enter the URL of a CSV file to load and visualize"
)

if url:
    # Load data
    with st.spinner("Loading data..."):
        df, error = load_data(url)
    
    if error:
        st.error(f"Error loading data: {error}")
    elif df is not None:
        st.success(f"Data loaded successfully! Shape: {df.shape}")
        
        # Create two columns for controls
        col1, col2 = st.columns(2)
        
        with col1:
            # Row count control
            max_rows = len(df)
            num_rows = st.slider(
                "Number of rows to display:",
                min_value=1,
                max_value=max_rows,
                value=min(100, max_rows),
                help="Control how many rows are shown in the DataFrame"
            )
        
        with col2:
            # Shuffle control
            shuffle_data = st.checkbox(
                "Randomly shuffle DataFrame",
                help="Randomly shuffle the order of rows in the DataFrame"
            )
        
        # Apply controls to dataframe
        display_df = df.copy()
        if shuffle_data:
            display_df = display_df.sample(frac=1, random_state=42).reset_index(drop=True)
        
        display_df = display_df.head(num_rows)
        
        # Display DataFrame
        st.subheader("DataFrame")
        st.dataframe(display_df, use_container_width=True)
        
        # Variable selection for visualization
        st.subheader("Data Visualization")
        
        # Get column names
        columns = df.columns.tolist()
        
        col1, col2 = st.columns(2)
        
        with col1:
            var1 = st.selectbox(
                "Select first variable:",
                options=columns,
                help="Choose the first variable for visualization"
            )
        
        with col2:
            var2 = st.selectbox(
                "Select second variable (optional):",
                options=[None] + columns,
                help="Optionally choose a second variable for bivariate visualization"
            )
        
        # Create and display visualization
        if var1:
            try:
                # Use full dataset for visualization, not just displayed rows
                viz_df = df.copy()
                if shuffle_data:
                    viz_df = viz_df.sample(frac=1, random_state=42).reset_index(drop=True)
                
                fig = create_visualization(viz_df, var1, var2)
                st.plotly_chart(fig, use_container_width=True)
                
                # Show some basic statistics
                st.subheader("Variable Statistics")
                
                if var2 is None:
                    # Single variable stats
                    if get_column_type(df[var1]) == 'numeric':
                        stats = df[var1].describe()
                        st.write(f"**{var1} Statistics:**")
                        st.write(stats)
                    else:
                        value_counts = df[var1].value_counts()
                        st.write(f"**{var1} Value Counts:**")
                        st.write(value_counts)
                else:
                    # Two variable stats
                    col1, col2 = st.columns(2)
                    
                    with col1:
                        st.write(f"**{var1} Statistics:**")
                        if get_column_type(df[var1]) == 'numeric':
                            st.write(df[var1].describe())
                        else:
                            st.write(df[var1].value_counts())
                    
                    with col2:
                        st.write(f"**{var2} Statistics:**")
                        if get_column_type(df[var2]) == 'numeric':
                            st.write(df[var2].describe())
                        else:
                            st.write(df[var2].value_counts())
                    
                    # Correlation if both are numeric
                    if (get_column_type(df[var1]) == 'numeric' and 
                        get_column_type(df[var2]) == 'numeric'):
                        correlation = df[var1].corr(df[var2])
                        st.metric("Correlation", f"{correlation:.3f}")
                
            except Exception as e:
                st.error(f"Error creating visualization: {str(e)}")
        
        # Data info
        with st.expander("Dataset Information"):
            st.write("**Dataset Shape:**", df.shape)
            st.write("**Column Information:**")
            
            info_df = pd.DataFrame({
                'Column': df.columns,
                'Data Type': [str(dtype) for dtype in df.dtypes],
                'Non-Null Count': [df[col].count() for col in df.columns],
                'Unique Values': [df[col].nunique() for col in df.columns]
            })
            st.dataframe(info_df, use_container_width=True)
            
            # Missing values
            missing_data = df.isnull().sum()
            if missing_data.sum() > 0:
                st.write("**Missing Values:**")
                st.write(missing_data[missing_data > 0])
            else:
                st.write("**No missing values found**")

else:
    st.info("Please enter a CSV URL to get started.")
Loading code editor...
Click Save & Run to preview your app
Terminal