Ploomber AI Editor | data-visualizer-1cc1

To upload files, please first save the app
Code Editor for app.py

import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

st.set_page_config(page_title="Data Visualizer", layout="wide")

@st.cache_data
def load_data(url):
    """Load and cache data from URL"""
    try:
        df = pd.read_csv(url)
        return df, None
    except Exception as e:
        return None, str(e)

def shuffle_dataframe(df):
    """Shuffle the dataframe rows"""
    return df.sample(frac=1).reset_index(drop=True)

def create_visualization(df, var1, var2=None):
    """Create appropriate visualization based on variable types"""
    if var2 is None:
        # Single variable visualization
        if pd.api.types.is_numeric_dtype(df[var1]):
            # Numeric variable - histogram
            fig = px.histogram(df, x=var1, title=f"Distribution of {var1}")
        else:
            # Categorical variable - bar chart
            value_counts = df[var1].value_counts()
            fig = px.bar(x=value_counts.index, y=value_counts.values, 
                        title=f"Count of {var1}")
            fig.update_xaxis(title=var1)
            fig.update_yaxis(title="Count")
    else:
        # Two variable visualization
        var1_numeric = pd.api.types.is_numeric_dtype(df[var1])
        var2_numeric = pd.api.types.is_numeric_dtype(df[var2])
        
        if var1_numeric and var2_numeric:
            # Both numeric - scatter plot
            fig = px.scatter(df, x=var1, y=var2, title=f"{var1} vs {var2}")
        elif var1_numeric and not var2_numeric:
            # Numeric vs Categorical - box plot
            fig = px.box(df, x=var2, y=var1, title=f"{var1} by {var2}")
        elif not var1_numeric and var2_numeric:
            # Categorical vs Numeric - box plot
            fig = px.box(df, x=var1, y=var2, title=f"{var2} by {var1}")
        else:
            # Both categorical - stacked bar chart
            crosstab = pd.crosstab(df[var1], df[var2])
            fig = px.bar(crosstab, title=f"{var1} vs {var2}")
    
    return fig

# Main app
st.title("📊 Data Visualizer")
st.write("Load CSV data from a URL and create visualizations")

# URL input
url = st.text_input(
    "Enter CSV URL:",
    value="https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv",
    help="Enter the URL of a CSV file to load and analyze"
)

if url:
    # Load data
    df, error = load_data(url)
    
    if error:
        st.error(f"Error loading data: {error}")
    else:
        st.success(f"Data loaded successfully! Shape: {df.shape}")
        
        # Controls
        col1, col2 = st.columns(2)
        
        with col1:
            max_rows = st.slider(
                "Number of rows to display:",
                min_value=1,
                max_value=len(df),
                value=min(100, len(df)),
                help="Select how many rows to display in the dataframe"
            )
        
        with col2:
            shuffle = st.checkbox(
                "Randomly shuffle data",
                help="Shuffle the dataframe rows randomly"
            )
        
        # Process data based on controls
        display_df = df.copy()
        if shuffle:
            display_df = shuffle_dataframe(display_df)
        display_df = display_df.head(max_rows)
        
        # Display dataframe
        st.subheader("📋 Data Preview")
        st.dataframe(display_df, use_container_width=True)
        
        # Data info
        st.subheader("ℹ️ Data Information")
        col1, col2, col3 = st.columns(3)
        with col1:
            st.metric("Total Rows", len(df))
        with col2:
            st.metric("Total Columns", len(df.columns))
        with col3:
            st.metric("Missing Values", df.isnull().sum().sum())
        
        # Variable selection for visualization
        st.subheader("📈 Create Visualization")
        
        # Get column names and types
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
        all_cols = df.columns.tolist()
        
        col1, col2 = st.columns(2)
        
        with col1:
            var1 = st.selectbox(
                "Select first variable:",
                options=all_cols,
                help="Choose the first variable for visualization"
            )
        
        with col2:
            var2 = st.selectbox(
                "Select second variable (optional):",
                options=["None"] + all_cols,
                help="Choose a second variable for bivariate visualization"
            )
        
        # Create visualization
        if var1:
            selected_var2 = None if var2 == "None" else var2
            
            # Show visualization type
            if selected_var2 is None:
                if pd.api.types.is_numeric_dtype(df[var1]):
                    viz_type = "Histogram (single numeric variable)"
                else:
                    viz_type = "Bar chart (single categorical variable)"
            else:
                var1_numeric = pd.api.types.is_numeric_dtype(df[var1])
                var2_numeric = pd.api.types.is_numeric_dtype(df[selected_var2])
                
                if var1_numeric and var2_numeric:
                    viz_type = "Scatter plot (numeric vs numeric)"
                elif (var1_numeric and not var2_numeric) or (not var1_numeric and var2_numeric):
                    viz_type = "Box plot (numeric vs categorical)"
                else:
                    viz_type = "Stacked bar chart (categorical vs categorical)"
            
            st.info(f"Visualization type: {viz_type}")
            
            # Create and display the plot
            try:
                fig = create_visualization(df, var1, selected_var2)
                st.plotly_chart(fig, use_container_width=True)
                
                # Show basic statistics
                st.subheader("📊 Variable Statistics")
                if selected_var2 is None:
                    if pd.api.types.is_numeric_dtype(df[var1]):
                        st.write(f"**{var1} Statistics:**")
                        st.write(df[var1].describe())
                    else:
                        st.write(f"**{var1} Value Counts:**")
                        st.write(df[var1].value_counts())
                else:
                    col1, col2 = st.columns(2)
                    with col1:
                        st.write(f"**{var1} Statistics:**")
                        if pd.api.types.is_numeric_dtype(df[var1]):
                            st.write(df[var1].describe())
                        else:
                            st.write(df[var1].value_counts())
                    
                    with col2:
                        st.write(f"**{selected_var2} Statistics:**")
                        if pd.api.types.is_numeric_dtype(df[selected_var2]):
                            st.write(df[selected_var2].describe())
                        else:
                            st.write(df[selected_var2].value_counts())
                
            except Exception as e:
                st.error(f"Error creating visualization: {str(e)}")
        
        # Data types information
        with st.expander("🔍 Column Information"):
            col_info = pd.DataFrame({
                'Column': df.columns,
                'Data Type': df.dtypes,
                'Non-Null Count': df.count(),
                'Null Count': df.isnull().sum()
            })
            st.dataframe(col_info, use_container_width=True)

else:
    st.info("Please enter a CSV URL to get started!")
Loading code editor...
Click Save & Run to preview your app
Terminal