Ploomber AI Editor | data-visualization-app-11a5

To upload files, please first save the app
Code Editor for app.py

import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

st.set_page_config(page_title="Data Visualization App", layout="wide")

@st.cache_data
def load_data(url):
    """Load and cache CSV data from URL"""
    try:
        df = pd.read_csv(url)
        return df, None
    except Exception as e:
        return None, str(e)

def determine_column_types(df, columns):
    """Determine if columns are numeric or categorical"""
    types = {}
    for col in columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            types[col] = 'numeric'
        else:
            types[col] = 'categorical'
    return types

def create_visualization(df, selected_columns):
    """Create appropriate visualization based on selected columns and their types"""
    if len(selected_columns) == 0:
        return None
    
    column_types = determine_column_types(df, selected_columns)
    
    if len(selected_columns) == 1:
        col = selected_columns[0]
        col_type = column_types[col]
        
        if col_type == 'numeric':
            # Histogram for single numeric variable
            fig = px.histogram(df, x=col, title=f'Distribution of {col}')
            fig.update_layout(showlegend=False)
            return fig
        else:
            # Bar chart for single categorical variable
            value_counts = df[col].value_counts()
            fig = px.bar(x=value_counts.index, y=value_counts.values, 
                        title=f'Count of {col}')
            fig.update_xaxes(title=col)
            fig.update_yaxes(title='Count')
            return fig
    
    elif len(selected_columns) == 2:
        col1, col2 = selected_columns
        type1, type2 = column_types[col1], column_types[col2]
        
        if type1 == 'numeric' and type2 == 'numeric':
            # Scatter plot for two numeric variables
            fig = px.scatter(df, x=col1, y=col2, title=f'{col1} vs {col2}')
            return fig
        elif type1 == 'categorical' and type2 == 'numeric':
            # Box plot: categorical x, numeric y
            fig = px.box(df, x=col1, y=col2, title=f'{col2} by {col1}')
            return fig
        elif type1 == 'numeric' and type2 == 'categorical':
            # Box plot: numeric x, categorical y (swapped)
            fig = px.box(df, x=col2, y=col1, title=f'{col1} by {col2}')
            return fig
        else:
            # Grouped bar chart for two categorical variables
            crosstab = pd.crosstab(df[col1], df[col2])
            fig = px.bar(crosstab, title=f'{col1} vs {col2}')
            fig.update_xaxes(title=col1)
            fig.update_yaxes(title='Count')
            return fig
    
    return None

# Main app
st.title("📊 Data Visualization App")
st.markdown("Load CSV data from a URL and create interactive visualizations")

# URL input
url = st.text_input(
    "Enter CSV URL:",
    value="https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv",
    help="Enter the URL of a CSV file to load and visualize"
)

if url:
    # Load data
    df, error = load_data(url)
    
    if error:
        st.error(f"Error loading data: {error}")
    elif df is not None:
        st.success(f"✅ Data loaded successfully! Shape: {df.shape}")
        
        # Controls
        col1, col2 = st.columns(2)
        
        with col1:
            # Number of rows to display
            max_rows = len(df)
            num_rows = st.slider(
                "Number of rows to display:",
                min_value=1,
                max_value=max_rows,
                value=min(100, max_rows),
                help="Select how many rows to display in the table"
            )
        
        with col2:
            # Shuffle checkbox
            shuffle_data = st.checkbox(
                "Randomly shuffle data",
                help="Check to randomly shuffle the DataFrame"
            )
        
        # Process data based on controls
        display_df = df.copy()
        if shuffle_data:
            display_df = display_df.sample(frac=1).reset_index(drop=True)
        
        display_df = display_df.head(num_rows)
        
        # Display DataFrame
        st.subheader("📋 Data Preview")
        st.dataframe(display_df, use_container_width=True)
        
        # Data info
        with st.expander("ℹ️ Data Information"):
            col1, col2 = st.columns(2)
            with col1:
                st.write("**Data Types:**")
                st.write(df.dtypes)
            with col2:
                st.write("**Missing Values:**")
                st.write(df.isnull().sum())
        
        # Variable selection for visualization
        st.subheader("📈 Create Visualization")
        
        # Get column names
        columns = df.columns.tolist()
        
        # Variable selection
        selected_columns = st.multiselect(
            "Select 1 or 2 variables to visualize:",
            options=columns,
            max_selections=2,
            help="Choose one variable for distribution or two variables for relationship analysis"
        )
        
        if selected_columns:
            # Show column information
            st.write("**Selected Variables:**")
            for col in selected_columns:
                col_type = "Numeric" if pd.api.types.is_numeric_dtype(df[col]) else "Categorical"
                unique_vals = df[col].nunique()
                st.write(f"- **{col}**: {col_type} ({unique_vals} unique values)")
            
            # Create and display visualization
            fig = create_visualization(df, selected_columns)
            
            if fig:
                st.plotly_chart(fig, use_container_width=True)
                
                # Show visualization explanation
                with st.expander("📊 Visualization Explanation"):
                    if len(selected_columns) == 1:
                        col = selected_columns[0]
                        if pd.api.types.is_numeric_dtype(df[col]):
                            st.write(f"**Histogram**: Shows the distribution of values for the numeric variable '{col}'. The x-axis represents the values, and the y-axis shows the frequency/count.")
                        else:
                            st.write(f"**Bar Chart**: Shows the count of each category in the categorical variable '{col}'. Each bar represents a different category.")
                    else:
                        col1, col2 = selected_columns
                        type1 = "numeric" if pd.api.types.is_numeric_dtype(df[col1]) else "categorical"
                        type2 = "numeric" if pd.api.types.is_numeric_dtype(df[col2]) else "categorical"
                        
                        if type1 == 'numeric' and type2 == 'numeric':
                            st.write(f"**Scatter Plot**: Shows the relationship between two numeric variables '{col1}' and '{col2}'. Each point represents one observation.")
                        elif (type1 == 'categorical' and type2 == 'numeric') or (type1 == 'numeric' and type2 == 'categorical'):
                            st.write(f"**Box Plot**: Shows the distribution of the numeric variable across different categories. The box shows the quartiles, and whiskers show the range.")
                        else:
                            st.write(f"**Grouped Bar Chart**: Shows the relationship between two categorical variables '{col1}' and '{col2}' by displaying counts for each combination.")
            else:
                st.warning("Could not create visualization for the selected variables.")
        else:
            st.info("👆 Select one or two variables above to create a visualization")
    
    else:
        st.warning("No data loaded. Please check your URL.")
else:
    st.info("👆 Enter a CSV URL above to get started")
Loading code editor...
Click Save & Run to preview your app
Terminal