Ploomber AI Editor | csv-visualizer-cdbd

To upload files, please first save the app
Code Editor for app.py

import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

st.set_page_config(page_title="CSV Data Visualizer", layout="wide")

@st.cache_data
def load_data(url):
    """Load and cache CSV data from URL"""
    try:
        df = pd.read_csv(url)
        return df, None
    except Exception as e:
        return None, str(e)

def get_column_type(series):
    """Determine if a column is numeric or categorical"""
    if pd.api.types.is_numeric_dtype(series):
        return 'numeric'
    else:
        return 'categorical'

def create_visualization(df, var1, var2=None):
    """Create appropriate visualization based on variable types"""
    if var2 is None:
        # Single variable visualization
        col_type = get_column_type(df[var1])
        
        if col_type == 'numeric':
            # Histogram for numeric variables
            fig = px.histogram(df, x=var1, title=f'Distribution of {var1}')
        else:
            # Bar chart for categorical variables
            value_counts = df[var1].value_counts()
            fig = px.bar(x=value_counts.index, y=value_counts.values, 
                        title=f'Count of {var1}')
            fig.update_xaxis(title=var1)
            fig.update_yaxis(title='Count')
    else:
        # Two variable visualization
        var1_type = get_column_type(df[var1])
        var2_type = get_column_type(df[var2])
        
        if var1_type == 'numeric' and var2_type == 'numeric':
            # Scatter plot for two numeric variables
            fig = px.scatter(df, x=var1, y=var2, title=f'{var1} vs {var2}')
        elif var1_type == 'categorical' and var2_type == 'categorical':
            # Heatmap for two categorical variables
            crosstab = pd.crosstab(df[var1], df[var2])
            fig = px.imshow(crosstab, title=f'{var1} vs {var2}', 
                           labels=dict(x=var2, y=var1, color="Count"))
        else:
            # Box plot for numeric vs categorical
            if var1_type == 'numeric':
                fig = px.box(df, x=var2, y=var1, title=f'{var1} by {var2}')
            else:
                fig = px.box(df, x=var1, y=var2, title=f'{var2} by {var1}')
    
    return fig

# Main app
st.title("CSV Data Visualizer")
st.markdown("Load CSV data from a URL and create interactive visualizations")

# URL input
url = st.text_input(
    "Enter CSV URL:", 
    value="https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv"
)

if url:
    # Load data
    df, error = load_data(url)
    
    if error:
        st.error(f"Error loading data: {error}")
    elif df is not None:
        st.success(f"Data loaded successfully! Shape: {df.shape}")
        
        # Controls
        col1, col2 = st.columns(2)
        
        with col1:
            # Number of rows control
            max_rows = len(df)
            num_rows = st.slider("Number of rows to display:", 
                               min_value=1, max_value=max_rows, value=min(100, max_rows))
        
        with col2:
            # Shuffle checkbox
            shuffle = st.checkbox("Randomly shuffle DataFrame")
        
        # Apply controls
        display_df = df.copy()
        if shuffle:
            display_df = display_df.sample(frac=1).reset_index(drop=True)
        display_df = display_df.head(num_rows)
        
        # Display DataFrame
        st.subheader("Data Preview")
        st.dataframe(display_df, use_container_width=True)
        
        # Data info
        st.subheader("Dataset Information")
        col1, col2, col3 = st.columns(3)
        with col1:
            st.metric("Total Rows", len(df))
        with col2:
            st.metric("Total Columns", len(df.columns))
        with col3:
            st.metric("Missing Values", df.isnull().sum().sum())
        
        # Variable selection for visualization
        st.subheader("Data Visualization")
        
        # Get column names and types
        columns = df.columns.tolist()
        
        col1, col2 = st.columns(2)
        
        with col1:
            var1 = st.selectbox("Select first variable:", options=columns)
        
        with col2:
            var2 = st.selectbox("Select second variable (optional):", 
                              options=["None"] + columns, index=0)
        
        if var2 == "None":
            var2 = None
        
        # Create and display visualization
        if var1:
            try:
                fig = create_visualization(df, var1, var2)
                st.plotly_chart(fig, use_container_width=True)
                
                # Display summary statistics
                st.subheader("Summary Statistics")
                if var2:
                    selected_cols = [var1, var2]
                else:
                    selected_cols = [var1]
                
                # Show statistics for selected variables
                for col in selected_cols:
                    if get_column_type(df[col]) == 'numeric':
                        st.write(f"**{col} (Numeric)**")
                        st.write(df[col].describe())
                    else:
                        st.write(f"**{col} (Categorical)**")
                        st.write(df[col].value_counts().head(10))
                    st.write("---")
                        
            except Exception as e:
                st.error(f"Error creating visualization: {str(e)}")
        
        # Data types info
        with st.expander("Column Data Types"):
            col_info = pd.DataFrame({
                'Column': df.columns,
                'Data Type': df.dtypes.astype(str),
                'Non-Null Count': df.count(),
                'Null Count': df.isnull().sum()
            })
            st.dataframe(col_info, use_container_width=True)
Loading code editor...
Click Save & Run to preview your app
Terminal