Ploomber AI Editor | csv-data-visualizer-e82b

To upload files, please first save the app
Code Editor for app.py

import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pyplot as plt

# Set page config
st.set_page_config(
    page_title="CSV Data Visualizer",
    page_icon="📊",
    layout="wide"
)

@st.cache_data
def load_data(url):
    """Load and cache CSV data from URL"""
    try:
        df = pd.read_csv(url)
        return df, None
    except Exception as e:
        return None, str(e)

def get_column_type(series):
    """Determine if a column is numeric or categorical"""
    if pd.api.types.is_numeric_dtype(series):
        return "numeric"
    else:
        return "categorical"

def create_single_variable_plot(df, column):
    """Create appropriate plot for single variable"""
    col_type = get_column_type(df[column])
    
    if col_type == "numeric":
        # Histogram for numeric data
        fig = px.histogram(df, x=column, title=f"Distribution of {column}")
        fig.update_layout(showlegend=False)
        return fig
    else:
        # Bar chart for categorical data
        value_counts = df[column].value_counts()
        fig = px.bar(x=value_counts.index, y=value_counts.values, 
                    title=f"Count of {column}")
        fig.update_xaxes(title=column)
        fig.update_yaxes(title="Count")
        return fig

def create_two_variable_plot(df, col1, col2):
    """Create appropriate plot for two variables"""
    col1_type = get_column_type(df[col1])
    col2_type = get_column_type(df[col2])
    
    if col1_type == "numeric" and col2_type == "numeric":
        # Scatter plot for two numeric variables
        fig = px.scatter(df, x=col1, y=col2, title=f"{col1} vs {col2}")
        return fig
    elif col1_type == "categorical" and col2_type == "numeric":
        # Box plot for categorical vs numeric
        fig = px.box(df, x=col1, y=col2, title=f"{col2} by {col1}")
        return fig
    elif col1_type == "numeric" and col2_type == "categorical":
        # Box plot for numeric vs categorical (swap axes)
        fig = px.box(df, x=col2, y=col1, title=f"{col1} by {col2}")
        return fig
    else:
        # Stacked bar chart for two categorical variables
        crosstab = pd.crosstab(df[col1], df[col2])
        fig = px.bar(crosstab, title=f"{col1} vs {col2}")
        fig.update_xaxes(title=col1)
        fig.update_yaxes(title="Count")
        return fig

# Main app
st.title("📊 CSV Data Visualizer")
st.markdown("Load CSV data from a URL and create interactive visualizations")

# URL input
st.subheader("Data Source")
url = st.text_input(
    "Enter CSV URL:",
    value="https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv",
    help="Enter the URL of a CSV file to analyze"
)

if url:
    # Load data
    with st.spinner("Loading data..."):
        df, error = load_data(url)
    
    if error:
        st.error(f"Error loading data: {error}")
    elif df is not None:
        st.success(f"Data loaded successfully! Shape: {df.shape}")
        
        # Data controls
        st.subheader("Data Controls")
        col1, col2 = st.columns(2)
        
        with col1:
            max_rows = len(df)
            num_rows = st.slider(
                "Number of rows to display:",
                min_value=1,
                max_value=max_rows,
                value=min(100, max_rows),
                help=f"Total rows available: {max_rows}"
            )
        
        with col2:
            shuffle_data = st.checkbox(
                "Randomly shuffle data",
                value=False,
                help="Shuffle the DataFrame before displaying"
            )
        
        # Process data based on controls
        display_df = df.copy()
        if shuffle_data:
            display_df = display_df.sample(frac=1).reset_index(drop=True)
        display_df = display_df.head(num_rows)
        
        # Display DataFrame
        st.subheader("Data Preview")
        st.dataframe(display_df, use_container_width=True)
        
        # Data info
        with st.expander("Dataset Information"):
            col1, col2 = st.columns(2)
            with col1:
                st.write("**Column Information:**")
                info_df = pd.DataFrame({
                    'Column': df.columns,
                    'Type': [get_column_type(df[col]) for col in df.columns],
                    'Non-null Count': [df[col].count() for col in df.columns],
                    'Null Count': [df[col].isnull().sum() for col in df.columns]
                })
                st.dataframe(info_df, use_container_width=True)
            
            with col2:
                st.write("**Basic Statistics:**")
                st.dataframe(df.describe(), use_container_width=True)
        
        # Visualization section
        st.subheader("Data Visualization")
        
        # Variable selection
        st.write("Select variables to visualize:")
        available_columns = list(df.columns)
        
        col1, col2 = st.columns(2)
        with col1:
            var1 = st.selectbox("First variable:", available_columns, index=0)
        with col2:
            var2 = st.selectbox("Second variable (optional):", [None] + available_columns, index=0)
        
        # Create and display plot
        if var1:
            try:
                if var2 is None:
                    # Single variable plot
                    fig = create_single_variable_plot(display_df, var1)
                    st.plotly_chart(fig, use_container_width=True)
                    
                    # Show some statistics
                    st.write(f"**Statistics for {var1}:**")
                    if get_column_type(df[var1]) == "numeric":
                        stats = df[var1].describe()
                        col1, col2, col3, col4 = st.columns(4)
                        with col1:
                            st.metric("Mean", f"{stats['mean']:.2f}")
                        with col2:
                            st.metric("Median", f"{stats['50%']:.2f}")
                        with col3:
                            st.metric("Std Dev", f"{stats['std']:.2f}")
                        with col4:
                            st.metric("Range", f"{stats['max'] - stats['min']:.2f}")
                    else:
                        value_counts = df[var1].value_counts()
                        st.write(f"Unique values: {len(value_counts)}")
                        st.write("Top categories:")
                        st.dataframe(value_counts.head(), use_container_width=True)
                
                else:
                    # Two variable plot
                    fig = create_two_variable_plot(display_df, var1, var2)
                    st.plotly_chart(fig, use_container_width=True)
                    
                    # Show correlation if both are numeric
                    if get_column_type(df[var1]) == "numeric" and get_column_type(df[var2]) == "numeric":
                        correlation = df[var1].corr(df[var2])
                        st.metric("Correlation", f"{correlation:.3f}")
                        
                        if abs(correlation) > 0.7:
                            st.info("Strong correlation detected!")
                        elif abs(correlation) > 0.3:
                            st.info("Moderate correlation detected.")
                        else:
                            st.info("Weak correlation.")
                
            except Exception as e:
                st.error(f"Error creating visualization: {str(e)}")
        
        # Additional insights
        with st.expander("Data Quality Insights"):
            st.write("**Missing Values:**")
            missing_data = df.isnull().sum()
            missing_data = missing_data[missing_data > 0]
            if len(missing_data) > 0:
                st.dataframe(missing_data.to_frame("Missing Count"), use_container_width=True)
            else:
                st.success("No missing values found!")
            
            st.write("**Duplicate Rows:**")
            duplicate_count = df.duplicated().sum()
            st.write(f"Number of duplicate rows: {duplicate_count}")
            
            if duplicate_count > 0:
                st.warning(f"Found {duplicate_count} duplicate rows in the dataset.")
            else:
                st.success("No duplicate rows found!")

else:
    st.info("Please enter a CSV URL to get started.")
Loading code editor...
Click Save & Run to preview your app
Terminal