Ploomber AI Editor | penguin-data-visualizer-baa1

Drop files here
or click to upload
Code Editor for app.py

import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Set page title
st.title("CSV Data Visualizer")

# Input for CSV URL
url = st.text_input(
    "Enter CSV URL:",
    value="https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv"
)

# Function to load data with caching
@st.cache_data
def load_data(url):
    try:
        data = pd.read_csv(url)
        return data
    except Exception as e:
        st.error(f"Error loading data: {e}")
        return None

# Load data
if url:
    data = load_data(url)
    
    if data is not None:
        # Display data info
        st.write(f"Dataset has {data.shape[0]} rows and {data.shape[1]} columns")
        
        # Controls for data display
        col1, col2 = st.columns(2)
        
        with col1:
            num_rows = st.slider("Number of rows to display", 
                                min_value=1, 
                                max_value=min(100, len(data)), 
                                value=min(10, len(data)))
        
        with col2:
            shuffle = st.checkbox("Randomly shuffle data")
        
        # Process data based on controls
        display_data = data.copy()
        if shuffle:
            display_data = display_data.sample(frac=1, random_state=42).reset_index(drop=True)
        
        # Display dataframe
        st.subheader("Data Preview")
        st.dataframe(display_data.head(num_rows))
        
        # Variable selection for visualization
        st.subheader("Data Visualization")
        
        # Get numeric and categorical columns
        numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
        categorical_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()
        all_cols = numeric_cols + categorical_cols
        
        # Select variables for visualization
        col1, col2 = st.columns(2)
        with col1:
            var1 = st.selectbox("Select first variable", all_cols)
        with col2:
            remaining_cols = [col for col in all_cols if col != var1]
            var2 = st.selectbox("Select second variable (optional)", 
                               options=["None"] + remaining_cols,
                               index=0)
        
        # Determine the best visualization based on selected variables
        if var2 == "None":
            # Single variable visualization
            st.subheader(f"Visualization of {var1}")
            
            fig, ax = plt.subplots(figsize=(10, 6))
            
            if var1 in numeric_cols:
                # For numeric variables, show histogram and KDE
                sns.histplot(data=data, x=var1, kde=True, ax=ax)
                st.pyplot(fig)
                
                # Also show basic statistics
                st.write(f"**Statistics for {var1}:**")
                stats = data[var1].describe()
                st.write(stats)
                
            else:
                # For categorical variables, show count plot
                sns.countplot(data=data, y=var1, ax=ax)
                plt.tight_layout()
                st.pyplot(fig)
                
                # Also show value counts
                st.write(f"**Value counts for {var1}:**")
                st.write(data[var1].value_counts())
                
        else:
            # Two variables visualization
            st.subheader(f"Visualization of {var1} vs {var2}")
            
            fig, ax = plt.subplots(figsize=(10, 6))
            
            # Both numeric
            if var1 in numeric_cols and var2 in numeric_cols:
                # Scatter plot for two numeric variables
                sns.scatterplot(data=data, x=var1, y=var2, ax=ax)
                
                # Add regression line
                sns.regplot(data=data, x=var1, y=var2, scatter=False, ax=ax)
                
                # Calculate and display correlation
                correlation = data[[var1, var2]].corr().iloc[0, 1]
                st.write(f"Correlation between {var1} and {var2}: {correlation:.4f}")
            
            # One numeric, one categorical
            elif (var1 in numeric_cols and var2 in categorical_cols) or (var1 in categorical_cols and var2 in numeric_cols):
                # Determine which is which
                num_var = var1 if var1 in numeric_cols else var2
                cat_var = var2 if var2 in categorical_cols else var1
                
                # Box plot for numeric vs categorical
                sns.boxplot(data=data, x=cat_var, y=num_var, ax=ax)
                plt.xticks(rotation=45)
            
            # Both categorical
            else:
                # Heatmap of counts for two categorical variables
                cross_tab = pd.crosstab(data[var1], data[var2])
                sns.heatmap(cross_tab, annot=True, fmt="d", cmap="YlGnBu", ax=ax)
            
            plt.tight_layout()
            st.pyplot(fig)
    else:
        st.warning("Please enter a valid CSV URL")
else:
    st.info("Enter a CSV URL to get started")
Loading code editor...
Terminal