Ploomber AI Editor | s3-data-viz-6c9c

To upload files, please first save the app
Code Editor for app.py

import streamlit as st
import boto3
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import altair as alt
import numpy as np
from io import StringIO, BytesIO
import json

st.set_page_config(page_title="S3 Data Visualization Tool", layout="wide")

def create_s3_client(access_key, secret_key, region):
    """Create S3 client with provided credentials"""
    try:
        s3_client = boto3.client(
            's3',
            aws_access_key_id=access_key,
            aws_secret_access_key=secret_key,
            region_name=region
        )
        return s3_client
    except Exception as e:
        st.error(f"Error creating S3 client: {str(e)}")
        return None

def list_s3_buckets(s3_client):
    """List all S3 buckets"""
    try:
        response = s3_client.list_buckets()
        return [bucket['Name'] for bucket in response['Buckets']]
    except Exception as e:
        st.error(f"Error listing buckets: {str(e)}")
        return []

def list_s3_objects(s3_client, bucket_name, prefix=""):
    """List objects in S3 bucket"""
    try:
        response = s3_client.list_objects_v2(
            Bucket=bucket_name,
            Prefix=prefix
        )
        if 'Contents' in response:
            return [obj['Key'] for obj in response['Contents'] 
                   if obj['Key'].endswith(('.csv', '.json', '.xlsx', '.xls', '.parquet'))]
        return []
    except Exception as e:
        st.error(f"Error listing objects: {str(e)}")
        return []

def read_s3_file(s3_client, bucket_name, object_key):
    """Read file from S3 and return as DataFrame"""
    try:
        response = s3_client.get_object(Bucket=bucket_name, Key=object_key)
        content = response['Body'].read()
        
        if object_key.endswith('.csv'):
            df = pd.read_csv(BytesIO(content))
        elif object_key.endswith('.json'):
            df = pd.read_json(BytesIO(content))
        elif object_key.endswith(('.xlsx', '.xls')):
            df = pd.read_excel(BytesIO(content))
        elif object_key.endswith('.parquet'):
            df = pd.read_parquet(BytesIO(content))
        else:
            st.error("Unsupported file format")
            return None
            
        return df
    except Exception as e:
        st.error(f"Error reading file: {str(e)}")
        return None

def create_visualization(df, chart_type, x_col, y_col, color_col=None):
    """Create visualization based on selected parameters"""
    if chart_type == "Bar Chart":
        if color_col and color_col != "None":
            fig = px.bar(df, x=x_col, y=y_col, color=color_col)
        else:
            fig = px.bar(df, x=x_col, y=y_col)
        return fig
    
    elif chart_type == "Line Chart":
        if color_col and color_col != "None":
            fig = px.line(df, x=x_col, y=y_col, color=color_col)
        else:
            fig = px.line(df, x=x_col, y=y_col)
        return fig
    
    elif chart_type == "Scatter Plot":
        if color_col and color_col != "None":
            fig = px.scatter(df, x=x_col, y=y_col, color=color_col)
        else:
            fig = px.scatter(df, x=x_col, y=y_col)
        return fig
    
    elif chart_type == "Histogram":
        if color_col and color_col != "None":
            fig = px.histogram(df, x=x_col, color=color_col)
        else:
            fig = px.histogram(df, x=x_col)
        return fig
    
    elif chart_type == "Box Plot":
        if color_col and color_col != "None":
            fig = px.box(df, x=x_col, y=y_col, color=color_col)
        else:
            fig = px.box(df, x=x_col, y=y_col)
        return fig
    
    elif chart_type == "Pie Chart":
        fig = px.pie(df, names=x_col, values=y_col)
        return fig

# Main app
st.title("🗂️ S3 Data Visualization Tool")
st.markdown("Connect to your S3 bucket and visualize your data with interactive charts")

# Sidebar for S3 configuration
with st.sidebar:
    st.header("🔑 AWS S3 Configuration")
    
    access_key = st.text_input("AWS Access Key ID", type="password")
    secret_key = st.text_input("AWS Secret Access Key", type="password")
    region = st.text_input("AWS Region", value="us-east-1")
    
    if st.button("Connect to S3"):
        if access_key and secret_key and region:
            st.session_state.s3_client = create_s3_client(access_key, secret_key, region)
            if st.session_state.s3_client:
                st.success("✅ Connected to S3!")
        else:
            st.error("Please provide all required credentials")

# Main content area
if 's3_client' in st.session_state and st.session_state.s3_client:
    col1, col2 = st.columns([1, 2])
    
    with col1:
        st.subheader("📁 Data Selection")
        
        # Bucket selection
        buckets = list_s3_buckets(st.session_state.s3_client)
        if buckets:
            selected_bucket = st.selectbox("Select S3 Bucket:", buckets)
            
            # Object selection
            objects = list_s3_objects(st.session_state.s3_client, selected_bucket)
            if objects:
                selected_object = st.selectbox("Select File:", objects)
                
                if st.button("Load Data"):
                    with st.spinner("Loading data from S3..."):
                        df = read_s3_file(st.session_state.s3_client, selected_bucket, selected_object)
                        if df is not None:
                            st.session_state.df = df
                            st.success(f"✅ Loaded {len(df)} rows and {len(df.columns)} columns")
            else:
                st.info("No supported files found in this bucket. Supported formats: CSV, JSON, Excel, Parquet")
        else:
            st.warning("No buckets found or insufficient permissions")
    
    with col2:
        if 'df' in st.session_state:
            df = st.session_state.df
            
            st.subheader("📊 Data Preview")
            st.dataframe(df.head(), use_container_width=True)
            
            # Data info
            col_info1, col_info2, col_info3 = st.columns(3)
            with col_info1:
                st.metric("Rows", len(df))
            with col_info2:
                st.metric("Columns", len(df.columns))
            with col_info3:
                st.metric("Memory Usage", f"{df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")

# Visualization section
if 'df' in st.session_state:
    df = st.session_state.df
    
    st.header("📈 Data Visualization")
    
    # Chart configuration
    col1, col2, col3, col4 = st.columns(4)
    
    with col1:
        chart_type = st.selectbox(
            "Chart Type:",
            ["Bar Chart", "Line Chart", "Scatter Plot", "Histogram", "Box Plot", "Pie Chart"]
        )
    
    with col2:
        x_col = st.selectbox("X-axis:", df.columns.tolist())
    
    with col3:
        if chart_type != "Histogram":
            numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
            y_col = st.selectbox("Y-axis:", numeric_cols if numeric_cols else df.columns.tolist())
        else:
            y_col = None
    
    with col4:
        color_options = ["None"] + df.columns.tolist()
        color_col = st.selectbox("Color by:", color_options)
    
    # Generate visualization
    if st.button("Generate Chart", type="primary"):
        try:
            if chart_type == "Histogram":
                fig = create_visualization(df, chart_type, x_col, None, color_col)
            elif chart_type == "Pie Chart":
                if y_col:
                    fig = create_visualization(df, chart_type, x_col, y_col)
                else:
                    st.error("Please select a Y-axis column for Pie Chart")
                    fig = None
            else:
                if y_col:
                    fig = create_visualization(df, chart_type, x_col, y_col, color_col)
                else:
                    st.error("Please select a Y-axis column")
                    fig = None
            
            if fig:
                st.plotly_chart(fig, use_container_width=True)
        except Exception as e:
            st.error(f"Error creating visualization: {str(e)}")
    
    # Data analysis section
    st.header("🔍 Data Analysis")
    
    analysis_tab1, analysis_tab2, analysis_tab3 = st.tabs(["Summary Statistics", "Data Types", "Missing Values"])
    
    with analysis_tab1:
        st.subheader("Summary Statistics")
        st.dataframe(df.describe(), use_container_width=True)
    
    with analysis_tab2:
        st.subheader("Data Types")
        dtype_df = pd.DataFrame({
            'Column': df.columns,
            'Data Type': df.dtypes.astype(str),
            'Non-Null Count': df.count()
        })
        st.dataframe(dtype_df, use_container_width=True)
    
    with analysis_tab3:
        st.subheader("Missing Values")
        missing_df = pd.DataFrame({
            'Column': df.columns,
            'Missing Count': df.isnull().sum(),
            'Missing Percentage': (df.isnull().sum() / len(df) * 100).round(2)
        })
        missing_df = missing_df[missing_df['Missing Count'] > 0]
        if not missing_df.empty:
            st.dataframe(missing_df, use_container_width=True)
            
            # Missing values visualization
            fig_missing = px.bar(
                missing_df, 
                x='Column', 
                y='Missing Percentage',
                title="Missing Values by Column"
            )
            st.plotly_chart(fig_missing, use_container_width=True)
        else:
            st.success("✅ No missing values found in the dataset!")

else:
    st.info("👈 Please configure your S3 connection in the sidebar and load data to start visualizing!")

# Footer
st.markdown("---")
st.markdown("Built with ❤️ using Streamlit | Upload your data to S3 and visualize it instantly!")
Loading code editor...
Click Save & Run to preview your app
Terminal