Create a streamlit app that takes a dataset and shows a visual of the number of missing observations per column. For each variable, the app should also show the 5 other columns with the highest missingness correlation to the each variable
To upload files, please first save the app
import streamlit as st
import pandas as pd
import numpy as np
import altair as alt
st.title("Missing Data Explorer")
# File uploader
uploaded_file = st.file_uploader("Upload a CSV file", type="csv")
if uploaded_file is not None:
# Load data
df = pd.read_csv(uploaded_file)
# Calculate missing values
missing_counts = df.isnull().sum()
missing_percentages = (missing_counts / len(df) * 100).round(2)
# Create missing values chart data
missing_data = pd.DataFrame({
'Column': missing_counts.index,
'Missing Count': missing_counts.values,
'Missing Percentage': missing_percentages.values
})
# Display missing values chart
st.subheader("Missing Values Overview")
chart = alt.Chart(missing_data).mark_bar().encode(
x=alt.X('Missing Percentage:Q', title='Missing Percentage (%)'),
y=alt.Y('Column:N', sort='-x', title='Column Name'),
tooltip=['Column', 'Missing Count', 'Missing Percentage']
).properties(
height=max(len(df.columns) * 25, 300)
)
st.altair_chart(chart, use_container_width=True)
# Calculate missingness correlation
st.subheader("Missingness Correlation Analysis")
# Create binary matrix of missing values
missing_binary = df.isnull().astype(int)
# Calculate correlation between missing values
missing_corr = missing_binary.corr()
# For each variable, show the top 5 correlated variables in terms of missingness
for col in df.columns:
if missing_counts[col] > 0: # Only show for columns with missing values
st.write(f"\n**Top 5 missingness correlations for {col}:**")
# Get correlations for this column
col_corr = missing_corr[col].sort_values(ascending=False)
# Remove self-correlation and get top 5
top_5_corr = col_corr[col_corr.index != col][:5]
# Create correlation data for this column
corr_data = pd.DataFrame({
'Related Column': top_5_corr.index,
'Correlation': top_5_corr.values.round(3)
})
st.table(corr_data)
else:
st.write("Please upload a CSV file to begin the analysis.")
Hi! I can help you with any questions about Streamlit and Python. What would you like to know?