an app showcasing k-means clustering
To upload files, please first save the app
import streamlit as st
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import plotly.express as px
# Set page title
st.title("K-Means Clustering Demo")
# Sidebar controls
st.sidebar.header("Parameters")
n_clusters = st.sidebar.slider("Number of Clusters (K)", min_value=2, max_value=10, value=3)
n_points = st.sidebar.slider("Number of Points", min_value=50, max_value=1000, value=300)
random_state = st.sidebar.slider("Random Seed", min_value=0, max_value=100, value=42)
# Generate random data
def generate_data(n_points, random_state):
np.random.seed(random_state)
# Create three clusters
cluster1 = np.random.normal(loc=[2, 2], scale=0.5, size=(n_points//3, 2))
cluster2 = np.random.normal(loc=[-2, -2], scale=0.5, size=(n_points//3, 2))
cluster3 = np.random.normal(loc=[2, -2], scale=0.5, size=(n_points//3, 2))
# Combine clusters
data = np.vstack([cluster1, cluster2, cluster3])
np.random.shuffle(data)
return data
# Generate and transform data
data = generate_data(n_points, random_state)
df = pd.DataFrame(data, columns=['X', 'Y'])
# Perform K-means clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
df['Cluster'] = kmeans.fit_predict(data)
# Create scatter plot
fig = px.scatter(df, x='X', y='Y', color='Cluster',
title='K-Means Clustering Visualization',
color_continuous_scale='viridis')
# Update layout
fig.update_layout(
plot_bgcolor='white',
showlegend=True,
width=800,
height=600
)
# Display plot
st.plotly_chart(fig)
# Display cluster centers
st.subheader("Cluster Centers")
centers = pd.DataFrame(kmeans.cluster_centers_, columns=['X', 'Y'])
centers.index.name = 'Cluster'
st.dataframe(centers)
# Display inertia (within-cluster sum of squares)
st.subheader("Clustering Metrics")
st.write(f"Inertia (Within-cluster sum of squares): {kmeans.inertia_:.2f}")
# Add explanation
st.markdown("""
### How it works
1. K-means clustering partitions n observations into k clusters.
2. Each observation belongs to the cluster with the nearest mean (cluster center).
3. The algorithm aims to minimize the within-cluster sum of squares (inertia).
### Parameters
- **Number of Clusters (K)**: The number of clusters to form
- **Number of Points**: Total number of data points to generate
- **Random Seed**: Controls the random generation of data points
Try adjusting the parameters in the sidebar to see how they affect the clustering!
""")
Hi! I can help you with any questions about Streamlit and Python. What would you like to know?