make me a hdbscan clustering interface that reads from the query parameters
To upload files, please first save the app
import streamlit as st
import numpy as np
import pandas as pd
import hdbscan
from sklearn.preprocessing import StandardScaler
import time
st.title("HDBSCAN Clustering Interface")
# Get query parameters from URL
query_params = st.experimental_get_query_params()
# Default values for parameters
default_min_cluster_size = 5
default_min_samples = 5
default_cluster_selection_epsilon = 0.0
default_n_samples = 1000
# Read parameters from query string or use defaults
min_cluster_size = int(query_params.get('min_cluster_size', [default_min_cluster_size])[0])
min_samples = int(query_params.get('min_samples', [default_min_samples])[0])
cluster_selection_epsilon = float(query_params.get('epsilon', [default_cluster_selection_epsilon])[0])
n_samples = int(query_params.get('n_samples', [default_n_samples])[0])
# Display current parameters
st.write("Current Parameters:")
st.write({
'min_cluster_size': min_cluster_size,
'min_samples': min_samples,
'cluster_selection_epsilon': cluster_selection_epsilon,
'n_samples': n_samples
})
# Generate sample data
@st.cache_data
def generate_sample_data(n_samples):
np.random.seed(42)
# Generate three clusters
cluster1 = np.random.normal(0, 1, (n_samples // 3, 2))
cluster2 = np.random.normal(4, 1.5, (n_samples // 3, 2))
cluster3 = np.random.normal(-4, 1, (n_samples // 3, 2))
X = np.vstack([cluster1, cluster2, cluster3])
return X
# Generate and scale the data
X = generate_sample_data(n_samples)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Perform clustering
start_time = time.time()
clusterer = hdbscan.HDBSCAN(
min_cluster_size=min_cluster_size,
min_samples=min_samples,
cluster_selection_epsilon=cluster_selection_epsilon
)
cluster_labels = clusterer.fit_predict(X_scaled)
end_time = time.time()
# Create DataFrame for visualization
df = pd.DataFrame(X, columns=['x', 'y'])
df['cluster'] = cluster_labels
# Display results
st.write(f"Time taken: {end_time - start_time:.2f} seconds")
st.write(f"Number of clusters found: {len(np.unique(cluster_labels[cluster_labels != -1]))}")
st.write(f"Number of noise points: {np.sum(cluster_labels == -1)}")
# Plot the results
st.write("Clustering Results:")
fig = plt.figure(figsize=(10, 6))
plt.scatter(df[df['cluster'] == -1]['x'],
df[df['cluster'] == -1]['y'],
c='gray', label='Noise', alpha=0.5)
for label in np.unique(cluster_labels[cluster_labels != -1]):
mask = df['cluster'] == label
plt.scatter(df[mask]['x'],
df[mask]['y'],
label=f'Cluster {label}',
alpha=0.7)
plt.legend()
plt.title('HDBSCAN Clustering Results')
st.pyplot(fig)
# Instructions for URL parameters
st.markdown("""
### How to use URL parameters
You can modify the clustering by adding the following parameters to the URL:
- `min_cluster_size`: Minimum size of clusters (default: 5)
- `min_samples`: Number of samples in a neighborhood (default: 5)
- `epsilon`: Cluster selection epsilon (default: 0.0)
- `n_samples`: Number of samples to generate (default: 1000)
Example URL:
```
?min_cluster_size=10&min_samples=5&epsilon=0.5&n_samples=2000
```
""")
Hi! I can help you with any questions about Streamlit and Python. What would you like to know?