Find a primary topic and all semantically relevant subtopics based on cosine similarity scores and wikipedia articles
To upload files, please first save the app
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
st.title('Topic Similarity Analysis')
# Sample Wikipedia articles (You may replace these with actual articles if available)
articles = {
'Python (programming language)': 'Python is an interpreted, high-level, general-purpose programming language.',
'Java (programming language)': 'Java is a high-level, class-based, object-oriented programming language that is designed to have as few implementation dependencies as possible.',
'JavaScript': 'JavaScript is a programming language that conforms to the ECMAScript specification.',
'Data Science': 'Data science is a multi-disciplinary field that uses scientific methods, processes, algorithms and systems to extract knowledge and insights from structured and unstructured data.',
'Machine Learning': 'Machine learning is the study of computer algorithms that improve automatically through experience and by the use of data.'
}
# Create a DataFrame from articles
article_df = pd.DataFrame(articles.items(), columns=['Topic', 'Content'])
# User input for the primary topic
primary_topic = st.selectbox('Select a primary topic:', article_df['Topic'])
# Prepare the vectorizer and transform the articles into TF-IDF matrix
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(article_df['Content'])
# Calculate cosine similarity
cosine_sim = cosine_similarity(X)
# Find the index of the selected primary topic
primary_index = article_df[article_df['Topic'] == primary_topic].index[0]
# Get similarity scores for the primary topic
similarity_scores = cosine_sim[primary_index]
# Create a DataFrame for results
similarity_df = pd.DataFrame({'Topic': article_df['Topic'], 'Similarity Score': similarity_scores})
# Sort the DataFrame by similarity scores
similarity_df = similarity_df.sort_values(by='Similarity Score', ascending=False)
# Display the similarity results
st.write('Similarity Scores for topics related to **' + primary_topic + '**')
st.table(similarity_df)
Hi! I can help you with any questions about Streamlit and Python. What would you like to know?