Document analysis app and dynamic knowledge graph generation based on document information
To upload files, please first save the app
import streamlit as st
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from textblob import TextBlob
import nltk
from collections import Counter, defaultdict
import re
import json
# Download required NLTK data
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
try:
nltk.data.find('corpora/stopwords')
except LookupError:
nltk.download('stopwords')
try:
nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
st.set_page_config(
page_title="Document Knowledge Graph Generator",
page_icon="π",
layout="wide"
)
st.title("π Document Analysis & Knowledge Graph Generator")
st.markdown("Upload documents to generate dynamic knowledge graphs and perform comprehensive text analysis.")
# Initialize session state
if 'documents' not in st.session_state:
st.session_state.documents = []
if 'knowledge_graph' not in st.session_state:
st.session_state.knowledge_graph = None
# Sidebar for document upload and settings
with st.sidebar:
st.header("π Document Upload")
uploaded_files = st.file_uploader(
"Choose text files",
type=['txt', 'md'],
accept_multiple_files=True
)
if uploaded_files:
for file in uploaded_files:
content = str(file.read(), "utf-8")
doc_info = {
'name': file.name,
'content': content,
'word_count': len(content.split()),
'char_count': len(content)
}
if doc_info not in st.session_state.documents:
st.session_state.documents.append(doc_info)
st.header("βοΈ Analysis Settings")
min_word_freq = st.slider("Minimum word frequency", 1, 10, 2)
max_nodes = st.slider("Maximum nodes in graph", 10, 100, 50)
include_pos = st.multiselect(
"Include POS tags",
["NN", "NNS", "NNP", "NNPS", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "JJ", "JJR", "JJS"],
default=["NN", "NNS", "NNP", "NNPS", "JJ"]
)
# Main content area
if st.session_state.documents:
tab1, tab2, tab3, tab4 = st.tabs(["π Documents", "π Analysis", "πΈοΈ Knowledge Graph", "πΎ Export"])
with tab1:
st.header("Uploaded Documents")
for i, doc in enumerate(st.session_state.documents):
with st.expander(f"π {doc['name']} ({doc['word_count']} words)"):
col1, col2 = st.columns([3, 1])
with col1:
st.text_area(
"Content",
doc['content'][:1000] + "..." if len(doc['content']) > 1000 else doc['content'],
height=200,
key=f"content_{i}"
)
with col2:
st.metric("Words", doc['word_count'])
st.metric("Characters", doc['char_count'])
if st.button("Remove", key=f"remove_{i}"):
st.session_state.documents.pop(i)
st.rerun()
with tab2:
st.header("Document Analysis")
if st.button("π Analyze Documents"):
# Combine all documents
combined_text = " ".join([doc['content'] for doc in st.session_state.documents])
# Basic statistics
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Total Documents", len(st.session_state.documents))
with col2:
st.metric("Total Words", len(combined_text.split()))
with col3:
st.metric("Total Characters", len(combined_text))
with col4:
sentences = sent_tokenize(combined_text)
st.metric("Total Sentences", len(sentences))
# Sentiment analysis
st.subheader("π Sentiment Analysis")
blob = TextBlob(combined_text)
sentiment_score = blob.sentiment.polarity
col1, col2 = st.columns(2)
with col1:
st.metric("Sentiment Score", f"{sentiment_score:.3f}")
if sentiment_score > 0.1:
st.success("Positive sentiment")
elif sentiment_score < -0.1:
st.error("Negative sentiment")
else:
st.info("Neutral sentiment")
with col2:
subjectivity = blob.sentiment.subjectivity
st.metric("Subjectivity", f"{subjectivity:.3f}")
if subjectivity > 0.5:
st.info("Subjective content")
else:
st.info("Objective content")
# Word frequency analysis
st.subheader("π Word Frequency Analysis")
# Tokenize and clean text
stop_words = set(stopwords.words('english'))
words = word_tokenize(combined_text.lower())
words = [word for word in words if word.isalpha() and word not in stop_words and len(word) > 2]
# POS tagging
tagged_words = pos_tag(words)
filtered_words = [word for word, pos in tagged_words if pos in include_pos]
word_freq = Counter(filtered_words)
top_words = word_freq.most_common(20)
# Create frequency chart
if top_words:
df_freq = pd.DataFrame(top_words, columns=['Word', 'Frequency'])
fig = px.bar(df_freq, x='Word', y='Frequency', title="Top 20 Most Frequent Words")
st.plotly_chart(fig, use_container_width=True)
# Word cloud style visualization
fig_scatter = px.scatter(
df_freq,
x='Word',
y='Frequency',
size='Frequency',
title="Word Frequency Bubble Chart"
)
st.plotly_chart(fig_scatter, use_container_width=True)
with tab3:
st.header("Knowledge Graph Generation")
if st.button("πΈοΈ Generate Knowledge Graph"):
# Combine all documents
combined_text = " ".join([doc['content'] for doc in st.session_state.documents])
# Extract entities and relationships
stop_words = set(stopwords.words('english'))
sentences = sent_tokenize(combined_text)
# Create graph
G = nx.Graph()
entity_freq = defaultdict(int)
relationships = defaultdict(int)
for sentence in sentences:
words = word_tokenize(sentence.lower())
words = [word for word in words if word.isalpha() and word not in stop_words and len(word) > 2]
# POS tagging to identify entities
tagged_words = pos_tag(words)
entities = [word for word, pos in tagged_words if pos in include_pos]
# Count entity frequency
for entity in entities:
entity_freq[entity] += 1
# Create relationships between entities in the same sentence
for i in range(len(entities)):
for j in range(i + 1, len(entities)):
if entities[i] != entities[j]:
relationship = tuple(sorted([entities[i], entities[j]]))
relationships[relationship] += 1
# Filter entities by frequency
filtered_entities = {k: v for k, v in entity_freq.items() if v >= min_word_freq}
# Add nodes to graph
for entity, freq in list(filtered_entities.items())[:max_nodes]:
G.add_node(entity, frequency=freq, size=freq*10)
# Add edges to graph
for (entity1, entity2), weight in relationships.items():
if entity1 in G.nodes() and entity2 in G.nodes() and weight >= min_word_freq:
G.add_edge(entity1, entity2, weight=weight)
st.session_state.knowledge_graph = G
# Display graph statistics
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Nodes", G.number_of_nodes())
with col2:
st.metric("Edges", G.number_of_edges())
with col3:
st.metric("Density", f"{nx.density(G):.3f}")
# Visualize graph with Plotly
if G.number_of_nodes() > 0:
pos = nx.spring_layout(G, k=1, iterations=50)
# Create edge traces
edge_x = []
edge_y = []
for edge in G.edges():
x0, y0 = pos[edge[0]]
x1, y1 = pos[edge[1]]
edge_x.extend([x0, x1, None])
edge_y.extend([y0, y1, None])
edge_trace = go.Scatter(
x=edge_x, y=edge_y,
line=dict(width=0.5, color='#888'),
hoverinfo='none',
mode='lines'
)
# Create node traces
node_x = []
node_y = []
node_text = []
node_size = []
for node in G.nodes():
x, y = pos[node]
node_x.append(x)
node_y.append(y)
node_text.append(f"{node}<br>Freq: {G.nodes[node]['frequency']}")
node_size.append(G.nodes[node]['frequency'] * 5)
node_trace = go.Scatter(
x=node_x, y=node_y,
mode='markers+text',
hoverinfo='text',
text=[node for node in G.nodes()],
textposition="middle center",
hovertext=node_text,
marker=dict(
size=node_size,
color='lightblue',
line=dict(width=2, color='darkblue')
)
)
fig = go.Figure(data=[edge_trace, node_trace],
layout=go.Layout(
title='Knowledge Graph',
titlefont_size=16,
showlegend=False,
hovermode='closest',
margin=dict(b=20,l=5,r=5,t=40),
annotations=[ dict(
text="Hover over nodes for details",
showarrow=False,
xref="paper", yref="paper",
x=0.005, y=-0.002 ) ],
xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
)
st.plotly_chart(fig, use_container_width=True)
# Graph analysis
st.subheader("Graph Analysis")
if G.number_of_nodes() > 0:
# Centrality measures
degree_centrality = nx.degree_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G)
# Top central nodes
col1, col2 = st.columns(2)
with col1:
st.write("**Most Connected Nodes (Degree Centrality)**")
top_degree = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:10]
for node, centrality in top_degree:
st.write(f"β’ {node}: {centrality:.3f}")
with col2:
st.write("**Most Important Nodes (Betweenness Centrality)**")
top_betweenness = sorted(betweenness_centrality.items(), key=lambda x: x[1], reverse=True)[:10]
for node, centrality in top_betweenness:
st.write(f"β’ {node}: {centrality:.3f}")
else:
st.warning("No nodes found with the current settings. Try lowering the minimum word frequency.")
with tab4:
st.header("Export Data")
if st.session_state.knowledge_graph:
col1, col2 = st.columns(2)
with col1:
st.subheader("Export Graph Data")
# Export nodes
nodes_data = []
for node in st.session_state.knowledge_graph.nodes(data=True):
nodes_data.append({
'node': node[0],
'frequency': node[1]['frequency']
})
nodes_df = pd.DataFrame(nodes_data)
st.download_button(
"Download Nodes CSV",
nodes_df.to_csv(index=False),
"knowledge_graph_nodes.csv",
"text/csv"
)
# Export edges
edges_data = []
for edge in st.session_state.knowledge_graph.edges(data=True):
edges_data.append({
'source': edge[0],
'target': edge[1],
'weight': edge[2].get('weight', 1)
})
if edges_data:
edges_df = pd.DataFrame(edges_data)
st.download_button(
"Download Edges CSV",
edges_df.to_csv(index=False),
"knowledge_graph_edges.csv",
"text/csv"
)
with col2:
st.subheader("Export Analysis Report")
# Generate report
combined_text = " ".join([doc['content'] for doc in st.session_state.documents])
blob = TextBlob(combined_text)
report = {
"document_count": len(st.session_state.documents),
"total_words": len(combined_text.split()),
"total_characters": len(combined_text),
"total_sentences": len(sent_tokenize(combined_text)),
"sentiment_score": blob.sentiment.polarity,
"subjectivity_score": blob.sentiment.subjectivity,
"graph_nodes": st.session_state.knowledge_graph.number_of_nodes(),
"graph_edges": st.session_state.knowledge_graph.number_of_edges(),
"graph_density": nx.density(st.session_state.knowledge_graph)
}
st.download_button(
"Download Analysis Report",
json.dumps(report, indent=2),
"analysis_report.json",
"application/json"
)
else:
st.info("Generate a knowledge graph first to export data.")
else:
st.info("π Upload documents using the sidebar to get started!")
# Sample data section
st.header("π Sample Documents")
st.markdown("You can use these sample texts to test the application:")
sample_texts = {
"Technology Article": """
Artificial intelligence is transforming modern technology. Machine learning algorithms are becoming more sophisticated every day. Neural networks and deep learning models are revolutionizing data analysis. Companies are investing heavily in AI research and development. The future of technology depends on artificial intelligence innovations.
""",
"Business Report": """
The quarterly earnings report shows significant growth in revenue. Market analysis indicates strong performance across all sectors. Customer satisfaction has improved due to better service quality. The company's strategic initiatives are delivering positive results. Stakeholder confidence remains high despite market volatility.
""",
"Scientific Abstract": """
This research study examines the effects of climate change on marine ecosystems. Ocean temperature increases are affecting biodiversity patterns. Coral reef degradation is accelerating due to acidification. Marine species are adapting to changing environmental conditions. Conservation efforts are crucial for ecosystem preservation.
"""
}
for title, content in sample_texts.items():
with st.expander(f"π {title}"):
st.text_area("Content", content, height=100, key=f"sample_{title}")
if st.button(f"Use {title}", key=f"use_{title}"):
doc_info = {
'name': f"{title}.txt",
'content': content.strip(),
'word_count': len(content.strip().split()),
'char_count': len(content.strip())
}
st.session_state.documents.append(doc_info)
st.rerun()
Hi! I can help you with any questions about Streamlit and Python. What would you like to know?