Document analysis app and dynamic knowledge graph generation based on document information
To upload files, please first save the app
import streamlit as st
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from collections import Counter, defaultdict
import re
import json
from pyvis.network import Network
import tempfile
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import streamlit.components.v1 as components
# Initialize session state
if 'documents' not in st.session_state:
st.session_state.documents = []
if 'knowledge_graph' not in st.session_state:
st.session_state.knowledge_graph = None
st.set_page_config(page_title="Document Analysis & Knowledge Graph", layout="wide")
def extract_entities_simple(text):
"""Simple entity extraction using pattern matching"""
# Extract potential entities (capitalized words, proper nouns)
entities = []
# Pattern for capitalized words (potential proper nouns)
proper_nouns = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text)
entities.extend(proper_nouns)
# Pattern for acronyms
acronyms = re.findall(r'\b[A-Z]{2,}\b', text)
entities.extend(acronyms)
# Pattern for numbers with units
numbers = re.findall(r'\b\d+(?:\.\d+)?(?:\s*(?:million|billion|thousand|percent|%|USD|EUR|km|miles))\b', text, re.IGNORECASE)
entities.extend(numbers)
return list(set(entities))
def extract_relationships(text, entities):
"""Extract relationships between entities"""
relationships = []
# Common relationship patterns
patterns = [
r'(\w+)\s+(?:is|was|are|were)\s+(?:a|an|the)?\s*(\w+)',
r'(\w+)\s+(?:owns|has|contains|includes)\s+(\w+)',
r'(\w+)\s+(?:works for|employed by|part of)\s+(\w+)',
r'(\w+)\s+(?:located in|based in|from)\s+(\w+)',
r'(\w+)\s+(?:and|with|alongside)\s+(\w+)',
]
for pattern in patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
for match in matches:
entity1, entity2 = match
if entity1 in entities and entity2 in entities:
relationships.append((entity1, entity2, 'related'))
return relationships
def analyze_document(text):
"""Analyze document and extract key information"""
analysis = {}
# Basic statistics
analysis['word_count'] = len(text.split())
analysis['char_count'] = len(text)
analysis['sentence_count'] = len(re.findall(r'[.!?]+', text))
analysis['paragraph_count'] = len([p for p in text.split('\n\n') if p.strip()])
# Extract entities
entities = extract_entities_simple(text)
analysis['entities'] = entities[:20] # Top 20 entities
# Extract key phrases using simple TF-IDF
sentences = re.split(r'[.!?]+', text)
if len(sentences) > 1:
vectorizer = TfidfVectorizer(max_features=10, stop_words='english', ngram_range=(1, 2))
try:
tfidf_matrix = vectorizer.fit_transform(sentences)
feature_names = vectorizer.get_feature_names_out()
analysis['key_phrases'] = list(feature_names)
except:
analysis['key_phrases'] = []
else:
analysis['key_phrases'] = []
# Word frequency
words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
stop_words = {'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can', 'had', 'her', 'was', 'one', 'our', 'out', 'day', 'get', 'use', 'man', 'new', 'now', 'way', 'may', 'say', 'each', 'which', 'their', 'time', 'will', 'about', 'would', 'there', 'could', 'other', 'after', 'first', 'well', 'water', 'been', 'call', 'who', 'oil', 'its', 'did', 'down', 'only', 'into', 'over', 'think', 'also', 'your', 'work', 'life', 'only', 'can', 'still', 'should', 'after', 'being', 'now', 'made', 'before', 'here', 'through', 'when', 'where', 'how', 'what', 'does', 'then', 'them', 'these', 'they', 'this', 'that', 'with', 'have', 'from', 'they', 'know', 'want', 'been', 'good', 'much', 'some', 'time', 'very', 'when', 'come', 'here', 'just', 'like', 'long', 'make', 'many', 'over', 'such', 'take', 'than', 'them', 'well', 'were'}
filtered_words = [word for word in words if word not in stop_words and len(word) > 3]
analysis['word_frequency'] = dict(Counter(filtered_words).most_common(10))
return analysis
def create_knowledge_graph(documents_data):
"""Create a knowledge graph from analyzed documents"""
G = nx.Graph()
# Add nodes and edges
all_entities = set()
all_relationships = []
for doc_data in documents_data:
doc_entities = doc_data['analysis']['entities']
all_entities.update(doc_entities)
# Add relationships between entities in the same document
for i, entity1 in enumerate(doc_entities):
for entity2 in doc_entities[i+1:]:
if entity1 != entity2:
all_relationships.append((entity1, entity2, 'co-occurs'))
# Add nodes
for entity in all_entities:
G.add_node(entity)
# Add edges
for entity1, entity2, relation in all_relationships:
if G.has_edge(entity1, entity2):
G[entity1][entity2]['weight'] += 1
else:
G.add_edge(entity1, entity2, weight=1, relation=relation)
return G
def visualize_knowledge_graph(G, layout='spring'):
"""Visualize knowledge graph using Plotly"""
if len(G.nodes()) == 0:
return None
# Calculate layout
if layout == 'spring':
pos = nx.spring_layout(G, k=3, iterations=50)
elif layout == 'circular':
pos = nx.circular_layout(G)
elif layout == 'kamada_kawai':
pos = nx.kamada_kawai_layout(G)
else:
pos = nx.spring_layout(G)
# Extract node and edge information
node_x = [pos[node][0] for node in G.nodes()]
node_y = [pos[node][1] for node in G.nodes()]
node_text = list(G.nodes())
node_size = [G.degree(node) * 10 + 10 for node in G.nodes()]
edge_x = []
edge_y = []
edge_info = []
for edge in G.edges():
x0, y0 = pos[edge[0]]
x1, y1 = pos[edge[1]]
edge_x.extend([x0, x1, None])
edge_y.extend([y0, y1, None])
weight = G[edge[0]][edge[1]].get('weight', 1)
edge_info.append(f"{edge[0]} - {edge[1]} (weight: {weight})")
# Create edge trace
edge_trace = go.Scatter(
x=edge_x, y=edge_y,
line=dict(width=0.5, color='#888'),
hoverinfo='none',
mode='lines'
)
# Create node trace
node_trace = go.Scatter(
x=node_x, y=node_y,
mode='markers+text',
hoverinfo='text',
text=node_text,
textposition="middle center",
hovertext=[f"{node}<br>Connections: {G.degree(node)}" for node in G.nodes()],
marker=dict(
showscale=True,
colorscale='YlGnBu',
reversescale=True,
color=node_size,
size=node_size,
colorbar=dict(
thickness=15,
len=0.5,
x=1.1,
title="Node Connections"
),
line=dict(width=2)
)
)
# Create figure
fig = go.Figure(data=[edge_trace, node_trace],
layout=go.Layout(
title='Knowledge Graph',
titlefont_size=16,
showlegend=False,
hovermode='closest',
margin=dict(b=20,l=5,r=5,t=40),
annotations=[ dict(
text="Hover over nodes to see connections",
showarrow=False,
xref="paper", yref="paper",
x=0.005, y=-0.002,
xanchor="left", yanchor="bottom",
font=dict(color="#888", size=12)
)],
xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
height=600
))
return fig
def create_pyvis_graph(G):
"""Create an interactive graph using PyVis"""
net = Network(height="600px", width="100%", bgcolor="#222222", font_color="white")
# Add nodes
for node in G.nodes():
net.add_node(node, label=node, title=f"Connections: {G.degree(node)}")
# Add edges
for edge in G.edges():
weight = G[edge[0]][edge[1]].get('weight', 1)
net.add_edge(edge[0], edge[1], width=weight, title=f"Weight: {weight}")
# Configure physics
net.set_options("""
var options = {
"physics": {
"enabled": true,
"stabilization": {"iterations": 100}
}
}
""")
# Save to temporary file
tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.html')
net.save_graph(tmp_file.name)
return tmp_file.name
# Main app
st.title("π Document Analysis & Knowledge Graph Generator")
# Sidebar
with st.sidebar:
st.header("Upload Documents")
uploaded_files = st.file_uploader(
"Choose text files",
type=['txt'],
accept_multiple_files=True
)
if uploaded_files:
for uploaded_file in uploaded_files:
if uploaded_file not in [doc['file'] for doc in st.session_state.documents]:
content = uploaded_file.read().decode('utf-8')
analysis = analyze_document(content)
st.session_state.documents.append({
'file': uploaded_file,
'name': uploaded_file.name,
'content': content,
'analysis': analysis
})
st.header("Sample Documents")
if st.button("Add Sample Document 1"):
sample_doc1 = """
Apple Inc. is an American multinational technology company headquartered in Cupertino, California.
Apple designs, develops, and sells consumer electronics, computer software, and online services.
The company was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in April 1976.
Apple is known for its iPhone, iPad, Mac computers, and Apple Watch products.
Tim Cook is the current CEO of Apple, succeeding Steve Jobs in 2011.
The company has a market capitalization of over 2 trillion USD.
"""
analysis = analyze_document(sample_doc1)
st.session_state.documents.append({
'file': None,
'name': 'Sample: Apple Inc.',
'content': sample_doc1,
'analysis': analysis
})
if st.button("Add Sample Document 2"):
sample_doc2 = """
Microsoft Corporation is an American multinational technology corporation with headquarters in Redmond, Washington.
Microsoft develops, manufactures, licenses, supports, and sells computer software, consumer electronics, personal computers, and related services.
The company was founded by Bill Gates and Paul Allen on April 4, 1975.
Microsoft is best known for its Windows operating systems, Microsoft Office suite, and Xbox gaming console.
Satya Nadella is the current CEO of Microsoft. The company has a strong presence in cloud computing with Azure.
"""
analysis = analyze_document(sample_doc2)
st.session_state.documents.append({
'file': None,
'name': 'Sample: Microsoft Corp.',
'content': sample_doc2,
'analysis': analysis
})
if st.button("Clear All Documents"):
st.session_state.documents = []
st.session_state.knowledge_graph = None
# Main content
if not st.session_state.documents:
st.info("π Please upload documents or add sample documents from the sidebar to begin analysis.")
else:
# Generate knowledge graph
if st.session_state.knowledge_graph is None or st.button("π Regenerate Knowledge Graph"):
st.session_state.knowledge_graph = create_knowledge_graph(st.session_state.documents)
# Tabs for different views
tab1, tab2, tab3, tab4 = st.tabs(["π Document Analysis", "πΈοΈ Knowledge Graph", "π Statistics", "π Document Explorer"])
with tab1:
st.header("Document Analysis Overview")
# Summary statistics
col1, col2, col3, col4 = st.columns(4)
total_words = sum([doc['analysis']['word_count'] for doc in st.session_state.documents])
total_entities = len(set([entity for doc in st.session_state.documents for entity in doc['analysis']['entities']]))
col1.metric("Total Documents", len(st.session_state.documents))
col2.metric("Total Words", total_words)
col3.metric("Total Entities", total_entities)
col4.metric("Graph Nodes", len(st.session_state.knowledge_graph.nodes()) if st.session_state.knowledge_graph else 0)
# Document details
for i, doc in enumerate(st.session_state.documents):
with st.expander(f"π {doc['name']}"):
col1, col2 = st.columns(2)
with col1:
st.subheader("Statistics")
st.write(f"**Words:** {doc['analysis']['word_count']}")
st.write(f"**Characters:** {doc['analysis']['char_count']}")
st.write(f"**Sentences:** {doc['analysis']['sentence_count']}")
st.write(f"**Paragraphs:** {doc['analysis']['paragraph_count']}")
with col2:
st.subheader("Entities")
st.write(doc['analysis']['entities'])
st.subheader("Key Phrases")
st.write(doc['analysis']['key_phrases'])
st.subheader("Word Frequency")
if doc['analysis']['word_frequency']:
freq_df = pd.DataFrame(list(doc['analysis']['word_frequency'].items()),
columns=['Word', 'Frequency'])
st.bar_chart(freq_df.set_index('Word'))
with tab2:
st.header("Knowledge Graph Visualization")
if st.session_state.knowledge_graph and len(st.session_state.knowledge_graph.nodes()) > 0:
# Layout selection
layout = st.selectbox("Select Layout", ["spring", "circular", "kamada_kawai"])
# Plotly visualization
fig = visualize_knowledge_graph(st.session_state.knowledge_graph, layout)
if fig:
st.plotly_chart(fig, use_container_width=True)
# Graph statistics
st.subheader("Graph Statistics")
col1, col2, col3 = st.columns(3)
col1.metric("Nodes", len(st.session_state.knowledge_graph.nodes()))
col2.metric("Edges", len(st.session_state.knowledge_graph.edges()))
if len(st.session_state.knowledge_graph.nodes()) > 0:
avg_degree = sum(dict(st.session_state.knowledge_graph.degree()).values()) / len(st.session_state.knowledge_graph.nodes())
col3.metric("Avg Connections", f"{avg_degree:.2f}")
# Top connected entities
if len(st.session_state.knowledge_graph.nodes()) > 0:
st.subheader("Most Connected Entities")
degree_centrality = nx.degree_centrality(st.session_state.knowledge_graph)
top_entities = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:10]
entities_df = pd.DataFrame(top_entities, columns=['Entity', 'Centrality'])
st.table(entities_df)
else:
st.warning("No entities found to create knowledge graph. Try uploading more detailed documents.")
with tab3:
st.header("Statistical Analysis")
# Document comparison
if len(st.session_state.documents) > 1:
st.subheader("Document Comparison")
comparison_data = []
for doc in st.session_state.documents:
comparison_data.append({
'Document': doc['name'],
'Words': doc['analysis']['word_count'],
'Entities': len(doc['analysis']['entities']),
'Key Phrases': len(doc['analysis']['key_phrases']),
'Sentences': doc['analysis']['sentence_count']
})
comparison_df = pd.DataFrame(comparison_data)
st.table(comparison_df)
# Visualization
fig = px.bar(comparison_df, x='Document', y='Words', title='Word Count by Document')
st.plotly_chart(fig, use_container_width=True)
# Entity frequency across documents
st.subheader("Entity Frequency Across Documents")
all_entities = []
for doc in st.session_state.documents:
all_entities.extend(doc['analysis']['entities'])
if all_entities:
entity_freq = Counter(all_entities)
top_entities = dict(entity_freq.most_common(15))
entities_df = pd.DataFrame(list(top_entities.items()), columns=['Entity', 'Frequency'])
fig = px.bar(entities_df, x='Entity', y='Frequency', title='Most Frequent Entities')
fig.update_xaxes(tickangle=45)
st.plotly_chart(fig, use_container_width=True)
with tab4:
st.header("Document Explorer")
if st.session_state.documents:
selected_doc = st.selectbox(
"Select a document to explore",
options=range(len(st.session_state.documents)),
format_func=lambda x: st.session_state.documents[x]['name']
)
doc = st.session_state.documents[selected_doc]
st.subheader(f"Content: {doc['name']}")
st.text_area("Document Content", doc['content'], height=300)
# Search in document
search_term = st.text_input("Search in document")
if search_term:
if search_term.lower() in doc['content'].lower():
# Highlight search term
highlighted_content = doc['content'].replace(
search_term,
f"**{search_term}**"
)
st.markdown("**Search Results:**")
st.markdown(highlighted_content)
else:
st.warning("Search term not found in document.")
# Footer
st.markdown("---")
st.markdown("Built with Streamlit β’ Document Analysis & Knowledge Graph Generator")
Hi! I can help you with any questions about Streamlit and Python. What would you like to know?