Document analysis app and dynamic knowledge graph generation based on document information
To upload files, please first save the app
import streamlit as st
import pandas as pd
import numpy as np
import networkx as nx
import plotly.graph_objects as go
import plotly.express as px
from textblob import TextBlob
import re
from collections import Counter, defaultdict
import json
from sqlalchemy import create_engine, Column, Integer, String, Text, Float, DateTime
from sqlalchemy.orm import DeclarativeBase, Session
from datetime import datetime
import io
# Database setup
class Base(DeclarativeBase):
pass
class Document(Base):
__tablename__ = 'documents'
id = Column(Integer, primary_key=True)
name = Column(String(255))
content = Column(Text)
upload_time = Column(DateTime, default=datetime.utcnow)
doc_type = Column(String(50))
word_count = Column(Integer)
class Entity(Base):
__tablename__ = 'entities'
id = Column(Integer, primary_key=True)
document_id = Column(Integer)
entity_name = Column(String(255))
entity_type = Column(String(100))
frequency = Column(Integer)
importance_score = Column(Float)
class Relationship(Base):
__tablename__ = 'relationships'
id = Column(Integer, primary_key=True)
document_id = Column(Integer)
source_entity = Column(String(255))
target_entity = Column(String(255))
relationship_type = Column(String(100))
strength = Column(Float)
# Initialize database
engine = create_engine("sqlite:///knowledge_graph.db")
Base.metadata.create_all(bind=engine)
class DocumentAnalyzer:
def __init__(self):
self.stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them'}
def extract_text_from_file(self, file):
"""Extract text from uploaded file"""
try:
if file.type == "application/pdf":
import PyPDF2
pdf_reader = PyPDF2.PdfReader(file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
return text
elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
from docx import Document as DocxDocument
doc = DocxDocument(file)
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text
elif file.type == "text/plain":
return str(file.read(), "utf-8")
else:
return str(file.read(), "utf-8")
except Exception as e:
st.error(f"Error reading file: {str(e)}")
return ""
def extract_entities(self, text):
"""Extract entities using TextBlob and custom patterns"""
blob = TextBlob(text)
# Extract noun phrases as potential entities
noun_phrases = [phrase.lower().strip() for phrase in blob.noun_phrases if len(phrase) > 2]
# Extract named entities using simple patterns
# People names (capitalized words)
people_pattern = r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b'
people = re.findall(people_pattern, text)
# Organizations (words with Corp, Inc, Ltd, etc.)
org_pattern = r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*(?:\s+(?:Corp|Inc|Ltd|LLC|Company|Organization|Institute|University|College))\b'
organizations = re.findall(org_pattern, text)
# Locations (capitalized phrases that might be places)
location_pattern = r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*(?:\s+(?:City|State|Country|County|Province|Region|Street|Avenue|Road))\b'
locations = re.findall(location_pattern, text)
# Dates
date_pattern = r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b|\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b'
dates = re.findall(date_pattern, text)
entities = {
'PERSON': list(set(people)),
'ORGANIZATION': list(set(organizations)),
'LOCATION': list(set(locations)),
'DATE': list(set(dates)),
'CONCEPT': list(set([phrase for phrase in noun_phrases if len(phrase.split()) <= 3]))
}
return entities
def calculate_entity_importance(self, entities, text):
"""Calculate importance scores for entities"""
text_lower = text.lower()
word_count = len(text.split())
entity_scores = {}
for entity_type, entity_list in entities.items():
for entity in entity_list:
frequency = text_lower.count(entity.lower())
# Simple importance score based on frequency and length
importance = (frequency / word_count) * len(entity.split()) * 100
entity_scores[entity] = {
'type': entity_type,
'frequency': frequency,
'importance': importance
}
return entity_scores
def extract_relationships(self, entities, text):
"""Extract relationships between entities"""
relationships = []
text_sentences = [sent.strip() for sent in text.split('.') if sent.strip()]
all_entities = []
for entity_type, entity_list in entities.items():
all_entities.extend([(entity, entity_type) for entity in entity_list])
# Find co-occurrence relationships
for sentence in text_sentences:
sentence_lower = sentence.lower()
sentence_entities = []
for entity, entity_type in all_entities:
if entity.lower() in sentence_lower:
sentence_entities.append((entity, entity_type))
# Create relationships between entities in the same sentence
for i, (entity1, type1) in enumerate(sentence_entities):
for entity2, type2 in sentence_entities[i+1:]:
if entity1 != entity2:
# Determine relationship type based on context
rel_type = self.determine_relationship_type(entity1, entity2, type1, type2, sentence)
strength = min(1.0, len(sentence.split()) / 50.0) # Shorter sentences = stronger relationships
relationships.append({
'source': entity1,
'target': entity2,
'type': rel_type,
'strength': strength,
'context': sentence[:100] + '...' if len(sentence) > 100 else sentence
})
return relationships
def determine_relationship_type(self, entity1, entity2, type1, type2, context):
"""Determine relationship type based on entity types and context"""
context_lower = context.lower()
# Define relationship patterns
if 'work' in context_lower or 'employ' in context_lower:
return 'WORKS_FOR'
elif 'own' in context_lower or 'belong' in context_lower:
return 'OWNS'
elif 'located' in context_lower or 'based' in context_lower:
return 'LOCATED_IN'
elif 'partner' in context_lower or 'collaborate' in context_lower:
return 'PARTNERS_WITH'
elif 'manage' in context_lower or 'lead' in context_lower:
return 'MANAGES'
elif type1 == 'PERSON' and type2 == 'ORGANIZATION':
return 'AFFILIATED_WITH'
elif type1 == 'ORGANIZATION' and type2 == 'LOCATION':
return 'OPERATES_IN'
else:
return 'RELATED_TO'
def create_knowledge_graph(entities, relationships):
"""Create a NetworkX graph from entities and relationships"""
G = nx.Graph()
# Add nodes (entities)
for entity_type, entity_list in entities.items():
for entity in entity_list:
G.add_node(entity, type=entity_type)
# Add edges (relationships)
for rel in relationships:
if rel['source'] in G.nodes and rel['target'] in G.nodes:
G.add_edge(rel['source'], rel['target'],
type=rel['type'],
strength=rel['strength'],
context=rel.get('context', ''))
return G
def visualize_knowledge_graph(G, entity_scores):
"""Create an interactive visualization of the knowledge graph using Plotly"""
if len(G.nodes) == 0:
return None
# Calculate layout
pos = nx.spring_layout(G, k=3, iterations=50)
# Create edge traces
edge_x = []
edge_y = []
edge_info = []
for edge in G.edges():
x0, y0 = pos[edge[0]]
x1, y1 = pos[edge[1]]
edge_x.extend([x0, x1, None])
edge_y.extend([y0, y1, None])
edge_data = G.edges[edge]
edge_info.append(f"{edge[0]} β {edge[1]}<br>Type: {edge_data.get('type', 'RELATED_TO')}<br>Strength: {edge_data.get('strength', 0):.2f}")
edge_trace = go.Scatter(x=edge_x, y=edge_y,
line=dict(width=1, color='#888'),
hoverinfo='none',
mode='lines')
# Create node traces
node_x = []
node_y = []
node_text = []
node_info = []
node_color = []
node_size = []
# Color map for different entity types
color_map = {
'PERSON': '#FF6B6B',
'ORGANIZATION': '#4ECDC4',
'LOCATION': '#45B7D1',
'DATE': '#96CEB4',
'CONCEPT': '#FFEAA7'
}
for node in G.nodes():
x, y = pos[node]
node_x.append(x)
node_y.append(y)
node_data = G.nodes[node]
entity_type = node_data.get('type', 'CONCEPT')
# Get entity score information
score_info = entity_scores.get(node, {'frequency': 0, 'importance': 0})
node_text.append(node)
node_info.append(f"Entity: {node}<br>Type: {entity_type}<br>Frequency: {score_info['frequency']}<br>Importance: {score_info['importance']:.2f}")
node_color.append(color_map.get(entity_type, '#FFEAA7'))
# Size based on importance (with minimum size)
size = max(10, min(50, score_info['importance'] * 5))
node_size.append(size)
node_trace = go.Scatter(x=node_x, y=node_y,
mode='markers+text',
hoverinfo='text',
text=node_text,
textposition="middle center",
hovertext=node_info,
marker=dict(size=node_size,
color=node_color,
line=dict(width=2, color='#000')))
# Create the figure
fig = go.Figure(data=[edge_trace, node_trace],
layout=go.Layout(
title='Knowledge Graph',
titlefont_size=16,
showlegend=False,
hovermode='closest',
margin=dict(b=20,l=5,r=5,t=40),
annotations=[ dict(
text="",
showarrow=False,
xref="paper", yref="paper",
x=0.005, y=-0.002 ) ],
xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
height=600))
return fig
# Streamlit App
st.set_page_config(page_title="Document Knowledge Graph Generator", layout="wide")
st.title("π Document Analysis & Knowledge Graph Generator")
st.markdown("Upload documents to automatically extract entities and generate dynamic knowledge graphs")
# Initialize analyzer
analyzer = DocumentAnalyzer()
# Sidebar for document upload and management
st.sidebar.header("π Document Management")
uploaded_files = st.sidebar.file_uploader(
"Upload Documents",
accept_multiple_files=True,
type=['txt', 'pdf', 'docx'],
help="Supported formats: TXT, PDF, DOCX"
)
# Main content area
if uploaded_files:
# Process uploaded files
with st.spinner("Processing documents..."):
all_documents = []
for file in uploaded_files:
text_content = analyzer.extract_text_from_file(file)
if text_content:
all_documents.append({
'name': file.name,
'content': text_content,
'type': file.type,
'size': len(text_content)
})
# Save to database
with Session(engine) as session:
doc = Document(
name=file.name,
content=text_content,
doc_type=file.type,
word_count=len(text_content.split())
)
session.add(doc)
session.commit()
if all_documents:
# Create tabs for different views
tab1, tab2, tab3, tab4 = st.tabs(["π Analysis Overview", "πΈοΈ Knowledge Graph", "π Entity Details", "π Statistics"])
with tab1:
st.header("Document Analysis Overview")
# Document summary
col1, col2, col3, col4 = st.columns(4)
total_docs = len(all_documents)
total_words = sum(len(doc['content'].split()) for doc in all_documents)
avg_words = total_words // total_docs if total_docs > 0 else 0
col1.metric("Total Documents", total_docs)
col2.metric("Total Words", f"{total_words:,}")
col3.metric("Average Words/Doc", f"{avg_words:,}")
col4.metric("Total Characters", f"{sum(len(doc['content']) for doc in all_documents):,}")
# Document list
st.subheader("Uploaded Documents")
doc_data = []
for doc in all_documents:
doc_data.append({
'Name': doc['name'],
'Type': doc['type'],
'Word Count': len(doc['content'].split()),
'Character Count': len(doc['content']),
'Preview': doc['content'][:100] + '...' if len(doc['content']) > 100 else doc['content']
})
st.dataframe(pd.DataFrame(doc_data), use_container_width=True)
with tab2:
st.header("Dynamic Knowledge Graph")
# Combine all document content for analysis
combined_text = " ".join([doc['content'] for doc in all_documents])
# Extract entities and relationships
with st.spinner("Extracting entities and relationships..."):
entities = analyzer.extract_entities(combined_text)
entity_scores = analyzer.calculate_entity_importance(entities, combined_text)
relationships = analyzer.extract_relationships(entities, combined_text)
# Create and visualize knowledge graph
G = create_knowledge_graph(entities, relationships)
if len(G.nodes) > 0:
# Graph controls
col1, col2 = st.columns([3, 1])
with col2:
st.subheader("Graph Controls")
# Filter by entity type
available_types = set()
for entity_list in entities.values():
if entity_list:
available_types.update([entity_scores.get(entity, {}).get('type', 'CONCEPT') for entity in entity_list])
selected_types = st.multiselect(
"Filter by Entity Type",
list(available_types),
default=list(available_types)
)
# Minimum importance threshold
min_importance = st.slider(
"Minimum Importance",
min_value=0.0,
max_value=max([score['importance'] for score in entity_scores.values()] + [1.0]),
value=0.0,
step=0.1
)
# Filter graph based on controls
filtered_entities = {}
for entity_type, entity_list in entities.items():
filtered_list = []
for entity in entity_list:
entity_info = entity_scores.get(entity, {})
if (entity_info.get('type', 'CONCEPT') in selected_types and
entity_info.get('importance', 0) >= min_importance):
filtered_list.append(entity)
if filtered_list:
filtered_entities[entity_type] = filtered_list
# Update graph
G_filtered = create_knowledge_graph(filtered_entities, relationships)
with col1:
fig = visualize_knowledge_graph(G_filtered, entity_scores)
if fig:
st.plotly_chart(fig, use_container_width=True)
else:
st.info("No entities found matching the current filters.")
# Graph statistics
st.subheader("Graph Statistics")
stat_col1, stat_col2, stat_col3, stat_col4 = st.columns(4)
stat_col1.metric("Total Nodes", len(G.nodes))
stat_col2.metric("Total Edges", len(G.edges))
stat_col3.metric("Filtered Nodes", len(G_filtered.nodes))
stat_col4.metric("Filtered Edges", len(G_filtered.edges))
else:
st.info("No entities found in the uploaded documents. Try uploading documents with more structured content.")
with tab3:
st.header("Entity Details")
if entity_scores:
# Entity summary
entity_df = []
for entity, info in entity_scores.items():
entity_df.append({
'Entity': entity,
'Type': info['type'],
'Frequency': info['frequency'],
'Importance Score': round(info['importance'], 2)
})
entity_df = pd.DataFrame(entity_df).sort_values('Importance Score', ascending=False)
# Display top entities
st.subheader("Top Entities by Importance")
st.dataframe(entity_df.head(20), use_container_width=True)
# Entity type distribution
st.subheader("Entity Type Distribution")
type_counts = entity_df['Type'].value_counts()
fig_pie = px.pie(values=type_counts.values, names=type_counts.index, title="Distribution of Entity Types")
st.plotly_chart(fig_pie, use_container_width=True)
# Relationships table
if relationships:
st.subheader("Extracted Relationships")
rel_df = pd.DataFrame(relationships)
rel_df = rel_df.sort_values('strength', ascending=False)
st.dataframe(rel_df.head(20), use_container_width=True)
else:
st.info("No entities extracted. Please upload documents with more content.")
with tab4:
st.header("Document Statistics & Analytics")
# Word frequency analysis
all_words = []
for doc in all_documents:
words = [word.lower().strip('.,!?";') for word in doc['content'].split()
if word.lower().strip('.,!?";') not in analyzer.stop_words and len(word) > 2]
all_words.extend(words)
if all_words:
word_freq = Counter(all_words)
top_words = word_freq.most_common(20)
# Word frequency chart
st.subheader("Top 20 Most Frequent Words")
words_df = pd.DataFrame(top_words, columns=['Word', 'Frequency'])
fig_bar = px.bar(words_df, x='Word', y='Frequency', title="Word Frequency Distribution")
fig_bar.update_xaxes(tickangle=45)
st.plotly_chart(fig_bar, use_container_width=True)
# Document comparison
if len(all_documents) > 1:
st.subheader("Document Comparison")
doc_stats = []
for doc in all_documents:
words = doc['content'].split()
sentences = len([s for s in doc['content'].split('.') if s.strip()])
doc_stats.append({
'Document': doc['name'],
'Words': len(words),
'Characters': len(doc['content']),
'Sentences': sentences,
'Avg Words/Sentence': len(words) / max(sentences, 1)
})
stats_df = pd.DataFrame(doc_stats)
st.dataframe(stats_df, use_container_width=True)
# Database summary
st.subheader("Database Summary")
with Session(engine) as session:
total_docs_db = session.query(Document).count()
total_entities_db = session.query(Entity).count()
total_relationships_db = session.query(Relationship).count()
db_col1, db_col2, db_col3 = st.columns(3)
db_col1.metric("Documents in DB", total_docs_db)
db_col2.metric("Entities in DB", total_entities_db)
db_col3.metric("Relationships in DB", total_relationships_db)
else:
# Welcome screen
st.markdown("""
## Welcome to the Document Knowledge Graph Generator! π
This app helps you analyze documents and automatically generate knowledge graphs by:
### π **Features:**
- **Multi-format Support**: Upload TXT, PDF, and DOCX files
- **Entity Extraction**: Automatically identify people, organizations, locations, dates, and concepts
- **Relationship Discovery**: Find connections between entities
- **Interactive Visualization**: Explore your knowledge graph with interactive controls
- **Importance Scoring**: Entities are sized and ranked by importance
- **Database Storage**: All analysis is saved for future reference
### π― **How to Use:**
1. Upload one or more documents using the sidebar
2. View the analysis overview and statistics
3. Explore the interactive knowledge graph
4. Examine detailed entity and relationship information
### π‘ **Tips:**
- Upload multiple related documents for richer knowledge graphs
- Use the filter controls to focus on specific entity types
- Adjust the importance threshold to reduce clutter
- Hover over nodes and edges for detailed information
**Get started by uploading your first document!** π
""")
# Sample data demonstration
st.subheader("Sample Analysis Preview")
sample_text = """
Apple Inc. is a technology company based in Cupertino, California.
Tim Cook serves as the CEO of Apple since 2011. The company was founded by Steve Jobs,
Steve Wozniak, and Ronald Wayne in April 1976. Apple develops consumer electronics,
computer software, and online services. The company's headquarters are located in
Apple Park, which opened in April 2017.
"""
# Quick demo analysis
sample_entities = analyzer.extract_entities(sample_text)
sample_scores = analyzer.calculate_entity_importance(sample_entities, sample_text)
sample_relationships = analyzer.extract_relationships(sample_entities, sample_text)
if sample_entities:
sample_G = create_knowledge_graph(sample_entities, sample_relationships)
sample_fig = visualize_knowledge_graph(sample_G, sample_scores)
if sample_fig:
st.plotly_chart(sample_fig, use_container_width=True)
st.caption("Sample knowledge graph generated from a short text about Apple Inc.")
Hi! I can help you with any questions about Streamlit and Python. What would you like to know?