Document analysis app and dynamic knowledge graph generation based on document information
To upload files, please first save the app
import streamlit as st
import pandas as pd
import numpy as np
import networkx as nx
import plotly.graph_objects as go
import plotly.express as px
from textblob import TextBlob
import re
from collections import Counter, defaultdict
import json
st.set_page_config(page_title="Document Knowledge Graph", layout="wide")
# Initialize session state
if 'documents' not in st.session_state:
st.session_state.documents = []
if 'knowledge_graph' not in st.session_state:
st.session_state.knowledge_graph = None
def extract_entities_from_text(text):
"""Extract entities from text using simple NLP techniques"""
blob = TextBlob(text)
# Extract nouns as potential entities
entities = []
for word, tag in blob.tags:
if tag in ['NN', 'NNP', 'NNS', 'NNPS'] and len(word) > 2:
entities.append(word.lower())
# Extract named entities (basic approach)
sentences = blob.sentences
capitalized_words = []
for sentence in sentences:
words = sentence.words
for i, word in enumerate(words):
if word[0].isupper() and len(word) > 2:
capitalized_words.append(word.lower())
# Combine and filter entities
all_entities = entities + capitalized_words
entity_counts = Counter(all_entities)
# Return entities that appear more than once or are capitalized
significant_entities = [entity for entity, count in entity_counts.items()
if count > 1 or entity in [w.lower() for w in capitalized_words]]
return list(set(significant_entities))
def extract_relationships(text, entities):
"""Extract relationships between entities based on co-occurrence"""
relationships = []
sentences = TextBlob(text).sentences
for sentence in sentences:
sentence_text = str(sentence).lower()
entities_in_sentence = [entity for entity in entities if entity in sentence_text]
# Create relationships between entities that appear in the same sentence
for i in range(len(entities_in_sentence)):
for j in range(i + 1, len(entities_in_sentence)):
relationships.append({
'source': entities_in_sentence[i],
'target': entities_in_sentence[j],
'relationship': 'co_occurs_with',
'sentence': str(sentence)
})
return relationships
def create_knowledge_graph(documents):
"""Create a knowledge graph from documents"""
G = nx.Graph()
all_entities = set()
all_relationships = []
for doc in documents:
# Extract entities
entities = extract_entities_from_text(doc['content'])
all_entities.update(entities)
# Extract relationships
relationships = extract_relationships(doc['content'], entities)
all_relationships.extend(relationships)
# Add document metadata
for entity in entities:
if G.has_node(entity):
G.nodes[entity]['documents'].append(doc['title'])
G.nodes[entity]['frequency'] += doc['content'].lower().count(entity)
else:
G.add_node(entity,
documents=[doc['title']],
frequency=doc['content'].lower().count(entity),
type='entity')
# Add relationships as edges
relationship_counts = defaultdict(int)
for rel in all_relationships:
key = (rel['source'], rel['target'])
relationship_counts[key] += 1
for (source, target), weight in relationship_counts.items():
if source in G.nodes and target in G.nodes:
G.add_edge(source, target, weight=weight, relationship='co_occurs_with')
return G
def visualize_knowledge_graph(G):
"""Create an interactive visualization of the knowledge graph"""
if len(G.nodes()) == 0:
return None
# Get node positions using spring layout
pos = nx.spring_layout(G, k=3, iterations=50)
# Extract node information
node_x = []
node_y = []
node_text = []
node_size = []
node_color = []
for node in G.nodes():
x, y = pos[node]
node_x.append(x)
node_y.append(y)
# Node info
frequency = G.nodes[node].get('frequency', 1)
documents = G.nodes[node].get('documents', [])
node_text.append(f"{node}<br>Frequency: {frequency}<br>Documents: {', '.join(documents[:3])}")
node_size.append(max(10, frequency * 2))
node_color.append(frequency)
# Extract edge information
edge_x = []
edge_y = []
edge_info = []
for edge in G.edges():
x0, y0 = pos[edge[0]]
x1, y1 = pos[edge[1]]
edge_x.extend([x0, x1, None])
edge_y.extend([y0, y1, None])
weight = G.edges[edge].get('weight', 1)
edge_info.append(f"{edge[0]} - {edge[1]}: {weight} co-occurrences")
# Create edge trace
edge_trace = go.Scatter(x=edge_x, y=edge_y,
line=dict(width=0.5, color='#888'),
hoverinfo='none',
mode='lines')
# Create node trace
node_trace = go.Scatter(x=node_x, y=node_y,
mode='markers+text',
hoverinfo='text',
text=[node for node in G.nodes()],
textposition="middle center",
hovertext=node_text,
marker=dict(showscale=True,
colorscale='YlOrRd',
reversescale=True,
color=node_color,
size=node_size,
colorbar=dict(thickness=15,
xanchor="left",
titleside="right",
title="Entity Frequency"),
line=dict(width=2)))
# Create figure
fig = go.Figure(data=[edge_trace, node_trace],
layout=go.Layout(
title='Document Knowledge Graph',
titlefont_size=16,
showlegend=False,
hovermode='closest',
margin=dict(b=20,l=5,r=5,t=40),
annotations=[ dict(
text="Knowledge graph showing entity relationships from documents",
showarrow=False,
xref="paper", yref="paper",
x=0.005, y=-0.002 ) ],
xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
height=600))
return fig
def analyze_document_stats(documents):
"""Analyze document statistics"""
if not documents:
return {}
stats = {
'total_documents': len(documents),
'total_words': sum(len(doc['content'].split()) for doc in documents),
'avg_words_per_doc': sum(len(doc['content'].split()) for doc in documents) / len(documents),
'total_entities': 0,
'unique_entities': set(),
'most_common_entities': []
}
all_entities = []
for doc in documents:
entities = extract_entities_from_text(doc['content'])
all_entities.extend(entities)
stats['unique_entities'].update(entities)
stats['total_entities'] = len(all_entities)
stats['unique_entities'] = len(stats['unique_entities'])
stats['most_common_entities'] = Counter(all_entities).most_common(10)
return stats
# Main app interface
st.title("📄 Document Analysis & Knowledge Graph Generator")
st.markdown("Upload and analyze documents to automatically generate dynamic knowledge graphs showing entity relationships.")
# Sidebar for document management
with st.sidebar:
st.header("Document Management")
# Add document section
st.subheader("Add New Document")
doc_title = st.text_input("Document Title")
doc_content = st.text_area("Document Content", height=200)
if st.button("Add Document", type="primary"):
if doc_title and doc_content:
new_doc = {
'title': doc_title,
'content': doc_content,
'word_count': len(doc_content.split())
}
st.session_state.documents.append(new_doc)
st.success(f"Added document: {doc_title}")
st.rerun()
else:
st.error("Please provide both title and content")
# Sample documents
st.subheader("Load Sample Documents")
if st.button("Load AI & Technology Sample"):
sample_docs = [
{
'title': 'Artificial Intelligence Overview',
'content': '''Artificial intelligence represents a revolutionary technology that enables machines to simulate human intelligence. Machine learning algorithms process vast amounts of data to identify patterns and make predictions. Deep learning networks use neural networks with multiple layers to analyze complex data structures. Natural language processing allows computers to understand and generate human language. Computer vision enables machines to interpret visual information from images and videos. These technologies are transforming industries including healthcare, finance, automotive, and retail.''',
'word_count': 67
},
{
'title': 'Machine Learning Applications',
'content': '''Machine learning applications span numerous domains in modern technology. Healthcare systems use predictive algorithms to diagnose diseases and recommend treatments. Financial institutions employ fraud detection systems that analyze transaction patterns. Autonomous vehicles rely on computer vision and sensor fusion for navigation. Recommendation systems in e-commerce platforms suggest products based on user behavior. Natural language processing powers chatbots and virtual assistants. Data scientists develop models using Python, TensorFlow, and scikit-learn frameworks.''',
'word_count': 69
},
{
'title': 'Future of Technology',
'content': '''The future of technology promises unprecedented innovation across multiple sectors. Quantum computing will revolutionize data processing capabilities and cryptography. Biotechnology advances will enable personalized medicine and gene therapy. Renewable energy technologies including solar panels and wind turbines will transform power generation. Internet of Things devices will create smart cities with interconnected infrastructure. Blockchain technology will secure digital transactions and enable decentralized systems. Robotics and automation will reshape manufacturing and service industries.''',
'word_count': 71
}
]
st.session_state.documents.extend(sample_docs)
st.success("Loaded sample documents!")
st.rerun()
# Clear documents
if st.button("Clear All Documents", type="secondary"):
st.session_state.documents = []
st.session_state.knowledge_graph = None
st.success("Cleared all documents!")
st.rerun()
# Main content area
if st.session_state.documents:
# Generate knowledge graph
if st.button("🔄 Regenerate Knowledge Graph", type="primary"):
with st.spinner("Generating knowledge graph..."):
st.session_state.knowledge_graph = create_knowledge_graph(st.session_state.documents)
st.success("Knowledge graph generated!")
# Tabs for different views
tab1, tab2, tab3, tab4 = st.tabs(["📊 Knowledge Graph", "📈 Document Statistics", "📋 Document List", "🔍 Entity Analysis"])
with tab1:
st.header("Interactive Knowledge Graph")
if st.session_state.knowledge_graph is None:
st.session_state.knowledge_graph = create_knowledge_graph(st.session_state.documents)
G = st.session_state.knowledge_graph
if G and len(G.nodes()) > 0:
# Graph metrics
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Entities", len(G.nodes()))
with col2:
st.metric("Relationships", len(G.edges()))
with col3:
density = nx.density(G)
st.metric("Graph Density", f"{density:.3f}")
with col4:
if len(G.nodes()) > 0:
avg_degree = sum(dict(G.degree()).values()) / len(G.nodes())
st.metric("Avg Connections", f"{avg_degree:.1f}")
# Visualize graph
fig = visualize_knowledge_graph(G)
if fig:
st.plotly_chart(fig, use_container_width=True)
# Graph analysis
st.subheader("Graph Analysis")
if len(G.nodes()) > 0:
# Most connected entities
degree_centrality = nx.degree_centrality(G)
most_connected = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:10]
df_connected = pd.DataFrame(most_connected, columns=['Entity', 'Centrality Score'])
st.write("**Most Connected Entities:**")
st.dataframe(df_connected, use_container_width=True)
else:
st.info("No entities found in documents. Try adding more detailed content.")
with tab2:
st.header("Document Statistics")
stats = analyze_document_stats(st.session_state.documents)
# Overview metrics
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Total Documents", stats['total_documents'])
with col2:
st.metric("Total Words", stats['total_words'])
with col3:
st.metric("Avg Words/Doc", f"{stats['avg_words_per_doc']:.0f}")
with col4:
st.metric("Unique Entities", stats['unique_entities'])
# Most common entities chart
if stats['most_common_entities']:
st.subheader("Most Common Entities")
entities_df = pd.DataFrame(stats['most_common_entities'], columns=['Entity', 'Frequency'])
fig_bar = px.bar(entities_df, x='Entity', y='Frequency',
title="Entity Frequency Distribution")
st.plotly_chart(fig_bar, use_container_width=True)
# Document word count distribution
word_counts = [doc['word_count'] for doc in st.session_state.documents]
fig_hist = px.histogram(x=word_counts, title="Document Length Distribution",
labels={'x': 'Word Count', 'y': 'Number of Documents'})
st.plotly_chart(fig_hist, use_container_width=True)
with tab3:
st.header("Document Collection")
for i, doc in enumerate(st.session_state.documents):
with st.expander(f"📄 {doc['title']} ({doc['word_count']} words)"):
st.write(doc['content'])
if st.button(f"Remove Document", key=f"remove_{i}"):
st.session_state.documents.pop(i)
st.session_state.knowledge_graph = None
st.rerun()
with tab4:
st.header("Entity Analysis")
if st.session_state.knowledge_graph:
G = st.session_state.knowledge_graph
# Entity details table
entity_data = []
for node in G.nodes():
node_data = G.nodes[node]
entity_data.append({
'Entity': node,
'Frequency': node_data.get('frequency', 0),
'Documents': ', '.join(node_data.get('documents', [])),
'Connections': G.degree(node)
})
if entity_data:
df_entities = pd.DataFrame(entity_data)
st.dataframe(df_entities, use_container_width=True)
# Entity network details
st.subheader("Entity Relationships")
selected_entity = st.selectbox("Select an entity to view its connections:",
options=list(G.nodes()))
if selected_entity:
neighbors = list(G.neighbors(selected_entity))
if neighbors:
st.write(f"**{selected_entity}** is connected to:")
for neighbor in neighbors:
edge_data = G.edges[selected_entity, neighbor]
weight = edge_data.get('weight', 1)
st.write(f"- {neighbor} (co-occurs {weight} times)")
else:
st.write(f"**{selected_entity}** has no connections.")
else:
st.info("Generate a knowledge graph first to see entity analysis.")
else:
st.info("👈 Add documents using the sidebar to get started!")
st.markdown("""
### How to use this app:
1. **Add documents** using the sidebar - either manually or load samples
2. **Generate knowledge graph** to visualize entity relationships
3. **Explore different tabs** to analyze your documents
4. **Interactive visualization** shows how entities are connected
### Features:
- **Entity extraction** from document text
- **Relationship mapping** based on co-occurrence
- **Interactive graph visualization** with Plotly
- **Document statistics** and analysis
- **Entity frequency** and connection analysis
""")
Hi! I can help you with any questions about Streamlit and Python. What would you like to know?