Ploomber AI Editor | document-knowledge-graph-5948

To upload files, please first save the app
Code Editor for app.py

import streamlit as st
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from collections import Counter, defaultdict
import re
import json
from pyvis.network import Network
import tempfile
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import streamlit.components.v1 as components

# Initialize session state
if 'documents' not in st.session_state:
    st.session_state.documents = []
if 'knowledge_graph' not in st.session_state:
    st.session_state.knowledge_graph = None

st.set_page_config(page_title="Document Analysis & Knowledge Graph", layout="wide")

def extract_entities_simple(text):
    """Simple entity extraction using pattern matching"""
    # Extract potential entities (capitalized words, proper nouns)
    entities = []
    
    # Pattern for capitalized words (potential proper nouns)
    proper_nouns = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text)
    entities.extend(proper_nouns)
    
    # Pattern for acronyms
    acronyms = re.findall(r'\b[A-Z]{2,}\b', text)
    entities.extend(acronyms)
    
    # Pattern for numbers with units
    numbers = re.findall(r'\b\d+(?:\.\d+)?(?:\s*(?:million|billion|thousand|percent|%|USD|EUR|km|miles))\b', text, re.IGNORECASE)
    entities.extend(numbers)
    
    return list(set(entities))

def extract_relationships(text, entities):
    """Extract relationships between entities"""
    relationships = []
    
    # Common relationship patterns
    patterns = [
        r'(\w+)\s+(?:is|was|are|were)\s+(?:a|an|the)?\s*(\w+)',
        r'(\w+)\s+(?:owns|has|contains|includes)\s+(\w+)',
        r'(\w+)\s+(?:works for|employed by|part of)\s+(\w+)',
        r'(\w+)\s+(?:located in|based in|from)\s+(\w+)',
        r'(\w+)\s+(?:and|with|alongside)\s+(\w+)',
    ]
    
    for pattern in patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        for match in matches:
            entity1, entity2 = match
            if entity1 in entities and entity2 in entities:
                relationships.append((entity1, entity2, 'related'))
    
    return relationships

def analyze_document(text):
    """Analyze document and extract key information"""
    analysis = {}
    
    # Basic statistics
    analysis['word_count'] = len(text.split())
    analysis['char_count'] = len(text)
    analysis['sentence_count'] = len(re.findall(r'[.!?]+', text))
    analysis['paragraph_count'] = len([p for p in text.split('\n\n') if p.strip()])
    
    # Extract entities
    entities = extract_entities_simple(text)
    analysis['entities'] = entities[:20]  # Top 20 entities
    
    # Extract key phrases using simple TF-IDF
    sentences = re.split(r'[.!?]+', text)
    if len(sentences) > 1:
        vectorizer = TfidfVectorizer(max_features=10, stop_words='english', ngram_range=(1, 2))
        try:
            tfidf_matrix = vectorizer.fit_transform(sentences)
            feature_names = vectorizer.get_feature_names_out()
            analysis['key_phrases'] = list(feature_names)
        except:
            analysis['key_phrases'] = []
    else:
        analysis['key_phrases'] = []
    
    # Word frequency
    words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
    stop_words = {'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can', 'had', 'her', 'was', 'one', 'our', 'out', 'day', 'get', 'use', 'man', 'new', 'now', 'way', 'may', 'say', 'each', 'which', 'their', 'time', 'will', 'about', 'would', 'there', 'could', 'other', 'after', 'first', 'well', 'water', 'been', 'call', 'who', 'oil', 'its', 'did', 'down', 'only', 'into', 'over', 'think', 'also', 'your', 'work', 'life', 'only', 'can', 'still', 'should', 'after', 'being', 'now', 'made', 'before', 'here', 'through', 'when', 'where', 'how', 'what', 'does', 'then', 'them', 'these', 'they', 'this', 'that', 'with', 'have', 'from', 'they', 'know', 'want', 'been', 'good', 'much', 'some', 'time', 'very', 'when', 'come', 'here', 'just', 'like', 'long', 'make', 'many', 'over', 'such', 'take', 'than', 'them', 'well', 'were'}
    filtered_words = [word for word in words if word not in stop_words and len(word) > 3]
    analysis['word_frequency'] = dict(Counter(filtered_words).most_common(10))
    
    return analysis

def create_knowledge_graph(documents_data):
    """Create a knowledge graph from analyzed documents"""
    G = nx.Graph()
    
    # Add nodes and edges
    all_entities = set()
    all_relationships = []
    
    for doc_data in documents_data:
        doc_entities = doc_data['analysis']['entities']
        all_entities.update(doc_entities)
        
        # Add relationships between entities in the same document
        for i, entity1 in enumerate(doc_entities):
            for entity2 in doc_entities[i+1:]:
                if entity1 != entity2:
                    all_relationships.append((entity1, entity2, 'co-occurs'))
    
    # Add nodes
    for entity in all_entities:
        G.add_node(entity)
    
    # Add edges
    for entity1, entity2, relation in all_relationships:
        if G.has_edge(entity1, entity2):
            G[entity1][entity2]['weight'] += 1
        else:
            G.add_edge(entity1, entity2, weight=1, relation=relation)
    
    return G

def visualize_knowledge_graph(G, layout='spring'):
    """Visualize knowledge graph using Plotly"""
    if len(G.nodes()) == 0:
        return None
    
    # Calculate layout
    if layout == 'spring':
        pos = nx.spring_layout(G, k=3, iterations=50)
    elif layout == 'circular':
        pos = nx.circular_layout(G)
    elif layout == 'kamada_kawai':
        pos = nx.kamada_kawai_layout(G)
    else:
        pos = nx.spring_layout(G)
    
    # Extract node and edge information
    node_x = [pos[node][0] for node in G.nodes()]
    node_y = [pos[node][1] for node in G.nodes()]
    node_text = list(G.nodes())
    node_size = [G.degree(node) * 10 + 10 for node in G.nodes()]
    
    edge_x = []
    edge_y = []
    edge_info = []
    
    for edge in G.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])
        weight = G[edge[0]][edge[1]].get('weight', 1)
        edge_info.append(f"{edge[0]} - {edge[1]} (weight: {weight})")
    
    # Create edge trace
    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=0.5, color='#888'),
        hoverinfo='none',
        mode='lines'
    )
    
    # Create node trace
    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers+text',
        hoverinfo='text',
        text=node_text,
        textposition="middle center",
        hovertext=[f"{node}<br>Connections: {G.degree(node)}" for node in G.nodes()],
        marker=dict(
            showscale=True,
            colorscale='YlGnBu',
            reversescale=True,
            color=node_size,
            size=node_size,
            colorbar=dict(
                thickness=15,
                len=0.5,
                x=1.1,
                title="Node Connections"
            ),
            line=dict(width=2)
        )
    )
    
    # Create figure
    fig = go.Figure(data=[edge_trace, node_trace],
                    layout=go.Layout(
                        title='Knowledge Graph',
                        titlefont_size=16,
                        showlegend=False,
                        hovermode='closest',
                        margin=dict(b=20,l=5,r=5,t=40),
                        annotations=[ dict(
                            text="Hover over nodes to see connections",
                            showarrow=False,
                            xref="paper", yref="paper",
                            x=0.005, y=-0.002,
                            xanchor="left", yanchor="bottom",
                            font=dict(color="#888", size=12)
                        )],
                        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        height=600
                    ))
    
    return fig

def create_pyvis_graph(G):
    """Create an interactive graph using PyVis"""
    net = Network(height="600px", width="100%", bgcolor="#222222", font_color="white")
    
    # Add nodes
    for node in G.nodes():
        net.add_node(node, label=node, title=f"Connections: {G.degree(node)}")
    
    # Add edges
    for edge in G.edges():
        weight = G[edge[0]][edge[1]].get('weight', 1)
        net.add_edge(edge[0], edge[1], width=weight, title=f"Weight: {weight}")
    
    # Configure physics
    net.set_options("""
    var options = {
      "physics": {
        "enabled": true,
        "stabilization": {"iterations": 100}
      }
    }
    """)
    
    # Save to temporary file
    tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.html')
    net.save_graph(tmp_file.name)
    
    return tmp_file.name

# Main app
st.title("📄 Document Analysis & Knowledge Graph Generator")

# Sidebar
with st.sidebar:
    st.header("Upload Documents")
    uploaded_files = st.file_uploader(
        "Choose text files", 
        type=['txt'], 
        accept_multiple_files=True
    )
    
    if uploaded_files:
        for uploaded_file in uploaded_files:
            if uploaded_file not in [doc['file'] for doc in st.session_state.documents]:
                content = uploaded_file.read().decode('utf-8')
                analysis = analyze_document(content)
                
                st.session_state.documents.append({
                    'file': uploaded_file,
                    'name': uploaded_file.name,
                    'content': content,
                    'analysis': analysis
                })
    
    st.header("Sample Documents")
    if st.button("Add Sample Document 1"):
        sample_doc1 = """
        Apple Inc. is an American multinational technology company headquartered in Cupertino, California. 
        Apple designs, develops, and sells consumer electronics, computer software, and online services. 
        The company was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in April 1976. 
        Apple is known for its iPhone, iPad, Mac computers, and Apple Watch products. 
        Tim Cook is the current CEO of Apple, succeeding Steve Jobs in 2011.
        The company has a market capitalization of over 2 trillion USD.
        """
        analysis = analyze_document(sample_doc1)
        st.session_state.documents.append({
            'file': None,
            'name': 'Sample: Apple Inc.',
            'content': sample_doc1,
            'analysis': analysis
        })
    
    if st.button("Add Sample Document 2"):
        sample_doc2 = """
        Microsoft Corporation is an American multinational technology corporation with headquarters in Redmond, Washington. 
        Microsoft develops, manufactures, licenses, supports, and sells computer software, consumer electronics, personal computers, and related services. 
        The company was founded by Bill Gates and Paul Allen on April 4, 1975. 
        Microsoft is best known for its Windows operating systems, Microsoft Office suite, and Xbox gaming console. 
        Satya Nadella is the current CEO of Microsoft. The company has a strong presence in cloud computing with Azure.
        """
        analysis = analyze_document(sample_doc2)
        st.session_state.documents.append({
            'file': None,
            'name': 'Sample: Microsoft Corp.',
            'content': sample_doc2,
            'analysis': analysis
        })
    
    if st.button("Clear All Documents"):
        st.session_state.documents = []
        st.session_state.knowledge_graph = None

# Main content
if not st.session_state.documents:
    st.info("👆 Please upload documents or add sample documents from the sidebar to begin analysis.")
else:
    # Generate knowledge graph
    if st.session_state.knowledge_graph is None or st.button("🔄 Regenerate Knowledge Graph"):
        st.session_state.knowledge_graph = create_knowledge_graph(st.session_state.documents)
    
    # Tabs for different views
    tab1, tab2, tab3, tab4 = st.tabs(["📊 Document Analysis", "🕸️ Knowledge Graph", "📈 Statistics", "🔍 Document Explorer"])
    
    with tab1:
        st.header("Document Analysis Overview")
        
        # Summary statistics
        col1, col2, col3, col4 = st.columns(4)
        
        total_words = sum([doc['analysis']['word_count'] for doc in st.session_state.documents])
        total_entities = len(set([entity for doc in st.session_state.documents for entity in doc['analysis']['entities']]))
        
        col1.metric("Total Documents", len(st.session_state.documents))
        col2.metric("Total Words", total_words)
        col3.metric("Total Entities", total_entities)
        col4.metric("Graph Nodes", len(st.session_state.knowledge_graph.nodes()) if st.session_state.knowledge_graph else 0)
        
        # Document details
        for i, doc in enumerate(st.session_state.documents):
            with st.expander(f"📄 {doc['name']}"):
                col1, col2 = st.columns(2)
                
                with col1:
                    st.subheader("Statistics")
                    st.write(f"**Words:** {doc['analysis']['word_count']}")
                    st.write(f"**Characters:** {doc['analysis']['char_count']}")
                    st.write(f"**Sentences:** {doc['analysis']['sentence_count']}")
                    st.write(f"**Paragraphs:** {doc['analysis']['paragraph_count']}")
                
                with col2:
                    st.subheader("Entities")
                    st.write(doc['analysis']['entities'])
                
                st.subheader("Key Phrases")
                st.write(doc['analysis']['key_phrases'])
                
                st.subheader("Word Frequency")
                if doc['analysis']['word_frequency']:
                    freq_df = pd.DataFrame(list(doc['analysis']['word_frequency'].items()), 
                                         columns=['Word', 'Frequency'])
                    st.bar_chart(freq_df.set_index('Word'))
    
    with tab2:
        st.header("Knowledge Graph Visualization")
        
        if st.session_state.knowledge_graph and len(st.session_state.knowledge_graph.nodes()) > 0:
            # Layout selection
            layout = st.selectbox("Select Layout", ["spring", "circular", "kamada_kawai"])
            
            # Plotly visualization
            fig = visualize_knowledge_graph(st.session_state.knowledge_graph, layout)
            if fig:
                st.plotly_chart(fig, use_container_width=True)
            
            # Graph statistics
            st.subheader("Graph Statistics")
            col1, col2, col3 = st.columns(3)
            
            col1.metric("Nodes", len(st.session_state.knowledge_graph.nodes()))
            col2.metric("Edges", len(st.session_state.knowledge_graph.edges()))
            
            if len(st.session_state.knowledge_graph.nodes()) > 0:
                avg_degree = sum(dict(st.session_state.knowledge_graph.degree()).values()) / len(st.session_state.knowledge_graph.nodes())
                col3.metric("Avg Connections", f"{avg_degree:.2f}")
            
            # Top connected entities
            if len(st.session_state.knowledge_graph.nodes()) > 0:
                st.subheader("Most Connected Entities")
                degree_centrality = nx.degree_centrality(st.session_state.knowledge_graph)
                top_entities = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:10]
                
                entities_df = pd.DataFrame(top_entities, columns=['Entity', 'Centrality'])
                st.table(entities_df)
        else:
            st.warning("No entities found to create knowledge graph. Try uploading more detailed documents.")
    
    with tab3:
        st.header("Statistical Analysis")
        
        # Document comparison
        if len(st.session_state.documents) > 1:
            st.subheader("Document Comparison")
            
            comparison_data = []
            for doc in st.session_state.documents:
                comparison_data.append({
                    'Document': doc['name'],
                    'Words': doc['analysis']['word_count'],
                    'Entities': len(doc['analysis']['entities']),
                    'Key Phrases': len(doc['analysis']['key_phrases']),
                    'Sentences': doc['analysis']['sentence_count']
                })
            
            comparison_df = pd.DataFrame(comparison_data)
            st.table(comparison_df)
            
            # Visualization
            fig = px.bar(comparison_df, x='Document', y='Words', title='Word Count by Document')
            st.plotly_chart(fig, use_container_width=True)
        
        # Entity frequency across documents
        st.subheader("Entity Frequency Across Documents")
        all_entities = []
        for doc in st.session_state.documents:
            all_entities.extend(doc['analysis']['entities'])
        
        if all_entities:
            entity_freq = Counter(all_entities)
            top_entities = dict(entity_freq.most_common(15))
            
            entities_df = pd.DataFrame(list(top_entities.items()), columns=['Entity', 'Frequency'])
            fig = px.bar(entities_df, x='Entity', y='Frequency', title='Most Frequent Entities')
            fig.update_xaxes(tickangle=45)
            st.plotly_chart(fig, use_container_width=True)
    
    with tab4:
        st.header("Document Explorer")
        
        if st.session_state.documents:
            selected_doc = st.selectbox(
                "Select a document to explore",
                options=range(len(st.session_state.documents)),
                format_func=lambda x: st.session_state.documents[x]['name']
            )
            
            doc = st.session_state.documents[selected_doc]
            
            st.subheader(f"Content: {doc['name']}")
            st.text_area("Document Content", doc['content'], height=300)
            
            # Search in document
            search_term = st.text_input("Search in document")
            if search_term:
                if search_term.lower() in doc['content'].lower():
                    # Highlight search term
                    highlighted_content = doc['content'].replace(
                        search_term, 
                        f"**{search_term}**"
                    )
                    st.markdown("**Search Results:**")
                    st.markdown(highlighted_content)
                else:
                    st.warning("Search term not found in document.")

# Footer
st.markdown("---")
st.markdown("Built with Streamlit • Document Analysis & Knowledge Graph Generator")
Loading code editor...
Click Save & Run to preview your app
Terminal