Ploomber AI Editor | document-knowledge-graph-7991

To upload files, please first save the app
Code Editor for app.py

import streamlit as st
import pandas as pd
import numpy as np
import networkx as nx
import plotly.graph_objects as go
import plotly.express as px
from textblob import TextBlob
import re
from collections import Counter, defaultdict
import json

st.set_page_config(page_title="Document Knowledge Graph", layout="wide")

# Initialize session state
if 'documents' not in st.session_state:
    st.session_state.documents = []
if 'knowledge_graph' not in st.session_state:
    st.session_state.knowledge_graph = None

def extract_entities_from_text(text):
    """Extract entities from text using simple NLP techniques"""
    blob = TextBlob(text)
    
    # Extract nouns as potential entities
    entities = []
    for word, tag in blob.tags:
        if tag in ['NN', 'NNP', 'NNS', 'NNPS'] and len(word) > 2:
            entities.append(word.lower())
    
    # Extract named entities (basic approach)
    sentences = blob.sentences
    capitalized_words = []
    for sentence in sentences:
        words = sentence.words
        for i, word in enumerate(words):
            if word[0].isupper() and len(word) > 2:
                capitalized_words.append(word.lower())
    
    # Combine and filter entities
    all_entities = entities + capitalized_words
    entity_counts = Counter(all_entities)
    
    # Return entities that appear more than once or are capitalized
    significant_entities = [entity for entity, count in entity_counts.items() 
                          if count > 1 or entity in [w.lower() for w in capitalized_words]]
    
    return list(set(significant_entities))

def extract_relationships(text, entities):
    """Extract relationships between entities based on co-occurrence"""
    relationships = []
    sentences = TextBlob(text).sentences
    
    for sentence in sentences:
        sentence_text = str(sentence).lower()
        entities_in_sentence = [entity for entity in entities if entity in sentence_text]
        
        # Create relationships between entities that appear in the same sentence
        for i in range(len(entities_in_sentence)):
            for j in range(i + 1, len(entities_in_sentence)):
                relationships.append({
                    'source': entities_in_sentence[i],
                    'target': entities_in_sentence[j],
                    'relationship': 'co_occurs_with',
                    'sentence': str(sentence)
                })
    
    return relationships

def create_knowledge_graph(documents):
    """Create a knowledge graph from documents"""
    G = nx.Graph()
    all_entities = set()
    all_relationships = []
    
    for doc in documents:
        # Extract entities
        entities = extract_entities_from_text(doc['content'])
        all_entities.update(entities)
        
        # Extract relationships
        relationships = extract_relationships(doc['content'], entities)
        all_relationships.extend(relationships)
        
        # Add document metadata
        for entity in entities:
            if G.has_node(entity):
                G.nodes[entity]['documents'].append(doc['title'])
                G.nodes[entity]['frequency'] += doc['content'].lower().count(entity)
            else:
                G.add_node(entity, 
                          documents=[doc['title']], 
                          frequency=doc['content'].lower().count(entity),
                          type='entity')
    
    # Add relationships as edges
    relationship_counts = defaultdict(int)
    for rel in all_relationships:
        key = (rel['source'], rel['target'])
        relationship_counts[key] += 1
    
    for (source, target), weight in relationship_counts.items():
        if source in G.nodes and target in G.nodes:
            G.add_edge(source, target, weight=weight, relationship='co_occurs_with')
    
    return G

def visualize_knowledge_graph(G):
    """Create an interactive visualization of the knowledge graph"""
    if len(G.nodes()) == 0:
        return None
    
    # Get node positions using spring layout
    pos = nx.spring_layout(G, k=3, iterations=50)
    
    # Extract node information
    node_x = []
    node_y = []
    node_text = []
    node_size = []
    node_color = []
    
    for node in G.nodes():
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)
        
        # Node info
        frequency = G.nodes[node].get('frequency', 1)
        documents = G.nodes[node].get('documents', [])
        node_text.append(f"{node}<br>Frequency: {frequency}<br>Documents: {', '.join(documents[:3])}")
        node_size.append(max(10, frequency * 2))
        node_color.append(frequency)
    
    # Extract edge information
    edge_x = []
    edge_y = []
    edge_info = []
    
    for edge in G.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])
        
        weight = G.edges[edge].get('weight', 1)
        edge_info.append(f"{edge[0]} - {edge[1]}: {weight} co-occurrences")
    
    # Create edge trace
    edge_trace = go.Scatter(x=edge_x, y=edge_y,
                           line=dict(width=0.5, color='#888'),
                           hoverinfo='none',
                           mode='lines')
    
    # Create node trace
    node_trace = go.Scatter(x=node_x, y=node_y,
                           mode='markers+text',
                           hoverinfo='text',
                           text=[node for node in G.nodes()],
                           textposition="middle center",
                           hovertext=node_text,
                           marker=dict(showscale=True,
                                     colorscale='YlOrRd',
                                     reversescale=True,
                                     color=node_color,
                                     size=node_size,
                                     colorbar=dict(thickness=15,
                                                 xanchor="left",
                                                 titleside="right",
                                                 title="Entity Frequency"),
                                     line=dict(width=2)))
    
    # Create figure
    fig = go.Figure(data=[edge_trace, node_trace],
                   layout=go.Layout(
                        title='Document Knowledge Graph',
                        titlefont_size=16,
                        showlegend=False,
                        hovermode='closest',
                        margin=dict(b=20,l=5,r=5,t=40),
                        annotations=[ dict(
                            text="Knowledge graph showing entity relationships from documents",
                            showarrow=False,
                            xref="paper", yref="paper",
                            x=0.005, y=-0.002 ) ],
                        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        height=600))
    
    return fig

def analyze_document_stats(documents):
    """Analyze document statistics"""
    if not documents:
        return {}
    
    stats = {
        'total_documents': len(documents),
        'total_words': sum(len(doc['content'].split()) for doc in documents),
        'avg_words_per_doc': sum(len(doc['content'].split()) for doc in documents) / len(documents),
        'total_entities': 0,
        'unique_entities': set(),
        'most_common_entities': []
    }
    
    all_entities = []
    for doc in documents:
        entities = extract_entities_from_text(doc['content'])
        all_entities.extend(entities)
        stats['unique_entities'].update(entities)
    
    stats['total_entities'] = len(all_entities)
    stats['unique_entities'] = len(stats['unique_entities'])
    stats['most_common_entities'] = Counter(all_entities).most_common(10)
    
    return stats

# Main app interface
st.title("📄 Document Analysis & Knowledge Graph Generator")
st.markdown("Upload and analyze documents to automatically generate dynamic knowledge graphs showing entity relationships.")

# Sidebar for document management
with st.sidebar:
    st.header("Document Management")
    
    # Add document section
    st.subheader("Add New Document")
    doc_title = st.text_input("Document Title")
    doc_content = st.text_area("Document Content", height=200)
    
    if st.button("Add Document", type="primary"):
        if doc_title and doc_content:
            new_doc = {
                'title': doc_title,
                'content': doc_content,
                'word_count': len(doc_content.split())
            }
            st.session_state.documents.append(new_doc)
            st.success(f"Added document: {doc_title}")
            st.rerun()
        else:
            st.error("Please provide both title and content")
    
    # Sample documents
    st.subheader("Load Sample Documents")
    if st.button("Load AI & Technology Sample"):
        sample_docs = [
            {
                'title': 'Artificial Intelligence Overview',
                'content': '''Artificial intelligence represents a revolutionary technology that enables machines to simulate human intelligence. Machine learning algorithms process vast amounts of data to identify patterns and make predictions. Deep learning networks use neural networks with multiple layers to analyze complex data structures. Natural language processing allows computers to understand and generate human language. Computer vision enables machines to interpret visual information from images and videos. These technologies are transforming industries including healthcare, finance, automotive, and retail.''',
                'word_count': 67
            },
            {
                'title': 'Machine Learning Applications',
                'content': '''Machine learning applications span numerous domains in modern technology. Healthcare systems use predictive algorithms to diagnose diseases and recommend treatments. Financial institutions employ fraud detection systems that analyze transaction patterns. Autonomous vehicles rely on computer vision and sensor fusion for navigation. Recommendation systems in e-commerce platforms suggest products based on user behavior. Natural language processing powers chatbots and virtual assistants. Data scientists develop models using Python, TensorFlow, and scikit-learn frameworks.''',
                'word_count': 69
            },
            {
                'title': 'Future of Technology',
                'content': '''The future of technology promises unprecedented innovation across multiple sectors. Quantum computing will revolutionize data processing capabilities and cryptography. Biotechnology advances will enable personalized medicine and gene therapy. Renewable energy technologies including solar panels and wind turbines will transform power generation. Internet of Things devices will create smart cities with interconnected infrastructure. Blockchain technology will secure digital transactions and enable decentralized systems. Robotics and automation will reshape manufacturing and service industries.''',
                'word_count': 71
            }
        ]
        st.session_state.documents.extend(sample_docs)
        st.success("Loaded sample documents!")
        st.rerun()
    
    # Clear documents
    if st.button("Clear All Documents", type="secondary"):
        st.session_state.documents = []
        st.session_state.knowledge_graph = None
        st.success("Cleared all documents!")
        st.rerun()

# Main content area
if st.session_state.documents:
    # Generate knowledge graph
    if st.button("🔄 Regenerate Knowledge Graph", type="primary"):
        with st.spinner("Generating knowledge graph..."):
            st.session_state.knowledge_graph = create_knowledge_graph(st.session_state.documents)
        st.success("Knowledge graph generated!")
    
    # Tabs for different views
    tab1, tab2, tab3, tab4 = st.tabs(["📊 Knowledge Graph", "📈 Document Statistics", "📋 Document List", "🔍 Entity Analysis"])
    
    with tab1:
        st.header("Interactive Knowledge Graph")
        
        if st.session_state.knowledge_graph is None:
            st.session_state.knowledge_graph = create_knowledge_graph(st.session_state.documents)
        
        G = st.session_state.knowledge_graph
        
        if G and len(G.nodes()) > 0:
            # Graph metrics
            col1, col2, col3, col4 = st.columns(4)
            with col1:
                st.metric("Entities", len(G.nodes()))
            with col2:
                st.metric("Relationships", len(G.edges()))
            with col3:
                density = nx.density(G)
                st.metric("Graph Density", f"{density:.3f}")
            with col4:
                if len(G.nodes()) > 0:
                    avg_degree = sum(dict(G.degree()).values()) / len(G.nodes())
                    st.metric("Avg Connections", f"{avg_degree:.1f}")
            
            # Visualize graph
            fig = visualize_knowledge_graph(G)
            if fig:
                st.plotly_chart(fig, use_container_width=True)
            
            # Graph analysis
            st.subheader("Graph Analysis")
            if len(G.nodes()) > 0:
                # Most connected entities
                degree_centrality = nx.degree_centrality(G)
                most_connected = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:10]
                
                df_connected = pd.DataFrame(most_connected, columns=['Entity', 'Centrality Score'])
                st.write("**Most Connected Entities:**")
                st.dataframe(df_connected, use_container_width=True)
        else:
            st.info("No entities found in documents. Try adding more detailed content.")
    
    with tab2:
        st.header("Document Statistics")
        
        stats = analyze_document_stats(st.session_state.documents)
        
        # Overview metrics
        col1, col2, col3, col4 = st.columns(4)
        with col1:
            st.metric("Total Documents", stats['total_documents'])
        with col2:
            st.metric("Total Words", stats['total_words'])
        with col3:
            st.metric("Avg Words/Doc", f"{stats['avg_words_per_doc']:.0f}")
        with col4:
            st.metric("Unique Entities", stats['unique_entities'])
        
        # Most common entities chart
        if stats['most_common_entities']:
            st.subheader("Most Common Entities")
            entities_df = pd.DataFrame(stats['most_common_entities'], columns=['Entity', 'Frequency'])
            
            fig_bar = px.bar(entities_df, x='Entity', y='Frequency', 
                           title="Entity Frequency Distribution")
            st.plotly_chart(fig_bar, use_container_width=True)
        
        # Document word count distribution
        word_counts = [doc['word_count'] for doc in st.session_state.documents]
        fig_hist = px.histogram(x=word_counts, title="Document Length Distribution", 
                              labels={'x': 'Word Count', 'y': 'Number of Documents'})
        st.plotly_chart(fig_hist, use_container_width=True)
    
    with tab3:
        st.header("Document Collection")
        
        for i, doc in enumerate(st.session_state.documents):
            with st.expander(f"📄 {doc['title']} ({doc['word_count']} words)"):
                st.write(doc['content'])
                if st.button(f"Remove Document", key=f"remove_{i}"):
                    st.session_state.documents.pop(i)
                    st.session_state.knowledge_graph = None
                    st.rerun()
    
    with tab4:
        st.header("Entity Analysis")
        
        if st.session_state.knowledge_graph:
            G = st.session_state.knowledge_graph
            
            # Entity details table
            entity_data = []
            for node in G.nodes():
                node_data = G.nodes[node]
                entity_data.append({
                    'Entity': node,
                    'Frequency': node_data.get('frequency', 0),
                    'Documents': ', '.join(node_data.get('documents', [])),
                    'Connections': G.degree(node)
                })
            
            if entity_data:
                df_entities = pd.DataFrame(entity_data)
                st.dataframe(df_entities, use_container_width=True)
                
                # Entity network details
                st.subheader("Entity Relationships")
                selected_entity = st.selectbox("Select an entity to view its connections:", 
                                             options=list(G.nodes()))
                
                if selected_entity:
                    neighbors = list(G.neighbors(selected_entity))
                    if neighbors:
                        st.write(f"**{selected_entity}** is connected to:")
                        for neighbor in neighbors:
                            edge_data = G.edges[selected_entity, neighbor]
                            weight = edge_data.get('weight', 1)
                            st.write(f"- {neighbor} (co-occurs {weight} times)")
                    else:
                        st.write(f"**{selected_entity}** has no connections.")
        else:
            st.info("Generate a knowledge graph first to see entity analysis.")

else:
    st.info("👈 Add documents using the sidebar to get started!")
    st.markdown("""
    ### How to use this app:
    1. **Add documents** using the sidebar - either manually or load samples
    2. **Generate knowledge graph** to visualize entity relationships
    3. **Explore different tabs** to analyze your documents
    4. **Interactive visualization** shows how entities are connected
    
    ### Features:
    - **Entity extraction** from document text
    - **Relationship mapping** based on co-occurrence
    - **Interactive graph visualization** with Plotly
    - **Document statistics** and analysis
    - **Entity frequency** and connection analysis
    """)
Loading code editor...
Click Save & Run to preview your app
Terminal