Ploomber AI Editor | document-knowledge-graph-98bd

To upload files, please first save the app
Code Editor for app.py

import streamlit as st
import pandas as pd
import numpy as np
import networkx as nx
import plotly.graph_objects as go
import plotly.express as px
from textblob import TextBlob
import re
from collections import Counter, defaultdict
import json
from sqlalchemy import create_engine, Column, Integer, String, Text, Float, DateTime
from sqlalchemy.orm import DeclarativeBase, Session
from datetime import datetime
import io

# Database setup
class Base(DeclarativeBase):
    pass

class Document(Base):
    __tablename__ = 'documents'
    id = Column(Integer, primary_key=True)
    name = Column(String(255))
    content = Column(Text)
    upload_time = Column(DateTime, default=datetime.utcnow)
    doc_type = Column(String(50))
    word_count = Column(Integer)

class Entity(Base):
    __tablename__ = 'entities'
    id = Column(Integer, primary_key=True)
    document_id = Column(Integer)
    entity_name = Column(String(255))
    entity_type = Column(String(100))
    frequency = Column(Integer)
    importance_score = Column(Float)

class Relationship(Base):
    __tablename__ = 'relationships'
    id = Column(Integer, primary_key=True)
    document_id = Column(Integer)
    source_entity = Column(String(255))
    target_entity = Column(String(255))
    relationship_type = Column(String(100))
    strength = Column(Float)

# Initialize database
engine = create_engine("sqlite:///knowledge_graph.db")
Base.metadata.create_all(bind=engine)

class DocumentAnalyzer:
    def __init__(self):
        self.stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them'}
    
    def extract_text_from_file(self, file):
        """Extract text from uploaded file"""
        try:
            if file.type == "application/pdf":
                import PyPDF2
                pdf_reader = PyPDF2.PdfReader(file)
                text = ""
                for page in pdf_reader.pages:
                    text += page.extract_text()
                return text
            elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
                from docx import Document as DocxDocument
                doc = DocxDocument(file)
                text = ""
                for paragraph in doc.paragraphs:
                    text += paragraph.text + "\n"
                return text
            elif file.type == "text/plain":
                return str(file.read(), "utf-8")
            else:
                return str(file.read(), "utf-8")
        except Exception as e:
            st.error(f"Error reading file: {str(e)}")
            return ""
    
    def extract_entities(self, text):
        """Extract entities using TextBlob and custom patterns"""
        blob = TextBlob(text)
        
        # Extract noun phrases as potential entities
        noun_phrases = [phrase.lower().strip() for phrase in blob.noun_phrases if len(phrase) > 2]
        
        # Extract named entities using simple patterns
        # People names (capitalized words)
        people_pattern = r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b'
        people = re.findall(people_pattern, text)
        
        # Organizations (words with Corp, Inc, Ltd, etc.)
        org_pattern = r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*(?:\s+(?:Corp|Inc|Ltd|LLC|Company|Organization|Institute|University|College))\b'
        organizations = re.findall(org_pattern, text)
        
        # Locations (capitalized phrases that might be places)
        location_pattern = r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*(?:\s+(?:City|State|Country|County|Province|Region|Street|Avenue|Road))\b'
        locations = re.findall(location_pattern, text)
        
        # Dates
        date_pattern = r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b|\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b'
        dates = re.findall(date_pattern, text)
        
        entities = {
            'PERSON': list(set(people)),
            'ORGANIZATION': list(set(organizations)),
            'LOCATION': list(set(locations)),
            'DATE': list(set(dates)),
            'CONCEPT': list(set([phrase for phrase in noun_phrases if len(phrase.split()) <= 3]))
        }
        
        return entities
    
    def calculate_entity_importance(self, entities, text):
        """Calculate importance scores for entities"""
        text_lower = text.lower()
        word_count = len(text.split())
        
        entity_scores = {}
        for entity_type, entity_list in entities.items():
            for entity in entity_list:
                frequency = text_lower.count(entity.lower())
                # Simple importance score based on frequency and length
                importance = (frequency / word_count) * len(entity.split()) * 100
                entity_scores[entity] = {
                    'type': entity_type,
                    'frequency': frequency,
                    'importance': importance
                }
        
        return entity_scores
    
    def extract_relationships(self, entities, text):
        """Extract relationships between entities"""
        relationships = []
        text_sentences = [sent.strip() for sent in text.split('.') if sent.strip()]
        
        all_entities = []
        for entity_type, entity_list in entities.items():
            all_entities.extend([(entity, entity_type) for entity in entity_list])
        
        # Find co-occurrence relationships
        for sentence in text_sentences:
            sentence_lower = sentence.lower()
            sentence_entities = []
            
            for entity, entity_type in all_entities:
                if entity.lower() in sentence_lower:
                    sentence_entities.append((entity, entity_type))
            
            # Create relationships between entities in the same sentence
            for i, (entity1, type1) in enumerate(sentence_entities):
                for entity2, type2 in sentence_entities[i+1:]:
                    if entity1 != entity2:
                        # Determine relationship type based on context
                        rel_type = self.determine_relationship_type(entity1, entity2, type1, type2, sentence)
                        strength = min(1.0, len(sentence.split()) / 50.0)  # Shorter sentences = stronger relationships
                        
                        relationships.append({
                            'source': entity1,
                            'target': entity2,
                            'type': rel_type,
                            'strength': strength,
                            'context': sentence[:100] + '...' if len(sentence) > 100 else sentence
                        })
        
        return relationships
    
    def determine_relationship_type(self, entity1, entity2, type1, type2, context):
        """Determine relationship type based on entity types and context"""
        context_lower = context.lower()
        
        # Define relationship patterns
        if 'work' in context_lower or 'employ' in context_lower:
            return 'WORKS_FOR'
        elif 'own' in context_lower or 'belong' in context_lower:
            return 'OWNS'
        elif 'located' in context_lower or 'based' in context_lower:
            return 'LOCATED_IN'
        elif 'partner' in context_lower or 'collaborate' in context_lower:
            return 'PARTNERS_WITH'
        elif 'manage' in context_lower or 'lead' in context_lower:
            return 'MANAGES'
        elif type1 == 'PERSON' and type2 == 'ORGANIZATION':
            return 'AFFILIATED_WITH'
        elif type1 == 'ORGANIZATION' and type2 == 'LOCATION':
            return 'OPERATES_IN'
        else:
            return 'RELATED_TO'

def create_knowledge_graph(entities, relationships):
    """Create a NetworkX graph from entities and relationships"""
    G = nx.Graph()
    
    # Add nodes (entities)
    for entity_type, entity_list in entities.items():
        for entity in entity_list:
            G.add_node(entity, type=entity_type)
    
    # Add edges (relationships)
    for rel in relationships:
        if rel['source'] in G.nodes and rel['target'] in G.nodes:
            G.add_edge(rel['source'], rel['target'], 
                      type=rel['type'], 
                      strength=rel['strength'],
                      context=rel.get('context', ''))
    
    return G

def visualize_knowledge_graph(G, entity_scores):
    """Create an interactive visualization of the knowledge graph using Plotly"""
    if len(G.nodes) == 0:
        return None
    
    # Calculate layout
    pos = nx.spring_layout(G, k=3, iterations=50)
    
    # Create edge traces
    edge_x = []
    edge_y = []
    edge_info = []
    
    for edge in G.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])
        
        edge_data = G.edges[edge]
        edge_info.append(f"{edge[0]} → {edge[1]}<br>Type: {edge_data.get('type', 'RELATED_TO')}<br>Strength: {edge_data.get('strength', 0):.2f}")
    
    edge_trace = go.Scatter(x=edge_x, y=edge_y,
                           line=dict(width=1, color='#888'),
                           hoverinfo='none',
                           mode='lines')
    
    # Create node traces
    node_x = []
    node_y = []
    node_text = []
    node_info = []
    node_color = []
    node_size = []
    
    # Color map for different entity types
    color_map = {
        'PERSON': '#FF6B6B',
        'ORGANIZATION': '#4ECDC4',
        'LOCATION': '#45B7D1',
        'DATE': '#96CEB4',
        'CONCEPT': '#FFEAA7'
    }
    
    for node in G.nodes():
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)
        
        node_data = G.nodes[node]
        entity_type = node_data.get('type', 'CONCEPT')
        
        # Get entity score information
        score_info = entity_scores.get(node, {'frequency': 0, 'importance': 0})
        
        node_text.append(node)
        node_info.append(f"Entity: {node}<br>Type: {entity_type}<br>Frequency: {score_info['frequency']}<br>Importance: {score_info['importance']:.2f}")
        node_color.append(color_map.get(entity_type, '#FFEAA7'))
        
        # Size based on importance (with minimum size)
        size = max(10, min(50, score_info['importance'] * 5))
        node_size.append(size)
    
    node_trace = go.Scatter(x=node_x, y=node_y,
                           mode='markers+text',
                           hoverinfo='text',
                           text=node_text,
                           textposition="middle center",
                           hovertext=node_info,
                           marker=dict(size=node_size,
                                     color=node_color,
                                     line=dict(width=2, color='#000')))
    
    # Create the figure
    fig = go.Figure(data=[edge_trace, node_trace],
                   layout=go.Layout(
                        title='Knowledge Graph',
                        titlefont_size=16,
                        showlegend=False,
                        hovermode='closest',
                        margin=dict(b=20,l=5,r=5,t=40),
                        annotations=[ dict(
                            text="",
                            showarrow=False,
                            xref="paper", yref="paper",
                            x=0.005, y=-0.002 ) ],
                        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        height=600))
    
    return fig

# Streamlit App
st.set_page_config(page_title="Document Knowledge Graph Generator", layout="wide")

st.title("📄 Document Analysis & Knowledge Graph Generator")
st.markdown("Upload documents to automatically extract entities and generate dynamic knowledge graphs")

# Initialize analyzer
analyzer = DocumentAnalyzer()

# Sidebar for document upload and management
st.sidebar.header("📁 Document Management")

uploaded_files = st.sidebar.file_uploader(
    "Upload Documents",
    accept_multiple_files=True,
    type=['txt', 'pdf', 'docx'],
    help="Supported formats: TXT, PDF, DOCX"
)

# Main content area
if uploaded_files:
    # Process uploaded files
    with st.spinner("Processing documents..."):
        all_documents = []
        for file in uploaded_files:
            text_content = analyzer.extract_text_from_file(file)
            if text_content:
                all_documents.append({
                    'name': file.name,
                    'content': text_content,
                    'type': file.type,
                    'size': len(text_content)
                })
                
                # Save to database
                with Session(engine) as session:
                    doc = Document(
                        name=file.name,
                        content=text_content,
                        doc_type=file.type,
                        word_count=len(text_content.split())
                    )
                    session.add(doc)
                    session.commit()
    
    if all_documents:
        # Create tabs for different views
        tab1, tab2, tab3, tab4 = st.tabs(["📊 Analysis Overview", "🕸️ Knowledge Graph", "🔍 Entity Details", "📈 Statistics"])
        
        with tab1:
            st.header("Document Analysis Overview")
            
            # Document summary
            col1, col2, col3, col4 = st.columns(4)
            
            total_docs = len(all_documents)
            total_words = sum(len(doc['content'].split()) for doc in all_documents)
            avg_words = total_words // total_docs if total_docs > 0 else 0
            
            col1.metric("Total Documents", total_docs)
            col2.metric("Total Words", f"{total_words:,}")
            col3.metric("Average Words/Doc", f"{avg_words:,}")
            col4.metric("Total Characters", f"{sum(len(doc['content']) for doc in all_documents):,}")
            
            # Document list
            st.subheader("Uploaded Documents")
            doc_data = []
            for doc in all_documents:
                doc_data.append({
                    'Name': doc['name'],
                    'Type': doc['type'],
                    'Word Count': len(doc['content'].split()),
                    'Character Count': len(doc['content']),
                    'Preview': doc['content'][:100] + '...' if len(doc['content']) > 100 else doc['content']
                })
            
            st.dataframe(pd.DataFrame(doc_data), use_container_width=True)
        
        with tab2:
            st.header("Dynamic Knowledge Graph")
            
            # Combine all document content for analysis
            combined_text = " ".join([doc['content'] for doc in all_documents])
            
            # Extract entities and relationships
            with st.spinner("Extracting entities and relationships..."):
                entities = analyzer.extract_entities(combined_text)
                entity_scores = analyzer.calculate_entity_importance(entities, combined_text)
                relationships = analyzer.extract_relationships(entities, combined_text)
            
            # Create and visualize knowledge graph
            G = create_knowledge_graph(entities, relationships)
            
            if len(G.nodes) > 0:
                # Graph controls
                col1, col2 = st.columns([3, 1])
                
                with col2:
                    st.subheader("Graph Controls")
                    
                    # Filter by entity type
                    available_types = set()
                    for entity_list in entities.values():
                        if entity_list:
                            available_types.update([entity_scores.get(entity, {}).get('type', 'CONCEPT') for entity in entity_list])
                    
                    selected_types = st.multiselect(
                        "Filter by Entity Type",
                        list(available_types),
                        default=list(available_types)
                    )
                    
                    # Minimum importance threshold
                    min_importance = st.slider(
                        "Minimum Importance",
                        min_value=0.0,
                        max_value=max([score['importance'] for score in entity_scores.values()] + [1.0]),
                        value=0.0,
                        step=0.1
                    )
                    
                    # Filter graph based on controls
                    filtered_entities = {}
                    for entity_type, entity_list in entities.items():
                        filtered_list = []
                        for entity in entity_list:
                            entity_info = entity_scores.get(entity, {})
                            if (entity_info.get('type', 'CONCEPT') in selected_types and 
                                entity_info.get('importance', 0) >= min_importance):
                                filtered_list.append(entity)
                        if filtered_list:
                            filtered_entities[entity_type] = filtered_list
                    
                    # Update graph
                    G_filtered = create_knowledge_graph(filtered_entities, relationships)
                
                with col1:
                    fig = visualize_knowledge_graph(G_filtered, entity_scores)
                    if fig:
                        st.plotly_chart(fig, use_container_width=True)
                    else:
                        st.info("No entities found matching the current filters.")
                
                # Graph statistics
                st.subheader("Graph Statistics")
                stat_col1, stat_col2, stat_col3, stat_col4 = st.columns(4)
                
                stat_col1.metric("Total Nodes", len(G.nodes))
                stat_col2.metric("Total Edges", len(G.edges))
                stat_col3.metric("Filtered Nodes", len(G_filtered.nodes))
                stat_col4.metric("Filtered Edges", len(G_filtered.edges))
            
            else:
                st.info("No entities found in the uploaded documents. Try uploading documents with more structured content.")
        
        with tab3:
            st.header("Entity Details")
            
            if entity_scores:
                # Entity summary
                entity_df = []
                for entity, info in entity_scores.items():
                    entity_df.append({
                        'Entity': entity,
                        'Type': info['type'],
                        'Frequency': info['frequency'],
                        'Importance Score': round(info['importance'], 2)
                    })
                
                entity_df = pd.DataFrame(entity_df).sort_values('Importance Score', ascending=False)
                
                # Display top entities
                st.subheader("Top Entities by Importance")
                st.dataframe(entity_df.head(20), use_container_width=True)
                
                # Entity type distribution
                st.subheader("Entity Type Distribution")
                type_counts = entity_df['Type'].value_counts()
                fig_pie = px.pie(values=type_counts.values, names=type_counts.index, title="Distribution of Entity Types")
                st.plotly_chart(fig_pie, use_container_width=True)
                
                # Relationships table
                if relationships:
                    st.subheader("Extracted Relationships")
                    rel_df = pd.DataFrame(relationships)
                    rel_df = rel_df.sort_values('strength', ascending=False)
                    st.dataframe(rel_df.head(20), use_container_width=True)
            
            else:
                st.info("No entities extracted. Please upload documents with more content.")
        
        with tab4:
            st.header("Document Statistics & Analytics")
            
            # Word frequency analysis
            all_words = []
            for doc in all_documents:
                words = [word.lower().strip('.,!?";') for word in doc['content'].split() 
                        if word.lower().strip('.,!?";') not in analyzer.stop_words and len(word) > 2]
                all_words.extend(words)
            
            if all_words:
                word_freq = Counter(all_words)
                top_words = word_freq.most_common(20)
                
                # Word frequency chart
                st.subheader("Top 20 Most Frequent Words")
                words_df = pd.DataFrame(top_words, columns=['Word', 'Frequency'])
                fig_bar = px.bar(words_df, x='Word', y='Frequency', title="Word Frequency Distribution")
                fig_bar.update_xaxes(tickangle=45)
                st.plotly_chart(fig_bar, use_container_width=True)
                
                # Document comparison
                if len(all_documents) > 1:
                    st.subheader("Document Comparison")
                    doc_stats = []
                    for doc in all_documents:
                        words = doc['content'].split()
                        sentences = len([s for s in doc['content'].split('.') if s.strip()])
                        doc_stats.append({
                            'Document': doc['name'],
                            'Words': len(words),
                            'Characters': len(doc['content']),
                            'Sentences': sentences,
                            'Avg Words/Sentence': len(words) / max(sentences, 1)
                        })
                    
                    stats_df = pd.DataFrame(doc_stats)
                    st.dataframe(stats_df, use_container_width=True)
                
                # Database summary
                st.subheader("Database Summary")
                with Session(engine) as session:
                    total_docs_db = session.query(Document).count()
                    total_entities_db = session.query(Entity).count()
                    total_relationships_db = session.query(Relationship).count()
                
                db_col1, db_col2, db_col3 = st.columns(3)
                db_col1.metric("Documents in DB", total_docs_db)
                db_col2.metric("Entities in DB", total_entities_db)
                db_col3.metric("Relationships in DB", total_relationships_db)

else:
    # Welcome screen
    st.markdown("""
    ## Welcome to the Document Knowledge Graph Generator! 🚀
    
    This app helps you analyze documents and automatically generate knowledge graphs by:
    
    ### 📋 **Features:**
    - **Multi-format Support**: Upload TXT, PDF, and DOCX files
    - **Entity Extraction**: Automatically identify people, organizations, locations, dates, and concepts
    - **Relationship Discovery**: Find connections between entities
    - **Interactive Visualization**: Explore your knowledge graph with interactive controls
    - **Importance Scoring**: Entities are sized and ranked by importance
    - **Database Storage**: All analysis is saved for future reference
    
    ### 🎯 **How to Use:**
    1. Upload one or more documents using the sidebar
    2. View the analysis overview and statistics
    3. Explore the interactive knowledge graph
    4. Examine detailed entity and relationship information
    
    ### 💡 **Tips:**
    - Upload multiple related documents for richer knowledge graphs
    - Use the filter controls to focus on specific entity types
    - Adjust the importance threshold to reduce clutter
    - Hover over nodes and edges for detailed information
    
    **Get started by uploading your first document!** 📁
    """)
    
    # Sample data demonstration
    st.subheader("Sample Analysis Preview")
    
    sample_text = """
    Apple Inc. is a technology company based in Cupertino, California. 
    Tim Cook serves as the CEO of Apple since 2011. The company was founded by Steve Jobs, 
    Steve Wozniak, and Ronald Wayne in April 1976. Apple develops consumer electronics, 
    computer software, and online services. The company's headquarters are located in 
    Apple Park, which opened in April 2017.
    """
    
    # Quick demo analysis
    sample_entities = analyzer.extract_entities(sample_text)
    sample_scores = analyzer.calculate_entity_importance(sample_entities, sample_text)
    sample_relationships = analyzer.extract_relationships(sample_entities, sample_text)
    
    if sample_entities:
        sample_G = create_knowledge_graph(sample_entities, sample_relationships)
        sample_fig = visualize_knowledge_graph(sample_G, sample_scores)
        
        if sample_fig:
            st.plotly_chart(sample_fig, use_container_width=True)
            st.caption("Sample knowledge graph generated from a short text about Apple Inc.")
Loading code editor...
Click Save & Run to preview your app
Terminal