Ploomber AI Editor | document-knowledge-graph-f447

To upload files, please first save the app
Code Editor for app.py

import streamlit as st
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from textblob import TextBlob
import nltk
from collections import Counter, defaultdict
import re
import json

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

try:
    nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
    nltk.download('averaged_perceptron_tagger')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag

st.set_page_config(
    page_title="Document Knowledge Graph Generator",
    page_icon="📊",
    layout="wide"
)

st.title("📊 Document Analysis & Knowledge Graph Generator")
st.markdown("Upload documents to generate dynamic knowledge graphs and perform comprehensive text analysis.")

# Initialize session state
if 'documents' not in st.session_state:
    st.session_state.documents = []
if 'knowledge_graph' not in st.session_state:
    st.session_state.knowledge_graph = None

# Sidebar for document upload and settings
with st.sidebar:
    st.header("📁 Document Upload")
    
    uploaded_files = st.file_uploader(
        "Choose text files",
        type=['txt', 'md'],
        accept_multiple_files=True
    )
    
    if uploaded_files:
        for file in uploaded_files:
            content = str(file.read(), "utf-8")
            doc_info = {
                'name': file.name,
                'content': content,
                'word_count': len(content.split()),
                'char_count': len(content)
            }
            if doc_info not in st.session_state.documents:
                st.session_state.documents.append(doc_info)
    
    st.header("⚙️ Analysis Settings")
    min_word_freq = st.slider("Minimum word frequency", 1, 10, 2)
    max_nodes = st.slider("Maximum nodes in graph", 10, 100, 50)
    include_pos = st.multiselect(
        "Include POS tags",
        ["NN", "NNS", "NNP", "NNPS", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "JJ", "JJR", "JJS"],
        default=["NN", "NNS", "NNP", "NNPS", "JJ"]
    )

# Main content area
if st.session_state.documents:
    tab1, tab2, tab3, tab4 = st.tabs(["📄 Documents", "📈 Analysis", "🕸️ Knowledge Graph", "💾 Export"])
    
    with tab1:
        st.header("Uploaded Documents")
        
        for i, doc in enumerate(st.session_state.documents):
            with st.expander(f"📄 {doc['name']} ({doc['word_count']} words)"):
                col1, col2 = st.columns([3, 1])
                with col1:
                    st.text_area(
                        "Content",
                        doc['content'][:1000] + "..." if len(doc['content']) > 1000 else doc['content'],
                        height=200,
                        key=f"content_{i}"
                    )
                with col2:
                    st.metric("Words", doc['word_count'])
                    st.metric("Characters", doc['char_count'])
                    if st.button("Remove", key=f"remove_{i}"):
                        st.session_state.documents.pop(i)
                        st.rerun()
    
    with tab2:
        st.header("Document Analysis")
        
        if st.button("🔍 Analyze Documents"):
            # Combine all documents
            combined_text = " ".join([doc['content'] for doc in st.session_state.documents])
            
            # Basic statistics
            col1, col2, col3, col4 = st.columns(4)
            with col1:
                st.metric("Total Documents", len(st.session_state.documents))
            with col2:
                st.metric("Total Words", len(combined_text.split()))
            with col3:
                st.metric("Total Characters", len(combined_text))
            with col4:
                sentences = sent_tokenize(combined_text)
                st.metric("Total Sentences", len(sentences))
            
            # Sentiment analysis
            st.subheader("📊 Sentiment Analysis")
            blob = TextBlob(combined_text)
            sentiment_score = blob.sentiment.polarity
            
            col1, col2 = st.columns(2)
            with col1:
                st.metric("Sentiment Score", f"{sentiment_score:.3f}")
                if sentiment_score > 0.1:
                    st.success("Positive sentiment")
                elif sentiment_score < -0.1:
                    st.error("Negative sentiment")
                else:
                    st.info("Neutral sentiment")
            
            with col2:
                subjectivity = blob.sentiment.subjectivity
                st.metric("Subjectivity", f"{subjectivity:.3f}")
                if subjectivity > 0.5:
                    st.info("Subjective content")
                else:
                    st.info("Objective content")
            
            # Word frequency analysis
            st.subheader("📈 Word Frequency Analysis")
            
            # Tokenize and clean text
            stop_words = set(stopwords.words('english'))
            words = word_tokenize(combined_text.lower())
            words = [word for word in words if word.isalpha() and word not in stop_words and len(word) > 2]
            
            # POS tagging
            tagged_words = pos_tag(words)
            filtered_words = [word for word, pos in tagged_words if pos in include_pos]
            
            word_freq = Counter(filtered_words)
            top_words = word_freq.most_common(20)
            
            # Create frequency chart
            if top_words:
                df_freq = pd.DataFrame(top_words, columns=['Word', 'Frequency'])
                fig = px.bar(df_freq, x='Word', y='Frequency', title="Top 20 Most Frequent Words")
                st.plotly_chart(fig, use_container_width=True)
                
                # Word cloud style visualization
                fig_scatter = px.scatter(
                    df_freq, 
                    x='Word', 
                    y='Frequency',
                    size='Frequency',
                    title="Word Frequency Bubble Chart"
                )
                st.plotly_chart(fig_scatter, use_container_width=True)
    
    with tab3:
        st.header("Knowledge Graph Generation")
        
        if st.button("🕸️ Generate Knowledge Graph"):
            # Combine all documents
            combined_text = " ".join([doc['content'] for doc in st.session_state.documents])
            
            # Extract entities and relationships
            stop_words = set(stopwords.words('english'))
            sentences = sent_tokenize(combined_text)
            
            # Create graph
            G = nx.Graph()
            entity_freq = defaultdict(int)
            relationships = defaultdict(int)
            
            for sentence in sentences:
                words = word_tokenize(sentence.lower())
                words = [word for word in words if word.isalpha() and word not in stop_words and len(word) > 2]
                
                # POS tagging to identify entities
                tagged_words = pos_tag(words)
                entities = [word for word, pos in tagged_words if pos in include_pos]
                
                # Count entity frequency
                for entity in entities:
                    entity_freq[entity] += 1
                
                # Create relationships between entities in the same sentence
                for i in range(len(entities)):
                    for j in range(i + 1, len(entities)):
                        if entities[i] != entities[j]:
                            relationship = tuple(sorted([entities[i], entities[j]]))
                            relationships[relationship] += 1
            
            # Filter entities by frequency
            filtered_entities = {k: v for k, v in entity_freq.items() if v >= min_word_freq}
            
            # Add nodes to graph
            for entity, freq in list(filtered_entities.items())[:max_nodes]:
                G.add_node(entity, frequency=freq, size=freq*10)
            
            # Add edges to graph
            for (entity1, entity2), weight in relationships.items():
                if entity1 in G.nodes() and entity2 in G.nodes() and weight >= min_word_freq:
                    G.add_edge(entity1, entity2, weight=weight)
            
            st.session_state.knowledge_graph = G
            
            # Display graph statistics
            col1, col2, col3 = st.columns(3)
            with col1:
                st.metric("Nodes", G.number_of_nodes())
            with col2:
                st.metric("Edges", G.number_of_edges())
            with col3:
                st.metric("Density", f"{nx.density(G):.3f}")
            
            # Visualize graph with Plotly
            if G.number_of_nodes() > 0:
                pos = nx.spring_layout(G, k=1, iterations=50)
                
                # Create edge traces
                edge_x = []
                edge_y = []
                for edge in G.edges():
                    x0, y0 = pos[edge[0]]
                    x1, y1 = pos[edge[1]]
                    edge_x.extend([x0, x1, None])
                    edge_y.extend([y0, y1, None])
                
                edge_trace = go.Scatter(
                    x=edge_x, y=edge_y,
                    line=dict(width=0.5, color='#888'),
                    hoverinfo='none',
                    mode='lines'
                )
                
                # Create node traces
                node_x = []
                node_y = []
                node_text = []
                node_size = []
                
                for node in G.nodes():
                    x, y = pos[node]
                    node_x.append(x)
                    node_y.append(y)
                    node_text.append(f"{node}<br>Freq: {G.nodes[node]['frequency']}")
                    node_size.append(G.nodes[node]['frequency'] * 5)
                
                node_trace = go.Scatter(
                    x=node_x, y=node_y,
                    mode='markers+text',
                    hoverinfo='text',
                    text=[node for node in G.nodes()],
                    textposition="middle center",
                    hovertext=node_text,
                    marker=dict(
                        size=node_size,
                        color='lightblue',
                        line=dict(width=2, color='darkblue')
                    )
                )
                
                fig = go.Figure(data=[edge_trace, node_trace],
                              layout=go.Layout(
                                  title='Knowledge Graph',
                                  titlefont_size=16,
                                  showlegend=False,
                                  hovermode='closest',
                                  margin=dict(b=20,l=5,r=5,t=40),
                                  annotations=[ dict(
                                      text="Hover over nodes for details",
                                      showarrow=False,
                                      xref="paper", yref="paper",
                                      x=0.005, y=-0.002 ) ],
                                  xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                                  yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                              )
                
                st.plotly_chart(fig, use_container_width=True)
                
                # Graph analysis
                st.subheader("Graph Analysis")
                
                if G.number_of_nodes() > 0:
                    # Centrality measures
                    degree_centrality = nx.degree_centrality(G)
                    betweenness_centrality = nx.betweenness_centrality(G)
                    
                    # Top central nodes
                    col1, col2 = st.columns(2)
                    
                    with col1:
                        st.write("**Most Connected Nodes (Degree Centrality)**")
                        top_degree = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:10]
                        for node, centrality in top_degree:
                            st.write(f"• {node}: {centrality:.3f}")
                    
                    with col2:
                        st.write("**Most Important Nodes (Betweenness Centrality)**")
                        top_betweenness = sorted(betweenness_centrality.items(), key=lambda x: x[1], reverse=True)[:10]
                        for node, centrality in top_betweenness:
                            st.write(f"• {node}: {centrality:.3f}")
            
            else:
                st.warning("No nodes found with the current settings. Try lowering the minimum word frequency.")
    
    with tab4:
        st.header("Export Data")
        
        if st.session_state.knowledge_graph:
            col1, col2 = st.columns(2)
            
            with col1:
                st.subheader("Export Graph Data")
                
                # Export nodes
                nodes_data = []
                for node in st.session_state.knowledge_graph.nodes(data=True):
                    nodes_data.append({
                        'node': node[0],
                        'frequency': node[1]['frequency']
                    })
                
                nodes_df = pd.DataFrame(nodes_data)
                st.download_button(
                    "Download Nodes CSV",
                    nodes_df.to_csv(index=False),
                    "knowledge_graph_nodes.csv",
                    "text/csv"
                )
                
                # Export edges
                edges_data = []
                for edge in st.session_state.knowledge_graph.edges(data=True):
                    edges_data.append({
                        'source': edge[0],
                        'target': edge[1],
                        'weight': edge[2].get('weight', 1)
                    })
                
                if edges_data:
                    edges_df = pd.DataFrame(edges_data)
                    st.download_button(
                        "Download Edges CSV",
                        edges_df.to_csv(index=False),
                        "knowledge_graph_edges.csv",
                        "text/csv"
                    )
            
            with col2:
                st.subheader("Export Analysis Report")
                
                # Generate report
                combined_text = " ".join([doc['content'] for doc in st.session_state.documents])
                blob = TextBlob(combined_text)
                
                report = {
                    "document_count": len(st.session_state.documents),
                    "total_words": len(combined_text.split()),
                    "total_characters": len(combined_text),
                    "total_sentences": len(sent_tokenize(combined_text)),
                    "sentiment_score": blob.sentiment.polarity,
                    "subjectivity_score": blob.sentiment.subjectivity,
                    "graph_nodes": st.session_state.knowledge_graph.number_of_nodes(),
                    "graph_edges": st.session_state.knowledge_graph.number_of_edges(),
                    "graph_density": nx.density(st.session_state.knowledge_graph)
                }
                
                st.download_button(
                    "Download Analysis Report",
                    json.dumps(report, indent=2),
                    "analysis_report.json",
                    "application/json"
                )
        
        else:
            st.info("Generate a knowledge graph first to export data.")

else:
    st.info("👈 Upload documents using the sidebar to get started!")
    
    # Sample data section
    st.header("📚 Sample Documents")
    st.markdown("You can use these sample texts to test the application:")
    
    sample_texts = {
        "Technology Article": """
        Artificial intelligence is transforming modern technology. Machine learning algorithms are becoming more sophisticated every day. Neural networks and deep learning models are revolutionizing data analysis. Companies are investing heavily in AI research and development. The future of technology depends on artificial intelligence innovations.
        """,
        "Business Report": """
        The quarterly earnings report shows significant growth in revenue. Market analysis indicates strong performance across all sectors. Customer satisfaction has improved due to better service quality. The company's strategic initiatives are delivering positive results. Stakeholder confidence remains high despite market volatility.
        """,
        "Scientific Abstract": """
        This research study examines the effects of climate change on marine ecosystems. Ocean temperature increases are affecting biodiversity patterns. Coral reef degradation is accelerating due to acidification. Marine species are adapting to changing environmental conditions. Conservation efforts are crucial for ecosystem preservation.
        """
    }
    
    for title, content in sample_texts.items():
        with st.expander(f"📄 {title}"):
            st.text_area("Content", content, height=100, key=f"sample_{title}")
            if st.button(f"Use {title}", key=f"use_{title}"):
                doc_info = {
                    'name': f"{title}.txt",
                    'content': content.strip(),
                    'word_count': len(content.strip().split()),
                    'char_count': len(content.strip())
                }
                st.session_state.documents.append(doc_info)
                st.rerun()
Loading code editor...
Click Save & Run to preview your app
Terminal