Ploomber AI Editor | accounting-receipts-4724

To upload files, please first save the app
Code Editor for app.py

import streamlit as st
import pandas as pd
import pdfplumber
import re
from datetime import datetime
import io
from dateutil.parser import parse
import xlsxwriter

st.set_page_config(page_title="Receipt Processor", page_icon="🧾", layout="wide")

st.title("🧾 Accounting Receipt Processor")
st.markdown("Upload PDF receipts and convert them to Excel format for accounting purposes")

# Initialize session state
if 'receipts_data' not in st.session_state:
    st.session_state.receipts_data = []

def extract_text_from_pdf(pdf_file):
    """Extract text from uploaded PDF file"""
    try:
        with pdfplumber.open(pdf_file) as pdf:
            text = ""
            for page in pdf.pages:
                text += page.extract_text() or ""
        return text
    except Exception as e:
        st.error(f"Error reading PDF: {str(e)}")
        return ""

def parse_receipt_data(text, filename):
    """Parse receipt text and extract relevant information"""
    # Initialize data structure
    receipt_data = {
        'filename': filename,
        'date': None,
        'vendor': None,
        'total_amount': None,
        'tax_amount': None,
        'items': [],
        'category': 'General',
        'payment_method': 'Unknown',
        'raw_text': text
    }
    
    # Extract date patterns
    date_patterns = [
        r'\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})\b',
        r'\b(\d{4}[/-]\d{1,2}[/-]\d{1,2})\b',
        r'\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},?\s+\d{4}\b'
    ]
    
    for pattern in date_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        if matches:
            try:
                receipt_data['date'] = parse(matches[0]).strftime('%Y-%m-%d')
                break
            except:
                continue
    
    # Extract total amount
    amount_patterns = [
        r'total[:\s]*\$?(\d+\.?\d*)',
        r'amount[:\s]*\$?(\d+\.?\d*)',
        r'\$(\d+\.\d{2})\s*$'
    ]
    
    for pattern in amount_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        if matches:
            try:
                receipt_data['total_amount'] = float(matches[-1])
                break
            except:
                continue
    
    # Extract tax amount
    tax_patterns = [
        r'tax[:\s]*\$?(\d+\.?\d*)',
        r'gst[:\s]*\$?(\d+\.?\d*)',
        r'hst[:\s]*\$?(\d+\.?\d*)'
    ]
    
    for pattern in tax_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        if matches:
            try:
                receipt_data['tax_amount'] = float(matches[0])
                break
            except:
                continue
    
    # Extract vendor name (first few lines often contain vendor info)
    lines = text.split('\n')
    for i, line in enumerate(lines[:5]):
        line = line.strip()
        if len(line) > 3 and not re.match(r'^\d+', line):
            receipt_data['vendor'] = line
            break
    
    # Determine category based on vendor name or content
    if receipt_data['vendor']:
        vendor_lower = receipt_data['vendor'].lower()
        if any(word in vendor_lower for word in ['gas', 'shell', 'esso', 'petro']):
            receipt_data['category'] = 'Fuel'
        elif any(word in vendor_lower for word in ['restaurant', 'cafe', 'coffee', 'food']):
            receipt_data['category'] = 'Meals'
        elif any(word in vendor_lower for word in ['office', 'supply', 'staples']):
            receipt_data['category'] = 'Office Supplies'
        elif any(word in vendor_lower for word in ['hotel', 'inn', 'lodge']):
            receipt_data['category'] = 'Accommodation'
    
    return receipt_data

# File upload section
st.header("📤 Upload Receipts")
uploaded_files = st.file_uploader(
    "Choose PDF files", 
    type=['pdf'], 
    accept_multiple_files=True,
    help="Upload one or more PDF receipts"
)

if uploaded_files:
    progress_bar = st.progress(0)
    status_text = st.empty()
    
    for i, uploaded_file in enumerate(uploaded_files):
        status_text.text(f"Processing {uploaded_file.name}...")
        
        # Extract text from PDF
        text = extract_text_from_pdf(uploaded_file)
        
        if text:
            # Parse receipt data
            receipt_data = parse_receipt_data(text, uploaded_file.name)
            
            # Check if this receipt is already processed
            existing_receipt = next((r for r in st.session_state.receipts_data if r['filename'] == uploaded_file.name), None)
            if not existing_receipt:
                st.session_state.receipts_data.append(receipt_data)
        
        progress_bar.progress((i + 1) / len(uploaded_files))
    
    status_text.text("✅ Processing complete!")
    st.success(f"Processed {len(uploaded_files)} receipt(s)")

# Display and edit receipt data
if st.session_state.receipts_data:
    st.header("📊 Receipt Data")
    
    # Convert to DataFrame for display
    df_display = pd.DataFrame([
        {
            'Filename': r['filename'],
            'Date': r['date'] or 'Not found',
            'Vendor': r['vendor'] or 'Not found',
            'Total Amount': f"${r['total_amount']:.2f}" if r['total_amount'] else 'Not found',
            'Tax Amount': f"${r['tax_amount']:.2f}" if r['tax_amount'] else 'Not found',
            'Category': r['category'],
            'Payment Method': r['payment_method']
        }
        for r in st.session_state.receipts_data
    ])
    
    st.dataframe(df_display, use_container_width=True)
    
    # Edit section
    st.subheader("✏️ Edit Receipt Data")
    
    if st.session_state.receipts_data:
        selected_receipt = st.selectbox(
            "Select receipt to edit:",
            options=range(len(st.session_state.receipts_data)),
            format_func=lambda x: st.session_state.receipts_data[x]['filename']
        )
        
        receipt = st.session_state.receipts_data[selected_receipt]
        
        col1, col2 = st.columns(2)
        
        with col1:
            new_date = st.date_input(
                "Date:",
                value=datetime.strptime(receipt['date'], '%Y-%m-%d').date() if receipt['date'] else datetime.now().date()
            )
            new_vendor = st.text_input("Vendor:", value=receipt['vendor'] or "")
            new_total = st.number_input("Total Amount:", value=receipt['total_amount'] or 0.0, min_value=0.0, step=0.01)
        
        with col2:
            new_tax = st.number_input("Tax Amount:", value=receipt['tax_amount'] or 0.0, min_value=0.0, step=0.01)
            new_category = st.selectbox(
                "Category:",
                options=['General', 'Meals', 'Fuel', 'Office Supplies', 'Accommodation', 'Travel', 'Equipment', 'Professional Services'],
                index=['General', 'Meals', 'Fuel', 'Office Supplies', 'Accommodation', 'Travel', 'Equipment', 'Professional Services'].index(receipt['category'])
            )
            new_payment = st.selectbox(
                "Payment Method:",
                options=['Credit Card', 'Cash', 'Debit Card', 'Bank Transfer', 'Unknown'],
                index=['Credit Card', 'Cash', 'Debit Card', 'Bank Transfer', 'Unknown'].index(receipt['payment_method'])
            )
        
        if st.button("Update Receipt"):
            st.session_state.receipts_data[selected_receipt].update({
                'date': new_date.strftime('%Y-%m-%d'),
                'vendor': new_vendor,
                'total_amount': new_total,
                'tax_amount': new_tax,
                'category': new_category,
                'payment_method': new_payment
            })
            st.success("Receipt updated successfully!")
            st.rerun()

# Export to Excel
if st.session_state.receipts_data:
    st.header("📥 Export to Excel")
    
    col1, col2 = st.columns(2)
    
    with col1:
        if st.button("Generate Excel File", type="primary"):
            # Create DataFrame for export
            export_data = []
            for receipt in st.session_state.receipts_data:
                export_data.append({
                    'Date': receipt['date'],
                    'Vendor': receipt['vendor'],
                    'Category': receipt['category'],
                    'Total Amount': receipt['total_amount'],
                    'Tax Amount': receipt['tax_amount'],
                    'Net Amount': (receipt['total_amount'] or 0) - (receipt['tax_amount'] or 0),
                    'Payment Method': receipt['payment_method'],
                    'Filename': receipt['filename']
                })
            
            df_export = pd.DataFrame(export_data)
            
            # Create Excel file in memory
            output = io.BytesIO()
            with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
                df_export.to_excel(writer, sheet_name='Receipts', index=False)
                
                # Get workbook and worksheet
                workbook = writer.book
                worksheet = writer.sheets['Receipts']
                
                # Add formatting
                money_format = workbook.add_format({'num_format': '$#,##0.00'})
                date_format = workbook.add_format({'num_format': 'yyyy-mm-dd'})
                
                # Apply formats
                worksheet.set_column('A:A', 12, date_format)  # Date column
                worksheet.set_column('B:B', 25)  # Vendor column
                worksheet.set_column('C:C', 15)  # Category column
                worksheet.set_column('D:F', 12, money_format)  # Amount columns
                worksheet.set_column('G:G', 15)  # Payment method column
                worksheet.set_column('H:H', 30)  # Filename column
            
            output.seek(0)
            
            st.download_button(
                label="📄 Download Excel File",
                data=output.getvalue(),
                file_name=f"receipts_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx",
                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
            )
    
    with col2:
        # Summary statistics
        st.subheader("📈 Summary")
        total_receipts = len(st.session_state.receipts_data)
        total_amount = sum(r['total_amount'] or 0 for r in st.session_state.receipts_data)
        total_tax = sum(r['tax_amount'] or 0 for r in st.session_state.receipts_data)
        
        st.metric("Total Receipts", total_receipts)
        st.metric("Total Amount", f"${total_amount:.2f}")
        st.metric("Total Tax", f"${total_tax:.2f}")

# Clear data button
if st.session_state.receipts_data:
    st.header("🗑️ Clear Data")
    if st.button("Clear All Receipt Data", type="secondary"):
        st.session_state.receipts_data = []
        st.success("All receipt data cleared!")
        st.rerun()

# Instructions
with st.expander("ℹ️ How to use this application"):
    st.markdown("""
    1. **Upload PDF Receipts**: Click the file uploader and select one or more PDF receipt files
    2. **Review Extracted Data**: The application will automatically extract information like date, vendor, amounts, etc.
    3. **Edit Data**: Select individual receipts to correct or add missing information
    4. **Export to Excel**: Generate an Excel file with all your processed receipts
    5. **Categories**: Receipts are automatically categorized based on vendor names
    
    **Supported Information**:
    - Date of transaction
    - Vendor/merchant name
    - Total amount
    - Tax amount
    - Payment method
    - Expense category
    
    **Note**: OCR accuracy depends on the quality of the PDF. You may need to manually correct some extracted data.
    """)
Loading code editor...
Click Save & Run to preview your app
Terminal