generate an accounting application, that uploads pdf recipts and make it into xls
To upload files, please first save the app
import streamlit as st
import pandas as pd
import pdfplumber
import re
from datetime import datetime
import io
from dateutil.parser import parse
import xlsxwriter
st.set_page_config(page_title="Receipt Processor", page_icon="๐งพ", layout="wide")
st.title("๐งพ Accounting Receipt Processor")
st.markdown("Upload PDF receipts and convert them to Excel format for accounting purposes")
# Initialize session state
if 'receipts_data' not in st.session_state:
st.session_state.receipts_data = []
def extract_text_from_pdf(pdf_file):
"""Extract text from uploaded PDF file"""
try:
with pdfplumber.open(pdf_file) as pdf:
text = ""
for page in pdf.pages:
text += page.extract_text() or ""
return text
except Exception as e:
st.error(f"Error reading PDF: {str(e)}")
return ""
def parse_receipt_data(text, filename):
"""Parse receipt text and extract relevant information"""
# Initialize data structure
receipt_data = {
'filename': filename,
'date': None,
'vendor': None,
'total_amount': None,
'tax_amount': None,
'items': [],
'category': 'General',
'payment_method': 'Unknown',
'raw_text': text
}
# Extract date patterns
date_patterns = [
r'\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})\b',
r'\b(\d{4}[/-]\d{1,2}[/-]\d{1,2})\b',
r'\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},?\s+\d{4}\b'
]
for pattern in date_patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
if matches:
try:
receipt_data['date'] = parse(matches[0]).strftime('%Y-%m-%d')
break
except:
continue
# Extract total amount
amount_patterns = [
r'total[:\s]*\$?(\d+\.?\d*)',
r'amount[:\s]*\$?(\d+\.?\d*)',
r'\$(\d+\.\d{2})\s*$'
]
for pattern in amount_patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
if matches:
try:
receipt_data['total_amount'] = float(matches[-1])
break
except:
continue
# Extract tax amount
tax_patterns = [
r'tax[:\s]*\$?(\d+\.?\d*)',
r'gst[:\s]*\$?(\d+\.?\d*)',
r'hst[:\s]*\$?(\d+\.?\d*)'
]
for pattern in tax_patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
if matches:
try:
receipt_data['tax_amount'] = float(matches[0])
break
except:
continue
# Extract vendor name (first few lines often contain vendor info)
lines = text.split('\n')
for i, line in enumerate(lines[:5]):
line = line.strip()
if len(line) > 3 and not re.match(r'^\d+', line):
receipt_data['vendor'] = line
break
# Determine category based on vendor name or content
if receipt_data['vendor']:
vendor_lower = receipt_data['vendor'].lower()
if any(word in vendor_lower for word in ['gas', 'shell', 'esso', 'petro']):
receipt_data['category'] = 'Fuel'
elif any(word in vendor_lower for word in ['restaurant', 'cafe', 'coffee', 'food']):
receipt_data['category'] = 'Meals'
elif any(word in vendor_lower for word in ['office', 'supply', 'staples']):
receipt_data['category'] = 'Office Supplies'
elif any(word in vendor_lower for word in ['hotel', 'inn', 'lodge']):
receipt_data['category'] = 'Accommodation'
return receipt_data
# File upload section
st.header("๐ค Upload Receipts")
uploaded_files = st.file_uploader(
"Choose PDF files",
type=['pdf'],
accept_multiple_files=True,
help="Upload one or more PDF receipts"
)
if uploaded_files:
progress_bar = st.progress(0)
status_text = st.empty()
for i, uploaded_file in enumerate(uploaded_files):
status_text.text(f"Processing {uploaded_file.name}...")
# Extract text from PDF
text = extract_text_from_pdf(uploaded_file)
if text:
# Parse receipt data
receipt_data = parse_receipt_data(text, uploaded_file.name)
# Check if this receipt is already processed
existing_receipt = next((r for r in st.session_state.receipts_data if r['filename'] == uploaded_file.name), None)
if not existing_receipt:
st.session_state.receipts_data.append(receipt_data)
progress_bar.progress((i + 1) / len(uploaded_files))
status_text.text("โ
Processing complete!")
st.success(f"Processed {len(uploaded_files)} receipt(s)")
# Display and edit receipt data
if st.session_state.receipts_data:
st.header("๐ Receipt Data")
# Convert to DataFrame for display
df_display = pd.DataFrame([
{
'Filename': r['filename'],
'Date': r['date'] or 'Not found',
'Vendor': r['vendor'] or 'Not found',
'Total Amount': f"${r['total_amount']:.2f}" if r['total_amount'] else 'Not found',
'Tax Amount': f"${r['tax_amount']:.2f}" if r['tax_amount'] else 'Not found',
'Category': r['category'],
'Payment Method': r['payment_method']
}
for r in st.session_state.receipts_data
])
st.dataframe(df_display, use_container_width=True)
# Edit section
st.subheader("โ๏ธ Edit Receipt Data")
if st.session_state.receipts_data:
selected_receipt = st.selectbox(
"Select receipt to edit:",
options=range(len(st.session_state.receipts_data)),
format_func=lambda x: st.session_state.receipts_data[x]['filename']
)
receipt = st.session_state.receipts_data[selected_receipt]
col1, col2 = st.columns(2)
with col1:
new_date = st.date_input(
"Date:",
value=datetime.strptime(receipt['date'], '%Y-%m-%d').date() if receipt['date'] else datetime.now().date()
)
new_vendor = st.text_input("Vendor:", value=receipt['vendor'] or "")
new_total = st.number_input("Total Amount:", value=receipt['total_amount'] or 0.0, min_value=0.0, step=0.01)
with col2:
new_tax = st.number_input("Tax Amount:", value=receipt['tax_amount'] or 0.0, min_value=0.0, step=0.01)
new_category = st.selectbox(
"Category:",
options=['General', 'Meals', 'Fuel', 'Office Supplies', 'Accommodation', 'Travel', 'Equipment', 'Professional Services'],
index=['General', 'Meals', 'Fuel', 'Office Supplies', 'Accommodation', 'Travel', 'Equipment', 'Professional Services'].index(receipt['category'])
)
new_payment = st.selectbox(
"Payment Method:",
options=['Credit Card', 'Cash', 'Debit Card', 'Bank Transfer', 'Unknown'],
index=['Credit Card', 'Cash', 'Debit Card', 'Bank Transfer', 'Unknown'].index(receipt['payment_method'])
)
if st.button("Update Receipt"):
st.session_state.receipts_data[selected_receipt].update({
'date': new_date.strftime('%Y-%m-%d'),
'vendor': new_vendor,
'total_amount': new_total,
'tax_amount': new_tax,
'category': new_category,
'payment_method': new_payment
})
st.success("Receipt updated successfully!")
st.rerun()
# Export to Excel
if st.session_state.receipts_data:
st.header("๐ฅ Export to Excel")
col1, col2 = st.columns(2)
with col1:
if st.button("Generate Excel File", type="primary"):
# Create DataFrame for export
export_data = []
for receipt in st.session_state.receipts_data:
export_data.append({
'Date': receipt['date'],
'Vendor': receipt['vendor'],
'Category': receipt['category'],
'Total Amount': receipt['total_amount'],
'Tax Amount': receipt['tax_amount'],
'Net Amount': (receipt['total_amount'] or 0) - (receipt['tax_amount'] or 0),
'Payment Method': receipt['payment_method'],
'Filename': receipt['filename']
})
df_export = pd.DataFrame(export_data)
# Create Excel file in memory
output = io.BytesIO()
with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
df_export.to_excel(writer, sheet_name='Receipts', index=False)
# Get workbook and worksheet
workbook = writer.book
worksheet = writer.sheets['Receipts']
# Add formatting
money_format = workbook.add_format({'num_format': '$#,##0.00'})
date_format = workbook.add_format({'num_format': 'yyyy-mm-dd'})
# Apply formats
worksheet.set_column('A:A', 12, date_format) # Date column
worksheet.set_column('B:B', 25) # Vendor column
worksheet.set_column('C:C', 15) # Category column
worksheet.set_column('D:F', 12, money_format) # Amount columns
worksheet.set_column('G:G', 15) # Payment method column
worksheet.set_column('H:H', 30) # Filename column
output.seek(0)
st.download_button(
label="๐ Download Excel File",
data=output.getvalue(),
file_name=f"receipts_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
with col2:
# Summary statistics
st.subheader("๐ Summary")
total_receipts = len(st.session_state.receipts_data)
total_amount = sum(r['total_amount'] or 0 for r in st.session_state.receipts_data)
total_tax = sum(r['tax_amount'] or 0 for r in st.session_state.receipts_data)
st.metric("Total Receipts", total_receipts)
st.metric("Total Amount", f"${total_amount:.2f}")
st.metric("Total Tax", f"${total_tax:.2f}")
# Clear data button
if st.session_state.receipts_data:
st.header("๐๏ธ Clear Data")
if st.button("Clear All Receipt Data", type="secondary"):
st.session_state.receipts_data = []
st.success("All receipt data cleared!")
st.rerun()
# Instructions
with st.expander("โน๏ธ How to use this application"):
st.markdown("""
1. **Upload PDF Receipts**: Click the file uploader and select one or more PDF receipt files
2. **Review Extracted Data**: The application will automatically extract information like date, vendor, amounts, etc.
3. **Edit Data**: Select individual receipts to correct or add missing information
4. **Export to Excel**: Generate an Excel file with all your processed receipts
5. **Categories**: Receipts are automatically categorized based on vendor names
**Supported Information**:
- Date of transaction
- Vendor/merchant name
- Total amount
- Tax amount
- Payment method
- Expense category
**Note**: OCR accuracy depends on the quality of the PDF. You may need to manually correct some extracted data.
""")
Hi! I can help you with any questions about Streamlit and Python. What would you like to know?