I want an app that is capable of converting Italian fatture pdfs to xml. Consider the most recent Italian documentation about xml and fatture, as this is a complex task.
To upload files, please first save the app
import streamlit as st
import PyPDF2
import xml.etree.ElementTree as ET
from xml.dom import minidom
import re
from datetime import datetime
from dateutil.parser import parse
import io
# Configure page
st.set_page_config(
page_title="Fatture PDF to XML Converter",
page_icon="đ",
layout="wide"
)
st.title("đŽđš Fatture PDF to XML Converter")
st.markdown("Convert Italian invoices (fatture) from PDF to FatturaPA XML format")
# Sidebar with info
with st.sidebar:
st.header("âšī¸ Information")
st.markdown("""
This tool converts Italian invoices (fatture) from PDF format to the official
FatturaPA XML format as required by the Italian tax authority.
**Supported features:**
- Extract invoice data from PDF
- Generate FatturaPA v1.2.2 compliant XML
- Validate basic XML structure
- Download generated XML file
**Note:** This is a basic implementation. Complex invoices may require manual review.
""")
# File upload
uploaded_file = st.file_uploader(
"Upload your Italian invoice PDF",
type=['pdf'],
help="Select a PDF file containing an Italian invoice (fattura)"
)
if uploaded_file is not None:
try:
# Read PDF
pdf_reader = PyPDF2.PdfReader(uploaded_file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
# Display extracted text
with st.expander("đ Extracted PDF Text", expanded=False):
st.text_area("Raw text from PDF:", text, height=200)
# Parse invoice data
invoice_data = parse_invoice_text(text)
# Display parsed data
st.header("đ Parsed Invoice Data")
col1, col2 = st.columns(2)
with col1:
st.subheader("Seller Information")
st.text_input("Company Name", value=invoice_data.get('seller_name', ''), key='seller_name')
st.text_input("VAT Number", value=invoice_data.get('seller_vat', ''), key='seller_vat')
st.text_input("Tax Code", value=invoice_data.get('seller_tax_code', ''), key='seller_tax_code')
st.text_area("Address", value=invoice_data.get('seller_address', ''), key='seller_address')
with col2:
st.subheader("Buyer Information")
st.text_input("Company Name", value=invoice_data.get('buyer_name', ''), key='buyer_name')
st.text_input("VAT Number", value=invoice_data.get('buyer_vat', ''), key='buyer_vat')
st.text_input("Tax Code", value=invoice_data.get('buyer_tax_code', ''), key='buyer_tax_code')
st.text_area("Address", value=invoice_data.get('buyer_address', ''), key='buyer_address')
st.subheader("Invoice Details")
col3, col4, col5 = st.columns(3)
with col3:
st.text_input("Invoice Number", value=invoice_data.get('invoice_number', ''), key='invoice_number')
st.date_input("Invoice Date", value=invoice_data.get('invoice_date', datetime.now().date()), key='invoice_date')
with col4:
st.selectbox("Document Type",
options=['TD01', 'TD02', 'TD03', 'TD04', 'TD05', 'TD06'],
index=0,
help="TD01=Invoice, TD02=Advance/Down Payment, TD04=Credit Note, TD05=Debit Note, TD06=Parifica",
key='doc_type')
st.selectbox("Currency", options=['EUR', 'USD', 'GBP'], index=0, key='currency')
with col5:
st.number_input("Total Amount", value=float(invoice_data.get('total_amount', 0)), key='total_amount')
st.number_input("VAT Amount", value=float(invoice_data.get('vat_amount', 0)), key='vat_amount')
# Line items
st.subheader("Invoice Line Items")
if 'line_items' not in st.session_state:
st.session_state.line_items = invoice_data.get('line_items', [{'description': '', 'quantity': 1, 'price': 0, 'vat_rate': 22}])
for i, item in enumerate(st.session_state.line_items):
cols = st.columns([3, 1, 1, 1, 1])
with cols[0]:
item['description'] = st.text_input(f"Description {i+1}", value=item.get('description', ''), key=f'desc_{i}')
with cols[1]:
item['quantity'] = st.number_input(f"Qty {i+1}", value=item.get('quantity', 1), key=f'qty_{i}')
with cols[2]:
item['price'] = st.number_input(f"Price {i+1}", value=item.get('price', 0.0), key=f'price_{i}')
with cols[3]:
item['vat_rate'] = st.number_input(f"VAT% {i+1}", value=item.get('vat_rate', 22), key=f'vat_{i}')
with cols[4]:
if st.button(f"Remove {i+1}", key=f'remove_{i}'):
st.session_state.line_items.pop(i)
st.rerun()
col_add, col_gen = st.columns([1, 1])
with col_add:
if st.button("Add Line Item"):
st.session_state.line_items.append({'description': '', 'quantity': 1, 'price': 0, 'vat_rate': 22})
st.rerun()
with col_gen:
if st.button("đ Generate XML", type="primary"):
# Collect all form data
form_data = {
'seller_name': st.session_state.seller_name,
'seller_vat': st.session_state.seller_vat,
'seller_tax_code': st.session_state.seller_tax_code,
'seller_address': st.session_state.seller_address,
'buyer_name': st.session_state.buyer_name,
'buyer_vat': st.session_state.buyer_vat,
'buyer_tax_code': st.session_state.buyer_tax_code,
'buyer_address': st.session_state.buyer_address,
'invoice_number': st.session_state.invoice_number,
'invoice_date': st.session_state.invoice_date,
'doc_type': st.session_state.doc_type,
'currency': st.session_state.currency,
'total_amount': st.session_state.total_amount,
'vat_amount': st.session_state.vat_amount,
'line_items': st.session_state.line_items
}
# Generate XML
xml_content = generate_fattura_xml(form_data)
# Display XML
st.header("đ Generated FatturaPA XML")
st.code(xml_content, language='xml')
# Download button
st.download_button(
label="đž Download XML",
data=xml_content,
file_name=f"Fattura_{form_data['invoice_number']}_{form_data['invoice_date'].strftime('%Y%m%d')}.xml",
mime="application/xml"
)
except Exception as e:
st.error(f"Error processing PDF: {str(e)}")
def parse_invoice_text(text):
"""Parse invoice data from extracted PDF text"""
data = {}
# Extract invoice number
invoice_num_match = re.search(r'(?:fattura|invoice|n\.?|numero)\s*:?\s*(\w+)', text, re.IGNORECASE)
if invoice_num_match:
data['invoice_number'] = invoice_num_match.group(1)
# Extract dates
date_patterns = [
r'(\d{1,2}[/\-\.]\d{1,2}[/\-\.]\d{2,4})',
r'(\d{2,4}[/\-\.]\d{1,2}[/\-\.]\d{1,2})'
]
for pattern in date_patterns:
date_matches = re.findall(pattern, text)
if date_matches:
try:
data['invoice_date'] = parse(date_matches[0], dayfirst=True).date()
break
except:
continue
# Extract amounts
amount_patterns = [
r'(?:totale|total|importo)\s*:?\s*âŦ?\s*([\d,\.]+)',
r'âŦ\s*([\d,\.]+)',
r'([\d,\.]+)\s*âŦ'
]
for pattern in amount_patterns:
amount_matches = re.finditer(pattern, text, re.IGNORECASE)
amounts = []
for match in amount_matches:
try:
amount = float(match.group(1).replace(',', '.'))
amounts.append(amount)
except:
continue
if amounts:
data['total_amount'] = max(amounts) # Assume highest amount is total
break
# Extract VAT numbers (Partita IVA)
vat_matches = re.finditer(r'(?:p\.?iva|partita iva)\s*:?\s*([A-Z]{2}[\d\w]{11}|\d{11})', text, re.IGNORECASE)
vat_numbers = [match.group(1) for match in vat_matches]
if len(vat_numbers) >= 2:
data['seller_vat'] = vat_numbers[0]
data['buyer_vat'] = vat_numbers[1]
elif len(vat_numbers) == 1:
data['seller_vat'] = vat_numbers[0]
# Basic line items extraction
data['line_items'] = [{'description': 'Servizio/Prodotto', 'quantity': 1, 'price': data.get('total_amount', 0), 'vat_rate': 22}]
return data
def generate_fattura_xml(data):
"""Generate FatturaPA XML from invoice data"""
# Create root element
root = ET.Element("p:FatturaElettronica")
root.set("versione", "FPR12")
root.set("xmlns:ds", "http://www.w3.org/2000/09/xmldsig#")
root.set("xmlns:p", "http://ivaservizi.agenziaentrate.gov.it/docs/xsd/fatture/v1.2")
root.set("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance")
root.set("xsi:schemaLocation", "http://ivaservizi.agenziaentrate.gov.it/docs/xsd/fatture/v1.2 http://www.fatturapa.gov.it/export/fatturazione/sdi/fatturapa/v1.2/Schema_del_file_xml_FatturaPA_versione_1.2.xsd")
# FatturaElettronicaHeader
header = ET.SubElement(root, "FatturaElettronicaHeader")
# DatiTrasmissione
dati_trasmissione = ET.SubElement(header, "DatiTrasmissione")
id_trasmittente = ET.SubElement(dati_trasmissione, "IdTrasmittente")
ET.SubElement(id_trasmittente, "IdPaese").text = "IT"
ET.SubElement(id_trasmittente, "IdCodice").text = data.get('seller_vat', '00000000000')
ET.SubElement(dati_trasmissione, "ProgressivoInvio").text = "1"
ET.SubElement(dati_trasmissione, "FormatoTrasmissione").text = "FPR12"
ET.SubElement(dati_trasmissione, "CodiceDestinatario").text = "0000000"
# CedentePrestatore
cedente = ET.SubElement(header, "CedentePrestatore")
dati_anagrafici_cedente = ET.SubElement(cedente, "DatiAnagrafici")
id_fiscale_iva_cedente = ET.SubElement(dati_anagrafici_cedente, "IdFiscaleIVA")
ET.SubElement(id_fiscale_iva_cedente, "IdPaese").text = "IT"
ET.SubElement(id_fiscale_iva_cedente, "IdCodice").text = data.get('seller_vat', '00000000000')
if data.get('seller_tax_code'):
ET.SubElement(dati_anagrafici_cedente, "CodiceFiscale").text = data['seller_tax_code']
anagrafica_cedente = ET.SubElement(dati_anagrafici_cedente, "Anagrafica")
ET.SubElement(anagrafica_cedente, "Denominazione").text = data.get('seller_name', 'Denominazione Cedente')
ET.SubElement(dati_anagrafici_cedente, "RegimeFiscale").text = "RF01"
sede_cedente = ET.SubElement(cedente, "Sede")
ET.SubElement(sede_cedente, "Indirizzo").text = data.get('seller_address', 'Via Roma 1')
ET.SubElement(sede_cedente, "CAP").text = "00100"
ET.SubElement(sede_cedente, "Comune").text = "Roma"
ET.SubElement(sede_cedente, "Provincia").text = "RM"
ET.SubElement(sede_cedente, "Nazione").text = "IT"
# CessionarioCommittente
cessionario = ET.SubElement(header, "CessionarioCommittente")
dati_anagrafici_cessionario = ET.SubElement(cessionario, "DatiAnagrafici")
if data.get('buyer_vat'):
id_fiscale_iva_cessionario = ET.SubElement(dati_anagrafici_cessionario, "IdFiscaleIVA")
ET.SubElement(id_fiscale_iva_cessionario, "IdPaese").text = "IT"
ET.SubElement(id_fiscale_iva_cessionario, "IdCodice").text = data['buyer_vat']
if data.get('buyer_tax_code'):
ET.SubElement(dati_anagrafici_cessionario, "CodiceFiscale").text = data['buyer_tax_code']
anagrafica_cessionario = ET.SubElement(dati_anagrafici_cessionario, "Anagrafica")
ET.SubElement(anagrafica_cessionario, "Denominazione").text = data.get('buyer_name', 'Denominazione Cessionario')
sede_cessionario = ET.SubElement(cessionario, "Sede")
ET.SubElement(sede_cessionario, "Indirizzo").text = data.get('buyer_address', 'Via Milano 1')
ET.SubElement(sede_cessionario, "CAP").text = "20100"
ET.SubElement(sede_cessionario, "Comune").text = "Milano"
ET.SubElement(sede_cessionario, "Provincia").text = "MI"
ET.SubElement(sede_cessionario, "Nazione").text = "IT"
# FatturaElettronicaBody
body = ET.SubElement(root, "FatturaElettronicaBody")
# DatiGenerali
dati_generali = ET.SubElement(body, "DatiGenerali")
dati_generali_documento = ET.SubElement(dati_generali, "DatiGeneraliDocumento")
ET.SubElement(dati_generali_documento, "TipoDocumento").text = data.get('doc_type', 'TD01')
ET.SubElement(dati_generali_documento, "Divisa").text = data.get('currency', 'EUR')
ET.SubElement(dati_generali_documento, "Data").text = data.get('invoice_date', datetime.now().date()).strftime('%Y-%m-%d')
ET.SubElement(dati_generali_documento, "Numero").text = data.get('invoice_number', '1')
# DatiBeniServizi
dati_beni_servizi = ET.SubElement(body, "DatiBeniServizi")
# Add line items
for i, item in enumerate(data.get('line_items', [])):
dettaglio_linee = ET.SubElement(dati_beni_servizi, "DettaglioLinee")
ET.SubElement(dettaglio_linee, "NumeroLinea").text = str(i + 1)
ET.SubElement(dettaglio_linee, "Descrizione").text = item.get('description', f'Riga {i+1}')
ET.SubElement(dettaglio_linee, "Quantita").text = f"{item.get('quantity', 1):.2f}"
ET.SubElement(dettaglio_linee, "PrezzoUnitario").text = f"{item.get('price', 0):.2f}"
ET.SubElement(dettaglio_linee, "PrezzoTotale").text = f"{item.get('quantity', 1) * item.get('price', 0):.2f}"
ET.SubElement(dettaglio_linee, "AliquotaIVA").text = f"{item.get('vat_rate', 22):.2f}"
# DatiRiepilogo
dati_riepilogo = ET.SubElement(dati_beni_servizi, "DatiRiepilogo")
ET.SubElement(dati_riepilogo, "AliquotaIVA").text = "22.00"
imponibile = data.get('total_amount', 0) - data.get('vat_amount', 0)
ET.SubElement(dati_riepilogo, "ImponibileImporto").text = f"{imponibile:.2f}"
ET.SubElement(dati_riepilogo, "Imposta").text = f"{data.get('vat_amount', 0):.2f}"
ET.SubElement(dati_riepilogo, "EsigibilitaIVA").text = "I"
# Convert to string with pretty formatting
rough_string = ET.tostring(root, encoding='unicode')
reparsed = minidom.parseString(rough_string)
pretty_xml = reparsed.toprettyxml(indent=" ", encoding=None)
# Remove empty lines
pretty_lines = [line for line in pretty_xml.split('\n') if line.strip()]
return '\n'.join(pretty_lines)
if __name__ == "__main__":
st.write("Fatture PDF to XML Converter is running!")
Hi! I can help you with any questions about Streamlit and Python. What would you like to know?