class BEExtractor
Belgium-specific invoice data extractor that uses LLM (Large Language Model) to extract structured invoice data from Belgian invoices in multiple languages (English, French, Dutch).
/tf/active/vicechatdev/invoice_extraction/extractors/be_extractor.py
11 - 463
complex
Purpose
This class extends BaseExtractor to provide specialized extraction of invoice data from Belgian invoices. It handles Belgian-specific formats including VAT numbers (BE format), European date formats (DD/MM/YYYY), European number formats (1.234,56), and standard Belgian VAT rates (21%, 12%, 6%, 0%). The extractor uses a comprehensive LLM-based approach to parse invoice text and tables, with fallback methods for robust extraction. It processes vendor information, invoice metadata, amounts, payment details, and line items while ensuring data validation and format standardization.
Source Code
class BEExtractor(BaseExtractor):
"""Belgium-specific invoice data extractor using pure LLM approach."""
def __init__(self, config=None):
super().__init__(config)
# Only initialize LLM client if not already initialized by parent
if self.llm_client is None:
self.llm_client = LLMClient(self.config.get('llm', {}))
# Belgian-specific configuration
self.default_currency = 'EUR'
# Standard VAT rates in Belgium for validation
self.vat_rates = [21, 12, 6, 0]
def extract(self, document: Dict[str, Any], language: str) -> Dict[str, Any]:
"""
Extract invoice data from the document with Belgian-specific processing.
Args:
document: Processed document from DocumentProcessor
language: Detected language of the document ('en', 'fr', 'nl')
Returns:
Dict containing extracted invoice fields
"""
logger.info(f"Extracting data from Belgian invoice in {language}")
# Get full text of the document
full_text = self._get_full_document_text(document)
if not full_text:
logger.warning("No text content found in document")
return self._empty_extraction_result(language)
# Extract tables if present
tables = []
for page in document.get('pages', []):
tables.extend(page.get('tables', []))
table_text = self._format_table_content(tables)
# Extract all data using comprehensive LLM approach
extraction_result = self._extract_all_invoice_data(full_text, table_text, language)
# Add metadata
extraction_result['metadata'] = {
'language': language,
'extraction_method': self.__class__.__name__
}
# Add confidence scores
extraction_result['confidence'] = self.calculate_confidence(extraction_result)
return extraction_result
def _get_full_document_text(self, document: Dict[str, Any]) -> str:
"""Extract full text from document."""
# If text is directly available in the document
if document.get('text'):
return document['text']
# Otherwise, collect text from all pages
full_text = []
for page in document.get('pages', []):
if page.get('text'):
full_text.append(page['text'])
return "\n\n".join(full_text)
def _format_table_content(self, tables: List[Dict[str, Any]]) -> str:
"""Format tables as text to provide additional structure to the LLM."""
if not tables:
return ""
table_texts = []
for i, table in enumerate(tables):
rows = []
current_row = []
current_row_number = 0
# Sort cells by row and column
cells = sorted(table.get('cells', []), key=lambda x: (x.get('row', 0), x.get('column', 0)))
for cell in cells:
row = cell.get('row', 0)
if row > current_row_number:
if current_row:
rows.append(" | ".join(current_row))
current_row = []
current_row_number = row
current_row.append(cell.get('text', '').strip())
if current_row:
rows.append(" | ".join(current_row))
table_texts.append(f"TABLE {i+1}:\n" + "\n".join(rows))
return "\n\n".join(table_texts)
def _empty_extraction_result(self, language: str) -> Dict[str, Any]:
"""Return an empty extraction result structure."""
return {
'invoice': {},
'vendor': {},
'amounts': {},
'payment': {},
'line_items': [],
'metadata': {
'language': language,
'extraction_method': self.__class__.__name__
},
'confidence': 0.0
}
def _extract_all_invoice_data(self, full_text: str, table_text: str, language: str) -> Dict[str, Any]:
"""
Extract all invoice data using a comprehensive LLM approach.
Args:
full_text: Full text of the document
table_text: Formatted table content if available
language: Detected language of the document
Returns:
Dictionary with all extracted invoice data
"""
# Provide language-specific context to improve extraction
language_context = {
'en': "This is a Belgian invoice in English.",
'fr': "This is a Belgian invoice in French. Common terms: facture (invoice), montant (amount), TVA (VAT), date d'échéance (due date).",
'nl': "This is a Belgian invoice in Dutch. Common terms: factuur (invoice), bedrag (amount), BTW (VAT), vervaldatum (due date)."
}.get(language, "This is a Belgian invoice.")
# Create comprehensive extraction prompt
prompt = f"""# Belgian Invoice Data Extraction
{language_context}
Analyze the following invoice text and extract all required information according to Belgian invoice standards.
## Important Belgian Invoice Characteristics:
- VAT numbers format: BE 0XXX.XXX.XXX or BE0XXXXXXXXX
- Date formats: DD/MM/YYYY, DD-MM-YYYY, DD.MM.YYYY
- Number formats: European (1.234,56) for amounts
- Standard VAT rates: 21% (standard), 12%, 6%, 0% (exempt)
- Belgian IBAN format: BEXX XXXX XXXX XXXX
## Invoice Text:
{full_text}
## Tables Detected:
{table_text}
## Required Output:
Extract and return a valid JSON object with the following structure:
{{
"invoice": {{
"number": "extracted invoice number",
"issue_date": "issue date in YYYY-MM-DD format",
"due_date": "due date in YYYY-MM-DD format",
"reference": "client reference or PO number"
}},
"vendor": {{
"name": "vendor company name",
"vat_number": "BE formatted VAT number",
"address": "complete vendor address",
"contact": "contact information"
}},
"amounts": {{
"subtotal": numeric value (before VAT),
"vat": numeric value (VAT amount),
"total": numeric value (including VAT),
"currency": "currency code (default EUR)",
"vat_rate": numeric value (percentage)
}},
"payment": {{
"bank_name": "bank name",
"iban": "BE formatted IBAN",
"bic": "BIC/SWIFT code",
"payment_terms": "payment terms",
"communication": "payment reference"
}},
"line_items": [
{{
"description": "item description",
"quantity": numeric value,
"unit_price": numeric value,
"vat_rate": numeric value,
"amount": numeric value
}}
...
]
}}
Convert all amounts from Belgian format (1.234,56) to standard decimal format (1234.56).
Format dates as ISO format YYYY-MM-DD.
If information is not found, use null or empty string as appropriate.
"""
# Call LLM with comprehensive extraction prompt
response = self.llm_client.generate(prompt)
# Parse response
try:
extraction_result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
return self._post_process_extraction(extraction_result)
except Exception as e:
logger.error(f"Failed to parse LLM extraction result: {e}")
# Attempt to extract partial results with a more structured prompt
return self._fallback_extraction(full_text, table_text, language)
def _post_process_extraction(self, extraction_result: Dict[str, Any]) -> Dict[str, Any]:
"""Perform post-processing on the extracted data."""
result = {
'invoice': {},
'vendor': {},
'amounts': {},
'payment': {},
'line_items': []
}
# Copy extracted data
for section in ['invoice', 'vendor', 'amounts', 'payment']:
if section in extraction_result and isinstance(extraction_result[section], dict):
result[section] = extraction_result[section]
if 'line_items' in extraction_result and isinstance(extraction_result['line_items'], list):
result['line_items'] = extraction_result['line_items']
# Process dates to ensure consistent format
for date_field in ['issue_date', 'due_date']:
if result.get('invoice', {}).get(date_field):
try:
date_str = result['invoice'][date_field]
# Check if already in ISO format
if '-' in date_str and len(date_str) == 10:
parts = date_str.split('-')
if len(parts) == 3 and len(parts[0]) == 4:
# Already in YYYY-MM-DD format
continue
# Try to parse and standardize date
parsed_date = self._parse_date(date_str)
if parsed_date:
result['invoice'][date_field] = parsed_date
except Exception as e:
logger.warning(f"Failed to process date {date_field}: {e}")
# Ensure currency defaults to EUR
if 'amounts' in result and not result['amounts'].get('currency'):
result['amounts']['currency'] = self.default_currency
# Validate VAT rate
if result.get('amounts', {}).get('vat_rate') is not None:
vat_rate = result['amounts']['vat_rate']
try:
vat_rate = float(vat_rate)
# Check if close to standard Belgian VAT rates
closest_rate = min(self.vat_rates, key=lambda x: abs(x - vat_rate))
if abs(closest_rate - vat_rate) < 1: # Within 1% tolerance
result['amounts']['vat_rate'] = closest_rate
except:
pass
return result
def _fallback_extraction(self, full_text: str, table_text: str, language: str) -> Dict[str, Any]:
"""
Fallback method to extract invoice data in multiple smaller LLM calls.
Used when comprehensive extraction fails.
"""
logger.info("Using fallback extraction method")
result = {
'invoice': self._extract_invoice_metadata(full_text),
'vendor': self._extract_vendor_data(full_text),
'amounts': self._extract_amounts(full_text),
'payment': self._extract_payment_data(full_text),
'line_items': self._extract_line_items(full_text, table_text)
}
return result
def _extract_invoice_metadata(self, text: str) -> Dict[str, Any]:
"""Extract invoice metadata using LLM."""
prompt = f"""Extract the following invoice metadata from this Belgian invoice text:
- invoice_number: The invoice number (facture no./factuurnr.)
- issue_date: The date the invoice was issued (date/datum)
- due_date: The date payment is due (échéance/vervaldatum)
- reference: Any client reference or PO number
Return ONLY a valid JSON object with these fields. Format dates as YYYY-MM-DD.
Invoice text:
{text[:2000]}
"""
response = self.llm_client.generate(prompt)
try:
return json.loads(response.replace('```json','').replace('```','').replace('\n',''))
except:
logger.warning("Failed to extract invoice metadata")
return {}
def _extract_vendor_data(self, text: str) -> Dict[str, Any]:
"""Extract vendor data using LLM."""
prompt = f"""Extract the vendor information from this Belgian invoice text:
- name: The vendor/supplier company name
- vat_number: The Belgian VAT number (format: BE 0XXX.XXX.XXX)
- address: The full address of the vendor
- contact: Email or phone for contact
Return ONLY a valid JSON object with these fields.
Invoice text:
{text[:2000]}
"""
response = self.llm_client.generate(prompt)
try:
return json.loads(response.replace('```json','').replace('```','').replace('\n',''))
except:
logger.warning("Failed to extract vendor data")
return {}
def _extract_amounts(self, text: str) -> Dict[str, Any]:
"""Extract amount information using LLM."""
prompt = f"""Extract the financial information from this Belgian invoice:
- subtotal: The amount before VAT/BTW (montant hors TVA/bedrag excl. BTW)
- total: The total amount due including VAT (montant total/totaalbedrag)
- vat: The VAT amount (montant TVA/BTW bedrag)
- vat_rate: The VAT percentage rate (e.g., 21 for 21%)
- currency: The currency code (EUR, USD, etc.)
Convert all amounts from Belgian format (1.234,56) to standard decimal (1234.56).
Return ONLY a valid JSON object with numeric values (not strings) for amounts.
Invoice text:
{text}
"""
response = self.llm_client.generate(prompt)
try:
result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
# Ensure amounts are numeric
for field in ['subtotal', 'total', 'vat', 'vat_rate']:
if field in result and result[field] is not None:
try:
result[field] = float(result[field])
except:
result[field] = None
return result
except:
logger.warning("Failed to extract amounts")
return {}
def _extract_payment_data(self, text: str) -> Dict[str, Any]:
"""Extract payment information using LLM."""
prompt = f"""Extract the payment information from this Belgian invoice:
- bank_name: The name of the bank
- iban: The Belgian IBAN (format: BEXX XXXX XXXX XXXX)
- bic: The BIC/SWIFT code
- payment_terms: Payment terms (e.g., "30 jours/dagen")
- communication: Any payment reference/communication
Return ONLY a valid JSON object with these fields.
Invoice text:
{text}
"""
response = self.llm_client.generate(prompt)
try:
return json.loads(response.replace('```json','').replace('```','').replace('\n',''))
except:
logger.warning("Failed to extract payment data")
return {}
def _extract_line_items(self, text: str, table_text: str) -> List[Dict[str, Any]]:
"""Extract line items using LLM."""
# Use table text if available, otherwise use full text
context = table_text if table_text else text
prompt = f"""Extract the line items from this Belgian invoice.
Look for tables with descriptions, quantities, unit prices, and amounts.
Return ONLY a valid JSON array of line items objects with these properties:
- description: Item description
- quantity: Numeric quantity
- unit_price: Numeric unit price
- vat_rate: VAT percentage (numeric)
- amount: Total amount for line item (numeric)
Convert all numbers from Belgian format (1.234,56) to standard decimal (1234.56).
Return an empty array [] if no line items can be identified.
Invoice content:
{context}
"""
response = self.llm_client.generate(prompt)
try:
result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
# Ensure numeric fields are properly formatted
for item in result:
for field in ['quantity', 'unit_price', 'vat_rate', 'amount']:
if field in item and item[field] is not None:
try:
item[field] = float(item[field])
except:
item[field] = None
return result
except:
logger.warning("Failed to extract line items")
return []
def _parse_date(self, date_str: str) -> Optional[str]:
"""Parse a date string in various formats and return ISO format."""
if not date_str:
return None
date_str = date_str.strip()
# Common date formats in Belgium
date_formats = [
'%d/%m/%Y', '%d-%m-%Y', '%d.%m.%Y', '%Y-%m-%d',
'%d/%m/%y', '%d-%m-%y', '%d.%m.%y',
'%Y/%m/%d', '%Y.%m.%d'
]
# Try all formats
for fmt in date_formats:
try:
date_obj = datetime.datetime.strptime(date_str, fmt)
return date_obj.strftime('%Y-%m-%d')
except ValueError:
continue
# If standard formats fail, rely on LLM to parse the date
prompt = f"""Convert this date string: "{date_str}" to ISO format YYYY-MM-DD.
If it's already in ISO format, just return it.
Return ONLY the date in YYYY-MM-DD format, nothing else."""
try:
response = self.llm_client.generate(prompt)
date_match = response.strip()
# Validate format with simple regex check
if len(date_match) == 10 and date_match[4] == '-' and date_match[7] == '-':
return date_match
except:
pass
# If all parsing attempts fail
return None
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
BaseExtractor | - |
Parameter Details
config: Optional configuration dictionary that can contain LLM settings under 'llm' key. If not provided or if llm_client is not initialized by parent BaseExtractor, a new LLMClient will be created. Configuration is passed to parent BaseExtractor constructor.
Return Value
The constructor returns a BEExtractor instance. The main extract() method returns a dictionary with keys: 'invoice' (metadata like number, dates), 'vendor' (name, VAT, address), 'amounts' (subtotal, VAT, total, currency), 'payment' (bank details, IBAN, terms), 'line_items' (array of item details), 'metadata' (language, extraction method), and 'confidence' (float score). All dates are in ISO format (YYYY-MM-DD), amounts are numeric decimals, and Belgian formats are standardized.
Class Interface
Methods
__init__(self, config=None)
Purpose: Initialize the BEExtractor with configuration, set up LLM client, and define Belgian-specific settings
Parameters:
config: Optional dictionary containing configuration, particularly 'llm' key for LLM settings
Returns: None - initializes instance
extract(self, document: Dict[str, Any], language: str) -> Dict[str, Any]
Purpose: Main extraction method that processes a document and extracts all invoice data with Belgian-specific handling
Parameters:
document: Processed document dictionary from DocumentProcessor containing 'text' or 'pages' with text and optional tableslanguage: Detected language code ('en', 'fr', 'nl') for language-specific processing
Returns: Dictionary with keys: invoice, vendor, amounts, payment, line_items, metadata, confidence
_get_full_document_text(self, document: Dict[str, Any]) -> str
Purpose: Extract and concatenate all text content from document structure
Parameters:
document: Document dictionary with 'text' key or 'pages' array
Returns: Concatenated text string from all pages
_format_table_content(self, tables: List[Dict[str, Any]]) -> str
Purpose: Convert table structures into formatted text representation for LLM processing
Parameters:
tables: List of table dictionaries containing 'cells' with row, column, and text data
Returns: Formatted string representation of tables with pipe-separated values
_empty_extraction_result(self, language: str) -> Dict[str, Any]
Purpose: Generate an empty result structure when extraction fails or no content is found
Parameters:
language: Language code to include in metadata
Returns: Dictionary with empty invoice, vendor, amounts, payment sections and zero confidence
_extract_all_invoice_data(self, full_text: str, table_text: str, language: str) -> Dict[str, Any]
Purpose: Perform comprehensive LLM-based extraction of all invoice data in a single call
Parameters:
full_text: Complete document texttable_text: Formatted table contentlanguage: Language code for context-specific prompting
Returns: Dictionary with all extracted invoice sections or falls back to multi-step extraction
_post_process_extraction(self, extraction_result: Dict[str, Any]) -> Dict[str, Any]
Purpose: Validate and standardize extracted data including date formats, currency defaults, and VAT rate validation
Parameters:
extraction_result: Raw extraction result from LLM
Returns: Cleaned and validated extraction result with standardized formats
_fallback_extraction(self, full_text: str, table_text: str, language: str) -> Dict[str, Any]
Purpose: Alternative extraction method using multiple smaller LLM calls when comprehensive extraction fails
Parameters:
full_text: Complete document texttable_text: Formatted table contentlanguage: Language code
Returns: Dictionary with extracted data from multiple targeted LLM calls
_extract_invoice_metadata(self, text: str) -> Dict[str, Any]
Purpose: Extract invoice-specific metadata (number, dates, reference) using targeted LLM call
Parameters:
text: Invoice text (first 2000 characters)
Returns: Dictionary with invoice_number, issue_date, due_date, reference
_extract_vendor_data(self, text: str) -> Dict[str, Any]
Purpose: Extract vendor information (name, VAT, address, contact) using targeted LLM call
Parameters:
text: Invoice text (first 2000 characters)
Returns: Dictionary with name, vat_number, address, contact
_extract_amounts(self, text: str) -> Dict[str, Any]
Purpose: Extract financial amounts (subtotal, VAT, total) with format conversion using targeted LLM call
Parameters:
text: Full invoice text
Returns: Dictionary with numeric subtotal, total, vat, vat_rate, currency
_extract_payment_data(self, text: str) -> Dict[str, Any]
Purpose: Extract payment information (bank, IBAN, BIC, terms) using targeted LLM call
Parameters:
text: Full invoice text
Returns: Dictionary with bank_name, iban, bic, payment_terms, communication
_extract_line_items(self, text: str, table_text: str) -> List[Dict[str, Any]]
Purpose: Extract invoice line items from tables or text using targeted LLM call
Parameters:
text: Full invoice texttable_text: Formatted table content (preferred if available)
Returns: List of dictionaries with description, quantity, unit_price, vat_rate, amount
_parse_date(self, date_str: str) -> Optional[str]
Purpose: Parse date strings in various Belgian formats and convert to ISO format (YYYY-MM-DD)
Parameters:
date_str: Date string in any common Belgian format (DD/MM/YYYY, DD-MM-YYYY, etc.)
Returns: ISO formatted date string (YYYY-MM-DD) or None if parsing fails
calculate_confidence(self, extraction_result: Dict[str, Any]) -> float
Purpose: Inherited from BaseExtractor - calculates confidence score for extraction quality
Parameters:
extraction_result: Extraction result dictionary
Returns: Float confidence score between 0.0 and 1.0
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
llm_client |
LLMClient | Client for making LLM API calls, initialized from config or inherited from parent | instance |
default_currency |
str | Default currency code for Belgian invoices, set to 'EUR' | instance |
vat_rates |
List[int] | Standard Belgian VAT rates [21, 12, 6, 0] used for validation | instance |
config |
Dict | Configuration dictionary inherited from BaseExtractor | instance |
Dependencies
loggingjsontypingdatetime
Required Imports
import logging
import json
from typing import Dict, List, Any, Optional
import datetime
from extractors.base_extractor import BaseExtractor
from utils.llm_client import LLMClient
Usage Example
from extractors.be_extractor import BEExtractor
import logging
# Configure logging
logging.basicConfig(level=logging.INFO)
# Initialize extractor with optional config
config = {
'llm': {
'model': 'gpt-4',
'api_key': 'your-api-key'
}
}
extractor = BEExtractor(config)
# Prepare document (from DocumentProcessor)
document = {
'text': 'Invoice text content...',
'pages': [
{
'text': 'Page 1 content...',
'tables': [
{
'cells': [
{'row': 0, 'column': 0, 'text': 'Description'},
{'row': 0, 'column': 1, 'text': 'Amount'}
]
}
]
}
]
}
# Extract invoice data
result = extractor.extract(document, language='fr')
# Access extracted data
print(f"Invoice Number: {result['invoice'].get('number')}")
print(f"Vendor: {result['vendor'].get('name')}")
print(f"Total: {result['amounts'].get('total')} {result['amounts'].get('currency')}")
print(f"Confidence: {result['confidence']}")
# Access line items
for item in result['line_items']:
print(f"{item['description']}: {item['amount']}")
Best Practices
- Always initialize with proper LLM configuration to ensure API access
- The extract() method is the main entry point - call it with a properly formatted document dictionary from DocumentProcessor
- Document must contain either 'text' key or 'pages' array with text content
- Language parameter should be 'en', 'fr', or 'nl' for optimal extraction
- Check confidence scores in results to assess extraction quality
- The class uses fallback extraction methods automatically if comprehensive extraction fails
- All dates are standardized to ISO format (YYYY-MM-DD) regardless of input format
- Belgian number formats (1.234,56) are automatically converted to decimal (1234.56)
- VAT rates are validated against standard Belgian rates (21%, 12%, 6%, 0%) with 1% tolerance
- Currency defaults to EUR if not detected
- Handle potential None values in extracted fields gracefully
- The extractor maintains state through instance attributes (default_currency, vat_rates)
- LLM calls may fail - the class includes error handling and fallback mechanisms
- For large documents, extraction may take time due to multiple LLM calls in fallback mode
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class TestBEExtractor 83.9% similar
-
class BEValidator 77.0% similar
-
class AUExtractor 72.5% similar
-
class BaseExtractor 71.7% similar
-
class TestBEValidator 68.6% similar