BEExtractor - Code Extractor

class BEExtractor

Maturity: 56

Belgium-specific invoice data extractor that uses LLM (Large Language Model) to extract structured invoice data from Belgian invoices in multiple languages (English, French, Dutch).

File:
/tf/active/vicechatdev/invoice_extraction/extractors/be_extractor.py

Lines:
11 - 463

Complexity:
complex

Purpose

This class extends BaseExtractor to provide specialized extraction of invoice data from Belgian invoices. It handles Belgian-specific formats including VAT numbers (BE format), European date formats (DD/MM/YYYY), European number formats (1.234,56), and standard Belgian VAT rates (21%, 12%, 6%, 0%). The extractor uses a comprehensive LLM-based approach to parse invoice text and tables, with fallback methods for robust extraction. It processes vendor information, invoice metadata, amounts, payment details, and line items while ensuring data validation and format standardization.

Source Code

class BEExtractor(BaseExtractor):
    """Belgium-specific invoice data extractor using pure LLM approach."""
    
    def __init__(self, config=None):
        super().__init__(config)
        
        # Only initialize LLM client if not already initialized by parent
        if self.llm_client is None:
            self.llm_client = LLMClient(self.config.get('llm', {}))
        
        # Belgian-specific configuration
        self.default_currency = 'EUR'
        
        # Standard VAT rates in Belgium for validation
        self.vat_rates = [21, 12, 6, 0]
    
    def extract(self, document: Dict[str, Any], language: str) -> Dict[str, Any]:
        """
        Extract invoice data from the document with Belgian-specific processing.
        
        Args:
            document: Processed document from DocumentProcessor
            language: Detected language of the document ('en', 'fr', 'nl')
            
        Returns:
            Dict containing extracted invoice fields
        """
        logger.info(f"Extracting data from Belgian invoice in {language}")
        
        # Get full text of the document
        full_text = self._get_full_document_text(document)
        if not full_text:
            logger.warning("No text content found in document")
            return self._empty_extraction_result(language)
        
        # Extract tables if present
        tables = []
        for page in document.get('pages', []):
            tables.extend(page.get('tables', []))
            
        table_text = self._format_table_content(tables)
        
        # Extract all data using comprehensive LLM approach
        extraction_result = self._extract_all_invoice_data(full_text, table_text, language)
        
        # Add metadata
        extraction_result['metadata'] = {
            'language': language,
            'extraction_method': self.__class__.__name__
        }
        
        # Add confidence scores
        extraction_result['confidence'] = self.calculate_confidence(extraction_result)
        
        return extraction_result
    
    def _get_full_document_text(self, document: Dict[str, Any]) -> str:
        """Extract full text from document."""
        # If text is directly available in the document
        if document.get('text'):
            return document['text']
        
        # Otherwise, collect text from all pages
        full_text = []
        for page in document.get('pages', []):
            if page.get('text'):
                full_text.append(page['text'])
        
        return "\n\n".join(full_text)
    
    def _format_table_content(self, tables: List[Dict[str, Any]]) -> str:
        """Format tables as text to provide additional structure to the LLM."""
        if not tables:
            return ""
            
        table_texts = []
        for i, table in enumerate(tables):
            rows = []
            current_row = []
            current_row_number = 0
            
            # Sort cells by row and column
            cells = sorted(table.get('cells', []), key=lambda x: (x.get('row', 0), x.get('column', 0)))
            
            for cell in cells:
                row = cell.get('row', 0)
                if row > current_row_number:
                    if current_row:
                        rows.append(" | ".join(current_row))
                    current_row = []
                    current_row_number = row
                
                current_row.append(cell.get('text', '').strip())
            
            if current_row:
                rows.append(" | ".join(current_row))
                
            table_texts.append(f"TABLE {i+1}:\n" + "\n".join(rows))
        
        return "\n\n".join(table_texts)
    
    def _empty_extraction_result(self, language: str) -> Dict[str, Any]:
        """Return an empty extraction result structure."""
        return {
            'invoice': {},
            'vendor': {},
            'amounts': {},
            'payment': {},
            'line_items': [],
            'metadata': {
                'language': language,
                'extraction_method': self.__class__.__name__
            },
            'confidence': 0.0
        }
    
    def _extract_all_invoice_data(self, full_text: str, table_text: str, language: str) -> Dict[str, Any]:
        """
        Extract all invoice data using a comprehensive LLM approach.
        
        Args:
            full_text: Full text of the document
            table_text: Formatted table content if available
            language: Detected language of the document
            
        Returns:
            Dictionary with all extracted invoice data
        """
        # Provide language-specific context to improve extraction
        language_context = {
            'en': "This is a Belgian invoice in English.",
            'fr': "This is a Belgian invoice in French. Common terms: facture (invoice), montant (amount), TVA (VAT), date d'échéance (due date).",
            'nl': "This is a Belgian invoice in Dutch. Common terms: factuur (invoice), bedrag (amount), BTW (VAT), vervaldatum (due date)."
        }.get(language, "This is a Belgian invoice.")
        
        # Create comprehensive extraction prompt
        prompt = f"""# Belgian Invoice Data Extraction

{language_context}

Analyze the following invoice text and extract all required information according to Belgian invoice standards.

## Important Belgian Invoice Characteristics:
- VAT numbers format: BE 0XXX.XXX.XXX or BE0XXXXXXXXX
- Date formats: DD/MM/YYYY, DD-MM-YYYY, DD.MM.YYYY
- Number formats: European (1.234,56) for amounts
- Standard VAT rates: 21% (standard), 12%, 6%, 0% (exempt)
- Belgian IBAN format: BEXX XXXX XXXX XXXX

## Invoice Text:
{full_text}

## Tables Detected:
{table_text}

## Required Output:
Extract and return a valid JSON object with the following structure:

{{
  "invoice": {{
    "number": "extracted invoice number",
    "issue_date": "issue date in YYYY-MM-DD format",
    "due_date": "due date in YYYY-MM-DD format",
    "reference": "client reference or PO number"
  }},
  "vendor": {{
    "name": "vendor company name",
    "vat_number": "BE formatted VAT number",
    "address": "complete vendor address",
    "contact": "contact information"
  }},
  "amounts": {{
    "subtotal": numeric value (before VAT),
    "vat": numeric value (VAT amount),
    "total": numeric value (including VAT),
    "currency": "currency code (default EUR)",
    "vat_rate": numeric value (percentage)
  }},
  "payment": {{
    "bank_name": "bank name",
    "iban": "BE formatted IBAN",
    "bic": "BIC/SWIFT code",
    "payment_terms": "payment terms",
    "communication": "payment reference"
  }},
  "line_items": [
    {{
      "description": "item description",
      "quantity": numeric value,
      "unit_price": numeric value,
      "vat_rate": numeric value,
      "amount": numeric value
    }}
    ...
  ]
}}

Convert all amounts from Belgian format (1.234,56) to standard decimal format (1234.56).
Format dates as ISO format YYYY-MM-DD.
If information is not found, use null or empty string as appropriate.
"""

        # Call LLM with comprehensive extraction prompt
        response = self.llm_client.generate(prompt)
        
        # Parse response
        try:
            extraction_result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
            return self._post_process_extraction(extraction_result)
        except Exception as e:
            logger.error(f"Failed to parse LLM extraction result: {e}")
            
            # Attempt to extract partial results with a more structured prompt
            return self._fallback_extraction(full_text, table_text, language)
    
    def _post_process_extraction(self, extraction_result: Dict[str, Any]) -> Dict[str, Any]:
        """Perform post-processing on the extracted data."""
        result = {
            'invoice': {},
            'vendor': {},
            'amounts': {},
            'payment': {},
            'line_items': []
        }
        
        # Copy extracted data
        for section in ['invoice', 'vendor', 'amounts', 'payment']:
            if section in extraction_result and isinstance(extraction_result[section], dict):
                result[section] = extraction_result[section]
        
        if 'line_items' in extraction_result and isinstance(extraction_result['line_items'], list):
            result['line_items'] = extraction_result['line_items']
        
        # Process dates to ensure consistent format
        for date_field in ['issue_date', 'due_date']:
            if result.get('invoice', {}).get(date_field):
                try:
                    date_str = result['invoice'][date_field]
                    # Check if already in ISO format
                    if '-' in date_str and len(date_str) == 10:
                        parts = date_str.split('-')
                        if len(parts) == 3 and len(parts[0]) == 4:
                            # Already in YYYY-MM-DD format
                            continue
                            
                    # Try to parse and standardize date
                    parsed_date = self._parse_date(date_str)
                    if parsed_date:
                        result['invoice'][date_field] = parsed_date
                except Exception as e:
                    logger.warning(f"Failed to process date {date_field}: {e}")
                    
        # Ensure currency defaults to EUR
        if 'amounts' in result and not result['amounts'].get('currency'):
            result['amounts']['currency'] = self.default_currency
            
        # Validate VAT rate
        if result.get('amounts', {}).get('vat_rate') is not None:
            vat_rate = result['amounts']['vat_rate']
            try:
                vat_rate = float(vat_rate)
                # Check if close to standard Belgian VAT rates
                closest_rate = min(self.vat_rates, key=lambda x: abs(x - vat_rate))
                if abs(closest_rate - vat_rate) < 1:  # Within 1% tolerance
                    result['amounts']['vat_rate'] = closest_rate
            except:
                pass
                
        return result
    
    def _fallback_extraction(self, full_text: str, table_text: str, language: str) -> Dict[str, Any]:
        """
        Fallback method to extract invoice data in multiple smaller LLM calls.
        Used when comprehensive extraction fails.
        """
        logger.info("Using fallback extraction method")
        
        result = {
            'invoice': self._extract_invoice_metadata(full_text),
            'vendor': self._extract_vendor_data(full_text),
            'amounts': self._extract_amounts(full_text),
            'payment': self._extract_payment_data(full_text),
            'line_items': self._extract_line_items(full_text, table_text)
        }
        
        return result
    
    def _extract_invoice_metadata(self, text: str) -> Dict[str, Any]:
        """Extract invoice metadata using LLM."""
        prompt = f"""Extract the following invoice metadata from this Belgian invoice text:
- invoice_number: The invoice number (facture no./factuurnr.)
- issue_date: The date the invoice was issued (date/datum)
- due_date: The date payment is due (échéance/vervaldatum)
- reference: Any client reference or PO number

Return ONLY a valid JSON object with these fields. Format dates as YYYY-MM-DD.

Invoice text:
{text[:2000]}
"""
        response = self.llm_client.generate(prompt)
        try:
            return json.loads(response.replace('```json','').replace('```','').replace('\n',''))
        except:
            logger.warning("Failed to extract invoice metadata")
            return {}
    
    def _extract_vendor_data(self, text: str) -> Dict[str, Any]:
        """Extract vendor data using LLM."""
        prompt = f"""Extract the vendor information from this Belgian invoice text:
- name: The vendor/supplier company name
- vat_number: The Belgian VAT number (format: BE 0XXX.XXX.XXX)
- address: The full address of the vendor
- contact: Email or phone for contact

Return ONLY a valid JSON object with these fields.

Invoice text:
{text[:2000]}
"""
        response = self.llm_client.generate(prompt)
        try:
            return json.loads(response.replace('```json','').replace('```','').replace('\n',''))
        except:
            logger.warning("Failed to extract vendor data")
            return {}
    
    def _extract_amounts(self, text: str) -> Dict[str, Any]:
        """Extract amount information using LLM."""
        prompt = f"""Extract the financial information from this Belgian invoice:
- subtotal: The amount before VAT/BTW (montant hors TVA/bedrag excl. BTW)
- total: The total amount due including VAT (montant total/totaalbedrag)
- vat: The VAT amount (montant TVA/BTW bedrag)
- vat_rate: The VAT percentage rate (e.g., 21 for 21%)
- currency: The currency code (EUR, USD, etc.)

Convert all amounts from Belgian format (1.234,56) to standard decimal (1234.56).
Return ONLY a valid JSON object with numeric values (not strings) for amounts.

Invoice text:
{text}
"""
        response = self.llm_client.generate(prompt)
        try:
            result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
            # Ensure amounts are numeric
            for field in ['subtotal', 'total', 'vat', 'vat_rate']:
                if field in result and result[field] is not None:
                    try:
                        result[field] = float(result[field])
                    except:
                        result[field] = None
            return result
        except:
            logger.warning("Failed to extract amounts")
            return {}
    
    def _extract_payment_data(self, text: str) -> Dict[str, Any]:
        """Extract payment information using LLM."""
        prompt = f"""Extract the payment information from this Belgian invoice:
- bank_name: The name of the bank
- iban: The Belgian IBAN (format: BEXX XXXX XXXX XXXX)
- bic: The BIC/SWIFT code
- payment_terms: Payment terms (e.g., "30 jours/dagen")
- communication: Any payment reference/communication

Return ONLY a valid JSON object with these fields.

Invoice text:
{text}
"""
        response = self.llm_client.generate(prompt)
        try:
            return json.loads(response.replace('```json','').replace('```','').replace('\n',''))
        except:
            logger.warning("Failed to extract payment data")
            return {}
    
    def _extract_line_items(self, text: str, table_text: str) -> List[Dict[str, Any]]:
        """Extract line items using LLM."""
        # Use table text if available, otherwise use full text
        context = table_text if table_text else text
        
        prompt = f"""Extract the line items from this Belgian invoice.
Look for tables with descriptions, quantities, unit prices, and amounts.

Return ONLY a valid JSON array of line items objects with these properties:
- description: Item description
- quantity: Numeric quantity
- unit_price: Numeric unit price
- vat_rate: VAT percentage (numeric)
- amount: Total amount for line item (numeric)

Convert all numbers from Belgian format (1.234,56) to standard decimal (1234.56).
Return an empty array [] if no line items can be identified.

Invoice content:
{context}
"""
        response = self.llm_client.generate(prompt)
        try:
            result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
            # Ensure numeric fields are properly formatted
            for item in result:
                for field in ['quantity', 'unit_price', 'vat_rate', 'amount']:
                    if field in item and item[field] is not None:
                        try:
                            item[field] = float(item[field])
                        except:
                            item[field] = None
            return result
        except:
            logger.warning("Failed to extract line items")
            return []
    
    def _parse_date(self, date_str: str) -> Optional[str]:
        """Parse a date string in various formats and return ISO format."""
        if not date_str:
            return None
            
        date_str = date_str.strip()
        
        # Common date formats in Belgium
        date_formats = [
            '%d/%m/%Y', '%d-%m-%Y', '%d.%m.%Y', '%Y-%m-%d',
            '%d/%m/%y', '%d-%m-%y', '%d.%m.%y',
            '%Y/%m/%d', '%Y.%m.%d'
        ]
        
        # Try all formats
        for fmt in date_formats:
            try:
                date_obj = datetime.datetime.strptime(date_str, fmt)
                return date_obj.strftime('%Y-%m-%d')
            except ValueError:
                continue
                
        # If standard formats fail, rely on LLM to parse the date
        prompt = f"""Convert this date string: "{date_str}" to ISO format YYYY-MM-DD.
If it's already in ISO format, just return it.
Return ONLY the date in YYYY-MM-DD format, nothing else."""

        try:
            response = self.llm_client.generate(prompt)
            date_match = response.strip()
            # Validate format with simple regex check
            if len(date_match) == 10 and date_match[4] == '-' and date_match[7] == '-':
                return date_match
        except:
            pass
                
        # If all parsing attempts fail
        return None

Parameters

Name	Type	Default	Kind
`bases`	BaseExtractor	-

Parameter Details

config: Optional configuration dictionary that can contain LLM settings under 'llm' key. If not provided or if llm_client is not initialized by parent BaseExtractor, a new LLMClient will be created. Configuration is passed to parent BaseExtractor constructor.

Return Value

The constructor returns a BEExtractor instance. The main extract() method returns a dictionary with keys: 'invoice' (metadata like number, dates), 'vendor' (name, VAT, address), 'amounts' (subtotal, VAT, total, currency), 'payment' (bank details, IBAN, terms), 'line_items' (array of item details), 'metadata' (language, extraction method), and 'confidence' (float score). All dates are in ISO format (YYYY-MM-DD), amounts are numeric decimals, and Belgian formats are standardized.

Class Interface

Methods

`init(self, config=None)`

Purpose: Initialize the BEExtractor with configuration, set up LLM client, and define Belgian-specific settings

Parameters:

config: Optional dictionary containing configuration, particularly 'llm' key for LLM settings

Returns: None - initializes instance

`extract(self, document: Dict[str, Any], language: str) -> Dict[str, Any]`

Purpose: Main extraction method that processes a document and extracts all invoice data with Belgian-specific handling

Parameters:

document: Processed document dictionary from DocumentProcessor containing 'text' or 'pages' with text and optional tables
language: Detected language code ('en', 'fr', 'nl') for language-specific processing

Returns: Dictionary with keys: invoice, vendor, amounts, payment, line_items, metadata, confidence

`_get_full_document_text(self, document: Dict[str, Any]) -> str`

Purpose: Extract and concatenate all text content from document structure

Parameters:

document: Document dictionary with 'text' key or 'pages' array

Returns: Concatenated text string from all pages

`_format_table_content(self, tables: List[Dict[str, Any]]) -> str`

Purpose: Convert table structures into formatted text representation for LLM processing

Parameters:

tables: List of table dictionaries containing 'cells' with row, column, and text data

Returns: Formatted string representation of tables with pipe-separated values

`_empty_extraction_result(self, language: str) -> Dict[str, Any]`

Purpose: Generate an empty result structure when extraction fails or no content is found

Parameters:

language: Language code to include in metadata

Returns: Dictionary with empty invoice, vendor, amounts, payment sections and zero confidence

`_extract_all_invoice_data(self, full_text: str, table_text: str, language: str) -> Dict[str, Any]`

Purpose: Perform comprehensive LLM-based extraction of all invoice data in a single call

Parameters:

full_text: Complete document text
table_text: Formatted table content
language: Language code for context-specific prompting

Returns: Dictionary with all extracted invoice sections or falls back to multi-step extraction

`_post_process_extraction(self, extraction_result: Dict[str, Any]) -> Dict[str, Any]`

Purpose: Validate and standardize extracted data including date formats, currency defaults, and VAT rate validation

Parameters:

extraction_result: Raw extraction result from LLM

Returns: Cleaned and validated extraction result with standardized formats

`_fallback_extraction(self, full_text: str, table_text: str, language: str) -> Dict[str, Any]`

Purpose: Alternative extraction method using multiple smaller LLM calls when comprehensive extraction fails

Parameters:

full_text: Complete document text
table_text: Formatted table content
language: Language code

Returns: Dictionary with extracted data from multiple targeted LLM calls

`_extract_invoice_metadata(self, text: str) -> Dict[str, Any]`

Purpose: Extract invoice-specific metadata (number, dates, reference) using targeted LLM call

Parameters:

text: Invoice text (first 2000 characters)

Returns: Dictionary with invoice_number, issue_date, due_date, reference

`_extract_vendor_data(self, text: str) -> Dict[str, Any]`

Purpose: Extract vendor information (name, VAT, address, contact) using targeted LLM call

Parameters:

text: Invoice text (first 2000 characters)

Returns: Dictionary with name, vat_number, address, contact

`_extract_amounts(self, text: str) -> Dict[str, Any]`

Purpose: Extract financial amounts (subtotal, VAT, total) with format conversion using targeted LLM call

Parameters:

text: Full invoice text

Returns: Dictionary with numeric subtotal, total, vat, vat_rate, currency

`_extract_payment_data(self, text: str) -> Dict[str, Any]`

Purpose: Extract payment information (bank, IBAN, BIC, terms) using targeted LLM call

Parameters:

text: Full invoice text

Returns: Dictionary with bank_name, iban, bic, payment_terms, communication

`_extract_line_items(self, text: str, table_text: str) -> List[Dict[str, Any]]`

Purpose: Extract invoice line items from tables or text using targeted LLM call

Parameters:

text: Full invoice text
table_text: Formatted table content (preferred if available)

Returns: List of dictionaries with description, quantity, unit_price, vat_rate, amount

`_parse_date(self, date_str: str) -> Optional[str]`

Purpose: Parse date strings in various Belgian formats and convert to ISO format (YYYY-MM-DD)

Parameters:

date_str: Date string in any common Belgian format (DD/MM/YYYY, DD-MM-YYYY, etc.)

Returns: ISO formatted date string (YYYY-MM-DD) or None if parsing fails

`calculate_confidence(self, extraction_result: Dict[str, Any]) -> float`

Purpose: Inherited from BaseExtractor - calculates confidence score for extraction quality

Parameters:

extraction_result: Extraction result dictionary

Returns: Float confidence score between 0.0 and 1.0

Attributes

Name	Type	Description	Scope
`llm_client`	LLMClient	Client for making LLM API calls, initialized from config or inherited from parent	instance
`default_currency`	str	Default currency code for Belgian invoices, set to 'EUR'	instance
`vat_rates`	List[int]	Standard Belgian VAT rates [21, 12, 6, 0] used for validation	instance
`config`	Dict	Configuration dictionary inherited from BaseExtractor	instance

Dependencies

logging
json
typing
datetime

Required Imports

import logging
import json
from typing import Dict, List, Any, Optional
import datetime
from extractors.base_extractor import BaseExtractor
from utils.llm_client import LLMClient

Usage Example

from extractors.be_extractor import BEExtractor
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)

# Initialize extractor with optional config
config = {
    'llm': {
        'model': 'gpt-4',
        'api_key': 'your-api-key'
    }
}
extractor = BEExtractor(config)

# Prepare document (from DocumentProcessor)
document = {
    'text': 'Invoice text content...',
    'pages': [
        {
            'text': 'Page 1 content...',
            'tables': [
                {
                    'cells': [
                        {'row': 0, 'column': 0, 'text': 'Description'},
                        {'row': 0, 'column': 1, 'text': 'Amount'}
                    ]
                }
            ]
        }
    ]
}

# Extract invoice data
result = extractor.extract(document, language='fr')

# Access extracted data
print(f"Invoice Number: {result['invoice'].get('number')}")
print(f"Vendor: {result['vendor'].get('name')}")
print(f"Total: {result['amounts'].get('total')} {result['amounts'].get('currency')}")
print(f"Confidence: {result['confidence']}")

# Access line items
for item in result['line_items']:
    print(f"{item['description']}: {item['amount']}")

Best Practices

Always initialize with proper LLM configuration to ensure API access
The extract() method is the main entry point - call it with a properly formatted document dictionary from DocumentProcessor
Document must contain either 'text' key or 'pages' array with text content
Language parameter should be 'en', 'fr', or 'nl' for optimal extraction
Check confidence scores in results to assess extraction quality
The class uses fallback extraction methods automatically if comprehensive extraction fails
All dates are standardized to ISO format (YYYY-MM-DD) regardless of input format
Belgian number formats (1.234,56) are automatically converted to decimal (1234.56)
VAT rates are validated against standard Belgian rates (21%, 12%, 6%, 0%) with 1% tolerance
Currency defaults to EUR if not detected
Handle potential None values in extracted fields gracefully
The extractor maintains state through instance attributes (default_currency, vat_rates)
LLM calls may fail - the class includes error handling and fallback mechanisms
For large documents, extraction may take time due to multiple LLM calls in fallback mode

Similar Components

AI-powered semantic similarity - components with related functionality:

class TestBEExtractor 83.9% similar

Unit test class for testing the BEExtractor class, which extracts structured data from Belgian invoices using LLM-based extraction.
From: /tf/active/vicechatdev/invoice_extraction/tests/test_extractors.py
class BEValidator 77.0% similar

Belgium-specific invoice data validator that extends BaseValidator to implement Belgian invoice validation rules including VAT number format, address verification, IBAN validation, and legal requirements.
From: /tf/active/vicechatdev/invoice_extraction/validators/be_validator.py
class AUExtractor 72.5% similar

Australia-specific invoice data extractor that uses LLM (Large Language Model) to extract structured invoice data from Australian tax invoices, handling ABN, ACN, GST, BSB numbers and Australian date formats.
From: /tf/active/vicechatdev/invoice_extraction/extractors/au_extractor.py
class BaseExtractor 71.7% similar

Abstract base class that defines the interface and shared functionality for entity-specific invoice data extractors (UK, BE, AU), providing a multi-stage extraction pipeline for invoice processing.
From: /tf/active/vicechatdev/invoice_extraction/extractors/base_extractor.py
class TestBEValidator 68.6% similar

Unit test class for validating the BEValidator class, which validates Belgian invoice extraction results including VAT numbers, addresses, IBAN, currency, and legal requirements.
From: /tf/active/vicechatdev/invoice_extraction/tests/test_validators.py

← Back to Browse

Assistant

Hi! I can help improve this code. Tell me what you'd like to enhance (e.g., "add error handling", "optimize performance", "improve readability", "add type hints").

Code Comparison

Original Code

                            class BEExtractor(BaseExtractor):
    """Belgium-specific invoice data extractor using pure LLM approach."""
    
    def __init__(self, config=None):
        super().__init__(config)
        
        # Only initialize LLM client if not already initialized by parent
        if self.llm_client is None:
            self.llm_client = LLMClient(self.config.get('llm', {}))
        
        # Belgian-specific configuration
        self.default_currency = 'EUR'
        
        # Standard VAT rates in Belgium for validation
        self.vat_rates = [21, 12, 6, 0]
    
    def extract(self, document: Dict[str, Any], language: str) -> Dict[str, Any]:
        """
        Extract invoice data from the document with Belgian-specific processing.
        
        Args:
            document: Processed document from DocumentProcessor
            language: Detected language of the document ('en', 'fr', 'nl')
            
        Returns:
            Dict containing extracted invoice fields
        """
        logger.info(f"Extracting data from Belgian invoice in {language}")
        
        # Get full text of the document
        full_text = self._get_full_document_text(document)
        if not full_text:
            logger.warning("No text content found in document")
            return self._empty_extraction_result(language)
        
        # Extract tables if present
        tables = []
        for page in document.get('pages', []):
            tables.extend(page.get('tables', []))
            
        table_text = self._format_table_content(tables)
        
        # Extract all data using comprehensive LLM approach
        extraction_result = self._extract_all_invoice_data(full_text, table_text, language)
        
        # Add metadata
        extraction_result['metadata'] = {
            'language': language,
            'extraction_method': self.__class__.__name__
        }
        
        # Add confidence scores
        extraction_result['confidence'] = self.calculate_confidence(extraction_result)
        
        return extraction_result
    
    def _get_full_document_text(self, document: Dict[str, Any]) -> str:
        """Extract full text from document."""
        # If text is directly available in the document
        if document.get('text'):
            return document['text']
        
        # Otherwise, collect text from all pages
        full_text = []
        for page in document.get('pages', []):
            if page.get('text'):
                full_text.append(page['text'])
        
        return "\n\n".join(full_text)
    
    def _format_table_content(self, tables: List[Dict[str, Any]]) -> str:
        """Format tables as text to provide additional structure to the LLM."""
        if not tables:
            return ""
            
        table_texts = []
        for i, table in enumerate(tables):
            rows = []
            current_row = []
            current_row_number = 0
            
            # Sort cells by row and column
            cells = sorted(table.get('cells', []), key=lambda x: (x.get('row', 0), x.get('column', 0)))
            
            for cell in cells:
                row = cell.get('row', 0)
                if row > current_row_number:
                    if current_row:
                        rows.append(" | ".join(current_row))
                    current_row = []
                    current_row_number = row
                
                current_row.append(cell.get('text', '').strip())
            
            if current_row:
                rows.append(" | ".join(current_row))
                
            table_texts.append(f"TABLE {i+1}:\n" + "\n".join(rows))
        
        return "\n\n".join(table_texts)
    
    def _empty_extraction_result(self, language: str) -> Dict[str, Any]:
        """Return an empty extraction result structure."""
        return {
            'invoice': {},
            'vendor': {},
            'amounts': {},
            'payment': {},
            'line_items': [],
            'metadata': {
                'language': language,
                'extraction_method': self.__class__.__name__
            },
            'confidence': 0.0
        }
    
    def _extract_all_invoice_data(self, full_text: str, table_text: str, language: str) -> Dict[str, Any]:
        """
        Extract all invoice data using a comprehensive LLM approach.
        
        Args:
            full_text: Full text of the document
            table_text: Formatted table content if available
            language: Detected language of the document
            
        Returns:
            Dictionary with all extracted invoice data
        """
        # Provide language-specific context to improve extraction
        language_context = {
            'en': "This is a Belgian invoice in English.",
            'fr': "This is a Belgian invoice in French. Common terms: facture (invoice), montant (amount), TVA (VAT), date d'échéance (due date).",
            'nl': "This is a Belgian invoice in Dutch. Common terms: factuur (invoice), bedrag (amount), BTW (VAT), vervaldatum (due date)."
        }.get(language, "This is a Belgian invoice.")
        
        # Create comprehensive extraction prompt
        prompt = f"""# Belgian Invoice Data Extraction

{language_context}

Analyze the following invoice text and extract all required information according to Belgian invoice standards.

## Important Belgian Invoice Characteristics:
- VAT numbers format: BE 0XXX.XXX.XXX or BE0XXXXXXXXX
- Date formats: DD/MM/YYYY, DD-MM-YYYY, DD.MM.YYYY
- Number formats: European (1.234,56) for amounts
- Standard VAT rates: 21% (standard), 12%, 6%, 0% (exempt)
- Belgian IBAN format: BEXX XXXX XXXX XXXX

## Invoice Text:
{full_text}

## Tables Detected:
{table_text}

## Required Output:
Extract and return a valid JSON object with the following structure:

{{
  "invoice": {{
    "number": "extracted invoice number",
    "issue_date": "issue date in YYYY-MM-DD format",
    "due_date": "due date in YYYY-MM-DD format",
    "reference": "client reference or PO number"
  }},
  "vendor": {{
    "name": "vendor company name",
    "vat_number": "BE formatted VAT number",
    "address": "complete vendor address",
    "contact": "contact information"
  }},
  "amounts": {{
    "subtotal": numeric value (before VAT),
    "vat": numeric value (VAT amount),
    "total": numeric value (including VAT),
    "currency": "currency code (default EUR)",
    "vat_rate": numeric value (percentage)
  }},
  "payment": {{
    "bank_name": "bank name",
    "iban": "BE formatted IBAN",
    "bic": "BIC/SWIFT code",
    "payment_terms": "payment terms",
    "communication": "payment reference"
  }},
  "line_items": [
    {{
      "description": "item description",
      "quantity": numeric value,
      "unit_price": numeric value,
      "vat_rate": numeric value,
      "amount": numeric value
    }}
    ...
  ]
}}

Convert all amounts from Belgian format (1.234,56) to standard decimal format (1234.56).
Format dates as ISO format YYYY-MM-DD.
If information is not found, use null or empty string as appropriate.
"""

        # Call LLM with comprehensive extraction prompt
        response = self.llm_client.generate(prompt)
        
        # Parse response
        try:
            extraction_result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
            return self._post_process_extraction(extraction_result)
        except Exception as e:
            logger.error(f"Failed to parse LLM extraction result: {e}")
            
            # Attempt to extract partial results with a more structured prompt
            return self._fallback_extraction(full_text, table_text, language)
    
    def _post_process_extraction(self, extraction_result: Dict[str, Any]) -> Dict[str, Any]:
        """Perform post-processing on the extracted data."""
        result = {
            'invoice': {},
            'vendor': {},
            'amounts': {},
            'payment': {},
            'line_items': []
        }
        
        # Copy extracted data
        for section in ['invoice', 'vendor', 'amounts', 'payment']:
            if section in extraction_result and isinstance(extraction_result[section], dict):
                result[section] = extraction_result[section]
        
        if 'line_items' in extraction_result and isinstance(extraction_result['line_items'], list):
            result['line_items'] = extraction_result['line_items']
        
        # Process dates to ensure consistent format
        for date_field in ['issue_date', 'due_date']:
            if result.get('invoice', {}).get(date_field):
                try:
                    date_str = result['invoice'][date_field]
                    # Check if already in ISO format
                    if '-' in date_str and len(date_str) == 10:
                        parts = date_str.split('-')
                        if len(parts) == 3 and len(parts[0]) == 4:
                            # Already in YYYY-MM-DD format
                            continue
                            
                    # Try to parse and standardize date
                    parsed_date = self._parse_date(date_str)
                    if parsed_date:
                        result['invoice'][date_field] = parsed_date
                except Exception as e:
                    logger.warning(f"Failed to process date {date_field}: {e}")
                    
        # Ensure currency defaults to EUR
        if 'amounts' in result and not result['amounts'].get('currency'):
            result['amounts']['currency'] = self.default_currency
            
        # Validate VAT rate
        if result.get('amounts', {}).get('vat_rate') is not None:
            vat_rate = result['amounts']['vat_rate']
            try:
                vat_rate = float(vat_rate)
                # Check if close to standard Belgian VAT rates
                closest_rate = min(self.vat_rates, key=lambda x: abs(x - vat_rate))
                if abs(closest_rate - vat_rate) < 1:  # Within 1% tolerance
                    result['amounts']['vat_rate'] = closest_rate
            except:
                pass
                
        return result
    
    def _fallback_extraction(self, full_text: str, table_text: str, language: str) -> Dict[str, Any]:
        """
        Fallback method to extract invoice data in multiple smaller LLM calls.
        Used when comprehensive extraction fails.
        """
        logger.info("Using fallback extraction method")
        
        result = {
            'invoice': self._extract_invoice_metadata(full_text),
            'vendor': self._extract_vendor_data(full_text),
            'amounts': self._extract_amounts(full_text),
            'payment': self._extract_payment_data(full_text),
            'line_items': self._extract_line_items(full_text, table_text)
        }
        
        return result
    
    def _extract_invoice_metadata(self, text: str) -> Dict[str, Any]:
        """Extract invoice metadata using LLM."""
        prompt = f"""Extract the following invoice metadata from this Belgian invoice text:
- invoice_number: The invoice number (facture no./factuurnr.)
- issue_date: The date the invoice was issued (date/datum)
- due_date: The date payment is due (échéance/vervaldatum)
- reference: Any client reference or PO number

Return ONLY a valid JSON object with these fields. Format dates as YYYY-MM-DD.

Invoice text:
{text[:2000]}
"""
        response = self.llm_client.generate(prompt)
        try:
            return json.loads(response.replace('```json','').replace('```','').replace('\n',''))
        except:
            logger.warning("Failed to extract invoice metadata")
            return {}
    
    def _extract_vendor_data(self, text: str) -> Dict[str, Any]:
        """Extract vendor data using LLM."""
        prompt = f"""Extract the vendor information from this Belgian invoice text:
- name: The vendor/supplier company name
- vat_number: The Belgian VAT number (format: BE 0XXX.XXX.XXX)
- address: The full address of the vendor
- contact: Email or phone for contact

Return ONLY a valid JSON object with these fields.

Invoice text:
{text[:2000]}
"""
        response = self.llm_client.generate(prompt)
        try:
            return json.loads(response.replace('```json','').replace('```','').replace('\n',''))
        except:
            logger.warning("Failed to extract vendor data")
            return {}
    
    def _extract_amounts(self, text: str) -> Dict[str, Any]:
        """Extract amount information using LLM."""
        prompt = f"""Extract the financial information from this Belgian invoice:
- subtotal: The amount before VAT/BTW (montant hors TVA/bedrag excl. BTW)
- total: The total amount due including VAT (montant total/totaalbedrag)
- vat: The VAT amount (montant TVA/BTW bedrag)
- vat_rate: The VAT percentage rate (e.g., 21 for 21%)
- currency: The currency code (EUR, USD, etc.)

Convert all amounts from Belgian format (1.234,56) to standard decimal (1234.56).
Return ONLY a valid JSON object with numeric values (not strings) for amounts.

Invoice text:
{text}
"""
        response = self.llm_client.generate(prompt)
        try:
            result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
            # Ensure amounts are numeric
            for field in ['subtotal', 'total', 'vat', 'vat_rate']:
                if field in result and result[field] is not None:
                    try:
                        result[field] = float(result[field])
                    except:
                        result[field] = None
            return result
        except:
            logger.warning("Failed to extract amounts")
            return {}
    
    def _extract_payment_data(self, text: str) -> Dict[str, Any]:
        """Extract payment information using LLM."""
        prompt = f"""Extract the payment information from this Belgian invoice:
- bank_name: The name of the bank
- iban: The Belgian IBAN (format: BEXX XXXX XXXX XXXX)
- bic: The BIC/SWIFT code
- payment_terms: Payment terms (e.g., "30 jours/dagen")
- communication: Any payment reference/communication

Return ONLY a valid JSON object with these fields.

Invoice text:
{text}
"""
        response = self.llm_client.generate(prompt)
        try:
            return json.loads(response.replace('```json','').replace('```','').replace('\n',''))
        except:
            logger.warning("Failed to extract payment data")
            return {}
    
    def _extract_line_items(self, text: str, table_text: str) -> List[Dict[str, Any]]:
        """Extract line items using LLM."""
        # Use table text if available, otherwise use full text
        context = table_text if table_text else text
        
        prompt = f"""Extract the line items from this Belgian invoice.
Look for tables with descriptions, quantities, unit prices, and amounts.

Return ONLY a valid JSON array of line items objects with these properties:
- description: Item description
- quantity: Numeric quantity
- unit_price: Numeric unit price
- vat_rate: VAT percentage (numeric)
- amount: Total amount for line item (numeric)

Convert all numbers from Belgian format (1.234,56) to standard decimal (1234.56).
Return an empty array [] if no line items can be identified.

Invoice content:
{context}
"""
        response = self.llm_client.generate(prompt)
        try:
            result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
            # Ensure numeric fields are properly formatted
            for item in result:
                for field in ['quantity', 'unit_price', 'vat_rate', 'amount']:
                    if field in item and item[field] is not None:
                        try:
                            item[field] = float(item[field])
                        except:
                            item[field] = None
            return result
        except:
            logger.warning("Failed to extract line items")
            return []
    
    def _parse_date(self, date_str: str) -> Optional[str]:
        """Parse a date string in various formats and return ISO format."""
        if not date_str:
            return None
            
        date_str = date_str.strip()
        
        # Common date formats in Belgium
        date_formats = [
            '%d/%m/%Y', '%d-%m-%Y', '%d.%m.%Y', '%Y-%m-%d',
            '%d/%m/%y', '%d-%m-%y', '%d.%m.%y',
            '%Y/%m/%d', '%Y.%m.%d'
        ]
        
        # Try all formats
        for fmt in date_formats:
            try:
                date_obj = datetime.datetime.strptime(date_str, fmt)
                return date_obj.strftime('%Y-%m-%d')
            except ValueError:
                continue
                
        # If standard formats fail, rely on LLM to parse the date
        prompt = f"""Convert this date string: "{date_str}" to ISO format YYYY-MM-DD.
If it's already in ISO format, just return it.
Return ONLY the date in YYYY-MM-DD format, nothing else."""

        try:
            response = self.llm_client.generate(prompt)
            date_match = response.strip()
            # Validate format with simple regex check
            if len(date_match) == 10 and date_match[4] == '-' and date_match[7] == '-':
                return date_match
        except:
            pass
                
        # If all parsing attempts fail
        return None
                        

Improved Code

🔍 Code Extractor

class BEExtractor

Purpose

Source Code

Parameters

Parameter Details

Return Value

Class Interface

Methods

`init(self, config=None)`

`extract(self, document: Dict[str, Any], language: str) -> Dict[str, Any]`

`_get_full_document_text(self, document: Dict[str, Any]) -> str`

`_format_table_content(self, tables: List[Dict[str, Any]]) -> str`

`_empty_extraction_result(self, language: str) -> Dict[str, Any]`

`_extract_all_invoice_data(self, full_text: str, table_text: str, language: str) -> Dict[str, Any]`

`_post_process_extraction(self, extraction_result: Dict[str, Any]) -> Dict[str, Any]`

`_fallback_extraction(self, full_text: str, table_text: str, language: str) -> Dict[str, Any]`

`_extract_invoice_metadata(self, text: str) -> Dict[str, Any]`

`_extract_vendor_data(self, text: str) -> Dict[str, Any]`

`_extract_amounts(self, text: str) -> Dict[str, Any]`

`_extract_payment_data(self, text: str) -> Dict[str, Any]`

`_extract_line_items(self, text: str, table_text: str) -> List[Dict[str, Any]]`

`_parse_date(self, date_str: str) -> Optional[str]`

`calculate_confidence(self, extraction_result: Dict[str, Any]) -> float`

Attributes

Dependencies

Required Imports

Usage Example

Best Practices

Tags

Similar Components

class TestBEExtractor 83.9% similar

class BEValidator 77.0% similar

class AUExtractor 72.5% similar

class BaseExtractor 71.7% similar

class TestBEValidator 68.6% similar

class BEExtractor

Purpose

Source Code

Parameters

Parameter Details

Return Value

Class Interface

Methods

__init__(self, config=None)

extract(self, document: Dict[str, Any], language: str) -> Dict[str, Any]

_get_full_document_text(self, document: Dict[str, Any]) -> str

_format_table_content(self, tables: List[Dict[str, Any]]) -> str

_empty_extraction_result(self, language: str) -> Dict[str, Any]

_extract_all_invoice_data(self, full_text: str, table_text: str, language: str) -> Dict[str, Any]

_post_process_extraction(self, extraction_result: Dict[str, Any]) -> Dict[str, Any]

_fallback_extraction(self, full_text: str, table_text: str, language: str) -> Dict[str, Any]

_extract_invoice_metadata(self, text: str) -> Dict[str, Any]

_extract_vendor_data(self, text: str) -> Dict[str, Any]

_extract_amounts(self, text: str) -> Dict[str, Any]

_extract_payment_data(self, text: str) -> Dict[str, Any]

_extract_line_items(self, text: str, table_text: str) -> List[Dict[str, Any]]

_parse_date(self, date_str: str) -> Optional[str]

calculate_confidence(self, extraction_result: Dict[str, Any]) -> float

Attributes

Dependencies

Required Imports

Usage Example

Best Practices

Tags

Similar Components

class TestBEExtractor 83.9% similar

class BEValidator 77.0% similar

class AUExtractor 72.5% similar

class BaseExtractor 71.7% similar

class TestBEValidator 68.6% similar

✨ Improve Code: BEExtractor

Code Comparison

`init(self, config=None)`

`extract(self, document: Dict[str, Any], language: str) -> Dict[str, Any]`

`_get_full_document_text(self, document: Dict[str, Any]) -> str`

`_format_table_content(self, tables: List[Dict[str, Any]]) -> str`

`_empty_extraction_result(self, language: str) -> Dict[str, Any]`

`_extract_all_invoice_data(self, full_text: str, table_text: str, language: str) -> Dict[str, Any]`

`_post_process_extraction(self, extraction_result: Dict[str, Any]) -> Dict[str, Any]`

`_fallback_extraction(self, full_text: str, table_text: str, language: str) -> Dict[str, Any]`

`_extract_invoice_metadata(self, text: str) -> Dict[str, Any]`

`_extract_vendor_data(self, text: str) -> Dict[str, Any]`

`_extract_amounts(self, text: str) -> Dict[str, Any]`

`_extract_payment_data(self, text: str) -> Dict[str, Any]`

`_extract_line_items(self, text: str, table_text: str) -> List[Dict[str, Any]]`

`_parse_date(self, date_str: str) -> Optional[str]`

`calculate_confidence(self, extraction_result: Dict[str, Any]) -> float`