class RegulatoryExtractor
A class for extracting structured metadata from regulatory guideline PDF documents using LLM-based analysis and storing the results in an Excel tracking spreadsheet.
/tf/active/vicechatdev/reg_extractor.py
11 - 383
complex
Purpose
RegulatoryExtractor automates the process of analyzing regulatory guideline PDFs (from agencies like FDA, EMA, MHRA, TGA) to extract key metadata such as title, reference number, jurisdiction, adoption date, effective date, and summary. It handles both text-based and scanned PDFs (via OCR), uses OpenAI's GPT models for intelligent data extraction, and maintains a deduplicated Excel tracking table. The class manages token limits for large documents and generates Filecloud URLs for document references.
Source Code
class RegulatoryExtractor:
"""
Class for extracting structured data from regulatory guideline PDFs
and storing results in an Excel tracking table.
"""
def __init__(self,
api_key: Optional[str] = None,
model: str = "gpt-4o",
excel_path: str = "regulatory_tracking.xlsx"):
"""
Initialize the RegulatoryExtractor.
Args:
api_key: OpenAI API key (if None, looks for OPENAI_API_KEY env variable)
model: LLM model to use for extraction
excel_path: Path to the Excel tracking file
"""
load_dotenv() # Load environment variables
self.api_key = api_key or os.getenv("OPENAI_API_KEY")
if not self.api_key:
raise ValueError("API key must be provided or set as OPENAI_API_KEY environment variable")
self.model = model
self.excel_path = excel_path
self.client = OpenAI(api_key=self.api_key)
self.logger = logging.getLogger(__name__)
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
def extract_text_from_pdf(self, pdf_path: str) -> str:
"""
Extract text content from a PDF file with OCR fallback for scanned documents.
Uses tiktoken for precise token counting.
Args:
pdf_path: Path to the PDF file
Returns:
Extracted text content
"""
self.logger.info(f"Extracting text from {pdf_path}")
try:
# Initialize tiktoken encoding for token counting
encoding = tiktoken.get_encoding("cl100k_base")
# First try standard text extraction
text = ""
with open(pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text() + "\n"
# Check if the text extraction yielded minimal text
# Use tiktoken to count tokens per page for more accuracy
tokens = len(encoding.encode(text))
avg_tokens_per_page = tokens / len(pdf_reader.pages) if pdf_reader.pages else 0
if avg_tokens_per_page < 50: # Very low token count indicates possible scan
self.logger.info(f"Minimal text extracted ({avg_tokens_per_page:.2f} tokens/page), trying OCR fallback")
text = self._extract_text_with_ocr(pdf_path)
tokens = len(encoding.encode(text))
# Check token limit and truncate if necessary
if tokens > 70000: # Reduced from 100k to ensure we stay within model limits
self.logger.warning(f"PDF text is very long ({tokens} tokens), truncating...")
# Calculate how many tokens to keep from beginning and end
half_tokens = 35000 # Half of our target size
# Extract beginning tokens (up to half_tokens)
beginning_text = encoding.decode(encoding.encode(text)[:half_tokens])
# Extract ending tokens
end_text = encoding.decode(encoding.encode(text)[-half_tokens:])
# Combine with a note about truncation
text = beginning_text + "\n\n...[content truncated]...\n\n" + end_text
# Verify the final token count
final_tokens = len(encoding.encode(text))
self.logger.info(f"Truncated text to {final_tokens} tokens")
return text
except Exception as e:
self.logger.error(f"Error extracting text: {str(e)}")
# Try OCR as fallback for any extraction errors
try:
self.logger.info(f"Extraction error, trying OCR fallback for {pdf_path}")
return self._extract_text_with_ocr(pdf_path)
except Exception as ocr_err:
self.logger.error(f"OCR fallback also failed: {str(ocr_err)}")
raise
def _extract_text_with_ocr(self, pdf_path: str) -> str:
"""
Extract text from PDF using OCR via llmsherpa API.
Based on logic from the offline_docstore_multi_vice module.
Args:
pdf_path: Path to the PDF file
Returns:
OCR extracted text
"""
self.logger.info(f"Using OCR to extract text from {pdf_path}")
try:
# Use llmsherpa for OCR-based extraction
llmsherpa_api_url = "http://llmsherpa:5001/api/parseDocument?renderFormat=all&useNewIndentParser=yes&applyOcr=yes"
# Import llmsherpa if available
try:
from llmsherpa.readers import LayoutPDFReader
pdf_reader = LayoutPDFReader(llmsherpa_api_url)
except ImportError:
self.logger.error("llmsherpa module not found. Install with: pip install llmsherpa")
raise ImportError("llmsherpa module required for OCR processing")
# Extract text using llmsherpa
doc = pdf_reader.read_pdf(pdf_path)
# Combine all text chunks with formatting improvements
all_text = []
text_chunk_interim = ""
min_chunk_len = 4000 # Similar to offline_docstore_multi_vice
for chunk in doc.chunks():
if hasattr(chunk, 'to_text'):
# Clean and normalize the text
clean_text = chunk.to_text().replace("- ","").replace("\n"," ")
text_chunk_interim = clean_text if text_chunk_interim == "" else text_chunk_interim + "\n" + clean_text
# Add chunk when it reaches minimum length
if len(text_chunk_interim) > min_chunk_len:
all_text.append(text_chunk_interim)
text_chunk_interim = ""
# Add any remaining text
if text_chunk_interim:
all_text.append(text_chunk_interim)
# Join all text with line breaks
combined_text = "\n\n".join(all_text)
# Count tokens in the extracted text
encoding = tiktoken.get_encoding("cl100k_base")
tokens = len(encoding.encode(combined_text))
self.logger.info(f"Successfully extracted {len(combined_text)} chars / {tokens} tokens using OCR")
return combined_text
except Exception as e:
self.logger.error(f"OCR extraction error: {str(e)}")
raise
def extract_guideline_data(self, text: str) -> Dict[str, Any]:
"""
Use LLM to extract structured data from guideline text.
Args:
text: Text content from the guideline
Returns:
Dictionary containing extracted fields
"""
self.logger.info("Extracting structured data using LLM")
prompt = """
You are a specialized data extraction assistant for regulatory documentation. Analyze the provided regulatory guideline document and extract the following specific fields:
1. Title: The official name of the guideline
2. Reference Number: Any identifying code, number, or reference ID
3. Jurisdiction: Whether it originates from EU, US, AU, UK, or other regulatory body
4. Adoption Date: When the guideline was officially adopted/published
5. Effective Date: When the guideline comes into force/effect
6. Summary : A brief summary of the guideline content maximum 1 paragraph long
Return ONLY a valid JSON object with these fields. If any information is unclear or missing, use null for that field. Format dates as YYYY-MM-DD when possible.
Example response format:
{
"title": "Full title of the guideline",
"referenceNumber": "GUID-2023-01",
"jurisdiction": "EU",
"adoptionDate": "2023-05-15",
"effectiveDate": "2024-01-01",
"summary": "Brief summary of the guideline content."
}
Important:
- Look for dates with context like "adopted on", "published on", "comes into effect on", "effective from"
- Jurisdiction may be indicated by mentions of agencies (FDA, EMA, MHRA, TGA)
- If dates appear in different formats, standardize to YYYY-MM-DD
Here is the document text:
""" + text
try:
response = self.client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": "You are a precise data extraction tool for regulatory documents."},
{"role": "user", "content": prompt}
],
temperature=0.0, # Use low temperature for precise extraction
)
content = response.choices[0].message.content
# Try to extract just the JSON part from the response
# (in case the model includes other text)
try:
# Find the first { and the last }
start_idx = content.find('{')
end_idx = content.rfind('}') + 1
if start_idx >= 0 and end_idx > 0:
json_str = content[start_idx:end_idx]
data = json.loads(json_str)
else:
# If no JSON delimiters found, try parsing the whole response
data = json.loads(content)
return data
except json.JSONDecodeError:
self.logger.error(f"Failed to parse JSON from response: {content}")
raise ValueError("LLM did not return valid JSON")
except Exception as e:
self.logger.error(f"Error during LLM extraction: {str(e)}")
raise
def update_excel_tracking(self, data: Dict[str, Any], pdf_path: str,orig_file) -> None:
"""
Update Excel tracking table with extracted data.
Only adds new entries if the source file doesn't already exist.
Args:
data: Extracted guideline data
pdf_path: Path to the source PDF file
"""
self.logger.info(f"Updating Excel tracking file: {self.excel_path}")
# Add source file information
data['sourceFile'] = os.path.basename(pdf_path)
source_file = data['sourceFile']
# Generate document URL using the same logic from OneCo_hybrid_RAG
data['documentURL'] = self.generate_filecloud_url(orig_file)
# Load existing Excel file if it exists, or create new DataFrame
if os.path.exists(self.excel_path):
try:
df = pd.read_excel(self.excel_path)
except Exception as e:
self.logger.error(f"Error reading Excel file: {str(e)}")
df = pd.DataFrame()
else:
df = pd.DataFrame()
# Check if this source file already exists in the DataFrame
if 'sourceFile' in df.columns and source_file in df['sourceFile'].values:
self.logger.info(f"Entry for {source_file} already exists in tracking file. Skipping.")
return
# Convert data dict to DataFrame and append
new_row = pd.DataFrame([data])
df = pd.concat([df, new_row], ignore_index=True)
# Write back to Excel
try:
df.to_excel(self.excel_path, index=False)
self.logger.info(f"Successfully updated tracking file with data for {data.get('title')}")
except Exception as e:
self.logger.error(f"Error writing to Excel file: {str(e)}")
raise
def generate_filecloud_url(self, filepath: str) -> str:
"""
Generate a Filecloud URL for the given file path using the same logic
from OneCo_hybrid_RAG.
Args:
filepath: Path to the file
Returns:
Filecloud URL for the file
"""
# Create file basename for display
filename = os.path.basename(filepath)
# Escape spaces in filename with + for the first part
encoded_filename = filename.replace(' ', '+')
# Extract directory path without filename
directory_path = os.path.dirname(filepath)
# Ensure path ends with '/'
if directory_path and not directory_path.endswith('/'):
directory_path += '/'
# Encode path for the second part (after #expl-tabl.)
encoded_path = directory_path
encoded_path = encoded_path.replace(' ', '%20')
# Construct the full URL
file_url = f"https://filecloud.vicebio.com/ui/core/index.html?filter={encoded_filename}#expl-tabl.{encoded_path}"
return file_url
def process_pdf(self, pdf_path: str,orig_file) -> Dict[str, Any]:
"""
Process a single PDF file - extract text, extract data, update Excel.
Args:
pdf_path: Path to the PDF file
Returns:
Extracted data dictionary
"""
self.logger.info(f"Processing regulatory guideline PDF: {pdf_path}")
try:
# Extract text from PDF
text = self.extract_text_from_pdf(pdf_path)
# Extract structured data using LLM
data = self.extract_guideline_data(text)
# Update Excel tracking
self.update_excel_tracking(data, pdf_path,orig_file)
return data
except Exception as e:
self.logger.error(f"Error processing {pdf_path}: {str(e)}")
raise
def process_directory(self, directory_path: str) -> List[Dict[str, Any]]:
"""
Process all PDF files in a directory.
Args:
directory_path: Path to directory containing PDF files
Returns:
List of extracted data dictionaries
"""
self.logger.info(f"Processing all PDFs in directory: {directory_path}")
results = []
for filename in os.listdir(directory_path):
if filename.lower().endswith('.pdf'):
pdf_path = os.path.join(directory_path, filename)
try:
data = self.process_pdf(pdf_path)
results.append(data)
except Exception as e:
self.logger.error(f"Skipping {filename} due to error: {str(e)}")
continue
self.logger.info(f"Processed {len(results)} PDF files successfully")
return results
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
api_key: Optional OpenAI API key string. If not provided, the class will attempt to load it from the OPENAI_API_KEY environment variable. Required for LLM-based extraction functionality.
model: The OpenAI model identifier to use for extraction (default: 'gpt-4o'). Should be a model capable of handling structured data extraction tasks with JSON output.
excel_path: Path to the Excel file where extracted regulatory data will be stored (default: 'regulatory_tracking.xlsx'). File will be created if it doesn't exist. Existing entries are preserved and duplicates are prevented based on source filename.
Return Value
Instantiation returns a RegulatoryExtractor object configured with the specified API key, model, and Excel tracking path. Key method returns: extract_text_from_pdf() returns extracted text as string; extract_guideline_data() returns Dict with keys 'title', 'referenceNumber', 'jurisdiction', 'adoptionDate', 'effectiveDate', 'summary'; process_pdf() returns the extracted data dictionary; process_directory() returns a List of data dictionaries for all processed PDFs.
Class Interface
Methods
__init__(self, api_key: Optional[str] = None, model: str = 'gpt-4o', excel_path: str = 'regulatory_tracking.xlsx')
Purpose: Initialize the RegulatoryExtractor with API credentials, model selection, and Excel tracking file path. Sets up OpenAI client, logger, and loads environment variables.
Parameters:
api_key: Optional OpenAI API key, falls back to OPENAI_API_KEY env varmodel: OpenAI model identifier for extractionexcel_path: Path to Excel tracking file
Returns: None (constructor)
extract_text_from_pdf(self, pdf_path: str) -> str
Purpose: Extract text content from a PDF file with automatic OCR fallback for scanned documents. Handles token counting and truncation for large documents.
Parameters:
pdf_path: Path to the PDF file to extract text from
Returns: Extracted text content as string, truncated to ~70k tokens if necessary with middle content removed
_extract_text_with_ocr(self, pdf_path: str) -> str
Purpose: Private method to extract text from PDF using OCR via llmsherpa API. Used as fallback when standard extraction fails or yields minimal text.
Parameters:
pdf_path: Path to the PDF file to process with OCR
Returns: OCR-extracted text content as string with cleaned formatting
extract_guideline_data(self, text: str) -> Dict[str, Any]
Purpose: Use LLM to extract structured metadata from guideline text. Sends text to OpenAI API with specialized prompt for regulatory document analysis.
Parameters:
text: Text content from the regulatory guideline document
Returns: Dictionary with keys: 'title', 'referenceNumber', 'jurisdiction', 'adoptionDate', 'effectiveDate', 'summary'. Missing fields are null.
update_excel_tracking(self, data: Dict[str, Any], pdf_path: str, orig_file: str) -> None
Purpose: Update Excel tracking table with extracted data. Prevents duplicates by checking if source file already exists. Adds sourceFile and documentURL fields.
Parameters:
data: Extracted guideline data dictionarypdf_path: Path to the source PDF file (local)orig_file: Original file path for URL generation (filecloud path)
Returns: None. Modifies Excel file on disk.
generate_filecloud_url(self, filepath: str) -> str
Purpose: Generate a Filecloud URL for the given file path with proper encoding for spaces and special characters.
Parameters:
filepath: Path to the file in filecloud system
Returns: Formatted Filecloud URL string pointing to the document
process_pdf(self, pdf_path: str, orig_file: str) -> Dict[str, Any]
Purpose: Complete workflow to process a single PDF file: extract text, extract structured data, and update Excel tracking.
Parameters:
pdf_path: Path to the PDF file to process (local path)orig_file: Original file path for URL generation (filecloud path)
Returns: Dictionary containing all extracted guideline metadata
process_directory(self, directory_path: str) -> List[Dict[str, Any]]
Purpose: Process all PDF files in a directory. Continues processing even if individual files fail.
Parameters:
directory_path: Path to directory containing PDF files
Returns: List of extracted data dictionaries for all successfully processed PDFs
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
api_key |
str | OpenAI API key used for LLM requests | instance |
model |
str | OpenAI model identifier (e.g., 'gpt-4o') used for extraction | instance |
excel_path |
str | Path to the Excel tracking file where results are stored | instance |
client |
OpenAI | Initialized OpenAI client instance for API calls | instance |
logger |
logging.Logger | Logger instance for tracking operations and errors | instance |
Dependencies
osjsonpandasPyPDF2typingloggingopenaidotenvtiktokenllmsherpa
Required Imports
import os
import json
import pandas as pd
import PyPDF2
from typing import List, Dict, Any, Optional, Union
import logging
from openai import OpenAI
from dotenv import load_dotenv
import tiktoken
Conditional/Optional Imports
These imports are only needed under specific conditions:
from llmsherpa.readers import LayoutPDFReader
Condition: only needed when processing scanned PDFs or when standard text extraction yields minimal content (< 50 tokens per page). Requires llmsherpa API service running at http://llmsherpa:5001
OptionalUsage Example
# Basic usage
from regulatory_extractor import RegulatoryExtractor
# Initialize with API key from environment
extractor = RegulatoryExtractor(
model='gpt-4o',
excel_path='my_tracking.xlsx'
)
# Process a single PDF
data = extractor.process_pdf(
pdf_path='/path/to/guideline.pdf',
orig_file='/filecloud/path/to/guideline.pdf'
)
print(f"Extracted: {data['title']}")
# Process entire directory
results = extractor.process_directory('/path/to/guidelines/')
print(f"Processed {len(results)} guidelines")
# Manual workflow for custom processing
text = extractor.extract_text_from_pdf('guideline.pdf')
structured_data = extractor.extract_guideline_data(text)
extractor.update_excel_tracking(structured_data, 'guideline.pdf', '/original/path/guideline.pdf')
Best Practices
- Always provide either api_key parameter or set OPENAI_API_KEY environment variable before instantiation
- The class automatically prevents duplicate entries in Excel based on source filename - safe to reprocess directories
- For scanned PDFs, ensure llmsherpa service is running and accessible at the configured URL
- Large PDFs (>70k tokens) are automatically truncated to first and last 35k tokens to stay within model limits
- Use process_pdf() with both pdf_path (local file) and orig_file (original filecloud path) for proper URL generation
- The class uses logging extensively - configure logging level before instantiation if needed
- Excel file is read and written for each PDF processed - for batch processing, consider the I/O overhead
- Token counting uses tiktoken with cl100k_base encoding for accuracy with OpenAI models
- OCR fallback is triggered automatically when standard extraction yields <50 tokens per page
- Method call order for manual processing: extract_text_from_pdf() -> extract_guideline_data() -> update_excel_tracking()
- The extract_guideline_data() method uses temperature=0.0 for deterministic extraction results
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class QueryBasedExtractor 61.3% similar
-
class DocumentExtractor 59.5% similar
-
function extract_previous_reports_summary 54.0% similar
-
function test_document_extractor 52.9% similar
-
function test_multiple_files 52.6% similar