class ImprovedProjectVictoriaGenerator
Improved Project Victoria Disclosure Generator with proper reference management.
/tf/active/vicechatdev/improved_project_victoria_generator.py
147 - 787
moderate
Purpose
Improved Project Victoria Disclosure Generator with proper reference management.
Source Code
class ImprovedProjectVictoriaGenerator:
"""
Improved Project Victoria Disclosure Generator with proper reference management.
"""
def __init__(self, pdf_path: str = "./20250623_Project Victoria - Disclosure Matrix_WIP.pdf"):
"""
Initialize the improved disclosure generator.
Args:
pdf_path: Path to the Project Victoria PDF document
"""
self.pdf_path = pdf_path
self.extracted_text = ""
self.warranty_claims = []
self.disclosures = {}
# Initialize reference manager
self.ref_manager = ReferenceManager()
# LLM configuration
self.api_key = "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"
os.environ["OPENAI_API_KEY"] = self.api_key
# Initialize tokenizer for counting tokens
self.tokenizer = tiktoken.get_encoding("cl100k_base")
# Initialize Chroma DB connection
self.init_chroma_connection()
# Cross-encoder for ranking (if available)
if CROSSENCODER_AVAILABLE:
try:
self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
print("ā
Cross-encoder initialized for document reranking")
except Exception as e:
print(f"ā ļø Cross-encoder initialization failed: {e}")
self.cross_encoder = None
else:
self.cross_encoder = None
print(f"Initialized Improved Project Victoria Disclosure Generator")
print(f"PDF path: {pdf_path}")
def init_chroma_connection(self):
"""Initialize connection to Chroma DB."""
try:
# Connect to Chroma DB (assuming it's running on vice_chroma:8000)
self.chroma_client = chromadb.HttpClient(host='vice_chroma', port=8000)
# Set up embedding function
if EMBEDDING_AVAILABLE:
self.chroma_embedder = MyEmbeddingFunction("gpt-4o-mini", "text-embedding-3-small", self.api_key)
else:
print("ā ļø Using default Chroma embeddings (may not work with custom collections)")
self.chroma_embedder = None
# Get the 99_edr collection
self.edr_collection = self.chroma_client.get_collection(
"99_edr",
embedding_function=self.chroma_embedder
)
print("ā
Successfully connected to Chroma DB and 99_edr collection")
except Exception as e:
print(f"ā Error connecting to Chroma DB: {e}")
print("Attempting to use local fallback or alternative connection...")
self.chroma_client = None
self.edr_collection = None
def extract_pdf_text(self) -> str:
"""Extract text from the Project Victoria PDF document."""
# First try to use existing extracted text file
try:
with open("project_victoria_extracted.txt", "r", encoding="utf-8") as f:
self.extracted_text = f.read()
print(f"ā
Successfully loaded text from existing file: {len(self.extracted_text)} characters")
return self.extracted_text
except FileNotFoundError:
print("š No existing extracted text file found, attempting PDF extraction...")
except Exception as e:
print(f"ā ļø Error reading existing text file: {e}")
# Try PDF extraction if PyMuPDF is available
if PDF_AVAILABLE and fitz:
try:
print(f"Extracting text from PDF: {self.pdf_path}")
# Open the PDF document
doc = fitz.open(self.pdf_path)
text_content = []
for page_num in range(len(doc)):
page = doc.load_page(page_num)
text = page.get_text()
text_content.append(f"\n--- Page {page_num + 1} ---\n{text}")
doc.close()
self.extracted_text = "\n".join(text_content)
# Save extracted text for future use
try:
with open("project_victoria_extracted.txt", "w", encoding="utf-8") as f:
f.write(self.extracted_text)
print("š¾ Saved extracted text to project_victoria_extracted.txt")
except Exception as save_error:
print(f"ā ļø Could not save extracted text: {save_error}")
print(f"ā
Successfully extracted {len(self.extracted_text)} characters from PDF")
print(f"š Total pages processed: {len(doc)}")
return self.extracted_text
except Exception as e:
print(f"ā Error extracting PDF text: {e}")
else:
print("ā PyMuPDF not available for PDF extraction")
# If all else fails, provide manual instructions
raise Exception("""
Could not extract text from PDF. Please do one of the following:
1. Install PyMuPDF: pip install PyMuPDF
2. Manually extract text from the PDF and save it as 'project_victoria_extracted.txt'
3. Provide the extracted text file in the working directory
The script will automatically use the text file if available.
""")
def identify_warranty_claims(self) -> List[Dict[str, Any]]:
"""Identify and extract individual warranty claims from the document."""
print("\n" + "="*60)
print("IDENTIFYING WARRANTY CLAIMS")
print("="*60)
# Use LLM to identify warranty claims
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=16000)
warranty_extraction_prompt = """
You are analyzing a legal document containing warranty disclosures for a company acquisition.
The document is structured with numbered warranty sections, each containing specific warranty claims.
Your task is to extract ALL individual warranty claims from the text. Each warranty claim should be identified as a distinct legal requirement or representation.
Please extract and return a JSON list of warranty claims, where each item has:
- "warranty_number": The section number (e.g., "1.1", "2.1(a)", "3.4")
- "warranty_title": A short descriptive title for the warranty
- "warranty_text": The complete text of the warranty claim (limit to 500 characters)
- "section_name": The main section name (e.g., "THE SHARES; THE SELLERS", "AUTHORITY AND CAPACITY")
Focus on extracting the actual warranty statements, not procedural text or definitions.
IMPORTANT: Keep warranty_text under 500 characters to ensure the JSON response is not truncated.
Here is the document text to analyze:
{document_text}
Return only a valid JSON array of warranty claims. Ensure the response is complete and valid JSON.
"""
# Split text into chunks if too long
max_chunk_size = 30000
text_chunks = self.split_text_for_processing(self.extracted_text, max_chunk_size)
all_warranties = []
for i, chunk in enumerate(text_chunks):
print(f"Processing warranty extraction chunk {i+1}/{len(text_chunks)}")
prompt = warranty_extraction_prompt.format(document_text=chunk)
try:
response = llm.invoke(prompt)
response_text = response.content.strip()
# Clean up response to ensure valid JSON
if response_text.startswith("```json"):
response_text = response_text[7:]
if response_text.endswith("```"):
response_text = response_text[:-3]
# Try to repair truncated JSON
response_text = response_text.strip()
if not response_text.endswith(']'):
last_complete = response_text.rfind('}}')
if last_complete > 0:
response_text = response_text[:last_complete + 2] + ']'
else:
response_text = response_text + ']'
chunk_warranties = json.loads(response_text)
if isinstance(chunk_warranties, list):
all_warranties.extend(chunk_warranties)
print(f"ā
Extracted {len(chunk_warranties)} warranties from chunk {i+1}")
else:
print(f"ā ļø Unexpected response format from chunk {i+1}")
except json.JSONDecodeError as je:
print(f"ā JSON decode error in chunk {i+1}: {je}")
print(f"Response was: {response_text[:500]}...")
# Try alternative extraction
try:
warranty_pattern = r'\{\s*"warranty_number"[^}]+\}'
matches = re.findall(warranty_pattern, response_text, re.DOTALL)
for match in matches:
try:
warranty = json.loads(match)
all_warranties.append(warranty)
except:
continue
if matches:
print(f"ā
Recovered {len(matches)} warranties using regex extraction")
except Exception as re_error:
print(f"ā Regex recovery also failed: {re_error}")
except Exception as e:
print(f"ā Error processing chunk {i+1}: {e}")
# Clean up duplicates
if all_warranties:
all_warranties = self.verify_and_clean_warranties(all_warranties)
self.warranty_claims = all_warranties
print(f"ā
Total warranty claims identified: {len(self.warranty_claims)}")
# Display summary
if self.warranty_claims:
print("\nSample warranty claims:")
for i, warranty in enumerate(self.warranty_claims[:3]):
print(f"{i+1}. [{warranty.get('warranty_number', 'N/A')}] {warranty.get('warranty_title', 'No title')}")
return self.warranty_claims
def verify_and_clean_warranties(self, warranties: List[Dict]) -> List[Dict]:
"""Use LLM to verify and clean up the warranty list."""
print("\nVerifying and cleaning warranty claims...")
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=3000)
verification_prompt = """
You are reviewing a list of warranty claims extracted from a legal document.
Please clean up this list by:
1. Removing any duplicates
2. Ensuring each warranty is a distinct legal claim
3. Fixing any formatting issues
4. Ensuring warranty numbers are correct
5. Making sure warranty titles are concise and descriptive
Here is the warranty list to review:
{warranties_json}
Return the cleaned list as a valid JSON array with the same structure.
"""
try:
prompt = verification_prompt.format(warranties_json=json.dumps(warranties, indent=2))
response = llm.invoke(prompt)
response_text = response.content.strip()
# Clean up response
if response_text.startswith("```json"):
response_text = response_text[7:]
if response_text.endswith("```"):
response_text = response_text[:-3]
cleaned_warranties = json.loads(response_text)
print(f"ā
Warranty verification complete: {len(warranties)} ā {len(cleaned_warranties)} claims")
return cleaned_warranties
except Exception as e:
print(f"ā ļø Warranty verification failed: {e}, using original list")
return warranties
def split_text_for_processing(self, text: str, max_tokens: int) -> List[str]:
"""Split text into chunks that fit within token limits."""
tokens = self.tokenizer.encode(text)
chunks = []
for i in range(0, len(tokens), max_tokens):
chunk_tokens = tokens[i:i + max_tokens]
chunk_text = self.tokenizer.decode(chunk_tokens)
chunks.append(chunk_text)
return chunks
def search_chroma_for_warranty(self, warranty: Dict[str, Any], top_k: int = 15) -> List[Dict]:
"""Search Chroma DB for documents relevant to a specific warranty."""
if not self.edr_collection:
print("ā ļø Chroma DB not available, skipping search")
return []
# Create search queries based on warranty content
warranty_text = warranty.get('warranty_text', '')
warranty_title = warranty.get('warranty_title', '')
# Generate multiple search queries for comprehensive coverage
search_queries = [
warranty_title,
warranty_text[:500], # First part of warranty text
f"{warranty_title} {warranty_text[:200]}" # Combined query
]
# Extract key terms for additional queries
key_terms = self.extract_key_terms_from_warranty(warranty)
if key_terms:
search_queries.extend(key_terms[:3]) # Add top 3 key terms
all_documents = []
retrieved_ids = set()
for query in search_queries:
if not query.strip():
continue
try:
# Query Chroma DB
results = self.edr_collection.query(
query_texts=[query],
n_results=top_k,
include=["documents", "metadatas", "embeddings", "distances"]
)
# Process results
if results['documents'] and len(results['documents'][0]) > 0:
for i, doc_id in enumerate(results['ids'][0]):
if doc_id not in retrieved_ids:
retrieved_ids.add(doc_id)
document_info = {
'id': doc_id,
'content': results['documents'][0][i],
'metadata': results['metadatas'][0][i] if results['metadatas'] else {},
'distance': results['distances'][0][i] if results['distances'] else 1.0,
'query_used': query
}
all_documents.append(document_info)
except Exception as e:
print(f"ā ļø Error querying Chroma DB with query '{query[:50]}...': {e}")
# Rank documents using cross-encoder if available
if all_documents and self.cross_encoder:
all_documents = self.rerank_documents(warranty_text, all_documents)
# Return top documents
return all_documents[:top_k]
def extract_key_terms_from_warranty(self, warranty: Dict[str, Any]) -> List[str]:
"""Extract key terms from warranty text for targeted searching."""
warranty_text = warranty.get('warranty_text', '')
# Use simple keyword extraction based on legal document patterns
key_terms = []
# Common legal/business terms that might be relevant
important_words = [
'incorporation', 'registered', 'authorized', 'shares', 'capital',
'subsidiaries', 'accounts', 'financial', 'liabilities', 'assets',
'compliance', 'regulatory', 'licenses', 'permits', 'agreements',
'contracts', 'intellectual property', 'employment', 'litigation',
'insurance', 'tax', 'environmental', 'data protection'
]
text_lower = warranty_text.lower()
for term in important_words:
if term in text_lower:
key_terms.append(term)
# Extract specific company/legal entity mentions
entity_patterns = [
r'\\b[A-Z][a-z]+ [A-Z][a-z]+\\b', # Proper names
r'\\b[A-Z]{2,}\\b', # Acronyms
]
for pattern in entity_patterns:
matches = re.findall(pattern, warranty_text)
key_terms.extend(matches[:3]) # Limit to avoid too many terms
return list(set(key_terms))[:5] # Return unique terms, max 5
def rerank_documents(self, query: str, documents: List[Dict]) -> List[Dict]:
"""Rerank documents using cross-encoder for better relevance."""
if len(documents) <= 1:
return documents
try:
# Prepare query-document pairs for cross-encoder
pairs = [(query, doc['content'][:500]) for doc in documents] # Limit content length
# Get cross-encoder scores
scores = self.cross_encoder.predict(pairs)
# Add scores to documents and sort
for i, doc in enumerate(documents):
doc['cross_encoder_score'] = float(scores[i])
# Sort by cross-encoder score (higher is better)
documents.sort(key=lambda x: x['cross_encoder_score'], reverse=True)
except Exception as e:
print(f"ā ļø Cross-encoder reranking failed: {e}")
return documents
def generate_disclosure_with_references(self, warranty: Dict[str, Any],
relevant_docs: List[Dict]) -> str:
"""
Generate a detailed disclosure for a specific warranty with proper inline references.
Args:
warranty: Warranty claim dictionary
relevant_docs: List of relevant documents from Chroma DB
Returns:
Generated disclosure text with inline references
"""
if not relevant_docs:
return "No relevant supporting documentation found in the knowledge base."
# Add documents to reference manager and create context
context_blocks = []
doc_references = {} # Map doc_id to reference number
for doc in relevant_docs[:10]: # Limit to top 10 docs
ref_num = self.ref_manager.add_document(
doc_id=doc['id'],
content=doc['content'],
metadata=doc.get('metadata', {})
)
doc_references[doc['id']] = ref_num
# Create context block with reference number
context_blocks.append(f"[{ref_num}] {doc['content']}")
context_text = "\n\n".join(context_blocks)
# Create disclosure generation prompt
llm = ChatOpenAI(model="gpt-4o", temperature=0.1, max_tokens=2000)
disclosure_prompt = """
You are a legal and business expert helping to prepare warranty disclosures for a company acquisition.
**TASK**: Generate a detailed disclosure summary for the warranty claim below, based on the provided supporting documentation.
**WARRANTY CLAIM**:
Section: {warranty_number}
Title: {warranty_title}
Text: {warranty_text}
**SUPPORTING DOCUMENTATION**:
{context_text}
**INSTRUCTIONS**:
1. Analyze the warranty claim to understand what needs to be disclosed
2. Review the supporting documentation for relevant information
3. Create a comprehensive disclosure that addresses the warranty requirements
4. Include specific facts, figures, dates, and details from the documentation
5. Use inline citations [1], [2], [3] etc. to reference specific information sources
6. Structure the disclosure clearly with appropriate headings if needed
7. Focus on factual information that supports or relates to the warranty claim
8. If certain aspects of the warranty cannot be fully addressed from the available documentation, note this clearly
**IMPORTANT**: Use ONLY the reference numbers [1], [2], [3] etc. that are shown in the supporting documentation above. Do not create new reference numbers.
**OUTPUT FORMAT**:
Provide a well-structured disclosure in markdown format with:
- Clear, professional language
- Inline citations using [1], [2], [3] format (matching the numbers in the supporting documentation)
- Specific details and facts from the supporting documents
- Appropriate level of detail for legal/business disclosure purposes
**DISCLOSURE SUMMARY**:
"""
prompt = disclosure_prompt.format(
warranty_number=warranty.get('warranty_number', 'N/A'),
warranty_title=warranty.get('warranty_title', ''),
warranty_text=warranty.get('warranty_text', ''),
context_text=context_text
)
try:
response = llm.invoke(prompt)
disclosure_text = response.content.strip()
return disclosure_text
except Exception as e:
print(f"ā Error generating disclosure: {e}")
return f"Error generating disclosure for warranty {warranty.get('warranty_number', 'N/A')}: {str(e)}"
def generate_all_disclosures(self) -> Dict[str, str]:
"""Generate disclosures for all identified warranty claims."""
print("\n" + "="*60)
print("GENERATING DISCLOSURES FOR ALL WARRANTIES")
print("="*60)
# Clear reference manager for fresh start
self.ref_manager.clear()
self.disclosures = {}
for i, warranty in enumerate(self.warranty_claims, 1):
warranty_num = warranty.get('warranty_number', f'W{i}')
warranty_title = warranty.get('warranty_title', 'Untitled Warranty')
print(f"\nProcessing warranty {i}/{len(self.warranty_claims)}: {warranty_num} - {warranty_title}")
# Search for relevant documents
relevant_docs = self.search_chroma_for_warranty(warranty)
if relevant_docs:
print(f"Found {len(relevant_docs)} relevant documents")
else:
print("No relevant documents found")
# Generate disclosure with proper references
disclosure = self.generate_disclosure_with_references(warranty, relevant_docs)
self.disclosures[warranty_num] = {
'warranty': warranty,
'disclosure': disclosure,
'source_documents': len(relevant_docs),
'generated_at': datetime.now().isoformat()
}
print(f"ā
Generated disclosure for {warranty_num}")
print(f"\nā
Completed disclosure generation for {len(self.disclosures)} warranties")
print(f"š Total references in bibliography: {len(self.ref_manager.references)}")
return self.disclosures
def export_to_markdown(self, output_path: str = "./project_victoria_disclosures_improved.md") -> str:
"""Export all disclosures to a comprehensive markdown report with proper references."""
print("\n" + "="*60)
print("EXPORTING DISCLOSURES TO MARKDOWN")
print("="*60)
# Create markdown content
markdown_content = []
# Header
markdown_content.extend([
"# Project Victoria - Warranty Disclosures\\n",
f"**Generated on**: {datetime.now().strftime('%B %d, %Y at %H:%M:%S')}",
f"**Total Warranties Processed**: {len(self.warranty_claims)}",
f"**Total Disclosures Generated**: {len(self.disclosures)}",
f"**Total References**: {len(self.ref_manager.references)}\\n",
"---\\n"
])
# Table of Contents
markdown_content.append("## Table of Contents\\n")
for warranty in sorted(self.warranty_claims, key=lambda x: x.get('warranty_number', '')):
warranty_num = warranty.get('warranty_number', '')
warranty_title = warranty.get('warranty_title', '')
anchor = warranty_num.lower().replace('(', '').replace(')', '').replace('.', '').replace(' ', '-')
markdown_content.append(f"- [{warranty_num} - {warranty_title}](#{anchor}-{warranty_title.lower().replace(' ', '-')})")
markdown_content.append("\\n---\\n")
# Individual warranty disclosures
for warranty in sorted(self.warranty_claims, key=lambda x: x.get('warranty_number', '')):
warranty_num = warranty.get('warranty_number', '')
warranty_title = warranty.get('warranty_title', '')
section_name = warranty.get('section_name', '')
warranty_text = warranty.get('warranty_text', '')
# Get disclosure content
disclosure_info = self.disclosures.get(warranty_num, {})
disclosure_content = disclosure_info.get('disclosure', 'No disclosure generated.')
source_docs_count = disclosure_info.get('source_documents', 0)
# Create warranty section
markdown_content.extend([
f"## {warranty_num} - {warranty_title}\\n",
f"**Section**: {section_name}",
f"**Source Documents Found**: {source_docs_count}\\n",
"### Warranty Text\\n",
warranty_text + "\\n",
"### Disclosure\\n",
disclosure_content + "\\n",
"---\\n"
])
# Add bibliography
bibliography = self.ref_manager.generate_bibliography()
markdown_content.append(bibliography)
# Write to file
markdown_text = "\\n".join(markdown_content)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(markdown_text)
print(f"ā
Markdown report exported to: {output_path}")
print(f"š Report contains {len(self.disclosures)} warranty disclosures")
print(f"š Bibliography contains {len(self.ref_manager.references)} references")
return output_path
def run_complete_pipeline(self) -> str:
"""Run the complete disclosure generation pipeline."""
print("š Starting Project Victoria Disclosure Generation Pipeline\\n")
try:
# Step 1: Extract PDF text
print("Step 1: Extracting PDF text...")
self.extract_pdf_text()
# Step 2: Identify warranty claims
print("\\nStep 2: Identifying warranty claims...")
self.identify_warranty_claims()
# Step 3: Generate disclosures
print("\\nStep 3: Generating disclosures...")
self.generate_all_disclosures()
# Step 4: Export to markdown
print("\\nStep 4: Exporting to markdown...")
output_path = self.export_to_markdown()
print("\\n" + "="*60)
print("ā
PIPELINE COMPLETED SUCCESSFULLY!")
print("="*60)
print(f"š Output file: {output_path}")
print(f"š Total warranties processed: {len(self.warranty_claims)}")
print(f"š Total disclosures generated: {len(self.disclosures)}")
print(f"š Total references: {len(self.ref_manager.references)}")
print("="*60)
return output_path
except Exception as e:
print(f"ā Pipeline failed: {e}")
raise
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
bases: Parameter of type
Return Value
Returns unspecified type
Class Interface
Methods
__init__(self, pdf_path)
Purpose: Initialize the improved disclosure generator. Args: pdf_path: Path to the Project Victoria PDF document
Parameters:
pdf_path: Type: str
Returns: None
init_chroma_connection(self)
Purpose: Initialize connection to Chroma DB.
Returns: None
extract_pdf_text(self) -> str
Purpose: Extract text from the Project Victoria PDF document.
Returns: Returns str
identify_warranty_claims(self) -> List[Dict[str, Any]]
Purpose: Identify and extract individual warranty claims from the document.
Returns: Returns List[Dict[str, Any]]
verify_and_clean_warranties(self, warranties) -> List[Dict]
Purpose: Use LLM to verify and clean up the warranty list.
Parameters:
warranties: Type: List[Dict]
Returns: Returns List[Dict]
split_text_for_processing(self, text, max_tokens) -> List[str]
Purpose: Split text into chunks that fit within token limits.
Parameters:
text: Type: strmax_tokens: Type: int
Returns: Returns List[str]
search_chroma_for_warranty(self, warranty, top_k) -> List[Dict]
Purpose: Search Chroma DB for documents relevant to a specific warranty.
Parameters:
warranty: Type: Dict[str, Any]top_k: Type: int
Returns: Returns List[Dict]
extract_key_terms_from_warranty(self, warranty) -> List[str]
Purpose: Extract key terms from warranty text for targeted searching.
Parameters:
warranty: Type: Dict[str, Any]
Returns: Returns List[str]
rerank_documents(self, query, documents) -> List[Dict]
Purpose: Rerank documents using cross-encoder for better relevance.
Parameters:
query: Type: strdocuments: Type: List[Dict]
Returns: Returns List[Dict]
generate_disclosure_with_references(self, warranty, relevant_docs) -> str
Purpose: Generate a detailed disclosure for a specific warranty with proper inline references. Args: warranty: Warranty claim dictionary relevant_docs: List of relevant documents from Chroma DB Returns: Generated disclosure text with inline references
Parameters:
warranty: Type: Dict[str, Any]relevant_docs: Type: List[Dict]
Returns: Returns str
generate_all_disclosures(self) -> Dict[str, str]
Purpose: Generate disclosures for all identified warranty claims.
Returns: Returns Dict[str, str]
export_to_markdown(self, output_path) -> str
Purpose: Export all disclosures to a comprehensive markdown report with proper references.
Parameters:
output_path: Type: str
Returns: Returns str
run_complete_pipeline(self) -> str
Purpose: Run the complete disclosure generation pipeline.
Returns: Returns str
Required Imports
import os
import re
import json
import tiktoken
from typing import List
Usage Example
# Example usage:
# result = ImprovedProjectVictoriaGenerator(bases)
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class FixedProjectVictoriaGenerator 77.8% similar
-
class ProjectVictoriaDisclosureGenerator 74.8% similar
-
function main_v29 69.6% similar
-
function main_v28 62.9% similar
-
function main_v14 58.4% similar