class ProjectVictoriaDisclosureGenerator
Main class for generating Project Victoria disclosures from warranty claims.
/tf/active/vicechatdev/project_victoria_disclosure_generator.py
63 - 816
moderate
Purpose
Main class for generating Project Victoria disclosures from warranty claims.
Source Code
class ProjectVictoriaDisclosureGenerator:
"""
Main class for generating Project Victoria disclosures from warranty claims.
"""
def __init__(self, pdf_path: str = "./20250623_Project Victoria - Disclosure Matrix_WIP.pdf"):
"""
Initialize the disclosure generator.
Args:
pdf_path: Path to the Project Victoria PDF document
"""
self.pdf_path = pdf_path
self.extracted_text = ""
self.warranty_claims = []
self.disclosures = {}
# LLM configuration
self.api_key = "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"
os.environ["OPENAI_API_KEY"] = self.api_key
# Initialize tokenizer for counting tokens
self.tokenizer = tiktoken.get_encoding("cl100k_base")
# Initialize Chroma DB connection
self.init_chroma_connection()
# Cross-encoder for ranking (if available)
if CROSSENCODER_AVAILABLE:
try:
self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
print("ā
Cross-encoder initialized for document reranking")
except Exception as e:
print(f"ā ļø Cross-encoder initialization failed: {e}")
self.cross_encoder = None
else:
self.cross_encoder = None
# Block reference counter for inline citations
self.block_counter = 1
self.blocks_dict = {}
print(f"Initialized Project Victoria Disclosure Generator")
print(f"PDF path: {pdf_path}")
def init_chroma_connection(self):
"""Initialize connection to Chroma DB."""
try:
# Connect to Chroma DB (assuming it's running on vice_chroma:8000)
self.chroma_client = chromadb.HttpClient(host='vice_chroma', port=8000)
# Set up embedding function
self.chroma_embedder = MyEmbeddingFunction("gpt-4o-mini", "text-embedding-3-small", self.api_key)
# Get the 99_EDR collection
self.edr_collection = self.chroma_client.get_collection(
"99_edr",
embedding_function=self.chroma_embedder
)
print("ā
Successfully connected to Chroma DB and 99_EDR collection")
except Exception as e:
print(f"ā Error connecting to Chroma DB: {e}")
print("Attempting to use local fallback or alternative connection...")
self.chroma_client = None
self.edr_collection = None
def extract_pdf_text(self) -> str:
"""
Extract text from the Project Victoria PDF document.
Returns:
Extracted text from the PDF
"""
# First try to use existing extracted text file
try:
with open("project_victoria_extracted.txt", "r", encoding="utf-8") as f:
self.extracted_text = f.read()
print(f"ā
Successfully loaded text from existing file: {len(self.extracted_text)} characters")
return self.extracted_text
except FileNotFoundError:
print("š No existing extracted text file found, attempting PDF extraction...")
except Exception as e:
print(f"ā ļø Error reading existing text file: {e}")
# Try PDF extraction if PyMuPDF is available
if PDF_AVAILABLE and fitz:
try:
print(f"Extracting text from PDF: {self.pdf_path}")
# Open the PDF document
doc = fitz.open(self.pdf_path)
text_content = []
for page_num in range(len(doc)):
page = doc.load_page(page_num)
text = page.get_text()
text_content.append(f"\\n--- Page {page_num + 1} ---\\n{text}")
doc.close()
self.extracted_text = "\\n".join(text_content)
# Save extracted text for future use
try:
with open("project_victoria_extracted.txt", "w", encoding="utf-8") as f:
f.write(self.extracted_text)
print("š¾ Saved extracted text to project_victoria_extracted.txt")
except Exception as save_error:
print(f"ā ļø Could not save extracted text: {save_error}")
print(f"ā
Successfully extracted {len(self.extracted_text)} characters from PDF")
print(f"š Total pages processed: {len(doc)}")
return self.extracted_text
except Exception as e:
print(f"ā Error extracting PDF text: {e}")
else:
print("ā PyMuPDF not available for PDF extraction")
# If all else fails, provide manual instructions
raise Exception("""
Could not extract text from PDF. Please do one of the following:
1. Install PyMuPDF: pip install PyMuPDF
2. Manually extract text from the PDF and save it as 'project_victoria_extracted.txt'
3. Provide the extracted text file in the working directory
The script will automatically use the text file if available.
""")
def identify_warranty_claims(self) -> List[Dict[str, Any]]:
"""
Identify and extract individual warranty claims from the document.
Returns:
List of warranty claim dictionaries
"""
print("\\n" + "="*60)
print("IDENTIFYING WARRANTY CLAIMS")
print("="*60)
# Use LLM to identify warranty claims
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=16000) # Increased token limit
warranty_extraction_prompt = """
You are analyzing a legal document containing warranty disclosures for a company acquisition.
The document is structured with numbered warranty sections, each containing specific warranty claims.
Your task is to extract ALL individual warranty claims from the text. Each warranty claim should be identified as a distinct legal requirement or representation.
Please extract and return a JSON list of warranty claims, where each item has:
- "warranty_number": The section number (e.g., "1.1", "2.1(a)", "3.4")
- "warranty_title": A short descriptive title for the warranty
- "warranty_text": The complete text of the warranty claim (limit to 500 characters)
- "section_name": The main section name (e.g., "THE SHARES; THE SELLERS", "AUTHORITY AND CAPACITY")
Focus on extracting the actual warranty statements, not procedural text or definitions.
IMPORTANT: Keep warranty_text under 500 characters to ensure the JSON response is not truncated.
Here is the document text to analyze:
{document_text}
Return only a valid JSON array of warranty claims. Ensure the response is complete and valid JSON.
"""
# Split text into chunks if too long
max_chunk_size = 30000 # Reduced chunk size for better processing
text_chunks = self.split_text_for_processing(self.extracted_text, max_chunk_size)
all_warranties = []
for i, chunk in enumerate(text_chunks):
print(f"Processing warranty extraction chunk {i+1}/{len(text_chunks)}")
prompt = warranty_extraction_prompt.format(document_text=chunk)
try:
response = llm.invoke(prompt)
response_text = response.content.strip()
# Clean up response to ensure valid JSON
if response_text.startswith("```json"):
response_text = response_text[7:]
if response_text.endswith("```"):
response_text = response_text[:-3]
# Try to repair truncated JSON
response_text = response_text.strip()
if not response_text.endswith(']'):
# Try to find the last complete entry
last_complete = response_text.rfind('}}')
if last_complete > 0:
response_text = response_text[:last_complete + 2] + ']'
else:
response_text = response_text + ']'
chunk_warranties = json.loads(response_text)
if isinstance(chunk_warranties, list):
all_warranties.extend(chunk_warranties)
print(f"ā
Extracted {len(chunk_warranties)} warranties from chunk {i+1}")
else:
print(f"ā ļø Unexpected response format from chunk {i+1}")
except json.JSONDecodeError as je:
print(f"ā JSON decode error in chunk {i+1}: {je}")
print(f"Response was: {response_text[:500]}...")
# Try alternative extraction
try:
# Extract individual warranty objects with regex
warranty_pattern = r'\{\s*"warranty_number"[^}]+\}'
matches = re.findall(warranty_pattern, response_text, re.DOTALL)
for match in matches:
try:
warranty = json.loads(match)
all_warranties.append(warranty)
except:
continue
if matches:
print(f"ā
Recovered {len(matches)} warranties using regex extraction")
except Exception as re_error:
print(f"ā Regex recovery also failed: {re_error}")
except Exception as e:
print(f"ā Error processing chunk {i+1}: {e}")
# Use LLM to verify and clean up warranty list
if all_warranties:
all_warranties = self.verify_and_clean_warranties(all_warranties)
self.warranty_claims = all_warranties
print(f"ā
Total warranty claims identified: {len(self.warranty_claims)}")
# Display summary
if self.warranty_claims:
print("\\nSample warranty claims:")
for i, warranty in enumerate(self.warranty_claims[:3]):
print(f"{i+1}. [{warranty.get('warranty_number', 'N/A')}] {warranty.get('warranty_title', 'No title')}")
return self.warranty_claims
def verify_and_clean_warranties(self, warranties: List[Dict]) -> List[Dict]:
"""
Use LLM to verify and clean up the warranty list.
Args:
warranties: Raw list of warranty claims
Returns:
Cleaned and verified warranty claims
"""
print("\\nVerifying and cleaning warranty claims...")
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=3000)
verification_prompt = """
You are reviewing a list of warranty claims extracted from a legal document.
Please clean up this list by:
1. Removing any duplicates
2. Ensuring each warranty is a distinct legal claim
3. Fixing any formatting issues
4. Ensuring warranty numbers are correct
5. Making sure warranty titles are concise and descriptive
Here is the warranty list to review:
{warranties_json}
Return the cleaned list as a valid JSON array with the same structure.
"""
try:
prompt = verification_prompt.format(warranties_json=json.dumps(warranties, indent=2))
response = llm.invoke(prompt)
response_text = response.content.strip()
# Clean up response
if response_text.startswith("```json"):
response_text = response_text[7:]
if response_text.endswith("```"):
response_text = response_text[:-3]
cleaned_warranties = json.loads(response_text)
print(f"ā
Warranty verification complete: {len(warranties)} ā {len(cleaned_warranties)} claims")
return cleaned_warranties
except Exception as e:
print(f"ā ļø Warranty verification failed: {e}, using original list")
return warranties
def split_text_for_processing(self, text: str, max_tokens: int) -> List[str]:
"""
Split text into chunks that fit within token limits.
Args:
text: Text to split
max_tokens: Maximum tokens per chunk
Returns:
List of text chunks
"""
tokens = self.tokenizer.encode(text)
chunks = []
for i in range(0, len(tokens), max_tokens):
chunk_tokens = tokens[i:i + max_tokens]
chunk_text = self.tokenizer.decode(chunk_tokens)
chunks.append(chunk_text)
return chunks
def search_chroma_for_warranty(self, warranty: Dict[str, Any], top_k: int = 15) -> List[Dict]:
"""
Search Chroma DB for documents relevant to a specific warranty.
Args:
warranty: Warranty claim dictionary
top_k: Number of top documents to retrieve
Returns:
List of relevant documents
"""
if not self.edr_collection:
print("ā ļø Chroma DB not available, skipping search")
return []
# Create search queries based on warranty content
warranty_text = warranty.get('warranty_text', '')
warranty_title = warranty.get('warranty_title', '')
# Generate multiple search queries for comprehensive coverage
search_queries = [
warranty_title,
warranty_text[:500], # First part of warranty text
f"{warranty_title} {warranty_text[:200]}" # Combined query
]
# Extract key terms for additional queries
key_terms = self.extract_key_terms_from_warranty(warranty)
if key_terms:
search_queries.extend(key_terms[:3]) # Add top 3 key terms
all_documents = []
retrieved_ids = set()
for query in search_queries:
if not query.strip():
continue
try:
# Query Chroma DB
results = self.edr_collection.query(
query_texts=[query],
n_results=top_k,
include=["documents", "metadatas", "embeddings", "distances"]
)
# Process results
if results['documents'] and len(results['documents'][0]) > 0:
for i, doc_id in enumerate(results['ids'][0]):
if doc_id not in retrieved_ids:
retrieved_ids.add(doc_id)
document_info = {
'id': doc_id,
'content': results['documents'][0][i],
'metadata': results['metadatas'][0][i] if results['metadatas'] else {},
'distance': results['distances'][0][i] if results['distances'] else 1.0,
'query_used': query
}
all_documents.append(document_info)
except Exception as e:
print(f"ā ļø Error querying Chroma DB with query '{query[:50]}...': {e}")
# Rank documents using cross-encoder if available
if all_documents and hasattr(self, 'cross_encoder'):
all_documents = self.rerank_documents(warranty_text, all_documents)
# Return top documents
return all_documents[:top_k]
def extract_key_terms_from_warranty(self, warranty: Dict[str, Any]) -> List[str]:
"""
Extract key terms from warranty text for targeted searching.
Args:
warranty: Warranty claim dictionary
Returns:
List of key terms
"""
warranty_text = warranty.get('warranty_text', '')
# Use simple keyword extraction based on legal document patterns
key_terms = []
# Common legal/business terms that might be relevant
important_words = [
'incorporation', 'registered', 'authorized', 'shares', 'capital',
'subsidiaries', 'accounts', 'financial', 'liabilities', 'assets',
'compliance', 'regulatory', 'licenses', 'permits', 'agreements',
'contracts', 'intellectual property', 'employment', 'litigation',
'insurance', 'tax', 'environmental', 'data protection'
]
text_lower = warranty_text.lower()
for term in important_words:
if term in text_lower:
key_terms.append(term)
# Extract specific company/legal entity mentions
entity_patterns = [
r'\\b[A-Z][a-z]+ [A-Z][a-z]+\\b', # Proper names
r'\\b[A-Z]{2,}\\b', # Acronyms
]
for pattern in entity_patterns:
matches = re.findall(pattern, warranty_text)
key_terms.extend(matches[:3]) # Limit to avoid too many terms
return list(set(key_terms))[:5] # Return unique terms, max 5
def rerank_documents(self, query: str, documents: List[Dict]) -> List[Dict]:
"""
Rerank documents using cross-encoder for better relevance.
Args:
query: Query text
documents: List of document dictionaries
Returns:
Reranked documents
"""
if len(documents) <= 1:
return documents
if not self.cross_encoder:
print("ā ļø Cross-encoder not available, skipping reranking")
return documents
try:
# Prepare query-document pairs for cross-encoder
pairs = [(query, doc['content'][:500]) for doc in documents] # Limit content length
# Get cross-encoder scores
scores = self.cross_encoder.predict(pairs)
# Add scores to documents and sort
for i, doc in enumerate(documents):
doc['cross_encoder_score'] = float(scores[i])
# Sort by cross-encoder score (higher is better)
documents.sort(key=lambda x: x['cross_encoder_score'], reverse=True)
except Exception as e:
print(f"ā ļø Cross-encoder reranking failed: {e}")
return documents
def generate_disclosure_for_warranty(self, warranty: Dict[str, Any],
relevant_docs: List[Dict]) -> str:
"""
Generate a detailed disclosure for a specific warranty using LLM.
Args:
warranty: Warranty claim dictionary
relevant_docs: List of relevant documents from Chroma DB
Returns:
Generated disclosure text with inline references
"""
if not relevant_docs:
return "No relevant supporting documentation found in the knowledge base."
# Prepare context from relevant documents
context_blocks = []
self.blocks_dict = {} # Reset for this warranty
for i, doc in enumerate(relevant_docs[:10], 1): # Limit to top 10 docs
block_num = self.block_counter + i - 1
context_blocks.append(f"[Block {block_num}] {doc['content']}")
# Store block information for references
self.blocks_dict[block_num] = {
'type': 'document',
'id': doc['id'],
'content': doc['content'][:200] + "..." if len(doc['content']) > 200 else doc['content'],
'metadata': doc.get('metadata', {}),
'source': doc.get('metadata', {}).get('source', 'Unknown source')
}
self.block_counter += len(relevant_docs[:10])
context_text = "\\n\\n".join(context_blocks)
# Create disclosure generation prompt
llm = ChatOpenAI(model="gpt-4o", temperature=0.1, max_tokens=2000)
disclosure_prompt = """
You are a legal and business expert helping to prepare warranty disclosures for a company acquisition.
**TASK**: Generate a detailed disclosure summary for the warranty claim below, based on the provided supporting documentation.
**WARRANTY CLAIM**:
Section: {warranty_number}
Title: {warranty_title}
Text: {warranty_text}
**SUPPORTING DOCUMENTATION**:
{context_text}
**INSTRUCTIONS**:
1. Analyze the warranty claim to understand what needs to be disclosed
2. Review the supporting documentation for relevant information
3. Create a comprehensive disclosure that addresses the warranty requirements
4. Include specific facts, figures, dates, and details from the documentation
5. Use inline citations [Block X] to reference specific information sources
6. Structure the disclosure clearly with appropriate headings if needed
7. Focus on factual information that supports or relates to the warranty claim
8. If certain aspects of the warranty cannot be fully addressed from the available documentation, note this clearly
**OUTPUT FORMAT**:
Provide a well-structured disclosure in markdown format with:
- Clear, professional language
- Inline citations using [Block X] format
- Specific details and facts from the supporting documents
- Appropriate level of detail for legal/business disclosure purposes
**DISCLOSURE SUMMARY**:
"""
prompt = disclosure_prompt.format(
warranty_number=warranty.get('warranty_number', 'N/A'),
warranty_title=warranty.get('warranty_title', ''),
warranty_text=warranty.get('warranty_text', ''),
context_text=context_text
)
try:
response = llm.invoke(prompt)
disclosure_text = response.content.strip()
return disclosure_text
except Exception as e:
print(f"ā Error generating disclosure: {e}")
return f"Error generating disclosure for warranty {warranty.get('warranty_number', 'N/A')}: {str(e)}"
def generate_all_disclosures(self) -> Dict[str, str]:
"""
Generate disclosures for all identified warranty claims.
Returns:
Dictionary mapping warranty numbers to disclosure texts
"""
print("\\n" + "="*60)
print("GENERATING DISCLOSURES FOR ALL WARRANTIES")
print("="*60)
self.disclosures = {}
for i, warranty in enumerate(self.warranty_claims, 1):
warranty_num = warranty.get('warranty_number', f'W{i}')
warranty_title = warranty.get('warranty_title', 'Untitled Warranty')
print(f"\\nProcessing warranty {i}/{len(self.warranty_claims)}: {warranty_num} - {warranty_title}")
# Search for relevant documents
relevant_docs = self.search_chroma_for_warranty(warranty)
if relevant_docs:
print(f"Found {len(relevant_docs)} relevant documents")
else:
print("No relevant documents found")
# Generate disclosure
disclosure = self.generate_disclosure_for_warranty(warranty, relevant_docs)
self.disclosures[warranty_num] = {
'warranty': warranty,
'disclosure': disclosure,
'source_documents': len(relevant_docs),
'generated_at': datetime.now().isoformat()
}
print(f"ā
Generated disclosure for {warranty_num}")
print(f"\\nā
Completed disclosure generation for {len(self.disclosures)} warranties")
return self.disclosures
def create_references_section(self) -> str:
"""
Create a references section listing all blocks used in disclosures.
Returns:
Formatted references section
"""
if not self.blocks_dict:
return "## References\\n\\nNo references available."
references = ["## References\\n"]
for block_num in sorted(self.blocks_dict.keys()):
block_info = self.blocks_dict[block_num]
source = block_info.get('source', block_info.get('metadata', {}).get('source', 'Unknown source'))
content_preview = block_info.get('content', 'No content preview available')
references.append(f"**[Block {block_num}]** {source}")
references.append(f"Content: {content_preview}")
references.append("") # Empty line
return "\\n".join(references)
def export_to_markdown(self, output_path: str = "./project_victoria_disclosures.md") -> str:
"""
Export all disclosures to a comprehensive markdown report.
Args:
output_path: Path for the output markdown file
Returns:
Path to the generated markdown file
"""
print("\\n" + "="*60)
print("EXPORTING DISCLOSURES TO MARKDOWN")
print("="*60)
# Create markdown content
markdown_content = []
# Header
markdown_content.extend([
"# Project Victoria - Warranty Disclosures",
"",
f"**Generated on**: {datetime.now().strftime('%B %d, %Y at %H:%M:%S')}",
f"**Total Warranties Processed**: {len(self.warranty_claims)}",
f"**Total Disclosures Generated**: {len(self.disclosures)}",
"",
"---",
""
])
# Table of contents
markdown_content.extend([
"## Table of Contents",
""
])
for warranty_num in sorted(self.disclosures.keys()):
warranty = self.disclosures[warranty_num]['warranty']
title = warranty.get('warranty_title', 'Untitled Warranty')
markdown_content.append(f"- [{warranty_num} - {title}](#{warranty_num.lower().replace('.', '').replace('(', '').replace(')', '')}-{title.lower().replace(' ', '-').replace('/', '').replace('(', '').replace(')', '')})")
markdown_content.extend(["", "---", ""])
# Disclosures
for warranty_num in sorted(self.disclosures.keys()):
disclosure_info = self.disclosures[warranty_num]
warranty = disclosure_info['warranty']
disclosure_text = disclosure_info['disclosure']
# Warranty header
markdown_content.extend([
f"## {warranty_num} - {warranty.get('warranty_title', 'Untitled Warranty')}",
"",
f"**Section**: {warranty.get('section_name', 'N/A')}",
f"**Source Documents Found**: {disclosure_info['source_documents']}",
"",
"### Warranty Text",
"",
warranty.get('warranty_text', 'No warranty text available'),
"",
"### Disclosure",
"",
disclosure_text,
"",
"---",
""
])
# References section
references_section = self.create_references_section()
markdown_content.extend([references_section])
# Write to file
final_content = "\\n".join(markdown_content)
try:
with open(output_path, 'w', encoding='utf-8') as f:
f.write(final_content)
print(f"ā
Successfully exported disclosures to: {output_path}")
print(f"š Total content length: {len(final_content)} characters")
return output_path
except Exception as e:
print(f"ā Error exporting to markdown: {e}")
raise
def run_complete_analysis(self, output_path: str = "./project_victoria_disclosures.md") -> str:
"""
Run the complete analysis pipeline.
Args:
output_path: Path for the output markdown file
Returns:
Path to the generated markdown report
"""
print("\\n" + "="*80)
print("PROJECT VICTORIA DISCLOSURE GENERATION")
print("="*80)
print(f"Analysis started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
try:
# Step 1: Extract PDF text
self.extract_pdf_text()
# Step 2: Identify warranty claims
self.identify_warranty_claims()
if not self.warranty_claims:
print("ā No warranty claims found. Cannot proceed.")
return None
# Step 3: Generate disclosures
self.generate_all_disclosures()
# Step 4: Export to markdown
output_file = self.export_to_markdown(output_path)
print("\\n" + "="*80)
print("ANALYSIS COMPLETED SUCCESSFULLY!")
print("="*80)
print(f"Analysis completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Output file: {output_file}")
print(f"Total warranties processed: {len(self.warranty_claims)}")
print(f"Total disclosures generated: {len(self.disclosures)}")
return output_file
except Exception as e:
print(f"\\nā Analysis failed with error: {e}")
import traceback
traceback.print_exc()
return None
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
bases: Parameter of type
Return Value
Returns unspecified type
Class Interface
Methods
__init__(self, pdf_path)
Purpose: Initialize the disclosure generator. Args: pdf_path: Path to the Project Victoria PDF document
Parameters:
pdf_path: Type: str
Returns: None
init_chroma_connection(self)
Purpose: Initialize connection to Chroma DB.
Returns: None
extract_pdf_text(self) -> str
Purpose: Extract text from the Project Victoria PDF document. Returns: Extracted text from the PDF
Returns: Returns str
identify_warranty_claims(self) -> List[Dict[str, Any]]
Purpose: Identify and extract individual warranty claims from the document. Returns: List of warranty claim dictionaries
Returns: Returns List[Dict[str, Any]]
verify_and_clean_warranties(self, warranties) -> List[Dict]
Purpose: Use LLM to verify and clean up the warranty list. Args: warranties: Raw list of warranty claims Returns: Cleaned and verified warranty claims
Parameters:
warranties: Type: List[Dict]
Returns: Returns List[Dict]
split_text_for_processing(self, text, max_tokens) -> List[str]
Purpose: Split text into chunks that fit within token limits. Args: text: Text to split max_tokens: Maximum tokens per chunk Returns: List of text chunks
Parameters:
text: Type: strmax_tokens: Type: int
Returns: Returns List[str]
search_chroma_for_warranty(self, warranty, top_k) -> List[Dict]
Purpose: Search Chroma DB for documents relevant to a specific warranty. Args: warranty: Warranty claim dictionary top_k: Number of top documents to retrieve Returns: List of relevant documents
Parameters:
warranty: Type: Dict[str, Any]top_k: Type: int
Returns: Returns List[Dict]
extract_key_terms_from_warranty(self, warranty) -> List[str]
Purpose: Extract key terms from warranty text for targeted searching. Args: warranty: Warranty claim dictionary Returns: List of key terms
Parameters:
warranty: Type: Dict[str, Any]
Returns: Returns List[str]
rerank_documents(self, query, documents) -> List[Dict]
Purpose: Rerank documents using cross-encoder for better relevance. Args: query: Query text documents: List of document dictionaries Returns: Reranked documents
Parameters:
query: Type: strdocuments: Type: List[Dict]
Returns: Returns List[Dict]
generate_disclosure_for_warranty(self, warranty, relevant_docs) -> str
Purpose: Generate a detailed disclosure for a specific warranty using LLM. Args: warranty: Warranty claim dictionary relevant_docs: List of relevant documents from Chroma DB Returns: Generated disclosure text with inline references
Parameters:
warranty: Type: Dict[str, Any]relevant_docs: Type: List[Dict]
Returns: Returns str
generate_all_disclosures(self) -> Dict[str, str]
Purpose: Generate disclosures for all identified warranty claims. Returns: Dictionary mapping warranty numbers to disclosure texts
Returns: Returns Dict[str, str]
create_references_section(self) -> str
Purpose: Create a references section listing all blocks used in disclosures. Returns: Formatted references section
Returns: Returns str
export_to_markdown(self, output_path) -> str
Purpose: Export all disclosures to a comprehensive markdown report. Args: output_path: Path for the output markdown file Returns: Path to the generated markdown file
Parameters:
output_path: Type: str
Returns: Returns str
run_complete_analysis(self, output_path) -> str
Purpose: Run the complete analysis pipeline. Args: output_path: Path for the output markdown file Returns: Path to the generated markdown report
Parameters:
output_path: Type: str
Returns: Returns str
Required Imports
import os
import re
import json
import pandas as pd
import numpy as np
Usage Example
# Example usage:
# result = ProjectVictoriaDisclosureGenerator(bases)
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class FixedProjectVictoriaGenerator 82.8% similar
-
class ImprovedProjectVictoriaGenerator 74.8% similar
-
function main_v29 70.6% similar
-
function main_v28 68.3% similar
-
function main_v14 67.2% similar