class FixedProjectVictoriaGenerator
Fixed Project Victoria Disclosure Generator that properly handles all warranty sections.
/tf/active/vicechatdev/fixed_project_victoria_generator.py
219 - 1622
moderate
Purpose
Fixed Project Victoria Disclosure Generator that properly handles all warranty sections.
Source Code
class FixedProjectVictoriaGenerator:
"""
Fixed Project Victoria Disclosure Generator that properly handles all warranty sections.
"""
def __init__(self):
self.pdf_path = "/tf/active/20250623_Project Victoria - Disclosure Matrix_WIP.pdf"
self.extracted_text_path = "/tf/active/project_victoria_extracted.txt"
self.chroma_path = "/tf/active/.persist/EDR_collection"
self.extracted_text = ""
self.warranty_claims = []
self.disclosures = []
self.api_key = "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"
os.environ["OPENAI_API_KEY"] = self.api_key
# Initialize reference manager
self.ref_manager = ReferenceManager()
# Initialize tokenizer
self.tokenizer = tiktoken.encoding_for_model("gpt-4o-mini")
# Initialize ChromaDB collection
self.edr_collection = None
self.chroma_client = None
# Try to connect to remote ChromaDB first (vice_chroma:8000)
try:
print("š Attempting to connect to remote ChromaDB at vice_chroma:8000...")
self.chroma_client = chromadb.HttpClient(host='vice_chroma', port=8000)
# Set up embedding function if available
if EMBEDDING_AVAILABLE:
self.chroma_embedder = MyEmbeddingFunction("gpt-4o-mini", "text-embedding-3-small", self.api_key)
else:
print("ā ļø Using default Chroma embeddings (may not work with custom collections)")
self.chroma_embedder = None
# Try to get the 99_edr collection
try:
self.edr_collection = self.chroma_client.get_collection(
"99_edr",
embedding_function=self.chroma_embedder
)
print("ā
Successfully connected to remote ChromaDB and 99_edr collection")
except Exception as collection_error:
print(f"ā ļø Could not access 99_edr collection: {collection_error}")
# Try to list available collections
try:
collections = self.chroma_client.list_collections()
if collections:
collection_names = [c.name for c in collections]
print(f"Available collections: {collection_names}")
# Use first available collection
first_collection = collection_names[0]
self.edr_collection = self.chroma_client.get_collection(first_collection)
print(f"ā
Using collection '{first_collection}' instead")
else:
print("ā ļø No collections found in remote ChromaDB")
except Exception as list_error:
print(f"ā ļø Could not list collections: {list_error}")
except Exception as remote_error:
print(f"ā ļø Could not connect to remote ChromaDB: {remote_error}")
# Fallback to local ChromaDB paths
print("š Trying local ChromaDB paths as fallback...")
possible_chroma_paths = [
"/tf/active/.persist/EDR_collection",
"/tf/active/.persist",
"/tf/active/chroma_db",
"/tf/active/.chroma"
]
for path in possible_chroma_paths:
if os.path.exists(path):
try:
client = chromadb.PersistentClient(path=path)
collections = client.list_collections()
# Try to find EDR collection or similar
collection_names = [c.name for c in collections]
if "EDR_collection" in collection_names:
self.edr_collection = client.get_collection(name="EDR_collection")
print(f"ā
Connected to ChromaDB collection 'EDR_collection' with {self.edr_collection.count()} documents at {path}")
break
elif collection_names:
# Use the first available collection
self.edr_collection = client.get_collection(name=collection_names[0])
print(f"ā
Connected to ChromaDB collection '{collection_names[0]}' with {self.edr_collection.count()} documents at {path}")
break
else:
print(f"ā ļø No collections found in ChromaDB at {path}")
except Exception as e:
print(f"ā ļø Could not connect to ChromaDB at {path}: {e}")
continue
if not self.edr_collection:
print("ā ļø ChromaDB not available - disclosure generation will proceed without document search")
def extract_pdf_text(self):
"""Extract text from PDF or load from existing file."""
# First try to load existing extracted text
if os.path.exists(self.extracted_text_path):
print(f"š Loading existing extracted text from: {self.extracted_text_path}")
with open(self.extracted_text_path, 'r', encoding='utf-8') as f:
self.extracted_text = f.read()
print(f"ā
Loaded {len(self.extracted_text)} characters of text")
return
# If no extracted text exists, try to extract from PDF
if not os.path.exists(self.pdf_path):
raise FileNotFoundError(f"PDF file not found: {self.pdf_path}")
if not PDF_AVAILABLE:
raise ImportError("PyMuPDF not available for PDF extraction")
print(f"š Extracting text from PDF: {self.pdf_path}")
doc = fitz.open(self.pdf_path)
text_parts = []
for page_num in range(len(doc)):
page = doc[page_num]
text = page.get_text()
text_parts.append(text)
self.extracted_text = "\n".join(text_parts)
# Save extracted text for future use
with open(self.extracted_text_path, 'w', encoding='utf-8') as f:
f.write(self.extracted_text)
print(f"ā
Extracted {len(self.extracted_text)} characters and saved to {self.extracted_text_path}")
def split_text_by_warranty_sections(self, text: str) -> List[str]:
"""
Split text into chunks based on warranty section boundaries.
This ensures warranty sections are not split across chunks.
"""
print("š Splitting text by warranty sections...")
# Find all warranty section headers using multiple patterns
# Pattern 1: Basic section numbers like "1.1", "2.1", etc.
pattern1 = r'\n\s*(\d+\.(?:\d+(?:\([a-z]\))?)?)\s*\n'
# Pattern 2: Section numbers at start of line
pattern2 = r'^(\d+\.\d+(?:\([a-z]\))?)\s'
# Pattern 3: Section numbers with spaces
pattern3 = r'\n(\d+\.\d+(?:\([a-z]\))?)\s+'
matches = []
for pattern in [pattern1, pattern2, pattern3]:
matches.extend(list(re.finditer(pattern, text, re.MULTILINE)))
# Remove duplicates and sort by position
unique_matches = {}
for match in matches:
pos = match.start()
if pos not in unique_matches:
unique_matches[pos] = match
matches = sorted(unique_matches.values(), key=lambda x: x.start())
if not matches:
print("ā ļø No warranty sections found using standard patterns, trying alternative approach")
# Try alternative pattern matching
alt_pattern = r'(\d+\.\d+(?:\([a-z]\))?)'
alt_matches = list(re.finditer(alt_pattern, text))
if alt_matches:
matches = alt_matches[:50] # Limit to avoid too many matches
print(f"Found {len(matches)} potential warranty sections with alternative pattern")
else:
print("ā ļø No warranty sections found, using single chunk")
return [text]
print(f"Found {len(matches)} warranty sections")
chunks = []
max_chunk_size = 20000 # Smaller chunks to avoid token limits
current_chunk = ""
for i, match in enumerate(matches):
section_start = match.start()
section_number = match.group(1)
# Find the end of this section (start of next section or end of text)
if i < len(matches) - 1:
section_end = matches[i + 1].start()
else:
section_end = len(text)
section_text = text[section_start:section_end]
# Check if adding this section would exceed chunk size
if len(current_chunk) + len(section_text) > max_chunk_size and current_chunk:
chunks.append(current_chunk.strip())
current_chunk = section_text
else:
current_chunk += section_text
# Add the last chunk
if current_chunk.strip():
chunks.append(current_chunk.strip())
print(f"ā
Split text into {len(chunks)} warranty-based chunks")
return chunks
def identify_warranty_claims(self) -> List[Dict[str, Any]]:
"""Identify and extract individual warranty claims from the document."""
print("\n" + "="*60)
print("IDENTIFYING WARRANTY CLAIMS")
print("="*60)
if not LANGCHAIN_AVAILABLE:
raise ImportError("LangChain not available for warranty identification")
# Use LLM to identify warranty claims with higher max_tokens to avoid truncation
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=12000)
warranty_extraction_prompt = """
You are analyzing a legal document containing warranty disclosures for a company acquisition.
The document is structured with numbered warranty sections, each containing specific warranty claims.
Your task is to extract ALL individual warranty claims from the provided text section.
Please extract and return a JSON list of warranty claims, where each item has:
- "warranty_number": The section number (e.g., "1.1", "2.1(a)", "3.4", "13.18", "19.24")
- "warranty_title": A short descriptive title for the warranty (max 100 characters)
- "warranty_text": The complete text of the warranty claim (limit to 200 characters)
- "section_name": The main section name or topic (if identifiable)
IMPORTANT INSTRUCTIONS:
1. Extract EVERY warranty section you find, including high-numbered sections (like 18.x, 19.x, 20.x, 21.x, 22.x, 23.x)
2. Look for sections with numbers like: 18.1, 19.24, 20.7, 21.4, 22.12, 23.8, etc.
3. Keep all text fields short to ensure complete JSON response
4. Include sections with letter suffixes like 13.18(a), 2.1(c)(i), etc.
5. If you see "Warranty X" or "Warranty Number X", extract it
6. Do not truncate the JSON response - complete all warranty objects
Here is the document section to analyze:
{document_text}
Return only a valid JSON array of warranty claims. Ensure the JSON is complete and valid.
"""
# Split text into warranty-based chunks
text_chunks = self.split_text_by_warranty_sections(self.extracted_text)
all_warranties = []
processed_numbers = set() # Track processed warranty numbers to avoid duplicates
for i, chunk in enumerate(text_chunks):
print(f"Processing warranty extraction chunk {i+1}/{len(text_chunks)}")
prompt = warranty_extraction_prompt.format(document_text=chunk)
try:
response = llm.invoke(prompt)
response_text = response.content.strip()
# Clean up response to ensure valid JSON
if response_text.startswith("```json"):
response_text = response_text[7:]
if response_text.endswith("```"):
response_text = response_text[:-3]
response_text = response_text.strip()
# Try to repair truncated JSON
if not response_text.endswith(']'):
# Find the last complete warranty object
last_complete = response_text.rfind('}}')
if last_complete > 0:
response_text = response_text[:last_complete + 2] + ']'
else:
response_text = response_text + ']'
chunk_warranties = json.loads(response_text)
if isinstance(chunk_warranties, list):
# Filter out duplicates based on warranty number
new_warranties = []
for warranty in chunk_warranties:
warranty_num = warranty.get('warranty_number', '')
if warranty_num and warranty_num not in processed_numbers:
processed_numbers.add(warranty_num)
new_warranties.append(warranty)
all_warranties.extend(new_warranties)
print(f"ā
Extracted {len(new_warranties)} unique warranties from chunk {i+1}")
else:
print(f"ā ļø Unexpected response format from chunk {i+1}")
except json.JSONDecodeError as je:
print(f"ā JSON decode error in chunk {i+1}: {je}")
print(f"Response preview: {response_text[:300]}...")
# Try alternative extraction using regex
try:
warranty_pattern = r'\{\s*"warranty_number"[^}]+?\}'
matches = re.findall(warranty_pattern, response_text, re.DOTALL)
recovered_count = 0
for match in matches:
try:
warranty = json.loads(match)
warranty_num = warranty.get('warranty_number', '')
if warranty_num and warranty_num not in processed_numbers:
processed_numbers.add(warranty_num)
all_warranties.append(warranty)
recovered_count += 1
except:
continue
if recovered_count > 0:
print(f"ā
Recovered {recovered_count} warranties using regex extraction")
except Exception as re_error:
print(f"ā Regex recovery also failed: {re_error}")
except Exception as e:
print(f"ā Error processing chunk {i+1}: {e}")
# Clean up and verify warranties
if all_warranties:
all_warranties = self.verify_and_clean_warranties(all_warranties)
self.warranty_claims = all_warranties
print(f"ā
Total warranty claims identified: {len(self.warranty_claims)}")
# Display warranty number range
if self.warranty_claims:
warranty_numbers = [w.get('warranty_number', '') for w in self.warranty_claims if w.get('warranty_number')]
if warranty_numbers:
print(f"š Warranty numbers range: {min(warranty_numbers)} to {max(warranty_numbers)}")
print("\nSample warranty claims:")
for i, warranty in enumerate(self.warranty_claims[:5]):
print(f"{i+1}. [{warranty.get('warranty_number', 'N/A')}] {warranty.get('warranty_title', 'No title')}")
return self.warranty_claims
def verify_and_clean_warranties(self, warranties: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Verify and clean warranty claims."""
try:
cleaned_warranties = []
seen_numbers = set()
for warranty in warranties:
# Check if warranty has required fields
if not warranty.get('warranty_number') or not warranty.get('warranty_title'):
continue
warranty_num = warranty['warranty_number']
# Skip duplicates
if warranty_num in seen_numbers:
continue
seen_numbers.add(warranty_num)
# Clean warranty text
warranty_text = warranty.get('warranty_text', '')
if len(warranty_text) > 500:
warranty['warranty_text'] = warranty_text[:497] + "..."
cleaned_warranties.append(warranty)
# Sort by warranty number
def sort_key(w):
num = w.get('warranty_number', '0')
try:
# Extract main number and sub-number for proper sorting
parts = re.findall(r'\d+', num)
if parts:
return (int(parts[0]), int(parts[1]) if len(parts) > 1 else 0, num)
return (0, 0, num)
except:
return (0, 0, num)
cleaned_warranties.sort(key=sort_key)
return cleaned_warranties
except Exception as e:
print(f"ā ļø Warranty verification failed: {e}, using original list")
return warranties
def search_chroma_for_warranty(self, warranty: Dict[str, Any], top_k: int = 15) -> List[Dict]:
"""Search Chroma DB for documents relevant to a specific warranty."""
if not self.edr_collection:
print("ā ļø Chroma DB not available, skipping search")
return []
# Create search queries based on warranty content
warranty_text = warranty.get('warranty_text', '')
warranty_title = warranty.get('warranty_title', '')
warranty_number = warranty.get('warranty_number', '')
# Generate multiple search queries for comprehensive coverage
search_queries = []
# Add warranty-specific terms
if warranty_title:
search_queries.append(warranty_title)
if warranty_text:
search_queries.append(warranty_text[:500])
if warranty_title and warranty_text:
search_queries.append(f"{warranty_title} {warranty_text[:200]}")
# Add section-specific terms
if warranty_number:
search_queries.append(f"section {warranty_number}")
# Extract key terms for additional queries
key_terms = self.extract_key_terms_from_warranty(warranty)
if key_terms:
search_queries.extend(key_terms[:3])
# Add fallback general terms based on warranty type
general_terms = self.get_general_warranty_terms(warranty)
search_queries.extend(general_terms[:2])
all_documents = []
retrieved_ids = set()
print(f"š Searching ChromaDB with {len(search_queries)} queries for warranty {warranty_number}")
for i, query in enumerate(search_queries):
if not query.strip():
continue
try:
print(f" Query {i+1}: '{query[:60]}{'...' if len(query) > 60 else ''}'")
# Query Chroma DB
results = self.edr_collection.query(
query_texts=[query],
n_results=min(top_k, 10), # Limit per query
include=["documents", "metadatas", "distances"]
)
# Process results
if results['documents'] and len(results['documents'][0]) > 0:
documents = results['documents'][0]
metadatas = results['metadatas'][0] if results['metadatas'] else [{}] * len(documents)
distances = results['distances'][0] if results['distances'] else [1.0] * len(documents)
for j, (doc, metadata, distance) in enumerate(zip(documents, metadatas, distances)):
# Create unique document ID
doc_id = metadata.get('id', f'doc_{hash(doc[:100])}_{j}')
if doc_id not in retrieved_ids:
retrieved_ids.add(doc_id)
all_documents.append({
'id': doc_id,
'content': doc,
'metadata': metadata,
'distance': distance,
'query_match': query
})
print(f" Found {len(results['documents'][0]) if results['documents'] else 0} documents")
except Exception as e:
print(f"ā ļø ChromaDB search failed for query '{query[:50]}...': {e}")
continue
# Sort by relevance (lower distance = more relevant)
all_documents.sort(key=lambda x: x['distance'])
# Return top results
final_docs = all_documents[:top_k]
print(f"ā
Found {len(final_docs)} relevant documents from {len(all_documents)} total matches")
return final_docs
def extract_key_terms_from_warranty(self, warranty: Dict[str, Any]) -> List[str]:
"""Extract key terms from warranty for search queries."""
warranty_text = warranty.get('warranty_text', '')
warranty_title = warranty.get('warranty_title', '')
# Combine text
full_text = f"{warranty_title} {warranty_text}".lower()
# Extract important terms (nouns, key concepts)
key_terms = []
# Look for capitalized terms (likely important concepts)
caps_terms = re.findall(r'\b[A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+)*\b', warranty_text)
key_terms.extend(caps_terms[:3])
# Look for legal/business terms
business_terms = re.findall(r'\b(?:agreement|contract|liability|obligation|compliance|breach|default|guarantee|indemnity|insurance|permit|license|employee|consultant|shareholder|subsidiary|acquisition|merger|transaction|disclosure|warranty|representation)\b', full_text)
key_terms.extend(set(business_terms[:5]))
# Remove duplicates and return
return list(set(key_terms))[:5]
def generate_disclosure_for_warranty(self, warranty: Dict[str, Any]) -> str:
"""Generate a disclosure for a specific warranty claim."""
if not LANGCHAIN_AVAILABLE:
print("ā ļø LangChain not available for disclosure generation")
return "Disclosure generation unavailable - LangChain not installed."
print(f"Generating disclosure for: {warranty.get('warranty_number', 'Unknown')} - {warranty.get('warranty_title', 'Unknown')}")
# Search for relevant documents
relevant_docs = self.search_chroma_for_warranty(warranty)
if not relevant_docs:
print("ā ļø No relevant documents found in ChromaDB")
return self.generate_basic_disclosure(warranty)
# Prepare context from documents
context_parts = []
references = []
for doc in relevant_docs[:10]: # Limit to top 10 documents
content = doc['content']
metadata = doc['metadata']
# Add to reference manager and get citation
ref_num = self.ref_manager.add_document(
doc_id=doc['id'],
content=content,
metadata=metadata
)
citation = f"[{ref_num}]"
references.append(citation)
# Truncate content for context
truncated_content = content[:1000] + ("..." if len(content) > 1000 else "")
context_parts.append(f"Document {citation}: {truncated_content}")
context_text = "\n\n".join(context_parts)
# Generate disclosure using LLM
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.3, max_tokens=4000)
disclosure_prompt = f"""
You are a legal expert creating a comprehensive warranty disclosure summary for a corporate acquisition.
**Warranty Claim Details:**
- **Number:** {warranty.get('warranty_number', 'N/A')}
- **Title:** {warranty.get('warranty_title', 'N/A')}
- **Text:** {warranty.get('warranty_text', 'N/A')}
- **Section:** {warranty.get('section_name', 'N/A')}
**Available Supporting Documentation:**
{context_text}
**Instructions:**
1. Create a comprehensive disclosure summary that addresses this specific warranty claim
2. Use inline citations {', '.join(references)} to reference the supporting documents
3. Include specific details from the documents that support or contradict the warranty
4. Structure the disclosure with clear headings and bullet points
5. Address potential risks, compliance issues, or qualifications to the warranty
6. Use professional legal language appropriate for an M&A context
7. Ensure the disclosure is thorough but concise (aim for 500-1000 words)
**Output Format:**
# Disclosure Summary for Warranty Claim: [Warranty Title]
## Overview
[Brief summary of the warranty and its implications]
## [Relevant Section Headers based on the specific warranty]
[Detailed analysis with inline citations]
## Conclusion
[Summary of key findings and any qualifications to the warranty]
**Note**: Include inline citations throughout using the format {', '.join(references)} to reference specific supporting documents.
"""
try:
response = llm.invoke(disclosure_prompt)
disclosure_text = response.content.strip()
# Ensure proper formatting
if not disclosure_text.startswith('#'):
disclosure_text = f"# Disclosure Summary for Warranty Claim: {warranty.get('warranty_title', 'Unknown')}\n\n{disclosure_text}"
return disclosure_text
except Exception as e:
print(f"ā Error generating disclosure: {e}")
return self.generate_basic_disclosure(warranty)
def generate_basic_disclosure(self, warranty: Dict[str, Any]) -> str:
"""Generate a basic disclosure when no documents or LLM is available."""
return f"""# Disclosure Summary for Warranty Claim: {warranty.get('warranty_title', 'Unknown')}
## Warranty Details
- **Section:** {warranty.get('warranty_number', 'N/A')}
- **Title:** {warranty.get('warranty_title', 'N/A')}
- **Text:** {warranty.get('warranty_text', 'N/A')}
## Analysis
This warranty claim requires detailed analysis against the company's records and documentation.
## Conclusion
Further investigation and document review required to provide comprehensive disclosure analysis.
**Note:** Limited disclosure analysis available due to insufficient supporting documentation or system limitations.
"""
def export_to_markdown(self) -> str:
"""Export warranty claims and disclosures to markdown format."""
print("\n" + "="*60)
print("EXPORTING TO MARKDOWN")
print("="*60)
if not self.warranty_claims or not self.disclosures:
print("ā ļø No warranty claims or disclosures available for export")
return ""
# Generate output filename
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_path = f"/tf/active/project_victoria_disclosures_fixed_{timestamp}.md"
# Start building markdown content
markdown_parts = []
# Add header
markdown_parts.append("# Project Victoria - Warranty Disclosures\\n")
markdown_parts.append(f"**Generated on**: {datetime.now().strftime('%B %d, %Y at %H:%M:%S')}")
markdown_parts.append(f"**Total Warranties Processed**: {len(self.warranty_claims)}")
markdown_parts.append(f"**Total Disclosures Generated**: {len(self.disclosures)}")
markdown_parts.append(f"**Total References**: {len(self.ref_manager.references)}")
markdown_parts.append("\\n---\\n")
# Add table of contents
markdown_parts.append("## Table of Contents\\n")
for warranty in self.warranty_claims:
warranty_num = warranty.get('warranty_number', 'Unknown')
warranty_title = warranty.get('warranty_title', 'No Title')
anchor = warranty_title.lower().replace(' ', '-').replace('(', '').replace(')', '')
markdown_parts.append(f"- [{warranty_num} - {warranty_title}](#{warranty_num.lower()}-{anchor})")
markdown_parts.append("\\n---\\n")
# Add warranty sections
for disclosure_record in self.disclosures:
warranty = disclosure_record['warranty']
disclosure_text = disclosure_record['disclosure']
warranty_num = warranty.get('warranty_number', 'Unknown')
warranty_title = warranty.get('warranty_title', 'No Title')
# Add warranty header
markdown_parts.append(f"## {warranty_num} - {warranty_title}\\n")
# Add warranty metadata
section_name = warranty.get('section_name', 'Unknown Section')
markdown_parts.append(f"**Section**: {section_name}")
# Add document count if available
warranty_docs = self.search_chroma_for_warranty(warranty, top_k=1)
doc_count = len(warranty_docs) if warranty_docs else 0
markdown_parts.append(f"**Source Documents Found**: {doc_count}\\n")
# Add warranty text
markdown_parts.append("### Warranty Text\\n")
warranty_text = warranty.get('warranty_text', 'No warranty text available')
markdown_parts.append(warranty_text + "\\n")
# Add disclosure
markdown_parts.append("### Disclosure\\n")
markdown_parts.append(disclosure_text + "\\n")
# Add separator
markdown_parts.append("---\\n")
# Add references section
bibliography = self.ref_manager.generate_bibliography()
markdown_parts.append(bibliography)
# Combine all parts
markdown_content = "\\n".join(markdown_parts)
# Write to file
with open(output_path, 'w', encoding='utf-8') as f:
f.write(markdown_content)
print(f"ā
Markdown exported to: {output_path}")
print(f"š File size: {len(markdown_content):,} characters")
return output_path
def get_general_warranty_terms(self, warranty: Dict[str, Any]) -> List[str]:
"""Get general search terms based on warranty type."""
warranty_text = warranty.get('warranty_text', '').lower()
warranty_title = warranty.get('warranty_title', '').lower()
warranty_number = warranty.get('warranty_number', '')
general_terms = []
# Map warranty types to general terms
term_mapping = {
'incorporation': ['incorporation', 'company formation', 'legal entity'],
'shares': ['shareholding', 'equity', 'ownership'],
'authority': ['power', 'authorization', 'legal capacity'],
'accounts': ['financial statements', 'accounting', 'audited accounts'],
'business': ['operations', 'trading', 'commercial activity'],
'assets': ['property', 'equipment', 'resources'],
'contracts': ['agreements', 'legal obligations', 'commitments'],
'employment': ['employees', 'staff', 'personnel', 'labour'],
'litigation': ['legal proceedings', 'disputes', 'claims'],
'compliance': ['regulatory', 'legal requirements', 'obligations'],
'insurance': ['policies', 'coverage', 'protection'],
'intellectual property': ['IP', 'patents', 'trademarks', 'copyrights'],
'environmental': ['EHS', 'environmental compliance', 'health safety'],
'tax': ['taxation', 'tax compliance', 'tax obligations'],
'permits': ['licenses', 'authorizations', 'regulatory approvals']
}
# Check for matching terms
combined_text = f"{warranty_title} {warranty_text}"
for category, terms in term_mapping.items():
if category in combined_text:
general_terms.extend(terms[:2])
break
# Add section-based terms
if warranty_number:
section_parts = warranty_number.split('.')
if len(section_parts) >= 1:
section_num = section_parts[0]
section_mapping = {
'1': ['shares', 'sellers', 'ownership'],
'2': ['authority', 'capacity', 'power'],
'3': ['share capital', 'securities', 'equity'],
'4': ['accounts', 'financial statements'],
'5': ['business continuity', 'operations'],
'6': ['assets', 'property', 'equipment'],
'7': ['corporate', 'constitutional'],
'8': ['contracts', 'agreements'],
'9': ['borrowing', 'debt', 'financing'],
'10': ['permits', 'licenses'],
'11': ['insolvency', 'financial distress'],
'12': ['litigation', 'legal proceedings'],
'13': ['employment', 'employees'],
'14': ['competition', 'antitrust'],
'15': ['environmental', 'health safety'],
'16': ['real estate', 'property'],
'17': ['insurance', 'coverage'],
'18': ['intellectual property', 'IP'],
'19': ['data protection', 'privacy'],
'20': ['tax', 'taxation'],
'21': ['regulatory', 'compliance'],
'22': ['technology', 'IT systems'],
'23': ['material contracts', 'key agreements']
}
if section_num in section_mapping:
general_terms.extend(section_mapping[section_num])
return list(set(general_terms))[:5] # Remove duplicates and limit
# Update the search method to include better debugging and error handling
def search_chroma_for_warranty(self, warranty: Dict[str, Any], top_k: int = 15) -> List[Dict]:
"""Search Chroma DB for documents relevant to a specific warranty."""
if not self.edr_collection:
print("ā ļø Chroma DB not available, skipping search")
return []
# Create search queries based on warranty content
warranty_text = warranty.get('warranty_text', '')
warranty_title = warranty.get('warranty_title', '')
warranty_number = warranty.get('warranty_number', '')
# Generate multiple search queries for comprehensive coverage
search_queries = []
# Add warranty-specific terms
if warranty_title:
search_queries.append(warranty_title)
if warranty_text:
search_queries.append(warranty_text[:500])
if warranty_title and warranty_text:
search_queries.append(f"{warranty_title} {warranty_text[:200]}")
# Add section-specific terms
if warranty_number:
search_queries.append(f"section {warranty_number}")
# Extract key terms for additional queries
key_terms = self.extract_key_terms_from_warranty(warranty)
if key_terms:
search_queries.extend(key_terms[:3])
# Add fallback general terms based on warranty type
general_terms = self.get_general_warranty_terms(warranty)
search_queries.extend(general_terms[:2])
all_documents = []
retrieved_ids = set()
print(f"š Searching ChromaDB with {len(search_queries)} queries for warranty {warranty_number}")
for i, query in enumerate(search_queries):
if not query.strip():
continue
try:
print(f" Query {i+1}: '{query[:60]}{'...' if len(query) > 60 else ''}'")
# Query Chroma DB
results = self.edr_collection.query(
query_texts=[query],
n_results=min(top_k, 10), # Limit per query
include=["documents", "metadatas", "distances"]
)
# Process results
if results['documents'] and len(results['documents'][0]) > 0:
documents = results['documents'][0]
metadatas = results['metadatas'][0] if results['metadatas'] else [{}] * len(documents)
distances = results['distances'][0] if results['distances'] else [1.0] * len(documents)
for j, (doc, metadata, distance) in enumerate(zip(documents, metadatas, distances)):
# Create unique document ID
doc_id = metadata.get('id', f'doc_{hash(doc[:100])}_{j}')
if doc_id not in retrieved_ids:
retrieved_ids.add(doc_id)
all_documents.append({
'id': doc_id,
'content': doc,
'metadata': metadata,
'distance': distance,
'query_match': query
})
print(f" Found {len(results['documents'][0]) if results['documents'] else 0} documents")
except Exception as e:
print(f"ā ļø ChromaDB search failed for query '{query[:50]}...': {e}")
continue
# Sort by relevance (lower distance = more relevant)
all_documents.sort(key=lambda x: x['distance'])
# Return top results
final_docs = all_documents[:top_k]
print(f"ā
Found {len(final_docs)} relevant documents from {len(all_documents)} total matches")
return final_docs
def generate_disclosure_for_warranty(self, warranty: Dict[str, Any]) -> str:
"""Generate a disclosure for a specific warranty."""
if not LANGCHAIN_AVAILABLE:
print("ā ļø LangChain not available for disclosure generation")
return "Disclosure generation unavailable - LangChain not installed."
print(f"Generating disclosure for: {warranty.get('warranty_number', 'Unknown')} - {warranty.get('warranty_title', 'Unknown')}")
# Search for relevant documents
relevant_docs = self.search_chroma_for_warranty(warranty)
if not relevant_docs:
print("ā ļø No relevant documents found in ChromaDB")
return self.generate_basic_disclosure(warranty)
# Prepare context from documents
context_parts = []
references = []
for doc in relevant_docs[:10]: # Limit to top 10 documents
content = doc['content']
metadata = doc['metadata']
# Add to reference manager and get citation
ref_num = self.ref_manager.add_document(
doc_id=doc['id'],
content=content,
metadata=metadata
)
citation = f"[{ref_num}]"
references.append(citation)
# Truncate content for context
truncated_content = content[:1000] + ("..." if len(content) > 1000 else "")
context_parts.append(f"Document {citation}: {truncated_content}")
context_text = "\n\n".join(context_parts)
# Generate disclosure using LLM
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.3, max_tokens=4000)
disclosure_prompt = f"""
You are a legal expert creating a comprehensive warranty disclosure summary for a corporate acquisition.
**Warranty Claim Details:**
- **Number:** {warranty.get('warranty_number', 'N/A')}
- **Title:** {warranty.get('warranty_title', 'N/A')}
- **Text:** {warranty.get('warranty_text', 'N/A')}
- **Section:** {warranty.get('section_name', 'N/A')}
**Available Supporting Documentation:**
{context_text}
**Instructions:**
1. Create a comprehensive disclosure summary that addresses this specific warranty claim
2. Use inline citations {', '.join(references)} to reference the supporting documents
3. Include specific details from the documents that support or contradict the warranty
4. Structure the disclosure with clear headings and bullet points
5. Address potential risks, compliance issues, or qualifications to the warranty
6. Use professional legal language appropriate for an M&A context
7. Ensure the disclosure is thorough but concise (aim for 500-1000 words)
**Output Format:**
# Disclosure Summary for Warranty Claim: [Warranty Title]
## Overview
[Brief summary of the warranty and its implications]
## [Relevant Section Headers based on the specific warranty]
[Detailed analysis with inline citations]
## Conclusion
[Summary of key findings and any qualifications to the warranty]
**Note**: Include inline citations throughout using the format {', '.join(references)} to reference specific supporting documents.
"""
try:
response = llm.invoke(disclosure_prompt)
disclosure_text = response.content.strip()
# Ensure proper formatting
if not disclosure_text.startswith('#'):
disclosure_text = f"# Disclosure Summary for Warranty Claim: {warranty.get('warranty_title', 'Unknown')}\n\n{disclosure_text}"
return disclosure_text
except Exception as e:
print(f"ā Error generating disclosure: {e}")
return self.generate_basic_disclosure(warranty)
def generate_basic_disclosure(self, warranty: Dict[str, Any]) -> str:
"""Generate a basic disclosure when no documents or LLM is available."""
return f"""# Disclosure Summary for Warranty Claim: {warranty.get('warranty_title', 'Unknown')}
## Warranty Details
- **Section:** {warranty.get('warranty_number', 'N/A')}
- **Title:** {warranty.get('warranty_title', 'N/A')}
- **Text:** {warranty.get('warranty_text', 'N/A')}
## Analysis
This warranty claim requires detailed analysis against the company's records and documentation.
## Conclusion
Further investigation and document review required to provide comprehensive disclosure analysis.
**Note:** Limited disclosure analysis available due to insufficient supporting documentation or system limitations.
"""
def export_to_markdown(self) -> str:
"""Export warranty claims and disclosures to markdown format."""
print("\n" + "="*60)
print("EXPORTING TO MARKDOWN")
print("="*60)
if not self.warranty_claims or not self.disclosures:
print("ā ļø No warranty claims or disclosures available for export")
return ""
# Generate output filename
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_path = f"/tf/active/project_victoria_disclosures_fixed_{timestamp}.md"
# Start building markdown content
markdown_parts = []
# Add header
markdown_parts.append("# Project Victoria - Warranty Disclosures\\n")
markdown_parts.append(f"**Generated on**: {datetime.now().strftime('%B %d, %Y at %H:%M:%S')}")
markdown_parts.append(f"**Total Warranties Processed**: {len(self.warranty_claims)}")
markdown_parts.append(f"**Total Disclosures Generated**: {len(self.disclosures)}")
markdown_parts.append(f"**Total References**: {len(self.ref_manager.references)}")
markdown_parts.append("\\n---\\n")
# Add table of contents
markdown_parts.append("## Table of Contents\\n")
for warranty in self.warranty_claims:
warranty_num = warranty.get('warranty_number', 'Unknown')
warranty_title = warranty.get('warranty_title', 'No Title')
anchor = warranty_title.lower().replace(' ', '-').replace('(', '').replace(')', '')
markdown_parts.append(f"- [{warranty_num} - {warranty_title}](#{warranty_num.lower()}-{anchor})")
markdown_parts.append("\\n---\\n")
# Add warranty sections
for disclosure_record in self.disclosures:
warranty = disclosure_record['warranty']
disclosure_text = disclosure_record['disclosure']
warranty_num = warranty.get('warranty_number', 'Unknown')
warranty_title = warranty.get('warranty_title', 'No Title')
# Add warranty header
markdown_parts.append(f"## {warranty_num} - {warranty_title}\\n")
# Add warranty metadata
section_name = warranty.get('section_name', 'Unknown Section')
markdown_parts.append(f"**Section**: {section_name}")
# Add document count if available
warranty_docs = self.search_chroma_for_warranty(warranty, top_k=1)
doc_count = len(warranty_docs) if warranty_docs else 0
markdown_parts.append(f"**Source Documents Found**: {doc_count}\\n")
# Add warranty text
markdown_parts.append("### Warranty Text\\n")
warranty_text = warranty.get('warranty_text', 'No warranty text available')
markdown_parts.append(warranty_text + "\\n")
# Add disclosure
markdown_parts.append("### Disclosure\\n")
markdown_parts.append(disclosure_text + "\\n")
# Add separator
markdown_parts.append("---\\n")
# Add references section
bibliography = self.ref_manager.generate_bibliography()
markdown_parts.append(bibliography)
# Combine all parts
markdown_content = "\\n".join(markdown_parts)
# Write to file
with open(output_path, 'w', encoding='utf-8') as f:
f.write(markdown_content)
print(f"ā
Markdown exported to: {output_path}")
print(f"š File size: {len(markdown_content):,} characters")
return output_path
def get_general_warranty_terms(self, warranty: Dict[str, Any]) -> List[str]:
"""Get general search terms based on warranty type."""
warranty_text = warranty.get('warranty_text', '').lower()
warranty_title = warranty.get('warranty_title', '').lower()
warranty_number = warranty.get('warranty_number', '')
general_terms = []
# Map warranty types to general terms
term_mapping = {
'incorporation': ['incorporation', 'company formation', 'legal entity'],
'shares': ['shareholding', 'equity', 'ownership'],
'authority': ['power', 'authorization', 'legal capacity'],
'accounts': ['financial statements', 'accounting', 'audited accounts'],
'business': ['operations', 'trading', 'commercial activity'],
'assets': ['property', 'equipment', 'resources'],
'contracts': ['agreements', 'legal obligations', 'commitments'],
'employment': ['employees', 'staff', 'personnel', 'labour'],
'litigation': ['legal proceedings', 'disputes', 'claims'],
'compliance': ['regulatory', 'legal requirements', 'obligations'],
'insurance': ['policies', 'coverage', 'protection'],
'intellectual property': ['IP', 'patents', 'trademarks', 'copyrights'],
'environmental': ['EHS', 'environmental compliance', 'health safety'],
'tax': ['taxation', 'tax compliance', 'tax obligations'],
'permits': ['licenses', 'authorizations', 'regulatory approvals']
}
# Check for matching terms
combined_text = f"{warranty_title} {warranty_text}"
for category, terms in term_mapping.items():
if category in combined_text:
general_terms.extend(terms[:2])
break
# Add section-based terms
if warranty_number:
section_parts = warranty_number.split('.')
if len(section_parts) >= 1:
section_num = section_parts[0]
section_mapping = {
'1': ['shares', 'sellers', 'ownership'],
'2': ['authority', 'capacity', 'power'],
'3': ['share capital', 'securities', 'equity'],
'4': ['accounts', 'financial statements'],
'5': ['business continuity', 'operations'],
'6': ['assets', 'property', 'equipment'],
'7': ['corporate', 'constitutional'],
'8': ['contracts', 'agreements'],
'9': ['borrowing', 'debt', 'financing'],
'10': ['permits', 'licenses'],
'11': ['insolvency', 'financial distress'],
'12': ['litigation', 'legal proceedings'],
'13': ['employment', 'employees'],
'14': ['competition', 'antitrust'],
'15': ['environmental', 'health safety'],
'16': ['real estate', 'property'],
'17': ['insurance', 'coverage'],
'18': ['intellectual property', 'IP'],
'19': ['data protection', 'privacy'],
'20': ['tax', 'taxation'],
'21': ['regulatory', 'compliance'],
'22': ['technology', 'IT systems'],
'23': ['material contracts', 'key agreements']
}
if section_num in section_mapping:
general_terms.extend(section_mapping[section_num])
return list(set(general_terms))[:5] # Remove duplicates and limit
# Update the search method to include better debugging and error handling
def search_chroma_for_warranty(self, warranty: Dict[str, Any], top_k: int = 15) -> List[Dict]:
"""Search Chroma DB for documents relevant to a specific warranty."""
if not self.edr_collection:
print("ā ļø Chroma DB not available, skipping search")
return []
# Create search queries based on warranty content
warranty_text = warranty.get('warranty_text', '')
warranty_title = warranty.get('warranty_title', '')
warranty_number = warranty.get('warranty_number', '')
# Generate multiple search queries for comprehensive coverage
search_queries = []
# Add warranty-specific terms
if warranty_title:
search_queries.append(warranty_title)
if warranty_text:
search_queries.append(warranty_text[:500])
if warranty_title and warranty_text:
search_queries.append(f"{warranty_title} {warranty_text[:200]}")
# Add section-specific terms
if warranty_number:
search_queries.append(f"section {warranty_number}")
# Extract key terms for additional queries
key_terms = self.extract_key_terms_from_warranty(warranty)
if key_terms:
search_queries.extend(key_terms[:3])
# Add fallback general terms based on warranty type
general_terms = self.get_general_warranty_terms(warranty)
search_queries.extend(general_terms[:2])
all_documents = []
retrieved_ids = set()
print(f"š Searching ChromaDB with {len(search_queries)} queries for warranty {warranty_number}")
for i, query in enumerate(search_queries):
if not query.strip():
continue
try:
print(f" Query {i+1}: '{query[:60]}{'...' if len(query) > 60 else ''}'")
# Query Chroma DB
results = self.edr_collection.query(
query_texts=[query],
n_results=min(top_k, 10), # Limit per query
include=["documents", "metadatas", "distances"]
)
# Process results
if results['documents'] and len(results['documents'][0]) > 0:
documents = results['documents'][0]
metadatas = results['metadatas'][0] if results['metadatas'] else [{}] * len(documents)
distances = results['distances'][0] if results['distances'] else [1.0] * len(documents)
for j, (doc, metadata, distance) in enumerate(zip(documents, metadatas, distances)):
# Create unique document ID
doc_id = metadata.get('id', f'doc_{hash(doc[:100])}_{j}')
if doc_id not in retrieved_ids:
retrieved_ids.add(doc_id)
all_documents.append({
'id': doc_id,
'content': doc,
'metadata': metadata,
'distance': distance,
'query_match': query
})
print(f" Found {len(results['documents'][0]) if results['documents'] else 0} documents")
except Exception as e:
print(f"ā ļø ChromaDB search failed for query '{query[:50]}...': {e}")
continue
# Sort by relevance (lower distance = more relevant)
all_documents.sort(key=lambda x: x['distance'])
# Return top results
final_docs = all_documents[:top_k]
print(f"ā
Found {len(final_docs)} relevant documents from {len(all_documents)} total matches")
return final_docs
def generate_disclosure_for_warranty(self, warranty: Dict[str, Any]) -> str:
"""Generate a disclosure for a specific warranty."""
if not LANGCHAIN_AVAILABLE:
print("ā ļø LangChain not available for disclosure generation")
return "Disclosure generation unavailable - LangChain not installed."
print(f"Generating disclosure for: {warranty.get('warranty_number', 'Unknown')} - {warranty.get('warranty_title', 'Unknown')}")
# Search for relevant documents
relevant_docs = self.search_chroma_for_warranty(warranty)
if not relevant_docs:
print("ā ļø No relevant documents found in ChromaDB")
return self.generate_basic_disclosure(warranty)
# Prepare context from documents
context_parts = []
references = []
for doc in relevant_docs[:10]: # Limit to top 10 documents
content = doc['content']
metadata = doc['metadata']
# Add to reference manager and get citation
ref_num = self.ref_manager.add_document(
doc_id=doc['id'],
content=content,
metadata=metadata
)
citation = f"[{ref_num}]"
references.append(citation)
# Truncate content for context
truncated_content = content[:1000] + ("..." if len(content) > 1000 else "")
context_parts.append(f"Document {citation}: {truncated_content}")
context_text = "\n\n".join(context_parts)
# Generate disclosure using LLM
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.3, max_tokens=4000)
disclosure_prompt = f"""
You are a legal expert creating a comprehensive warranty disclosure summary for a corporate acquisition.
**Warranty Claim Details:**
- **Number:** {warranty.get('warranty_number', 'N/A')}
- **Title:** {warranty.get('warranty_title', 'N/A')}
- **Text:** {warranty.get('warranty_text', 'N/A')}
- **Section:** {warranty.get('section_name', 'N/A')}
**Available Supporting Documentation:**
{context_text}
**Instructions:**
1. Create a comprehensive disclosure summary that addresses this specific warranty claim
2. Use inline citations {', '.join(references)} to reference the supporting documents
3. Include specific details from the documents that support or contradict the warranty
4. Structure the disclosure with clear headings and bullet points
5. Address potential risks, compliance issues, or qualifications to the warranty
6. Use professional legal language appropriate for an M&A context
7. Ensure the disclosure is thorough but concise (aim for 500-1000 words)
**Output Format:**
# Disclosure Summary for Warranty Claim: [Warranty Title]
## Overview
[Brief summary of the warranty and its implications]
## [Relevant Section Headers based on the specific warranty]
[Detailed analysis with inline citations]
## Conclusion
[Summary of key findings and any qualifications to the warranty]
**Note**: Include inline citations throughout using the format {', '.join(references)} to reference specific supporting documents.
"""
try:
response = llm.invoke(disclosure_prompt)
disclosure_text = response.content.strip()
# Ensure proper formatting
if not disclosure_text.startswith('#'):
disclosure_text = f"# Disclosure Summary for Warranty Claim: {warranty.get('warranty_title', 'Unknown')}\n\n{disclosure_text}"
return disclosure_text
except Exception as e:
print(f"ā Error generating disclosure: {e}")
return self.generate_basic_disclosure(warranty)
def generate_basic_disclosure(self, warranty: Dict[str, Any]) -> str:
"""Generate a basic disclosure when no documents or LLM is available."""
warranty_num = warranty.get('warranty_number', 'Unknown')
warranty_title = warranty.get('warranty_title', 'Unknown')
warranty_text = warranty.get('warranty_text', 'No warranty text available')
basic_disclosure = f"""
# Basic Disclosure for Warranty {warranty_num}
## Warranty Overview
- **Section**: {warranty_num}
- **Title**: {warranty_title}
- **Text**: {warranty_text}
## Disclosure Status
This warranty requires detailed analysis based on company documentation and records.
**Note**: Full disclosure analysis requires access to supporting documentation from the company's data room and relevant business records.
## Next Steps
1. Review relevant company documentation
2. Consult with legal and business teams
3. Verify compliance with warranty requirements
4. Update disclosure based on findings
"""
return basic_disclosure
def generate_all_disclosures(self):
"""Generate disclosures for all warranty claims."""
print("\n" + "="*60)
print("GENERATING DISCLOSURES")
print("="*60)
if not self.warranty_claims:
print("ā ļø No warranty claims available for disclosure generation")
return
self.disclosures = []
for i, warranty in enumerate(self.warranty_claims):
print(f"\nProgress: {i+1}/{len(self.warranty_claims)} - Processing warranty {warranty.get('warranty_number', 'Unknown')}")
try:
disclosure_text = self.generate_disclosure_for_warranty(warranty)
disclosure_record = {
'warranty': warranty,
'disclosure': disclosure_text,
'generated_at': datetime.now().isoformat()
}
self.disclosures.append(disclosure_record)
except Exception as e:
print(f"ā Failed to generate disclosure for warranty {warranty.get('warranty_number', 'Unknown')}: {e}")
# Add a basic disclosure record
basic_disclosure = self.generate_basic_disclosure(warranty)
disclosure_record = {
'warranty': warranty,
'disclosure': basic_disclosure,
'generated_at': datetime.now().isoformat(),
'error': str(e)
}
self.disclosures.append(disclosure_record)
print(f"\nā
Generated {len(self.disclosures)} disclosures")
def run_complete_pipeline(self) -> str:
"""Run the complete disclosure generation pipeline."""
print("š Starting Fixed Project Victoria Disclosure Generation Pipeline\\n")
try:
# Step 1: Extract PDF text
print("Step 1: Extracting PDF text...")
self.extract_pdf_text()
# Step 2: Identify warranty claims
print("\\nStep 2: Identifying warranty claims...")
self.identify_warranty_claims()
# Step 3: Generate disclosures
print("\\nStep 3: Generating disclosures...")
self.generate_all_disclosures()
# Step 4: Export to markdown
print("\\nStep 4: Exporting to markdown...")
output_path = self.export_to_markdown()
print("\\n" + "="*60)
print("ā
FIXED PIPELINE COMPLETED SUCCESSFULLY!")
print("="*60)
print(f"š Output file: {output_path}")
print(f"š Total warranties processed: {len(self.warranty_claims)}")
print(f"š Total disclosures generated: {len(self.disclosures)}")
print(f"š Total references: {len(self.ref_manager.references)}")
print("="*60)
return output_path
except Exception as e:
print(f"ā Pipeline failed: {e}")
raise
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
bases: Parameter of type
Return Value
Returns unspecified type
Class Interface
Methods
__init__(self)
Purpose: Internal method: init
Returns: None
extract_pdf_text(self)
Purpose: Extract text from PDF or load from existing file.
Returns: None
split_text_by_warranty_sections(self, text) -> List[str]
Purpose: Split text into chunks based on warranty section boundaries. This ensures warranty sections are not split across chunks.
Parameters:
text: Type: str
Returns: Returns List[str]
identify_warranty_claims(self) -> List[Dict[str, Any]]
Purpose: Identify and extract individual warranty claims from the document.
Returns: Returns List[Dict[str, Any]]
verify_and_clean_warranties(self, warranties) -> List[Dict[str, Any]]
Purpose: Verify and clean warranty claims.
Parameters:
warranties: Type: List[Dict[str, Any]]
Returns: Returns List[Dict[str, Any]]
search_chroma_for_warranty(self, warranty, top_k) -> List[Dict]
Purpose: Search Chroma DB for documents relevant to a specific warranty.
Parameters:
warranty: Type: Dict[str, Any]top_k: Type: int
Returns: Returns List[Dict]
extract_key_terms_from_warranty(self, warranty) -> List[str]
Purpose: Extract key terms from warranty for search queries.
Parameters:
warranty: Type: Dict[str, Any]
Returns: Returns List[str]
generate_disclosure_for_warranty(self, warranty) -> str
Purpose: Generate a disclosure for a specific warranty claim.
Parameters:
warranty: Type: Dict[str, Any]
Returns: Returns str
generate_basic_disclosure(self, warranty) -> str
Purpose: Generate a basic disclosure when no documents or LLM is available.
Parameters:
warranty: Type: Dict[str, Any]
Returns: Returns str
export_to_markdown(self) -> str
Purpose: Export warranty claims and disclosures to markdown format.
Returns: Returns str
get_general_warranty_terms(self, warranty) -> List[str]
Purpose: Get general search terms based on warranty type.
Parameters:
warranty: Type: Dict[str, Any]
Returns: Returns List[str]
search_chroma_for_warranty(self, warranty, top_k) -> List[Dict]
Purpose: Search Chroma DB for documents relevant to a specific warranty.
Parameters:
warranty: Type: Dict[str, Any]top_k: Type: int
Returns: Returns List[Dict]
generate_disclosure_for_warranty(self, warranty) -> str
Purpose: Generate a disclosure for a specific warranty.
Parameters:
warranty: Type: Dict[str, Any]
Returns: Returns str
generate_basic_disclosure(self, warranty) -> str
Purpose: Generate a basic disclosure when no documents or LLM is available.
Parameters:
warranty: Type: Dict[str, Any]
Returns: Returns str
export_to_markdown(self) -> str
Purpose: Export warranty claims and disclosures to markdown format.
Returns: Returns str
get_general_warranty_terms(self, warranty) -> List[str]
Purpose: Get general search terms based on warranty type.
Parameters:
warranty: Type: Dict[str, Any]
Returns: Returns List[str]
search_chroma_for_warranty(self, warranty, top_k) -> List[Dict]
Purpose: Search Chroma DB for documents relevant to a specific warranty.
Parameters:
warranty: Type: Dict[str, Any]top_k: Type: int
Returns: Returns List[Dict]
generate_disclosure_for_warranty(self, warranty) -> str
Purpose: Generate a disclosure for a specific warranty.
Parameters:
warranty: Type: Dict[str, Any]
Returns: Returns str
generate_basic_disclosure(self, warranty) -> str
Purpose: Generate a basic disclosure when no documents or LLM is available.
Parameters:
warranty: Type: Dict[str, Any]
Returns: Returns str
generate_all_disclosures(self)
Purpose: Generate disclosures for all warranty claims.
Returns: None
run_complete_pipeline(self) -> str
Purpose: Run the complete disclosure generation pipeline.
Returns: Returns str
Required Imports
import os
import re
import json
import tiktoken
from typing import List
Usage Example
# Example usage:
# result = FixedProjectVictoriaGenerator(bases)
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class ProjectVictoriaDisclosureGenerator 82.8% similar
-
class ImprovedProjectVictoriaGenerator 77.8% similar
-
function main_v28 67.8% similar
-
function main_v29 63.0% similar
-
function main_v14 56.7% similar