ImprovedProjectVictoriaGenerator

class ImprovedProjectVictoriaGenerator

Maturity: 26

Improved Project Victoria Disclosure Generator with proper reference management.

File:
/tf/active/vicechatdev/improved_project_victoria_generator.py

Lines:
147 - 787

Complexity:
moderate

Purpose

Improved Project Victoria Disclosure Generator with proper reference management.

Source Code

class ImprovedProjectVictoriaGenerator:
    """
    Improved Project Victoria Disclosure Generator with proper reference management.
    """
    
    def __init__(self, pdf_path: str = "./20250623_Project Victoria - Disclosure Matrix_WIP.pdf"):
        """
        Initialize the improved disclosure generator.
        
        Args:
            pdf_path: Path to the Project Victoria PDF document
        """
        self.pdf_path = pdf_path
        self.extracted_text = ""
        self.warranty_claims = []
        self.disclosures = {}
        
        # Initialize reference manager
        self.ref_manager = ReferenceManager()
        
        # LLM configuration
        self.api_key = "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"
        os.environ["OPENAI_API_KEY"] = self.api_key
        
        # Initialize tokenizer for counting tokens
        self.tokenizer = tiktoken.get_encoding("cl100k_base")
        
        # Initialize Chroma DB connection
        self.init_chroma_connection()
        
        # Cross-encoder for ranking (if available)
        if CROSSENCODER_AVAILABLE:
            try:
                self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
                print("✅ Cross-encoder initialized for document reranking")
            except Exception as e:
                print(f"⚠️ Cross-encoder initialization failed: {e}")
                self.cross_encoder = None
        else:
            self.cross_encoder = None
        
        print(f"Initialized Improved Project Victoria Disclosure Generator")
        print(f"PDF path: {pdf_path}")
    
    def init_chroma_connection(self):
        """Initialize connection to Chroma DB."""
        try:
            # Connect to Chroma DB (assuming it's running on vice_chroma:8000)
            self.chroma_client = chromadb.HttpClient(host='vice_chroma', port=8000)
            
            # Set up embedding function
            if EMBEDDING_AVAILABLE:
                self.chroma_embedder = MyEmbeddingFunction("gpt-4o-mini", "text-embedding-3-small", self.api_key)
            else:
                print("⚠️ Using default Chroma embeddings (may not work with custom collections)")
                self.chroma_embedder = None
            
            # Get the 99_edr collection
            self.edr_collection = self.chroma_client.get_collection(
                "99_edr", 
                embedding_function=self.chroma_embedder
            )
            
            print("✅ Successfully connected to Chroma DB and 99_edr collection")
            
        except Exception as e:
            print(f"❌ Error connecting to Chroma DB: {e}")
            print("Attempting to use local fallback or alternative connection...")
            self.chroma_client = None
            self.edr_collection = None
    
    def extract_pdf_text(self) -> str:
        """Extract text from the Project Victoria PDF document."""
        # First try to use existing extracted text file
        try:
            with open("project_victoria_extracted.txt", "r", encoding="utf-8") as f:
                self.extracted_text = f.read()
            print(f"✅ Successfully loaded text from existing file: {len(self.extracted_text)} characters")
            return self.extracted_text
        except FileNotFoundError:
            print("📄 No existing extracted text file found, attempting PDF extraction...")
        except Exception as e:
            print(f"⚠️ Error reading existing text file: {e}")
        
        # Try PDF extraction if PyMuPDF is available
        if PDF_AVAILABLE and fitz:
            try:
                print(f"Extracting text from PDF: {self.pdf_path}")
                
                # Open the PDF document
                doc = fitz.open(self.pdf_path)
                
                text_content = []
                for page_num in range(len(doc)):
                    page = doc.load_page(page_num)
                    text = page.get_text()
                    text_content.append(f"\n--- Page {page_num + 1} ---\n{text}")
                
                doc.close()
                
                self.extracted_text = "\n".join(text_content)
                
                # Save extracted text for future use
                try:
                    with open("project_victoria_extracted.txt", "w", encoding="utf-8") as f:
                        f.write(self.extracted_text)
                    print("💾 Saved extracted text to project_victoria_extracted.txt")
                except Exception as save_error:
                    print(f"⚠️ Could not save extracted text: {save_error}")
                
                print(f"✅ Successfully extracted {len(self.extracted_text)} characters from PDF")
                print(f"📄 Total pages processed: {len(doc)}")
                
                return self.extracted_text
                
            except Exception as e:
                print(f"❌ Error extracting PDF text: {e}")
        else:
            print("❌ PyMuPDF not available for PDF extraction")
        
        # If all else fails, provide manual instructions
        raise Exception("""
        Could not extract text from PDF. Please do one of the following:
        
        1. Install PyMuPDF: pip install PyMuPDF
        2. Manually extract text from the PDF and save it as 'project_victoria_extracted.txt'
        3. Provide the extracted text file in the working directory
        
        The script will automatically use the text file if available.
        """)
    
    def identify_warranty_claims(self) -> List[Dict[str, Any]]:
        """Identify and extract individual warranty claims from the document."""
        print("\n" + "="*60)
        print("IDENTIFYING WARRANTY CLAIMS")
        print("="*60)
        
        # Use LLM to identify warranty claims
        llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=16000)
        
        warranty_extraction_prompt = """
You are analyzing a legal document containing warranty disclosures for a company acquisition.
The document is structured with numbered warranty sections, each containing specific warranty claims.

Your task is to extract ALL individual warranty claims from the text. Each warranty claim should be identified as a distinct legal requirement or representation.

Please extract and return a JSON list of warranty claims, where each item has:
- "warranty_number": The section number (e.g., "1.1", "2.1(a)", "3.4")
- "warranty_title": A short descriptive title for the warranty
- "warranty_text": The complete text of the warranty claim (limit to 500 characters)
- "section_name": The main section name (e.g., "THE SHARES; THE SELLERS", "AUTHORITY AND CAPACITY")

Focus on extracting the actual warranty statements, not procedural text or definitions.
IMPORTANT: Keep warranty_text under 500 characters to ensure the JSON response is not truncated.

Here is the document text to analyze:

{document_text}

Return only a valid JSON array of warranty claims. Ensure the response is complete and valid JSON.
"""
        
        # Split text into chunks if too long
        max_chunk_size = 30000
        text_chunks = self.split_text_for_processing(self.extracted_text, max_chunk_size)
        
        all_warranties = []
        
        for i, chunk in enumerate(text_chunks):
            print(f"Processing warranty extraction chunk {i+1}/{len(text_chunks)}")
            
            prompt = warranty_extraction_prompt.format(document_text=chunk)
            
            try:
                response = llm.invoke(prompt)
                response_text = response.content.strip()
                
                # Clean up response to ensure valid JSON
                if response_text.startswith("```json"):
                    response_text = response_text[7:]
                if response_text.endswith("```"):
                    response_text = response_text[:-3]
                
                # Try to repair truncated JSON
                response_text = response_text.strip()
                if not response_text.endswith(']'):
                    last_complete = response_text.rfind('}}')
                    if last_complete > 0:
                        response_text = response_text[:last_complete + 2] + ']'
                    else:
                        response_text = response_text + ']'
                
                chunk_warranties = json.loads(response_text)
                
                if isinstance(chunk_warranties, list):
                    all_warranties.extend(chunk_warranties)
                    print(f"✅ Extracted {len(chunk_warranties)} warranties from chunk {i+1}")
                else:
                    print(f"⚠️ Unexpected response format from chunk {i+1}")
                    
            except json.JSONDecodeError as je:
                print(f"❌ JSON decode error in chunk {i+1}: {je}")
                print(f"Response was: {response_text[:500]}...")
                # Try alternative extraction
                try:
                    warranty_pattern = r'\{\s*"warranty_number"[^}]+\}'
                    matches = re.findall(warranty_pattern, response_text, re.DOTALL)
                    for match in matches:
                        try:
                            warranty = json.loads(match)
                            all_warranties.append(warranty)
                        except:
                            continue
                    if matches:
                        print(f"✅ Recovered {len(matches)} warranties using regex extraction")
                except Exception as re_error:
                    print(f"❌ Regex recovery also failed: {re_error}")
            except Exception as e:
                print(f"❌ Error processing chunk {i+1}: {e}")
        
        # Clean up duplicates
        if all_warranties:
            all_warranties = self.verify_and_clean_warranties(all_warranties)
        
        self.warranty_claims = all_warranties
        
        print(f"✅ Total warranty claims identified: {len(self.warranty_claims)}")
        
        # Display summary
        if self.warranty_claims:
            print("\nSample warranty claims:")
            for i, warranty in enumerate(self.warranty_claims[:3]):
                print(f"{i+1}. [{warranty.get('warranty_number', 'N/A')}] {warranty.get('warranty_title', 'No title')}")
        
        return self.warranty_claims
    
    def verify_and_clean_warranties(self, warranties: List[Dict]) -> List[Dict]:
        """Use LLM to verify and clean up the warranty list."""
        print("\nVerifying and cleaning warranty claims...")
        
        llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=3000)
        
        verification_prompt = """
You are reviewing a list of warranty claims extracted from a legal document. 
Please clean up this list by:

1. Removing any duplicates
2. Ensuring each warranty is a distinct legal claim
3. Fixing any formatting issues
4. Ensuring warranty numbers are correct
5. Making sure warranty titles are concise and descriptive

Here is the warranty list to review:

{warranties_json}

Return the cleaned list as a valid JSON array with the same structure.
"""
        
        try:
            prompt = verification_prompt.format(warranties_json=json.dumps(warranties, indent=2))
            response = llm.invoke(prompt)
            response_text = response.content.strip()
            
            # Clean up response
            if response_text.startswith("```json"):
                response_text = response_text[7:]
            if response_text.endswith("```"):
                response_text = response_text[:-3]
            
            cleaned_warranties = json.loads(response_text)
            
            print(f"✅ Warranty verification complete: {len(warranties)} → {len(cleaned_warranties)} claims")
            return cleaned_warranties
            
        except Exception as e:
            print(f"⚠️ Warranty verification failed: {e}, using original list")
            return warranties
    
    def split_text_for_processing(self, text: str, max_tokens: int) -> List[str]:
        """Split text into chunks that fit within token limits."""
        tokens = self.tokenizer.encode(text)
        chunks = []
        
        for i in range(0, len(tokens), max_tokens):
            chunk_tokens = tokens[i:i + max_tokens]
            chunk_text = self.tokenizer.decode(chunk_tokens)
            chunks.append(chunk_text)
        
        return chunks
    
    def search_chroma_for_warranty(self, warranty: Dict[str, Any], top_k: int = 15) -> List[Dict]:
        """Search Chroma DB for documents relevant to a specific warranty."""
        if not self.edr_collection:
            print("⚠️ Chroma DB not available, skipping search")
            return []
        
        # Create search queries based on warranty content
        warranty_text = warranty.get('warranty_text', '')
        warranty_title = warranty.get('warranty_title', '')
        
        # Generate multiple search queries for comprehensive coverage
        search_queries = [
            warranty_title,
            warranty_text[:500],  # First part of warranty text
            f"{warranty_title} {warranty_text[:200]}"  # Combined query
        ]
        
        # Extract key terms for additional queries
        key_terms = self.extract_key_terms_from_warranty(warranty)
        if key_terms:
            search_queries.extend(key_terms[:3])  # Add top 3 key terms
        
        all_documents = []
        retrieved_ids = set()
        
        for query in search_queries:
            if not query.strip():
                continue
                
            try:
                # Query Chroma DB
                results = self.edr_collection.query(
                    query_texts=[query],
                    n_results=top_k,
                    include=["documents", "metadatas", "embeddings", "distances"]
                )
                
                # Process results
                if results['documents'] and len(results['documents'][0]) > 0:
                    for i, doc_id in enumerate(results['ids'][0]):
                        if doc_id not in retrieved_ids:
                            retrieved_ids.add(doc_id)
                            
                            document_info = {
                                'id': doc_id,
                                'content': results['documents'][0][i],
                                'metadata': results['metadatas'][0][i] if results['metadatas'] else {},
                                'distance': results['distances'][0][i] if results['distances'] else 1.0,
                                'query_used': query
                            }
                            all_documents.append(document_info)
                            
            except Exception as e:
                print(f"⚠️ Error querying Chroma DB with query '{query[:50]}...': {e}")
        
        # Rank documents using cross-encoder if available
        if all_documents and self.cross_encoder:
            all_documents = self.rerank_documents(warranty_text, all_documents)
        
        # Return top documents
        return all_documents[:top_k]
    
    def extract_key_terms_from_warranty(self, warranty: Dict[str, Any]) -> List[str]:
        """Extract key terms from warranty text for targeted searching."""
        warranty_text = warranty.get('warranty_text', '')
        
        # Use simple keyword extraction based on legal document patterns
        key_terms = []
        
        # Common legal/business terms that might be relevant
        important_words = [
            'incorporation', 'registered', 'authorized', 'shares', 'capital',
            'subsidiaries', 'accounts', 'financial', 'liabilities', 'assets',
            'compliance', 'regulatory', 'licenses', 'permits', 'agreements',
            'contracts', 'intellectual property', 'employment', 'litigation',
            'insurance', 'tax', 'environmental', 'data protection'
        ]
        
        text_lower = warranty_text.lower()
        for term in important_words:
            if term in text_lower:
                key_terms.append(term)
        
        # Extract specific company/legal entity mentions
        entity_patterns = [
            r'\\b[A-Z][a-z]+ [A-Z][a-z]+\\b',  # Proper names
            r'\\b[A-Z]{2,}\\b',  # Acronyms
        ]
        
        for pattern in entity_patterns:
            matches = re.findall(pattern, warranty_text)
            key_terms.extend(matches[:3])  # Limit to avoid too many terms
        
        return list(set(key_terms))[:5]  # Return unique terms, max 5
    
    def rerank_documents(self, query: str, documents: List[Dict]) -> List[Dict]:
        """Rerank documents using cross-encoder for better relevance."""
        if len(documents) <= 1:
            return documents
        
        try:
            # Prepare query-document pairs for cross-encoder
            pairs = [(query, doc['content'][:500]) for doc in documents]  # Limit content length
            
            # Get cross-encoder scores
            scores = self.cross_encoder.predict(pairs)
            
            # Add scores to documents and sort
            for i, doc in enumerate(documents):
                doc['cross_encoder_score'] = float(scores[i])
            
            # Sort by cross-encoder score (higher is better)
            documents.sort(key=lambda x: x['cross_encoder_score'], reverse=True)
            
        except Exception as e:
            print(f"⚠️ Cross-encoder reranking failed: {e}")
        
        return documents
    
    def generate_disclosure_with_references(self, warranty: Dict[str, Any], 
                                         relevant_docs: List[Dict]) -> str:
        """
        Generate a detailed disclosure for a specific warranty with proper inline references.
        
        Args:
            warranty: Warranty claim dictionary
            relevant_docs: List of relevant documents from Chroma DB
            
        Returns:
            Generated disclosure text with inline references
        """
        if not relevant_docs:
            return "No relevant supporting documentation found in the knowledge base."
        
        # Add documents to reference manager and create context
        context_blocks = []
        doc_references = {}  # Map doc_id to reference number
        
        for doc in relevant_docs[:10]:  # Limit to top 10 docs
            ref_num = self.ref_manager.add_document(
                doc_id=doc['id'],
                content=doc['content'],
                metadata=doc.get('metadata', {})
            )
            doc_references[doc['id']] = ref_num
            
            # Create context block with reference number
            context_blocks.append(f"[{ref_num}] {doc['content']}")
        
        context_text = "\n\n".join(context_blocks)
        
        # Create disclosure generation prompt
        llm = ChatOpenAI(model="gpt-4o", temperature=0.1, max_tokens=2000)
        
        disclosure_prompt = """
You are a legal and business expert helping to prepare warranty disclosures for a company acquisition.

**TASK**: Generate a detailed disclosure summary for the warranty claim below, based on the provided supporting documentation.

**WARRANTY CLAIM**:
Section: {warranty_number}
Title: {warranty_title}
Text: {warranty_text}

**SUPPORTING DOCUMENTATION**:
{context_text}

**INSTRUCTIONS**:
1. Analyze the warranty claim to understand what needs to be disclosed
2. Review the supporting documentation for relevant information
3. Create a comprehensive disclosure that addresses the warranty requirements
4. Include specific facts, figures, dates, and details from the documentation
5. Use inline citations [1], [2], [3] etc. to reference specific information sources
6. Structure the disclosure clearly with appropriate headings if needed
7. Focus on factual information that supports or relates to the warranty claim
8. If certain aspects of the warranty cannot be fully addressed from the available documentation, note this clearly

**IMPORTANT**: Use ONLY the reference numbers [1], [2], [3] etc. that are shown in the supporting documentation above. Do not create new reference numbers.

**OUTPUT FORMAT**:
Provide a well-structured disclosure in markdown format with:
- Clear, professional language
- Inline citations using [1], [2], [3] format (matching the numbers in the supporting documentation)
- Specific details and facts from the supporting documents
- Appropriate level of detail for legal/business disclosure purposes

**DISCLOSURE SUMMARY**:
"""
        
        prompt = disclosure_prompt.format(
            warranty_number=warranty.get('warranty_number', 'N/A'),
            warranty_title=warranty.get('warranty_title', ''),
            warranty_text=warranty.get('warranty_text', ''),
            context_text=context_text
        )
        
        try:
            response = llm.invoke(prompt)
            disclosure_text = response.content.strip()
            
            return disclosure_text
            
        except Exception as e:
            print(f"❌ Error generating disclosure: {e}")
            return f"Error generating disclosure for warranty {warranty.get('warranty_number', 'N/A')}: {str(e)}"
    
    def generate_all_disclosures(self) -> Dict[str, str]:
        """Generate disclosures for all identified warranty claims."""
        print("\n" + "="*60)
        print("GENERATING DISCLOSURES FOR ALL WARRANTIES")
        print("="*60)
        
        # Clear reference manager for fresh start
        self.ref_manager.clear()
        self.disclosures = {}
        
        for i, warranty in enumerate(self.warranty_claims, 1):
            warranty_num = warranty.get('warranty_number', f'W{i}')
            warranty_title = warranty.get('warranty_title', 'Untitled Warranty')
            
            print(f"\nProcessing warranty {i}/{len(self.warranty_claims)}: {warranty_num} - {warranty_title}")
            
            # Search for relevant documents
            relevant_docs = self.search_chroma_for_warranty(warranty)
            
            if relevant_docs:
                print(f"Found {len(relevant_docs)} relevant documents")
            else:
                print("No relevant documents found")
            
            # Generate disclosure with proper references
            disclosure = self.generate_disclosure_with_references(warranty, relevant_docs)
            
            self.disclosures[warranty_num] = {
                'warranty': warranty,
                'disclosure': disclosure,
                'source_documents': len(relevant_docs),
                'generated_at': datetime.now().isoformat()
            }
            
            print(f"✅ Generated disclosure for {warranty_num}")
        
        print(f"\n✅ Completed disclosure generation for {len(self.disclosures)} warranties")
        print(f"📚 Total references in bibliography: {len(self.ref_manager.references)}")
        return self.disclosures
    
    def export_to_markdown(self, output_path: str = "./project_victoria_disclosures_improved.md") -> str:
        """Export all disclosures to a comprehensive markdown report with proper references."""
        print("\n" + "="*60)
        print("EXPORTING DISCLOSURES TO MARKDOWN")
        print("="*60)
        
        # Create markdown content
        markdown_content = []
        
        # Header
        markdown_content.extend([
            "# Project Victoria - Warranty Disclosures\\n",
            f"**Generated on**: {datetime.now().strftime('%B %d, %Y at %H:%M:%S')}",
            f"**Total Warranties Processed**: {len(self.warranty_claims)}",
            f"**Total Disclosures Generated**: {len(self.disclosures)}",
            f"**Total References**: {len(self.ref_manager.references)}\\n",
            "---\\n"
        ])
        
        # Table of Contents
        markdown_content.append("## Table of Contents\\n")
        for warranty in sorted(self.warranty_claims, key=lambda x: x.get('warranty_number', '')):
            warranty_num = warranty.get('warranty_number', '')
            warranty_title = warranty.get('warranty_title', '')
            anchor = warranty_num.lower().replace('(', '').replace(')', '').replace('.', '').replace(' ', '-')
            markdown_content.append(f"- [{warranty_num} - {warranty_title}](#{anchor}-{warranty_title.lower().replace(' ', '-')})")
        
        markdown_content.append("\\n---\\n")
        
        # Individual warranty disclosures
        for warranty in sorted(self.warranty_claims, key=lambda x: x.get('warranty_number', '')):
            warranty_num = warranty.get('warranty_number', '')
            warranty_title = warranty.get('warranty_title', '')
            section_name = warranty.get('section_name', '')
            warranty_text = warranty.get('warranty_text', '')
            
            # Get disclosure content
            disclosure_info = self.disclosures.get(warranty_num, {})
            disclosure_content = disclosure_info.get('disclosure', 'No disclosure generated.')
            source_docs_count = disclosure_info.get('source_documents', 0)
            
            # Create warranty section
            markdown_content.extend([
                f"## {warranty_num} - {warranty_title}\\n",
                f"**Section**: {section_name}",
                f"**Source Documents Found**: {source_docs_count}\\n",
                "### Warranty Text\\n",
                warranty_text + "\\n",
                "### Disclosure\\n",
                disclosure_content + "\\n",
                "---\\n"
            ])
        
        # Add bibliography
        bibliography = self.ref_manager.generate_bibliography()
        markdown_content.append(bibliography)
        
        # Write to file
        markdown_text = "\\n".join(markdown_content)
        
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(markdown_text)
        
        print(f"✅ Markdown report exported to: {output_path}")
        print(f"📄 Report contains {len(self.disclosures)} warranty disclosures")
        print(f"📚 Bibliography contains {len(self.ref_manager.references)} references")
        
        return output_path
    
    def run_complete_pipeline(self) -> str:
        """Run the complete disclosure generation pipeline."""
        print("🚀 Starting Project Victoria Disclosure Generation Pipeline\\n")
        
        try:
            # Step 1: Extract PDF text
            print("Step 1: Extracting PDF text...")
            self.extract_pdf_text()
            
            # Step 2: Identify warranty claims
            print("\\nStep 2: Identifying warranty claims...")
            self.identify_warranty_claims()
            
            # Step 3: Generate disclosures
            print("\\nStep 3: Generating disclosures...")
            self.generate_all_disclosures()
            
            # Step 4: Export to markdown
            print("\\nStep 4: Exporting to markdown...")
            output_path = self.export_to_markdown()
            
            print("\\n" + "="*60)
            print("✅ PIPELINE COMPLETED SUCCESSFULLY!")
            print("="*60)
            print(f"📁 Output file: {output_path}")
            print(f"📊 Total warranties processed: {len(self.warranty_claims)}")
            print(f"📝 Total disclosures generated: {len(self.disclosures)}")
            print(f"📚 Total references: {len(self.ref_manager.references)}")
            print("="*60)
            
            return output_path
            
        except Exception as e:
            print(f"❌ Pipeline failed: {e}")
            raise

Parameters

Name	Type	Default	Kind
`bases`	-	-

Parameter Details

bases: Parameter of type

Return Value

Returns unspecified type

Class Interface

Methods

`init(self, pdf_path)`

Purpose: Initialize the improved disclosure generator. Args: pdf_path: Path to the Project Victoria PDF document

Parameters:

pdf_path: Type: str

Returns: None

`init_chroma_connection(self)`

Purpose: Initialize connection to Chroma DB.

Returns: None

`extract_pdf_text(self) -> str`

Purpose: Extract text from the Project Victoria PDF document.

Returns: Returns str

`identify_warranty_claims(self) -> List[Dict[str, Any]]`

Purpose: Identify and extract individual warranty claims from the document.

Returns: Returns List[Dict[str, Any]]

`verify_and_clean_warranties(self, warranties) -> List[Dict]`

Purpose: Use LLM to verify and clean up the warranty list.

Parameters:

warranties: Type: List[Dict]

Returns: Returns List[Dict]

`split_text_for_processing(self, text, max_tokens) -> List[str]`

Purpose: Split text into chunks that fit within token limits.

Parameters:

text: Type: str
max_tokens: Type: int

Returns: Returns List[str]

`search_chroma_for_warranty(self, warranty, top_k) -> List[Dict]`

Purpose: Search Chroma DB for documents relevant to a specific warranty.

Parameters:

warranty: Type: Dict[str, Any]
top_k: Type: int

Returns: Returns List[Dict]

`extract_key_terms_from_warranty(self, warranty) -> List[str]`

Purpose: Extract key terms from warranty text for targeted searching.

Parameters:

warranty: Type: Dict[str, Any]

Returns: Returns List[str]

`rerank_documents(self, query, documents) -> List[Dict]`

Purpose: Rerank documents using cross-encoder for better relevance.

Parameters:

query: Type: str
documents: Type: List[Dict]

Returns: Returns List[Dict]

`generate_disclosure_with_references(self, warranty, relevant_docs) -> str`

Purpose: Generate a detailed disclosure for a specific warranty with proper inline references. Args: warranty: Warranty claim dictionary relevant_docs: List of relevant documents from Chroma DB Returns: Generated disclosure text with inline references

Parameters:

warranty: Type: Dict[str, Any]
relevant_docs: Type: List[Dict]

Returns: Returns str

`generate_all_disclosures(self) -> Dict[str, str]`

Purpose: Generate disclosures for all identified warranty claims.

Returns: Returns Dict[str, str]

`export_to_markdown(self, output_path) -> str`

Purpose: Export all disclosures to a comprehensive markdown report with proper references.

Parameters:

output_path: Type: str

Returns: Returns str

`run_complete_pipeline(self) -> str`

Purpose: Run the complete disclosure generation pipeline.

Returns: Returns str

Required Imports

import os
import re
import json
import tiktoken
from typing import List

Usage Example

# Example usage:
# result = ImprovedProjectVictoriaGenerator(bases)

Similar Components

AI-powered semantic similarity - components with related functionality:

class FixedProjectVictoriaGenerator 77.8% similar

Fixed Project Victoria Disclosure Generator that properly handles all warranty sections.
From: /tf/active/vicechatdev/fixed_project_victoria_generator.py
class ProjectVictoriaDisclosureGenerator 74.8% similar

Main class for generating Project Victoria disclosures from warranty claims.
From: /tf/active/vicechatdev/project_victoria_disclosure_generator.py
function main_v29 69.6% similar

Entry point function that instantiates an ImprovedProjectVictoriaGenerator and executes its complete pipeline to generate disclosure documents.
From: /tf/active/vicechatdev/improved_project_victoria_generator.py
function main_v28 62.9% similar

Entry point function that instantiates a FixedProjectVictoriaGenerator and executes its complete pipeline to generate fixed disclosure documents.
From: /tf/active/vicechatdev/fixed_project_victoria_generator.py
function main_v14 58.4% similar

Entry point function that orchestrates the Project Victoria disclosure analysis by initializing the generator, running the complete analysis, and displaying results with next steps.
From: /tf/active/vicechatdev/project_victoria_disclosure_generator.py

← Back to Browse

Assistant

Hi! I can help improve this code. Tell me what you'd like to enhance (e.g., "add error handling", "optimize performance", "improve readability", "add type hints").

Code Comparison

Original Code

                            class ImprovedProjectVictoriaGenerator:
    """
    Improved Project Victoria Disclosure Generator with proper reference management.
    """
    
    def __init__(self, pdf_path: str = "./20250623_Project Victoria - Disclosure Matrix_WIP.pdf"):
        """
        Initialize the improved disclosure generator.
        
        Args:
            pdf_path: Path to the Project Victoria PDF document
        """
        self.pdf_path = pdf_path
        self.extracted_text = ""
        self.warranty_claims = []
        self.disclosures = {}
        
        # Initialize reference manager
        self.ref_manager = ReferenceManager()
        
        # LLM configuration
        self.api_key = "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"
        os.environ["OPENAI_API_KEY"] = self.api_key
        
        # Initialize tokenizer for counting tokens
        self.tokenizer = tiktoken.get_encoding("cl100k_base")
        
        # Initialize Chroma DB connection
        self.init_chroma_connection()
        
        # Cross-encoder for ranking (if available)
        if CROSSENCODER_AVAILABLE:
            try:
                self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
                print("✅ Cross-encoder initialized for document reranking")
            except Exception as e:
                print(f"⚠️ Cross-encoder initialization failed: {e}")
                self.cross_encoder = None
        else:
            self.cross_encoder = None
        
        print(f"Initialized Improved Project Victoria Disclosure Generator")
        print(f"PDF path: {pdf_path}")
    
    def init_chroma_connection(self):
        """Initialize connection to Chroma DB."""
        try:
            # Connect to Chroma DB (assuming it's running on vice_chroma:8000)
            self.chroma_client = chromadb.HttpClient(host='vice_chroma', port=8000)
            
            # Set up embedding function
            if EMBEDDING_AVAILABLE:
                self.chroma_embedder = MyEmbeddingFunction("gpt-4o-mini", "text-embedding-3-small", self.api_key)
            else:
                print("⚠️ Using default Chroma embeddings (may not work with custom collections)")
                self.chroma_embedder = None
            
            # Get the 99_edr collection
            self.edr_collection = self.chroma_client.get_collection(
                "99_edr", 
                embedding_function=self.chroma_embedder
            )
            
            print("✅ Successfully connected to Chroma DB and 99_edr collection")
            
        except Exception as e:
            print(f"❌ Error connecting to Chroma DB: {e}")
            print("Attempting to use local fallback or alternative connection...")
            self.chroma_client = None
            self.edr_collection = None
    
    def extract_pdf_text(self) -> str:
        """Extract text from the Project Victoria PDF document."""
        # First try to use existing extracted text file
        try:
            with open("project_victoria_extracted.txt", "r", encoding="utf-8") as f:
                self.extracted_text = f.read()
            print(f"✅ Successfully loaded text from existing file: {len(self.extracted_text)} characters")
            return self.extracted_text
        except FileNotFoundError:
            print("📄 No existing extracted text file found, attempting PDF extraction...")
        except Exception as e:
            print(f"⚠️ Error reading existing text file: {e}")
        
        # Try PDF extraction if PyMuPDF is available
        if PDF_AVAILABLE and fitz:
            try:
                print(f"Extracting text from PDF: {self.pdf_path}")
                
                # Open the PDF document
                doc = fitz.open(self.pdf_path)
                
                text_content = []
                for page_num in range(len(doc)):
                    page = doc.load_page(page_num)
                    text = page.get_text()
                    text_content.append(f"\n--- Page {page_num + 1} ---\n{text}")
                
                doc.close()
                
                self.extracted_text = "\n".join(text_content)
                
                # Save extracted text for future use
                try:
                    with open("project_victoria_extracted.txt", "w", encoding="utf-8") as f:
                        f.write(self.extracted_text)
                    print("💾 Saved extracted text to project_victoria_extracted.txt")
                except Exception as save_error:
                    print(f"⚠️ Could not save extracted text: {save_error}")
                
                print(f"✅ Successfully extracted {len(self.extracted_text)} characters from PDF")
                print(f"📄 Total pages processed: {len(doc)}")
                
                return self.extracted_text
                
            except Exception as e:
                print(f"❌ Error extracting PDF text: {e}")
        else:
            print("❌ PyMuPDF not available for PDF extraction")
        
        # If all else fails, provide manual instructions
        raise Exception("""
        Could not extract text from PDF. Please do one of the following:
        
        1. Install PyMuPDF: pip install PyMuPDF
        2. Manually extract text from the PDF and save it as 'project_victoria_extracted.txt'
        3. Provide the extracted text file in the working directory
        
        The script will automatically use the text file if available.
        """)
    
    def identify_warranty_claims(self) -> List[Dict[str, Any]]:
        """Identify and extract individual warranty claims from the document."""
        print("\n" + "="*60)
        print("IDENTIFYING WARRANTY CLAIMS")
        print("="*60)
        
        # Use LLM to identify warranty claims
        llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=16000)
        
        warranty_extraction_prompt = """
You are analyzing a legal document containing warranty disclosures for a company acquisition.
The document is structured with numbered warranty sections, each containing specific warranty claims.

Your task is to extract ALL individual warranty claims from the text. Each warranty claim should be identified as a distinct legal requirement or representation.

Please extract and return a JSON list of warranty claims, where each item has:
- "warranty_number": The section number (e.g., "1.1", "2.1(a)", "3.4")
- "warranty_title": A short descriptive title for the warranty
- "warranty_text": The complete text of the warranty claim (limit to 500 characters)
- "section_name": The main section name (e.g., "THE SHARES; THE SELLERS", "AUTHORITY AND CAPACITY")

Focus on extracting the actual warranty statements, not procedural text or definitions.
IMPORTANT: Keep warranty_text under 500 characters to ensure the JSON response is not truncated.

Here is the document text to analyze:

{document_text}

Return only a valid JSON array of warranty claims. Ensure the response is complete and valid JSON.
"""
        
        # Split text into chunks if too long
        max_chunk_size = 30000
        text_chunks = self.split_text_for_processing(self.extracted_text, max_chunk_size)
        
        all_warranties = []
        
        for i, chunk in enumerate(text_chunks):
            print(f"Processing warranty extraction chunk {i+1}/{len(text_chunks)}")
            
            prompt = warranty_extraction_prompt.format(document_text=chunk)
            
            try:
                response = llm.invoke(prompt)
                response_text = response.content.strip()
                
                # Clean up response to ensure valid JSON
                if response_text.startswith("```json"):
                    response_text = response_text[7:]
                if response_text.endswith("```"):
                    response_text = response_text[:-3]
                
                # Try to repair truncated JSON
                response_text = response_text.strip()
                if not response_text.endswith(']'):
                    last_complete = response_text.rfind('}}')
                    if last_complete > 0:
                        response_text = response_text[:last_complete + 2] + ']'
                    else:
                        response_text = response_text + ']'
                
                chunk_warranties = json.loads(response_text)
                
                if isinstance(chunk_warranties, list):
                    all_warranties.extend(chunk_warranties)
                    print(f"✅ Extracted {len(chunk_warranties)} warranties from chunk {i+1}")
                else:
                    print(f"⚠️ Unexpected response format from chunk {i+1}")
                    
            except json.JSONDecodeError as je:
                print(f"❌ JSON decode error in chunk {i+1}: {je}")
                print(f"Response was: {response_text[:500]}...")
                # Try alternative extraction
                try:
                    warranty_pattern = r'\{\s*"warranty_number"[^}]+\}'
                    matches = re.findall(warranty_pattern, response_text, re.DOTALL)
                    for match in matches:
                        try:
                            warranty = json.loads(match)
                            all_warranties.append(warranty)
                        except:
                            continue
                    if matches:
                        print(f"✅ Recovered {len(matches)} warranties using regex extraction")
                except Exception as re_error:
                    print(f"❌ Regex recovery also failed: {re_error}")
            except Exception as e:
                print(f"❌ Error processing chunk {i+1}: {e}")
        
        # Clean up duplicates
        if all_warranties:
            all_warranties = self.verify_and_clean_warranties(all_warranties)
        
        self.warranty_claims = all_warranties
        
        print(f"✅ Total warranty claims identified: {len(self.warranty_claims)}")
        
        # Display summary
        if self.warranty_claims:
            print("\nSample warranty claims:")
            for i, warranty in enumerate(self.warranty_claims[:3]):
                print(f"{i+1}. [{warranty.get('warranty_number', 'N/A')}] {warranty.get('warranty_title', 'No title')}")
        
        return self.warranty_claims
    
    def verify_and_clean_warranties(self, warranties: List[Dict]) -> List[Dict]:
        """Use LLM to verify and clean up the warranty list."""
        print("\nVerifying and cleaning warranty claims...")
        
        llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=3000)
        
        verification_prompt = """
You are reviewing a list of warranty claims extracted from a legal document. 
Please clean up this list by:

1. Removing any duplicates
2. Ensuring each warranty is a distinct legal claim
3. Fixing any formatting issues
4. Ensuring warranty numbers are correct
5. Making sure warranty titles are concise and descriptive

Here is the warranty list to review:

{warranties_json}

Return the cleaned list as a valid JSON array with the same structure.
"""
        
        try:
            prompt = verification_prompt.format(warranties_json=json.dumps(warranties, indent=2))
            response = llm.invoke(prompt)
            response_text = response.content.strip()
            
            # Clean up response
            if response_text.startswith("```json"):
                response_text = response_text[7:]
            if response_text.endswith("```"):
                response_text = response_text[:-3]
            
            cleaned_warranties = json.loads(response_text)
            
            print(f"✅ Warranty verification complete: {len(warranties)} → {len(cleaned_warranties)} claims")
            return cleaned_warranties
            
        except Exception as e:
            print(f"⚠️ Warranty verification failed: {e}, using original list")
            return warranties
    
    def split_text_for_processing(self, text: str, max_tokens: int) -> List[str]:
        """Split text into chunks that fit within token limits."""
        tokens = self.tokenizer.encode(text)
        chunks = []
        
        for i in range(0, len(tokens), max_tokens):
            chunk_tokens = tokens[i:i + max_tokens]
            chunk_text = self.tokenizer.decode(chunk_tokens)
            chunks.append(chunk_text)
        
        return chunks
    
    def search_chroma_for_warranty(self, warranty: Dict[str, Any], top_k: int = 15) -> List[Dict]:
        """Search Chroma DB for documents relevant to a specific warranty."""
        if not self.edr_collection:
            print("⚠️ Chroma DB not available, skipping search")
            return []
        
        # Create search queries based on warranty content
        warranty_text = warranty.get('warranty_text', '')
        warranty_title = warranty.get('warranty_title', '')
        
        # Generate multiple search queries for comprehensive coverage
        search_queries = [
            warranty_title,
            warranty_text[:500],  # First part of warranty text
            f"{warranty_title} {warranty_text[:200]}"  # Combined query
        ]
        
        # Extract key terms for additional queries
        key_terms = self.extract_key_terms_from_warranty(warranty)
        if key_terms:
            search_queries.extend(key_terms[:3])  # Add top 3 key terms
        
        all_documents = []
        retrieved_ids = set()
        
        for query in search_queries:
            if not query.strip():
                continue
                
            try:
                # Query Chroma DB
                results = self.edr_collection.query(
                    query_texts=[query],
                    n_results=top_k,
                    include=["documents", "metadatas", "embeddings", "distances"]
                )
                
                # Process results
                if results['documents'] and len(results['documents'][0]) > 0:
                    for i, doc_id in enumerate(results['ids'][0]):
                        if doc_id not in retrieved_ids:
                            retrieved_ids.add(doc_id)
                            
                            document_info = {
                                'id': doc_id,
                                'content': results['documents'][0][i],
                                'metadata': results['metadatas'][0][i] if results['metadatas'] else {},
                                'distance': results['distances'][0][i] if results['distances'] else 1.0,
                                'query_used': query
                            }
                            all_documents.append(document_info)
                            
            except Exception as e:
                print(f"⚠️ Error querying Chroma DB with query '{query[:50]}...': {e}")
        
        # Rank documents using cross-encoder if available
        if all_documents and self.cross_encoder:
            all_documents = self.rerank_documents(warranty_text, all_documents)
        
        # Return top documents
        return all_documents[:top_k]
    
    def extract_key_terms_from_warranty(self, warranty: Dict[str, Any]) -> List[str]:
        """Extract key terms from warranty text for targeted searching."""
        warranty_text = warranty.get('warranty_text', '')
        
        # Use simple keyword extraction based on legal document patterns
        key_terms = []
        
        # Common legal/business terms that might be relevant
        important_words = [
            'incorporation', 'registered', 'authorized', 'shares', 'capital',
            'subsidiaries', 'accounts', 'financial', 'liabilities', 'assets',
            'compliance', 'regulatory', 'licenses', 'permits', 'agreements',
            'contracts', 'intellectual property', 'employment', 'litigation',
            'insurance', 'tax', 'environmental', 'data protection'
        ]
        
        text_lower = warranty_text.lower()
        for term in important_words:
            if term in text_lower:
                key_terms.append(term)
        
        # Extract specific company/legal entity mentions
        entity_patterns = [
            r'\\b[A-Z][a-z]+ [A-Z][a-z]+\\b',  # Proper names
            r'\\b[A-Z]{2,}\\b',  # Acronyms
        ]
        
        for pattern in entity_patterns:
            matches = re.findall(pattern, warranty_text)
            key_terms.extend(matches[:3])  # Limit to avoid too many terms
        
        return list(set(key_terms))[:5]  # Return unique terms, max 5
    
    def rerank_documents(self, query: str, documents: List[Dict]) -> List[Dict]:
        """Rerank documents using cross-encoder for better relevance."""
        if len(documents) <= 1:
            return documents
        
        try:
            # Prepare query-document pairs for cross-encoder
            pairs = [(query, doc['content'][:500]) for doc in documents]  # Limit content length
            
            # Get cross-encoder scores
            scores = self.cross_encoder.predict(pairs)
            
            # Add scores to documents and sort
            for i, doc in enumerate(documents):
                doc['cross_encoder_score'] = float(scores[i])
            
            # Sort by cross-encoder score (higher is better)
            documents.sort(key=lambda x: x['cross_encoder_score'], reverse=True)
            
        except Exception as e:
            print(f"⚠️ Cross-encoder reranking failed: {e}")
        
        return documents
    
    def generate_disclosure_with_references(self, warranty: Dict[str, Any], 
                                         relevant_docs: List[Dict]) -> str:
        """
        Generate a detailed disclosure for a specific warranty with proper inline references.
        
        Args:
            warranty: Warranty claim dictionary
            relevant_docs: List of relevant documents from Chroma DB
            
        Returns:
            Generated disclosure text with inline references
        """
        if not relevant_docs:
            return "No relevant supporting documentation found in the knowledge base."
        
        # Add documents to reference manager and create context
        context_blocks = []
        doc_references = {}  # Map doc_id to reference number
        
        for doc in relevant_docs[:10]:  # Limit to top 10 docs
            ref_num = self.ref_manager.add_document(
                doc_id=doc['id'],
                content=doc['content'],
                metadata=doc.get('metadata', {})
            )
            doc_references[doc['id']] = ref_num
            
            # Create context block with reference number
            context_blocks.append(f"[{ref_num}] {doc['content']}")
        
        context_text = "\n\n".join(context_blocks)
        
        # Create disclosure generation prompt
        llm = ChatOpenAI(model="gpt-4o", temperature=0.1, max_tokens=2000)
        
        disclosure_prompt = """
You are a legal and business expert helping to prepare warranty disclosures for a company acquisition.

**TASK**: Generate a detailed disclosure summary for the warranty claim below, based on the provided supporting documentation.

**WARRANTY CLAIM**:
Section: {warranty_number}
Title: {warranty_title}
Text: {warranty_text}

**SUPPORTING DOCUMENTATION**:
{context_text}

**INSTRUCTIONS**:
1. Analyze the warranty claim to understand what needs to be disclosed
2. Review the supporting documentation for relevant information
3. Create a comprehensive disclosure that addresses the warranty requirements
4. Include specific facts, figures, dates, and details from the documentation
5. Use inline citations [1], [2], [3] etc. to reference specific information sources
6. Structure the disclosure clearly with appropriate headings if needed
7. Focus on factual information that supports or relates to the warranty claim
8. If certain aspects of the warranty cannot be fully addressed from the available documentation, note this clearly

**IMPORTANT**: Use ONLY the reference numbers [1], [2], [3] etc. that are shown in the supporting documentation above. Do not create new reference numbers.

**OUTPUT FORMAT**:
Provide a well-structured disclosure in markdown format with:
- Clear, professional language
- Inline citations using [1], [2], [3] format (matching the numbers in the supporting documentation)
- Specific details and facts from the supporting documents
- Appropriate level of detail for legal/business disclosure purposes

**DISCLOSURE SUMMARY**:
"""
        
        prompt = disclosure_prompt.format(
            warranty_number=warranty.get('warranty_number', 'N/A'),
            warranty_title=warranty.get('warranty_title', ''),
            warranty_text=warranty.get('warranty_text', ''),
            context_text=context_text
        )
        
        try:
            response = llm.invoke(prompt)
            disclosure_text = response.content.strip()
            
            return disclosure_text
            
        except Exception as e:
            print(f"❌ Error generating disclosure: {e}")
            return f"Error generating disclosure for warranty {warranty.get('warranty_number', 'N/A')}: {str(e)}"
    
    def generate_all_disclosures(self) -> Dict[str, str]:
        """Generate disclosures for all identified warranty claims."""
        print("\n" + "="*60)
        print("GENERATING DISCLOSURES FOR ALL WARRANTIES")
        print("="*60)
        
        # Clear reference manager for fresh start
        self.ref_manager.clear()
        self.disclosures = {}
        
        for i, warranty in enumerate(self.warranty_claims, 1):
            warranty_num = warranty.get('warranty_number', f'W{i}')
            warranty_title = warranty.get('warranty_title', 'Untitled Warranty')
            
            print(f"\nProcessing warranty {i}/{len(self.warranty_claims)}: {warranty_num} - {warranty_title}")
            
            # Search for relevant documents
            relevant_docs = self.search_chroma_for_warranty(warranty)
            
            if relevant_docs:
                print(f"Found {len(relevant_docs)} relevant documents")
            else:
                print("No relevant documents found")
            
            # Generate disclosure with proper references
            disclosure = self.generate_disclosure_with_references(warranty, relevant_docs)
            
            self.disclosures[warranty_num] = {
                'warranty': warranty,
                'disclosure': disclosure,
                'source_documents': len(relevant_docs),
                'generated_at': datetime.now().isoformat()
            }
            
            print(f"✅ Generated disclosure for {warranty_num}")
        
        print(f"\n✅ Completed disclosure generation for {len(self.disclosures)} warranties")
        print(f"📚 Total references in bibliography: {len(self.ref_manager.references)}")
        return self.disclosures
    
    def export_to_markdown(self, output_path: str = "./project_victoria_disclosures_improved.md") -> str:
        """Export all disclosures to a comprehensive markdown report with proper references."""
        print("\n" + "="*60)
        print("EXPORTING DISCLOSURES TO MARKDOWN")
        print("="*60)
        
        # Create markdown content
        markdown_content = []
        
        # Header
        markdown_content.extend([
            "# Project Victoria - Warranty Disclosures\\n",
            f"**Generated on**: {datetime.now().strftime('%B %d, %Y at %H:%M:%S')}",
            f"**Total Warranties Processed**: {len(self.warranty_claims)}",
            f"**Total Disclosures Generated**: {len(self.disclosures)}",
            f"**Total References**: {len(self.ref_manager.references)}\\n",
            "---\\n"
        ])
        
        # Table of Contents
        markdown_content.append("## Table of Contents\\n")
        for warranty in sorted(self.warranty_claims, key=lambda x: x.get('warranty_number', '')):
            warranty_num = warranty.get('warranty_number', '')
            warranty_title = warranty.get('warranty_title', '')
            anchor = warranty_num.lower().replace('(', '').replace(')', '').replace('.', '').replace(' ', '-')
            markdown_content.append(f"- [{warranty_num} - {warranty_title}](#{anchor}-{warranty_title.lower().replace(' ', '-')})")
        
        markdown_content.append("\\n---\\n")
        
        # Individual warranty disclosures
        for warranty in sorted(self.warranty_claims, key=lambda x: x.get('warranty_number', '')):
            warranty_num = warranty.get('warranty_number', '')
            warranty_title = warranty.get('warranty_title', '')
            section_name = warranty.get('section_name', '')
            warranty_text = warranty.get('warranty_text', '')
            
            # Get disclosure content
            disclosure_info = self.disclosures.get(warranty_num, {})
            disclosure_content = disclosure_info.get('disclosure', 'No disclosure generated.')
            source_docs_count = disclosure_info.get('source_documents', 0)
            
            # Create warranty section
            markdown_content.extend([
                f"## {warranty_num} - {warranty_title}\\n",
                f"**Section**: {section_name}",
                f"**Source Documents Found**: {source_docs_count}\\n",
                "### Warranty Text\\n",
                warranty_text + "\\n",
                "### Disclosure\\n",
                disclosure_content + "\\n",
                "---\\n"
            ])
        
        # Add bibliography
        bibliography = self.ref_manager.generate_bibliography()
        markdown_content.append(bibliography)
        
        # Write to file
        markdown_text = "\\n".join(markdown_content)
        
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(markdown_text)
        
        print(f"✅ Markdown report exported to: {output_path}")
        print(f"📄 Report contains {len(self.disclosures)} warranty disclosures")
        print(f"📚 Bibliography contains {len(self.ref_manager.references)} references")
        
        return output_path
    
    def run_complete_pipeline(self) -> str:
        """Run the complete disclosure generation pipeline."""
        print("🚀 Starting Project Victoria Disclosure Generation Pipeline\\n")
        
        try:
            # Step 1: Extract PDF text
            print("Step 1: Extracting PDF text...")
            self.extract_pdf_text()
            
            # Step 2: Identify warranty claims
            print("\\nStep 2: Identifying warranty claims...")
            self.identify_warranty_claims()
            
            # Step 3: Generate disclosures
            print("\\nStep 3: Generating disclosures...")
            self.generate_all_disclosures()
            
            # Step 4: Export to markdown
            print("\\nStep 4: Exporting to markdown...")
            output_path = self.export_to_markdown()
            
            print("\\n" + "="*60)
            print("✅ PIPELINE COMPLETED SUCCESSFULLY!")
            print("="*60)
            print(f"📁 Output file: {output_path}")
            print(f"📊 Total warranties processed: {len(self.warranty_claims)}")
            print(f"📝 Total disclosures generated: {len(self.disclosures)}")
            print(f"📚 Total references: {len(self.ref_manager.references)}")
            print("="*60)
            
            return output_path
            
        except Exception as e:
            print(f"❌ Pipeline failed: {e}")
            raise
                        

Improved Code

🔍 Code Extractor

class ImprovedProjectVictoriaGenerator

Purpose

Source Code

Parameters

Parameter Details

Return Value

Class Interface

Methods

`init(self, pdf_path)`

`init_chroma_connection(self)`

`extract_pdf_text(self) -> str`

`identify_warranty_claims(self) -> List[Dict[str, Any]]`

`verify_and_clean_warranties(self, warranties) -> List[Dict]`

`split_text_for_processing(self, text, max_tokens) -> List[str]`

`search_chroma_for_warranty(self, warranty, top_k) -> List[Dict]`

`extract_key_terms_from_warranty(self, warranty) -> List[str]`

`rerank_documents(self, query, documents) -> List[Dict]`

`generate_disclosure_with_references(self, warranty, relevant_docs) -> str`

`generate_all_disclosures(self) -> Dict[str, str]`

`export_to_markdown(self, output_path) -> str`

`run_complete_pipeline(self) -> str`

Required Imports

Usage Example

Tags

Similar Components

class FixedProjectVictoriaGenerator 77.8% similar

class ProjectVictoriaDisclosureGenerator 74.8% similar

function main_v29 69.6% similar

function main_v28 62.9% similar

function main_v14 58.4% similar

class ImprovedProjectVictoriaGenerator

Purpose

Source Code

Parameters

Parameter Details

Return Value

Class Interface

Methods

__init__(self, pdf_path)

init_chroma_connection(self)

extract_pdf_text(self) -> str

identify_warranty_claims(self) -> List[Dict[str, Any]]

verify_and_clean_warranties(self, warranties) -> List[Dict]

split_text_for_processing(self, text, max_tokens) -> List[str]

search_chroma_for_warranty(self, warranty, top_k) -> List[Dict]

extract_key_terms_from_warranty(self, warranty) -> List[str]

rerank_documents(self, query, documents) -> List[Dict]

generate_disclosure_with_references(self, warranty, relevant_docs) -> str

generate_all_disclosures(self) -> Dict[str, str]

export_to_markdown(self, output_path) -> str

run_complete_pipeline(self) -> str

Required Imports

Usage Example

Tags

Similar Components

class FixedProjectVictoriaGenerator 77.8% similar

class ProjectVictoriaDisclosureGenerator 74.8% similar

function main_v29 69.6% similar

function main_v28 62.9% similar

function main_v14 58.4% similar

✨ Improve Code: ImprovedProjectVictoriaGenerator

Code Comparison

`init(self, pdf_path)`

`init_chroma_connection(self)`

`extract_pdf_text(self) -> str`

`identify_warranty_claims(self) -> List[Dict[str, Any]]`

`verify_and_clean_warranties(self, warranties) -> List[Dict]`

`split_text_for_processing(self, text, max_tokens) -> List[str]`

`search_chroma_for_warranty(self, warranty, top_k) -> List[Dict]`

`extract_key_terms_from_warranty(self, warranty) -> List[str]`

`rerank_documents(self, query, documents) -> List[Dict]`

`generate_disclosure_with_references(self, warranty, relevant_docs) -> str`

`generate_all_disclosures(self) -> Dict[str, str]`

`export_to_markdown(self, output_path) -> str`

`run_complete_pipeline(self) -> str`