šŸ” Code Extractor

class ProjectVictoriaDisclosureGenerator

Maturity: 26

Main class for generating Project Victoria disclosures from warranty claims.

File:
/tf/active/vicechatdev/project_victoria_disclosure_generator.py
Lines:
63 - 816
Complexity:
moderate

Purpose

Main class for generating Project Victoria disclosures from warranty claims.

Source Code

class ProjectVictoriaDisclosureGenerator:
    """
    Main class for generating Project Victoria disclosures from warranty claims.
    """
    
    def __init__(self, pdf_path: str = "./20250623_Project Victoria - Disclosure Matrix_WIP.pdf"):
        """
        Initialize the disclosure generator.
        
        Args:
            pdf_path: Path to the Project Victoria PDF document
        """
        self.pdf_path = pdf_path
        self.extracted_text = ""
        self.warranty_claims = []
        self.disclosures = {}
        
        # LLM configuration
        self.api_key = "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"
        os.environ["OPENAI_API_KEY"] = self.api_key
        
        # Initialize tokenizer for counting tokens
        self.tokenizer = tiktoken.get_encoding("cl100k_base")
        
        # Initialize Chroma DB connection
        self.init_chroma_connection()
        
        # Cross-encoder for ranking (if available)
        if CROSSENCODER_AVAILABLE:
            try:
                self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
                print("āœ… Cross-encoder initialized for document reranking")
            except Exception as e:
                print(f"āš ļø Cross-encoder initialization failed: {e}")
                self.cross_encoder = None
        else:
            self.cross_encoder = None
        
        # Block reference counter for inline citations
        self.block_counter = 1
        self.blocks_dict = {}
        
        print(f"Initialized Project Victoria Disclosure Generator")
        print(f"PDF path: {pdf_path}")
    
    def init_chroma_connection(self):
        """Initialize connection to Chroma DB."""
        try:
            # Connect to Chroma DB (assuming it's running on vice_chroma:8000)
            self.chroma_client = chromadb.HttpClient(host='vice_chroma', port=8000)
            
            # Set up embedding function
            self.chroma_embedder = MyEmbeddingFunction("gpt-4o-mini", "text-embedding-3-small", self.api_key)
            
            # Get the 99_EDR collection
            self.edr_collection = self.chroma_client.get_collection(
                "99_edr", 
                embedding_function=self.chroma_embedder
            )
            
            print("āœ… Successfully connected to Chroma DB and 99_EDR collection")
            
        except Exception as e:
            print(f"āŒ Error connecting to Chroma DB: {e}")
            print("Attempting to use local fallback or alternative connection...")
            self.chroma_client = None
            self.edr_collection = None
    
    def extract_pdf_text(self) -> str:
        """
        Extract text from the Project Victoria PDF document.
        
        Returns:
            Extracted text from the PDF
        """
        # First try to use existing extracted text file
        try:
            with open("project_victoria_extracted.txt", "r", encoding="utf-8") as f:
                self.extracted_text = f.read()
            print(f"āœ… Successfully loaded text from existing file: {len(self.extracted_text)} characters")
            return self.extracted_text
        except FileNotFoundError:
            print("šŸ“„ No existing extracted text file found, attempting PDF extraction...")
        except Exception as e:
            print(f"āš ļø Error reading existing text file: {e}")
        
        # Try PDF extraction if PyMuPDF is available
        if PDF_AVAILABLE and fitz:
            try:
                print(f"Extracting text from PDF: {self.pdf_path}")
                
                # Open the PDF document
                doc = fitz.open(self.pdf_path)
                
                text_content = []
                for page_num in range(len(doc)):
                    page = doc.load_page(page_num)
                    text = page.get_text()
                    text_content.append(f"\\n--- Page {page_num + 1} ---\\n{text}")
                
                doc.close()
                
                self.extracted_text = "\\n".join(text_content)
                
                # Save extracted text for future use
                try:
                    with open("project_victoria_extracted.txt", "w", encoding="utf-8") as f:
                        f.write(self.extracted_text)
                    print("šŸ’¾ Saved extracted text to project_victoria_extracted.txt")
                except Exception as save_error:
                    print(f"āš ļø Could not save extracted text: {save_error}")
                
                print(f"āœ… Successfully extracted {len(self.extracted_text)} characters from PDF")
                print(f"šŸ“„ Total pages processed: {len(doc)}")
                
                return self.extracted_text
                
            except Exception as e:
                print(f"āŒ Error extracting PDF text: {e}")
        else:
            print("āŒ PyMuPDF not available for PDF extraction")
        
        # If all else fails, provide manual instructions
        raise Exception("""
        Could not extract text from PDF. Please do one of the following:
        
        1. Install PyMuPDF: pip install PyMuPDF
        2. Manually extract text from the PDF and save it as 'project_victoria_extracted.txt'
        3. Provide the extracted text file in the working directory
        
        The script will automatically use the text file if available.
        """)
    
    def identify_warranty_claims(self) -> List[Dict[str, Any]]:
        """
        Identify and extract individual warranty claims from the document.
        
        Returns:
            List of warranty claim dictionaries
        """
        print("\\n" + "="*60)
        print("IDENTIFYING WARRANTY CLAIMS")
        print("="*60)
        
        # Use LLM to identify warranty claims
        llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=16000)  # Increased token limit
        
        warranty_extraction_prompt = """
You are analyzing a legal document containing warranty disclosures for a company acquisition.
The document is structured with numbered warranty sections, each containing specific warranty claims.

Your task is to extract ALL individual warranty claims from the text. Each warranty claim should be identified as a distinct legal requirement or representation.

Please extract and return a JSON list of warranty claims, where each item has:
- "warranty_number": The section number (e.g., "1.1", "2.1(a)", "3.4")
- "warranty_title": A short descriptive title for the warranty
- "warranty_text": The complete text of the warranty claim (limit to 500 characters)
- "section_name": The main section name (e.g., "THE SHARES; THE SELLERS", "AUTHORITY AND CAPACITY")

Focus on extracting the actual warranty statements, not procedural text or definitions.
IMPORTANT: Keep warranty_text under 500 characters to ensure the JSON response is not truncated.

Here is the document text to analyze:

{document_text}

Return only a valid JSON array of warranty claims. Ensure the response is complete and valid JSON.
"""
        
        # Split text into chunks if too long
        max_chunk_size = 30000  # Reduced chunk size for better processing
        text_chunks = self.split_text_for_processing(self.extracted_text, max_chunk_size)
        
        all_warranties = []
        
        for i, chunk in enumerate(text_chunks):
            print(f"Processing warranty extraction chunk {i+1}/{len(text_chunks)}")
            
            prompt = warranty_extraction_prompt.format(document_text=chunk)
            
            try:
                response = llm.invoke(prompt)
                response_text = response.content.strip()
                
                # Clean up response to ensure valid JSON
                if response_text.startswith("```json"):
                    response_text = response_text[7:]
                if response_text.endswith("```"):
                    response_text = response_text[:-3]
                
                # Try to repair truncated JSON
                response_text = response_text.strip()
                if not response_text.endswith(']'):
                    # Try to find the last complete entry
                    last_complete = response_text.rfind('}}')
                    if last_complete > 0:
                        response_text = response_text[:last_complete + 2] + ']'
                    else:
                        response_text = response_text + ']'
                
                chunk_warranties = json.loads(response_text)
                
                if isinstance(chunk_warranties, list):
                    all_warranties.extend(chunk_warranties)
                    print(f"āœ… Extracted {len(chunk_warranties)} warranties from chunk {i+1}")
                else:
                    print(f"āš ļø Unexpected response format from chunk {i+1}")
                    
            except json.JSONDecodeError as je:
                print(f"āŒ JSON decode error in chunk {i+1}: {je}")
                print(f"Response was: {response_text[:500]}...")
                # Try alternative extraction
                try:
                    # Extract individual warranty objects with regex
                    warranty_pattern = r'\{\s*"warranty_number"[^}]+\}'
                    matches = re.findall(warranty_pattern, response_text, re.DOTALL)
                    for match in matches:
                        try:
                            warranty = json.loads(match)
                            all_warranties.append(warranty)
                        except:
                            continue
                    if matches:
                        print(f"āœ… Recovered {len(matches)} warranties using regex extraction")
                except Exception as re_error:
                    print(f"āŒ Regex recovery also failed: {re_error}")
            except Exception as e:
                print(f"āŒ Error processing chunk {i+1}: {e}")
        
        # Use LLM to verify and clean up warranty list
        if all_warranties:
            all_warranties = self.verify_and_clean_warranties(all_warranties)
        
        self.warranty_claims = all_warranties
        
        print(f"āœ… Total warranty claims identified: {len(self.warranty_claims)}")
        
        # Display summary
        if self.warranty_claims:
            print("\\nSample warranty claims:")
            for i, warranty in enumerate(self.warranty_claims[:3]):
                print(f"{i+1}. [{warranty.get('warranty_number', 'N/A')}] {warranty.get('warranty_title', 'No title')}")
        
        return self.warranty_claims
    
    def verify_and_clean_warranties(self, warranties: List[Dict]) -> List[Dict]:
        """
        Use LLM to verify and clean up the warranty list.
        
        Args:
            warranties: Raw list of warranty claims
            
        Returns:
            Cleaned and verified warranty claims
        """
        print("\\nVerifying and cleaning warranty claims...")
        
        llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=3000)
        
        verification_prompt = """
You are reviewing a list of warranty claims extracted from a legal document. 
Please clean up this list by:

1. Removing any duplicates
2. Ensuring each warranty is a distinct legal claim
3. Fixing any formatting issues
4. Ensuring warranty numbers are correct
5. Making sure warranty titles are concise and descriptive

Here is the warranty list to review:

{warranties_json}

Return the cleaned list as a valid JSON array with the same structure.
"""
        
        try:
            prompt = verification_prompt.format(warranties_json=json.dumps(warranties, indent=2))
            response = llm.invoke(prompt)
            response_text = response.content.strip()
            
            # Clean up response
            if response_text.startswith("```json"):
                response_text = response_text[7:]
            if response_text.endswith("```"):
                response_text = response_text[:-3]
            
            cleaned_warranties = json.loads(response_text)
            
            print(f"āœ… Warranty verification complete: {len(warranties)} → {len(cleaned_warranties)} claims")
            return cleaned_warranties
            
        except Exception as e:
            print(f"āš ļø Warranty verification failed: {e}, using original list")
            return warranties
    
    def split_text_for_processing(self, text: str, max_tokens: int) -> List[str]:
        """
        Split text into chunks that fit within token limits.
        
        Args:
            text: Text to split
            max_tokens: Maximum tokens per chunk
            
        Returns:
            List of text chunks
        """
        tokens = self.tokenizer.encode(text)
        chunks = []
        
        for i in range(0, len(tokens), max_tokens):
            chunk_tokens = tokens[i:i + max_tokens]
            chunk_text = self.tokenizer.decode(chunk_tokens)
            chunks.append(chunk_text)
        
        return chunks
    
    def search_chroma_for_warranty(self, warranty: Dict[str, Any], top_k: int = 15) -> List[Dict]:
        """
        Search Chroma DB for documents relevant to a specific warranty.
        
        Args:
            warranty: Warranty claim dictionary
            top_k: Number of top documents to retrieve
            
        Returns:
            List of relevant documents
        """
        if not self.edr_collection:
            print("āš ļø Chroma DB not available, skipping search")
            return []
        
        # Create search queries based on warranty content
        warranty_text = warranty.get('warranty_text', '')
        warranty_title = warranty.get('warranty_title', '')
        
        # Generate multiple search queries for comprehensive coverage
        search_queries = [
            warranty_title,
            warranty_text[:500],  # First part of warranty text
            f"{warranty_title} {warranty_text[:200]}"  # Combined query
        ]
        
        # Extract key terms for additional queries
        key_terms = self.extract_key_terms_from_warranty(warranty)
        if key_terms:
            search_queries.extend(key_terms[:3])  # Add top 3 key terms
        
        all_documents = []
        retrieved_ids = set()
        
        for query in search_queries:
            if not query.strip():
                continue
                
            try:
                # Query Chroma DB
                results = self.edr_collection.query(
                    query_texts=[query],
                    n_results=top_k,
                    include=["documents", "metadatas", "embeddings", "distances"]
                )
                
                # Process results
                if results['documents'] and len(results['documents'][0]) > 0:
                    for i, doc_id in enumerate(results['ids'][0]):
                        if doc_id not in retrieved_ids:
                            retrieved_ids.add(doc_id)
                            
                            document_info = {
                                'id': doc_id,
                                'content': results['documents'][0][i],
                                'metadata': results['metadatas'][0][i] if results['metadatas'] else {},
                                'distance': results['distances'][0][i] if results['distances'] else 1.0,
                                'query_used': query
                            }
                            all_documents.append(document_info)
                            
            except Exception as e:
                print(f"āš ļø Error querying Chroma DB with query '{query[:50]}...': {e}")
        
        # Rank documents using cross-encoder if available
        if all_documents and hasattr(self, 'cross_encoder'):
            all_documents = self.rerank_documents(warranty_text, all_documents)
        
        # Return top documents
        return all_documents[:top_k]
    
    def extract_key_terms_from_warranty(self, warranty: Dict[str, Any]) -> List[str]:
        """
        Extract key terms from warranty text for targeted searching.
        
        Args:
            warranty: Warranty claim dictionary
            
        Returns:
            List of key terms
        """
        warranty_text = warranty.get('warranty_text', '')
        
        # Use simple keyword extraction based on legal document patterns
        key_terms = []
        
        # Common legal/business terms that might be relevant
        important_words = [
            'incorporation', 'registered', 'authorized', 'shares', 'capital',
            'subsidiaries', 'accounts', 'financial', 'liabilities', 'assets',
            'compliance', 'regulatory', 'licenses', 'permits', 'agreements',
            'contracts', 'intellectual property', 'employment', 'litigation',
            'insurance', 'tax', 'environmental', 'data protection'
        ]
        
        text_lower = warranty_text.lower()
        for term in important_words:
            if term in text_lower:
                key_terms.append(term)
        
        # Extract specific company/legal entity mentions
        entity_patterns = [
            r'\\b[A-Z][a-z]+ [A-Z][a-z]+\\b',  # Proper names
            r'\\b[A-Z]{2,}\\b',  # Acronyms
        ]
        
        for pattern in entity_patterns:
            matches = re.findall(pattern, warranty_text)
            key_terms.extend(matches[:3])  # Limit to avoid too many terms
        
        return list(set(key_terms))[:5]  # Return unique terms, max 5
    
    def rerank_documents(self, query: str, documents: List[Dict]) -> List[Dict]:
        """
        Rerank documents using cross-encoder for better relevance.
        
        Args:
            query: Query text
            documents: List of document dictionaries
            
        Returns:
            Reranked documents
        """
        if len(documents) <= 1:
            return documents
        
        if not self.cross_encoder:
            print("āš ļø Cross-encoder not available, skipping reranking")
            return documents
        
        try:
            # Prepare query-document pairs for cross-encoder
            pairs = [(query, doc['content'][:500]) for doc in documents]  # Limit content length
            
            # Get cross-encoder scores
            scores = self.cross_encoder.predict(pairs)
            
            # Add scores to documents and sort
            for i, doc in enumerate(documents):
                doc['cross_encoder_score'] = float(scores[i])
            
            # Sort by cross-encoder score (higher is better)
            documents.sort(key=lambda x: x['cross_encoder_score'], reverse=True)
            
        except Exception as e:
            print(f"āš ļø Cross-encoder reranking failed: {e}")
        
        return documents
    
    def generate_disclosure_for_warranty(self, warranty: Dict[str, Any], 
                                       relevant_docs: List[Dict]) -> str:
        """
        Generate a detailed disclosure for a specific warranty using LLM.
        
        Args:
            warranty: Warranty claim dictionary
            relevant_docs: List of relevant documents from Chroma DB
            
        Returns:
            Generated disclosure text with inline references
        """
        if not relevant_docs:
            return "No relevant supporting documentation found in the knowledge base."
        
        # Prepare context from relevant documents
        context_blocks = []
        self.blocks_dict = {}  # Reset for this warranty
        
        for i, doc in enumerate(relevant_docs[:10], 1):  # Limit to top 10 docs
            block_num = self.block_counter + i - 1
            context_blocks.append(f"[Block {block_num}] {doc['content']}")
            
            # Store block information for references
            self.blocks_dict[block_num] = {
                'type': 'document',
                'id': doc['id'],
                'content': doc['content'][:200] + "..." if len(doc['content']) > 200 else doc['content'],
                'metadata': doc.get('metadata', {}),
                'source': doc.get('metadata', {}).get('source', 'Unknown source')
            }
        
        self.block_counter += len(relevant_docs[:10])
        
        context_text = "\\n\\n".join(context_blocks)
        
        # Create disclosure generation prompt
        llm = ChatOpenAI(model="gpt-4o", temperature=0.1, max_tokens=2000)
        
        disclosure_prompt = """
You are a legal and business expert helping to prepare warranty disclosures for a company acquisition.

**TASK**: Generate a detailed disclosure summary for the warranty claim below, based on the provided supporting documentation.

**WARRANTY CLAIM**:
Section: {warranty_number}
Title: {warranty_title}
Text: {warranty_text}

**SUPPORTING DOCUMENTATION**:
{context_text}

**INSTRUCTIONS**:
1. Analyze the warranty claim to understand what needs to be disclosed
2. Review the supporting documentation for relevant information
3. Create a comprehensive disclosure that addresses the warranty requirements
4. Include specific facts, figures, dates, and details from the documentation
5. Use inline citations [Block X] to reference specific information sources
6. Structure the disclosure clearly with appropriate headings if needed
7. Focus on factual information that supports or relates to the warranty claim
8. If certain aspects of the warranty cannot be fully addressed from the available documentation, note this clearly

**OUTPUT FORMAT**:
Provide a well-structured disclosure in markdown format with:
- Clear, professional language
- Inline citations using [Block X] format
- Specific details and facts from the supporting documents
- Appropriate level of detail for legal/business disclosure purposes

**DISCLOSURE SUMMARY**:
"""
        
        prompt = disclosure_prompt.format(
            warranty_number=warranty.get('warranty_number', 'N/A'),
            warranty_title=warranty.get('warranty_title', ''),
            warranty_text=warranty.get('warranty_text', ''),
            context_text=context_text
        )
        
        try:
            response = llm.invoke(prompt)
            disclosure_text = response.content.strip()
            
            return disclosure_text
            
        except Exception as e:
            print(f"āŒ Error generating disclosure: {e}")
            return f"Error generating disclosure for warranty {warranty.get('warranty_number', 'N/A')}: {str(e)}"
    
    def generate_all_disclosures(self) -> Dict[str, str]:
        """
        Generate disclosures for all identified warranty claims.
        
        Returns:
            Dictionary mapping warranty numbers to disclosure texts
        """
        print("\\n" + "="*60)
        print("GENERATING DISCLOSURES FOR ALL WARRANTIES")
        print("="*60)
        
        self.disclosures = {}
        
        for i, warranty in enumerate(self.warranty_claims, 1):
            warranty_num = warranty.get('warranty_number', f'W{i}')
            warranty_title = warranty.get('warranty_title', 'Untitled Warranty')
            
            print(f"\\nProcessing warranty {i}/{len(self.warranty_claims)}: {warranty_num} - {warranty_title}")
            
            # Search for relevant documents
            relevant_docs = self.search_chroma_for_warranty(warranty)
            
            if relevant_docs:
                print(f"Found {len(relevant_docs)} relevant documents")
            else:
                print("No relevant documents found")
            
            # Generate disclosure
            disclosure = self.generate_disclosure_for_warranty(warranty, relevant_docs)
            
            self.disclosures[warranty_num] = {
                'warranty': warranty,
                'disclosure': disclosure,
                'source_documents': len(relevant_docs),
                'generated_at': datetime.now().isoformat()
            }
            
            print(f"āœ… Generated disclosure for {warranty_num}")
        
        print(f"\\nāœ… Completed disclosure generation for {len(self.disclosures)} warranties")
        return self.disclosures
    
    def create_references_section(self) -> str:
        """
        Create a references section listing all blocks used in disclosures.
        
        Returns:
            Formatted references section
        """
        if not self.blocks_dict:
            return "## References\\n\\nNo references available."
        
        references = ["## References\\n"]
        
        for block_num in sorted(self.blocks_dict.keys()):
            block_info = self.blocks_dict[block_num]
            source = block_info.get('source', block_info.get('metadata', {}).get('source', 'Unknown source'))
            content_preview = block_info.get('content', 'No content preview available')
            
            references.append(f"**[Block {block_num}]** {source}")
            references.append(f"Content: {content_preview}")
            references.append("")  # Empty line
        
        return "\\n".join(references)
    
    def export_to_markdown(self, output_path: str = "./project_victoria_disclosures.md") -> str:
        """
        Export all disclosures to a comprehensive markdown report.
        
        Args:
            output_path: Path for the output markdown file
            
        Returns:
            Path to the generated markdown file
        """
        print("\\n" + "="*60)
        print("EXPORTING DISCLOSURES TO MARKDOWN")
        print("="*60)
        
        # Create markdown content
        markdown_content = []
        
        # Header
        markdown_content.extend([
            "# Project Victoria - Warranty Disclosures",
            "",
            f"**Generated on**: {datetime.now().strftime('%B %d, %Y at %H:%M:%S')}",
            f"**Total Warranties Processed**: {len(self.warranty_claims)}",
            f"**Total Disclosures Generated**: {len(self.disclosures)}",
            "",
            "---",
            ""
        ])
        
        # Table of contents
        markdown_content.extend([
            "## Table of Contents",
            ""
        ])
        
        for warranty_num in sorted(self.disclosures.keys()):
            warranty = self.disclosures[warranty_num]['warranty']
            title = warranty.get('warranty_title', 'Untitled Warranty')
            markdown_content.append(f"- [{warranty_num} - {title}](#{warranty_num.lower().replace('.', '').replace('(', '').replace(')', '')}-{title.lower().replace(' ', '-').replace('/', '').replace('(', '').replace(')', '')})")
        
        markdown_content.extend(["", "---", ""])
        
        # Disclosures
        for warranty_num in sorted(self.disclosures.keys()):
            disclosure_info = self.disclosures[warranty_num]
            warranty = disclosure_info['warranty']
            disclosure_text = disclosure_info['disclosure']
            
            # Warranty header
            markdown_content.extend([
                f"## {warranty_num} - {warranty.get('warranty_title', 'Untitled Warranty')}",
                "",
                f"**Section**: {warranty.get('section_name', 'N/A')}",
                f"**Source Documents Found**: {disclosure_info['source_documents']}",
                "",
                "### Warranty Text",
                "",
                warranty.get('warranty_text', 'No warranty text available'),
                "",
                "### Disclosure",
                "",
                disclosure_text,
                "",
                "---",
                ""
            ])
        
        # References section
        references_section = self.create_references_section()
        markdown_content.extend([references_section])
        
        # Write to file
        final_content = "\\n".join(markdown_content)
        
        try:
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(final_content)
            
            print(f"āœ… Successfully exported disclosures to: {output_path}")
            print(f"šŸ“„ Total content length: {len(final_content)} characters")
            
            return output_path
            
        except Exception as e:
            print(f"āŒ Error exporting to markdown: {e}")
            raise
    
    def run_complete_analysis(self, output_path: str = "./project_victoria_disclosures.md") -> str:
        """
        Run the complete analysis pipeline.
        
        Args:
            output_path: Path for the output markdown file
            
        Returns:
            Path to the generated markdown report
        """
        print("\\n" + "="*80)
        print("PROJECT VICTORIA DISCLOSURE GENERATION")
        print("="*80)
        print(f"Analysis started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        
        try:
            # Step 1: Extract PDF text
            self.extract_pdf_text()
            
            # Step 2: Identify warranty claims
            self.identify_warranty_claims()
            
            if not self.warranty_claims:
                print("āŒ No warranty claims found. Cannot proceed.")
                return None
            
            # Step 3: Generate disclosures
            self.generate_all_disclosures()
            
            # Step 4: Export to markdown
            output_file = self.export_to_markdown(output_path)
            
            print("\\n" + "="*80)
            print("ANALYSIS COMPLETED SUCCESSFULLY!")
            print("="*80)
            print(f"Analysis completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
            print(f"Output file: {output_file}")
            print(f"Total warranties processed: {len(self.warranty_claims)}")
            print(f"Total disclosures generated: {len(self.disclosures)}")
            
            return output_file
            
        except Exception as e:
            print(f"\\nāŒ Analysis failed with error: {e}")
            import traceback
            traceback.print_exc()
            return None

Parameters

Name Type Default Kind
bases - -

Parameter Details

bases: Parameter of type

Return Value

Returns unspecified type

Class Interface

Methods

__init__(self, pdf_path)

Purpose: Initialize the disclosure generator. Args: pdf_path: Path to the Project Victoria PDF document

Parameters:

  • pdf_path: Type: str

Returns: None

init_chroma_connection(self)

Purpose: Initialize connection to Chroma DB.

Returns: None

extract_pdf_text(self) -> str

Purpose: Extract text from the Project Victoria PDF document. Returns: Extracted text from the PDF

Returns: Returns str

identify_warranty_claims(self) -> List[Dict[str, Any]]

Purpose: Identify and extract individual warranty claims from the document. Returns: List of warranty claim dictionaries

Returns: Returns List[Dict[str, Any]]

verify_and_clean_warranties(self, warranties) -> List[Dict]

Purpose: Use LLM to verify and clean up the warranty list. Args: warranties: Raw list of warranty claims Returns: Cleaned and verified warranty claims

Parameters:

  • warranties: Type: List[Dict]

Returns: Returns List[Dict]

split_text_for_processing(self, text, max_tokens) -> List[str]

Purpose: Split text into chunks that fit within token limits. Args: text: Text to split max_tokens: Maximum tokens per chunk Returns: List of text chunks

Parameters:

  • text: Type: str
  • max_tokens: Type: int

Returns: Returns List[str]

search_chroma_for_warranty(self, warranty, top_k) -> List[Dict]

Purpose: Search Chroma DB for documents relevant to a specific warranty. Args: warranty: Warranty claim dictionary top_k: Number of top documents to retrieve Returns: List of relevant documents

Parameters:

  • warranty: Type: Dict[str, Any]
  • top_k: Type: int

Returns: Returns List[Dict]

extract_key_terms_from_warranty(self, warranty) -> List[str]

Purpose: Extract key terms from warranty text for targeted searching. Args: warranty: Warranty claim dictionary Returns: List of key terms

Parameters:

  • warranty: Type: Dict[str, Any]

Returns: Returns List[str]

rerank_documents(self, query, documents) -> List[Dict]

Purpose: Rerank documents using cross-encoder for better relevance. Args: query: Query text documents: List of document dictionaries Returns: Reranked documents

Parameters:

  • query: Type: str
  • documents: Type: List[Dict]

Returns: Returns List[Dict]

generate_disclosure_for_warranty(self, warranty, relevant_docs) -> str

Purpose: Generate a detailed disclosure for a specific warranty using LLM. Args: warranty: Warranty claim dictionary relevant_docs: List of relevant documents from Chroma DB Returns: Generated disclosure text with inline references

Parameters:

  • warranty: Type: Dict[str, Any]
  • relevant_docs: Type: List[Dict]

Returns: Returns str

generate_all_disclosures(self) -> Dict[str, str]

Purpose: Generate disclosures for all identified warranty claims. Returns: Dictionary mapping warranty numbers to disclosure texts

Returns: Returns Dict[str, str]

create_references_section(self) -> str

Purpose: Create a references section listing all blocks used in disclosures. Returns: Formatted references section

Returns: Returns str

export_to_markdown(self, output_path) -> str

Purpose: Export all disclosures to a comprehensive markdown report. Args: output_path: Path for the output markdown file Returns: Path to the generated markdown file

Parameters:

  • output_path: Type: str

Returns: Returns str

run_complete_analysis(self, output_path) -> str

Purpose: Run the complete analysis pipeline. Args: output_path: Path for the output markdown file Returns: Path to the generated markdown report

Parameters:

  • output_path: Type: str

Returns: Returns str

Required Imports

import os
import re
import json
import pandas as pd
import numpy as np

Usage Example

# Example usage:
# result = ProjectVictoriaDisclosureGenerator(bases)

Similar Components

AI-powered semantic similarity - components with related functionality:

  • class FixedProjectVictoriaGenerator 82.8% similar

    Fixed Project Victoria Disclosure Generator that properly handles all warranty sections.

    From: /tf/active/vicechatdev/fixed_project_victoria_generator.py
  • class ImprovedProjectVictoriaGenerator 74.8% similar

    Improved Project Victoria Disclosure Generator with proper reference management.

    From: /tf/active/vicechatdev/improved_project_victoria_generator.py
  • function main_v29 70.6% similar

    Entry point function that instantiates an ImprovedProjectVictoriaGenerator and executes its complete pipeline to generate disclosure documents.

    From: /tf/active/vicechatdev/improved_project_victoria_generator.py
  • function main_v28 68.3% similar

    Entry point function that instantiates a FixedProjectVictoriaGenerator and executes its complete pipeline to generate fixed disclosure documents.

    From: /tf/active/vicechatdev/fixed_project_victoria_generator.py
  • function main_v14 67.2% similar

    Entry point function that orchestrates the Project Victoria disclosure analysis by initializing the generator, running the complete analysis, and displaying results with next steps.

    From: /tf/active/vicechatdev/project_victoria_disclosure_generator.py
← Back to Browse