FixedProjectVictoriaGenerator

class FixedProjectVictoriaGenerator

Maturity: 26

Fixed Project Victoria Disclosure Generator that properly handles all warranty sections.

File:
/tf/active/vicechatdev/fixed_project_victoria_generator.py

Lines:
219 - 1622

Complexity:
moderate

Purpose

Fixed Project Victoria Disclosure Generator that properly handles all warranty sections.

Source Code

class FixedProjectVictoriaGenerator:
    """
    Fixed Project Victoria Disclosure Generator that properly handles all warranty sections.
    """
    
    def __init__(self):
        self.pdf_path = "/tf/active/20250623_Project Victoria - Disclosure Matrix_WIP.pdf"
        self.extracted_text_path = "/tf/active/project_victoria_extracted.txt"
        self.chroma_path = "/tf/active/.persist/EDR_collection"
        
        self.extracted_text = ""
        self.warranty_claims = []
        self.disclosures = []
        self.api_key = "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"
        os.environ["OPENAI_API_KEY"] = self.api_key
        
        # Initialize reference manager
        self.ref_manager = ReferenceManager()
        
        # Initialize tokenizer
        self.tokenizer = tiktoken.encoding_for_model("gpt-4o-mini")
        
        # Initialize ChromaDB collection
        self.edr_collection = None
        self.chroma_client = None
        
        # Try to connect to remote ChromaDB first (vice_chroma:8000)
        try:
            print("🔗 Attempting to connect to remote ChromaDB at vice_chroma:8000...")
            self.chroma_client = chromadb.HttpClient(host='vice_chroma', port=8000)
            
            # Set up embedding function if available
            if EMBEDDING_AVAILABLE:
                self.chroma_embedder = MyEmbeddingFunction("gpt-4o-mini", "text-embedding-3-small", self.api_key)
            else:
                print("⚠️ Using default Chroma embeddings (may not work with custom collections)")
                self.chroma_embedder = None
            
            # Try to get the 99_edr collection
            try:
                self.edr_collection = self.chroma_client.get_collection(
                    "99_edr", 
                    embedding_function=self.chroma_embedder
                )
                print("✅ Successfully connected to remote ChromaDB and 99_edr collection")
            except Exception as collection_error:
                print(f"⚠️ Could not access 99_edr collection: {collection_error}")
                # Try to list available collections
                try:
                    collections = self.chroma_client.list_collections()
                    if collections:
                        collection_names = [c.name for c in collections]
                        print(f"Available collections: {collection_names}")
                        # Use first available collection
                        first_collection = collection_names[0]
                        self.edr_collection = self.chroma_client.get_collection(first_collection)
                        print(f"✅ Using collection '{first_collection}' instead")
                    else:
                        print("⚠️ No collections found in remote ChromaDB")
                except Exception as list_error:
                    print(f"⚠️ Could not list collections: {list_error}")
                    
        except Exception as remote_error:
            print(f"⚠️ Could not connect to remote ChromaDB: {remote_error}")
            
            # Fallback to local ChromaDB paths  
            print("🔗 Trying local ChromaDB paths as fallback...")
            possible_chroma_paths = [
                "/tf/active/.persist/EDR_collection",
                "/tf/active/.persist",
                "/tf/active/chroma_db",
                "/tf/active/.chroma"
            ]
            
            for path in possible_chroma_paths:
                if os.path.exists(path):
                    try:
                        client = chromadb.PersistentClient(path=path)
                        collections = client.list_collections()
                        
                        # Try to find EDR collection or similar
                        collection_names = [c.name for c in collections]
                        if "EDR_collection" in collection_names:
                            self.edr_collection = client.get_collection(name="EDR_collection")
                            print(f"✅ Connected to ChromaDB collection 'EDR_collection' with {self.edr_collection.count()} documents at {path}")
                            break
                        elif collection_names:
                            # Use the first available collection
                            self.edr_collection = client.get_collection(name=collection_names[0])
                            print(f"✅ Connected to ChromaDB collection '{collection_names[0]}' with {self.edr_collection.count()} documents at {path}")
                            break
                        else:
                            print(f"⚠️ No collections found in ChromaDB at {path}")
                            
                    except Exception as e:
                        print(f"⚠️ Could not connect to ChromaDB at {path}: {e}")
                        continue
        
        if not self.edr_collection:
            print("⚠️ ChromaDB not available - disclosure generation will proceed without document search")
                
    def extract_pdf_text(self):
        """Extract text from PDF or load from existing file."""
        # First try to load existing extracted text
        if os.path.exists(self.extracted_text_path):
            print(f"📄 Loading existing extracted text from: {self.extracted_text_path}")
            with open(self.extracted_text_path, 'r', encoding='utf-8') as f:
                self.extracted_text = f.read()
            print(f"✅ Loaded {len(self.extracted_text)} characters of text")
            return
        
        # If no extracted text exists, try to extract from PDF
        if not os.path.exists(self.pdf_path):
            raise FileNotFoundError(f"PDF file not found: {self.pdf_path}")
        
        if not PDF_AVAILABLE:
            raise ImportError("PyMuPDF not available for PDF extraction")
        
        print(f"📄 Extracting text from PDF: {self.pdf_path}")
        doc = fitz.open(self.pdf_path)
        text_parts = []
        
        for page_num in range(len(doc)):
            page = doc[page_num]
            text = page.get_text()
            text_parts.append(text)
        
        self.extracted_text = "\n".join(text_parts)
        
        # Save extracted text for future use
        with open(self.extracted_text_path, 'w', encoding='utf-8') as f:
            f.write(self.extracted_text)
        
        print(f"✅ Extracted {len(self.extracted_text)} characters and saved to {self.extracted_text_path}")
    
    def split_text_by_warranty_sections(self, text: str) -> List[str]:
        """
        Split text into chunks based on warranty section boundaries.
        This ensures warranty sections are not split across chunks.
        """
        print("🔍 Splitting text by warranty sections...")
        
        # Find all warranty section headers using multiple patterns
        # Pattern 1: Basic section numbers like "1.1", "2.1", etc.
        pattern1 = r'\n\s*(\d+\.(?:\d+(?:\([a-z]\))?)?)\s*\n'
        # Pattern 2: Section numbers at start of line
        pattern2 = r'^(\d+\.\d+(?:\([a-z]\))?)\s'
        # Pattern 3: Section numbers with spaces
        pattern3 = r'\n(\d+\.\d+(?:\([a-z]\))?)\s+'
        
        matches = []
        for pattern in [pattern1, pattern2, pattern3]:
            matches.extend(list(re.finditer(pattern, text, re.MULTILINE)))
        
        # Remove duplicates and sort by position
        unique_matches = {}
        for match in matches:
            pos = match.start()
            if pos not in unique_matches:
                unique_matches[pos] = match
        
        matches = sorted(unique_matches.values(), key=lambda x: x.start())
        
        if not matches:
            print("⚠️ No warranty sections found using standard patterns, trying alternative approach")
            # Try alternative pattern matching
            alt_pattern = r'(\d+\.\d+(?:\([a-z]\))?)'
            alt_matches = list(re.finditer(alt_pattern, text))
            if alt_matches:
                matches = alt_matches[:50]  # Limit to avoid too many matches
                print(f"Found {len(matches)} potential warranty sections with alternative pattern")
            else:
                print("⚠️ No warranty sections found, using single chunk")
                return [text]
        
        print(f"Found {len(matches)} warranty sections")
        
        chunks = []
        max_chunk_size = 20000  # Smaller chunks to avoid token limits
        current_chunk = ""
        
        for i, match in enumerate(matches):
            section_start = match.start()
            section_number = match.group(1)
            
            # Find the end of this section (start of next section or end of text)
            if i < len(matches) - 1:
                section_end = matches[i + 1].start()
            else:
                section_end = len(text)
            
            section_text = text[section_start:section_end]
            
            # Check if adding this section would exceed chunk size
            if len(current_chunk) + len(section_text) > max_chunk_size and current_chunk:
                chunks.append(current_chunk.strip())
                current_chunk = section_text
            else:
                current_chunk += section_text
        
        # Add the last chunk
        if current_chunk.strip():
            chunks.append(current_chunk.strip())
        
        print(f"✅ Split text into {len(chunks)} warranty-based chunks")
        return chunks
    
    def identify_warranty_claims(self) -> List[Dict[str, Any]]:
        """Identify and extract individual warranty claims from the document."""
        print("\n" + "="*60)
        print("IDENTIFYING WARRANTY CLAIMS")
        print("="*60)
        
        if not LANGCHAIN_AVAILABLE:
            raise ImportError("LangChain not available for warranty identification")
        
        # Use LLM to identify warranty claims with higher max_tokens to avoid truncation
        llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=12000)
        
        warranty_extraction_prompt = """
You are analyzing a legal document containing warranty disclosures for a company acquisition.
The document is structured with numbered warranty sections, each containing specific warranty claims.

Your task is to extract ALL individual warranty claims from the provided text section.

Please extract and return a JSON list of warranty claims, where each item has:
- "warranty_number": The section number (e.g., "1.1", "2.1(a)", "3.4", "13.18", "19.24")
- "warranty_title": A short descriptive title for the warranty (max 100 characters)
- "warranty_text": The complete text of the warranty claim (limit to 200 characters)
- "section_name": The main section name or topic (if identifiable)

IMPORTANT INSTRUCTIONS:
1. Extract EVERY warranty section you find, including high-numbered sections (like 18.x, 19.x, 20.x, 21.x, 22.x, 23.x)
2. Look for sections with numbers like: 18.1, 19.24, 20.7, 21.4, 22.12, 23.8, etc.
3. Keep all text fields short to ensure complete JSON response
4. Include sections with letter suffixes like 13.18(a), 2.1(c)(i), etc.
5. If you see "Warranty X" or "Warranty Number X", extract it
6. Do not truncate the JSON response - complete all warranty objects

Here is the document section to analyze:

{document_text}

Return only a valid JSON array of warranty claims. Ensure the JSON is complete and valid.
"""
        
        # Split text into warranty-based chunks
        text_chunks = self.split_text_by_warranty_sections(self.extracted_text)
        
        all_warranties = []
        processed_numbers = set()  # Track processed warranty numbers to avoid duplicates
        
        for i, chunk in enumerate(text_chunks):
            print(f"Processing warranty extraction chunk {i+1}/{len(text_chunks)}")
            
            prompt = warranty_extraction_prompt.format(document_text=chunk)
            
            try:
                response = llm.invoke(prompt)
                response_text = response.content.strip()
                
                # Clean up response to ensure valid JSON
                if response_text.startswith("```json"):
                    response_text = response_text[7:]
                if response_text.endswith("```"):
                    response_text = response_text[:-3]
                
                response_text = response_text.strip()
                
                # Try to repair truncated JSON
                if not response_text.endswith(']'):
                    # Find the last complete warranty object
                    last_complete = response_text.rfind('}}')
                    if last_complete > 0:
                        response_text = response_text[:last_complete + 2] + ']'
                    else:
                        response_text = response_text + ']'
                
                chunk_warranties = json.loads(response_text)
                
                if isinstance(chunk_warranties, list):
                    # Filter out duplicates based on warranty number
                    new_warranties = []
                    for warranty in chunk_warranties:
                        warranty_num = warranty.get('warranty_number', '')
                        if warranty_num and warranty_num not in processed_numbers:
                            processed_numbers.add(warranty_num)
                            new_warranties.append(warranty)
                    
                    all_warranties.extend(new_warranties)
                    print(f"✅ Extracted {len(new_warranties)} unique warranties from chunk {i+1}")
                else:
                    print(f"⚠️ Unexpected response format from chunk {i+1}")
                    
            except json.JSONDecodeError as je:
                print(f"❌ JSON decode error in chunk {i+1}: {je}")
                print(f"Response preview: {response_text[:300]}...")
                
                # Try alternative extraction using regex
                try:
                    warranty_pattern = r'\{\s*"warranty_number"[^}]+?\}'
                    matches = re.findall(warranty_pattern, response_text, re.DOTALL)
                    recovered_count = 0
                    for match in matches:
                        try:
                            warranty = json.loads(match)
                            warranty_num = warranty.get('warranty_number', '')
                            if warranty_num and warranty_num not in processed_numbers:
                                processed_numbers.add(warranty_num)
                                all_warranties.append(warranty)
                                recovered_count += 1
                        except:
                            continue
                    if recovered_count > 0:
                        print(f"✅ Recovered {recovered_count} warranties using regex extraction")
                except Exception as re_error:
                    print(f"❌ Regex recovery also failed: {re_error}")
            except Exception as e:
                print(f"❌ Error processing chunk {i+1}: {e}")
        
        # Clean up and verify warranties
        if all_warranties:
            all_warranties = self.verify_and_clean_warranties(all_warranties)
        
        self.warranty_claims = all_warranties
        
        print(f"✅ Total warranty claims identified: {len(self.warranty_claims)}")
        
        # Display warranty number range
        if self.warranty_claims:
            warranty_numbers = [w.get('warranty_number', '') for w in self.warranty_claims if w.get('warranty_number')]
            if warranty_numbers:
                print(f"📊 Warranty numbers range: {min(warranty_numbers)} to {max(warranty_numbers)}")
            
            print("\nSample warranty claims:")
            for i, warranty in enumerate(self.warranty_claims[:5]):
                print(f"{i+1}. [{warranty.get('warranty_number', 'N/A')}] {warranty.get('warranty_title', 'No title')}")
        
        return self.warranty_claims
    
    def verify_and_clean_warranties(self, warranties: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """Verify and clean warranty claims."""
        try:
            cleaned_warranties = []
            seen_numbers = set()
            
            for warranty in warranties:
                # Check if warranty has required fields
                if not warranty.get('warranty_number') or not warranty.get('warranty_title'):
                    continue
                
                warranty_num = warranty['warranty_number']
                
                # Skip duplicates
                if warranty_num in seen_numbers:
                    continue
                
                seen_numbers.add(warranty_num)
                
                # Clean warranty text
                warranty_text = warranty.get('warranty_text', '')
                if len(warranty_text) > 500:
                    warranty['warranty_text'] = warranty_text[:497] + "..."
                
                cleaned_warranties.append(warranty)
            
            # Sort by warranty number
            def sort_key(w):
                num = w.get('warranty_number', '0')
                try:
                    # Extract main number and sub-number for proper sorting
                    parts = re.findall(r'\d+', num)
                    if parts:
                        return (int(parts[0]), int(parts[1]) if len(parts) > 1 else 0, num)
                    return (0, 0, num)
                except:
                    return (0, 0, num)
            
            cleaned_warranties.sort(key=sort_key)
            return cleaned_warranties
            
        except Exception as e:
            print(f"⚠️ Warranty verification failed: {e}, using original list")
            return warranties
    
    def search_chroma_for_warranty(self, warranty: Dict[str, Any], top_k: int = 15) -> List[Dict]:
        """Search Chroma DB for documents relevant to a specific warranty."""
        if not self.edr_collection:
            print("⚠️ Chroma DB not available, skipping search")
            return []
        
        # Create search queries based on warranty content
        warranty_text = warranty.get('warranty_text', '')
        warranty_title = warranty.get('warranty_title', '')
        warranty_number = warranty.get('warranty_number', '')
        
        # Generate multiple search queries for comprehensive coverage
        search_queries = []
        
        # Add warranty-specific terms
        if warranty_title:
            search_queries.append(warranty_title)
        
        if warranty_text:
            search_queries.append(warranty_text[:500])
        
        if warranty_title and warranty_text:
            search_queries.append(f"{warranty_title} {warranty_text[:200]}")
        
        # Add section-specific terms
        if warranty_number:
            search_queries.append(f"section {warranty_number}")
        
        # Extract key terms for additional queries
        key_terms = self.extract_key_terms_from_warranty(warranty)
        if key_terms:
            search_queries.extend(key_terms[:3])
        
        # Add fallback general terms based on warranty type
        general_terms = self.get_general_warranty_terms(warranty)
        search_queries.extend(general_terms[:2])
        
        all_documents = []
        retrieved_ids = set()
        
        print(f"🔍 Searching ChromaDB with {len(search_queries)} queries for warranty {warranty_number}")
        
        for i, query in enumerate(search_queries):
            if not query.strip():
                continue
                
            try:
                print(f"  Query {i+1}: '{query[:60]}{'...' if len(query) > 60 else ''}'")
                
                # Query Chroma DB
                results = self.edr_collection.query(
                    query_texts=[query],
                    n_results=min(top_k, 10),  # Limit per query
                    include=["documents", "metadatas", "distances"]
                )
                
                # Process results
                if results['documents'] and len(results['documents'][0]) > 0:
                    documents = results['documents'][0]
                    metadatas = results['metadatas'][0] if results['metadatas'] else [{}] * len(documents)
                    distances = results['distances'][0] if results['distances'] else [1.0] * len(documents)
                    
                    for j, (doc, metadata, distance) in enumerate(zip(documents, metadatas, distances)):
                        # Create unique document ID
                        doc_id = metadata.get('id', f'doc_{hash(doc[:100])}_{j}')
                        
                        if doc_id not in retrieved_ids:
                            retrieved_ids.add(doc_id)
                            all_documents.append({
                                'id': doc_id,
                                'content': doc,
                                'metadata': metadata,
                                'distance': distance,
                                'query_match': query
                            })
                
                print(f"    Found {len(results['documents'][0]) if results['documents'] else 0} documents")
                            
            except Exception as e:
                print(f"⚠️ ChromaDB search failed for query '{query[:50]}...': {e}")
                continue
        
        # Sort by relevance (lower distance = more relevant)
        all_documents.sort(key=lambda x: x['distance'])
        
        # Return top results
        final_docs = all_documents[:top_k]
        print(f"✅ Found {len(final_docs)} relevant documents from {len(all_documents)} total matches")
        
        return final_docs
    
    def extract_key_terms_from_warranty(self, warranty: Dict[str, Any]) -> List[str]:
        """Extract key terms from warranty for search queries."""
        warranty_text = warranty.get('warranty_text', '')
        warranty_title = warranty.get('warranty_title', '')
        
        # Combine text
        full_text = f"{warranty_title} {warranty_text}".lower()
        
        # Extract important terms (nouns, key concepts)
        key_terms = []
        
        # Look for capitalized terms (likely important concepts)
        caps_terms = re.findall(r'\b[A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+)*\b', warranty_text)
        key_terms.extend(caps_terms[:3])
        
        # Look for legal/business terms
        business_terms = re.findall(r'\b(?:agreement|contract|liability|obligation|compliance|breach|default|guarantee|indemnity|insurance|permit|license|employee|consultant|shareholder|subsidiary|acquisition|merger|transaction|disclosure|warranty|representation)\b', full_text)
        key_terms.extend(set(business_terms[:5]))
        
        # Remove duplicates and return
        return list(set(key_terms))[:5]
    
    def generate_disclosure_for_warranty(self, warranty: Dict[str, Any]) -> str:
        """Generate a disclosure for a specific warranty claim."""
        if not LANGCHAIN_AVAILABLE:
            print("⚠️ LangChain not available for disclosure generation")
            return "Disclosure generation unavailable - LangChain not installed."
        
        print(f"Generating disclosure for: {warranty.get('warranty_number', 'Unknown')} - {warranty.get('warranty_title', 'Unknown')}")
        
        # Search for relevant documents
        relevant_docs = self.search_chroma_for_warranty(warranty)
        
        if not relevant_docs:
            print("⚠️ No relevant documents found in ChromaDB")
            return self.generate_basic_disclosure(warranty)
        
        # Prepare context from documents
        context_parts = []
        references = []
        
        for doc in relevant_docs[:10]:  # Limit to top 10 documents
            content = doc['content']
            metadata = doc['metadata']
            
            # Add to reference manager and get citation
            ref_num = self.ref_manager.add_document(
                doc_id=doc['id'],
                content=content,
                metadata=metadata
            )
            
            citation = f"[{ref_num}]"
            references.append(citation)
            
            # Truncate content for context
            truncated_content = content[:1000] + ("..." if len(content) > 1000 else "")
            context_parts.append(f"Document {citation}: {truncated_content}")
        
        context_text = "\n\n".join(context_parts)
        
        # Generate disclosure using LLM
        llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.3, max_tokens=4000)
        
        disclosure_prompt = f"""
You are a legal expert creating a comprehensive warranty disclosure summary for a corporate acquisition.

**Warranty Claim Details:**
- **Number:** {warranty.get('warranty_number', 'N/A')}
- **Title:** {warranty.get('warranty_title', 'N/A')}
- **Text:** {warranty.get('warranty_text', 'N/A')}
- **Section:** {warranty.get('section_name', 'N/A')}

**Available Supporting Documentation:**
{context_text}

**Instructions:**
1. Create a comprehensive disclosure summary that addresses this specific warranty claim
2. Use inline citations {', '.join(references)} to reference the supporting documents
3. Include specific details from the documents that support or contradict the warranty
4. Structure the disclosure with clear headings and bullet points
5. Address potential risks, compliance issues, or qualifications to the warranty
6. Use professional legal language appropriate for an M&A context
7. Ensure the disclosure is thorough but concise (aim for 500-1000 words)

**Output Format:**
# Disclosure Summary for Warranty Claim: [Warranty Title]

## Overview
[Brief summary of the warranty and its implications]

## [Relevant Section Headers based on the specific warranty]
[Detailed analysis with inline citations]

## Conclusion
[Summary of key findings and any qualifications to the warranty]

**Note**: Include inline citations throughout using the format {', '.join(references)} to reference specific supporting documents.
"""
        
        try:
            response = llm.invoke(disclosure_prompt)
            disclosure_text = response.content.strip()
            
            # Ensure proper formatting
            if not disclosure_text.startswith('#'):
                disclosure_text = f"# Disclosure Summary for Warranty Claim: {warranty.get('warranty_title', 'Unknown')}\n\n{disclosure_text}"
            
            return disclosure_text
            
        except Exception as e:
            print(f"❌ Error generating disclosure: {e}")
            return self.generate_basic_disclosure(warranty)
    
    def generate_basic_disclosure(self, warranty: Dict[str, Any]) -> str:
        """Generate a basic disclosure when no documents or LLM is available."""
        return f"""# Disclosure Summary for Warranty Claim: {warranty.get('warranty_title', 'Unknown')}

## Warranty Details
- **Section:** {warranty.get('warranty_number', 'N/A')}
- **Title:** {warranty.get('warranty_title', 'N/A')}
- **Text:** {warranty.get('warranty_text', 'N/A')}

## Analysis
This warranty claim requires detailed analysis against the company's records and documentation. 

## Conclusion
Further investigation and document review required to provide comprehensive disclosure analysis.

**Note:** Limited disclosure analysis available due to insufficient supporting documentation or system limitations.
"""
    
    
    def export_to_markdown(self) -> str:
        """Export warranty claims and disclosures to markdown format."""
        print("\n" + "="*60)
        print("EXPORTING TO MARKDOWN")
        print("="*60)
        
        if not self.warranty_claims or not self.disclosures:
            print("⚠️ No warranty claims or disclosures available for export")
            return ""
        
        # Generate output filename
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        output_path = f"/tf/active/project_victoria_disclosures_fixed_{timestamp}.md"
        
        # Start building markdown content
        markdown_parts = []
        
        # Add header
        markdown_parts.append("# Project Victoria - Warranty Disclosures\\n")
        markdown_parts.append(f"**Generated on**: {datetime.now().strftime('%B %d, %Y at %H:%M:%S')}")
        markdown_parts.append(f"**Total Warranties Processed**: {len(self.warranty_claims)}")
        markdown_parts.append(f"**Total Disclosures Generated**: {len(self.disclosures)}")
        markdown_parts.append(f"**Total References**: {len(self.ref_manager.references)}")
        markdown_parts.append("\\n---\\n")
        
        # Add table of contents
        markdown_parts.append("## Table of Contents\\n")
        for warranty in self.warranty_claims:
            warranty_num = warranty.get('warranty_number', 'Unknown')
            warranty_title = warranty.get('warranty_title', 'No Title')
            anchor = warranty_title.lower().replace(' ', '-').replace('(', '').replace(')', '')
            markdown_parts.append(f"- [{warranty_num} - {warranty_title}](#{warranty_num.lower()}-{anchor})")
        
        markdown_parts.append("\\n---\\n")
        
        # Add warranty sections
        for disclosure_record in self.disclosures:
            warranty = disclosure_record['warranty']
            disclosure_text = disclosure_record['disclosure']
            
            warranty_num = warranty.get('warranty_number', 'Unknown')
            warranty_title = warranty.get('warranty_title', 'No Title')
            
            # Add warranty header
            markdown_parts.append(f"## {warranty_num} - {warranty_title}\\n")
            
            # Add warranty metadata
            section_name = warranty.get('section_name', 'Unknown Section')
            markdown_parts.append(f"**Section**: {section_name}")
            
            # Add document count if available
            warranty_docs = self.search_chroma_for_warranty(warranty, top_k=1)
            doc_count = len(warranty_docs) if warranty_docs else 0
            markdown_parts.append(f"**Source Documents Found**: {doc_count}\\n")
            
            # Add warranty text
            markdown_parts.append("### Warranty Text\\n")
            warranty_text = warranty.get('warranty_text', 'No warranty text available')
            markdown_parts.append(warranty_text + "\\n")
            
            # Add disclosure
            markdown_parts.append("### Disclosure\\n")
            markdown_parts.append(disclosure_text + "\\n")
            
            # Add separator
            markdown_parts.append("---\\n")
        
        # Add references section
        bibliography = self.ref_manager.generate_bibliography()
        markdown_parts.append(bibliography)
        
        # Combine all parts
        markdown_content = "\\n".join(markdown_parts)
        
        # Write to file
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(markdown_content)
        
        print(f"✅ Markdown exported to: {output_path}")
        print(f"📊 File size: {len(markdown_content):,} characters")
        
        return output_path
    
    def get_general_warranty_terms(self, warranty: Dict[str, Any]) -> List[str]:
        """Get general search terms based on warranty type."""
        warranty_text = warranty.get('warranty_text', '').lower()
        warranty_title = warranty.get('warranty_title', '').lower()
        warranty_number = warranty.get('warranty_number', '')
        
        general_terms = []
        
        # Map warranty types to general terms
        term_mapping = {
            'incorporation': ['incorporation', 'company formation', 'legal entity'],
            'shares': ['shareholding', 'equity', 'ownership'],
            'authority': ['power', 'authorization', 'legal capacity'],
            'accounts': ['financial statements', 'accounting', 'audited accounts'],
            'business': ['operations', 'trading', 'commercial activity'],
            'assets': ['property', 'equipment', 'resources'],
            'contracts': ['agreements', 'legal obligations', 'commitments'],
            'employment': ['employees', 'staff', 'personnel', 'labour'],
            'litigation': ['legal proceedings', 'disputes', 'claims'],
            'compliance': ['regulatory', 'legal requirements', 'obligations'],
            'insurance': ['policies', 'coverage', 'protection'],
            'intellectual property': ['IP', 'patents', 'trademarks', 'copyrights'],
            'environmental': ['EHS', 'environmental compliance', 'health safety'],
            'tax': ['taxation', 'tax compliance', 'tax obligations'],
            'permits': ['licenses', 'authorizations', 'regulatory approvals']
        }
        
        # Check for matching terms
        combined_text = f"{warranty_title} {warranty_text}"
        for category, terms in term_mapping.items():
            if category in combined_text:
                general_terms.extend(terms[:2])
                break
        
        # Add section-based terms
        if warranty_number:
            section_parts = warranty_number.split('.')
            if len(section_parts) >= 1:
                section_num = section_parts[0]
                section_mapping = {
                    '1': ['shares', 'sellers', 'ownership'],
                    '2': ['authority', 'capacity', 'power'],
                    '3': ['share capital', 'securities', 'equity'],
                    '4': ['accounts', 'financial statements'],
                    '5': ['business continuity', 'operations'],
                    '6': ['assets', 'property', 'equipment'],
                    '7': ['corporate', 'constitutional'],
                    '8': ['contracts', 'agreements'],
                    '9': ['borrowing', 'debt', 'financing'],
                    '10': ['permits', 'licenses'],
                    '11': ['insolvency', 'financial distress'],
                    '12': ['litigation', 'legal proceedings'],
                    '13': ['employment', 'employees'],
                    '14': ['competition', 'antitrust'],
                    '15': ['environmental', 'health safety'],
                    '16': ['real estate', 'property'],
                    '17': ['insurance', 'coverage'],
                    '18': ['intellectual property', 'IP'],
                    '19': ['data protection', 'privacy'],
                    '20': ['tax', 'taxation'],
                    '21': ['regulatory', 'compliance'],
                    '22': ['technology', 'IT systems'],
                    '23': ['material contracts', 'key agreements']
                }
                
                if section_num in section_mapping:
                    general_terms.extend(section_mapping[section_num])
        
        return list(set(general_terms))[:5]  # Remove duplicates and limit

    # Update the search method to include better debugging and error handling
    def search_chroma_for_warranty(self, warranty: Dict[str, Any], top_k: int = 15) -> List[Dict]:
        """Search Chroma DB for documents relevant to a specific warranty."""
        if not self.edr_collection:
            print("⚠️ Chroma DB not available, skipping search")
            return []
        
        # Create search queries based on warranty content
        warranty_text = warranty.get('warranty_text', '')
        warranty_title = warranty.get('warranty_title', '')
        warranty_number = warranty.get('warranty_number', '')
        
        # Generate multiple search queries for comprehensive coverage
        search_queries = []
        
        # Add warranty-specific terms
        if warranty_title:
            search_queries.append(warranty_title)
        
        if warranty_text:
            search_queries.append(warranty_text[:500])
        
        if warranty_title and warranty_text:
            search_queries.append(f"{warranty_title} {warranty_text[:200]}")
        
        # Add section-specific terms
        if warranty_number:
            search_queries.append(f"section {warranty_number}")
        
        # Extract key terms for additional queries
        key_terms = self.extract_key_terms_from_warranty(warranty)
        if key_terms:
            search_queries.extend(key_terms[:3])
        
        # Add fallback general terms based on warranty type
        general_terms = self.get_general_warranty_terms(warranty)
        search_queries.extend(general_terms[:2])
        
        all_documents = []
        retrieved_ids = set()
        
        print(f"🔍 Searching ChromaDB with {len(search_queries)} queries for warranty {warranty_number}")
        
        for i, query in enumerate(search_queries):
            if not query.strip():
                continue
                
            try:
                print(f"  Query {i+1}: '{query[:60]}{'...' if len(query) > 60 else ''}'")
                
                # Query Chroma DB
                results = self.edr_collection.query(
                    query_texts=[query],
                    n_results=min(top_k, 10),  # Limit per query
                    include=["documents", "metadatas", "distances"]
                )
                
                # Process results
                if results['documents'] and len(results['documents'][0]) > 0:
                    documents = results['documents'][0]
                    metadatas = results['metadatas'][0] if results['metadatas'] else [{}] * len(documents)
                    distances = results['distances'][0] if results['distances'] else [1.0] * len(documents)
                    
                    for j, (doc, metadata, distance) in enumerate(zip(documents, metadatas, distances)):
                        # Create unique document ID
                        doc_id = metadata.get('id', f'doc_{hash(doc[:100])}_{j}')
                        
                        if doc_id not in retrieved_ids:
                            retrieved_ids.add(doc_id)
                            all_documents.append({
                                'id': doc_id,
                                'content': doc,
                                'metadata': metadata,
                                'distance': distance,
                                'query_match': query
                            })
                
                print(f"    Found {len(results['documents'][0]) if results['documents'] else 0} documents")
                            
            except Exception as e:
                print(f"⚠️ ChromaDB search failed for query '{query[:50]}...': {e}")
                continue
        
        # Sort by relevance (lower distance = more relevant)
        all_documents.sort(key=lambda x: x['distance'])
        
        # Return top results
        final_docs = all_documents[:top_k]
        print(f"✅ Found {len(final_docs)} relevant documents from {len(all_documents)} total matches")
        
        return final_docs
    
    def generate_disclosure_for_warranty(self, warranty: Dict[str, Any]) -> str:
        """Generate a disclosure for a specific warranty."""
        if not LANGCHAIN_AVAILABLE:
            print("⚠️ LangChain not available for disclosure generation")
            return "Disclosure generation unavailable - LangChain not installed."
        
        print(f"Generating disclosure for: {warranty.get('warranty_number', 'Unknown')} - {warranty.get('warranty_title', 'Unknown')}")
        
        # Search for relevant documents
        relevant_docs = self.search_chroma_for_warranty(warranty)
        
        if not relevant_docs:
            print("⚠️ No relevant documents found in ChromaDB")
            return self.generate_basic_disclosure(warranty)
        
        # Prepare context from documents
        context_parts = []
        references = []
        
        for doc in relevant_docs[:10]:  # Limit to top 10 documents
            content = doc['content']
            metadata = doc['metadata']
            
            # Add to reference manager and get citation
            ref_num = self.ref_manager.add_document(
                doc_id=doc['id'],
                content=content,
                metadata=metadata
            )
            
            citation = f"[{ref_num}]"
            references.append(citation)
            
            # Truncate content for context
            truncated_content = content[:1000] + ("..." if len(content) > 1000 else "")
            context_parts.append(f"Document {citation}: {truncated_content}")
        
        context_text = "\n\n".join(context_parts)
        
        # Generate disclosure using LLM
        llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.3, max_tokens=4000)
        
        disclosure_prompt = f"""
You are a legal expert creating a comprehensive warranty disclosure summary for a corporate acquisition.

**Warranty Claim Details:**
- **Number:** {warranty.get('warranty_number', 'N/A')}
- **Title:** {warranty.get('warranty_title', 'N/A')}
- **Text:** {warranty.get('warranty_text', 'N/A')}
- **Section:** {warranty.get('section_name', 'N/A')}

**Available Supporting Documentation:**
{context_text}

**Instructions:**
1. Create a comprehensive disclosure summary that addresses this specific warranty claim
2. Use inline citations {', '.join(references)} to reference the supporting documents
3. Include specific details from the documents that support or contradict the warranty
4. Structure the disclosure with clear headings and bullet points
5. Address potential risks, compliance issues, or qualifications to the warranty
6. Use professional legal language appropriate for an M&A context
7. Ensure the disclosure is thorough but concise (aim for 500-1000 words)

**Output Format:**
# Disclosure Summary for Warranty Claim: [Warranty Title]

## Overview
[Brief summary of the warranty and its implications]

## [Relevant Section Headers based on the specific warranty]
[Detailed analysis with inline citations]

## Conclusion
[Summary of key findings and any qualifications to the warranty]

**Note**: Include inline citations throughout using the format {', '.join(references)} to reference specific supporting documents.
"""
        
        try:
            response = llm.invoke(disclosure_prompt)
            disclosure_text = response.content.strip()
            
            # Ensure proper formatting
            if not disclosure_text.startswith('#'):
                disclosure_text = f"# Disclosure Summary for Warranty Claim: {warranty.get('warranty_title', 'Unknown')}\n\n{disclosure_text}"
            
            return disclosure_text
            
        except Exception as e:
            print(f"❌ Error generating disclosure: {e}")
            return self.generate_basic_disclosure(warranty)
    
    def generate_basic_disclosure(self, warranty: Dict[str, Any]) -> str:
        """Generate a basic disclosure when no documents or LLM is available."""
        return f"""# Disclosure Summary for Warranty Claim: {warranty.get('warranty_title', 'Unknown')}

## Warranty Details
- **Section:** {warranty.get('warranty_number', 'N/A')}
- **Title:** {warranty.get('warranty_title', 'N/A')}
- **Text:** {warranty.get('warranty_text', 'N/A')}

## Analysis
This warranty claim requires detailed analysis against the company's records and documentation. 

## Conclusion
Further investigation and document review required to provide comprehensive disclosure analysis.

**Note:** Limited disclosure analysis available due to insufficient supporting documentation or system limitations.
"""
    
    
    def export_to_markdown(self) -> str:
        """Export warranty claims and disclosures to markdown format."""
        print("\n" + "="*60)
        print("EXPORTING TO MARKDOWN")
        print("="*60)
        
        if not self.warranty_claims or not self.disclosures:
            print("⚠️ No warranty claims or disclosures available for export")
            return ""
        
        # Generate output filename
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        output_path = f"/tf/active/project_victoria_disclosures_fixed_{timestamp}.md"
        
        # Start building markdown content
        markdown_parts = []
        
        # Add header
        markdown_parts.append("# Project Victoria - Warranty Disclosures\\n")
        markdown_parts.append(f"**Generated on**: {datetime.now().strftime('%B %d, %Y at %H:%M:%S')}")
        markdown_parts.append(f"**Total Warranties Processed**: {len(self.warranty_claims)}")
        markdown_parts.append(f"**Total Disclosures Generated**: {len(self.disclosures)}")
        markdown_parts.append(f"**Total References**: {len(self.ref_manager.references)}")
        markdown_parts.append("\\n---\\n")
        
        # Add table of contents
        markdown_parts.append("## Table of Contents\\n")
        for warranty in self.warranty_claims:
            warranty_num = warranty.get('warranty_number', 'Unknown')
            warranty_title = warranty.get('warranty_title', 'No Title')
            anchor = warranty_title.lower().replace(' ', '-').replace('(', '').replace(')', '')
            markdown_parts.append(f"- [{warranty_num} - {warranty_title}](#{warranty_num.lower()}-{anchor})")
        
        markdown_parts.append("\\n---\\n")
        
        # Add warranty sections
        for disclosure_record in self.disclosures:
            warranty = disclosure_record['warranty']
            disclosure_text = disclosure_record['disclosure']
            
            warranty_num = warranty.get('warranty_number', 'Unknown')
            warranty_title = warranty.get('warranty_title', 'No Title')
            
            # Add warranty header
            markdown_parts.append(f"## {warranty_num} - {warranty_title}\\n")
            
            # Add warranty metadata
            section_name = warranty.get('section_name', 'Unknown Section')
            markdown_parts.append(f"**Section**: {section_name}")
            
            # Add document count if available
            warranty_docs = self.search_chroma_for_warranty(warranty, top_k=1)
            doc_count = len(warranty_docs) if warranty_docs else 0
            markdown_parts.append(f"**Source Documents Found**: {doc_count}\\n")
            
            # Add warranty text
            markdown_parts.append("### Warranty Text\\n")
            warranty_text = warranty.get('warranty_text', 'No warranty text available')
            markdown_parts.append(warranty_text + "\\n")
            
            # Add disclosure
            markdown_parts.append("### Disclosure\\n")
            markdown_parts.append(disclosure_text + "\\n")
            
            # Add separator
            markdown_parts.append("---\\n")
        
        # Add references section
        bibliography = self.ref_manager.generate_bibliography()
        markdown_parts.append(bibliography)
        
        # Combine all parts
        markdown_content = "\\n".join(markdown_parts)
        
        # Write to file
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(markdown_content)
        
        print(f"✅ Markdown exported to: {output_path}")
        print(f"📊 File size: {len(markdown_content):,} characters")
        
        return output_path
    
    def get_general_warranty_terms(self, warranty: Dict[str, Any]) -> List[str]:
        """Get general search terms based on warranty type."""
        warranty_text = warranty.get('warranty_text', '').lower()
        warranty_title = warranty.get('warranty_title', '').lower()
        warranty_number = warranty.get('warranty_number', '')
        
        general_terms = []
        
        # Map warranty types to general terms
        term_mapping = {
            'incorporation': ['incorporation', 'company formation', 'legal entity'],
            'shares': ['shareholding', 'equity', 'ownership'],
            'authority': ['power', 'authorization', 'legal capacity'],
            'accounts': ['financial statements', 'accounting', 'audited accounts'],
            'business': ['operations', 'trading', 'commercial activity'],
            'assets': ['property', 'equipment', 'resources'],
            'contracts': ['agreements', 'legal obligations', 'commitments'],
            'employment': ['employees', 'staff', 'personnel', 'labour'],
            'litigation': ['legal proceedings', 'disputes', 'claims'],
            'compliance': ['regulatory', 'legal requirements', 'obligations'],
            'insurance': ['policies', 'coverage', 'protection'],
            'intellectual property': ['IP', 'patents', 'trademarks', 'copyrights'],
            'environmental': ['EHS', 'environmental compliance', 'health safety'],
            'tax': ['taxation', 'tax compliance', 'tax obligations'],
            'permits': ['licenses', 'authorizations', 'regulatory approvals']
        }
        
        # Check for matching terms
        combined_text = f"{warranty_title} {warranty_text}"
        for category, terms in term_mapping.items():
            if category in combined_text:
                general_terms.extend(terms[:2])
                break
        
        # Add section-based terms
        if warranty_number:
            section_parts = warranty_number.split('.')
            if len(section_parts) >= 1:
                section_num = section_parts[0]
                section_mapping = {
                    '1': ['shares', 'sellers', 'ownership'],
                    '2': ['authority', 'capacity', 'power'],
                    '3': ['share capital', 'securities', 'equity'],
                    '4': ['accounts', 'financial statements'],
                    '5': ['business continuity', 'operations'],
                    '6': ['assets', 'property', 'equipment'],
                    '7': ['corporate', 'constitutional'],
                    '8': ['contracts', 'agreements'],
                    '9': ['borrowing', 'debt', 'financing'],
                    '10': ['permits', 'licenses'],
                    '11': ['insolvency', 'financial distress'],
                    '12': ['litigation', 'legal proceedings'],
                    '13': ['employment', 'employees'],
                    '14': ['competition', 'antitrust'],
                    '15': ['environmental', 'health safety'],
                    '16': ['real estate', 'property'],
                    '17': ['insurance', 'coverage'],
                    '18': ['intellectual property', 'IP'],
                    '19': ['data protection', 'privacy'],
                    '20': ['tax', 'taxation'],
                    '21': ['regulatory', 'compliance'],
                    '22': ['technology', 'IT systems'],
                    '23': ['material contracts', 'key agreements']
                }
                
                if section_num in section_mapping:
                    general_terms.extend(section_mapping[section_num])
        
        return list(set(general_terms))[:5]  # Remove duplicates and limit

    # Update the search method to include better debugging and error handling
    def search_chroma_for_warranty(self, warranty: Dict[str, Any], top_k: int = 15) -> List[Dict]:
        """Search Chroma DB for documents relevant to a specific warranty."""
        if not self.edr_collection:
            print("⚠️ Chroma DB not available, skipping search")
            return []
        
        # Create search queries based on warranty content
        warranty_text = warranty.get('warranty_text', '')
        warranty_title = warranty.get('warranty_title', '')
        warranty_number = warranty.get('warranty_number', '')
        
        # Generate multiple search queries for comprehensive coverage
        search_queries = []
        
        # Add warranty-specific terms
        if warranty_title:
            search_queries.append(warranty_title)
        
        if warranty_text:
            search_queries.append(warranty_text[:500])
        
        if warranty_title and warranty_text:
            search_queries.append(f"{warranty_title} {warranty_text[:200]}")
        
        # Add section-specific terms
        if warranty_number:
            search_queries.append(f"section {warranty_number}")
        
        # Extract key terms for additional queries
        key_terms = self.extract_key_terms_from_warranty(warranty)
        if key_terms:
            search_queries.extend(key_terms[:3])
        
        # Add fallback general terms based on warranty type
        general_terms = self.get_general_warranty_terms(warranty)
        search_queries.extend(general_terms[:2])
        
        all_documents = []
        retrieved_ids = set()
        
        print(f"🔍 Searching ChromaDB with {len(search_queries)} queries for warranty {warranty_number}")
        
        for i, query in enumerate(search_queries):
            if not query.strip():
                continue
                
            try:
                print(f"  Query {i+1}: '{query[:60]}{'...' if len(query) > 60 else ''}'")
                
                # Query Chroma DB
                results = self.edr_collection.query(
                    query_texts=[query],
                    n_results=min(top_k, 10),  # Limit per query
                    include=["documents", "metadatas", "distances"]
                )
                
                # Process results
                if results['documents'] and len(results['documents'][0]) > 0:
                    documents = results['documents'][0]
                    metadatas = results['metadatas'][0] if results['metadatas'] else [{}] * len(documents)
                    distances = results['distances'][0] if results['distances'] else [1.0] * len(documents)
                    
                    for j, (doc, metadata, distance) in enumerate(zip(documents, metadatas, distances)):
                        # Create unique document ID
                        doc_id = metadata.get('id', f'doc_{hash(doc[:100])}_{j}')
                        
                        if doc_id not in retrieved_ids:
                            retrieved_ids.add(doc_id)
                            all_documents.append({
                                'id': doc_id,
                                'content': doc,
                                'metadata': metadata,
                                'distance': distance,
                                'query_match': query
                            })
                
                print(f"    Found {len(results['documents'][0]) if results['documents'] else 0} documents")
                            
            except Exception as e:
                print(f"⚠️ ChromaDB search failed for query '{query[:50]}...': {e}")
                continue
        
        # Sort by relevance (lower distance = more relevant)
        all_documents.sort(key=lambda x: x['distance'])
        
        # Return top results
        final_docs = all_documents[:top_k]
        print(f"✅ Found {len(final_docs)} relevant documents from {len(all_documents)} total matches")
        
        return final_docs
    
    def generate_disclosure_for_warranty(self, warranty: Dict[str, Any]) -> str:
        """Generate a disclosure for a specific warranty."""
        if not LANGCHAIN_AVAILABLE:
            print("⚠️ LangChain not available for disclosure generation")
            return "Disclosure generation unavailable - LangChain not installed."
        
        print(f"Generating disclosure for: {warranty.get('warranty_number', 'Unknown')} - {warranty.get('warranty_title', 'Unknown')}")
        
        # Search for relevant documents
        relevant_docs = self.search_chroma_for_warranty(warranty)
        
        if not relevant_docs:
            print("⚠️ No relevant documents found in ChromaDB")
            return self.generate_basic_disclosure(warranty)
        
        # Prepare context from documents
        context_parts = []
        references = []
        
        for doc in relevant_docs[:10]:  # Limit to top 10 documents
            content = doc['content']
            metadata = doc['metadata']
            
            # Add to reference manager and get citation
            ref_num = self.ref_manager.add_document(
                doc_id=doc['id'],
                content=content,
                metadata=metadata
            )
            
            citation = f"[{ref_num}]"
            references.append(citation)
            
            # Truncate content for context
            truncated_content = content[:1000] + ("..." if len(content) > 1000 else "")
            context_parts.append(f"Document {citation}: {truncated_content}")
        
        context_text = "\n\n".join(context_parts)
        
        # Generate disclosure using LLM
        llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.3, max_tokens=4000)
        
        disclosure_prompt = f"""
You are a legal expert creating a comprehensive warranty disclosure summary for a corporate acquisition.

**Warranty Claim Details:**
- **Number:** {warranty.get('warranty_number', 'N/A')}
- **Title:** {warranty.get('warranty_title', 'N/A')}
- **Text:** {warranty.get('warranty_text', 'N/A')}
- **Section:** {warranty.get('section_name', 'N/A')}

**Available Supporting Documentation:**
{context_text}

**Instructions:**
1. Create a comprehensive disclosure summary that addresses this specific warranty claim
2. Use inline citations {', '.join(references)} to reference the supporting documents
3. Include specific details from the documents that support or contradict the warranty
4. Structure the disclosure with clear headings and bullet points
5. Address potential risks, compliance issues, or qualifications to the warranty
6. Use professional legal language appropriate for an M&A context
7. Ensure the disclosure is thorough but concise (aim for 500-1000 words)

**Output Format:**
# Disclosure Summary for Warranty Claim: [Warranty Title]

## Overview
[Brief summary of the warranty and its implications]

## [Relevant Section Headers based on the specific warranty]
[Detailed analysis with inline citations]

## Conclusion
[Summary of key findings and any qualifications to the warranty]

**Note**: Include inline citations throughout using the format {', '.join(references)} to reference specific supporting documents.
"""
        
        try:
            response = llm.invoke(disclosure_prompt)
            disclosure_text = response.content.strip()
            
            # Ensure proper formatting
            if not disclosure_text.startswith('#'):
                disclosure_text = f"# Disclosure Summary for Warranty Claim: {warranty.get('warranty_title', 'Unknown')}\n\n{disclosure_text}"
            
            return disclosure_text
            
        except Exception as e:
            print(f"❌ Error generating disclosure: {e}")
            return self.generate_basic_disclosure(warranty)
    
    def generate_basic_disclosure(self, warranty: Dict[str, Any]) -> str:
        """Generate a basic disclosure when no documents or LLM is available."""
        warranty_num = warranty.get('warranty_number', 'Unknown')
        warranty_title = warranty.get('warranty_title', 'Unknown')
        warranty_text = warranty.get('warranty_text', 'No warranty text available')
        
        basic_disclosure = f"""
# Basic Disclosure for Warranty {warranty_num}

## Warranty Overview
- **Section**: {warranty_num}
- **Title**: {warranty_title}
- **Text**: {warranty_text}

## Disclosure Status
This warranty requires detailed analysis based on company documentation and records. 

**Note**: Full disclosure analysis requires access to supporting documentation from the company's data room and relevant business records.

## Next Steps
1. Review relevant company documentation
2. Consult with legal and business teams
3. Verify compliance with warranty requirements
4. Update disclosure based on findings
"""
        return basic_disclosure
    
    def generate_all_disclosures(self):
        """Generate disclosures for all warranty claims."""
        print("\n" + "="*60)
        print("GENERATING DISCLOSURES")
        print("="*60)
        
        if not self.warranty_claims:
            print("⚠️ No warranty claims available for disclosure generation")
            return
        
        self.disclosures = []
        
        for i, warranty in enumerate(self.warranty_claims):
            print(f"\nProgress: {i+1}/{len(self.warranty_claims)} - Processing warranty {warranty.get('warranty_number', 'Unknown')}")
            
            try:
                disclosure_text = self.generate_disclosure_for_warranty(warranty)
                
                disclosure_record = {
                    'warranty': warranty,
                    'disclosure': disclosure_text,
                    'generated_at': datetime.now().isoformat()
                }
                
                self.disclosures.append(disclosure_record)
                
            except Exception as e:
                print(f"❌ Failed to generate disclosure for warranty {warranty.get('warranty_number', 'Unknown')}: {e}")
                # Add a basic disclosure record
                basic_disclosure = self.generate_basic_disclosure(warranty)
                disclosure_record = {
                    'warranty': warranty,
                    'disclosure': basic_disclosure,
                    'generated_at': datetime.now().isoformat(),
                    'error': str(e)
                }
                self.disclosures.append(disclosure_record)
        
        print(f"\n✅ Generated {len(self.disclosures)} disclosures")
    
    def run_complete_pipeline(self) -> str:
        """Run the complete disclosure generation pipeline."""
        print("🚀 Starting Fixed Project Victoria Disclosure Generation Pipeline\\n")
        
        try:
            # Step 1: Extract PDF text
            print("Step 1: Extracting PDF text...")
            self.extract_pdf_text()
            
            # Step 2: Identify warranty claims
            print("\\nStep 2: Identifying warranty claims...")
            self.identify_warranty_claims()
            
            # Step 3: Generate disclosures
            print("\\nStep 3: Generating disclosures...")
            self.generate_all_disclosures()
            
            # Step 4: Export to markdown
            print("\\nStep 4: Exporting to markdown...")
            output_path = self.export_to_markdown()
            
            print("\\n" + "="*60)
            print("✅ FIXED PIPELINE COMPLETED SUCCESSFULLY!")
            print("="*60)
            print(f"📁 Output file: {output_path}")
            print(f"📊 Total warranties processed: {len(self.warranty_claims)}")
            print(f"📝 Total disclosures generated: {len(self.disclosures)}")
            print(f"📚 Total references: {len(self.ref_manager.references)}")
            print("="*60)
            
            return output_path
            
        except Exception as e:
            print(f"❌ Pipeline failed: {e}")
            raise

Parameters

Name	Type	Default	Kind
`bases`	-	-

Parameter Details

bases: Parameter of type

Return Value

Returns unspecified type

Class Interface

Methods

`init(self)`

Purpose: Internal method: init

Returns: None

`extract_pdf_text(self)`

Purpose: Extract text from PDF or load from existing file.

Returns: None

`split_text_by_warranty_sections(self, text) -> List[str]`

Purpose: Split text into chunks based on warranty section boundaries. This ensures warranty sections are not split across chunks.

Parameters:

text: Type: str

Returns: Returns List[str]

`identify_warranty_claims(self) -> List[Dict[str, Any]]`

Purpose: Identify and extract individual warranty claims from the document.

Returns: Returns List[Dict[str, Any]]

`verify_and_clean_warranties(self, warranties) -> List[Dict[str, Any]]`

Purpose: Verify and clean warranty claims.

Parameters:

warranties: Type: List[Dict[str, Any]]

Returns: Returns List[Dict[str, Any]]

`search_chroma_for_warranty(self, warranty, top_k) -> List[Dict]`

Purpose: Search Chroma DB for documents relevant to a specific warranty.

Parameters:

warranty: Type: Dict[str, Any]
top_k: Type: int

Returns: Returns List[Dict]

`extract_key_terms_from_warranty(self, warranty) -> List[str]`

Purpose: Extract key terms from warranty for search queries.

Parameters:

warranty: Type: Dict[str, Any]

Returns: Returns List[str]

`generate_disclosure_for_warranty(self, warranty) -> str`

Purpose: Generate a disclosure for a specific warranty claim.

Parameters:

warranty: Type: Dict[str, Any]

Returns: Returns str

`generate_basic_disclosure(self, warranty) -> str`

Purpose: Generate a basic disclosure when no documents or LLM is available.

Parameters:

warranty: Type: Dict[str, Any]

Returns: Returns str

`export_to_markdown(self) -> str`

Purpose: Export warranty claims and disclosures to markdown format.

Returns: Returns str

`get_general_warranty_terms(self, warranty) -> List[str]`

Purpose: Get general search terms based on warranty type.

Parameters:

warranty: Type: Dict[str, Any]

Returns: Returns List[str]

`search_chroma_for_warranty(self, warranty, top_k) -> List[Dict]`

Purpose: Search Chroma DB for documents relevant to a specific warranty.

Parameters:

warranty: Type: Dict[str, Any]
top_k: Type: int

Returns: Returns List[Dict]

`generate_disclosure_for_warranty(self, warranty) -> str`

Purpose: Generate a disclosure for a specific warranty.

Parameters:

warranty: Type: Dict[str, Any]

Returns: Returns str

`generate_basic_disclosure(self, warranty) -> str`

Purpose: Generate a basic disclosure when no documents or LLM is available.

Parameters:

warranty: Type: Dict[str, Any]

Returns: Returns str

`export_to_markdown(self) -> str`

Purpose: Export warranty claims and disclosures to markdown format.

Returns: Returns str

`get_general_warranty_terms(self, warranty) -> List[str]`

Purpose: Get general search terms based on warranty type.

Parameters:

warranty: Type: Dict[str, Any]

Returns: Returns List[str]

`search_chroma_for_warranty(self, warranty, top_k) -> List[Dict]`

Purpose: Search Chroma DB for documents relevant to a specific warranty.

Parameters:

warranty: Type: Dict[str, Any]
top_k: Type: int

Returns: Returns List[Dict]

`generate_disclosure_for_warranty(self, warranty) -> str`

Purpose: Generate a disclosure for a specific warranty.

Parameters:

warranty: Type: Dict[str, Any]

Returns: Returns str

`generate_basic_disclosure(self, warranty) -> str`

Purpose: Generate a basic disclosure when no documents or LLM is available.

Parameters:

warranty: Type: Dict[str, Any]

Returns: Returns str

`generate_all_disclosures(self)`

Purpose: Generate disclosures for all warranty claims.

Returns: None

`run_complete_pipeline(self) -> str`

Purpose: Run the complete disclosure generation pipeline.

Returns: Returns str

Required Imports

import os
import re
import json
import tiktoken
from typing import List

Usage Example

# Example usage:
# result = FixedProjectVictoriaGenerator(bases)

Similar Components

AI-powered semantic similarity - components with related functionality:

class ProjectVictoriaDisclosureGenerator 82.8% similar

Main class for generating Project Victoria disclosures from warranty claims.
From: /tf/active/vicechatdev/project_victoria_disclosure_generator.py
class ImprovedProjectVictoriaGenerator 77.8% similar

Improved Project Victoria Disclosure Generator with proper reference management.
From: /tf/active/vicechatdev/improved_project_victoria_generator.py
function main_v28 67.8% similar

Entry point function that instantiates a FixedProjectVictoriaGenerator and executes its complete pipeline to generate fixed disclosure documents.
From: /tf/active/vicechatdev/fixed_project_victoria_generator.py
function main_v29 63.0% similar

Entry point function that instantiates an ImprovedProjectVictoriaGenerator and executes its complete pipeline to generate disclosure documents.
From: /tf/active/vicechatdev/improved_project_victoria_generator.py
function main_v14 56.7% similar

Entry point function that orchestrates the Project Victoria disclosure analysis by initializing the generator, running the complete analysis, and displaying results with next steps.
From: /tf/active/vicechatdev/project_victoria_disclosure_generator.py

← Back to Browse

Assistant

Hi! I can help improve this code. Tell me what you'd like to enhance (e.g., "add error handling", "optimize performance", "improve readability", "add type hints").

Code Comparison

Original Code

                            class FixedProjectVictoriaGenerator:
    """
    Fixed Project Victoria Disclosure Generator that properly handles all warranty sections.
    """
    
    def __init__(self):
        self.pdf_path = "/tf/active/20250623_Project Victoria - Disclosure Matrix_WIP.pdf"
        self.extracted_text_path = "/tf/active/project_victoria_extracted.txt"
        self.chroma_path = "/tf/active/.persist/EDR_collection"
        
        self.extracted_text = ""
        self.warranty_claims = []
        self.disclosures = []
        self.api_key = "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"
        os.environ["OPENAI_API_KEY"] = self.api_key
        
        # Initialize reference manager
        self.ref_manager = ReferenceManager()
        
        # Initialize tokenizer
        self.tokenizer = tiktoken.encoding_for_model("gpt-4o-mini")
        
        # Initialize ChromaDB collection
        self.edr_collection = None
        self.chroma_client = None
        
        # Try to connect to remote ChromaDB first (vice_chroma:8000)
        try:
            print("🔗 Attempting to connect to remote ChromaDB at vice_chroma:8000...")
            self.chroma_client = chromadb.HttpClient(host='vice_chroma', port=8000)
            
            # Set up embedding function if available
            if EMBEDDING_AVAILABLE:
                self.chroma_embedder = MyEmbeddingFunction("gpt-4o-mini", "text-embedding-3-small", self.api_key)
            else:
                print("⚠️ Using default Chroma embeddings (may not work with custom collections)")
                self.chroma_embedder = None
            
            # Try to get the 99_edr collection
            try:
                self.edr_collection = self.chroma_client.get_collection(
                    "99_edr", 
                    embedding_function=self.chroma_embedder
                )
                print("✅ Successfully connected to remote ChromaDB and 99_edr collection")
            except Exception as collection_error:
                print(f"⚠️ Could not access 99_edr collection: {collection_error}")
                # Try to list available collections
                try:
                    collections = self.chroma_client.list_collections()
                    if collections:
                        collection_names = [c.name for c in collections]
                        print(f"Available collections: {collection_names}")
                        # Use first available collection
                        first_collection = collection_names[0]
                        self.edr_collection = self.chroma_client.get_collection(first_collection)
                        print(f"✅ Using collection '{first_collection}' instead")
                    else:
                        print("⚠️ No collections found in remote ChromaDB")
                except Exception as list_error:
                    print(f"⚠️ Could not list collections: {list_error}")
                    
        except Exception as remote_error:
            print(f"⚠️ Could not connect to remote ChromaDB: {remote_error}")
            
            # Fallback to local ChromaDB paths  
            print("🔗 Trying local ChromaDB paths as fallback...")
            possible_chroma_paths = [
                "/tf/active/.persist/EDR_collection",
                "/tf/active/.persist",
                "/tf/active/chroma_db",
                "/tf/active/.chroma"
            ]
            
            for path in possible_chroma_paths:
                if os.path.exists(path):
                    try:
                        client = chromadb.PersistentClient(path=path)
                        collections = client.list_collections()
                        
                        # Try to find EDR collection or similar
                        collection_names = [c.name for c in collections]
                        if "EDR_collection" in collection_names:
                            self.edr_collection = client.get_collection(name="EDR_collection")
                            print(f"✅ Connected to ChromaDB collection 'EDR_collection' with {self.edr_collection.count()} documents at {path}")
                            break
                        elif collection_names:
                            # Use the first available collection
                            self.edr_collection = client.get_collection(name=collection_names[0])
                            print(f"✅ Connected to ChromaDB collection '{collection_names[0]}' with {self.edr_collection.count()} documents at {path}")
                            break
                        else:
                            print(f"⚠️ No collections found in ChromaDB at {path}")
                            
                    except Exception as e:
                        print(f"⚠️ Could not connect to ChromaDB at {path}: {e}")
                        continue
        
        if not self.edr_collection:
            print("⚠️ ChromaDB not available - disclosure generation will proceed without document search")
                
    def extract_pdf_text(self):
        """Extract text from PDF or load from existing file."""
        # First try to load existing extracted text
        if os.path.exists(self.extracted_text_path):
            print(f"📄 Loading existing extracted text from: {self.extracted_text_path}")
            with open(self.extracted_text_path, 'r', encoding='utf-8') as f:
                self.extracted_text = f.read()
            print(f"✅ Loaded {len(self.extracted_text)} characters of text")
            return
        
        # If no extracted text exists, try to extract from PDF
        if not os.path.exists(self.pdf_path):
            raise FileNotFoundError(f"PDF file not found: {self.pdf_path}")
        
        if not PDF_AVAILABLE:
            raise ImportError("PyMuPDF not available for PDF extraction")
        
        print(f"📄 Extracting text from PDF: {self.pdf_path}")
        doc = fitz.open(self.pdf_path)
        text_parts = []
        
        for page_num in range(len(doc)):
            page = doc[page_num]
            text = page.get_text()
            text_parts.append(text)
        
        self.extracted_text = "\n".join(text_parts)
        
        # Save extracted text for future use
        with open(self.extracted_text_path, 'w', encoding='utf-8') as f:
            f.write(self.extracted_text)
        
        print(f"✅ Extracted {len(self.extracted_text)} characters and saved to {self.extracted_text_path}")
    
    def split_text_by_warranty_sections(self, text: str) -> List[str]:
        """
        Split text into chunks based on warranty section boundaries.
        This ensures warranty sections are not split across chunks.
        """
        print("🔍 Splitting text by warranty sections...")
        
        # Find all warranty section headers using multiple patterns
        # Pattern 1: Basic section numbers like "1.1", "2.1", etc.
        pattern1 = r'\n\s*(\d+\.(?:\d+(?:\([a-z]\))?)?)\s*\n'
        # Pattern 2: Section numbers at start of line
        pattern2 = r'^(\d+\.\d+(?:\([a-z]\))?)\s'
        # Pattern 3: Section numbers with spaces
        pattern3 = r'\n(\d+\.\d+(?:\([a-z]\))?)\s+'
        
        matches = []
        for pattern in [pattern1, pattern2, pattern3]:
            matches.extend(list(re.finditer(pattern, text, re.MULTILINE)))
        
        # Remove duplicates and sort by position
        unique_matches = {}
        for match in matches:
            pos = match.start()
            if pos not in unique_matches:
                unique_matches[pos] = match
        
        matches = sorted(unique_matches.values(), key=lambda x: x.start())
        
        if not matches:
            print("⚠️ No warranty sections found using standard patterns, trying alternative approach")
            # Try alternative pattern matching
            alt_pattern = r'(\d+\.\d+(?:\([a-z]\))?)'
            alt_matches = list(re.finditer(alt_pattern, text))
            if alt_matches:
                matches = alt_matches[:50]  # Limit to avoid too many matches
                print(f"Found {len(matches)} potential warranty sections with alternative pattern")
            else:
                print("⚠️ No warranty sections found, using single chunk")
                return [text]
        
        print(f"Found {len(matches)} warranty sections")
        
        chunks = []
        max_chunk_size = 20000  # Smaller chunks to avoid token limits
        current_chunk = ""
        
        for i, match in enumerate(matches):
            section_start = match.start()
            section_number = match.group(1)
            
            # Find the end of this section (start of next section or end of text)
            if i < len(matches) - 1:
                section_end = matches[i + 1].start()
            else:
                section_end = len(text)
            
            section_text = text[section_start:section_end]
            
            # Check if adding this section would exceed chunk size
            if len(current_chunk) + len(section_text) > max_chunk_size and current_chunk:
                chunks.append(current_chunk.strip())
                current_chunk = section_text
            else:
                current_chunk += section_text
        
        # Add the last chunk
        if current_chunk.strip():
            chunks.append(current_chunk.strip())
        
        print(f"✅ Split text into {len(chunks)} warranty-based chunks")
        return chunks
    
    def identify_warranty_claims(self) -> List[Dict[str, Any]]:
        """Identify and extract individual warranty claims from the document."""
        print("\n" + "="*60)
        print("IDENTIFYING WARRANTY CLAIMS")
        print("="*60)
        
        if not LANGCHAIN_AVAILABLE:
            raise ImportError("LangChain not available for warranty identification")
        
        # Use LLM to identify warranty claims with higher max_tokens to avoid truncation
        llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=12000)
        
        warranty_extraction_prompt = """
You are analyzing a legal document containing warranty disclosures for a company acquisition.
The document is structured with numbered warranty sections, each containing specific warranty claims.

Your task is to extract ALL individual warranty claims from the provided text section.

Please extract and return a JSON list of warranty claims, where each item has:
- "warranty_number": The section number (e.g., "1.1", "2.1(a)", "3.4", "13.18", "19.24")
- "warranty_title": A short descriptive title for the warranty (max 100 characters)
- "warranty_text": The complete text of the warranty claim (limit to 200 characters)
- "section_name": The main section name or topic (if identifiable)

IMPORTANT INSTRUCTIONS:
1. Extract EVERY warranty section you find, including high-numbered sections (like 18.x, 19.x, 20.x, 21.x, 22.x, 23.x)
2. Look for sections with numbers like: 18.1, 19.24, 20.7, 21.4, 22.12, 23.8, etc.
3. Keep all text fields short to ensure complete JSON response
4. Include sections with letter suffixes like 13.18(a), 2.1(c)(i), etc.
5. If you see "Warranty X" or "Warranty Number X", extract it
6. Do not truncate the JSON response - complete all warranty objects

Here is the document section to analyze:

{document_text}

Return only a valid JSON array of warranty claims. Ensure the JSON is complete and valid.
"""
        
        # Split text into warranty-based chunks
        text_chunks = self.split_text_by_warranty_sections(self.extracted_text)
        
        all_warranties = []
        processed_numbers = set()  # Track processed warranty numbers to avoid duplicates
        
        for i, chunk in enumerate(text_chunks):
            print(f"Processing warranty extraction chunk {i+1}/{len(text_chunks)}")
            
            prompt = warranty_extraction_prompt.format(document_text=chunk)
            
            try:
                response = llm.invoke(prompt)
                response_text = response.content.strip()
                
                # Clean up response to ensure valid JSON
                if response_text.startswith("```json"):
                    response_text = response_text[7:]
                if response_text.endswith("```"):
                    response_text = response_text[:-3]
                
                response_text = response_text.strip()
                
                # Try to repair truncated JSON
                if not response_text.endswith(']'):
                    # Find the last complete warranty object
                    last_complete = response_text.rfind('}}')
                    if last_complete > 0:
                        response_text = response_text[:last_complete + 2] + ']'
                    else:
                        response_text = response_text + ']'
                
                chunk_warranties = json.loads(response_text)
                
                if isinstance(chunk_warranties, list):
                    # Filter out duplicates based on warranty number
                    new_warranties = []
                    for warranty in chunk_warranties:
                        warranty_num = warranty.get('warranty_number', '')
                        if warranty_num and warranty_num not in processed_numbers:
                            processed_numbers.add(warranty_num)
                            new_warranties.append(warranty)
                    
                    all_warranties.extend(new_warranties)
                    print(f"✅ Extracted {len(new_warranties)} unique warranties from chunk {i+1}")
                else:
                    print(f"⚠️ Unexpected response format from chunk {i+1}")
                    
            except json.JSONDecodeError as je:
                print(f"❌ JSON decode error in chunk {i+1}: {je}")
                print(f"Response preview: {response_text[:300]}...")
                
                # Try alternative extraction using regex
                try:
                    warranty_pattern = r'\{\s*"warranty_number"[^}]+?\}'
                    matches = re.findall(warranty_pattern, response_text, re.DOTALL)
                    recovered_count = 0
                    for match in matches:
                        try:
                            warranty = json.loads(match)
                            warranty_num = warranty.get('warranty_number', '')
                            if warranty_num and warranty_num not in processed_numbers:
                                processed_numbers.add(warranty_num)
                                all_warranties.append(warranty)
                                recovered_count += 1
                        except:
                            continue
                    if recovered_count > 0:
                        print(f"✅ Recovered {recovered_count} warranties using regex extraction")
                except Exception as re_error:
                    print(f"❌ Regex recovery also failed: {re_error}")
            except Exception as e:
                print(f"❌ Error processing chunk {i+1}: {e}")
        
        # Clean up and verify warranties
        if all_warranties:
            all_warranties = self.verify_and_clean_warranties(all_warranties)
        
        self.warranty_claims = all_warranties
        
        print(f"✅ Total warranty claims identified: {len(self.warranty_claims)}")
        
        # Display warranty number range
        if self.warranty_claims:
            warranty_numbers = [w.get('warranty_number', '') for w in self.warranty_claims if w.get('warranty_number')]
            if warranty_numbers:
                print(f"📊 Warranty numbers range: {min(warranty_numbers)} to {max(warranty_numbers)}")
            
            print("\nSample warranty claims:")
            for i, warranty in enumerate(self.warranty_claims[:5]):
                print(f"{i+1}. [{warranty.get('warranty_number', 'N/A')}] {warranty.get('warranty_title', 'No title')}")
        
        return self.warranty_claims
    
    def verify_and_clean_warranties(self, warranties: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """Verify and clean warranty claims."""
        try:
            cleaned_warranties = []
            seen_numbers = set()
            
            for warranty in warranties:
                # Check if warranty has required fields
                if not warranty.get('warranty_number') or not warranty.get('warranty_title'):
                    continue
                
                warranty_num = warranty['warranty_number']
                
                # Skip duplicates
                if warranty_num in seen_numbers:
                    continue
                
                seen_numbers.add(warranty_num)
                
                # Clean warranty text
                warranty_text = warranty.get('warranty_text', '')
                if len(warranty_text) > 500:
                    warranty['warranty_text'] = warranty_text[:497] + "..."
                
                cleaned_warranties.append(warranty)
            
            # Sort by warranty number
            def sort_key(w):
                num = w.get('warranty_number', '0')
                try:
                    # Extract main number and sub-number for proper sorting
                    parts = re.findall(r'\d+', num)
                    if parts:
                        return (int(parts[0]), int(parts[1]) if len(parts) > 1 else 0, num)
                    return (0, 0, num)
                except:
                    return (0, 0, num)
            
            cleaned_warranties.sort(key=sort_key)
            return cleaned_warranties
            
        except Exception as e:
            print(f"⚠️ Warranty verification failed: {e}, using original list")
            return warranties
    
    def search_chroma_for_warranty(self, warranty: Dict[str, Any], top_k: int = 15) -> List[Dict]:
        """Search Chroma DB for documents relevant to a specific warranty."""
        if not self.edr_collection:
            print("⚠️ Chroma DB not available, skipping search")
            return []
        
        # Create search queries based on warranty content
        warranty_text = warranty.get('warranty_text', '')
        warranty_title = warranty.get('warranty_title', '')
        warranty_number = warranty.get('warranty_number', '')
        
        # Generate multiple search queries for comprehensive coverage
        search_queries = []
        
        # Add warranty-specific terms
        if warranty_title:
            search_queries.append(warranty_title)
        
        if warranty_text:
            search_queries.append(warranty_text[:500])
        
        if warranty_title and warranty_text:
            search_queries.append(f"{warranty_title} {warranty_text[:200]}")
        
        # Add section-specific terms
        if warranty_number:
            search_queries.append(f"section {warranty_number}")
        
        # Extract key terms for additional queries
        key_terms = self.extract_key_terms_from_warranty(warranty)
        if key_terms:
            search_queries.extend(key_terms[:3])
        
        # Add fallback general terms based on warranty type
        general_terms = self.get_general_warranty_terms(warranty)
        search_queries.extend(general_terms[:2])
        
        all_documents = []
        retrieved_ids = set()
        
        print(f"🔍 Searching ChromaDB with {len(search_queries)} queries for warranty {warranty_number}")
        
        for i, query in enumerate(search_queries):
            if not query.strip():
                continue
                
            try:
                print(f"  Query {i+1}: '{query[:60]}{'...' if len(query) > 60 else ''}'")
                
                # Query Chroma DB
                results = self.edr_collection.query(
                    query_texts=[query],
                    n_results=min(top_k, 10),  # Limit per query
                    include=["documents", "metadatas", "distances"]
                )
                
                # Process results
                if results['documents'] and len(results['documents'][0]) > 0:
                    documents = results['documents'][0]
                    metadatas = results['metadatas'][0] if results['metadatas'] else [{}] * len(documents)
                    distances = results['distances'][0] if results['distances'] else [1.0] * len(documents)
                    
                    for j, (doc, metadata, distance) in enumerate(zip(documents, metadatas, distances)):
                        # Create unique document ID
                        doc_id = metadata.get('id', f'doc_{hash(doc[:100])}_{j}')
                        
                        if doc_id not in retrieved_ids:
                            retrieved_ids.add(doc_id)
                            all_documents.append({
                                'id': doc_id,
                                'content': doc,
                                'metadata': metadata,
                                'distance': distance,
                                'query_match': query
                            })
                
                print(f"    Found {len(results['documents'][0]) if results['documents'] else 0} documents")
                            
            except Exception as e:
                print(f"⚠️ ChromaDB search failed for query '{query[:50]}...': {e}")
                continue
        
        # Sort by relevance (lower distance = more relevant)
        all_documents.sort(key=lambda x: x['distance'])
        
        # Return top results
        final_docs = all_documents[:top_k]
        print(f"✅ Found {len(final_docs)} relevant documents from {len(all_documents)} total matches")
        
        return final_docs
    
    def extract_key_terms_from_warranty(self, warranty: Dict[str, Any]) -> List[str]:
        """Extract key terms from warranty for search queries."""
        warranty_text = warranty.get('warranty_text', '')
        warranty_title = warranty.get('warranty_title', '')
        
        # Combine text
        full_text = f"{warranty_title} {warranty_text}".lower()
        
        # Extract important terms (nouns, key concepts)
        key_terms = []
        
        # Look for capitalized terms (likely important concepts)
        caps_terms = re.findall(r'\b[A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+)*\b', warranty_text)
        key_terms.extend(caps_terms[:3])
        
        # Look for legal/business terms
        business_terms = re.findall(r'\b(?:agreement|contract|liability|obligation|compliance|breach|default|guarantee|indemnity|insurance|permit|license|employee|consultant|shareholder|subsidiary|acquisition|merger|transaction|disclosure|warranty|representation)\b', full_text)
        key_terms.extend(set(business_terms[:5]))
        
        # Remove duplicates and return
        return list(set(key_terms))[:5]
    
    def generate_disclosure_for_warranty(self, warranty: Dict[str, Any]) -> str:
        """Generate a disclosure for a specific warranty claim."""
        if not LANGCHAIN_AVAILABLE:
            print("⚠️ LangChain not available for disclosure generation")
            return "Disclosure generation unavailable - LangChain not installed."
        
        print(f"Generating disclosure for: {warranty.get('warranty_number', 'Unknown')} - {warranty.get('warranty_title', 'Unknown')}")
        
        # Search for relevant documents
        relevant_docs = self.search_chroma_for_warranty(warranty)
        
        if not relevant_docs:
            print("⚠️ No relevant documents found in ChromaDB")
            return self.generate_basic_disclosure(warranty)
        
        # Prepare context from documents
        context_parts = []
        references = []
        
        for doc in relevant_docs[:10]:  # Limit to top 10 documents
            content = doc['content']
            metadata = doc['metadata']
            
            # Add to reference manager and get citation
            ref_num = self.ref_manager.add_document(
                doc_id=doc['id'],
                content=content,
                metadata=metadata
            )
            
            citation = f"[{ref_num}]"
            references.append(citation)
            
            # Truncate content for context
            truncated_content = content[:1000] + ("..." if len(content) > 1000 else "")
            context_parts.append(f"Document {citation}: {truncated_content}")
        
        context_text = "\n\n".join(context_parts)
        
        # Generate disclosure using LLM
        llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.3, max_tokens=4000)
        
        disclosure_prompt = f"""
You are a legal expert creating a comprehensive warranty disclosure summary for a corporate acquisition.

**Warranty Claim Details:**
- **Number:** {warranty.get('warranty_number', 'N/A')}
- **Title:** {warranty.get('warranty_title', 'N/A')}
- **Text:** {warranty.get('warranty_text', 'N/A')}
- **Section:** {warranty.get('section_name', 'N/A')}

**Available Supporting Documentation:**
{context_text}

**Instructions:**
1. Create a comprehensive disclosure summary that addresses this specific warranty claim
2. Use inline citations {', '.join(references)} to reference the supporting documents
3. Include specific details from the documents that support or contradict the warranty
4. Structure the disclosure with clear headings and bullet points
5. Address potential risks, compliance issues, or qualifications to the warranty
6. Use professional legal language appropriate for an M&A context
7. Ensure the disclosure is thorough but concise (aim for 500-1000 words)

**Output Format:**
# Disclosure Summary for Warranty Claim: [Warranty Title]

## Overview
[Brief summary of the warranty and its implications]

## [Relevant Section Headers based on the specific warranty]
[Detailed analysis with inline citations]

## Conclusion
[Summary of key findings and any qualifications to the warranty]

**Note**: Include inline citations throughout using the format {', '.join(references)} to reference specific supporting documents.
"""
        
        try:
            response = llm.invoke(disclosure_prompt)
            disclosure_text = response.content.strip()
            
            # Ensure proper formatting
            if not disclosure_text.startswith('#'):
                disclosure_text = f"# Disclosure Summary for Warranty Claim: {warranty.get('warranty_title', 'Unknown')}\n\n{disclosure_text}"
            
            return disclosure_text
            
        except Exception as e:
            print(f"❌ Error generating disclosure: {e}")
            return self.generate_basic_disclosure(warranty)
    
    def generate_basic_disclosure(self, warranty: Dict[str, Any]) -> str:
        """Generate a basic disclosure when no documents or LLM is available."""
        return f"""# Disclosure Summary for Warranty Claim: {warranty.get('warranty_title', 'Unknown')}

## Warranty Details
- **Section:** {warranty.get('warranty_number', 'N/A')}
- **Title:** {warranty.get('warranty_title', 'N/A')}
- **Text:** {warranty.get('warranty_text', 'N/A')}

## Analysis
This warranty claim requires detailed analysis against the company's records and documentation. 

## Conclusion
Further investigation and document review required to provide comprehensive disclosure analysis.

**Note:** Limited disclosure analysis available due to insufficient supporting documentation or system limitations.
"""
    
    
    def export_to_markdown(self) -> str:
        """Export warranty claims and disclosures to markdown format."""
        print("\n" + "="*60)
        print("EXPORTING TO MARKDOWN")
        print("="*60)
        
        if not self.warranty_claims or not self.disclosures:
            print("⚠️ No warranty claims or disclosures available for export")
            return ""
        
        # Generate output filename
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        output_path = f"/tf/active/project_victoria_disclosures_fixed_{timestamp}.md"
        
        # Start building markdown content
        markdown_parts = []
        
        # Add header
        markdown_parts.append("# Project Victoria - Warranty Disclosures\\n")
        markdown_parts.append(f"**Generated on**: {datetime.now().strftime('%B %d, %Y at %H:%M:%S')}")
        markdown_parts.append(f"**Total Warranties Processed**: {len(self.warranty_claims)}")
        markdown_parts.append(f"**Total Disclosures Generated**: {len(self.disclosures)}")
        markdown_parts.append(f"**Total References**: {len(self.ref_manager.references)}")
        markdown_parts.append("\\n---\\n")
        
        # Add table of contents
        markdown_parts.append("## Table of Contents\\n")
        for warranty in self.warranty_claims:
            warranty_num = warranty.get('warranty_number', 'Unknown')
            warranty_title = warranty.get('warranty_title', 'No Title')
            anchor = warranty_title.lower().replace(' ', '-').replace('(', '').replace(')', '')
            markdown_parts.append(f"- [{warranty_num} - {warranty_title}](#{warranty_num.lower()}-{anchor})")
        
        markdown_parts.append("\\n---\\n")
        
        # Add warranty sections
        for disclosure_record in self.disclosures:
            warranty = disclosure_record['warranty']
            disclosure_text = disclosure_record['disclosure']
            
            warranty_num = warranty.get('warranty_number', 'Unknown')
            warranty_title = warranty.get('warranty_title', 'No Title')
            
            # Add warranty header
            markdown_parts.append(f"## {warranty_num} - {warranty_title}\\n")
            
            # Add warranty metadata
            section_name = warranty.get('section_name', 'Unknown Section')
            markdown_parts.append(f"**Section**: {section_name}")
            
            # Add document count if available
            warranty_docs = self.search_chroma_for_warranty(warranty, top_k=1)
            doc_count = len(warranty_docs) if warranty_docs else 0
            markdown_parts.append(f"**Source Documents Found**: {doc_count}\\n")
            
            # Add warranty text
            markdown_parts.append("### Warranty Text\\n")
            warranty_text = warranty.get('warranty_text', 'No warranty text available')
            markdown_parts.append(warranty_text + "\\n")
            
            # Add disclosure
            markdown_parts.append("### Disclosure\\n")
            markdown_parts.append(disclosure_text + "\\n")
            
            # Add separator
            markdown_parts.append("---\\n")
        
        # Add references section
        bibliography = self.ref_manager.generate_bibliography()
        markdown_parts.append(bibliography)
        
        # Combine all parts
        markdown_content = "\\n".join(markdown_parts)
        
        # Write to file
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(markdown_content)
        
        print(f"✅ Markdown exported to: {output_path}")
        print(f"📊 File size: {len(markdown_content):,} characters")
        
        return output_path
    
    def get_general_warranty_terms(self, warranty: Dict[str, Any]) -> List[str]:
        """Get general search terms based on warranty type."""
        warranty_text = warranty.get('warranty_text', '').lower()
        warranty_title = warranty.get('warranty_title', '').lower()
        warranty_number = warranty.get('warranty_number', '')
        
        general_terms = []
        
        # Map warranty types to general terms
        term_mapping = {
            'incorporation': ['incorporation', 'company formation', 'legal entity'],
            'shares': ['shareholding', 'equity', 'ownership'],
            'authority': ['power', 'authorization', 'legal capacity'],
            'accounts': ['financial statements', 'accounting', 'audited accounts'],
            'business': ['operations', 'trading', 'commercial activity'],
            'assets': ['property', 'equipment', 'resources'],
            'contracts': ['agreements', 'legal obligations', 'commitments'],
            'employment': ['employees', 'staff', 'personnel', 'labour'],
            'litigation': ['legal proceedings', 'disputes', 'claims'],
            'compliance': ['regulatory', 'legal requirements', 'obligations'],
            'insurance': ['policies', 'coverage', 'protection'],
            'intellectual property': ['IP', 'patents', 'trademarks', 'copyrights'],
            'environmental': ['EHS', 'environmental compliance', 'health safety'],
            'tax': ['taxation', 'tax compliance', 'tax obligations'],
            'permits': ['licenses', 'authorizations', 'regulatory approvals']
        }
        
        # Check for matching terms
        combined_text = f"{warranty_title} {warranty_text}"
        for category, terms in term_mapping.items():
            if category in combined_text:
                general_terms.extend(terms[:2])
                break
        
        # Add section-based terms
        if warranty_number:
            section_parts = warranty_number.split('.')
            if len(section_parts) >= 1:
                section_num = section_parts[0]
                section_mapping = {
                    '1': ['shares', 'sellers', 'ownership'],
                    '2': ['authority', 'capacity', 'power'],
                    '3': ['share capital', 'securities', 'equity'],
                    '4': ['accounts', 'financial statements'],
                    '5': ['business continuity', 'operations'],
                    '6': ['assets', 'property', 'equipment'],
                    '7': ['corporate', 'constitutional'],
                    '8': ['contracts', 'agreements'],
                    '9': ['borrowing', 'debt', 'financing'],
                    '10': ['permits', 'licenses'],
                    '11': ['insolvency', 'financial distress'],
                    '12': ['litigation', 'legal proceedings'],
                    '13': ['employment', 'employees'],
                    '14': ['competition', 'antitrust'],
                    '15': ['environmental', 'health safety'],
                    '16': ['real estate', 'property'],
                    '17': ['insurance', 'coverage'],
                    '18': ['intellectual property', 'IP'],
                    '19': ['data protection', 'privacy'],
                    '20': ['tax', 'taxation'],
                    '21': ['regulatory', 'compliance'],
                    '22': ['technology', 'IT systems'],
                    '23': ['material contracts', 'key agreements']
                }
                
                if section_num in section_mapping:
                    general_terms.extend(section_mapping[section_num])
        
        return list(set(general_terms))[:5]  # Remove duplicates and limit

    # Update the search method to include better debugging and error handling
    def search_chroma_for_warranty(self, warranty: Dict[str, Any], top_k: int = 15) -> List[Dict]:
        """Search Chroma DB for documents relevant to a specific warranty."""
        if not self.edr_collection:
            print("⚠️ Chroma DB not available, skipping search")
            return []
        
        # Create search queries based on warranty content
        warranty_text = warranty.get('warranty_text', '')
        warranty_title = warranty.get('warranty_title', '')
        warranty_number = warranty.get('warranty_number', '')
        
        # Generate multiple search queries for comprehensive coverage
        search_queries = []
        
        # Add warranty-specific terms
        if warranty_title:
            search_queries.append(warranty_title)
        
        if warranty_text:
            search_queries.append(warranty_text[:500])
        
        if warranty_title and warranty_text:
            search_queries.append(f"{warranty_title} {warranty_text[:200]}")
        
        # Add section-specific terms
        if warranty_number:
            search_queries.append(f"section {warranty_number}")
        
        # Extract key terms for additional queries
        key_terms = self.extract_key_terms_from_warranty(warranty)
        if key_terms:
            search_queries.extend(key_terms[:3])
        
        # Add fallback general terms based on warranty type
        general_terms = self.get_general_warranty_terms(warranty)
        search_queries.extend(general_terms[:2])
        
        all_documents = []
        retrieved_ids = set()
        
        print(f"🔍 Searching ChromaDB with {len(search_queries)} queries for warranty {warranty_number}")
        
        for i, query in enumerate(search_queries):
            if not query.strip():
                continue
                
            try:
                print(f"  Query {i+1}: '{query[:60]}{'...' if len(query) > 60 else ''}'")
                
                # Query Chroma DB
                results = self.edr_collection.query(
                    query_texts=[query],
                    n_results=min(top_k, 10),  # Limit per query
                    include=["documents", "metadatas", "distances"]
                )
                
                # Process results
                if results['documents'] and len(results['documents'][0]) > 0:
                    documents = results['documents'][0]
                    metadatas = results['metadatas'][0] if results['metadatas'] else [{}] * len(documents)
                    distances = results['distances'][0] if results['distances'] else [1.0] * len(documents)
                    
                    for j, (doc, metadata, distance) in enumerate(zip(documents, metadatas, distances)):
                        # Create unique document ID
                        doc_id = metadata.get('id', f'doc_{hash(doc[:100])}_{j}')
                        
                        if doc_id not in retrieved_ids:
                            retrieved_ids.add(doc_id)
                            all_documents.append({
                                'id': doc_id,
                                'content': doc,
                                'metadata': metadata,
                                'distance': distance,
                                'query_match': query
                            })
                
                print(f"    Found {len(results['documents'][0]) if results['documents'] else 0} documents")
                            
            except Exception as e:
                print(f"⚠️ ChromaDB search failed for query '{query[:50]}...': {e}")
                continue
        
        # Sort by relevance (lower distance = more relevant)
        all_documents.sort(key=lambda x: x['distance'])
        
        # Return top results
        final_docs = all_documents[:top_k]
        print(f"✅ Found {len(final_docs)} relevant documents from {len(all_documents)} total matches")
        
        return final_docs
    
    def generate_disclosure_for_warranty(self, warranty: Dict[str, Any]) -> str:
        """Generate a disclosure for a specific warranty."""
        if not LANGCHAIN_AVAILABLE:
            print("⚠️ LangChain not available for disclosure generation")
            return "Disclosure generation unavailable - LangChain not installed."
        
        print(f"Generating disclosure for: {warranty.get('warranty_number', 'Unknown')} - {warranty.get('warranty_title', 'Unknown')}")
        
        # Search for relevant documents
        relevant_docs = self.search_chroma_for_warranty(warranty)
        
        if not relevant_docs:
            print("⚠️ No relevant documents found in ChromaDB")
            return self.generate_basic_disclosure(warranty)
        
        # Prepare context from documents
        context_parts = []
        references = []
        
        for doc in relevant_docs[:10]:  # Limit to top 10 documents
            content = doc['content']
            metadata = doc['metadata']
            
            # Add to reference manager and get citation
            ref_num = self.ref_manager.add_document(
                doc_id=doc['id'],
                content=content,
                metadata=metadata
            )
            
            citation = f"[{ref_num}]"
            references.append(citation)
            
            # Truncate content for context
            truncated_content = content[:1000] + ("..." if len(content) > 1000 else "")
            context_parts.append(f"Document {citation}: {truncated_content}")
        
        context_text = "\n\n".join(context_parts)
        
        # Generate disclosure using LLM
        llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.3, max_tokens=4000)
        
        disclosure_prompt = f"""
You are a legal expert creating a comprehensive warranty disclosure summary for a corporate acquisition.

**Warranty Claim Details:**
- **Number:** {warranty.get('warranty_number', 'N/A')}
- **Title:** {warranty.get('warranty_title', 'N/A')}
- **Text:** {warranty.get('warranty_text', 'N/A')}
- **Section:** {warranty.get('section_name', 'N/A')}

**Available Supporting Documentation:**
{context_text}

**Instructions:**
1. Create a comprehensive disclosure summary that addresses this specific warranty claim
2. Use inline citations {', '.join(references)} to reference the supporting documents
3. Include specific details from the documents that support or contradict the warranty
4. Structure the disclosure with clear headings and bullet points
5. Address potential risks, compliance issues, or qualifications to the warranty
6. Use professional legal language appropriate for an M&A context
7. Ensure the disclosure is thorough but concise (aim for 500-1000 words)

**Output Format:**
# Disclosure Summary for Warranty Claim: [Warranty Title]

## Overview
[Brief summary of the warranty and its implications]

## [Relevant Section Headers based on the specific warranty]
[Detailed analysis with inline citations]

## Conclusion
[Summary of key findings and any qualifications to the warranty]

**Note**: Include inline citations throughout using the format {', '.join(references)} to reference specific supporting documents.
"""
        
        try:
            response = llm.invoke(disclosure_prompt)
            disclosure_text = response.content.strip()
            
            # Ensure proper formatting
            if not disclosure_text.startswith('#'):
                disclosure_text = f"# Disclosure Summary for Warranty Claim: {warranty.get('warranty_title', 'Unknown')}\n\n{disclosure_text}"
            
            return disclosure_text
            
        except Exception as e:
            print(f"❌ Error generating disclosure: {e}")
            return self.generate_basic_disclosure(warranty)
    
    def generate_basic_disclosure(self, warranty: Dict[str, Any]) -> str:
        """Generate a basic disclosure when no documents or LLM is available."""
        return f"""# Disclosure Summary for Warranty Claim: {warranty.get('warranty_title', 'Unknown')}

## Warranty Details
- **Section:** {warranty.get('warranty_number', 'N/A')}
- **Title:** {warranty.get('warranty_title', 'N/A')}
- **Text:** {warranty.get('warranty_text', 'N/A')}

## Analysis
This warranty claim requires detailed analysis against the company's records and documentation. 

## Conclusion
Further investigation and document review required to provide comprehensive disclosure analysis.

**Note:** Limited disclosure analysis available due to insufficient supporting documentation or system limitations.
"""
    
    
    def export_to_markdown(self) -> str:
        """Export warranty claims and disclosures to markdown format."""
        print("\n" + "="*60)
        print("EXPORTING TO MARKDOWN")
        print("="*60)
        
        if not self.warranty_claims or not self.disclosures:
            print("⚠️ No warranty claims or disclosures available for export")
            return ""
        
        # Generate output filename
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        output_path = f"/tf/active/project_victoria_disclosures_fixed_{timestamp}.md"
        
        # Start building markdown content
        markdown_parts = []
        
        # Add header
        markdown_parts.append("# Project Victoria - Warranty Disclosures\\n")
        markdown_parts.append(f"**Generated on**: {datetime.now().strftime('%B %d, %Y at %H:%M:%S')}")
        markdown_parts.append(f"**Total Warranties Processed**: {len(self.warranty_claims)}")
        markdown_parts.append(f"**Total Disclosures Generated**: {len(self.disclosures)}")
        markdown_parts.append(f"**Total References**: {len(self.ref_manager.references)}")
        markdown_parts.append("\\n---\\n")
        
        # Add table of contents
        markdown_parts.append("## Table of Contents\\n")
        for warranty in self.warranty_claims:
            warranty_num = warranty.get('warranty_number', 'Unknown')
            warranty_title = warranty.get('warranty_title', 'No Title')
            anchor = warranty_title.lower().replace(' ', '-').replace('(', '').replace(')', '')
            markdown_parts.append(f"- [{warranty_num} - {warranty_title}](#{warranty_num.lower()}-{anchor})")
        
        markdown_parts.append("\\n---\\n")
        
        # Add warranty sections
        for disclosure_record in self.disclosures:
            warranty = disclosure_record['warranty']
            disclosure_text = disclosure_record['disclosure']
            
            warranty_num = warranty.get('warranty_number', 'Unknown')
            warranty_title = warranty.get('warranty_title', 'No Title')
            
            # Add warranty header
            markdown_parts.append(f"## {warranty_num} - {warranty_title}\\n")
            
            # Add warranty metadata
            section_name = warranty.get('section_name', 'Unknown Section')
            markdown_parts.append(f"**Section**: {section_name}")
            
            # Add document count if available
            warranty_docs = self.search_chroma_for_warranty(warranty, top_k=1)
            doc_count = len(warranty_docs) if warranty_docs else 0
            markdown_parts.append(f"**Source Documents Found**: {doc_count}\\n")
            
            # Add warranty text
            markdown_parts.append("### Warranty Text\\n")
            warranty_text = warranty.get('warranty_text', 'No warranty text available')
            markdown_parts.append(warranty_text + "\\n")
            
            # Add disclosure
            markdown_parts.append("### Disclosure\\n")
            markdown_parts.append(disclosure_text + "\\n")
            
            # Add separator
            markdown_parts.append("---\\n")
        
        # Add references section
        bibliography = self.ref_manager.generate_bibliography()
        markdown_parts.append(bibliography)
        
        # Combine all parts
        markdown_content = "\\n".join(markdown_parts)
        
        # Write to file
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(markdown_content)
        
        print(f"✅ Markdown exported to: {output_path}")
        print(f"📊 File size: {len(markdown_content):,} characters")
        
        return output_path
    
    def get_general_warranty_terms(self, warranty: Dict[str, Any]) -> List[str]:
        """Get general search terms based on warranty type."""
        warranty_text = warranty.get('warranty_text', '').lower()
        warranty_title = warranty.get('warranty_title', '').lower()
        warranty_number = warranty.get('warranty_number', '')
        
        general_terms = []
        
        # Map warranty types to general terms
        term_mapping = {
            'incorporation': ['incorporation', 'company formation', 'legal entity'],
            'shares': ['shareholding', 'equity', 'ownership'],
            'authority': ['power', 'authorization', 'legal capacity'],
            'accounts': ['financial statements', 'accounting', 'audited accounts'],
            'business': ['operations', 'trading', 'commercial activity'],
            'assets': ['property', 'equipment', 'resources'],
            'contracts': ['agreements', 'legal obligations', 'commitments'],
            'employment': ['employees', 'staff', 'personnel', 'labour'],
            'litigation': ['legal proceedings', 'disputes', 'claims'],
            'compliance': ['regulatory', 'legal requirements', 'obligations'],
            'insurance': ['policies', 'coverage', 'protection'],
            'intellectual property': ['IP', 'patents', 'trademarks', 'copyrights'],
            'environmental': ['EHS', 'environmental compliance', 'health safety'],
            'tax': ['taxation', 'tax compliance', 'tax obligations'],
            'permits': ['licenses', 'authorizations', 'regulatory approvals']
        }
        
        # Check for matching terms
        combined_text = f"{warranty_title} {warranty_text}"
        for category, terms in term_mapping.items():
            if category in combined_text:
                general_terms.extend(terms[:2])
                break
        
        # Add section-based terms
        if warranty_number:
            section_parts = warranty_number.split('.')
            if len(section_parts) >= 1:
                section_num = section_parts[0]
                section_mapping = {
                    '1': ['shares', 'sellers', 'ownership'],
                    '2': ['authority', 'capacity', 'power'],
                    '3': ['share capital', 'securities', 'equity'],
                    '4': ['accounts', 'financial statements'],
                    '5': ['business continuity', 'operations'],
                    '6': ['assets', 'property', 'equipment'],
                    '7': ['corporate', 'constitutional'],
                    '8': ['contracts', 'agreements'],
                    '9': ['borrowing', 'debt', 'financing'],
                    '10': ['permits', 'licenses'],
                    '11': ['insolvency', 'financial distress'],
                    '12': ['litigation', 'legal proceedings'],
                    '13': ['employment', 'employees'],
                    '14': ['competition', 'antitrust'],
                    '15': ['environmental', 'health safety'],
                    '16': ['real estate', 'property'],
                    '17': ['insurance', 'coverage'],
                    '18': ['intellectual property', 'IP'],
                    '19': ['data protection', 'privacy'],
                    '20': ['tax', 'taxation'],
                    '21': ['regulatory', 'compliance'],
                    '22': ['technology', 'IT systems'],
                    '23': ['material contracts', 'key agreements']
                }
                
                if section_num in section_mapping:
                    general_terms.extend(section_mapping[section_num])
        
        return list(set(general_terms))[:5]  # Remove duplicates and limit

    # Update the search method to include better debugging and error handling
    def search_chroma_for_warranty(self, warranty: Dict[str, Any], top_k: int = 15) -> List[Dict]:
        """Search Chroma DB for documents relevant to a specific warranty."""
        if not self.edr_collection:
            print("⚠️ Chroma DB not available, skipping search")
            return []
        
        # Create search queries based on warranty content
        warranty_text = warranty.get('warranty_text', '')
        warranty_title = warranty.get('warranty_title', '')
        warranty_number = warranty.get('warranty_number', '')
        
        # Generate multiple search queries for comprehensive coverage
        search_queries = []
        
        # Add warranty-specific terms
        if warranty_title:
            search_queries.append(warranty_title)
        
        if warranty_text:
            search_queries.append(warranty_text[:500])
        
        if warranty_title and warranty_text:
            search_queries.append(f"{warranty_title} {warranty_text[:200]}")
        
        # Add section-specific terms
        if warranty_number:
            search_queries.append(f"section {warranty_number}")
        
        # Extract key terms for additional queries
        key_terms = self.extract_key_terms_from_warranty(warranty)
        if key_terms:
            search_queries.extend(key_terms[:3])
        
        # Add fallback general terms based on warranty type
        general_terms = self.get_general_warranty_terms(warranty)
        search_queries.extend(general_terms[:2])
        
        all_documents = []
        retrieved_ids = set()
        
        print(f"🔍 Searching ChromaDB with {len(search_queries)} queries for warranty {warranty_number}")
        
        for i, query in enumerate(search_queries):
            if not query.strip():
                continue
                
            try:
                print(f"  Query {i+1}: '{query[:60]}{'...' if len(query) > 60 else ''}'")
                
                # Query Chroma DB
                results = self.edr_collection.query(
                    query_texts=[query],
                    n_results=min(top_k, 10),  # Limit per query
                    include=["documents", "metadatas", "distances"]
                )
                
                # Process results
                if results['documents'] and len(results['documents'][0]) > 0:
                    documents = results['documents'][0]
                    metadatas = results['metadatas'][0] if results['metadatas'] else [{}] * len(documents)
                    distances = results['distances'][0] if results['distances'] else [1.0] * len(documents)
                    
                    for j, (doc, metadata, distance) in enumerate(zip(documents, metadatas, distances)):
                        # Create unique document ID
                        doc_id = metadata.get('id', f'doc_{hash(doc[:100])}_{j}')
                        
                        if doc_id not in retrieved_ids:
                            retrieved_ids.add(doc_id)
                            all_documents.append({
                                'id': doc_id,
                                'content': doc,
                                'metadata': metadata,
                                'distance': distance,
                                'query_match': query
                            })
                
                print(f"    Found {len(results['documents'][0]) if results['documents'] else 0} documents")
                            
            except Exception as e:
                print(f"⚠️ ChromaDB search failed for query '{query[:50]}...': {e}")
                continue
        
        # Sort by relevance (lower distance = more relevant)
        all_documents.sort(key=lambda x: x['distance'])
        
        # Return top results
        final_docs = all_documents[:top_k]
        print(f"✅ Found {len(final_docs)} relevant documents from {len(all_documents)} total matches")
        
        return final_docs
    
    def generate_disclosure_for_warranty(self, warranty: Dict[str, Any]) -> str:
        """Generate a disclosure for a specific warranty."""
        if not LANGCHAIN_AVAILABLE:
            print("⚠️ LangChain not available for disclosure generation")
            return "Disclosure generation unavailable - LangChain not installed."
        
        print(f"Generating disclosure for: {warranty.get('warranty_number', 'Unknown')} - {warranty.get('warranty_title', 'Unknown')}")
        
        # Search for relevant documents
        relevant_docs = self.search_chroma_for_warranty(warranty)
        
        if not relevant_docs:
            print("⚠️ No relevant documents found in ChromaDB")
            return self.generate_basic_disclosure(warranty)
        
        # Prepare context from documents
        context_parts = []
        references = []
        
        for doc in relevant_docs[:10]:  # Limit to top 10 documents
            content = doc['content']
            metadata = doc['metadata']
            
            # Add to reference manager and get citation
            ref_num = self.ref_manager.add_document(
                doc_id=doc['id'],
                content=content,
                metadata=metadata
            )
            
            citation = f"[{ref_num}]"
            references.append(citation)
            
            # Truncate content for context
            truncated_content = content[:1000] + ("..." if len(content) > 1000 else "")
            context_parts.append(f"Document {citation}: {truncated_content}")
        
        context_text = "\n\n".join(context_parts)
        
        # Generate disclosure using LLM
        llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.3, max_tokens=4000)
        
        disclosure_prompt = f"""
You are a legal expert creating a comprehensive warranty disclosure summary for a corporate acquisition.

**Warranty Claim Details:**
- **Number:** {warranty.get('warranty_number', 'N/A')}
- **Title:** {warranty.get('warranty_title', 'N/A')}
- **Text:** {warranty.get('warranty_text', 'N/A')}
- **Section:** {warranty.get('section_name', 'N/A')}

**Available Supporting Documentation:**
{context_text}

**Instructions:**
1. Create a comprehensive disclosure summary that addresses this specific warranty claim
2. Use inline citations {', '.join(references)} to reference the supporting documents
3. Include specific details from the documents that support or contradict the warranty
4. Structure the disclosure with clear headings and bullet points
5. Address potential risks, compliance issues, or qualifications to the warranty
6. Use professional legal language appropriate for an M&A context
7. Ensure the disclosure is thorough but concise (aim for 500-1000 words)

**Output Format:**
# Disclosure Summary for Warranty Claim: [Warranty Title]

## Overview
[Brief summary of the warranty and its implications]

## [Relevant Section Headers based on the specific warranty]
[Detailed analysis with inline citations]

## Conclusion
[Summary of key findings and any qualifications to the warranty]

**Note**: Include inline citations throughout using the format {', '.join(references)} to reference specific supporting documents.
"""
        
        try:
            response = llm.invoke(disclosure_prompt)
            disclosure_text = response.content.strip()
            
            # Ensure proper formatting
            if not disclosure_text.startswith('#'):
                disclosure_text = f"# Disclosure Summary for Warranty Claim: {warranty.get('warranty_title', 'Unknown')}\n\n{disclosure_text}"
            
            return disclosure_text
            
        except Exception as e:
            print(f"❌ Error generating disclosure: {e}")
            return self.generate_basic_disclosure(warranty)
    
    def generate_basic_disclosure(self, warranty: Dict[str, Any]) -> str:
        """Generate a basic disclosure when no documents or LLM is available."""
        warranty_num = warranty.get('warranty_number', 'Unknown')
        warranty_title = warranty.get('warranty_title', 'Unknown')
        warranty_text = warranty.get('warranty_text', 'No warranty text available')
        
        basic_disclosure = f"""
# Basic Disclosure for Warranty {warranty_num}

## Warranty Overview
- **Section**: {warranty_num}
- **Title**: {warranty_title}
- **Text**: {warranty_text}

## Disclosure Status
This warranty requires detailed analysis based on company documentation and records. 

**Note**: Full disclosure analysis requires access to supporting documentation from the company's data room and relevant business records.

## Next Steps
1. Review relevant company documentation
2. Consult with legal and business teams
3. Verify compliance with warranty requirements
4. Update disclosure based on findings
"""
        return basic_disclosure
    
    def generate_all_disclosures(self):
        """Generate disclosures for all warranty claims."""
        print("\n" + "="*60)
        print("GENERATING DISCLOSURES")
        print("="*60)
        
        if not self.warranty_claims:
            print("⚠️ No warranty claims available for disclosure generation")
            return
        
        self.disclosures = []
        
        for i, warranty in enumerate(self.warranty_claims):
            print(f"\nProgress: {i+1}/{len(self.warranty_claims)} - Processing warranty {warranty.get('warranty_number', 'Unknown')}")
            
            try:
                disclosure_text = self.generate_disclosure_for_warranty(warranty)
                
                disclosure_record = {
                    'warranty': warranty,
                    'disclosure': disclosure_text,
                    'generated_at': datetime.now().isoformat()
                }
                
                self.disclosures.append(disclosure_record)
                
            except Exception as e:
                print(f"❌ Failed to generate disclosure for warranty {warranty.get('warranty_number', 'Unknown')}: {e}")
                # Add a basic disclosure record
                basic_disclosure = self.generate_basic_disclosure(warranty)
                disclosure_record = {
                    'warranty': warranty,
                    'disclosure': basic_disclosure,
                    'generated_at': datetime.now().isoformat(),
                    'error': str(e)
                }
                self.disclosures.append(disclosure_record)
        
        print(f"\n✅ Generated {len(self.disclosures)} disclosures")
    
    def run_complete_pipeline(self) -> str:
        """Run the complete disclosure generation pipeline."""
        print("🚀 Starting Fixed Project Victoria Disclosure Generation Pipeline\\n")
        
        try:
            # Step 1: Extract PDF text
            print("Step 1: Extracting PDF text...")
            self.extract_pdf_text()
            
            # Step 2: Identify warranty claims
            print("\\nStep 2: Identifying warranty claims...")
            self.identify_warranty_claims()
            
            # Step 3: Generate disclosures
            print("\\nStep 3: Generating disclosures...")
            self.generate_all_disclosures()
            
            # Step 4: Export to markdown
            print("\\nStep 4: Exporting to markdown...")
            output_path = self.export_to_markdown()
            
            print("\\n" + "="*60)
            print("✅ FIXED PIPELINE COMPLETED SUCCESSFULLY!")
            print("="*60)
            print(f"📁 Output file: {output_path}")
            print(f"📊 Total warranties processed: {len(self.warranty_claims)}")
            print(f"📝 Total disclosures generated: {len(self.disclosures)}")
            print(f"📚 Total references: {len(self.ref_manager.references)}")
            print("="*60)
            
            return output_path
            
        except Exception as e:
            print(f"❌ Pipeline failed: {e}")
            raise
                        

Improved Code

🔍 Code Extractor

class FixedProjectVictoriaGenerator

Purpose

Source Code

Parameters

Parameter Details

Return Value

Class Interface

Methods

`init(self)`

`extract_pdf_text(self)`

`split_text_by_warranty_sections(self, text) -> List[str]`

`identify_warranty_claims(self) -> List[Dict[str, Any]]`

`verify_and_clean_warranties(self, warranties) -> List[Dict[str, Any]]`

`search_chroma_for_warranty(self, warranty, top_k) -> List[Dict]`

`extract_key_terms_from_warranty(self, warranty) -> List[str]`

`generate_disclosure_for_warranty(self, warranty) -> str`

`generate_basic_disclosure(self, warranty) -> str`

`export_to_markdown(self) -> str`

`get_general_warranty_terms(self, warranty) -> List[str]`

`search_chroma_for_warranty(self, warranty, top_k) -> List[Dict]`

`generate_disclosure_for_warranty(self, warranty) -> str`

`generate_basic_disclosure(self, warranty) -> str`

`export_to_markdown(self) -> str`

`get_general_warranty_terms(self, warranty) -> List[str]`

`search_chroma_for_warranty(self, warranty, top_k) -> List[Dict]`

`generate_disclosure_for_warranty(self, warranty) -> str`

`generate_basic_disclosure(self, warranty) -> str`

`generate_all_disclosures(self)`

`run_complete_pipeline(self) -> str`

Required Imports

Usage Example

Tags

Similar Components

class ProjectVictoriaDisclosureGenerator 82.8% similar

class ImprovedProjectVictoriaGenerator 77.8% similar

function main_v28 67.8% similar

function main_v29 63.0% similar

function main_v14 56.7% similar

class FixedProjectVictoriaGenerator

Purpose

Source Code

Parameters

Parameter Details

Return Value

Class Interface

Methods

__init__(self)

extract_pdf_text(self)

split_text_by_warranty_sections(self, text) -> List[str]

identify_warranty_claims(self) -> List[Dict[str, Any]]

verify_and_clean_warranties(self, warranties) -> List[Dict[str, Any]]

search_chroma_for_warranty(self, warranty, top_k) -> List[Dict]

extract_key_terms_from_warranty(self, warranty) -> List[str]

generate_disclosure_for_warranty(self, warranty) -> str

generate_basic_disclosure(self, warranty) -> str

export_to_markdown(self) -> str

get_general_warranty_terms(self, warranty) -> List[str]

search_chroma_for_warranty(self, warranty, top_k) -> List[Dict]

generate_disclosure_for_warranty(self, warranty) -> str

generate_basic_disclosure(self, warranty) -> str

export_to_markdown(self) -> str

get_general_warranty_terms(self, warranty) -> List[str]

search_chroma_for_warranty(self, warranty, top_k) -> List[Dict]

generate_disclosure_for_warranty(self, warranty) -> str

generate_basic_disclosure(self, warranty) -> str

generate_all_disclosures(self)

run_complete_pipeline(self) -> str

Required Imports

Usage Example

Tags

Similar Components

class ProjectVictoriaDisclosureGenerator 82.8% similar

class ImprovedProjectVictoriaGenerator 77.8% similar

function main_v28 67.8% similar

function main_v29 63.0% similar

function main_v14 56.7% similar

✨ Improve Code: FixedProjectVictoriaGenerator

Code Comparison

`init(self)`

`extract_pdf_text(self)`

`split_text_by_warranty_sections(self, text) -> List[str]`

`identify_warranty_claims(self) -> List[Dict[str, Any]]`

`verify_and_clean_warranties(self, warranties) -> List[Dict[str, Any]]`

`search_chroma_for_warranty(self, warranty, top_k) -> List[Dict]`

`extract_key_terms_from_warranty(self, warranty) -> List[str]`

`generate_disclosure_for_warranty(self, warranty) -> str`

`generate_basic_disclosure(self, warranty) -> str`

`export_to_markdown(self) -> str`

`get_general_warranty_terms(self, warranty) -> List[str]`

`search_chroma_for_warranty(self, warranty, top_k) -> List[Dict]`

`generate_disclosure_for_warranty(self, warranty) -> str`

`generate_basic_disclosure(self, warranty) -> str`

`export_to_markdown(self) -> str`

`get_general_warranty_terms(self, warranty) -> List[str]`

`search_chroma_for_warranty(self, warranty, top_k) -> List[Dict]`

`generate_disclosure_for_warranty(self, warranty) -> str`

`generate_basic_disclosure(self, warranty) -> str`

`generate_all_disclosures(self)`

`run_complete_pipeline(self) -> str`