class LLMHandler
Handles OpenAI LLM interactions with optimized prompts for analyzing handwritten and drawn content, providing comprehensive responses with optional hybrid graphics integration.
/tf/active/vicechatdev/e-ink-llm/llm_handler.py
8 - 426
complex
Purpose
This class manages the complete lifecycle of analyzing handwritten/drawn images using OpenAI's GPT models. It employs a two-stage approach: first using a small model (gpt-4o-mini) to analyze content type and characteristics, then using a main model (gpt-4o) to generate comprehensive responses. The class supports dynamic prompt generation based on content analysis, hybrid mode with graphics placeholders, usage tracking, and cost estimation. It's designed for e-ink display applications and handles various content types including questions, instructions, diagrams, mathematical notation, and mixed content.
Source Code
class LLMHandler:
"""Handles OpenAI LLM interactions with optimized prompts for handwritten/drawn content"""
def __init__(self, api_key: Optional[str] = None):
# Use API key from environment or parameter (matching OneCo_hybrid_RAG pattern)
self.api_key = api_key or os.getenv("OPENAI_API_KEY")
if not self.api_key:
raise ValueError("OpenAI API key not provided. Set OPENAI_API_KEY environment variable or pass api_key parameter.")
self.client = OpenAI(api_key=self.api_key)
# Model configurations (matching OneCo_hybrid_RAG style)
self.small_model = "gpt-4o-mini" # For preprocessing tasks
self.main_model = "gpt-4o" # For main processing
# Usage tracking
self.usage_stats = {
"preprocessing_calls": 0,
"main_processing_calls": 0,
"total_tokens_used": 0,
"total_cost_estimate": 0.0
}
async def analyze_and_respond(self, image_b64: str, metadata: Dict[str, Any]) -> str:
"""
Main method to analyze handwritten/drawn content and provide comprehensive response
Args:
image_b64: Base64 encoded image
metadata: Image metadata from input processor
Returns:
Comprehensive response string
"""
print(f"🧠 Starting LLM analysis for {metadata.get('source_type', 'unknown')} content...")
# Step 1: Quick content analysis with small model
content_analysis = await self._analyze_content_type(image_b64)
print(f"📋 Content analysis: {content_analysis['content_type']}")
# Step 2: Generate comprehensive response with main model
response = await self._generate_comprehensive_response(image_b64, content_analysis, metadata)
print(f"✅ LLM analysis complete")
return response
async def _analyze_content_type(self, image_b64: str) -> Dict[str, Any]:
"""Analyze content type and structure using small model"""
print(f"🔍 Analyzing content type with {self.small_model}...")
prompt = """
You are an expert content analyzer. Analyze this handwritten/drawn image and classify its content type and characteristics.
Determine:
1. Content type (question, instruction, diagram, notes, sketch, etc.)
2. Primary language (if text is present)
3. Complexity level (simple, moderate, complex)
4. Key elements present (text, drawings, diagrams, mathematical notation, etc.)
5. Suggested response approach
Output your analysis in strict JSON format:
```json
{
"content_type": "question|instruction|diagram|notes|sketch|mixed",
"language": "english|spanish|french|other",
"complexity": "simple|moderate|complex",
"elements": ["text", "drawings", "diagrams", "math", "tables"],
"response_approach": "direct_answer|explanation|step_by_step|analysis|interpretation",
"confidence": 0.8
}
```
Focus on accuracy and be concise.
"""
try:
response = await asyncio.to_thread(
self.client.chat.completions.create,
model=self.small_model,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{image_b64}",
"detail": "low" # Use low detail for analysis to save tokens
}
}
]
}
],
max_tokens=500,
temperature=0.1
)
self.usage_stats["preprocessing_calls"] += 1
self._update_usage_stats(response.usage)
# Parse JSON response
content = response.choices[0].message.content
if '```json' in content:
content = content.split('```json')[1].split('```')[0].strip()
elif '```' in content:
content = content.split('```')[1].split('```')[0].strip()
return json.loads(content)
except Exception as e:
print(f"⚠️ Error in content analysis: {e}")
# Return default analysis if parsing fails
return {
"content_type": "mixed",
"language": "english",
"complexity": "moderate",
"elements": ["text", "drawings"],
"response_approach": "analysis",
"confidence": 0.5
}
async def _generate_comprehensive_response(self, image_b64: str, content_analysis: Dict[str, Any], metadata: Dict[str, Any]) -> str:
"""Generate comprehensive response using main model with optimized prompts"""
print(f"🎯 Generating comprehensive response with {self.main_model}...")
# Build dynamic prompt based on content analysis
prompt = self._build_dynamic_prompt(content_analysis, metadata)
try:
response = await asyncio.to_thread(
self.client.chat.completions.create,
model=self.main_model,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{image_b64}",
"detail": "high" # High detail for comprehensive analysis
}
}
]
}
],
max_tokens=2500,
temperature=0.3
)
self.usage_stats["main_processing_calls"] += 1
self._update_usage_stats(response.usage)
return response.choices[0].message.content
except Exception as e:
return f"Error processing request: {str(e)}\n\nPlease ensure the image is clear and try again."
def _build_dynamic_prompt(self, content_analysis: Dict[str, Any], metadata: Dict[str, Any]) -> str:
"""Build optimized prompt based on content analysis"""
content_type = content_analysis.get("content_type", "mixed")
complexity = content_analysis.get("complexity", "moderate")
approach = content_analysis.get("response_approach", "analysis")
elements = content_analysis.get("elements", [])
# Check if hybrid mode is available and appropriate
use_hybrid_mode = self._should_use_hybrid_mode(content_analysis, metadata)
# Base prompt structure
base_prompt = f"""
You are an expert AI assistant specialized in analyzing and responding to handwritten and drawn content.
**Content Analysis Results:**
- Content Type: {content_type}
- Complexity: {complexity}
- Approach: {approach}
- Elements Present: {', '.join(elements)}
- Source: {metadata.get('source_type', 'unknown')}
**Your Task:**
Carefully analyze this {content_type} content and provide a comprehensive, well-structured response.
"""
# Add hybrid graphics instruction if appropriate
if use_hybrid_mode:
base_prompt += self._get_hybrid_graphics_instruction(content_analysis)
# Add specific instructions based on content type
if content_type == "question":
specific_instructions = """
**Response Structure for Questions:**
1. **Question Understanding**: Restate what you understand from the handwritten question
2. **Direct Answer**: Provide a clear, direct answer
3. **Detailed Explanation**: Elaborate with context, examples, and supporting information
4. **Additional Context**: Include related information that might be helpful
5. **Summary**: Conclude with key takeaways
**Requirements:**
- Address all parts of the question thoroughly
- Use clear, accessible language
- Provide examples where helpful
- Include relevant context and background information
"""
if use_hybrid_mode:
specific_instructions += """
**Graphics Integration for Questions:**
- Include charts for data-related questions
- Add diagrams for process or concept questions
- Use illustrations for educational explanations
- Provide visual examples where they enhance understanding
"""
elif content_type == "instruction":
specific_instructions = """
**Response Structure for Instructions:**
1. **Instruction Analysis**: Clarify what the instruction is asking for
2. **Step-by-Step Response**: Break down your response into clear steps
3. **Detailed Guidance**: Provide comprehensive guidance for each step
4. **Best Practices**: Include tips and best practices
5. **Troubleshooting**: Address potential issues or alternatives
**Requirements:**
- Follow the instruction precisely
- Provide actionable, specific guidance
- Include warnings or precautions if relevant
- Offer alternatives or variations where appropriate
"""
if use_hybrid_mode:
specific_instructions += """
**Graphics Integration for Instructions:**
- Create flowcharts for step-by-step processes
- Include diagrams for complex procedures
- Add illustrations for technical concepts
- Use charts for comparative information
"""
elif content_type == "diagram" or "diagrams" in elements:
specific_instructions = """
**Response Structure for Diagrams:**
1. **Diagram Description**: Describe what you see in the diagram
2. **Component Analysis**: Break down the key components and their relationships
3. **Interpretation**: Explain what the diagram represents or demonstrates
4. **Context and Applications**: Provide relevant context and real-world applications
5. **Additional Information**: Include related concepts or principles
**Requirements:**
- Describe visual elements clearly
- Explain relationships between components
- Provide technical accuracy
- Include practical applications or examples
"""
if use_hybrid_mode:
specific_instructions += """
**Graphics Integration for Diagrams:**
- Create enhanced versions of hand-drawn diagrams
- Add professional diagram representations
- Include process flow improvements
- Provide alternative visual perspectives
"""
elif "math" in elements:
specific_instructions = """
**Response Structure for Mathematical Content:**
1. **Problem/Expression Recognition**: Identify the mathematical content
2. **Solution Process**: Show step-by-step solution if it's a problem
3. **Explanation**: Explain the mathematical concepts involved
4. **Verification**: Check the solution or explain the concept thoroughly
5. **Related Concepts**: Include related mathematical principles
**Requirements:**
- Show all mathematical steps clearly
- Explain the reasoning behind each step
- Use proper mathematical notation in text form
- Provide conceptual understanding, not just calculations
"""
if use_hybrid_mode:
specific_instructions += """
**Graphics Integration for Mathematics:**
- Use illustration graphics for mathematical concepts
- Include charts for data analysis or statistics
- Create diagrams for geometric problems
- Show visual solutions where helpful
"""
else: # mixed or other content types
specific_instructions = """
**Response Structure for Mixed Content:**
1. **Content Overview**: Summarize what you observe in the handwritten/drawn content
2. **Element-by-Element Analysis**: Address each distinct element (text, drawings, etc.)
3. **Synthesis**: Connect the different elements and explain their relationship
4. **Comprehensive Response**: Provide thorough information addressing all aspects
5. **Conclusion**: Summarize key points and implications
**Requirements:**
- Address all visible elements in the content
- Maintain logical flow between different content types
- Provide depth and detail appropriate to the content
- Ensure clarity and accessibility
"""
if use_hybrid_mode:
specific_instructions += """
**Graphics Integration for Mixed Content:**
- Add graphics that complement and enhance written explanations
- Use appropriate chart types for any data mentioned
- Include diagrams for processes or workflows
- Provide illustrations for complex concepts
"""
# Add formatting requirements
formatting_requirements = """
**Formatting Requirements:**
- Use clear Markdown formatting with appropriate headers
- Structure your response with logical sections
- Use bullet points or numbered lists for clarity
- Include **bold** text for emphasis on key points
- Ensure the response is well-organized and easy to read on an e-ink display
- Keep paragraphs concise but informative
**Quality Standards:**
- Provide accurate, helpful information
- Be thorough but not unnecessarily verbose
- Use examples and analogies where helpful
- Maintain a helpful, educational tone
- Double-check any factual claims or calculations
"""
return base_prompt + specific_instructions + formatting_requirements
def _should_use_hybrid_mode(self, content_analysis: Dict[str, Any], metadata: Dict[str, Any]) -> bool:
"""Determine if hybrid mode with graphics should be used"""
# Check if hybrid mode is enabled in metadata
if not metadata.get('enable_hybrid_mode', True):
return False
# Use hybrid mode for content that would benefit from graphics
content_type = content_analysis.get("content_type", "mixed")
elements = content_analysis.get("elements", [])
complexity = content_analysis.get("complexity", "moderate")
# Use graphics for these content types
graphic_friendly_types = ["question", "instruction", "diagram", "mixed"]
has_graphic_elements = any(elem in elements for elem in ["math", "diagrams", "drawings"])
is_complex = complexity in ["moderate", "complex"]
return (content_type in graphic_friendly_types or
has_graphic_elements or
is_complex)
def _get_hybrid_graphics_instruction(self, content_analysis: Dict[str, Any]) -> str:
"""Get graphics instruction based on content analysis"""
graphics_instruction = """
**HYBRID RESPONSE CAPABILITY:**
You can now include graphics in your responses! Use the following placeholder format to request graphics:
[GRAPHIC:type:description:parameters]
**Available Graphic Types:**
- chart: Data visualizations (bar, line, pie, scatter charts)
- diagram: Process flows, organizational charts, concept maps
- illustration: Educational diagrams, mathematical concepts, technical drawings
- sketch: Simple drawings, annotations, visual explanations
**Placeholder Format Examples:**
- [GRAPHIC:chart:Sales Comparison:{"type":"bar","data":[25,40,30,45],"labels":["Q1","Q2","Q3","Q4"],"title":"Quarterly Sales"}]
- [GRAPHIC:diagram:Process Flow:{"steps":["Input","Process","Output"],"style":"flowchart","direction":"horizontal"}]
- [GRAPHIC:illustration:Mathematical Concept:{"concept":"quadratic_function","style":"educational","annotations":true}]
**When to Include Graphics:**
- Data that would benefit from visualization
- Complex processes that need step-by-step diagrams
- Mathematical or scientific concepts
- Comparisons that work better visually
- Any content where a graphic would enhance understanding
**Graphics Integration Guidelines:**
1. Place graphic placeholders exactly where you want them in your text
2. Ensure graphics complement and enhance your written explanation
3. Provide clear, descriptive parameters for graphic generation
4. Use graphics strategically - not every response needs them
5. Consider the e-ink display limitations (high contrast, simple designs work best)
**Response Structure with Graphics:**
- Start with your text explanation
- Insert graphic placeholders at relevant points
- Continue your explanation referencing the graphics
- Ensure the response flows naturally even without the graphics
**Important:** Only include graphics when they genuinely enhance your response. A good text-only response is better than a response with unnecessary graphics. Focus on clarity and helpfulness above all else.
"""
return graphics_instruction
def _update_usage_stats(self, usage_data):
"""Update usage statistics for cost tracking"""
if hasattr(usage_data, 'total_tokens'):
self.usage_stats["total_tokens_used"] += usage_data.total_tokens
# Rough cost estimation (as of 2024 pricing)
if self.main_model == "gpt-4o":
# Approximate costs: input ~$0.005/1K tokens, output ~$0.015/1K tokens
prompt_tokens = getattr(usage_data, 'prompt_tokens', 0)
completion_tokens = getattr(usage_data, 'completion_tokens', 0)
cost = (prompt_tokens * 0.005 + completion_tokens * 0.015) / 1000
self.usage_stats["total_cost_estimate"] += cost
def get_usage_summary(self) -> Dict[str, Any]:
"""Get current usage statistics"""
return self.usage_stats.copy()
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
api_key: Optional OpenAI API key string. If not provided, the class will attempt to read from the OPENAI_API_KEY environment variable. If neither is available, raises ValueError during initialization.
Return Value
Instantiation returns an LLMHandler object configured with OpenAI client, model settings, and usage tracking. The main method analyze_and_respond() returns a string containing the comprehensive response with optional graphics placeholders. The get_usage_summary() method returns a dictionary with usage statistics including call counts, token usage, and cost estimates.
Class Interface
Methods
__init__(self, api_key: Optional[str] = None)
Purpose: Initialize the LLMHandler with OpenAI client, model configurations, and usage tracking
Parameters:
api_key: Optional OpenAI API key. Falls back to OPENAI_API_KEY environment variable if not provided
Returns: None (constructor)
async analyze_and_respond(self, image_b64: str, metadata: Dict[str, Any]) -> str
Purpose: Main method to analyze handwritten/drawn content and provide comprehensive response with two-stage processing
Parameters:
image_b64: Base64 encoded image string (PNG format recommended)metadata: Dictionary containing image metadata including 'source_type' and 'enable_hybrid_mode' flags
Returns: String containing comprehensive response with optional graphics placeholders in format [GRAPHIC:type:description:parameters]
async _analyze_content_type(self, image_b64: str) -> Dict[str, Any]
Purpose: Analyze content type and structure using small model (gpt-4o-mini) for efficient preprocessing
Parameters:
image_b64: Base64 encoded image string
Returns: Dictionary with keys: content_type, language, complexity, elements, response_approach, confidence
async _generate_comprehensive_response(self, image_b64: str, content_analysis: Dict[str, Any], metadata: Dict[str, Any]) -> str
Purpose: Generate comprehensive response using main model (gpt-4o) with optimized prompts based on content analysis
Parameters:
image_b64: Base64 encoded image stringcontent_analysis: Dictionary from _analyze_content_type containing content characteristicsmetadata: Image metadata dictionary
Returns: String containing the comprehensive response text with optional graphics placeholders
_build_dynamic_prompt(self, content_analysis: Dict[str, Any], metadata: Dict[str, Any]) -> str
Purpose: Build optimized prompt dynamically based on content analysis results and metadata
Parameters:
content_analysis: Dictionary containing content type, complexity, elements, and approachmetadata: Image metadata including source_type and hybrid mode settings
Returns: String containing the complete prompt tailored to the specific content type
_should_use_hybrid_mode(self, content_analysis: Dict[str, Any], metadata: Dict[str, Any]) -> bool
Purpose: Determine if hybrid mode with graphics should be enabled based on content characteristics
Parameters:
content_analysis: Dictionary with content type, elements, and complexitymetadata: Metadata dictionary with enable_hybrid_mode flag
Returns: Boolean indicating whether to include graphics instructions in the prompt
_get_hybrid_graphics_instruction(self, content_analysis: Dict[str, Any]) -> str
Purpose: Get detailed graphics instruction text for hybrid mode responses
Parameters:
content_analysis: Dictionary containing content analysis results
Returns: String containing comprehensive instructions for including graphics placeholders
_update_usage_stats(self, usage_data)
Purpose: Update internal usage statistics for cost tracking based on API response usage data
Parameters:
usage_data: OpenAI API usage object containing token counts (total_tokens, prompt_tokens, completion_tokens)
Returns: None (updates internal state)
get_usage_summary(self) -> Dict[str, Any]
Purpose: Get current usage statistics including call counts, token usage, and cost estimates
Returns: Dictionary copy with keys: preprocessing_calls, main_processing_calls, total_tokens_used, total_cost_estimate
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
api_key |
str | OpenAI API key used for authentication | instance |
client |
OpenAI | OpenAI client instance for making API calls | instance |
small_model |
str | Model name for preprocessing tasks (default: 'gpt-4o-mini') | instance |
main_model |
str | Model name for main processing tasks (default: 'gpt-4o') | instance |
usage_stats |
Dict[str, Any] | Dictionary tracking preprocessing_calls, main_processing_calls, total_tokens_used, and total_cost_estimate | instance |
Dependencies
openaiasynciojsonostimetyping
Required Imports
import os
import asyncio
import json
from openai import OpenAI
from typing import Dict, Any, Optional
import time
Usage Example
import os
import asyncio
from llm_handler import LLMHandler
# Initialize handler
handler = LLMHandler(api_key=os.getenv('OPENAI_API_KEY'))
# Prepare image and metadata
image_b64 = "base64_encoded_image_string"
metadata = {
'source_type': 'handwritten',
'enable_hybrid_mode': True
}
# Analyze and get response (async)
async def process_image():
response = await handler.analyze_and_respond(image_b64, metadata)
print(response)
# Check usage statistics
stats = handler.get_usage_summary()
print(f"Tokens used: {stats['total_tokens_used']}")
print(f"Estimated cost: ${stats['total_cost_estimate']:.4f}")
# Run async function
asyncio.run(process_image())
Best Practices
- Always provide an API key either through constructor parameter or OPENAI_API_KEY environment variable
- Use async/await pattern when calling analyze_and_respond() and other async methods
- Monitor usage statistics with get_usage_summary() to track API costs
- Ensure base64 encoded images are valid PNG format for best results
- Set enable_hybrid_mode in metadata to control graphics placeholder generation
- The class uses a two-stage processing approach: content analysis with small model, then comprehensive response with main model
- Handle exceptions from API calls as network errors or rate limits may occur
- The class maintains state through usage_stats - create new instances for independent tracking
- Graphics placeholders in responses follow format: [GRAPHIC:type:description:parameters]
- Cost estimates are approximate based on 2024 pricing and may need adjustment
- The class is designed for e-ink displays, so responses are optimized for high contrast and readability
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class LLMClient_v1 66.7% similar
-
function run_demo 63.5% similar
-
class HybridResponseHandler 62.9% similar
-
function main_v68 62.9% similar
-
class OpenAIChatLLM 62.9% similar