class AnnotationDetector
A class that detects various types of annotations in PDF documents including red pen markups, highlights, strikethrough lines, underlines, and insertion marks using computer vision and native PDF annotation extraction.
/tf/active/vicechatdev/e-ink-llm/annotation_detector.py
36 - 348
complex
Purpose
The AnnotationDetector class provides comprehensive annotation detection capabilities for PDF documents. It combines computer vision techniques (color-based detection using OpenCV) with native PDF annotation extraction (using PyMuPDF/fitz) to identify and classify different types of annotations. The class processes PDF pages as images to detect visual annotations like colored highlights and markups, while also extracting native PDF annotations. It's designed for document analysis workflows where identifying user annotations, edits, and markups is important, such as grading systems, document review processes, or collaborative editing analysis.
Source Code
class AnnotationDetector:
"""
Detects various types of annotations in PDF documents:
- Red pen markups and corrections
- Yellow/green highlights
- Strikethrough lines
- Underlines
- Insertion marks
"""
def __init__(self):
# Color ranges for different annotation types (HSV format)
self.color_ranges = {
'red_markup': {
'lower': np.array([0, 120, 120]),
'upper': np.array([10, 255, 255]),
'type': 'markup'
},
'red_markup_2': {
'lower': np.array([170, 120, 120]),
'upper': np.array([180, 255, 255]),
'type': 'markup'
},
'yellow_highlight': {
'lower': np.array([20, 100, 100]),
'upper': np.array([30, 255, 255]),
'type': 'highlight'
},
'green_highlight': {
'lower': np.array([40, 100, 100]),
'upper': np.array([80, 255, 255]),
'type': 'highlight'
},
'blue_markup': {
'lower': np.array([100, 120, 120]),
'upper': np.array([130, 255, 255]),
'type': 'markup'
}
}
# Minimum sizes for different annotation types
self.min_areas = {
'highlight': 100, # Highlights should be reasonably large
'markup': 20, # Markup can be small pen strokes
'strikethrough': 50, # Strikethrough lines
'underline': 30, # Underlines
'insertion': 10 # Small insertion marks
}
async def detect_annotations_in_pdf(self, pdf_path: str) -> Optional[AnnotationResult]:
"""
Detect annotations in a PDF document
Args:
pdf_path: Path to the PDF file
Returns:
AnnotationResult with detected annotations or None if failed
"""
try:
logger.info(f"Starting annotation detection for {pdf_path}")
# Convert PDF pages to images for processing
images = self._pdf_to_images(pdf_path)
if not images:
logger.warning("Failed to convert PDF to images")
return None
all_annotations = []
pages_processed = 0
# Process each page
for page_num, image in enumerate(images, 1):
page_annotations = self._detect_annotations_in_image(image, page_num)
all_annotations.extend(page_annotations)
pages_processed += 1
if page_annotations:
logger.info(f"Page {page_num}: Found {len(page_annotations)} annotations")
# Also check for native PDF annotations
native_annotations = self._detect_native_pdf_annotations(pdf_path)
all_annotations.extend(native_annotations)
# Generate summary
total_annotations = len(all_annotations)
summary = self._generate_detection_summary(all_annotations, pages_processed)
logger.info(f"Annotation detection complete: {total_annotations} annotations found")
return AnnotationResult(
annotations=all_annotations,
total_annotations=total_annotations,
pages_processed=pages_processed,
detection_summary=summary
)
except Exception as e:
logger.error(f"Error in annotation detection: {e}")
return None
def _pdf_to_images(self, pdf_path: str) -> List[np.ndarray]:
"""Convert PDF pages to OpenCV images"""
images = []
try:
# Use PyMuPDF for better image quality
doc = fitz.open(pdf_path)
for page_num in range(min(doc.page_count, 5)): # Limit to first 5 pages
page = doc.load_page(page_num)
# Render at high DPI for better annotation detection
mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better detail
pix = page.get_pixmap(matrix=mat, alpha=False)
# Convert to OpenCV format
img_data = pix.tobytes("ppm")
nparr = np.frombuffer(img_data, np.uint8)
img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
if img is not None:
images.append(img)
doc.close()
return images
except Exception as e:
logger.error(f"Error converting PDF to images: {e}")
return []
def _detect_annotations_in_image(self, image: np.ndarray, page_number: int) -> List[AnnotationInfo]:
"""Detect annotations in a single image using color-based detection"""
annotations = []
# Convert to HSV for better color detection
hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
# Detect each color range
for color_name, color_info in self.color_ranges.items():
mask = cv2.inRange(hsv, color_info['lower'], color_info['upper'])
# Find contours
contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for contour in contours:
area = cv2.contourArea(contour)
annotation_type = color_info['type']
# Check minimum area threshold
if area < self.min_areas.get(annotation_type, 20):
continue
# Get bounding rectangle
x, y, w, h = cv2.boundingRect(contour)
# Calculate confidence based on area and shape
confidence = self._calculate_annotation_confidence(contour, annotation_type)
if confidence > 0.3: # Minimum confidence threshold
# Get average color in the region
roi = image[y:y+h, x:x+w]
avg_color = np.mean(roi.reshape(-1, 3), axis=0)
annotations.append(AnnotationInfo(
annotation_type=annotation_type,
confidence=confidence,
area=int(area),
color=tuple(map(int, avg_color[::-1])), # BGR to RGB
bounds=(x, y, w, h),
page_number=page_number
))
# Detect geometric patterns (strikethrough, underline)
geometric_annotations = self._detect_geometric_patterns(image, page_number)
annotations.extend(geometric_annotations)
return annotations
def _detect_geometric_patterns(self, image: np.ndarray, page_number: int) -> List[AnnotationInfo]:
"""Detect strikethrough and underline patterns"""
annotations = []
# Convert to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Detect horizontal lines (potential strikethrough/underline)
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
detected_lines = cv2.morphologyEx(gray, cv2.MORPH_OPEN, kernel)
# Find contours of lines
contours, _ = cv2.findContours(detected_lines, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for contour in contours:
area = cv2.contourArea(contour)
if area < 30: # Too small to be a meaningful line
continue
x, y, w, h = cv2.boundingRect(contour)
# Classify as strikethrough or underline based on aspect ratio
aspect_ratio = w / h if h > 0 else 0
if aspect_ratio > 5: # Long, thin line
# Determine if it's strikethrough or underline based on context
# This is a simplified heuristic
annotation_type = 'strikethrough' if aspect_ratio > 10 else 'underline'
annotations.append(AnnotationInfo(
annotation_type=annotation_type,
confidence=0.7,
area=int(area),
color=(0, 0, 0), # Default to black
bounds=(x, y, w, h),
page_number=page_number
))
return annotations
def _detect_native_pdf_annotations(self, pdf_path: str) -> List[AnnotationInfo]:
"""Detect native PDF annotations (comments, highlights, etc.)"""
annotations = []
try:
doc = fitz.open(pdf_path)
for page_num in range(doc.page_count):
page = doc.load_page(page_num)
# Get annotations from the page
annot_list = page.annots()
for annot in annot_list:
annot_dict = annot.info
annot_type = annot_dict.get('type_name', 'unknown')
# Map PDF annotation types to our types
type_mapping = {
'Highlight': 'highlight',
'StrikeOut': 'strikethrough',
'Underline': 'underline',
'FreeText': 'markup',
'Text': 'markup',
'Ink': 'markup'
}
mapped_type = type_mapping.get(annot_type, 'markup')
# Get annotation bounds
rect = annot.rect
bounds = (int(rect.x0), int(rect.y0), int(rect.width), int(rect.height))
annotations.append(AnnotationInfo(
annotation_type=mapped_type,
confidence=0.9, # High confidence for native annotations
area=int(rect.width * rect.height),
color=(255, 255, 0), # Default yellow
bounds=bounds,
page_number=page_num + 1,
text_content=annot_dict.get('content', '')
))
doc.close()
except Exception as e:
logger.error(f"Error detecting native PDF annotations: {e}")
return annotations
def _calculate_annotation_confidence(self, contour: np.ndarray, annotation_type: str) -> float:
"""Calculate confidence score for an annotation based on shape and size"""
area = cv2.contourArea(contour)
# Get contour properties
x, y, w, h = cv2.boundingRect(contour)
aspect_ratio = w / h if h > 0 else 0
# Base confidence on area
area_confidence = min(area / 1000, 1.0) # Normalize by expected area
# Adjust based on annotation type expectations
shape_confidence = 0.5
if annotation_type == 'highlight':
# Highlights should be roughly rectangular
shape_confidence = 0.8 if 2 <= aspect_ratio <= 10 else 0.4
elif annotation_type == 'markup':
# Markup can be any shape
shape_confidence = 0.7
# Final confidence
return min((area_confidence + shape_confidence) / 2, 1.0)
def _generate_detection_summary(self, annotations: List[AnnotationInfo], pages_processed: int) -> str:
"""Generate a summary of the detection process"""
if not annotations:
return f"No annotations detected across {pages_processed} pages"
# Count by type
type_counts = {}
for ann in annotations:
type_counts[ann.annotation_type] = type_counts.get(ann.annotation_type, 0) + 1
# Generate summary
summary_parts = [f"Detected {len(annotations)} annotations across {pages_processed} pages"]
if type_counts:
type_summary = ", ".join([f"{count} {type}" for type, count in type_counts.items()])
summary_parts.append(f"Types: {type_summary}")
# Add confidence summary
avg_confidence = sum(ann.confidence for ann in annotations) / len(annotations)
summary_parts.append(f"Average confidence: {avg_confidence:.2f}")
return ". ".join(summary_parts)
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
No constructor parameters: The __init__ method takes no parameters. All configuration is done through predefined instance attributes that are initialized with default values for color ranges and minimum area thresholds.
Return Value
The class itself returns an AnnotationDetector instance when instantiated. The main method detect_annotations_in_pdf returns an Optional[AnnotationResult] containing all detected annotations, total count, pages processed, and a summary string. Returns None if detection fails. Individual helper methods return List[AnnotationInfo] for annotations found in specific contexts, or List[np.ndarray] for image conversions.
Class Interface
Methods
__init__(self)
Purpose: Initializes the AnnotationDetector with predefined color ranges for different annotation types and minimum area thresholds
Returns: None - initializes instance attributes color_ranges and min_areas
async detect_annotations_in_pdf(self, pdf_path: str) -> Optional[AnnotationResult]
Purpose: Main entry point that detects all annotations in a PDF document by combining image-based and native PDF annotation detection
Parameters:
pdf_path: String path to the PDF file to analyze
Returns: AnnotationResult object containing all detected annotations, counts, and summary, or None if detection fails
_pdf_to_images(self, pdf_path: str) -> List[np.ndarray]
Purpose: Converts PDF pages to OpenCV image arrays for visual processing, limited to first 5 pages at 2x resolution
Parameters:
pdf_path: String path to the PDF file to convert
Returns: List of numpy arrays representing each page as an OpenCV image, empty list on failure
_detect_annotations_in_image(self, image: np.ndarray, page_number: int) -> List[AnnotationInfo]
Purpose: Detects annotations in a single image using color-based detection in HSV color space and geometric pattern detection
Parameters:
image: OpenCV image array (numpy ndarray) to analyzepage_number: Integer page number for annotation metadata
Returns: List of AnnotationInfo objects for all annotations detected in the image
_detect_geometric_patterns(self, image: np.ndarray, page_number: int) -> List[AnnotationInfo]
Purpose: Detects strikethrough and underline patterns using morphological operations on horizontal lines
Parameters:
image: OpenCV image array to analyze for geometric patternspage_number: Integer page number for annotation metadata
Returns: List of AnnotationInfo objects for detected strikethrough and underline annotations
_detect_native_pdf_annotations(self, pdf_path: str) -> List[AnnotationInfo]
Purpose: Extracts native PDF annotations (comments, highlights, etc.) directly from the PDF structure using PyMuPDF
Parameters:
pdf_path: String path to the PDF file to extract annotations from
Returns: List of AnnotationInfo objects for all native PDF annotations found
_calculate_annotation_confidence(self, contour: np.ndarray, annotation_type: str) -> float
Purpose: Calculates a confidence score (0.0-1.0) for an annotation based on its shape, size, and type expectations
Parameters:
contour: OpenCV contour array representing the annotation boundaryannotation_type: String type of annotation ('highlight', 'markup', etc.) to apply type-specific heuristics
Returns: Float confidence score between 0.0 and 1.0
_generate_detection_summary(self, annotations: List[AnnotationInfo], pages_processed: int) -> str
Purpose: Generates a human-readable summary string of the detection results including counts by type and average confidence
Parameters:
annotations: List of all detected AnnotationInfo objectspages_processed: Integer number of pages that were processed
Returns: String summary of detection results
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
color_ranges |
Dict[str, Dict[str, Any]] | Dictionary mapping color names to HSV color range definitions with 'lower' and 'upper' numpy arrays and 'type' classification. Includes ranges for red_markup, red_markup_2, yellow_highlight, green_highlight, and blue_markup | instance |
min_areas |
Dict[str, int] | Dictionary mapping annotation types to minimum pixel area thresholds for detection. Includes thresholds for highlight (100), markup (20), strikethrough (50), underline (30), and insertion (10) | instance |
Dependencies
cv2numpytypingdataclassespathlibPyPDF2fitzlogging
Required Imports
import cv2
import numpy as np
from typing import List, Dict, Any, Optional, Tuple
from dataclasses import dataclass
from pathlib import Path
import PyPDF2
import fitz
import logging
Usage Example
# Define required dataclasses
from dataclasses import dataclass
from typing import List, Tuple, Optional
@dataclass
class AnnotationInfo:
annotation_type: str
confidence: float
area: int
color: Tuple[int, int, int]
bounds: Tuple[int, int, int, int]
page_number: int
text_content: str = ''
@dataclass
class AnnotationResult:
annotations: List[AnnotationInfo]
total_annotations: int
pages_processed: int
detection_summary: str
# Set up logging
import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# Instantiate the detector
detector = AnnotationDetector()
# Detect annotations in a PDF
import asyncio
async def analyze_pdf():
result = await detector.detect_annotations_in_pdf('document.pdf')
if result:
print(f"Found {result.total_annotations} annotations")
print(f"Summary: {result.detection_summary}")
for ann in result.annotations:
print(f"Page {ann.page_number}: {ann.annotation_type} (confidence: {ann.confidence:.2f})")
else:
print("Detection failed")
# Run the async function
asyncio.run(analyze_pdf())
Best Practices
- Always use the async detect_annotations_in_pdf method as the main entry point rather than calling internal methods directly
- Ensure the AnnotationInfo and AnnotationResult dataclasses are properly defined before instantiating the detector
- The detector processes only the first 5 pages of PDFs by default to manage memory and performance - modify _pdf_to_images if you need more pages
- Color ranges are defined in HSV color space and can be customized by modifying the color_ranges dictionary after instantiation
- Minimum area thresholds in min_areas can be adjusted based on your document resolution and annotation size requirements
- The detector combines both visual (image-based) and native PDF annotation detection for comprehensive coverage
- Handle None return values from detect_annotations_in_pdf to gracefully manage detection failures
- The confidence threshold of 0.3 for visual annotations can be adjusted in _detect_annotations_in_image for stricter or looser detection
- Native PDF annotations receive a confidence of 0.9 as they are explicitly defined in the PDF structure
- The detector renders PDF pages at 2x zoom (DPI) for better detail - adjust the Matrix parameters in _pdf_to_images if needed for performance
- Geometric pattern detection (strikethrough/underline) uses morphological operations and may need tuning for different document types
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class AnnotationInfo 62.9% similar
-
class AnnotationResult 61.5% similar
-
class DocumentProcessor_v3 60.6% similar
-
class EditingWorkflowHandler 59.1% similar
-
class DocumentAnalyzer 58.9% similar