class PowerPointProcessor
A class that processes PowerPoint (.pptx) presentations to extract text content and tables, converting tables to markdown format and organizing content by slides.
/tf/active/vicechatdev/leexi/enhanced_meeting_minutes_generator.py
63 - 211
moderate
Purpose
PowerPointProcessor is designed to parse PowerPoint presentations and extract structured content including text from shapes and tables from slides. It separates text and table content into distinct chunks, each associated with slide metadata (slide number and title). Tables are converted to markdown format for easy consumption. The class handles file validation, error recovery, and provides detailed logging throughout the extraction process. It's particularly useful for document processing pipelines, content indexing systems, or any application that needs to extract and structure PowerPoint content programmatically.
Source Code
class PowerPointProcessor:
"""Process PowerPoint presentations to extract text and table content"""
def __init__(self, temp_dir=None):
"""Initialize the PowerPoint processor"""
if not PPTX_AVAILABLE:
logger.warning("python-pptx library not available. PowerPoint processing will be limited.")
self.temp_dir = Path(temp_dir) if temp_dir else Path(tempfile.mkdtemp())
os.makedirs(self.temp_dir, exist_ok=True)
def _is_valid_file(self, file_path):
"""Check if a file appears to be valid and processable"""
try:
path = Path(file_path)
if not path.exists() or path.stat().st_size == 0:
logger.warning(f"File doesn't exist or is empty: {file_path}")
return False
with open(file_path, 'rb') as f:
header = f.read(16)
if not header:
logger.warning(f"File appears unreadable: {file_path}")
return False
return True
except Exception as e:
logger.error(f"Error checking file validity: {file_path} - {str(e)}")
return False
def _table_to_markdown(self, table_data):
"""Convert a 2D array to a markdown table"""
if not table_data or not table_data[0]:
return "| |"
# Create header
markdown = "| " + " | ".join([str(cell) for cell in table_data[0]]) + " |\n"
# Add separator line
markdown += "| " + " | ".join(["---" for _ in table_data[0]]) + " |\n"
# Add data rows
for row in table_data[1:]:
markdown += "| " + " | ".join([str(cell) for cell in row]) + " |\n"
return markdown
def _process_powerpoint_table(self, table):
"""Extract table data from a PowerPoint table"""
try:
table_data = []
for i, row in enumerate(table.rows):
row_data = []
for cell in row.cells:
cell_text = cell.text.strip().replace('\n', ' ').replace('\r', ' ')
row_data.append(cell_text)
table_data.append(row_data)
return self._table_to_markdown(table_data)
except Exception as e:
logger.error(f"Error processing PowerPoint table: {str(e)}")
return "Error processing table"
def process_powerpoint(self, file_path):
"""Process PowerPoint presentations to extract text and table content"""
logger.info(f"Processing PowerPoint: {file_path}")
if not PPTX_AVAILABLE:
logger.error("python-pptx library not available. Cannot process PowerPoint files.")
return {"text_chunks": [], "table_chunks": []}
if not self._is_valid_file(file_path):
logger.error(f"Invalid or corrupted file, skipping: {file_path}")
return None
try:
text_chunks = []
table_chunks = []
# Try to open the presentation
try:
presentation = pptx.Presentation(file_path)
except Exception as e:
logger.error(f"Error opening PowerPoint with python-pptx: {str(e)}")
return None
# Process each slide
for i, slide in enumerate(presentation.slides):
# Get slide title or default
try:
slide_title = None
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text and hasattr(shape, "placeholder_format"):
if shape.placeholder_format.type == 1: # Title placeholder
slide_title = shape.text.strip()
break
if not slide_title:
slide_title = f"Slide {i+1}"
except Exception as slide_err:
logger.warning(f"Error getting slide title: {str(slide_err)}")
slide_title = f"Slide {i+1}"
# First identify all tables to exclude them from text extraction
tables = []
try:
tables = [shape for shape in slide.shapes if hasattr(shape, "has_table") and shape.has_table]
table_ids = set(id(table) for table in tables)
except Exception as table_err:
logger.warning(f"Error identifying tables in slide {i+1}: {str(table_err)}")
table_ids = set()
# Extract all text from shapes on this slide (excluding tables and titles)
text_content = []
for shape in slide.shapes:
try:
# Skip tables and titles
if (id(shape) in table_ids or
(hasattr(shape, "text") and shape.text == slide_title)):
continue
# Add text from non-table shapes
if hasattr(shape, "text") and shape.text.strip():
text_content.append(shape.text.strip())
except Exception as shape_err:
logger.warning(f"Error processing shape in slide {i+1}: {str(shape_err)}")
# Combine all text from this slide into a single chunk
combined_text = "\n".join(text_content)
if combined_text.strip(): # Only add if there's meaningful text
newuid = str(uuid4())
parent_text = f"Slide {i+1}: {slide_title}"
text_chunks.append([parent_text, combined_text, newuid])
# Process tables separately
for shape in tables:
try:
if hasattr(shape, "table"):
table_markdown = self._process_powerpoint_table(shape.table)
newuid = str(uuid4())
parent_text = f"Slide {i+1}: {slide_title} - Table"
table_chunks.append([parent_text, table_markdown, "", newuid])
except Exception as table_process_err:
logger.warning(f"Error processing table in slide {i+1}: {str(table_process_err)}")
return {"text_chunks": text_chunks, "table_chunks": table_chunks}
except Exception as e:
logger.error(f"Error processing PowerPoint {file_path}: {str(e)}")
return None
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
temp_dir: Optional path to a temporary directory for file operations. If not provided, a new temporary directory is created using tempfile.mkdtemp(). The directory is created if it doesn't exist. Type: str or Path-like object, Default: None
Return Value
Instantiation returns a PowerPointProcessor object. The main method process_powerpoint() returns a dictionary with keys 'text_chunks' and 'table_chunks'. text_chunks contains lists of [parent_text, content, uuid], where parent_text is 'Slide N: Title', content is the combined text, and uuid is a unique identifier. table_chunks contains lists of [parent_text, markdown_table, empty_string, uuid]. Returns None if processing fails or file is invalid.
Class Interface
Methods
__init__(self, temp_dir=None)
Purpose: Initialize the PowerPoint processor with an optional temporary directory
Parameters:
temp_dir: Optional path to temporary directory for file operations. If None, creates a new temp directory
Returns: None (constructor)
_is_valid_file(self, file_path) -> bool
Purpose: Check if a file exists, is readable, and appears to be valid for processing
Parameters:
file_path: Path to the file to validate (str or Path-like)
Returns: Boolean indicating whether the file is valid and processable
_table_to_markdown(self, table_data) -> str
Purpose: Convert a 2D array representing table data into markdown table format
Parameters:
table_data: 2D list where first row is headers and subsequent rows are data
Returns: String containing the markdown-formatted table with headers, separators, and data rows
_process_powerpoint_table(self, table) -> str
Purpose: Extract data from a PowerPoint table object and convert it to markdown format
Parameters:
table: A python-pptx Table object from a shape
Returns: Markdown-formatted string representation of the table, or 'Error processing table' on failure
process_powerpoint(self, file_path) -> dict or None
Purpose: Main method to process a PowerPoint file and extract all text and table content organized by slides
Parameters:
file_path: Path to the PowerPoint (.pptx) file to process
Returns: Dictionary with keys 'text_chunks' (list of [parent_text, content, uuid]) and 'table_chunks' (list of [parent_text, markdown, '', uuid]), or None if processing fails
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
temp_dir |
Path | Path object pointing to the temporary directory used for file operations during processing | instance |
Dependencies
python-pptxpathlibuuidtempfileoslogging
Required Imports
import os
from pathlib import Path
from uuid import uuid4
import tempfile
import logging
Conditional/Optional Imports
These imports are only needed under specific conditions:
import pptx
Condition: Required for PowerPoint processing functionality. The class checks PPTX_AVAILABLE flag and logs warnings if not available, but will return empty results or None
Required (conditional)Usage Example
import logging
from pathlib import Path
# Setup logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
# Check if pptx is available
try:
import pptx
PPTX_AVAILABLE = True
except ImportError:
PPTX_AVAILABLE = False
# Instantiate the processor
processor = PowerPointProcessor(temp_dir='/tmp/ppt_processing')
# Process a PowerPoint file
result = processor.process_powerpoint('presentation.pptx')
if result:
# Access text chunks
for parent, text, uid in result['text_chunks']:
print(f'{parent}:\n{text}\n')
# Access table chunks
for parent, markdown, _, uid in result['table_chunks']:
print(f'{parent}:\n{markdown}\n')
else:
print('Failed to process PowerPoint file')
Best Practices
- Always check if the result is None before accessing text_chunks or table_chunks, as processing can fail
- Ensure the python-pptx library is installed before instantiation, or handle the PPTX_AVAILABLE flag appropriately
- The temp_dir is created during initialization but not automatically cleaned up; manage cleanup externally if needed
- File validation is performed automatically via _is_valid_file(), but ensure files are accessible and not corrupted
- Each chunk includes a UUID for tracking and deduplication purposes
- Text and tables are processed separately to maintain structure and allow different handling strategies
- The class logs extensively; configure logging appropriately to capture warnings and errors
- Table markdown format uses standard markdown table syntax with headers and separators
- Slide titles are extracted from placeholder shapes with type 1; if not found, defaults to 'Slide N'
- The processor combines all text from a slide (excluding tables and titles) into a single chunk per slide
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class DocumentExtractor 58.3% similar
-
function main_v3 49.4% similar
-
class DocumentProcessor 48.2% similar
-
class DocumentProcessor_v1 47.7% similar
-
function main_v2 47.6% similar