class DocumentConverter
A class that converts various document formats (Word, Excel, PowerPoint, OpenDocument, Visio) to PDF using LibreOffice's headless conversion capabilities, with support for parallel processing and directory structure preservation.
/tf/active/vicechatdev/pdfconverter.py
15 - 190
moderate
Purpose
DocumentConverter provides a robust solution for batch converting office documents to PDF format. It recursively scans an input directory for supported document types, converts them to PDF using LibreOffice in headless mode, and preserves the original directory structure in the output location. The class supports concurrent conversions for improved performance, handles PDF files by copying them directly, tracks conversion errors with detailed reporting, and manages temporary files during the conversion process. It's designed for scenarios requiring bulk document conversion with error tracking and progress logging.
Source Code
class DocumentConverter:
"""Convert various document formats to PDF using LibreOffice"""
# Supported file extensions
SUPPORTED_EXTENSIONS = [
# Word documents
'.doc', '.docx', '.docm', '.dot', '.dotx', '.dotm', '.rtf',
# Excel documents
'.xls', '.xlsx', '.xlsm', '.xlt', '.xltx', '.xltm', '.xlsb',
# PowerPoint documents
'.ppt', '.pptx', '.pptm', '.pot', '.potx', '.potm', '.pps', '.ppsx',
# Other formats
'.odt', '.ods', '.odp', '.vsd', '.vsdx',
# Include PDF to handle PDF files in the source
'.pdf'
]
def __init__(self, input_dir, output_dir, max_workers=1):
"""
Initialize the converter
Args:
input_dir: Directory with source documents
output_dir: Directory to save PDF files
max_workers: Maximum number of concurrent conversions
"""
self.input_dir = Path(input_dir).absolute()
self.output_dir = Path(output_dir).absolute()
self.max_workers = max_workers
self.error_details = {} # Store detailed error information
# Create output directory if not exists
os.makedirs(self.output_dir, exist_ok=True)
def find_documents(self):
"""Find all supported documents in input directory"""
documents = []
for ext in self.SUPPORTED_EXTENSIONS:
documents.extend(self.input_dir.glob(f'**/*{ext}'))
return documents
def get_relative_output_path(self, input_file):
"""Determine the output path that preserves the original directory structure"""
# Get the relative path from the input_dir
rel_path = input_file.relative_to(self.input_dir)
# Calculate the output directory path preserving folder structure
output_dir = self.output_dir / rel_path.parent
# Create the directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Return the full output path with .pdf extension
return output_dir / f"{input_file.stem}.pdf"
def convert_document(self, input_file):
"""Convert a document to PDF using LibreOffice"""
file_id = str(input_file)
output_file = self.get_relative_output_path(input_file)
try:
# Skip if already converted
if output_file.exists():
logger.info(f"Skipping {input_file.name} - already exists at {output_file}")
return False
logger.info(f"Converting {input_file} to {output_file}")
# Special handling for PDF files - just copy them
if input_file.suffix.lower() == '.pdf':
shutil.copy2(input_file, output_file)
logger.info(f"Copied PDF file {input_file.name} to {output_file}")
return True
# Use LibreOffice for actual conversion
# We'll use a temporary directory for LibreOffice output
temp_dir = self.output_dir / "_temp"
os.makedirs(temp_dir, exist_ok=True)
cmd = [
'libreoffice',
'--headless',
'--convert-to',
'pdf',
'--outdir',
str(temp_dir),
str(input_file)
]
process = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=120 # 2 minute timeout
)
if process.returncode != 0:
error_msg = f"Error converting {input_file.name}: {process.stderr}"
logger.error(error_msg)
self.error_details[file_id] = {
'file': str(input_file),
'error': process.stderr,
'return_code': process.returncode
}
return False
# Move the converted file from temp dir to the proper location
temp_output = temp_dir / f"{input_file.stem}.pdf"
if temp_output.exists():
# Ensure target directory exists
os.makedirs(output_file.parent, exist_ok=True)
# Move the file to preserve structure
shutil.move(temp_output, output_file)
logger.info(f"Successfully converted {input_file.name} to {output_file}")
return True
else:
error_msg = f"Conversion produced no output for {input_file.name}"
logger.error(error_msg)
self.error_details[file_id] = {
'file': str(input_file),
'error': error_msg
}
return False
except subprocess.TimeoutExpired as e:
error_msg = f"Timeout converting {input_file.name}"
logger.error(error_msg)
self.error_details[file_id] = {
'file': str(input_file),
'error': str(e)
}
return False
except Exception as e:
error_msg = f"Error converting {input_file.name}: {str(e)}"
logger.error(error_msg)
self.error_details[file_id] = {
'file': str(input_file),
'error': str(e)
}
return False
def convert_all(self):
"""Convert all documents in parallel"""
documents = self.find_documents()
logger.info(f"Found {len(documents)} documents to convert")
success_count = 0
failure_count = 0
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
results = list(executor.map(self.convert_document, documents))
success_count = results.count(True)
failure_count = results.count(False)
# Clean up any temporary directory
temp_dir = self.output_dir / "_temp"
if temp_dir.exists():
shutil.rmtree(temp_dir, ignore_errors=True)
logger.info(f"Conversion complete: {success_count} succeeded, {failure_count} failed")
# Print detailed error information
if self.error_details:
logger.info(f"\n{'='*80}\nDETAILED ERROR REPORT\n{'='*80}")
for idx, (file_id, details) in enumerate(self.error_details.items(), 1):
logger.info(f"\nError #{idx}:")
logger.info(f"File: {details['file']}")
logger.info(f"Error: {details['error']}")
if 'return_code' in details:
logger.info(f"Return code: {details['return_code']}")
logger.info('-' * 50)
return success_count, failure_count
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
input_dir: Path to the directory containing source documents to convert. Can be a string or Path object. The converter will recursively search this directory for all supported file types. The path is converted to an absolute path internally.
output_dir: Path to the directory where converted PDF files will be saved. Can be a string or Path object. The directory structure from input_dir is preserved in this location. The directory is created automatically if it doesn't exist. The path is converted to an absolute path internally.
max_workers: Maximum number of concurrent document conversions to run in parallel using ThreadPoolExecutor. Default is 1 (sequential processing). Higher values can improve performance but increase system resource usage. Should be tuned based on available CPU cores and memory.
Return Value
Instantiation returns a DocumentConverter object. The convert_all() method returns a tuple (success_count, failure_count) indicating the number of successfully converted documents and the number of failed conversions. The convert_document() method returns a boolean: True if conversion succeeded, False if it failed or was skipped. The find_documents() method returns a list of Path objects representing all supported documents found. The get_relative_output_path() method returns a Path object representing the output file path with preserved directory structure.
Class Interface
Methods
__init__(self, input_dir, output_dir, max_workers=1)
Purpose: Initialize the DocumentConverter with input/output directories and concurrency settings
Parameters:
input_dir: Directory path containing source documents to convertoutput_dir: Directory path where converted PDFs will be savedmax_workers: Maximum number of concurrent conversions (default: 1)
Returns: None - initializes the DocumentConverter instance
find_documents(self) -> list
Purpose: Recursively search the input directory for all supported document types
Returns: List of Path objects representing all found documents with supported extensions
get_relative_output_path(self, input_file) -> Path
Purpose: Calculate the output PDF path that preserves the original directory structure relative to input_dir
Parameters:
input_file: Path object of the input document file
Returns: Path object representing the output PDF file location with preserved directory structure
convert_document(self, input_file) -> bool
Purpose: Convert a single document to PDF using LibreOffice, or copy if already PDF
Parameters:
input_file: Path object of the document to convert
Returns: True if conversion succeeded, False if it failed or was skipped (already exists)
convert_all(self) -> tuple
Purpose: Convert all documents found in input_dir to PDF using parallel processing, with error tracking and cleanup
Returns: Tuple of (success_count, failure_count) indicating conversion results
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
SUPPORTED_EXTENSIONS |
list | Class variable containing all supported file extensions for conversion, including Word (.doc, .docx), Excel (.xls, .xlsx), PowerPoint (.ppt, .pptx), OpenDocument (.odt, .ods, .odp), Visio (.vsd, .vsdx), and PDF (.pdf) | class |
input_dir |
Path | Absolute path to the directory containing source documents to convert | instance |
output_dir |
Path | Absolute path to the directory where converted PDF files are saved | instance |
max_workers |
int | Maximum number of concurrent document conversions allowed | instance |
error_details |
dict | Dictionary storing detailed error information for failed conversions, keyed by file path string, with values containing 'file', 'error', and optionally 'return_code' keys | instance |
Dependencies
ossubprocessloggingconcurrent.futurespathlibshutil
Required Imports
import os
import subprocess
import logging
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
import shutil
Usage Example
import os
import logging
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
import subprocess
import shutil
# Setup logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
# Basic usage - sequential conversion
converter = DocumentConverter(
input_dir='/path/to/documents',
output_dir='/path/to/pdfs',
max_workers=1
)
# Find all documents first (optional)
documents = converter.find_documents()
print(f'Found {len(documents)} documents')
# Convert all documents
success, failures = converter.convert_all()
print(f'Converted {success} documents, {failures} failed')
# Check for errors
if converter.error_details:
for file_id, error_info in converter.error_details.items():
print(f"Error in {error_info['file']}: {error_info['error']}")
# Parallel conversion with 4 workers
parallel_converter = DocumentConverter(
input_dir='/path/to/documents',
output_dir='/path/to/pdfs',
max_workers=4
)
success, failures = parallel_converter.convert_all()
# Convert a single document
single_file = Path('/path/to/documents/report.docx')
result = converter.convert_document(single_file)
Best Practices
- Always ensure LibreOffice is installed before instantiating the class, as conversion will fail without it
- Configure a logger before using the class, as it relies on a module-level 'logger' variable for status reporting
- Start with max_workers=1 for testing, then increase based on system resources and performance needs
- Monitor the error_details attribute after conversion to identify and handle failed conversions
- The class creates a temporary directory (_temp) in output_dir during conversion, which is cleaned up automatically
- PDF files in the input directory are copied rather than converted, preserving the original file
- Each conversion has a 2-minute timeout to prevent hanging on problematic files
- The class preserves the original directory structure from input_dir in output_dir
- Already converted files (existing PDFs in output_dir) are skipped to avoid redundant work
- Use absolute paths or ensure relative paths are correct, as the class converts them to absolute paths internally
- For large batch conversions, consider the disk space requirements for both output PDFs and temporary files
- The error_details dictionary persists across the object's lifetime, accumulating errors from all conversions
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class PDFConverter 87.2% similar
-
class DocumentExtractor 65.2% similar
-
function convert_document_to_pdf 55.5% similar
-
function main_v12 51.6% similar
-
function test_document_extractor 49.6% similar