class PDFConverter
A class that converts various document formats (Word, PowerPoint, Excel, images) to PDF format using LibreOffice and ReportLab libraries.
/tf/active/vicechatdev/msg_to_eml.py
262 - 410
moderate
Purpose
PDFConverter provides a unified interface for converting multiple document types to PDF. It handles Word documents (.doc, .docx), PowerPoint presentations (.ppt, .pptx), Excel spreadsheets (.xls, .xlsx), and images (.jpg, .png, etc.) by routing them to appropriate conversion methods. The class uses LibreOffice for office documents and ReportLab/PIL for image conversions. It manages temporary directories, handles file path operations, and provides error handling for conversion failures.
Source Code
class PDFConverter:
"""Converts various document formats to PDF"""
# Supported file extensions by type
WORD_EXTENSIONS = ['.doc', '.docx', '.docm', '.dot', '.dotx', '.dotm', '.rtf', '.odt']
PPT_EXTENSIONS = ['.ppt', '.pptx', '.pptm', '.pot', '.potx', '.potm', '.pps', '.ppsx', '.odp']
EXCEL_EXTENSIONS = ['.xls', '.xlsx', '.xlsm', '.xlt', '.xltx', '.xltm', '.xlsb', '.ods']
IMAGE_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.tif']
def __init__(self, temp_dir=None):
"""Initialize the PDF converter with an optional temp directory"""
self.temp_dir = temp_dir if temp_dir else tempfile.mkdtemp()
os.makedirs(self.temp_dir, exist_ok=True)
def convert_to_pdf(self, input_path, output_path):
"""Convert a document to PDF format"""
if not os.path.exists(input_path):
raise FileNotFoundError(f"Input file not found: {input_path}")
# Create output directory if it doesn't exist
output_dir = os.path.dirname(output_path)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir, exist_ok=True)
# Get file extension and determine conversion method
ext = os.path.splitext(input_path)[1].lower()
# If already a PDF, just copy it
if ext == '.pdf':
shutil.copy2(input_path, output_path)
return output_path
# Convert based on file type
if ext in self.WORD_EXTENSIONS:
return self._convert_with_libreoffice(input_path, output_path)
elif ext in self.PPT_EXTENSIONS:
return self._convert_with_libreoffice(input_path, output_path)
elif ext in self.EXCEL_EXTENSIONS:
return self._convert_with_libreoffice(input_path, output_path)
elif ext in self.IMAGE_EXTENSIONS:
return self._convert_image_to_pdf(input_path, output_path)
else:
# Try LibreOffice as fallback for unknown types
return self._convert_with_libreoffice(input_path, output_path)
def _convert_with_libreoffice(self, input_path, output_path):
"""Convert a document to PDF using LibreOffice"""
try:
# Absolute paths to avoid directory issues
abs_input = os.path.abspath(input_path)
abs_output_dir = os.path.abspath(os.path.dirname(output_path))
# Use LibreOffice for conversion
cmd = [
'libreoffice',
'--headless',
'--convert-to', 'pdf',
'--outdir', abs_output_dir,
abs_input
]
# Run with timeout
process = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=60 # 60 second timeout
)
# Check if successful
if process.returncode != 0:
logger.error(f"LibreOffice conversion failed: {process.stderr}")
return None
# LibreOffice creates output with original filename but .pdf extension
expected_output = os.path.join(
abs_output_dir,
os.path.splitext(os.path.basename(input_path))[0] + '.pdf'
)
# Rename if necessary
if expected_output != output_path and os.path.exists(expected_output):
shutil.move(expected_output, output_path)
if os.path.exists(output_path):
return output_path
else:
logger.error(f"Expected output not found: {output_path}")
return None
except subprocess.TimeoutExpired:
logger.error(f"Timeout while converting: {input_path}")
return None
except Exception as e:
logger.error(f"Error in LibreOffice conversion: {str(e)}")
return None
def _convert_image_to_pdf(self, input_path, output_path):
"""Convert an image to PDF"""
try:
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Image as RLImage, Paragraph
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.units import inch
from PIL import Image
# Open image to get dimensions
img = Image.open(input_path)
img_width, img_height = img.size
# Create PDF
doc = SimpleDocTemplate(
output_path,
pagesize=letter,
rightMargin=72,
leftMargin=72,
topMargin=72,
bottomMargin=72
)
# Calculate image size to fit on page
page_width, page_height = letter
avail_width = page_width - 144 # Account for margins
avail_height = page_height - 144
# Scale image to fit available space
width_ratio = avail_width / img_width
height_ratio = avail_height / img_height
ratio = min(width_ratio, height_ratio)
# Create elements for the PDF
styles = getSampleStyleSheet()
elements = []
# Add filename as title
elements.append(Paragraph(os.path.basename(input_path), styles['Heading2']))
# Add image
img_obj = RLImage(input_path, width=img_width*ratio, height=img_height*ratio)
elements.append(img_obj)
# Build PDF
doc.build(elements)
return output_path
except Exception as e:
logger.error(f"Error converting image to PDF: {str(e)}")
return None
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
temp_dir: Optional string path to a temporary directory for intermediate file operations. If not provided, a new temporary directory is created using tempfile.mkdtemp(). The directory is created if it doesn't exist. Used for storing temporary files during conversion processes.
Return Value
Instantiation returns a PDFConverter object. The convert_to_pdf method returns the output_path string if conversion succeeds, or None if conversion fails. Private methods (_convert_with_libreoffice and _convert_image_to_pdf) also return the output_path string on success or None on failure.
Class Interface
Methods
__init__(self, temp_dir=None) -> None
Purpose: Initialize the PDF converter with an optional temporary directory for intermediate file operations
Parameters:
temp_dir: Optional string path to temporary directory. If None, creates a new temp directory using tempfile.mkdtemp()
Returns: None - initializes the PDFConverter instance
convert_to_pdf(self, input_path, output_path) -> str | None
Purpose: Main public method to convert any supported document format to PDF, routing to appropriate conversion method based on file extension
Parameters:
input_path: String path to the input file to be converted. Must exist or FileNotFoundError is raisedoutput_path: String path where the output PDF should be saved. Directory is created if it doesn't exist
Returns: String path to the output PDF file on success, or None if conversion fails
_convert_with_libreoffice(self, input_path, output_path) -> str | None
Purpose: Private method to convert office documents (Word, Excel, PowerPoint) to PDF using LibreOffice headless mode
Parameters:
input_path: String path to the input office documentoutput_path: String path where the output PDF should be saved
Returns: String path to the output PDF file on success, or None if conversion fails or times out (60 second timeout)
_convert_image_to_pdf(self, input_path, output_path) -> str | None
Purpose: Private method to convert image files to PDF using ReportLab and PIL, scaling the image to fit letter-sized pages with margins
Parameters:
input_path: String path to the input image fileoutput_path: String path where the output PDF should be saved
Returns: String path to the output PDF file on success, or None if conversion fails
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
WORD_EXTENSIONS |
list[str] | Class variable containing supported Word document file extensions: .doc, .docx, .docm, .dot, .dotx, .dotm, .rtf, .odt | class |
PPT_EXTENSIONS |
list[str] | Class variable containing supported PowerPoint presentation file extensions: .ppt, .pptx, .pptm, .pot, .potx, .potm, .pps, .ppsx, .odp | class |
EXCEL_EXTENSIONS |
list[str] | Class variable containing supported Excel spreadsheet file extensions: .xls, .xlsx, .xlsm, .xlt, .xltx, .xltm, .xlsb, .ods | class |
IMAGE_EXTENSIONS |
list[str] | Class variable containing supported image file extensions: .jpg, .jpeg, .png, .gif, .bmp, .tiff, .tif | class |
temp_dir |
str | Instance variable storing the path to the temporary directory used for intermediate file operations during conversion | instance |
Dependencies
ostempfileshutilsubprocessreportlabPILlogging
Required Imports
import os
import tempfile
import shutil
import subprocess
import logging
Conditional/Optional Imports
These imports are only needed under specific conditions:
from reportlab.lib.pagesizes import letter
Condition: only when converting images to PDF using _convert_image_to_pdf method
Required (conditional)from reportlab.platypus import SimpleDocTemplate, Image as RLImage, Paragraph
Condition: only when converting images to PDF using _convert_image_to_pdf method
Required (conditional)from reportlab.lib.styles import getSampleStyleSheet
Condition: only when converting images to PDF using _convert_image_to_pdf method
Required (conditional)from reportlab.lib.units import inch
Condition: only when converting images to PDF using _convert_image_to_pdf method
Required (conditional)from PIL import Image
Condition: only when converting images to PDF using _convert_image_to_pdf method
Required (conditional)Usage Example
import os
import tempfile
import logging
# Setup logger (required by the class)
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
# Create converter with default temp directory
converter = PDFConverter()
# Or specify a custom temp directory
converter = PDFConverter(temp_dir='/tmp/my_conversions')
# Convert a Word document to PDF
input_file = 'document.docx'
output_file = 'output.pdf'
result = converter.convert_to_pdf(input_file, output_file)
if result:
print(f'Successfully converted to {result}')
else:
print('Conversion failed')
# Convert an image to PDF
image_result = converter.convert_to_pdf('photo.jpg', 'photo.pdf')
# Convert a PowerPoint presentation
ppt_result = converter.convert_to_pdf('presentation.pptx', 'presentation.pdf')
# If input is already PDF, it will be copied
pdf_result = converter.convert_to_pdf('existing.pdf', 'copy.pdf')
Best Practices
- Always check the return value of convert_to_pdf() - it returns None on failure
- Ensure LibreOffice is installed and accessible in system PATH before using for office document conversions
- The class creates output directories automatically if they don't exist
- Conversion has a 60-second timeout for LibreOffice operations to prevent hanging
- The temp_dir is created during initialization but not automatically cleaned up - consider manual cleanup if needed
- For image conversions, ensure PIL/Pillow and ReportLab are installed
- The class supports a wide range of file formats but uses LibreOffice as fallback for unknown types
- If input file is already a PDF, it is simply copied to the output location
- Error messages are logged using the module-level logger object, so ensure logging is configured
- File paths are converted to absolute paths internally to avoid directory-related issues
- The class is stateless except for temp_dir, so a single instance can be reused for multiple conversions
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class DocumentConverter 87.2% similar
-
class DocumentExtractor 62.2% similar
-
function html_to_pdf 55.6% similar
-
function convert_document_to_pdf 51.9% similar
-
function eml_to_pdf 50.2% similar