function eml_to_pdf
Converts an .eml email file to PDF format, including the email body and all attachments merged into a single PDF document.
/tf/active/vicechatdev/msg_to_eml.py
1150 - 1291
complex
Purpose
This function provides comprehensive email-to-PDF conversion by parsing .eml files, rendering the email content as HTML, converting it to PDF, extracting and converting attachments (images, documents, etc.) to PDF format, and merging everything into a single output PDF file. It handles multiple conversion methods with fallbacks, supports various attachment types, and includes robust error handling for production use.
Source Code
def eml_to_pdf(eml_path, pdf_path):
"""Convert an .eml file to PDF format with attachments included"""
try:
if not os.path.exists(eml_path):
logger.error(f"Input EML file not found: {eml_path}")
return False
# Create a temporary directory for processing
with tempfile.TemporaryDirectory() as temp_dir:
# Parse the EML file
with open(eml_path, 'rb') as f:
parsed_email = email.message_from_binary_file(f)
# Create a simpler, more reliable HTML representation
email_html = generate_simple_html_from_eml(parsed_email)
html_path = os.path.join(temp_dir, "email.html")
with open(html_path, 'w', encoding='utf-8') as f:
f.write(email_html)
# Convert email body to PDF first
email_body_pdf = os.path.join(temp_dir, "email_body.pdf")
# Try using wkhtmltopdf (more reliable HTML renderer) if available
body_pdf_created = False
if shutil.which('wkhtmltopdf'):
try:
cmd = [
'wkhtmltopdf',
'--encoding', 'utf-8',
'--quiet',
html_path,
email_body_pdf
]
subprocess.run(cmd, check=True, timeout=30)
body_pdf_created = os.path.exists(email_body_pdf)
except Exception as e:
logger.warning(f"wkhtmltopdf conversion failed: {str(e)}")
# Fall back to ReportLab if needed
if not body_pdf_created:
body_pdf_created = html_to_pdf(email_html, email_body_pdf)
if not body_pdf_created:
logger.error(f"Failed to convert email body to PDF for {eml_path}")
return False
# List to collect all PDFs for merging (starting with the email body PDF)
pdf_files_to_merge = [email_body_pdf]
# Process attachments
pdf_converter = PDFConverter(temp_dir)
# Find and extract all attachments
attachment_counter = 0
if parsed_email.is_multipart():
for i, part in enumerate(parsed_email.walk()):
if part.get_content_disposition() == 'attachment':
try:
# Get filename
filename = part.get_filename()
if not filename:
filename = f'attachment_{i+1}'
# Clean up filename (remove problematic characters)
filename = ''.join(c for c in filename if c.isalnum() or c in '._- ')
logger.info(f"Processing EML attachment: {filename}")
# Extract attachment data
attachment_data = part.get_payload(decode=True)
if not attachment_data:
logger.warning(f"Skipping empty attachment: {filename}")
continue
# Save attachment to temp file
attachment_path = os.path.join(temp_dir, filename)
with open(attachment_path, 'wb') as f:
f.write(attachment_data)
# Skip conversion for very small files (likely empty)
if os.path.getsize(attachment_path) < 10:
logger.warning(f"Skipping tiny attachment: {filename}")
continue
# Try to convert attachment to PDF if possible
attachment_counter += 1
attachment_pdf = os.path.join(temp_dir, f"attachment_{attachment_counter}.pdf")
# If already PDF, just use it as is
if filename.lower().endswith('.pdf'):
shutil.copy2(attachment_path, attachment_pdf)
pdf_files_to_merge.append(attachment_pdf)
continue
# Try to convert to PDF
conversion_result = pdf_converter.convert_to_pdf(attachment_path, attachment_pdf)
if conversion_result and os.path.exists(attachment_pdf):
pdf_files_to_merge.append(attachment_pdf)
logger.info(f"Successfully converted attachment: {filename}")
else:
logger.warning(f"Could not convert attachment to PDF: {filename}")
# For images that failed normal conversion, try direct image-to-pdf
ext = os.path.splitext(filename.lower())[1]
if ext in pdf_converter.IMAGE_EXTENSIONS:
try:
pdf_converter._convert_image_to_pdf(attachment_path, attachment_pdf)
if os.path.exists(attachment_pdf):
pdf_files_to_merge.append(attachment_pdf)
logger.info(f"Converted image using direct method: {filename}")
except Exception as e:
logger.error(f"Failed direct image conversion: {str(e)}")
except Exception as e:
logger.error(f"Error processing attachment {filename}: {str(e)}")
# Merge all PDFs if we have multiple
if len(pdf_files_to_merge) > 1:
logger.info(f"Merging {len(pdf_files_to_merge)} PDFs")
merge_result = merge_pdfs(pdf_files_to_merge, pdf_path)
if merge_result and os.path.exists(pdf_path):
logger.info(f"Successfully merged PDFs to: {pdf_path}")
return True
else:
logger.error("Failed to merge PDFs")
return False
elif len(pdf_files_to_merge) == 1:
# Just copy the single PDF if only email body was converted
shutil.copy2(pdf_files_to_merge[0], pdf_path)
logger.info(f"Created PDF without attachments: {pdf_path}")
return True
else:
logger.error("No PDFs were created for merging")
return False
except Exception as e:
logger.error(f"Error converting {eml_path} to PDF: {str(e)}")
logger.error(traceback.format_exc())
return False
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
eml_path |
- | - | positional_or_keyword |
pdf_path |
- | - | positional_or_keyword |
Parameter Details
eml_path: String path to the input .eml email file to be converted. Must be a valid file path pointing to an existing .eml file. The function will validate existence before processing.
pdf_path: String path where the output PDF file should be saved. This is the destination path for the merged PDF containing the email body and all converted attachments. Parent directory must exist or be writable.
Return Value
Returns a boolean value: True if the conversion was successful and the PDF was created at pdf_path, False if any error occurred during parsing, conversion, or merging. Errors are logged via the logger object.
Dependencies
extract_msgemailosmimetypesloggingtracebacktempfilesysbase64shutilsubprocesspathlibdatetimeargparsehtmlrereportlabtimePILfitzPyPDF2
Required Imports
import os
import email
import tempfile
import shutil
import subprocess
import traceback
import logging
Conditional/Optional Imports
These imports are only needed under specific conditions:
from reportlab.lib.pagesizes import letter
Condition: Required for PDF generation using ReportLab fallback method
Required (conditional)from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
Condition: Required for PDF generation using ReportLab fallback method
Required (conditional)from reportlab.lib.styles import getSampleStyleSheet
Condition: Required for PDF generation using ReportLab fallback method
Required (conditional)from PyPDF2 import PdfMerger
Condition: Required for merging multiple PDFs (email body + attachments)
Required (conditional)from PIL import Image
Condition: Required for image attachment processing and conversion
Required (conditional)import fitz
Condition: Required for PDF manipulation (PyMuPDF library)
Required (conditional)Usage Example
import logging
import os
# Setup logger
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
# Define required helper functions (simplified examples)
def generate_simple_html_from_eml(parsed_email):
# Implementation needed
return '<html><body>Email content</body></html>'
def html_to_pdf(html_content, output_path):
# Implementation needed
return True
def merge_pdfs(pdf_list, output_path):
from PyPDF2 import PdfMerger
merger = PdfMerger()
for pdf in pdf_list:
merger.append(pdf)
merger.write(output_path)
merger.close()
return True
class PDFConverter:
IMAGE_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.gif', '.bmp']
def __init__(self, temp_dir):
self.temp_dir = temp_dir
def convert_to_pdf(self, input_path, output_path):
# Implementation needed
return True
def _convert_image_to_pdf(self, input_path, output_path):
# Implementation needed
pass
# Convert email to PDF
eml_file = '/path/to/email.eml'
output_pdf = '/path/to/output.pdf'
success = eml_to_pdf(eml_file, output_pdf)
if success:
print(f'Successfully converted {eml_file} to {output_pdf}')
else:
print('Conversion failed')
Best Practices
- Ensure all helper functions (generate_simple_html_from_eml, html_to_pdf, merge_pdfs) and PDFConverter class are properly implemented before using this function
- Install wkhtmltopdf system binary for better HTML rendering quality, though the function will fall back to ReportLab if unavailable
- Configure logging appropriately to capture detailed error messages and conversion progress
- Ensure sufficient disk space is available as the function creates temporary files during processing
- The function automatically cleans up temporary files using context managers, but ensure the process isn't interrupted during execution
- Handle the boolean return value to implement proper error handling in calling code
- For large emails with many attachments, expect longer processing times
- The function sanitizes attachment filenames to prevent filesystem issues
- Very small attachments (< 10 bytes) are automatically skipped as likely empty or corrupted
- Image attachments have a fallback conversion method if primary conversion fails
- The function is thread-safe due to use of temporary directories, but consider rate limiting for bulk conversions
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
function html_to_pdf 70.9% similar
-
function msg_to_pdf_improved 68.1% similar
-
function msg_to_pdf 67.1% similar
-
function msg_to_eml 63.8% similar
-
function generate_simple_html_from_eml 62.5% similar