function msg_to_pdf
Converts a Microsoft Outlook .msg email file to a single PDF document, including the email body and all attachments merged together.
/tf/active/vicechatdev/msg_to_eml.py
874 - 992
complex
Purpose
This function provides comprehensive email-to-PDF conversion for archival or documentation purposes. It extracts the email content from a .msg file, converts it to HTML and then PDF, processes all attachments (converting them to PDF when possible), and merges everything into a single consolidated PDF file. This is useful for creating permanent records of email communications with all associated files.
Source Code
def msg_to_pdf(msg_path, pdf_path):
"""Convert a .msg file to PDF format with all attachments included"""
try:
# Check if input file exists
if not os.path.exists(msg_path):
logger.error(f"Input file not found: {msg_path}")
return False
# Create a temporary directory for processing
with tempfile.TemporaryDirectory() as temp_dir:
# Load the MSG file
msg = extract_msg.Message(msg_path)
# Generate HTML from the message for the email body PDF
html_content = generate_html_from_msg(msg)
html_path = os.path.join(temp_dir, "email.html")
with open(html_path, 'w', encoding='utf-8') as f:
f.write(html_content)
# Convert the HTML to PDF for the email body
pdf_converter = PDFConverter(temp_dir)
email_body_pdf = os.path.join(temp_dir, "email_body.pdf")
result = pdf_converter.convert_to_pdf(html_path, email_body_pdf)
if not result or not os.path.exists(email_body_pdf):
logger.error(f"Failed to convert email body HTML to PDF for {msg_path}")
return False
# List to collect all PDFs for merging (starting with the email body PDF)
pdf_files_to_merge = [email_body_pdf]
# Process attachments (if any)
if msg.attachments:
logger.info(f"Processing {len(msg.attachments)} attachments")
for i, attachment in enumerate(msg.attachments):
try:
# Get filename
filename = None
if hasattr(attachment, 'longFilename') and attachment.longFilename:
filename = attachment.longFilename
elif hasattr(attachment, 'shortFilename') and attachment.shortFilename:
filename = attachment.shortFilename
else:
filename = f'attachment_{i+1}'
logger.info(f"Processing attachment: {filename}")
# Skip if no data
if not attachment.data:
logger.warning(f"Skipping empty attachment: {filename}")
continue
# Save attachment to temp file
attachment_path = os.path.join(temp_dir, filename)
with open(attachment_path, 'wb') as f:
f.write(attachment.data)
# Skip conversion for very small files (likely empty)
if os.path.getsize(attachment_path) < 10:
logger.warning(f"Skipping tiny attachment: {filename}")
continue
# Try to convert attachment to PDF if possible
attachment_pdf = os.path.join(temp_dir, f"attachment_{i+1}.pdf")
# If already PDF, just use it as is
if filename.lower().endswith('.pdf'):
shutil.copy2(attachment_path, attachment_pdf)
pdf_files_to_merge.append(attachment_pdf)
continue
# Try to convert to PDF
conversion_result = pdf_converter.convert_to_pdf(attachment_path, attachment_pdf)
if conversion_result and os.path.exists(attachment_pdf):
pdf_files_to_merge.append(attachment_pdf)
logger.info(f"Successfully converted attachment: {filename}")
else:
logger.warning(f"Could not convert attachment to PDF: {filename}")
# For images that failed normal conversion, try direct image-to-pdf
if os.path.splitext(filename.lower())[1] in pdf_converter.IMAGE_EXTENSIONS:
try:
pdf_converter._convert_image_to_pdf(attachment_path, attachment_pdf)
if os.path.exists(attachment_pdf):
pdf_files_to_merge.append(attachment_pdf)
logger.info(f"Converted image using direct method: {filename}")
except Exception as e:
logger.error(f"Failed direct image conversion: {str(e)}")
except Exception as e:
logger.error(f"Error processing attachment {filename}: {str(e)}")
# Merge all PDFs if we have multiple
if len(pdf_files_to_merge) > 1:
logger.info(f"Merging {len(pdf_files_to_merge)} PDFs")
merge_result = merge_pdfs(pdf_files_to_merge, pdf_path)
if merge_result and os.path.exists(pdf_path):
logger.info(f"Successfully merged PDFs to: {pdf_path}")
return True
else:
logger.error("Failed to merge PDFs")
return False
elif len(pdf_files_to_merge) == 1:
# Just copy the single PDF if only email body was converted
shutil.copy2(pdf_files_to_merge[0], pdf_path)
logger.info(f"Created PDF without attachments: {pdf_path}")
return True
else:
logger.error("No PDFs were created for merging")
return False
except Exception as e:
logger.error(f"Error converting {msg_path} to PDF: {str(e)}")
logger.error(traceback.format_exc())
return False
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
msg_path |
- | - | positional_or_keyword |
pdf_path |
- | - | positional_or_keyword |
Parameter Details
msg_path: String path to the input .msg file (Microsoft Outlook message format). Must be a valid file path that exists on the filesystem. The file should be a properly formatted .msg file containing email data.
pdf_path: String path where the output PDF file should be saved. This is the destination path for the merged PDF containing the email body and all converted attachments. The directory must exist or be writable.
Return Value
Returns a boolean value: True if the conversion and PDF creation was successful, False if any error occurred during the process (file not found, conversion failure, merge failure, etc.). The function logs detailed error messages for debugging.
Dependencies
extract_msgostempfileshutiltracebackloggingPyPDF2
Required Imports
import extract_msg
import os
import tempfile
import shutil
import traceback
import logging
Usage Example
import logging
import os
# Setup logger
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
# Define paths
msg_file = '/path/to/email.msg'
output_pdf = '/path/to/output.pdf'
# Convert MSG to PDF
success = msg_to_pdf(msg_file, output_pdf)
if success:
print(f'Successfully created PDF: {output_pdf}')
print(f'File size: {os.path.getsize(output_pdf)} bytes')
else:
print('Conversion failed. Check logs for details.')
Best Practices
- Ensure the logger is properly configured before calling this function to capture detailed error messages
- Verify that the input .msg file exists and is readable before calling the function
- Ensure the output directory has sufficient write permissions and disk space
- The function uses a temporary directory that is automatically cleaned up, so no manual cleanup is needed
- Handle the boolean return value to determine if the conversion was successful
- Be aware that not all attachment types can be converted to PDF - the function will skip unconvertible attachments with warnings
- For large emails with many attachments, the conversion process may take significant time
- The function requires helper functions (generate_html_from_msg, PDFConverter class, merge_pdfs) to be available in the same module
- Very small attachments (< 10 bytes) are automatically skipped as they are likely empty or corrupted
- Image attachments have a fallback conversion method if the primary conversion fails
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
function msg_to_pdf_improved 86.3% similar
-
function msg_to_eml 81.9% similar
-
function msg_to_eml_alternative 75.5% similar
-
function eml_to_pdf 67.1% similar
-
function generate_html_from_msg 65.6% similar