class FileCloudEmailProcessor
A class that processes email files (.msg format) stored in FileCloud by finding, downloading, converting them to EML and PDF formats, and organizing them into mail_archive folders.
/tf/active/vicechatdev/msg_to_eml.py
1300 - 1516
complex
Purpose
This class provides a complete workflow for managing email files in a FileCloud storage system. It connects to FileCloud, searches for .msg files, downloads them, converts them to both EML and PDF formats, uploads the converted files back to FileCloud in an organized structure (mail_archive folders), and moves the original .msg files to the archive. The class handles the entire lifecycle of email file processing including temporary file management, error handling, and cleanup.
Source Code
class FileCloudEmailProcessor:
"""Process email files in FileCloud - find, download, convert and organize"""
def __init__(self, server_url, username, password):
"""Initialize with FileCloud connection details"""
self.server_url = server_url
self.username = username
self.password = password
self.client = None
self.temp_dir = tempfile.mkdtemp()
logger.info(f"Created temporary directory: {self.temp_dir}")
def connect(self):
"""Connect to FileCloud server"""
try:
self.client = FileCloudAPI(self.server_url, self.username, self.password)
if self.client.login():
logger.info(f"Successfully connected to FileCloud at {self.server_url}")
return True
else:
logger.error("Failed to authenticate with FileCloud")
return False
except Exception as e:
logger.error(f"Error connecting to FileCloud: {str(e)}")
return False
def find_msg_files(self, start_path="/", recursive=True):
"""Find all .msg files in FileCloud starting from the specified path"""
if not self.client:
if not self.connect():
return []
logger.info(f"Searching for .msg files in {start_path}")
# Use search functionality to find .msg files
search_results = self.client.search(
search_string="**/*.msg", # Search for .msg extension
search_scope="1", # Search in filenames
search_location=start_path,
limit='10000' # High limit to find all files
)
#logger.info("result messages: " + str(search_results))
# Extract paths from the search results
msg_paths = []
if search_results.get("success", False):
# Extract paths from response
if "paths" in search_results:
msg_paths = search_results["paths"]
else:
# Try to extract from XML or other formats if needed
logger.warning("No direct paths found in search results")
# Filter out files that are already in a mail_archive folder
filtered_paths = []
for path in msg_paths:
# Check if this file is in a mail_archive folder
if "/mail_archive/" in path or path.endswith("/mail_archive"):
logger.debug(f"Skipping already processed file: {path}")
continue
filtered_paths.append(path)
logger.info(f"Found {len(msg_paths)} .msg files, {len(filtered_paths)} need processing (excluding files in mail_archive folders)")
return filtered_paths
def process_msg_file(self, file_path):
"""Process a single .msg file: download, convert, and organize"""
if not self.client:
if not self.connect():
return False
try:
# Get filename and parent directory info
filename = os.path.basename(file_path)
parent_dir = os.path.dirname(file_path)
# Create mail_archive folder in the same directory as the .msg file
mail_archive_path = f"{parent_dir}/mail_archive"
# Ensure the mail_archive folder exists in FileCloud
self.client.create_folder(path=parent_dir, subpath="mail_archive")
logger.info(f"Ensured mail_archive folder exists: {mail_archive_path}")
# Download the .msg file
logger.info(f"Downloading {file_path}")
temp_msg_path = os.path.join(self.temp_dir, filename)
result = self.client.download_file(file_path, temp_msg_path)
if isinstance(result, dict) and not result.get('success', False):
logger.error(f"Failed to download {file_path}: {result.get('message', 'Unknown error')}")
return False
if not os.path.exists(temp_msg_path):
logger.error(f"Downloaded file not found at {temp_msg_path}")
return False
# Generate output filenames
file_basename = os.path.splitext(filename)[0]
temp_eml_path = os.path.join(self.temp_dir, f"{file_basename}.eml")
temp_pdf_path = os.path.join(self.temp_dir, f"{file_basename}.pdf")
# Convert to EML
logger.info(f"Converting {temp_msg_path} to EML format")
if not msg_to_eml(temp_msg_path, temp_eml_path):
logger.error(f"Failed to convert {temp_msg_path} to EML format")
return False
# Convert to PDF
logger.info(f"Converting {temp_msg_path} to PDF format")
if not msg_to_pdf_improved(temp_msg_path, temp_pdf_path):
logger.error(f"Failed to convert {temp_msg_path} to PDF format")
return False
# Upload PDF to the original location
logger.info(f"Uploading PDF to {parent_dir}")
pdf_remote_path = f"{parent_dir}/{file_basename}.pdf"
pdf_upload_result = self.client.upload_file(
local_file_path=temp_pdf_path,
remote_path=mail_archive_path,
filename=f"{file_basename}.pdf",
overwrite=True
)
if not pdf_upload_result.get('success', False):
logger.error(f"Failed to upload PDF: {pdf_upload_result.get('message', 'Unknown error')}")
return False
# Upload EML to mail_archive folder
logger.info(f"Uploading EML to {mail_archive_path}")
eml_remote_path = f"{mail_archive_path}/{file_basename}.eml"
eml_upload_result = self.client.upload_file(
local_file_path=temp_eml_path,
remote_path=parent_dir,
filename=f"{file_basename}.eml",
overwrite=True
)
if not eml_upload_result.get('success', False):
logger.error(f"Failed to upload EML: {eml_upload_result.get('message', 'Unknown error')}")
# Copy the original MSG file to mail_archive folder
logger.info(f"Copying MSG file to {mail_archive_path}")
msg_copy_result = self.client.rename_or_move(
from_path=file_path,
to_path=f"{mail_archive_path}/{filename}",
overwrite=True
)
if not msg_copy_result.get('success', False):
logger.error(f"Failed to move MSG: {msg_copy_result.get('message', 'Unknown error')}")
# Try copying instead
msg_copy_result = self.client.copy_file(
path=parent_dir,
name=filename,
copy_to=f"{mail_archive_path}/{filename}",
overwrite=True
)
if not msg_copy_result.get('success', False):
logger.error(f"Failed to copy MSG: {msg_copy_result.get('message', 'Unknown error')}")
logger.info(f"Successfully processed {file_path}")
return True
except Exception as e:
logger.error(f"Error processing {file_path}: {str(e)}")
logger.error(traceback.format_exc())
return False
finally:
# Clean up temporary files
# Added second try loop as interim errors return empty temp file variables
try:
for temp_file in [temp_msg_path, temp_eml_path, temp_pdf_path]:
try:
if os.path.exists(temp_file):
os.unlink(temp_file)
except:
pass
except:
pass
def process_all_msg_files(self, start_path="/"):
"""Find and process all .msg files in FileCloud"""
import time
msg_files = self.find_msg_files(start_path)
if not msg_files:
logger.info(f"No .msg files found in {start_path}")
return 0
logger.info(f"Processing {len(msg_files)} .msg files")
# counter=0
# for i, file_path in enumerate(msg_files, 1):
# logger.info(f"Processing file {i}/{len(msg_files)}: {file_path}")
# counter += 1
# if counter == 25:
# counter=0
# time.sleep(3)
# return 0
success_count = 0
for i, file_path in enumerate(msg_files, 1):
logger.info(f"Processing file {i}/{len(msg_files)}: {file_path}")
if self.process_msg_file(file_path):
success_count += 1
logger.info(f"Completed processing {len(msg_files)} files. Success: {success_count}, Failed: {len(msg_files) - success_count}")
return success_count
def cleanup(self):
"""Clean up temporary files and resources"""
try:
if os.path.exists(self.temp_dir):
shutil.rmtree(self.temp_dir)
logger.info(f"Removed temporary directory: {self.temp_dir}")
except Exception as e:
logger.error(f"Error cleaning up temporary directory: {str(e)}")
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
server_url: The URL of the FileCloud server to connect to (e.g., 'https://filecloud.example.com'). This is the base URL for all API operations.
username: The username for authenticating with the FileCloud server. Must have permissions to read, write, create folders, and move files.
password: The password for the FileCloud account. Used for authentication during the login process.
Return Value
The constructor returns an instance of FileCloudEmailProcessor. Key method returns: connect() returns boolean (True on success), find_msg_files() returns list of file paths (strings), process_msg_file() returns boolean (True on success), process_all_msg_files() returns integer count of successfully processed files, cleanup() returns None.
Class Interface
Methods
__init__(self, server_url, username, password)
Purpose: Initialize the FileCloudEmailProcessor with connection credentials and create a temporary directory for file operations
Parameters:
server_url: URL of the FileCloud serverusername: FileCloud account usernamepassword: FileCloud account password
Returns: None (constructor)
connect(self) -> bool
Purpose: Establish connection to the FileCloud server and authenticate using provided credentials
Returns: Boolean: True if connection and authentication successful, False otherwise
find_msg_files(self, start_path='/', recursive=True) -> list
Purpose: Search for all .msg files in FileCloud starting from specified path, excluding files already in mail_archive folders
Parameters:
start_path: Starting directory path for search (default: '/' for root)recursive: Whether to search recursively (default: True, currently not used in implementation)
Returns: List of strings containing full paths to .msg files that need processing
process_msg_file(self, file_path) -> bool
Purpose: Process a single .msg file by downloading it, converting to EML and PDF formats, uploading converted files to mail_archive folder, and moving the original
Parameters:
file_path: Full path to the .msg file in FileCloud to process
Returns: Boolean: True if all operations (download, convert, upload, move) succeeded, False if any step failed
process_all_msg_files(self, start_path='/') -> int
Purpose: Find and process all .msg files in FileCloud starting from the specified path, converting and organizing them in batch
Parameters:
start_path: Starting directory path for finding and processing files (default: '/' for root)
Returns: Integer: Count of successfully processed files
cleanup(self) -> None
Purpose: Remove the temporary directory and all its contents created during initialization
Returns: None
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
server_url |
str | The URL of the FileCloud server for API operations | instance |
username |
str | Username for FileCloud authentication | instance |
password |
str | Password for FileCloud authentication | instance |
client |
FileCloudAPI or None | FileCloud API client instance, initialized to None and set during connect() | instance |
temp_dir |
str | Path to temporary directory created for storing downloaded and converted files during processing | instance |
Dependencies
extract_msgemailtracebacktempfilesysbase64shutilsubprocesspathlibdatetimeargparseFC_apihtmlrereportlabtimePILfitzPyPDF2osmimetypeslogging
Required Imports
import extract_msg
import os
import mimetypes
import logging
import email
from email.message import EmailMessage
from email.utils import formatdate
from email.utils import formataddr
from email.headerregistry import Address
import email.charset
import traceback
import tempfile
import sys
import base64
import shutil
import subprocess
from pathlib import Path
from datetime import datetime
import argparse
from FC_api import FileCloudAPI
import html
import re
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate
from reportlab.platypus import Paragraph
from reportlab.platypus import Spacer
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.styles import ParagraphStyle
from reportlab.lib.units import inch
from reportlab.lib import colors
import time
from reportlab.platypus import Image as RLImage
from PIL import Image
import fitz
from PyPDF2 import PdfMerger
Conditional/Optional Imports
These imports are only needed under specific conditions:
import time
Condition: Used in process_all_msg_files method for rate limiting (currently commented out)
OptionalUsage Example
# Initialize the processor
processor = FileCloudEmailProcessor(
server_url='https://filecloud.example.com',
username='user@example.com',
password='secure_password'
)
# Connect to FileCloud
if processor.connect():
# Find all .msg files starting from root
msg_files = processor.find_msg_files(start_path='/', recursive=True)
print(f'Found {len(msg_files)} .msg files')
# Process a single file
success = processor.process_msg_file('/path/to/email.msg')
# Or process all files at once
success_count = processor.process_all_msg_files(start_path='/emails')
print(f'Successfully processed {success_count} files')
# Clean up temporary files
processor.cleanup()
else:
print('Failed to connect to FileCloud')
Best Practices
- Always call connect() before using any file operation methods
- Always call cleanup() when done to remove temporary files, preferably in a try-finally block
- The class creates a temporary directory on instantiation that persists until cleanup() is called
- Files already in mail_archive folders are automatically skipped to prevent reprocessing
- The class maintains state through the self.client attribute - ensure connection is established before operations
- Error handling is built-in but check return values (boolean for single operations, integer count for batch)
- Large batch operations may benefit from rate limiting (see commented code in process_all_msg_files)
- The class moves original .msg files to mail_archive folders, so ensure you have backups if needed
- Temporary files are cleaned up after each process_msg_file() call, but the temp_dir persists
- The class requires external helper functions msg_to_eml() and msg_to_pdf_improved() to be available
- FileCloud API operations may fail silently - always check success flags in returned dictionaries
- Use in a context manager pattern or ensure cleanup() is called to prevent disk space issues
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
function main_v17 65.3% similar
-
function msg_to_eml 64.9% similar
-
function msg_to_pdf 64.0% similar
-
function msg_to_pdf_improved 63.4% similar
-
function msg_to_eml_alternative 61.3% similar