DocumentConverter - Code Extractor

class DocumentConverter

Maturity: 55

A class that converts various document formats (Word, Excel, PowerPoint, OpenDocument, Visio) to PDF using LibreOffice's headless conversion capabilities, with support for parallel processing and directory structure preservation.

File:
/tf/active/vicechatdev/pdfconverter.py

Lines:
15 - 190

Complexity:
moderate

Purpose

DocumentConverter provides a robust solution for batch converting office documents to PDF format. It recursively scans an input directory for supported document types, converts them to PDF using LibreOffice in headless mode, and preserves the original directory structure in the output location. The class supports concurrent conversions for improved performance, handles PDF files by copying them directly, tracks conversion errors with detailed reporting, and manages temporary files during the conversion process. It's designed for scenarios requiring bulk document conversion with error tracking and progress logging.

Source Code

class DocumentConverter:
    """Convert various document formats to PDF using LibreOffice"""
    
    # Supported file extensions
    SUPPORTED_EXTENSIONS = [
        # Word documents
        '.doc', '.docx', '.docm', '.dot', '.dotx', '.dotm', '.rtf',
        # Excel documents
        '.xls', '.xlsx', '.xlsm', '.xlt', '.xltx', '.xltm', '.xlsb',
        # PowerPoint documents
        '.ppt', '.pptx', '.pptm', '.pot', '.potx', '.potm', '.pps', '.ppsx',
        # Other formats
        '.odt', '.ods', '.odp', '.vsd', '.vsdx',
        # Include PDF to handle PDF files in the source
        '.pdf'
    ]
    
    def __init__(self, input_dir, output_dir, max_workers=1):
        """
        Initialize the converter
        
        Args:
            input_dir: Directory with source documents
            output_dir: Directory to save PDF files
            max_workers: Maximum number of concurrent conversions
        """
        self.input_dir = Path(input_dir).absolute()
        self.output_dir = Path(output_dir).absolute()
        self.max_workers = max_workers
        self.error_details = {}  # Store detailed error information
        
        # Create output directory if not exists
        os.makedirs(self.output_dir, exist_ok=True)
        
    def find_documents(self):
        """Find all supported documents in input directory"""
        documents = []
        
        for ext in self.SUPPORTED_EXTENSIONS:
            documents.extend(self.input_dir.glob(f'**/*{ext}'))
            
        return documents
    
    def get_relative_output_path(self, input_file):
        """Determine the output path that preserves the original directory structure"""
        # Get the relative path from the input_dir
        rel_path = input_file.relative_to(self.input_dir)
        
        # Calculate the output directory path preserving folder structure
        output_dir = self.output_dir / rel_path.parent
        
        # Create the directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        # Return the full output path with .pdf extension
        return output_dir / f"{input_file.stem}.pdf"
    
    def convert_document(self, input_file):
        """Convert a document to PDF using LibreOffice"""
        file_id = str(input_file)
        output_file = self.get_relative_output_path(input_file)
        
        try:
            # Skip if already converted
            if output_file.exists():
                logger.info(f"Skipping {input_file.name} - already exists at {output_file}")
                return False
                
            logger.info(f"Converting {input_file} to {output_file}")
            
            # Special handling for PDF files - just copy them
            if input_file.suffix.lower() == '.pdf':
                shutil.copy2(input_file, output_file)
                logger.info(f"Copied PDF file {input_file.name} to {output_file}")
                return True
            
            # Use LibreOffice for actual conversion
            # We'll use a temporary directory for LibreOffice output
            temp_dir = self.output_dir / "_temp"
            os.makedirs(temp_dir, exist_ok=True)
            
            cmd = [
                'libreoffice', 
                '--headless', 
                '--convert-to', 
                'pdf',
                '--outdir', 
                str(temp_dir),
                str(input_file)
            ]
            
            process = subprocess.run(
                cmd,
                capture_output=True, 
                text=True,
                timeout=120  # 2 minute timeout
            )
            
            if process.returncode != 0:
                error_msg = f"Error converting {input_file.name}: {process.stderr}"
                logger.error(error_msg)
                self.error_details[file_id] = {
                    'file': str(input_file),
                    'error': process.stderr,
                    'return_code': process.returncode
                }
                return False
            
            # Move the converted file from temp dir to the proper location
            temp_output = temp_dir / f"{input_file.stem}.pdf"
            if temp_output.exists():
                # Ensure target directory exists
                os.makedirs(output_file.parent, exist_ok=True)
                # Move the file to preserve structure
                shutil.move(temp_output, output_file)
                logger.info(f"Successfully converted {input_file.name} to {output_file}")
                return True
            else:
                error_msg = f"Conversion produced no output for {input_file.name}"
                logger.error(error_msg)
                self.error_details[file_id] = {
                    'file': str(input_file),
                    'error': error_msg
                }
                return False
                
        except subprocess.TimeoutExpired as e:
            error_msg = f"Timeout converting {input_file.name}"
            logger.error(error_msg)
            self.error_details[file_id] = {
                'file': str(input_file),
                'error': str(e)
            }
            return False
        except Exception as e:
            error_msg = f"Error converting {input_file.name}: {str(e)}"
            logger.error(error_msg)
            self.error_details[file_id] = {
                'file': str(input_file),
                'error': str(e)
            }
            return False
            
    def convert_all(self):
        """Convert all documents in parallel"""
        documents = self.find_documents()
        logger.info(f"Found {len(documents)} documents to convert")
        
        success_count = 0
        failure_count = 0
        
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            results = list(executor.map(self.convert_document, documents))
            
            success_count = results.count(True)
            failure_count = results.count(False)
        
        # Clean up any temporary directory    
        temp_dir = self.output_dir / "_temp"
        if temp_dir.exists():
            shutil.rmtree(temp_dir, ignore_errors=True)
            
        logger.info(f"Conversion complete: {success_count} succeeded, {failure_count} failed")
        
        # Print detailed error information
        if self.error_details:
            logger.info(f"\n{'='*80}\nDETAILED ERROR REPORT\n{'='*80}")
            for idx, (file_id, details) in enumerate(self.error_details.items(), 1):
                logger.info(f"\nError #{idx}:")
                logger.info(f"File: {details['file']}")
                logger.info(f"Error: {details['error']}")
                if 'return_code' in details:
                    logger.info(f"Return code: {details['return_code']}")
                logger.info('-' * 50)
                
        return success_count, failure_count

Parameters

Name	Type	Default	Kind
`bases`	-	-

Parameter Details

input_dir: Path to the directory containing source documents to convert. Can be a string or Path object. The converter will recursively search this directory for all supported file types. The path is converted to an absolute path internally.

output_dir: Path to the directory where converted PDF files will be saved. Can be a string or Path object. The directory structure from input_dir is preserved in this location. The directory is created automatically if it doesn't exist. The path is converted to an absolute path internally.

max_workers: Maximum number of concurrent document conversions to run in parallel using ThreadPoolExecutor. Default is 1 (sequential processing). Higher values can improve performance but increase system resource usage. Should be tuned based on available CPU cores and memory.

Return Value

Instantiation returns a DocumentConverter object. The convert_all() method returns a tuple (success_count, failure_count) indicating the number of successfully converted documents and the number of failed conversions. The convert_document() method returns a boolean: True if conversion succeeded, False if it failed or was skipped. The find_documents() method returns a list of Path objects representing all supported documents found. The get_relative_output_path() method returns a Path object representing the output file path with preserved directory structure.

Class Interface

Methods

`init(self, input_dir, output_dir, max_workers=1)`

Purpose: Initialize the DocumentConverter with input/output directories and concurrency settings

Parameters:

input_dir: Directory path containing source documents to convert
output_dir: Directory path where converted PDFs will be saved
max_workers: Maximum number of concurrent conversions (default: 1)

Returns: None - initializes the DocumentConverter instance

`find_documents(self) -> list`

Purpose: Recursively search the input directory for all supported document types

Returns: List of Path objects representing all found documents with supported extensions

`get_relative_output_path(self, input_file) -> Path`

Purpose: Calculate the output PDF path that preserves the original directory structure relative to input_dir

Parameters:

input_file: Path object of the input document file

Returns: Path object representing the output PDF file location with preserved directory structure

`convert_document(self, input_file) -> bool`

Purpose: Convert a single document to PDF using LibreOffice, or copy if already PDF

Parameters:

input_file: Path object of the document to convert

Returns: True if conversion succeeded, False if it failed or was skipped (already exists)

`convert_all(self) -> tuple`

Purpose: Convert all documents found in input_dir to PDF using parallel processing, with error tracking and cleanup

Returns: Tuple of (success_count, failure_count) indicating conversion results

Attributes

Name	Type	Description	Scope
`SUPPORTED_EXTENSIONS`	list	Class variable containing all supported file extensions for conversion, including Word (.doc, .docx), Excel (.xls, .xlsx), PowerPoint (.ppt, .pptx), OpenDocument (.odt, .ods, .odp), Visio (.vsd, .vsdx), and PDF (.pdf)	class
`input_dir`	Path	Absolute path to the directory containing source documents to convert	instance
`output_dir`	Path	Absolute path to the directory where converted PDF files are saved	instance
`max_workers`	int	Maximum number of concurrent document conversions allowed	instance
`error_details`	dict	Dictionary storing detailed error information for failed conversions, keyed by file path string, with values containing 'file', 'error', and optionally 'return_code' keys	instance

Dependencies

os
subprocess
logging
concurrent.futures
pathlib
shutil

Required Imports

import os
import subprocess
import logging
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
import shutil

Usage Example

import os
import logging
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
import subprocess
import shutil

# Setup logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

# Basic usage - sequential conversion
converter = DocumentConverter(
    input_dir='/path/to/documents',
    output_dir='/path/to/pdfs',
    max_workers=1
)

# Find all documents first (optional)
documents = converter.find_documents()
print(f'Found {len(documents)} documents')

# Convert all documents
success, failures = converter.convert_all()
print(f'Converted {success} documents, {failures} failed')

# Check for errors
if converter.error_details:
    for file_id, error_info in converter.error_details.items():
        print(f"Error in {error_info['file']}: {error_info['error']}")

# Parallel conversion with 4 workers
parallel_converter = DocumentConverter(
    input_dir='/path/to/documents',
    output_dir='/path/to/pdfs',
    max_workers=4
)
success, failures = parallel_converter.convert_all()

# Convert a single document
single_file = Path('/path/to/documents/report.docx')
result = converter.convert_document(single_file)

Best Practices

Always ensure LibreOffice is installed before instantiating the class, as conversion will fail without it
Configure a logger before using the class, as it relies on a module-level 'logger' variable for status reporting
Start with max_workers=1 for testing, then increase based on system resources and performance needs
Monitor the error_details attribute after conversion to identify and handle failed conversions
The class creates a temporary directory (_temp) in output_dir during conversion, which is cleaned up automatically
PDF files in the input directory are copied rather than converted, preserving the original file
Each conversion has a 2-minute timeout to prevent hanging on problematic files
The class preserves the original directory structure from input_dir in output_dir
Already converted files (existing PDFs in output_dir) are skipped to avoid redundant work
Use absolute paths or ensure relative paths are correct, as the class converts them to absolute paths internally
For large batch conversions, consider the disk space requirements for both output PDFs and temporary files
The error_details dictionary persists across the object's lifetime, accumulating errors from all conversions

Similar Components

AI-powered semantic similarity - components with related functionality:

class PDFConverter 87.2% similar

A class that converts various document formats (Word, PowerPoint, Excel, images) to PDF format using LibreOffice and ReportLab libraries.
From: /tf/active/vicechatdev/msg_to_eml.py
class DocumentExtractor 65.2% similar

A document text extraction class that supports multiple file formats including Word, PowerPoint, PDF, and plain text files, with automatic format detection and conversion capabilities.
From: /tf/active/vicechatdev/leexi/document_extractor.py
function convert_document_to_pdf 55.5% similar

Converts a document version from an editable format (e.g., Word) to PDF without changing the document's status, uploading the result to FileCloud and updating the version record.
From: /tf/active/vicechatdev/document_controller_backup.py
function main_v12 51.6% similar

Main entry point function that reads a markdown file, converts it to an enhanced Word document with preserved heading structure, and saves it with a timestamped filename.
From: /tf/active/vicechatdev/improved_word_converter.py
function test_document_extractor 49.6% similar

A test function that validates the DocumentExtractor class by testing file type support detection, text extraction from various document formats, and error handling.
From: /tf/active/vicechatdev/leexi/test_document_extractor.py

← Back to Browse

Assistant

Hi! I can help improve this code. Tell me what you'd like to enhance (e.g., "add error handling", "optimize performance", "improve readability", "add type hints").

Code Comparison

Original Code

                            class DocumentConverter:
    """Convert various document formats to PDF using LibreOffice"""
    
    # Supported file extensions
    SUPPORTED_EXTENSIONS = [
        # Word documents
        '.doc', '.docx', '.docm', '.dot', '.dotx', '.dotm', '.rtf',
        # Excel documents
        '.xls', '.xlsx', '.xlsm', '.xlt', '.xltx', '.xltm', '.xlsb',
        # PowerPoint documents
        '.ppt', '.pptx', '.pptm', '.pot', '.potx', '.potm', '.pps', '.ppsx',
        # Other formats
        '.odt', '.ods', '.odp', '.vsd', '.vsdx',
        # Include PDF to handle PDF files in the source
        '.pdf'
    ]
    
    def __init__(self, input_dir, output_dir, max_workers=1):
        """
        Initialize the converter
        
        Args:
            input_dir: Directory with source documents
            output_dir: Directory to save PDF files
            max_workers: Maximum number of concurrent conversions
        """
        self.input_dir = Path(input_dir).absolute()
        self.output_dir = Path(output_dir).absolute()
        self.max_workers = max_workers
        self.error_details = {}  # Store detailed error information
        
        # Create output directory if not exists
        os.makedirs(self.output_dir, exist_ok=True)
        
    def find_documents(self):
        """Find all supported documents in input directory"""
        documents = []
        
        for ext in self.SUPPORTED_EXTENSIONS:
            documents.extend(self.input_dir.glob(f'**/*{ext}'))
            
        return documents
    
    def get_relative_output_path(self, input_file):
        """Determine the output path that preserves the original directory structure"""
        # Get the relative path from the input_dir
        rel_path = input_file.relative_to(self.input_dir)
        
        # Calculate the output directory path preserving folder structure
        output_dir = self.output_dir / rel_path.parent
        
        # Create the directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        # Return the full output path with .pdf extension
        return output_dir / f"{input_file.stem}.pdf"
    
    def convert_document(self, input_file):
        """Convert a document to PDF using LibreOffice"""
        file_id = str(input_file)
        output_file = self.get_relative_output_path(input_file)
        
        try:
            # Skip if already converted
            if output_file.exists():
                logger.info(f"Skipping {input_file.name} - already exists at {output_file}")
                return False
                
            logger.info(f"Converting {input_file} to {output_file}")
            
            # Special handling for PDF files - just copy them
            if input_file.suffix.lower() == '.pdf':
                shutil.copy2(input_file, output_file)
                logger.info(f"Copied PDF file {input_file.name} to {output_file}")
                return True
            
            # Use LibreOffice for actual conversion
            # We'll use a temporary directory for LibreOffice output
            temp_dir = self.output_dir / "_temp"
            os.makedirs(temp_dir, exist_ok=True)
            
            cmd = [
                'libreoffice', 
                '--headless', 
                '--convert-to', 
                'pdf',
                '--outdir', 
                str(temp_dir),
                str(input_file)
            ]
            
            process = subprocess.run(
                cmd,
                capture_output=True, 
                text=True,
                timeout=120  # 2 minute timeout
            )
            
            if process.returncode != 0:
                error_msg = f"Error converting {input_file.name}: {process.stderr}"
                logger.error(error_msg)
                self.error_details[file_id] = {
                    'file': str(input_file),
                    'error': process.stderr,
                    'return_code': process.returncode
                }
                return False
            
            # Move the converted file from temp dir to the proper location
            temp_output = temp_dir / f"{input_file.stem}.pdf"
            if temp_output.exists():
                # Ensure target directory exists
                os.makedirs(output_file.parent, exist_ok=True)
                # Move the file to preserve structure
                shutil.move(temp_output, output_file)
                logger.info(f"Successfully converted {input_file.name} to {output_file}")
                return True
            else:
                error_msg = f"Conversion produced no output for {input_file.name}"
                logger.error(error_msg)
                self.error_details[file_id] = {
                    'file': str(input_file),
                    'error': error_msg
                }
                return False
                
        except subprocess.TimeoutExpired as e:
            error_msg = f"Timeout converting {input_file.name}"
            logger.error(error_msg)
            self.error_details[file_id] = {
                'file': str(input_file),
                'error': str(e)
            }
            return False
        except Exception as e:
            error_msg = f"Error converting {input_file.name}: {str(e)}"
            logger.error(error_msg)
            self.error_details[file_id] = {
                'file': str(input_file),
                'error': str(e)
            }
            return False
            
    def convert_all(self):
        """Convert all documents in parallel"""
        documents = self.find_documents()
        logger.info(f"Found {len(documents)} documents to convert")
        
        success_count = 0
        failure_count = 0
        
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            results = list(executor.map(self.convert_document, documents))
            
            success_count = results.count(True)
            failure_count = results.count(False)
        
        # Clean up any temporary directory    
        temp_dir = self.output_dir / "_temp"
        if temp_dir.exists():
            shutil.rmtree(temp_dir, ignore_errors=True)
            
        logger.info(f"Conversion complete: {success_count} succeeded, {failure_count} failed")
        
        # Print detailed error information
        if self.error_details:
            logger.info(f"\n{'='*80}\nDETAILED ERROR REPORT\n{'='*80}")
            for idx, (file_id, details) in enumerate(self.error_details.items(), 1):
                logger.info(f"\nError #{idx}:")
                logger.info(f"File: {details['file']}")
                logger.info(f"Error: {details['error']}")
                if 'return_code' in details:
                    logger.info(f"Return code: {details['return_code']}")
                logger.info('-' * 50)
                
        return success_count, failure_count
                        

Improved Code

🔍 Code Extractor

class DocumentConverter

Purpose

Source Code

Parameters

Parameter Details

Return Value

Class Interface

Methods

`init(self, input_dir, output_dir, max_workers=1)`

`find_documents(self) -> list`

`get_relative_output_path(self, input_file) -> Path`

`convert_document(self, input_file) -> bool`

`convert_all(self) -> tuple`

Attributes

Dependencies

Required Imports

Usage Example

Best Practices

Tags

Similar Components

class PDFConverter 87.2% similar

class DocumentExtractor 65.2% similar

function convert_document_to_pdf 55.5% similar

function main_v12 51.6% similar

function test_document_extractor 49.6% similar

class DocumentConverter

Purpose

Source Code

Parameters

Parameter Details

Return Value

Class Interface

Methods

__init__(self, input_dir, output_dir, max_workers=1)

find_documents(self) -> list

get_relative_output_path(self, input_file) -> Path

convert_document(self, input_file) -> bool

convert_all(self) -> tuple

Attributes

Dependencies

Required Imports

Usage Example

Best Practices

Tags

Similar Components

class PDFConverter 87.2% similar

class DocumentExtractor 65.2% similar

function convert_document_to_pdf 55.5% similar

function main_v12 51.6% similar

function test_document_extractor 49.6% similar

✨ Improve Code: DocumentConverter

Code Comparison

`init(self, input_dir, output_dir, max_workers=1)`

`find_documents(self) -> list`

`get_relative_output_path(self, input_file) -> Path`

`convert_document(self, input_file) -> bool`

`convert_all(self) -> tuple`