function create_enhanced_word_document
Converts markdown-formatted warranty disclosure content into a formatted Microsoft Word document with hierarchical headings, styled text, lists, and special formatting for block references.
/tf/active/vicechatdev/improved_word_converter.py
20 - 185
complex
Purpose
This function parses markdown content containing warranty disclosures for 'Project Victoria' and generates a professionally formatted Word document. It handles complex markdown structures including multi-level headings, bold text, lists, section metadata, and special formatting for disclosure sections. The function intelligently skips table of contents, processes section headers with specific patterns (e.g., '## 1.1(a) - Title'), and applies different formatting rules for disclosure vs non-disclosure content. It's designed for legal/compliance documentation where structured warranty information needs to be converted from markdown to Word format.
Source Code
def create_enhanced_word_document(markdown_content, output_path):
"""Create Word document from markdown content with enhanced formatting"""
doc = Document()
# Add title
title = doc.add_heading('Project Victoria - Warranty Disclosures', 0)
title.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
lines = markdown_content.split('\n')
current_section = None
in_disclosure = False
skip_toc = False
i = 0
while i < len(lines):
line = lines[i].strip()
# Skip empty lines
if not line:
i += 1
continue
# Skip the main title and metadata
if line.startswith('# Project Victoria') or line.startswith('**Generated on**') or line.startswith('**Total Warranties'):
i += 1
continue
# Skip table of contents section
if line == '## Table of Contents':
skip_toc = True
i += 1
continue
elif skip_toc and line.startswith('## ') and line != '## Table of Contents':
skip_toc = False
elif skip_toc:
i += 1
continue
# Handle section headers (## 1.1(a) - Title)
if line.startswith('## ') and ' - ' in line:
current_section = line[3:] # Remove '## '
heading = doc.add_heading(current_section, 1)
in_disclosure = False
# Add section metadata if present
j = i + 1
while j < len(lines) and lines[j].strip():
next_line = lines[j].strip()
if next_line.startswith('**Section**:') or next_line.startswith('**Source Documents Found**:'):
p = doc.add_paragraph()
if next_line.startswith('**'):
# Handle bold text with colons
parts = next_line.split(':', 1)
if parts[0].startswith('**') and parts[0].endswith('**'):
run = p.add_run(parts[0][2:-2] + ':')
run.bold = True
if len(parts) > 1 and parts[1].strip():
p.add_run(' ' + parts[1].strip())
else:
p.add_run(next_line)
else:
p.add_run(next_line)
j += 1
elif next_line.startswith('###'):
break
else:
j += 1
i = j - 1
# Handle subsection headers (### Warranty Text, ### Disclosure)
elif line.startswith('### '):
subsection = line[4:] # Remove '### '
doc.add_heading(subsection, 2)
if subsection == 'Disclosure':
in_disclosure = True
else:
in_disclosure = False
# Handle content within disclosure sections
elif in_disclosure:
# Handle headers within disclosure content (markdown style)
if line.startswith('# ') and not line.startswith('## '):
doc.add_heading(line[2:], 3)
elif line.startswith('## '):
doc.add_heading(line[3:], 4)
elif line.startswith('### ') and not line.startswith('#### '):
doc.add_heading(line[4:], 5)
elif line.startswith('#### '):
doc.add_heading(line[5:], 6)
elif line.startswith('##### '):
doc.add_heading(line[6:], 7)
# Handle bold text that looks like headers
elif line.startswith('**') and line.endswith('**') and len(line) > 4 and not ':' in line:
# Check if it's likely a heading (short and capitalized)
content = line[2:-2]
if len(content.split()) <= 6 and (content[0].isupper() or content.isupper()):
doc.add_heading(content, 6)
else:
p = doc.add_paragraph()
run = p.add_run(content)
run.bold = True
# Handle bold text with colons (**label**: content)
elif line.startswith('**') and '**:' in line:
p = doc.add_paragraph()
parts = line.split('**:', 1)
run = p.add_run(parts[0][2:] + ':')
run.bold = True
if len(parts) > 1 and parts[1].strip():
p.add_run(' ' + parts[1].strip())
# Handle list items
elif line.startswith('- '):
doc.add_paragraph(line[2:], style='List Bullet')
# Handle numbered lists
elif re.match(r'^\d+\.', line):
doc.add_paragraph(line, style='List Number')
# Handle block references [Block X] with special formatting
elif '[Block' in line and ']' in line:
p = doc.add_paragraph()
# Split text around block references
parts = re.split(r'(\[Block[^\]]*\])', line)
for part in parts:
if part.startswith('[Block') and part.endswith(']'):
run = p.add_run(part)
run.italic = True
run.font.color.rgb = None # Use default color but italic
else:
p.add_run(part)
# Handle lines that end with colon (potential section headers)
elif line.endswith(':') and not line.startswith('-') and len(line.split()) <= 6:
# Check if it looks like a section header
content = line[:-1].strip()
if content and (content[0].isupper() or content.count(' ') <= 3):
doc.add_heading(content, 7)
else:
doc.add_paragraph(line)
# Regular paragraphs (skip separator lines)
elif line and not line.startswith('---') and line != '```':
doc.add_paragraph(line)
# Handle non-disclosure content (like warranty text)
else:
# Handle bold text with colons (**text**: content)
if line.startswith('**') and ':' in line and not line.endswith('**'):
p = doc.add_paragraph()
parts = line.split(':', 1)
if parts[0].startswith('**') and parts[0].endswith('**'):
run = p.add_run(parts[0][2:-2] + ':')
run.bold = True
if len(parts) > 1 and parts[1].strip():
p.add_run(' ' + parts[1].strip())
else:
p.add_run(line)
elif line and not line.startswith('---'):
doc.add_paragraph(line)
i += 1
doc.save(output_path)
print(f"Enhanced Word document saved: {output_path}")
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
markdown_content |
- | - | positional_or_keyword |
output_path |
- | - | positional_or_keyword |
Parameter Details
markdown_content: String containing the complete markdown-formatted text to be converted. Expected to follow a specific structure with '# Project Victoria' as main title, '## Table of Contents' section (which will be skipped), section headers in format '## X.X(x) - Title', subsection headers like '### Warranty Text' and '### Disclosure', and various markdown formatting elements (bold text with **, lists with -, numbered lists, block references like [Block X]). The content should represent warranty disclosure documentation.
output_path: String or Path object specifying the file path where the generated Word document should be saved. Should include the .docx extension. The directory must exist or be creatable by the Document.save() method. Example: 'output/warranties.docx' or Path('reports/disclosure.docx').
Return Value
This function returns None. It produces a side effect by creating and saving a Word document at the specified output_path. Upon successful completion, it prints a confirmation message to stdout in the format 'Enhanced Word document saved: {output_path}'.
Dependencies
python-docxre (standard library)pathlib (standard library)
Required Imports
import re
from docx import Document
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
Usage Example
from docx import Document
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
import re
markdown_text = '''# Project Victoria - Warranty Disclosures
**Generated on**: 2024-01-15
**Total Warranties**: 5
## Table of Contents
- Section 1.1(a)
- Section 1.1(b)
## 1.1(a) - Equipment Warranties
**Section**: 1.1(a)
**Source Documents Found**: 3
### Warranty Text
All equipment shall be free from defects.
### Disclosure
**Warranty Period**: 12 months
**Coverage**: Full replacement
- Parts included
- Labor included
[Block 1] contains additional terms.
## 1.1(b) - Service Warranties
### Warranty Text
Services performed with reasonable care.
'''
output_file = 'warranty_report.docx'
create_enhanced_word_document(markdown_text, output_file)
# Output: Enhanced Word document saved: warranty_report.docx
Best Practices
- Ensure markdown_content follows the expected structure with 'Project Victoria' title and proper section formatting (## X.X(x) - Title pattern)
- Verify the output directory exists before calling the function, or handle potential file write errors
- The function expects specific markdown patterns; deviations may result in unexpected formatting
- Block references should follow the pattern [Block X] for proper italic formatting
- Section headers must contain ' - ' (space-dash-space) to be recognized as section headers
- The function modifies formatting based on context (in_disclosure flag), so maintain proper ### Disclosure subsection headers
- Large markdown documents may take time to process due to line-by-line parsing
- The function prints to stdout; redirect or capture output if running in production environments
- Test with sample markdown first to ensure formatting meets requirements before processing large documents
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
function create_enhanced_word_document_v1 94.4% similar
-
function create_word_report 85.1% similar
-
function create_word_report_improved 84.9% similar
-
function main_v2 79.6% similar
-
function main_v1 77.0% similar