class _DictSAXHandler
A SAX (Simple API for XML) event handler that converts XML documents into Python dictionaries, with extensive configuration options for handling attributes, namespaces, CDATA, and structure.
/tf/active/vicechatdev/SPFCsync/venv/lib64/python3.11/site-packages/xmltodict.py
43 - 199
complex
Purpose
This class serves as a SAX parser handler for converting XML to dictionary representations. It processes XML parsing events (startElement, endElement, characters, etc.) and builds a nested dictionary structure. It supports advanced features like namespace handling, attribute prefixing, CDATA management, custom postprocessing, forcing list structures, and streaming parsing with callbacks at specific depths. The handler is designed to work with Python's SAX parser infrastructure and can handle complex XML documents with namespaces, comments, and mixed content.
Source Code
class _DictSAXHandler(object):
def __init__(self,
item_depth=0,
item_callback=lambda *args: True,
xml_attribs=True,
attr_prefix='@',
cdata_key='#text',
force_cdata=False,
cdata_separator='',
postprocessor=None,
dict_constructor=_dict,
strip_whitespace=True,
namespace_separator=':',
namespaces=None,
force_list=None,
comment_key='#comment'):
self.path = []
self.stack = []
self.data = []
self.item = None
self.item_depth = item_depth
self.xml_attribs = xml_attribs
self.item_callback = item_callback
self.attr_prefix = attr_prefix
self.cdata_key = cdata_key
self.force_cdata = force_cdata
self.cdata_separator = cdata_separator
self.postprocessor = postprocessor
self.dict_constructor = dict_constructor
self.strip_whitespace = strip_whitespace
self.namespace_separator = namespace_separator
self.namespaces = namespaces
self.namespace_declarations = dict_constructor()
self.force_list = force_list
self.comment_key = comment_key
def _build_name(self, full_name):
if self.namespaces is None:
return full_name
i = full_name.rfind(self.namespace_separator)
if i == -1:
return full_name
namespace, name = full_name[:i], full_name[i+1:]
try:
short_namespace = self.namespaces[namespace]
except KeyError:
short_namespace = namespace
if not short_namespace:
return name
else:
return self.namespace_separator.join((short_namespace, name))
def _attrs_to_dict(self, attrs):
if isinstance(attrs, dict):
return attrs
return self.dict_constructor(zip(attrs[0::2], attrs[1::2]))
def startNamespaceDecl(self, prefix, uri):
self.namespace_declarations[prefix or ''] = uri
def startElement(self, full_name, attrs):
name = self._build_name(full_name)
attrs = self._attrs_to_dict(attrs)
if attrs and self.namespace_declarations:
attrs['xmlns'] = self.namespace_declarations
self.namespace_declarations = self.dict_constructor()
self.path.append((name, attrs or None))
if len(self.path) > self.item_depth:
self.stack.append((self.item, self.data))
if self.xml_attribs:
attr_entries = []
for key, value in attrs.items():
key = self.attr_prefix+self._build_name(key)
if self.postprocessor:
entry = self.postprocessor(self.path, key, value)
else:
entry = (key, value)
if entry:
attr_entries.append(entry)
attrs = self.dict_constructor(attr_entries)
else:
attrs = None
self.item = attrs or None
self.data = []
def endElement(self, full_name):
name = self._build_name(full_name)
if len(self.path) == self.item_depth:
item = self.item
if item is None:
item = (None if not self.data
else self.cdata_separator.join(self.data))
should_continue = self.item_callback(self.path, item)
if not should_continue:
raise ParsingInterrupted()
if self.stack:
data = (None if not self.data
else self.cdata_separator.join(self.data))
item = self.item
self.item, self.data = self.stack.pop()
if self.strip_whitespace and data:
data = data.strip() or None
if data and self.force_cdata and item is None:
item = self.dict_constructor()
if item is not None:
if data:
self.push_data(item, self.cdata_key, data)
self.item = self.push_data(self.item, name, item)
else:
self.item = self.push_data(self.item, name, data)
else:
self.item = None
self.data = []
self.path.pop()
def characters(self, data):
if not self.data:
self.data = [data]
else:
self.data.append(data)
def comments(self, data):
if self.strip_whitespace:
data = data.strip()
self.item = self.push_data(self.item, self.comment_key, data)
def push_data(self, item, key, data):
if self.postprocessor is not None:
result = self.postprocessor(self.path, key, data)
if result is None:
return item
key, data = result
if item is None:
item = self.dict_constructor()
try:
value = item[key]
if isinstance(value, list):
value.append(data)
else:
item[key] = [value, data]
except KeyError:
if self._should_force_list(key, data):
item[key] = [data]
else:
item[key] = data
return item
def _should_force_list(self, key, value):
if not self.force_list:
return False
if isinstance(self.force_list, bool):
return self.force_list
try:
return key in self.force_list
except TypeError:
return self.force_list(self.path[:-1], key, value)
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
object | - |
Parameter Details
item_depth: Integer specifying the depth at which to trigger item_callback. 0 means root level. Used for streaming large XML documents by processing items at a specific nesting level.
item_callback: Callable function invoked when an element at item_depth is fully parsed. Receives (path, item) and should return True to continue parsing or False to stop. Default is lambda that always returns True.
xml_attribs: Boolean indicating whether to include XML attributes in the output dictionary. If True, attributes are added with attr_prefix. Default is True.
attr_prefix: String prefix for attribute keys in the output dictionary. Default is '@'. For example, an attribute 'id' becomes '@id'.
cdata_key: String key name for character data (text content) in elements that also have children or attributes. Default is '#text'.
force_cdata: Boolean that forces creation of a dictionary with cdata_key even when element has only text content. Default is False.
cdata_separator: String used to join multiple character data segments within an element. Default is empty string ''.
postprocessor: Optional callable for transforming parsed data. Receives (path, key, data) and should return None to skip or (new_key, new_data) tuple. Default is None.
dict_constructor: Callable that creates dictionary objects. Allows using OrderedDict or custom dict types. Default is _dict (typically OrderedDict).
strip_whitespace: Boolean indicating whether to strip leading/trailing whitespace from text content. Default is True.
namespace_separator: String separator for namespace prefix and local name. Default is ':'. Used when building qualified names.
namespaces: Optional dictionary mapping namespace URIs to prefixes for namespace abbreviation. None means no namespace processing.
force_list: Controls when to force values into lists. Can be: None (no forcing), True (always force), False (never force), iterable of keys to force, or callable(path, key, value) returning boolean.
comment_key: String key name for XML comments in the output dictionary. Default is '#comment'.
Return Value
Instantiation returns a _DictSAXHandler object that can be used as a SAX ContentHandler. The handler maintains state during parsing and builds the final dictionary structure in the 'item' attribute. Methods like startElement, endElement, and characters return None (they modify internal state). The push_data method returns the updated item dictionary.
Class Interface
Methods
__init__(self, item_depth=0, item_callback=lambda *args: True, xml_attribs=True, attr_prefix='@', cdata_key='#text', force_cdata=False, cdata_separator='', postprocessor=None, dict_constructor=_dict, strip_whitespace=True, namespace_separator=':', namespaces=None, force_list=None, comment_key='#comment')
Purpose: Initializes the SAX handler with configuration options for XML-to-dict conversion
Parameters:
item_depth: Depth level for triggering item callbacksitem_callback: Function called when item at specified depth is parsedxml_attribs: Whether to include XML attributesattr_prefix: Prefix for attribute keyscdata_key: Key name for character dataforce_cdata: Force CDATA key creationcdata_separator: Separator for joining character datapostprocessor: Optional data transformation functiondict_constructor: Function to create dictionary objectsstrip_whitespace: Whether to strip whitespace from textnamespace_separator: Separator for namespace and local namenamespaces: Namespace URI to prefix mappingforce_list: Configuration for forcing list structurescomment_key: Key name for XML comments
Returns: None (constructor)
_build_name(self, full_name) -> str
Purpose: Builds element name by processing namespaces according to configuration
Parameters:
full_name: Full qualified name potentially containing namespace
Returns: Processed name with namespace prefix applied or removed based on namespaces mapping
_attrs_to_dict(self, attrs) -> dict
Purpose: Converts SAX attributes object to dictionary using configured dict_constructor
Parameters:
attrs: SAX attributes object or dictionary
Returns: Dictionary representation of attributes
startNamespaceDecl(self, prefix, uri) -> None
Purpose: SAX event handler for namespace declarations, stores namespace mappings
Parameters:
prefix: Namespace prefix (empty string for default namespace)uri: Namespace URI
Returns: None
startElement(self, full_name, attrs) -> None
Purpose: SAX event handler for element start, processes element name and attributes, updates parsing state
Parameters:
full_name: Full element name including namespaceattrs: Element attributes
Returns: None
endElement(self, full_name) -> None
Purpose: SAX event handler for element end, finalizes element processing, triggers callbacks, and updates item structure
Parameters:
full_name: Full element name including namespace
Returns: None (may raise ParsingInterrupted if callback returns False)
characters(self, data) -> None
Purpose: SAX event handler for character data, accumulates text content in data list
Parameters:
data: Character data string
Returns: None
comments(self, data) -> None
Purpose: SAX event handler for XML comments, adds comment to current item with comment_key
Parameters:
data: Comment text
Returns: None
push_data(self, item, key, data) -> dict
Purpose: Adds data to item dictionary, handling duplicate keys by creating lists and applying postprocessor
Parameters:
item: Current item dictionary or Nonekey: Key to adddata: Value to add
Returns: Updated item dictionary with new data added
_should_force_list(self, key, value) -> bool
Purpose: Determines if a key should be forced into a list structure based on force_list configuration
Parameters:
key: Dictionary key being evaluatedvalue: Value associated with the key
Returns: Boolean indicating whether to force list structure
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
path |
list | Stack of (name, attrs) tuples representing current path in XML tree during parsing | instance |
stack |
list | Stack of (item, data) tuples for maintaining parent context during nested element parsing | instance |
data |
list | Accumulator for character data segments within current element | instance |
item |
dict or None | Current item being built or final parsed result after parsing completes | instance |
item_depth |
int | Depth level at which to trigger item_callback | instance |
xml_attribs |
bool | Whether to include XML attributes in output | instance |
item_callback |
callable | Function called when element at item_depth is fully parsed | instance |
attr_prefix |
str | Prefix string for attribute keys in output dictionary | instance |
cdata_key |
str | Key name for character data in output dictionary | instance |
force_cdata |
bool | Whether to force creation of dictionary with cdata_key for text-only elements | instance |
cdata_separator |
str | String used to join multiple character data segments | instance |
postprocessor |
callable or None | Optional function for transforming parsed data | instance |
dict_constructor |
callable | Function used to create dictionary objects (e.g., dict, OrderedDict) | instance |
strip_whitespace |
bool | Whether to strip leading/trailing whitespace from text content | instance |
namespace_separator |
str | Separator between namespace prefix and local name | instance |
namespaces |
dict or None | Mapping of namespace URIs to prefixes for namespace processing | instance |
namespace_declarations |
dict | Temporary storage for namespace declarations in current element | instance |
force_list |
bool, iterable, callable, or None | Configuration for forcing values into list structures | instance |
comment_key |
str | Key name for XML comments in output dictionary | instance |
Dependencies
xml.saxxml.parsers.expatdefusedexpat
Required Imports
from xml.sax.saxutils import XMLGenerator
from xml.sax.xmlreader import AttributesImpl
from xml.parsers import expat
Conditional/Optional Imports
These imports are only needed under specific conditions:
from defusedexpat import pyexpat as expat
Condition: when defusedexpat is available for secure XML parsing
Optionalfrom collections import OrderedDict as _dict
Condition: for maintaining element order in parsed dictionaries
Required (conditional)from cStringIO import StringIO
Condition: Python 2.x only
Optionalfrom io import StringIO
Condition: Python 3.x only
OptionalUsage Example
from collections import OrderedDict
from xml.sax import make_parser
# Create handler with custom configuration
handler = _DictSAXHandler(
item_depth=0,
xml_attribs=True,
attr_prefix='@',
cdata_key='#text',
strip_whitespace=True,
dict_constructor=OrderedDict
)
# Use with SAX parser
parser = make_parser()
parser.setContentHandler(handler)
parser.parse('example.xml')
# Access parsed result
result = handler.item
# Example with streaming callback
def process_item(path, item):
print(f'Parsed item at {path}: {item}')
return True # Continue parsing
streaming_handler = _DictSAXHandler(
item_depth=2,
item_callback=process_item
)
# Example with namespace handling
ns_handler = _DictSAXHandler(
namespaces={'http://example.com/ns': 'ex'},
namespace_separator=':'
)
# Example with force_list
list_handler = _DictSAXHandler(
force_list=['item', 'entry'] # Always make these keys lists
)
Best Practices
- This handler is designed to be used with Python's SAX parser infrastructure via make_parser() and setContentHandler()
- The handler maintains internal state (path, stack, data, item) that is modified during parsing - do not reuse the same handler instance for multiple documents
- Use item_depth and item_callback for memory-efficient streaming parsing of large XML documents
- The item attribute contains the final parsed result after parsing completes
- When using postprocessor, return None to skip an element or (key, value) tuple to transform it
- The force_list parameter is crucial for ensuring consistent structure when elements may appear once or multiple times
- Set xml_attribs=False if you don't need attributes in the output to simplify the resulting dictionary
- Use namespace_separator and namespaces parameters together for proper namespace handling
- The handler can raise ParsingInterrupted exception if item_callback returns False - ensure this exception is defined
- Character data is accumulated in the data list and joined with cdata_separator when the element closes
- The stack attribute maintains parent context during nested element parsing
- Path tracking (self.path) provides context for postprocessor and item_callback functions
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
function _emit 70.6% similar
-
function parse 63.2% similar
-
function unparse 59.5% similar
-
function _process_namespace 47.5% similar
-
class SimpleDataHandle 46.0% similar