# dbtk/writers/xml.py
"""
XML writer for database results using lxml.
"""
import io
import logging
import re
import sys
from typing import Any, BinaryIO, List, Optional, TextIO, Tuple, Union
from pathlib import Path
try:
from lxml import etree
HAS_LXML = True
except ImportError:
HAS_LXML = False
from .base import BaseWriter, BatchWriter
from ..utils import to_string
logger = logging.getLogger(__name__)
def _sanitize_element_name(name: str) -> str:
"""
Sanitize column name to be valid XML element name.
XML element names must start with a letter or underscore, and can only
contain letters, digits, hyphens, underscores, and periods.
Parameters
----------
name : str
Original column name.
Returns
-------
str
Valid XML element name
"""
sanitized = re.sub(r'[^a-zA-Z0-9_.-]', '_', str(name))
# Ensure it doesn't start with a number
if sanitized and sanitized[0].isdigit():
sanitized = 'col_' + sanitized
return sanitized or 'unnamed'
[docs]
class XMLWriter(BaseWriter):
"""
XML writer that builds complete XML tree in memory.
Best for small to medium datasets. For large datasets that don't fit
in memory, use XMLStreamer instead.
Parameters
----------
data : Iterable[RecordLike]
Data to write
file : str, Path, TextIO, or BinaryIO, optional
Output file or file handle. If None, writes to stdout.
columns : List[str], optional
Column names for list-of-lists data
encoding : str, default 'utf-8'
XML encoding declaration
root_element : str, default 'data'
Name of the root XML element
record_element : str, default 'record'
Name of each record element
pretty : bool, default True
Whether to format with indentation
Examples
--------
>>> to_xml(cursor, 'users.xml')
>>> to_xml(records, 'output.xml', root_element='users', record_element='user')
"""
preserve_types = False
[docs]
def __init__(
self,
data=None,
file: Optional[Union[str, Path, TextIO, BinaryIO]] = None,
columns: Optional[List[str]] = None,
encoding: str = 'utf-8',
root_element: str = 'data',
record_element: str = 'record',
pretty: bool = True,
):
"""Initialize XML writer."""
self.root_element = root_element
self.record_element = record_element
self.pretty = pretty
# BOM encodings can break XML parsers
if encoding and '-sig' in encoding:
encoding = encoding.replace('-sig', '')
logger.warning(f'A BOM encoding ({encoding}-sig) is not supported by LXML and most parsers. Using {encoding} instead.')
super().__init__(data, file, columns, encoding)
self._xml_columns = {col: _sanitize_element_name(col) for col in self.columns}
def _write_data(self, file_obj: Union[TextIO, BinaryIO]) -> None:
"""Write XML data by building complete tree in memory."""
root = etree.Element(self.root_element)
for record in self.data_iterator:
record_elem = etree.SubElement(root, self.record_element)
# Convert record to dict and prepare for XML
record_dict = self._row_to_dict(record)
# Add elements for each field
for key, value in record_dict.items():
xml_key = self._xml_columns[key]
elem = etree.SubElement(record_elem, xml_key)
elem.text = value
self._row_num += 1
xml_str = etree.tostring(root, encoding=self.encoding, xml_declaration=True, pretty_print=self.pretty)
# Write output
file_obj.write(xml_str.decode(self.encoding))
[docs]
class XMLStreamer(BatchWriter):
"""
Streaming XML writer that writes records incrementally.
Memory-efficient for large datasets. Writes XML elements as they arrive
without building the entire tree in memory.
Parameters
----------
data : Iterable[RecordLike], optional
Initial data. For streaming mode, use data=None.
file : str, Path, or BinaryIO, optional
Output file or binary file handle. Must be binary mode for streaming.
columns : List[str], optional
Column names for list-of-lists data
encoding : str, default 'utf-8'
XML encoding declaration
root_element : str, default 'data'
Name of the root XML element
record_element : str, default 'record'
Name of each record element
Examples
--------
Streaming mode::
with open('output.xml', 'wb') as f:
with XMLStreamer(data=None, file=f, root_element='data') as writer:
for batch in surge.batched(records):
writer.write_batch(batch)
Single-shot mode::
XMLStreamer(data=records, file='output.xml').write()
Notes
-----
- Requires lxml library
- File must be opened in binary mode ('wb') for streaming
- No pretty-printing (streaming writes compact XML)
- More memory-efficient than XMLWriter for large datasets
"""
[docs]
def __init__(
self,
data=None,
file: Optional[Union[str, Path, BinaryIO]] = None,
columns: Optional[List[str]] = None,
encoding: str = 'utf-8',
root_element: str = 'data',
record_element: str = 'record',
):
"""Initialize streaming XML writer."""
# Set these BEFORE super().__init__() in case _lazy_init is called
self.root_element = root_element
self.record_element = record_element
# XML streaming contexts (set up in _lazy_init)
self._xmlfile_ctx = None
self._xf = None
self._root_ctx = None
self._xml_columns = {}
super().__init__(
data=data,
file=file,
columns=columns,
encoding=encoding,
preserve_types=True, # We'll convert in _prepare_record_for_xml
)
def _open_file_handle(self, mode: str = 'wb') -> Tuple[BinaryIO, bool]:
"""
Override to use binary mode and validate binary streams.
Parameters
----------
mode : str, default 'wb'
File open mode (must be binary)
Returns
-------
Tuple[BinaryIO, bool]
(file_handle, should_close_flag)
Raises
------
ValueError
If a text stream is provided instead of binary
"""
if self.file is None:
return sys.stdout.buffer, False
if hasattr(self.file, 'write'):
# Validate it's a binary stream
if isinstance(self.file, io.TextIOWrapper):
raise ValueError(
"XMLStreamer requires a binary file handle, got TextIOWrapper. "
"Open file in 'wb' mode or use file.buffer"
)
if hasattr(self.file, 'mode') and 'b' not in self.file.mode:
raise ValueError(
f"XMLStreamer requires binary mode, file opened in '{self.file.mode}' mode. "
"Use 'wb' mode instead."
)
return super()._open_file_handle('wb')
def _lazy_init(self, data) -> None:
"""
Set up columns and XML streaming contexts on first use.
Parameters
----------
data : Iterable[RecordLike]
First batch of data
"""
if self._initialized:
return
# Parent handles columns and data_iterator
super()._lazy_init(data)
self._xml_columns = {col: _sanitize_element_name(col) for col in self.columns}
# Set up XML streaming contexts
self._xmlfile_ctx = etree.xmlfile(self._file_obj)
self._xf = self._xmlfile_ctx.__enter__()
self._root_ctx = self._xf.element(self.root_element)
self._root_ctx.__enter__()
# Newline after opening root tag
self._xf.write('\n')
def _write_data(self, file_obj: BinaryIO) -> None:
"""
Write XML records to stream.
Parameters
----------
file_obj : BinaryIO
Binary file handle (managed by parent class)
"""
if not self._initialized:
self._lazy_init(self.data_iterator)
for record in self.data_iterator:
record_dict = self._row_to_dict(record)
with self._xf.element(self.record_element):
for key, value in record_dict.items():
xml_key = self._xml_columns[key]
with self._xf.element(xml_key):
if value != '': # Only write non-empty values
self._xf.write(value)
# Newline after each record
self._xf.write('\n')
self._row_num += 1
# Flush after writing
self._xf.flush()
def __exit__(self, exc_type, exc_val, exc_tb):
"""Close XML contexts, then close file."""
# Close XML contexts
if self._root_ctx:
self._root_ctx.__exit__(exc_type, exc_val, exc_tb)
self._root_ctx = None
if self._xmlfile_ctx:
self._xmlfile_ctx.__exit__(exc_type, exc_val, exc_tb)
self._xmlfile_ctx = None
# Let parent close the file
return super().__exit__(exc_type, exc_val, exc_tb)
[docs]
def to_xml(
data,
file: Optional[Union[str, Path]] = None,
encoding: str = 'utf-8',
root_element: str = 'data',
record_element: str = 'record',
stream: bool = False,
pretty: bool = None,
) -> None:
"""
Export cursor or result set to XML file.
Parameters
----------
data : Iterable[RecordLike]
Cursor object or list of records
file : str or Path, optional
Output file. If None, writes to stdout (limited to 20 rows)
encoding : str, default 'utf-8'
XML encoding declaration
root_element : str, default 'data'
Name of the root XML element
record_element : str, default 'record'
Name of each record element
stream : bool, default False
Whether to use streaming mode (reduces memory usage for large datasets)
pretty : bool, optional
Whether to format with indentation. Defaults to True for tree mode,
False for streaming mode.
Examples
--------
Write to file::
to_xml(cursor, 'users.xml')
Write to stdout (limited to 20 rows)::
to_xml(cursor)
Custom element names with streaming::
to_xml(cursor, 'active_users.xml',
root_element='users',
record_element='user',
stream=True)
Notes
-----
- Tree mode (stream=False): Builds complete XML tree in memory, supports pretty printing
- Streaming mode (stream=True): Memory-efficient, writes incrementally, no pretty printing
- For large datasets (>100K rows), use stream=True
"""
if pretty is None:
pretty = not stream
if stream:
with XMLStreamer(
data=data,
file=file,
encoding=encoding,
root_element=root_element,
record_element=record_element,
) as writer:
writer.write()
else:
with XMLWriter(
data=data,
file=file,
encoding=encoding,
root_element=root_element,
record_element=record_element,
pretty=pretty,
) as writer:
writer.write()
[docs]
def check_dependencies():
"""Check for optional dependencies and issue warnings if missing."""
if not HAS_LXML:
logger.error('lxml is required for XML support. Install with "pip install lxml".')
check_dependencies()