Source code for dbtk.writers.xml

# dbtk/writers/xml.py
"""
XML writer for database results using lxml.
"""

import io
import logging
import re
import sys
from typing import Any, BinaryIO, List, Optional, TextIO, Tuple, Union
from pathlib import Path

try:
    from lxml import etree

    HAS_LXML = True
except ImportError:
    HAS_LXML = False

from .base import BaseWriter, BatchWriter
from ..utils import to_string

logger = logging.getLogger(__name__)


def _sanitize_element_name(name: str) -> str:
    """
    Sanitize column name to be valid XML element name.

    XML element names must start with a letter or underscore, and can only
    contain letters, digits, hyphens, underscores, and periods.

    Parameters
    ----------
    name : str
        Original column name.

    Returns
    -------
    str
        Valid XML element name
    """
    sanitized = re.sub(r'[^a-zA-Z0-9_.-]', '_', str(name))

    # Ensure it doesn't start with a number
    if sanitized and sanitized[0].isdigit():
        sanitized = 'col_' + sanitized

    return sanitized or 'unnamed'


[docs] class XMLWriter(BaseWriter): """ XML writer that builds complete XML tree in memory. Best for small to medium datasets. For large datasets that don't fit in memory, use XMLStreamer instead. Parameters ---------- data : Iterable[RecordLike] Data to write file : str, Path, TextIO, or BinaryIO, optional Output file or file handle. If None, writes to stdout. columns : List[str], optional Column names for list-of-lists data encoding : str, default 'utf-8' XML encoding declaration root_element : str, default 'data' Name of the root XML element record_element : str, default 'record' Name of each record element pretty : bool, default True Whether to format with indentation Examples -------- >>> to_xml(cursor, 'users.xml') >>> to_xml(records, 'output.xml', root_element='users', record_element='user') """ preserve_types = False
[docs] def __init__( self, data=None, file: Optional[Union[str, Path, TextIO, BinaryIO]] = None, columns: Optional[List[str]] = None, encoding: str = 'utf-8', root_element: str = 'data', record_element: str = 'record', pretty: bool = True, ): """Initialize XML writer.""" self.root_element = root_element self.record_element = record_element self.pretty = pretty # BOM encodings can break XML parsers if encoding and '-sig' in encoding: encoding = encoding.replace('-sig', '') logger.warning(f'A BOM encoding ({encoding}-sig) is not supported by LXML and most parsers. Using {encoding} instead.') super().__init__(data, file, columns, encoding) self._xml_columns = {col: _sanitize_element_name(col) for col in self.columns}
def _write_data(self, file_obj: Union[TextIO, BinaryIO]) -> None: """Write XML data by building complete tree in memory.""" root = etree.Element(self.root_element) for record in self.data_iterator: record_elem = etree.SubElement(root, self.record_element) # Convert record to dict and prepare for XML record_dict = self._row_to_dict(record) # Add elements for each field for key, value in record_dict.items(): xml_key = self._xml_columns[key] elem = etree.SubElement(record_elem, xml_key) elem.text = value self._row_num += 1 xml_str = etree.tostring(root, encoding=self.encoding, xml_declaration=True, pretty_print=self.pretty) # Write output file_obj.write(xml_str.decode(self.encoding))
[docs] class XMLStreamer(BatchWriter): """ Streaming XML writer that writes records incrementally. Memory-efficient for large datasets. Writes XML elements as they arrive without building the entire tree in memory. Parameters ---------- data : Iterable[RecordLike], optional Initial data. For streaming mode, use data=None. file : str, Path, or BinaryIO, optional Output file or binary file handle. Must be binary mode for streaming. columns : List[str], optional Column names for list-of-lists data encoding : str, default 'utf-8' XML encoding declaration root_element : str, default 'data' Name of the root XML element record_element : str, default 'record' Name of each record element Examples -------- Streaming mode:: with open('output.xml', 'wb') as f: with XMLStreamer(data=None, file=f, root_element='data') as writer: for batch in surge.batched(records): writer.write_batch(batch) Single-shot mode:: XMLStreamer(data=records, file='output.xml').write() Notes ----- - Requires lxml library - File must be opened in binary mode ('wb') for streaming - No pretty-printing (streaming writes compact XML) - More memory-efficient than XMLWriter for large datasets """
[docs] def __init__( self, data=None, file: Optional[Union[str, Path, BinaryIO]] = None, columns: Optional[List[str]] = None, encoding: str = 'utf-8', root_element: str = 'data', record_element: str = 'record', ): """Initialize streaming XML writer.""" # Set these BEFORE super().__init__() in case _lazy_init is called self.root_element = root_element self.record_element = record_element # XML streaming contexts (set up in _lazy_init) self._xmlfile_ctx = None self._xf = None self._root_ctx = None self._xml_columns = {} super().__init__( data=data, file=file, columns=columns, encoding=encoding, preserve_types=True, # We'll convert in _prepare_record_for_xml )
def _open_file_handle(self, mode: str = 'wb') -> Tuple[BinaryIO, bool]: """ Override to use binary mode and validate binary streams. Parameters ---------- mode : str, default 'wb' File open mode (must be binary) Returns ------- Tuple[BinaryIO, bool] (file_handle, should_close_flag) Raises ------ ValueError If a text stream is provided instead of binary """ if self.file is None: return sys.stdout.buffer, False if hasattr(self.file, 'write'): # Validate it's a binary stream if isinstance(self.file, io.TextIOWrapper): raise ValueError( "XMLStreamer requires a binary file handle, got TextIOWrapper. " "Open file in 'wb' mode or use file.buffer" ) if hasattr(self.file, 'mode') and 'b' not in self.file.mode: raise ValueError( f"XMLStreamer requires binary mode, file opened in '{self.file.mode}' mode. " "Use 'wb' mode instead." ) return super()._open_file_handle('wb') def _lazy_init(self, data) -> None: """ Set up columns and XML streaming contexts on first use. Parameters ---------- data : Iterable[RecordLike] First batch of data """ if self._initialized: return # Parent handles columns and data_iterator super()._lazy_init(data) self._xml_columns = {col: _sanitize_element_name(col) for col in self.columns} # Set up XML streaming contexts self._xmlfile_ctx = etree.xmlfile(self._file_obj) self._xf = self._xmlfile_ctx.__enter__() self._root_ctx = self._xf.element(self.root_element) self._root_ctx.__enter__() # Newline after opening root tag self._xf.write('\n') def _write_data(self, file_obj: BinaryIO) -> None: """ Write XML records to stream. Parameters ---------- file_obj : BinaryIO Binary file handle (managed by parent class) """ if not self._initialized: self._lazy_init(self.data_iterator) for record in self.data_iterator: record_dict = self._row_to_dict(record) with self._xf.element(self.record_element): for key, value in record_dict.items(): xml_key = self._xml_columns[key] with self._xf.element(xml_key): if value != '': # Only write non-empty values self._xf.write(value) # Newline after each record self._xf.write('\n') self._row_num += 1 # Flush after writing self._xf.flush() def __exit__(self, exc_type, exc_val, exc_tb): """Close XML contexts, then close file.""" # Close XML contexts if self._root_ctx: self._root_ctx.__exit__(exc_type, exc_val, exc_tb) self._root_ctx = None if self._xmlfile_ctx: self._xmlfile_ctx.__exit__(exc_type, exc_val, exc_tb) self._xmlfile_ctx = None # Let parent close the file return super().__exit__(exc_type, exc_val, exc_tb)
[docs] def to_xml( data, file: Optional[Union[str, Path]] = None, encoding: str = 'utf-8', root_element: str = 'data', record_element: str = 'record', stream: bool = False, pretty: bool = None, ) -> None: """ Export cursor or result set to XML file. Parameters ---------- data : Iterable[RecordLike] Cursor object or list of records file : str or Path, optional Output file. If None, writes to stdout (limited to 20 rows) encoding : str, default 'utf-8' XML encoding declaration root_element : str, default 'data' Name of the root XML element record_element : str, default 'record' Name of each record element stream : bool, default False Whether to use streaming mode (reduces memory usage for large datasets) pretty : bool, optional Whether to format with indentation. Defaults to True for tree mode, False for streaming mode. Examples -------- Write to file:: to_xml(cursor, 'users.xml') Write to stdout (limited to 20 rows):: to_xml(cursor) Custom element names with streaming:: to_xml(cursor, 'active_users.xml', root_element='users', record_element='user', stream=True) Notes ----- - Tree mode (stream=False): Builds complete XML tree in memory, supports pretty printing - Streaming mode (stream=True): Memory-efficient, writes incrementally, no pretty printing - For large datasets (>100K rows), use stream=True """ if pretty is None: pretty = not stream if stream: with XMLStreamer( data=data, file=file, encoding=encoding, root_element=root_element, record_element=record_element, ) as writer: writer.write() else: with XMLWriter( data=data, file=file, encoding=encoding, root_element=root_element, record_element=record_element, pretty=pretty, ) as writer: writer.write()
[docs] def check_dependencies(): """Check for optional dependencies and issue warnings if missing.""" if not HAS_LXML: logger.error('lxml is required for XML support. Install with "pip install lxml".')
check_dependencies()