Source code for ocx_schema_parser.xparse

#  Copyright (c) 2023-2025. OCX Consortium https://3docx.org. See the LICENSE
"""xparse module."""

# System imports
from typing import Dict

# Third party imports
from loguru import logger
from lxml import etree
from lxml.etree import Element, XMLSyntaxError

# Application imports
from .xelement import LxmlElement


[docs]class LxmlParser: """A wrapper of the lxml etree document tree and parser. Attributes: _tree : The ``lxml.etree`` DOM """ def __init__(self): self._tree: Element = None
[docs] def parse(self, file: str, store_ids: bool = False) -> bool: """Parses an XML file. Args: file: The file name of the xml document to be parsed. The parser can only parse from a local file. store_ids: If set to True, the parser will create a hash table of the xml IDs Returns: The return value. True for success, False otherwise. """ # Parsing the XML file. parsed = False try: my_parser = etree.XMLParser( remove_comments=False, remove_blank_text=True, ns_clean=True, collect_ids=store_ids, ) self._tree = etree.parse(file, parser=my_parser) parsed = True except XMLSyntaxError as e: logger.error(e) except OSError: logger.error("Failed to open file %s" % file, exc_info=True) return parsed
[docs] def get_root(self) -> Element: """The XML root. Returns: The XML root node """ return self._tree.getroot()
[docs] def lxml_version(self) -> str: """lxml version tag. Returns: The lxml version tag """ return etree.LXML_VERSION
[docs] def doc_public_id(self) -> str: """ Returns: The XML document type """ return self._tree.docinfo.public_id
[docs] def doc_url(self) -> str: """ Returns: The XML document url """ return self._tree.docinfo.URL
[docs] def doc_encoding(self) -> str: """ Returns: The XML document encoding """ return self._tree.docinfo.encoding
[docs] def doc_root_name(self) -> str: """ Returns: The XML document root name """ return self._tree.docinfo.root_name
[docs] def doc_system_url(self) -> str: """ Returns: The XML document system URL """ return self._tree.docinfo.system_url
[docs] def doc_xml_version(self) -> str: """ Returns: The XML document version """ return self._tree.docinfo.xml_version
[docs] def get_namespaces(self) -> Dict: """The dict of the defined namespaces of (prefix, namespace) as (key,value) pairs. Returns: (prefix, namespace) as (key,value) pairs """ root = self.get_root() if __debug__: if root is None: raise AssertionError(f"{__name__}: The root node is None") return root.nsmap
[docs] def get_target_namespace(self) -> str: """The target namespace of the schema. Returns: The target namespace as a str """ root = self.get_root() if __debug__: if root is None: raise AssertionError(f"{__name__}: The root node is None") return root.get("targetNamespace")
[docs] def get_referenced_files(self) -> Dict: """The XML imports (xs:import tags). Returns: A dict of key, value pairs (namespace: location/URL) of all xs:import tags. """ root = self.get_root() if __debug__: if root is None: raise AssertionError(f"{__name__}: The root node is None") urls = {} references = LxmlElement.find_all_children_with_name(root, "import") for ref in references: loc = ref.get("schemaLocation") ns = ref.get("namespace") urls[ns] = loc return urls