Source code for magpie.xml_util
"""
Define a default XML parser that avoids XXE injection.
Package :mod:`lxml` is employed directly even though some linters (e.g.: ``bandit``) report to employ ``defusedxml``
instead, because that package's extension with ``lxml`` is marked as deprecated.
.. seealso::
https://github.com/tiran/defusedxml#defusedxmllxml
To use the module, import is as if importing ``lxml.etree``:
.. code-block:: python
from weaver.xml_util import XML # ElementTree
from weaver import xml_util
data = xml_util.fromstring("<xml>content</xml>")
"""
from typing import TYPE_CHECKING
from lxml import etree as lxml_etree # nosec: B410 # flagged known issue, this is what the applied fix below is about
if TYPE_CHECKING:
from lxml.etree._FeedParser import _FeedParser as Parser # noqa # nosec: B410 # pylint: disable=W0212
[docs]
XML_PARSER = lxml_etree.XMLParser(
# security fix: XML external entity (XXE) injection
# https://lxml.de/parsing.html#parser-options
# https://nvd.nist.gov/vuln/detail/CVE-2021-39371
# based on:
# https://github.com/geopython/pywps/pull/616
resolve_entities=False,
# avoid failing parsing if some characters are not correctly escaped
# based on:
# https://stackoverflow.com/a/57450722/5936364
recover=True, # attempt, no guarantee
)
[docs]
tostring = lxml_etree.tostring
[docs]
Element = lxml_etree.Element
[docs]
ParseError = lxml_etree.ParseError
# define this type here so that code can use it for actual logic without repeating 'noqa'
[docs]
XML = lxml_etree._Element # noqa # pylint: disable=W0212
# save a local reference to method employed by OWSLib directly called
[docs]
_lxml_fromstring = lxml_etree.fromstring
[docs]
def fromstring(text, parser=XML_PARSER):
# type: (str, Parser) -> XML
"""
Drop in replacement for :func:`lxml.etree.fromstring` using a secure :term:`XML` parser.
"""
return _lxml_fromstring(text, parser=parser) # nosec: B410,B320 # safe use if using secure parser
[docs]
def strip_namespace(tree):
# type: (XML) -> None
"""
Strip the namespace component from all tags in the specified :term:`XML` tree.
"""
for node in tree.iter():
try:
has_namespace = node.tag.startswith("{")
except AttributeError:
continue # node.tag is not a string (node is a comment or similar)
if has_namespace:
node.tag = node.tag.split("}", 1)[1]