From 1e5ce815364ec2c4865a401d44fdc2cc413b3027 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Wed, 8 Oct 2025 18:26:18 +0200 Subject: [PATCH] Adding test for XML Entities --- dapytains/tei/document.py | 11 ++++++----- tests/tei/xml_entity.xml | 20 ++++++++++++++++++++ tests/tei/xml_entity_tail.xml | 22 ++++++++++++++++++++++ tests/test_tei.py | 31 +++++++++++++++++++++++++++++++ 4 files changed, 79 insertions(+), 5 deletions(-) create mode 100644 tests/tei/xml_entity.xml create mode 100644 tests/tei/xml_entity_tail.xml diff --git a/dapytains/tei/document.py b/dapytains/tei/document.py index c77ac8f..68934ec 100644 --- a/dapytains/tei/document.py +++ b/dapytains/tei/document.py @@ -7,6 +7,7 @@ from lxml.objectify import Element, SubElement, StringElement, ObjectifiedElement from lxml import objectify import re +from xml.sax.saxutils import unescape from dapytains.errors import UnknownTreeName COPY_UNTIL_END = -1 @@ -108,12 +109,12 @@ def _add_space_tail(element: ElementBase, node: saxonlib.PyXdmNode, processor: s if hasattr(element, "_setText"): element._setText(content) else: - element.text = content + element.text = unescape(content) if element.tail is None or len(element.tail) == 0: tail = _get_text(node, "following-sibling::node()[1]", processor=processor) if tail is not None and not tail.strip(): - element.tail = str(tail) + element.tail = unescape(tail) def _prune(node: saxonlib.PyXdmNode, milestone: str, processor: saxonlib.PySaxonProcessor) -> str: @@ -175,9 +176,9 @@ def copy_node( elif parent is not None: if not parent.getchildren(): if not isinstance(parent, (StringElement, ObjectifiedElement)): - parent.text = (parent.text or "") + element + parent.text = unescape((parent.text or "") + element) else: - parent.getchildren()[-1].tail = element + parent.getchildren()[-1].tail = unescape(element) return parent if node is None: @@ -258,7 +259,7 @@ def _treat_siblings( for node in next_nodes: if node.node_kind_str == "text": if not last_node.tail: - last_node.tail = _get_text(node, ".", processor=processor) + last_node.tail = unescape(_get_text(node, ".", processor=processor)) else: if xpath != "node()": last_node = copy_node( diff --git a/tests/tei/xml_entity.xml b/tests/tei/xml_entity.xml new file mode 100644 index 0000000..f172aa4 --- /dev/null +++ b/tests/tei/xml_entity.xml @@ -0,0 +1,20 @@ + + + + + + + + + + + + +
+

Lorem

+

& Ipsum

+

Dolorem

+
+ +
+
diff --git a/tests/tei/xml_entity_tail.xml b/tests/tei/xml_entity_tail.xml new file mode 100644 index 0000000..f237bc3 --- /dev/null +++ b/tests/tei/xml_entity_tail.xml @@ -0,0 +1,22 @@ + + + + + + + + + + + + +
+

+ Lorem + & Ipsum + Dolorem & +

+
+ +
+
diff --git a/tests/test_tei.py b/tests/test_tei.py index c7d3287..dd5fe6a 100644 --- a/tests/test_tei.py +++ b/tests/test_tei.py @@ -221,3 +221,34 @@ def test_passage_ranger_simple(): ' \n' ' \n' '') + + +def test_xml_entity(): + """Test that a single range passage matching works""" + doc = Document(f"{local_dir}/xml_entity.xml") + assert tostring( + doc.get_passage("2", "3"), encoding=str + ) == ('\n' + ' \n' + '
\n' + '

& Ipsum

\n' + '

Dolorem

\n' + '
\n' + ' \n' + '
\n' + '
') + doc = Document(f"{local_dir}/xml_entity_tail.xml") + assert tostring( + doc.get_passage("2", "3"), encoding=str + ) == ('\n' + ' \n' + '
\n' + '

\n' + ' & Ipsum\n' + ' Dolorem &\n' + '

\n' + '
\n' + ' \n' + '
\n' + '
') +