Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions dapytains/tei/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from lxml.objectify import Element, SubElement, StringElement, ObjectifiedElement
from lxml import objectify
import re
from xml.sax.saxutils import unescape
from dapytains.errors import UnknownTreeName

COPY_UNTIL_END = -1
Expand Down Expand Up @@ -108,12 +109,12 @@ def _add_space_tail(element: ElementBase, node: saxonlib.PyXdmNode, processor: s
if hasattr(element, "_setText"):
element._setText(content)
else:
element.text = content
element.text = unescape(content)

if element.tail is None or len(element.tail) == 0:
tail = _get_text(node, "following-sibling::node()[1]", processor=processor)
if tail is not None and not tail.strip():
element.tail = str(tail)
element.tail = unescape(tail)


def _prune(node: saxonlib.PyXdmNode, milestone: str, processor: saxonlib.PySaxonProcessor) -> str:
Expand Down Expand Up @@ -175,9 +176,9 @@ def copy_node(
elif parent is not None:
if not parent.getchildren():
if not isinstance(parent, (StringElement, ObjectifiedElement)):
parent.text = (parent.text or "") + element
parent.text = unescape((parent.text or "") + element)
else:
parent.getchildren()[-1].tail = element
parent.getchildren()[-1].tail = unescape(element)
return parent

if node is None:
Expand Down Expand Up @@ -258,7 +259,7 @@ def _treat_siblings(
for node in next_nodes:
if node.node_kind_str == "text":
if not last_node.tail:
last_node.tail = _get_text(node, ".", processor=processor)
last_node.tail = unescape(_get_text(node, ".", processor=processor))
else:
if xpath != "node()":
last_node = copy_node(
Expand Down
20 changes: 20 additions & 0 deletions tests/tei/xml_entity.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<TEI xmlns="http://www.tei-c.org/ns/1.0">
<teiHeader>
<encodingDesc>
<refsDecl>
<citeStructure unit="paragraph" match="//p" use="@n"/>
</refsDecl>
</encodingDesc>
</teiHeader>
<text>
<front>
</front>
<body>
<div>
<p n="1">Lorem</p>
<p n="2">&amp; Ipsum</p>
<p n="3">Dolorem</p>
</div>
</body>
</text>
</TEI>
22 changes: 22 additions & 0 deletions tests/tei/xml_entity_tail.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<TEI xmlns="http://www.tei-c.org/ns/1.0">
<teiHeader>
<encodingDesc>
<refsDecl>
<citeStructure unit="paragraph" match="//lb" use="@n"/>
</refsDecl>
</encodingDesc>
</teiHeader>
<text>
<front>
</front>
<body>
<div>
<p>
<lb n="1"/>Lorem
<lb n="2"/>&amp; Ipsum
<lb n="3"/>Dolorem &amp;
</p>
</div>
</body>
</text>
</TEI>
31 changes: 31 additions & 0 deletions tests/test_tei.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,3 +221,34 @@ def test_passage_ranger_simple():
' </body>\n'
' </text>\n'
'</TEI>')


def test_xml_entity():
"""Test that a single range passage matching works"""
doc = Document(f"{local_dir}/xml_entity.xml")
assert tostring(
doc.get_passage("2", "3"), encoding=str
) == ('<TEI xmlns="http://www.tei-c.org/ns/1.0"><text>\n'
' <body>\n'
' <div>\n'
' <p n="2">&amp; Ipsum</p>\n'
' <p n="3">Dolorem</p>\n'
' </div>\n'
' </body>\n'
' </text>\n'
'</TEI>')
doc = Document(f"{local_dir}/xml_entity_tail.xml")
assert tostring(
doc.get_passage("2", "3"), encoding=str
) == ('<TEI xmlns="http://www.tei-c.org/ns/1.0"><text>\n'
' <body>\n'
' <div>\n'
' <p>\n'
' <lb n="2"/>&amp; Ipsum\n'
' <lb n="3"/>Dolorem &amp;\n'
' </p>\n'
' </div>\n'
' </body>\n'
' </text>\n'
'</TEI>')

Loading