Skip to content

Commit 18ae04f

Browse files
Copilotlfoppiano
andcommitted
Add equation serialization to JSON and Markdown output
- Add formula handling in TEI2Markdown converter (_process_paragraph and _extract_fulltext) - Add formula handling in TEI2LossyJSON converter (_process_div_with_nested_content) - Formulas with labels are formatted as code blocks in Markdown - Formulas without labels are formatted as inline code in Markdown - JSON output includes formulas as separate entries with type='formula' - Formulas maintain proper order between paragraphs - Add comprehensive tests for equation serialization Co-authored-by: lfoppiano <15426+lfoppiano@users.noreply.github.com>
1 parent 6c1dfc9 commit 18ae04f

File tree

3 files changed

+381
-20
lines changed

3 files changed

+381
-20
lines changed

grobid_client/format/TEI2LossyJSON.py

Lines changed: 77 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -730,25 +730,32 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa
730730
# Generic handling - capitalize and format
731731
head_section = div_type.replace("_", " ").title()
732732

733-
# Process paragraphs in this div
734-
if len(direct_p_nodes) > 0:
735-
for id_p, p in enumerate(direct_p_nodes):
736-
paragraph_id = get_random_id(prefix="p_")
737-
738-
if passage_level == "sentence":
739-
for id_s, sentence in enumerate(p.find_all("s")):
740-
struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, sentence)
733+
# Process direct children of div in order to maintain proper sequence
734+
for child in div.children:
735+
if hasattr(child, 'name') and child.name:
736+
if child.name == "p":
737+
# Process paragraphs
738+
paragraph_id = get_random_id(prefix="p_")
739+
740+
if passage_level == "sentence":
741+
for id_s, sentence in enumerate(child.find_all("s")):
742+
struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, sentence)
743+
if self.validate_refs:
744+
for ref in struct['refs']:
745+
assert "Wrong offsets", ref['offset_start'] < ref['offset_end']
746+
assert "Cannot apply offsets", struct['text'][ref['offset_start']:ref['offset_end']] == ref['text']
747+
yield struct
748+
else:
749+
struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, child)
741750
if self.validate_refs:
742751
for ref in struct['refs']:
743752
assert "Wrong offsets", ref['offset_start'] < ref['offset_end']
744753
assert "Cannot apply offsets", struct['text'][ref['offset_start']:ref['offset_end']] == ref['text']
745754
yield struct
746-
else:
747-
struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, p)
748-
if self.validate_refs:
749-
for ref in struct['refs']:
750-
assert "Wrong offsets", ref['offset_start'] < ref['offset_end']
751-
assert "Cannot apply offsets", struct['text'][ref['offset_start']:ref['offset_end']] == ref['text']
755+
elif child.name == "formula":
756+
# Process formulas in their natural position
757+
formula_id = get_random_id(prefix="formula_")
758+
struct = get_formatted_formula(current_head_paragraph or head_paragraph, head_section, formula_id, child)
752759
yield struct
753760

754761
# Update head_paragraph for potential next div
@@ -930,6 +937,62 @@ def _clean_text_local(text: str) -> str:
930937
return passage
931938

932939

940+
def get_formatted_formula(head_paragraph, head_section, formula_id, element):
941+
"""Format a formula element with metadata."""
942+
# Import the clean_text method
943+
def _clean_text_local(text: str) -> str:
944+
if not text:
945+
return ""
946+
import re
947+
import html
948+
text = re.sub(r'\s+', ' ', text.strip())
949+
text = html.unescape(text)
950+
return text
951+
952+
# Extract formula text and label separately
953+
formula_text = ""
954+
label_text = ""
955+
956+
for child in element.children:
957+
if hasattr(child, 'name'):
958+
if child.name == "label":
959+
label_text = _clean_text_local(child.get_text())
960+
else:
961+
formula_text += child.get_text()
962+
else:
963+
# NavigableString - direct text content
964+
formula_text += str(child)
965+
966+
formula_text = _clean_text_local(formula_text)
967+
968+
# Create the formula structure
969+
formula_data = {
970+
"id": formula_id,
971+
"type": "formula",
972+
"text": formula_text,
973+
"coords": [
974+
box_to_dict(coord.split(","))
975+
for coord in element.get("coords", "").split(";")
976+
] if element.has_attr("coords") else []
977+
}
978+
979+
# Add label if present
980+
if label_text:
981+
formula_data["label"] = label_text
982+
983+
# Add xml:id if present
984+
xml_id = element.get("xml:id")
985+
if xml_id:
986+
formula_data["xml_id"] = xml_id
987+
988+
if head_paragraph:
989+
formula_data["head_paragraph"] = head_paragraph
990+
if head_section:
991+
formula_data["head_section"] = head_section
992+
993+
return formula_data
994+
995+
933996
def xml_table_to_markdown(table_element):
934997
"""Convert XML table to markdown format."""
935998
if not table_element:

grobid_client/format/TEI2Markdown.py

Lines changed: 47 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -213,12 +213,19 @@ def _extract_fulltext(self, soup: BeautifulSoup) -> str:
213213
section_title = head.get_text().strip()
214214
fulltext_sections.append(f"### {section_title}\n")
215215

216-
# Get paragraphs
217-
paragraphs = div.find_all("p")
218-
for p in paragraphs:
219-
paragraph_text = self._process_paragraph(p)
220-
if paragraph_text.strip():
221-
fulltext_sections.append(f"{paragraph_text}\n\n")
216+
# Process direct children of div in order to maintain structure
217+
for child in div.children:
218+
if hasattr(child, 'name'):
219+
if child.name == "p":
220+
# Process paragraphs
221+
paragraph_text = self._process_paragraph(child)
222+
if paragraph_text.strip():
223+
fulltext_sections.append(f"{paragraph_text}\n\n")
224+
elif child.name == "formula":
225+
# Process formulas that are direct children of div
226+
formula_md = self._formula_to_markdown(child)
227+
if formula_md:
228+
fulltext_sections.append(f"{formula_md}\n\n")
222229

223230
return "".join(fulltext_sections)
224231

@@ -320,6 +327,11 @@ def _process_paragraph(self, p_element: Tag) -> str:
320327
# Handle references - keep the text but don't add special formatting
321328
ref_text = element.get_text()
322329
text_parts.append(ref_text)
330+
elif element.name == "formula":
331+
# Handle formulas/equations
332+
formula_md = self._formula_to_markdown(element)
333+
if formula_md:
334+
text_parts.append(f"\n{formula_md}\n")
323335
elif element.name == "figure":
324336
# Handle figures
325337
fig_desc = element.find("figDesc")
@@ -352,6 +364,35 @@ def _table_to_markdown(self, table_element: Tag) -> str:
352364

353365
return "\n".join(markdown_lines) if markdown_lines else ""
354366

367+
def _formula_to_markdown(self, formula_element: Tag) -> str:
368+
"""Convert a formula element to markdown format."""
369+
# Extract the formula content (text without label)
370+
formula_text = ""
371+
label_text = ""
372+
373+
# Get formula text, excluding the label
374+
for child in formula_element.children:
375+
if isinstance(child, NavigableString):
376+
formula_text += str(child)
377+
elif child.name == "label":
378+
label_text = child.get_text().strip()
379+
else:
380+
# For other elements within formula, get their text
381+
formula_text += child.get_text()
382+
383+
formula_text = formula_text.strip()
384+
385+
# Format as inline code or block depending on content
386+
if formula_text:
387+
if label_text:
388+
# If there's a label (equation number), format as block equation
389+
return f"```\n{formula_text} {label_text}\n```"
390+
else:
391+
# Otherwise, format as inline code
392+
return f"`{formula_text}`"
393+
394+
return ""
395+
355396
def _format_reference(self, bibl_struct: Tag, ref_num: int) -> str:
356397
"""
357398
Format a bibliographic reference with comprehensive TEI element handling.

0 commit comments

Comments
 (0)