Add equation serialization to JSON and Markdown output

Copilot · lfoppiano · Copilot · commit 18ae04fe2739 · 2025-11-16T22:16:57.000Z
- Add formula handling in TEI2Markdown converter (_process_paragraph and _extract_fulltext)
- Add formula handling in TEI2LossyJSON converter (_process_div_with_nested_content)
- Formulas with labels are formatted as code blocks in Markdown
- Formulas without labels are formatted as inline code in Markdown
- JSON output includes formulas as separate entries with type='formula'
- Formulas maintain proper order between paragraphs
- Add comprehensive tests for equation serialization

Co-authored-by: lfoppiano &lt;15426+lfoppiano@users.noreply.github.com&gt;
diff --git a/grobid_client/format/TEI2LossyJSON.py b/grobid_client/format/TEI2LossyJSON.py
@@ -730,25 +730,32 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa
                     # Generic handling - capitalize and format
                     head_section = div_type.replace("_", " ").title()
 
-        # Process paragraphs in this div
-        if len(direct_p_nodes) > 0:
-            for id_p, p in enumerate(direct_p_nodes):
-                paragraph_id = get_random_id(prefix="p_")
-
-                if passage_level == "sentence":
-                    for id_s, sentence in enumerate(p.find_all("s")):
-                        struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, sentence)
+        # Process direct children of div in order to maintain proper sequence
+        for child in div.children:
+            if hasattr(child, 'name') and child.name:
+                if child.name == "p":
+                    # Process paragraphs
+                    paragraph_id = get_random_id(prefix="p_")
+                    
+                    if passage_level == "sentence":
+                        for id_s, sentence in enumerate(child.find_all("s")):
+                            struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, sentence)
+                            if self.validate_refs:
+                                for ref in struct['refs']:
+                                    assert "Wrong offsets", ref['offset_start'] < ref['offset_end']
+                                    assert "Cannot apply offsets", struct['text'][ref['offset_start']:ref['offset_end']] == ref['text']
+                            yield struct
+                    else:
+                        struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, child)
                         if self.validate_refs:
                             for ref in struct['refs']:
                                 assert "Wrong offsets", ref['offset_start'] < ref['offset_end']
                                 assert "Cannot apply offsets", struct['text'][ref['offset_start']:ref['offset_end']] == ref['text']
                         yield struct
-                else:
-                    struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, p)
-                    if self.validate_refs:
-                        for ref in struct['refs']:
-                            assert "Wrong offsets", ref['offset_start'] < ref['offset_end']
-                            assert "Cannot apply offsets", struct['text'][ref['offset_start']:ref['offset_end']] == ref['text']
+                elif child.name == "formula":
+                    # Process formulas in their natural position
+                    formula_id = get_random_id(prefix="formula_")
+                    struct = get_formatted_formula(current_head_paragraph or head_paragraph, head_section, formula_id, child)
                     yield struct
 
         # Update head_paragraph for potential next div
@@ -930,6 +937,62 @@ def _clean_text_local(text: str) -> str:
     return passage
 
 
+def get_formatted_formula(head_paragraph, head_section, formula_id, element):
+    """Format a formula element with metadata."""
+    # Import the clean_text method
+    def _clean_text_local(text: str) -> str:
+        if not text:
+            return ""
+        import re
+        import html
+        text = re.sub(r'\s+', ' ', text.strip())
+        text = html.unescape(text)
+        return text
+    
+    # Extract formula text and label separately
+    formula_text = ""
+    label_text = ""
+    
+    for child in element.children:
+        if hasattr(child, 'name'):
+            if child.name == "label":
+                label_text = _clean_text_local(child.get_text())
+            else:
+                formula_text += child.get_text()
+        else:
+            # NavigableString - direct text content
+            formula_text += str(child)
+    
+    formula_text = _clean_text_local(formula_text)
+    
+    # Create the formula structure
+    formula_data = {
+        "id": formula_id,
+        "type": "formula",
+        "text": formula_text,
+        "coords": [
+            box_to_dict(coord.split(","))
+            for coord in element.get("coords", "").split(";")
+        ] if element.has_attr("coords") else []
+    }
+    
+    # Add label if present
+    if label_text:
+        formula_data["label"] = label_text
+    
+    # Add xml:id if present
+    xml_id = element.get("xml:id")
+    if xml_id:
+        formula_data["xml_id"] = xml_id
+    
+    if head_paragraph:
+        formula_data["head_paragraph"] = head_paragraph
+    if head_section:
+        formula_data["head_section"] = head_section
+    
+    return formula_data
+
+
 def xml_table_to_markdown(table_element):
     """Convert XML table to markdown format."""
     if not table_element:
diff --git a/grobid_client/format/TEI2Markdown.py b/grobid_client/format/TEI2Markdown.py
@@ -213,12 +213,19 @@ def _extract_fulltext(self, soup: BeautifulSoup) -> str:
                 section_title = head.get_text().strip()
                 fulltext_sections.append(f"### {section_title}\n")
 
-            # Get paragraphs
-            paragraphs = div.find_all("p")
-            for p in paragraphs:
-                paragraph_text = self._process_paragraph(p)
-                if paragraph_text.strip():
-                    fulltext_sections.append(f"{paragraph_text}\n\n")
+            # Process direct children of div in order to maintain structure
+            for child in div.children:
+                if hasattr(child, 'name'):
+                    if child.name == "p":
+                        # Process paragraphs
+                        paragraph_text = self._process_paragraph(child)
+                        if paragraph_text.strip():
+                            fulltext_sections.append(f"{paragraph_text}\n\n")
+                    elif child.name == "formula":
+                        # Process formulas that are direct children of div
+                        formula_md = self._formula_to_markdown(child)
+                        if formula_md:
+                            fulltext_sections.append(f"{formula_md}\n\n")
         
         return "".join(fulltext_sections)
 
@@ -320,6 +327,11 @@ def _process_paragraph(self, p_element: Tag) -> str:
                 # Handle references - keep the text but don't add special formatting
                 ref_text = element.get_text()
                 text_parts.append(ref_text)
+            elif element.name == "formula":
+                # Handle formulas/equations
+                formula_md = self._formula_to_markdown(element)
+                if formula_md:
+                    text_parts.append(f"\n{formula_md}\n")
             elif element.name == "figure":
                 # Handle figures
                 fig_desc = element.find("figDesc")
@@ -352,6 +364,35 @@ def _table_to_markdown(self, table_element: Tag) -> str:
         
         return "\n".join(markdown_lines) if markdown_lines else ""
 
+    def _formula_to_markdown(self, formula_element: Tag) -> str:
+        """Convert a formula element to markdown format."""
+        # Extract the formula content (text without label)
+        formula_text = ""
+        label_text = ""
+        
+        # Get formula text, excluding the label
+        for child in formula_element.children:
+            if isinstance(child, NavigableString):
+                formula_text += str(child)
+            elif child.name == "label":
+                label_text = child.get_text().strip()
+            else:
+                # For other elements within formula, get their text
+                formula_text += child.get_text()
+        
+        formula_text = formula_text.strip()
+        
+        # Format as inline code or block depending on content
+        if formula_text:
+            if label_text:
+                # If there's a label (equation number), format as block equation
+                return f"```\n{formula_text}  {label_text}\n```"
+            else:
+                # Otherwise, format as inline code
+                return f"`{formula_text}`"
+        
+        return ""
+
     def _format_reference(self, bibl_struct: Tag, ref_num: int) -> str:
         """
         Format a bibliographic reference with comprehensive TEI element handling.
diff --git a/tests/test_equation_serialization.py b/tests/test_equation_serialization.py