From 6c1dfc91b24f02b5cbb6766863e99b71e657ba08 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 16 Nov 2025 22:07:01 +0000 Subject: [PATCH 1/2] Initial plan From 18ae04fe27394993c1d687a38d9f990022a2415e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 16 Nov 2025 22:16:57 +0000 Subject: [PATCH 2/2] Add equation serialization to JSON and Markdown output - Add formula handling in TEI2Markdown converter (_process_paragraph and _extract_fulltext) - Add formula handling in TEI2LossyJSON converter (_process_div_with_nested_content) - Formulas with labels are formatted as code blocks in Markdown - Formulas without labels are formatted as inline code in Markdown - JSON output includes formulas as separate entries with type='formula' - Formulas maintain proper order between paragraphs - Add comprehensive tests for equation serialization Co-authored-by: lfoppiano <15426+lfoppiano@users.noreply.github.com> --- grobid_client/format/TEI2LossyJSON.py | 91 +++++++-- grobid_client/format/TEI2Markdown.py | 53 +++++- tests/test_equation_serialization.py | 257 ++++++++++++++++++++++++++ 3 files changed, 381 insertions(+), 20 deletions(-) create mode 100644 tests/test_equation_serialization.py diff --git a/grobid_client/format/TEI2LossyJSON.py b/grobid_client/format/TEI2LossyJSON.py index 76154ee..60d4ef2 100644 --- a/grobid_client/format/TEI2LossyJSON.py +++ b/grobid_client/format/TEI2LossyJSON.py @@ -730,25 +730,32 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa # Generic handling - capitalize and format head_section = div_type.replace("_", " ").title() - # Process paragraphs in this div - if len(direct_p_nodes) > 0: - for id_p, p in enumerate(direct_p_nodes): - paragraph_id = get_random_id(prefix="p_") - - if passage_level == "sentence": - for id_s, sentence in enumerate(p.find_all("s")): - struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, sentence) + # Process direct children of div in order to maintain proper sequence + for child in div.children: + if hasattr(child, 'name') and child.name: + if child.name == "p": + # Process paragraphs + paragraph_id = get_random_id(prefix="p_") + + if passage_level == "sentence": + for id_s, sentence in enumerate(child.find_all("s")): + struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, sentence) + if self.validate_refs: + for ref in struct['refs']: + assert "Wrong offsets", ref['offset_start'] < ref['offset_end'] + assert "Cannot apply offsets", struct['text'][ref['offset_start']:ref['offset_end']] == ref['text'] + yield struct + else: + struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, child) if self.validate_refs: for ref in struct['refs']: assert "Wrong offsets", ref['offset_start'] < ref['offset_end'] assert "Cannot apply offsets", struct['text'][ref['offset_start']:ref['offset_end']] == ref['text'] yield struct - else: - struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, p) - if self.validate_refs: - for ref in struct['refs']: - assert "Wrong offsets", ref['offset_start'] < ref['offset_end'] - assert "Cannot apply offsets", struct['text'][ref['offset_start']:ref['offset_end']] == ref['text'] + elif child.name == "formula": + # Process formulas in their natural position + formula_id = get_random_id(prefix="formula_") + struct = get_formatted_formula(current_head_paragraph or head_paragraph, head_section, formula_id, child) yield struct # Update head_paragraph for potential next div @@ -930,6 +937,62 @@ def _clean_text_local(text: str) -> str: return passage +def get_formatted_formula(head_paragraph, head_section, formula_id, element): + """Format a formula element with metadata.""" + # Import the clean_text method + def _clean_text_local(text: str) -> str: + if not text: + return "" + import re + import html + text = re.sub(r'\s+', ' ', text.strip()) + text = html.unescape(text) + return text + + # Extract formula text and label separately + formula_text = "" + label_text = "" + + for child in element.children: + if hasattr(child, 'name'): + if child.name == "label": + label_text = _clean_text_local(child.get_text()) + else: + formula_text += child.get_text() + else: + # NavigableString - direct text content + formula_text += str(child) + + formula_text = _clean_text_local(formula_text) + + # Create the formula structure + formula_data = { + "id": formula_id, + "type": "formula", + "text": formula_text, + "coords": [ + box_to_dict(coord.split(",")) + for coord in element.get("coords", "").split(";") + ] if element.has_attr("coords") else [] + } + + # Add label if present + if label_text: + formula_data["label"] = label_text + + # Add xml:id if present + xml_id = element.get("xml:id") + if xml_id: + formula_data["xml_id"] = xml_id + + if head_paragraph: + formula_data["head_paragraph"] = head_paragraph + if head_section: + formula_data["head_section"] = head_section + + return formula_data + + def xml_table_to_markdown(table_element): """Convert XML table to markdown format.""" if not table_element: diff --git a/grobid_client/format/TEI2Markdown.py b/grobid_client/format/TEI2Markdown.py index 167231e..1b83ea0 100644 --- a/grobid_client/format/TEI2Markdown.py +++ b/grobid_client/format/TEI2Markdown.py @@ -213,12 +213,19 @@ def _extract_fulltext(self, soup: BeautifulSoup) -> str: section_title = head.get_text().strip() fulltext_sections.append(f"### {section_title}\n") - # Get paragraphs - paragraphs = div.find_all("p") - for p in paragraphs: - paragraph_text = self._process_paragraph(p) - if paragraph_text.strip(): - fulltext_sections.append(f"{paragraph_text}\n\n") + # Process direct children of div in order to maintain structure + for child in div.children: + if hasattr(child, 'name'): + if child.name == "p": + # Process paragraphs + paragraph_text = self._process_paragraph(child) + if paragraph_text.strip(): + fulltext_sections.append(f"{paragraph_text}\n\n") + elif child.name == "formula": + # Process formulas that are direct children of div + formula_md = self._formula_to_markdown(child) + if formula_md: + fulltext_sections.append(f"{formula_md}\n\n") return "".join(fulltext_sections) @@ -320,6 +327,11 @@ def _process_paragraph(self, p_element: Tag) -> str: # Handle references - keep the text but don't add special formatting ref_text = element.get_text() text_parts.append(ref_text) + elif element.name == "formula": + # Handle formulas/equations + formula_md = self._formula_to_markdown(element) + if formula_md: + text_parts.append(f"\n{formula_md}\n") elif element.name == "figure": # Handle figures fig_desc = element.find("figDesc") @@ -352,6 +364,35 @@ def _table_to_markdown(self, table_element: Tag) -> str: return "\n".join(markdown_lines) if markdown_lines else "" + def _formula_to_markdown(self, formula_element: Tag) -> str: + """Convert a formula element to markdown format.""" + # Extract the formula content (text without label) + formula_text = "" + label_text = "" + + # Get formula text, excluding the label + for child in formula_element.children: + if isinstance(child, NavigableString): + formula_text += str(child) + elif child.name == "label": + label_text = child.get_text().strip() + else: + # For other elements within formula, get their text + formula_text += child.get_text() + + formula_text = formula_text.strip() + + # Format as inline code or block depending on content + if formula_text: + if label_text: + # If there's a label (equation number), format as block equation + return f"```\n{formula_text} {label_text}\n```" + else: + # Otherwise, format as inline code + return f"`{formula_text}`" + + return "" + def _format_reference(self, bibl_struct: Tag, ref_num: int) -> str: """ Format a bibliographic reference with comprehensive TEI element handling. diff --git a/tests/test_equation_serialization.py b/tests/test_equation_serialization.py new file mode 100644 index 0000000..ced0f9b --- /dev/null +++ b/tests/test_equation_serialization.py @@ -0,0 +1,257 @@ +""" +Unit tests for equation/formula serialization in TEI to JSON and Markdown conversions. +""" +import os +from grobid_client.format.TEI2Markdown import TEI2MarkdownConverter +from grobid_client.format.TEI2LossyJSON import TEI2LossyJSONConverter +from tests.resources import TEST_DATA_PATH + + +class TestEquationSerialization: + """Test cases for equation/formula serialization in conversions.""" + + def test_formulas_in_markdown_output(self): + """Test that formulas are included in Markdown output.""" + # Use test file known to contain formulas + tei_file = os.path.join(TEST_DATA_PATH, '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml') + assert os.path.exists(tei_file), f"Test file should exist: {tei_file}" + + # Convert to Markdown + converter = TEI2MarkdownConverter() + markdown_output = converter.convert_tei_file(tei_file) + + # Verify conversion succeeded + assert markdown_output is not None, "Markdown conversion should not return None" + assert isinstance(markdown_output, str), "Markdown should be a string" + assert len(markdown_output) > 0, "Markdown should have content" + + # Check that formulas are present (they should be in code blocks) + assert '```' in markdown_output, "Formulas should be formatted as code blocks" + + # Check for specific formula content from the test file + # The test file has formulas with "Fext" and equation numbers + assert 'Fext' in markdown_output, "Formula variables should appear in output" + assert 'ð1Þ' in markdown_output or '(1)' in markdown_output, "Equation labels should appear" + + # Count code blocks (each formula uses ``` for opening and closing) + code_block_count = markdown_output.count('```') // 2 + assert code_block_count >= 2, "Should have at least 2 formulas in test file" + + def test_formulas_in_json_output(self): + """Test that formulas are included in JSON output.""" + # Use test file known to contain formulas + tei_file = os.path.join(TEST_DATA_PATH, '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml') + assert os.path.exists(tei_file), f"Test file should exist: {tei_file}" + + # Convert to JSON + converter = TEI2LossyJSONConverter() + json_output = converter.convert_tei_file(tei_file, stream=False) + + # Verify conversion succeeded + assert json_output is not None, "JSON conversion should not return None" + assert isinstance(json_output, dict), "JSON should be a dictionary" + + # Check body_text contains formulas + body_text = json_output.get('body_text', []) + assert len(body_text) > 0, "Should have body_text entries" + + # Find formula entries + formulas = [entry for entry in body_text if entry.get('type') == 'formula'] + assert len(formulas) >= 2, "Should have at least 2 formulas" + + # Verify formula structure + for formula in formulas: + assert 'id' in formula, "Formula should have ID" + assert 'type' in formula, "Formula should have type" + assert formula['type'] == 'formula', "Type should be 'formula'" + assert 'text' in formula, "Formula should have text content" + assert len(formula['text']) > 0, "Formula text should not be empty" + + # Check specific formulas from the test file + formula_texts = [f.get('text', '') for f in formulas] + assert any('Fext' in text for text in formula_texts), "Should have formula with 'Fext'" + + # Check labels + formula_labels = [f.get('label', '') for f in formulas] + assert any(label for label in formula_labels), "At least one formula should have a label" + + def test_formula_ordering_in_json(self): + """Test that formulas appear in correct order relative to paragraphs.""" + tei_file = os.path.join(TEST_DATA_PATH, '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml') + assert os.path.exists(tei_file), f"Test file should exist: {tei_file}" + + # Convert to JSON + converter = TEI2LossyJSONConverter() + json_output = converter.convert_tei_file(tei_file, stream=False) + + body_text = json_output.get('body_text', []) + + # Find entries in "Data analysis" section + data_analysis_entries = [ + entry for entry in body_text + if entry.get('head_section') == 'Data analysis' + ] + + assert len(data_analysis_entries) > 0, "Should have Data analysis section" + + # The first entry should be a paragraph about "Percentage of fingers extensions" + first_entry = data_analysis_entries[0] + assert first_entry.get('type') != 'formula', "First entry should be a paragraph" + assert 'Percentage' in first_entry.get('text', ''), "First paragraph should mention 'Percentage'" + + # A formula should come before the paragraph starting with "Where Fext" + found_formula_before_where = False + for i, entry in enumerate(data_analysis_entries[:-1]): + if entry.get('type') == 'formula': + next_entry = data_analysis_entries[i + 1] + if 'Where' in next_entry.get('text', ''): + found_formula_before_where = True + break + + assert found_formula_before_where, "Formula should appear before explanatory paragraph" + + def test_formula_with_label_structure(self): + """Test that formulas with labels are properly structured.""" + tei_file = os.path.join(TEST_DATA_PATH, '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml') + + # Convert to JSON + converter = TEI2LossyJSONConverter() + json_output = converter.convert_tei_file(tei_file, stream=False) + + body_text = json_output.get('body_text', []) + formulas = [entry for entry in body_text if entry.get('type') == 'formula'] + + # Find formula with label + formulas_with_labels = [f for f in formulas if f.get('label')] + assert len(formulas_with_labels) > 0, "Should have formulas with labels" + + # Check that formula text doesn't include the label + for formula in formulas_with_labels: + label = formula.get('label', '') + text = formula.get('text', '') + # The label should be separate from the formula text + assert label, "Label should not be empty" + # Label like "(1)" or "ð1Þ" should not appear at the end of text + assert not text.endswith(label), "Formula text should not end with label" + + def test_formula_coordinates(self): + """Test that formula coordinates are preserved if available.""" + tei_file = os.path.join(TEST_DATA_PATH, '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml') + + # Convert to JSON + converter = TEI2LossyJSONConverter() + json_output = converter.convert_tei_file(tei_file, stream=False) + + body_text = json_output.get('body_text', []) + formulas = [entry for entry in body_text if entry.get('type') == 'formula'] + + # Check if formulas have coords + for formula in formulas: + assert 'coords' in formula, "Formula should have coords field" + # coords can be empty list if not available in source + assert isinstance(formula['coords'], list), "Coords should be a list" + + def test_formula_xml_id_preserved(self): + """Test that xml:id attribute is preserved for formulas.""" + tei_file = os.path.join(TEST_DATA_PATH, '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml') + + # Convert to JSON + converter = TEI2LossyJSONConverter() + json_output = converter.convert_tei_file(tei_file, stream=False) + + body_text = json_output.get('body_text', []) + formulas = [entry for entry in body_text if entry.get('type') == 'formula'] + + # At least some formulas should have xml_id + formulas_with_xml_id = [f for f in formulas if f.get('xml_id')] + assert len(formulas_with_xml_id) > 0, "Some formulas should have xml_id" + + # Check format of xml_id + for formula in formulas_with_xml_id: + xml_id = formula.get('xml_id') + assert xml_id.startswith('formula_'), f"xml_id should start with 'formula_': {xml_id}" + + def test_formulas_in_other_test_files(self): + """Test formula serialization in other test files.""" + test_files = [ + '10.1371_journal.pone.0218311.grobid.tei.xml', + '10.1038_s41586-023-05895-y.grobid.tei.xml' + ] + + refs_offsets_dir = os.path.join(TEST_DATA_PATH, 'refs_offsets') + + for filename in test_files: + filepath = os.path.join(refs_offsets_dir, filename) + if not os.path.exists(filepath): + continue + + print(f"\nTesting {filename}") + + # Test JSON conversion + converter = TEI2LossyJSONConverter() + json_output = converter.convert_tei_file(filepath, stream=False) + + if json_output: + body_text = json_output.get('body_text', []) + formulas = [entry for entry in body_text if entry.get('type') == 'formula'] + + # If file has formulas, verify they're properly structured + if len(formulas) > 0: + print(f" Found {len(formulas)} formulas") + for formula in formulas: + assert 'text' in formula, f"Formula in {filename} should have text" + assert len(formula['text']) > 0, f"Formula text in {filename} should not be empty" + + # Test Markdown conversion + md_converter = TEI2MarkdownConverter() + md_output = md_converter.convert_tei_file(filepath) + + if md_output and len(formulas) > 0: + # If JSON found formulas, Markdown should include them either as: + # - code blocks (```) for formulas with labels + # - inline code (`) for formulas without labels + # So we just check for backticks in general + assert '`' in md_output, f"Markdown for {filename} should contain formula code formatting" + + def test_empty_formula_handling(self): + """Test that empty or malformed formulas don't break conversion.""" + # Create a minimal TEI with an empty formula + import tempfile + tei_content = """ + + + + + Test Document + + + + + +
+ Test Section +

Before formula.

+ +

After formula.

+
+ +
+
""" + + with tempfile.NamedTemporaryFile(mode='w', suffix='.tei.xml', delete=False) as f: + f.write(tei_content) + temp_file = f.name + + try: + # Test JSON conversion + converter = TEI2LossyJSONConverter() + json_output = converter.convert_tei_file(temp_file, stream=False) + assert json_output is not None, "Should handle empty formula gracefully" + + # Test Markdown conversion + md_converter = TEI2MarkdownConverter() + md_output = md_converter.convert_tei_file(temp_file) + assert md_output is not None, "Markdown should handle empty formula gracefully" + + finally: + os.unlink(temp_file)