Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 77 additions & 14 deletions grobid_client/format/TEI2LossyJSON.py
Original file line number Diff line number Diff line change
Expand Up @@ -730,25 +730,32 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa
# Generic handling - capitalize and format
head_section = div_type.replace("_", " ").title()

# Process paragraphs in this div
if len(direct_p_nodes) > 0:
for id_p, p in enumerate(direct_p_nodes):
paragraph_id = get_random_id(prefix="p_")

if passage_level == "sentence":
for id_s, sentence in enumerate(p.find_all("s")):
struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, sentence)
# Process direct children of div in order to maintain proper sequence
for child in div.children:
if hasattr(child, 'name') and child.name:
if child.name == "p":
# Process paragraphs
paragraph_id = get_random_id(prefix="p_")

if passage_level == "sentence":
for id_s, sentence in enumerate(child.find_all("s")):
struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, sentence)
if self.validate_refs:
for ref in struct['refs']:
assert "Wrong offsets", ref['offset_start'] < ref['offset_end']
assert "Cannot apply offsets", struct['text'][ref['offset_start']:ref['offset_end']] == ref['text']
yield struct
else:
struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, child)
if self.validate_refs:
for ref in struct['refs']:
assert "Wrong offsets", ref['offset_start'] < ref['offset_end']
assert "Cannot apply offsets", struct['text'][ref['offset_start']:ref['offset_end']] == ref['text']
yield struct
else:
struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, p)
if self.validate_refs:
for ref in struct['refs']:
assert "Wrong offsets", ref['offset_start'] < ref['offset_end']
assert "Cannot apply offsets", struct['text'][ref['offset_start']:ref['offset_end']] == ref['text']
elif child.name == "formula":
# Process formulas in their natural position
formula_id = get_random_id(prefix="formula_")
struct = get_formatted_formula(current_head_paragraph or head_paragraph, head_section, formula_id, child)
yield struct

# Update head_paragraph for potential next div
Expand Down Expand Up @@ -930,6 +937,62 @@ def _clean_text_local(text: str) -> str:
return passage


def get_formatted_formula(head_paragraph, head_section, formula_id, element):
"""Format a formula element with metadata."""
# Import the clean_text method
def _clean_text_local(text: str) -> str:
if not text:
return ""
import re
import html
text = re.sub(r'\s+', ' ', text.strip())
text = html.unescape(text)
return text

# Extract formula text and label separately
formula_text = ""
label_text = ""

for child in element.children:
if hasattr(child, 'name'):
if child.name == "label":
label_text = _clean_text_local(child.get_text())
else:
formula_text += child.get_text()
else:
# NavigableString - direct text content
formula_text += str(child)

formula_text = _clean_text_local(formula_text)

# Create the formula structure
formula_data = {
"id": formula_id,
"type": "formula",
"text": formula_text,
"coords": [
box_to_dict(coord.split(","))
for coord in element.get("coords", "").split(";")
] if element.has_attr("coords") else []
}

# Add label if present
if label_text:
formula_data["label"] = label_text

# Add xml:id if present
xml_id = element.get("xml:id")
if xml_id:
formula_data["xml_id"] = xml_id

if head_paragraph:
formula_data["head_paragraph"] = head_paragraph
if head_section:
formula_data["head_section"] = head_section

return formula_data


def xml_table_to_markdown(table_element):
"""Convert XML table to markdown format."""
if not table_element:
Expand Down
53 changes: 47 additions & 6 deletions grobid_client/format/TEI2Markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,12 +213,19 @@ def _extract_fulltext(self, soup: BeautifulSoup) -> str:
section_title = head.get_text().strip()
fulltext_sections.append(f"### {section_title}\n")

# Get paragraphs
paragraphs = div.find_all("p")
for p in paragraphs:
paragraph_text = self._process_paragraph(p)
if paragraph_text.strip():
fulltext_sections.append(f"{paragraph_text}\n\n")
# Process direct children of div in order to maintain structure
for child in div.children:
if hasattr(child, 'name'):
if child.name == "p":
# Process paragraphs
paragraph_text = self._process_paragraph(child)
if paragraph_text.strip():
fulltext_sections.append(f"{paragraph_text}\n\n")
elif child.name == "formula":
# Process formulas that are direct children of div
formula_md = self._formula_to_markdown(child)
if formula_md:
fulltext_sections.append(f"{formula_md}\n\n")

return "".join(fulltext_sections)

Expand Down Expand Up @@ -320,6 +327,11 @@ def _process_paragraph(self, p_element: Tag) -> str:
# Handle references - keep the text but don't add special formatting
ref_text = element.get_text()
text_parts.append(ref_text)
elif element.name == "formula":
# Handle formulas/equations
formula_md = self._formula_to_markdown(element)
if formula_md:
text_parts.append(f"\n{formula_md}\n")
elif element.name == "figure":
# Handle figures
fig_desc = element.find("figDesc")
Expand Down Expand Up @@ -352,6 +364,35 @@ def _table_to_markdown(self, table_element: Tag) -> str:

return "\n".join(markdown_lines) if markdown_lines else ""

def _formula_to_markdown(self, formula_element: Tag) -> str:
"""Convert a formula element to markdown format."""
# Extract the formula content (text without label)
formula_text = ""
label_text = ""

# Get formula text, excluding the label
for child in formula_element.children:
if isinstance(child, NavigableString):
formula_text += str(child)
elif child.name == "label":
label_text = child.get_text().strip()
else:
# For other elements within formula, get their text
formula_text += child.get_text()

formula_text = formula_text.strip()

# Format as inline code or block depending on content
if formula_text:
if label_text:
# If there's a label (equation number), format as block equation
return f"```\n{formula_text} {label_text}\n```"
else:
# Otherwise, format as inline code
return f"`{formula_text}`"

return ""

def _format_reference(self, bibl_struct: Tag, ref_num: int) -> str:
"""
Format a bibliographic reference with comprehensive TEI element handling.
Expand Down
Loading