Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
248 changes: 243 additions & 5 deletions src/docc/plugins/html/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@
Plugin that renders to HTML.
"""


import html.parser
import sys
import warnings
import xml.etree.ElementTree as ET
from dataclasses import dataclass
from io import StringIO, TextIOBase
Expand Down Expand Up @@ -822,6 +822,241 @@ def references_definition(
return None


_NON_TRANSPARENT_ELEMENTS: FrozenSet[str] = frozenset(
{
"tr",
"td",
"th",
"tbody",
"thead",
"tfoot",
"col",
"colgroup",
"caption",
"table",
"form",
"fieldset",
"legend",
"button",
"input",
"select",
"textarea",
"label",
"details",
"summary",
"dialog",
"menu",
"menuitem",
"iframe",
"object",
"embed",
"video",
"audio",
"source",
"track",
"canvas",
"map",
"area",
"svg",
"math",
}
)


class _NonTransparentElementVisitor(Visitor):
"""
Visitor that checks if a node or any of its descendants contains HTML
elements that cannot be descendants of an `<a>` tag.
"""

def __init__(self) -> None:
self.found_non_transparent = False

def enter(self, node: Node) -> Visit:
"""
Check if the current node is a non-transparent element.
"""
if isinstance(node, HTMLTag):
if node.tag_name.lower() in _NON_TRANSPARENT_ELEMENTS:
self.found_non_transparent = True
return (
Visit.SkipChildren
) # No need to check children if we found one
return Visit.TraverseChildren

def exit(self, node: Node) -> None:
"""
Called after visiting a node and its children.
"""
pass


def _contains_non_transparent_elements(node: Node) -> bool:
"""
Check if a node or any of its descendants contains HTML elements
that cannot be descendants of an <a> tag.
"""
visitor = _NonTransparentElementVisitor()
node.visit(visitor)
return visitor.found_non_transparent


def _handle_non_transparent_reference(
context: Context,
parent: Union[HTMLRoot, HTMLTag],
reference: references.Reference,
) -> RenderResult:
"""
Handle references that contain non-transparent HTML elements.

Strategy: Try to invert the structure when possible, otherwise
render without the link and warn the user.
"""
# Try to invert the structure. For example:
#
# ```html
# <a>
# <tr>
# <td>foo</td>
# </tr>
# </a>
# ```
#
# Should become:
#
# ```html
# <tr>
# <td>
# <a>foo</a>
# </td>
# </tr>
inverted_content = _try_invert_reference_structure(context, reference)
if inverted_content is not None:
# Successfully inverted, render the inverted structure
visitor = HTMLVisitor(context)
inverted_content.visit(visitor)

for child in visitor.root.children:
parent.append(child)
return None

# If inversion failed, render without the link and warn

warnings.warn(
f"Reference `{reference.identifier}` contains non-transparent HTML "
"elements that cannot be wrapped in an <a> tag. Rendering without "
"link.",
UserWarning,
stacklevel=2,
)

# Render the child content normally without the link
visitor = HTMLVisitor(context)
reference.child.visit(visitor)

# Append all rendered children to the parent
for child in visitor.root.children:
parent.append(child)

return None


def _find_anchor_insertion_point(node: Node, href: str) -> Optional[Node]:
"""
Find a suitable place to insert an anchor tag within the node structure.

This function looks for text nodes or simple HTML elements that can
contain an anchor tag, and creates a modified version of the node
with the anchor inserted.
"""
return _find_anchor_insertion_point_recursive(node, href, False)


def _find_anchor_insertion_point_recursive(
node: Node, href: str, already_has_anchor: bool
) -> Optional[Node]:
"""
Recursive helper that tracks whether we're already inside an anchor tag.
"""
if isinstance(node, HTMLTag):
# Check if we're already inside an anchor tag
if node.tag_name.lower() == "a":
# Already an anchor tag, don't create nested anchors
return node if already_has_anchor else None

# For HTML tags, try to find a suitable child to wrap with an anchor
if node.tag_name.lower() in _NON_TRANSPARENT_ELEMENTS:
# This is a non-transparent element, look at its children
for child in node.children:
insertion_point = _find_anchor_insertion_point_recursive(
child, href, already_has_anchor
)
if insertion_point is not None:
# Found an insertion point, create a modified version.
new_node = HTMLTag(node.tag_name, node.attributes.copy())
# Copy all children to the new node
for original_child in node.children:
assert isinstance(original_child, (HTMLTag, TextNode))
new_node.append(original_child)
# Replace the specific child with the modified version
new_node.replace_child(child, insertion_point)
return new_node
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the following example, where do anchors end up? Not saying there's anything wrong with what you have; I just want to check my intuition.

<table>
    <a href="foo">
        <tr>
            <td>hello</td>
            <td>world</td>
        </tr>
    </a>
</table>

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for the feedback, the current logic created a nested anchor tag. I have fixed this. The current logic now handles cases where there is already an anchor tag. I will push my changes in a moment.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, my bad! I was unclear. I guess I was asking about:

<table>
    <Reference ...>
        <tr>
            <td>hello</td>
            <td>world</td>
        </tr>
    </Reference>
</table>

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function finds the first text node ("hello") and wraps it in an anchor
It then wraps the entire in another anchor to preserve structure
This creates nested anchors - outer anchor around the row, inner anchor around the text
The HTML is valid but creates nested clickable areas
The result looks like this:

<table>
    <a href="...">
        <tr>
            <td><a href="...">hello</a></td>
            <td>world</td>
        </tr>
    </a>
</table>

I will push a fix for this because nested anchor tags was not accounted for if not present in the original HTML.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have pushed a fix for this. Thank you again for the feedback!

return None
else:
# This element can contain an anchor
if already_has_anchor:
# Don't create nested anchors
return None
# Wrap it in a new anchor
anchor = HTMLTag("a", {"href": href})
anchor.append(node)
return anchor
elif isinstance(node, TextNode):
# Text nodes can be wrapped in an anchor, but only if we're not already
# inside one
if already_has_anchor:
return None
anchor = HTMLTag("a", {"href": href})
anchor.append(node)
return anchor
elif hasattr(node, "children"):
# For other node types with children, try to find a suitable child
for child in node.children:
insertion_point = _find_anchor_insertion_point_recursive(
child, href, already_has_anchor
)
if insertion_point is not None:
# Found a suitable insertion point, but we can't create a copy
# of arbitrary node types, so return None to fall back to
# warning
return None
return None


def _try_invert_reference_structure(
context: Context, reference: references.Reference
) -> Optional[Node]:
"""
Try to invert the reference structure to make it HTML-valid.

For example, if we have a reference with a table row containing a cell,
we try to move the link inside the cell instead of wrapping the row.

Returns the inverted structure if successful, None otherwise.
"""
# Get the anchor tag attributes from render_reference
try:
anchor = render_reference(context, reference)
href = anchor.attributes.get("href")
if not href:
return None
except Exception:
return None

# Try to find a suitable place to put the anchor tag
return _find_anchor_insertion_point(reference.child, href)


def references_reference(
context: object,
parent: object,
Expand All @@ -834,15 +1069,18 @@ def references_reference(
assert isinstance(parent, (HTMLRoot, HTMLTag))
assert isinstance(reference, references.Reference)

anchor = render_reference(context, reference)
parent.append(anchor)

if not reference.child:
anchor = render_reference(context, reference)
parent.append(anchor)
anchor.append(TextNode(reference.identifier))
return None

# TODO: handle tr, td, and other elements that can't be wrapped in an <a>.
if _contains_non_transparent_elements(reference.child):
return _handle_non_transparent_reference(context, parent, reference)

# Standard case: child content can be wrapped in an anchor
anchor = render_reference(context, reference)
parent.append(anchor)
return anchor


Expand Down
89 changes: 89 additions & 0 deletions test_non_transaparent_tags.py
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, we'll have to figure out what to do with this. I guess we'll need some plumbing for pytest and GitHub Actions?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably...

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll look into this. Thanks!

Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
"""
Test script to demonstrate the fix for non-transparent elements in references.
"""

import sys
import os

sys.path.insert(0, os.path.join(os.path.dirname(__file__), "src"))

from docc.plugins.html import (
HTMLTag,
TextNode,
_contains_non_transparent_elements,
)
from docc.plugins import references


def test_non_transparent_detection():
"""Test that we can detect non-transparent elements."""

# Test case 1: Simple text node (should be fine)
text_node = TextNode("Hello World")
assert not _contains_non_transparent_elements(text_node)
print("✓ Text node detection works")

# Test case 2: Simple HTML tag that can be in anchor
div_tag = HTMLTag("div")
div_tag.append(TextNode("Some content"))
assert not _contains_non_transparent_elements(div_tag)
print("✓ Div tag detection works")

# Test case 3: Table row (should be detected as non-transparent)
tr_tag = HTMLTag("tr")
td_tag = HTMLTag("td")
td_tag.append(TextNode("Cell content"))
tr_tag.append(td_tag)
assert _contains_non_transparent_elements(tr_tag)
print("✓ Table row detection works")

# Test case 4: Table cell (should be detected as non-transparent)
assert _contains_non_transparent_elements(td_tag)
print("✓ Table cell detection works")

# Test case 5: Nested structure with non-transparent element
table_tag = HTMLTag("table")
table_tag.append(tr_tag)
assert _contains_non_transparent_elements(table_tag)
print("✓ Nested non-transparent detection works")

print(
"\nAll tests passed! The non-transparent element detection is working correctly."
)


def test_reference_creation():
"""Test creating references with different content types."""

# Test case 1: Reference with simple text
simple_ref = references.Reference("test_ref", TextNode("Simple text"))
print(f"✓ Created reference with simple text: {simple_ref.identifier}")

# Test case 2: Reference with table content
tr_tag = HTMLTag("tr")
td_tag = HTMLTag("td")
td_tag.append(TextNode("Table cell"))
tr_tag.append(td_tag)

table_ref = references.Reference("table_ref", tr_tag)
print(f"✓ Created reference with table content: {table_ref.identifier}")

print("\nReference creation tests passed!")


if __name__ == "__main__":
print("Testing non-transparent element handling in references...\n")

test_non_transparent_detection()
print()
test_reference_creation()

print("\n" + "=" * 60)
print("SUMMARY:")
print("The solution successfully detects non-transparent HTML elements")
print("that cannot be descendants of <a> tags. When such elements are")
print("found in a reference, the system will:")
print("1. Try to invert the structure (move <a> inside suitable elements)")
print("2. If inversion fails, render without the link and warn the user")
print("3. Maintain HTML validity in all cases")
print("=" * 60)
1 change: 1 addition & 0 deletions whitelist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ rvalue
setext
setitem
src
stacklevel
starttag
stmt
strikethrough
Expand Down