From 9732e193ea86c3311f9a62e433dd983a958e4dde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonatan=20K=C5=82osko?= Date: Mon, 9 Feb 2026 17:46:19 +0100 Subject: [PATCH] Ensure unique nodes in query and query_by_id --- c_src/lazy_html.cpp | 43 ++++++++++++++++++++--------- test/lazy_html_test.exs | 60 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+), 13 deletions(-) diff --git a/c_src/lazy_html.cpp b/c_src/lazy_html.cpp index bc34a1c..6bfda8f 100644 --- a/c_src/lazy_html.cpp +++ b/c_src/lazy_html.cpp @@ -697,17 +697,27 @@ ExLazyHTML query(ErlNifEnv *env, ExLazyHTML ex_lazy_html, LXB_SELECTORS_OPT_MATCH_ROOT)); auto nodes = std::vector(); + auto inserted_nodes = std::unordered_set(); + + struct FindCtx { + std::vector *nodes; + std::unordered_set *inserted_nodes; + }; + + auto ctx = FindCtx{&nodes, &inserted_nodes}; for (auto node : ex_lazy_html.resource->nodes) { status = lxb_selectors_find( selectors, node, css_selector_list, [](lxb_dom_node_t *node, lxb_css_selector_specificity_t spec, void *ctx) -> lxb_status_t { - auto nodes_ptr = static_cast *>(ctx); - nodes_ptr->push_back(node); + auto find_ctx = static_cast(ctx); + if (find_ctx->inserted_nodes->insert(node).second) { + find_ctx->nodes->push_back(node); + } return LXB_STATUS_OK; }, - &nodes); + &ctx); if (status != LXB_STATUS_OK) { throw std::runtime_error("failed to run find"); } @@ -789,22 +799,31 @@ bool matches_id(lxb_dom_node_t *node, ErlNifBinary *id) { ExLazyHTML query_by_id(ErlNifEnv *env, ExLazyHTML ex_lazy_html, ErlNifBinary id) { auto nodes = std::vector(); + auto seen = std::unordered_set(); - auto ctx = std::make_tuple(&nodes, &id); + struct WalkCtx { + std::vector *nodes; + std::unordered_set *seen; + ErlNifBinary *id; + }; + + auto ctx = WalkCtx{&nodes, &seen, &id}; for (auto node : ex_lazy_html.resource->nodes) { if (matches_id(node, &id)) { - nodes.push_back(node); + if (seen.insert(node).second) { + nodes.push_back(node); + } } lxb_dom_node_simple_walk( node, [](lxb_dom_node_t *node, void *ctx) -> lexbor_action_t { - auto [nodes_ptr, id_ptr] = *static_cast< - std::tuple *, ErlNifBinary *> *>( - ctx); - if (matches_id(node, id_ptr)) { - nodes_ptr->push_back(node); + auto walk_ctx = static_cast(ctx); + if (matches_id(node, walk_ctx->id)) { + if (walk_ctx->seen->insert(node).second) { + walk_ctx->nodes->push_back(node); + } } return LEXBOR_ACTION_OK; @@ -843,9 +862,7 @@ ExLazyHTML parent_node(ErlNifEnv *env, ExLazyHTML ex_lazy_html) { auto parent = lxb_dom_node_parent(node); if (parent != NULL && parent->type == LXB_DOM_NODE_TYPE_ELEMENT && (is_document || !lxb_html_tree_node_is(parent, LXB_TAG_HTML))) { - auto inserted_node = inserted_nodes.find(parent); - if (inserted_node == inserted_nodes.end()) { - inserted_nodes.insert(parent); + if (inserted_nodes.insert(parent).second) { nodes.push_back(parent); } } diff --git a/test/lazy_html_test.exs b/test/lazy_html_test.exs index 84bee63..59c04fd 100644 --- a/test/lazy_html_test.exs +++ b/test/lazy_html_test.exs @@ -248,6 +248,36 @@ defmodule LazyHTMLTest do LazyHTML.query(lazy_html, "hover:") end end + + test "does not include duplicated elements in the result set" do + fragment = + LazyHTML.from_fragment(~S""" +
+
1
+
2
+
+ """) + + result = fragment |> LazyHTML.query("div") |> LazyHTML.query("div") + + # If nodes were not deduplicated, the second query would inflate + # the result to 5 nodes. We expect only 3 unique nodes. + + assert inspect(result) == """ + #LazyHTML< + 3 nodes (from selector) + #1 +
+
1
+
2
+
+ #2 +
1
+ #3 +
2
+ >\ + """ + end end describe "parent_node/1" do @@ -379,6 +409,36 @@ defmodule LazyHTMLTest do result = LazyHTML.query_by_id(lazy_html, "root") assert Enum.count(result) == 1 end + + test "does not include duplicated elements in the result set" do + # A proper HTML document should not have duplicated ids, but it + # can be the case. + fragment = + LazyHTML.from_fragment(~S""" +
+
1
+
2
+
+ """) + + result = fragment |> LazyHTML.query_by_id("1") |> LazyHTML.query_by_id("1") + + # If nodes were not deduplicated, the second query would inflate + # the result to 3 nodes. We expect only 2 unique nodes. + + assert inspect(result) == """ + #LazyHTML< + 2 nodes (from selector) + #1 +
+
1
+
2
+
+ #2 +
1
+ >\ + """ + end end describe "text/1" do