Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 30 additions & 13 deletions c_src/lazy_html.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -697,17 +697,27 @@ ExLazyHTML query(ErlNifEnv *env, ExLazyHTML ex_lazy_html,
LXB_SELECTORS_OPT_MATCH_ROOT));

auto nodes = std::vector<lxb_dom_node_t *>();
auto inserted_nodes = std::unordered_set<lxb_dom_node_t *>();

struct FindCtx {
std::vector<lxb_dom_node_t *> *nodes;
std::unordered_set<lxb_dom_node_t *> *inserted_nodes;
};

auto ctx = FindCtx{&nodes, &inserted_nodes};

for (auto node : ex_lazy_html.resource->nodes) {
status = lxb_selectors_find(
selectors, node, css_selector_list,
[](lxb_dom_node_t *node, lxb_css_selector_specificity_t spec,
void *ctx) -> lxb_status_t {
auto nodes_ptr = static_cast<std::vector<lxb_dom_node_t *> *>(ctx);
nodes_ptr->push_back(node);
auto find_ctx = static_cast<FindCtx *>(ctx);
if (find_ctx->inserted_nodes->insert(node).second) {
find_ctx->nodes->push_back(node);
}
return LXB_STATUS_OK;
},
&nodes);
&ctx);
if (status != LXB_STATUS_OK) {
throw std::runtime_error("failed to run find");
}
Expand Down Expand Up @@ -789,22 +799,31 @@ bool matches_id(lxb_dom_node_t *node, ErlNifBinary *id) {
ExLazyHTML query_by_id(ErlNifEnv *env, ExLazyHTML ex_lazy_html,
ErlNifBinary id) {
auto nodes = std::vector<lxb_dom_node_t *>();
auto seen = std::unordered_set<lxb_dom_node_t *>();

auto ctx = std::make_tuple(&nodes, &id);
struct WalkCtx {
std::vector<lxb_dom_node_t *> *nodes;
std::unordered_set<lxb_dom_node_t *> *seen;
ErlNifBinary *id;
};

auto ctx = WalkCtx{&nodes, &seen, &id};

for (auto node : ex_lazy_html.resource->nodes) {
if (matches_id(node, &id)) {
nodes.push_back(node);
if (seen.insert(node).second) {
nodes.push_back(node);
}
}

lxb_dom_node_simple_walk(
node,
[](lxb_dom_node_t *node, void *ctx) -> lexbor_action_t {
auto [nodes_ptr, id_ptr] = *static_cast<
std::tuple<std::vector<lxb_dom_node_t *> *, ErlNifBinary *> *>(
ctx);
if (matches_id(node, id_ptr)) {
nodes_ptr->push_back(node);
auto walk_ctx = static_cast<WalkCtx *>(ctx);
if (matches_id(node, walk_ctx->id)) {
if (walk_ctx->seen->insert(node).second) {
walk_ctx->nodes->push_back(node);
}
}

return LEXBOR_ACTION_OK;
Expand Down Expand Up @@ -843,9 +862,7 @@ ExLazyHTML parent_node(ErlNifEnv *env, ExLazyHTML ex_lazy_html) {
auto parent = lxb_dom_node_parent(node);
if (parent != NULL && parent->type == LXB_DOM_NODE_TYPE_ELEMENT &&
(is_document || !lxb_html_tree_node_is(parent, LXB_TAG_HTML))) {
auto inserted_node = inserted_nodes.find(parent);
if (inserted_node == inserted_nodes.end()) {
inserted_nodes.insert(parent);
if (inserted_nodes.insert(parent).second) {
nodes.push_back(parent);
}
}
Expand Down
60 changes: 60 additions & 0 deletions test/lazy_html_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,36 @@ defmodule LazyHTMLTest do
LazyHTML.query(lazy_html, "hover:")
end
end

test "does not include duplicated elements in the result set" do
fragment =
LazyHTML.from_fragment(~S"""
<div>
<div>1</div>
<div>2</div>
</div>
""")

result = fragment |> LazyHTML.query("div") |> LazyHTML.query("div")

# If nodes were not deduplicated, the second query would inflate
# the result to 5 nodes. We expect only 3 unique nodes.

assert inspect(result) == """
#LazyHTML<
3 nodes (from selector)
#1
<div>
<div>1</div>
<div>2</div>
</div>
#2
<div>1</div>
#3
<div>2</div>
>\
"""
end
end

describe "parent_node/1" do
Expand Down Expand Up @@ -379,6 +409,36 @@ defmodule LazyHTMLTest do
result = LazyHTML.query_by_id(lazy_html, "root")
assert Enum.count(result) == 1
end

test "does not include duplicated elements in the result set" do
# A proper HTML document should not have duplicated ids, but it
# can be the case.
fragment =
LazyHTML.from_fragment(~S"""
<div id="1">
<div id="1">1</div>
<div>2</div>
</div>
""")

result = fragment |> LazyHTML.query_by_id("1") |> LazyHTML.query_by_id("1")

# If nodes were not deduplicated, the second query would inflate
# the result to 3 nodes. We expect only 2 unique nodes.

assert inspect(result) == """
#LazyHTML<
2 nodes (from selector)
#1
<div id=\"1\">
<div id=\"1\">1</div>
<div>2</div>
</div>
#2
<div id=\"1\">1</div>
>\
"""
end
end

describe "text/1" do
Expand Down
Loading