From 7b4504c102fddefbac2e828144ee0c2ddf5c336d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonatan=20K=C5=82osko?= Date: Mon, 9 Feb 2026 14:22:55 +0100 Subject: [PATCH] Rewrite recursions into explicit stack --- c_src/lazy_html.cpp | 404 ++++++++++++++++++++++++++++---------------- mix.exs | 2 +- 2 files changed, 260 insertions(+), 146 deletions(-) diff --git a/c_src/lazy_html.cpp b/c_src/lazy_html.cpp index ce5a8ae..bc34a1c 100644 --- a/c_src/lazy_html.cpp +++ b/c_src/lazy_html.cpp @@ -10,10 +10,10 @@ #include #include -#include #include -#include #include +#include +#include namespace lazy_html { @@ -243,69 +243,108 @@ lxb_dom_node_t *template_aware_first_child(lxb_dom_node_t *node) { void append_node_html(lxb_dom_node_t *node, bool skip_whitespace_nodes, std::string &html) { - if (node->type == LXB_DOM_NODE_TYPE_TEXT) { - auto character_data = lxb_dom_interface_character_data(node); + // We use an explicit stack instead of recursion to avoid stack + // overflow on deeply nested trees. + struct StackFrame { + lxb_dom_node_t *next_child; + std::string closing_tag; + }; - auto whitespace_size = leading_whitespace_size(character_data->data.data, - character_data->data.length); + std::vector stack; - if (whitespace_size == character_data->data.length && - skip_whitespace_nodes) { - // Append nothing - } else { - if (is_noescape_text_node(node)) { - html.append(reinterpret_cast(character_data->data.data), - character_data->data.length); - } else { - append_escaping(html, character_data->data.data, - character_data->data.length, whitespace_size); - } - } - } else if (node->type == LXB_DOM_NODE_TYPE_COMMENT) { - auto character_data = lxb_dom_interface_character_data(node); - html.append(""); - } else if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) { - auto element = lxb_dom_interface_element(node); - size_t name_length; - auto name = lxb_dom_element_qualified_name(element, &name_length); - if (name == NULL) { - throw std::runtime_error("failed to read tag name"); - } - html.append("<"); - html.append(reinterpret_cast(name), name_length); + auto current = node; + + while (true) { + if (current->type == LXB_DOM_NODE_TYPE_TEXT) { + auto character_data = lxb_dom_interface_character_data(current); - for (auto attribute = lxb_dom_element_first_attribute(element); - attribute != NULL; - attribute = lxb_dom_element_next_attribute(attribute)) { - html.append(" "); + auto whitespace_size = leading_whitespace_size( + character_data->data.data, character_data->data.length); + if (whitespace_size == character_data->data.length && + skip_whitespace_nodes) { + // Append nothing + } else { + if (is_noescape_text_node(current)) { + html.append(reinterpret_cast(character_data->data.data), + character_data->data.length); + } else { + append_escaping(html, character_data->data.data, + character_data->data.length, whitespace_size); + } + } + } else if (current->type == LXB_DOM_NODE_TYPE_COMMENT) { + auto character_data = lxb_dom_interface_character_data(current); + html.append(""); + } else if (current->type == LXB_DOM_NODE_TYPE_ELEMENT) { + auto element = lxb_dom_interface_element(current); size_t name_length; - auto name = lxb_dom_attr_qualified_name(attribute, &name_length); + auto name = lxb_dom_element_qualified_name(element, &name_length); + if (name == NULL) { + throw std::runtime_error("failed to read tag name"); + } + html.append("<"); html.append(reinterpret_cast(name), name_length); - html.append("=\""); + for (auto attribute = lxb_dom_element_first_attribute(element); + attribute != NULL; + attribute = lxb_dom_element_next_attribute(attribute)) { + html.append(" "); + + size_t name_length; + auto name = lxb_dom_attr_qualified_name(attribute, &name_length); + html.append(reinterpret_cast(name), name_length); - size_t value_length; - auto value = lxb_dom_attr_value(attribute, &value_length); - append_escaping(html, value, value_length); + html.append("=\""); - html.append("\""); + size_t value_length; + auto value = lxb_dom_attr_value(attribute, &value_length); + append_escaping(html, value, value_length); + + html.append("\""); + } + + if (lxb_html_node_is_void(current)) { + html.append("/>"); + } else { + html.append(">"); + + auto closing = std::string("(name), name_length); + closing.append(">"); + + auto first_child = template_aware_first_child(current); + + if (first_child == nullptr) { + html.append(closing); + } else { + stack.push_back({lxb_dom_node_next(first_child), std::move(closing)}); + // Immediately process the child. + current = first_child; + continue; + } + } } - if (lxb_html_node_is_void(node)) { - html.append("/>"); - } else { - html.append(">"); - for (auto child = template_aware_first_child(node); child != NULL; - child = lxb_dom_node_next(child)) { - append_node_html(child, skip_whitespace_nodes, html); + // Advance to the next sibling, or pop frames until we find one. + while (!stack.empty()) { + auto &frame = stack.back(); + + if (frame.next_child != nullptr) { + current = frame.next_child; + frame.next_child = lxb_dom_node_next(current); + break; } - html.append("(name), name_length); - html.append(">"); + + html.append(frame.closing_tag); + stack.pop_back(); + } + + if (stack.empty()) { + return; } } } @@ -357,51 +396,94 @@ ERL_NIF_TERM attributes_to_term(ErlNifEnv *env, lxb_dom_element_t *element, void node_to_tree(ErlNifEnv *env, fine::ResourcePtr &resource, lxb_dom_node_t *node, std::vector &tree, bool sort_attributes, bool skip_whitespace_nodes) { - if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) { - auto element = lxb_dom_interface_element(node); + // We use an explicit stack instead of recursion to avoid stack + // overflow on deeply nested trees. + struct StackFrame { + lxb_dom_node_t *next_child; + ERL_NIF_TERM name_term; + ERL_NIF_TERM attrs_term; + std::vector children; + }; + + std::vector stack; + + auto current = node; + + while (true) { + if (current->type == LXB_DOM_NODE_TYPE_TEXT) { + auto character_data = lxb_dom_interface_character_data(current); + + auto whitespace_size = leading_whitespace_size( + character_data->data.data, character_data->data.length); + + if (!(whitespace_size == character_data->data.length && + skip_whitespace_nodes)) { + auto term = fine::make_resource_binary( + env, resource, reinterpret_cast(character_data->data.data), + character_data->data.length); + auto &target = stack.empty() ? tree : stack.back().children; + target.push_back(term); + } + } else if (current->type == LXB_DOM_NODE_TYPE_COMMENT) { + auto character_data = lxb_dom_interface_character_data(current); + auto term = fine::make_resource_binary( + env, resource, reinterpret_cast(character_data->data.data), + character_data->data.length); + auto &target = stack.empty() ? tree : stack.back().children; + target.push_back( + enif_make_tuple2(env, fine::encode(env, atoms::comment), term)); + } else if (current->type == LXB_DOM_NODE_TYPE_ELEMENT) { + auto element = lxb_dom_interface_element(current); - size_t name_length; - auto name = lxb_dom_element_qualified_name(element, &name_length); - if (name == NULL) { - throw std::runtime_error("failed to read tag name"); - } - auto name_term = make_new_binary(env, name_length, name); + size_t name_length; + auto name = lxb_dom_element_qualified_name(element, &name_length); + if (name == NULL) { + throw std::runtime_error("failed to read tag name"); + } + auto name_term = make_new_binary(env, name_length, name); + auto attrs_term = attributes_to_term(env, element, sort_attributes); - auto attrs_term = attributes_to_term(env, element, sort_attributes); + auto first_child = template_aware_first_child(current); - auto children = std::vector(); - for (auto child = template_aware_first_child(node); child != NULL; - child = lxb_dom_node_next(child)) { - node_to_tree(env, resource, child, children, sort_attributes, - skip_whitespace_nodes); + if (first_child == nullptr) { + auto children_term = enif_make_list(env, 0); + auto &target = stack.empty() ? tree : stack.back().children; + target.push_back( + enif_make_tuple3(env, name_term, attrs_term, children_term)); + } else { + stack.push_back( + {lxb_dom_node_next(first_child), name_term, attrs_term, {}}); + // Immediately process the child. + current = first_child; + continue; + } } - auto children_term = enif_make_list_from_array( - env, children.data(), static_cast(children.size())); + // Advance to the next sibling, or pop frames until we find one. + while (!stack.empty()) { + auto &frame = stack.back(); - tree.push_back(enif_make_tuple3(env, name_term, attrs_term, children_term)); - } else if (node->type == LXB_DOM_NODE_TYPE_TEXT) { - auto character_data = lxb_dom_interface_character_data(node); + if (frame.next_child != nullptr) { + current = frame.next_child; + frame.next_child = lxb_dom_node_next(current); + break; + } - auto whitespace_size = leading_whitespace_size(character_data->data.data, - character_data->data.length); + auto children_term = enif_make_list_from_array( + env, frame.children.data(), + static_cast(frame.children.size())); + auto element_term = enif_make_tuple3(env, frame.name_term, + frame.attrs_term, children_term); - if (whitespace_size == character_data->data.length && - skip_whitespace_nodes) { - // Append nothing - } else { - auto term = fine::make_resource_binary( - env, resource, reinterpret_cast(character_data->data.data), - character_data->data.length); - tree.push_back(term); + stack.pop_back(); + + auto &target = stack.empty() ? tree : stack.back().children; + target.push_back(element_term); + } + + if (stack.empty()) { + return; } - } else if (node->type == LXB_DOM_NODE_TYPE_COMMENT) { - auto character_data = lxb_dom_interface_character_data(node); - auto term = fine::make_resource_binary( - env, resource, reinterpret_cast(character_data->data.data), - character_data->data.length); - tree.push_back( - enif_make_tuple2(env, fine::encode(env, atoms::comment), term)); } } @@ -432,10 +514,10 @@ std::optional get_tag_namespace(ErlNifBinary name) { return std::nullopt; } -lxb_dom_node_t *node_from_tree_item(ErlNifEnv *env, - lxb_html_document_t *document, - fine::Term item, - std::optional ns) { +void insert_children_from_tree(ErlNifEnv *env, lxb_html_document_t *document, + lxb_dom_node_t *root, + std::vector tree, + std::optional ns) { using ExText = ErlNifBinary; using ExElement = std::tuple>; using ExComment = std::tuple; - auto decoded = - fine::decode>(env, item); + // We use an explicit stack instead of recursion to avoid stack + // overflow on deeply nested trees. + struct StackFrame { + lxb_dom_node_t *parent; + std::vector children; + size_t index; + std::optional ns; + }; - if (auto text_ptr = std::get_if(&decoded)) { - auto text = lxb_dom_document_create_text_node( - &document->dom_document, text_ptr->data, text_ptr->size); - if (text == NULL) { - throw std::runtime_error("failed to create text node"); - } - return lxb_dom_interface_node(text); - } else if (auto element_ptr = std::get_if(&decoded)) { - const auto &[name, attributes, children_tree] = *element_ptr; + std::vector stack; - auto element = lxb_dom_document_create_element(&document->dom_document, - name.data, name.size, NULL); + stack.push_back({root, tree, 0, ns}); - auto node = lxb_dom_interface_node(element); + while (!stack.empty()) { + auto &frame = stack.back(); - if (!ns) { - ns = get_tag_namespace(name); + if (frame.index >= frame.children.size()) { + stack.pop_back(); + continue; } - if (ns) { - node->ns = ns.value(); - } + auto current_item = frame.children[frame.index]; + auto current_ns = frame.ns; + auto current_parent = frame.parent; + frame.index++; + + auto decoded = fine::decode>( + env, current_item); - for (auto &[key, value] : attributes) { - auto attr = lxb_dom_element_set_attribute(element, key.data, key.size, - value.data, value.size); - if (attr == NULL) { - throw std::runtime_error("failed to set element attribute"); + lxb_dom_node_t *node = nullptr; + + if (auto text_ptr = std::get_if(&decoded)) { + auto text = lxb_dom_document_create_text_node( + &document->dom_document, text_ptr->data, text_ptr->size); + if (text == NULL) { + throw std::runtime_error("failed to create text node"); } - } + node = lxb_dom_interface_node(text); + } else if (auto element_ptr = std::get_if(&decoded)) { + auto &[name, attributes, children_tree] = *element_ptr; - if (lxb_html_tree_node_is(node, LXB_TAG_TEMPLATE)) { - //