From e65e1f19761db01a0a460130c59bf51467c33518 Mon Sep 17 00:00:00 2001 From: dmgcodevil Date: Sun, 21 Sep 2025 22:36:24 -0400 Subject: [PATCH 1/2] utility function hash_code_ for schema and node id --- src/core.cpp | 102 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 75 insertions(+), 27 deletions(-) diff --git a/src/core.cpp b/src/core.cpp index 082eaa5..0840b80 100644 --- a/src/core.cpp +++ b/src/core.cpp @@ -37,6 +37,76 @@ namespace tundradb { constexpr static uint64_t NODE_MASK = (1ULL << 48) - 1; +// Deterministic 16-bit tag from alias string (SchemaRef::value()). +// https://www.ietf.org/archive/id/draft-eastlake-fnv-21.html +static uint16_t compute_tag(const SchemaRef& ref) { + // FNV-1a 32-bit, then fold to 16 bits. + const std::string& s = ref.value(); + uint32_t h = 2166136261u; + for (unsigned char c : s) { + h ^= c; + h *= 16777619u; + } + h ^= (h >> 16); + return static_cast(h & 0xFFFFu); +} + +/** + * @brief Creates a packed 64-bit hash code for schema+node_id pairs + * + * This function combines a schema identifier and node ID into a single 64-bit + * value for efficient storage and comparison in hash sets/maps. This eliminates + * the need for expensive string concatenation and hashing that was previously + * used for tracking visited nodes during graph traversal. + * + * @param schema The schema reference containing a pre-computed 16-bit tag + * @param node_id The node identifier (48-bit max) + * + * @return A 64-bit packed value with layout: + * - Bits 63-48: Schema tag (16 bits) + * - Bits 47-0: Node ID (48 bits, masked) + * + * @details + * Memory Layout: + * ``` + * 63 56 48 40 32 24 16 8 0 + * | Schema | Node ID (48 bits) | + * | (16 bit) | | + * ``` + * + * Performance Benefits: + * - Replaces string operations: "User:12345" → single uint64_t + * - Enables fast integer comparison instead of string hashing + * - Reduces memory allocations (no temporary strings) + * - Compatible with llvm::DenseSet for O(1) lookups + * + * Constraints: + * - Node IDs must fit in 48 bits (max ~281 trillion nodes) + * - Schema tags must be unique within query context + * - NODE_MASK = (1ULL << 48) - 1 = 0x0000FFFFFFFFFFFF + * + * Example: + * ```cpp + * SchemaRef user_schema = SchemaRef::parse("u:User"); + * user_schema.set_tag(0x1234); // Pre-computed schema tag + * + * uint64_t packed = hash_code_(user_schema, 98765); + * // Result: 0x1234000000018149 (schema=0x1234, node=98765) + * + * // Usage in visited tracking: + * llvm::DenseSet visited; + * visited.insert(packed); // Fast O(1) integer hash + * ``` + * + * @see SchemaRef::tag() for schema tag computation + * @see NODE_MASK constant definition + */ +static uint64_t hash_code_(const SchemaRef& schema, int64_t node_id) { + const uint16_t schema_id16 = schema.tag(); + return (static_cast(schema_id16) << 48) | + (static_cast(node_id) & NODE_MASK); +} + // Utility function to join containers using C++23 ranges template std::string join_container(const Container& container, @@ -468,20 +538,6 @@ struct QueryState { return true; } - // Deterministic 16-bit tag from alias string (SchemaRef::value()). - // https://www.ietf.org/archive/id/draft-eastlake-fnv-21.html - static uint16_t compute_alias_tag(const SchemaRef& ref) { - // FNV-1a 32-bit, then fold to 16 bits. - const std::string& s = ref.value(); - uint32_t h = 2166136261u; - for (unsigned char c : s) { - h ^= c; - h *= 16777619u; - } - h ^= (h >> 16); - return static_cast(h & 0xFFFFu); - } - const llvm::DenseSet& get_ids(const SchemaRef& schema_ref) { return ids[schema_ref.value()]; } @@ -1183,10 +1239,7 @@ arrow::Result>> populate_rows_bfs( item.schema_ref.value()); } item.row->set_cell_from_node(it_fq->second, node); - // Pack 16-bit schema id (precomputed in SchemaRef) and 48-bit node id. - const uint16_t schema_id16 = item.schema_ref.tag(); - const uint64_t packed = (static_cast(schema_id16) << 48) | - (static_cast(item.node_id) & NODE_MASK); + const uint64_t packed = hash_code_(item.schema_ref, item.node_id); global_visited.insert(item.schema_ref.value() + ":" + std::to_string(item.node_id)); item.path_visited_nodes.insert(packed); @@ -1201,10 +1254,7 @@ arrow::Result>> populate_rows_bfs( for (const auto& conn : query_state.connections.at(item.schema_ref.value()) .at(item.node_id)) { - const uint16_t tgt_schema_id16 = conn.target.tag(); - const uint64_t tgt_packed = - (static_cast(tgt_schema_id16) << 48) | - (static_cast(conn.target_id) & NODE_MASK); + const uint64_t tgt_packed = hash_code_(conn.target, conn.target_id); if (!item.path_visited_nodes.contains(tgt_packed)) { if (query_state.ids.at(conn.target.value()) .contains(conn.target_id)) { @@ -1771,7 +1821,7 @@ arrow::Result> Database::query( } // Precompute tag for FROM schema (alias-based hash) query_state.from = query.from(); - query_state.from.set_tag(QueryState::compute_alias_tag(query_state.from)); + query_state.from.set_tag(compute_tag(query_state.from)); ARROW_ASSIGN_OR_RAISE(auto source_schema, query_state.resolve_schema(query.from())); if (!this->schema_registry_->exists(source_schema)) { @@ -1863,10 +1913,8 @@ arrow::Result> Database::query( auto traverse = std::static_pointer_cast(clause); // Precompute and set tags for source/target refs (alias-based, // deterministic) - traverse->mutable_source().set_tag( - QueryState::compute_alias_tag(traverse->source())); - traverse->mutable_target().set_tag( - QueryState::compute_alias_tag(traverse->target())); + traverse->mutable_source().set_tag(compute_tag(traverse->source())); + traverse->mutable_target().set_tag(compute_tag(traverse->target())); ARROW_ASSIGN_OR_RAISE(auto source_schema, query_state.resolve_schema(traverse->source())); From 71bff7a7837c4101f0f0790c1fc9fd3467a6050f Mon Sep 17 00:00:00 2001 From: dmgcodevil Date: Sun, 21 Sep 2025 22:53:27 -0400 Subject: [PATCH 2/2] use packed in global visted --- src/core.cpp | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/src/core.cpp b/src/core.cpp index 0840b80..eacb8d4 100644 --- a/src/core.cpp +++ b/src/core.cpp @@ -1206,11 +1206,10 @@ void log_grouped_connections( } } -template arrow::Result>> populate_rows_bfs( int64_t node_id, const SchemaRef& start_schema, const std::shared_ptr& output_schema, - const QueryState& query_state, VisitedSet& global_visited) { + const QueryState& query_state, llvm::DenseSet& global_visited) { IF_DEBUG_ENABLED { log_debug("populate_rows_bfs::node={}:{}", start_schema.value(), node_id); } @@ -1240,8 +1239,7 @@ arrow::Result>> populate_rows_bfs( } item.row->set_cell_from_node(it_fq->second, node); const uint64_t packed = hash_code_(item.schema_ref, item.node_id); - global_visited.insert(item.schema_ref.value() + ":" + - std::to_string(item.node_id)); + global_visited.insert(packed); item.path_visited_nodes.insert(packed); // group connections by target schema (small, stack-friendly) @@ -1334,15 +1332,14 @@ arrow::Result>> populate_batch_rows( const llvm::DenseSet& node_ids, const SchemaRef& schema_ref, const std::shared_ptr& output_schema, const QueryState& query_state, const TraverseType join_type, - tbb::concurrent_unordered_set& global_visited) { + tbb::concurrent_unordered_set& global_visited) { auto rows = std::make_shared>(); rows->reserve(node_ids.size()); - std::set local_visited; + llvm::DenseSet local_visited; // For INNER join: only process nodes that have connections // For LEFT join: process all nodes from the "left" side for (const auto node_id : node_ids) { - auto key = schema_ref.value() + ":" + std::to_string(node_id); - if (!global_visited.insert(key).second) { + if (!global_visited.insert(hash_code_(schema_ref, node_id)).second) { // Skip if already processed in an earlier traversal continue; } @@ -1402,7 +1399,7 @@ arrow::Result>> populate_rows( const std::shared_ptr& output_schema) { auto rows = std::make_shared>(); std::mutex rows_mtx; - tbb::concurrent_unordered_set global_visited; + tbb::concurrent_unordered_set global_visited; // Map schemas to their join types std::unordered_map schema_join_types;