From 0f6d18ca69fd30df931e363f3b31d523b27317e5 Mon Sep 17 00:00:00 2001 From: jolavillette Date: Wed, 28 Jan 2026 10:10:07 +0100 Subject: [PATCH 1/2] forums: implement FTS5 content search --- src/deep_search/forumsindex.cpp | 210 ---------- src/deep_search/forumsindex.hpp | 81 ---- src/deep_search/forumsindex_fts5.cpp | 584 +++++++++++++++++++++++++++ src/deep_search/forumsindex_fts5.hpp | 162 ++++++++ src/gxs/rsdataservice.cc | 5 + src/gxs/rsdataservice.h | 2 + src/gxs/rsgds.h | 5 + src/libretroshare.pro | 7 +- src/retroshare/rsgxsforums.h | 5 + src/services/p3gxsforums.cc | 263 ++++++++---- src/services/p3gxsforums.h | 6 +- src/util/retrodb.h | 1 + 12 files changed, 960 insertions(+), 371 deletions(-) delete mode 100644 src/deep_search/forumsindex.cpp delete mode 100644 src/deep_search/forumsindex.hpp create mode 100644 src/deep_search/forumsindex_fts5.cpp create mode 100644 src/deep_search/forumsindex_fts5.hpp diff --git a/src/deep_search/forumsindex.cpp b/src/deep_search/forumsindex.cpp deleted file mode 100644 index a04f2ecac..000000000 --- a/src/deep_search/forumsindex.cpp +++ /dev/null @@ -1,210 +0,0 @@ -/******************************************************************************* - * RetroShare full text indexing and search implementation based on Xapian * - * * - * Copyright (C) 2021 Gioacchino Mazzurco * - * Copyright (C) 2021 Asociación Civil Altermundi * - * * - * This program is free software: you can redistribute it and/or modify * - * it under the terms of the GNU Affero General Public License version 3 as * - * published by the Free Software Foundation. * - * * - * This program is distributed in the hope that it will be useful, * - * but WITHOUT ANY WARRANTY; without even the implied warranty of * - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * - * GNU Affero General Public License for more details. * - * * - * You should have received a copy of the GNU Affero General Public License * - * along with this program. If not, see . * - * * - *******************************************************************************/ - -#include "deep_search/forumsindex.hpp" -#include "deep_search/commonutils.hpp" -#include "retroshare/rsinit.h" -#include "retroshare/rsgxsforums.h" -#include "util/rsdebuglevel4.h" - -std::error_condition DeepForumsIndex::search( - const std::string& queryStr, - std::vector& results, uint32_t maxResults ) -{ - results.clear(); - - std::unique_ptr dbPtr( - DeepSearch::openReadOnlyDatabase(mDbPath) ); - if(!dbPtr) return std::errc::bad_file_descriptor; - - Xapian::Database& db(*dbPtr); - - // Set up a QueryParser with a stemmer and suitable prefixes. - Xapian::QueryParser queryparser; - //queryparser.set_stemmer(Xapian::Stem("en")); - queryparser.set_stemming_strategy(queryparser.STEM_SOME); - // Start of prefix configuration. - //queryparser.add_prefix("title", "S"); - //queryparser.add_prefix("description", "XD"); - // End of prefix configuration. - - // And parse the query. - using XQP = Xapian::QueryParser; - Xapian::Query query = queryparser.parse_query( - queryStr, XQP::FLAG_WILDCARD | XQP::FLAG_DEFAULT ); - - // Use an Enquire object on the database to run the query. - Xapian::Enquire enquire(db); - enquire.set_query(query); - - Xapian::MSet mset = enquire.get_mset( - 0, maxResults ? maxResults : db.get_doccount() ); - - for( Xapian::MSetIterator m = mset.begin(); m != mset.end(); ++m ) - { - const Xapian::Document& doc = m.get_document(); - DeepForumsSearchResult s; - s.mUrl = doc.get_value(URL_VALUENO); -#if XAPIAN_AT_LEAST(1,3,5) - s.mSnippet = mset.snippet(doc.get_data()); -#endif // XAPIAN_AT_LEAST(1,3,5) - results.push_back(s); - } - - return std::error_condition(); -} - -/*static*/ std::string DeepForumsIndex::forumIndexId(const RsGxsGroupId& grpId) -{ - RsUrl forumIndexId(RsGxsForums::DEFAULT_FORUM_BASE_URL); - forumIndexId.setQueryKV( - RsGxsForums::FORUM_URL_ID_FIELD, grpId.toStdString() ); - return forumIndexId.toString(); -} - -/*static*/ std::string DeepForumsIndex::postIndexId( - const RsGxsGroupId& grpId, const RsGxsMessageId& msgId ) -{ - RsUrl postIndexId(RsGxsForums::DEFAULT_FORUM_BASE_URL); - postIndexId.setQueryKV(RsGxsForums::FORUM_URL_ID_FIELD, grpId.toStdString()); - postIndexId.setQueryKV(RsGxsForums::FORUM_URL_MSG_ID_FIELD, msgId.toStdString()); - return postIndexId.toString(); -} - -std::error_condition DeepForumsIndex::indexForumGroup( - const RsGxsForumGroup& forum ) -{ - // Set up a TermGenerator that we'll use in indexing. - Xapian::TermGenerator termgenerator; - //termgenerator.set_stemmer(Xapian::Stem("en")); - - // We make a document and tell the term generator to use this. - Xapian::Document doc; - termgenerator.set_document(doc); - - // Index each field with a suitable prefix. - termgenerator.index_text(forum.mMeta.mGroupName, 1, "G"); - termgenerator.index_text( - DeepSearch::timetToXapianDate(forum.mMeta.mPublishTs), 1, "D" ); - termgenerator.index_text(forum.mDescription, 1, "XD"); - - // Index fields without prefixes for general search. - termgenerator.index_text(forum.mMeta.mGroupName); - termgenerator.increase_termpos(); - termgenerator.index_text(forum.mDescription); - - // store the RS link so we are able to retrive it on matching search - const std::string rsLink(forumIndexId(forum.mMeta.mGroupId)); - doc.add_value(URL_VALUENO, rsLink); - - /* Store some fields for display purposes. Retrieved later to provide the - * matching snippet on search */ - doc.set_data(forum.mMeta.mGroupName + "\n" + forum.mDescription); - - /* We use the identifier to ensure each object ends up in the database only - * once no matter how many times we run the indexer. - * "Q" prefix is a Xapian convention for unique id term. */ - const std::string idTerm("Q" + rsLink); - doc.add_boolean_term(idTerm); - - mWriteQueue.push([idTerm, doc](Xapian::WritableDatabase& db) - { db.replace_document(idTerm, doc); } ); - - return std::error_condition(); -} - -std::error_condition DeepForumsIndex::removeForumFromIndex( - const RsGxsGroupId& grpId ) -{ - mWriteQueue.push([grpId](Xapian::WritableDatabase& db) - { db.delete_document("Q" + forumIndexId(grpId)); }); - - return std::error_condition(); -} - -std::error_condition DeepForumsIndex::indexForumPost(const RsGxsForumMsg& post) -{ - RS_DBG4(post); - - const auto& groupId = post.mMeta.mGroupId; - const auto& msgId = post.mMeta.mMsgId; - - if(groupId.isNull() || msgId.isNull()) - { - RS_ERR("Got post with invalid id ", post); - print_stacktrace(); - return std::errc::invalid_argument; - } - - // Set up a TermGenerator that we'll use in indexing. - Xapian::TermGenerator termgenerator; - //termgenerator.set_stemmer(Xapian::Stem("en")); - - // We make a document and tell the term generator to use this. - Xapian::Document doc; - termgenerator.set_document(doc); - - // Index each field with a suitable prefix. - termgenerator.index_text(post.mMeta.mMsgName, 1, "S"); - termgenerator.index_text( - DeepSearch::timetToXapianDate(post.mMeta.mPublishTs), 1, "D" ); - - // Avoid indexing RetroShare-gui HTML tags - const std::string cleanMsg = DeepSearch::simpleTextHtmlExtract(post.mMsg); - termgenerator.index_text(cleanMsg, 1, "XD" ); - - // Index fields without prefixes for general search. - termgenerator.index_text(post.mMeta.mMsgName); - - termgenerator.increase_termpos(); - termgenerator.index_text(cleanMsg); - // store the RS link so we are able to retrive it on matching search - const std::string rsLink(postIndexId(groupId, msgId)); - doc.add_value(URL_VALUENO, rsLink); - - // Store some fields for display purposes. - doc.set_data(post.mMeta.mMsgName + "\n" + cleanMsg); - - // We use the identifier to ensure each object ends up in the - // database only once no matter how many times we run the - // indexer. - const std::string idTerm("Q" + rsLink); - doc.add_boolean_term(idTerm); - - mWriteQueue.push( [idTerm, doc](Xapian::WritableDatabase& db) - { db.replace_document(idTerm, doc); } ); - - - return std::error_condition(); -} - -std::error_condition DeepForumsIndex::removeForumPostFromIndex( - RsGxsGroupId grpId, RsGxsMessageId msgId ) -{ - // "Q" prefix is a Xapian convention for unique id term. - std::string idTerm("Q" + postIndexId(grpId, msgId)); - mWriteQueue.push( [idTerm](Xapian::WritableDatabase& db) - { db.delete_document(idTerm); } ); - - return std::error_condition(); -} - -/*static*/ std::string DeepForumsIndex::dbDefaultPath() -{ return RsAccounts::AccountDirectory() + "/deep_forum_index_xapian_db"; } diff --git a/src/deep_search/forumsindex.hpp b/src/deep_search/forumsindex.hpp deleted file mode 100644 index 2955ce323..000000000 --- a/src/deep_search/forumsindex.hpp +++ /dev/null @@ -1,81 +0,0 @@ -/******************************************************************************* - * RetroShare full text indexing and search implementation based on Xapian * - * * - * Copyright (C) 2021 Gioacchino Mazzurco * - * Copyright (C) 2021 Asociación Civil Altermundi * - * * - * This program is free software: you can redistribute it and/or modify * - * it under the terms of the GNU Affero General Public License version 3 as * - * published by the Free Software Foundation. * - * * - * This program is distributed in the hope that it will be useful, * - * but WITHOUT ANY WARRANTY; without even the implied warranty of * - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * - * GNU Affero General Public License for more details. * - * * - * You should have received a copy of the GNU Affero General Public License * - * along with this program. If not, see . * - * * - *******************************************************************************/ -#pragma once - -#include -#include -#include - -#include "util/rstime.h" -#include "retroshare/rsgxsforums.h" -#include "retroshare/rsevents.h" -#include "deep_search/commonutils.hpp" - -struct DeepForumsSearchResult -{ - std::string mUrl; - double mWeight; - std::string mSnippet; -}; - -struct DeepForumsIndex -{ - explicit DeepForumsIndex(const std::string& dbPath) : - mDbPath(dbPath), mWriteQueue(dbPath) {} - - /** - * @brief Search indexed GXS groups and messages - * @param[in] maxResults maximum number of acceptable search results, 0 for - * no limits - * @return search results count - */ - std::error_condition search( const std::string& queryStr, - std::vector& results, - uint32_t maxResults = 100 ); - - std::error_condition indexForumGroup(const RsGxsForumGroup& chan); - - std::error_condition removeForumFromIndex(const RsGxsGroupId& grpId); - - std::error_condition indexForumPost(const RsGxsForumMsg& post); - - std::error_condition removeForumPostFromIndex( - RsGxsGroupId grpId, RsGxsMessageId msgId ); - - static std::string dbDefaultPath(); - -private: - static std::string forumIndexId(const RsGxsGroupId& grpId); - static std::string postIndexId( - const RsGxsGroupId& grpId, const RsGxsMessageId& msgId ); - - enum : Xapian::valueno - { - /// Used to store retroshare url of indexed documents - URL_VALUENO, - - /// @see Xapian::BAD_VALUENO - BAD_VALUENO = Xapian::BAD_VALUENO - }; - - const std::string mDbPath; - - DeepSearch::StubbornWriteOpQueue mWriteQueue; -}; diff --git a/src/deep_search/forumsindex_fts5.cpp b/src/deep_search/forumsindex_fts5.cpp new file mode 100644 index 000000000..cab5e9102 --- /dev/null +++ b/src/deep_search/forumsindex_fts5.cpp @@ -0,0 +1,584 @@ +/******************************************************************************* + * RetroShare full text indexing and search implementation based on SQLite FTS5* + * * + * Copyright (C) 2026 jolavillette * + * * + * This program is free software: you can redistribute it and/or modify * + * it under the terms of the GNU Affero General Public License version 3 as * + * published by the Free Software Foundation. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Affero General Public License for more details. * + * * + * You should have received a copy of the GNU Affero General Public License * + * along with this program. If not, see . * + * * + *******************************************************************************/ + +#include "deep_search/forumsindex_fts5.hpp" +#include "deep_search/commonutils.hpp" +#include "retroshare/rsinit.h" +#include "retroshare/rsgxsforums.h" +#include "util/rsdebuglevel4.h" +#include "util/rsdebug.h" + +// Constructor +DeepForumsIndexFTS5::DeepForumsIndexFTS5(const std::string& dbPath, const std::string& dbKey) + : mDbPath(dbPath), mDbKey(dbKey), mDb(nullptr), mIsFTS5(true) +{ + RsDbg() << "DEEPSEARCH: Initializing FTS5 index at " << dbPath; + + // Initialize database immediately + auto err = initDatabase(); + if(err) + { + RsErr() << "DEEPSEARCH: Database initialization failed: " << err.message(); + } +} + +// Destructor +DeepForumsIndexFTS5::~DeepForumsIndexFTS5() +{ + RsDbg() << "DEEPSEARCH: Closing FTS5 index"; + if(mDb) + { + mDb->closeDb(); + } + mDb.reset(); +} + +// Search implementation +std::error_condition DeepForumsIndexFTS5::search( + const std::string& queryStr, + std::vector& results, + uint32_t maxResults +) +{ + RsDbg() << "DEEPSEARCH: search('" << queryStr << "')"; + RsDbg() << "DEEPSEARCH: Search query '" << queryStr << "' maxResults=" << maxResults; + results.clear(); + + if(queryStr.empty()) return std::error_condition(); + + // Initialize database if needed + if(!mDb) + { + auto err = initDatabase(); + if(err) return err; + } + + // 1. Prepare Columns + // We want: url, type, group_id, msg_id, title, snippet(content), author_id, publish_ts, forum_name + std::list columns; + columns.push_back("url"); // 0 + columns.push_back("type"); // 1 + columns.push_back("group_id"); // 2 + columns.push_back("msg_id"); // 3 + columns.push_back("title"); // 4 + // Column 5 is content. We use snippet function on it. + // Syntax: snippet(table, col_index, start_match, end_match, ellipsis, max_tokens) + if(mIsFTS5) { + columns.push_back("snippet(forum_index, 5, '', '', '...', 64)"); // 5 + } else { + // FTS4 syntax: snippet(table, start, end, ellip, col, tokens) + // Wait, FTS4 snippet arg order: snippet(table, start, end, ellip, -1, 64) -> column index is implicit or tricky? + // Actually, FTS3/4 snippet: snippet(table, start, end, ellip) - column is auto-selected BEST column? + // Or snippet(table, start, end, ellip, col_index, tokens) provided by FTS4? + // Let's check docs again. FTS3 snippet: snippet(tbl, start, end, ellip). + // It allows 6 args. snippet(tbl, start, end, ellip, col, ntok). + // But in FTS3/4 col=-1 means "any column" or "best snippet". + // Let's use snippet(forum_index, '', '', '...', -1, 64) which is standard for "match anywhere". + // Although we want column 5 (content). + columns.push_back("snippet(forum_index, '', '', '...', 5, 64)"); + } + columns.push_back("author_id"); // 6 + columns.push_back("publish_ts"); // 7 + columns.push_back("forum_name"); // 8 + + // 2. Prepare WHERE clause (FTS5 MATCH) + // Must escape single quotes in the query string + auto escapeSQL = [](const std::string& input) -> std::string { + std::string output; + output.reserve(input.size()); + for(char c : input) { + if(c == '\'') output += "''"; + else output += c; + } + return output; + }; + + // Use prefix matching by adding * to the query if it's long enough + std::string ftsQuery = escapeSQL(queryStr); + if (ftsQuery.size() >= 1 && ftsQuery.back() != '*') { + ftsQuery += "*"; + } + + // FTS5 MATCH query: forum_index MATCH 'query*' + // We use a simple match first as multi-column specifiers {col1 col2} : ... + // can sometimes fail if columns are null or have specific tokenizer issues. + std::string where = "forum_index MATCH '" + ftsQuery + "'"; + + // Debug: Check total rows and do a LIKE test on BOTH title and content + std::list testCols = {"count(*)", + "count(case when title like '%" + escapeSQL(queryStr) + "%' then 1 end)", + "count(case when content like '%" + escapeSQL(queryStr) + "%' then 1 end)"}; + RetroCursor* countCursor = mDb->sqlQuery("forum_index", testCols, "", ""); + if (countCursor && countCursor->moveToFirst()) { + int totalRows = countCursor->getInt32(0); + int titleMatches = countCursor->getInt32(1); + int contentMatches = countCursor->getInt32(2); + RsDbg() << "DEEPSEARCH: Table 'forum_index' total rows: " << totalRows; + RsDbg() << "DEEPSEARCH: LIKE test for '%" << queryStr << "%' -> Titles: " << titleMatches << " | Content: " << contentMatches; + } + delete countCursor; + + // Debug: Dump first 3 rows to see actual content + RsDbg() << "DEEPSEARCH: --- Database Sample (First 3 rows) ---"; + // We use a raw query because RetroDb::sqlQuery is too rigid for complex LIMIT/selection tests + std::string sampleSQL = "SELECT rowid, title, content, author_name FROM forum_index LIMIT 3;"; + sqlite3_stmt* sampleStmt = NULL; + if (sqlite3_prepare_v2(mDb->getSqlHandle(), sampleSQL.c_str(), -1, &sampleStmt, NULL) == SQLITE_OK) { + while (sqlite3_step(sampleStmt) == SQLITE_ROW) { + const char* t = (const char*)sqlite3_column_text(sampleStmt, 1); + const char* c = (const char*)sqlite3_column_text(sampleStmt, 2); + const char* a = (const char*)sqlite3_column_text(sampleStmt, 3); + RsDbg() << "DEEPSEARCH: RowID: " << sqlite3_column_int64(sampleStmt, 0) + << " | Title: '" << (t?t:"NULL") << "' | Auth: '" << (a?a:"NULL") + << "' | Content Sample: '" << (c ? std::string(c).substr(0, 50) : "NULL") << "...'"; + } + sqlite3_finalize(sampleStmt); + } else { + RsDbg() << "DEEPSEARCH: Sample query failed: " << sqlite3_errmsg(mDb->getSqlHandle()); + } + RsDbg() << "DEEPSEARCH: ------------------------------------"; + + // Debug: Check table structure + RsDbg() << "DEEPSEARCH: --- Table Structure ---"; + if (sqlite3_prepare_v2(mDb->getSqlHandle(), "PRAGMA table_info(forum_index);", -1, &sampleStmt, NULL) == SQLITE_OK) { + while (sqlite3_step(sampleStmt) == SQLITE_ROW) { + RsDbg() << "DEEPSEARCH: Col: " << sqlite3_column_text(sampleStmt, 1) + << " | Type: " << sqlite3_column_text(sampleStmt, 2); + } + sqlite3_finalize(sampleStmt); + } + RsDbg() << "DEEPSEARCH: -----------------------"; + + // 3. Prepare ORDER BY / LIMIT + // FTS5 sorts by relevance automatically if we order by rank + // Note: RetroDb::sqlQuery adds its own " ORDER BY " prefix + std::string orderBy = "rank LIMIT " + std::to_string(maxResults); + + // 4. Execute Query + // Construct the full SQL query string for logging + std::string sql = "SELECT "; + bool firstCol = true; + for (const auto& col : columns) { + if (!firstCol) sql += ", "; + sql += col; + firstCol = false; + } + sql += " FROM forum_index WHERE " + where + " ORDER BY " + orderBy + ";"; + RsDbg() << "DEEPSEARCH: Executing SQL: " << sql; + RsDbg() << "DEEPSEARCH: Executing SQL: " << sql; + RetroCursor* c = mDb->sqlQuery("forum_index", columns, where, orderBy); + + if(!c || !c->moveToFirst()) + { + delete c; + RsDbg() << "DEEPSEARCH: MATCH query returned 0 results. Trying LIKE fallback..."; + + // Fallback to LIKE if MATCH fails (e.g. tokenizer issues) + std::string likeWhere = "(title LIKE '%" + escapeSQL(queryStr) + "%' OR " + + "content LIKE '%" + escapeSQL(queryStr) + "%')"; + + c = mDb->sqlQuery("forum_index", columns, likeWhere, ""); + if (!c || !c->moveToFirst()) { + RsDbg() << "DEEPSEARCH: LIKE fallback also failed."; + delete c; + c = nullptr; + } else { + RsDbg() << "DEEPSEARCH: LIKE fallback succeeded."; + } + } + + // 5. Parse Results + int count = 0; + // Note: moveToFirst() was already called and succeeded if c is not null. + bool valid = (c != nullptr); + while(valid) + { + DeepForumsSearchResult res; + + // 0: url -> mUrl + c->getString(0, res.mUrl); + + // 5: snippet -> mSnippet + c->getString(5, res.mSnippet); + + // mWeight (Relevance) + res.mWeight = 1.0; + + results.push_back(res); + count++; + + RsDbg() << "DEEPSEARCH: Result " << count << ": URL=" << res.mUrl << " Snippet='" << res.mSnippet << "'"; + + valid = c->moveToNext(); + } + + RsDbg() << "DEEPSEARCH: Search returned " << count << " results for query '" << queryStr << "'"; + if (count == 0) { + RsDbg() << "DEEPSEARCH: Last SQL Error: " << sqlite3_errmsg(mDb->getSqlHandle()); + } + RsDbg() << "DEEPSEARCH: Search returned " << count << " results for query '" << queryStr << "'"; + delete c; + return std::error_condition(); +} + +// Index forum group +std::error_condition DeepForumsIndexFTS5::indexForumGroup(const RsGxsForumGroup& forum) +{ + if(forum.mMeta.mGroupId.isNull()) + { + RsErr() << "DEEPSEARCH: Cannot index forum group with null ID"; + return std::errc::invalid_argument; + } + + // Initialize database if needed + if(!mDb) + { + auto err = initDatabase(); + if(err) return err; + } + + RsDbg() << "DEEPSEARCH: Indexing forum group " << forum.mMeta.mGroupId; + + // Prepare data + std::string cleanDesc = DeepSearch::simpleTextHtmlExtract(forum.mDescription); + std::string url = forumIndexId(forum.mMeta.mGroupId); + + // Escape SQL string helper + auto escapeSQL = [](const std::string& input) -> std::string { + std::string output; + output.reserve(input.size()); + for(char c : input) { + if(c == '\'') output += "''"; + else output += c; + } + return output; + }; + + // SQL Query + // INSERT OR REPLACE INTO forum_index VALUES(...) + // Order: url, type, group_id, msg_id, title, content, author_id, author_name, forum_name, publish_ts, circle_type + std::string q = "INSERT OR REPLACE INTO forum_index VALUES("; + q += "'" + escapeSQL(url) + "', "; // url + q += "'group', "; // type + q += "'" + escapeSQL(forum.mMeta.mGroupId.toStdString()) + "', "; // group_id + q += "'', "; // msg_id (empty for group) + q += "'" + escapeSQL(forum.mMeta.mGroupName) + "', "; // title + q += "'" + escapeSQL(cleanDesc) + "', "; // content + q += "'" + escapeSQL(forum.mMeta.mAuthorId.toStdString()) + "', "; // author_id + q += "'', "; // author_name (TODO: fetch) + q += "'" + escapeSQL(forum.mMeta.mGroupName) + "', ";// forum_name (same as title) + q += std::to_string(forum.mMeta.mPublishTs) + ", "; // publish_ts + q += std::to_string(forum.mMeta.mCircleType); // circle_type + q += ");"; + + if(!mDb->execSQL(q)) + { + RsErr() << "DEEPSEARCH: Failed to execute INSERT for forum " << forum.mMeta.mGroupId; + return std::errc::io_error; + } + + RsDbg() << "DEEPSEARCH: Inserted forum " << forum.mMeta.mGroupId << " into FTS"; + return std::error_condition(); +} + +// Index forum post +std::error_condition DeepForumsIndexFTS5::indexForumPost(const RsGxsForumMsg& post) +{ + if(post.mMeta.mGroupId.isNull() || post.mMeta.mMsgId.isNull()) + { + RsErr() << "DEEPSEARCH: Cannot index post with null ID"; + return std::errc::invalid_argument; + } + + // Initialize database if needed + if(!mDb) + { + auto err = initDatabase(); + if(err) return err; + } + + RsDbg() << "DEEPSEARCH: Indexing post " << post.mMeta.mMsgId + << " in forum " << post.mMeta.mGroupId; + + // Prepare data + std::string cleanContent = DeepSearch::simpleTextHtmlExtract(post.mMsg); + std::string url = postIndexId(post.mMeta.mGroupId, post.mMeta.mMsgId); + + RsDbg() << "DEEPSEARCH: Indexing post " << post.mMeta.mMsgId << " (Title: '" << post.mMeta.mMsgName + << "', Content Size: " << cleanContent.size() << " bytes)"; + if (cleanContent.size() > 0) { + RsDbg() << "DEEPSEARCH: Content sample: " << cleanContent.substr(0, 100) << "..."; + } + + // escapeSQL helper + auto escapeSQL = [](const std::string& input) -> std::string { + std::string output; + output.reserve(input.size()); + for(char c : input) { + if(c == '\'') output += "''"; + else output += c; + } + return output; + }; + + // SQL Query + // INSERT OR REPLACE INTO forum_index VALUES(...) + std::string q = "INSERT OR REPLACE INTO forum_index VALUES("; + q += "'" + escapeSQL(url) + "', "; // url + q += "'post', "; // type + q += "'" + escapeSQL(post.mMeta.mGroupId.toStdString()) + "', "; // group_id + q += "'" + escapeSQL(post.mMeta.mMsgId.toStdString()) + "', "; // msg_id + q += "'" + escapeSQL(post.mMeta.mMsgName) + "', "; // title + q += "'" + escapeSQL(cleanContent) + "', "; // content + q += "'" + escapeSQL(post.mMeta.mAuthorId.toStdString()) + "', "; // author_id + q += "'', "; // author_name (TODO: fetch) + q += "'', "; // forum_name (TODO: fetch) + q += std::to_string(post.mMeta.mPublishTs) + ", "; // publish_ts + q += "0"; // circle_type (TODO: fetch) + q += ");"; + + if(!mDb->execSQL(q)) + { + RsErr() << "DEEPSEARCH: Failed to execute INSERT for post " << post.mMeta.mMsgId; + return std::errc::io_error; + } + + return std::error_condition(); +} + +// Remove forum from index +std::error_condition DeepForumsIndexFTS5::removeForumFromIndex(const RsGxsGroupId& grpId) +{ + if(grpId.isNull()) return std::errc::invalid_argument; + + // Initialize database if needed + if(!mDb) + { + auto err = initDatabase(); + if(err) return err; + } + + RsDbg() << "DEEPSEARCH: Removing forum " << grpId << " and all its posts from index"; + + // Delete everything related to this group (the group entry itself + all posts) + // escapeSQL helper + auto escapeSQL = [](const std::string& input) -> std::string { + std::string output; + output.reserve(input.size()); + for(char c : input) { + if(c == '\'') output += "''"; + else output += c; + } + return output; + }; + + std::string q = "DELETE FROM forum_index WHERE group_id='" + escapeSQL(grpId.toStdString()) + "';"; + + if(!mDb->execSQL(q)) + { + RsErr() << "DEEPSEARCH: Failed to remove forum " << grpId; + return std::errc::io_error; + } + + return std::error_condition(); +} + +// Remove post from index +std::error_condition DeepForumsIndexFTS5::removeForumPostFromIndex( + RsGxsGroupId grpId, + RsGxsMessageId msgId +) +{ + if(grpId.isNull() || msgId.isNull()) return std::errc::invalid_argument; + + // Initialize database if needed + if(!mDb) + { + auto err = initDatabase(); + if(err) return err; + } + + RsDbg() << "DEEPSEARCH: Removing post " << msgId + << " from forum " << grpId; + + std::string url = postIndexId(grpId, msgId); + + // escapeSQL helper + auto escapeSQL = [](const std::string& input) -> std::string { + std::string output; + output.reserve(input.size()); + for(char c : input) { + if(c == '\'') output += "''"; + else output += c; + } + return output; + }; + + std::string q = "DELETE FROM forum_index WHERE url='" + escapeSQL(url) + "';"; + + if(!mDb->execSQL(q)) + { + RsErr() << "DEEPSEARCH: Failed to remove post " << msgId; + return std::errc::io_error; + } + + return std::error_condition(); +} + +std::error_condition DeepForumsIndexFTS5::clearIndex() +{ + if(!mDb) initDatabase(); + if(!mDb || !mDb->isOpen()) return std::errc::io_error; + + RsDbg() << "DEEPSEARCH: Clearing all forum index entries..."; + if(!mDb->execSQL("DELETE FROM forum_index;")) + { + RsErr() << "DEEPSEARCH: Failed to clear forum index"; + return std::errc::io_error; + } + + // SQLite DELETE doesn't reclaim disk space by default. + // VACUUM re-packs the database and actually reduces the file size. + RsDbg() << "DEEPSEARCH: Vacuuming database to reclaim space..."; + mDb->execSQL("VACUUM;"); + + return std::error_condition(); +} + +// Initialize database (to be implemented in Step 3) +std::error_condition DeepForumsIndexFTS5::initDatabase() +{ + RsDbg() << "DEEPSEARCH: Initializing FTS5 database at " << mDbPath; + + // Open database with RetroDb (SQLCipher enabled) + try + { + mDb = std::make_unique( + mDbPath, + RetroDb::OPEN_READWRITE_CREATE, + mDbKey + ); + } + catch(const std::exception& e) + { + RsErr() << "DEEPSEARCH: Exception opening database: " << e.what(); + return std::errc::io_error; + } + + if(!mDb || !mDb->isOpen()) + { + RsErr() << "DEEPSEARCH: Failed to open database"; + return std::errc::io_error; + } + + RsDbg() << "DEEPSEARCH: Database opened successfully"; + + // Create FTS5 table if it doesn't exist + // Full schema with author, forum metadata, and security fields + const char* createTableSQL = + "CREATE VIRTUAL TABLE IF NOT EXISTS forum_index USING fts5(" + " url UNINDEXED," // RetroShare URL (retroshare://forum?id=...) + " type UNINDEXED," // 'group' or 'post' + " group_id UNINDEXED," // Forum group ID + " msg_id UNINDEXED," // Message ID (NULL for groups) + " title," // Forum/Post title (indexed for search) + " content," // Forum description or post content (indexed) + " author_id UNINDEXED," // GxsId of the author + " author_name," // Author name (indexed for search by author) + " forum_name," // Forum name (indexed for search in forum names) + " publish_ts UNINDEXED," // Publish timestamp + " circle_type UNINDEXED" // PUBLIC/PRIVATE/RESTRICTED (for security filtering) + ");"; + + if(!mDb->execSQL(createTableSQL)) + { + RsErr() << "DEEPSEARCH: Failed to create FTS5 table. Trying fallback to FTS4..."; + + // Fallback to FTS4 + const char* createTableSQL_FTS4 = + "CREATE VIRTUAL TABLE IF NOT EXISTS forum_index USING fts4(" + " url UNINDEXED," // RetroShare URL (retroshare://forum?id=...) + " type UNINDEXED," // 'group' or 'post' + " group_id UNINDEXED," // Forum group ID + " msg_id UNINDEXED," // Message ID (NULL for groups) + " title," // Forum/Post title (indexed for search) + " content," // Forum description or post content (indexed) + " author_id UNINDEXED," // GxsId of the author + " author_name," // Author name (indexed for search by author) + " forum_name," // Forum name (indexed for search in forum names) + " publish_ts UNINDEXED," // Publish timestamp + " circle_type UNINDEXED" // PUBLIC/PRIVATE/RESTRICTED (for security filtering) + ");"; + + if(!mDb->execSQL(createTableSQL_FTS4)) + { + RsErr() << "DEEPSEARCH: Failed to create FTS4 table as well."; + return std::errc::io_error; + } + RsDbg() << "DEEPSEARCH: FTS4 table 'forum_index' created (fallback)."; + mIsFTS5 = false; + } + else + { + RsDbg() << "DEEPSEARCH: FTS5 table 'forum_index' created successfully."; + } + + RsDbg() << "DEEPSEARCH: FTS5 table 'forum_index' ready (isFTS5=" << mIsFTS5 << ")"; + return std::error_condition(); +} + +void DeepForumsIndexFTS5::beginTransaction() +{ + if(!mDb) initDatabase(); + if(mDb) mDb->execSQL("BEGIN;"); +} + +void DeepForumsIndexFTS5::commitTransaction() +{ + if(!mDb) initDatabase(); + if(mDb) mDb->execSQL("COMMIT;"); +} + +// Generate forum index ID +/*static*/ std::string DeepForumsIndexFTS5::forumIndexId(const RsGxsGroupId& grpId) +{ + RsUrl forumIndexId(RsGxsForums::DEFAULT_FORUM_BASE_URL); + forumIndexId.setQueryKV( + RsGxsForums::FORUM_URL_ID_FIELD, grpId.toStdString() + ); + return forumIndexId.toString(); +} + +// Generate post index ID +/*static*/ std::string DeepForumsIndexFTS5::postIndexId( + const RsGxsGroupId& grpId, + const RsGxsMessageId& msgId +) +{ + RsUrl postIndexId(RsGxsForums::DEFAULT_FORUM_BASE_URL); + postIndexId.setQueryKV(RsGxsForums::FORUM_URL_ID_FIELD, grpId.toStdString()); + postIndexId.setQueryKV(RsGxsForums::FORUM_URL_MSG_ID_FIELD, msgId.toStdString()); + return postIndexId.toString(); +} + +// Get default database path +/*static*/ std::string DeepForumsIndexFTS5::dbDefaultPath() +{ + return RsAccounts::AccountDirectory() + "/deep_forum_index_fts5.db"; +} diff --git a/src/deep_search/forumsindex_fts5.hpp b/src/deep_search/forumsindex_fts5.hpp new file mode 100644 index 000000000..631163ea0 --- /dev/null +++ b/src/deep_search/forumsindex_fts5.hpp @@ -0,0 +1,162 @@ +/******************************************************************************* + * RetroShare full text indexing and search implementation based on SQLite FTS5* + * * + * Copyright (C) 2026 jolavillette * + * * + * This program is free software: you can redistribute it and/or modify * + * it under the terms of the GNU Affero General Public License version 3 as * + * published by the Free Software Foundation. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Affero General Public License for more details. * + * * + * You should have received a copy of the GNU Affero General Public License * + * along with this program. If not, see . * + * * + *******************************************************************************/ +#pragma once + +#include +#include +#include + +#include "util/rstime.h" +#include "util/retrodb.h" +#include "retroshare/rsgxsforums.h" +#include "retroshare/rsevents.h" +#include "deep_search/commonutils.hpp" + +struct DeepForumsSearchResult +{ + std::string mUrl; + double mWeight; + std::string mSnippet; +}; + +struct DeepForumsIndexFTS5 +{ + explicit DeepForumsIndexFTS5(const std::string& dbPath, const std::string& dbKey); + virtual ~DeepForumsIndexFTS5(); + + /** + * @brief Search indexed GXS groups and messages using FTS5 + * @param[in] queryStr Search query string + * @param[out] results Vector of search results + * @param[in] maxResults Maximum number of results to return (0 = no limit) + * @return Error condition if search fails + */ + std::error_condition search( + const std::string& queryStr, + std::vector& results, + uint32_t maxResults = 100 + ); + + /** + * @brief Index a forum group (metadata only) + * @param[in] forum Forum group to index + * @return Error condition if indexing fails + */ + std::error_condition indexForumGroup(const RsGxsForumGroup& forum); + + /** + * @brief Remove a forum group from the index + * @param[in] grpId Forum group ID to remove + * @return Error condition if removal fails + */ + std::error_condition removeForumFromIndex(const RsGxsGroupId& grpId); + + /** + * @brief Index a forum post (title + content) + * @param[in] post Forum post to index + * @return Error condition if indexing fails + */ + std::error_condition indexForumPost(const RsGxsForumMsg& post); + + /** + * @brief Remove a forum post from the index + * @param[in] grpId Forum group ID + * @param[in] msgId Message ID to remove + * @return Error condition if removal fails + */ + std::error_condition removeForumPostFromIndex( + RsGxsGroupId grpId, + RsGxsMessageId msgId + ); + + /** + * @brief Clear all entries from the index + * @return Error condition if operation fails + */ + std::error_condition clearIndex(); + + /** + * @brief Start an SQL transaction + */ + void beginTransaction(); + + /** + * @brief Commit an SQL transaction + */ + void commitTransaction(); + + /** + * @brief Get default database path for FTS5 index + * @return Path to encrypted FTS5 database + */ + static std::string dbDefaultPath(); + +private: + /** + * @brief Initialize FTS5 database and create tables if needed + * @return Error condition if initialization fails + */ + std::error_condition initDatabase(); + + /** + * @brief Generate unique index ID for a forum group + * @param[in] grpId Forum group ID + * @return RetroShare URL string + */ + static std::string forumIndexId(const RsGxsGroupId& grpId); + + /** + * @brief Generate unique index ID for a forum post + * @param[in] grpId Forum group ID + * @param[in] msgId Message ID + * @return RetroShare URL string + */ + static std::string postIndexId( + const RsGxsGroupId& grpId, + const RsGxsMessageId& msgId + ); + + const std::string mDbPath; + const std::string mDbKey; + std::unique_ptr mDb; + bool mIsFTS5; +}; + +/* + * FTS5 Table Schema (Complete - 11 columns): + * + * CREATE VIRTUAL TABLE forum_index USING fts5( + * url UNINDEXED, -- RetroShare URL (retroshare://forum?id=...) + * type UNINDEXED, -- 'group' or 'post' + * group_id UNINDEXED, -- Forum group ID + * msg_id UNINDEXED, -- Message ID (NULL for groups) + * title, -- Forum/Post title (INDEXED for search) + * content, -- Forum description or post content (INDEXED) + * author_id UNINDEXED, -- GxsId of the author + * author_name, -- Author name (INDEXED for search by author) + * forum_name, -- Forum name (INDEXED for search in forum names) + * publish_ts UNINDEXED, -- Publish timestamp + * circle_type UNINDEXED -- PUBLIC/PRIVATE/RESTRICTED (for security filtering) + * ); + * + * Indexed columns (FTS5 search): title, content, author_name, forum_name + * Unindexed columns (metadata): url, type, group_id, msg_id, author_id, publish_ts, circle_type + * + * The database is encrypted using SQLCipher with the same key as GXS databases. + */ diff --git a/src/gxs/rsdataservice.cc b/src/gxs/rsdataservice.cc index 61900e171..25fe6b564 100644 --- a/src/gxs/rsdataservice.cc +++ b/src/gxs/rsdataservice.cc @@ -1779,3 +1779,8 @@ void RsDataService::debug_printCacheSize() + +std::string RsDataService::getEncryptionKey() const +{ + return mDb ? mDb->getKey() : ""; +} diff --git a/src/gxs/rsdataservice.h b/src/gxs/rsdataservice.h index e9554f04f..6f1f08ab6 100644 --- a/src/gxs/rsdataservice.h +++ b/src/gxs/rsdataservice.h @@ -259,6 +259,8 @@ class RsDataService : public RsGeneralDataService bool validSize(RsNxsMsg* msg) const override; bool validSize(RsNxsGrp* grp) const override; + std::string getEncryptionKey() const override; + /*! * Convenience function used to only update group keys. This is used when sending * publish keys between peers. diff --git a/src/gxs/rsgds.h b/src/gxs/rsgds.h index 790a68deb..060d3cc56 100644 --- a/src/gxs/rsgds.h +++ b/src/gxs/rsgds.h @@ -277,5 +277,10 @@ class RsGeneralDataService * @return whether the size of grp is valid for storage */ virtual bool validSize(RsNxsGrp* grp) const = 0 ; + + /*! + * @return the encryption key used for current data service + */ + virtual std::string getEncryptionKey() const = 0; }; diff --git a/src/libretroshare.pro b/src/libretroshare.pro index 812c59385..fe17d8805 100644 --- a/src/libretroshare.pro +++ b/src/libretroshare.pro @@ -1043,12 +1043,15 @@ rs_jsonapi { SOURCES += jsonapi/jsonapi.cpp } + rs_deep_forums_index { HEADERS *= deep_search/commonutils.hpp SOURCES *= deep_search/commonutils.cpp - HEADERS += deep_search/forumsindex.hpp - SOURCES += deep_search/forumsindex.cpp + + # FTS5 implementation (parallel to Xapian for now) + HEADERS += deep_search/forumsindex_fts5.hpp + SOURCES += deep_search/forumsindex_fts5.cpp } rs_deep_channels_index { diff --git a/src/retroshare/rsgxsforums.h b/src/retroshare/rsgxsforums.h index ccd16a923..2ecd51b1c 100644 --- a/src/retroshare/rsgxsforums.h +++ b/src/retroshare/rsgxsforums.h @@ -418,6 +418,11 @@ class RsGxsForums: public RsGxsIfaceHelper virtual bool subscribeToForum( const RsGxsGroupId& forumId, bool subscribe ) = 0; + /** + * @brief Re-index all forums content for Full Text Search + */ + virtual void reindexAll() = 0; + /// default base URL used for forums links @see exportForumLink static const std::string DEFAULT_FORUM_BASE_URL; diff --git a/src/services/p3gxsforums.cc b/src/services/p3gxsforums.cc index 14dd808e6..34c1d07d2 100644 --- a/src/services/p3gxsforums.cc +++ b/src/services/p3gxsforums.cc @@ -61,7 +61,7 @@ p3GxsForums::p3GxsForums( RsGeneralDataService *gds, mGenActive(false), mGenCount(0), mKnownForumsMutex("GXS forums known forums timestamp cache") #ifdef RS_DEEP_FORUMS_INDEX - , mDeepIndex(DeepForumsIndex::dbDefaultPath()) + , mDeepIndex(DeepForumsIndexFTS5::dbDefaultPath(), gds->getEncryptionKey()) #endif { // Test Data disabled in Repo. @@ -916,6 +916,86 @@ bool p3GxsForums::subscribeToForum(const RsGxsGroupId& groupId, bool subscribe ) return true; } +void p3GxsForums::reindexAll() +{ +#ifdef RS_DEEP_FORUMS_INDEX + std::cerr << "DEEPSEARCH: Starting full re-indexation..." << std::endl; + + // Clear existing index to avoid duplicates + mDeepIndex.clearIndex(); + + // Use a transaction for re-indexing (much faster and safer) + mDeepIndex.beginTransaction(); + + // 1. Get all groups (empty list = request all) + std::vector groupsInfo; + if(!getForumsInfo({}, groupsInfo)) + { + std::cerr << "DEEPSEARCH: Failed to get group list using getForumsInfo" << std::endl; + mDeepIndex.commitTransaction(); + return; + } + + int grpCount = 0; + int msgCount = 0; + + for(const auto& group : groupsInfo) + { + // 1. Index Group + mDeepIndex.indexForumGroup(group); + grpCount++; + + // 2. Get Message IDs using Metadata + std::vector msgMetas; + if(!getForumMsgMetaData(group.mMeta.mGroupId, msgMetas)) + { + std::cerr << "DEEPSEARCH: Failed to get metadata for group " << group.mMeta.mGroupId << std::endl; + continue; + } + + if(msgMetas.empty()) continue; + + // 3. Fetch Message Content in Batches of 50 + std::set batch; + + auto processBatch = [&](const std::set& batchIds) + { + std::vector msgs; + // getForumContent handles token request internally + if(getForumContent(group.mMeta.mGroupId, batchIds, msgs)) + { + for(const auto& msg : msgs) + { + if(!mDeepIndex.indexForumPost(msg)) msgCount++; + } + } + else + { + std::cerr << "DEEPSEARCH: Failed to get content for batch" << std::endl; + } + }; + + for(const auto& meta : msgMetas) + { + batch.insert(meta.mMsgId); + if(batch.size() >= 50) + { + processBatch(batch); + batch.clear(); + } + } + if(!batch.empty()) + { + processBatch(batch); + } + } + + mDeepIndex.commitTransaction(); + + std::cerr << "DEEPSEARCH: Re-indexation completed. Groups: " << grpCount << ", Messages: " << msgCount << std::endl; +#endif +} + bool p3GxsForums::exportForumLink( std::string& link, const RsGxsGroupId& forumId, bool includeGxsData, const std::string& baseUrl, std::string& errMsg ) @@ -1972,98 +2052,129 @@ std::error_condition p3GxsForums::distantSearchRequest( std::error_condition p3GxsForums::localSearch( const std::string& matchString, std::vector& searchResults ) -{ return prepareSearchResults(matchString, false, searchResults); } +{ + RsDbg() << "DEEPSEARCH: localSearch entry pattern='" << matchString << "'"; + auto res = prepareSearchResults(matchString, false, searchResults); + RsDbg() << "DEEPSEARCH: localSearch exit. Found " << searchResults.size() << " results."; + return res; +} std::error_condition p3GxsForums::prepareSearchResults( const std::string& matchString, bool publicOnly, std::vector& searchResults ) { - std::vector results; - auto mErr = mDeepIndex.search(matchString, results); - if(mErr) return mErr; + std::vector results; + auto mErr = mDeepIndex.search(matchString, results); + if(mErr) return mErr; - searchResults.clear(); - for(auto uRes: results) - { - RsUrl resUrl(uRes.mUrl); - const auto forumIdStr = resUrl.getQueryV(RsGxsForums::FORUM_URL_ID_FIELD); - if(!forumIdStr) - { - RS_ERR( "Forum URL retrieved from deep index miss ID. ", - "Should never happen! ", uRes.mUrl ); - print_stacktrace(); - return std::errc::address_not_available; - } + searchResults.clear(); - std::vector forumsInfo; - RsGxsGroupId forumId(*forumIdStr); - if(forumId.isNull()) - { - RS_ERR( "Forum ID retrieved from deep index is invalid. ", - "Should never happen! ", uRes.mUrl ); - print_stacktrace(); - return std::errc::bad_address; - } + // 1. Group results by ForumId to batch requests + std::map> forumToMsgs; + std::vector> orderedResults; + std::map, std::string> contextMap; - if( !getForumsInfo(std::list{forumId}, forumsInfo) || - forumsInfo.empty() ) - { - RS_ERR( "Forum just parsed from deep index link not found. " - "Should never happen! ", forumId, " ", uRes.mUrl ); - print_stacktrace(); - return std::errc::identifier_removed; - } + for(const auto& uRes : results) + { + RsUrl resUrl(uRes.mUrl); + const auto forumIdStr = resUrl.getQueryV(RsGxsForums::FORUM_URL_ID_FIELD); + if(!forumIdStr) continue; - RsGroupMetaData& fMeta(forumsInfo[0].mMeta); + RsGxsGroupId forumId(*forumIdStr); + if(forumId.isNull()) continue; - // Avoid leaking sensitive information to unkown peers - if( publicOnly && - ( static_cast(fMeta.mCircleType) != - RsGxsCircleType::PUBLIC ) ) continue; + RsGxsMessageId msgId; + const auto postIdStr = resUrl.getQueryV(RsGxsForums::FORUM_URL_MSG_ID_FIELD); + if(postIdStr) + { + msgId = RsGxsMessageId(*postIdStr); + if(!msgId.isNull()) + { + forumToMsgs[forumId].insert(msgId); + } + } - RsGxsSearchResult res; - res.mGroupId = forumId; - res.mGroupName = fMeta.mGroupName; - res.mAuthorId = fMeta.mAuthorId; - res.mPublishTs = fMeta.mPublishTs; - res.mSearchContext = uRes.mSnippet; + orderedResults.push_back({forumId, msgId}); + contextMap[{forumId, msgId}] = uRes.mSnippet; + } - auto postIdStr = - resUrl.getQueryV(RsGxsForums::FORUM_URL_MSG_ID_FIELD); - if(postIdStr) - { - RsGxsMessageId msgId(*postIdStr); - if(msgId.isNull()) - { - RS_ERR( "Post just parsed from deep index link is invalid. " - "Should never happen! ", postIdStr, " ", uRes.mUrl ); - print_stacktrace(); - return std::errc::bad_address; - } + // 2. Batch fetch ALL needed forum metadata in one go + std::list forumIdList; + for(auto const& [gid, msgs] : forumToMsgs) { + forumIdList.push_back(gid); + } + // Also include forums that don't have messages but were in orderedResults + for(auto const& resPair : orderedResults) { + if (forumToMsgs.find(resPair.first) == forumToMsgs.end()) { + forumIdList.push_back(resPair.first); + } + } + forumIdList.sort(); + forumIdList.unique(); - std::vector msgSummaries; - auto errc = getContentSummaries( - forumId, std::set{msgId}, msgSummaries); - if(errc) return errc; + std::vector forumsInfo; + if(!forumIdList.empty()) { + getForumsInfo(forumIdList, forumsInfo); + } - if(msgSummaries.size() != 1) - { - RS_ERR( "getContentSummaries returned: ", msgSummaries.size(), - "should never happen!" ); - return std::errc::result_out_of_range; - } + std::map forumDataMap; + for(const auto& f : forumsInfo) { + forumDataMap[f.mMeta.mGroupId] = f; + } - RsMsgMetaData& msgMeta(msgSummaries[0]); - res.mMsgId = msgMeta.mMsgId; - res.mMsgName = msgMeta.mMsgName; - res.mAuthorId = msgMeta.mAuthorId; - } + // 3. Batch fetch message summaries PER forum + std::map> msgDataMap; + for(auto const& [gid, msgIds] : forumToMsgs) + { + std::vector msgSummaries; + getContentSummaries(gid, msgIds, msgSummaries); + for(const auto& m : msgSummaries) { + msgDataMap[gid][m.mMsgId] = m; + } + } - RS_DBG4(res); - searchResults.push_back(res); - } + RsDbg() << "DEEPSEARCH: prepareSearchResults: FTS returned " << results.size() << " raw matches."; - return std::error_condition(); + // 4. Assemble final results maintaining search order + for(const auto& resPair : orderedResults) + { + const RsGxsGroupId& forumId = resPair.first; + const RsGxsMessageId& msgId = resPair.second; + + if (forumDataMap.find(forumId) == forumDataMap.end()) { + RsDbg() << "DEEPSEARCH: skipping result: forum metadata not found for " << forumId.toStdString(); + continue; + } + const RsGxsForumGroup& forum = forumDataMap[forumId]; + + // Avoid leaking sensitive information to unknown peers + if( publicOnly && ( static_cast(forum.mMeta.mCircleType) != RsGxsCircleType::PUBLIC ) ) + continue; + + RsGxsSearchResult finalRes; + finalRes.mGroupId = forumId; + finalRes.mGroupName = forum.mMeta.mGroupName; + finalRes.mAuthorId = forum.mMeta.mAuthorId; + finalRes.mPublishTs = forum.mMeta.mPublishTs; + finalRes.mSearchContext = contextMap[resPair]; + + if(!msgId.isNull()) + { + if (msgDataMap[forumId].find(msgId) == msgDataMap[forumId].end()) { + RsDbg() << "DEEPSEARCH: skipping result: message metadata not found for " << msgId.toStdString() << " in forum " << forumId.toStdString(); + continue; + } + const RsMsgMetaData& msgMeta = msgDataMap[forumId][msgId]; + finalRes.mMsgId = msgMeta.mMsgId; + finalRes.mMsgName = msgMeta.mMsgName; + finalRes.mAuthorId = msgMeta.mAuthorId; + } + + RS_DBG4(finalRes); + searchResults.push_back(finalRes); + } + + return std::error_condition(); } std::error_condition p3GxsForums::receiveDistantSearchResult( diff --git a/src/services/p3gxsforums.h b/src/services/p3gxsforums.h index 4cc6efa1e..b7c8ae841 100644 --- a/src/services/p3gxsforums.h +++ b/src/services/p3gxsforums.h @@ -33,7 +33,7 @@ #include "util/rsdebug.h" #ifdef RS_DEEP_FORUMS_INDEX -#include "deep_search/forumsindex.hpp" +#include "deep_search/forumsindex_fts5.hpp" #endif @@ -132,6 +132,8 @@ class p3GxsForums: public RsGenExchange, public RsGxsForums, public p3Config, virtual bool subscribeToForum( const RsGxsGroupId& forumId, bool subscribe ) override; + virtual void reindexAll() override; + /// @see RsGxsForums bool exportForumLink( std::string& link, const RsGxsGroupId& forumId, @@ -250,6 +252,6 @@ class p3GxsForums: public RsGenExchange, public RsGxsForums, public p3Config, RsMutex mKnownForumsMutex; #ifdef RS_DEEP_FORUMS_INDEX - DeepForumsIndex mDeepIndex; + DeepForumsIndexFTS5 mDeepIndex; #endif }; diff --git a/src/util/retrodb.h b/src/util/retrodb.h index 6c33a6b83..09f7a85a0 100644 --- a/src/util/retrodb.h +++ b/src/util/retrodb.h @@ -171,6 +171,7 @@ class RetroDb * @return true/false */ bool tableExists(const std::string& tableName); + sqlite3* getSqlHandle() const { return mDb; } public: From 341d787360ed407671462acba36cc38dcd1c21b1 Mon Sep 17 00:00:00 2001 From: jolavillette Date: Wed, 28 Jan 2026 18:35:34 +0100 Subject: [PATCH 2/2] Remove Xapian dependency from forum search (use FTS5 only) --- src/deep_search/commonutils.cpp | 106 +++++++++++++++++--------------- src/deep_search/commonutils.hpp | 17 ++++- src/use_libretroshare.pri | 2 +- 3 files changed, 74 insertions(+), 51 deletions(-) diff --git a/src/deep_search/commonutils.cpp b/src/deep_search/commonutils.cpp index e8e1e4b59..baaa16def 100644 --- a/src/deep_search/commonutils.cpp +++ b/src/deep_search/commonutils.cpp @@ -26,6 +26,63 @@ #include "util/rsthreads.h" #include "util/rsdebuglevel0.h" +namespace DeepSearch +{ + +std::string simpleTextHtmlExtract(const std::string& rsHtmlDoc) +{ + if(rsHtmlDoc.empty()) return rsHtmlDoc; + + const bool isPlainMsg = + rsHtmlDoc[0] != '<' || rsHtmlDoc[rsHtmlDoc.size() - 1] != '>'; + if(isPlainMsg) return rsHtmlDoc; + + auto oSize = rsHtmlDoc.size(); + auto bodyTagBegin(rsHtmlDoc.find("= oSize) return rsHtmlDoc; + + auto bodyTagEnd(rsHtmlDoc.find(">", bodyTagBegin)); + if(bodyTagEnd >= oSize) return rsHtmlDoc; + + std::string retVal(rsHtmlDoc.substr(bodyTagEnd+1)); + + // strip also CSS inside + oSize = retVal.size(); + auto styleTagBegin(retVal.find("", styleTagBegin)); + if(styleEnd < oSize) + retVal.erase(styleTagBegin, 8+styleEnd-styleTagBegin); + } + + std::string::size_type oPos; + std::string::size_type cPos; + int itCount = 0; + while((oPos = retVal.find("<")) < retVal.size()) + { + if((cPos = retVal.find(">")) <= retVal.size()) + retVal.erase(oPos, 1+cPos-oPos); + else break; + + // Avoid infinite loop with crafty input + if(itCount > 1000) + { + RS_WARN( "Breaking stripping loop due to max allowed iterations ", + "rsHtmlDoc: ", rsHtmlDoc, " retVal: ", retVal ); + break; + } + ++itCount; + } + + return retVal; +} + +} + +// Xapian-specific code (only for channels/files indexing) +#if defined(RS_DEEP_CHANNEL_INDEX) || defined(RS_DEEP_FILES_INDEX) + #ifndef XAPIAN_AT_LEAST /// Added in Xapian 1.4.2. #define XAPIAN_AT_LEAST(A,B,C) \ @@ -168,53 +225,6 @@ std::error_condition StubbornWriteOpQueue::flush( return std::error_condition(); } -std::string simpleTextHtmlExtract(const std::string& rsHtmlDoc) -{ - if(rsHtmlDoc.empty()) return rsHtmlDoc; - - const bool isPlainMsg = - rsHtmlDoc[0] != '<' || rsHtmlDoc[rsHtmlDoc.size() - 1] != '>'; - if(isPlainMsg) return rsHtmlDoc; - - auto oSize = rsHtmlDoc.size(); - auto bodyTagBegin(rsHtmlDoc.find("= oSize) return rsHtmlDoc; - - auto bodyTagEnd(rsHtmlDoc.find(">", bodyTagBegin)); - if(bodyTagEnd >= oSize) return rsHtmlDoc; - - std::string retVal(rsHtmlDoc.substr(bodyTagEnd+1)); - - // strip also CSS inside - oSize = retVal.size(); - auto styleTagBegin(retVal.find("", styleTagBegin)); - if(styleEnd < oSize) - retVal.erase(styleTagBegin, 8+styleEnd-styleTagBegin); - } - - std::string::size_type oPos; - std::string::size_type cPos; - int itCount = 0; - while((oPos = retVal.find("<")) < retVal.size()) - { - if((cPos = retVal.find(">")) <= retVal.size()) - retVal.erase(oPos, 1+cPos-oPos); - else break; - - // Avoid infinite loop with crafty input - if(itCount > 1000) - { - RS_WARN( "Breaking stripping loop due to max allowed iterations ", - "rsHtmlDoc: ", rsHtmlDoc, " retVal: ", retVal ); - break; - } - ++itCount; - } - - return retVal; } -} +#endif // RS_DEEP_CHANNEL_INDEX || RS_DEEP_FILES_INDEX diff --git a/src/deep_search/commonutils.hpp b/src/deep_search/commonutils.hpp index 5f47c39bd..3d629d5f0 100644 --- a/src/deep_search/commonutils.hpp +++ b/src/deep_search/commonutils.hpp @@ -19,14 +19,25 @@ *******************************************************************************/ #pragma once +#include + +#include "util/rstime.h" + +namespace DeepSearch +{ +// Common utilities (always available, used by FTS5) +std::string simpleTextHtmlExtract(const std::string& rsHtmlDoc); +} + +// Xapian-specific code (only for channels/files indexing) +#if defined(RS_DEEP_CHANNEL_INDEX) || defined(RS_DEEP_FILES_INDEX) + #include #include #include #include #include -#include "util/rstime.h" - #ifndef XAPIAN_AT_LEAST #define XAPIAN_AT_LEAST(A,B,C) (XAPIAN_MAJOR_VERSION > (A) || \ (XAPIAN_MAJOR_VERSION == (A) && \ @@ -67,3 +78,5 @@ struct StubbornWriteOpQueue }; } + +#endif // RS_DEEP_CHANNEL_INDEX || RS_DEEP_FILES_INDEX diff --git a/src/use_libretroshare.pri b/src/use_libretroshare.pri index 85377d042..a8e4ccbac 100644 --- a/src/use_libretroshare.pri +++ b/src/use_libretroshare.pri @@ -108,7 +108,7 @@ linux-* { mLibs += dl } -rs_deep_channels_index | rs_deep_files_index | rs_deep_forums_index { +rs_deep_channels_index | rs_deep_files_index { mLibs += xapian win32-g++|win32-clang-g++:mLibs += rpcrt4 }