From 9d08ff5f1fbc0aa460dd03ab9f250c88067d4b64 Mon Sep 17 00:00:00 2001 From: Samuel Allan Date: Tue, 26 Aug 2025 16:56:18 +0930 Subject: [PATCH 1/5] feat: add Typesense backend for search Also run `make upgrade` to be able to successfully compile dependencies. This upgrade may need to be refactored out of this commit. Private-ref: https://tasks.opencraft.com/browse/BB-9975 --- forum/search/typesense.py | 377 ++++++++++++++++++++++++++++++++++++++ forum/settings/common.py | 8 +- requirements/base.in | 1 + requirements/base.txt | 3 + requirements/ci.txt | 3 + requirements/dev.txt | 5 + requirements/doc.txt | 3 + requirements/quality.txt | 3 + requirements/test.txt | 3 + 9 files changed, 405 insertions(+), 1 deletion(-) create mode 100644 forum/search/typesense.py diff --git a/forum/search/typesense.py b/forum/search/typesense.py new file mode 100644 index 00000000..eb3fc99b --- /dev/null +++ b/forum/search/typesense.py @@ -0,0 +1,377 @@ +""" +Typesense backend for searching comments and threads. +""" + +from typing import Any, Optional + +from bs4 import BeautifulSoup +from django.conf import settings +from django.core.paginator import Paginator +from typesense import Client +from typesense.types.collection import CollectionCreateSchema +from typesense.types.document import DocumentSchema, SearchParameters +from typesense.exceptions import ObjectNotFound + +from forum.backends.mysql.models import Comment, CommentThread +from forum.constants import FORUM_MAX_DEEP_SEARCH_COMMENT_COUNT +from forum.search.base import ( + BaseDocumentSearchBackend, + BaseIndexSearchBackend, + BaseSearchBackend, + BaseThreadSearchBackend, +) + +_TYPESENSE_CLIENT: Client | None = None + + +def get_typesense_client() -> Client: + """ + Return a singleton Typesense client instance. + """ + global _TYPESENSE_CLIENT + if _TYPESENSE_CLIENT is None: + _TYPESENSE_CLIENT = Client( + { + "api_key": settings.TYPESENSE_API_KEY, + "nodes": [settings.TYPESENSE_URL], + } + ) + return _TYPESENSE_CLIENT + + +class CommentsIndex: + """ + Common data and operations relating to the comments index. + """ + + model = Comment + + @staticmethod + def name() -> str: + """ + Return the Typesense index name for the index. + """ + return settings.TYPESENSE_COLLECTION_PREFIX + "comments" + + @classmethod + def schema(cls) -> CollectionCreateSchema: + return { + "name": cls.name(), + "fields": [ + {"name": "course_id", "type": "string"}, + {"name": "comment_thread_id", "type": "string"}, + {"name": "body", "type": "string"}, + ], + } + + @staticmethod + def build_document(doc_id: str | int, data: dict[str, Any]) -> DocumentSchema: + """ + Build a Typesense document for this index. + """ + # NOTE: Comments have no commentable_id or title, and the context is hardcoded to "course". + return { + "id": str(doc_id), + "course_id": str(data.get("course_id", "")), + "comment_thread_id": str(data.get("comment_thread_id", "")), + "body": ( + BeautifulSoup(data["body"], features="html.parser").get_text() + if data.get("body") + else "" + ), + } + + @staticmethod + def build_search_parameters( + *, search_text: str, course_id: str | None + ) -> SearchParameters: + """ + Build Typesense search parameters for this index. + """ + return { + "q": search_text, + "query_by": "body", + "filter_by": ( + f"course_id:={quote_filter_value(course_id)}" if course_id else "" + ), + "per_page": FORUM_MAX_DEEP_SEARCH_COMMENT_COUNT, + } + + +class CommentThreadsIndex: + """ + Common data and operations relating to the comments index. + """ + + model = CommentThread + + @staticmethod + def name() -> str: + """ + Return the Typesense index name for the index. + """ + return settings.TYPESENSE_COLLECTION_PREFIX + "comment_threads" + + @classmethod + def schema(cls) -> CollectionCreateSchema: + return { + "name": cls.name(), + "fields": [ + {"name": "course_id", "type": "string"}, + {"name": "commentable_id", "type": "string"}, + {"name": "context", "type": "string"}, + {"name": "title", "type": "string"}, + {"name": "body", "type": "string"}, + ], + } + + @staticmethod + def build_document(doc_id: str | int, data: dict[str, Any]) -> DocumentSchema: + """ + Build a Typesense document for this index. + """ + return { + "id": str(doc_id), + "course_id": str(data.get("course_id", "")), + "commentable_id": str(data.get("commentable_id", "")), + "context": str(data.get("context", "")), + "title": str(data.get("title", "")), + "body": ( + BeautifulSoup(data["body"], features="html.parser").get_text() + if data.get("body") + else "" + ), + } + + @staticmethod + def build_search_parameters( + *, + search_text: str, + course_id: str | None, + context: str, + commentable_ids: list[str] | None, + ) -> SearchParameters: + """ + Build Typesense search parameters for this index. + """ + # Context is always a single word, so we can use the faster `:` operator, without sacrificing accuracy. + filters = [f"context:{quote_filter_value(context)}"] + if commentable_ids: + safe_ids = ", ".join(quote_filter_value(value) for value in commentable_ids) + filters.append(f"commentable_ids:[{safe_ids}]") + if course_id: + filters.append(f"course_id:={quote_filter_value(course_id)}") + + return { + "q": search_text, + "query_by": "title,body", + "filter_by": " && ".join(filters), + "per_page": FORUM_MAX_DEEP_SEARCH_COMMENT_COUNT, + } + + +INDICES: dict[str, type[CommentsIndex] | type[CommentThreadsIndex]] = { + "comments": CommentsIndex, + "comment_threads": CommentThreadsIndex, +} + + +class TypesenseDocumentBackend(BaseDocumentSearchBackend): + """ + Document backend implementation for Typesense. + """ + + def index_document( + self, index_name: str, doc_id: str | int, document: dict[str, Any] + ) -> None: + """ + Index a document in Typesense. + """ + client = get_typesense_client() + index = INDICES[index_name] + typesense_document = index.build_document(doc_id, document) + client.collections[index.name()].documents.upsert(typesense_document) + + def update_document( + self, index_name: str, doc_id: str | int, update_data: dict[str, Any] + ) -> None: + """ + Same operation as index_document, because upsert is used. + """ + return self.index_document(index_name, doc_id, update_data) + + def delete_document(self, index_name: str, doc_id: str | int) -> None: + """ + Delete a document from Typesense. + """ + client = get_typesense_client() + index = INDICES[index_name] + client.collections[index.name()].documents[str(doc_id)].delete( + delete_parameters={"ignore_not_found": True}, + ) + + +class TypesenseIndexBackend(BaseIndexSearchBackend): + """ + Manage indexes for the Typesense backend. + + Typesense calls these "collections". https://typesense.org/docs/29.0/api/collections.html + """ + + def initialize_indices(self, force_new_index: bool = False) -> None: + """ + Initialize the indices in Typesense. + + If force_new_index is True, the indexes will be dropped before being recreated. + """ + client = get_typesense_client() + for index in INDICES.values(): + exists: bool = True + try: + client.collections[index.name()].retrieve() + except ObjectNotFound: + exists = False + + if force_new_index and exists: + client.collections[index.name()].delete() + + if force_new_index or not exists: + client.collections.create(index.schema()) + + def rebuild_indices( + self, batch_size: int = 500, extra_catchup_minutes: int = 5 + ) -> None: + """ + Reindex everything in Typesense + + The Typesense collections are dropped and recreated, + and data is reindexed from the MySQL database. + + Only MySQL-backed instances are supported. + Note that the `extra_catchup_minutes` argument is ignored. + """ + client = get_typesense_client() + self.initialize_indices(force_new_index=True) + for index in INDICES.values(): + paginator = Paginator(index.model.objects.all(), per_page=batch_size) + for page_number in paginator.page_range: + page = paginator.get_page(page_number) + documents = [ + index.build_document(obj.pk, obj.doc_to_hash()) + for obj in page.object_list + ] + if documents: + client.collections[index.name()].documents.import_( + documents, {"action": "upsert"} + ) + + def validate_indices(self) -> None: + """ + Check if the indices exist and are valid. + + Raise an exception if any do not exist or if any are not valid. + """ + client = get_typesense_client() + for index in INDICES.values(): + collection = client.collections[index.name()].retrieve() + # TODO: collection returns more information than the initial create schema, + # so we need a better comparison here; this is currently broken + if collection != index.schema(): + print(f"Expected schema: {index.schema()}") + print(f"Found schema: {collection}") + raise AssertionError( + f"Collection {index.name()} exists, but schema does not match expected." + ) + + def refresh_indices(self) -> None: + """ + Noop on Typesense, as all write API operations are synchronous. + + See https://typesense.org/docs/guide/migrating-from-algolia.html#synchronous-write-apis for more information. + """ + return None + + def delete_unused_indices(self) -> int: + """ + Noop on Typesense. + """ + return 0 + + +def quote_filter_value(value: str) -> str: + """ + Sanitize and safely quote a value for use in a Typesense filter. + + https://typesense.org/docs/guide/tips-for-filtering.html#escaping-special-characters + """ + return "`" + value.replace("`", "") + "`" + + +class TypesenseThreadSearchBackend(BaseThreadSearchBackend): + """ + Thread search backend implementation for Typesense. + """ + + def get_thread_ids( + self, + context: str, + # This argument is unsupported. Anyway, its only role was to boost some results, + # which did not have much effect because they are shuffled anyway downstream. + group_ids: list[int], + search_text: str, + # This parameter is unsupported, but as far as we know it's not used anywhere. + sort_criteria: Optional[list[dict[str, str]]] = None, + commentable_ids: Optional[list[str]] = None, + course_id: Optional[str] = None, + ) -> list[str]: + """ + Retrieve thread IDs based on search criteria. + """ + client = get_typesense_client() + thread_ids: set[str] = set() + + # All comments have "course" as their context, and none of them have a commentable_id. + if context == "course" and not commentable_ids: + comment_results = client.collections[CommentsIndex.name()].documents.search( + CommentsIndex.build_search_parameters( + search_text=search_text, course_id=course_id + ) + ) + for hit in comment_results.get("hits", []): + thread_ids.add(hit["document"]["comment_thread_id"]) + + thread_results = client.collections[ + CommentThreadsIndex.name() + ].documents.search( + CommentThreadsIndex.build_search_parameters( + search_text=search_text, + course_id=course_id, + context=context, + commentable_ids=commentable_ids, + ) + ) + for hit in thread_results.get("hits", []): + thread_ids.add(hit["document"]["id"]) + + return list(thread_ids) + + def get_suggested_text(self, search_text: str) -> Optional[str]: + """ + Retrieve text suggestions for a given search query. + + :param search_text: Text to search for suggestions + :return: Suggested text or None + """ + # TODO: https://typesense.org/docs/guide/query-suggestions.html + # TODO: if this is implemented, do we need to also implement get_thread_ids_with_corrected_text? + return None + + +class TypesenseBackend(BaseSearchBackend): + """ + Typesense-powered search backend. + """ + + DOCUMENT_SEARCH_CLASS = TypesenseDocumentBackend + INDEX_SEARCH_CLASS = TypesenseIndexBackend + THREAD_SEARCH_CLASS = TypesenseThreadSearchBackend diff --git a/forum/settings/common.py b/forum/settings/common.py index dbecd9cd..772fc9e7 100644 --- a/forum/settings/common.py +++ b/forum/settings/common.py @@ -10,7 +10,13 @@ def plugin_settings(settings: Any) -> None: Common settings for forum app """ # Search backend - if getattr(settings, "MEILISEARCH_ENABLED", False): + if getattr(settings, "TYPESENSE_ENABLED", False): + settings.FORUM_SEARCH_BACKEND = getattr( + settings, + "FORUM_SEARCH_BACKEND", + "forum.search.typesense.TypesenseBackend", + ) + elif getattr(settings, "MEILISEARCH_ENABLED", False): settings.FORUM_SEARCH_BACKEND = getattr( settings, "FORUM_SEARCH_BACKEND", diff --git a/requirements/base.in b/requirements/base.in index 7fc1fced..311e9e10 100644 --- a/requirements/base.in +++ b/requirements/base.in @@ -11,4 +11,5 @@ requests pymongo elasticsearch edx-search # meilisearch backend +typesense mysqlclient diff --git a/requirements/base.txt b/requirements/base.txt index 449bdd19..689a1d76 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -141,6 +141,7 @@ requests==2.32.5 # via # -r requirements/base.in # meilisearch + # typesense six==1.17.0 # via # edx-ccx-keys @@ -157,6 +158,8 @@ stevedore==5.5.0 # edx-opaque-keys text-unidecode==1.3 # via python-slugify +typesense==1.1.1 + # via -r requirements/base.in typing-extensions==4.15.0 # via # beautifulsoup4 diff --git a/requirements/ci.txt b/requirements/ci.txt index 748a1284..66df4121 100644 --- a/requirements/ci.txt +++ b/requirements/ci.txt @@ -431,6 +431,7 @@ requests==2.32.5 # meilisearch # requests-toolbelt # twine + # typesense requests-toolbelt==1.0.0 # via # -r requirements/quality.txt @@ -507,6 +508,8 @@ types-urllib3==1.26.25.14 # via # -r requirements/quality.txt # types-requests +typesense==1.1.1 + # via -r requirements/quality.txt typing-extensions==4.15.0 # via # -r requirements/quality.txt diff --git a/requirements/dev.txt b/requirements/dev.txt index 5e01b88a..14346e94 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -574,6 +574,7 @@ requests==2.32.5 # meilisearch # requests-toolbelt # twine + # typesense requests-toolbelt==1.0.0 # via # -r requirements/ci.txt @@ -672,6 +673,10 @@ types-urllib3==1.26.25.14 # -r requirements/ci.txt # -r requirements/quality.txt # types-requests +typesense==1.1.1 + # via + # -r requirements/ci.txt + # -r requirements/quality.txt typing-extensions==4.15.0 # via # -r requirements/ci.txt diff --git a/requirements/doc.txt b/requirements/doc.txt index e48ac9c0..71dbab6e 100644 --- a/requirements/doc.txt +++ b/requirements/doc.txt @@ -374,6 +374,7 @@ requests==2.32.5 # requests-toolbelt # sphinx # twine + # typesense requests-toolbelt==1.0.0 # via # -r requirements/test.txt @@ -446,6 +447,8 @@ tox==4.28.4 # via -r requirements/test.txt twine==6.1.0 # via -r requirements/test.txt +typesense==1.1.1 + # via -r requirements/test.txt typing-extensions==4.15.0 # via # -r requirements/test.txt diff --git a/requirements/quality.txt b/requirements/quality.txt index fe0a2766..15ac1f9a 100644 --- a/requirements/quality.txt +++ b/requirements/quality.txt @@ -403,6 +403,7 @@ requests==2.32.5 # meilisearch # requests-toolbelt # twine + # typesense requests-toolbelt==1.0.0 # via # -r requirements/test.txt @@ -470,6 +471,8 @@ types-requests==2.31.0.6 # djangorestframework-stubs types-urllib3==1.26.25.14 # via types-requests +typesense==1.1.1 + # via -r requirements/test.txt typing-extensions==4.15.0 # via # -r requirements/test.txt diff --git a/requirements/test.txt b/requirements/test.txt index 87b4932d..6fd4abe8 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -305,6 +305,7 @@ requests==2.32.5 # meilisearch # requests-toolbelt # twine + # typesense requests-toolbelt==1.0.0 # via twine rfc3986==2.0.0 @@ -343,6 +344,8 @@ tox==4.28.4 # via -r requirements/test.in twine==6.1.0 # via -r requirements/test.in +typesense==1.1.1 + # via -r requirements/base.txt typing-extensions==4.15.0 # via # -r requirements/base.txt From c1a8d654ac4a2fe7b20b57696474593f04c4be6f Mon Sep 17 00:00:00 2001 From: Samuel Allan Date: Fri, 29 Aug 2025 09:08:24 +0930 Subject: [PATCH 2/5] fix: switch to TYPESENSE_URLS for cluster support --- forum/search/typesense.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/forum/search/typesense.py b/forum/search/typesense.py index eb3fc99b..556ff36b 100644 --- a/forum/search/typesense.py +++ b/forum/search/typesense.py @@ -33,7 +33,7 @@ def get_typesense_client() -> Client: _TYPESENSE_CLIENT = Client( { "api_key": settings.TYPESENSE_API_KEY, - "nodes": [settings.TYPESENSE_URL], + "nodes": settings.TYPESENSE_URLS, } ) return _TYPESENSE_CLIENT From 824e925943d1f01e4af3eba7c5627d4c447d737a Mon Sep 17 00:00:00 2001 From: Samuel Allan Date: Fri, 29 Aug 2025 09:09:04 +0930 Subject: [PATCH 3/5] docs: fix missing word Co-authored-by: Braden MacDonald --- forum/search/typesense.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/forum/search/typesense.py b/forum/search/typesense.py index 556ff36b..b7be715d 100644 --- a/forum/search/typesense.py +++ b/forum/search/typesense.py @@ -100,7 +100,7 @@ def build_search_parameters( class CommentThreadsIndex: """ - Common data and operations relating to the comments index. + Common data and operations relating to the comment threads index. """ model = CommentThread From d073dcefcb2b151186f2fd90cf49170686cd739e Mon Sep 17 00:00:00 2001 From: Samuel Allan Date: Mon, 1 Sep 2025 12:38:02 +0930 Subject: [PATCH 4/5] refactor!: use a single index for all documents Also finish implementing the validation method, and address lint errors. --- forum/search/typesense.py | 414 +++++++++++++++++++++----------------- 1 file changed, 230 insertions(+), 184 deletions(-) diff --git a/forum/search/typesense.py b/forum/search/typesense.py index b7be715d..c0095d90 100644 --- a/forum/search/typesense.py +++ b/forum/search/typesense.py @@ -7,7 +7,9 @@ from bs4 import BeautifulSoup from django.conf import settings from django.core.paginator import Paginator -from typesense import Client + +# mypy complains: 'Module "typesense" does not explicitly export attribute "Client"' +from typesense import Client # type: ignore from typesense.types.collection import CollectionCreateSchema from typesense.types.document import DocumentSchema, SearchParameters from typesense.exceptions import ObjectNotFound @@ -39,141 +41,187 @@ def get_typesense_client() -> Client: return _TYPESENSE_CLIENT -class CommentsIndex: - """ - Common data and operations relating to the comments index. +def quote_filter_value(value: str) -> str: """ + Sanitize and safely quote a value for use in a Typesense filter. - model = Comment - - @staticmethod - def name() -> str: - """ - Return the Typesense index name for the index. - """ - return settings.TYPESENSE_COLLECTION_PREFIX + "comments" - - @classmethod - def schema(cls) -> CollectionCreateSchema: - return { - "name": cls.name(), - "fields": [ - {"name": "course_id", "type": "string"}, - {"name": "comment_thread_id", "type": "string"}, - {"name": "body", "type": "string"}, - ], - } - - @staticmethod - def build_document(doc_id: str | int, data: dict[str, Any]) -> DocumentSchema: - """ - Build a Typesense document for this index. - """ - # NOTE: Comments have no commentable_id or title, and the context is hardcoded to "course". - return { - "id": str(doc_id), - "course_id": str(data.get("course_id", "")), - "comment_thread_id": str(data.get("comment_thread_id", "")), - "body": ( - BeautifulSoup(data["body"], features="html.parser").get_text() - if data.get("body") - else "" - ), - } - - @staticmethod - def build_search_parameters( - *, search_text: str, course_id: str | None - ) -> SearchParameters: - """ - Build Typesense search parameters for this index. - """ - return { - "q": search_text, - "query_by": "body", - "filter_by": ( - f"course_id:={quote_filter_value(course_id)}" if course_id else "" - ), - "per_page": FORUM_MAX_DEEP_SEARCH_COMMENT_COUNT, - } + https://typesense.org/docs/guide/tips-for-filtering.html#escaping-special-characters + """ + return "`" + value.replace("`", "") + "`" -class CommentThreadsIndex: +def collection_name() -> str: """ - Common data and operations relating to the comment threads index. + Generate the collection name to use in Typesense. """ + return settings.TYPESENSE_COLLECTION_PREFIX + "forum" - model = CommentThread - @staticmethod - def name() -> str: - """ - Return the Typesense index name for the index. - """ - return settings.TYPESENSE_COLLECTION_PREFIX + "comment_threads" - - @classmethod - def schema(cls) -> CollectionCreateSchema: - return { - "name": cls.name(), - "fields": [ - {"name": "course_id", "type": "string"}, - {"name": "commentable_id", "type": "string"}, - {"name": "context", "type": "string"}, - {"name": "title", "type": "string"}, - {"name": "body", "type": "string"}, - ], - } +def collection_schema() -> CollectionCreateSchema: + """ + The schema to use for creating the collection. + """ + return { + "name": collection_name(), + # NOTE: there's always an implicit "id" field + "fields": [ + {"name": "thread_id", "type": "string"}, + {"name": "course_id", "type": "string"}, + {"name": "commentable_id", "type": "string"}, + {"name": "context", "type": "string"}, + {"name": "text", "type": "string"}, + ], + } + + +def expected_full_collection_schema() -> dict[str, Any]: + """ + What is expected to be the full collection schema. - @staticmethod - def build_document(doc_id: str | int, data: dict[str, Any]) -> DocumentSchema: - """ - Build a Typesense document for this index. - """ - return { - "id": str(doc_id), - "course_id": str(data.get("course_id", "")), - "commentable_id": str(data.get("commentable_id", "")), - "context": str(data.get("context", "")), - "title": str(data.get("title", "")), - "body": ( + Use this to validate the actual schema from the server. + """ + return { + "default_sorting_field": "", + "enable_nested_fields": False, + "fields": [ + { + "facet": False, + "index": True, + "infix": False, + "locale": "", + "name": "thread_id", + "optional": False, + "sort": False, + "stem": False, + "stem_dictionary": "", + "store": True, + "type": "string", + }, + { + "facet": False, + "index": True, + "infix": False, + "locale": "", + "name": "course_id", + "optional": False, + "sort": False, + "stem": False, + "stem_dictionary": "", + "store": True, + "type": "string", + }, + { + "facet": False, + "index": True, + "infix": False, + "locale": "", + "name": "commentable_id", + "optional": False, + "sort": False, + "stem": False, + "stem_dictionary": "", + "store": True, + "type": "string", + }, + { + "facet": False, + "index": True, + "infix": False, + "locale": "", + "name": "context", + "optional": False, + "sort": False, + "stem": False, + "stem_dictionary": "", + "store": True, + "type": "string", + }, + { + "facet": False, + "index": True, + "infix": False, + "locale": "", + "name": "text", + "optional": False, + "sort": False, + "stem": False, + "stem_dictionary": "", + "store": True, + "type": "string", + }, + ], + "name": collection_name(), + "symbols_to_index": [], + "token_separators": [], + } + + +def document_from_thread(doc_id: str | int, data: dict[str, Any]) -> DocumentSchema: + """ + Build a Typesense document from a thread's data. + """ + return { + "id": f"thread-{doc_id}", + "thread_id": str(doc_id), + "course_id": str(data.get("course_id", "")), + "commentable_id": str(data.get("commentable_id", "")), + "context": str(data.get("context", "")), + "text": "{}\n{}".format( + str(data.get("title", "")), + ( BeautifulSoup(data["body"], features="html.parser").get_text() if data.get("body") else "" ), - } + ), + } - @staticmethod - def build_search_parameters( - *, - search_text: str, - course_id: str | None, - context: str, - commentable_ids: list[str] | None, - ) -> SearchParameters: - """ - Build Typesense search parameters for this index. - """ - # Context is always a single word, so we can use the faster `:` operator, without sacrificing accuracy. - filters = [f"context:{quote_filter_value(context)}"] - if commentable_ids: - safe_ids = ", ".join(quote_filter_value(value) for value in commentable_ids) - filters.append(f"commentable_ids:[{safe_ids}]") - if course_id: - filters.append(f"course_id:={quote_filter_value(course_id)}") - - return { - "q": search_text, - "query_by": "title,body", - "filter_by": " && ".join(filters), - "per_page": FORUM_MAX_DEEP_SEARCH_COMMENT_COUNT, - } +def document_from_comment(doc_id: str | int, data: dict[str, Any]) -> DocumentSchema: + """ + Build a Typesense document from a comment's data. + """ + # NOTE: Comments have no commentable_id or title, and the context is hardcoded to "course". + return { + "id": f"comment-{doc_id}", + "thread_id": str(data.get("comment_thread_id", "")), + "course_id": str(data.get("course_id", "")), + "commentable_id": "", + "context": str(data.get("context", "")), + "text": ( + BeautifulSoup(data["body"], features="html.parser").get_text() + if data.get("body") + else "" + ), + } + + +def build_search_parameters( + *, + search_text: str, + course_id: str | None, + context: str, + commentable_ids: list[str] | None, +) -> SearchParameters: + """ + Build Typesense search parameters for searching the index. + """ + # Context is always a single word, so we can use the faster `:` operator, without sacrificing accuracy. + filters = [f"context:{quote_filter_value(context)}"] + + if commentable_ids: + safe_ids = ", ".join(quote_filter_value(value) for value in commentable_ids) + filters.append(f"commentable_ids:[{safe_ids}]") -INDICES: dict[str, type[CommentsIndex] | type[CommentThreadsIndex]] = { - "comments": CommentsIndex, - "comment_threads": CommentThreadsIndex, -} + if course_id: + filters.append(f"course_id:={quote_filter_value(course_id)}") + + return { + "q": search_text, + "query_by": "text", + "filter_by": " && ".join(filters), + "per_page": FORUM_MAX_DEEP_SEARCH_COMMENT_COUNT, + } class TypesenseDocumentBackend(BaseDocumentSearchBackend): @@ -188,9 +236,15 @@ def index_document( Index a document in Typesense. """ client = get_typesense_client() - index = INDICES[index_name] - typesense_document = index.build_document(doc_id, document) - client.collections[index.name()].documents.upsert(typesense_document) + + if index_name == "comments": + typesense_document = document_from_comment(doc_id, document) + elif index_name == "comment_threads": + typesense_document = document_from_thread(doc_id, document) + else: + raise NotImplementedError(f"unknown index name: {index_name}") + + client.collections[collection_name()].documents.upsert(typesense_document) def update_document( self, index_name: str, doc_id: str | int, update_data: dict[str, Any] @@ -205,8 +259,14 @@ def delete_document(self, index_name: str, doc_id: str | int) -> None: Delete a document from Typesense. """ client = get_typesense_client() - index = INDICES[index_name] - client.collections[index.name()].documents[str(doc_id)].delete( + if index_name == "comments": + typesense_doc_id = f"comment-{doc_id}" + elif index_name == "comment_threads": + typesense_doc_id = f"thread-{doc_id}" + else: + raise NotImplementedError(f"unknown index name: {index_name}") + + client.collections[collection_name()].documents[typesense_doc_id].delete( delete_parameters={"ignore_not_found": True}, ) @@ -225,18 +285,18 @@ def initialize_indices(self, force_new_index: bool = False) -> None: If force_new_index is True, the indexes will be dropped before being recreated. """ client = get_typesense_client() - for index in INDICES.values(): - exists: bool = True - try: - client.collections[index.name()].retrieve() - except ObjectNotFound: - exists = False + name = collection_name() + exists: bool = True + try: + client.collections[name].retrieve() + except ObjectNotFound: + exists = False - if force_new_index and exists: - client.collections[index.name()].delete() + if force_new_index and exists: + client.collections[name].delete() - if force_new_index or not exists: - client.collections.create(index.schema()) + if force_new_index or not exists: + client.collections.create(collection_schema()) def rebuild_indices( self, batch_size: int = 500, extra_catchup_minutes: int = 5 @@ -252,18 +312,28 @@ def rebuild_indices( """ client = get_typesense_client() self.initialize_indices(force_new_index=True) - for index in INDICES.values(): - paginator = Paginator(index.model.objects.all(), per_page=batch_size) + + for model, document_builder in [ + (CommentThread, document_from_thread), + (Comment, document_from_comment), + ]: + paginator = Paginator( + model.objects.order_by("pk").all(), per_page=batch_size + ) for page_number in paginator.page_range: page = paginator.get_page(page_number) documents = [ - index.build_document(obj.pk, obj.doc_to_hash()) + document_builder(obj.pk, obj.doc_to_hash()) for obj in page.object_list ] if documents: - client.collections[index.name()].documents.import_( + response = client.collections[collection_name()].documents.import_( documents, {"action": "upsert"} ) + if not all(result["success"] for result in response): + raise ValueError( + f"Errors while importing documents to Typesense collection: {response}" + ) def validate_indices(self) -> None: """ @@ -272,16 +342,15 @@ def validate_indices(self) -> None: Raise an exception if any do not exist or if any are not valid. """ client = get_typesense_client() - for index in INDICES.values(): - collection = client.collections[index.name()].retrieve() - # TODO: collection returns more information than the initial create schema, - # so we need a better comparison here; this is currently broken - if collection != index.schema(): - print(f"Expected schema: {index.schema()}") - print(f"Found schema: {collection}") - raise AssertionError( - f"Collection {index.name()} exists, but schema does not match expected." - ) + collection = client.collections[collection_name()].retrieve() + collection.pop("created_at") # type: ignore + collection.pop("num_documents") # type: ignore + if collection != expected_full_collection_schema(): + print(f"Expected schema: {expected_full_collection_schema()}") + print(f"Found schema: {collection}") + raise AssertionError( + f"Collection {collection_name()} exists, but schema does not match expected." + ) def refresh_indices(self) -> None: """ @@ -293,20 +362,11 @@ def refresh_indices(self) -> None: def delete_unused_indices(self) -> int: """ - Noop on Typesense. + Noop for this implementation. """ return 0 -def quote_filter_value(value: str) -> str: - """ - Sanitize and safely quote a value for use in a Typesense filter. - - https://typesense.org/docs/guide/tips-for-filtering.html#escaping-special-characters - """ - return "`" + value.replace("`", "") + "`" - - class TypesenseThreadSearchBackend(BaseThreadSearchBackend): """ Thread search backend implementation for Typesense. @@ -328,31 +388,18 @@ def get_thread_ids( Retrieve thread IDs based on search criteria. """ client = get_typesense_client() - thread_ids: set[str] = set() - - # All comments have "course" as their context, and none of them have a commentable_id. - if context == "course" and not commentable_ids: - comment_results = client.collections[CommentsIndex.name()].documents.search( - CommentsIndex.build_search_parameters( - search_text=search_text, course_id=course_id - ) - ) - for hit in comment_results.get("hits", []): - thread_ids.add(hit["document"]["comment_thread_id"]) - - thread_results = client.collections[ - CommentThreadsIndex.name() - ].documents.search( - CommentThreadsIndex.build_search_parameters( - search_text=search_text, - course_id=course_id, - context=context, - commentable_ids=commentable_ids, - ) + + params = build_search_parameters( + search_text=search_text, + course_id=course_id, + context=context, + commentable_ids=commentable_ids, ) - for hit in thread_results.get("hits", []): - thread_ids.add(hit["document"]["id"]) + results = client.collections[collection_name()].documents.search(params) + thread_ids: set[str] = { + hit["document"]["thread_id"] for hit in results.get("hits", []) # type: ignore + } return list(thread_ids) def get_suggested_text(self, search_text: str) -> Optional[str]: @@ -362,8 +409,7 @@ def get_suggested_text(self, search_text: str) -> Optional[str]: :param search_text: Text to search for suggestions :return: Suggested text or None """ - # TODO: https://typesense.org/docs/guide/query-suggestions.html - # TODO: if this is implemented, do we need to also implement get_thread_ids_with_corrected_text? + # Not implemented, so no suggestions. return None From 98a92fb9a92cf260a5e19ac4f3936aee151862af Mon Sep 17 00:00:00 2001 From: Samuel Allan Date: Thu, 4 Sep 2025 11:39:00 +0930 Subject: [PATCH 5/5] fix: improve validation and type checking in response to review --- forum/search/typesense.py | 142 +++++++++++++++++++++----------------- 1 file changed, 80 insertions(+), 62 deletions(-) diff --git a/forum/search/typesense.py b/forum/search/typesense.py index c0095d90..d191c7b1 100644 --- a/forum/search/typesense.py +++ b/forum/search/typesense.py @@ -2,14 +2,13 @@ Typesense backend for searching comments and threads. """ -from typing import Any, Optional +from typing import Any, Optional, cast from bs4 import BeautifulSoup from django.conf import settings from django.core.paginator import Paginator -# mypy complains: 'Module "typesense" does not explicitly export attribute "Client"' -from typesense import Client # type: ignore +from typesense.client import Client from typesense.types.collection import CollectionCreateSchema from typesense.types.document import DocumentSchema, SearchParameters from typesense.exceptions import ObjectNotFound @@ -79,75 +78,44 @@ def expected_full_collection_schema() -> dict[str, Any]: What is expected to be the full collection schema. Use this to validate the actual schema from the server. + Note that Typesense may add new keys to the schema; + this is ok, and validation should still pass. """ + field_defaults = { + "facet": False, + "index": True, + "infix": False, + "locale": "", + "optional": False, + "sort": False, + "stem": False, + "stem_dictionary": "", + "store": True, + "type": "string", + } return { "default_sorting_field": "", "enable_nested_fields": False, "fields": [ { - "facet": False, - "index": True, - "infix": False, - "locale": "", + **field_defaults, "name": "thread_id", - "optional": False, - "sort": False, - "stem": False, - "stem_dictionary": "", - "store": True, - "type": "string", }, { - "facet": False, - "index": True, - "infix": False, - "locale": "", + **field_defaults, "name": "course_id", - "optional": False, - "sort": False, - "stem": False, - "stem_dictionary": "", - "store": True, - "type": "string", }, { - "facet": False, - "index": True, - "infix": False, - "locale": "", + **field_defaults, "name": "commentable_id", - "optional": False, - "sort": False, - "stem": False, - "stem_dictionary": "", - "store": True, - "type": "string", }, { - "facet": False, - "index": True, - "infix": False, - "locale": "", + **field_defaults, "name": "context", - "optional": False, - "sort": False, - "stem": False, - "stem_dictionary": "", - "store": True, - "type": "string", }, { - "facet": False, - "index": True, - "infix": False, - "locale": "", + **field_defaults, "name": "text", - "optional": False, - "sort": False, - "stem": False, - "stem_dictionary": "", - "store": True, - "type": "string", }, ], "name": collection_name(), @@ -206,7 +174,9 @@ def build_search_parameters( """ Build Typesense search parameters for searching the index. """ - # Context is always a single word, so we can use the faster `:` operator, without sacrificing accuracy. + # `context` is always a single word, + # so we can gain performance without losing accuracy by using the faster `:` (non-exact) operator. + # See https://typesense.org/docs/29.0/api/search.html#filter-parameters for more information. filters = [f"context:{quote_filter_value(context)}"] if commentable_ids: @@ -340,18 +310,66 @@ def validate_indices(self) -> None: Check if the indices exist and are valid. Raise an exception if any do not exist or if any are not valid. + Note that the validation is lengthy, + because Typesense may add new keys to the schema. + This is fine - we only want to assert that keys we know about are set as expected. + There are also some fields in the retrieved schema we don't care about - eg. 'created_at' """ client = get_typesense_client() - collection = client.collections[collection_name()].retrieve() - collection.pop("created_at") # type: ignore - collection.pop("num_documents") # type: ignore - if collection != expected_full_collection_schema(): - print(f"Expected schema: {expected_full_collection_schema()}") - print(f"Found schema: {collection}") - raise AssertionError( - f"Collection {collection_name()} exists, but schema does not match expected." + # cast to a wider type, because we want to use it in a more flexible way than TypedDict normally allows. + actual_schema = cast( + dict[str, Any], client.collections[collection_name()].retrieve() + ) + expected_schema = expected_full_collection_schema() + errors: list[str] = [] + + expected_field_names = set( + map(lambda field: field["name"], expected_schema["fields"]) + ) + actual_field_names = set( + map(lambda field: field["name"], actual_schema["fields"]) + ) + + if missing_fields := expected_field_names - actual_field_names: + errors.append( + f"ERROR: '{collection_name()}' collection schema 'fields' has missing field(s): {missing_fields}." ) + if extra_fields := actual_field_names - expected_field_names: + errors.append( + f"ERROR: '{collection_name()}' collection schema 'fields' " + f"has unexpected extra field(s): {extra_fields}." + ) + + if actual_field_names == expected_field_names: + for expected_field, actual_field in zip( + sorted(expected_schema["fields"], key=lambda field: field["name"]), + sorted(actual_schema["fields"], key=lambda field: field["name"]), + ): + for key, expected_value in expected_field.items(): + if expected_value != actual_field[key]: + errors.append( + f"ERROR: in collection '{collection_name()}' fields, field '{expected_field['name']}', " + f"key '{key}' failed to validate. " + f"Expected: '{expected_value}', actual '{actual_field[key]}'." + ) + + for key, expected_value in expected_schema.items(): + if key == "fields": + # we've already validated fields separately above + continue + + if expected_value != actual_schema[key]: + errors.append( + f"ERROR: in collection '{collection_name()}', key '{key}' failed to validate. " + f"Expected: '{expected_value}', actual '{actual_schema[key]}'." + ) + + if errors: + for error in errors: + print(error) + raise AssertionError("\n".join(errors)) + def refresh_indices(self) -> None: """ Noop on Typesense, as all write API operations are synchronous.