From 9d08ff5f1fbc0aa460dd03ab9f250c88067d4b64 Mon Sep 17 00:00:00 2001
From: Samuel Allan <samuel@opencraft.com>
Date: Tue, 26 Aug 2025 16:56:18 +0930
Subject: [PATCH 1/5] feat: add Typesense backend for search

Also run `make upgrade` to be able to successfully compile dependencies.
This upgrade may need to be refactored out of this commit.

Private-ref: https://tasks.opencraft.com/browse/BB-9975
---
 forum/search/typesense.py | 377 ++++++++++++++++++++++++++++++++++++++
 forum/settings/common.py  |   8 +-
 requirements/base.in      |   1 +
 requirements/base.txt     |   3 +
 requirements/ci.txt       |   3 +
 requirements/dev.txt      |   5 +
 requirements/doc.txt      |   3 +
 requirements/quality.txt  |   3 +
 requirements/test.txt     |   3 +
 9 files changed, 405 insertions(+), 1 deletion(-)
 create mode 100644 forum/search/typesense.py

diff --git a/forum/search/typesense.py b/forum/search/typesense.py
new file mode 100644
index 00000000..eb3fc99b
--- /dev/null
+++ b/forum/search/typesense.py
@@ -0,0 +1,377 @@
+"""
+Typesense backend for searching comments and threads.
+"""
+
+from typing import Any, Optional
+
+from bs4 import BeautifulSoup
+from django.conf import settings
+from django.core.paginator import Paginator
+from typesense import Client
+from typesense.types.collection import CollectionCreateSchema
+from typesense.types.document import DocumentSchema, SearchParameters
+from typesense.exceptions import ObjectNotFound
+
+from forum.backends.mysql.models import Comment, CommentThread
+from forum.constants import FORUM_MAX_DEEP_SEARCH_COMMENT_COUNT
+from forum.search.base import (
+    BaseDocumentSearchBackend,
+    BaseIndexSearchBackend,
+    BaseSearchBackend,
+    BaseThreadSearchBackend,
+)
+
+_TYPESENSE_CLIENT: Client | None = None
+
+
+def get_typesense_client() -> Client:
+    """
+    Return a singleton Typesense client instance.
+    """
+    global _TYPESENSE_CLIENT
+    if _TYPESENSE_CLIENT is None:
+        _TYPESENSE_CLIENT = Client(
+            {
+                "api_key": settings.TYPESENSE_API_KEY,
+                "nodes": [settings.TYPESENSE_URL],
+            }
+        )
+    return _TYPESENSE_CLIENT
+
+
+class CommentsIndex:
+    """
+    Common data and operations relating to the comments index.
+    """
+
+    model = Comment
+
+    @staticmethod
+    def name() -> str:
+        """
+        Return the Typesense index name for the index.
+        """
+        return settings.TYPESENSE_COLLECTION_PREFIX + "comments"
+
+    @classmethod
+    def schema(cls) -> CollectionCreateSchema:
+        return {
+            "name": cls.name(),
+            "fields": [
+                {"name": "course_id", "type": "string"},
+                {"name": "comment_thread_id", "type": "string"},
+                {"name": "body", "type": "string"},
+            ],
+        }
+
+    @staticmethod
+    def build_document(doc_id: str | int, data: dict[str, Any]) -> DocumentSchema:
+        """
+        Build a Typesense document for this index.
+        """
+        # NOTE: Comments have no commentable_id or title, and the context is hardcoded to "course".
+        return {
+            "id": str(doc_id),
+            "course_id": str(data.get("course_id", "")),
+            "comment_thread_id": str(data.get("comment_thread_id", "")),
+            "body": (
+                BeautifulSoup(data["body"], features="html.parser").get_text()
+                if data.get("body")
+                else ""
+            ),
+        }
+
+    @staticmethod
+    def build_search_parameters(
+        *, search_text: str, course_id: str | None
+    ) -> SearchParameters:
+        """
+        Build Typesense search parameters for this index.
+        """
+        return {
+            "q": search_text,
+            "query_by": "body",
+            "filter_by": (
+                f"course_id:={quote_filter_value(course_id)}" if course_id else ""
+            ),
+            "per_page": FORUM_MAX_DEEP_SEARCH_COMMENT_COUNT,
+        }
+
+
+class CommentThreadsIndex:
+    """
+    Common data and operations relating to the comments index.
+    """
+
+    model = CommentThread
+
+    @staticmethod
+    def name() -> str:
+        """
+        Return the Typesense index name for the index.
+        """
+        return settings.TYPESENSE_COLLECTION_PREFIX + "comment_threads"
+
+    @classmethod
+    def schema(cls) -> CollectionCreateSchema:
+        return {
+            "name": cls.name(),
+            "fields": [
+                {"name": "course_id", "type": "string"},
+                {"name": "commentable_id", "type": "string"},
+                {"name": "context", "type": "string"},
+                {"name": "title", "type": "string"},
+                {"name": "body", "type": "string"},
+            ],
+        }
+
+    @staticmethod
+    def build_document(doc_id: str | int, data: dict[str, Any]) -> DocumentSchema:
+        """
+        Build a Typesense document for this index.
+        """
+        return {
+            "id": str(doc_id),
+            "course_id": str(data.get("course_id", "")),
+            "commentable_id": str(data.get("commentable_id", "")),
+            "context": str(data.get("context", "")),
+            "title": str(data.get("title", "")),
+            "body": (
+                BeautifulSoup(data["body"], features="html.parser").get_text()
+                if data.get("body")
+                else ""
+            ),
+        }
+
+    @staticmethod
+    def build_search_parameters(
+        *,
+        search_text: str,
+        course_id: str | None,
+        context: str,
+        commentable_ids: list[str] | None,
+    ) -> SearchParameters:
+        """
+        Build Typesense search parameters for this index.
+        """
+        # Context is always a single word, so we can use the faster `:` operator, without sacrificing accuracy.
+        filters = [f"context:{quote_filter_value(context)}"]
+        if commentable_ids:
+            safe_ids = ", ".join(quote_filter_value(value) for value in commentable_ids)
+            filters.append(f"commentable_ids:[{safe_ids}]")
+        if course_id:
+            filters.append(f"course_id:={quote_filter_value(course_id)}")
+
+        return {
+            "q": search_text,
+            "query_by": "title,body",
+            "filter_by": " && ".join(filters),
+            "per_page": FORUM_MAX_DEEP_SEARCH_COMMENT_COUNT,
+        }
+
+
+INDICES: dict[str, type[CommentsIndex] | type[CommentThreadsIndex]] = {
+    "comments": CommentsIndex,
+    "comment_threads": CommentThreadsIndex,
+}
+
+
+class TypesenseDocumentBackend(BaseDocumentSearchBackend):
+    """
+    Document backend implementation for Typesense.
+    """
+
+    def index_document(
+        self, index_name: str, doc_id: str | int, document: dict[str, Any]
+    ) -> None:
+        """
+        Index a document in Typesense.
+        """
+        client = get_typesense_client()
+        index = INDICES[index_name]
+        typesense_document = index.build_document(doc_id, document)
+        client.collections[index.name()].documents.upsert(typesense_document)
+
+    def update_document(
+        self, index_name: str, doc_id: str | int, update_data: dict[str, Any]
+    ) -> None:
+        """
+        Same operation as index_document, because upsert is used.
+        """
+        return self.index_document(index_name, doc_id, update_data)
+
+    def delete_document(self, index_name: str, doc_id: str | int) -> None:
+        """
+        Delete a document from Typesense.
+        """
+        client = get_typesense_client()
+        index = INDICES[index_name]
+        client.collections[index.name()].documents[str(doc_id)].delete(
+            delete_parameters={"ignore_not_found": True},
+        )
+
+
+class TypesenseIndexBackend(BaseIndexSearchBackend):
+    """
+    Manage indexes for the Typesense backend.
+
+    Typesense calls these "collections". https://typesense.org/docs/29.0/api/collections.html
+    """
+
+    def initialize_indices(self, force_new_index: bool = False) -> None:
+        """
+        Initialize the indices in Typesense.
+
+        If force_new_index is True, the indexes will be dropped before being recreated.
+        """
+        client = get_typesense_client()
+        for index in INDICES.values():
+            exists: bool = True
+            try:
+                client.collections[index.name()].retrieve()
+            except ObjectNotFound:
+                exists = False
+
+            if force_new_index and exists:
+                client.collections[index.name()].delete()
+
+            if force_new_index or not exists:
+                client.collections.create(index.schema())
+
+    def rebuild_indices(
+        self, batch_size: int = 500, extra_catchup_minutes: int = 5
+    ) -> None:
+        """
+        Reindex everything in Typesense
+
+        The Typesense collections are dropped and recreated,
+        and data is reindexed from the MySQL database.
+
+        Only MySQL-backed instances are supported.
+        Note that the `extra_catchup_minutes` argument is ignored.
+        """
+        client = get_typesense_client()
+        self.initialize_indices(force_new_index=True)
+        for index in INDICES.values():
+            paginator = Paginator(index.model.objects.all(), per_page=batch_size)
+            for page_number in paginator.page_range:
+                page = paginator.get_page(page_number)
+                documents = [
+                    index.build_document(obj.pk, obj.doc_to_hash())
+                    for obj in page.object_list
+                ]
+                if documents:
+                    client.collections[index.name()].documents.import_(
+                        documents, {"action": "upsert"}
+                    )
+
+    def validate_indices(self) -> None:
+        """
+        Check if the indices exist and are valid.
+
+        Raise an exception if any do not exist or if any are not valid.
+        """
+        client = get_typesense_client()
+        for index in INDICES.values():
+            collection = client.collections[index.name()].retrieve()
+            # TODO: collection returns more information than the initial create schema,
+            # so we need a better comparison here; this is currently broken
+            if collection != index.schema():
+                print(f"Expected schema: {index.schema()}")
+                print(f"Found schema: {collection}")
+                raise AssertionError(
+                    f"Collection {index.name()} exists, but schema does not match expected."
+                )
+
+    def refresh_indices(self) -> None:
+        """
+        Noop on Typesense, as all write API operations are synchronous.
+
+        See https://typesense.org/docs/guide/migrating-from-algolia.html#synchronous-write-apis for more information.
+        """
+        return None
+
+    def delete_unused_indices(self) -> int:
+        """
+        Noop on Typesense.
+        """
+        return 0
+
+
+def quote_filter_value(value: str) -> str:
+    """
+    Sanitize and safely quote a value for use in a Typesense filter.
+
+    https://typesense.org/docs/guide/tips-for-filtering.html#escaping-special-characters
+    """
+    return "`" + value.replace("`", "") + "`"
+
+
+class TypesenseThreadSearchBackend(BaseThreadSearchBackend):
+    """
+    Thread search backend implementation for Typesense.
+    """
+
+    def get_thread_ids(
+        self,
+        context: str,
+        # This argument is unsupported. Anyway, its only role was to boost some results,
+        # which did not have much effect because they are shuffled anyway downstream.
+        group_ids: list[int],
+        search_text: str,
+        # This parameter is unsupported, but as far as we know it's not used anywhere.
+        sort_criteria: Optional[list[dict[str, str]]] = None,
+        commentable_ids: Optional[list[str]] = None,
+        course_id: Optional[str] = None,
+    ) -> list[str]:
+        """
+        Retrieve thread IDs based on search criteria.
+        """
+        client = get_typesense_client()
+        thread_ids: set[str] = set()
+
+        # All comments have "course" as their context, and none of them have a commentable_id.
+        if context == "course" and not commentable_ids:
+            comment_results = client.collections[CommentsIndex.name()].documents.search(
+                CommentsIndex.build_search_parameters(
+                    search_text=search_text, course_id=course_id
+                )
+            )
+            for hit in comment_results.get("hits", []):
+                thread_ids.add(hit["document"]["comment_thread_id"])
+
+        thread_results = client.collections[
+            CommentThreadsIndex.name()
+        ].documents.search(
+            CommentThreadsIndex.build_search_parameters(
+                search_text=search_text,
+                course_id=course_id,
+                context=context,
+                commentable_ids=commentable_ids,
+            )
+        )
+        for hit in thread_results.get("hits", []):
+            thread_ids.add(hit["document"]["id"])
+
+        return list(thread_ids)
+
+    def get_suggested_text(self, search_text: str) -> Optional[str]:
+        """
+        Retrieve text suggestions for a given search query.
+
+        :param search_text: Text to search for suggestions
+        :return: Suggested text or None
+        """
+        # TODO: https://typesense.org/docs/guide/query-suggestions.html
+        # TODO: if this is implemented, do we need to also implement get_thread_ids_with_corrected_text?
+        return None
+
+
+class TypesenseBackend(BaseSearchBackend):
+    """
+    Typesense-powered search backend.
+    """
+
+    DOCUMENT_SEARCH_CLASS = TypesenseDocumentBackend
+    INDEX_SEARCH_CLASS = TypesenseIndexBackend
+    THREAD_SEARCH_CLASS = TypesenseThreadSearchBackend
diff --git a/forum/settings/common.py b/forum/settings/common.py
index dbecd9cd..772fc9e7 100644
--- a/forum/settings/common.py
+++ b/forum/settings/common.py
@@ -10,7 +10,13 @@ def plugin_settings(settings: Any) -> None:
     Common settings for forum app
     """
     # Search backend
-    if getattr(settings, "MEILISEARCH_ENABLED", False):
+    if getattr(settings, "TYPESENSE_ENABLED", False):
+        settings.FORUM_SEARCH_BACKEND = getattr(
+            settings,
+            "FORUM_SEARCH_BACKEND",
+            "forum.search.typesense.TypesenseBackend",
+        )
+    elif getattr(settings, "MEILISEARCH_ENABLED", False):
         settings.FORUM_SEARCH_BACKEND = getattr(
             settings,
             "FORUM_SEARCH_BACKEND",
diff --git a/requirements/base.in b/requirements/base.in
index 7fc1fced..311e9e10 100644
--- a/requirements/base.in
+++ b/requirements/base.in
@@ -11,4 +11,5 @@ requests
 pymongo
 elasticsearch
 edx-search # meilisearch backend
+typesense
 mysqlclient
diff --git a/requirements/base.txt b/requirements/base.txt
index 449bdd19..689a1d76 100644
--- a/requirements/base.txt
+++ b/requirements/base.txt
@@ -141,6 +141,7 @@ requests==2.32.5
     # via
     #   -r requirements/base.in
     #   meilisearch
+    #   typesense
 six==1.17.0
     # via
     #   edx-ccx-keys
@@ -157,6 +158,8 @@ stevedore==5.5.0
     #   edx-opaque-keys
 text-unidecode==1.3
     # via python-slugify
+typesense==1.1.1
+    # via -r requirements/base.in
 typing-extensions==4.15.0
     # via
     #   beautifulsoup4
diff --git a/requirements/ci.txt b/requirements/ci.txt
index 748a1284..66df4121 100644
--- a/requirements/ci.txt
+++ b/requirements/ci.txt
@@ -431,6 +431,7 @@ requests==2.32.5
     #   meilisearch
     #   requests-toolbelt
     #   twine
+    #   typesense
 requests-toolbelt==1.0.0
     # via
     #   -r requirements/quality.txt
@@ -507,6 +508,8 @@ types-urllib3==1.26.25.14
     # via
     #   -r requirements/quality.txt
     #   types-requests
+typesense==1.1.1
+    # via -r requirements/quality.txt
 typing-extensions==4.15.0
     # via
     #   -r requirements/quality.txt
diff --git a/requirements/dev.txt b/requirements/dev.txt
index 5e01b88a..14346e94 100644
--- a/requirements/dev.txt
+++ b/requirements/dev.txt
@@ -574,6 +574,7 @@ requests==2.32.5
     #   meilisearch
     #   requests-toolbelt
     #   twine
+    #   typesense
 requests-toolbelt==1.0.0
     # via
     #   -r requirements/ci.txt
@@ -672,6 +673,10 @@ types-urllib3==1.26.25.14
     #   -r requirements/ci.txt
     #   -r requirements/quality.txt
     #   types-requests
+typesense==1.1.1
+    # via
+    #   -r requirements/ci.txt
+    #   -r requirements/quality.txt
 typing-extensions==4.15.0
     # via
     #   -r requirements/ci.txt
diff --git a/requirements/doc.txt b/requirements/doc.txt
index e48ac9c0..71dbab6e 100644
--- a/requirements/doc.txt
+++ b/requirements/doc.txt
@@ -374,6 +374,7 @@ requests==2.32.5
     #   requests-toolbelt
     #   sphinx
     #   twine
+    #   typesense
 requests-toolbelt==1.0.0
     # via
     #   -r requirements/test.txt
@@ -446,6 +447,8 @@ tox==4.28.4
     # via -r requirements/test.txt
 twine==6.1.0
     # via -r requirements/test.txt
+typesense==1.1.1
+    # via -r requirements/test.txt
 typing-extensions==4.15.0
     # via
     #   -r requirements/test.txt
diff --git a/requirements/quality.txt b/requirements/quality.txt
index fe0a2766..15ac1f9a 100644
--- a/requirements/quality.txt
+++ b/requirements/quality.txt
@@ -403,6 +403,7 @@ requests==2.32.5
     #   meilisearch
     #   requests-toolbelt
     #   twine
+    #   typesense
 requests-toolbelt==1.0.0
     # via
     #   -r requirements/test.txt
@@ -470,6 +471,8 @@ types-requests==2.31.0.6
     #   djangorestframework-stubs
 types-urllib3==1.26.25.14
     # via types-requests
+typesense==1.1.1
+    # via -r requirements/test.txt
 typing-extensions==4.15.0
     # via
     #   -r requirements/test.txt
diff --git a/requirements/test.txt b/requirements/test.txt
index 87b4932d..6fd4abe8 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -305,6 +305,7 @@ requests==2.32.5
     #   meilisearch
     #   requests-toolbelt
     #   twine
+    #   typesense
 requests-toolbelt==1.0.0
     # via twine
 rfc3986==2.0.0
@@ -343,6 +344,8 @@ tox==4.28.4
     # via -r requirements/test.in
 twine==6.1.0
     # via -r requirements/test.in
+typesense==1.1.1
+    # via -r requirements/base.txt
 typing-extensions==4.15.0
     # via
     #   -r requirements/base.txt

From c1a8d654ac4a2fe7b20b57696474593f04c4be6f Mon Sep 17 00:00:00 2001
From: Samuel Allan <samuel@opencraft.com>
Date: Fri, 29 Aug 2025 09:08:24 +0930
Subject: [PATCH 2/5] fix: switch to TYPESENSE_URLS for cluster support

---
 forum/search/typesense.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/forum/search/typesense.py b/forum/search/typesense.py
index eb3fc99b..556ff36b 100644
--- a/forum/search/typesense.py
+++ b/forum/search/typesense.py
@@ -33,7 +33,7 @@ def get_typesense_client() -> Client:
         _TYPESENSE_CLIENT = Client(
             {
                 "api_key": settings.TYPESENSE_API_KEY,
-                "nodes": [settings.TYPESENSE_URL],
+                "nodes": settings.TYPESENSE_URLS,
             }
         )
     return _TYPESENSE_CLIENT

From 824e925943d1f01e4af3eba7c5627d4c447d737a Mon Sep 17 00:00:00 2001
From: Samuel Allan <samuelengineer@fastmail.com>
Date: Fri, 29 Aug 2025 09:09:04 +0930
Subject: [PATCH 3/5] docs: fix missing word

Co-authored-by: Braden MacDonald <mail@bradenm.com>
---
 forum/search/typesense.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/forum/search/typesense.py b/forum/search/typesense.py
index 556ff36b..b7be715d 100644
--- a/forum/search/typesense.py
+++ b/forum/search/typesense.py
@@ -100,7 +100,7 @@ def build_search_parameters(
 
 class CommentThreadsIndex:
     """
-    Common data and operations relating to the comments index.
+    Common data and operations relating to the comment threads index.
     """
 
     model = CommentThread

From d073dcefcb2b151186f2fd90cf49170686cd739e Mon Sep 17 00:00:00 2001
From: Samuel Allan <samuel@opencraft.com>
Date: Mon, 1 Sep 2025 12:38:02 +0930
Subject: [PATCH 4/5] refactor!: use a single index for all documents

Also finish implementing the validation method,
and address lint errors.
---
 forum/search/typesense.py | 414 +++++++++++++++++++++-----------------
 1 file changed, 230 insertions(+), 184 deletions(-)

diff --git a/forum/search/typesense.py b/forum/search/typesense.py
index b7be715d..c0095d90 100644
--- a/forum/search/typesense.py
+++ b/forum/search/typesense.py
@@ -7,7 +7,9 @@
 from bs4 import BeautifulSoup
 from django.conf import settings
 from django.core.paginator import Paginator
-from typesense import Client
+
+# mypy complains: 'Module "typesense" does not explicitly export attribute "Client"'
+from typesense import Client  # type: ignore
 from typesense.types.collection import CollectionCreateSchema
 from typesense.types.document import DocumentSchema, SearchParameters
 from typesense.exceptions import ObjectNotFound
@@ -39,141 +41,187 @@ def get_typesense_client() -> Client:
     return _TYPESENSE_CLIENT
 
 
-class CommentsIndex:
-    """
-    Common data and operations relating to the comments index.
+def quote_filter_value(value: str) -> str:
     """
+    Sanitize and safely quote a value for use in a Typesense filter.
 
-    model = Comment
-
-    @staticmethod
-    def name() -> str:
-        """
-        Return the Typesense index name for the index.
-        """
-        return settings.TYPESENSE_COLLECTION_PREFIX + "comments"
-
-    @classmethod
-    def schema(cls) -> CollectionCreateSchema:
-        return {
-            "name": cls.name(),
-            "fields": [
-                {"name": "course_id", "type": "string"},
-                {"name": "comment_thread_id", "type": "string"},
-                {"name": "body", "type": "string"},
-            ],
-        }
-
-    @staticmethod
-    def build_document(doc_id: str | int, data: dict[str, Any]) -> DocumentSchema:
-        """
-        Build a Typesense document for this index.
-        """
-        # NOTE: Comments have no commentable_id or title, and the context is hardcoded to "course".
-        return {
-            "id": str(doc_id),
-            "course_id": str(data.get("course_id", "")),
-            "comment_thread_id": str(data.get("comment_thread_id", "")),
-            "body": (
-                BeautifulSoup(data["body"], features="html.parser").get_text()
-                if data.get("body")
-                else ""
-            ),
-        }
-
-    @staticmethod
-    def build_search_parameters(
-        *, search_text: str, course_id: str | None
-    ) -> SearchParameters:
-        """
-        Build Typesense search parameters for this index.
-        """
-        return {
-            "q": search_text,
-            "query_by": "body",
-            "filter_by": (
-                f"course_id:={quote_filter_value(course_id)}" if course_id else ""
-            ),
-            "per_page": FORUM_MAX_DEEP_SEARCH_COMMENT_COUNT,
-        }
+    https://typesense.org/docs/guide/tips-for-filtering.html#escaping-special-characters
+    """
+    return "`" + value.replace("`", "") + "`"
 
 
-class CommentThreadsIndex:
+def collection_name() -> str:
     """
-    Common data and operations relating to the comment threads index.
+    Generate the collection name to use in Typesense.
     """
+    return settings.TYPESENSE_COLLECTION_PREFIX + "forum"
 
-    model = CommentThread
 
-    @staticmethod
-    def name() -> str:
-        """
-        Return the Typesense index name for the index.
-        """
-        return settings.TYPESENSE_COLLECTION_PREFIX + "comment_threads"
-
-    @classmethod
-    def schema(cls) -> CollectionCreateSchema:
-        return {
-            "name": cls.name(),
-            "fields": [
-                {"name": "course_id", "type": "string"},
-                {"name": "commentable_id", "type": "string"},
-                {"name": "context", "type": "string"},
-                {"name": "title", "type": "string"},
-                {"name": "body", "type": "string"},
-            ],
-        }
+def collection_schema() -> CollectionCreateSchema:
+    """
+    The schema to use for creating the collection.
+    """
+    return {
+        "name": collection_name(),
+        # NOTE: there's always an implicit "id" field
+        "fields": [
+            {"name": "thread_id", "type": "string"},
+            {"name": "course_id", "type": "string"},
+            {"name": "commentable_id", "type": "string"},
+            {"name": "context", "type": "string"},
+            {"name": "text", "type": "string"},
+        ],
+    }
+
+
+def expected_full_collection_schema() -> dict[str, Any]:
+    """
+    What is expected to be the full collection schema.
 
-    @staticmethod
-    def build_document(doc_id: str | int, data: dict[str, Any]) -> DocumentSchema:
-        """
-        Build a Typesense document for this index.
-        """
-        return {
-            "id": str(doc_id),
-            "course_id": str(data.get("course_id", "")),
-            "commentable_id": str(data.get("commentable_id", "")),
-            "context": str(data.get("context", "")),
-            "title": str(data.get("title", "")),
-            "body": (
+    Use this to validate the actual schema from the server.
+    """
+    return {
+        "default_sorting_field": "",
+        "enable_nested_fields": False,
+        "fields": [
+            {
+                "facet": False,
+                "index": True,
+                "infix": False,
+                "locale": "",
+                "name": "thread_id",
+                "optional": False,
+                "sort": False,
+                "stem": False,
+                "stem_dictionary": "",
+                "store": True,
+                "type": "string",
+            },
+            {
+                "facet": False,
+                "index": True,
+                "infix": False,
+                "locale": "",
+                "name": "course_id",
+                "optional": False,
+                "sort": False,
+                "stem": False,
+                "stem_dictionary": "",
+                "store": True,
+                "type": "string",
+            },
+            {
+                "facet": False,
+                "index": True,
+                "infix": False,
+                "locale": "",
+                "name": "commentable_id",
+                "optional": False,
+                "sort": False,
+                "stem": False,
+                "stem_dictionary": "",
+                "store": True,
+                "type": "string",
+            },
+            {
+                "facet": False,
+                "index": True,
+                "infix": False,
+                "locale": "",
+                "name": "context",
+                "optional": False,
+                "sort": False,
+                "stem": False,
+                "stem_dictionary": "",
+                "store": True,
+                "type": "string",
+            },
+            {
+                "facet": False,
+                "index": True,
+                "infix": False,
+                "locale": "",
+                "name": "text",
+                "optional": False,
+                "sort": False,
+                "stem": False,
+                "stem_dictionary": "",
+                "store": True,
+                "type": "string",
+            },
+        ],
+        "name": collection_name(),
+        "symbols_to_index": [],
+        "token_separators": [],
+    }
+
+
+def document_from_thread(doc_id: str | int, data: dict[str, Any]) -> DocumentSchema:
+    """
+    Build a Typesense document from a thread's data.
+    """
+    return {
+        "id": f"thread-{doc_id}",
+        "thread_id": str(doc_id),
+        "course_id": str(data.get("course_id", "")),
+        "commentable_id": str(data.get("commentable_id", "")),
+        "context": str(data.get("context", "")),
+        "text": "{}\n{}".format(
+            str(data.get("title", "")),
+            (
                 BeautifulSoup(data["body"], features="html.parser").get_text()
                 if data.get("body")
                 else ""
             ),
-        }
+        ),
+    }
 
-    @staticmethod
-    def build_search_parameters(
-        *,
-        search_text: str,
-        course_id: str | None,
-        context: str,
-        commentable_ids: list[str] | None,
-    ) -> SearchParameters:
-        """
-        Build Typesense search parameters for this index.
-        """
-        # Context is always a single word, so we can use the faster `:` operator, without sacrificing accuracy.
-        filters = [f"context:{quote_filter_value(context)}"]
-        if commentable_ids:
-            safe_ids = ", ".join(quote_filter_value(value) for value in commentable_ids)
-            filters.append(f"commentable_ids:[{safe_ids}]")
-        if course_id:
-            filters.append(f"course_id:={quote_filter_value(course_id)}")
-
-        return {
-            "q": search_text,
-            "query_by": "title,body",
-            "filter_by": " && ".join(filters),
-            "per_page": FORUM_MAX_DEEP_SEARCH_COMMENT_COUNT,
-        }
 
+def document_from_comment(doc_id: str | int, data: dict[str, Any]) -> DocumentSchema:
+    """
+    Build a Typesense document from a comment's data.
+    """
+    # NOTE: Comments have no commentable_id or title, and the context is hardcoded to "course".
+    return {
+        "id": f"comment-{doc_id}",
+        "thread_id": str(data.get("comment_thread_id", "")),
+        "course_id": str(data.get("course_id", "")),
+        "commentable_id": "",
+        "context": str(data.get("context", "")),
+        "text": (
+            BeautifulSoup(data["body"], features="html.parser").get_text()
+            if data.get("body")
+            else ""
+        ),
+    }
+
+
+def build_search_parameters(
+    *,
+    search_text: str,
+    course_id: str | None,
+    context: str,
+    commentable_ids: list[str] | None,
+) -> SearchParameters:
+    """
+    Build Typesense search parameters for searching the index.
+    """
+    # Context is always a single word, so we can use the faster `:` operator, without sacrificing accuracy.
+    filters = [f"context:{quote_filter_value(context)}"]
+
+    if commentable_ids:
+        safe_ids = ", ".join(quote_filter_value(value) for value in commentable_ids)
+        filters.append(f"commentable_ids:[{safe_ids}]")
 
-INDICES: dict[str, type[CommentsIndex] | type[CommentThreadsIndex]] = {
-    "comments": CommentsIndex,
-    "comment_threads": CommentThreadsIndex,
-}
+    if course_id:
+        filters.append(f"course_id:={quote_filter_value(course_id)}")
+
+    return {
+        "q": search_text,
+        "query_by": "text",
+        "filter_by": " && ".join(filters),
+        "per_page": FORUM_MAX_DEEP_SEARCH_COMMENT_COUNT,
+    }
 
 
 class TypesenseDocumentBackend(BaseDocumentSearchBackend):
@@ -188,9 +236,15 @@ def index_document(
         Index a document in Typesense.
         """
         client = get_typesense_client()
-        index = INDICES[index_name]
-        typesense_document = index.build_document(doc_id, document)
-        client.collections[index.name()].documents.upsert(typesense_document)
+
+        if index_name == "comments":
+            typesense_document = document_from_comment(doc_id, document)
+        elif index_name == "comment_threads":
+            typesense_document = document_from_thread(doc_id, document)
+        else:
+            raise NotImplementedError(f"unknown index name: {index_name}")
+
+        client.collections[collection_name()].documents.upsert(typesense_document)
 
     def update_document(
         self, index_name: str, doc_id: str | int, update_data: dict[str, Any]
@@ -205,8 +259,14 @@ def delete_document(self, index_name: str, doc_id: str | int) -> None:
         Delete a document from Typesense.
         """
         client = get_typesense_client()
-        index = INDICES[index_name]
-        client.collections[index.name()].documents[str(doc_id)].delete(
+        if index_name == "comments":
+            typesense_doc_id = f"comment-{doc_id}"
+        elif index_name == "comment_threads":
+            typesense_doc_id = f"thread-{doc_id}"
+        else:
+            raise NotImplementedError(f"unknown index name: {index_name}")
+
+        client.collections[collection_name()].documents[typesense_doc_id].delete(
             delete_parameters={"ignore_not_found": True},
         )
 
@@ -225,18 +285,18 @@ def initialize_indices(self, force_new_index: bool = False) -> None:
         If force_new_index is True, the indexes will be dropped before being recreated.
         """
         client = get_typesense_client()
-        for index in INDICES.values():
-            exists: bool = True
-            try:
-                client.collections[index.name()].retrieve()
-            except ObjectNotFound:
-                exists = False
+        name = collection_name()
+        exists: bool = True
+        try:
+            client.collections[name].retrieve()
+        except ObjectNotFound:
+            exists = False
 
-            if force_new_index and exists:
-                client.collections[index.name()].delete()
+        if force_new_index and exists:
+            client.collections[name].delete()
 
-            if force_new_index or not exists:
-                client.collections.create(index.schema())
+        if force_new_index or not exists:
+            client.collections.create(collection_schema())
 
     def rebuild_indices(
         self, batch_size: int = 500, extra_catchup_minutes: int = 5
@@ -252,18 +312,28 @@ def rebuild_indices(
         """
         client = get_typesense_client()
         self.initialize_indices(force_new_index=True)
-        for index in INDICES.values():
-            paginator = Paginator(index.model.objects.all(), per_page=batch_size)
+
+        for model, document_builder in [
+            (CommentThread, document_from_thread),
+            (Comment, document_from_comment),
+        ]:
+            paginator = Paginator(
+                model.objects.order_by("pk").all(), per_page=batch_size
+            )
             for page_number in paginator.page_range:
                 page = paginator.get_page(page_number)
                 documents = [
-                    index.build_document(obj.pk, obj.doc_to_hash())
+                    document_builder(obj.pk, obj.doc_to_hash())
                     for obj in page.object_list
                 ]
                 if documents:
-                    client.collections[index.name()].documents.import_(
+                    response = client.collections[collection_name()].documents.import_(
                         documents, {"action": "upsert"}
                     )
+                    if not all(result["success"] for result in response):
+                        raise ValueError(
+                            f"Errors while importing documents to Typesense collection: {response}"
+                        )
 
     def validate_indices(self) -> None:
         """
@@ -272,16 +342,15 @@ def validate_indices(self) -> None:
         Raise an exception if any do not exist or if any are not valid.
         """
         client = get_typesense_client()
-        for index in INDICES.values():
-            collection = client.collections[index.name()].retrieve()
-            # TODO: collection returns more information than the initial create schema,
-            # so we need a better comparison here; this is currently broken
-            if collection != index.schema():
-                print(f"Expected schema: {index.schema()}")
-                print(f"Found schema: {collection}")
-                raise AssertionError(
-                    f"Collection {index.name()} exists, but schema does not match expected."
-                )
+        collection = client.collections[collection_name()].retrieve()
+        collection.pop("created_at")  # type: ignore
+        collection.pop("num_documents")  # type: ignore
+        if collection != expected_full_collection_schema():
+            print(f"Expected schema: {expected_full_collection_schema()}")
+            print(f"Found schema: {collection}")
+            raise AssertionError(
+                f"Collection {collection_name()} exists, but schema does not match expected."
+            )
 
     def refresh_indices(self) -> None:
         """
@@ -293,20 +362,11 @@ def refresh_indices(self) -> None:
 
     def delete_unused_indices(self) -> int:
         """
-        Noop on Typesense.
+        Noop for this implementation.
         """
         return 0
 
 
-def quote_filter_value(value: str) -> str:
-    """
-    Sanitize and safely quote a value for use in a Typesense filter.
-
-    https://typesense.org/docs/guide/tips-for-filtering.html#escaping-special-characters
-    """
-    return "`" + value.replace("`", "") + "`"
-
-
 class TypesenseThreadSearchBackend(BaseThreadSearchBackend):
     """
     Thread search backend implementation for Typesense.
@@ -328,31 +388,18 @@ def get_thread_ids(
         Retrieve thread IDs based on search criteria.
         """
         client = get_typesense_client()
-        thread_ids: set[str] = set()
-
-        # All comments have "course" as their context, and none of them have a commentable_id.
-        if context == "course" and not commentable_ids:
-            comment_results = client.collections[CommentsIndex.name()].documents.search(
-                CommentsIndex.build_search_parameters(
-                    search_text=search_text, course_id=course_id
-                )
-            )
-            for hit in comment_results.get("hits", []):
-                thread_ids.add(hit["document"]["comment_thread_id"])
-
-        thread_results = client.collections[
-            CommentThreadsIndex.name()
-        ].documents.search(
-            CommentThreadsIndex.build_search_parameters(
-                search_text=search_text,
-                course_id=course_id,
-                context=context,
-                commentable_ids=commentable_ids,
-            )
+
+        params = build_search_parameters(
+            search_text=search_text,
+            course_id=course_id,
+            context=context,
+            commentable_ids=commentable_ids,
         )
-        for hit in thread_results.get("hits", []):
-            thread_ids.add(hit["document"]["id"])
 
+        results = client.collections[collection_name()].documents.search(params)
+        thread_ids: set[str] = {
+            hit["document"]["thread_id"] for hit in results.get("hits", [])  # type: ignore
+        }
         return list(thread_ids)
 
     def get_suggested_text(self, search_text: str) -> Optional[str]:
@@ -362,8 +409,7 @@ def get_suggested_text(self, search_text: str) -> Optional[str]:
         :param search_text: Text to search for suggestions
         :return: Suggested text or None
         """
-        # TODO: https://typesense.org/docs/guide/query-suggestions.html
-        # TODO: if this is implemented, do we need to also implement get_thread_ids_with_corrected_text?
+        # Not implemented, so no suggestions.
         return None
 
 

From 98a92fb9a92cf260a5e19ac4f3936aee151862af Mon Sep 17 00:00:00 2001
From: Samuel Allan <samuel@opencraft.com>
Date: Thu, 4 Sep 2025 11:39:00 +0930
Subject: [PATCH 5/5] fix: improve validation and type checking

in response to review
---
 forum/search/typesense.py | 142 +++++++++++++++++++++-----------------
 1 file changed, 80 insertions(+), 62 deletions(-)

diff --git a/forum/search/typesense.py b/forum/search/typesense.py
index c0095d90..d191c7b1 100644
--- a/forum/search/typesense.py
+++ b/forum/search/typesense.py
@@ -2,14 +2,13 @@
 Typesense backend for searching comments and threads.
 """
 
-from typing import Any, Optional
+from typing import Any, Optional, cast
 
 from bs4 import BeautifulSoup
 from django.conf import settings
 from django.core.paginator import Paginator
 
-# mypy complains: 'Module "typesense" does not explicitly export attribute "Client"'
-from typesense import Client  # type: ignore
+from typesense.client import Client
 from typesense.types.collection import CollectionCreateSchema
 from typesense.types.document import DocumentSchema, SearchParameters
 from typesense.exceptions import ObjectNotFound
@@ -79,75 +78,44 @@ def expected_full_collection_schema() -> dict[str, Any]:
     What is expected to be the full collection schema.
 
     Use this to validate the actual schema from the server.
+    Note that Typesense may add new keys to the schema;
+    this is ok, and validation should still pass.
     """
+    field_defaults = {
+        "facet": False,
+        "index": True,
+        "infix": False,
+        "locale": "",
+        "optional": False,
+        "sort": False,
+        "stem": False,
+        "stem_dictionary": "",
+        "store": True,
+        "type": "string",
+    }
     return {
         "default_sorting_field": "",
         "enable_nested_fields": False,
         "fields": [
             {
-                "facet": False,
-                "index": True,
-                "infix": False,
-                "locale": "",
+                **field_defaults,
                 "name": "thread_id",
-                "optional": False,
-                "sort": False,
-                "stem": False,
-                "stem_dictionary": "",
-                "store": True,
-                "type": "string",
             },
             {
-                "facet": False,
-                "index": True,
-                "infix": False,
-                "locale": "",
+                **field_defaults,
                 "name": "course_id",
-                "optional": False,
-                "sort": False,
-                "stem": False,
-                "stem_dictionary": "",
-                "store": True,
-                "type": "string",
             },
             {
-                "facet": False,
-                "index": True,
-                "infix": False,
-                "locale": "",
+                **field_defaults,
                 "name": "commentable_id",
-                "optional": False,
-                "sort": False,
-                "stem": False,
-                "stem_dictionary": "",
-                "store": True,
-                "type": "string",
             },
             {
-                "facet": False,
-                "index": True,
-                "infix": False,
-                "locale": "",
+                **field_defaults,
                 "name": "context",
-                "optional": False,
-                "sort": False,
-                "stem": False,
-                "stem_dictionary": "",
-                "store": True,
-                "type": "string",
             },
             {
-                "facet": False,
-                "index": True,
-                "infix": False,
-                "locale": "",
+                **field_defaults,
                 "name": "text",
-                "optional": False,
-                "sort": False,
-                "stem": False,
-                "stem_dictionary": "",
-                "store": True,
-                "type": "string",
             },
         ],
         "name": collection_name(),
@@ -206,7 +174,9 @@ def build_search_parameters(
     """
     Build Typesense search parameters for searching the index.
     """
-    # Context is always a single word, so we can use the faster `:` operator, without sacrificing accuracy.
+    # `context` is always a single word,
+    # so we can gain performance without losing accuracy by using the faster `:` (non-exact) operator.
+    # See https://typesense.org/docs/29.0/api/search.html#filter-parameters for more information.
     filters = [f"context:{quote_filter_value(context)}"]
 
     if commentable_ids:
@@ -340,18 +310,66 @@ def validate_indices(self) -> None:
         Check if the indices exist and are valid.
 
         Raise an exception if any do not exist or if any are not valid.
+        Note that the validation is lengthy,
+        because Typesense may add new keys to the schema.
+        This is fine - we only want to assert that keys we know about are set as expected.
+        There are also some fields in the retrieved schema we don't care about - eg. 'created_at'
         """
         client = get_typesense_client()
-        collection = client.collections[collection_name()].retrieve()
-        collection.pop("created_at")  # type: ignore
-        collection.pop("num_documents")  # type: ignore
-        if collection != expected_full_collection_schema():
-            print(f"Expected schema: {expected_full_collection_schema()}")
-            print(f"Found schema: {collection}")
-            raise AssertionError(
-                f"Collection {collection_name()} exists, but schema does not match expected."
+        # cast to a wider type, because we want to use it in a more flexible way than TypedDict normally allows.
+        actual_schema = cast(
+            dict[str, Any], client.collections[collection_name()].retrieve()
+        )
+        expected_schema = expected_full_collection_schema()
+        errors: list[str] = []
+
+        expected_field_names = set(
+            map(lambda field: field["name"], expected_schema["fields"])
+        )
+        actual_field_names = set(
+            map(lambda field: field["name"], actual_schema["fields"])
+        )
+
+        if missing_fields := expected_field_names - actual_field_names:
+            errors.append(
+                f"ERROR: '{collection_name()}' collection schema 'fields' has missing field(s): {missing_fields}."
             )
 
+        if extra_fields := actual_field_names - expected_field_names:
+            errors.append(
+                f"ERROR: '{collection_name()}' collection schema 'fields' "
+                f"has unexpected extra field(s): {extra_fields}."
+            )
+
+        if actual_field_names == expected_field_names:
+            for expected_field, actual_field in zip(
+                sorted(expected_schema["fields"], key=lambda field: field["name"]),
+                sorted(actual_schema["fields"], key=lambda field: field["name"]),
+            ):
+                for key, expected_value in expected_field.items():
+                    if expected_value != actual_field[key]:
+                        errors.append(
+                            f"ERROR: in collection '{collection_name()}' fields, field '{expected_field['name']}', "
+                            f"key '{key}' failed to validate. "
+                            f"Expected: '{expected_value}', actual '{actual_field[key]}'."
+                        )
+
+        for key, expected_value in expected_schema.items():
+            if key == "fields":
+                # we've already validated fields separately above
+                continue
+
+            if expected_value != actual_schema[key]:
+                errors.append(
+                    f"ERROR: in collection '{collection_name()}', key '{key}' failed to validate. "
+                    f"Expected: '{expected_value}', actual '{actual_schema[key]}'."
+                )
+
+        if errors:
+            for error in errors:
+                print(error)
+            raise AssertionError("\n".join(errors))
+
     def refresh_indices(self) -> None:
         """
         Noop on Typesense, as all write API operations are synchronous.