Read embeddings from TDA and update OpenSearch mapping

jonavellecuerdo · jonavellecuerdo · commit 7e9459168194 · 2025-12-11T16:27:11.000-05:00
Why these changes are being introduced: * Now that TDA supports reading embeddings associated with TIMDEX records in the TIMDEX dataset, the stub CLI command can be completed. The OpenSearch mapping requires a new field to store embeddings. How this addresses that need: * Add 'embedding_full_record' field to OpenSearch mapping * Add helper method to format embeddings as input JSON for OpenSearch client * Update cli Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/USE-181
diff --git a/Makefile b/Makefile
@@ -1,4 +1,5 @@
 ### This is the Terraform-generated header for timdex-index-manager-dev ###
+.PHONY: test
 ECR_NAME_DEV:=timdex-index-manager-dev
 ECR_URL_DEV:=222053980223.dkr.ecr.us-east-1.amazonaws.com/timdex-index-manager-dev
 ### End of Terraform-generated header ###
@@ -28,7 +29,7 @@ update: install # Update Python dependencies
 ######################
 
 test: # Run tests and print a coverage report
-	pipenv run coverage run --source=tim -m pytest -vv
+	pipenv run coverage run --source=tim -m pytest -vv 
 	pipenv run coverage report -m
 
 coveralls: test # Write coverage data to an LCOV report
diff --git a/config/opensearch_mappings.json b/config/opensearch_mappings.json
@@ -132,6 +132,9 @@
       "edition": {
         "type": "text"
       },
+      "embedding_full_record": {
+        "type": "rank_features"
+      },
       "file_formats": {
         "type": "keyword",
         "normalizer": "lowercase"
diff --git a/tim/cli.py b/tim/cli.py
@@ -343,15 +343,13 @@ def bulk_update(
         "records with embeddings."
     ),
 )
-@click.option("-d", "--run-date", help="Run date, formatted as YYYY-MM-DD.")
 @click.option("-rid", "--run-id", help="Run ID.")
 @click.argument("dataset_path", type=click.Path())
 @click.pass_context
 def bulk_update_embeddings(
     ctx: click.Context,
     index: str,
     source: str,
-    run_date: str,
     run_id: str,
     dataset_path: str,
 ) -> None:
@@ -378,42 +376,20 @@ def bulk_update_embeddings(
 
     td = TIMDEXDataset(location=dataset_path)
 
-    # TODO @ghukill: https://mitlibraries.atlassian.net/browse/USE-143 # noqa: FIX002
-    # Remove temporary code and replace with TDA
-    # method to read embeddings
-    # ==== START TEMPORARY CODE ====
-    # The code below reads transformed records from
-    # the TIMDEX dataset. To simulate embeddings,
-    # which are added to the record post-creation, a list
-    # of dicts containing only the 'timdex_record_id' and
-    # the new field (i.e., what would be the embedding fields)
-    # is created. For simulation purposes, the 'alternate_titles'
-    # field represents the new field as it is already added
-    # to the OpenSearch mapping in config/opensearch_mappings.json.
-    # When testing, the user is expected to pass in a source that
-    # does not set this field (e.g., libguides).
-    # Once TDA has been updated to read/write embeddings
-    # from/to the TIMDEX dataset, this code should be replaced
-    # with a simple call to read vector embeddings, which should
-    # return an iter of dicts representing the embeddings.
-    transformed_records = td.read_transformed_records_iter(
-        run_date=run_date,
+    # bulk index embeddings
+    embeddings = td.embeddings.read_dicts_iter(
+        table="current_run_embeddings",
+        columns=[
+            "timdex_record_id",
+            "embedding_strategy",
+            "embedding_object",
+        ],
         run_id=run_id,
-        action="index",
     )
+    embeddings_to_index = helpers.format_embeddings(embeddings)
 
-    records_to_update = iter(
-        [
-            {
-                "timdex_record_id": record["timdex_record_id"],
-                "alternate_titles": [{"kind": "Test", "value": "Test Alternate Title"}],
-            }
-            for record in transformed_records
-        ]
-    )
-    # ==== END TEMPORARY CODE ====
     try:
-        update_results.update(tim_os.bulk_update(client, index, records_to_update))
+        update_results.update(tim_os.bulk_update(client, index, embeddings_to_index))
     except BulkIndexingError as exception:
         logger.info(f"Bulk update with embeddings failed: {exception}")
 
diff --git a/tim/helpers.py b/tim/helpers.py
@@ -1,3 +1,4 @@
+import json
 from collections.abc import Generator, Iterator
 from datetime import UTC, datetime
 
@@ -18,6 +19,22 @@ def confirm_action(input_prompt: str) -> bool:
     return confirm_action(input_prompt)
 
 
+def format_embeddings(embeddings: Iterator[dict]) -> Iterator[dict]:
+    """Format embeddings for bulk update command.
+
+    This method yields a dict that maps the embedding to the
+    corresponding field in OpenSearch, using the 'embedding_strategy'
+    to form the field name and assigning 'embedding_object' as the value.
+    """
+    for embedding in embeddings:
+        yield {
+            "timdex_record_id": embedding["timdex_record_id"],
+            f"embedding_{embedding["embedding_strategy"]}": json.loads(
+                embedding["embedding_object"]
+            ),
+        }
+
+
 def generate_index_name(source: str) -> str:
     """Generate a new index name from a source short name.
 
diff --git a/tim/opensearch.py b/tim/opensearch.py
@@ -467,5 +467,4 @@ def bulk_update(
         result["total"] += 1
         if result["total"] % int(os.getenv("STATUS_UPDATE_INTERVAL", "1000")) == 0:
             logger.info("Status update: %s records updated so far!", result["total"])
-    logger.debug(response)
     return result