Skip to content

Commit 7e94591

Browse files
Read embeddings from TDA and update OpenSearch mapping
Why these changes are being introduced: * Now that TDA supports reading embeddings associated with TIMDEX records in the TIMDEX dataset, the stub CLI command can be completed. The OpenSearch mapping requires a new field to store embeddings. How this addresses that need: * Add 'embedding_full_record' field to OpenSearch mapping * Add helper method to format embeddings as input JSON for OpenSearch client * Update cli Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/USE-181
1 parent e85e7be commit 7e94591

File tree

5 files changed

+32
-36
lines changed

5 files changed

+32
-36
lines changed

Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
### This is the Terraform-generated header for timdex-index-manager-dev ###
2+
.PHONY: test
23
ECR_NAME_DEV:=timdex-index-manager-dev
34
ECR_URL_DEV:=222053980223.dkr.ecr.us-east-1.amazonaws.com/timdex-index-manager-dev
45
### End of Terraform-generated header ###
@@ -28,7 +29,7 @@ update: install # Update Python dependencies
2829
######################
2930

3031
test: # Run tests and print a coverage report
31-
pipenv run coverage run --source=tim -m pytest -vv
32+
pipenv run coverage run --source=tim -m pytest -vv
3233
pipenv run coverage report -m
3334

3435
coveralls: test # Write coverage data to an LCOV report

config/opensearch_mappings.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,9 @@
132132
"edition": {
133133
"type": "text"
134134
},
135+
"embedding_full_record": {
136+
"type": "rank_features"
137+
},
135138
"file_formats": {
136139
"type": "keyword",
137140
"normalizer": "lowercase"

tim/cli.py

Lines changed: 10 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -343,15 +343,13 @@ def bulk_update(
343343
"records with embeddings."
344344
),
345345
)
346-
@click.option("-d", "--run-date", help="Run date, formatted as YYYY-MM-DD.")
347346
@click.option("-rid", "--run-id", help="Run ID.")
348347
@click.argument("dataset_path", type=click.Path())
349348
@click.pass_context
350349
def bulk_update_embeddings(
351350
ctx: click.Context,
352351
index: str,
353352
source: str,
354-
run_date: str,
355353
run_id: str,
356354
dataset_path: str,
357355
) -> None:
@@ -378,42 +376,20 @@ def bulk_update_embeddings(
378376

379377
td = TIMDEXDataset(location=dataset_path)
380378

381-
# TODO @ghukill: https://mitlibraries.atlassian.net/browse/USE-143 # noqa: FIX002
382-
# Remove temporary code and replace with TDA
383-
# method to read embeddings
384-
# ==== START TEMPORARY CODE ====
385-
# The code below reads transformed records from
386-
# the TIMDEX dataset. To simulate embeddings,
387-
# which are added to the record post-creation, a list
388-
# of dicts containing only the 'timdex_record_id' and
389-
# the new field (i.e., what would be the embedding fields)
390-
# is created. For simulation purposes, the 'alternate_titles'
391-
# field represents the new field as it is already added
392-
# to the OpenSearch mapping in config/opensearch_mappings.json.
393-
# When testing, the user is expected to pass in a source that
394-
# does not set this field (e.g., libguides).
395-
# Once TDA has been updated to read/write embeddings
396-
# from/to the TIMDEX dataset, this code should be replaced
397-
# with a simple call to read vector embeddings, which should
398-
# return an iter of dicts representing the embeddings.
399-
transformed_records = td.read_transformed_records_iter(
400-
run_date=run_date,
379+
# bulk index embeddings
380+
embeddings = td.embeddings.read_dicts_iter(
381+
table="current_run_embeddings",
382+
columns=[
383+
"timdex_record_id",
384+
"embedding_strategy",
385+
"embedding_object",
386+
],
401387
run_id=run_id,
402-
action="index",
403388
)
389+
embeddings_to_index = helpers.format_embeddings(embeddings)
404390

405-
records_to_update = iter(
406-
[
407-
{
408-
"timdex_record_id": record["timdex_record_id"],
409-
"alternate_titles": [{"kind": "Test", "value": "Test Alternate Title"}],
410-
}
411-
for record in transformed_records
412-
]
413-
)
414-
# ==== END TEMPORARY CODE ====
415391
try:
416-
update_results.update(tim_os.bulk_update(client, index, records_to_update))
392+
update_results.update(tim_os.bulk_update(client, index, embeddings_to_index))
417393
except BulkIndexingError as exception:
418394
logger.info(f"Bulk update with embeddings failed: {exception}")
419395

tim/helpers.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import json
12
from collections.abc import Generator, Iterator
23
from datetime import UTC, datetime
34

@@ -18,6 +19,22 @@ def confirm_action(input_prompt: str) -> bool:
1819
return confirm_action(input_prompt)
1920

2021

22+
def format_embeddings(embeddings: Iterator[dict]) -> Iterator[dict]:
23+
"""Format embeddings for bulk update command.
24+
25+
This method yields a dict that maps the embedding to the
26+
corresponding field in OpenSearch, using the 'embedding_strategy'
27+
to form the field name and assigning 'embedding_object' as the value.
28+
"""
29+
for embedding in embeddings:
30+
yield {
31+
"timdex_record_id": embedding["timdex_record_id"],
32+
f"embedding_{embedding["embedding_strategy"]}": json.loads(
33+
embedding["embedding_object"]
34+
),
35+
}
36+
37+
2138
def generate_index_name(source: str) -> str:
2239
"""Generate a new index name from a source short name.
2340

tim/opensearch.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -467,5 +467,4 @@ def bulk_update(
467467
result["total"] += 1
468468
if result["total"] % int(os.getenv("STATUS_UPDATE_INTERVAL", "1000")) == 0:
469469
logger.info("Status update: %s records updated so far!", result["total"])
470-
logger.debug(response)
471470
return result

0 commit comments

Comments
 (0)