From 72c77639f967ba003878fdc2da261b2366324984 Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Thu, 29 Jan 2026 11:54:17 +0200 Subject: [PATCH 01/35] Create enrichment table and model --- app/models/enrichment.rb | 34 ++++ app/models/schemas/enrichment/enrichment.json | 162 ++++++++++++++++++ config/routes.rb | 2 + .../20260120083752_create_enrichments.rb | 19 ++ db/schema.rb | 71 ++++---- 5 files changed, 259 insertions(+), 29 deletions(-) create mode 100644 app/models/enrichment.rb create mode 100644 app/models/schemas/enrichment/enrichment.json create mode 100644 db/migrate/20260120083752_create_enrichments.rb diff --git a/app/models/enrichment.rb b/app/models/enrichment.rb new file mode 100644 index 000000000..08ee8f23a --- /dev/null +++ b/app/models/enrichment.rb @@ -0,0 +1,34 @@ +class Enrichment < ApplicationRecord + validate :validate_json_schema + + private + + def validate_json_schema + doc = to_enrichment_hash + error_list = self.class.enrichment_schemer.validate(doc).to_a + + return if error_list.empty? + + errors.add(:base, "Validation failed: #{error_list.map { |e| e['message'] || e.inspect }.join('; ')}") + end + + def to_enrichment_hash + { + "doi" => doi, + "contributors" => contributors, + "resources" => resources, + "field" => field, + "action" => action, + "originalValue" => original_value, + "enrichedValue" => enriched_value + }.compact + end + + def self.enrichment_schemer + @enrichment_schemer ||= begin + schema_path = Rails.root.join("app/models/schemas/enrichment/enrichment.json") + schema = JSON.parse(File.read(schema_path)) + JSONSchemer.schema(schema) + end + end +end diff --git a/app/models/schemas/enrichment/enrichment.json b/app/models/schemas/enrichment/enrichment.json new file mode 100644 index 000000000..d69a12839 --- /dev/null +++ b/app/models/schemas/enrichment/enrichment.json @@ -0,0 +1,162 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "http://json-schema.org/draft-07/schema#", + "title": "Core schema meta-schema", + "definitions": { + "schemaArray": { + "type": "array", + "minItems": 1, + "items": { "$ref": "#" } + }, + "nonNegativeInteger": { + "type": "integer", + "minimum": 0 + }, + "nonNegativeIntegerDefault0": { + "allOf": [ + { "$ref": "#/definitions/nonNegativeInteger" }, + { "default": 0 } + ] + }, + "simpleTypes": { + "enum": [ + "array", + "boolean", + "integer", + "null", + "number", + "object", + "string" + ] + }, + "stringArray": { + "type": "array", + "items": { "type": "string" }, + "uniqueItems": true, + "default": [] + } + }, + "type": ["object", "boolean"], + "properties": { + "$id": { + "type": "string", + "format": "uri-reference" + }, + "$schema": { + "type": "string", + "format": "uri" + }, + "$ref": { + "type": "string", + "format": "uri-reference" + }, + "$comment": { + "type": "string" + }, + "title": { + "type": "string" + }, + "description": { + "type": "string" + }, + "default": true, + "readOnly": { + "type": "boolean", + "default": false + }, + "examples": { + "type": "array", + "items": true + }, + "multipleOf": { + "type": "number", + "exclusiveMinimum": 0 + }, + "maximum": { + "type": "number" + }, + "exclusiveMaximum": { + "type": "number" + }, + "minimum": { + "type": "number" + }, + "exclusiveMinimum": { + "type": "number" + }, + "maxLength": { "$ref": "#/definitions/nonNegativeInteger" }, + "minLength": { "$ref": "#/definitions/nonNegativeIntegerDefault0" }, + "pattern": { + "type": "string", + "format": "regex" + }, + "additionalItems": { "$ref": "#" }, + "items": { + "anyOf": [{ "$ref": "#" }, { "$ref": "#/definitions/schemaArray" }], + "default": true + }, + "maxItems": { "$ref": "#/definitions/nonNegativeInteger" }, + "minItems": { "$ref": "#/definitions/nonNegativeIntegerDefault0" }, + "uniqueItems": { + "type": "boolean", + "default": false + }, + "contains": { "$ref": "#" }, + "maxProperties": { "$ref": "#/definitions/nonNegativeInteger" }, + "minProperties": { "$ref": "#/definitions/nonNegativeIntegerDefault0" }, + "required": { "$ref": "#/definitions/stringArray" }, + "additionalProperties": { "$ref": "#" }, + "definitions": { + "type": "object", + "additionalProperties": { "$ref": "#" }, + "default": {} + }, + "properties": { + "type": "object", + "additionalProperties": { "$ref": "#" }, + "default": {} + }, + "patternProperties": { + "type": "object", + "additionalProperties": { "$ref": "#" }, + "propertyNames": { "format": "regex" }, + "default": {} + }, + "dependencies": { + "type": "object", + "additionalProperties": { + "anyOf": [{ "$ref": "#" }, { "$ref": "#/definitions/stringArray" }] + } + }, + "propertyNames": { "$ref": "#" }, + "const": true, + "enum": { + "type": "array", + "items": true, + "minItems": 1, + "uniqueItems": true + }, + "type": { + "anyOf": [ + { "$ref": "#/definitions/simpleTypes" }, + { + "type": "array", + "items": { "$ref": "#/definitions/simpleTypes" }, + "minItems": 1, + "uniqueItems": true + } + ] + }, + "format": { "type": "string" }, + "contentMediaType": { "type": "string" }, + "contentEncoding": { "type": "string" }, + "if": { "$ref": "#" }, + "then": { "$ref": "#" }, + "else": { "$ref": "#" }, + "allOf": { "$ref": "#/definitions/schemaArray" }, + "anyOf": { "$ref": "#/definitions/schemaArray" }, + "oneOf": { "$ref": "#/definitions/schemaArray" }, + "not": { "$ref": "#" } + }, + "default": true +} diff --git a/config/routes.rb b/config/routes.rb index c6dfce4fa..c9f65f344 100644 --- a/config/routes.rb +++ b/config/routes.rb @@ -195,6 +195,8 @@ resources :activities end + resources :enrichments, only: %i[index] + resources :reference_repositories, path: "reference-repositories", only: %i[index show], constraints: { id: /.+/ } resources :client_prefixes, path: "client-prefixes" diff --git a/db/migrate/20260120083752_create_enrichments.rb b/db/migrate/20260120083752_create_enrichments.rb new file mode 100644 index 000000000..d2b403163 --- /dev/null +++ b/db/migrate/20260120083752_create_enrichments.rb @@ -0,0 +1,19 @@ +class CreateEnrichments < ActiveRecord::Migration[7.2] + disable_departure! + + def change + create_table :enrichments, options: "DEFAULT CHARSET=utf8 COLLATE=utf8_general_ci" do |t| + t.string :doi, null: false + t.json :contributors, null: false + t.json :resources, null: false + t.string :field, null: false + t.string :action, null: false + t.json :original_value, null: true + t.json :enriched_value, null: true + + t.timestamps + end + + add_index :enrichments, [:doi, :updated_at, :id], order: { updated_at: :desc, id: :desc } + end +end diff --git a/db/schema.rb b/db/schema.rb index 3a82fe4b7..a0ac03d85 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema[7.2].define(version: 2025_11_10_121400) do +ActiveRecord::Schema[7.2].define(version: 2026_01_20_083752) do create_table "active_storage_attachments", charset: "utf8mb4", collation: "utf8mb4_0900_ai_ci", force: :cascade do |t| t.string "name", limit: 191, null: false t.string "record_type", null: false @@ -28,7 +28,7 @@ t.text "metadata" t.bigint "byte_size", null: false t.string "checksum", limit: 191 - t.datetime "created_at", precision: nil, null: false + t.datetime "created_at", null: false t.string "service_name", null: false t.index ["key"], name: "index_active_storage_blobs_on_key", unique: true end @@ -41,7 +41,7 @@ create_table "allocator", charset: "utf8mb3", force: :cascade do |t| t.string "system_email", null: false - t.datetime "created", precision: nil + t.datetime "created" t.integer "doi_quota_allowed", null: false t.integer "doi_quota_used", null: false t.binary "is_active", limit: 1 @@ -49,7 +49,7 @@ t.string "password" t.string "role_name" t.string "symbol", null: false - t.datetime "updated", precision: nil + t.datetime "updated" t.integer "version" t.text "comments", size: :long t.string "experiments" @@ -57,7 +57,7 @@ t.string "region" t.string "country_code" t.string "website" - t.datetime "deleted_at", precision: nil + t.datetime "deleted_at" t.date "joined" t.string "logo" t.string "focus_area", limit: 191 @@ -81,7 +81,7 @@ t.string "logo_file_name" t.string "logo_content_type" t.bigint "logo_file_size" - t.datetime "logo_updated_at", precision: nil + t.datetime "logo_updated_at" t.integer "doi_estimate", default: 0, null: false t.index ["deleted_at"], name: "index_allocator_deleted_at" t.index ["organization_type"], name: "index_allocator_organization_type" @@ -115,8 +115,8 @@ create_table "client_prefixes", charset: "utf8mb3", force: :cascade do |t| t.bigint "client_id", null: false t.bigint "prefix_id", null: false - t.datetime "created_at", precision: nil - t.datetime "updated_at", precision: nil + t.datetime "created_at" + t.datetime "updated_at" t.bigint "provider_prefix_id" t.string "uid", null: false t.index ["client_id"], name: "FK13A1B3BA47B5F5FF" @@ -132,9 +132,9 @@ t.string "family_name" t.string "email" t.json "role_name" - t.datetime "created_at", precision: nil - t.datetime "updated_at", precision: nil - t.datetime "deleted_at", precision: nil + t.datetime "created_at" + t.datetime "updated_at" + t.datetime "deleted_at" t.index ["deleted_at"], name: "index_contacts_deleted_at" t.index ["email"], name: "index_contacts_email" t.index ["uid"], name: "index_contacts_uid" @@ -143,7 +143,7 @@ create_table "datacentre", charset: "utf8mb3", force: :cascade do |t| t.text "comments", size: :long t.string "system_email", null: false - t.datetime "created", precision: nil + t.datetime "created" t.integer "doi_quota_allowed", null: false t.integer "doi_quota_used", null: false t.text "domains" @@ -152,11 +152,11 @@ t.string "password" t.string "role_name" t.string "symbol", null: false - t.datetime "updated", precision: nil + t.datetime "updated" t.integer "version" t.bigint "allocator", null: false t.string "experiments" - t.datetime "deleted_at", precision: nil + t.datetime "deleted_at" t.string "re3data_id" t.text "url" t.string "software", limit: 191 @@ -182,18 +182,18 @@ end create_table "dataset", charset: "utf8mb3", force: :cascade do |t| - t.datetime "created", precision: nil + t.datetime "created" t.string "doi", null: false t.binary "is_active", limit: 1, null: false t.binary "is_ref_quality", limit: 1 t.integer "last_landing_page_status" - t.datetime "last_landing_page_status_check", precision: nil + t.datetime "last_landing_page_status_check" t.json "last_landing_page_status_result" t.string "last_metadata_status" - t.datetime "updated", precision: nil + t.datetime "updated" t.integer "version" t.bigint "datacentre", null: false - t.datetime "minted", precision: nil + t.datetime "minted" t.text "url" t.text "last_landing_page" t.string "last_landing_page_content_type" @@ -239,6 +239,19 @@ t.index ["url"], name: "index_dataset_on_url", length: 100 end + create_table "enrichments", charset: "utf8mb3", force: :cascade do |t| + t.string "doi", null: false + t.json "contributors", null: false + t.json "resources", null: false + t.string "field", null: false + t.string "action", null: false + t.json "original_value" + t.json "enriched_value" + t.datetime "created_at", null: false + t.datetime "updated_at", null: false + t.index ["doi", "updated_at", "id"], name: "index_enrichments_on_doi_and_updated_at_and_id", order: { updated_at: :desc, id: :desc } + end + create_table "events", charset: "utf8mb4", collation: "utf8mb4_0900_ai_ci", force: :cascade do |t| t.text "uuid", null: false t.text "subj_id", null: false @@ -251,8 +264,8 @@ t.text "source_token" t.datetime "created_at", precision: 3, null: false t.datetime "updated_at", precision: 3, null: false - t.datetime "indexed_at", precision: nil, default: "1970-01-01 00:00:00", null: false - t.datetime "occurred_at", precision: nil + t.datetime "indexed_at", default: "1970-01-01 00:00:00", null: false + t.datetime "occurred_at" t.string "message_action", limit: 191, default: "create", null: false t.string "relation_type_id", limit: 191 t.text "subj", size: :medium @@ -273,9 +286,9 @@ end create_table "media", charset: "utf8mb3", force: :cascade do |t| - t.datetime "created", precision: nil + t.datetime "created" t.string "media_type", limit: 80 - t.datetime "updated", precision: nil + t.datetime "updated" t.text "url", null: false t.integer "version" t.bigint "dataset", null: false @@ -286,7 +299,7 @@ end create_table "metadata", charset: "utf8mb3", force: :cascade do |t| - t.datetime "created", precision: nil + t.datetime "created" t.integer "metadata_version" t.integer "version" t.binary "xml", size: :medium @@ -299,7 +312,7 @@ end create_table "prefixes", charset: "utf8mb3", force: :cascade do |t| - t.datetime "created_at", precision: nil + t.datetime "created_at" t.string "uid", limit: 80, null: false t.index ["uid"], name: "prefix", unique: true end @@ -307,8 +320,8 @@ create_table "provider_prefixes", charset: "utf8mb3", force: :cascade do |t| t.bigint "provider_id", null: false t.bigint "prefix_id", null: false - t.datetime "created_at", precision: nil - t.datetime "updated_at", precision: nil + t.datetime "created_at" + t.datetime "updated_at" t.string "uid", null: false t.index ["prefix_id"], name: "FKE7FBD674AF86A1C7" t.index ["provider_id"], name: "FKE7FBD67446EBD781" @@ -318,9 +331,9 @@ create_table "reference_repositories", charset: "utf8mb4", collation: "utf8mb4_0900_ai_ci", force: :cascade do |t| t.string "client_id" t.string "re3doi" - t.datetime "created_at", precision: nil, null: false - t.datetime "updated_at", precision: nil, null: false + t.datetime "created_at", null: false + t.datetime "updated_at", null: false end - add_foreign_key "active_storage_variant_records", "active_storage_blobs", column: "blob_id", name: "_fk_rails_993965df05" + add_foreign_key "active_storage_variant_records", "active_storage_blobs", column: "blob_id" end From bafbb6fe2aa257a671654ec77771e9f4cb9a93cb Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Thu, 29 Jan 2026 12:03:20 +0200 Subject: [PATCH 02/35] Appease our rubocop overlords --- app/models/enrichment.rb | 47 ++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/app/models/enrichment.rb b/app/models/enrichment.rb index 08ee8f23a..f2d949f64 100644 --- a/app/models/enrichment.rb +++ b/app/models/enrichment.rb @@ -2,33 +2,32 @@ class Enrichment < ApplicationRecord validate :validate_json_schema private + def validate_json_schema + doc = to_enrichment_hash + error_list = self.class.enrichment_schemer.validate(doc).to_a - def validate_json_schema - doc = to_enrichment_hash - error_list = self.class.enrichment_schemer.validate(doc).to_a + return if error_list.empty? - return if error_list.empty? - - errors.add(:base, "Validation failed: #{error_list.map { |e| e['message'] || e.inspect }.join('; ')}") - end + errors.add(:base, "Validation failed: #{error_list.map { |e| e['message'] || e.inspect }.join('; ')}") + end - def to_enrichment_hash - { - "doi" => doi, - "contributors" => contributors, - "resources" => resources, - "field" => field, - "action" => action, - "originalValue" => original_value, - "enrichedValue" => enriched_value - }.compact - end + def to_enrichment_hash + { + "doi" => doi, + "contributors" => contributors, + "resources" => resources, + "field" => field, + "action" => action, + "originalValue" => original_value, + "enrichedValue" => enriched_value + }.compact + end - def self.enrichment_schemer - @enrichment_schemer ||= begin - schema_path = Rails.root.join("app/models/schemas/enrichment/enrichment.json") - schema = JSON.parse(File.read(schema_path)) - JSONSchemer.schema(schema) + def self.enrichment_schemer + @enrichment_schemer ||= begin + schema_path = Rails.root.join("app/models/schemas/enrichment/enrichment.json") + schema = JSON.parse(File.read(schema_path)) + JSONSchemer.schema(schema) + end end - end end From b241269a538c2334171a9dc5b09e9837d0676d31 Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Thu, 29 Jan 2026 12:47:39 +0200 Subject: [PATCH 03/35] Update json schema --- app/models/schemas/enrichment/enrichment.json | 475 +++++++++++++----- 1 file changed, 336 insertions(+), 139 deletions(-) diff --git a/app/models/schemas/enrichment/enrichment.json b/app/models/schemas/enrichment/enrichment.json index d69a12839..ea2382201 100644 --- a/app/models/schemas/enrichment/enrichment.json +++ b/app/models/schemas/enrichment/enrichment.json @@ -1,162 +1,359 @@ { "$schema": "http://json-schema.org/draft-07/schema#", - "$id": "http://json-schema.org/draft-07/schema#", - "title": "Core schema meta-schema", - "definitions": { - "schemaArray": { + "type": "object", + "additionalProperties": false, + "properties": { + "doi": { + "type": "string", + "description": "The target DOI of the enrichment record." + }, + "contributors": { "type": "array", "minItems": 1, - "items": { "$ref": "#" } - }, - "nonNegativeInteger": { - "type": "integer", - "minimum": 0 - }, - "nonNegativeIntegerDefault0": { - "allOf": [ - { "$ref": "#/definitions/nonNegativeInteger" }, - { "default": 0 } - ] - }, - "simpleTypes": { - "enum": [ - "array", - "boolean", - "integer", - "null", - "number", - "object", - "string" - ] + "description": "The source entities of the enrichment represented as an array of contributors", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["name", "contributorType"], + "properties": { + "name": { + "type": "string" + }, + "nameType": { + "$ref": "#/definitions/nameType" + }, + "givenName": { + "type": ["string", "null"] + }, + "familyName": { + "type": ["string", "null"] + }, + "lang": { + "type": ["string", "null"] + }, + "affiliation": { + "$ref": "#/definitions/affiliations" + }, + "nameIdentifiers": { + "$ref": "#/definitions/nameIdentifiers" + }, + "contributorType": { + "$ref": "#/definitions/contributorTypes" + } + } + } }, - "stringArray": { + "resources": { "type": "array", - "items": { "type": "string" }, - "uniqueItems": true, - "default": [] - } - }, - "type": ["object", "boolean"], - "properties": { - "$id": { - "type": "string", - "format": "uri-reference" + "minItems": 1, + "description": "The processes that produced the enrichment represented as an array of relatedIdentifiers", + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "relatedIdentifier", + "relationType", + "relatedIdentifierType" + ], + "properties": { + "relatedIdentifier": { + "type": "string" + }, + "relationType": { + "$ref": "#/definitions/relationTypes" + }, + "relatedIdentifierType": { + "$ref": "#/definitions/relatedIdentifierTypes" + }, + "relatedMetadataScheme": { + "type": "string" + }, + "schemeUri": { + "type": "string" + }, + "resourceTypeGeneral": { + "$ref": "#/definitions/resourceTypeGeneral" + } + } + } }, - "$schema": { + "field": { "type": "string", - "format": "uri" + "description": "The top-level field to enrich.", + "enum": [ + "alternateIdentifiers", + "creators", + "titles", + "publisher", + "publicationYear", + "subjects", + "contributors", + "dates", + "language", + "types", + "relatedIdentifiers", + "relatedItems", + "sizes", + "formats", + "version", + "rightsList", + "descriptions", + "geoLocations", + "fundingReferences" + ] }, - "$ref": { + "action": { "type": "string", - "format": "uri-reference" - }, - "$comment": { - "type": "string" + "description": "The action that the enrichment performs. The available actions are: update (change the entire top-level \"field\" from the \"originalValue\" to the \"enrichedValue\") updateChild (change a child object within the top-level \"field\" from the \"originalValue\" to the \"enrichedValue\") insert (insert the object in \"enrichedValue\" into the top-level \"field\") deleteChild (remove the object in \"originalValue\" within the top-level field specified in \"field\")", + "enum": ["update", "updateChild", "insert", "deleteChild"] }, - "title": { - "type": "string" + "originalValue": { + "description": "When the action is update, updateChild, or deleteChild, the original value of the field or child value. Otherwise, this field is empty.", + "oneOf": [ + { + "type": "object" + }, + { + "type": "string" + }, + { + "type": "number" + }, + { + "type": "array" + } + ] }, - "description": { - "type": "string" + "enrichedValue": { + "description": "When the action is update, updateChild, or insert, the enriched value of the field or child value. Otherwise, this field is empty.", + "oneOf": [ + { + "type": "object" + }, + { + "type": "string" + }, + { + "type": "number" + }, + { + "type": "array" + } + ] + } + }, + "required": ["doi", "contributors", "resources", "action", "field"], + "allOf": [ + { + "if": { + "properties": { + "action": { + "enum": ["update", "updateChild", "deleteChild"] + } + } + }, + "then": { + "required": ["originalValue"] + } }, - "default": true, - "readOnly": { - "type": "boolean", - "default": false + { + "if": { + "properties": { + "action": { + "enum": ["update", "updateChild", "insert"] + } + } + }, + "then": { + "required": ["enrichedValue"] + } + } + ], + "definitions": { + "nameType": { + "anyOf": [ + { + "type": "string", + "enum": ["Organizational", "Personal"] + }, + { + "type": "null" + } + ] }, - "examples": { + "affiliations": { "type": "array", - "items": true - }, - "multipleOf": { - "type": "number", - "exclusiveMinimum": 0 - }, - "maximum": { - "type": "number" - }, - "exclusiveMaximum": { - "type": "number" - }, - "minimum": { - "type": "number" + "items": { + "type": "object", + "required": ["name"], + "additionalProperties": false, + "properties": { + "affiliationIdentifier": { + "type": ["string", "null"] + }, + "affiliationIdentifierScheme": { + "type": ["string", "null"] + }, + "name": { + "type": "string" + }, + "schemeUri": { + "type": ["string", "null"] + } + } + } }, - "exclusiveMinimum": { - "type": "number" + "nameIdentifiers": { + "type": "array", + "items": { + "type": "object", + "required": ["nameIdentifier", "nameIdentifierScheme"], + "additionalProperties": false, + "properties": { + "schemeUri": { + "type": ["string", "null"] + }, + "nameIdentifier": { + "type": "string" + }, + "nameIdentifierScheme": { + "type": "string" + } + } + } }, - "maxLength": { "$ref": "#/definitions/nonNegativeInteger" }, - "minLength": { "$ref": "#/definitions/nonNegativeIntegerDefault0" }, - "pattern": { + "relationTypes": { "type": "string", - "format": "regex" - }, - "additionalItems": { "$ref": "#" }, - "items": { - "anyOf": [{ "$ref": "#" }, { "$ref": "#/definitions/schemaArray" }], - "default": true - }, - "maxItems": { "$ref": "#/definitions/nonNegativeInteger" }, - "minItems": { "$ref": "#/definitions/nonNegativeIntegerDefault0" }, - "uniqueItems": { - "type": "boolean", - "default": false - }, - "contains": { "$ref": "#" }, - "maxProperties": { "$ref": "#/definitions/nonNegativeInteger" }, - "minProperties": { "$ref": "#/definitions/nonNegativeIntegerDefault0" }, - "required": { "$ref": "#/definitions/stringArray" }, - "additionalProperties": { "$ref": "#" }, - "definitions": { - "type": "object", - "additionalProperties": { "$ref": "#" }, - "default": {} - }, - "properties": { - "type": "object", - "additionalProperties": { "$ref": "#" }, - "default": {} - }, - "patternProperties": { - "type": "object", - "additionalProperties": { "$ref": "#" }, - "propertyNames": { "format": "regex" }, - "default": {} - }, - "dependencies": { - "type": "object", - "additionalProperties": { - "anyOf": [{ "$ref": "#" }, { "$ref": "#/definitions/stringArray" }] - } + "enum": [ + "IsCitedBy", + "Cites", + "IsSupplementTo", + "IsSupplementedBy", + "IsContinuedBy", + "Continues", + "IsDescribedBy", + "Describes", + "HasMetadata", + "IsMetadataFor", + "HasVersion", + "IsVersionOf", + "IsNewVersionOf", + "IsPreviousVersionOf", + "IsPartOf", + "HasPart", + "IsPublishedIn", + "IsReferencedBy", + "References", + "IsDocumentedBy", + "Documents", + "IsCompiledBy", + "Compiles", + "IsVariantFormOf", + "IsOriginalFormOf", + "IsIdenticalTo", + "IsReviewedBy", + "Reviews", + "IsDerivedFrom", + "IsSourceOf", + "IsRequiredBy", + "Requires", + "IsObsoletedBy", + "Obsoletes", + "IsCollectedBy", + "Collects", + "IsTranslationOf", + "HasTranslation" + ] }, - "propertyNames": { "$ref": "#" }, - "const": true, - "enum": { - "type": "array", - "items": true, - "minItems": 1, - "uniqueItems": true + "relatedIdentifierTypes": { + "type": "string", + "enum": [ + "ARK", + "arXiv", + "bibcode", + "CSTR", + "DOI", + "EAN13", + "EISSN", + "Handle", + "IGSN", + "ISBN", + "ISSN", + "ISTC", + "LISSN", + "LSID", + "PMID", + "PURL", + "RRID", + "UPC", + "URL", + "URN", + "w3id" + ] }, - "type": { - "anyOf": [ - { "$ref": "#/definitions/simpleTypes" }, - { - "type": "array", - "items": { "$ref": "#/definitions/simpleTypes" }, - "minItems": 1, - "uniqueItems": true - } + "resourceTypeGeneral": { + "type": "string", + "enum": [ + "Audiovisual", + "Award", + "Book", + "BookChapter", + "Collection", + "ComputationalNotebook", + "ConferencePaper", + "ConferenceProceeding", + "DataPaper", + "Dataset", + "Dissertation", + "Event", + "Image", + "InteractiveResource", + "Instrument", + "Journal", + "JournalArticle", + "Model", + "OutputManagementPlan", + "PeerReview", + "PhysicalObject", + "Preprint", + "Project", + "Report", + "Service", + "Software", + "Sound", + "Standard", + "StudyRegistration", + "Text", + "Workflow", + "Other" ] }, - "format": { "type": "string" }, - "contentMediaType": { "type": "string" }, - "contentEncoding": { "type": "string" }, - "if": { "$ref": "#" }, - "then": { "$ref": "#" }, - "else": { "$ref": "#" }, - "allOf": { "$ref": "#/definitions/schemaArray" }, - "anyOf": { "$ref": "#/definitions/schemaArray" }, - "oneOf": { "$ref": "#/definitions/schemaArray" }, - "not": { "$ref": "#" } - }, - "default": true + "contributorTypes": { + "type": "string", + "enum": [ + "ContactPerson", + "DataCollector", + "DataCurator", + "DataManager", + "Distributor", + "Editor", + "HostingInstitution", + "Producer", + "ProjectLeader", + "ProjectManager", + "ProjectMember", + "RegistrationAgency", + "RegistrationAuthority", + "RelatedPerson", + "Researcher", + "ResearchGroup", + "RightsHolder", + "Sponsor", + "Supervisor", + "Translator", + "WorkPackageLeader", + "Other" + ] + } + } } From 13ab7860c6a8e25a5b5c8558227c96162bba31b6 Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Thu, 29 Jan 2026 13:20:46 +0200 Subject: [PATCH 04/35] Specs --- spec/models/enrichment_spec.rb | 132 +++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 spec/models/enrichment_spec.rb diff --git a/spec/models/enrichment_spec.rb b/spec/models/enrichment_spec.rb new file mode 100644 index 000000000..07ae4e881 --- /dev/null +++ b/spec/models/enrichment_spec.rb @@ -0,0 +1,132 @@ +# frozen_string_literal: true + +require "rails_helper" + +RSpec.describe Enrichment, type: :model do + describe "json schema validation" do + let(:valid_attributes) do + { + doi: "10.1234/example", + contributors: [ + { + name: "COMET", + nameType: "Organizational", + contributorType: "ResearchGroup", + affiliation: [], + nameIdentifiers: [] + } + ], + resources: [ + { + relatedIdentifier: "10.1234/example_dataset", + relationType: "IsDerivedFrom", + relatedIdentifierType: "DOI", + resourceTypeGeneral: "Dataset" + } + ], + field: "creators", + action: "updateChild", + original_value: { + name: "Chadly, Duncan", + nameType: "Personal", + givenName: "Duncan", + familyName: "Chadly", + affiliation: [ + { + name: "California Institute of Technology", + affiliationIdentifier: "https://ror.org/05dxps055", + affiliationIdentifierScheme: "ROR" + } + ], + nameIdentifiers: [ + { + nameIdentifier: "https://orcid.org/0000-0002-8417-1522", + nameIdentifierScheme: "ORCID" + } + ] + }, + enriched_value: { + name: "Chadly, Duncan", + nameType: "Personal", + givenName: "Duncan", + familyName: "Chadly", + affiliation: [ + { + name: "California Institute of Technology", + affiliationIdentifier: "https://ror.org/05dxps055", + affiliationIdentifierScheme: "ROR" + } + ], + nameIdentifiers: [ + { + nameIdentifier: "https://orcid.org/0000-0002-8417-1522", + nameIdentifierScheme: "ORCID" + } + ] + } + } + end + + context "when schema validation passes" do + it "is valid" do + enrichment = described_class.new(valid_attributes) + expect(enrichment).to be_valid + end + + it "does not add base errors" do + enrichment = described_class.new(valid_attributes) + enrichment.validate + expect(enrichment.errors[:base]).to be_empty + end + end + + context "when schema validation fails" do + it "adds a base error containing 'Validation failed'" do + enrichment = described_class.new(valid_attributes.except(:field)) + expect(enrichment).not_to be_valid + + expect(enrichment.errors[:base].join(" ")).to include("Validation failed") + end + + it "includes the raw schema error output (message or inspect) in the base error" do + enrichment = described_class.new(valid_attributes.except(:field)) + enrichment.validate + + expect(enrichment.errors[:base].first).to match(/Validation failed:/) + end + end + + describe ".enrichment_schemer" do + it "memoizes the schemer instance" do + schemer1 = described_class.enrichment_schemer + schemer2 = described_class.enrichment_schemer + + expect(schemer1).to be(schemer2) + end + end + + describe "#to_enrichment_hash" do + it "includes the expected keys and uses camelCase for original/enriched values" do + enrichment = described_class.new(valid_attributes) + hash = enrichment.send(:to_enrichment_hash) + + expect(hash).to include( + "doi" => valid_attributes[:doi], + "contributors" => valid_attributes[:contributors], + "resources" => valid_attributes[:resources], + "field" => valid_attributes[:field], + "action" => valid_attributes[:action], + "originalValue" => valid_attributes[:original_value], + "enrichedValue" => valid_attributes[:enriched_value] + ) + end + + it "compacts nil values (omits keys when underlying attributes are nil)" do + enrichment = described_class.new(valid_attributes.merge(enriched_value: nil)) + hash = enrichment.send(:to_enrichment_hash) + + expect(hash).not_to have_key("enrichedValue") + end + end + end +end From 2668140e0db615f0c78c4e2023df65fd824577aa Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Thu, 29 Jan 2026 13:38:13 +0200 Subject: [PATCH 05/35] Fix specs --- spec/models/enrichment_spec.rb | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spec/models/enrichment_spec.rb b/spec/models/enrichment_spec.rb index 07ae4e881..6be874c57 100644 --- a/spec/models/enrichment_spec.rb +++ b/spec/models/enrichment_spec.rb @@ -110,7 +110,7 @@ enrichment = described_class.new(valid_attributes) hash = enrichment.send(:to_enrichment_hash) - expect(hash).to include( + expected = { "doi" => valid_attributes[:doi], "contributors" => valid_attributes[:contributors], "resources" => valid_attributes[:resources], @@ -118,7 +118,9 @@ "action" => valid_attributes[:action], "originalValue" => valid_attributes[:original_value], "enrichedValue" => valid_attributes[:enriched_value] - ) + }.deep_stringify_keys + + expect(hash).to include(expected) end it "compacts nil values (omits keys when underlying attributes are nil)" do From 9f51eafe487be3729fef64636e4f6848a8f59884 Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Fri, 30 Jan 2026 13:22:08 +0200 Subject: [PATCH 06/35] Add controller logic and tests --- app/controllers/enrichments_controller.rb | 60 ++++ app/models/enrichment.rb | 21 ++ spec/factories/enrichment.rb | 7 + spec/models/enrichment_spec.rb | 329 ++++++++++++++++++---- 4 files changed, 359 insertions(+), 58 deletions(-) create mode 100644 app/controllers/enrichments_controller.rb create mode 100644 spec/factories/enrichment.rb diff --git a/app/controllers/enrichments_controller.rb b/app/controllers/enrichments_controller.rb new file mode 100644 index 000000000..4477c247d --- /dev/null +++ b/app/controllers/enrichments_controller.rb @@ -0,0 +1,60 @@ +# frozen_string_literal: true + +class EnrichmentsController < ApplicationController + PAGE_SIZE = 10 + + def index + doi = params["doi"] + client_id = params["client_id"] + cursor = params["cursor"] + + if doi.blank? && client_id.blank? + return render(json: { message: "Missing doi or client-id query string parameter" }, status: :bad_request) + end + + enrichments = doi.present? ? Enrichment.by_doi(doi) : Enrichment.by_client(client_id) + + if cursor.present? + decoded_cursor = decode_cursor(cursor) + cursor_updated_at = Time.iso8601(decoded_cursor.fetch("updated_at")) + cursor_id = decoded_cursor.fetch("id").to_i + + enrichments = enrichments.by_cursor(cursor_updated_at, cursor_id) + end + + puts(enrichments.order_by_cursor.limit(PAGE_SIZE).to_sql) + enrichments = enrichments.order_by_cursor.limit(PAGE_SIZE).to_a + + current_link = request.original_url + + next_cursor = if enrichments.any? + last = enrichments.last + encode_cursor(updated_at: last.updated_at.iso8601(6), id: last.id) + end + + next_link = doi.present? ? + "#{request.original_url.split("?").first}?doi=#{doi}&cursor=#{next_cursor}" : + "#{request.original_url.split("?").first}?client-id=#{client_id}&cursor=#{next_cursor}" + + render(json: { + data: enrichments, + links: { + self: current_link, + next: enrichments.length == PAGE_SIZE ? next_link : nil + } + }) + end + + private + def encode_cursor(hash) + Base64.urlsafe_encode64(hash.to_json, padding: false) + rescue + raise ActionController::BadRequest + end + + def decode_cursor(token) + JSON.parse(Base64.urlsafe_decode64(token)) + rescue + raise ActionController::BadRequest + end +end diff --git a/app/models/enrichment.rb b/app/models/enrichment.rb index f2d949f64..32596fc56 100644 --- a/app/models/enrichment.rb +++ b/app/models/enrichment.rb @@ -1,6 +1,27 @@ class Enrichment < ApplicationRecord validate :validate_json_schema + belongs_to :doi_record, + class_name: "Doi", + foreign_key: :doi, # enrichments.doi + primary_key: :doi, # dois.doi + optional: false + + has_one :client, through: :doi_record + + scope :by_doi, ->(doi) { where(doi: doi) } + + scope :by_client, ->(client_id) { joins(doi_record: :client).where(datacentre: { symbol: client_id }) } + + scope :by_cursor, ->(updated_at, id) { + where("(enrichments.updated_at < ?) OR (enrichments.updated_at = ? AND enrichments.id < ?)", + updated_at, + updated_at, + id) + } + + scope :order_by_cursor, -> { order(updated_at: :desc, id: :desc) } + private def validate_json_schema doc = to_enrichment_hash diff --git a/spec/factories/enrichment.rb b/spec/factories/enrichment.rb new file mode 100644 index 000000000..60f3a93dd --- /dev/null +++ b/spec/factories/enrichment.rb @@ -0,0 +1,7 @@ +# frozen_string_literal: true + +FactoryBot.define do + factory :enrichment do + doi { create(:doi, doi: "10.0000/fake.test.doi", agency: "datacite") } + end +end diff --git a/spec/models/enrichment_spec.rb b/spec/models/enrichment_spec.rb index 6be874c57..832c96ad6 100644 --- a/spec/models/enrichment_spec.rb +++ b/spec/models/enrichment_spec.rb @@ -3,68 +3,184 @@ require "rails_helper" RSpec.describe Enrichment, type: :model do - describe "json schema validation" do - let(:valid_attributes) do - { - doi: "10.1234/example", - contributors: [ + def valid_enrichment_attrs(doi:) + { + doi: doi, + contributors: [ + { + name: "COMET", + nameType: "Organizational", + contributorType: "ResearchGroup", + affiliation: [], + nameIdentifiers: [] + } + ], + resources: [ + { + relatedIdentifier: "10.1234/example_dataset", + relationType: "IsDerivedFrom", + relatedIdentifierType: "DOI", + resourceTypeGeneral: "Dataset" + } + ], + field: "creators", + action: "updateChild", + original_value: { + name: "Chadly, Duncan", + nameType: "Personal", + givenName: "Duncan", + familyName: "Chadly", + affiliation: [ { - name: "COMET", - nameType: "Organizational", - contributorType: "ResearchGroup", - affiliation: [], - nameIdentifiers: [] + name: "California Institute of Technology", + affiliationIdentifier: "https://ror.org/05dxps055", + affiliationIdentifierScheme: "ROR" } ], - resources: [ + nameIdentifiers: [ + { + nameIdentifier: "https://orcid.org/0000-0002-8417-1522", + nameIdentifierScheme: "ORCID" + } + ] + }, + enriched_value: { + name: "Chadly, Duncan", + nameType: "Personal", + givenName: "Duncan", + familyName: "Chadly", + affiliation: [ { - relatedIdentifier: "10.1234/example_dataset", - relationType: "IsDerivedFrom", - relatedIdentifierType: "DOI", - resourceTypeGeneral: "Dataset" + name: "California Institute of Technology", + affiliationIdentifier: "https://ror.org/05dxps055", + affiliationIdentifierScheme: "ROR" } ], - field: "creators", - action: "updateChild", - original_value: { - name: "Chadly, Duncan", - nameType: "Personal", - givenName: "Duncan", - familyName: "Chadly", - affiliation: [ - { - name: "California Institute of Technology", - affiliationIdentifier: "https://ror.org/05dxps055", - affiliationIdentifierScheme: "ROR" - } - ], - nameIdentifiers: [ - { - nameIdentifier: "https://orcid.org/0000-0002-8417-1522", - nameIdentifierScheme: "ORCID" - } - ] - }, - enriched_value: { - name: "Chadly, Duncan", - nameType: "Personal", - givenName: "Duncan", - familyName: "Chadly", - affiliation: [ - { - name: "California Institute of Technology", - affiliationIdentifier: "https://ror.org/05dxps055", - affiliationIdentifierScheme: "ROR" - } - ], - nameIdentifiers: [ - { - nameIdentifier: "https://orcid.org/0000-0002-8417-1522", - nameIdentifierScheme: "ORCID" - } - ] - } + nameIdentifiers: [ + { + nameIdentifier: "https://orcid.org/0000-0002-8417-1522", + nameIdentifierScheme: "ORCID" + } + ] } + } + end + + describe "associations" do + it { is_expected.to belong_to(:doi_record).class_name("Doi") } + it { is_expected.to have_one(:client).through(:doi_record) } + end + + describe "scopes" do + before do + allow_any_instance_of(described_class).to receive(:validate_json_schema) + end + + def create_enrichment!(doi:, updated_at: nil) + attrs = valid_enrichment_attrs(doi: doi) + attrs = attrs.merge(updated_at: updated_at) if updated_at + described_class.create!(attrs) + end + + describe ".by_doi" do + it "returns enrichments matching the doi" do + doi_a = create(:doi) + doi_b = create(:doi) + + e1 = create_enrichment!(doi: doi_a.doi) + _e2 = create_enrichment!(doi: doi_b.doi) + + expect(described_class.by_doi(doi_a.doi)).to contain_exactly(e1) + end + end + + describe ".by_client" do + it "returns enrichments for DOIs belonging to the given client symbol (Client model backed by datacentre table)" do + client_a = create(:client, symbol: "DATACITE.TEST") + client_b = create(:client, symbol: "OTHER.TEST") + + doi_a = create(:doi, client: client_a) + doi_b = create(:doi, client: client_b) + + e1 = create_enrichment!(doi: doi_a.doi) + _e2 = create_enrichment!(doi: doi_b.doi) + + expect(described_class.by_client("DATACITE.TEST")).to contain_exactly(e1) + end + end + + describe ".order_by_cursor" do + it "orders by updated_at desc then id desc" do + doi = create(:doi) + t = Time.utc(2026, 1, 29, 10, 0, 0) + + a = create_enrichment!(doi: doi.doi, updated_at: t) + b = create_enrichment!(doi: doi.doi, updated_at: t) + c = create_enrichment!(doi: doi.doi, updated_at: t + 1.second) + + ordered = described_class.where(id: [a.id, b.id, c.id]).order_by_cursor.to_a + + expect(ordered.first).to eq(c) + expect(ordered[1].updated_at).to eq(t) + expect(ordered[2].updated_at).to eq(t) + expect(ordered[1].id).to be > ordered[2].id + end + end + + describe ".by_cursor" do + it "filters to records before the cursor (updated_at desc, id desc tie-break)" do + doi = create(:doi) + t = Time.utc(2026, 1, 29, 10, 0, 0) + + older = create_enrichment!(doi: doi.doi, updated_at: t - 10.seconds) + newer = create_enrichment!(doi: doi.doi, updated_at: t + 10.seconds) + + same_time_1 = create_enrichment!(doi: doi.doi, updated_at: t) + same_time_2 = create_enrichment!(doi: doi.doi, updated_at: t) + + small, big = [same_time_1, same_time_2].sort_by(&:id) + + results = described_class.by_cursor(t, big.id) + + expect(results).to include(older, small) + expect(results).not_to include(newer, big) + end + end + end + + describe "json schema validation" do + let(:valid_attributes) { valid_enrichment_attrs(doi: "10.1234/example") } + + context "happy paths (action-dependent requirements)" do + it "is valid for action=update with original_value and enriched_value" do + attrs = valid_attributes.merge(action: "update") + enrichment = described_class.new(attrs) + expect(enrichment).to be_valid + end + + it "is valid for action=updateChild with original_value and enriched_value" do + attrs = valid_attributes.merge(action: "updateChild") + enrichment = described_class.new(attrs) + expect(enrichment).to be_valid + end + + it "is valid for action=insert when enriched_value is present (original_value may be omitted)" do + attrs = valid_attributes.deep_dup + attrs[:action] = "insert" + attrs.delete(:original_value) + + enrichment = described_class.new(attrs) + expect(enrichment).to be_valid + end + + it "is valid for action=deleteChild when original_value is present (enriched_value may be omitted)" do + attrs = valid_attributes.deep_dup + attrs[:action] = "deleteChild" + attrs.delete(:enriched_value) + + enrichment = described_class.new(attrs) + expect(enrichment).to be_valid + end end context "when schema validation passes" do @@ -81,18 +197,115 @@ end context "when schema validation fails" do - it "adds a base error containing 'Validation failed'" do + it "fails when a required top-level attribute is missing (field)" do enrichment = described_class.new(valid_attributes.except(:field)) + + expect(enrichment).not_to be_valid + expect(enrichment.errors[:base].join(" ")).to include("Validation failed") + end + + it "fails when a required top-level attribute is missing (doi)" do + enrichment = described_class.new(valid_attributes.except(:doi)) + + expect(enrichment).not_to be_valid + expect(enrichment.errors[:base].join(" ")).to include("Validation failed") + end + + it "fails when contributors is empty (minItems 1)" do + enrichment = described_class.new(valid_attributes.merge(contributors: [])) + + expect(enrichment).not_to be_valid + expect(enrichment.errors[:base].join(" ")).to include("Validation failed") + end + + it "fails when resources is empty (minItems 1)" do + enrichment = described_class.new(valid_attributes.merge(resources: [])) + + expect(enrichment).not_to be_valid + expect(enrichment.errors[:base].join(" ")).to include("Validation failed") + end + + it "fails when contributors contains an unexpected extra property (additionalProperties false)" do + bad = valid_attributes.deep_dup + bad[:contributors][0][:unexpected] = "nope" + + enrichment = described_class.new(bad) + expect(enrichment).not_to be_valid + expect(enrichment.errors[:base].join(" ")).to include("Validation failed") + end + + it "fails when resources contains an unexpected extra property (additionalProperties false)" do + bad = valid_attributes.deep_dup + bad[:resources][0][:unexpected] = "nope" + + enrichment = described_class.new(bad) + + expect(enrichment).not_to be_valid + expect(enrichment.errors[:base].join(" ")).to include("Validation failed") + end + + it "fails when contributor is missing required keys (name and contributorType)" do + bad = valid_attributes.deep_dup + bad[:contributors] = [{ name: "Only name" }] # missing contributorType + + enrichment = described_class.new(bad) + expect(enrichment).not_to be_valid + expect(enrichment.errors[:base].join(" ")).to include("Validation failed") + end + + it "fails when resource is missing required keys (relatedIdentifier, relationType, relatedIdentifierType)" do + bad = valid_attributes.deep_dup + bad[:resources] = [{ relatedIdentifier: "10.1234/x" }] # missing relationType + relatedIdentifierType + + enrichment = described_class.new(bad) + + expect(enrichment).not_to be_valid + expect(enrichment.errors[:base].join(" ")).to include("Validation failed") + end + + it "fails when action=updateChild but original_value is missing (schema requires originalValue)" do + enrichment = described_class.new(valid_attributes.except(:original_value)) + + expect(enrichment).not_to be_valid + expect(enrichment.errors[:base].join(" ")).to include("Validation failed") + end + + it "fails when action=updateChild but enriched_value is missing (schema requires enrichedValue)" do + enrichment = described_class.new(valid_attributes.except(:enriched_value)) + + expect(enrichment).not_to be_valid + expect(enrichment.errors[:base].join(" ")).to include("Validation failed") + end + + it "fails when action=insert but enriched_value is missing (schema requires enrichedValue)" do + attrs = valid_attributes.deep_dup + attrs[:action] = "insert" + attrs.delete(:enriched_value) + + enrichment = described_class.new(attrs) + + expect(enrichment).not_to be_valid + expect(enrichment.errors[:base].join(" ")).to include("Validation failed") + end + + it "fails when action=deleteChild but original_value is missing (schema requires originalValue)" do + attrs = valid_attributes.deep_dup + attrs[:action] = "deleteChild" + attrs.delete(:original_value) + + enrichment = described_class.new(attrs) + + expect(enrichment).not_to be_valid expect(enrichment.errors[:base].join(" ")).to include("Validation failed") end - it "includes the raw schema error output (message or inspect) in the base error" do + it "adds a base error that starts with 'Validation failed:'" do enrichment = described_class.new(valid_attributes.except(:field)) enrichment.validate - expect(enrichment.errors[:base].first).to match(/Validation failed:/) + expect(enrichment.errors[:base].first).to match(/\AValidation failed:/) end end From da20a0c34f5278587e77984ae23d692d60e0ee52 Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Fri, 30 Jan 2026 13:28:07 +0200 Subject: [PATCH 07/35] Add controller specs --- spec/requests/enrichments_spec.rb | 170 ++++++++++++++++++++++++++++++ 1 file changed, 170 insertions(+) create mode 100644 spec/requests/enrichments_spec.rb diff --git a/spec/requests/enrichments_spec.rb b/spec/requests/enrichments_spec.rb new file mode 100644 index 000000000..dfcb7fa7c --- /dev/null +++ b/spec/requests/enrichments_spec.rb @@ -0,0 +1,170 @@ +# frozen_string_literal: true + +require "rails_helper" + +RSpec.describe "Enrichments", type: :request do + before do + allow_any_instance_of(Enrichment).to receive(:validate_json_schema) + end + + def create_enrichment!(doi:, updated_at:) + Enrichment.create!( + doi: doi, + contributors: [{ name: "X", contributorType: "ResearchGroup" }], + resources: [{ relatedIdentifier: "10.1234/x", relationType: "IsDerivedFrom", relatedIdentifierType: "DOI" }], + field: "creators", + action: "updateChild", + original_value: { name: "Old" }, + enriched_value: { name: "New" }, + updated_at: updated_at + ) + end + + def json + JSON.parse(response.body) + end + + describe "GET /enrichments" do + it "returns 400 when both doi and client_id are missing" do + get "/enrichments" + + expect(response).to have_http_status(:bad_request) + expect(json).to eq("message" => "Missing doi or client-id query string parameter") + end + + it "filters by doi when doi param is provided" do + doi_a = create(:doi) + doi_b = create(:doi) + + t = Time.utc(2026, 1, 29, 10, 0, 0) + e1 = create_enrichment!(doi: doi_a.doi, updated_at: t) + _e2 = create_enrichment!(doi: doi_b.doi, updated_at: t) + + get "/enrichments", params: { doi: doi_a.doi } + + expect(response).to have_http_status(:ok) + expect(json["data"].map { |h| h["id"] }).to eq([e1.id]) + end + + it "filters by client_id when client_id param is provided" do + client_a = create(:client, symbol: "DATACITE.TEST") + client_b = create(:client, symbol: "OTHER.TEST") + + doi_a = create(:doi, client: client_a) + doi_b = create(:doi, client: client_b) + + t = Time.utc(2026, 1, 29, 10, 0, 0) + e1 = create_enrichment!(doi: doi_a.doi, updated_at: t) + _e2 = create_enrichment!(doi: doi_b.doi, updated_at: t) + + get "/enrichments", params: { client_id: "DATACITE.TEST" } + + expect(response).to have_http_status(:ok) + expect(json["data"].map { |h| h["id"] }).to eq([e1.id]) + end + + it "orders results by updated_at desc then id desc" do + doi = create(:doi) + + t = Time.utc(2026, 1, 29, 10, 0, 0) + a = create_enrichment!(doi: doi.doi, updated_at: t) + b = create_enrichment!(doi: doi.doi, updated_at: t) + c = create_enrichment!(doi: doi.doi, updated_at: t + 1.second) + + get "/enrichments", params: { doi: doi.doi } + + ids = json["data"].map { |h| h["id"] } + expected_same_time = [a.id, b.id].sort.reverse + expect(ids).to eq([c.id] + expected_same_time) + end + + it "includes links.self and links.next is nil when fewer than PAGE_SIZE results" do + doi = create(:doi) + + t = Time.utc(2026, 1, 29, 10, 0, 0) + 3.times { |i| create_enrichment!(doi: doi.doi, updated_at: t + i.seconds) } + + get "/enrichments", params: { doi: doi.doi } + + expect(response).to have_http_status(:ok) + expect(json.dig("links", "self")).to be_present + expect(json.dig("links", "next")).to be_nil + end + + it "sets links.next when exactly PAGE_SIZE results are returned" do + stub_const("EnrichmentsController::PAGE_SIZE", 2) + + doi = create(:doi) + + t = Time.utc(2026, 1, 29, 10, 0, 0) + create_enrichment!(doi: doi.doi, updated_at: t) + create_enrichment!(doi: doi.doi, updated_at: t + 1.second) + + get "/enrichments", params: { doi: doi.doi } + + expect(response).to have_http_status(:ok) + expect(json.dig("links", "next")).to be_present + expect(json.dig("links", "next")).to include("cursor=") + expect(json.dig("links", "next")).to include("doi=#{doi.doi}") + end + + it "paginates with cursor so the next page excludes the cursor record and newer ones" do + stub_const("EnrichmentsController::PAGE_SIZE", 2) + + doi = create(:doi) + t = Time.utc(2026, 1, 29, 10, 0, 0) + + # Newest -> Oldest in page 1 + newest = create_enrichment!(doi: doi.doi, updated_at: t + 3.seconds) + mid = create_enrichment!(doi: doi.doi, updated_at: t + 2.seconds) + older = create_enrichment!(doi: doi.doi, updated_at: t + 1.second) + + # Page 1 + get "/enrichments", params: { doi: doi.doi } + + expect(response).to have_http_status(:ok) + ids1 = json["data"].map { |h| h["id"] } + expect(ids1).to eq([newest.id, mid.id]) + + next_link = json.dig("links", "next") + expect(next_link).to be_present + + cursor = CGI.parse(URI.parse(next_link).query).fetch("cursor").first + + # Page 2 using cursor + get "/enrichments", params: { doi: doi.doi, cursor: cursor } + + expect(response).to have_http_status(:ok) + ids2 = json["data"].map { |h| h["id"] } + expect(ids2).to eq([older.id]) + end + + it "returns 400 for an invalid cursor" do + doi = create(:doi) + create_enrichment!(doi: doi.doi, updated_at: Time.utc(2026, 1, 29, 10, 0, 0)) + + get "/enrichments", params: { doi: doi.doi, cursor: "not-a-valid-base64" } + + expect(response).to have_http_status(:bad_request) + end + + it "builds next link with client-id param name (hyphen) when paginating by client_id" do + stub_const("EnrichmentsController::PAGE_SIZE", 2) + + client = create(:client, symbol: "DATACITE.TEST") + doi = create(:doi, client: client) + + t = Time.utc(2026, 1, 29, 10, 0, 0) + create_enrichment!(doi: doi.doi, updated_at: t + 2.seconds) + create_enrichment!(doi: doi.doi, updated_at: t + 1.second) + + get "/enrichments", params: { client_id: "DATACITE.TEST" } + + expect(response).to have_http_status(:ok) + next_link = json.dig("links", "next") + expect(next_link).to be_present + expect(next_link).to include("client-id=DATACITE.TEST") + expect(next_link).to include("cursor=") + end + end +end From 95d7f81708bdbae04aad2d6a4b437c01b92cdf1a Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Fri, 30 Jan 2026 13:38:02 +0200 Subject: [PATCH 08/35] Fix specs --- spec/models/enrichment_spec.rb | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spec/models/enrichment_spec.rb b/spec/models/enrichment_spec.rb index 832c96ad6..c80c0446d 100644 --- a/spec/models/enrichment_spec.rb +++ b/spec/models/enrichment_spec.rb @@ -149,7 +149,9 @@ def create_enrichment!(doi:, updated_at: nil) end describe "json schema validation" do - let(:valid_attributes) { valid_enrichment_attrs(doi: "10.1234/example") } + # Create the required associated DOI record so "Doi record must exist" passes. + let!(:doi_record) { create(:doi) } + let(:valid_attributes) { valid_enrichment_attrs(doi: doi_record.doi) } context "happy paths (action-dependent requirements)" do it "is valid for action=update with original_value and enriched_value" do From 3e0adcb733f6abeb15c29af2b705cf5cab0b132b Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Fri, 30 Jan 2026 13:46:24 +0200 Subject: [PATCH 09/35] Fix specs --- spec/requests/enrichments_spec.rb | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/spec/requests/enrichments_spec.rb b/spec/requests/enrichments_spec.rb index dfcb7fa7c..71c4431a6 100644 --- a/spec/requests/enrichments_spec.rb +++ b/spec/requests/enrichments_spec.rb @@ -47,8 +47,9 @@ def json end it "filters by client_id when client_id param is provided" do - client_a = create(:client, symbol: "DATACITE.TEST") - client_b = create(:client, symbol: "OTHER.TEST") + provider = create(:provider) + client_a = create(:client, provider: provider, symbol: "#{provider.symbol}.DATACITE.TEST") + client_b = create(:client, provider: provider, symbol: "#{provider.symbol}.OTHER.TEST") doi_a = create(:doi, client: client_a) doi_b = create(:doi, client: client_b) @@ -57,7 +58,7 @@ def json e1 = create_enrichment!(doi: doi_a.doi, updated_at: t) _e2 = create_enrichment!(doi: doi_b.doi, updated_at: t) - get "/enrichments", params: { client_id: "DATACITE.TEST" } + get "/enrichments", params: { client_id: client_a.symbol } expect(response).to have_http_status(:ok) expect(json["data"].map { |h| h["id"] }).to eq([e1.id]) @@ -114,12 +115,10 @@ def json doi = create(:doi) t = Time.utc(2026, 1, 29, 10, 0, 0) - # Newest -> Oldest in page 1 newest = create_enrichment!(doi: doi.doi, updated_at: t + 3.seconds) mid = create_enrichment!(doi: doi.doi, updated_at: t + 2.seconds) older = create_enrichment!(doi: doi.doi, updated_at: t + 1.second) - # Page 1 get "/enrichments", params: { doi: doi.doi } expect(response).to have_http_status(:ok) @@ -131,7 +130,6 @@ def json cursor = CGI.parse(URI.parse(next_link).query).fetch("cursor").first - # Page 2 using cursor get "/enrichments", params: { doi: doi.doi, cursor: cursor } expect(response).to have_http_status(:ok) @@ -151,19 +149,20 @@ def json it "builds next link with client-id param name (hyphen) when paginating by client_id" do stub_const("EnrichmentsController::PAGE_SIZE", 2) - client = create(:client, symbol: "DATACITE.TEST") + provider = create(:provider) + client = create(:client, provider: provider, symbol: "#{provider.symbol}.DATACITE.TEST") doi = create(:doi, client: client) t = Time.utc(2026, 1, 29, 10, 0, 0) create_enrichment!(doi: doi.doi, updated_at: t + 2.seconds) create_enrichment!(doi: doi.doi, updated_at: t + 1.second) - get "/enrichments", params: { client_id: "DATACITE.TEST" } + get "/enrichments", params: { client_id: client.symbol } expect(response).to have_http_status(:ok) next_link = json.dig("links", "next") expect(next_link).to be_present - expect(next_link).to include("client-id=DATACITE.TEST") + expect(next_link).to include("client-id=#{CGI.escape(client.symbol)}") expect(next_link).to include("cursor=") end end From bcd494bdd2b1cb0436c3b8032b65809d3a52ecbe Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Fri, 30 Jan 2026 13:56:42 +0200 Subject: [PATCH 10/35] Fix specs --- spec/requests/enrichments_spec.rb | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/spec/requests/enrichments_spec.rb b/spec/requests/enrichments_spec.rb index 71c4431a6..5a71297bf 100644 --- a/spec/requests/enrichments_spec.rb +++ b/spec/requests/enrichments_spec.rb @@ -119,17 +119,21 @@ def json mid = create_enrichment!(doi: doi.doi, updated_at: t + 2.seconds) older = create_enrichment!(doi: doi.doi, updated_at: t + 1.second) + # Page 1 get "/enrichments", params: { doi: doi.doi } expect(response).to have_http_status(:ok) ids1 = json["data"].map { |h| h["id"] } expect(ids1).to eq([newest.id, mid.id]) - next_link = json.dig("links", "next") - expect(next_link).to be_present - - cursor = CGI.parse(URI.parse(next_link).query).fetch("cursor").first + # Build a cursor that represents "mid" (the last record of page 1) + cursor_payload = { + updated_at: mid.updated_at.iso8601(6), + id: mid.id + } + cursor = Base64.urlsafe_encode64(cursor_payload.to_json, padding: false) + # Page 2 using cursor get "/enrichments", params: { doi: doi.doi, cursor: cursor } expect(response).to have_http_status(:ok) From 5488ba666bc9b82ad8c6965c93844534cb6111bb Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Fri, 30 Jan 2026 14:03:03 +0200 Subject: [PATCH 11/35] Fix specs --- spec/models/enrichment_spec.rb | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/spec/models/enrichment_spec.rb b/spec/models/enrichment_spec.rb index c80c0446d..4d7107bb6 100644 --- a/spec/models/enrichment_spec.rb +++ b/spec/models/enrichment_spec.rb @@ -95,18 +95,19 @@ def create_enrichment!(doi:, updated_at: nil) end describe ".by_client" do - it "returns enrichments for DOIs belonging to the given client symbol (Client model backed by datacentre table)" do - client_a = create(:client, symbol: "DATACITE.TEST") - client_b = create(:client, symbol: "OTHER.TEST") + it "returns enrichments for DOIs belonging to the given client symbol" do + provider = create(:provider) - doi_a = create(:doi, client: client_a) - doi_b = create(:doi, client: client_b) + client_a = create(:client, provider: provider) + client_b = create(:client, provider: provider) - e1 = create_enrichment!(doi: doi_a.doi) - _e2 = create_enrichment!(doi: doi_b.doi) + doi_a = create(:doi, client: client_a) + doi_b = create(:doi, client: client_b) - expect(described_class.by_client("DATACITE.TEST")).to contain_exactly(e1) - end + e1 = create_enrichment!(doi: doi_a.doi) + e2 = create_enrichment!(doi: doi_b.doi) + + expect(described_class.by_client(client_a.symbol)).to contain_exactly(e1) end describe ".order_by_cursor" do From ac65de24bf5bc7f5bfdc7aff90f0ce5567cd9cbd Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Fri, 30 Jan 2026 14:16:21 +0200 Subject: [PATCH 12/35] Fix specs --- spec/models/enrichment_spec.rb | 107 +++++++++++++++++---------------- 1 file changed, 54 insertions(+), 53 deletions(-) diff --git a/spec/models/enrichment_spec.rb b/spec/models/enrichment_spec.rb index 4d7107bb6..f14618dfd 100644 --- a/spec/models/enrichment_spec.rb +++ b/spec/models/enrichment_spec.rb @@ -72,82 +72,82 @@ def valid_enrichment_attrs(doi:) end describe "scopes" do - before do - allow_any_instance_of(described_class).to receive(:validate_json_schema) - end + before do + allow_any_instance_of(described_class).to receive(:validate_json_schema) + end - def create_enrichment!(doi:, updated_at: nil) - attrs = valid_enrichment_attrs(doi: doi) - attrs = attrs.merge(updated_at: updated_at) if updated_at - described_class.create!(attrs) - end + def create_enrichment!(doi:, updated_at: nil) + attrs = valid_enrichment_attrs(doi: doi) + attrs = attrs.merge(updated_at: updated_at) if updated_at + described_class.create!(attrs) + end - describe ".by_doi" do - it "returns enrichments matching the doi" do - doi_a = create(:doi) - doi_b = create(:doi) + describe ".by_doi" do + it "returns enrichments matching the doi" do + doi_a = create(:doi) + doi_b = create(:doi) - e1 = create_enrichment!(doi: doi_a.doi) - _e2 = create_enrichment!(doi: doi_b.doi) + e1 = create_enrichment!(doi: doi_a.doi) + _e2 = create_enrichment!(doi: doi_b.doi) - expect(described_class.by_doi(doi_a.doi)).to contain_exactly(e1) - end + expect(described_class.by_doi(doi_a.doi)).to contain_exactly(e1) end + end - describe ".by_client" do - it "returns enrichments for DOIs belonging to the given client symbol" do - provider = create(:provider) + describe ".by_client" do + it "returns enrichments for DOIs belonging to the given client symbol" do + provider = create(:provider) - client_a = create(:client, provider: provider) - client_b = create(:client, provider: provider) + client_a = create(:client, provider: provider) + client_b = create(:client, provider: provider) - doi_a = create(:doi, client: client_a) - doi_b = create(:doi, client: client_b) + doi_a = create(:doi, client: client_a) + doi_b = create(:doi, client: client_b) - e1 = create_enrichment!(doi: doi_a.doi) - e2 = create_enrichment!(doi: doi_b.doi) + e1 = create_enrichment!(doi: doi_a.doi) + create_enrichment!(doi: doi_b.doi) - expect(described_class.by_client(client_a.symbol)).to contain_exactly(e1) - end + expect(described_class.by_client(client_a.symbol)).to contain_exactly(e1) +end - describe ".order_by_cursor" do - it "orders by updated_at desc then id desc" do - doi = create(:doi) - t = Time.utc(2026, 1, 29, 10, 0, 0) + describe ".order_by_cursor" do + it "orders by updated_at desc then id desc" do + doi = create(:doi) + t = Time.utc(2026, 1, 29, 10, 0, 0) - a = create_enrichment!(doi: doi.doi, updated_at: t) - b = create_enrichment!(doi: doi.doi, updated_at: t) - c = create_enrichment!(doi: doi.doi, updated_at: t + 1.second) + a = create_enrichment!(doi: doi.doi, updated_at: t) + b = create_enrichment!(doi: doi.doi, updated_at: t) + c = create_enrichment!(doi: doi.doi, updated_at: t + 1.second) - ordered = described_class.where(id: [a.id, b.id, c.id]).order_by_cursor.to_a + ordered = described_class.where(id: [a.id, b.id, c.id]).order_by_cursor.to_a - expect(ordered.first).to eq(c) - expect(ordered[1].updated_at).to eq(t) - expect(ordered[2].updated_at).to eq(t) - expect(ordered[1].id).to be > ordered[2].id - end + expect(ordered.first).to eq(c) + expect(ordered[1].updated_at).to eq(t) + expect(ordered[2].updated_at).to eq(t) + expect(ordered[1].id).to be > ordered[2].id end + end - describe ".by_cursor" do - it "filters to records before the cursor (updated_at desc, id desc tie-break)" do - doi = create(:doi) - t = Time.utc(2026, 1, 29, 10, 0, 0) + describe ".by_cursor" do + it "filters to records before the cursor (updated_at desc, id desc tie-break)" do + doi = create(:doi) + t = Time.utc(2026, 1, 29, 10, 0, 0) - older = create_enrichment!(doi: doi.doi, updated_at: t - 10.seconds) - newer = create_enrichment!(doi: doi.doi, updated_at: t + 10.seconds) + older = create_enrichment!(doi: doi.doi, updated_at: t - 10.seconds) + newer = create_enrichment!(doi: doi.doi, updated_at: t + 10.seconds) - same_time_1 = create_enrichment!(doi: doi.doi, updated_at: t) - same_time_2 = create_enrichment!(doi: doi.doi, updated_at: t) + same_time_1 = create_enrichment!(doi: doi.doi, updated_at: t) + same_time_2 = create_enrichment!(doi: doi.doi, updated_at: t) - small, big = [same_time_1, same_time_2].sort_by(&:id) + small, big = [same_time_1, same_time_2].sort_by(&:id) - results = described_class.by_cursor(t, big.id) + results = described_class.by_cursor(t, big.id) - expect(results).to include(older, small) - expect(results).not_to include(newer, big) - end + expect(results).to include(older, small) + expect(results).not_to include(newer, big) end end +end describe "json schema validation" do # Create the required associated DOI record so "Doi record must exist" passes. @@ -348,3 +348,4 @@ def create_enrichment!(doi:, updated_at: nil) end end end +end From 86a97d79a6d4b6cb3164b3758cf1e9d8b55a4ba4 Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Fri, 30 Jan 2026 14:33:21 +0200 Subject: [PATCH 13/35] Fix specs --- spec/models/enrichment_spec.rb | 145 ++++++++++++++++----------------- 1 file changed, 72 insertions(+), 73 deletions(-) diff --git a/spec/models/enrichment_spec.rb b/spec/models/enrichment_spec.rb index f14618dfd..9683205d3 100644 --- a/spec/models/enrichment_spec.rb +++ b/spec/models/enrichment_spec.rb @@ -12,16 +12,16 @@ def valid_enrichment_attrs(doi:) nameType: "Organizational", contributorType: "ResearchGroup", affiliation: [], - nameIdentifiers: [] - } + nameIdentifiers: [], + }, ], resources: [ { relatedIdentifier: "10.1234/example_dataset", relationType: "IsDerivedFrom", relatedIdentifierType: "DOI", - resourceTypeGeneral: "Dataset" - } + resourceTypeGeneral: "Dataset", + }, ], field: "creators", action: "updateChild", @@ -34,15 +34,15 @@ def valid_enrichment_attrs(doi:) { name: "California Institute of Technology", affiliationIdentifier: "https://ror.org/05dxps055", - affiliationIdentifierScheme: "ROR" - } + affiliationIdentifierScheme: "ROR", + }, ], nameIdentifiers: [ { nameIdentifier: "https://orcid.org/0000-0002-8417-1522", - nameIdentifierScheme: "ORCID" - } - ] + nameIdentifierScheme: "ORCID", + }, + ], }, enriched_value: { name: "Chadly, Duncan", @@ -53,16 +53,16 @@ def valid_enrichment_attrs(doi:) { name: "California Institute of Technology", affiliationIdentifier: "https://ror.org/05dxps055", - affiliationIdentifierScheme: "ROR" - } + affiliationIdentifierScheme: "ROR", + }, ], nameIdentifiers: [ { nameIdentifier: "https://orcid.org/0000-0002-8417-1522", - nameIdentifierScheme: "ORCID" - } - ] - } + nameIdentifierScheme: "ORCID", + }, + ], + }, } end @@ -72,82 +72,83 @@ def valid_enrichment_attrs(doi:) end describe "scopes" do - before do - allow_any_instance_of(described_class).to receive(:validate_json_schema) - end + before do + allow_any_instance_of(described_class).to receive(:validate_json_schema) + end - def create_enrichment!(doi:, updated_at: nil) - attrs = valid_enrichment_attrs(doi: doi) - attrs = attrs.merge(updated_at: updated_at) if updated_at - described_class.create!(attrs) - end + def create_enrichment!(doi:, updated_at: nil) + attrs = valid_enrichment_attrs(doi: doi) + attrs = attrs.merge(updated_at: updated_at) if updated_at + described_class.create!(attrs) + end - describe ".by_doi" do - it "returns enrichments matching the doi" do - doi_a = create(:doi) - doi_b = create(:doi) + describe ".by_doi" do + it "returns enrichments matching the doi" do + doi_a = create(:doi) + doi_b = create(:doi) - e1 = create_enrichment!(doi: doi_a.doi) - _e2 = create_enrichment!(doi: doi_b.doi) + e1 = create_enrichment!(doi: doi_a.doi) + _e2 = create_enrichment!(doi: doi_b.doi) - expect(described_class.by_doi(doi_a.doi)).to contain_exactly(e1) + expect(described_class.by_doi(doi_a.doi)).to contain_exactly(e1) + end end - end - describe ".by_client" do - it "returns enrichments for DOIs belonging to the given client symbol" do - provider = create(:provider) + describe ".by_client" do + it "returns enrichments for DOIs belonging to the given client symbol" do + provider = create(:provider) - client_a = create(:client, provider: provider) - client_b = create(:client, provider: provider) + client_a = create(:client, provider: provider) + client_b = create(:client, provider: provider) - doi_a = create(:doi, client: client_a) - doi_b = create(:doi, client: client_b) + doi_a = create(:doi, client: client_a) + doi_b = create(:doi, client: client_b) - e1 = create_enrichment!(doi: doi_a.doi) - create_enrichment!(doi: doi_b.doi) + e1 = create_enrichment!(doi: doi_a.doi) + create_enrichment!(doi: doi_b.doi) - expect(described_class.by_client(client_a.symbol)).to contain_exactly(e1) -end + expect(described_class.by_client(client_a.symbol)).to contain_exactly(e1) + end + end - describe ".order_by_cursor" do - it "orders by updated_at desc then id desc" do - doi = create(:doi) - t = Time.utc(2026, 1, 29, 10, 0, 0) + describe ".order_by_cursor" do + it "orders by updated_at desc then id desc" do + doi = create(:doi) + t = Time.utc(2026, 1, 29, 10, 0, 0) - a = create_enrichment!(doi: doi.doi, updated_at: t) - b = create_enrichment!(doi: doi.doi, updated_at: t) - c = create_enrichment!(doi: doi.doi, updated_at: t + 1.second) + a = create_enrichment!(doi: doi.doi, updated_at: t) + b = create_enrichment!(doi: doi.doi, updated_at: t) + c = create_enrichment!(doi: doi.doi, updated_at: t + 1.second) - ordered = described_class.where(id: [a.id, b.id, c.id]).order_by_cursor.to_a + ordered = described_class.where(id: [a.id, b.id, c.id]).order_by_cursor.to_a - expect(ordered.first).to eq(c) - expect(ordered[1].updated_at).to eq(t) - expect(ordered[2].updated_at).to eq(t) - expect(ordered[1].id).to be > ordered[2].id + expect(ordered.first).to eq(c) + expect(ordered[1].updated_at).to eq(t) + expect(ordered[2].updated_at).to eq(t) + expect(ordered[1].id).to be > ordered[2].id + end end - end - describe ".by_cursor" do - it "filters to records before the cursor (updated_at desc, id desc tie-break)" do - doi = create(:doi) - t = Time.utc(2026, 1, 29, 10, 0, 0) + describe ".by_cursor" do + it "filters to records before the cursor (updated_at desc, id desc tie-break)" do + doi = create(:doi) + t = Time.utc(2026, 1, 29, 10, 0, 0) - older = create_enrichment!(doi: doi.doi, updated_at: t - 10.seconds) - newer = create_enrichment!(doi: doi.doi, updated_at: t + 10.seconds) + older = create_enrichment!(doi: doi.doi, updated_at: t - 10.seconds) + newer = create_enrichment!(doi: doi.doi, updated_at: t + 10.seconds) - same_time_1 = create_enrichment!(doi: doi.doi, updated_at: t) - same_time_2 = create_enrichment!(doi: doi.doi, updated_at: t) + same_time_1 = create_enrichment!(doi: doi.doi, updated_at: t) + same_time_2 = create_enrichment!(doi: doi.doi, updated_at: t) - small, big = [same_time_1, same_time_2].sort_by(&:id) + small, big = [same_time_1, same_time_2].sort_by(&:id) - results = described_class.by_cursor(t, big.id) + results = described_class.by_cursor(t, big.id) - expect(results).to include(older, small) - expect(results).not_to include(newer, big) + expect(results).to include(older, small) + expect(results).not_to include(newer, big) + end end end -end describe "json schema validation" do # Create the required associated DOI record so "Doi record must exist" passes. @@ -260,10 +261,9 @@ def create_enrichment!(doi:, updated_at: nil) it "fails when resource is missing required keys (relatedIdentifier, relationType, relatedIdentifierType)" do bad = valid_attributes.deep_dup - bad[:resources] = [{ relatedIdentifier: "10.1234/x" }] # missing relationType + relatedIdentifierType - - enrichment = described_class.new(bad) + bad[:resources] = [{ relatedIdentifier: "10.1234/x" }] + enrichment = described_class.new(bad.deep_stringify_keys) expect(enrichment).not_to be_valid expect(enrichment.errors[:base].join(" ")).to include("Validation failed") end @@ -333,7 +333,7 @@ def create_enrichment!(doi:, updated_at: nil) "field" => valid_attributes[:field], "action" => valid_attributes[:action], "originalValue" => valid_attributes[:original_value], - "enrichedValue" => valid_attributes[:enriched_value] + "enrichedValue" => valid_attributes[:enriched_value], }.deep_stringify_keys expect(hash).to include(expected) @@ -348,4 +348,3 @@ def create_enrichment!(doi:, updated_at: nil) end end end -end From b9de835e195d0c45fc5a049b334ebd6f9a7319ea Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Mon, 2 Feb 2026 11:06:08 +0200 Subject: [PATCH 14/35] Remove controller specs --- spec/requests/enrichments_spec.rb | 173 ------------------------------ 1 file changed, 173 deletions(-) delete mode 100644 spec/requests/enrichments_spec.rb diff --git a/spec/requests/enrichments_spec.rb b/spec/requests/enrichments_spec.rb deleted file mode 100644 index 5a71297bf..000000000 --- a/spec/requests/enrichments_spec.rb +++ /dev/null @@ -1,173 +0,0 @@ -# frozen_string_literal: true - -require "rails_helper" - -RSpec.describe "Enrichments", type: :request do - before do - allow_any_instance_of(Enrichment).to receive(:validate_json_schema) - end - - def create_enrichment!(doi:, updated_at:) - Enrichment.create!( - doi: doi, - contributors: [{ name: "X", contributorType: "ResearchGroup" }], - resources: [{ relatedIdentifier: "10.1234/x", relationType: "IsDerivedFrom", relatedIdentifierType: "DOI" }], - field: "creators", - action: "updateChild", - original_value: { name: "Old" }, - enriched_value: { name: "New" }, - updated_at: updated_at - ) - end - - def json - JSON.parse(response.body) - end - - describe "GET /enrichments" do - it "returns 400 when both doi and client_id are missing" do - get "/enrichments" - - expect(response).to have_http_status(:bad_request) - expect(json).to eq("message" => "Missing doi or client-id query string parameter") - end - - it "filters by doi when doi param is provided" do - doi_a = create(:doi) - doi_b = create(:doi) - - t = Time.utc(2026, 1, 29, 10, 0, 0) - e1 = create_enrichment!(doi: doi_a.doi, updated_at: t) - _e2 = create_enrichment!(doi: doi_b.doi, updated_at: t) - - get "/enrichments", params: { doi: doi_a.doi } - - expect(response).to have_http_status(:ok) - expect(json["data"].map { |h| h["id"] }).to eq([e1.id]) - end - - it "filters by client_id when client_id param is provided" do - provider = create(:provider) - client_a = create(:client, provider: provider, symbol: "#{provider.symbol}.DATACITE.TEST") - client_b = create(:client, provider: provider, symbol: "#{provider.symbol}.OTHER.TEST") - - doi_a = create(:doi, client: client_a) - doi_b = create(:doi, client: client_b) - - t = Time.utc(2026, 1, 29, 10, 0, 0) - e1 = create_enrichment!(doi: doi_a.doi, updated_at: t) - _e2 = create_enrichment!(doi: doi_b.doi, updated_at: t) - - get "/enrichments", params: { client_id: client_a.symbol } - - expect(response).to have_http_status(:ok) - expect(json["data"].map { |h| h["id"] }).to eq([e1.id]) - end - - it "orders results by updated_at desc then id desc" do - doi = create(:doi) - - t = Time.utc(2026, 1, 29, 10, 0, 0) - a = create_enrichment!(doi: doi.doi, updated_at: t) - b = create_enrichment!(doi: doi.doi, updated_at: t) - c = create_enrichment!(doi: doi.doi, updated_at: t + 1.second) - - get "/enrichments", params: { doi: doi.doi } - - ids = json["data"].map { |h| h["id"] } - expected_same_time = [a.id, b.id].sort.reverse - expect(ids).to eq([c.id] + expected_same_time) - end - - it "includes links.self and links.next is nil when fewer than PAGE_SIZE results" do - doi = create(:doi) - - t = Time.utc(2026, 1, 29, 10, 0, 0) - 3.times { |i| create_enrichment!(doi: doi.doi, updated_at: t + i.seconds) } - - get "/enrichments", params: { doi: doi.doi } - - expect(response).to have_http_status(:ok) - expect(json.dig("links", "self")).to be_present - expect(json.dig("links", "next")).to be_nil - end - - it "sets links.next when exactly PAGE_SIZE results are returned" do - stub_const("EnrichmentsController::PAGE_SIZE", 2) - - doi = create(:doi) - - t = Time.utc(2026, 1, 29, 10, 0, 0) - create_enrichment!(doi: doi.doi, updated_at: t) - create_enrichment!(doi: doi.doi, updated_at: t + 1.second) - - get "/enrichments", params: { doi: doi.doi } - - expect(response).to have_http_status(:ok) - expect(json.dig("links", "next")).to be_present - expect(json.dig("links", "next")).to include("cursor=") - expect(json.dig("links", "next")).to include("doi=#{doi.doi}") - end - - it "paginates with cursor so the next page excludes the cursor record and newer ones" do - stub_const("EnrichmentsController::PAGE_SIZE", 2) - - doi = create(:doi) - t = Time.utc(2026, 1, 29, 10, 0, 0) - - newest = create_enrichment!(doi: doi.doi, updated_at: t + 3.seconds) - mid = create_enrichment!(doi: doi.doi, updated_at: t + 2.seconds) - older = create_enrichment!(doi: doi.doi, updated_at: t + 1.second) - - # Page 1 - get "/enrichments", params: { doi: doi.doi } - - expect(response).to have_http_status(:ok) - ids1 = json["data"].map { |h| h["id"] } - expect(ids1).to eq([newest.id, mid.id]) - - # Build a cursor that represents "mid" (the last record of page 1) - cursor_payload = { - updated_at: mid.updated_at.iso8601(6), - id: mid.id - } - cursor = Base64.urlsafe_encode64(cursor_payload.to_json, padding: false) - - # Page 2 using cursor - get "/enrichments", params: { doi: doi.doi, cursor: cursor } - - expect(response).to have_http_status(:ok) - ids2 = json["data"].map { |h| h["id"] } - expect(ids2).to eq([older.id]) - end - - it "returns 400 for an invalid cursor" do - doi = create(:doi) - create_enrichment!(doi: doi.doi, updated_at: Time.utc(2026, 1, 29, 10, 0, 0)) - - get "/enrichments", params: { doi: doi.doi, cursor: "not-a-valid-base64" } - - expect(response).to have_http_status(:bad_request) - end - - it "builds next link with client-id param name (hyphen) when paginating by client_id" do - stub_const("EnrichmentsController::PAGE_SIZE", 2) - - provider = create(:provider) - client = create(:client, provider: provider, symbol: "#{provider.symbol}.DATACITE.TEST") - doi = create(:doi, client: client) - - t = Time.utc(2026, 1, 29, 10, 0, 0) - create_enrichment!(doi: doi.doi, updated_at: t + 2.seconds) - create_enrichment!(doi: doi.doi, updated_at: t + 1.second) - - get "/enrichments", params: { client_id: client.symbol } - - expect(response).to have_http_status(:ok) - next_link = json.dig("links", "next") - expect(next_link).to be_present - expect(next_link).to include("client-id=#{CGI.escape(client.symbol)}") - expect(next_link).to include("cursor=") - end - end -end From 4fe26ded612b57a38f770b8bc350a8ab4a6aa314 Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Mon, 2 Feb 2026 15:07:46 +0200 Subject: [PATCH 15/35] Remove specs --- spec/models/enrichment_spec.rb | 350 --------------------------------- 1 file changed, 350 deletions(-) delete mode 100644 spec/models/enrichment_spec.rb diff --git a/spec/models/enrichment_spec.rb b/spec/models/enrichment_spec.rb deleted file mode 100644 index 9683205d3..000000000 --- a/spec/models/enrichment_spec.rb +++ /dev/null @@ -1,350 +0,0 @@ -# frozen_string_literal: true - -require "rails_helper" - -RSpec.describe Enrichment, type: :model do - def valid_enrichment_attrs(doi:) - { - doi: doi, - contributors: [ - { - name: "COMET", - nameType: "Organizational", - contributorType: "ResearchGroup", - affiliation: [], - nameIdentifiers: [], - }, - ], - resources: [ - { - relatedIdentifier: "10.1234/example_dataset", - relationType: "IsDerivedFrom", - relatedIdentifierType: "DOI", - resourceTypeGeneral: "Dataset", - }, - ], - field: "creators", - action: "updateChild", - original_value: { - name: "Chadly, Duncan", - nameType: "Personal", - givenName: "Duncan", - familyName: "Chadly", - affiliation: [ - { - name: "California Institute of Technology", - affiliationIdentifier: "https://ror.org/05dxps055", - affiliationIdentifierScheme: "ROR", - }, - ], - nameIdentifiers: [ - { - nameIdentifier: "https://orcid.org/0000-0002-8417-1522", - nameIdentifierScheme: "ORCID", - }, - ], - }, - enriched_value: { - name: "Chadly, Duncan", - nameType: "Personal", - givenName: "Duncan", - familyName: "Chadly", - affiliation: [ - { - name: "California Institute of Technology", - affiliationIdentifier: "https://ror.org/05dxps055", - affiliationIdentifierScheme: "ROR", - }, - ], - nameIdentifiers: [ - { - nameIdentifier: "https://orcid.org/0000-0002-8417-1522", - nameIdentifierScheme: "ORCID", - }, - ], - }, - } - end - - describe "associations" do - it { is_expected.to belong_to(:doi_record).class_name("Doi") } - it { is_expected.to have_one(:client).through(:doi_record) } - end - - describe "scopes" do - before do - allow_any_instance_of(described_class).to receive(:validate_json_schema) - end - - def create_enrichment!(doi:, updated_at: nil) - attrs = valid_enrichment_attrs(doi: doi) - attrs = attrs.merge(updated_at: updated_at) if updated_at - described_class.create!(attrs) - end - - describe ".by_doi" do - it "returns enrichments matching the doi" do - doi_a = create(:doi) - doi_b = create(:doi) - - e1 = create_enrichment!(doi: doi_a.doi) - _e2 = create_enrichment!(doi: doi_b.doi) - - expect(described_class.by_doi(doi_a.doi)).to contain_exactly(e1) - end - end - - describe ".by_client" do - it "returns enrichments for DOIs belonging to the given client symbol" do - provider = create(:provider) - - client_a = create(:client, provider: provider) - client_b = create(:client, provider: provider) - - doi_a = create(:doi, client: client_a) - doi_b = create(:doi, client: client_b) - - e1 = create_enrichment!(doi: doi_a.doi) - create_enrichment!(doi: doi_b.doi) - - expect(described_class.by_client(client_a.symbol)).to contain_exactly(e1) - end - end - - describe ".order_by_cursor" do - it "orders by updated_at desc then id desc" do - doi = create(:doi) - t = Time.utc(2026, 1, 29, 10, 0, 0) - - a = create_enrichment!(doi: doi.doi, updated_at: t) - b = create_enrichment!(doi: doi.doi, updated_at: t) - c = create_enrichment!(doi: doi.doi, updated_at: t + 1.second) - - ordered = described_class.where(id: [a.id, b.id, c.id]).order_by_cursor.to_a - - expect(ordered.first).to eq(c) - expect(ordered[1].updated_at).to eq(t) - expect(ordered[2].updated_at).to eq(t) - expect(ordered[1].id).to be > ordered[2].id - end - end - - describe ".by_cursor" do - it "filters to records before the cursor (updated_at desc, id desc tie-break)" do - doi = create(:doi) - t = Time.utc(2026, 1, 29, 10, 0, 0) - - older = create_enrichment!(doi: doi.doi, updated_at: t - 10.seconds) - newer = create_enrichment!(doi: doi.doi, updated_at: t + 10.seconds) - - same_time_1 = create_enrichment!(doi: doi.doi, updated_at: t) - same_time_2 = create_enrichment!(doi: doi.doi, updated_at: t) - - small, big = [same_time_1, same_time_2].sort_by(&:id) - - results = described_class.by_cursor(t, big.id) - - expect(results).to include(older, small) - expect(results).not_to include(newer, big) - end - end - end - - describe "json schema validation" do - # Create the required associated DOI record so "Doi record must exist" passes. - let!(:doi_record) { create(:doi) } - let(:valid_attributes) { valid_enrichment_attrs(doi: doi_record.doi) } - - context "happy paths (action-dependent requirements)" do - it "is valid for action=update with original_value and enriched_value" do - attrs = valid_attributes.merge(action: "update") - enrichment = described_class.new(attrs) - expect(enrichment).to be_valid - end - - it "is valid for action=updateChild with original_value and enriched_value" do - attrs = valid_attributes.merge(action: "updateChild") - enrichment = described_class.new(attrs) - expect(enrichment).to be_valid - end - - it "is valid for action=insert when enriched_value is present (original_value may be omitted)" do - attrs = valid_attributes.deep_dup - attrs[:action] = "insert" - attrs.delete(:original_value) - - enrichment = described_class.new(attrs) - expect(enrichment).to be_valid - end - - it "is valid for action=deleteChild when original_value is present (enriched_value may be omitted)" do - attrs = valid_attributes.deep_dup - attrs[:action] = "deleteChild" - attrs.delete(:enriched_value) - - enrichment = described_class.new(attrs) - expect(enrichment).to be_valid - end - end - - context "when schema validation passes" do - it "is valid" do - enrichment = described_class.new(valid_attributes) - expect(enrichment).to be_valid - end - - it "does not add base errors" do - enrichment = described_class.new(valid_attributes) - enrichment.validate - expect(enrichment.errors[:base]).to be_empty - end - end - - context "when schema validation fails" do - it "fails when a required top-level attribute is missing (field)" do - enrichment = described_class.new(valid_attributes.except(:field)) - - expect(enrichment).not_to be_valid - expect(enrichment.errors[:base].join(" ")).to include("Validation failed") - end - - it "fails when a required top-level attribute is missing (doi)" do - enrichment = described_class.new(valid_attributes.except(:doi)) - - expect(enrichment).not_to be_valid - expect(enrichment.errors[:base].join(" ")).to include("Validation failed") - end - - it "fails when contributors is empty (minItems 1)" do - enrichment = described_class.new(valid_attributes.merge(contributors: [])) - - expect(enrichment).not_to be_valid - expect(enrichment.errors[:base].join(" ")).to include("Validation failed") - end - - it "fails when resources is empty (minItems 1)" do - enrichment = described_class.new(valid_attributes.merge(resources: [])) - - expect(enrichment).not_to be_valid - expect(enrichment.errors[:base].join(" ")).to include("Validation failed") - end - - it "fails when contributors contains an unexpected extra property (additionalProperties false)" do - bad = valid_attributes.deep_dup - bad[:contributors][0][:unexpected] = "nope" - - enrichment = described_class.new(bad) - - expect(enrichment).not_to be_valid - expect(enrichment.errors[:base].join(" ")).to include("Validation failed") - end - - it "fails when resources contains an unexpected extra property (additionalProperties false)" do - bad = valid_attributes.deep_dup - bad[:resources][0][:unexpected] = "nope" - - enrichment = described_class.new(bad) - - expect(enrichment).not_to be_valid - expect(enrichment.errors[:base].join(" ")).to include("Validation failed") - end - - it "fails when contributor is missing required keys (name and contributorType)" do - bad = valid_attributes.deep_dup - bad[:contributors] = [{ name: "Only name" }] # missing contributorType - - enrichment = described_class.new(bad) - - expect(enrichment).not_to be_valid - expect(enrichment.errors[:base].join(" ")).to include("Validation failed") - end - - it "fails when resource is missing required keys (relatedIdentifier, relationType, relatedIdentifierType)" do - bad = valid_attributes.deep_dup - bad[:resources] = [{ relatedIdentifier: "10.1234/x" }] - - enrichment = described_class.new(bad.deep_stringify_keys) - expect(enrichment).not_to be_valid - expect(enrichment.errors[:base].join(" ")).to include("Validation failed") - end - - it "fails when action=updateChild but original_value is missing (schema requires originalValue)" do - enrichment = described_class.new(valid_attributes.except(:original_value)) - - expect(enrichment).not_to be_valid - expect(enrichment.errors[:base].join(" ")).to include("Validation failed") - end - - it "fails when action=updateChild but enriched_value is missing (schema requires enrichedValue)" do - enrichment = described_class.new(valid_attributes.except(:enriched_value)) - - expect(enrichment).not_to be_valid - expect(enrichment.errors[:base].join(" ")).to include("Validation failed") - end - - it "fails when action=insert but enriched_value is missing (schema requires enrichedValue)" do - attrs = valid_attributes.deep_dup - attrs[:action] = "insert" - attrs.delete(:enriched_value) - - enrichment = described_class.new(attrs) - - expect(enrichment).not_to be_valid - expect(enrichment.errors[:base].join(" ")).to include("Validation failed") - end - - it "fails when action=deleteChild but original_value is missing (schema requires originalValue)" do - attrs = valid_attributes.deep_dup - attrs[:action] = "deleteChild" - attrs.delete(:original_value) - - enrichment = described_class.new(attrs) - - expect(enrichment).not_to be_valid - expect(enrichment.errors[:base].join(" ")).to include("Validation failed") - end - - it "adds a base error that starts with 'Validation failed:'" do - enrichment = described_class.new(valid_attributes.except(:field)) - enrichment.validate - - expect(enrichment.errors[:base].first).to match(/\AValidation failed:/) - end - end - - describe ".enrichment_schemer" do - it "memoizes the schemer instance" do - schemer1 = described_class.enrichment_schemer - schemer2 = described_class.enrichment_schemer - - expect(schemer1).to be(schemer2) - end - end - - describe "#to_enrichment_hash" do - it "includes the expected keys and uses camelCase for original/enriched values" do - enrichment = described_class.new(valid_attributes) - hash = enrichment.send(:to_enrichment_hash) - - expected = { - "doi" => valid_attributes[:doi], - "contributors" => valid_attributes[:contributors], - "resources" => valid_attributes[:resources], - "field" => valid_attributes[:field], - "action" => valid_attributes[:action], - "originalValue" => valid_attributes[:original_value], - "enrichedValue" => valid_attributes[:enriched_value], - }.deep_stringify_keys - - expect(hash).to include(expected) - end - - it "compacts nil values (omits keys when underlying attributes are nil)" do - enrichment = described_class.new(valid_attributes.merge(enriched_value: nil)) - hash = enrichment.send(:to_enrichment_hash) - - expect(hash).not_to have_key("enrichedValue") - end - end - end -end From 97f2279e2f8e07192a353bde0f5321e43e8e14cd Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Mon, 2 Feb 2026 16:07:54 +0200 Subject: [PATCH 16/35] Build initial rake task --- config/application.rb | 1 + lib/tasks/enrichment.rake | 47 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 lib/tasks/enrichment.rake diff --git a/config/application.rb b/config/application.rb index b537f6269..253a9caf2 100644 --- a/config/application.rb +++ b/config/application.rb @@ -69,6 +69,7 @@ ENV["METADATA_STORAGE_BUCKET_NAME"] ||= "" ENV["MONTHLY_DATAFILE_BUCKET"] ||= "monthly-datafile.stage.datacite.org" ENV["MONTHLY_DATAFILE_ACCESS_ROLE"] ||= "" +ENV["ENRICHMENTS_INGESTION_FILES_BUCKET_NAME"] ||= "stage-enrichments-ingestion-files" module Lupo class Application < Rails::Application diff --git a/lib/tasks/enrichment.rake b/lib/tasks/enrichment.rake new file mode 100644 index 000000000..94ebbf64e --- /dev/null +++ b/lib/tasks/enrichment.rake @@ -0,0 +1,47 @@ +# frozen_string_literal: true + +namespace :enrichment do + desc "Ingest Enrichment File from S3" + task ingest_file: :environment do + bucket = ENV["ENRICHMENTS_INGESTION_FILES_BUCKET_NAME"] + key = ENV["KEY"] + + if bucket.blank? + puts("bucket environment variable is not set") + exit + end + + if key.blank? + puts("bucket environment variable is not set") + exit + end + + s3 = AWS::S3::Client.new + + buffer = +"" + + s3.get_object(bucket: bucket, key: key) do |chunk| + buffer << chunk + + # Consume complete lines from buffer + while (newline_index = buffer.index("\n")) + # Read line + line = buffer.slice!(0..newline_index) + + # Remove the newline character at the end of line + line = line.strip + + # Exit the loop, if this line is empty + next if line.empty? + + puts(line) + + # # Parse the enrichment record to json + # enrichment = JSON.parse(line) + + # # Queue up the job that processes the record + # enrichment_process_job.perform_later(enrichment) + end + end + end +end From 7515e5e1f77859b5ada8c3aea3e1928d889b92ff Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Thu, 5 Feb 2026 13:20:46 +0200 Subject: [PATCH 17/35] Implement rake and jobs --- app/jobs/enrichment_batch_process_job.rb | 38 +++++++++++ config/application.rb | 2 +- config/shoryuken.yml | 4 ++ docker-compose.localstack.yml | 5 ++ lib/tasks/enrichment.rake | 86 +++++++++++++++++------- 5 files changed, 109 insertions(+), 26 deletions(-) create mode 100644 app/jobs/enrichment_batch_process_job.rb diff --git a/app/jobs/enrichment_batch_process_job.rb b/app/jobs/enrichment_batch_process_job.rb new file mode 100644 index 000000000..56aa0d6ed --- /dev/null +++ b/app/jobs/enrichment_batch_process_job.rb @@ -0,0 +1,38 @@ +# frozen_string_literal: true + +class EnrichmentBatchProcessJob < ApplicationJob + queue_as :enrichment_batch_process_job + + def perform(lines) + log_prefix = "EnrichmentBathProcessJob" + Rails.logger.info("#{log_prefix}: Job started") + + Parallel.each(lines, in_threads: 10) do |line| + parsed_line = JSON.parse(line) + + doi = Doi.find_by(doi: parsed_line["doi"], agency: "datacite") + + if doi.blank? + Rails.logger.error( + "#{log_prefix}: Doi #{parsed_line["doi"]} does not exist") + else + enrichment = Enrichment.new( + doi: parsed_line["doi"], + contributors: parsed_line["contributors"], + resources: parsed_line["resources"], + field: parsed_line["field"], + action: parsed_line["action"], + original_value: parsed_line["originalValue"], + enriched_value: parsed_line["enrichedValue"] + ) + + unless enrichment.save + Rails.logger.warn( + "#{log_prefix}: Enrichment failed to save: #{enrichment.errors.full_messages.join(";")}") + end + end + + Rails.logger.info("#{log_prefix}: Job completed") + end + end +end diff --git a/config/application.rb b/config/application.rb index 253a9caf2..5c790a924 100644 --- a/config/application.rb +++ b/config/application.rb @@ -69,7 +69,7 @@ ENV["METADATA_STORAGE_BUCKET_NAME"] ||= "" ENV["MONTHLY_DATAFILE_BUCKET"] ||= "monthly-datafile.stage.datacite.org" ENV["MONTHLY_DATAFILE_ACCESS_ROLE"] ||= "" -ENV["ENRICHMENTS_INGESTION_FILES_BUCKET_NAME"] ||= "stage-enrichments-ingestion-files" +ENV["ENRICHMENTS_INGESTION_FILES_BUCKET_NAME"] ||= "" module Lupo class Application < Rails::Application diff --git a/config/shoryuken.yml b/config/shoryuken.yml index 201cad06c..a5276ee01 100644 --- a/config/shoryuken.yml +++ b/config/shoryuken.yml @@ -24,3 +24,7 @@ groups: queues: - events_other_doi_job - events_other_doi_by_id_job + enrichments_group: + concurrency: 2 + queues: + - enrichment_batch_process_job diff --git a/docker-compose.localstack.yml b/docker-compose.localstack.yml index 0dc4aa53f..da930eb29 100644 --- a/docker-compose.localstack.yml +++ b/docker-compose.localstack.yml @@ -9,6 +9,11 @@ services: - ELASTIC_PASSWORD=AnUnsecurePassword123 - PASSENGER_MAX_POOL_SIZE=10 - PASSENGER_MIN_INSTANCES=1 + - ENRICHMENTS_INGESTION_FILES_BUCKET_NAME=enrichments-ingestion-files + - AWS_REGION=us-east-1 + - AWS_ACCESS_KEY_ID=test + - AWS_SECRET_ACCESS_KEY=test + - AWS_ENDPOINT_URL=http://localstack:4566 image: ghcr.io/datacite/lupo:main build: . ports: diff --git a/lib/tasks/enrichment.rake b/lib/tasks/enrichment.rake index 94ebbf64e..230cda471 100644 --- a/lib/tasks/enrichment.rake +++ b/lib/tasks/enrichment.rake @@ -1,47 +1,83 @@ # frozen_string_literal: true namespace :enrichment do - desc "Ingest Enrichment File from S3" - task ingest_file: :environment do + desc "Process JSONL from S3 and enqueue batches sized by bytes (256KB message size limit)" + # "Example command: bundle exec rake enrichment:batch_process_file KEY=02022026_test_ingestion_file.jsonl + task batch_process_file: :environment do bucket = ENV["ENRICHMENTS_INGESTION_FILES_BUCKET_NAME"] - key = ENV["KEY"] + key = ENV["KEY"] - if bucket.blank? - puts("bucket environment variable is not set") - exit - end + abort("ENRICHMENTS_INGESTION_FILES_BUCKET_NAME is not set") if bucket.blank? + abort("KEY is not set") if key.blank? - if key.blank? - puts("bucket environment variable is not set") - exit - end + # SQS limit is 256KB. + max_batch_bytes = Integer(ENV.fetch("MAX_BATCH_BYTES", "200000")) + + puts("Begin ingestion for s3://#{bucket}/#{key} (max_batch_bytes=#{max_batch_bytes})") + + s3 = Aws::S3::Client.new(force_path_style: true) - s3 = AWS::S3::Client.new + buffer = +"" + line_no = 0 - buffer = +"" + batch_lines = [] + batch_bytes = 0 + + flush = lambda do + return if batch_lines.empty? + + EnrichmentBatchProcessJob.perform_later(batch_lines) + + batch_lines.clear + batch_bytes = 0 + end s3.get_object(bucket: bucket, key: key) do |chunk| buffer << chunk - # Consume complete lines from buffer - while (newline_index = buffer.index("\n")) - # Read line - line = buffer.slice!(0..newline_index) + while (idx = buffer.index("\n")) + raw = buffer.slice!(0..idx).delete_suffix("\n") + line_no += 1 - # Remove the newline character at the end of line - line = line.strip + line = raw.strip - # Exit the loop, if this line is empty next if line.empty? - puts(line) + # +1 for the newline we removed, and some slack for JSON array encoding. + line_bytes = line.bytesize + 1 - # # Parse the enrichment record to json - # enrichment = JSON.parse(line) + # If a single line is too big to ever fit in one message we need to process differently. + if line_bytes > max_batch_bytes + raise "Single JSONL line at #{line_no} is #{line_bytes} bytes, exceeds MAX_BATCH_BYTES=#{max_batch_bytes}. " + end - # # Queue up the job that processes the record - # enrichment_process_job.perform_later(enrichment) + # If adding this line would exceed the cap, flush current batch first. + if (batch_bytes + line_bytes) > max_batch_bytes + flush.call + end + + batch_lines << line + batch_bytes += line_bytes + end + end + + # File might not end with newline + tail = buffer.strip + + unless tail.empty? + line_no += 1 + line_bytes = tail.bytesize + 1 + + if line_bytes > max_batch_bytes + raise "Single JSONL tail line at #{line_no} is #{line_bytes} bytes, exceeds MAX_BATCH_BYTES=#{max_batch_bytes}." end + + flush.call if (batch_bytes + line_bytes) > max_batch_bytes + batch_lines << tail + batch_bytes += line_bytes end + + flush.call + puts("Finished ingestion for s3://#{bucket}/#{key} (lines_seen=#{line_no})") end end From 0035b0e1bfd3f21e2ab847f87c05b05706794f65 Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Thu, 12 Feb 2026 11:37:24 +0200 Subject: [PATCH 18/35] Finish up the batch process job --- app/jobs/enrichment_batch_process_job.rb | 78 ++++++++++++++++++------ 1 file changed, 59 insertions(+), 19 deletions(-) diff --git a/app/jobs/enrichment_batch_process_job.rb b/app/jobs/enrichment_batch_process_job.rb index 56aa0d6ed..f1cf4e686 100644 --- a/app/jobs/enrichment_batch_process_job.rb +++ b/app/jobs/enrichment_batch_process_job.rb @@ -1,38 +1,78 @@ # frozen_string_literal: true class EnrichmentBatchProcessJob < ApplicationJob + include ErrorSerializable + queue_as :enrichment_batch_process_job def perform(lines) log_prefix = "EnrichmentBathProcessJob" - Rails.logger.info("#{log_prefix}: Job started") + # We will process the lines in parallel to speed up ingestion. Parallel.each(lines, in_threads: 10) do |line| parsed_line = JSON.parse(line) + # We only create enrichments for DOIs that exist and which have an agency of 'datacite'. doi = Doi.find_by(doi: parsed_line["doi"], agency: "datacite") if doi.blank? - Rails.logger.error( - "#{log_prefix}: Doi #{parsed_line["doi"]} does not exist") - else - enrichment = Enrichment.new( - doi: parsed_line["doi"], - contributors: parsed_line["contributors"], - resources: parsed_line["resources"], - field: parsed_line["field"], - action: parsed_line["action"], - original_value: parsed_line["originalValue"], - enriched_value: parsed_line["enrichedValue"] - ) - - unless enrichment.save - Rails.logger.warn( - "#{log_prefix}: Enrichment failed to save: #{enrichment.errors.full_messages.join(";")}") - end + Rails.logger.error("#{log_prefix}: Doi #{parsed_line["doi"]} does not exist") + next + end + + # We set the only_validate flag on the DOI model to true such that we + # ensure that validation functions as expected when not persisting the record. + doi.only_validate = true + + enrichment = Enrichment.new( + doi: "#{parsed_line["doi"]}", + contributors: parsed_line["contributors"], + resources: parsed_line["resources"], + field: parsed_line["field"], + action: parsed_line["action"], + original_value: parsed_line["originalValue"], + enriched_value: parsed_line["enrichedValue"] + ) + + enrich_doi(enrichment, doi) + + unless doi.valid? + errors = serialize_errors(doi.errors, uid: enrichment.doi) + Rails.logger.error("#{log_prefix}: Enrichment does not generate valid metadata: #{errors}") + next end - Rails.logger.info("#{log_prefix}: Job completed") + unless enrichment.save + errors = enrichment.errors.full_messages.join(";") + Rails.logger.error("#{log_prefix}: Enrichment failed to save: #{errors}") + end + end + end + + def enrich_doi(enrichment, doi) + action = enrichment["action"] + case action + when "insert" + doi[enrichment["field"]] ||= [] + doi[enrichment["field"]] << enrichment["enriched_value"] + when "update" + doi[enrichment["field"]] = enrichment["enriched_value"] + when "update_child" + field = enrichment["field"] + doi[field].each_with_index do |item, index| + if item == enrichment["original_value"] + doi[field][index] = enrichment["enriched_value"] + end + end + when "delete_child" + field = enrichment["field"] + doi[field] ||= [] + doi[field].each_with_index do |item, index| + if item == enrichment["original_value"] + doi[field].delete_at(index) + break + end + end end end end From 8a3a4d200fc079351029063b74a7a674d17ef7b1 Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Thu, 12 Feb 2026 13:46:22 +0200 Subject: [PATCH 19/35] Reduce batch size further --- app/controllers/enrichments_controller.rb | 1 - app/jobs/enrichment_batch_process_job.rb | 12 ++++++------ lib/tasks/enrichment.rake | 6 ++++-- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/app/controllers/enrichments_controller.rb b/app/controllers/enrichments_controller.rb index 4477c247d..35cacfe56 100644 --- a/app/controllers/enrichments_controller.rb +++ b/app/controllers/enrichments_controller.rb @@ -22,7 +22,6 @@ def index enrichments = enrichments.by_cursor(cursor_updated_at, cursor_id) end - puts(enrichments.order_by_cursor.limit(PAGE_SIZE).to_sql) enrichments = enrichments.order_by_cursor.limit(PAGE_SIZE).to_a current_link = request.original_url diff --git a/app/jobs/enrichment_batch_process_job.rb b/app/jobs/enrichment_batch_process_job.rb index f1cf4e686..1595be9ba 100644 --- a/app/jobs/enrichment_batch_process_job.rb +++ b/app/jobs/enrichment_batch_process_job.rb @@ -6,7 +6,7 @@ class EnrichmentBatchProcessJob < ApplicationJob queue_as :enrichment_batch_process_job def perform(lines) - log_prefix = "EnrichmentBathProcessJob" + log_prefix = "EnrichmentBatchProcessJob" # We will process the lines in parallel to speed up ingestion. Parallel.each(lines, in_threads: 10) do |line| @@ -51,21 +51,21 @@ def perform(lines) def enrich_doi(enrichment, doi) action = enrichment["action"] + field = enrichment["field"].underscore + case action when "insert" - doi[enrichment["field"]] ||= [] - doi[enrichment["field"]] << enrichment["enriched_value"] + doi[field] ||= [] + doi[field] << enrichment["enriched_value"] when "update" - doi[enrichment["field"]] = enrichment["enriched_value"] + doi[field] = enrichment["enriched_value"] when "update_child" - field = enrichment["field"] doi[field].each_with_index do |item, index| if item == enrichment["original_value"] doi[field][index] = enrichment["enriched_value"] end end when "delete_child" - field = enrichment["field"] doi[field] ||= [] doi[field].each_with_index do |item, index| if item == enrichment["original_value"] diff --git a/lib/tasks/enrichment.rake b/lib/tasks/enrichment.rake index 230cda471..78aca7c4f 100644 --- a/lib/tasks/enrichment.rake +++ b/lib/tasks/enrichment.rake @@ -3,6 +3,7 @@ namespace :enrichment do desc "Process JSONL from S3 and enqueue batches sized by bytes (256KB message size limit)" # "Example command: bundle exec rake enrichment:batch_process_file KEY=02022026_test_ingestion_file.jsonl + # bundle exec rake enrichment:batch_process_file KEY=preprint_matching_enrichments_datacite_format_1000.jsonl task batch_process_file: :environment do bucket = ENV["ENRICHMENTS_INGESTION_FILES_BUCKET_NAME"] key = ENV["KEY"] @@ -10,8 +11,9 @@ namespace :enrichment do abort("ENRICHMENTS_INGESTION_FILES_BUCKET_NAME is not set") if bucket.blank? abort("KEY is not set") if key.blank? - # SQS limit is 256KB. - max_batch_bytes = Integer(ENV.fetch("MAX_BATCH_BYTES", "200000")) + # SQS limit is 256KB so we'll set the batch size to be more conservative to allow for some + # overhead and ensure we don't exceed limits. + max_batch_bytes = 150000 puts("Begin ingestion for s3://#{bucket}/#{key} (max_batch_bytes=#{max_batch_bytes})") From 61fd6600a247a7a71fabd8ad3dccfd47bdd8b725 Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Fri, 13 Feb 2026 14:34:39 +0200 Subject: [PATCH 20/35] PR comment changes --- app/controllers/enrichments_controller.rb | 12 +++--- app/jobs/enrichment_batch_process_job.rb | 52 +++++++++++------------ app/models/doi.rb | 27 ++++++++++++ 3 files changed, 60 insertions(+), 31 deletions(-) diff --git a/app/controllers/enrichments_controller.rb b/app/controllers/enrichments_controller.rb index 35cacfe56..c3925a90e 100644 --- a/app/controllers/enrichments_controller.rb +++ b/app/controllers/enrichments_controller.rb @@ -1,19 +1,21 @@ # frozen_string_literal: true class EnrichmentsController < ApplicationController - PAGE_SIZE = 10 + PAGE_SIZE = 25 def index doi = params["doi"] client_id = params["client_id"] cursor = params["cursor"] - if doi.blank? && client_id.blank? - return render(json: { message: "Missing doi or client-id query string parameter" }, status: :bad_request) + enrichments = if doi.present? + Enrichment.by_doi(doi) + elsif client_id.present? + Enrichment.by_client(client_id) + else + Enrichment.all end - enrichments = doi.present? ? Enrichment.by_doi(doi) : Enrichment.by_client(client_id) - if cursor.present? decoded_cursor = decode_cursor(cursor) cursor_updated_at = Time.iso8601(decoded_cursor.fetch("updated_at")) diff --git a/app/jobs/enrichment_batch_process_job.rb b/app/jobs/enrichment_batch_process_job.rb index 1595be9ba..a59b82c0f 100644 --- a/app/jobs/enrichment_batch_process_job.rb +++ b/app/jobs/enrichment_batch_process_job.rb @@ -34,7 +34,7 @@ def perform(lines) enriched_value: parsed_line["enrichedValue"] ) - enrich_doi(enrichment, doi) + doi.apply_enrichment(enrichment) unless doi.valid? errors = serialize_errors(doi.errors, uid: enrichment.doi) @@ -49,30 +49,30 @@ def perform(lines) end end - def enrich_doi(enrichment, doi) - action = enrichment["action"] - field = enrichment["field"].underscore + # def enrich_doi(enrichment, doi) + # action = enrichment["action"] + # field = enrichment["field"].underscore - case action - when "insert" - doi[field] ||= [] - doi[field] << enrichment["enriched_value"] - when "update" - doi[field] = enrichment["enriched_value"] - when "update_child" - doi[field].each_with_index do |item, index| - if item == enrichment["original_value"] - doi[field][index] = enrichment["enriched_value"] - end - end - when "delete_child" - doi[field] ||= [] - doi[field].each_with_index do |item, index| - if item == enrichment["original_value"] - doi[field].delete_at(index) - break - end - end - end - end + # case action + # when "insert" + # doi[field] ||= [] + # doi[field] << enrichment["enriched_value"] + # when "update" + # doi[field] = enrichment["enriched_value"] + # when "update_child" + # doi[field].each_with_index do |item, index| + # if item == enrichment["original_value"] + # doi[field][index] = enrichment["enriched_value"] + # end + # end + # when "delete_child" + # doi[field] ||= [] + # doi[field].each_with_index do |item, index| + # if item == enrichment["original_value"] + # doi[field].delete_at(index) + # break + # end + # end + # end + # end end diff --git a/app/models/doi.rb b/app/models/doi.rb index a08b6608d..fe965a2b1 100644 --- a/app/models/doi.rb +++ b/app/models/doi.rb @@ -2770,6 +2770,33 @@ def handle_resource_type(types) end end + def apply_enrichment(enrichment) + action = enrichment["action"] + field = enrichment["field"].underscore + + case action + when "insert" + self[field] ||= [] + self[field] << enrichment["enriched_value"] + when "update" + self[field] = enrichment["enriched_value"] + when "update_child" + self[field].each_with_index do |item, index| + if item == enrichment["original_value"] + self[field][index] = enrichment["enriched_value"] + end + end + when "delete_child" + self[field] ||= [] + self[field].each_with_index do |item, index| + if item == enrichment["original_value"] + self[field].delete_at(index) + break + end + end + end + end + private def update_publisher_from_hash symbolized_publisher_hash = publisher_before_type_cast.symbolize_keys From 266a2061b82dc734664463691afd476f42dae48b Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Mon, 16 Feb 2026 10:50:05 +0200 Subject: [PATCH 21/35] Add Enrichable concern --- app/jobs/enrichment_batch_process_job.rb | 36 +++----------- app/models/concerns/enrichable.rb | 61 ++++++++++++++++++++++++ app/models/doi.rb | 29 +---------- lib/tasks/enrichment.rake | 2 +- 4 files changed, 71 insertions(+), 57 deletions(-) create mode 100644 app/models/concerns/enrichable.rb diff --git a/app/jobs/enrichment_batch_process_job.rb b/app/jobs/enrichment_batch_process_job.rb index a59b82c0f..f339c749e 100644 --- a/app/jobs/enrichment_batch_process_job.rb +++ b/app/jobs/enrichment_batch_process_job.rb @@ -5,8 +5,8 @@ class EnrichmentBatchProcessJob < ApplicationJob queue_as :enrichment_batch_process_job - def perform(lines) - log_prefix = "EnrichmentBatchProcessJob" + def perform(lines, file_name) + log_prefix = "EnrichmentBatchProcessJob (#{file_name})" # We will process the lines in parallel to speed up ingestion. Parallel.each(lines, in_threads: 10) do |line| @@ -20,6 +20,11 @@ def perform(lines) next end + if doi.enrichment_field(parsed_line["field"]).nil? + Rails.logger.error("#{log_prefix}: Unsupported enrichment field #{parsed_line["field"]} for DOI #{parsed_line["doi"]}") + next + end + # We set the only_validate flag on the DOI model to true such that we # ensure that validation functions as expected when not persisting the record. doi.only_validate = true @@ -48,31 +53,4 @@ def perform(lines) end end end - - # def enrich_doi(enrichment, doi) - # action = enrichment["action"] - # field = enrichment["field"].underscore - - # case action - # when "insert" - # doi[field] ||= [] - # doi[field] << enrichment["enriched_value"] - # when "update" - # doi[field] = enrichment["enriched_value"] - # when "update_child" - # doi[field].each_with_index do |item, index| - # if item == enrichment["original_value"] - # doi[field][index] = enrichment["enriched_value"] - # end - # end - # when "delete_child" - # doi[field] ||= [] - # doi[field].each_with_index do |item, index| - # if item == enrichment["original_value"] - # doi[field].delete_at(index) - # break - # end - # end - # end - # end end diff --git a/app/models/concerns/enrichable.rb b/app/models/concerns/enrichable.rb new file mode 100644 index 000000000..ece3d33f5 --- /dev/null +++ b/app/models/concerns/enrichable.rb @@ -0,0 +1,61 @@ +# frozen_string_literal: true + +module Enrichable + extend ActiveSupport::Concern + + def apply_enrichment(enrichment) + action = enrichment["action"] + + field = enrichment_field(enrichment["field"]) + + raise ArgumentError, "Invalid enrichment field #{enrichment["field"]}" if field.nil? + + case action + when "insert" + self[field] ||= [] + self[field] << enrichment["enriched_value"] + when "update" + self[field] = enrichment["enriched_value"] + when "update_child" + self[field].each_with_index do |item, index| + if item == enrichment["original_value"] + self[field][index] = enrichment["enriched_value"] + end + end + when "delete_child" + self[field] ||= [] + self[field].each_with_index do |item, index| + if item == enrichment["original_value"] + self[field].delete_at(index) + break + end + end + end + end + + def enrichment_field(field) + field_mapping = { + "alternateIdentifiers" => "alternate_identifiers", + "creators" => "creators", + "titles" => "titles", + "publisher" => "publisher", + "publicationYear" => "publication_year", + "subjects" => "subjects", + "contributors" => "contributors", + "dates" => "dates", + "language" => "language", + "types" => "types", + "relatedIdentifiers" => "related_identifiers", + "relatedItems" => "related_items", + "sizes" => "sizes", + "formats" => "formats", + "version" => "version", + "rightsList" => "rights_list", + "descriptions" => "descriptions", + "geoLocations" => "geo_locations", + "fundingReferences" => "funding_references" + } + + field_mapping.fetch(field, nil) + end +end diff --git a/app/models/doi.rb b/app/models/doi.rb index fe965a2b1..d4452e337 100644 --- a/app/models/doi.rb +++ b/app/models/doi.rb @@ -35,6 +35,8 @@ class Doi < ApplicationRecord include Elasticsearch::Model + include Enrichable + aasm whiny_transitions: false do # draft is initial state for new DOIs. state :draft, initial: true @@ -2770,33 +2772,6 @@ def handle_resource_type(types) end end - def apply_enrichment(enrichment) - action = enrichment["action"] - field = enrichment["field"].underscore - - case action - when "insert" - self[field] ||= [] - self[field] << enrichment["enriched_value"] - when "update" - self[field] = enrichment["enriched_value"] - when "update_child" - self[field].each_with_index do |item, index| - if item == enrichment["original_value"] - self[field][index] = enrichment["enriched_value"] - end - end - when "delete_child" - self[field] ||= [] - self[field].each_with_index do |item, index| - if item == enrichment["original_value"] - self[field].delete_at(index) - break - end - end - end - end - private def update_publisher_from_hash symbolized_publisher_hash = publisher_before_type_cast.symbolize_keys diff --git a/lib/tasks/enrichment.rake b/lib/tasks/enrichment.rake index 78aca7c4f..4316d937a 100644 --- a/lib/tasks/enrichment.rake +++ b/lib/tasks/enrichment.rake @@ -28,7 +28,7 @@ namespace :enrichment do flush = lambda do return if batch_lines.empty? - EnrichmentBatchProcessJob.perform_later(batch_lines) + EnrichmentBatchProcessJob.perform_later(batch_lines, key) batch_lines.clear batch_bytes = 0 From 5a1804eb9a212ecc3a2bb80f5a4b56c69999c5b8 Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Mon, 16 Feb 2026 13:22:59 +0200 Subject: [PATCH 22/35] Add error handling when decoding paging cursor --- app/controllers/enrichments_controller.rb | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/app/controllers/enrichments_controller.rb b/app/controllers/enrichments_controller.rb index c3925a90e..533c1f0a5 100644 --- a/app/controllers/enrichments_controller.rb +++ b/app/controllers/enrichments_controller.rb @@ -17,9 +17,13 @@ def index end if cursor.present? - decoded_cursor = decode_cursor(cursor) - cursor_updated_at = Time.iso8601(decoded_cursor.fetch("updated_at")) - cursor_id = decoded_cursor.fetch("id").to_i + begin + decoded_cursor = decode_cursor(cursor) + cursor_updated_at = Time.iso8601(decoded_cursor.fetch("updated_at")) + cursor_id = decoded_cursor.fetch("id").to_i + rescue + raise ActionController::BadRequest, "Invalid cursor" + end enrichments = enrichments.by_cursor(cursor_updated_at, cursor_id) end @@ -55,7 +59,5 @@ def encode_cursor(hash) def decode_cursor(token) JSON.parse(Base64.urlsafe_decode64(token)) - rescue - raise ActionController::BadRequest end end From fe0c20cc679b62f43f42a5388e884a323441695d Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Mon, 16 Feb 2026 14:33:29 +0200 Subject: [PATCH 23/35] Some refactoring --- app/controllers/enrichments_controller.rb | 62 ++++++++++------- app/jobs/enrichment_batch_process_job.rb | 81 +++++++++++++---------- 2 files changed, 84 insertions(+), 59 deletions(-) diff --git a/app/controllers/enrichments_controller.rb b/app/controllers/enrichments_controller.rb index 533c1f0a5..b443fde19 100644 --- a/app/controllers/enrichments_controller.rb +++ b/app/controllers/enrichments_controller.rb @@ -8,26 +8,8 @@ def index client_id = params["client_id"] cursor = params["cursor"] - enrichments = if doi.present? - Enrichment.by_doi(doi) - elsif client_id.present? - Enrichment.by_client(client_id) - else - Enrichment.all - end - - if cursor.present? - begin - decoded_cursor = decode_cursor(cursor) - cursor_updated_at = Time.iso8601(decoded_cursor.fetch("updated_at")) - cursor_id = decoded_cursor.fetch("id").to_i - rescue - raise ActionController::BadRequest, "Invalid cursor" - end - - enrichments = enrichments.by_cursor(cursor_updated_at, cursor_id) - end - + enrichments = base_page_enrichments(doi, client_id) + enrichments = cursor.present? ? filter_enrichments_with_cursor(enrichments, cursor) : enrichments enrichments = enrichments.order_by_cursor.limit(PAGE_SIZE).to_a current_link = request.original_url @@ -37,9 +19,7 @@ def index encode_cursor(updated_at: last.updated_at.iso8601(6), id: last.id) end - next_link = doi.present? ? - "#{request.original_url.split("?").first}?doi=#{doi}&cursor=#{next_cursor}" : - "#{request.original_url.split("?").first}?client-id=#{client_id}&cursor=#{next_cursor}" + next_link = build_next_link(doi, client_id, next_cursor) render(json: { data: enrichments, @@ -51,6 +31,28 @@ def index end private + def base_page_enrichments(doi, client_id) + enrichments = if doi.present? + Enrichment.by_doi(doi) + elsif client_id.present? + Enrichment.by_client(client_id) + else + Enrichment.all + end + end + + def filter_enrichments_with_cursor(enrichments, cursor) + begin + decoded_cursor = decode_cursor(cursor) + cursor_updated_at = Time.iso8601(decoded_cursor.fetch("updated_at")) + cursor_id = decoded_cursor.fetch("id").to_i + rescue ArgumentError, KeyError + raise ActionController::BadRequest, "Invalid cursor" + end + + enrichments.by_cursor(cursor_updated_at, cursor_id) + end + def encode_cursor(hash) Base64.urlsafe_encode64(hash.to_json, padding: false) rescue @@ -60,4 +62,18 @@ def encode_cursor(hash) def decode_cursor(token) JSON.parse(Base64.urlsafe_decode64(token)) end + + def build_next_link(doi, client_id, next_cursor) + base_link = request.original_url.split("?").first + + query_string = if doi.present? + "doi=#{doi}&cursor=#{next_cursor}" + elsif client_id.present? + "client-id=#{client_id}&cursor=#{next_cursor}" + else + "cursor=#{next_cursor}" + end + + "#{base_link}?#{query_string}" + end end diff --git a/app/jobs/enrichment_batch_process_job.rb b/app/jobs/enrichment_batch_process_job.rb index f339c749e..1a2bc5f0c 100644 --- a/app/jobs/enrichment_batch_process_job.rb +++ b/app/jobs/enrichment_batch_process_job.rb @@ -9,47 +9,56 @@ def perform(lines, file_name) log_prefix = "EnrichmentBatchProcessJob (#{file_name})" # We will process the lines in parallel to speed up ingestion. - Parallel.each(lines, in_threads: 10) do |line| - parsed_line = JSON.parse(line) + Parallel.each(lines, in_threads: 5) do |line| + # with_connection ensures the connection is explicitly checked out and returned to the pool after + # each iteration, preventing connection pool exhaustion. + ActiveRecord::Base.connection_pool.with_connection do + begin + parsed_line = JSON.parse(line) + rescue JSON::ParserError => e + Rails.logger.error("#{log_prefix}: Failed to parse line: #{e.message}") + next + end - # We only create enrichments for DOIs that exist and which have an agency of 'datacite'. - doi = Doi.find_by(doi: parsed_line["doi"], agency: "datacite") + # We only create enrichments for DOIs that exist and which have an agency of 'datacite'. + doi = Doi.find_by(doi: parsed_line["doi"], agency: "datacite") - if doi.blank? - Rails.logger.error("#{log_prefix}: Doi #{parsed_line["doi"]} does not exist") - next - end + if doi.blank? + Rails.logger.error("#{log_prefix}: Doi #{parsed_line["doi"]} does not exist") + next + end - if doi.enrichment_field(parsed_line["field"]).nil? - Rails.logger.error("#{log_prefix}: Unsupported enrichment field #{parsed_line["field"]} for DOI #{parsed_line["doi"]}") - next - end + if doi.enrichment_field(parsed_line["field"]).nil? + Rails.logger.error("#{log_prefix}: Unsupported enrichment field #{parsed_line["field"]} for DOI #{parsed_line["doi"]}") + next + end - # We set the only_validate flag on the DOI model to true such that we - # ensure that validation functions as expected when not persisting the record. - doi.only_validate = true - - enrichment = Enrichment.new( - doi: "#{parsed_line["doi"]}", - contributors: parsed_line["contributors"], - resources: parsed_line["resources"], - field: parsed_line["field"], - action: parsed_line["action"], - original_value: parsed_line["originalValue"], - enriched_value: parsed_line["enrichedValue"] - ) - - doi.apply_enrichment(enrichment) - - unless doi.valid? - errors = serialize_errors(doi.errors, uid: enrichment.doi) - Rails.logger.error("#{log_prefix}: Enrichment does not generate valid metadata: #{errors}") - next - end + # We set the only_validate flag on the DOI model to true such that we + # ensure that validation functions as expected when not persisting the record. + doi.only_validate = true + + enrichment = Enrichment.new( + doi: "#{parsed_line["doi"]}", + contributors: parsed_line["contributors"], + resources: parsed_line["resources"], + field: parsed_line["field"], + action: parsed_line["action"], + original_value: parsed_line["originalValue"], + enriched_value: parsed_line["enrichedValue"] + ) + + doi.apply_enrichment(enrichment) + + unless doi.valid? + errors = serialize_errors(doi.errors, uid: enrichment.doi) + Rails.logger.error("#{log_prefix}: Enrichment does not generate valid metadata: #{errors}") + next + end - unless enrichment.save - errors = enrichment.errors.full_messages.join(";") - Rails.logger.error("#{log_prefix}: Enrichment failed to save: #{errors}") + unless enrichment.save + errors = enrichment.errors.full_messages.join(";") + Rails.logger.error("#{log_prefix}: Enrichment failed to save: #{errors}") + end end end end From 39033dcf0ff33807752f41085755d37225e23344 Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Mon, 16 Feb 2026 14:44:12 +0200 Subject: [PATCH 24/35] Fix rubocop complaint --- app/controllers/enrichments_controller.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/controllers/enrichments_controller.rb b/app/controllers/enrichments_controller.rb index b443fde19..278e2c089 100644 --- a/app/controllers/enrichments_controller.rb +++ b/app/controllers/enrichments_controller.rb @@ -32,7 +32,7 @@ def index private def base_page_enrichments(doi, client_id) - enrichments = if doi.present? + if doi.present? Enrichment.by_doi(doi) elsif client_id.present? Enrichment.by_client(client_id) From 28c2ffe3cbc77a99e47ae19137ec30373d38a721 Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Mon, 16 Feb 2026 15:33:44 +0200 Subject: [PATCH 25/35] Refactoring --- app/controllers/enrichments_controller.rb | 4 ++-- app/jobs/enrichment_batch_process_job.rb | 9 +++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/app/controllers/enrichments_controller.rb b/app/controllers/enrichments_controller.rb index 278e2c089..2951b0615 100644 --- a/app/controllers/enrichments_controller.rb +++ b/app/controllers/enrichments_controller.rb @@ -4,8 +4,8 @@ class EnrichmentsController < ApplicationController PAGE_SIZE = 25 def index - doi = params["doi"] - client_id = params["client_id"] + doi = params["doi"]&.upcase + client_id = params["client_id"]&.upcase cursor = params["cursor"] enrichments = base_page_enrichments(doi, client_id) diff --git a/app/jobs/enrichment_batch_process_job.rb b/app/jobs/enrichment_batch_process_job.rb index 1a2bc5f0c..757ba765b 100644 --- a/app/jobs/enrichment_batch_process_job.rb +++ b/app/jobs/enrichment_batch_process_job.rb @@ -21,15 +21,16 @@ def perform(lines, file_name) end # We only create enrichments for DOIs that exist and which have an agency of 'datacite'. - doi = Doi.find_by(doi: parsed_line["doi"], agency: "datacite") + uid = parsed_line["doi"]&.upcase + doi = Doi.find_by(doi: uid, agency: "datacite") if doi.blank? - Rails.logger.error("#{log_prefix}: Doi #{parsed_line["doi"]} does not exist") + Rails.logger.error("#{log_prefix}: Doi #{uid} does not exist") next end if doi.enrichment_field(parsed_line["field"]).nil? - Rails.logger.error("#{log_prefix}: Unsupported enrichment field #{parsed_line["field"]} for DOI #{parsed_line["doi"]}") + Rails.logger.error("#{log_prefix}: Unsupported enrichment field #{parsed_line["field"]} for DOI #{uid}") next end @@ -38,7 +39,7 @@ def perform(lines, file_name) doi.only_validate = true enrichment = Enrichment.new( - doi: "#{parsed_line["doi"]}", + doi: "#{uid}", contributors: parsed_line["contributors"], resources: parsed_line["resources"], field: parsed_line["field"], From 3d15c0c139444601c7aa91a85dca80af295d2e32 Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Mon, 16 Feb 2026 15:55:45 +0200 Subject: [PATCH 26/35] Refactor --- app/jobs/enrichment_batch_process_job.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/jobs/enrichment_batch_process_job.rb b/app/jobs/enrichment_batch_process_job.rb index 757ba765b..e8cfc3d62 100644 --- a/app/jobs/enrichment_batch_process_job.rb +++ b/app/jobs/enrichment_batch_process_job.rb @@ -39,7 +39,7 @@ def perform(lines, file_name) doi.only_validate = true enrichment = Enrichment.new( - doi: "#{uid}", + doi: uid, contributors: parsed_line["contributors"], resources: parsed_line["resources"], field: parsed_line["field"], From 4bba1174a26059e67fed40b95d5c9b744fc81051 Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Mon, 16 Feb 2026 16:36:45 +0200 Subject: [PATCH 27/35] Make cursor decoding error handling more generic since any cursor decode error should result in an invalid cursor response --- app/controllers/enrichments_controller.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/controllers/enrichments_controller.rb b/app/controllers/enrichments_controller.rb index 2951b0615..b114bd9a5 100644 --- a/app/controllers/enrichments_controller.rb +++ b/app/controllers/enrichments_controller.rb @@ -46,7 +46,7 @@ def filter_enrichments_with_cursor(enrichments, cursor) decoded_cursor = decode_cursor(cursor) cursor_updated_at = Time.iso8601(decoded_cursor.fetch("updated_at")) cursor_id = decoded_cursor.fetch("id").to_i - rescue ArgumentError, KeyError + rescue raise ActionController::BadRequest, "Invalid cursor" end From d54f58191f0bb85292269572011ff2634952701b Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Mon, 16 Feb 2026 18:15:33 +0200 Subject: [PATCH 28/35] Add enrichments serializer --- app/controllers/enrichments_controller.rb | 4 ++-- app/serializers/enrichment_serializer.rb | 29 +++++++++++++++++++++++ 2 files changed, 31 insertions(+), 2 deletions(-) create mode 100644 app/serializers/enrichment_serializer.rb diff --git a/app/controllers/enrichments_controller.rb b/app/controllers/enrichments_controller.rb index b114bd9a5..686d7b539 100644 --- a/app/controllers/enrichments_controller.rb +++ b/app/controllers/enrichments_controller.rb @@ -5,7 +5,7 @@ class EnrichmentsController < ApplicationController def index doi = params["doi"]&.upcase - client_id = params["client_id"]&.upcase + client_id = params["client_id"] cursor = params["cursor"] enrichments = base_page_enrichments(doi, client_id) @@ -22,7 +22,7 @@ def index next_link = build_next_link(doi, client_id, next_cursor) render(json: { - data: enrichments, + data: EnrichmentSerializer.new(enrichments, {}).serializable_hash, links: { self: current_link, next: enrichments.length == PAGE_SIZE ? next_link : nil diff --git a/app/serializers/enrichment_serializer.rb b/app/serializers/enrichment_serializer.rb new file mode 100644 index 000000000..566d74f88 --- /dev/null +++ b/app/serializers/enrichment_serializer.rb @@ -0,0 +1,29 @@ +# frozen_string_literal: true + +class EnrichmentSerializer + include JSONAPI::Serializer + + set_key_transform :camel_lower + set_type :enrichments + set_id :id + + attributes :id, + :doi, + :contributors, + :resources, + :field, + :action, + :original_value, + :enriched_value + + # Ensure the DOI value is always serialized in lowercase + attribute :doi do |object| + object.doi&.downcase + end + + # Map created_at to created + attribute :created, &:created_at + + # Map updated_at to updated + attribute :updated, &:updated_at +end From 08f49efc960a8ab6d9184543a39d5b5fee257ff7 Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Tue, 17 Feb 2026 13:51:42 +0200 Subject: [PATCH 29/35] Refactor serializer --- app/serializers/enrichment_serializer.rb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/app/serializers/enrichment_serializer.rb b/app/serializers/enrichment_serializer.rb index 566d74f88..84a7d122e 100644 --- a/app/serializers/enrichment_serializer.rb +++ b/app/serializers/enrichment_serializer.rb @@ -7,8 +7,7 @@ class EnrichmentSerializer set_type :enrichments set_id :id - attributes :id, - :doi, + attributes :doi, :contributors, :resources, :field, From 7a4d60c87bec6a8078941680e8e1e463a6a9deca Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Tue, 17 Feb 2026 18:38:35 +0200 Subject: [PATCH 30/35] A few fixes --- app/controllers/enrichments_controller.rb | 7 ++++--- app/jobs/enrichment_batch_process_job.rb | 5 +++-- db/migrate/20260120083752_create_enrichments.rb | 1 + db/schema.rb | 1 + 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/app/controllers/enrichments_controller.rb b/app/controllers/enrichments_controller.rb index 686d7b539..144fce4db 100644 --- a/app/controllers/enrichments_controller.rb +++ b/app/controllers/enrichments_controller.rb @@ -21,13 +21,14 @@ def index next_link = build_next_link(doi, client_id, next_cursor) - render(json: { - data: EnrichmentSerializer.new(enrichments, {}).serializable_hash, + options = { links: { self: current_link, next: enrichments.length == PAGE_SIZE ? next_link : nil } - }) + } + + render(json: EnrichmentSerializer.new(enrichments, options).serializable_hash, status: :ok) end private diff --git a/app/jobs/enrichment_batch_process_job.rb b/app/jobs/enrichment_batch_process_job.rb index e8cfc3d62..0b8457c37 100644 --- a/app/jobs/enrichment_batch_process_job.rb +++ b/app/jobs/enrichment_batch_process_job.rb @@ -5,8 +5,8 @@ class EnrichmentBatchProcessJob < ApplicationJob queue_as :enrichment_batch_process_job - def perform(lines, file_name) - log_prefix = "EnrichmentBatchProcessJob (#{file_name})" + def perform(lines, filename) + log_prefix = "EnrichmentBatchProcessJob (#{filename})" # We will process the lines in parallel to speed up ingestion. Parallel.each(lines, in_threads: 5) do |line| @@ -39,6 +39,7 @@ def perform(lines, file_name) doi.only_validate = true enrichment = Enrichment.new( + filename: filename, doi: uid, contributors: parsed_line["contributors"], resources: parsed_line["resources"], diff --git a/db/migrate/20260120083752_create_enrichments.rb b/db/migrate/20260120083752_create_enrichments.rb index d2b403163..d66fe779b 100644 --- a/db/migrate/20260120083752_create_enrichments.rb +++ b/db/migrate/20260120083752_create_enrichments.rb @@ -10,6 +10,7 @@ def change t.string :action, null: false t.json :original_value, null: true t.json :enriched_value, null: true + t.string :filename, null: true t.timestamps end diff --git a/db/schema.rb b/db/schema.rb index c5618a2f3..f67b504d5 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -249,6 +249,7 @@ t.json "enriched_value" t.datetime "created_at", null: false t.datetime "updated_at", null: false + t.string "filename" t.index ["doi", "updated_at", "id"], name: "index_enrichments_on_doi_and_updated_at_and_id", order: { updated_at: :desc, id: :desc } end From b10b0441a23348599c64e271ba17ec2e4e8aa506 Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Wed, 18 Feb 2026 16:57:45 +0200 Subject: [PATCH 31/35] Fix links object and add meta object to the json payload --- app/controllers/enrichments_controller.rb | 75 +++++++++++++++-------- 1 file changed, 49 insertions(+), 26 deletions(-) diff --git a/app/controllers/enrichments_controller.rb b/app/controllers/enrichments_controller.rb index 144fce4db..4f01f7e28 100644 --- a/app/controllers/enrichments_controller.rb +++ b/app/controllers/enrichments_controller.rb @@ -6,26 +6,24 @@ class EnrichmentsController < ApplicationController def index doi = params["doi"]&.upcase client_id = params["client_id"] - cursor = params["cursor"] + cursor = params.dig("page", "cursor") - enrichments = base_page_enrichments(doi, client_id) - enrichments = cursor.present? ? filter_enrichments_with_cursor(enrichments, cursor) : enrichments - enrichments = enrichments.order_by_cursor.limit(PAGE_SIZE).to_a - - current_link = request.original_url + base_enrichments = base_page_enrichments(doi, client_id) - next_cursor = if enrichments.any? - last = enrichments.last - encode_cursor(updated_at: last.updated_at.iso8601(6), id: last.id) + enrichments = if cursor.present? + cursor_updated_at, cursor_id, cursor_page = decode_cursor(cursor) + base_enrichments.by_cursor(cursor_updated_at, cursor_id) + else + base_enrichments end - next_link = build_next_link(doi, client_id, next_cursor) + enrichments = enrichments.order_by_cursor.limit(PAGE_SIZE).to_a + + cursor_page ||= 1 options = { - links: { - self: current_link, - next: enrichments.length == PAGE_SIZE ? next_link : nil - } + meta: build_meta(base_enrichments, cursor_page), + links: build_paging_links(enrichments, doi, client_id, cursor_page) } render(json: EnrichmentSerializer.new(enrichments, options).serializable_hash, status: :ok) @@ -42,26 +40,35 @@ def base_page_enrichments(doi, client_id) end end - def filter_enrichments_with_cursor(enrichments, cursor) + def encode_cursor(hash) + Base64.urlsafe_encode64(hash.to_json, padding: false) + rescue + raise ActionController::InternalServerError, "Failed to encode cursor" + end + + def decode_cursor(token) begin - decoded_cursor = decode_cursor(cursor) + decoded_cursor = JSON.parse(Base64.urlsafe_decode64(token)) cursor_updated_at = Time.iso8601(decoded_cursor.fetch("updated_at")) cursor_id = decoded_cursor.fetch("id").to_i + cursor_page = decoded_cursor.fetch("page", nil).to_i || 0 + + Rails.logger.info("cursor_page: #{cursor_page}") + + [cursor_updated_at, cursor_id, cursor_page] rescue raise ActionController::BadRequest, "Invalid cursor" end - - enrichments.by_cursor(cursor_updated_at, cursor_id) end - def encode_cursor(hash) - Base64.urlsafe_encode64(hash.to_json, padding: false) - rescue - raise ActionController::BadRequest - end + def build_meta(enrichments, cursor_page) + enrichments_total = enrichments.count - def decode_cursor(token) - JSON.parse(Base64.urlsafe_decode64(token)) + { + total: enrichments_total, + totalPages: (enrichments_total / PAGE_SIZE.to_f).ceil, + page: cursor_page + } end def build_next_link(doi, client_id, next_cursor) @@ -72,9 +79,25 @@ def build_next_link(doi, client_id, next_cursor) elsif client_id.present? "client-id=#{client_id}&cursor=#{next_cursor}" else - "cursor=#{next_cursor}" + "page[cursor]=#{next_cursor}" end "#{base_link}?#{query_string}" end + + def build_paging_links(enrichments, doi, client_id, cursor_page) + current_link = request.original_url + + next_cursor = if enrichments.any? + last = enrichments.last + encode_cursor(updated_at: last.updated_at.iso8601(6), id: last.id, page: cursor_page + 1) + end + + next_link = build_next_link(doi, client_id, next_cursor) + + { + self: current_link, + next: enrichments.length == PAGE_SIZE ? next_link : nil + } + end end From dfa3547d4872a62f30b67919d64975144cfa6946 Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Wed, 18 Feb 2026 17:06:40 +0200 Subject: [PATCH 32/35] Rubocop fixes --- app/controllers/enrichments_controller.rb | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/app/controllers/enrichments_controller.rb b/app/controllers/enrichments_controller.rb index 4f01f7e28..4deb6f740 100644 --- a/app/controllers/enrichments_controller.rb +++ b/app/controllers/enrichments_controller.rb @@ -47,18 +47,16 @@ def encode_cursor(hash) end def decode_cursor(token) - begin - decoded_cursor = JSON.parse(Base64.urlsafe_decode64(token)) - cursor_updated_at = Time.iso8601(decoded_cursor.fetch("updated_at")) - cursor_id = decoded_cursor.fetch("id").to_i - cursor_page = decoded_cursor.fetch("page", nil).to_i || 0 + decoded_cursor = JSON.parse(Base64.urlsafe_decode64(token)) + cursor_updated_at = Time.iso8601(decoded_cursor.fetch("updated_at")) + cursor_id = decoded_cursor.fetch("id").to_i + cursor_page = decoded_cursor.fetch("page", nil).to_i || 0 - Rails.logger.info("cursor_page: #{cursor_page}") + Rails.logger.info("cursor_page: #{cursor_page}") - [cursor_updated_at, cursor_id, cursor_page] - rescue - raise ActionController::BadRequest, "Invalid cursor" - end + [cursor_updated_at, cursor_id, cursor_page] + rescue + raise ActionController::BadRequest, "Invalid cursor" end def build_meta(enrichments, cursor_page) From 871b7e38724d3fddd8b627eb1f0668c5e9175ec7 Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Mon, 23 Feb 2026 12:44:28 +0200 Subject: [PATCH 33/35] Add failure logic for invalid field mappings and add some initial specs --- app/jobs/enrichment_batch_process_job.rb | 9 ++- app/models/concerns/enrichable.rb | 30 ++++++-- spec/models/doi_spec.rb | 98 ++++++++++++++++++++++++ 3 files changed, 130 insertions(+), 7 deletions(-) diff --git a/app/jobs/enrichment_batch_process_job.rb b/app/jobs/enrichment_batch_process_job.rb index 0b8457c37..1b8567ea9 100644 --- a/app/jobs/enrichment_batch_process_job.rb +++ b/app/jobs/enrichment_batch_process_job.rb @@ -49,7 +49,12 @@ def perform(lines, filename) enriched_value: parsed_line["enrichedValue"] ) - doi.apply_enrichment(enrichment) + apply_error = doi.apply_enrichment(parsed_line) + + if apply_error.present? + Rails.logger.error("#{log_prefix}: Failed to apply enrichment for DOI #{uid}, #{apply_error}") + next + end unless doi.valid? errors = serialize_errors(doi.errors, uid: enrichment.doi) @@ -59,7 +64,7 @@ def perform(lines, filename) unless enrichment.save errors = enrichment.errors.full_messages.join(";") - Rails.logger.error("#{log_prefix}: Enrichment failed to save: #{errors}") + Rails.logger.error("#{log_prefix}: Failed to save enrichment for DOI #{uid}: #{errors}") end end end diff --git a/app/models/concerns/enrichable.rb b/app/models/concerns/enrichable.rb index ece3d33f5..f10ae3017 100644 --- a/app/models/concerns/enrichable.rb +++ b/app/models/concerns/enrichable.rb @@ -3,34 +3,54 @@ module Enrichable extend ActiveSupport::Concern + # Returns a non-nil error message if application of the enrichment failed, otherwise returns nil. def apply_enrichment(enrichment) action = enrichment["action"] - field = enrichment_field(enrichment["field"]) raise ArgumentError, "Invalid enrichment field #{enrichment["field"]}" if field.nil? + error = nil + + # A return of true indicates that the enrichment was applied. + # This is important in the case of the update_child and delete_child actions. case action when "insert" self[field] ||= [] self[field] << enrichment["enriched_value"] when "update" - self[field] = enrichment["enriched_value"] - when "update_child" + if self[field] == enrichment["original_value"] + self[field] = enrichment["enriched_value"] + else + error = "Original value does not match current value for update action" + end + when "updateChild" + success = false + self[field].each_with_index do |item, index| if item == enrichment["original_value"] self[field][index] = enrichment["enriched_value"] + success = true + break end end - when "delete_child" + + error = "Original value not found for updateChild action" unless success + when "deleteChild" + success = false + self[field] ||= [] self[field].each_with_index do |item, index| if item == enrichment["original_value"] self[field].delete_at(index) - break + success = true end end + + error = "Original value not found for deleteChild action" unless success end + + error end def enrichment_field(field) diff --git a/spec/models/doi_spec.rb b/spec/models/doi_spec.rb index a2359634e..2dac17058 100644 --- a/spec/models/doi_spec.rb +++ b/spec/models/doi_spec.rb @@ -2363,4 +2363,102 @@ expect(doi.as_indexed_json["funder_parent_rors"]).to eq(["https://ror.org/019w4f821", "https://ror.org/04cw6st05"]) end end + + describe "enrichable" do + describe "#enrichment_field" do + before do + let(:doi) { create(:doi, aasm_state: "findable", agency: "datacite") } + end + + it "maps alternatveIdentifiers to alternate_identifiers" do + expect(doi.enrichment_field("alternateIdentifiers")).to(eq("alternate_identifiers")) + end + + it "maps creators to creators" do + expect(doi.enrichment_field("creators")).to(eq("creators")) + end + + it "maps titles to titles" do + expect(doi.enrichment_field("titles")).to(eq("titles")) + end + + it "maps publisher to publisher" do + expect(doi.enrichment_field("publisher")).to(eq("publisher")) + end + + it "maps publicationYear to publication_year" do + expect(doi.enrichment_field("publicationYear")).to(eq("publication_year")) + end + + it "maps subjects to subjects" do + expect(doi.enrichment_field("subjects")).to(eq("subjects")) + end + + it "maps contributors to contributors" do + expect(doi.enrichment_field("contributors")).to(eq("contributors")) + end + + it "maps dates to dates" do + expect(doi.enrichment_field("dates")).to(eq("dates")) + end + + it "maps language to language" do + expect(doi.enrichment_field("language")).to(eq("language")) + end + + it "maps types to types" do + expect(doi.enrichment_field("types")).to(eq("types")) + end + + it "maps relatedIdentifiers to related_identifiers" do + expect(doi.enrichment_field("relatedIdentifiers")).to(eq("related_identifiers")) + end + + it "maps relatedItems to related_items" do + expect(doi.enrichment_field("relatedItems")).to(eq("related_items")) + end + + it "maps sizes to sizes" do + expect(doi.enrichment_field("sizes")).to(eq("sizes")) + end + + it "maps formats to formats" do + expect(doi.enrichment_field("formats")).to(eq("formats")) + end + + it "maps version to version" do + expect(doi.enrichment_field("version")).to(eq("version")) + end + + it "maps rightsList to rights_list" do + expect(doi.enrichment_field("rightsList")).to(eq("rights_list")) + end + + it "maps descriptions to descriptions" do + expect(doi.enrichment_field("descriptions")).to(eq("descriptions")) + end + + it "maps geoLocations to geo_locations" do + expect(doi.enrichment_field("geoLocations")).to(eq("geo_locations")) + end + + it "maps fundingReferences to funding_references" do + expect(doi.enrichment_field("fundingReferences")).to(eq("funding_references")) + end + end + + describe "#apply_enrichment" do + describe "when field is invalid" do + it "raises ArgumentError" do + enrichment = { + "action": "insert", + "field": "invalid_field", + "enriched_value": "foo" + } + expect { doi.apply_enrichment(enrichment) } + .to(raise_error(ArgumentError, "Invalid enrichment field: invalid_field")) + end + end + end + end end From f4de16dc57cdcd6f2639c33bc066c44a7ac0ad4f Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Mon, 23 Feb 2026 13:00:52 +0200 Subject: [PATCH 34/35] Fix spec --- spec/models/doi_spec.rb | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/spec/models/doi_spec.rb b/spec/models/doi_spec.rb index 2dac17058..9d8081ac7 100644 --- a/spec/models/doi_spec.rb +++ b/spec/models/doi_spec.rb @@ -2366,9 +2366,7 @@ describe "enrichable" do describe "#enrichment_field" do - before do - let(:doi) { create(:doi, aasm_state: "findable", agency: "datacite") } - end + let(:doi) { create(:doi, aasm_state: "findable", agency: "datacite") } it "maps alternatveIdentifiers to alternate_identifiers" do expect(doi.enrichment_field("alternateIdentifiers")).to(eq("alternate_identifiers")) From 16d9fac1efab515c5ea06a42f1c040174160d9fc Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Mon, 23 Feb 2026 13:07:47 +0200 Subject: [PATCH 35/35] Fix spec --- spec/models/doi_spec.rb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spec/models/doi_spec.rb b/spec/models/doi_spec.rb index 9d8081ac7..0da32549a 100644 --- a/spec/models/doi_spec.rb +++ b/spec/models/doi_spec.rb @@ -2447,6 +2447,8 @@ describe "#apply_enrichment" do describe "when field is invalid" do + let(:doi) { create(:doi, aasm_state: "findable", agency: "datacite") } + it "raises ArgumentError" do enrichment = { "action": "insert",