Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
72c7763
Create enrichment table and model
wendelfabianchinsamy Jan 29, 2026
bafbb6f
Appease our rubocop overlords
wendelfabianchinsamy Jan 29, 2026
b241269
Update json schema
wendelfabianchinsamy Jan 29, 2026
13ab786
Specs
wendelfabianchinsamy Jan 29, 2026
2668140
Fix specs
wendelfabianchinsamy Jan 29, 2026
9f51eaf
Add controller logic and tests
wendelfabianchinsamy Jan 30, 2026
da20a0c
Add controller specs
wendelfabianchinsamy Jan 30, 2026
95d7f81
Fix specs
wendelfabianchinsamy Jan 30, 2026
3e0adcb
Fix specs
wendelfabianchinsamy Jan 30, 2026
bcd494b
Fix specs
wendelfabianchinsamy Jan 30, 2026
5488ba6
Fix specs
wendelfabianchinsamy Jan 30, 2026
ac65de2
Fix specs
wendelfabianchinsamy Jan 30, 2026
86a97d7
Fix specs
wendelfabianchinsamy Jan 30, 2026
b9de835
Remove controller specs
wendelfabianchinsamy Feb 2, 2026
4fe26de
Remove specs
wendelfabianchinsamy Feb 2, 2026
97f2279
Build initial rake task
wendelfabianchinsamy Feb 2, 2026
7515e5e
Implement rake and jobs
wendelfabianchinsamy Feb 5, 2026
7ae2e12
Merge branch 'master' into create-enrichments-table
wendelfabianchinsamy Feb 5, 2026
0035b0e
Finish up the batch process job
wendelfabianchinsamy Feb 12, 2026
8a3a4d2
Reduce batch size further
wendelfabianchinsamy Feb 12, 2026
61fd660
PR comment changes
wendelfabianchinsamy Feb 13, 2026
266a206
Add Enrichable concern
wendelfabianchinsamy Feb 16, 2026
5a1804e
Add error handling when decoding paging cursor
wendelfabianchinsamy Feb 16, 2026
fe0c20c
Some refactoring
wendelfabianchinsamy Feb 16, 2026
39033dc
Fix rubocop complaint
wendelfabianchinsamy Feb 16, 2026
28c2ffe
Refactoring
wendelfabianchinsamy Feb 16, 2026
3d15c0c
Refactor
wendelfabianchinsamy Feb 16, 2026
4bba117
Make cursor decoding error handling more generic since any cursor dec…
wendelfabianchinsamy Feb 16, 2026
d54f581
Add enrichments serializer
wendelfabianchinsamy Feb 16, 2026
08f49ef
Refactor serializer
wendelfabianchinsamy Feb 17, 2026
7a4d60c
A few fixes
wendelfabianchinsamy Feb 17, 2026
b10b044
Fix links object and add meta object to the json payload
wendelfabianchinsamy Feb 18, 2026
dfa3547
Rubocop fixes
wendelfabianchinsamy Feb 18, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 101 additions & 0 deletions app/controllers/enrichments_controller.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# frozen_string_literal: true

class EnrichmentsController < ApplicationController
PAGE_SIZE = 25

def index
doi = params["doi"]&.upcase
client_id = params["client_id"]
cursor = params.dig("page", "cursor")

base_enrichments = base_page_enrichments(doi, client_id)

enrichments = if cursor.present?
cursor_updated_at, cursor_id, cursor_page = decode_cursor(cursor)
base_enrichments.by_cursor(cursor_updated_at, cursor_id)
else
base_enrichments
end

enrichments = enrichments.order_by_cursor.limit(PAGE_SIZE).to_a

cursor_page ||= 1

options = {
meta: build_meta(base_enrichments, cursor_page),
links: build_paging_links(enrichments, doi, client_id, cursor_page)
}

render(json: EnrichmentSerializer.new(enrichments, options).serializable_hash, status: :ok)
end

private
def base_page_enrichments(doi, client_id)
if doi.present?
Enrichment.by_doi(doi)
elsif client_id.present?
Enrichment.by_client(client_id)
else
Enrichment.all
end
end

def encode_cursor(hash)
Base64.urlsafe_encode64(hash.to_json, padding: false)
rescue
raise ActionController::InternalServerError, "Failed to encode cursor"
end

def decode_cursor(token)
decoded_cursor = JSON.parse(Base64.urlsafe_decode64(token))
cursor_updated_at = Time.iso8601(decoded_cursor.fetch("updated_at"))
cursor_id = decoded_cursor.fetch("id").to_i
cursor_page = decoded_cursor.fetch("page", nil).to_i || 0

Rails.logger.info("cursor_page: #{cursor_page}")

[cursor_updated_at, cursor_id, cursor_page]
rescue
raise ActionController::BadRequest, "Invalid cursor"
end

def build_meta(enrichments, cursor_page)
enrichments_total = enrichments.count

{
total: enrichments_total,
totalPages: (enrichments_total / PAGE_SIZE.to_f).ceil,
page: cursor_page
}
end

def build_next_link(doi, client_id, next_cursor)
base_link = request.original_url.split("?").first

query_string = if doi.present?
"doi=#{doi}&cursor=#{next_cursor}"
elsif client_id.present?
"client-id=#{client_id}&cursor=#{next_cursor}"
else
"page[cursor]=#{next_cursor}"
end

"#{base_link}?#{query_string}"
end

def build_paging_links(enrichments, doi, client_id, cursor_page)
current_link = request.original_url

next_cursor = if enrichments.any?
last = enrichments.last
encode_cursor(updated_at: last.updated_at.iso8601(6), id: last.id, page: cursor_page + 1)
end

next_link = build_next_link(doi, client_id, next_cursor)

{
self: current_link,
next: enrichments.length == PAGE_SIZE ? next_link : nil
}
end
end
67 changes: 67 additions & 0 deletions app/jobs/enrichment_batch_process_job.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# frozen_string_literal: true

class EnrichmentBatchProcessJob < ApplicationJob
include ErrorSerializable

queue_as :enrichment_batch_process_job

def perform(lines, filename)
log_prefix = "EnrichmentBatchProcessJob (#{filename})"

# We will process the lines in parallel to speed up ingestion.
Parallel.each(lines, in_threads: 5) do |line|
# with_connection ensures the connection is explicitly checked out and returned to the pool after
# each iteration, preventing connection pool exhaustion.
ActiveRecord::Base.connection_pool.with_connection do
begin
parsed_line = JSON.parse(line)
rescue JSON::ParserError => e
Rails.logger.error("#{log_prefix}: Failed to parse line: #{e.message}")
next
end

# We only create enrichments for DOIs that exist and which have an agency of 'datacite'.
uid = parsed_line["doi"]&.upcase
doi = Doi.find_by(doi: uid, agency: "datacite")

if doi.blank?
Rails.logger.error("#{log_prefix}: Doi #{uid} does not exist")
next
end

if doi.enrichment_field(parsed_line["field"]).nil?
Rails.logger.error("#{log_prefix}: Unsupported enrichment field #{parsed_line["field"]} for DOI #{uid}")
next
end

# We set the only_validate flag on the DOI model to true such that we
# ensure that validation functions as expected when not persisting the record.
doi.only_validate = true

enrichment = Enrichment.new(
filename: filename,
doi: uid,
contributors: parsed_line["contributors"],
resources: parsed_line["resources"],
field: parsed_line["field"],
action: parsed_line["action"],
original_value: parsed_line["originalValue"],
enriched_value: parsed_line["enrichedValue"]
)

doi.apply_enrichment(enrichment)

unless doi.valid?
errors = serialize_errors(doi.errors, uid: enrichment.doi)
Rails.logger.error("#{log_prefix}: Enrichment does not generate valid metadata: #{errors}")
next
end

unless enrichment.save
errors = enrichment.errors.full_messages.join(";")
Rails.logger.error("#{log_prefix}: Enrichment failed to save: #{errors}")
end
end
end
end
end
61 changes: 61 additions & 0 deletions app/models/concerns/enrichable.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# frozen_string_literal: true

module Enrichable
extend ActiveSupport::Concern

def apply_enrichment(enrichment)
action = enrichment["action"]

field = enrichment_field(enrichment["field"])

raise ArgumentError, "Invalid enrichment field #{enrichment["field"]}" if field.nil?

case action
when "insert"
self[field] ||= []
self[field] << enrichment["enriched_value"]
when "update"
self[field] = enrichment["enriched_value"]
when "update_child"
self[field].each_with_index do |item, index|
if item == enrichment["original_value"]
self[field][index] = enrichment["enriched_value"]
end
end
when "delete_child"
self[field] ||= []
self[field].each_with_index do |item, index|
if item == enrichment["original_value"]
self[field].delete_at(index)
break
end
end
end
Comment on lines +19 to +33
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Open question that doesn't necessarily need to be answered now: should the enrichment be considered valid if the no item is found for update_child and delete_child? At that point, the record wouldn't actually "do" anything, right?

end

def enrichment_field(field)
field_mapping = {
"alternateIdentifiers" => "alternate_identifiers",
"creators" => "creators",
"titles" => "titles",
"publisher" => "publisher",
"publicationYear" => "publication_year",
"subjects" => "subjects",
"contributors" => "contributors",
"dates" => "dates",
"language" => "language",
"types" => "types",
"relatedIdentifiers" => "related_identifiers",
"relatedItems" => "related_items",
"sizes" => "sizes",
"formats" => "formats",
"version" => "version",
"rightsList" => "rights_list",
"descriptions" => "descriptions",
"geoLocations" => "geo_locations",
"fundingReferences" => "funding_references"
}

field_mapping.fetch(field, nil)
end
end
2 changes: 2 additions & 0 deletions app/models/doi.rb
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ class Doi < ApplicationRecord

include Elasticsearch::Model

include Enrichable

aasm whiny_transitions: false do
# draft is initial state for new DOIs.
state :draft, initial: true
Expand Down
54 changes: 54 additions & 0 deletions app/models/enrichment.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
class Enrichment < ApplicationRecord
validate :validate_json_schema

belongs_to :doi_record,
class_name: "Doi",
foreign_key: :doi, # enrichments.doi
primary_key: :doi, # dois.doi
optional: false

has_one :client, through: :doi_record

scope :by_doi, ->(doi) { where(doi: doi) }

scope :by_client, ->(client_id) { joins(doi_record: :client).where(datacentre: { symbol: client_id }) }

scope :by_cursor, ->(updated_at, id) {
where("(enrichments.updated_at < ?) OR (enrichments.updated_at = ? AND enrichments.id < ?)",
updated_at,
updated_at,
id)
}

scope :order_by_cursor, -> { order(updated_at: :desc, id: :desc) }

private
def validate_json_schema
doc = to_enrichment_hash
error_list = self.class.enrichment_schemer.validate(doc).to_a

return if error_list.empty?

errors.add(:base, "Validation failed: #{error_list.map { |e| e['message'] || e.inspect }.join('; ')}")
end

def to_enrichment_hash
{
"doi" => doi,
"contributors" => contributors,
"resources" => resources,
"field" => field,
"action" => action,
"originalValue" => original_value,
"enrichedValue" => enriched_value
}.compact
end

def self.enrichment_schemer
@enrichment_schemer ||= begin
schema_path = Rails.root.join("app/models/schemas/enrichment/enrichment.json")
schema = JSON.parse(File.read(schema_path))
JSONSchemer.schema(schema)
end
end
end
Loading