From 35b7c0cd6a48c9240de98fc6d74cf63a22d7c0cf Mon Sep 17 00:00:00 2001 From: dpaluy Date: Tue, 6 Jan 2026 23:00:59 -0600 Subject: [PATCH 1/8] refactor: remove legacy redaction system in preparation for pattern-based refactor Remove dual-layer PII redaction system (database rules + class-based redactors) to prepare for new unified pattern-based architecture. Deleted: - RedactionRule model and migration (database-backed rules) - Base, Email, Phone, CardPAN redactor classes - Related tests for legacy system Modified: - RedactionPipeline: removed apply_database_rules! method - Config: default_redactors now returns [] with TODO comment - Tests: updated to use custom_redactors instead of built-in ones Breaking change: No redactors active until new Pattern system is implemented (T1-T3). Tests pass: 123 runs, 368 assertions, 0 failures Related: #38-#54 (PII Redaction Architecture Refactor) --- app/models/tracebook/redaction_rule.rb | 81 ------------------- ...000400_create_tracebook_redaction_rules.rb | 19 ----- lib/tracebook/config.rb | 7 +- lib/tracebook/redaction_pipeline.rb | 25 ------ lib/tracebook/redactors.rb | 7 +- lib/tracebook/redactors/base.rb | 29 ------- lib/tracebook/redactors/card_pan.rb | 15 ---- lib/tracebook/redactors/email.rb | 15 ---- lib/tracebook/redactors/phone.rb | 15 ---- test/dummy/db/schema.rb | 15 +--- test/jobs/persist_interaction_job_test.rb | 16 ++-- test/lib/config_test.rb | 3 +- test/lib/redaction_pipeline_test.rb | 38 +++++---- test/models/redaction_rule_test.rb | 13 --- 14 files changed, 38 insertions(+), 260 deletions(-) delete mode 100644 app/models/tracebook/redaction_rule.rb delete mode 100644 db/migrate/20251112000400_create_tracebook_redaction_rules.rb delete mode 100644 lib/tracebook/redactors/base.rb delete mode 100644 lib/tracebook/redactors/card_pan.rb delete mode 100644 lib/tracebook/redactors/email.rb delete mode 100644 lib/tracebook/redactors/phone.rb delete mode 100644 test/models/redaction_rule_test.rb diff --git a/app/models/tracebook/redaction_rule.rb b/app/models/tracebook/redaction_rule.rb deleted file mode 100644 index 7447467..0000000 --- a/app/models/tracebook/redaction_rule.rb +++ /dev/null @@ -1,81 +0,0 @@ -# frozen_string_literal: true - -module Tracebook - # Rule for redacting PII from interaction payloads. - # - # Defines a regex pattern to detect and replace sensitive data before - # persistence. Runs on request, response, or both payloads. - # - # ## Fields - # - `name` - Human-readable name for this rule - # - `pattern` - Regular expression pattern to match - # - `replacement` - Replacement string (e.g., "[REDACTED]", "[EMAIL]") - # - `applies_to` - Where to apply: `:request`, `:response`, `:both`, `:metadata` - # - `enabled` - Whether this rule is active - # - # ## Built-in Rules - # TraceBook includes default redactors for: - # - Email addresses - # - Phone numbers (US format) - # - Credit card PANs - # - # @example Creating a custom redaction rule - # RedactionRule.create!( - # name: "API Keys", - # pattern: 'api_key["\s]*[:=]["\s]*\K[\w-]+', - # replacement: "[API_KEY]", - # applies_to: :both, - # enabled: true - # ) - # - # @example Email redaction - # RedactionRule.create!( - # name: "Email Addresses", - # pattern: '\b[\w\.-]+@[\w\.-]+\.\w{2,}\b', - # replacement: "[EMAIL]", - # applies_to: :both, - # enabled: true - # ) - # - # @example SSN redaction - # RedactionRule.create!( - # name: "Social Security Numbers", - # pattern: '\b\d{3}-\d{2}-\d{4}\b', - # replacement: "[SSN]", - # applies_to: :both, - # enabled: true - # ) - # - # @see Redactors::Email - # @see Redactors::Phone - # @see Redactors::CardPAN - class RedactionRule < ApplicationRecord - self.table_name = "tracebook_redaction_rules" - - # @!attribute [rw] applies_to - # @return [Symbol] Where to apply redaction (:request, :response, :both, :metadata) - enum :applies_to, { request: 0, response: 1, both: 2, metadata: 3 } - - validates :name, presence: true - validates :pattern, presence: true - validates :replacement, presence: true - - # Returns the compiled regex pattern. - # - # Caches the compiled pattern for performance. If pattern is invalid, - # falls back to escaped literal match. - # - # @return [Regexp] Compiled regular expression with MULTILINE flag - # - # @example - # rule = RedactionRule.new(pattern: '\b\d{3}-\d{2}-\d{4}\b') - # rule.compiled_pattern.match("123-45-6789") # => MatchData - def compiled_pattern - @compiled_pattern ||= Regexp.new(pattern, Regexp::MULTILINE) - rescue RegexpError - Regexp.new(Regexp.escape(pattern.to_s)) - end - end -end - -TraceBook = Tracebook unless defined?(TraceBook) diff --git a/db/migrate/20251112000400_create_tracebook_redaction_rules.rb b/db/migrate/20251112000400_create_tracebook_redaction_rules.rb deleted file mode 100644 index 21c2081..0000000 --- a/db/migrate/20251112000400_create_tracebook_redaction_rules.rb +++ /dev/null @@ -1,19 +0,0 @@ -# frozen_string_literal: true - -class CreateTracebookRedactionRules < ActiveRecord::Migration[8.0] - def change - create_table :tracebook_redaction_rules do |t| - t.string :name, null: false - t.text :pattern, null: false - t.string :replacement, null: false, default: "[REDACTED]" - t.integer :applies_to, null: false, default: 2 - t.boolean :enabled, null: false, default: true - t.integer :priority, null: false, default: 100 - - t.timestamps - end - - add_index :tracebook_redaction_rules, :enabled - add_index :tracebook_redaction_rules, :priority - end -end diff --git a/lib/tracebook/config.rb b/lib/tracebook/config.rb index edb9013..ad78965 100644 --- a/lib/tracebook/config.rb +++ b/lib/tracebook/config.rb @@ -128,11 +128,8 @@ def finalize! private def default_redactors - [ - Tracebook::Redactors::Email.new, - Tracebook::Redactors::Phone.new, - Tracebook::Redactors::CardPAN.new - ] + # TODO: Replace with new Pattern-based redactors from T3/T7 + [] end def freeze_collections! diff --git a/lib/tracebook/redaction_pipeline.rb b/lib/tracebook/redaction_pipeline.rb index be4dc8f..e75883d 100644 --- a/lib/tracebook/redaction_pipeline.rb +++ b/lib/tracebook/redaction_pipeline.rb @@ -14,7 +14,6 @@ def call(normalized) data = normalized.to_h.deep_dup apply_callable_redactors!(data) - apply_database_rules!(data) NormalizedInteraction.new(**data) end @@ -30,24 +29,6 @@ def apply_callable_redactors!(data) end end - def apply_database_rules!(data) - Tracebook::RedactionRule.where(enabled: true).order(:priority).find_each do |rule| - callable = ->(value) { redact_string(value, rule.compiled_pattern, rule.replacement) } - - case rule.applies_to.to_sym - when :request - apply_to_request!(data, callable) - when :response - apply_to_response!(data, callable) - when :both - apply_to_request!(data, callable) - apply_to_response!(data, callable) - when :metadata - apply_to_metadata!(data, callable) - end - end - end - def apply_to_request!(data, redactor) data[:request_payload] = deep_transform(data[:request_payload], redactor) data[:request_text] = redactor.call(data[:request_text]) if data[:request_text].is_a?(String) @@ -76,12 +57,6 @@ def deep_transform(value, redactor) value end end - - def redact_string(value, pattern, replacement) - return value unless value.is_a?(String) - - value.gsub(pattern, replacement) - end end end diff --git a/lib/tracebook/redactors.rb b/lib/tracebook/redactors.rb index 0120d81..6e9494d 100644 --- a/lib/tracebook/redactors.rb +++ b/lib/tracebook/redactors.rb @@ -1,8 +1,7 @@ # frozen_string_literal: true -require_relative "redactors/base" -require_relative "redactors/email" -require_relative "redactors/phone" -require_relative "redactors/card_pan" +# Redactors module - patterns and validators will be loaded here +# TODO: require_relative "redactors/patterns" +# TODO: require_relative "redactors/validators" TraceBook = Tracebook unless defined?(TraceBook) diff --git a/lib/tracebook/redactors/base.rb b/lib/tracebook/redactors/base.rb deleted file mode 100644 index 0ca65bf..0000000 --- a/lib/tracebook/redactors/base.rb +++ /dev/null @@ -1,29 +0,0 @@ -# frozen_string_literal: true - -module Tracebook - module Redactors - class Base - def call(value) - return value unless value.is_a?(String) - - value.gsub(pattern, replacement) - end - - def applies_to - :both - end - - private - - def pattern - raise NotImplementedError, "implement in subclasses" - end - - def replacement - "[REDACTED]" - end - end - end -end - -TraceBook = Tracebook unless defined?(TraceBook) diff --git a/lib/tracebook/redactors/card_pan.rb b/lib/tracebook/redactors/card_pan.rb deleted file mode 100644 index 7164dbd..0000000 --- a/lib/tracebook/redactors/card_pan.rb +++ /dev/null @@ -1,15 +0,0 @@ -# frozen_string_literal: true - -module Tracebook - module Redactors - class CardPAN < Base - private - - def pattern - /\b(?:\d[ -]*?){13,16}\b/ - end - end - end -end - -TraceBook = Tracebook unless defined?(TraceBook) diff --git a/lib/tracebook/redactors/email.rb b/lib/tracebook/redactors/email.rb deleted file mode 100644 index 853b4b4..0000000 --- a/lib/tracebook/redactors/email.rb +++ /dev/null @@ -1,15 +0,0 @@ -# frozen_string_literal: true - -module Tracebook - module Redactors - class Email < Base - private - - def pattern - /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b/i - end - end - end -end - -TraceBook = Tracebook unless defined?(TraceBook) diff --git a/lib/tracebook/redactors/phone.rb b/lib/tracebook/redactors/phone.rb deleted file mode 100644 index 4d4024c..0000000 --- a/lib/tracebook/redactors/phone.rb +++ /dev/null @@ -1,15 +0,0 @@ -# frozen_string_literal: true - -module Tracebook - module Redactors - class Phone < Base - private - - def pattern - /(?:\+?\d{1,3}[\s.-]?)?(?:\(\d{3}\)|\d{3})[\s.-]?\d{3}[\s.-]?\d{4}/ - end - end - end -end - -TraceBook = Tracebook unless defined?(TraceBook) diff --git a/test/dummy/db/schema.rb b/test/dummy/db/schema.rb index 7a25923..be6870c 100644 --- a/test/dummy/db/schema.rb +++ b/test/dummy/db/schema.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema[8.1].define(version: 2025_11_12_060837) do +ActiveRecord::Schema[8.1].define(version: 2025_11_12_000500) do create_table "active_storage_attachments", force: :cascade do |t| t.bigint "blob_id", null: false t.datetime "created_at", null: false @@ -109,19 +109,6 @@ t.index ["provider"], name: "index_tracebook_pricing_rules_on_provider" end - create_table "tracebook_redaction_rules", force: :cascade do |t| - t.integer "applies_to", default: 2, null: false - t.datetime "created_at", null: false - t.boolean "enabled", default: true, null: false - t.string "name", null: false - t.text "pattern", null: false - t.integer "priority", default: 100, null: false - t.string "replacement", default: "[REDACTED]", null: false - t.datetime "updated_at", null: false - t.index ["enabled"], name: "index_tracebook_redaction_rules_on_enabled" - t.index ["priority"], name: "index_tracebook_redaction_rules_on_priority" - end - create_table "tracebook_rollups_dailies", force: :cascade do |t| t.integer "cost_cents_sum", default: 0, null: false t.datetime "created_at", null: false diff --git a/test/jobs/persist_interaction_job_test.rb b/test/jobs/persist_interaction_job_test.rb index 113210b..511f0c4 100644 --- a/test/jobs/persist_interaction_job_test.rb +++ b/test/jobs/persist_interaction_job_test.rb @@ -7,7 +7,6 @@ class PersistInteractionJobTest < ActiveSupport::TestCase Interaction.delete_all RollupDaily.delete_all PricingRule.delete_all - RedactionRule.delete_all TraceBook.reset_configuration! end @@ -16,21 +15,20 @@ class PersistInteractionJobTest < ActiveSupport::TestCase Interaction.delete_all RollupDaily.delete_all PricingRule.delete_all - RedactionRule.delete_all TraceBook.reset_configuration! end - test "persists redacted interaction, computes cost, enqueues rollup" do + test "persists interaction, computes cost, enqueues rollup" do PricingRule.create!(provider: "openai", model_glob: "gpt-*", input_cents_per_unit: 150, output_cents_per_unit: 600, effective_from: Date.today - 1) payload = TraceBook::NormalizedInteraction.new( provider: "openai", model: "gpt-4o", project: "demo", - request_payload: { "messages" => [ { "content" => "Email me at user@example.com" } ] }, - response_payload: { "content" => "Call (555) 123-4567" }, - request_text: "Email me at user@example.com", - response_text: "Call (555) 123-4567", + request_payload: { "messages" => [ { "content" => "Hello world" } ] }, + response_payload: { "content" => "Hi there" }, + request_text: "Hello world", + response_text: "Hi there", input_tokens: 1200, output_tokens: 300, status: :success @@ -41,8 +39,8 @@ class PersistInteractionJobTest < ActiveSupport::TestCase assert_equal "openai", interaction.provider assert_equal "demo", interaction.project - assert_equal "Email me at [REDACTED]", interaction.request_text - assert_equal "Call [REDACTED]", interaction.response_text + assert_equal "Hello world", interaction.request_text + assert_equal "Hi there", interaction.response_text assert_equal 180, interaction.cost_input_cents assert_equal 180, interaction.cost_output_cents assert_equal 360, interaction.cost_total_cents diff --git a/test/lib/config_test.rb b/test/lib/config_test.rb index c9c1c32..65ad099 100644 --- a/test/lib/config_test.rb +++ b/test/lib/config_test.rb @@ -15,8 +15,7 @@ class TraceBookConfigTest < ActiveSupport::TestCase assert_equal false, config.auto_subscribe_ruby_llm assert_equal false, config.auto_subscribe_active_agent assert_kind_of Array, config.redactors - assert_equal 3, config.redactors.length - assert config.redactors.all? { |redactor| redactor.respond_to?(:call) } + assert_equal 0, config.redactors.length # No default redactors until new Pattern system is built assert_equal [], config.custom_redactors end diff --git a/test/lib/redaction_pipeline_test.rb b/test/lib/redaction_pipeline_test.rb index 40f7250..e1b37a3 100644 --- a/test/lib/redaction_pipeline_test.rb +++ b/test/lib/redaction_pipeline_test.rb @@ -4,15 +4,20 @@ module TraceBook class RedactionPipelineTest < ActiveSupport::TestCase setup do TraceBook.reset_configuration! - RedactionRule.delete_all end teardown do - RedactionRule.delete_all TraceBook.reset_configuration! end - test "applies built-in redactors to request and response payloads" do + test "applies custom redactors to request and response payloads" do + TraceBook.configure do |config| + config.custom_redactors = [ + ->(text) { text.gsub(/user@example\.com/, "[EMAIL]") }, + ->(text) { text.gsub(/\(?\d{3}\)?[- ]?\d{3}[- ]?\d{4}/, "[PHONE]") } + ] + end + normalized = NormalizedInteraction.new( provider: "openai", model: "gpt-4o", @@ -28,27 +33,32 @@ class RedactionPipelineTest < ActiveSupport::TestCase pipeline = RedactionPipeline.new(config: TraceBook.config) redacted = pipeline.call(normalized) - assert_equal "Contact [REDACTED]", redacted.request_text - assert_equal "Phone: [REDACTED]", redacted.response_text - assert_equal "[REDACTED]", redacted.request_payload["user"]["email"] - assert_equal "[REDACTED]", redacted.request_payload["phone"] - assert_equal "Call me at [REDACTED]", redacted.response_payload["message"] + assert_equal "Contact [EMAIL]", redacted.request_text + assert_equal "Phone: [PHONE]", redacted.response_text + assert_equal "[EMAIL]", redacted.request_payload["user"]["email"] + assert_equal "[PHONE]", redacted.request_payload["phone"] + assert_equal "Call me at [PHONE]", redacted.response_payload["message"] end - test "applies database redaction rules with priority" do - RedactionRule.create!(name: "session", pattern: "session-[0-9]+", applies_to: :metadata, priority: 1) - + test "passes through data unchanged when no redactors configured" do normalized = NormalizedInteraction.new( provider: "openai", model: "gpt-4o", - metadata: { "session" => "session-12345", "notes" => "contains session-67890" } + project: "demo", + request_payload: { "content" => "secret data" }, + response_payload: { "content" => "response data" }, + request_text: "secret data", + response_text: "response data", + metadata: { "key" => "value" }, + tags: [] ) pipeline = RedactionPipeline.new(config: TraceBook.config) redacted = pipeline.call(normalized) - assert_equal "[REDACTED]", redacted.metadata["session"] - assert_equal "contains [REDACTED]", redacted.metadata["notes"] + assert_equal "secret data", redacted.request_text + assert_equal "response data", redacted.response_text + assert_equal "secret data", redacted.request_payload["content"] end end end diff --git a/test/models/redaction_rule_test.rb b/test/models/redaction_rule_test.rb deleted file mode 100644 index 2d45ed9..0000000 --- a/test/models/redaction_rule_test.rb +++ /dev/null @@ -1,13 +0,0 @@ -require "test_helper" - -module TraceBook - class RedactionRuleTest < ActiveSupport::TestCase - test "requires name and pattern" do - rule = RedactionRule.new - - assert_not rule.valid? - assert_includes rule.errors.attribute_names, :name - assert_includes rule.errors.attribute_names, :pattern - end - end -end From b4b99badbfc553d26efa4f940ba94bba6c1b87b0 Mon Sep 17 00:00:00 2001 From: dpaluy Date: Tue, 6 Jan 2026 23:06:33 -0600 Subject: [PATCH 2/8] feat: Create RedactionAudit value object for PII auditing Implements immutable value object for tracking PII redaction operations, supporting GDPR/CCPA compliance requirements (Spec Gap 9). Key features: - Immutable instances with builder pattern methods - Tracks redaction metadata: timestamp, redactors applied, fields, counts - LLM redaction status tracking (success, failed, skipped) - Methods: record_redaction, record_llm_failure, record_llm_success, to_h - Deduplicates and sorts redactor names - Compact JSON serialization via to_h Comprehensive test suite (22 tests): - Default initialization and timestamp handling - Redaction recording and deduplication - Immutability verification - LLM status transitions - Hash serialization with nil filtering - Complex nested scenarios and chaining Next: Integrate with RedactionPipeline and NormalizedInteraction --- lib/tracebook.rb | 1 + lib/tracebook/redaction_audit.rb | 240 +++++++++++++++++++++ test/lib/tracebook/redaction_audit_test.rb | 232 ++++++++++++++++++++ 3 files changed, 473 insertions(+) create mode 100644 lib/tracebook/redaction_audit.rb create mode 100644 test/lib/tracebook/redaction_audit_test.rb diff --git a/lib/tracebook.rb b/lib/tracebook.rb index f3d2c5c..22d81ef 100644 --- a/lib/tracebook.rb +++ b/lib/tracebook.rb @@ -8,6 +8,7 @@ require "tracebook/result" require "tracebook/normalized_interaction" require "tracebook/redaction_pipeline" +require "tracebook/redaction_audit" require "tracebook/pricing" require "tracebook/adapters" require "tracebook/seeds/pricing_rules" diff --git a/lib/tracebook/redaction_audit.rb b/lib/tracebook/redaction_audit.rb new file mode 100644 index 0000000..9aca5c5 --- /dev/null +++ b/lib/tracebook/redaction_audit.rb @@ -0,0 +1,240 @@ +# frozen_string_literal: true + +module Tracebook + # Value object for auditing redaction operations. + # + # Tracks what was redacted, when, and by which rules for GDPR/CCPA compliance. + # Immutable after creation - use builder methods to construct instances. + # + # @attr redacted_at [String] ISO8601 timestamp when redaction occurred + # @attr redactors_applied [Array] List of redactor names that ran (sorted) + # @attr fields_redacted [Hash>] Map of field paths to redactor names + # @attr redaction_count [Integer] Total number of redactions performed + # @attr llm_redaction_status [String, nil] "success", "failed", or "skipped" + # @attr llm_redacted_at [String, nil] ISO8601 timestamp when LLM redaction succeeded + # @attr llm_redaction_error [String, nil] Error message if LLM redaction failed + # + # @example Building and using an audit + # audit = RedactionAudit.new + # audit = audit.record_redaction("email", "request_payload.messages[0].content") + # audit = audit.record_redaction("phone", "response_payload.content") + # audit = audit.record_llm_failure("Rate limit exceeded") + # audit.to_h # => { redacted_at: "2026-01-06T12:00:00Z", ... } + # + class RedactionAudit + attr_reader :redacted_at, :redactors_applied, :fields_redacted, :redaction_count, + :llm_redaction_status, :llm_redacted_at, :llm_redaction_error + + # Initialize a new RedactionAudit with default values. + # + # @param redacted_at [String] Timestamp (default: current time in ISO8601) + # @param redactors_applied [Array] Redactor names (default: empty) + # @param fields_redacted [Hash>] Field map (default: empty) + # @param redaction_count [Integer] Total redactions (default: 0) + # @param llm_redaction_status [String, nil] LLM status (default: nil) + # @param llm_redacted_at [String, nil] LLM success timestamp (default: nil) + # @param llm_redaction_error [String, nil] LLM error message (default: nil) + # + def initialize( + redacted_at: nil, + redactors_applied: [], + fields_redacted: {}, + redaction_count: 0, + llm_redaction_status: nil, + llm_redacted_at: nil, + llm_redaction_error: nil + ) + @redacted_at = redacted_at || Time.current.iso8601 + @redactors_applied = redactors_applied.dup + @fields_redacted = fields_redacted.dup + @redaction_count = redaction_count + @llm_redaction_status = llm_redaction_status + @llm_redacted_at = llm_redacted_at + @llm_redaction_error = llm_redaction_error + + freeze + end + + # Record a redaction event. + # + # Updates the audit trail to reflect that a redactor was applied to a field. + # Increments the redaction count and adds the redactor name to applied list. + # + # @param redactor_name [String] Name of the redactor (e.g., "email", "phone") + # @param field_path [String] Dot-notation path to the redacted field + # (e.g., "request_payload.messages[0].content") + # + # @return [RedactionAudit] New audit instance with recorded redaction + # + # @example Record multiple redactions + # audit = RedactionAudit.new + # audit = audit.record_redaction("email", "request_payload.user.email") + # audit = audit.record_redaction("phone", "request_payload.contact.phone") + # audit = audit.record_redaction("email", "response_payload.from") + # # redactors_applied: ["email", "phone"] + # # fields_redacted: { + # # "request_payload.user.email" => ["email"], + # # "request_payload.contact.phone" => ["phone"], + # # "response_payload.from" => ["email"] + # # } + # + def record_redaction(redactor_name, field_path) + new_redactors = Set.new(redactors_applied) + new_redactors.add(redactor_name) + + new_fields = fields_redacted.dup + new_fields[field_path] ||= [] + new_fields[field_path] = (new_fields[field_path] + [redactor_name]).uniq + + RedactionAudit.new( + redacted_at: redacted_at, + redactors_applied: new_redactors.to_a.sort, + fields_redacted: new_fields, + redaction_count: redaction_count + 1, + llm_redaction_status: llm_redaction_status, + llm_redacted_at: llm_redacted_at, + llm_redaction_error: llm_redaction_error + ) + end + + # Record an LLM redaction failure. + # + # Updates the audit trail to mark LLM-based redaction as failed, + # storing the error message for debugging. + # + # @param error_message [String] Description of the failure + # + # @return [RedactionAudit] New audit instance with failure recorded + # + # @example Record and handle LLM failure + # audit = RedactionAudit.new + # audit = audit.record_llm_failure("OpenAI rate limit exceeded") + # audit.llm_redaction_status # => "failed" + # audit.llm_redaction_error # => "OpenAI rate limit exceeded" + # + def record_llm_failure(error_message) + RedactionAudit.new( + redacted_at: redacted_at, + redactors_applied: redactors_applied, + fields_redacted: fields_redacted, + redaction_count: redaction_count, + llm_redaction_status: "failed", + llm_redacted_at: llm_redacted_at, + llm_redaction_error: error_message + ) + end + + # Record successful LLM redaction. + # + # Updates the audit trail to mark LLM-based redaction as successful, + # storing the completion timestamp. + # + # @param at [String] ISO8601 timestamp (default: current time) + # + # @return [RedactionAudit] New audit instance with success recorded + # + # @example Mark LLM redaction as succeeded + # audit = RedactionAudit.new + # audit = audit.record_llm_success("2026-01-06T12:05:00Z") + # audit.llm_redaction_status # => "success" + # audit.llm_redacted_at # => "2026-01-06T12:05:00Z" + # + def record_llm_success(at: nil) + RedactionAudit.new( + redacted_at: redacted_at, + redactors_applied: redactors_applied, + fields_redacted: fields_redacted, + redaction_count: redaction_count, + llm_redaction_status: "success", + llm_redacted_at: at || Time.current.iso8601, + llm_redaction_error: llm_redaction_error + ) + end + + # Record that LLM redaction was skipped. + # + # Updates the audit trail to mark LLM-based redaction as skipped + # (e.g., no LLM redactor configured, or redaction already complete). + # + # @return [RedactionAudit] New audit instance with skip recorded + # + # @example Mark LLM redaction as skipped + # audit = RedactionAudit.new + # audit = audit.record_llm_skip + # audit.llm_redaction_status # => "skipped" + # + def record_llm_skip + RedactionAudit.new( + redacted_at: redacted_at, + redactors_applied: redactors_applied, + fields_redacted: fields_redacted, + redaction_count: redaction_count, + llm_redaction_status: "skipped", + llm_redacted_at: llm_redacted_at, + llm_redaction_error: llm_redaction_error + ) + end + + # Convert audit to a hash suitable for JSON serialization. + # + # Filters out nil values to keep the hash minimal. Only includes + # keys with actual values. + # + # @return [Hash] Serializable audit hash + # + # @example Serialization + # audit = RedactionAudit.new + # audit = audit.record_redaction("email", "request_payload.user") + # audit.to_h + # # => { + # # redacted_at: "2026-01-06T12:00:00Z", + # # redactors_applied: ["email"], + # # fields_redacted: { "request_payload.user" => ["email"] }, + # # redaction_count: 1, + # # llm_redaction_status: nil, # Omitted if nil + # # llm_redacted_at: nil, # Omitted if nil + # # llm_redaction_error: nil # Omitted if nil + # # } + # + def to_h + { + redacted_at: redacted_at, + redactors_applied: redactors_applied, + fields_redacted: fields_redacted, + redaction_count: redaction_count, + llm_redaction_status: llm_redaction_status, + llm_redacted_at: llm_redacted_at, + llm_redaction_error: llm_redaction_error + }.compact + end + + # Test equality by comparing all attributes + def ==(other) + return false unless other.is_a?(RedactionAudit) + + redacted_at == other.redacted_at && + redactors_applied == other.redactors_applied && + fields_redacted == other.fields_redacted && + redaction_count == other.redaction_count && + llm_redaction_status == other.llm_redaction_status && + llm_redacted_at == other.llm_redacted_at && + llm_redaction_error == other.llm_redaction_error + end + + alias_method :eql?, :== + + def hash + [ + redacted_at, + redactors_applied, + fields_redacted, + redaction_count, + llm_redaction_status, + llm_redacted_at, + llm_redaction_error + ].hash + end + end +end + +TraceBook = Tracebook unless defined?(TraceBook) diff --git a/test/lib/tracebook/redaction_audit_test.rb b/test/lib/tracebook/redaction_audit_test.rb new file mode 100644 index 0000000..f14133c --- /dev/null +++ b/test/lib/tracebook/redaction_audit_test.rb @@ -0,0 +1,232 @@ +require "test_helper" + +module Tracebook + class RedactionAuditTest < ActiveSupport::TestCase + test "initializes with default values" do + audit = RedactionAudit.new + + assert_not_nil audit.redacted_at + assert_equal [], audit.redactors_applied + assert_equal({}, audit.fields_redacted) + assert_equal 0, audit.redaction_count + assert_nil audit.llm_redaction_status + assert_nil audit.llm_redacted_at + assert_nil audit.llm_redaction_error + end + + test "uses provided redacted_at timestamp" do + timestamp = "2026-01-06T10:00:00Z" + audit = RedactionAudit.new(redacted_at: timestamp) + + assert_equal timestamp, audit.redacted_at + end + + test "record_redaction tracks first redactor" do + audit = RedactionAudit.new + audit = audit.record_redaction("email", "request_payload.user.email") + + assert_equal ["email"], audit.redactors_applied + assert_equal({"request_payload.user.email" => ["email"]}, audit.fields_redacted) + assert_equal 1, audit.redaction_count + end + + test "record_redaction deduplicates redactors in applied list" do + audit = RedactionAudit.new + audit = audit.record_redaction("email", "request_payload.field1") + audit = audit.record_redaction("email", "request_payload.field2") + + assert_equal ["email"], audit.redactors_applied + assert_equal 2, audit.redaction_count + end + + test "record_redaction sorts redactors alphabetically" do + audit = RedactionAudit.new + audit = audit.record_redaction("phone", "field1") + audit = audit.record_redaction("email", "field2") + audit = audit.record_redaction("ssn", "field3") + + assert_equal ["email", "phone", "ssn"], audit.redactors_applied + end + + test "record_redaction tracks multiple redactors applied to same field" do + audit = RedactionAudit.new + audit = audit.record_redaction("email", "request_payload.content") + audit = audit.record_redaction("phone", "request_payload.content") + + assert_equal({"request_payload.content" => ["email", "phone"]}, audit.fields_redacted) + assert_equal 2, audit.redaction_count + end + + test "record_redaction deduplicates redactors per field" do + audit = RedactionAudit.new + audit = audit.record_redaction("email", "field1") + audit = audit.record_redaction("email", "field1") + + assert_equal({"field1" => ["email"]}, audit.fields_redacted) + assert_equal 2, audit.redaction_count # Still increments + end + + test "record_redaction handles nested field paths" do + audit = RedactionAudit.new + audit = audit.record_redaction("email", "request_payload.messages[0].content") + audit = audit.record_redaction("phone", "response_payload.data[1].value") + + assert_equal( + { + "request_payload.messages[0].content" => ["email"], + "response_payload.data[1].value" => ["phone"] + }, + audit.fields_redacted + ) + end + + test "record_redaction is immutable" do + audit1 = RedactionAudit.new + audit2 = audit1.record_redaction("email", "field1") + + assert_not_equal audit1.object_id, audit2.object_id + assert_equal 0, audit1.redaction_count + assert_equal 1, audit2.redaction_count + end + + test "record_llm_failure sets status and error message" do + audit = RedactionAudit.new + audit = audit.record_llm_failure("OpenAI rate limit exceeded") + + assert_equal "failed", audit.llm_redaction_status + assert_equal "OpenAI rate limit exceeded", audit.llm_redaction_error + assert_nil audit.llm_redacted_at + end + + test "record_llm_failure overwrites previous LLM status" do + audit = RedactionAudit.new(llm_redaction_status: "success") + audit = audit.record_llm_failure("Connection timeout") + + assert_equal "failed", audit.llm_redaction_status + assert_equal "Connection timeout", audit.llm_redaction_error + end + + test "record_llm_success sets status and timestamp" do + audit = RedactionAudit.new + timestamp = "2026-01-06T12:05:00Z" + audit = audit.record_llm_success(at: timestamp) + + assert_equal "success", audit.llm_redaction_status + assert_equal timestamp, audit.llm_redacted_at + assert_nil audit.llm_redaction_error + end + + test "record_llm_success uses current time by default" do + audit = RedactionAudit.new + before = Time.current.iso8601 + audit = audit.record_llm_success + after = Time.current.iso8601 + + assert_equal "success", audit.llm_redaction_status + assert audit.llm_redacted_at >= before + assert audit.llm_redacted_at <= after + end + + test "record_llm_skip sets status to skipped" do + audit = RedactionAudit.new + audit = audit.record_llm_skip + + assert_equal "skipped", audit.llm_redaction_status + assert_nil audit.llm_redacted_at + assert_nil audit.llm_redaction_error + end + + test "record_llm_skip clears previous LLM status" do + audit = RedactionAudit.new(llm_redaction_status: "failed", llm_redaction_error: "Error") + audit = audit.record_llm_skip + + assert_equal "skipped", audit.llm_redaction_status + # error message is still there (we only clear status) + assert_equal "Error", audit.llm_redaction_error + end + + test "to_h returns compact hash without nil values" do + audit = RedactionAudit.new(redacted_at: "2026-01-06T12:00:00Z") + hash = audit.to_h + + refute_includes hash, :llm_redaction_status + refute_includes hash, :llm_redacted_at + refute_includes hash, :llm_redaction_error + assert_includes hash, :redacted_at + assert_includes hash, :redaction_count + end + + test "to_h includes non-nil values" do + audit = RedactionAudit.new + audit = audit.record_redaction("email", "field1") + audit = audit.record_llm_failure("Error message") + + hash = audit.to_h + + assert_equal ["email"], hash[:redactors_applied] + assert_equal({"field1" => ["email"]}, hash[:fields_redacted]) + assert_equal 1, hash[:redaction_count] + assert_equal "failed", hash[:llm_redaction_status] + assert_equal "Error message", hash[:llm_redaction_error] + end + + test "to_h with complex nested redactions" do + audit = RedactionAudit.new + audit = audit.record_redaction("email", "request.messages[0].content") + audit = audit.record_redaction("phone", "request.messages[0].content") + audit = audit.record_redaction("ssn", "response.pii[1]") + + hash = audit.to_h + + assert_equal ["email", "phone", "ssn"], hash[:redactors_applied] + assert_equal( + { + "request.messages[0].content" => ["email", "phone"], + "response.pii[1]" => ["ssn"] + }, + hash[:fields_redacted] + ) + assert_equal 3, hash[:redaction_count] + end + + test "chaining operations preserves previous state" do + audit = RedactionAudit.new + audit = audit.record_redaction("email", "field1") + audit = audit.record_redaction("phone", "field2") + audit = audit.record_llm_failure("Timeout") + + hash = audit.to_h + + assert_equal ["email", "phone"], hash[:redactors_applied] + assert_equal 2, hash[:redaction_count] + assert_equal "failed", hash[:llm_redaction_status] + assert_equal "Timeout", hash[:llm_redaction_error] + end + + test "empty fields_redacted when no redactions recorded" do + audit = RedactionAudit.new + hash = audit.to_h + + assert_equal({}, hash[:fields_redacted]) + end + + test "redactors_applied remains sorted through mutations" do + audit = RedactionAudit.new + audit = audit.record_redaction("zzz", "field1") + audit = audit.record_redaction("aaa", "field2") + audit = audit.record_redaction("mmm", "field3") + + assert_equal ["aaa", "mmm", "zzz"], audit.redactors_applied + end + + test "redaction_count increments even for duplicate field-redactor pairs" do + audit = RedactionAudit.new + audit = audit.record_redaction("email", "field1") + audit = audit.record_redaction("email", "field1") + audit = audit.record_redaction("email", "field1") + + assert_equal 3, audit.redaction_count + assert_equal({"field1" => ["email"]}, audit.fields_redacted) + end + end +end From 016fb6c17bf4175de0ac9bc9b4a7c838588fe030 Mon Sep 17 00:00:00 2001 From: dpaluy Date: Tue, 6 Jan 2026 23:33:32 -0600 Subject: [PATCH 3/8] feat: add serialize_actor method to TraceBook module Add public method to convert ActiveRecord actors to job-safe serialized format for background job enqueueing. Supports GlobalID extraction with fallback to type/id tuple for objects without GlobalID support. Closes #42 --- lib/tracebook.rb | 32 ++++++++++++++ ...te_active_storage_tables.active_storage.rb | 3 ++ test/dummy/db/schema.rb | 2 +- test/tracebook_test.rb | 42 +++++++++++++++++++ 4 files changed, 78 insertions(+), 1 deletion(-) diff --git a/lib/tracebook.rb b/lib/tracebook.rb index 22d81ef..ab2d5cd 100644 --- a/lib/tracebook.rb +++ b/lib/tracebook.rb @@ -97,6 +97,38 @@ def reset_configuration! @configuration_finalized = false end + # Serializes an actor for job-safe persistence. + # + # Converts an ActiveRecord object (or similar) into a hash that can be + # safely passed to background jobs. Prefers GlobalID when available for + # reliable deserialization, falls back to type/id tuple otherwise. + # + # @param actor [ActiveRecord::Base, nil] The actor to serialize + # @return [Hash] Serialized actor data with :actor_gid or :actor_type/:actor_id keys + # + # @example With a User model (GlobalID available) + # TraceBook.serialize_actor(User.find(1)) + # # => { actor_gid: "gid://myapp/User/1" } + # + # @example With a plain object (no GlobalID) + # TraceBook.serialize_actor(some_object) + # # => { actor_type: "SomeObject", actor_id: 123 } + # + # @example With nil + # TraceBook.serialize_actor(nil) + # # => {} + def serialize_actor(actor) + return {} unless actor + + if actor.respond_to?(:to_global_id) + { actor_gid: actor.to_global_id.to_s } + elsif actor.respond_to?(:id) && actor.class.respond_to?(:name) + { actor_type: actor.class.name, actor_id: actor.id } + else + {} + end + end + # Records an LLM interaction. # # When `config.persist_async` is true, the interaction is enqueued via diff --git a/test/dummy/db/migrate/20251112060837_create_active_storage_tables.active_storage.rb b/test/dummy/db/migrate/20251112060837_create_active_storage_tables.active_storage.rb index 6bd8bd0..5f0b498 100644 --- a/test/dummy/db/migrate/20251112060837_create_active_storage_tables.active_storage.rb +++ b/test/dummy/db/migrate/20251112060837_create_active_storage_tables.active_storage.rb @@ -4,6 +4,9 @@ def change # Use Active Record's configured type for primary and foreign keys primary_key_type, foreign_key_type = primary_and_foreign_key_types + # Skip if tables already exist (idempotent for schema:load scenarios) + return if table_exists?(:active_storage_blobs) + create_table :active_storage_blobs, id: primary_key_type do |t| t.string :key, null: false t.string :filename, null: false diff --git a/test/dummy/db/schema.rb b/test/dummy/db/schema.rb index be6870c..965f4ea 100644 --- a/test/dummy/db/schema.rb +++ b/test/dummy/db/schema.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema[8.1].define(version: 2025_11_12_000500) do +ActiveRecord::Schema[8.1].define(version: 2025_11_12_060837) do create_table "active_storage_attachments", force: :cascade do |t| t.bigint "blob_id", null: false t.datetime "created_at", null: false diff --git a/test/tracebook_test.rb b/test/tracebook_test.rb index aa278fa..181da53 100644 --- a/test/tracebook_test.rb +++ b/test/tracebook_test.rb @@ -4,4 +4,46 @@ class TracebookTest < ActiveSupport::TestCase test "it has a version number" do assert Tracebook::VERSION end + + test "serialize_actor returns empty hash for nil" do + assert_equal({}, Tracebook.serialize_actor(nil)) + end + + test "serialize_actor extracts global_id when available" do + global_id = Object.new + global_id.define_singleton_method(:to_s) { "gid://app/User/123" } + + actor = Object.new + actor.define_singleton_method(:to_global_id) { global_id } + + result = Tracebook.serialize_actor(actor) + + assert_equal({ actor_gid: "gid://app/User/123" }, result) + end + + test "serialize_actor falls back to type/id tuple when no global_id" do + actor_class = Class.new do + def self.name + "CustomActor" + end + end + + actor = actor_class.new + actor.define_singleton_method(:respond_to?) do |method| + method != :to_global_id && super(method) + end + actor.define_singleton_method(:id) { 456 } + + result = Tracebook.serialize_actor(actor) + + assert_equal({ actor_type: "CustomActor", actor_id: 456 }, result) + end + + test "serialize_actor returns empty hash for non-serializable objects" do + actor = Object.new + + result = Tracebook.serialize_actor(actor) + + assert_equal({}, result) + end end From 86eac0c7e8c24d5f6d42f800087c876d7469c00e Mon Sep 17 00:00:00 2001 From: dpaluy Date: Wed, 7 Jan 2026 07:54:15 -0600 Subject: [PATCH 4/8] feat: add config DSL methods for PII redaction (T7 #44) Add pattern-based redaction DSL to Config class: - config.redact :email, :phone - enables individual patterns - config.redact_group :api_keys - enables pattern groups - config.redact_pattern(/regex/, "[REPLACEMENT]") - custom patterns - config.active_patterns - returns all enabled Pattern objects Also includes: - T2: Validators module with Luhn and SSN range validation - T3: PATTERNS hash with 16 built-in patterns (email, phone, credit_card, ssn, openai_key, anthropic_key, aws_key, stripe_key, github_token, github_pat, bearer_token, basic_auth, private_key, ipv4, ipv6, jwt) - PATTERN_GROUPS for convenient batch enabling (pii, financial, api_keys, auth, network, crypto) Invalid pattern names raise ConfigurationError at config time for early validation. --- lib/tracebook/config.rb | 98 +++++++++++- lib/tracebook/redactors/patterns.rb | 220 ++++++++++++++++++++++++++ lib/tracebook/redactors/validators.rb | 72 +++++++++ test/lib/config_test.rb | 121 ++++++++++++++ 4 files changed, 510 insertions(+), 1 deletion(-) create mode 100644 lib/tracebook/redactors/patterns.rb create mode 100644 lib/tracebook/redactors/validators.rb diff --git a/lib/tracebook/config.rb b/lib/tracebook/config.rb index ad78965..c267dea 100644 --- a/lib/tracebook/config.rb +++ b/lib/tracebook/config.rb @@ -1,5 +1,7 @@ # frozen_string_literal: true +require_relative "redactors/patterns" + module Tracebook # Configuration object for TraceBook. # @@ -14,7 +16,14 @@ module Tracebook # config.default_currency = "USD" # end # - # @example With custom redactors + # @example Pattern-based redaction DSL + # TraceBook.configure do |config| + # config.redact :email, :phone, :credit_card # Enable specific patterns + # config.redact_group :api_keys # Enable pattern group + # config.redact_pattern(/secret=\w+/, "[SECRET]") # Custom pattern + # end + # + # @example Legacy custom redactors (lambdas) # TraceBook.configure do |config| # config.custom_redactors += [ # ->(payload) { payload.gsub(/api_key=\w+/, "api_key=[REDACTED]") } @@ -88,6 +97,14 @@ class Config # config.actor_display = ->(actor) { actor.full_name } attr_accessor :actor_display + # @!attribute [r] enabled_patterns + # @return [Array] Pattern symbols enabled via redact DSL + attr_reader :enabled_patterns + + # @!attribute [r] custom_patterns + # @return [Array] Custom patterns added via redact_pattern + attr_reader :custom_patterns + # Creates a new configuration with default values. # # @return [Config] @@ -103,6 +120,83 @@ def initialize @auto_subscribe_active_agent = false @per_page = 100 @actor_display = nil + @enabled_patterns = [] + @custom_patterns = [] + end + + # Enable one or more built-in redaction patterns. + # + # @param names [Array] Pattern names from {Redactors::PATTERNS} + # @raise [ConfigurationError] if any name is not a valid pattern + # @return [void] + # + # @example Enable email and phone redaction + # config.redact :email, :phone + # + # @example Enable financial PII + # config.redact :credit_card, :ssn + def redact(*names) + names.each do |name| + unless Redactors::PATTERNS.key?(name) + valid_patterns = Redactors::PATTERNS.keys.join(", ") + raise ConfigurationError, "Unknown pattern: #{name}. Valid patterns: #{valid_patterns}" + end + @enabled_patterns << name unless @enabled_patterns.include?(name) + end + end + + # Enable a group of related patterns. + # + # @param group_name [Symbol] Group name from {Redactors::PATTERN_GROUPS} + # @raise [ConfigurationError] if group name is not valid + # @return [void] + # + # @example Enable all API key patterns + # config.redact_group :api_keys + # + # @see Redactors::PATTERN_GROUPS + def redact_group(group_name) + unless Redactors::PATTERN_GROUPS.key?(group_name) + valid_groups = Redactors::PATTERN_GROUPS.keys.join(", ") + raise ConfigurationError, "Unknown pattern group: #{group_name}. Valid groups: #{valid_groups}" + end + + Redactors::PATTERN_GROUPS[group_name].each do |pattern_name| + @enabled_patterns << pattern_name unless @enabled_patterns.include?(pattern_name) + end + end + + # Add a custom regex pattern for redaction. + # + # @param regex [Regexp] The pattern to match + # @param replacement [String] The replacement text (e.g., "[REDACTED]") + # @param name [String] Name for audit trail (defaults to "custom_N") + # @return [void] + # + # @example Redact custom API keys + # config.redact_pattern(/myapp_key_\w+/, "[MYAPP_KEY]") + # + # @example Named custom pattern + # config.redact_pattern(/secret=\w+/, "[SECRET]", name: "app_secret") + def redact_pattern(regex, replacement, name: nil) + pattern_name = name || "custom_#{@custom_patterns.size + 1}" + pattern = Redactors::Pattern.new( + regex: regex, + replacement: replacement, + name: pattern_name + ) + @custom_patterns << pattern + end + + # Returns all enabled Pattern objects for redaction. + # + # Combines patterns enabled via {#redact} and {#redact_group} + # with custom patterns from {#redact_pattern}. + # + # @return [Array] + def active_patterns + patterns = @enabled_patterns.map { |name| Redactors::PATTERNS[name] } + patterns + @custom_patterns end # Returns true if configuration has been finalized. @@ -136,6 +230,8 @@ def freeze_collections! @redactors = @redactors.map { |redactor| redactor }.freeze @custom_redactors = @custom_redactors.map { |callable| callable }.freeze @export_formats = @export_formats.map(&:to_sym).freeze + @enabled_patterns = @enabled_patterns.dup.freeze + @custom_patterns = @custom_patterns.dup.freeze end end end diff --git a/lib/tracebook/redactors/patterns.rb b/lib/tracebook/redactors/patterns.rb new file mode 100644 index 0000000..5deda28 --- /dev/null +++ b/lib/tracebook/redactors/patterns.rb @@ -0,0 +1,220 @@ +# frozen_string_literal: true + +require_relative "validators" + +module Tracebook + module Redactors + # Pattern class for PII redaction with audit support. + # + # Wraps a regex pattern with optional validation and provides a consistent + # call(text, audit:) interface for redaction operations. + # + # @attr_reader regex [Regexp] The pattern to match + # @attr_reader replacement [String] The replacement text (e.g., "[EMAIL]") + # @attr_reader name [String] Human-readable pattern name for audit trails + # @attr_reader validator [Proc, nil] Optional validation proc for matched text + # + # @example Basic pattern usage + # pattern = Pattern.new( + # regex: /\b[\w.+-]+@[\w.-]+\.[a-z]{2,}\b/i, + # replacement: "[EMAIL]", + # name: "email" + # ) + # audit = RedactionAudit.new + # result = pattern.call("Contact: user@example.com", audit: audit) + # # result => "Contact: [EMAIL]" + # # audit.redaction_count => 1 + # + # @example Pattern with validator + # pattern = Pattern.new( + # regex: /\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b/, + # replacement: "[CARD]", + # name: "credit_card", + # validator: ->(match) { Validators.luhn(match.gsub(/[\s-]/, "")) } + # ) + # + class Pattern + attr_reader :regex, :replacement, :name, :validator + + def initialize(regex:, replacement:, name:, validator: nil) + @regex = regex + @replacement = replacement + @name = name + @validator = validator + end + + # Apply pattern to text and record redactions to audit. + # + # @param text [String] The text to redact + # @param audit [RedactionAudit] Audit object to record redactions + # @param field_path [String] Optional field path for audit trail + # @return [Array] Tuple of [redacted_text, updated_audit] + # + def call(text, audit:, field_path: nil) + return [ text, audit ] unless text.is_a?(String) + + result_text = text.dup + updated_audit = audit + + # Find all matches and process in reverse order to preserve positions + matches = [] + text.scan(regex) do + match = Regexp.last_match + matches << { text: match[0], start: match.begin(0), end: match.end(0) } + end + + matches.reverse_each do |match_info| + matched_text = match_info[:text] + + # Skip if validator fails + next if validator && !validator.call(matched_text) + + # Perform replacement + result_text[match_info[:start]...match_info[:end]] = replacement + + # Record to audit + path = field_path || "inline" + updated_audit = updated_audit.record_redaction(name, path) + end + + [ result_text, updated_audit ] + end + end + + # Standard PII patterns for redaction. + # + # Each pattern includes a regex, replacement marker, name for audit trails, + # and optional validator to reduce false positives. + PATTERNS = { + # Email addresses - RFC 5322 simplified + email: Pattern.new( + regex: /\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b/, + replacement: "[EMAIL]", + name: "email" + ), + + # Phone numbers - US and international formats + # Matches: (123) 456-7890, 123-456-7890, +1-123-456-7890, +44 20 7946 0958 + phone: Pattern.new( + regex: /(?:\+\d{1,3}[\s.-]?)?\(?\d{2,4}\)?[\s.-]?\d{3,4}[\s.-]?\d{4}\b/, + replacement: "[PHONE]", + name: "phone" + ), + + # Credit card numbers with Luhn validation + # Matches: 4532015112830366, 4532-0151-1283-0366, 4532 0151 1283 0366 + credit_card: Pattern.new( + regex: /\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b/, + replacement: "[CARD]", + name: "credit_card", + validator: ->(match) { Validators.luhn(match.gsub(/[\s-]/, "")) } + ), + + # Social Security Numbers with range validation + # Matches: 123-45-6789, 123 45 6789, 123456789 + ssn: Pattern.new( + regex: /\b(\d{3})[\s-]?(\d{2})[\s-]?(\d{4})\b/, + replacement: "[SSN]", + name: "ssn", + validator: ->(match) { + area = match.gsub(/[\s-]/, "")[0, 3] + Validators.ssn_range(area) + } + ), + + # OpenAI API keys - sk-... format + openai_key: Pattern.new( + regex: /\bsk-[a-zA-Z0-9]{20,}\b/, + replacement: "[OPENAI_KEY]", + name: "openai_key" + ), + + # Anthropic API keys - sk-ant-... format + anthropic_key: Pattern.new( + regex: /\bsk-ant-[a-zA-Z0-9-]{20,}\b/, + replacement: "[ANTHROPIC_KEY]", + name: "anthropic_key" + ), + + # AWS access keys - AKIA... format (20 chars) + aws_key: Pattern.new( + regex: /\b(?:AKIA|ABIA|ACCA|ASIA)[A-Z0-9]{16}\b/, + replacement: "[AWS_KEY]", + name: "aws_key" + ), + + # Stripe API keys - sk_live_..., sk_test_..., pk_live_..., pk_test_... + stripe_key: Pattern.new( + regex: /\b[sp]k_(?:live|test)_[a-zA-Z0-9]{24,}\b/, + replacement: "[STRIPE_KEY]", + name: "stripe_key" + ), + + # GitHub tokens - ghp_... format (fine-grained PATs) + github_token: Pattern.new( + regex: /\bghp_[a-zA-Z0-9]{36}\b/, + replacement: "[GITHUB_TOKEN]", + name: "github_token" + ), + + # GitHub PATs - github_pat_... format + github_pat: Pattern.new( + regex: /\bgithub_pat_[a-zA-Z0-9]{22}_[a-zA-Z0-9]{59}\b/, + replacement: "[GITHUB_PAT]", + name: "github_pat" + ), + + # Bearer tokens in headers + bearer_token: Pattern.new( + regex: /\bBearer\s+[a-zA-Z0-9._-]{20,}\b/i, + replacement: "[BEARER_TOKEN]", + name: "bearer_token" + ), + + # Basic auth credentials - base64 encoded user:pass + basic_auth: Pattern.new( + regex: /\bBasic\s+[a-zA-Z0-9+\/]{20,}={0,2}/i, + replacement: "[BASIC_AUTH]", + name: "basic_auth" + ), + + # PEM private keys + private_key: Pattern.new( + regex: /-----BEGIN\s+(?:RSA\s+)?PRIVATE\s+KEY-----[\s\S]*?-----END\s+(?:RSA\s+)?PRIVATE\s+KEY-----/, + replacement: "[PRIVATE_KEY]", + name: "private_key" + ), + + # IPv4 addresses + ipv4: Pattern.new( + regex: /\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b/, + replacement: "[IPV4]", + name: "ipv4" + ), + + # IPv6 addresses - full and compressed formats + ipv6: Pattern.new( + regex: /\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\b|\b(?:[0-9a-fA-F]{1,4}:){1,7}:\b|\b(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}\b|::(?:[0-9a-fA-F]{1,4}:){0,5}[0-9a-fA-F]{1,4}\b|::1\b/, + replacement: "[IPV6]", + name: "ipv6" + ), + + # JSON Web Tokens (JWT) - three base64url segments + jwt: Pattern.new( + regex: /\beyJ[a-zA-Z0-9_-]*\.eyJ[a-zA-Z0-9_-]*\.[a-zA-Z0-9_-]+\b/, + replacement: "[JWT]", + name: "jwt" + ) + }.freeze + + # Pattern groups for convenient batch enabling + PATTERN_GROUPS = { + pii: %i[email phone ssn], + financial: %i[credit_card], + api_keys: %i[openai_key anthropic_key aws_key stripe_key github_token github_pat], + auth: %i[bearer_token basic_auth jwt], + network: %i[ipv4 ipv6], + crypto: %i[private_key] + }.freeze + end +end diff --git a/lib/tracebook/redactors/validators.rb b/lib/tracebook/redactors/validators.rb new file mode 100644 index 0000000..f5b906c --- /dev/null +++ b/lib/tracebook/redactors/validators.rb @@ -0,0 +1,72 @@ +# frozen_string_literal: true + +module Tracebook + module Redactors + # Validation methods for PII detection. + # + # Provides Luhn algorithm for credit card validation and SSN range validation. + # Used by Pattern class to reduce false positives in PII detection. + module Validators + module_function + + # Validates a credit card number using the Luhn algorithm. + # + # The Luhn algorithm (mod 10) is used to validate credit card numbers. + # It detects single-digit errors and most transpositions. + # + # @param digits [String] The credit card number (digits only, no spaces/dashes) + # @return [Boolean] true if the checksum is valid + # + # @example + # Validators.luhn("4532015112830366") # => true (valid Visa) + # Validators.luhn("1234567890123456") # => false (invalid checksum) + def luhn(digits) + return false if digits.nil? || digits.empty? + return false unless digits.match?(/\A\d+\z/) + return false if digits.length < 13 || digits.length > 19 + return false if digits.chars.uniq.size == 1 # Reject repeated digits (e.g., all zeros) + + sum = 0 + digits.reverse.each_char.with_index do |char, index| + digit = char.to_i + if index.odd? + doubled = digit * 2 + digit = doubled > 9 ? doubled - 9 : doubled + end + sum += digit + end + + (sum % 10).zero? + end + + # Validates an SSN area number (first 3 digits). + # + # The Social Security Administration has specific rules for valid area numbers: + # - 000 is never valid + # - 666 is never valid + # - 900-999 were never issued (reserved for advertising/promotional use) + # + # @param area [String] The first 3 digits of an SSN + # @return [Boolean] true if the area number could be valid + # + # @example + # Validators.ssn_range("078") # => true (valid area) + # Validators.ssn_range("000") # => false (invalid) + # Validators.ssn_range("666") # => false (invalid) + # Validators.ssn_range("900") # => false (invalid - promotional range) + def ssn_range(area) + return false if area.nil? || area.empty? + return false unless area.match?(/\A\d{3}\z/) + + area_num = area.to_i + + # Invalid ranges per SSA rules + return false if area_num.zero? # 000 never valid + return false if area_num == 666 # 666 never valid + return false if area_num >= 900 # 900-999 never issued + + true + end + end + end +end diff --git a/test/lib/config_test.rb b/test/lib/config_test.rb index 65ad099..e5455e5 100644 --- a/test/lib/config_test.rb +++ b/test/lib/config_test.rb @@ -17,6 +17,8 @@ class TraceBookConfigTest < ActiveSupport::TestCase assert_kind_of Array, config.redactors assert_equal 0, config.redactors.length # No default redactors until new Pattern system is built assert_equal [], config.custom_redactors + assert_equal [], config.enabled_patterns + assert_equal [], config.custom_patterns end test "configure yields mutable config then freezes it" do @@ -41,4 +43,123 @@ class TraceBookConfigTest < ActiveSupport::TestCase TraceBook.configure { |_config| } end end + + # Config DSL tests (T7) + + test "redact enables individual patterns" do + TraceBook.configure do |config| + config.redact :email, :phone + end + + config = TraceBook.config + assert_equal [ :email, :phone ], config.enabled_patterns + assert_equal 2, config.active_patterns.size + assert config.active_patterns.all? { |p| p.is_a?(Tracebook::Redactors::Pattern) } + end + + test "redact raises ConfigurationError for unknown pattern" do + assert_raises TraceBook::ConfigurationError do + TraceBook.configure do |config| + config.redact :bogus_pattern + end + end + end + + test "redact deduplicates patterns" do + TraceBook.configure do |config| + config.redact :email, :phone + config.redact :email # duplicate + end + + config = TraceBook.config + assert_equal [ :email, :phone ], config.enabled_patterns + end + + test "redact_group enables all patterns in group" do + TraceBook.configure do |config| + config.redact_group :api_keys + end + + config = TraceBook.config + expected = [ :openai_key, :anthropic_key, :aws_key, :stripe_key, :github_token, :github_pat ] + assert_equal expected, config.enabled_patterns + end + + test "redact_group raises ConfigurationError for unknown group" do + assert_raises TraceBook::ConfigurationError do + TraceBook.configure do |config| + config.redact_group :nonexistent_group + end + end + end + + test "redact_group deduplicates with individual redact" do + TraceBook.configure do |config| + config.redact :openai_key + config.redact_group :api_keys # includes openai_key + end + + config = TraceBook.config + # openai_key should only appear once + assert_equal 1, config.enabled_patterns.count(:openai_key) + end + + test "redact_pattern adds custom pattern" do + TraceBook.configure do |config| + config.redact_pattern(/secret=\w+/, "[SECRET]") + end + + config = TraceBook.config + assert_equal 1, config.custom_patterns.size + pattern = config.custom_patterns.first + assert_equal(/secret=\w+/, pattern.regex) + assert_equal "[SECRET]", pattern.replacement + assert_equal "custom_1", pattern.name + end + + test "redact_pattern accepts custom name" do + TraceBook.configure do |config| + config.redact_pattern(/mykey_\w+/, "[MYKEY]", name: "my_app_key") + end + + config = TraceBook.config + pattern = config.custom_patterns.first + assert_equal "my_app_key", pattern.name + end + + test "redact_pattern numbers multiple custom patterns" do + TraceBook.configure do |config| + config.redact_pattern(/first/, "[FIRST]") + config.redact_pattern(/second/, "[SECOND]") + end + + config = TraceBook.config + assert_equal 2, config.custom_patterns.size + assert_equal "custom_1", config.custom_patterns[0].name + assert_equal "custom_2", config.custom_patterns[1].name + end + + test "active_patterns combines enabled patterns and custom patterns" do + TraceBook.configure do |config| + config.redact :email + config.redact_pattern(/custom/, "[CUSTOM]") + end + + config = TraceBook.config + active = config.active_patterns + assert_equal 2, active.size + assert_equal "email", active[0].name + assert_equal "custom_1", active[1].name + end + + test "enabled_patterns and custom_patterns are frozen after configure" do + TraceBook.configure do |config| + config.redact :email + config.redact_pattern(/x/, "[X]") + end + + config = TraceBook.config + assert config.enabled_patterns.frozen? + assert config.custom_patterns.frozen? + end end From f8c02d6a4246d8499923d12428308216eeb29a46 Mon Sep 17 00:00:00 2001 From: dpaluy Date: Wed, 7 Jan 2026 07:59:26 -0600 Subject: [PATCH 5/8] feat: add legacy lambda wrapper to RedactionPipeline (T8 #45) Update RedactionPipeline to support both interface styles: - New Pattern-based: call(text, audit:, field_path:) returns [text, audit] - Legacy lambda: call(text) returns text The pipeline auto-detects which interface to use: - Pattern objects are detected by class check - Lambdas with `audit:` keyword param bypass wrapping - Legacy single-arg lambdas get wrapped with audit tracking Also includes: - NormalizedInteraction extended with actor_type, actor_id, actor_gid, redaction_audit fields (T4 dependencies) - Patterns applied first, then custom_redactors - RedactionAudit populated on result.redaction_audit Backwards compatible with existing custom_redactors configuration. --- lib/tracebook/normalized_interaction.rb | 12 +- lib/tracebook/redaction_pipeline.rb | 184 +++++++++++++++++++++--- test/lib/redaction_pipeline_test.rb | 169 ++++++++++++++++++++++ 3 files changed, 343 insertions(+), 22 deletions(-) diff --git a/lib/tracebook/normalized_interaction.rb b/lib/tracebook/normalized_interaction.rb index bca6542..b106115 100644 --- a/lib/tracebook/normalized_interaction.rb +++ b/lib/tracebook/normalized_interaction.rb @@ -56,7 +56,11 @@ module Tracebook :metadata, :actor, :parent_id, - :session_id + :session_id, + :actor_type, + :actor_id, + :actor_gid, + :redaction_audit ) do def initialize( provider:, @@ -76,7 +80,11 @@ def initialize( metadata: {}, actor: nil, parent_id: nil, - session_id: nil + session_id: nil, + actor_type: nil, + actor_id: nil, + actor_gid: nil, + redaction_audit: nil ) super end diff --git a/lib/tracebook/redaction_pipeline.rb b/lib/tracebook/redaction_pipeline.rb index e75883d..0c6dda8 100644 --- a/lib/tracebook/redaction_pipeline.rb +++ b/lib/tracebook/redaction_pipeline.rb @@ -1,8 +1,33 @@ # frozen_string_literal: true require "active_support/core_ext/object/deep_dup" +require_relative "redaction_audit" module Tracebook + # Pipeline for applying PII redaction to LLM interaction data. + # + # Supports two interfaces: + # - New Pattern-based redactors: `call(text, audit:)` returning `[text, audit]` + # - Legacy lambda redactors: `call(text)` returning `text` + # + # The pipeline auto-detects which interface to use based on the callable's + # arity and provides backwards compatibility for existing custom_redactors. + # + # @example Using new Pattern-based redaction + # TraceBook.configure do |config| + # config.redact :email, :phone + # end + # pipeline = RedactionPipeline.new + # result = pipeline.call(normalized_interaction) + # result.redaction_audit.redaction_count # => 3 + # + # @example Using legacy lambda redactors + # TraceBook.configure do |config| + # config.custom_redactors = [ + # ->(text) { text.gsub(/secret=\w+/, "secret=[REDACTED]") } + # ] + # end + # class RedactionPipeline attr_reader :config @@ -12,51 +37,170 @@ def initialize(config: Tracebook.config) def call(normalized) data = normalized.to_h.deep_dup + @audit = RedactionAudit.new + apply_pattern_redactors!(data) apply_callable_redactors!(data) + data[:redaction_audit] = @audit NormalizedInteraction.new(**data) end private - def apply_callable_redactors!(data) - redactors = Array(config.redactors) + Array(config.custom_redactors) - redactors.each do |redactor| - apply_to_request!(data, redactor) - apply_to_response!(data, redactor) - apply_to_metadata!(data, redactor) + # Apply new-style Pattern redactors from config.active_patterns + def apply_pattern_redactors!(data) + patterns = config.respond_to?(:active_patterns) ? config.active_patterns : [] + patterns.each do |pattern| + apply_pattern_to_request!(data, pattern) + apply_pattern_to_response!(data, pattern) + apply_pattern_to_metadata!(data, pattern) end end - def apply_to_request!(data, redactor) - data[:request_payload] = deep_transform(data[:request_payload], redactor) - data[:request_text] = redactor.call(data[:request_text]) if data[:request_text].is_a?(String) + def apply_pattern_to_request!(data, pattern) + data[:request_payload], @audit = deep_transform_with_audit( + data[:request_payload], pattern, "request_payload" + ) + if data[:request_text].is_a?(String) + data[:request_text], @audit = pattern.call( + data[:request_text], audit: @audit, field_path: "request_text" + ) + end end - def apply_to_response!(data, redactor) - data[:response_payload] = deep_transform(data[:response_payload], redactor) - data[:response_text] = redactor.call(data[:response_text]) if data[:response_text].is_a?(String) + def apply_pattern_to_response!(data, pattern) + data[:response_payload], @audit = deep_transform_with_audit( + data[:response_payload], pattern, "response_payload" + ) + if data[:response_text].is_a?(String) + data[:response_text], @audit = pattern.call( + data[:response_text], audit: @audit, field_path: "response_text" + ) + end end - def apply_to_metadata!(data, redactor) - data[:metadata] = deep_transform(data[:metadata], redactor) + def apply_pattern_to_metadata!(data, pattern) + data[:metadata], @audit = deep_transform_with_audit( + data[:metadata], pattern, "metadata" + ) end - def deep_transform(value, redactor) + def deep_transform_with_audit(value, pattern, path) case value when String - redactor.call(value) + pattern.call(value, audit: @audit, field_path: path) when Hash - value.each_with_object({}) do |(key, nested), memo| - memo[key] = deep_transform(nested, redactor) + result = {} + value.each do |key, nested| + result[key], @audit = deep_transform_with_audit(nested, pattern, "#{path}.#{key}") end + [ result, @audit ] when Array - value.map { |nested| deep_transform(nested, redactor) } + result = value.map.with_index do |nested, idx| + transformed, @audit = deep_transform_with_audit(nested, pattern, "#{path}[#{idx}]") + transformed + end + [ result, @audit ] + else + [ value, @audit ] + end + end + + # Apply legacy callable redactors (custom_redactors) + def apply_callable_redactors!(data) + redactors = Array(config.redactors) + Array(config.custom_redactors) + redactors.each do |redactor| + wrapped = wrap_legacy_redactor(redactor) + apply_wrapped_to_request!(data, wrapped) + apply_wrapped_to_response!(data, wrapped) + apply_wrapped_to_metadata!(data, wrapped) + end + end + + # Wrap legacy single-arg lambdas to work with audit interface. + # + # Detects if the redactor is: + # - New-style: responds to `call(text, audit:)` (arity -1 with keywords or 2) + # - Legacy-style: responds to `call(text)` (arity 1) + # + # @param redactor [#call] The redactor callable + # @return [#call] A wrapped callable with consistent interface + def wrap_legacy_redactor(redactor) + return redactor if new_style_redactor?(redactor) + + # Wrap legacy lambda to track redactions + ->(text, audit:, field_path: nil) { + return [ text, audit ] unless text.is_a?(String) + + result = redactor.call(text) + updated_audit = audit + + # Track if redaction occurred (text changed) + if result != text + redactor_name = extract_redactor_name(redactor) + updated_audit = audit.record_redaction(redactor_name, field_path || "inline") + end + + [ result, updated_audit ] + } + end + + def new_style_redactor?(redactor) + # Pattern objects respond to call(text, audit:, field_path:) + return true if redactor.is_a?(Redactors::Pattern) + + # Check if lambda/proc accepts keyword arguments + if redactor.respond_to?(:parameters) + params = redactor.parameters + params.any? { |type, name| type == :keyreq && name == :audit } else - value + false end end + + def extract_redactor_name(redactor) + if redactor.respond_to?(:name) && redactor.name + redactor.name + elsif redactor.is_a?(Proc) && redactor.source_location + file, line = redactor.source_location + "lambda@#{File.basename(file)}:#{line}" + else + "custom_lambda" + end + end + + def apply_wrapped_to_request!(data, redactor) + data[:request_payload], @audit = deep_transform_with_audit( + data[:request_payload], redactor, "request_payload" + ) + if data[:request_text].is_a?(String) + data[:request_text], @audit = call_redactor( + redactor, data[:request_text], "request_text" + ) + end + end + + def apply_wrapped_to_response!(data, redactor) + data[:response_payload], @audit = deep_transform_with_audit( + data[:response_payload], redactor, "response_payload" + ) + if data[:response_text].is_a?(String) + data[:response_text], @audit = call_redactor( + redactor, data[:response_text], "response_text" + ) + end + end + + def apply_wrapped_to_metadata!(data, redactor) + data[:metadata], @audit = deep_transform_with_audit( + data[:metadata], redactor, "metadata" + ) + end + + def call_redactor(redactor, text, field_path) + redactor.call(text, audit: @audit, field_path: field_path) + end end end diff --git a/test/lib/redaction_pipeline_test.rb b/test/lib/redaction_pipeline_test.rb index e1b37a3..98a6fab 100644 --- a/test/lib/redaction_pipeline_test.rb +++ b/test/lib/redaction_pipeline_test.rb @@ -60,5 +60,174 @@ class RedactionPipelineTest < ActiveSupport::TestCase assert_equal "response data", redacted.response_text assert_equal "secret data", redacted.request_payload["content"] end + + # T8: Legacy lambda wrapper tests + + test "legacy lambdas track redactions in audit" do + TraceBook.configure do |config| + config.custom_redactors = [ + ->(text) { text.gsub(/secret/, "[REDACTED]") } + ] + end + + normalized = NormalizedInteraction.new( + provider: "openai", + model: "gpt-4o", + request_payload: {}, + response_payload: {}, + request_text: "my secret data", + response_text: "another secret here" + ) + + pipeline = RedactionPipeline.new(config: TraceBook.config) + redacted = pipeline.call(normalized) + + assert_equal "my [REDACTED] data", redacted.request_text + assert_equal "another [REDACTED] here", redacted.response_text + assert_kind_of Tracebook::RedactionAudit, redacted.redaction_audit + assert redacted.redaction_audit.redaction_count >= 2 + end + + test "pattern-based redactors work with audit tracking" do + TraceBook.configure do |config| + config.redact :email + end + + normalized = NormalizedInteraction.new( + provider: "openai", + model: "gpt-4o", + request_payload: { "contact" => "user@example.com" }, + response_payload: {}, + request_text: "Email: admin@test.org", + response_text: "OK" + ) + + pipeline = RedactionPipeline.new(config: TraceBook.config) + redacted = pipeline.call(normalized) + + assert_equal "Email: [EMAIL]", redacted.request_text + assert_equal "[EMAIL]", redacted.request_payload["contact"] + assert_equal 2, redacted.redaction_audit.redaction_count + assert redacted.redaction_audit.redactors_applied.include?("email") + end + + test "combines pattern redactors with legacy lambdas" do + TraceBook.configure do |config| + config.redact :email + config.custom_redactors = [ + ->(text) { text.gsub(/secret=\w+/, "secret=[HIDDEN]") } + ] + end + + normalized = NormalizedInteraction.new( + provider: "openai", + model: "gpt-4o", + request_payload: {}, + response_payload: {}, + request_text: "user@example.com secret=abc123", + response_text: "Done" + ) + + pipeline = RedactionPipeline.new(config: TraceBook.config) + redacted = pipeline.call(normalized) + + assert_equal "[EMAIL] secret=[HIDDEN]", redacted.request_text + assert redacted.redaction_audit.redaction_count >= 2 + end + + test "new-style redactor with audit keyword is not wrapped" do + new_style_redactor = ->(text, audit:, field_path: nil) { + result = text.gsub(/token=\w+/, "[TOKEN]") + updated_audit = result != text ? audit.record_redaction("token", field_path) : audit + [ result, updated_audit ] + } + + TraceBook.configure do |config| + config.custom_redactors = [ new_style_redactor ] + end + + normalized = NormalizedInteraction.new( + provider: "openai", + model: "gpt-4o", + request_payload: {}, + response_payload: {}, + request_text: "token=xyz789", + response_text: "OK" + ) + + pipeline = RedactionPipeline.new(config: TraceBook.config) + redacted = pipeline.call(normalized) + + assert_equal "[TOKEN]", redacted.request_text + assert_equal 1, redacted.redaction_audit.redaction_count + assert redacted.redaction_audit.redactors_applied.include?("token") + end + + test "legacy lambda that does not change text does not record redaction" do + TraceBook.configure do |config| + config.custom_redactors = [ + ->(text) { text.gsub(/nonexistent/, "[REDACTED]") } + ] + end + + normalized = NormalizedInteraction.new( + provider: "openai", + model: "gpt-4o", + request_payload: {}, + response_payload: {}, + request_text: "nothing to redact", + response_text: "OK" + ) + + pipeline = RedactionPipeline.new(config: TraceBook.config) + redacted = pipeline.call(normalized) + + assert_equal "nothing to redact", redacted.request_text + assert_equal 0, redacted.redaction_audit.redaction_count + end + + test "returns redaction_audit on result" do + normalized = NormalizedInteraction.new( + provider: "openai", + model: "gpt-4o", + request_payload: {}, + response_payload: {}, + request_text: "test", + response_text: "OK" + ) + + pipeline = RedactionPipeline.new(config: TraceBook.config) + redacted = pipeline.call(normalized) + + assert_not_nil redacted.redaction_audit + assert_kind_of Tracebook::RedactionAudit, redacted.redaction_audit + end + + test "redacts nested arrays and hashes" do + TraceBook.configure do |config| + config.redact :email + end + + normalized = NormalizedInteraction.new( + provider: "openai", + model: "gpt-4o", + request_payload: { + "users" => [ + { "email" => "a@example.com" }, + { "email" => "b@example.org" } + ] + }, + response_payload: {}, + request_text: "", + response_text: "" + ) + + pipeline = RedactionPipeline.new(config: TraceBook.config) + redacted = pipeline.call(normalized) + + assert_equal "[EMAIL]", redacted.request_payload["users"][0]["email"] + assert_equal "[EMAIL]", redacted.request_payload["users"][1]["email"] + assert_equal 2, redacted.redaction_audit.redaction_count + end end end From 8a1f1b0ddf1947789b9cef9b13266d0f1c1b6bf3 Mon Sep 17 00:00:00 2001 From: dpaluy Date: Wed, 7 Jan 2026 08:09:34 -0600 Subject: [PATCH 6/8] feat: move redaction timing to before job enqueue in record! (#47) Critical security fix: Apply PII redaction BEFORE job enqueue to ensure raw PII never enters the job queue (Redis/Sidekiq/SQS). Changes: - record! now calls apply_redaction() before PersistInteractionJob.perform_later - apply_redaction() serializes actor and runs RedactionPipeline inline - RedactionAudit excluded from job payload (not ActiveJob serializable) - PersistInteractionJob uses assign_actor() for serialized actor data - Added comprehensive tests for redaction timing and actor serialization --- app/jobs/tracebook/persist_interaction_job.rb | 25 +++- lib/tracebook.rb | 23 +++- test/tracebook_test.rb | 127 ++++++++++++++++++ 3 files changed, 172 insertions(+), 3 deletions(-) diff --git a/app/jobs/tracebook/persist_interaction_job.rb b/app/jobs/tracebook/persist_interaction_job.rb index 83e79c6..f1a09bb 100644 --- a/app/jobs/tracebook/persist_interaction_job.rb +++ b/app/jobs/tracebook/persist_interaction_job.rb @@ -93,13 +93,36 @@ def persist_interaction(normalized, cost) ActiveRecord::Base.transaction do Interaction.create!(attributes).tap do |interaction| - interaction.actor = normalized.actor if normalized.actor + assign_actor(interaction, normalized) persist_payloads(interaction, normalized) interaction.save! if interaction.changed? end end end + def assign_actor(interaction, normalized) + # Prefer deserialized actor from GlobalID + if normalized.actor_gid.present? + interaction.actor = deserialize_actor(normalized) + elsif normalized.actor_type.present? && normalized.actor_id.present? + # Use serialized type/id when no GlobalID + interaction.actor_type = normalized.actor_type + interaction.actor_id = normalized.actor_id + elsif normalized.actor + # Legacy: raw actor object (shouldn't happen with new flow, but backwards compatible) + interaction.actor = normalized.actor + end + end + + def deserialize_actor(normalized) + return nil unless normalized.actor_gid.present? + + GlobalID::Locator.locate(normalized.actor_gid) + rescue GlobalID::ParseError, ActiveRecord::RecordNotFound => e + Rails.logger.warn "TraceBook: Could not deserialize actor from #{normalized.actor_gid}: #{e.message}" + nil + end + def total_tokens(normalized) [ normalized.input_tokens.to_i, normalized.output_tokens.to_i ].compact.sum end diff --git a/lib/tracebook.rb b/lib/tracebook.rb index ab2d5cd..1ca35f0 100644 --- a/lib/tracebook.rb +++ b/lib/tracebook.rb @@ -183,14 +183,18 @@ def serialize_actor(actor) # latency_ms: 30000 # ) def record!(**attributes) + # Build normalized interaction and apply redaction BEFORE job enqueue + # This ensures no raw PII ever enters the job queue (critical security fix) payload = build_normalized_interaction(attributes) + redacted_payload = apply_redaction(payload) + result = Result.new(idempotency_key: attributes[:idempotency_key]) if config.persist_async - PersistInteractionJob.perform_later(payload.to_h) + PersistInteractionJob.perform_later(redacted_payload.to_h) result else - interaction = PersistInteractionJob.perform_now(payload.to_h) + interaction = PersistInteractionJob.perform_now(redacted_payload.to_h) Result.new(interaction: interaction, idempotency_key: attributes[:idempotency_key]) end rescue StandardError => error @@ -210,6 +214,21 @@ def ensure_configurable! raise ConfigurationError, "TraceBook configuration is already finalized" end + def apply_redaction(normalized) + # Serialize actor BEFORE pipeline (deep_dup doesn't handle arbitrary objects well) + actor_data = serialize_actor(normalized.actor) + + pipeline = RedactionPipeline.new(config: config) + redacted = pipeline.call(normalized) + + # Return new normalized with serialized actor data + # Remove :actor (raw object) and :redaction_audit (not serializable by ActiveJob) + # redaction_audit is for call-time observability, not persistence + NormalizedInteraction.new( + **redacted.to_h.except(:actor, :redaction_audit).merge(actor_data) + ) + end + def build_normalized_interaction(attributes) NormalizedInteraction.new( provider: attributes.fetch(:provider), diff --git a/test/tracebook_test.rb b/test/tracebook_test.rb index 181da53..5e32f71 100644 --- a/test/tracebook_test.rb +++ b/test/tracebook_test.rb @@ -46,4 +46,131 @@ def self.name assert_equal({}, result) end + + # T10: Redaction timing tests + + test "record! redacts PII before job enqueue with sync mode" do + TraceBook.reset_configuration! + Tracebook::Interaction.delete_all + TraceBook.configure do |config| + config.redact :email + config.persist_async = false # Test with sync mode to verify result + end + + result = TraceBook.record!( + provider: "openai", + model: "gpt-4o", + request_text: "Contact user@example.com", + response_text: "OK" + ) + + interaction = result.interaction + assert_not_nil interaction + assert_equal "Contact [EMAIL]", interaction.request_text + assert_not_includes interaction.request_text.to_s, "user@example.com" + ensure + TraceBook.reset_configuration! + Tracebook::Interaction.delete_all + end + + test "record! no PII in persisted data with patterns enabled" do + TraceBook.reset_configuration! + Tracebook::Interaction.delete_all + TraceBook.configure do |config| + config.redact :email, :phone + config.persist_async = false + end + + result = TraceBook.record!( + provider: "openai", + model: "gpt-4o", + request_payload: { + "messages" => [ + { "content" => "Email: test@email.org, Phone: (555) 123-4567" } + ] + }, + response_payload: {} + ) + + # Verify no PII in persisted interaction + interaction = result.interaction + payload_json = interaction.request_payload.to_json + assert_not_includes payload_json, "test@email.org" + assert_includes payload_json, "[EMAIL]" + assert_includes payload_json, "[PHONE]" + ensure + TraceBook.reset_configuration! + Tracebook::Interaction.delete_all + end + + test "record! serializes actor for job-safe persistence" do + TraceBook.reset_configuration! + Tracebook::Interaction.delete_all + TraceBook.configure do |config| + config.persist_async = false + end + + actor_class = Class.new do + def self.name + "TestActor" + end + end + actor = actor_class.new + actor.define_singleton_method(:id) { 789 } + actor.define_singleton_method(:respond_to?) do |method| + method != :to_global_id && super(method) + end + + result = TraceBook.record!( + provider: "openai", + model: "gpt-4o", + request_text: "Hello", + response_text: "Hi", + actor: actor + ) + + # Actor should be serialized as type/id in the interaction + interaction = result.interaction + assert_equal "TestActor", interaction.actor_type + assert_equal 789, interaction.actor_id + ensure + TraceBook.reset_configuration! + Tracebook::Interaction.delete_all + end + + test "record! redaction performance is acceptable" do + TraceBook.reset_configuration! + Tracebook::Interaction.delete_all + TraceBook.configure do |config| + config.redact :email, :phone, :credit_card, :ssn + config.persist_async = false + end + + # Create payload with many fields to redact + large_payload = { + "messages" => 50.times.map do |i| + { "content" => "Email #{i}: user#{i}@test.com, Phone: (555) 123-45#{i.to_s.rjust(2, '0')}" } + end + } + + start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC) + + result = TraceBook.record!( + provider: "openai", + model: "gpt-4o", + request_payload: large_payload, + response_payload: {} + ) + + elapsed_ms = (Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) * 1000 + + # Performance should be under 100ms for regex redaction (database write adds overhead) + assert elapsed_ms < 100, "Redaction took #{elapsed_ms.round(2)}ms, should be under 100ms" + + # Verify redaction actually happened + assert_not_includes result.interaction.request_payload.to_json, "@test.com" + ensure + TraceBook.reset_configuration! + Tracebook::Interaction.delete_all + end end From edca60877266b4f1c0f5d74ff0ec50a72fa3f374 Mon Sep 17 00:00:00 2001 From: dpaluy Date: Wed, 7 Jan 2026 08:12:54 -0600 Subject: [PATCH 7/8] feat: add LLMBased redactor with guard protection (#48) Creates LLM-based redactor for sophisticated PII detection that pattern-based redactors might miss (names, addresses, context-sensitive info). Features: - IsolatedExecutionState guard prevents infinite recursion - Provider abstraction: OpenAI, Anthropic, Ollama - Sync and async modes with configurable timeout - Failure handling: raise, log_and_continue, retry_async - Security warning documented for external providers Closes #48 --- lib/tracebook/redactors.rb | 6 +- lib/tracebook/redactors/llm_based.rb | 251 +++++++++++++++++++++ test/lib/redactors/llm_based_test.rb | 314 +++++++++++++++++++++++++++ 3 files changed, 568 insertions(+), 3 deletions(-) create mode 100644 lib/tracebook/redactors/llm_based.rb create mode 100644 test/lib/redactors/llm_based_test.rb diff --git a/lib/tracebook/redactors.rb b/lib/tracebook/redactors.rb index 6e9494d..54e9cdf 100644 --- a/lib/tracebook/redactors.rb +++ b/lib/tracebook/redactors.rb @@ -1,7 +1,7 @@ # frozen_string_literal: true -# Redactors module - patterns and validators will be loaded here -# TODO: require_relative "redactors/patterns" -# TODO: require_relative "redactors/validators" +require_relative "redactors/patterns" +require_relative "redactors/validators" +require_relative "redactors/llm_based" TraceBook = Tracebook unless defined?(TraceBook) diff --git a/lib/tracebook/redactors/llm_based.rb b/lib/tracebook/redactors/llm_based.rb new file mode 100644 index 0000000..e6a9874 --- /dev/null +++ b/lib/tracebook/redactors/llm_based.rb @@ -0,0 +1,251 @@ +# frozen_string_literal: true + +module Tracebook + module Redactors + # LLM-based redactor for sophisticated PII detection. + # + # Uses an LLM to identify and redact PII that pattern-based redactors might miss, + # such as names, addresses, and context-sensitive information. + # + # SECURITY WARNING: When using external LLM providers (OpenAI, Anthropic), + # the text being redacted is sent to their servers. For maximum privacy, + # use a local Ollama instance. Never use external providers for highly + # sensitive data without explicit user consent. + # + # @example Configuration with OpenAI + # TraceBook.configure do |config| + # config.llm_redactor = Tracebook::Redactors::LLMBased.new( + # provider: :openai, + # model: "gpt-4o-mini", + # mode: :sync, + # on_failure: :log_and_continue + # ) + # end + # + # @example Configuration with local Ollama (privacy-preserving) + # TraceBook.configure do |config| + # config.llm_redactor = Tracebook::Redactors::LLMBased.new( + # provider: :ollama, + # model: "llama3.2", + # mode: :sync, + # timeout: 30 + # ) + # end + class LLMBased + GUARD_KEY = :tracebook_llm_redaction_guard + + DEFAULT_TIMEOUT = 30 + DEFAULT_MODE = :sync + DEFAULT_ON_FAILURE = :log_and_continue + + VALID_PROVIDERS = %i[openai anthropic ollama].freeze + VALID_MODES = %i[sync async].freeze + VALID_ON_FAILURE = %i[raise log_and_continue retry_async].freeze + + DEFAULT_PROMPT = <<~PROMPT + You are a PII redaction assistant. Your task is to identify and redact + personally identifiable information (PII) from the given text. + + Replace PII with appropriate markers: + - Names: [NAME] + - Addresses: [ADDRESS] + - Dates of birth: [DOB] + - Account numbers: [ACCOUNT] + - Any other PII: [PII] + + IMPORTANT: + - Preserve existing redaction markers like [EMAIL], [PHONE], [CARD] + - Do NOT redact these markers + - Only output the redacted text, nothing else + - If no PII is found, output the original text unchanged + PROMPT + + attr_reader :provider, :model, :mode, :on_failure, :timeout, :prompt + + # Creates a new LLM-based redactor. + # + # @param provider [Symbol] LLM provider (:openai, :anthropic, :ollama) + # @param model [String] Model identifier (e.g., "gpt-4o-mini", "claude-3-haiku") + # @param mode [Symbol] Execution mode (:sync or :async) + # @param on_failure [Symbol] Failure handling (:raise, :log_and_continue, :retry_async) + # @param timeout [Integer] Request timeout in seconds + # @param prompt [String, :default] Custom system prompt or :default + def initialize(provider:, model:, mode: DEFAULT_MODE, on_failure: DEFAULT_ON_FAILURE, timeout: DEFAULT_TIMEOUT, prompt: :default) + validate_provider!(provider) + validate_mode!(mode) + validate_on_failure!(on_failure) + + @provider = provider.to_sym + @model = model + @mode = mode.to_sym + @on_failure = on_failure.to_sym + @timeout = timeout + @prompt = prompt + end + + # Redacts PII from text using the configured LLM. + # + # Uses IsolatedExecutionState guard to prevent infinite recursion when + # TraceBook.record! is called within the LLM client (which would trigger + # another redaction cycle). + # + # @param text [String] Text to redact + # @param audit [RedactionAudit, nil] Optional audit object to record actions + # @param field_path [String, nil] Optional field path for audit context + # @return [Array] Tuple of [redacted_text, audit] + def call(text, audit: RedactionAudit.new, field_path: nil) + return [ text, audit ] if guarded? + return [ text, audit ] unless text.is_a?(String) + return [ text, audit ] if text.length < 10 # Skip very short strings + + with_guard do + perform_redaction(text, audit, field_path) + end + rescue StandardError => error + handle_failure(error, text, audit, field_path) + end + + # @return [Boolean] true if mode is :sync + def sync? + mode == :sync + end + + # @return [Boolean] true if mode is :async + def async? + mode == :async + end + + private + + def validate_provider!(provider) + return if VALID_PROVIDERS.include?(provider.to_sym) + + raise ArgumentError, "Unknown provider: #{provider}. Valid: #{VALID_PROVIDERS.join(', ')}" + end + + def validate_mode!(mode) + return if VALID_MODES.include?(mode.to_sym) + + raise ArgumentError, "Unknown mode: #{mode}. Valid: #{VALID_MODES.join(', ')}" + end + + def validate_on_failure!(on_failure) + return if VALID_ON_FAILURE.include?(on_failure.to_sym) + + raise ArgumentError, "Unknown on_failure: #{on_failure}. Valid: #{VALID_ON_FAILURE.join(', ')}" + end + + def guarded? + ActiveSupport::IsolatedExecutionState[GUARD_KEY] == true + end + + def with_guard + ActiveSupport::IsolatedExecutionState[GUARD_KEY] = true + yield + ensure + ActiveSupport::IsolatedExecutionState[GUARD_KEY] = false + end + + def perform_redaction(text, audit, field_path) + response = llm_request(text) + redacted = extract_response_text(response) + + # Only record if text actually changed + if redacted.present? && redacted != text + updated_audit = audit.record_redaction("llm_based", field_path || "inline") + [ redacted, updated_audit ] + else + [ text, audit ] + end + end + + def llm_request(text) + client.chat( + model: model, + messages: build_messages(text), + timeout: timeout + ) + end + + def build_messages(text) + [ + { role: "system", content: system_prompt }, + { role: "user", content: text } + ] + end + + def system_prompt + prompt == :default ? DEFAULT_PROMPT.strip : prompt + end + + def client + @client ||= build_client + end + + def build_client + case provider + when :openai + require_openai + OpenAI::Client.new + when :anthropic + require_anthropic + Anthropic::Client.new + when :ollama + require_ollama + Ollama.new(url: ollama_url) + end + end + + def require_openai + require "openai" + rescue LoadError + raise LoadError, "The 'ruby-openai' gem is required for OpenAI provider. Add `gem 'ruby-openai'` to your Gemfile." + end + + def require_anthropic + require "anthropic" + rescue LoadError + raise LoadError, "The 'anthropic' gem is required for Anthropic provider. Add `gem 'anthropic'` to your Gemfile." + end + + def require_ollama + require "ollama-ai" + rescue LoadError + raise LoadError, "The 'ollama-ai' gem is required for Ollama provider. Add `gem 'ollama-ai'` to your Gemfile." + end + + def ollama_url + ENV.fetch("OLLAMA_URL", "http://localhost:11434") + end + + def extract_response_text(response) + case provider + when :openai + response.dig("choices", 0, "message", "content") + when :anthropic + response.dig("content", 0, "text") + when :ollama + response.dig("message", "content") + end || "" + end + + def handle_failure(error, text, audit, field_path) + updated_audit = audit.record_redaction("llm_based_failure:#{error.class.name}", field_path || "inline") + + case on_failure + when :raise + raise + when :retry_async + # In sync mode, mark for async retry + if sync? + Rails.logger.warn("[TraceBook] LLM redaction failed, marked for async retry: #{error.message}") + end + [ text, updated_audit ] + else # :log_and_continue + Rails.logger.error("[TraceBook] LLM redaction failed: #{error.message}") + [ text, updated_audit ] + end + end + end + end +end diff --git a/test/lib/redactors/llm_based_test.rb b/test/lib/redactors/llm_based_test.rb new file mode 100644 index 0000000..6b223e0 --- /dev/null +++ b/test/lib/redactors/llm_based_test.rb @@ -0,0 +1,314 @@ +require "test_helper" + +module Tracebook + module Redactors + class LLMBasedTest < ActiveSupport::TestCase + # Mock LLM client for testing + class MockClient + attr_accessor :response, :should_fail, :request_log + + def initialize + @response = nil + @should_fail = false + @request_log = [] + end + + def chat(model:, messages:, timeout:) + @request_log << { model: model, messages: messages, timeout: timeout } + raise "Mock LLM error" if should_fail + + response + end + end + + setup do + @mock_client = MockClient.new + end + + # Initialization tests + + test "initializes with valid parameters" do + redactor = LLMBased.new(provider: :openai, model: "gpt-4o-mini") + + assert_equal :openai, redactor.provider + assert_equal "gpt-4o-mini", redactor.model + assert_equal :sync, redactor.mode + assert_equal :log_and_continue, redactor.on_failure + assert_equal 30, redactor.timeout + end + + test "raises on invalid provider" do + error = assert_raises(ArgumentError) do + LLMBased.new(provider: :invalid, model: "model") + end + + assert_includes error.message, "Unknown provider: invalid" + assert_includes error.message, "openai" + end + + test "raises on invalid mode" do + error = assert_raises(ArgumentError) do + LLMBased.new(provider: :openai, model: "model", mode: :invalid) + end + + assert_includes error.message, "Unknown mode: invalid" + end + + test "raises on invalid on_failure" do + error = assert_raises(ArgumentError) do + LLMBased.new(provider: :openai, model: "model", on_failure: :invalid) + end + + assert_includes error.message, "Unknown on_failure: invalid" + end + + test "accepts custom prompt" do + redactor = LLMBased.new( + provider: :openai, + model: "gpt-4o-mini", + prompt: "Custom prompt" + ) + + assert_equal "Custom prompt", redactor.prompt + end + + # Mode tests + + test "sync? returns true when mode is sync" do + redactor = LLMBased.new(provider: :openai, model: "model", mode: :sync) + assert redactor.sync? + assert_not redactor.async? + end + + test "async? returns true when mode is async" do + redactor = LLMBased.new(provider: :openai, model: "model", mode: :async) + assert redactor.async? + assert_not redactor.sync? + end + + # Guard protection tests + + test "guard prevents infinite recursion" do + redactor = LLMBased.new(provider: :openai, model: "gpt-4o-mini") + redactor.instance_variable_set(:@client, @mock_client) + @mock_client.response = openai_response("Redacted text") + + # First call should work + ActiveSupport::IsolatedExecutionState[LLMBased::GUARD_KEY] = false + result1, = redactor.call("Some PII text here") + + # Simulate nested call (guard should prevent) + ActiveSupport::IsolatedExecutionState[LLMBased::GUARD_KEY] = true + result2, = redactor.call("Another PII text") + + assert_equal "Redacted text", result1 + assert_equal "Another PII text", result2 # Unchanged due to guard + ensure + ActiveSupport::IsolatedExecutionState[LLMBased::GUARD_KEY] = false + end + + test "guard is cleared after successful call" do + redactor = LLMBased.new(provider: :openai, model: "gpt-4o-mini") + redactor.instance_variable_set(:@client, @mock_client) + @mock_client.response = openai_response("Redacted") + + redactor.call("Some text here") + + assert_not ActiveSupport::IsolatedExecutionState[LLMBased::GUARD_KEY] + end + + test "guard is cleared after failed call" do + redactor = LLMBased.new(provider: :openai, model: "gpt-4o-mini", on_failure: :log_and_continue) + redactor.instance_variable_set(:@client, @mock_client) + @mock_client.should_fail = true + + redactor.call("Some text here") + + assert_not ActiveSupport::IsolatedExecutionState[LLMBased::GUARD_KEY] + end + + # Redaction tests + + test "redacts text via LLM" do + redactor = LLMBased.new(provider: :openai, model: "gpt-4o-mini") + redactor.instance_variable_set(:@client, @mock_client) + @mock_client.response = openai_response("Hello [NAME], how are you?") + + result, audit = redactor.call("Hello John Smith, how are you?") + + assert_equal "Hello [NAME], how are you?", result + assert_equal 1, audit.redaction_count + assert_includes audit.redactors_applied, "llm_based" + end + + test "returns original text when no changes" do + redactor = LLMBased.new(provider: :openai, model: "gpt-4o-mini") + redactor.instance_variable_set(:@client, @mock_client) + @mock_client.response = openai_response("No PII here") + + result, audit = redactor.call("No PII here") + + assert_equal "No PII here", result + assert_equal 0, audit.redaction_count + end + + test "skips very short strings" do + redactor = LLMBased.new(provider: :openai, model: "gpt-4o-mini") + redactor.instance_variable_set(:@client, @mock_client) + + result, = redactor.call("Hi") + + assert_equal "Hi", result + assert_empty @mock_client.request_log # No LLM call made + end + + test "skips non-string input" do + redactor = LLMBased.new(provider: :openai, model: "gpt-4o-mini") + redactor.instance_variable_set(:@client, @mock_client) + + result, = redactor.call(12345) + + assert_equal 12345, result + assert_empty @mock_client.request_log + end + + # Audit tracking tests + + test "records redaction in audit with field_path" do + redactor = LLMBased.new(provider: :openai, model: "gpt-4o-mini") + redactor.instance_variable_set(:@client, @mock_client) + @mock_client.response = openai_response("[NAME] lives at [ADDRESS]") + + _, audit = redactor.call("John lives at 123 Main St", field_path: "messages.0.content") + + assert_equal 1, audit.redaction_count + assert_includes audit.redactors_applied, "llm_based" + end + + # Failure handling tests + + test "on_failure :log_and_continue returns original text" do + redactor = LLMBased.new( + provider: :openai, + model: "gpt-4o-mini", + on_failure: :log_and_continue + ) + redactor.instance_variable_set(:@client, @mock_client) + @mock_client.should_fail = true + + result, audit = redactor.call("John Smith secret data") + + assert_equal "John Smith secret data", result + assert audit.redactors_applied.any? { |r| r.include?("llm_based_failure") } + end + + test "on_failure :raise re-raises exception" do + redactor = LLMBased.new( + provider: :openai, + model: "gpt-4o-mini", + on_failure: :raise + ) + redactor.instance_variable_set(:@client, @mock_client) + @mock_client.should_fail = true + + assert_raises(RuntimeError) do + redactor.call("Some text here") + end + end + + test "on_failure :retry_async returns original text" do + redactor = LLMBased.new( + provider: :openai, + model: "gpt-4o-mini", + on_failure: :retry_async + ) + redactor.instance_variable_set(:@client, @mock_client) + @mock_client.should_fail = true + + result, audit = redactor.call("John Smith data") + + assert_equal "John Smith data", result + assert audit.redactors_applied.any? { |r| r.include?("llm_based_failure") } + end + + # Provider abstraction tests + + test "extracts openai response format" do + redactor = LLMBased.new(provider: :openai, model: "gpt-4o-mini") + redactor.instance_variable_set(:@client, @mock_client) + @mock_client.response = openai_response("Redacted OpenAI") + + result, = redactor.call("Original text here") + + assert_equal "Redacted OpenAI", result + end + + test "extracts anthropic response format" do + redactor = LLMBased.new(provider: :anthropic, model: "claude-3-haiku") + redactor.instance_variable_set(:@client, @mock_client) + @mock_client.response = anthropic_response("Redacted Anthropic") + + result, = redactor.call("Original text here") + + assert_equal "Redacted Anthropic", result + end + + test "extracts ollama response format" do + redactor = LLMBased.new(provider: :ollama, model: "llama3.2") + redactor.instance_variable_set(:@client, @mock_client) + @mock_client.response = ollama_response("Redacted Ollama") + + result, = redactor.call("Original text here") + + assert_equal "Redacted Ollama", result + end + + # System prompt tests + + test "uses default prompt when prompt is :default" do + redactor = LLMBased.new(provider: :openai, model: "gpt-4o-mini") + redactor.instance_variable_set(:@client, @mock_client) + @mock_client.response = openai_response("Redacted") + + redactor.call("Some PII text") + + messages = @mock_client.request_log.first[:messages] + system_message = messages.find { |m| m[:role] == "system" } + + assert_includes system_message[:content], "PII redaction assistant" + assert_includes system_message[:content], "[NAME]" + end + + test "uses custom prompt when provided" do + redactor = LLMBased.new( + provider: :openai, + model: "gpt-4o-mini", + prompt: "Custom redaction rules" + ) + redactor.instance_variable_set(:@client, @mock_client) + @mock_client.response = openai_response("Redacted") + + redactor.call("Some PII text") + + messages = @mock_client.request_log.first[:messages] + system_message = messages.find { |m| m[:role] == "system" } + + assert_equal "Custom redaction rules", system_message[:content] + end + + private + + def openai_response(content) + { "choices" => [{ "message" => { "content" => content } }] } + end + + def anthropic_response(content) + { "content" => [{ "text" => content }] } + end + + def ollama_response(content) + { "message" => { "content" => content } } + end + end + end +end From 4cf7c0146170ca67390e64747b736d3f2daeb6c7 Mon Sep 17 00:00:00 2001 From: dpaluy Date: Wed, 7 Jan 2026 08:19:29 -0600 Subject: [PATCH 8/8] feat: add LlmRedactionJob for async processing and llm_redactor config (#49, #50) Background job for async LLM-based PII redaction: - Retry policy: 3 attempts with exponential backoff - Updates metadata on success/failure - Skips already-processed interactions - Pessimistic lock prevents race conditions - Handles nil payloads gracefully Also adds `config.llm_redactor` accessor to Config for configuring the LLM-based redactor instance. Closes #49, Closes #50 --- app/jobs/tracebook/llm_redaction_job.rb | 121 +++++++++++++ lib/tracebook/config.rb | 18 ++ test/jobs/llm_redaction_job_test.rb | 225 ++++++++++++++++++++++++ 3 files changed, 364 insertions(+) create mode 100644 app/jobs/tracebook/llm_redaction_job.rb create mode 100644 test/jobs/llm_redaction_job_test.rb diff --git a/app/jobs/tracebook/llm_redaction_job.rb b/app/jobs/tracebook/llm_redaction_job.rb new file mode 100644 index 0000000..4f6c8fa --- /dev/null +++ b/app/jobs/tracebook/llm_redaction_job.rb @@ -0,0 +1,121 @@ +# frozen_string_literal: true + +module Tracebook + # Background job for async LLM-based PII redaction. + # + # This job processes interactions that need additional LLM-based redaction + # after initial pattern-based redaction. Useful for catching PII that + # patterns miss (names, addresses, context-sensitive data). + # + # Uses pessimistic locking to prevent race conditions and supports + # exponential backoff retry on failures. + # + # @example Enqueue for async processing + # LlmRedactionJob.perform_later(interaction.id) + # + # @example From retry_async failure mode + # # When LLMBased redactor fails with on_failure: :retry_async, + # # this job can be enqueued to retry later + # LlmRedactionJob.perform_later(interaction.id) + class LlmRedactionJob < ApplicationJob + queue_as :tracebook_llm_redaction + + # Retry configuration + RETRY_CONFIG = { + max_attempts: 3, + base_delay: 30.seconds, + max_delay: 5.minutes + }.freeze + + # Retry on standard errors with exponential backoff + retry_on StandardError, + wait: :polynomially_longer, + attempts: RETRY_CONFIG[:max_attempts] + + # Don't retry if interaction was deleted + discard_on ActiveRecord::RecordNotFound + + # @param interaction_id [Integer] ID of interaction to process + def perform(interaction_id) + interaction = Interaction.lock.find(interaction_id) + + # Skip if already processed successfully + return if llm_redaction_complete?(interaction) + + redactor = Tracebook.config.llm_redactor + return mark_skipped(interaction, "no_llm_redactor_configured") unless redactor + + process_redaction(interaction, redactor) + rescue StandardError => error + record_failure(interaction, error) if interaction + raise # Let ActiveJob handle retry + end + + private + + def llm_redaction_complete?(interaction) + status = interaction.metadata&.dig("llm_redaction_status") + status == "success" + end + + def process_redaction(interaction, redactor) + # Redact request payload + if interaction.request_payload.present? + request_json = JSON.generate(interaction.request_payload) + redacted_request, = redactor.call(request_json, audit: RedactionAudit.new) + interaction.request_payload = JSON.parse(redacted_request) + end + + # Redact response payload + if interaction.response_payload.present? + response_json = JSON.generate(interaction.response_payload) + redacted_response, = redactor.call(response_json, audit: RedactionAudit.new) + interaction.response_payload = JSON.parse(redacted_response) + end + + # Redact text fields + if interaction.request_text.present? + interaction.request_text, = redactor.call(interaction.request_text, audit: RedactionAudit.new) + end + + if interaction.response_text.present? + interaction.response_text, = redactor.call(interaction.response_text, audit: RedactionAudit.new) + end + + # Update metadata with success status + update_metadata(interaction, { + "llm_redaction_status" => "success", + "llm_redacted_at" => Time.current.iso8601 + }) + + interaction.save! + end + + def mark_skipped(interaction, reason) + update_metadata(interaction, { + "llm_redaction_status" => "skipped", + "llm_redaction_reason" => reason + }) + interaction.save! + end + + def record_failure(interaction, error) + current_attempts = interaction.metadata&.dig("llm_redaction_attempts").to_i + + update_metadata(interaction, { + "llm_redaction_status" => "failed", + "llm_redaction_error" => error.message, + "llm_redaction_attempts" => current_attempts + 1, + "llm_redaction_last_attempt" => Time.current.iso8601 + }) + interaction.save! + rescue StandardError => save_error + # Log but don't raise - the original error should propagate + Rails.logger.error("[TraceBook] Failed to record LLM redaction failure: #{save_error.message}") + end + + def update_metadata(interaction, updates) + interaction.metadata = (interaction.metadata || {}).merge(updates) + end + end +end diff --git a/lib/tracebook/config.rb b/lib/tracebook/config.rb index c267dea..7ae18f3 100644 --- a/lib/tracebook/config.rb +++ b/lib/tracebook/config.rb @@ -97,6 +97,23 @@ class Config # config.actor_display = ->(actor) { actor.full_name } attr_accessor :actor_display + # @!attribute [rw] llm_redactor + # @return [Redactors::LLMBased, nil] Optional LLM-based redactor for advanced PII detection + # When configured, can be used for async LLM redaction via {LLMRedactionJob}. + # @example Configure with OpenAI + # config.llm_redactor = Tracebook::Redactors::LLMBased.new( + # provider: :openai, + # model: "gpt-4o-mini", + # mode: :sync, + # on_failure: :log_and_continue + # ) + # @example Configure with local Ollama (privacy-preserving) + # config.llm_redactor = Tracebook::Redactors::LLMBased.new( + # provider: :ollama, + # model: "llama3.2" + # ) + attr_accessor :llm_redactor + # @!attribute [r] enabled_patterns # @return [Array] Pattern symbols enabled via redact DSL attr_reader :enabled_patterns @@ -120,6 +137,7 @@ def initialize @auto_subscribe_active_agent = false @per_page = 100 @actor_display = nil + @llm_redactor = nil @enabled_patterns = [] @custom_patterns = [] end diff --git a/test/jobs/llm_redaction_job_test.rb b/test/jobs/llm_redaction_job_test.rb new file mode 100644 index 0000000..665b896 --- /dev/null +++ b/test/jobs/llm_redaction_job_test.rb @@ -0,0 +1,225 @@ +require "test_helper" + +module Tracebook + class LlmRedactionJobTest < ActiveSupport::TestCase + include ActiveJob::TestHelper + + # Mock LLM redactor for testing + class MockRedactor + attr_accessor :replacement, :should_fail + + def initialize(replacement: "[LLM_REDACTED]") + @replacement = replacement + @should_fail = false + end + + def call(text, audit: nil, field_path: nil) + raise "Mock LLM error" if should_fail + + # Simple mock: replace "SECRET" with replacement + redacted = text.gsub(/SECRET/, replacement) + [ redacted, audit || RedactionAudit.new ] + end + end + + setup do + clear_enqueued_jobs + clear_performed_jobs + TraceBook.reset_configuration! + Interaction.delete_all + @mock_redactor = MockRedactor.new + end + + teardown do + clear_enqueued_jobs + clear_performed_jobs + TraceBook.reset_configuration! + Interaction.delete_all + end + + test "skips when no LLM redactor configured" do + TraceBook.configure { |c| c.persist_async = false } + interaction = create_interaction(request_text: "Some SECRET data") + + perform_enqueued_jobs do + LlmRedactionJob.perform_later(interaction.id) + end + + interaction.reload + assert_equal "skipped", interaction.metadata["llm_redaction_status"] + assert_equal "no_llm_redactor_configured", interaction.metadata["llm_redaction_reason"] + assert_equal "Some SECRET data", interaction.request_text # Unchanged + end + + test "redacts text fields with LLM redactor" do + TraceBook.configure do |c| + c.persist_async = false + c.llm_redactor = @mock_redactor + end + + interaction = create_interaction( + request_text: "My SECRET password", + response_text: "Your SECRET is safe" + ) + + perform_enqueued_jobs do + LlmRedactionJob.perform_later(interaction.id) + end + + interaction.reload + assert_equal "success", interaction.metadata["llm_redaction_status"] + assert_equal "My [LLM_REDACTED] password", interaction.request_text + assert_equal "Your [LLM_REDACTED] is safe", interaction.response_text + assert_not_nil interaction.metadata["llm_redacted_at"] + end + + test "redacts payload fields with LLM redactor" do + TraceBook.configure do |c| + c.persist_async = false + c.llm_redactor = @mock_redactor + end + + interaction = create_interaction( + request_payload: { "message" => "Tell me the SECRET" }, + response_payload: { "content" => "The SECRET is 42" } + ) + + perform_enqueued_jobs do + LlmRedactionJob.perform_later(interaction.id) + end + + interaction.reload + assert_equal "success", interaction.metadata["llm_redaction_status"] + assert_includes interaction.request_payload.to_json, "[LLM_REDACTED]" + assert_includes interaction.response_payload.to_json, "[LLM_REDACTED]" + assert_not_includes interaction.request_payload.to_json, "SECRET" + end + + test "skips already processed interactions" do + TraceBook.configure do |c| + c.persist_async = false + c.llm_redactor = @mock_redactor + end + + interaction = create_interaction( + request_text: "Some SECRET data", + metadata: { "llm_redaction_status" => "success" } + ) + + # Job should return early without calling redactor + @mock_redactor.should_fail = true # Would fail if called + + perform_enqueued_jobs do + LlmRedactionJob.perform_later(interaction.id) + end + + interaction.reload + assert_equal "success", interaction.metadata["llm_redaction_status"] + assert_equal "Some SECRET data", interaction.request_text # Unchanged + end + + test "records failure on redaction error" do + TraceBook.configure do |c| + c.persist_async = false + c.llm_redactor = @mock_redactor + end + @mock_redactor.should_fail = true + + interaction = create_interaction(request_text: "Some SECRET data") + + # perform_now with retry_on configured may not re-raise, but should record failure + LlmRedactionJob.perform_now(interaction.id) + + interaction.reload + assert_equal "failed", interaction.metadata["llm_redaction_status"] + assert_equal "Mock LLM error", interaction.metadata["llm_redaction_error"] + assert_equal 1, interaction.metadata["llm_redaction_attempts"] + assert_not_nil interaction.metadata["llm_redaction_last_attempt"] + end + + test "increments attempt count on repeated failures" do + TraceBook.configure do |c| + c.persist_async = false + c.llm_redactor = @mock_redactor + end + @mock_redactor.should_fail = true + + interaction = create_interaction( + request_text: "Some SECRET data", + metadata: { "llm_redaction_attempts" => 1 } + ) + + # perform_now with retry_on configured may not re-raise + LlmRedactionJob.perform_now(interaction.id) + + interaction.reload + assert_equal 2, interaction.metadata["llm_redaction_attempts"] + end + + test "discards job when interaction not found" do + TraceBook.configure do |c| + c.persist_async = false + c.llm_redactor = @mock_redactor + end + + # Should not raise - just discarded + assert_nothing_raised do + perform_enqueued_jobs do + LlmRedactionJob.perform_later(999999) + end + end + end + + test "uses pessimistic lock to prevent race conditions" do + TraceBook.configure do |c| + c.persist_async = false + c.llm_redactor = @mock_redactor + end + + interaction = create_interaction(request_text: "Some SECRET data") + + # Verify the job uses .lock method (can't easily test concurrency in MiniTest) + # This test verifies the job completes successfully with lock + perform_enqueued_jobs do + LlmRedactionJob.perform_later(interaction.id) + end + + interaction.reload + assert_equal "success", interaction.metadata["llm_redaction_status"] + end + + test "handles nil payloads gracefully" do + TraceBook.configure do |c| + c.persist_async = false + c.llm_redactor = @mock_redactor + end + + interaction = create_interaction( + request_payload: nil, + response_payload: nil, + request_text: "SECRET data", + response_text: nil + ) + + perform_enqueued_jobs do + LlmRedactionJob.perform_later(interaction.id) + end + + interaction.reload + assert_equal "success", interaction.metadata["llm_redaction_status"] + assert_equal "[LLM_REDACTED] data", interaction.request_text + end + + private + + def create_interaction(attrs = {}) + defaults = { + provider: "openai", + model: "gpt-4o", + status: :success, + metadata: {} + } + Interaction.create!(defaults.merge(attrs)) + end + end +end