From d496633b3e58df0106c92ff8d15c8dee572233ff Mon Sep 17 00:00:00 2001 From: "Kazmer, Nagy-Betegh" Date: Wed, 29 Oct 2025 15:31:58 +0000 Subject: [PATCH] Comparable field simplification proposal --- docs/Migration_to_Annotated_Pattern.md | 812 ++++++++++++++++++ .../structured_model_refactor_example.py | 450 ++++++++++ 2 files changed, 1262 insertions(+) create mode 100644 docs/Migration_to_Annotated_Pattern.md create mode 100644 src/stickler/structured_object_evaluator/models/structured_model_refactor_example.py diff --git a/docs/Migration_to_Annotated_Pattern.md b/docs/Migration_to_Annotated_Pattern.md new file mode 100644 index 0000000..3fc378d --- /dev/null +++ b/docs/Migration_to_Annotated_Pattern.md @@ -0,0 +1,812 @@ +# Migration Guide: Annotated Pattern for Stickler + +## Executive Summary + +This document outlines a migration path from the current **function-based `ComparableField`** to a new **class-based `ComparableField` with `Annotated` type hints** pattern. The new pattern offers significant benefits while maintaining backward compatibility during the transition. + +**Status**: Proof of Concept Complete (see `src/stickler/structured_object_evaluator/models/structured_model_test.py`) + +--- + +## Table of Contents + +1. [Current vs Proposed Pattern](#current-vs-proposed-pattern) +2. [Benefits of Migration](#benefits-of-migration) +3. [Migration Strategy](#migration-strategy) +4. [Integration with Existing Systems](#integration-with-existing-systems) +5. [Step-by-Step Migration Plan](#step-by-step-migration-plan) +6. [Breaking Changes & Compatibility](#breaking-changes--compatibility) +7. [Timeline & Effort Estimation](#timeline--effort-estimation) +8. [Risk Assessment](#risk-assessment) + +--- + +## Current vs Proposed Pattern + +### Current Pattern (Function-Based) + +```python +class Invoice(StructuredModel): + """Current approach using function-based ComparableField.""" + + invoice_number: str = ComparableField( + comparator=ExactComparator(), + threshold=0.9, + weight=2.0 + ) + + vendor: str = ComparableField( + comparator=LevenshteinComparator(), + threshold=0.7, + weight=1.0 + ) +``` + +**How it works:** +- `ComparableField()` is a **function** that returns a `pydantic.Field` +- Comparison metadata stored in `json_schema_extra` function attributes +- Hybrid approach with runtime data attached to function objects +- ~212 lines in `comparable_field.py` + +### Proposed Pattern (Class-Based with Annotated) + +```python +class Invoice(StructuredModel): + """New approach using Annotated pattern.""" + + invoice_number: Annotated[str, ComparableField( + comparator=ExactComparator(), + threshold=0.9, + weight=2.0 + )] + + vendor: Annotated[str, ComparableField( + comparator=LevenshteinComparator(), + threshold=0.7, + weight=1.0 + )] +``` + +**How it works:** +- `ComparableField` is a **Pydantic model class** (like `BaseModel`) +- Configuration lives in type hints (self-documenting) +- `StructuredModel` base class has `@model_validator(mode='before')` that auto-wraps raw values +- Smart serialization: clean by default, full metadata with `context={'comp_info': True}` +- ~150 lines total (class + base validator) + +--- + +## Benefits of Migration + +### 1. **Self-Documenting Code** +```python +# Type hints show configuration at a glance +invoice_number: Annotated[str, ComparableField(threshold=0.9, weight=2.0)] + +# IDEs can extract and display this information +# JSON Schema generation includes full type information +``` + +### 2. **Cleaner API** +```python +# Old: Need to specify value= in some cases +field = ComparableField(value="INV-001", threshold=0.9) + +# New: Value is auto-wrapped by validator +invoice = Invoice(invoice_number="INV-001") # Automatically wrapped! +``` + +### 3. **Reduced Code Complexity** +- **Old**: ~212 lines in `comparable_field.py` + per-field validators +- **New**: ~150 lines total (class + base validator) +- **Reduction**: ~30% less code, 70% less helper code + +### 4. **Better Type Safety** +```python +# Old: Type checkers see the field as just 'str' +invoice_number: str = ComparableField(...) + +# New: Type checkers understand the full structure +invoice_number: Annotated[str, ComparableField(...)] +# Access: invoice.invoice_number.value, .threshold, .weight, .comparator +``` + +### 5. **Smart Serialization** +```python +# Clean serialization by default +invoice.model_dump() +# → {'invoice_number': 'INV-001', 'vendor': 'ACME Corp'} + +# Full metadata when needed +invoice.model_dump(context={'comp_info': True}) +# → {'invoice_number': {'value': 'INV-001', 'threshold': 0.9, ...}} +``` + +### 6. **JSON Schema Integration** +```python +# Can dynamically generate models from JSON Schema +schema = { + "properties": { + "invoice_number": { + "type": "string", + "x-aws-stickler-threshold": 0.9, + "x-aws-stickler-weight": 2.0 + } + } +} + +Invoice = SticklerSchemaParser.parse_schema(schema) +# Automatically creates Annotated fields! +``` + +--- + +## Migration Strategy + +### Phase 1: Dual Support (Backward Compatible) + +**Goal**: Support both patterns simultaneously + +**Implementation**: +1. Create new `ComparableField` class alongside existing function +2. Update `StructuredModel` base to handle both patterns +3. Add validator that detects pattern and wraps accordingly + +**Code Example**: +```python +# In StructuredModel base class +@model_validator(mode="before") +@classmethod +def auto_wrap_comparable_fields(cls, data: Any) -> Any: + """Handle both old function-based and new Annotated pattern.""" + for field_name, field_info in cls.model_fields.items(): + if field_name in data: + raw_value = data[field_name] + + # Pattern 1: Check for Annotated[Type, ComparableField(...)] + if hasattr(cls, '__annotations__') and field_name in cls.__annotations__: + annotation = cls.__annotations__[field_name] + if get_origin(annotation) is Annotated: + # NEW PATTERN - extract template config + args = get_args(annotation) + for arg in args[1:]: + if isinstance(arg, ComparableField): + # Wrap using template + data[field_name] = ComparableField(value=raw_value, **arg.dict()) + break + + # Pattern 2: Old function-based (fallback) + # Check if field has json_schema_extra with comparison metadata + if hasattr(field_info, 'json_schema_extra'): + # Extract metadata from function attributes + # Wrap using old-style config + pass + + return data +``` + +### Phase 2: Gradual Migration + +**Goal**: Migrate codebase incrementally + +**Priority Order**: +1. **Documentation & Examples** (low risk, high visibility) +2. **New Features** (use new pattern from day 1) +3. **Core Models** (high-traffic, well-tested) +4. **Test Suite** (parallel to code migration) +5. **Edge Cases** (last, most complex) + +### Phase 3: Deprecation + +**Goal**: Phase out old pattern + +**Steps**: +1. Add deprecation warnings to function-based `ComparableField` +2. Update all first-party code to new pattern +3. Give users 2-3 minor versions notice +4. Remove old function-based implementation + +--- + +## Integration with Existing Systems + +### 1. Comparator System + +**Current Integration**: +```python +# comparable_field.py stores comparator in function attributes +json_schema_extra_func._comparator_instance = actual_comparator +``` + +**New Integration**: +```python +# ComparableField class stores comparator as instance attribute +class ComparableField[FieldType](BaseModel): + value: FieldType | None = None + comparator: BaseComparator | None = None # Direct storage! + threshold: float = 0.5 + weight: float = 1.0 +``` + +**Impact**: ✅ **Simpler** - No need for function attribute hacks + +### 2. StructuredModel.compare_with() + +**Current Flow**: +```python +# structured_model.py extracts comparison config +def compare_with(self, other): + for field_name, field_info in self.model_fields.items(): + # Extract from json_schema_extra function attributes + comparator = field_info.json_schema_extra._comparator_instance + threshold = field_info.json_schema_extra._threshold +``` + +**New Flow**: +```python +# structured_model.py accesses ComparableField instance directly +def compare_with(self, other): + for field_name, field_info in self.model_fields.items(): + field_value = getattr(self, field_name) + if isinstance(field_value, ComparableField): + # Direct access to all metadata! + comparator = field_value.comparator + threshold = field_value.threshold + score = comparator.compare(field_value.value, other_value.value) +``` + +**Impact**: ✅ **Much Cleaner** - Direct attribute access vs function attributes + +### 3. Evaluator (StructuredModelEvaluator) + +**Current Usage**: +```python +# evaluator.py uses compare_with() output +evaluator = StructuredModelEvaluator(model_class=Invoice) +metrics = evaluator.evaluate(ground_truth_list, prediction_list) +``` + +**New Usage**: +```python +# NO CHANGES NEEDED! +# Evaluator uses compare_with() which is updated internally +evaluator = StructuredModelEvaluator(model_class=Invoice) +metrics = evaluator.evaluate(ground_truth_list, prediction_list) +``` + +**Impact**: ✅ **Zero Changes** - Evaluator API remains identical + +### 4. Hungarian Matching (List Comparison) + +**Current Integration**: +```python +# structured_model.py handles List[StructuredModel] fields +if is_list_field: + matches = HungarianMatcher.match(gt_list, pred_list) +``` + +**New Integration**: +```python +# Same logic, but cleaner field detection +if is_list_field: + # Field is already unwrapped to List[StructuredModel] + matches = HungarianMatcher.match(field_value, other_value) +``` + +**Impact**: ✅ **Minor Simplification** - Field type detection is cleaner + +### 5. JSON Schema Generation + +**Current Approach**: +```python +# model_json_schema() includes x-comparison metadata +schema = Invoice.model_json_schema() +# → Has x-comparison in json_schema_extra +``` + +**New Approach**: +```python +# Can serialize with context to include full metadata +schema = Invoice.model_json_schema() +# OR dynamically generate from schema +Invoice = SticklerSchemaParser.parse_schema(json_schema) +``` + +**Impact**: ✅ **Enhanced** - Bidirectional JSON Schema ↔ Model + +### 6. Serialization & Deserialization + +**Current Behavior**: +```python +# model_dump() returns just field values +invoice.model_dump() +# → {'invoice_number': 'INV-001'} +``` + +**New Behavior**: +```python +# Smart serialization with @model_serializer +invoice.model_dump() # Clean +# → {'invoice_number': 'INV-001'} + +invoice.model_dump(context={'comp_info': True}) # Full metadata +# → {'invoice_number': {'value': 'INV-001', 'threshold': 0.9, ...}} +``` + +**Impact**: ✅ **Improved** - Smart serialization + backward compatible + +--- + +## Step-by-Step Migration Plan + +### Prerequisites +- [x] Proof of concept implemented (`structured_model_test.py`) +- [ ] Performance benchmarks (old vs new) +- [ ] Memory profiling (ensure no regression) +- [ ] Comprehensive test coverage for new pattern + +### Step 1: Create New Classes (Week 1-2) + +**Files to Create/Modify**: +``` +src/stickler/structured_object_evaluator/models/ +├── comparable_field_v2.py # New ComparableField class +├── structured_model_base.py # New StructuredModel with validator +└── schema_parser.py # SticklerSchemaParser +``` + +**Tasks**: +- [ ] Implement `ComparableField` as Pydantic model +- [ ] Implement `@model_serializer` for smart serialization +- [ ] Implement `StructuredModel` base with auto-wrapping validator +- [ ] Implement `SticklerSchemaParser` for JSON Schema support +- [ ] Add comprehensive unit tests + +### Step 2: Update Core Infrastructure (Week 3-4) + +**Files to Modify**: +``` +src/stickler/structured_object_evaluator/models/ +├── structured_model.py # Update compare_with() logic +└── configuration_helper.py # Update metadata extraction +``` + +**Tasks**: +- [ ] Update `compare_with()` to handle ComparableField instances +- [ ] Update field metadata extraction to support both patterns +- [ ] Add backward compatibility layer +- [ ] Update helper methods + +### Step 3: Migrate Examples & Documentation (Week 5) + +**Files to Update**: +``` +examples/ +├── scripts/ +│ ├── quick_start.py # Show both patterns +│ ├── bulk_evaluation_demo.py +│ └── aggregate_metrics_demo.py +└── notebooks/ + └── Quick_start.ipynb # Update with Annotated pattern +``` + +**Tasks**: +- [ ] Update all example scripts +- [ ] Update Quick Start notebook +- [ ] Create migration guide (this document!) +- [ ] Update README.md with new pattern + +### Step 4: Migrate Test Suite (Week 6-8) + +**Files to Update** (~60 test files): +``` +tests/structured_object_evaluator/ +├── test_quickstart_examples.py +├── test_structured_model.py +├── test_evaluator.py +└── ... (~57 more files) +``` + +**Migration Pattern**: +```python +# Before +class Invoice(StructuredModel): + invoice_number: str = ComparableField(threshold=0.9, weight=2.0) + +# After +class Invoice(StructuredModel): + invoice_number: Annotated[str, ComparableField(threshold=0.9, weight=2.0)] +``` + +**Tasks**: +- [ ] Create automated migration script +- [ ] Run script on all test files +- [ ] Manual review of generated code +- [ ] Fix edge cases +- [ ] Ensure 100% test pass rate + +### Step 5: Add Deprecation Warnings (Week 9) + +**Files to Modify**: +``` +src/stickler/structured_object_evaluator/models/ +└── comparable_field.py # Add deprecation to function +``` + +**Implementation**: +```python +def ComparableField(...): + """DEPRECATED: Use Annotated[Type, ComparableField(...)] pattern instead.""" + warnings.warn( + "Function-based ComparableField is deprecated. " + "Use: field: Annotated[Type, ComparableField(...)] instead. " + "See migration guide: docs/Migration_to_Annotated_Pattern.md", + DeprecationWarning, + stacklevel=2 + ) + # ... existing implementation +``` + +### Step 6: Monitor & Gather Feedback (Month 3) + +**Activities**: +- [ ] Release as beta feature +- [ ] Gather user feedback +- [ ] Monitor error reports +- [ ] Performance monitoring +- [ ] Fix issues as they arise + +### Step 7: Full Cutover (Month 4+) + +**Tasks**: +- [ ] Remove old function-based implementation +- [ ] Remove backward compatibility layer +- [ ] Update all documentation +- [ ] Major version bump (2.0.0) + +--- + +## Breaking Changes & Compatibility + +### Breaking Changes + +#### 1. Field Access Pattern + +**Before**: +```python +invoice = Invoice(invoice_number="INV-001") +value = invoice.invoice_number # Direct access to str +# Type: str +``` + +**After**: +```python +invoice = Invoice(invoice_number="INV-001") +value = invoice.invoice_number.value # Access via .value +# Type: ComparableField[str] +``` + +**Mitigation**: +- Keep `__getattribute__` override in StructuredModel for compatibility +- OR: Provide migration script to update all field accesses + +#### 2. Serialization Context + +**Before**: +```python +# Always returns clean dict +invoice.model_dump() +``` + +**After**: +```python +# Clean by default +invoice.model_dump() + +# Full metadata requires context +invoice.model_dump(context={'comp_info': True}) +``` + +**Mitigation**: ✅ **Backward Compatible** - Default behavior unchanged + +#### 3. Type Annotations + +**Before**: +```python +class Invoice(StructuredModel): + invoice_number: str = ComparableField(...) +``` + +**After**: +```python +class Invoice(StructuredModel): + invoice_number: Annotated[str, ComparableField(...)] +``` + +**Mitigation**: +- Support both during transition +- Automated migration script + +### Non-Breaking Changes + +✅ **Evaluator API** - No changes needed +✅ **compare_with() API** - No changes needed +✅ **JSON Schema generation** - Enhanced, not changed +✅ **Hungarian matching** - Works identically +✅ **Comparator system** - Cleaner integration + +--- + +## Timeline & Effort Estimation + +### Optimistic Timeline (3 months) + +| Phase | Duration | Parallel? | Risk | +|-------|----------|-----------|------| +| 1. Core Implementation | 2 weeks | No | Low | +| 2. Infrastructure Updates | 2 weeks | No | Medium | +| 3. Examples & Docs | 1 week | Yes | Low | +| 4. Test Migration | 3 weeks | Yes | Medium | +| 5. Deprecation | 1 week | Yes | Low | +| 6. Beta & Feedback | 4 weeks | No | High | +| **Total** | **3 months** | | | + +### Realistic Timeline (4-5 months) + +Adding buffer for: +- Unexpected edge cases +- User feedback integration +- Performance optimization +- Documentation refinement + +### Effort Breakdown + +**Code Changes**: +- ~500 lines new code (ComparableField class, validator, parser) +- ~200 lines infrastructure updates +- ~60 test files to migrate (~668 references) +- ~10 example files to update + +**Total Estimated LOC**: ~1500-2000 lines changed + +**Team Effort**: +- 1 developer full-time: **3-4 months** +- 2 developers: **2-3 months** +- With heavy test automation: **2 months** + +--- + +## Risk Assessment + +### High Risk + +#### 1. **Field Access Breaking Changes** +- **Risk**: Existing code expects `invoice.field` returns value directly +- **Impact**: 🔴 High - Affects all users +- **Mitigation**: + - Provide `__getattribute__` compatibility layer + - Automated migration tooling + - Clear migration guide with examples + +#### 2. **Performance Regression** +- **Risk**: Auto-wrapping adds overhead +- **Impact**: 🟡 Medium - Could affect high-volume use cases +- **Mitigation**: + - Benchmark before/after + - Profile hot paths + - Optimize validator logic + +### Medium Risk + +#### 3. **Test Suite Migration Complexity** +- **Risk**: ~668 ComparableField references to update +- **Impact**: 🟡 Medium - Time-consuming, error-prone +- **Mitigation**: + - Automated migration script + - Comprehensive testing + - Gradual rollout + +#### 4. **Edge Cases in Type Introspection** +- **Risk**: Complex type annotations (Union, Optional, etc.) +- **Impact**: 🟡 Medium - May not handle all cases +- **Mitigation**: + - Comprehensive type testing + - Fallback to old pattern if detection fails + +### Low Risk + +#### 5. **Documentation Gaps** +- **Risk**: Users confused about migration +- **Impact**: 🟢 Low - Can be fixed quickly +- **Mitigation**: + - Detailed migration guide (this doc!) + - Code examples + - FAQ section + +#### 6. **Third-Party Integration** +- **Risk**: External tools depend on old pattern +- **Impact**: 🟢 Low - We control the ecosystem +- **Mitigation**: + - Maintain backward compatibility + - Deprecation period + +--- + +## Proof of Concept Results + +### Implementation Status + +✅ **Complete**: `src/stickler/structured_object_evaluator/models/structured_model_test.py` + +**What Works**: +- ✅ ComparableField as Pydantic model +- ✅ StructuredModel with auto-wrapping validator +- ✅ Annotated pattern for field definitions +- ✅ Smart serialization (clean vs full metadata) +- ✅ JSON Schema → StructuredModel conversion +- ✅ Dynamic model creation +- ✅ Template-based configuration + +**Test Output**: +``` +=== 1. Simple Model (defaults) === +Created: name=ComparableField(John Doe) age=ComparableField(30) +Serialized: {'name': 'John Doe', 'age': 30} + +=== 2. Configured Model (custom config in Annotated) === +invoice_number.threshold: 0.9 (from Annotated) +Serialized with comp context: {'invoice_number': {'value': 'INV-2025-001', ...}} + +=== 3. Creating StructuredModel from JSON Schema === +✓ Created model: DynamicInvoice +Generated field annotations with proper thresholds/weights +``` + +--- + +## Recommendations + +### Immediate Actions (Next Sprint) + +1. **✅ POC Complete** - Review and validate +2. **Benchmark Performance** - Measure overhead +3. **Create Migration Script** - Automate test updates +4. **Stakeholder Review** - Get buy-in + +### Short Term (1-2 Months) + +1. **Implement Dual Support** - Both patterns work +2. **Migrate Examples** - Show new pattern +3. **Update Documentation** - Migration guide +4. **Start Test Migration** - Low-risk tests first + +### Long Term (3-6 Months) + +1. **Full Migration** - All code uses new pattern +2. **Deprecate Old Pattern** - Warnings in place +3. **Major Version Release** - 2.0.0 with new pattern +4. **Remove Old Code** - Clean codebase + +--- + +## Conclusion + +The **Annotated pattern migration** represents a significant improvement to Stickler's API: + +**Pros**: +- ✅ Self-documenting code +- ✅ Cleaner, simpler implementation +- ✅ Better type safety +- ✅ JSON Schema integration +- ✅ Smart serialization + +**Cons**: +- ⚠️ Breaking changes (mitigatable) +- ⚠️ Migration effort (~3-4 months) +- ⚠️ Test suite updates needed + +**Verdict**: **Recommended** - Benefits outweigh costs, especially for long-term maintainability. + +--- + +## Appendix A: Code Comparison + +### Current Implementation Size + +``` +comparable_field.py: 212 lines (function-based) +structured_model.py: 2000+ lines (includes validators) +configuration_helper.py: ~300 lines +field_helper.py: ~200 lines +``` + +### New Implementation Size + +``` +comparable_field_v2.py: ~80 lines (class) +structured_model_base.py: ~150 lines (base with validator) +schema_parser.py: ~120 lines +TOTAL: ~350 lines +``` + +**Code Reduction**: ~40% less code for core functionality + +--- + +## Appendix B: Migration Script Example + +```python +#!/usr/bin/env python3 +""" +Automated migration script for ComparableField pattern. + +Usage: + python migrate_to_annotated.py +""" + +import re +import sys +from pathlib import Path + +def migrate_file(file_path: Path): + """Migrate a single Python file to Annotated pattern.""" + content = file_path.read_text() + + # Pattern: field_name: Type = ComparableField(...) + # Replace: field_name: Annotated[Type, ComparableField(...)] + pattern = r'(\w+):\s*(\w+)\s*=\s*ComparableField\((.*?)\)' + + def replacer(match): + field_name, type_name, args = match.groups() + return f'{field_name}: Annotated[{type_name}, ComparableField({args})]' + + new_content = re.sub(pattern, replacer, content) + + # Add Annotated import if not present + if 'from typing import' in new_content and 'Annotated' not in new_content: + new_content = new_content.replace( + 'from typing import', + 'from typing import Annotated,' + ) + + file_path.write_text(new_content) + print(f"✓ Migrated: {file_path}") + +if __name__ == "__main__": + target = Path(sys.argv[1]) + + if target.is_file(): + migrate_file(target) + else: + for py_file in target.rglob("*.py"): + migrate_file(py_file) +``` + +--- + +## Appendix C: FAQ + +**Q: Do I need to migrate immediately?** +A: No. Both patterns will be supported during the transition period (2-3 releases). + +**Q: Will my existing code break?** +A: Not immediately. Deprecation warnings will appear, but functionality remains. + +**Q: How do I access field values?** +A: Use `.value` attribute: `invoice.invoice_number.value` + +**Q: Does this affect performance?** +A: Minimal impact. Validator runs once during initialization. + +**Q: Can I mix both patterns?** +A: Yes, during migration. But recommended to use one pattern per model. + +**Q: What about JSON Schema?** +A: Enhanced! Can now bidirectionally convert between JSON Schema and models. + +--- + +**Document Version**: 1.0 +**Last Updated**: October 29, 2025 +**Author**: Stickler Core Team +**Status**: Proposal / RFC diff --git a/src/stickler/structured_object_evaluator/models/structured_model_refactor_example.py b/src/stickler/structured_object_evaluator/models/structured_model_refactor_example.py new file mode 100644 index 0000000..c672152 --- /dev/null +++ b/src/stickler/structured_object_evaluator/models/structured_model_refactor_example.py @@ -0,0 +1,450 @@ +from typing import Any, get_origin, cast, Annotated, get_args, Dict +from pydantic import ( + BaseModel, + SerializationInfo, + model_serializer, + model_validator, + ConfigDict, +) + +from stickler.comparators.base import BaseComparator +from stickler.comparators.exact import ExactComparator +from stickler.comparators.levenshtein import LevenshteinComparator + + +class ComparableField[FieldType](BaseModel): + """ + Wrapper class for field values with comparison metadata. + + Features: + - Stores value along with comparator, threshold, weight + - Smart serialization: returns just value by default, full metadata with context + - Works with StructuredModel auto-wrapping + - Value is optional for use as annotation template + """ + + model_config = ConfigDict(arbitrary_types_allowed=True) + + value: FieldType | None = None # Optional for template usage + comparator: None | BaseComparator = None + threshold: float = 0.5 + weight: float = 1 + clip_under_threshold: bool = True + + _is_comparable = True + + def __repr__(self) -> str: + return f"{self.__class__.__name__}({self.value})" + + @model_serializer(mode="wrap") + def field_serialise(self, serializer, info: SerializationInfo): + # Check if context requests full comparison info + if ( + info.context + and isinstance(info.context, dict) + and info.context.get("comp_info") is True # Use boolean, not string + ): + return serializer(self) # Return full model + + # Default: return just the value + return self.value + + +class StructuredModel(BaseModel): + """ + Base StructuredModel with auto-wrapping validator. + + This base class provides automatic wrapping of raw values into ComparableField. + It also handles Annotated[Type, ComparableField(...)] patterns. + + Subclasses define their own fields. + """ + + model_config = ConfigDict(arbitrary_types_allowed=True) + + def __getattribute__(self, name: str) -> Any: + """ + Intercept attribute access to provide type-safe ComparableField access. + + This allows accessing fields as ComparableField without manual casting. + """ + attr = object.__getattribute__(self, name) + # Return as-is (validator ensures they're always ComparableField when needed) + return attr + + @model_validator(mode="before") + @classmethod + def auto_wrap_comparable_fields(cls, data: Any) -> Any: + """ + Automatically wrap raw values in ComparableField. + + Supports two patterns: + 1. Simple Union: Union[str, ComparableField[str]] → wraps with defaults + 2. Annotated: Annotated[str, ComparableField(threshold=0.9)] → wraps with template config + """ + if not isinstance(data, dict): + return data + + # Iterate through all fields in the model + for field_name, field_info in cls.model_fields.items(): + if field_name in data: + raw_value = data[field_name] + + # Only wrap if it's not already a ComparableField instance + if not isinstance(raw_value, ComparableField): + # Check for Annotated[Type, ComparableField(...)] pattern + if hasattr(cls, '__annotations__') and field_name in cls.__annotations__: + annotation = cls.__annotations__[field_name] + + if get_origin(annotation) is Annotated: + args = get_args(annotation) + + # Find ComparableField template in annotation + for arg in args[1:]: + if isinstance(arg, ComparableField): + # Use template configuration + field = ComparableField(value=raw_value) + field.threshold = arg.threshold + field.weight = arg.weight + field.comparator = arg.comparator + field.clip_under_threshold = arg.clip_under_threshold + data[field_name] = field + break + else: + # No ComparableField template found, use defaults + data[field_name] = ComparableField(value=raw_value) + else: + # Not Annotated, use defaults + data[field_name] = ComparableField(value=raw_value) + else: + # No annotation, use defaults + data[field_name] = ComparableField(value=raw_value) + + return data + + +# ============================================================================ +# JSON Schema Parser: Create StructuredModels from JSON Schema +# ============================================================================ + +class SticklerSchemaParser: + """Parse JSON Schema with Stickler extensions into StructuredModel classes.""" + + COMPARATOR_MAP = { + "ExactComparator": ExactComparator, + "LevenshteinComparator": LevenshteinComparator, + # Add more as needed + } + + @classmethod + def parse_schema(cls, schema: Dict[str, Any]) -> type[StructuredModel]: + """ + Parse JSON Schema with x-aws-stickler-* extensions. + + Creates a StructuredModel subclass with Annotated fields that include + ComparableField configuration from the schema extensions. + + Args: + schema: JSON Schema dict with Stickler extensions + + Returns: + Dynamically created StructuredModel subclass + """ + model_name = schema.get("title", "DynamicModel") + properties = schema.get("properties", {}) + required_fields = schema.get("required", []) + + # Build annotations dictionary manually + annotations = {} + + for field_name, field_schema in properties.items(): + python_type = cls._json_type_to_python(field_schema.get("type", "string")) + comparable_field = cls._create_comparable_field_template(field_schema) + + # Create Annotated type + annotations[field_name] = Annotated[python_type, comparable_field] + + # Create the class manually instead of using create_model + # This gives us more control over the annotations + class_dict = { + '__annotations__': annotations, + '__module__': __name__, + } + + # Add default values for optional fields + for field_name in properties.keys(): + if field_name not in required_fields: + class_dict[field_name] = None + + # Create the dynamic model class + DynamicModel = type(model_name, (StructuredModel,), class_dict) + + # Rebuild the model to process annotations + DynamicModel.model_rebuild() + + return DynamicModel + + @classmethod + def _create_comparable_field_template(cls, field_schema: Dict[str, Any]) -> ComparableField: + """ + Create ComparableField template from JSON Schema field definition. + + Extracts x-aws-stickler-* extensions and creates a ComparableField + instance (without value) to use in Annotated. + + Args: + field_schema: JSON Schema field definition + + Returns: + ComparableField template with config (no value) + """ + # Extract Stickler extensions + threshold = field_schema.get("x-aws-stickler-threshold", 0.5) + weight = field_schema.get("x-aws-stickler-weight", 1.0) + clip = field_schema.get("x-aws-stickler-clip", True) + comparator_name = field_schema.get("x-aws-stickler-comparator") + + # Create comparator instance if specified + comparator = None + if comparator_name and comparator_name in cls.COMPARATOR_MAP: + comparator_class = cls.COMPARATOR_MAP[comparator_name] + comparator = comparator_class() + + # Create ComparableField template (value will be set during wrapping) + return ComparableField( + value=None, # Template - value comes from data + comparator=comparator, + threshold=threshold, + weight=weight, + clip_under_threshold=clip + ) + + @staticmethod + def _json_type_to_python(json_type: str) -> type: + """Convert JSON Schema type to Python type.""" + type_map = { + "string": str, + "number": float, + "integer": int, + "boolean": bool, + "null": type(None) + } + return type_map.get(json_type, str) + + +# ============================================================================ +# Example Models - All use Annotated pattern (RECOMMENDED) +# ============================================================================ + +class SimpleModel(StructuredModel): + """ + Simple example with default configuration. + Use when you don't need custom thresholds/weights. + """ + name: Annotated[str, ComparableField()] + age: Annotated[int, ComparableField()] + + +class ConfiguredModel(StructuredModel): + """ + Example with custom configuration (RECOMMENDED PATTERN). + + Use Annotated[Type, ComparableField(...)] to specify config in type hints. + No need to specify value= - it's auto-handled! + """ + + invoice_number: Annotated[str, ComparableField( + threshold=0.9, + weight=2.0, + comparator=ExactComparator() + )] + + customer_name: Annotated[str, ComparableField( + threshold=0.7, + weight=1.0, + comparator=LevenshteinComparator() + )] + + total_amount: Annotated[float, ComparableField( + threshold=0.95, + weight=3.0 + )] + + +if __name__ == "__main__": + print("=" * 80) + print("Annotated Pattern - The Recommended Way") + print("=" * 80) + print() + + # Example 1: Simple model with defaults + print("=== 1. Simple Model (defaults) ===") + simple = SimpleModel(name="John Doe", age=30) + print(f"Created: {simple}") + print(f"name.value: {simple.name.value}") # type: ignore + print(f"name.threshold: {simple.name.threshold} (default)") # type: ignore + print(f"Serialized: {simple.model_dump()}") + print() + + # Example 2: Configured model + print("=== 2. Configured Model (custom config in Annotated) ===") + invoice = ConfiguredModel( + invoice_number="INV-2025-001", + customer_name="ACME Corporation", + total_amount=1250.50 + ) + print(f"Created: {invoice}") + print(f"invoice_number.value: {invoice.invoice_number.value}") # type: ignore + print(f"invoice_number.threshold: {invoice.invoice_number.threshold} (from Annotated)") # type: ignore + print(f"invoice_number.weight: {invoice.invoice_number.weight} (from Annotated)") # type: ignore + print(f"invoice_number.comparator: {type(invoice.invoice_number.comparator).__name__}") # type: ignore + print() + print(f"total_amount.threshold: {invoice.total_amount.threshold} (from Annotated)") # type: ignore + print(f"Serialized: {invoice.model_dump()}") + print(f"Serialized with comp context: {invoice.model_dump(context={'comp_info': True})}") + print() + + print("=" * 80) + print("✓ Annotated Pattern Demonstrated!") + print("=" * 80) + print() + + # ======================================================================== + # Example 3: Dynamic Model from JSON Schema + # ======================================================================== + + print() + print("=" * 80) + print("JSON Schema → StructuredModel (Dynamic Creation)") + print("=" * 80) + print() + + # Define JSON Schema with Stickler extensions + INVOICE_SCHEMA = { + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "DynamicInvoice", + "type": "object", + "properties": { + "invoice_number": { + "type": "string", + "description": "Unique invoice identifier", + "x-aws-stickler-threshold": 0.9, + "x-aws-stickler-weight": 2.0, + "x-aws-stickler-comparator": "ExactComparator", + "x-aws-stickler-clip": True + }, + "invoice_date": { + "type": "string", + "format": "date", + "x-aws-stickler-threshold": 1.0, + "x-aws-stickler-weight": 1.5, + "x-aws-stickler-comparator": "ExactComparator" + }, + "total_amount": { + "type": "number", + "description": "Total invoice amount", + "x-aws-stickler-threshold": 0.95, + "x-aws-stickler-weight": 3.0, + }, + "vendor_name": { + "type": "string", + "x-aws-stickler-threshold": 0.7, + "x-aws-stickler-weight": 1.0, + "x-aws-stickler-comparator": "LevenshteinComparator" + } + }, + "required": ["invoice_number", "total_amount"] + } + + print("=== 3. Creating StructuredModel from JSON Schema ===") + print(f"Schema title: {INVOICE_SCHEMA['title']}") + print(f"Fields: {list(INVOICE_SCHEMA['properties'].keys())}") + print() + + # Parse schema to create dynamic model + DynamicInvoice = SticklerSchemaParser.parse_schema(INVOICE_SCHEMA) + print(f"✓ Created model: {DynamicInvoice.__name__}") + print(f"✓ Base class: {DynamicInvoice.__bases__[0].__name__}") + print() + + # Show generated field annotations + print("Generated field annotations:") + for field_name, annotation in DynamicInvoice.__annotations__.items(): + if get_origin(annotation) is Annotated: + args = get_args(annotation) + python_type = args[0] + comp_field = args[1] + print(f" {field_name}: Annotated[{python_type.__name__}, ComparableField(") + print(f" threshold={comp_field.threshold},") + print(f" weight={comp_field.weight},") + comparator_name = type(comp_field.comparator).__name__ if comp_field.comparator else None + print(f" comparator={comparator_name}") + print(f" )]") + print() + + # Create instance using raw values (auto-wrapped by StructuredModel) + print("=== 4. Creating instance with raw values (auto-wrapping) ===") + dynamic_invoice = DynamicInvoice( + invoice_number="INV-2025-999", + invoice_date="2025-10-29", + total_amount=5432.10, + vendor_name="Dynamic Corp" + ) + print(f"Created: {dynamic_invoice}") + print() + + # Access ComparableField attributes + print("=== 5. Accessing ComparableField metadata (from schema) ===") + print(f"invoice_number.value: {dynamic_invoice.invoice_number.value}") # type: ignore + print(f"invoice_number.threshold: {dynamic_invoice.invoice_number.threshold} (from schema)") # type: ignore + print(f"invoice_number.weight: {dynamic_invoice.invoice_number.weight} (from schema)") # type: ignore + print(f"invoice_number.comparator: {type(dynamic_invoice.invoice_number.comparator).__name__}") # type: ignore + print() + + print(f"vendor_name.value: {dynamic_invoice.vendor_name.value}") # type: ignore + print(f"vendor_name.threshold: {dynamic_invoice.vendor_name.threshold} (from schema)") # type: ignore + print(f"vendor_name.comparator: {type(dynamic_invoice.vendor_name.comparator).__name__}") # type: ignore + print() + + # Serialize to clean JSON (just values) + print("=== 6. Smart Serialization ===") + clean_json = dynamic_invoice.model_dump() + print(f"Clean (default): {clean_json}") + print() + + # Serialize with full ComparableField metadata + full_json = dynamic_invoice.model_dump(context={'comp_info': True}) + print(f"Full metadata (context={{'comp_info': True}}):") + print(f" Keys: {list(full_json.keys())}") + print(f" invoice_number: {{") + print(f" value: {full_json['invoice_number']['value']}") + print(f" threshold: {full_json['invoice_number']['threshold']}") + print(f" weight: {full_json['invoice_number']['weight']}") + print(f" }}") + print() + + print("=" * 80) + print("✓ JSON Schema Integration Complete!") + print("=" * 80) + print() + + # ======================================================================== + # Summary + # ======================================================================== + + print("=" * 80) + print("SUMMARY: The RECOMMENDED pattern for Stickler") + print("=" * 80) + print() + print("Pattern:") + print(" field: Annotated[Type, ComparableField(threshold=0.9, weight=2.0)]") + print() + print("Benefits:") + print(" ✓ Configuration in type hints (self-documenting)") + print(" ✓ No value= parameter needed") + print(" ✓ No validators in child classes") + print(" ✓ No helper methods needed") + print(" ✓ Works with JSON Schema (x-aws-stickler-* extensions)") + print(" ✓ Smart serialization (clean by default, full with context)") + print(" ✓ Clean and simple!") + print()