From bdac725b5855125a7a2a4121601e1617bd625bd9 Mon Sep 17 00:00:00 2001
From: Agam More <agam@apprais.ai>
Date: Mon, 11 Aug 2025 15:34:12 -0500
Subject: [PATCH 1/2] Fix Citation serialization issue causing truncated JSON
 files

- Replace asdict() with manual dict conversion in JobResult.to_dict()
- Fix both citations and citation_mappings serialization
- Resolves TypeError: Object of type Citation is not JSON serializable
- Ensures complete job output files instead of truncated ones
- Tested with comprehensive verification script
---
 batchata/core/job_result.py | 14 ++++++++++++--
 uv.lock                     |  2 +-
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/batchata/core/job_result.py b/batchata/core/job_result.py
index b0077dd..3f57ba1 100644
--- a/batchata/core/job_result.py
+++ b/batchata/core/job_result.py
@@ -61,7 +61,12 @@ def to_dict(self) -> Dict[str, Any]:
         citation_mappings = None
         if self.citation_mappings:
             citation_mappings = {
-                field: [asdict(c) for c in citations]
+                field: [{
+                    'text': c.text,
+                    'source': c.source, 
+                    'page': c.page,
+                    'metadata': c.metadata
+                } for c in citations]
                 for field, citations in self.citation_mappings.items()
             }
         
@@ -69,7 +74,12 @@ def to_dict(self) -> Dict[str, Any]:
             "job_id": self.job_id,
             "raw_response": self.raw_response,
             "parsed_response": parsed_response,
-            "citations": [asdict(c) for c in self.citations] if self.citations else None,
+            "citations": [{
+                'text': c.text,
+                'source': c.source, 
+                'page': c.page,
+                'metadata': c.metadata
+            } for c in self.citations] if self.citations else None,
             "citation_mappings": citation_mappings,
             "input_tokens": self.input_tokens,
             "output_tokens": self.output_tokens,
diff --git a/uv.lock b/uv.lock
index 5dfb0b1..4165959 100644
--- a/uv.lock
+++ b/uv.lock
@@ -130,7 +130,7 @@ wheels = [
 
 [[package]]
 name = "batchata"
-version = "0.4.2"
+version = "0.4.3"
 source = { editable = "." }
 dependencies = [
     { name = "anthropic" },

From 4855e8b28a157c187f040e0a4c8647aa7d824e8d Mon Sep 17 00:00:00 2001
From: Agam More <agam@apprais.ai>
Date: Mon, 11 Aug 2025 16:37:10 -0500
Subject: [PATCH 2/2] Add unit test for citation_mappings JSON serialization

- Test specifically verifies the fix for Citation serialization issue
- Covers to_dict() and json.dumps/loads round-trip serialization
- Ensures citation_mappings are properly converted to dicts, not Citation objects
- Prevents regression of truncated JSON file issue
---
 tests/core/test_job_result.py | 85 ++++++++++++++++++++++++++++++++++-
 1 file changed, 84 insertions(+), 1 deletion(-)

diff --git a/tests/core/test_job_result.py b/tests/core/test_job_result.py
index 0ad8c70..77509fa 100644
--- a/tests/core/test_job_result.py
+++ b/tests/core/test_job_result.py
@@ -233,4 +233,87 @@ def test_default_values_in_deserialization(self):
         
         # Verify computed properties work with defaults
         assert result.total_tokens == 0
-        assert result.is_success is True  # No error = success
\ No newline at end of file
+        assert result.is_success is True  # No error = success
+    
+    def test_citation_mappings_json_serialization(self):
+        """Test that citation_mappings are properly JSON serializable.
+        
+        This test specifically verifies the fix for the issue where Citation objects
+        in citation_mappings were causing 'Object of type Citation is not JSON serializable'
+        errors and truncated output files.
+        """
+        # Create citations similar to the ones that were causing issues
+        citations = [
+            Citation(
+                text='EXTRAORDINARY ASSUMPTION(S) AND FINANCIAL INDICATORS',
+                source='test.pdf',
+                page=8,
+                metadata={'type': 'page_location', 'document_index': 0}
+            ),
+            Citation(
+                text='Market Extraction 6.21% - 7.25%',
+                source='test.pdf',
+                page=72,
+                metadata={'type': 'page_location', 'start_page_number': 72}
+            )
+        ]
+        
+        # Create citation mappings - this was causing the serialization issue
+        citation_mappings = {
+            'cap_rate': citations,
+            'occupancy': [citations[0]],
+            'address': citations
+        }
+        
+        # Create JobResult with both citations and citation_mappings
+        result = JobResult(
+            job_id="citation-mappings-test",
+            raw_response="Response with citation mappings",
+            parsed_response={'cap_rate': 7.0, 'occupancy': 99.0, 'address': '123 Test St'},
+            citations=citations,
+            citation_mappings=citation_mappings,
+            input_tokens=1000,
+            output_tokens=200,
+            cost_usd=0.15
+        )
+        
+        # Test 1: to_dict() should not fail (was failing before the fix)
+        data = result.to_dict()
+        
+        # Test 2: The result should be JSON serializable (was failing before)
+        json_str = json.dumps(data)
+        parsed_data = json.loads(json_str)
+        
+        # Test 3: Verify citation_mappings structure is correct
+        assert 'citation_mappings' in parsed_data
+        assert 'cap_rate' in parsed_data['citation_mappings']
+        assert 'occupancy' in parsed_data['citation_mappings']
+        assert 'address' in parsed_data['citation_mappings']
+        
+        # Test 4: Verify citation_mappings contain proper dict structures, not Citation objects
+        cap_rate_citations = parsed_data['citation_mappings']['cap_rate']
+        assert len(cap_rate_citations) == 2
+        assert isinstance(cap_rate_citations[0], dict)  # Should be dict, not Citation object
+        assert cap_rate_citations[0]['text'] == 'EXTRAORDINARY ASSUMPTION(S) AND FINANCIAL INDICATORS'
+        assert cap_rate_citations[0]['source'] == 'test.pdf'
+        assert cap_rate_citations[0]['page'] == 8
+        assert cap_rate_citations[0]['metadata']['type'] == 'page_location'
+        
+        # Test 5: Verify single citation mapping (occupancy)
+        occupancy_citations = parsed_data['citation_mappings']['occupancy']
+        assert len(occupancy_citations) == 1
+        assert isinstance(occupancy_citations[0], dict)
+        assert occupancy_citations[0]['text'] == 'EXTRAORDINARY ASSUMPTION(S) AND FINANCIAL INDICATORS'
+        
+        # Test 6: Verify citations list is also properly serialized
+        assert 'citations' in parsed_data
+        assert len(parsed_data['citations']) == 2
+        assert isinstance(parsed_data['citations'][0], dict)
+        
+        # Test 7: Full round-trip serialization
+        restored = JobResult.from_dict(parsed_data)
+        assert restored.job_id == result.job_id
+        assert len(restored.citations) == 2
+        assert len(restored.citation_mappings) == 3
+        assert len(restored.citation_mappings['cap_rate']) == 2
+        assert len(restored.citation_mappings['occupancy']) == 1
\ No newline at end of file