From bdac725b5855125a7a2a4121601e1617bd625bd9 Mon Sep 17 00:00:00 2001 From: Agam More Date: Mon, 11 Aug 2025 15:34:12 -0500 Subject: [PATCH 1/2] Fix Citation serialization issue causing truncated JSON files - Replace asdict() with manual dict conversion in JobResult.to_dict() - Fix both citations and citation_mappings serialization - Resolves TypeError: Object of type Citation is not JSON serializable - Ensures complete job output files instead of truncated ones - Tested with comprehensive verification script --- batchata/core/job_result.py | 14 ++++++++++++-- uv.lock | 2 +- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/batchata/core/job_result.py b/batchata/core/job_result.py index b0077dd..3f57ba1 100644 --- a/batchata/core/job_result.py +++ b/batchata/core/job_result.py @@ -61,7 +61,12 @@ def to_dict(self) -> Dict[str, Any]: citation_mappings = None if self.citation_mappings: citation_mappings = { - field: [asdict(c) for c in citations] + field: [{ + 'text': c.text, + 'source': c.source, + 'page': c.page, + 'metadata': c.metadata + } for c in citations] for field, citations in self.citation_mappings.items() } @@ -69,7 +74,12 @@ def to_dict(self) -> Dict[str, Any]: "job_id": self.job_id, "raw_response": self.raw_response, "parsed_response": parsed_response, - "citations": [asdict(c) for c in self.citations] if self.citations else None, + "citations": [{ + 'text': c.text, + 'source': c.source, + 'page': c.page, + 'metadata': c.metadata + } for c in self.citations] if self.citations else None, "citation_mappings": citation_mappings, "input_tokens": self.input_tokens, "output_tokens": self.output_tokens, diff --git a/uv.lock b/uv.lock index 5dfb0b1..4165959 100644 --- a/uv.lock +++ b/uv.lock @@ -130,7 +130,7 @@ wheels = [ [[package]] name = "batchata" -version = "0.4.2" +version = "0.4.3" source = { editable = "." } dependencies = [ { name = "anthropic" }, From 4855e8b28a157c187f040e0a4c8647aa7d824e8d Mon Sep 17 00:00:00 2001 From: Agam More Date: Mon, 11 Aug 2025 16:37:10 -0500 Subject: [PATCH 2/2] Add unit test for citation_mappings JSON serialization - Test specifically verifies the fix for Citation serialization issue - Covers to_dict() and json.dumps/loads round-trip serialization - Ensures citation_mappings are properly converted to dicts, not Citation objects - Prevents regression of truncated JSON file issue --- tests/core/test_job_result.py | 85 ++++++++++++++++++++++++++++++++++- 1 file changed, 84 insertions(+), 1 deletion(-) diff --git a/tests/core/test_job_result.py b/tests/core/test_job_result.py index 0ad8c70..77509fa 100644 --- a/tests/core/test_job_result.py +++ b/tests/core/test_job_result.py @@ -233,4 +233,87 @@ def test_default_values_in_deserialization(self): # Verify computed properties work with defaults assert result.total_tokens == 0 - assert result.is_success is True # No error = success \ No newline at end of file + assert result.is_success is True # No error = success + + def test_citation_mappings_json_serialization(self): + """Test that citation_mappings are properly JSON serializable. + + This test specifically verifies the fix for the issue where Citation objects + in citation_mappings were causing 'Object of type Citation is not JSON serializable' + errors and truncated output files. + """ + # Create citations similar to the ones that were causing issues + citations = [ + Citation( + text='EXTRAORDINARY ASSUMPTION(S) AND FINANCIAL INDICATORS', + source='test.pdf', + page=8, + metadata={'type': 'page_location', 'document_index': 0} + ), + Citation( + text='Market Extraction 6.21% - 7.25%', + source='test.pdf', + page=72, + metadata={'type': 'page_location', 'start_page_number': 72} + ) + ] + + # Create citation mappings - this was causing the serialization issue + citation_mappings = { + 'cap_rate': citations, + 'occupancy': [citations[0]], + 'address': citations + } + + # Create JobResult with both citations and citation_mappings + result = JobResult( + job_id="citation-mappings-test", + raw_response="Response with citation mappings", + parsed_response={'cap_rate': 7.0, 'occupancy': 99.0, 'address': '123 Test St'}, + citations=citations, + citation_mappings=citation_mappings, + input_tokens=1000, + output_tokens=200, + cost_usd=0.15 + ) + + # Test 1: to_dict() should not fail (was failing before the fix) + data = result.to_dict() + + # Test 2: The result should be JSON serializable (was failing before) + json_str = json.dumps(data) + parsed_data = json.loads(json_str) + + # Test 3: Verify citation_mappings structure is correct + assert 'citation_mappings' in parsed_data + assert 'cap_rate' in parsed_data['citation_mappings'] + assert 'occupancy' in parsed_data['citation_mappings'] + assert 'address' in parsed_data['citation_mappings'] + + # Test 4: Verify citation_mappings contain proper dict structures, not Citation objects + cap_rate_citations = parsed_data['citation_mappings']['cap_rate'] + assert len(cap_rate_citations) == 2 + assert isinstance(cap_rate_citations[0], dict) # Should be dict, not Citation object + assert cap_rate_citations[0]['text'] == 'EXTRAORDINARY ASSUMPTION(S) AND FINANCIAL INDICATORS' + assert cap_rate_citations[0]['source'] == 'test.pdf' + assert cap_rate_citations[0]['page'] == 8 + assert cap_rate_citations[0]['metadata']['type'] == 'page_location' + + # Test 5: Verify single citation mapping (occupancy) + occupancy_citations = parsed_data['citation_mappings']['occupancy'] + assert len(occupancy_citations) == 1 + assert isinstance(occupancy_citations[0], dict) + assert occupancy_citations[0]['text'] == 'EXTRAORDINARY ASSUMPTION(S) AND FINANCIAL INDICATORS' + + # Test 6: Verify citations list is also properly serialized + assert 'citations' in parsed_data + assert len(parsed_data['citations']) == 2 + assert isinstance(parsed_data['citations'][0], dict) + + # Test 7: Full round-trip serialization + restored = JobResult.from_dict(parsed_data) + assert restored.job_id == result.job_id + assert len(restored.citations) == 2 + assert len(restored.citation_mappings) == 3 + assert len(restored.citation_mappings['cap_rate']) == 2 + assert len(restored.citation_mappings['occupancy']) == 1 \ No newline at end of file