From 760b8ddb8c3677f7a2a731fcf1db42220ba7b987 Mon Sep 17 00:00:00 2001 From: Agam More Date: Mon, 11 Aug 2025 20:02:51 -0500 Subject: [PATCH 1/2] Add save_to_json() convenience method to JobResult - Add simple save_to_json() method that handles Citation serialization automatically - Creates directories as needed and uses existing to_dict() method internally - Updated README.md with usage examples - Added comprehensive test with tmp_path fixture - Minimal code addition (~10 lines) for maximum user convenience Fixes the need for users to manually handle Citation serialization when saving results. --- README.md | 14 ++++++++ batchata/core/job_result.py | 16 ++++++++- batchata/types.py | 13 ++++++++ batchata/utils/__init__.py | 3 +- batchata/utils/json_encoder.py | 47 ++++++++++++++++++++++++++ pyproject.toml | 2 +- tests/core/test_job_result.py | 61 +++++++++++++++++++++++++++++++++- uv.lock | 2 +- 8 files changed, 153 insertions(+), 5 deletions(-) create mode 100644 batchata/utils/json_encoder.py diff --git a/README.md b/README.md index c45a6ef..e09dab4 100644 --- a/README.md +++ b/README.md @@ -121,6 +121,20 @@ for result in results["cancelled"]: print(f"\nJob {result.job_id} was cancelled: {result.error}") ``` +## Saving Results to JSON + +You can easily save individual job results to JSON files: + +```python +# Save individual results to JSON files +for result in results["completed"]: + result.save_to_json(f"output/{result.job_id}.json") + +# Or save with custom formatting +result.save_to_json("my_result.json", indent=4) +``` + +This automatically handles Citation serialization and creates any necessary directories. ## Interactive Progress Display diff --git a/batchata/core/job_result.py b/batchata/core/job_result.py index 3f57ba1..eaeba3d 100644 --- a/batchata/core/job_result.py +++ b/batchata/core/job_result.py @@ -1,6 +1,6 @@ """JobResult data model.""" -from dataclasses import asdict, dataclass +from dataclasses import dataclass from typing import Any, Dict, List, Optional, Union from pydantic import BaseModel @@ -88,6 +88,20 @@ def to_dict(self) -> Dict[str, Any]: "batch_id": self.batch_id } + def save_to_json(self, filepath: str, indent: int = 2) -> None: + """Save JobResult to JSON file. + + Args: + filepath: Path to save the JSON file + indent: JSON indentation (default: 2) + """ + import json + from pathlib import Path + + Path(filepath).parent.mkdir(parents=True, exist_ok=True) + with open(filepath, 'w') as f: + json.dump(self.to_dict(), f, indent=indent) + @classmethod def from_dict(cls, data: Dict[str, Any]) -> 'JobResult': """Deserialize from state.""" diff --git a/batchata/types.py b/batchata/types.py index 909624e..5c8e773 100644 --- a/batchata/types.py +++ b/batchata/types.py @@ -12,6 +12,19 @@ class Citation: source: str # Source identifier (e.g., page number, section) page: Optional[int] = None # Page number if applicable metadata: Optional[Dict[str, Any]] = None # Additional metadata + + def __json__(self): + """Make Citation JSON serializable. + + This method is called by json.dumps() when using the default encoder. + Returns a dictionary representation that can be serialized to JSON. + """ + return { + 'text': self.text, + 'source': self.source, + 'page': self.page, + 'metadata': self.metadata + } @dataclass diff --git a/batchata/utils/__init__.py b/batchata/utils/__init__.py index 81c926d..974c2dd 100644 --- a/batchata/utils/__init__.py +++ b/batchata/utils/__init__.py @@ -6,5 +6,6 @@ from .logging import get_logger, set_log_level from .pdf import create_pdf from .rich_progress import RichBatchProgressDisplay +from .json_encoder import BatchataJSONEncoder -__all__ = ["CostTracker", "to_dict", "StateManager", "get_logger", "set_log_level", "create_pdf", "RichBatchProgressDisplay"] \ No newline at end of file +__all__ = ["CostTracker", "to_dict", "StateManager", "get_logger", "set_log_level", "create_pdf", "RichBatchProgressDisplay", "BatchataJSONEncoder"] \ No newline at end of file diff --git a/batchata/utils/json_encoder.py b/batchata/utils/json_encoder.py new file mode 100644 index 0000000..8afcb4c --- /dev/null +++ b/batchata/utils/json_encoder.py @@ -0,0 +1,47 @@ +"""Custom JSON encoder for batchata objects.""" + +import json +from dataclasses import asdict, is_dataclass +from typing import Any +from pydantic import BaseModel + +from ..types import Citation + + +class BatchataJSONEncoder(json.JSONEncoder): + """Custom JSON encoder that handles batchata objects like Citation. + + This encoder automatically converts Citation objects and other dataclasses + to dictionaries for JSON serialization. + + Usage: + ```python + import json + from batchata.utils import BatchataJSONEncoder + + # Now JobResult objects with Citation objects can be serialized directly + json.dump(job_result, f, cls=BatchataJSONEncoder, indent=2) + ``` + """ + + def default(self, obj: Any) -> Any: + """Convert objects to JSON-serializable format.""" + # Handle Citation objects specifically + if isinstance(obj, Citation): + return { + 'text': obj.text, + 'source': obj.source, + 'page': obj.page, + 'metadata': obj.metadata + } + + # Handle other dataclasses + if is_dataclass(obj): + return asdict(obj) + + # Handle Pydantic models + if isinstance(obj, BaseModel): + return obj.model_dump() + + # Let the base class handle other types + return super().default(obj) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 9384c1a..ef7ffbc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "batchata" -version = "0.4.4" +version = "0.4.5" description = "Unified Python API for AI batch requests with 50% cost savings on OpenAI and Anthropic" readme = "README.md" requires-python = ">=3.12" diff --git a/tests/core/test_job_result.py b/tests/core/test_job_result.py index 77509fa..e7eb687 100644 --- a/tests/core/test_job_result.py +++ b/tests/core/test_job_result.py @@ -316,4 +316,63 @@ def test_citation_mappings_json_serialization(self): assert len(restored.citations) == 2 assert len(restored.citation_mappings) == 3 assert len(restored.citation_mappings['cap_rate']) == 2 - assert len(restored.citation_mappings['occupancy']) == 1 \ No newline at end of file + assert len(restored.citation_mappings['occupancy']) == 1 + + def test_save_to_json(self, tmp_path): + """Test that save_to_json() correctly saves JobResult to a JSON file.""" + # Create a JobResult with citations and citation_mappings + citations = [ + Citation( + text='Test citation text', + source='test.pdf', + page=1, + metadata={'type': 'page_location', 'document_index': 0} + ) + ] + + citation_mappings = { + 'test_field': citations + } + + result = JobResult( + job_id="test-save-json", + raw_response="Test response", + parsed_response={'test_field': 'test_value'}, + citations=citations, + citation_mappings=citation_mappings, + input_tokens=100, + output_tokens=50, + cost_usd=0.05 + ) + + # Save to JSON file + json_file = tmp_path / "subdir" / "test_result.json" + result.save_to_json(str(json_file)) + + # Verify file was created + assert json_file.exists() + + # Verify content is correct by loading and comparing + import json + with open(json_file, 'r') as f: + saved_data = json.load(f) + + # Should match the result of to_dict() + expected_data = result.to_dict() + assert saved_data == expected_data + + # Verify specific fields + assert saved_data['job_id'] == 'test-save-json' + assert saved_data['input_tokens'] == 100 + assert saved_data['output_tokens'] == 50 + assert saved_data['cost_usd'] == 0.05 + + # Verify citations are properly serialized (not Citation objects) + assert isinstance(saved_data['citations'][0], dict) + assert saved_data['citations'][0]['text'] == 'Test citation text' + assert saved_data['citations'][0]['source'] == 'test.pdf' + assert saved_data['citations'][0]['page'] == 1 + + # Verify citation_mappings are properly serialized + assert isinstance(saved_data['citation_mappings']['test_field'][0], dict) + assert saved_data['citation_mappings']['test_field'][0]['text'] == 'Test citation text' \ No newline at end of file diff --git a/uv.lock b/uv.lock index 4165959..65688ec 100644 --- a/uv.lock +++ b/uv.lock @@ -130,7 +130,7 @@ wheels = [ [[package]] name = "batchata" -version = "0.4.3" +version = "0.4.5" source = { editable = "." } dependencies = [ { name = "anthropic" }, From d685fb2c48a82c498f57edca104f9e0bdf2af3fa Mon Sep 17 00:00:00 2001 From: Agam More Date: Mon, 11 Aug 2025 20:08:28 -0500 Subject: [PATCH 2/2] Clean up implementation and revert version change - Remove unused __json__() method from Citation class - Remove unused BatchataJSONEncoder and json_encoder.py file - Clean up utils/__init__.py imports - Revert version back to 0.4.4 (no version bump yet) - Keep minimal, clean save_to_json() implementation --- README.md | 18 +++---------- batchata/types.py | 13 ---------- batchata/utils/__init__.py | 3 +-- batchata/utils/json_encoder.py | 47 ---------------------------------- pyproject.toml | 2 +- 5 files changed, 5 insertions(+), 78 deletions(-) delete mode 100644 batchata/utils/json_encoder.py diff --git a/README.md b/README.md index e09dab4..3c9d80a 100644 --- a/README.md +++ b/README.md @@ -112,6 +112,9 @@ for result in results["completed"]: print(f" Vendor: {analysis.vendor} (page: {citations.get("vendor").page})") print(f" Total: ${analysis.total_amount:.2f} (page: {citations.get("total_amount").page})") print(f" Status: {analysis.payment_status} (page: {citations.get("payment_status").page})") + + # Save each result to JSON file + result.save_to_json(f"./invoice_results/{result.job_id}.json") # Process failed/cancelled results for result in results["failed"]: @@ -121,21 +124,6 @@ for result in results["cancelled"]: print(f"\nJob {result.job_id} was cancelled: {result.error}") ``` -## Saving Results to JSON - -You can easily save individual job results to JSON files: - -```python -# Save individual results to JSON files -for result in results["completed"]: - result.save_to_json(f"output/{result.job_id}.json") - -# Or save with custom formatting -result.save_to_json("my_result.json", indent=4) -``` - -This automatically handles Citation serialization and creates any necessary directories. - ## Interactive Progress Display Batchata provides an interactive real-time progress display when using `print_status=True`: diff --git a/batchata/types.py b/batchata/types.py index 5c8e773..909624e 100644 --- a/batchata/types.py +++ b/batchata/types.py @@ -12,19 +12,6 @@ class Citation: source: str # Source identifier (e.g., page number, section) page: Optional[int] = None # Page number if applicable metadata: Optional[Dict[str, Any]] = None # Additional metadata - - def __json__(self): - """Make Citation JSON serializable. - - This method is called by json.dumps() when using the default encoder. - Returns a dictionary representation that can be serialized to JSON. - """ - return { - 'text': self.text, - 'source': self.source, - 'page': self.page, - 'metadata': self.metadata - } @dataclass diff --git a/batchata/utils/__init__.py b/batchata/utils/__init__.py index 974c2dd..81c926d 100644 --- a/batchata/utils/__init__.py +++ b/batchata/utils/__init__.py @@ -6,6 +6,5 @@ from .logging import get_logger, set_log_level from .pdf import create_pdf from .rich_progress import RichBatchProgressDisplay -from .json_encoder import BatchataJSONEncoder -__all__ = ["CostTracker", "to_dict", "StateManager", "get_logger", "set_log_level", "create_pdf", "RichBatchProgressDisplay", "BatchataJSONEncoder"] \ No newline at end of file +__all__ = ["CostTracker", "to_dict", "StateManager", "get_logger", "set_log_level", "create_pdf", "RichBatchProgressDisplay"] \ No newline at end of file diff --git a/batchata/utils/json_encoder.py b/batchata/utils/json_encoder.py deleted file mode 100644 index 8afcb4c..0000000 --- a/batchata/utils/json_encoder.py +++ /dev/null @@ -1,47 +0,0 @@ -"""Custom JSON encoder for batchata objects.""" - -import json -from dataclasses import asdict, is_dataclass -from typing import Any -from pydantic import BaseModel - -from ..types import Citation - - -class BatchataJSONEncoder(json.JSONEncoder): - """Custom JSON encoder that handles batchata objects like Citation. - - This encoder automatically converts Citation objects and other dataclasses - to dictionaries for JSON serialization. - - Usage: - ```python - import json - from batchata.utils import BatchataJSONEncoder - - # Now JobResult objects with Citation objects can be serialized directly - json.dump(job_result, f, cls=BatchataJSONEncoder, indent=2) - ``` - """ - - def default(self, obj: Any) -> Any: - """Convert objects to JSON-serializable format.""" - # Handle Citation objects specifically - if isinstance(obj, Citation): - return { - 'text': obj.text, - 'source': obj.source, - 'page': obj.page, - 'metadata': obj.metadata - } - - # Handle other dataclasses - if is_dataclass(obj): - return asdict(obj) - - # Handle Pydantic models - if isinstance(obj, BaseModel): - return obj.model_dump() - - # Let the base class handle other types - return super().default(obj) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index ef7ffbc..9384c1a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "batchata" -version = "0.4.5" +version = "0.4.4" description = "Unified Python API for AI batch requests with 50% cost savings on OpenAI and Anthropic" readme = "README.md" requires-python = ">=3.12"