diff --git a/docs/admin_client_guide.md b/docs/admin_client_guide.md index 42e35e2..417fee5 100644 --- a/docs/admin_client_guide.md +++ b/docs/admin_client_guide.md @@ -354,12 +354,12 @@ The schema client validates SQL queries and returns their output schemas without ```python # Validate a query and get its schema schema_response = client.schema.get_output_schema( - sql_query='SELECT block_num, hash, timestamp FROM eth.blocks WHERE block_num > 1000000', - is_sql_dataset=True + tables={'query_analysis': 'SELECT block_num, hash, timestamp FROM eth.blocks WHERE block_num > 1000000'}, + dependencies={'eth': '_/eth_firehose@1.0.0'} ) # Inspect the Arrow schema -print(schema_response.schema) +print(schema_response.schemas['query_analysis'].schema) ``` This is particularly useful for: @@ -651,8 +651,11 @@ with Client(query_url=..., admin_url=..., auth_token=...) as client: ```python # Validate before registration -schema = client.schema.get_output_schema(sql_query, True) -print(f"Query will produce {len(schema.schema['fields'])} columns") +response = client.schema.get_output_schema( + tables={'t': sql_query}, + dependencies={...} +) +print(f"Query will produce {len(response.schemas['t'].schema['fields'])} columns") ``` ### 3. Version Your Datasets diff --git a/docs/api/client_api.md b/docs/api/client_api.md index d03763a..8ed634c 100644 --- a/docs/api/client_api.md +++ b/docs/api/client_api.md @@ -598,17 +598,21 @@ Validate SQL query and get its output Arrow schema without executing it. ```python get_output_schema( - sql_query: str, - is_sql_dataset: bool = True -) -> models.OutputSchemaResponse + tables: Optional[dict[str, str]] = None, + dependencies: Optional[dict[str, str]] = None, + functions: Optional[dict[str, Any]] = None +) -> models.SchemaResponse ``` **Parameters:** -- `sql_query` (str): SQL query to analyze. -- `is_sql_dataset` (bool, optional): Whether this is for a SQL dataset. Default: True. +| Parameter | Type | Default | Description | +| :--- | :--- | :--- | :--- | +| `tables` | `dict[str, str]` | `None` | Optional map of table names to SQL queries | +| `dependencies` | `dict[str, str]` | `None` | Optional map of alias -> dataset reference | +| `functions` | `dict[str, Any]` | `None` | Optional map of function definitions | -**Returns:** `OutputSchemaResponse` with Arrow schema. +**Returns:** `SchemaResponse` containing schemas for all requested tables. **Raises:** @@ -619,11 +623,11 @@ get_output_schema( ```python response = client.schema.get_output_schema( - 'SELECT block_num, hash FROM eth.blocks WHERE block_num > 1000000', - is_sql_dataset=True + tables={'my_table': 'SELECT block_num FROM eth.blocks'}, + dependencies={'eth': '_/eth_firehose@1.0.0'} ) -print(response.schema) # Arrow schema dict +print(response.schemas['my_table'].schema) # Arrow schema dict ``` --- @@ -712,13 +716,22 @@ Response from deploying a dataset. - `job_id` (int): ID of the created job -#### `OutputSchemaResponse` +#### `SchemaResponse` -Response containing Arrow schema for a query. +Response containing schemas for one or more tables. + +**Fields:** + +- `schemas` (dict[str, TableSchemaWithNetworks]): Map of table names to their schemas + +#### `TableSchemaWithNetworks` + +Response containing Arrow schema for a query and associated networks. **Fields:** - `schema` (dict): Arrow schema dictionary +- `networks` (list[str]): List of referenced networks ### Request Models @@ -733,14 +746,15 @@ Request to register a dataset. - `version` (str, optional): Version string - `manifest` (dict): Dataset manifest -#### `OutputSchemaRequest` +#### `SchemaRequest` -Request to get output schema for a query. +Request for schema analysis with dependencies, tables, and functions. **Fields:** -- `sql_query` (str): SQL query -- `is_sql_dataset` (bool): Whether this is for a SQL dataset +- `dependencies` (dict[str, str], optional): External dataset dependencies +- `tables` (dict[str, str], optional): Table definitions +- `functions` (dict[str, Any], optional): User-defined functions --- @@ -976,7 +990,10 @@ try: print(f"Query returns {len(df)} rows") # Validate schema - schema = client.schema.get_output_schema(query.query, True) + schema = client.schema.get_output_schema( + query.query, + dependencies={'eth': '_/eth_firehose@1.0.0'} + ) print(f"Schema: {schema.schema}") # Register and deploy diff --git a/performance_benchmarks.json b/performance_benchmarks.json index 8b63ef4..54f8a35 100644 --- a/performance_benchmarks.json +++ b/performance_benchmarks.json @@ -2,232 +2,232 @@ "postgresql_large_table_loading_performance": { "test_name": "large_table_loading_performance", "loader_type": "postgresql", - "throughput_rows_per_sec": 128032.82091356427, - "memory_mb": 450.359375, - "duration_seconds": 0.39052486419677734, + "throughput_rows_per_sec": 227144.69378847073, + "memory_mb": 373.84375, + "duration_seconds": 0.2201240062713623, "dataset_size": 50000, - "timestamp": "2025-10-27T23:59:34.602321", - "git_commit": "e38e5aab", + "timestamp": "2025-12-13T14:50:58.654194", + "git_commit": "08a868c5", "environment": "local" }, "redis_pipeline_performance": { "test_name": "pipeline_performance", "loader_type": "redis", - "throughput_rows_per_sec": 43232.59035152331, + "throughput_rows_per_sec": 38539.89058184217, "memory_mb": 0, "duration_seconds": 0, "dataset_size": 50000, - "timestamp": "2025-10-28T00:00:03.930037", - "git_commit": "e38e5aab", + "timestamp": "2025-12-13T14:51:10.837943", + "git_commit": "08a868c5", "environment": "local" }, "redis_data_structure_performance_hash": { "test_name": "data_structure_performance_hash", "loader_type": "redis", - "throughput_rows_per_sec": 34689.0009927911, + "throughput_rows_per_sec": 37345.95465065899, "memory_mb": 0, "duration_seconds": 0, "dataset_size": 50000, - "timestamp": "2025-10-28T00:00:06.866695", - "git_commit": "e38e5aab", + "timestamp": "2025-12-13T14:51:13.753274", + "git_commit": "08a868c5", "environment": "local" }, "redis_data_structure_performance_string": { "test_name": "data_structure_performance_string", "loader_type": "redis", - "throughput_rows_per_sec": 74117.79882204712, + "throughput_rows_per_sec": 49109.77710179646, "memory_mb": 0, "duration_seconds": 0, "dataset_size": 50000, - "timestamp": "2025-10-28T00:00:06.892124", - "git_commit": "e38e5aab", + "timestamp": "2025-12-13T14:51:13.767213", + "git_commit": "08a868c5", "environment": "local" }, "redis_data_structure_performance_sorted_set": { "test_name": "data_structure_performance_sorted_set", "loader_type": "redis", - "throughput_rows_per_sec": 72130.90621426176, + "throughput_rows_per_sec": 104821.38421175483, "memory_mb": 0, "duration_seconds": 0, "dataset_size": 50000, - "timestamp": "2025-10-28T00:00:06.915461", - "git_commit": "e38e5aab", + "timestamp": "2025-12-13T14:51:13.777390", + "git_commit": "08a868c5", "environment": "local" }, "redis_memory_efficiency": { "test_name": "memory_efficiency", "loader_type": "redis", - "throughput_rows_per_sec": 37452.955032923, - "memory_mb": 14.465019226074219, - "duration_seconds": 1.335008144378662, + "throughput_rows_per_sec": 37107.23940379341, + "memory_mb": 14.457672119140625, + "duration_seconds": 1.3474459648132324, "dataset_size": 50000, - "timestamp": "2025-10-28T00:00:08.312561", - "git_commit": "e38e5aab", + "timestamp": "2025-12-13T14:51:15.176147", + "git_commit": "08a868c5", "environment": "local" }, "delta_lake_large_file_write_performance": { "test_name": "large_file_write_performance", "loader_type": "delta_lake", - "throughput_rows_per_sec": 378063.45308981824, - "memory_mb": 485.609375, - "duration_seconds": 0.13225293159484863, + "throughput_rows_per_sec": 591561.8979382868, + "memory_mb": 280.546875, + "duration_seconds": 0.08452200889587402, "dataset_size": 50000, - "timestamp": "2025-10-28T00:00:08.528047", - "git_commit": "e38e5aab", + "timestamp": "2025-12-13T14:51:15.321486", + "git_commit": "08a868c5", "environment": "local" }, "lmdb_large_table_loading_performance": { "test_name": "large_table_loading_performance", "loader_type": "lmdb", - "throughput_rows_per_sec": 68143.20147805117, - "memory_mb": 1272.359375, - "duration_seconds": 0.7337489128112793, + "throughput_rows_per_sec": 78945.8983105974, + "memory_mb": 840.875, + "duration_seconds": 0.6333451271057129, "dataset_size": 50000, - "timestamp": "2025-10-28T00:00:12.347292", - "git_commit": "e38e5aab", + "timestamp": "2025-12-13T14:51:17.051522", + "git_commit": "08a868c5", "environment": "local" }, "lmdb_key_generation_strategy_performance_pattern_based": { "test_name": "key_generation_strategy_performance_pattern_based", "loader_type": "lmdb", - "throughput_rows_per_sec": 94096.5745855362, + "throughput_rows_per_sec": 77106.25074223634, "memory_mb": 0, "duration_seconds": 0, "dataset_size": 50000, - "timestamp": "2025-10-28T00:00:14.592329", - "git_commit": "e38e5aab", + "timestamp": "2025-12-13T14:51:19.164620", + "git_commit": "08a868c5", "environment": "local" }, "lmdb_key_generation_strategy_performance_single_column": { "test_name": "key_generation_strategy_performance_single_column", "loader_type": "lmdb", - "throughput_rows_per_sec": 78346.86278487406, + "throughput_rows_per_sec": 86566.45444218424, "memory_mb": 0, "duration_seconds": 0, "dataset_size": 50000, - "timestamp": "2025-10-28T00:00:14.639451", - "git_commit": "e38e5aab", + "timestamp": "2025-12-13T14:51:19.196420", + "git_commit": "08a868c5", "environment": "local" }, "lmdb_key_generation_strategy_performance_composite_key": { "test_name": "key_generation_strategy_performance_composite_key", "loader_type": "lmdb", - "throughput_rows_per_sec": 64687.24273107819, + "throughput_rows_per_sec": 78268.90829691064, "memory_mb": 0, "duration_seconds": 0, "dataset_size": 50000, - "timestamp": "2025-10-28T00:00:14.686219", - "git_commit": "e38e5aab", + "timestamp": "2025-12-13T14:51:19.208121", + "git_commit": "08a868c5", "environment": "local" }, "lmdb_writemap_performance_with": { "test_name": "writemap_performance_with", "loader_type": "lmdb", - "throughput_rows_per_sec": 87847.98917248333, + "throughput_rows_per_sec": 86608.13917088526, "memory_mb": 0, "duration_seconds": 0, "dataset_size": 50000, - "timestamp": "2025-10-28T00:00:19.439505", - "git_commit": "e38e5aab", + "timestamp": "2025-12-13T14:51:23.502083", + "git_commit": "08a868c5", "environment": "local" }, "lmdb_writemap_performance_without": { "test_name": "writemap_performance_without", "loader_type": "lmdb", - "throughput_rows_per_sec": 104290.05352869684, + "throughput_rows_per_sec": 90709.69896251542, "memory_mb": 0, "duration_seconds": 0, "dataset_size": 50000, - "timestamp": "2025-10-28T00:00:19.466225", - "git_commit": "e38e5aab", + "timestamp": "2025-12-13T14:51:23.523911", + "git_commit": "08a868c5", "environment": "local" }, "lmdb_memory_efficiency": { "test_name": "memory_efficiency", "loader_type": "lmdb", - "throughput_rows_per_sec": 61804.62313406004, + "throughput_rows_per_sec": 86586.57679160737, "memory_mb": 120.21875, - "duration_seconds": 0.8090009689331055, + "duration_seconds": 0.5774567127227783, "dataset_size": 50000, - "timestamp": "2025-10-28T00:00:20.360722", - "git_commit": "e38e5aab", + "timestamp": "2025-12-13T14:51:24.158544", + "git_commit": "08a868c5", "environment": "local" }, "lmdb_concurrent_read_performance": { "test_name": "concurrent_read_performance", "loader_type": "lmdb", - "throughput_rows_per_sec": 226961.30898591253, + "throughput_rows_per_sec": 234797.9618730078, "memory_mb": 0, - "duration_seconds": 0.22030186653137207, + "duration_seconds": 0.21294903755187988, "dataset_size": 50000, - "timestamp": "2025-10-28T00:00:21.415388", - "git_commit": "e38e5aab", + "timestamp": "2025-12-13T14:51:25.005213", + "git_commit": "08a868c5", "environment": "local" }, "lmdb_large_value_performance": { "test_name": "large_value_performance", "loader_type": "lmdb", - "throughput_rows_per_sec": 98657.00710354236, + "throughput_rows_per_sec": 76336.40913640914, "memory_mb": 0.03125, - "duration_seconds": 0.010136127471923828, + "duration_seconds": 0.013099908828735352, "dataset_size": 1000, - "timestamp": "2025-10-28T00:00:21.772304", - "git_commit": "e38e5aab", + "timestamp": "2025-12-13T14:51:25.314350", + "git_commit": "08a868c5", "environment": "local" }, "postgresql_throughput_comparison": { "test_name": "throughput_comparison", "loader_type": "postgresql", - "throughput_rows_per_sec": 114434.94678369434, + "throughput_rows_per_sec": 149951.87873154337, "memory_mb": 0, "duration_seconds": 0, "dataset_size": 10000, - "timestamp": "2025-10-28T00:00:22.506677", - "git_commit": "e38e5aab", + "timestamp": "2025-12-13T14:51:25.818210", + "git_commit": "08a868c5", "environment": "local" }, "redis_throughput_comparison": { "test_name": "throughput_comparison", "loader_type": "redis", - "throughput_rows_per_sec": 39196.31876614371, + "throughput_rows_per_sec": 43536.38460372721, "memory_mb": 0, "duration_seconds": 0, "dataset_size": 10000, - "timestamp": "2025-10-28T00:00:22.550799", - "git_commit": "e38e5aab", + "timestamp": "2025-12-13T14:51:25.831876", + "git_commit": "08a868c5", "environment": "local" }, "lmdb_throughput_comparison": { "test_name": "throughput_comparison", "loader_type": "lmdb", - "throughput_rows_per_sec": 64069.99835024838, + "throughput_rows_per_sec": 93945.16454890999, "memory_mb": 0, "duration_seconds": 0, "dataset_size": 10000, - "timestamp": "2025-10-28T00:00:22.593882", - "git_commit": "e38e5aab", + "timestamp": "2025-12-13T14:51:25.842671", + "git_commit": "08a868c5", "environment": "local" }, "delta_lake_throughput_comparison": { "test_name": "throughput_comparison", "loader_type": "delta_lake", - "throughput_rows_per_sec": 74707.64780586681, + "throughput_rows_per_sec": 288599.56100816745, "memory_mb": 0, "duration_seconds": 0, "dataset_size": 10000, - "timestamp": "2025-10-28T00:00:22.641513", - "git_commit": "e38e5aab", + "timestamp": "2025-12-13T14:51:25.854085", + "git_commit": "08a868c5", "environment": "local" }, "iceberg_large_file_write_performance": { "test_name": "large_file_write_performance", "loader_type": "iceberg", - "throughput_rows_per_sec": 565892.4099818668, - "memory_mb": 1144.453125, - "duration_seconds": 0.08835601806640625, + "throughput_rows_per_sec": 568064.4680706982, + "memory_mb": 729.890625, + "duration_seconds": 0.08801817893981934, "dataset_size": 50000, - "timestamp": "2025-10-28T00:00:22.874880", - "git_commit": "e38e5aab", + "timestamp": "2025-12-13T14:51:26.029000", + "git_commit": "08a868c5", "environment": "local" } } \ No newline at end of file diff --git a/scripts/generate_models.py b/scripts/generate_models.py index 13a69b0..c67b495 100644 --- a/scripts/generate_models.py +++ b/scripts/generate_models.py @@ -15,7 +15,7 @@ def main(): """Generate Pydantic models from OpenAPI spec.""" - from datamodel_code_generator import InputFileType, generate + from datamodel_code_generator import DataModelType, InputFileType, generate # Define paths spec_file = Path('specs/admin.spec.json') @@ -35,6 +35,7 @@ def main(): input_=spec_file, output=output_file, input_file_type=InputFileType.OpenAPI, + output_model_type=DataModelType.PydanticV2BaseModel, use_schema_description=True, use_field_description=True, field_constraints=True, diff --git a/specs/admin.spec.json b/specs/admin.spec.json index 71225af..54616c1 100644 --- a/specs/admin.spec.json +++ b/specs/admin.spec.json @@ -2,7 +2,7 @@ "openapi": "3.1.0", "info": { "title": "Amp Admin API", - "description": "Administration API for Amp, a high-performance ETL system for blockchain data services on The Graph.\n\n## About\n\nThe Admin API provides a RESTful HTTP interface for managing Amp's ETL operations. This API serves as the primary administrative interface for monitoring and controlling the Amp data pipeline, allowing you to deploy datasets, trigger data extraction jobs, monitor job progress, manage distributed worker locations, configure external data providers, and perform operations on Parquet files and their metadata.\n\n## Key Capabilities\n\n### Dataset Management\nHandle the lifecycle of data extraction configurations and access dataset information:\n- List all registered datasets from the metadata database registry\n- Register new dataset configurations with versioning support\n- Trigger data extraction jobs for specific datasets or dataset versions\n- Retrieve dataset details including tables and active storage locations\n\n### Job Control\nControl and monitor data extraction and processing jobs:\n- List and retrieve job information with pagination\n- Trigger extraction jobs with optional end block configuration\n- Stop running jobs gracefully\n- Delete jobs in terminal states (Completed, Stopped, Failed)\n- Bulk cleanup operations for finalized jobs\n\n### Storage Management\nManage locations where dataset tables are stored:\n- Supports local filesystem, S3, GCS, and Azure Blob Storage\n- List storage locations and their associated files\n- Delete locations with comprehensive cleanup (removes files and metadata)\n- Query file information including Parquet metadata and statistics\n\n### Provider Configuration\nConfigure external blockchain data sources:\n- Create, retrieve, and delete provider configurations\n- Support for EVM RPC endpoints and Firehose streams\n- Providers are reusable across multiple dataset definitions\n- **Security Note**: Provider configurations may contain connection details; ensure sensitive information is properly managed\n\n### Schema Analysis\nValidate SQL queries and infer output schemas:\n- Validate queries against registered datasets without execution\n- Determine output schema using DataFusion's query planner\n- Useful for building dynamic query tools and validating dataset definitions\n\n## Pagination\n\nMost list endpoints use cursor-based pagination for efficient data retrieval:\n\n### Paginated Endpoints\nThe following endpoints support pagination:\n- Jobs: `/jobs`\n- Locations: `/locations`\n- Files: `/locations/{location_id}/files`\n\n### Non-Paginated Endpoints\nThe following endpoints return all results without pagination:\n- Datasets: `/datasets` (returns all datasets)\n- Dataset Versions: `/datasets/{name}/versions` (returns all versions for a dataset)\n\n### Query Parameters (Paginated Endpoints Only)\n- `limit`: Maximum items per page (default: 50, max: 1000)\n- `last_*_id`: Cursor from previous page's `next_cursor` field\n\n### Response Format\nPaginated responses include:\n- Array of items (e.g., `jobs`, `locations`, `files`)\n- `next_cursor`: Cursor for the next page (absent when no more results)\n\n### Usage Pattern\n\n**First Page Request:**\n```\nGET /jobs?limit=100\n```\n\n**First Page Response:**\n```json\n{\n \"jobs\": [...],\n \"next_cursor\": 12345\n}\n```\n\n**Next Page Request:**\n```\nGET /jobs?limit=100&last_job_id=12345\n```\n\n**Last Page Response:**\n```json\n{\n \"jobs\": [...]\n // No next_cursor field = end of results\n}\n```\n\n### Cursor Formats\n\nEndpoints use different cursor formats based on their data type:\n\n**Integer ID Cursors (64-bit integers):**\nMost paginated endpoints use simple integer IDs as cursors:\n- Jobs: `last_job_id=12345`\n- Locations: `last_location_id=67890`\n- Files: `last_file_id=54321`\n\n## Error Handling\n\nAll error responses follow a consistent format with:\n- `error_code`: Stable, machine-readable code (SCREAMING_SNAKE_CASE)\n- `error_message`: Human-readable error description\n\nError codes are stable across API versions and suitable for programmatic error handling. Messages may change and should only be used for display or logging.\n\n## Important Notes\n\n### Dataset Registration\nSupports two main scenarios:\n- **Derived datasets** (kind=\"manifest\"): Registered in both object store and metadata database\n- **SQL datasets** (other kinds): Dataset definitions stored in object store\n\n### Job Lifecycle\nJobs have the following terminal states that allow deletion:\n- **Completed**: Job finished successfully\n- **Stopped**: Job was manually stopped\n- **Failed**: Job encountered an error\n\nNon-terminal jobs (Scheduled, Running, StopRequested, Stopping) are protected from deletion.\n\n### Storage Locations\n- Locations can be active or inactive for queries\n- Deleting a location performs comprehensive cleanup including file removal from object store\n- Each location is associated with a specific dataset table and storage URL\n", + "description": "Administration API for Amp, a high-performance ETL system for blockchain data services on The Graph.\n\n## About\n\nThe Admin API provides a RESTful HTTP interface for managing Amp's ETL operations. This API serves as the primary administrative interface for monitoring and controlling the Amp data pipeline, allowing you to deploy datasets, trigger data extraction jobs, monitor job progress, manage distributed worker locations, configure external data providers, and perform operations on Parquet files and their metadata.\n\n## Key Capabilities\n\n### Dataset Management\nHandle the lifecycle of data extraction configurations and access dataset information:\n- List all registered datasets from the metadata database registry\n- Register new dataset configurations with versioning support\n- Trigger data extraction jobs for specific datasets or dataset versions\n- Retrieve dataset details including tables and active storage locations\n\n### Job Control\nControl and monitor data extraction and processing jobs:\n- List and retrieve job information with pagination\n- Trigger extraction jobs with optional end block configuration\n- Stop running jobs gracefully\n- Delete jobs in terminal states (Completed, Stopped, Failed)\n- Bulk cleanup operations for finalized jobs\n\n### Storage Management\nManage locations where dataset tables are stored:\n- Supports local filesystem, S3, GCS, and Azure Blob Storage\n- List storage locations and their associated files\n- Delete locations with comprehensive cleanup (removes files and metadata)\n- Query file information including Parquet metadata and statistics\n\n### Provider Configuration\nConfigure external blockchain data sources:\n- Create, retrieve, and delete provider configurations\n- Support for EVM RPC endpoints and Firehose streams\n- Providers are reusable across multiple dataset definitions\n- **Security Note**: Provider configurations may contain connection details; ensure sensitive information is properly managed\n\n### Schema Analysis\nValidate SQL queries and infer output schemas:\n- Validate queries against registered datasets without execution\n- Determine output schema using DataFusion's query planner\n- Useful for building dynamic query tools and validating dataset definitions\n\n## Pagination\n\nMost list endpoints use cursor-based pagination for efficient data retrieval:\n\n### Paginated Endpoints\nThe following endpoints support pagination:\n- Jobs: `/jobs`\n\n### Non-Paginated Endpoints\nThe following endpoints return all results without pagination:\n- Datasets: `/datasets` (returns all datasets)\n- Dataset Versions: `/datasets/{name}/versions` (returns all versions for a dataset)\n\n### Query Parameters (Paginated Endpoints Only)\n- `limit`: Maximum items per page (default: 50, max: 1000)\n- `last_*_id`: Cursor from previous page's `next_cursor` field\n\n### Response Format\nPaginated responses include:\n- Array of items (e.g., `jobs`, `locations`, `files`)\n- `next_cursor`: Cursor for the next page (absent when no more results)\n\n### Usage Pattern\n\n**First Page Request:**\n```\nGET /jobs?limit=100\n```\n\n**First Page Response:**\n```json\n{\n \"jobs\": [...],\n \"next_cursor\": 12345\n}\n```\n\n**Next Page Request:**\n```\nGET /jobs?limit=100&last_job_id=12345\n```\n\n**Last Page Response:**\n```json\n{\n \"jobs\": [...]\n // No next_cursor field = end of results\n}\n```\n\n### Cursor Formats\n\nEndpoints use different cursor formats based on their data type:\n\n**Integer ID Cursors (64-bit integers):**\nMost paginated endpoints use simple integer IDs as cursors:\n- Jobs: `last_job_id=12345`\n- Locations: `last_location_id=67890`\n- Files: `last_file_id=54321`\n\n## Error Handling\n\nAll error responses follow a consistent format with:\n- `error_code`: Stable, machine-readable code (SCREAMING_SNAKE_CASE)\n- `error_message`: Human-readable error description\n\nError codes are stable across API versions and suitable for programmatic error handling. Messages may change and should only be used for display or logging.\n\n## Important Notes\n\n### Dataset Registration\nSupports two main scenarios:\n- **Derived datasets** (kind=\"manifest\"): Registered in both object store and metadata database\n- **SQL datasets** (other kinds): Dataset definitions stored in object store\n\n### Job Lifecycle\nJobs have the following terminal states that allow deletion:\n- **Completed**: Job finished successfully\n- **Stopped**: Job was manually stopped\n- **Failed**: Job encountered an error\n\nNon-terminal jobs (Scheduled, Running, StopRequested, Stopping) are protected from deletion.\n\n### Storage Locations\n- Locations can be active or inactive for queries\n- Deleting a location performs comprehensive cleanup including file removal from object store\n- Each location is associated with a specific dataset table and storage URL\n", "license": { "name": "" }, @@ -288,7 +288,7 @@ "datasets" ], "summary": "Handler for the `POST /datasets/{namespace}/{name}/versions/{revision}/deploy` endpoint", - "description": "Schedules a data extraction job for the specified dataset revision.\n\n## Response\n- **202 Accepted**: Job successfully scheduled\n- **400 Bad Request**: Invalid path parameters or request body\n- **404 Not Found**: Dataset or revision not found\n- **500 Internal Server Error**: Database or scheduler error\n\n## Error Codes\n- `INVALID_PATH`: Invalid path parameters (namespace, name, or revision)\n- `INVALID_BODY`: Invalid request body (malformed JSON or missing required fields)\n- `DATASET_NOT_FOUND`: The specified dataset or revision does not exist\n- `LIST_VERSION_TAGS_ERROR`: Failed to list version tags from dataset store\n- `RESOLVE_REVISION_ERROR`: Failed to resolve revision to manifest hash\n- `GET_DATASET_ERROR`: Failed to load dataset from store\n- `SCHEDULER_ERROR`: Failed to schedule extraction job\n\n## Behavior\nThis endpoint schedules a data extraction job for a dataset:\n1. Resolves the revision to find the corresponding version tag\n2. Loads the full dataset configuration from the dataset store\n3. Schedules an extraction job with the specified parameters\n4. Returns job ID for tracking\n\nThe revision parameter supports four types:\n- Semantic version (e.g., \"1.2.3\") - uses that specific version\n- \"latest\" - resolves to the highest semantic version\n- \"dev\" - resolves to the development version tag\n- Manifest hash (SHA256 hash) - finds the version that points to this hash\n\nJobs are executed asynchronously by worker nodes. Use the returned job ID\nto track progress via the jobs endpoints.", + "description": "Schedules a data extraction job for the specified dataset revision.\n\n## Response\n- **202 Accepted**: Job successfully scheduled\n- **400 Bad Request**: Invalid path parameters or request body\n- **404 Not Found**: Dataset or revision not found\n- **500 Internal Server Error**: Database or scheduler error\n\n## Error Codes\n- `INVALID_PATH`: Invalid path parameters (namespace, name, or revision)\n- `INVALID_BODY`: Invalid request body (malformed JSON or missing required fields)\n- `DATASET_NOT_FOUND`: The specified dataset or revision does not exist\n- `LIST_VERSION_TAGS_ERROR`: Failed to list version tags from dataset store\n- `RESOLVE_REVISION_ERROR`: Failed to resolve revision to manifest hash\n- `GET_DATASET_ERROR`: Failed to load dataset from store\n- `WORKER_NOT_AVAILABLE`: Specified worker not found or inactive\n- `SCHEDULER_ERROR`: Failed to schedule extraction job\n\n## Behavior\nThis endpoint schedules a data extraction job for a dataset:\n1. Resolves the revision to find the corresponding version tag\n2. Loads the full dataset configuration from the dataset store\n3. Schedules an extraction job with the specified parameters\n4. Returns job ID for tracking\n\nThe revision parameter supports four types:\n- Semantic version (e.g., \"1.2.3\") - uses that specific version\n- \"latest\" - resolves to the highest semantic version\n- \"dev\" - resolves to the development version tag\n- Manifest hash (SHA256 hash) - finds the version that points to this hash\n\nJobs are executed asynchronously by worker nodes. Use the returned job ID\nto track progress via the jobs endpoints.", "operationId": "deploy_dataset", "parameters": [ { @@ -373,6 +373,87 @@ } } }, + "/datasets/{namespace}/{name}/versions/{revision}/jobs": { + "get": { + "tags": [ + "datasets" + ], + "summary": "Handler for the `GET /datasets/{namespace}/{name}/versions/{revision}/jobs` endpoint", + "description": "Retrieves and returns all jobs for a specific dataset revision.\n\n## Path Parameters\n- `namespace`: Dataset namespace\n- `name`: Dataset name\n- `revision`: Dataset revision (version, hash, \"latest\", or \"dev\")\n\n## Response\n- **200 OK**: Returns all jobs for the dataset\n- **400 Bad Request**: Invalid path parameters\n- **404 Not Found**: Dataset or revision not found\n- **500 Internal Server Error**: Database connection or query error\n\n## Error Codes\n- `INVALID_PATH`: Invalid path parameters (namespace, name, or revision malformed)\n- `DATASET_NOT_FOUND`: Dataset revision does not exist\n- `RESOLVE_REVISION_ERROR`: Failed to resolve dataset revision (database error)\n- `LIST_JOBS_ERROR`: Failed to list jobs from metadata database (database error)\n\nThis handler:\n- Validates and extracts the dataset reference from the URL path\n- Resolves the revision to a manifest hash using the dataset store\n- Queries all jobs filtered by the manifest hash\n- Returns all matching jobs", + "operationId": "list_dataset_jobs", + "parameters": [ + { + "name": "namespace", + "in": "path", + "description": "Dataset namespace", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "name", + "in": "path", + "description": "Dataset name", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "revision", + "in": "path", + "description": "Revision (version, hash, latest, or dev)", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "Successfully retrieved jobs", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/JobsResponse" + } + } + } + }, + "400": { + "description": "Invalid path parameters", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + } + }, + "404": { + "description": "Dataset or revision not found", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + } + }, + "500": { + "description": "Internal server error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + } + } + } + } + }, "/datasets/{namespace}/{name}/versions/{revision}/manifest": { "get": { "tags": [ @@ -444,6 +525,87 @@ } } }, + "/datasets/{namespace}/{name}/versions/{revision}/restore": { + "post": { + "tags": [ + "datasets" + ], + "summary": "Handler for the `POST /datasets/{namespace}/{name}/versions/{revision}/restore` endpoint", + "description": "Restores physical table locations from object storage into the metadata database.\n\n## Path Parameters\n- `namespace`: Dataset namespace\n- `name`: Dataset name\n- `revision`: Revision (version, hash, latest, or dev)\n\n## Response\n- **202 Accepted**: Physical tables successfully restored from storage\n- **400 Bad Request**: Invalid path parameters\n- **404 Not Found**: Dataset or revision not found, or no tables found in storage\n- **500 Internal Server Error**: Database or storage error\n\n## Error Codes\n- `INVALID_PATH`: Invalid path parameters (namespace, name, or revision)\n- `DATASET_NOT_FOUND`: The specified dataset or revision does not exist\n- `GET_DATASET_ERROR`: Failed to load dataset from store\n- `RESTORE_TABLE_ERROR`: Failed to restore a table from storage\n- `TABLE_NOT_FOUND`: Table data not found in object storage\n\n## Behavior\nThis endpoint restores dataset physical tables from object storage:\n1. Resolves the revision to find the corresponding dataset\n2. Scans object storage for existing physical table files\n3. Re-indexes all Parquet file metadata from storage\n4. Registers the physical table locations in the metadata database\n5. Marks the restored locations as active\n\nThis is useful for:\n- Recovering from metadata database loss\n- Setting up a new system with pre-existing data\n- Re-syncing metadata after storage restoration", + "operationId": "restore_dataset", + "parameters": [ + { + "name": "namespace", + "in": "path", + "description": "Dataset namespace", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "name", + "in": "path", + "description": "Dataset name", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "revision", + "in": "path", + "description": "Revision (version, hash, latest, or dev)", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "202": { + "description": "Physical tables successfully restored", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RestoreResponse" + } + } + } + }, + "400": { + "description": "Bad request (invalid parameters)", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + } + }, + "404": { + "description": "Dataset or revision not found", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + } + }, + "500": { + "description": "Internal server error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + } + } + } + } + }, "/datasets/{namespace}/{name}/versions/{version}": { "delete": { "tags": [ @@ -578,7 +740,7 @@ "jobs" ], "summary": "Handler for the `GET /jobs` endpoint", - "description": "Retrieves and returns a paginated list of jobs from the metadata database.\n\n## Query Parameters\n- `limit`: Maximum number of jobs to return (default: 50, max: 1000)\n- `last_job_id`: ID of the last job from previous page for cursor-based pagination\n\n## Response\n- **200 OK**: Returns paginated job data with next cursor\n- **400 Bad Request**: Invalid limit parameter (0, negative, or > 1000)\n- **500 Internal Server Error**: Database connection or query error\n\n## Error Codes\n- `INVALID_QUERY_PARAMETERS`: Invalid query parameters (malformed or unparseable)\n- `LIMIT_TOO_LARGE`: Limit exceeds maximum allowed value (>1000)\n- `LIMIT_INVALID`: Limit is zero\n- `LIST_JOBS_ERROR`: Failed to list jobs from scheduler (database error)", + "description": "Retrieves and returns a paginated list of jobs from the metadata database.\n\n## Query Parameters\n- `limit`: Maximum number of jobs to return (default: 50, max: 1000)\n- `last_job_id`: ID of the last job from previous page for cursor-based pagination\n- `status`: Status filter - \"active\" (default, shows non-terminal jobs), \"all\" (shows all jobs), or comma-separated status values (e.g., \"scheduled,running\")\n\n## Response\n- **200 OK**: Returns paginated job data with next cursor\n- **400 Bad Request**: Invalid limit parameter (0, negative, or > 1000)\n- **500 Internal Server Error**: Database connection or query error\n\n## Error Codes\n- `INVALID_QUERY_PARAMETERS`: Invalid query parameters (malformed or unparseable)\n- `LIMIT_TOO_LARGE`: Limit exceeds maximum allowed value (>1000)\n- `LIMIT_INVALID`: Limit is zero\n- `LIST_JOBS_ERROR`: Failed to list jobs from scheduler (database error)", "operationId": "jobs_list", "parameters": [ { @@ -599,6 +761,15 @@ "schema": { "type": "string" } + }, + { + "name": "status", + "in": "query", + "description": "Status filter: 'active' (default, non-terminal jobs), 'all' (all jobs), or comma-separated status values", + "required": false, + "schema": { + "type": "string" + } } ], "responses": { @@ -831,209 +1002,8 @@ } } }, - "404": { - "description": "Job not found", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorResponse" - } - } - } - }, - "500": { - "description": "Internal server error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorResponse" - } - } - } - } - } - } - }, - "/locations": { - "get": { - "tags": [ - "locations" - ], - "summary": "Handler for the `GET /locations` endpoint", - "description": "Retrieves and returns a paginated list of locations from the metadata database.\n\n## Query Parameters\n- `limit`: Maximum number of locations to return (default: 50, max: 1000)\n- `last_location_id`: ID of the last location from previous page for cursor-based pagination\n\n## Response\n- **200 OK**: Returns paginated location data with next cursor\n- **400 Bad Request**: Invalid limit parameter (0, negative, or > 1000)\n- **500 Internal Server Error**: Database connection or query error\n\n## Error Codes\n- `INVALID_REQUEST`: Invalid query parameters (limit out of range)\n- `METADATA_DB_ERROR`: Internal database error occurred\n\nThis handler:\n- Accepts query parameters for pagination (limit, last_location_id)\n- Validates the limit parameter (max 1000)\n- Calls the metadata DB to list locations with pagination\n- Returns a structured response with locations and next cursor", - "operationId": "locations_list", - "parameters": [ - { - "name": "limit", - "in": "query", - "description": "Maximum number of locations to return (default: 50, max: 1000)", - "required": false, - "schema": { - "type": "integer", - "minimum": 0 - } - }, - { - "name": "last_location_id", - "in": "query", - "description": "ID of the last location from the previous page for pagination", - "required": false, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "description": "Successfully retrieved locations", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/LocationsResponse" - } - } - } - }, - "400": { - "description": "Invalid query parameters", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorResponse" - } - } - } - }, - "500": { - "description": "Internal server error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorResponse" - } - } - } - } - } - } - }, - "/locations/{id}": { - "get": { - "tags": [ - "locations" - ], - "summary": "Handler for the `GET /locations/{id}` endpoint", - "description": "Retrieves and returns a specific location by its ID from the metadata database.\n\n## Path Parameters\n- `id`: The unique identifier of the location to retrieve (must be a positive integer)\n\n## Response\n- **200 OK**: Returns the location information as JSON\n- **400 Bad Request**: Invalid location ID format (not a number, zero, or negative)\n- **404 Not Found**: Location with the given ID does not exist\n- **500 Internal Server Error**: Database connection or query error\n\n## Error Codes\n- `INVALID_LOCATION_ID`: The provided ID is not a valid positive integer\n- `LOCATION_NOT_FOUND`: No location exists with the given ID\n- `METADATA_DB_ERROR`: Internal database error occurred\n\nThis handler:\n- Validates and extracts the location ID from the URL path\n- Queries the metadata database for the location\n- Returns appropriate HTTP status codes and error messages", - "operationId": "locations_get", - "parameters": [ - { - "name": "id", - "in": "path", - "description": "Location ID", - "required": true, - "schema": { - "type": "integer", - "format": "int64" - } - } - ], - "responses": { - "200": { - "description": "Successfully retrieved location information", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/LocationInfoWithDetails" - } - } - } - }, - "400": { - "description": "Invalid location ID", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorResponse" - } - } - } - }, - "404": { - "description": "Location not found", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorResponse" - } - } - } - }, - "500": { - "description": "Internal server error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorResponse" - } - } - } - } - } - }, - "delete": { - "tags": [ - "locations" - ], - "summary": "Handler for the `DELETE /locations/{id}` endpoint", - "description": "Deletes a specific location by its ID from the metadata database.\n\n## Path Parameters\n- `id`: The unique identifier of the location to delete (must be a positive integer)\n\n## Query Parameters\n- `force`: (optional, default: false) Force deletion even if location is active\n\n## Response\n- **204 No Content**: Location successfully deleted\n- **400 Bad Request**: Invalid location ID format or invalid query parameters\n- **404 Not Found**: Location with the given ID does not exist\n- **409 Conflict**: Location is active (without force=true) or has an ongoing job\n- **500 Internal Server Error**: Database connection or query error\n\n## Error Codes\n- `INVALID_LOCATION_ID`: The provided ID is not a valid positive integer\n- `INVALID_QUERY_PARAMETERS`: The query parameters cannot be parsed\n- `LOCATION_NOT_FOUND`: No location exists with the given ID\n- `ACTIVE_LOCATION_CONFLICT`: Location is active and cannot be deleted without force=true\n- `ONGOING_JOB_CONFLICT`: Location has an ongoing job and cannot be deleted\n- `METADATA_DB_ERROR`: Internal database error occurred\n\n## Safety Checks\n- Active locations require `force=true` to be deleted\n- Locations with ongoing jobs cannot be deleted (even with force=true)\n- Users must stop active jobs before deleting associated locations\n\nThis handler:\n- Validates and extracts the location ID from the URL path\n- Validates optional query parameters (force flag)\n- Performs safety checks for active locations and ongoing jobs\n- Deletes associated files from object store\n- Deletes the location from the metadata database\n- Returns appropriate HTTP status codes and error messages", - "operationId": "locations_delete", - "parameters": [ - { - "name": "id", - "in": "path", - "description": "Location ID", - "required": true, - "schema": { - "type": "integer", - "format": "int64" - } - }, - { - "name": "force", - "in": "query", - "description": "Force deletion even if location is active", - "required": false, - "schema": { - "type": "boolean" - } - } - ], - "responses": { - "204": { - "description": "Location successfully deleted" - }, - "400": { - "description": "Invalid location ID or query parameters", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorResponse" - } - } - } - }, - "404": { - "description": "Location not found", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorResponse" - } - } - } - }, - "409": { - "description": "Location is active or has ongoing job", + "404": { + "description": "Job not found", "content": { "application/json": { "schema": { @@ -1055,43 +1025,21 @@ } } }, - "/locations/{location_id}/files": { + "/manifests": { "get": { "tags": [ - "locations" - ], - "summary": "Handler for the `GET /locations/{location_id}/files` endpoint", - "description": "Retrieves and returns a paginated list of files for a specific location from the metadata database.\n\n## Path Parameters\n- `location_id`: The unique identifier of the location (must be a positive integer)\n\n## Query Parameters\n- `limit`: Maximum number of files to return (default: 50, max: 1000)\n- `last_file_id`: ID of the last file from previous page for cursor-based pagination\n\n## Response\n- **200 OK**: Returns paginated file data with next cursor\n- **400 Bad Request**: Invalid location ID format or invalid limit parameter\n- **500 Internal Server Error**: Database connection or query error\n\n## Error Codes\n- `INVALID_LOCATION_ID`: Invalid location ID format\n- `INVALID_QUERY_PARAMETERS`: Invalid query parameters (limit out of range)\n- `LIMIT_TOO_LARGE`: Limit exceeds maximum allowed value\n- `LIMIT_INVALID`: Limit is zero or negative\n- `METADATA_DB_ERROR`: Internal database error occurred\n\nThis handler:\n- Validates and extracts the location ID from the URL path\n- Accepts query parameters for pagination (limit, last_file_id)\n- Validates the limit parameter (max 1000)\n- Calls the metadata DB to list files with pagination for the specified location\n- Returns a structured response with minimal file info and next cursor", - "operationId": "locations_list_files", - "parameters": [ - { - "name": "location_id", - "in": "path", - "description": "Location ID", - "required": true, - "schema": { - "type": "integer", - "format": "int64" - } - } + "manifests" ], + "summary": "Handler for the `GET /manifests` endpoint", + "description": "Returns all registered manifests in the system.\n\n## Response\n- **200 OK**: Successfully retrieved all manifests\n- **500 Internal Server Error**: Database query error\n\n## Error Codes\n- `LIST_ALL_MANIFESTS_ERROR`: Failed to list all manifests from metadata database\n\n## Behavior\nThis handler returns a comprehensive list of all manifests registered in the system.\nFor each manifest, it includes:\n- The content-addressable hash (SHA-256)\n- The object store path where the manifest is stored\n\nResults are ordered by hash (lexicographical).", + "operationId": "list_all_manifests", "responses": { "200": { - "description": "Successfully retrieved location files", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/LocationFilesResponse" - } - } - } - }, - "400": { - "description": "Invalid location ID or query parameters", + "description": "Successfully retrieved all manifests", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ErrorResponse" + "$ref": "#/components/schemas/ManifestsResponse" } } } @@ -1107,9 +1055,7 @@ } } } - } - }, - "/manifests": { + }, "post": { "tags": [ "manifests" @@ -1556,14 +1502,14 @@ "tags": [ "schema" ], - "summary": "Handler for the `/schema` endpoint that provides SQL schema analysis.", - "description": "This endpoint performs comprehensive SQL validation and schema inference by:\n1. **Parsing SQL**: Validates syntax using DataFusion's SQL parser\n2. **Loading Datasets**: Retrieves actual dataset definitions from the registry\n3. **Schema Resolution**: Creates planning context with real table schemas from stored datasets\n4. **Schema Inference**: Uses DataFusion's query planner to determine output schema without execution\n5. **Special Fields**: Optionally prepends `SPECIAL_BLOCK_NUM` field for SQL datasets\n6. **Network Extraction**: Identifies which blockchain networks the query references\n\nThe validation works with real registered datasets and their actual schemas,\nensuring datasets exist, tables are valid, and column references are correct.\nThis enables accurate schema introspection for query builders and dataset development tools.\n\n## Request Body\n- `sql_query`: The SQL query to analyze\n- `is_sql_dataset`: (optional) Whether this is a SQL dataset (affects block number field inclusion)\n\n## Response\n- **200 OK**: Returns the schema and networks used by the query\n- **400 Bad Request**: SQL parse error\n- **500 Internal Server Error**: Dataset store or planning error\n\n## Error Codes\n- `SQL_PARSE_ERROR`: Failed to parse the SQL query\n- `DATASET_STORE_ERROR`: Failed to load datasets from store\n- `PLANNING_ERROR`: Failed to determine output schema", + "summary": "Handler for the `POST /schema` endpoint", + "description": "Analyzes SQL queries and returns the output schema without executing the query.\nPerforms comprehensive validation and schema inference using real registered datasets\nand their actual schemas.\n\n## Request Body\n- `dependencies`: External dataset dependencies mapped by alias\n- `tables`: Table definitions mapped by table name (optional if functions provided)\n- `functions`: Function names defined in dataset config (optional if tables provided)\n\n## Response\n- **200 OK**: Returns the inferred schema and networks referenced by the query\n- **400 Bad Request**: Invalid SQL syntax, table references, or function format\n- **404 Not Found**: Referenced dataset does not exist\n- **500 Internal Server Error**: Dataset store, planning, or internal errors\n\n## Error Codes\n- `INVALID_PAYLOAD_FORMAT`: Request JSON is malformed or missing required fields\n- `EMPTY_TABLES_AND_FUNCTIONS`: No tables or functions provided (at least one is required)\n- `INVALID_TABLE_SQL`: SQL syntax error in table definition\n- `TABLE_REFERENCE_RESOLUTION`: Failed to extract table references from SQL\n- `FUNCTION_REFERENCE_RESOLUTION`: Failed to extract function references from SQL\n- `DEPENDENCY_NOT_FOUND`: Referenced dependency does not exist\n- `DEPENDENCY_RESOLUTION`: Failed to resolve dependency\n- `CATALOG_QUALIFIED_TABLE`: Table uses unsupported catalog qualification\n- `UNQUALIFIED_TABLE`: Table missing required dataset qualification\n- `INVALID_TABLE_NAME`: Table name violates SQL identifier rules\n- `INVALID_DEPENDENCY_ALIAS_FOR_TABLE_REF`: Dependency alias in table reference is invalid\n- `INVALID_DEPENDENCY_ALIAS_FOR_FUNCTION_REF`: Dependency alias in function reference is invalid\n- `CATALOG_QUALIFIED_FUNCTION`: Function uses unsupported catalog qualification\n- `DEPENDENCY_ALIAS_NOT_FOUND`: Referenced alias not in dependencies\n- `DATASET_NOT_FOUND`: Referenced dataset does not exist\n- `GET_DATASET_ERROR`: Failed to retrieve dataset from store\n- `ETH_CALL_UDF_CREATION_ERROR`: Failed to create eth_call UDF\n- `TABLE_NOT_FOUND_IN_DATASET`: Table not found in referenced dataset\n- `FUNCTION_NOT_FOUND_IN_DATASET`: Function not found in referenced dataset\n- `ETH_CALL_NOT_AVAILABLE`: eth_call function not available for dataset\n- `SCHEMA_INFERENCE`: Failed to infer output schema from query\n\n## Schema Analysis Process\n1. **Parse SQL**: Validates syntax using DataFusion's SQL parser\n2. **Load Datasets**: Retrieves dataset definitions from the registry for all referenced datasets\n3. **Create Planning Context**: Builds planning context with real table schemas from stored datasets\n4. **Infer Schema**: Uses DataFusion's query planner to determine output schema without executing the query\n5. **Prepend Special Fields**: Adds `SPECIAL_BLOCK_NUM` field to the output schema\n6. **Extract Networks**: Identifies which blockchain networks are referenced by the query", "operationId": "schema_analyze", "requestBody": { "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/OutputSchemaRequest" + "$ref": "#/components/schemas/SchemaRequest" } } }, @@ -1575,13 +1521,23 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/OutputSchemaResponse" + "$ref": "#/components/schemas/SchemaResponse" } } } }, "400": { - "description": "SQL parse error", + "description": "Client error: Invalid SQL, table references, or function syntax", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + } + }, + "404": { + "description": "Dataset not found", "content": { "application/json": { "schema": { @@ -1591,7 +1547,7 @@ } }, "500": { - "description": "Dataset store or planning error", + "description": "Server error: Dataset store, planning, or internal failures", "content": { "application/json": { "schema": { @@ -1609,7 +1565,7 @@ "workers" ], "summary": "Handler for the `GET /workers` endpoint", - "description": "Retrieves and returns a list of all workers from the metadata database.\n\n## Response\n- **200 OK**: Returns all workers with their information\n- **500 Internal Server Error**: Database connection or query error\n\n## Error Codes\n- `METADATA_DB_LIST_ERROR`: Failed to retrieve workers list from database\n\nThis handler:\n- Fetches all workers from the metadata database\n- Converts worker records to API response format with ISO 8601 RFC3339 timestamps\n- Returns a structured response with worker information including node IDs and last heartbeat times", + "description": "Retrieves and returns a list of all workers from the scheduler.\n\n## Response\n- **200 OK**: Returns all workers with their information\n- **500 Internal Server Error**: Scheduler query error\n\n## Error Codes\n- `SCHEDULER_LIST_WORKERS_ERROR`: Failed to retrieve workers list from scheduler\n\nThis handler:\n- Fetches all workers from the scheduler\n- Converts worker records to API response format with ISO 8601 RFC3339 timestamps\n- Returns a structured response with worker information including node IDs and last heartbeat times", "operationId": "workers_list", "responses": { "200": { @@ -1641,7 +1597,7 @@ "workers" ], "summary": "Handler for the `GET /workers/{id}` endpoint", - "description": "Retrieves and returns a specific worker by its node ID from the metadata database.\n\n## Path Parameters\n- `id`: The unique node identifier of the worker to retrieve\n\n## Response\n- **200 OK**: Returns the worker information as JSON with detailed metadata\n- **400 Bad Request**: Invalid node ID format (not parseable as NodeId)\n- **404 Not Found**: Worker with the given node ID does not exist\n- **500 Internal Server Error**: Database connection or query error\n\n## Error Codes\n- `INVALID_WORKER_ID`: The provided ID is not a valid worker node identifier\n- `WORKER_NOT_FOUND`: No worker exists with the given node ID\n- `METADATA_DB_GET_BY_ID_ERROR`: Failed to retrieve worker from database", + "description": "Retrieves and returns a specific worker by its node ID from the scheduler.\n\n## Path Parameters\n- `id`: The unique node identifier of the worker to retrieve\n\n## Response\n- **200 OK**: Returns the worker information as JSON with detailed metadata\n- **400 Bad Request**: Invalid node ID format (not parseable as NodeId)\n- **404 Not Found**: Worker with the given node ID does not exist\n- **500 Internal Server Error**: Scheduler query error\n\n## Error Codes\n- `INVALID_WORKER_ID`: The provided ID is not a valid worker node identifier\n- `WORKER_NOT_FOUND`: No worker exists with the given node ID\n- `SCHEDULER_GET_WORKER_ERROR`: Failed to retrieve worker from scheduler", "operationId": "workers_get", "parameters": [ { @@ -1819,6 +1775,13 @@ "format": "int32", "description": "Number of parallel workers to run\n\nEach worker will be responsible for an equal number of blocks.\nFor example, if extracting blocks 0-10,000,000 with parallelism=10,\neach worker will handle a contiguous section of 1 million blocks.\n\nOnly applicable to raw datasets (EVM RPC, Firehose, etc.).\nDerived datasets ignore this parameter.\n\nDefaults to 1 if not specified.", "minimum": 0 + }, + "worker_id": { + "type": [ + "string", + "null" + ], + "description": "Optional worker ID to assign the job to\n\nIf specified, the job will be assigned to this specific worker.\nIf not specified, a worker will be selected randomly from available workers.\n\nThe worker must be active (has sent heartbeats recently) for the deployment to succeed." } } }, @@ -1917,50 +1880,29 @@ } } }, - "FileListInfo": { - "type": "object", - "description": "Minimal file information for location file listings\n\nThis struct represents essential file metadata for list endpoints,\ncontaining only the most relevant information needed for file browsing\nwithin a location context.", - "required": [ - "id", - "file_name" - ], - "properties": { - "file_name": { - "type": "string", - "description": "Name of the file (e.g., \"blocks_0000000000_0000099999.parquet\")" - }, - "id": { - "type": "integer", - "format": "int64", - "description": "Unique identifier for this file (64-bit integer)" - }, - "object_size": { - "type": [ - "integer", - "null" - ], - "format": "int64", - "description": "Size of the file object in bytes" - } - } - }, "JobInfo": { "type": "object", - "description": "Job information returned by the API\n\nThis struct represents job metadata in a format suitable for API responses.\nIt contains essential information about a job without exposing internal\ndatabase implementation details.", + "description": "Represents job information for the API response", "required": [ "id", + "created_at", + "updated_at", "node_id", "status", "descriptor" ], "properties": { + "created_at": { + "type": "string", + "description": "Job creation timestamp in ISO 8601 / RFC 3339 format" + }, "descriptor": { "description": "Job descriptor containing job-specific parameters as JSON" }, "id": { "type": "integer", "format": "int64", - "description": "Unique identifier for this job (64-bit integer)" + "description": "Unique identifier for the job (64-bit integer)" }, "node_id": { "type": "string", @@ -1969,6 +1911,10 @@ "status": { "type": "string", "description": "Current status of the job (Scheduled, Running, Completed, Stopped, Failed, etc.)" + }, + "updated_at": { + "type": "string", + "description": "Job last update timestamp in ISO 8601 / RFC 3339 format" } } }, @@ -1996,151 +1942,6 @@ } } }, - "LocationFilesResponse": { - "type": "object", - "description": "Collection response for location file listings\n\nThis response structure provides paginated file data with\ncursor-based pagination support for efficient traversal.", - "required": [ - "files" - ], - "properties": { - "files": { - "type": "array", - "items": { - "$ref": "#/components/schemas/FileListInfo" - }, - "description": "List of files in this page with minimal information" - }, - "next_cursor": { - "type": [ - "integer", - "null" - ], - "format": "int64", - "description": "Cursor for the next page of results - use as last_file_id in next request (None if no more results)" - } - } - }, - "LocationInfo": { - "type": "object", - "description": "Location information returned by the API\n\nThis struct represents location metadata from the database in a format\nsuitable for API responses. It contains all the essential information\nabout where dataset table data is stored.", - "required": [ - "id", - "dataset", - "dataset_version", - "table", - "url", - "active" - ], - "properties": { - "active": { - "type": "boolean", - "description": "Whether this location is currently active for queries" - }, - "dataset": { - "type": "string", - "description": "Name of the dataset this location belongs to" - }, - "dataset_version": { - "type": "string", - "description": "Version of the dataset using semantic versioning (e.g., \"1.0.0\", or empty string for unversioned)" - }, - "id": { - "type": "integer", - "format": "int64", - "description": "Unique identifier for this location (64-bit integer)" - }, - "table": { - "type": "string", - "description": "Name of the table within the dataset (e.g., \"blocks\", \"transactions\")" - }, - "url": { - "type": "string", - "description": "Full URL to the storage location (e.g., \"s3://bucket/path/table.parquet\", \"file:///local/path/table.parquet\")" - }, - "writer": { - "type": [ - "integer", - "null" - ], - "format": "int64", - "description": "Writer job ID (64-bit integer, if one exists)" - } - } - }, - "LocationInfoWithDetails": { - "type": "object", - "description": "Location information with writer job details", - "required": [ - "id", - "dataset", - "dataset_version", - "table", - "url", - "active" - ], - "properties": { - "active": { - "type": "boolean", - "description": "Whether this location is currently active for queries" - }, - "dataset": { - "type": "string", - "description": "Name of the dataset this location belongs to" - }, - "dataset_version": { - "type": "string", - "description": "Version of the dataset using semantic versioning (e.g., \"1.0.0\", or empty string for unversioned)" - }, - "id": { - "type": "integer", - "format": "int64", - "description": "Unique identifier for this location (64-bit integer)" - }, - "table": { - "type": "string", - "description": "Name of the table within the dataset (e.g., \"blocks\", \"transactions\")" - }, - "url": { - "type": "string", - "description": "Full URL to the storage location (e.g., \"s3://bucket/path/table.parquet\", \"file:///local/path/table.parquet\")" - }, - "writer": { - "oneOf": [ - { - "type": "null" - }, - { - "$ref": "#/components/schemas/JobInfo", - "description": "Writer job information with full details (if one exists)" - } - ] - } - } - }, - "LocationsResponse": { - "type": "object", - "description": "API response containing location information\n\nThis response structure provides paginated location data with\ncursor-based pagination support for efficient traversal.", - "required": [ - "locations" - ], - "properties": { - "locations": { - "type": "array", - "items": { - "$ref": "#/components/schemas/LocationInfo" - }, - "description": "List of locations in this page" - }, - "next_cursor": { - "type": [ - "integer", - "null" - ], - "format": "int64", - "description": "Cursor for the next page of results (None if no more results)" - } - } - }, "ManifestDatasetsResponse": { "type": "object", "description": "Response for listing datasets using a manifest", @@ -2162,44 +1963,43 @@ } } }, - "ManifestResponse": { - "type": "object", - "description": "Response wrapper for manifest content" - }, - "OutputSchemaRequest": { + "ManifestInfo": { "type": "object", - "description": "Request payload for output schema analysis\n\nContains the SQL query to analyze and optional configuration flags.", + "description": "Summary information for a single manifest", "required": [ - "sql_query" + "hash", + "dataset_count" ], "properties": { - "is_sql_dataset": { - "type": "boolean", - "description": "Whether this is a SQL dataset (affects block number field inclusion)\n\nWhen true, a special block number field is prepended to the schema.\nThis field tracks the block number for each row in SQL datasets." + "dataset_count": { + "type": "integer", + "format": "int64", + "description": "Number of datasets using this manifest", + "minimum": 0 }, - "sql_query": { + "hash": { "type": "string", - "description": "The SQL query to analyze for output schema determination" + "description": "Content-addressable hash (SHA-256)" } } }, - "OutputSchemaResponse": { + "ManifestResponse": { + "type": "object", + "description": "Response wrapper for manifest content" + }, + "ManifestsResponse": { "type": "object", - "description": "Response returned by the output schema endpoint\n\nContains the determined schema and list of networks referenced by the query.", + "description": "Response for listing all manifests", "required": [ - "schema", - "networks" + "manifests" ], "properties": { - "networks": { + "manifests": { "type": "array", "items": { - "type": "string" + "$ref": "#/components/schemas/ManifestInfo" }, - "description": "List of networks referenced by the query\n\nContains the network names of all datasets/tables referenced\nin the SQL query (e.g., \"mainnet\", \"polygon\", etc.)." - }, - "schema": { - "description": "The output schema for the SQL query\n\nDescribes the structure and types of columns that will be returned\nwhen executing the provided SQL query against the dataset." + "description": "List of all manifests in the system" } } }, @@ -2311,6 +2111,99 @@ } } }, + "RestoreResponse": { + "type": "object", + "description": "Response for restore operation", + "required": [ + "tables" + ], + "properties": { + "tables": { + "type": "array", + "items": { + "$ref": "#/components/schemas/RestoredTableInfo" + }, + "description": "List of restored physical tables" + } + } + }, + "RestoredTableInfo": { + "type": "object", + "description": "Information about a restored physical table", + "required": [ + "table_name", + "location_id", + "url" + ], + "properties": { + "location_id": { + "type": "integer", + "format": "int64", + "description": "Unique location ID assigned in the metadata database" + }, + "table_name": { + "type": "string", + "description": "Name of the table within the dataset" + }, + "url": { + "type": "string", + "description": "Full URL to the storage location" + } + } + }, + "SchemaRequest": { + "type": "object", + "description": "Request for schema analysis with dependencies, tables, and functions", + "properties": { + "dependencies": { + "type": "object", + "description": "External dataset dependencies mapped by alias\n\nMaps alias names to dataset references (namespace/name@version or namespace/name@hash).\nThese aliases are used in SQL queries to reference external datasets.\nSymbolic references like \"latest\" or \"dev\" are not allowed.", + "additionalProperties": { + "type": "string" + }, + "propertyNames": { + "type": "string" + } + }, + "functions": { + "type": "object", + "description": "User-defined function definitions mapped by function name\n\nMaps function names to their complete definitions including input/output types\nand implementation source code. These functions can be referenced in SQL queries\nas bare function calls (e.g., `my_function(args)` without dataset qualification).\n\nAt least one of `tables` or `functions` must be provided.\n\nFunction names must follow DataFusion UDF identifier rules:\n- Start with a letter (a-z, A-Z) or underscore (_)\n- Contain only letters, digits (0-9), underscores (_), and dollar signs ($)\n- Maximum length of 255 bytes", + "additionalProperties": {}, + "propertyNames": { + "type": "string" + } + }, + "tables": { + "type": "object", + "description": "Table definitions mapped by table name\n\nEach table is defined by a SQL query that may reference\ntables from dependencies using the alias names.", + "additionalProperties": { + "type": "string" + }, + "propertyNames": { + "type": "string" + } + } + } + }, + "SchemaResponse": { + "type": "object", + "description": "Response returned by the schema endpoint\n\nContains schemas and networks for one or more tables.", + "required": [ + "schemas" + ], + "properties": { + "schemas": { + "type": "object", + "description": "Schemas for each table\n\nMaps table names to their schemas and networks.\nContains one entry per table definition.", + "additionalProperties": { + "$ref": "#/components/schemas/TableSchemaWithNetworks" + }, + "propertyNames": { + "type": "string" + } + } + } + }, "SpecialTags": { "type": "object", "description": "Special tags pointing to versions or hashes", @@ -2341,6 +2234,26 @@ "Error" ] }, + "TableSchemaWithNetworks": { + "type": "object", + "description": "Table schema with associated networks\n\nContains the output schema for a table and the list of networks referenced by its query.", + "required": [ + "schema", + "networks" + ], + "properties": { + "networks": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of networks referenced by this table's query\n\nContains the network names of all datasets/tables referenced\nin this specific table's SQL query (e.g., \"mainnet\", \"polygon\", etc.)." + }, + "schema": { + "description": "The output schema for the table\n\nDescribes the structure and types of columns that will be returned\nwhen executing the SQL query for this table." + } + } + }, "Value": {}, "VersionInfo": { "type": "object", @@ -2558,10 +2471,6 @@ "name": "jobs", "description": "Job management endpoints" }, - { - "name": "locations", - "description": "Location management endpoints" - }, { "name": "manifests", "description": "Manifest management endpoints" diff --git a/src/amp/admin/models.py b/src/amp/admin/models.py index 4e1273d..b5397ec 100644 --- a/src/amp/admin/models.py +++ b/src/amp/admin/models.py @@ -1,14 +1,13 @@ # generated by datamodel-codegen: # filename: admin.spec.json -# timestamp: 2025-11-06T23:57:02+00:00 +# timestamp: 2025-12-13T12:27:34+00:00 from __future__ import annotations -from datetime import datetime from enum import Enum from typing import Annotated, Any, Optional, Union -from pydantic import BaseModel, Field +from pydantic import AwareDatetime, BaseModel, Field, RootModel class Dataset(BaseModel): @@ -104,7 +103,8 @@ class DeployResponse(BaseModel): """ -class EndBlock(BaseModel): +class EndBlock(RootModel[Optional[str]]): + root: Optional[str] """ End block configuration for API requests. @@ -114,14 +114,9 @@ class EndBlock(BaseModel): - `null` (or omitted): Continuous dumping - never stops, keeps extracting new blocks as they arrive - `"latest"`: Stop at the latest available block at the time the dump starts - A positive number as a string (e.g., `"1000000"`): Stop at the specified absolute block number - - A negative number as a string (e.g., `"-100"`): Stop at (latest block - N), useful for staying N - blocks behind the chain tip - - Note: This is a simple wrapper around Optional[str] for documentation purposes. + - A negative number as a string (e.g., `"-100"`): Stop at (latest block - N), useful for staying N blocks behind the chain tip """ - value: Optional[str] = None - class ErrorResponse(BaseModel): """ @@ -205,45 +200,22 @@ class FileInfo(BaseModel): """ -class FileListInfo(BaseModel): - """ - Minimal file information for location file listings - - This struct represents essential file metadata for list endpoints, - containing only the most relevant information needed for file browsing - within a location context. - """ - - file_name: str - """ - Name of the file (e.g., "blocks_0000000000_0000099999.parquet") - """ - id: int - """ - Unique identifier for this file (64-bit integer) - """ - object_size: Optional[int] = None +class JobInfo(BaseModel): """ - Size of the file object in bytes + Represents job information for the API response """ - -class JobInfo(BaseModel): + created_at: str """ - Job information returned by the API - - This struct represents job metadata in a format suitable for API responses. - It contains essential information about a job without exposing internal - database implementation details. + Job creation timestamp in ISO 8601 / RFC 3339 format """ - descriptor: Any """ Job descriptor containing job-specific parameters as JSON """ id: int """ - Unique identifier for this job (64-bit integer) + Unique identifier for the job (64-bit integer) """ node_id: str """ @@ -253,6 +225,10 @@ class JobInfo(BaseModel): """ Current status of the job (Scheduled, Running, Completed, Stopped, Failed, etc.) """ + updated_at: str + """ + Job last update timestamp in ISO 8601 / RFC 3339 format + """ class JobsResponse(BaseModel): @@ -270,113 +246,6 @@ class JobsResponse(BaseModel): """ -class LocationFilesResponse(BaseModel): - """ - Collection response for location file listings - - This response structure provides paginated file data with - cursor-based pagination support for efficient traversal. - """ - - files: list[FileListInfo] - """ - List of files in this page with minimal information - """ - next_cursor: Optional[int] = None - """ - Cursor for the next page of results - use as last_file_id in next request (None if no more results) - """ - - -class LocationInfo(BaseModel): - """ - Location information returned by the API - - This struct represents location metadata from the database in a format - suitable for API responses. It contains all the essential information - about where dataset table data is stored. - """ - - active: bool - """ - Whether this location is currently active for queries - """ - dataset: str - """ - Name of the dataset this location belongs to - """ - dataset_version: str - """ - Version of the dataset using semantic versioning (e.g., "1.0.0", or empty string for unversioned) - """ - id: int - """ - Unique identifier for this location (64-bit integer) - """ - table: str - """ - Name of the table within the dataset (e.g., "blocks", "transactions") - """ - url: str - """ - Full URL to the storage location (e.g., "s3://bucket/path/table.parquet", "file:///local/path/table.parquet") - """ - writer: Optional[int] = None - """ - Writer job ID (64-bit integer, if one exists) - """ - - -class LocationInfoWithDetails(BaseModel): - """ - Location information with writer job details - """ - - active: bool - """ - Whether this location is currently active for queries - """ - dataset: str - """ - Name of the dataset this location belongs to - """ - dataset_version: str - """ - Version of the dataset using semantic versioning (e.g., "1.0.0", or empty string for unversioned) - """ - id: int - """ - Unique identifier for this location (64-bit integer) - """ - table: str - """ - Name of the table within the dataset (e.g., "blocks", "transactions") - """ - url: str - """ - Full URL to the storage location (e.g., "s3://bucket/path/table.parquet", "file:///local/path/table.parquet") - """ - writer: Optional[JobInfo] = None - - -class LocationsResponse(BaseModel): - """ - API response containing location information - - This response structure provides paginated location data with - cursor-based pagination support for efficient traversal. - """ - - locations: list[LocationInfo] - """ - List of locations in this page - """ - next_cursor: Optional[int] = None - """ - Cursor for the next page of results (None if no more results) - """ - - class ManifestDatasetsResponse(BaseModel): """ Response for listing datasets using a manifest @@ -392,52 +261,35 @@ class ManifestDatasetsResponse(BaseModel): """ -class ManifestResponse(BaseModel): - """ - Response wrapper for manifest content - """ - - -class OutputSchemaRequest(BaseModel): +class ManifestInfo(BaseModel): """ - Request payload for output schema analysis - - Contains the SQL query to analyze and optional configuration flags. + Summary information for a single manifest """ - is_sql_dataset: Optional[bool] = None + dataset_count: Annotated[int, Field(ge=0)] """ - Whether this is a SQL dataset (affects block number field inclusion) - - When true, a special block number field is prepended to the schema. - This field tracks the block number for each row in SQL datasets. + Number of datasets using this manifest """ - sql_query: str + hash: str """ - The SQL query to analyze for output schema determination + Content-addressable hash (SHA-256) """ -class OutputSchemaResponse(BaseModel): +class ManifestResponse(BaseModel): """ - Response returned by the output schema endpoint - - Contains the determined schema and list of networks referenced by the query. + Response wrapper for manifest content """ - networks: list[str] - """ - List of networks referenced by the query - Contains the network names of all datasets/tables referenced - in the SQL query (e.g., "mainnet", "polygon", etc.). +class ManifestsResponse(BaseModel): """ - schema_: Annotated[Any, Field(alias='schema')] + Response for listing all manifests """ - The output schema for the SQL query - Describes the structure and types of columns that will be returned - when executing the provided SQL query against the dataset. + manifests: list[ManifestInfo] + """ + List of all manifests in the system """ @@ -510,13 +362,12 @@ class RegisterManifestResponse(BaseModel): """ -class Manifest(BaseModel): +class Manifest(RootModel[str]): + root: Annotated[str, Field(max_length=64, min_length=64, pattern='[0-9a-fA-F]{64}')] """ A manifest hash (64-character SHA-256 hex string) """ - hash: Annotated[str, Field(max_length=64, min_length=64, pattern='[0-9a-fA-F]{64}')] - class RegisterRequest(BaseModel): """ @@ -549,6 +400,62 @@ class RegisterRequest(BaseModel): """ +class RestoredTableInfo(BaseModel): + """ + Information about a restored physical table + """ + + location_id: int + """ + Unique location ID assigned in the metadata database + """ + table_name: str + """ + Name of the table within the dataset + """ + url: str + """ + Full URL to the storage location + """ + + +class SchemaRequest(BaseModel): + """ + Request for schema analysis with dependencies, tables, and functions + """ + + dependencies: Optional[dict[str, str]] = None + """ + External dataset dependencies mapped by alias + + Maps alias names to dataset references (namespace/name@version or namespace/name@hash). + These aliases are used in SQL queries to reference external datasets. + Symbolic references like "latest" or "dev" are not allowed. + """ + functions: Optional[dict[str, Any]] = None + """ + User-defined function definitions mapped by function name + + Maps function names to their complete definitions including input/output types + and implementation source code. These functions can be referenced in SQL queries + as bare function calls (e.g., `my_function(args)` without dataset qualification). + + At least one of `tables` or `functions` must be provided. + + Function names must follow DataFusion UDF identifier rules: + - Start with a letter (a-z, A-Z) or underscore (_) + - Contain only letters, digits (0-9), underscores (_), and dollar signs ($) + - Maximum length of 255 bytes + """ + tables: Optional[dict[str, str]] = None + """ + Table definitions mapped by table name + + Each table is defined by a SQL query that may reference + tables from dependencies using the alias names. + """ + + class SpecialTags(BaseModel): """ Special tags pointing to versions or hashes @@ -575,10 +482,31 @@ class String(Enum): Error = 'Error' -class Value(BaseModel): - """Generic value wrapper for Any type""" +class TableSchemaWithNetworks(BaseModel): + """ + Table schema with associated networks + + Contains the output schema for a table and the list of networks referenced by its query. + """ - value: Any + networks: list[str] + """ + List of networks referenced by this table's query + + Contains the network names of all datasets/tables referenced + in this specific table's SQL query (e.g., "mainnet", "polygon", etc.). + """ + schema_: Annotated[Any, Field(alias='schema')] + """ + The output schema for the table + + Describes the structure and types of columns that will be returned + when executing the SQL query for this table. + """ + + +class Value(RootModel[Any]): + root: Any class VersionInfo(BaseModel): @@ -635,7 +563,9 @@ class WorkerInfo(BaseModel): This is a lightweight summary view suitable for list endpoints. """ - heartbeat_at: Annotated[datetime, Field(examples=['2025-01-15T17:20:15.456789Z'])] + heartbeat_at: Annotated[ + AwareDatetime, Field(examples=['2025-01-15T17:20:15.456789Z']) + ] """ Last heartbeat timestamp (RFC3339 format) @@ -674,8 +604,10 @@ class WorkerMetadata(BaseModel): """ build_date: Annotated[ - datetime, - Field(examples=['2025-01-15T15:45:30Z', '2025-01-15T10:45:30-05:00', 'unknown']), + AwareDatetime, + Field( + examples=['2025-01-15T15:45:30Z', '2025-01-15T10:45:30-05:00', 'unknown'] + ), ] """ Date and time when the worker binary was built (RFC3339 format) @@ -685,7 +617,9 @@ class WorkerMetadata(BaseModel): Returns "unknown" if build date is not available. """ - commit_sha: Annotated[str, Field(examples=['8b065bde9c1a2f3e4d5c6b7a8e9f0a1b2c3d4e5f', 'unknown'])] + commit_sha: Annotated[ + str, Field(examples=['8b065bde9c1a2f3e4d5c6b7a8e9f0a1b2c3d4e5f', 'unknown']) + ] """ Full Git commit SHA (40-character hexadecimal) @@ -695,8 +629,10 @@ class WorkerMetadata(BaseModel): Returns "unknown" if commit information is not available. """ commit_timestamp: Annotated[ - datetime, - Field(examples=['2025-01-15T14:30:00Z', '2025-01-15T09:30:00-05:00', 'unknown']), + AwareDatetime, + Field( + examples=['2025-01-15T14:30:00Z', '2025-01-15T09:30:00-05:00', 'unknown'] + ), ] """ Timestamp when the commit was created (RFC3339 format) @@ -775,6 +711,42 @@ class DeployRequest(BaseModel): Defaults to 1 if not specified. """ + worker_id: Optional[str] = None + """ + Optional worker ID to assign the job to + + If specified, the job will be assigned to this specific worker. + If not specified, a worker will be selected randomly from available workers. + + The worker must be active (has sent heartbeats recently) for the deployment to succeed. + """ + + +class RestoreResponse(BaseModel): + """ + Response for restore operation + """ + + tables: list[RestoredTableInfo] + """ + List of restored physical tables + """ + + +class SchemaResponse(BaseModel): + """ + Response returned by the schema endpoint + + Contains schemas and networks for one or more tables. + """ + + schemas: dict[str, TableSchemaWithNetworks] + """ + Schemas for each table + + Maps table names to their schemas and networks. + Contains one entry per table definition. + """ class WorkerDetailResponse(BaseModel): @@ -786,14 +758,18 @@ class WorkerDetailResponse(BaseModel): worker health, version tracking, and operational status. """ - created_at: Annotated[datetime, Field(examples=['2025-01-15T14:30:00.123456Z'])] + created_at: Annotated[ + AwareDatetime, Field(examples=['2025-01-15T14:30:00.123456Z']) + ] """ Timestamp when the worker was first created in the system (RFC3339 format) The initial registration time of this worker. This timestamp never changes and represents when the worker first appeared in the system. """ - heartbeat_at: Annotated[datetime, Field(examples=['2025-01-15T17:20:15.456789Z'])] + heartbeat_at: Annotated[ + AwareDatetime, Field(examples=['2025-01-15T17:20:15.456789Z']) + ] """ Last heartbeat timestamp (RFC3339 format) @@ -828,7 +804,9 @@ class WorkerDetailResponse(BaseModel): Must start with a letter and contain only alphanumeric characters, underscores, hyphens, and dots. """ - registered_at: Annotated[datetime, Field(examples=['2025-01-15T16:45:30.789012Z'])] + registered_at: Annotated[ + AwareDatetime, Field(examples=['2025-01-15T16:45:30.789012Z']) + ] """ Timestamp when the worker last registered (RFC3339 format) diff --git a/src/amp/admin/schema.py b/src/amp/admin/schema.py index 01bcf87..a48f289 100644 --- a/src/amp/admin/schema.py +++ b/src/amp/admin/schema.py @@ -4,7 +4,7 @@ of SQL queries without executing them. """ -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any, Optional from . import models @@ -23,7 +23,10 @@ class SchemaClient: Example: >>> client = AdminClient('http://localhost:8080') - >>> schema = client.schema.get_output_schema('SELECT * FROM eth.blocks', True) + >>> response = client.schema.get_output_schema( + ... tables={'t1': 'SELECT * FROM eth.blocks'}, + ... dependencies={'eth': '_/eth_firehose@0.0.0'} + ... ) """ def __init__(self, admin_client: 'AdminClient'): @@ -34,31 +37,42 @@ def __init__(self, admin_client: 'AdminClient'): """ self._admin = admin_client - def get_output_schema(self, sql_query: str, is_sql_dataset: bool = True) -> models.OutputSchemaResponse: - """Get output schema for a SQL query. + def get_output_schema( + self, + tables: Optional[dict[str, str]] = None, + dependencies: Optional[dict[str, str]] = None, + functions: Optional[dict[str, Any]] = None, + ) -> models.SchemaResponse: + """Get output schema for tables and functions. - Validates the query and returns the Arrow schema that would be produced, - without actually executing the query. + Validates the queries and returns the Arrow schemas that would be produced, + without actually executing the queries. Args: - sql_query: SQL query to analyze - is_sql_dataset: Whether this is for a SQL dataset (default: True) + tables: Optional map of table_name -> sql_query + dependencies: Optional map of alias -> dataset reference + functions: Optional map of function_name -> function_definition Returns: - OutputSchemaResponse with Arrow schema + SchemaResponse containing schemas for all requested tables Raises: GetOutputSchemaError: If schema analysis fails DependencyValidationError: If query references invalid dependencies Example: - >>> schema_resp = client.schema.get_output_schema( - ... 'SELECT block_num, hash FROM eth.blocks WHERE block_num > 1000000', - ... is_sql_dataset=True + >>> response = client.schema.get_output_schema( + ... tables={'my_table': 'SELECT block_num FROM eth.blocks'}, + ... dependencies={'eth': '_/eth_firehose@0.0.0'} ... ) - >>> print(schema_resp.schema) + >>> print(response.schemas['my_table'].schema) """ - request_data = models.OutputSchemaRequest(sql_query=sql_query, is_sql_dataset=is_sql_dataset) + request_data = models.SchemaRequest( + tables=tables, + dependencies=dependencies, + functions=functions, + ) - response = self._admin._request('POST', '/schema', json=request_data.model_dump(mode='json')) - return models.OutputSchemaResponse.model_validate(response.json()) + response = self._admin._request('POST', '/schema', json=request_data.model_dump(mode='json', exclude_none=True)) + + return models.SchemaResponse.model_validate(response.json()) diff --git a/src/amp/client.py b/src/amp/client.py index 2eee462..6f81530 100644 --- a/src/amp/client.py +++ b/src/amp/client.py @@ -201,7 +201,15 @@ def to_manifest(self, table_name: str, network: str = 'mainnet') -> dict: 'manifest' """ # Get schema from Admin API - schema_response = self.client.schema.get_output_schema(self.query, is_sql_dataset=True) + schema_response = self.client.schema.get_output_schema( + tables={table_name: self.query}, dependencies=self._dependencies + ) + + # Extract the schema for our table + if table_name not in schema_response.schemas: + raise KeyError(f"Server did not return schema for table '{table_name}'") + + table_schema = schema_response.schemas[table_name] # Build manifest structure matching tests/config/manifests/*.json format manifest = { @@ -210,7 +218,7 @@ def to_manifest(self, table_name: str, network: str = 'mainnet') -> dict: 'tables': { table_name: { 'input': {'sql': self.query}, - 'schema': schema_response.schema_, # Use schema_ field (schema is aliased in Pydantic) + 'schema': table_schema.schema_, # Use schema_ field (schema is aliased in Pydantic) 'network': network, } }, @@ -250,7 +258,6 @@ def register_as(self, namespace: str, name: str, version: str, table_name: str, # Generate manifest manifest = self.to_manifest(table_name, network) - # Register with Admin API self.client.datasets.register(namespace, name, version, manifest) @@ -481,7 +488,10 @@ def schema(self): Example: >>> client = Client(query_url='...', admin_url='http://localhost:8080') - >>> schema_resp = client.schema.get_output_schema('SELECT * FROM eth.blocks', True) + >>> schema_resp = client.schema.get_output_schema( + ... tables={'t1': 'SELECT * FROM eth.blocks'}, + ... dependencies={'eth': '_/eth_firehose@1.0.0'} + ... ) """ if not self._admin_client: raise ValueError( diff --git a/tests/integration/admin/test_jobs_client.py b/tests/integration/admin/test_jobs_client.py index 330940f..7a72393 100644 --- a/tests/integration/admin/test_jobs_client.py +++ b/tests/integration/admin/test_jobs_client.py @@ -15,7 +15,14 @@ class TestJobsClient: @respx.mock def test_get_job(self): """Test getting job by ID.""" - job_response = {'id': 123, 'status': 'Running', 'descriptor': {}, 'node_id': 'worker-1'} + job_response = { + 'id': 123, + 'status': 'Running', + 'descriptor': {}, + 'node_id': 'worker-1', + 'created_at': '2025-01-01T00:00:00Z', + 'updated_at': '2025-01-01T00:00:01Z', + } respx.get('http://localhost:8080/jobs/123').mock(return_value=Response(200, json=job_response)) client = AdminClient('http://localhost:8080') @@ -41,8 +48,22 @@ def test_list_jobs(self): """Test listing jobs with pagination.""" jobs_response = { 'jobs': [ - {'id': 123, 'status': 'Running', 'descriptor': {}, 'node_id': 'worker-1'}, - {'id': 124, 'status': 'Completed', 'descriptor': {}, 'node_id': 'worker-2'}, + { + 'id': 123, + 'status': 'Running', + 'descriptor': {}, + 'node_id': 'worker-1', + 'created_at': '2025-01-01T00:00:00Z', + 'updated_at': '2025-01-01T00:00:01Z', + }, + { + 'id': 124, + 'status': 'Completed', + 'descriptor': {}, + 'node_id': 'worker-2', + 'created_at': '2025-01-01T00:00:00Z', + 'updated_at': '2025-01-01T00:00:01Z', + }, ], 'next_cursor': 125, } @@ -73,8 +94,22 @@ def test_wait_for_completion_success(self): """Test waiting for job completion.""" # First call: job is Running # Second call: job is Completed - job_running = {'id': 123, 'status': 'Running', 'descriptor': {}, 'node_id': 'worker-1'} - job_completed = {'id': 123, 'status': 'Completed', 'descriptor': {}, 'node_id': 'worker-1'} + job_running = { + 'id': 123, + 'status': 'Running', + 'descriptor': {}, + 'node_id': 'worker-1', + 'created_at': '2025-01-01T00:00:00Z', + 'updated_at': '2025-01-01T00:00:01Z', + } + job_completed = { + 'id': 123, + 'status': 'Completed', + 'descriptor': {}, + 'node_id': 'worker-1', + 'created_at': '2025-01-01T00:00:00Z', + 'updated_at': '2025-01-01T00:00:02Z', + } route = respx.get('http://localhost:8080/jobs/123') route.side_effect = [Response(200, json=job_running), Response(200, json=job_completed)] @@ -87,7 +122,14 @@ def test_wait_for_completion_success(self): @respx.mock def test_wait_for_completion_timeout(self): """Test waiting for job with timeout.""" - job_running = {'id': 123, 'status': 'Running', 'descriptor': {}, 'node_id': 'worker-1'} + job_running = { + 'id': 123, + 'status': 'Running', + 'descriptor': {}, + 'node_id': 'worker-1', + 'created_at': '2025-01-01T00:00:00Z', + 'updated_at': '2025-01-01T00:00:01Z', + } respx.get('http://localhost:8080/jobs/123').mock(return_value=Response(200, json=job_running)) client = AdminClient('http://localhost:8080') diff --git a/tests/unit/admin/test_models.py b/tests/unit/admin/test_models.py index f14580c..fae8688 100644 --- a/tests/unit/admin/test_models.py +++ b/tests/unit/admin/test_models.py @@ -59,19 +59,25 @@ def test_deploy_response_job_id(self): class TestSchemaModels: """Test schema-related models.""" - def test_output_schema_request(self): - """Test OutputSchemaRequest model.""" - request = models.OutputSchemaRequest(sql_query='SELECT * FROM eth.blocks', is_sql_dataset=True) + def test_schema_request(self): + """Test SchemaRequest model.""" + request = models.SchemaRequest( + tables={'t1': 'SELECT * FROM eth.blocks'}, + dependencies={'eth': 'ns/eth@1.0.0'}, + functions={}, + ) - assert request.sql_query == 'SELECT * FROM eth.blocks' - assert request.is_sql_dataset is True + assert request.tables == {'t1': 'SELECT * FROM eth.blocks'} + assert request.dependencies == {'eth': 'ns/eth@1.0.0'} + assert request.functions == {} - def test_output_schema_request_defaults(self): - """Test OutputSchemaRequest with default values.""" - request = models.OutputSchemaRequest(sql_query='SELECT 1') + def test_schema_request_defaults(self): + """Test SchemaRequest with default values.""" + request = models.SchemaRequest() - assert request.sql_query == 'SELECT 1' - # is_sql_dataset should have a default if defined in the model + assert request.tables is None + assert request.dependencies is None + assert request.functions is None class TestEndBlockModel: @@ -79,18 +85,19 @@ class TestEndBlockModel: def test_end_block_with_value(self): """Test EndBlock with a value.""" - end_block = models.EndBlock(value='latest') + end_block = models.EndBlock(root='latest') - assert end_block.value == 'latest' + assert end_block.root == 'latest' def test_end_block_none(self): """Test EndBlock with None (continuous).""" - end_block = models.EndBlock(value=None) + end_block = models.EndBlock(root=None) - assert end_block.value is None + assert end_block.root is None def test_end_block_default(self): """Test EndBlock with default (no value provided).""" - end_block = models.EndBlock() + # Pydantic V2 RootModel with Optional[str] requires explicit root=None if no default + end_block = models.EndBlock(root=None) - assert end_block.value is None + assert end_block.root is None diff --git a/tests/unit/admin/test_schema.py b/tests/unit/admin/test_schema.py new file mode 100644 index 0000000..5d4e8ca --- /dev/null +++ b/tests/unit/admin/test_schema.py @@ -0,0 +1,58 @@ +"""Unit tests for SchemaClient.""" + +from unittest.mock import Mock + +import pytest +from amp.admin import models +from amp.admin.schema import SchemaClient + + +@pytest.mark.unit +class TestSchemaClient: + """Test SchemaClient operations.""" + + def test_get_output_schema_tables(self): + """Test get_output_schema with tables.""" + mock_admin = Mock() + client = SchemaClient(mock_admin) + + # Mock response + mock_response = Mock() + expected_response = { + 'schemas': {'t1': {'schema': {'fields': [{'name': 'col1', 'type': 'int64'}]}, 'networks': ['mainnet']}} + } + mock_response.json.return_value = expected_response + mock_admin._request.return_value = mock_response + + # Call method + response = client.get_output_schema(tables={'t1': 'SELECT * FROM t1'}) + + # Verify request + mock_admin._request.assert_called_once() + call_kwargs = mock_admin._request.call_args[1] + assert call_kwargs['json']['tables'] == {'t1': 'SELECT * FROM t1'} + + # Verify response + assert isinstance(response, models.SchemaResponse) + assert 't1' in response.schemas + assert response.schemas['t1'].networks == ['mainnet'] + + def test_get_output_schema_full_request(self): + """Test get_output_schema with all parameters.""" + mock_admin = Mock() + client = SchemaClient(mock_admin) + + mock_response = Mock() + mock_response.json.return_value = {'schemas': {}} + mock_admin._request.return_value = mock_response + + client.get_output_schema( + tables={'t1': 'SELECT 1'}, dependencies={'d1': 'ns/d1@1.0.0'}, functions={'f1': {'body': '...'}} + ) + + # Verify request structure + call_kwargs = mock_admin._request.call_args[1] + request_json = call_kwargs['json'] + assert request_json['tables'] == {'t1': 'SELECT 1'} + assert request_json['dependencies'] == {'d1': 'ns/d1@1.0.0'} + assert request_json['functions'] == {'f1': {'body': '...'}} diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index f9fd98b..e2a9c57 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -11,7 +11,7 @@ import pytest -from src.amp.admin.models import OutputSchemaResponse +from src.amp.admin.models import TableSchemaWithNetworks from src.amp.client import Client, QueryBuilder @@ -244,9 +244,12 @@ def test_to_manifest_basic_structure(self): """Test that to_manifest generates correct manifest structure""" # Create a mock client with admin API mock_client = Mock() - mock_schema_response = OutputSchemaResponse( + table_schema = TableSchemaWithNetworks( networks=['mainnet'], schema={'fields': [{'name': 'block_num', 'type': 'int64'}]} ) + mock_schema_response = Mock() + mock_schema_response.schemas = {'blocks': table_schema} + mock_client.schema.get_output_schema.return_value = mock_schema_response # Create QueryBuilder and generate manifest @@ -265,9 +268,12 @@ def test_to_manifest_with_dependencies(self): """Test that to_manifest includes dependencies""" # Create a mock client with admin API mock_client = Mock() - mock_schema_response = OutputSchemaResponse( + table_schema = TableSchemaWithNetworks( networks=['mainnet'], schema={'fields': [{'name': 'block_num', 'type': 'int64'}]} ) + mock_schema_response = Mock() + mock_schema_response.schemas = {'blocks': table_schema} + mock_client.schema.get_output_schema.return_value = mock_schema_response # Create QueryBuilder with dependencies @@ -279,13 +285,21 @@ def test_to_manifest_with_dependencies(self): # Verify dependencies are included assert manifest['dependencies'] == {'eth': '_/eth_firehose@0.0.0'} + # Verify schema API was called with dependencies + mock_client.schema.get_output_schema.assert_called_once_with( + tables={'blocks': 'SELECT block_num FROM eth.blocks'}, dependencies={'eth': '_/eth_firehose@0.0.0'} + ) + def test_to_manifest_with_multiple_dependencies(self): """Test that to_manifest includes multiple dependencies""" # Create a mock client with admin API mock_client = Mock() - mock_schema_response = OutputSchemaResponse( + table_schema = TableSchemaWithNetworks( networks=['mainnet'], schema={'fields': [{'name': 'block_num', 'type': 'int64'}]} ) + mock_schema_response = Mock() + mock_schema_response.schemas = {'blocks': table_schema} + mock_client.schema.get_output_schema.return_value = mock_schema_response # Create QueryBuilder with multiple dependencies @@ -297,13 +311,22 @@ def test_to_manifest_with_multiple_dependencies(self): # Verify all dependencies are included assert manifest['dependencies'] == {'eth': '_/eth_firehose@0.0.0', 'btc': '_/btc_firehose@1.2.3'} + # Verify schema API was called with dependencies + mock_client.schema.get_output_schema.assert_called_once_with( + tables={'blocks': 'SELECT e.block_num FROM eth.blocks e JOIN btc.blocks b'}, + dependencies={'eth': '_/eth_firehose@0.0.0', 'btc': '_/btc_firehose@1.2.3'}, + ) + def test_to_manifest_custom_network(self): """Test that to_manifest respects custom network parameter""" # Create a mock client with admin API mock_client = Mock() - mock_schema_response = OutputSchemaResponse( + table_schema = TableSchemaWithNetworks( networks=['polygon'], schema={'fields': [{'name': 'block_num', 'type': 'int64'}]} ) + mock_schema_response = Mock() + mock_schema_response.schemas = {'blocks': table_schema} + mock_client.schema.get_output_schema.return_value = mock_schema_response # Create QueryBuilder @@ -317,9 +340,12 @@ def test_to_manifest_calls_schema_api(self): """Test that to_manifest calls the schema API with correct parameters""" # Create a mock client with admin API mock_client = Mock() - mock_schema_response = OutputSchemaResponse( + table_schema = TableSchemaWithNetworks( networks=['mainnet'], schema={'fields': [{'name': 'block_num', 'type': 'int64'}]} ) + mock_schema_response = Mock() + mock_schema_response.schemas = {'blocks': table_schema} + mock_client.schema.get_output_schema.return_value = mock_schema_response # Create QueryBuilder @@ -328,7 +354,7 @@ def test_to_manifest_calls_schema_api(self): qb.to_manifest('blocks') # Verify schema API was called correctly - mock_client.schema.get_output_schema.assert_called_once_with(query, is_sql_dataset=True) + mock_client.schema.get_output_schema.assert_called_once_with(tables={'blocks': query}, dependencies={}) def test_to_manifest_matches_expected_format(self): """ @@ -347,7 +373,10 @@ def test_to_manifest_matches_expected_format(self): # Create a mock client with admin API mock_client = Mock() - mock_schema_response = OutputSchemaResponse(networks=['mainnet'], schema=expected_schema) + table_schema = TableSchemaWithNetworks(networks=['mainnet'], schema=expected_schema) + mock_schema_response = Mock() + mock_schema_response.schemas = {'erc20_transfers': table_schema} + mock_client.schema.get_output_schema.return_value = mock_schema_response # Create QueryBuilder with the same query and dependency @@ -374,14 +403,22 @@ def test_to_manifest_matches_expected_format(self): # Verify schema fields match exactly assert generated_table['schema']['arrow']['fields'] == expected_table['schema']['arrow']['fields'] + # Verify schema API was called with dependencies + mock_client.schema.get_output_schema.assert_called_once_with( + tables={'erc20_transfers': expected_query}, dependencies={'eth_firehose': '_/eth_firehose@0.0.0'} + ) + def test_to_manifest_serializes_to_valid_json(self): """Test that to_manifest generates a manifest that serializes to valid JSON with double quotes""" # Create a mock client with admin API mock_client = Mock() - mock_schema_response = OutputSchemaResponse( + table_schema = TableSchemaWithNetworks( networks=['mainnet'], schema={'arrow': {'fields': [{'name': 'block_num', 'type': 'UInt64', 'nullable': False}]}}, ) + mock_schema_response = Mock() + mock_schema_response.schemas = {'blocks': table_schema} + mock_client.schema.get_output_schema.return_value = mock_schema_response # Create QueryBuilder