diff --git a/.env.example b/.env.example index c90c0f0..23660e5 100644 --- a/.env.example +++ b/.env.example @@ -35,6 +35,35 @@ BLOBSTORE_URL=https://appdev.kbase.us/services/shock-api # Enable debug mode (true/false) DEBUG=false +# ============================================================================= +# AI PROVIDER CONFIGURATION (for automatic config generation) +# ============================================================================= +# Preferred AI provider: auto, openai, argo, ollama, claude-code, rules-only +AI_PROVIDER=auto + +# Fallback chain (comma-separated, tried in order) +AI_FALLBACK_CHAIN=openai,argo,ollama,rules-only + +# OpenAI Configuration +# OPENAI_API_KEY=sk-your-api-key-here +OPENAI_MODEL=gpt-4o-mini +OPENAI_TEMPERATURE=0.1 + +# Argo Configuration (ANL internal) +# ARGO_USER=your-anl-username +ARGO_MODEL=gpt4o +ARGO_PROXY_PORT=1080 + +# Ollama Configuration (local LLM) +OLLAMA_HOST=http://localhost:11434 +OLLAMA_MODEL=llama3 + +# Claude Code Configuration +CLAUDE_CODE_EXECUTABLE=claude + +# Generated Config Storage +GENERATED_CONFIG_DIR=/tmp/tablescanner_configs + # ============================================================================= # TEST DATA (AppDev) # ============================================================================= diff --git a/.gitignore b/.gitignore index 5ec0315..6e4db2d 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,10 @@ trash/ docs/DEMO_SCRIPT.md docs/QUICKSTART.md docs/internal/ +DATABASE_SCHEMA.md +docs/personal/ +archive/ +docs/archive .DS_Store .idea @@ -31,3 +35,8 @@ lib/ # Cache directory cache/ + +# Project-specific artifacts +DATABASE_SCHEMA.md +*.webp +*.png diff --git a/README.md b/README.md index 4fc1c96..e6ec70a 100644 --- a/README.md +++ b/README.md @@ -1,64 +1,156 @@ # TableScanner -TableScanner is a microservice for providing filtered and paginated access to tabular data stored in KBase. It uses local SQLite caching and indexing to provide fast access to large datasets without loading them entirely into memory. +TableScanner is a production-grade microservice for querying tabular data from KBase SQLite databases. It provides a comprehensive DataTables Viewer-compatible API with advanced query capabilities, type-aware filtering, and performance optimizations. -## Functionality +## Features -The service provides two methods for data access: -1. **Hierarchical REST**: Path-based endpoints for navigating objects and tables using GET requests. -2. **Flat POST**: A single endpoint (`/table-data`) that accepts a JSON payload for all query parameters. +- **Data Access**: Query SQLite databases from KBase objects and handles +- **Type-Aware Filtering**: Automatic numeric conversion for proper filtering +- **Advanced Operators**: Support for eq, ne, gt, gte, lt, lte, like, ilike, in, not_in, between, is_null, is_not_null +- **Aggregations**: GROUP BY support with count, sum, avg, min, max, stddev, variance, distinct_count +- **Full-Text Search**: FTS5 support with automatic virtual table creation +- **Performance**: Connection pooling, query caching, automatic indexing +- **Statistics**: Pre-computed column statistics (min, max, mean, median, stddev) +- **Schema Information**: Detailed table and column schema with indexes -## Architecture - -TableScanner operates as a bridge between KBase storage and client applications: -1. **Data Fetching**: Retrieves SQLite databases from the KBase Blobstore. -2. **Local Caching**: Stores databases locally to avoid repeated downloads. -3. **Indexing**: Creates indices on-the-fly for all table columns to optimize query performance. -4. **API Layer**: A FastAPI application that handles requests and executes SQL queries against the local cache. - -Technical details on race conditions and concurrency handling are available in [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md). - -## Setup +## Quick Start ### Production + ```bash docker compose up --build -d ``` + The service will be available at `http://localhost:8000`. API documentation is at `/docs`. ### Development + ```bash cp .env.example .env -bash scripts/dev.sh +# Edit .env and set KB_SERVICE_AUTH_TOKEN +./scripts/dev.sh ``` +The helper script `scripts/dev.sh` automates the environment setup: +1. Activates the virtual environment (`.venv` or `venv`) +2. Loads environment variables from `.env` +3. Sets `PYTHONPATH` +4. Starts the FastAPI development server with hot-reload via `fastapi dev` + ## API Usage -### Path-based REST -List tables: -`GET /object/{upa}/tables` +### List Tables -Query table data: -`GET /object/{upa}/tables/{table_name}/data?limit=100` +```bash +curl -H "Authorization: Bearer $KB_TOKEN" \ + "http://localhost:8000/object/76990/7/2/tables" +``` -### Flat POST -Query table data: -`POST /table-data` +### Query Table Data -Payload example: -```json -{ - "berdl_table_id": "76990/7/2", - "table_name": "Genes", - "limit": 100 -} +```bash +curl -H "Authorization: Bearer $KB_TOKEN" \ + "http://localhost:8000/object/76990/7/2/tables/Genes/data?limit=10" +``` + +### Enhanced Query with Filters + +```bash +curl -X POST -H "Authorization: Bearer $KB_TOKEN" \ + -H "Content-Type: application/json" \ + -d '{ + "berdl_table_id": "local/76990_7_2", + "table_name": "Genes", + "limit": 100, + "filters": [ + {"column": "contigs", "operator": "gt", "value": "50"} + ] + }' \ + "http://localhost:8000/table-data" ``` +### Aggregation Query + +```bash +curl -X POST -H "Authorization: Bearer $KB_TOKEN" \ + -H "Content-Type: application/json" \ + -d '{ + "group_by": ["category"], + "aggregations": [ + {"column": "value", "function": "sum", "alias": "total"} + ] + }' \ + "http://localhost:8000/api/aggregate/local/76990_7_2/tables/Data" +``` + +## Documentation + +- **[API Reference](docs/API.md)** - Complete API documentation with examples +- **[Architecture Dictionary](docs/ARCHITECTURE.md)** - System design and technical overview +- **[Contributing Guide](docs/CONTRIBUTING.md)** - Setup, testing, and contribution guidelines + +## Architecture + +TableScanner operates as a bridge between KBase storage and client applications: + +1. **Data Fetching**: Retrieves SQLite databases from KBase Blobstore +2. **Local Caching**: Stores databases locally to avoid repeated downloads +3. **Connection Pooling**: Manages database connections with automatic lifecycle +4. **Query Execution**: Type-aware filtering with automatic numeric conversion +5. **Performance**: Query caching, automatic indexing, SQLite optimizations +6. **API Layer**: FastAPI application with comprehensive endpoints + ## Project Structure -- `app/`: Application logic and routes. -- `app/utils/`: Utilities for caching, SQLite operations, and Workspace integration. -- `docs/`: Technical documentation. -- `scripts/`: Client examples and utility scripts. + +``` +TableScanner/ +├── app/ +│ ├── main.py # FastAPI application +│ ├── routes.py # API endpoints +│ ├── models.py # Pydantic models +│ ├── config.py # Configuration settings +│ ├── services/ +│ │ ├── data/ +│ │ │ ├── connection_pool.py # Connection pooling +│ │ │ ├── query_service.py # Query execution +│ │ │ └── ... +│ │ └── db_helper.py # Database resolution +│ └── utils/ # Utilities (SQLite, KBase Client) +├── docs/ # Documentation (API, Architecture, Contributing) +├── tests/ # Test suite (Unit & Integration) +├── scripts/ # Helper scripts (dev.sh) +└── static/ # Static files +``` + +## Configuration + +Create a `.env` file with: + +```env +KB_SERVICE_AUTH_TOKEN=your_token_here +CACHE_DIR=/tmp/tablescanner_cache +CACHE_MAX_AGE_HOURS=24 +DEBUG=false +``` + +## Performance + +- Query execution: < 100ms for typical queries +- Cache hit rate: > 80% for repeated queries +- Database connection: Reused for 30 minutes +- Query cache: 5-minute TTL, max 1000 entries +- Automatic indexing: One-time cost, cached thereafter + +## Testing + +```bash +# Run all tests +pytest + +# Run with coverage +pytest --cov=app --cov-report=html +``` ## License -MIT License. + +MIT License diff --git a/app/config.py b/app/config.py index 37fb984..525a056 100644 --- a/app/config.py +++ b/app/config.py @@ -59,9 +59,23 @@ class Settings(BaseSettings): default=False, description="Enable debug mode with verbose logging" ) + KB_ENV: str = Field( + default="appdev", + description="KBase environment (appdev, ci, prod)" + ) # Root path for proxy deployment (e.g., "/services/berdl_table_scanner") ROOT_PATH: str = "" + + # Timeout settings + DOWNLOAD_TIMEOUT_SECONDS: float = Field( + default=30.0, + description="Timeout in seconds for downloading databases" + ) + KBASE_API_TIMEOUT_SECONDS: float = Field( + default=10.0, + description="Timeout in seconds for KBase API calls" + ) class Config: env_file = ".env" diff --git a/app/config_constants.py b/app/config_constants.py new file mode 100644 index 0000000..2f5124e --- /dev/null +++ b/app/config_constants.py @@ -0,0 +1,20 @@ +""" +Configuration constants for TableScanner. +""" + +# Default values +DEFAULT_LIMIT = 100 +MAX_LIMIT = 500000 +DEFAULT_OFFSET = 0 +DEFAULT_SORT_ORDER = "ASC" + +# Cache settings +CACHE_TTL_SECONDS = 300 # 5 minutes +CACHE_MAX_ENTRIES = 1000 +INDEX_CACHE_TTL = 3600 # 1 hour + +# Timeout settings +KBASE_API_TIMEOUT_SECONDS = 30 + +# API Version +API_VERSION = "2.0" diff --git a/app/db/__init__.py b/app/db/__init__.py new file mode 100644 index 0000000..3038256 --- /dev/null +++ b/app/db/__init__.py @@ -0,0 +1,5 @@ +""" +Database module for Config Control Plane. + +Provides SQLite-based persistent storage for configuration records. +""" diff --git a/app/db/schema.sql b/app/db/schema.sql new file mode 100644 index 0000000..db58d0a --- /dev/null +++ b/app/db/schema.sql @@ -0,0 +1,107 @@ +-- ============================================================================= +-- Config Control Plane Database Schema +-- ============================================================================= +-- +-- Stores configuration records with full lifecycle support: +-- - draft: Work in progress, modifiable +-- - proposed: Ready for review, read-only +-- - published: Production-ready, locked +-- - deprecated: Marked for removal +-- - archived: Historical reference +-- +-- ============================================================================= + +-- Config records with full lifecycle support +CREATE TABLE IF NOT EXISTS config_records ( + id TEXT PRIMARY KEY, + source_type TEXT NOT NULL CHECK(source_type IN ('object', 'handle', 'builtin', 'custom')), + source_ref TEXT NOT NULL, + fingerprint TEXT, + version INTEGER NOT NULL DEFAULT 1, + + -- Lifecycle + state TEXT NOT NULL DEFAULT 'draft' CHECK(state IN ('draft', 'proposed', 'published', 'deprecated', 'archived')), + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + created_by TEXT NOT NULL, + published_at TIMESTAMP, + published_by TEXT, + + -- Content + config_json TEXT NOT NULL, -- Full DataTypeConfig JSON + extends_id TEXT REFERENCES config_records(id), + overlays_json TEXT, + + -- Metadata + object_type TEXT, + ai_provider TEXT, + confidence REAL DEFAULT 1.0, + generation_time_ms REAL, + + -- Audit + change_summary TEXT, + change_author TEXT, + + -- Unique constraint on source_ref + fingerprint + version + UNIQUE(source_ref, fingerprint, version) +); + +-- Audit log for all changes +CREATE TABLE IF NOT EXISTS config_audit_log ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + config_id TEXT NOT NULL REFERENCES config_records(id) ON DELETE CASCADE, + action TEXT NOT NULL, + old_state TEXT, + new_state TEXT, + changed_by TEXT NOT NULL, + changed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + diff_json TEXT, + reason TEXT +); + +-- User overrides for personalized config preferences +CREATE TABLE IF NOT EXISTS user_config_overrides ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + user_id TEXT NOT NULL, + source_ref TEXT NOT NULL, + override_config_json TEXT NOT NULL, -- Partial or full config override + priority INTEGER DEFAULT 100, -- Lower = higher priority + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + is_active BOOLEAN DEFAULT 1, + UNIQUE(user_id, source_ref) +); + +-- Config version history for diff visualization +CREATE TABLE IF NOT EXISTS config_version_history ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + config_id TEXT NOT NULL REFERENCES config_records(id) ON DELETE CASCADE, + version INTEGER NOT NULL, + config_json TEXT NOT NULL, + snapshot_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + UNIQUE(config_id, version) +); + +-- Config test results for validation against real data +CREATE TABLE IF NOT EXISTS config_test_results ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + config_id TEXT NOT NULL REFERENCES config_records(id) ON DELETE CASCADE, + test_type TEXT NOT NULL, -- 'schema', 'data', 'performance', 'integration' + test_status TEXT NOT NULL, -- 'passed', 'failed', 'warning' + test_details_json TEXT, -- Detailed test results + tested_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + tested_by TEXT, + execution_time_ms REAL +); + +-- Indexes for fast lookups +CREATE INDEX IF NOT EXISTS idx_config_source ON config_records(source_type, source_ref); +CREATE INDEX IF NOT EXISTS idx_config_state ON config_records(state); +CREATE INDEX IF NOT EXISTS idx_config_fingerprint ON config_records(fingerprint); +CREATE INDEX IF NOT EXISTS idx_config_object_type ON config_records(object_type); +CREATE INDEX IF NOT EXISTS idx_config_extends ON config_records(extends_id); +CREATE INDEX IF NOT EXISTS idx_audit_config_id ON config_audit_log(config_id); +CREATE INDEX IF NOT EXISTS idx_audit_changed_at ON config_audit_log(changed_at); +CREATE INDEX IF NOT EXISTS idx_user_override_user ON user_config_overrides(user_id, source_ref); +CREATE INDEX IF NOT EXISTS idx_version_history_config ON config_version_history(config_id, version); +CREATE INDEX IF NOT EXISTS idx_test_results_config ON config_test_results(config_id, test_type); \ No newline at end of file diff --git a/app/exceptions.py b/app/exceptions.py new file mode 100644 index 0000000..1b707e9 --- /dev/null +++ b/app/exceptions.py @@ -0,0 +1,29 @@ +""" +Custom exceptions for TableScanner. +""" + +class TableScannerError(Exception): + """Base exception for TableScanner.""" + pass + +class TableNotFoundError(TableScannerError): + """Raised when a requested table does not exist.""" + def __init__(self, table_name: str, available_tables: list[str] | None = None): + msg = f"Table '{table_name}' not found" + if available_tables: + msg += f". Available: {available_tables}" + super().__init__(msg) + self.table_name = table_name + +class ColumnNotFoundError(TableScannerError): + """Raised when a requested column does not exist.""" + def __init__(self, column_name: str, table_name: str): + super().__init__(f"Column '{column_name}' not found in table '{table_name}'") + +class InvalidFilterError(TableScannerError): + """Raised when a filter configuration is invalid.""" + pass + +class DatabaseAccessError(TableScannerError): + """Raised when database file cannot be accessed or opened.""" + pass diff --git a/app/main.py b/app/main.py index 8ed4284..b20519d 100644 --- a/app/main.py +++ b/app/main.py @@ -7,6 +7,7 @@ Run with: uv run fastapi dev app/main.py """ +import os from pathlib import Path from fastapi import FastAPI from fastapi.staticfiles import StaticFiles @@ -30,20 +31,28 @@ def create_app() -> FastAPI: # Configure root_path for KBase dynamic services # KBase services are often deployed at /services/service_name # Pydantic Settings management or manual environ check can handle this. - import os + # Pydantic Settings management or manual environ check can handle this. root_path = os.environ.get("KB_SERVICE_ROOT_PATH", "") description = """ ## TableScanner API - A FastAPI service for querying BERDL table data from KBase. + A FastAPI service for querying tabular data from KBase SQLite databases. + Provides a comprehensive DataTables Viewer-compatible API with advanced + query capabilities, type-aware filtering, and performance optimizations. ### Features - - List pangenomes from BERDLTables objects - - List tables within a pangenome + - List tables in KBase objects - Query table data with filtering, sorting, and pagination - - Local caching for performance + - Type-aware filtering with automatic numeric conversion + - Advanced filter operators (eq, ne, gt, gte, lt, lte, like, ilike, in, not_in, between, is_null, is_not_null) + - Aggregations with GROUP BY support + - Full-text search (FTS5) + - Column statistics and schema information + - Query result caching for performance + - Local database caching + - Connection pooling with automatic lifecycle management ### Authentication Pass your KBase auth token in the `Authorization` header. @@ -87,7 +96,7 @@ def create_app() -> FastAPI: app.add_middleware( CORSMiddleware, allow_origins=["*"], - allow_credentials=True, + allow_credentials=False, allow_methods=["*"], allow_headers=["*"], ) diff --git a/app/models.py b/app/models.py index f24fbfd..9dee55c 100644 --- a/app/models.py +++ b/app/models.py @@ -1,4 +1,6 @@ from __future__ import annotations +from datetime import datetime +from enum import Enum from typing import Any, Literal from pydantic import BaseModel, Field @@ -109,6 +111,7 @@ class TableListResponse(BaseModel): """Response for listing tables in a database.""" berdl_table_id: str | None = Field(None, description="BERDLTable object reference", examples=["76990/7/2"]) handle_ref: str | None = Field(None, description="Blobstore handle reference", examples=["KBH_248028"]) + object_type: str | None = Field(None, description="KBase object type", examples=["KBaseGeneDataLakes.BERDLTables-1.0"]) tables: list[TableInfo] = Field( default_factory=list, description="List of available tables", @@ -118,52 +121,59 @@ class TableListResponse(BaseModel): ]] ) source: str | None = Field(None, description="Data source", examples=["Cache"]) - - -class PangenomeInfo(BaseModel): - """Information about a pangenome found in the SQLite file.""" - pangenome_taxonomy: str | None = Field(None, description="Taxonomy of the pangenome", examples=["Escherichia coli"]) - genome_count: int = Field(..., description="Number of genomes in the pangenome", examples=[42]) - source_berdl_id: str = Field(..., description="Source BERDL Table ID", examples=["76990/7/2"]) - user_genomes: list[str] = Field( - default_factory=list, - description="List of user-provided genome references", - examples=[["76990/1/1", "76990/2/1"]] + + # Viewer integration fields + config_fingerprint: str | None = Field( + None, + description="Fingerprint of cached viewer config (if exists)", + examples=["v1_auto_abc123def456"] ) - berdl_genomes: list[str] = Field( - default_factory=list, - description="List of BERDL/Datalake genome identifiers", - examples=[["GLM4:EC_G1", "GLM4:EC_G2"]] + config_url: str | None = Field( + None, + description="URL to retrieve generated viewer config", + examples=["/config/generated/v1_auto_abc123def456"] ) - handle_ref: str | None = Field( + has_cached_config: bool = Field( + False, + description="Whether a viewer config is cached for this database" + ) + + # Schema information for immediate viewer use + schemas: dict | None = Field( None, - description="Blobstore handle reference for SQLite database", - examples=["KBH_248028"] + description="Column types per table: {table_name: {column: sql_type}}" ) - - -class PangenomesResponse(BaseModel): - """Response for listing pangenomes from a BERDLTables object.""" - berdl_table_id: str | None = Field(None, description="BERDLTable object reference", examples=["76990/7/2"]) - pangenomes: list[PangenomeInfo] = Field( - default_factory=list, - description="List of available pangenomes", - examples=[[ - { - "pangenome_taxonomy": "Escherichia coli", - "genome_count": 42, - "source_berdl_id": "76990/7/2", - "handle_ref": "KBH_248028" - } - ]] + + # Fallback config availability + has_builtin_config: bool = Field( + False, + description="Whether a built-in fallback config exists for this object type" ) - pangenome_count: int = Field( - 1, - description="Total number of pangenomes", - examples=[1] + builtin_config_id: str | None = Field( + None, + description="ID of the matching built-in config" + ) + + # Database metadata + database_size_bytes: int | None = Field( + None, + description="Size of the SQLite database file in bytes" + ) + total_rows: int = Field( + 0, + description="Total rows across all tables" + ) + + # Versioning for backward compatibility + api_version: str = Field( + "2.0", + description="API version for response format compatibility" ) + + + class TableDataResponse(BaseModel): """ Response for table data queries. @@ -218,6 +228,11 @@ class TableDataResponse(BaseModel): None, description="Path to SQLite database" ) + object_type: str | None = Field( + None, + description="KBase object type", + examples=["KBaseGeneDataLakes.BERDLTables-1.0"] + ) model_config = { "json_schema_extra": { @@ -296,4 +311,131 @@ class ServiceStatus(BaseModel): ..., description="Service status" ) - cache_dir: str = Field(..., description="Cache directory path") \ No newline at end of file + cache_dir: str = Field(..., description="Cache directory path") + + + + +# ============================================================================= +# DATATABLES VIEWER API MODELS +# ============================================================================= + + +class FilterRequest(BaseModel): + """Filter specification for DataTables Viewer API.""" + column: str = Field(..., description="Column name to filter") + operator: str = Field( + ..., + description="Filter operator: eq, ne, gt, gte, lt, lte, like, ilike, in, not_in, between, is_null, is_not_null" + ) + value: Any = Field(None, description="Filter value (or first value for 'between')") + value2: Any = Field(None, description="Second value for 'between' operator") + + +class AggregationRequest(BaseModel): + """Aggregation specification for DataTables Viewer API.""" + column: str = Field(..., description="Column name to aggregate") + function: str = Field( + ..., + description="Aggregation function: count, sum, avg, min, max, stddev, variance, distinct_count" + ) + alias: str | None = Field(None, description="Alias for aggregated column") + + +class TableDataQueryRequest(BaseModel): + """Enhanced table data query request for DataTables Viewer API.""" + berdl_table_id: str = Field(..., description="Database identifier (local/db_name format)") + table_name: str = Field(..., description="Table name") + limit: int = Field(100, ge=1, le=500000, description="Maximum rows to return") + offset: int = Field(0, ge=0, description="Number of rows to skip") + columns: list[str] | None = Field(None, description="List of columns to select (None = all)") + sort_column: str | None = Field(None, description="Column to sort by") + sort_order: Literal["ASC", "DESC"] = Field("ASC", description="Sort direction") + search_value: str | None = Field(None, description="Global search term") + col_filter: dict[str, str] | None = Field(None, description="Simple column filters (legacy)") + filters: list[FilterRequest] | None = Field(None, description="Advanced filter specifications") + aggregations: list[AggregationRequest] | None = Field(None, description="Aggregation specifications") + group_by: list[str] | None = Field(None, description="Columns for GROUP BY clause") + + +class AggregationQueryRequest(BaseModel): + """Aggregation query request.""" + group_by: list[str] = Field(..., description="Columns for GROUP BY") + aggregations: list[AggregationRequest] = Field(..., description="Aggregation specifications") + filters: list[FilterRequest] | None = Field(None, description="Filter specifications") + limit: int = Field(100, ge=1, le=500000, description="Maximum rows to return") + offset: int = Field(0, ge=0, description="Number of rows to skip") + + +class ColumnTypeInfo(BaseModel): + """Column type information.""" + name: str = Field(..., description="Column name") + type: str = Field(..., description="SQLite type (INTEGER, REAL, TEXT, etc.)") + notnull: bool = Field(False, description="Whether column is NOT NULL") + pk: bool = Field(False, description="Whether column is PRIMARY KEY") + dflt_value: Any = Field(None, description="Default value") + + +class QueryMetadata(BaseModel): + """Query execution metadata.""" + query_type: str = Field(..., description="Type of query: select, aggregate") + sql: str = Field(..., description="Executed SQL query") + filters_applied: int = Field(0, description="Number of filters applied") + has_search: bool = Field(False, description="Whether search was applied") + has_sort: bool = Field(False, description="Whether sorting was applied") + has_group_by: bool = Field(False, description="Whether GROUP BY was applied") + has_aggregations: bool = Field(False, description="Whether aggregations were applied") + + +class TableDataQueryResponse(BaseModel): + """Enhanced table data query response for DataTables Viewer API.""" + headers: list[str] = Field(..., description="Column names") + data: list[list[str]] = Field(..., description="Row data as list of lists") + total_count: int = Field(..., description="Total rows in table (before filtering)") + column_types: list[ColumnTypeInfo] = Field(..., description="Column type information") + query_metadata: QueryMetadata = Field(..., description="Query execution metadata") + cached: bool = Field(False, description="Whether result was from cache") + execution_time_ms: float = Field(..., description="Query execution time in milliseconds") + limit: int = Field(..., description="Limit applied") + offset: int = Field(..., description="Offset applied") + table_name: str = Field(..., description="Table name") + database_path: str = Field(..., description="Path to database file") + + +class TableSchemaInfo(BaseModel): + """Table schema information.""" + table: str = Field(..., description="Table name") + columns: list[ColumnTypeInfo] = Field(..., description="Column information") + indexes: list[dict[str, str]] = Field(default_factory=list, description="Index information") + + +class ColumnStatistic(BaseModel): + """Column statistics.""" + column: str = Field(..., description="Column name") + type: str = Field(..., description="Column type") + null_count: int = Field(0, description="Number of NULL values") + distinct_count: int = Field(0, description="Number of distinct values") + min: Any = Field(None, description="Minimum value") + max: Any = Field(None, description="Maximum value") + mean: float | None = Field(None, description="Mean value") + median: float | None = Field(None, description="Median value") + stddev: float | None = Field(None, description="Standard deviation") + sample_values: list[Any] = Field(default_factory=list, description="Sample values") + + +class TableStatisticsResponse(BaseModel): + """Table statistics response.""" + table: str = Field(..., description="Table name") + row_count: int = Field(..., description="Total row count") + columns: list[ColumnStatistic] = Field(..., description="Column statistics") + last_updated: int = Field(..., description="Last update timestamp (milliseconds since epoch)") + + +class HealthResponse(BaseModel): + """Health check response.""" + status: str = Field("ok", description="Service status") + timestamp: str = Field(..., description="ISO8601 timestamp") + mode: str = Field("cached_sqlite", description="Service mode") + data_dir: str = Field(..., description="Data directory path") + config_dir: str = Field(..., description="Config directory path") + cache: dict[str, Any] = Field(..., description="Cache information") \ No newline at end of file diff --git a/app/routes.py b/app/routes.py index 12abb08..c5fbe8a 100644 --- a/app/routes.py +++ b/app/routes.py @@ -12,10 +12,10 @@ """ -import time +import asyncio import logging +from datetime import datetime from pathlib import Path -from uuid import uuid4 from app.utils.workspace import KBaseClient from fastapi import APIRouter, HTTPException, Header, Query @@ -23,32 +23,40 @@ from app.models import ( TableDataRequest, TableDataResponse, - PangenomesResponse, - PangenomeInfo, TableListResponse, TableInfo, CacheResponse, ServiceStatus, TableSchemaResponse, + TableDataQueryRequest, + TableDataQueryResponse, + TableSchemaInfo, + TableStatisticsResponse, + AggregationQueryRequest, + HealthResponse, + FilterRequest, + AggregationRequest, ) from app.utils.workspace import ( - list_pangenomes_from_object, download_pangenome_db, + get_object_type, ) from app.utils.sqlite import ( list_tables, - get_table_data, get_table_columns, get_table_row_count, validate_table_exists, - ensure_indices, ) -from app.utils.cache import ( - is_cached, - clear_cache, - list_cached_items, +from app.services.data.schema_service import get_schema_service +from app.services.data.connection_pool import get_connection_pool +from app.services.db_helper import ( + get_object_db_path, + ensure_table_accessible, ) +from app.utils.async_utils import run_sync_in_thread +from app.utils.request_utils import TableRequestProcessor from app.config import settings +from app.config_constants import MAX_LIMIT, DEFAULT_LIMIT # Configure module logger logger = logging.getLogger(__name__) @@ -97,256 +105,46 @@ async def root(): ) -# ============================================================================= -# HANDLE-BASED ENDPOINTS (Primary REST API per diagram) -# /{handle_ref}/tables - List tables -# /{handle_ref}/tables/{table}/schema - Table schema -# /{handle_ref}/tables/{table}/data - Table data with pagination -# ============================================================================= - -@router.get("/handle/{handle_ref}/tables", tags=["Handle Access"], response_model=TableListResponse) -async def list_tables_by_handle( - handle_ref: str, - kb_env: str = Query("appdev", description="KBase environment"), - authorization: str | None = Header(None) -): +@router.get("/health", response_model=HealthResponse, tags=["General"]) +async def health_check(): """ - List all tables in a SQLite database accessed via handle reference. + Health check endpoint for DataTables Viewer API. - **Example:** - ```bash - curl -H "Authorization: $KB_TOKEN" \ - "https://appdev.kbase.us/services/berdl_table_scanner/handle/KBH_248028/tables" - ``` - """ - try: - token = get_auth_token(authorization) - cache_dir = get_cache_dir() - - # Download SQLite from handle - client = KBaseClient(token, kb_env, cache_dir) - - # Cache path based on handle - safe_handle = handle_ref.replace(":", "_").replace("/", "_") - db_dir = cache_dir / "handles" - db_dir.mkdir(parents=True, exist_ok=True) - db_path = db_dir / f"{safe_handle}.db" - - # Atomic download to prevent race conditions - if not db_path.exists(): - temp_path = db_path.with_suffix(f".{uuid4().hex}.tmp") - try: - client.download_blob_file(handle_ref, temp_path) - temp_path.rename(db_path) - except Exception: - temp_path.unlink(missing_ok=True) - raise - - # List tables - table_names = list_tables(db_path) - tables = [] - for name in table_names: - try: - columns = get_table_columns(db_path, name) - row_count = get_table_row_count(db_path, name) - tables.append({ - "name": name, - "row_count": row_count, - "column_count": len(columns) - }) - except Exception as e: - logger.warning("Error getting table info for %s", name, exc_info=True) - tables.append({"name": name}) - - return { - "handle_ref": handle_ref, - "tables": tables, - "db_path": str(db_path) - } - - except Exception as e: - logger.error(f"Error listing tables from handle: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - -@router.get("/handle/{handle_ref}/tables/{table_name}/schema", tags=["Handle Access"], response_model=TableSchemaResponse) -async def get_table_schema_by_handle( - handle_ref: str, - table_name: str, - kb_env: str = Query("appdev"), - authorization: str | None = Header(None) -): + Returns service status, cache information, and connection pool stats. """ - Get schema (columns) for a table accessed via handle reference. - - **Example:** - ```bash - curl -H "Authorization: $KB_TOKEN" \ - "https://appdev.kbase.us/services/berdl_table_scanner/handle/KBH_248028/tables/Genes/schema" - ``` - """ - try: - token = get_auth_token(authorization) - cache_dir = get_cache_dir() - - client = KBaseClient(token, kb_env, cache_dir) - - safe_handle = handle_ref.replace(":", "_").replace("/", "_") - db_dir = cache_dir / "handles" - db_dir.mkdir(parents=True, exist_ok=True) - db_path = db_dir / f"{safe_handle}.db" - - if not db_path.exists(): - temp_path = db_path.with_suffix(f".{uuid4().hex}.tmp") - try: - client.download_blob_file(handle_ref, temp_path) - temp_path.rename(db_path) - except Exception: - temp_path.unlink(missing_ok=True) - raise - - if not validate_table_exists(db_path, table_name): - available = list_tables(db_path) - raise HTTPException(404, f"Table '{table_name}' not found. Available: {available}") - - columns = get_table_columns(db_path, table_name) - row_count = get_table_row_count(db_path, table_name) - - return { - "handle_ref": handle_ref, - "table_name": table_name, - "columns": columns, - "row_count": row_count - } - - except HTTPException: - raise - except Exception as e: - logger.error(f"Error getting schema: {e}") - raise HTTPException(status_code=500, detail=str(e)) - -@router.get("/handle/{handle_ref}/tables/{table_name}/data", tags=["Handle Access"], response_model=TableDataResponse) -async def get_table_data_by_handle( - handle_ref: str, - table_name: str, - limit: int = Query(100, ge=1, le=500000), - offset: int = Query(0, ge=0), - sort_column: str | None = Query(None), - sort_order: str | None = Query("ASC"), - search: str | None = Query(None, description="Global search term"), - kb_env: str = Query("appdev"), - authorization: str | None = Header(None) -): - """ - Query table data from SQLite via handle reference. - - **Example:** - ```bash - curl -H "Authorization: $KB_TOKEN" \ - "https://appdev.kbase.us/services/berdl_table_scanner/handle/KBH_248028/tables/Genes/data?limit=5" - ``` - """ - start_time = time.time() try: - token = get_auth_token(authorization) - cache_dir = get_cache_dir() - - client = KBaseClient(token, kb_env, cache_dir) - - safe_handle = handle_ref.replace(":", "_").replace("/", "_") - db_dir = cache_dir / "handles" - db_dir.mkdir(parents=True, exist_ok=True) - db_path = db_dir / f"{safe_handle}.db" - - if not db_path.exists(): - temp_path = db_path.with_suffix(f".{uuid4().hex}.tmp") - try: - client.download_blob_file(handle_ref, temp_path) - temp_path.rename(db_path) - except Exception: - temp_path.unlink(missing_ok=True) - raise - - if not validate_table_exists(db_path, table_name): - available = list_tables(db_path) - raise HTTPException(404, f"Table '{table_name}' not found. Available: {available}") - - # Query data - headers, data, total_count, filtered_count, db_query_ms, conversion_ms = get_table_data( - sqlite_file=db_path, - table_name=table_name, - limit=limit, - offset=offset, - sort_column=sort_column, - sort_order=sort_order, - search_value=search, + # Get connection pool stats (non-blocking) + try: + pool = get_connection_pool() + cache_stats = pool.get_stats() + except Exception as pool_error: + logger.warning(f"Error getting pool stats: {pool_error}") + cache_stats = {"total_connections": 0, "connections": []} + + return HealthResponse( + status="ok", + timestamp=datetime.utcnow().isoformat() + "Z", + mode="cached_sqlite", + data_dir=str(settings.CACHE_DIR), + config_dir=str(Path(settings.CACHE_DIR) / "configs"), + cache={ + "databases_cached": cache_stats.get("total_connections", 0), + "databases": cache_stats.get("connections", []) + } ) - - response_time_ms = (time.time() - start_time) * 1000 - - return { - "handle_ref": handle_ref, - "table_name": table_name, - "headers": headers, - "data": data, - "row_count": len(data), - "total_count": total_count, - "filtered_count": filtered_count, - "response_time_ms": response_time_ms, - "db_query_ms": db_query_ms - } - - except HTTPException: - raise except Exception as e: - logger.error(f"Error querying data: {e}") + logger.error(f"Error in health check: {e}") raise HTTPException(status_code=500, detail=str(e)) # ============================================================================= # OBJECT-BASED ENDPOINTS (via KBase workspace object reference) -# /object/{ws_ref}/pangenomes - List pangenomes from BERDLTables object -# /object/{ws_ref}/pangenomes/{pg_id}/tables - List tables for a pangenome -# /object/{ws_ref}/pangenomes/{pg_id}/tables/{table}/data - Query data +# /object/{ws_ref}/tables - List tables from KBase object +# /object/{ws_ref}/tables/{table}/data - Query data # ============================================================================= -@router.get("/object/{ws_ref:path}/pangenomes", tags=["Object Access"], response_model=PangenomesResponse) -async def list_pangenomes_by_object( - ws_ref: str, - kb_env: str = Query("appdev"), - authorization: str | None = Header(None) -): - """ - List pangenomes from a BERDLTables/GenomeDataLakeTables object. - - **Example:** - ```bash - curl -H "Authorization: $KB_TOKEN" \ - "https://appdev.kbase.us/services/berdl_table_scanner/object/76990/7/2/pangenomes" - ``` - """ - try: - token = get_auth_token(authorization) - berdl_table_id = ws_ref - - pangenomes = list_pangenomes_from_object( - berdl_table_id=berdl_table_id, - auth_token=token, - kb_env=kb_env - ) - - return { - "berdl_table_id": berdl_table_id, - "pangenomes": pangenomes - } - - except Exception as e: - logger.error(f"Error listing pangenomes: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - @router.get("/object/{ws_ref:path}/tables", tags=["Object Access"], response_model=TableListResponse) async def list_tables_by_object( ws_ref: str, @@ -355,44 +153,105 @@ async def list_tables_by_object( ): """ List tables for a BERDLTables object. - - **Example:** - ```bash - curl -H "Authorization: $KB_TOKEN" \ - "https://appdev.kbase.us/services/berdl_table_scanner/object/76990/7/2/tables" - ``` """ + + try: token = get_auth_token(authorization) cache_dir = get_cache_dir() berdl_table_id = ws_ref - db_path = download_pangenome_db( - berdl_table_id=berdl_table_id, - auth_token=token, - cache_dir=cache_dir, - kb_env=kb_env - ) + # Get database path (handles caching, download timeouts via helper) + db_path = await get_object_db_path(berdl_table_id, token, kb_env, cache_dir) + + # List tables (run in thread) + table_names = await run_sync_in_thread(list_tables, db_path) - table_names = list_tables(db_path) tables = [] + schemas = {} + total_rows = 0 + + # Use schema service for better column type information + schema_service = get_schema_service() + + # Process tables for name in table_names: try: - columns = get_table_columns(db_path, name) - row_count = get_table_row_count(db_path, name) + # Run lightweight checks in thread + columns = await run_sync_in_thread(get_table_columns, db_path, name) + row_count = await run_sync_in_thread(get_table_row_count, db_path, name) + + # Get display name (use table name as default) + display_name = name.replace("_", " ").title() + tables.append({ "name": name, + "displayName": display_name, "row_count": row_count, "column_count": len(columns) }) - except Exception as e: + total_rows += row_count or 0 + + # Build schema map with actual types + try: + table_schema = await run_sync_in_thread( + schema_service.get_table_schema, db_path, name + ) + schemas[name] = { + col["name"]: col["type"] + for col in table_schema["columns"] + } + except Exception: + # Fallback to default type + schemas[name] = {col: "TEXT" for col in columns} + except Exception: logger.warning("Error getting table info for %s", name, exc_info=True) - tables.append({"name": name}) + tables.append({"name": name, "displayName": name}) + + # Get object type (non-blocking) + try: + # Use specific timeout for API call + object_type = await asyncio.wait_for( + run_sync_in_thread(get_object_type, berdl_table_id, token, kb_env), + timeout=settings.KBASE_API_TIMEOUT_SECONDS + ) + except (asyncio.TimeoutError, Exception) as e: + logger.warning(f"Could not get object type (non-critical): {e}") + object_type = None + + # Config-related fields (deprecated, kept for backward compatibility) + config_fingerprint = None + config_url = None + has_cached_config = False + has_builtin_config = False + builtin_config_id = None + + # Get database size + database_size = None + try: + database_size = db_path.stat().st_size if db_path.exists() else None + except Exception: + pass + + # Format berdl_table_id for DataTables Viewer API (local/db_name format) + berdl_table_id_formatted = f"local/{berdl_table_id.replace('/', '_')}" return { - "berdl_table_id": berdl_table_id, + "berdl_table_id": berdl_table_id_formatted, + "object_type": object_type or "LocalDatabase", "tables": tables, - "source": "Cache" if (db_path.exists() and db_path.stat().st_size > 0) else "Downloaded" + "source": "Local", + "has_config": has_cached_config, + "config_source": "static" if has_cached_config else None, + "config_fingerprint": config_fingerprint, + "config_url": config_url, + "has_cached_config": has_cached_config, + "schemas": schemas, + "has_builtin_config": has_builtin_config, + "builtin_config_id": builtin_config_id, + "database_size_bytes": database_size, + "total_rows": total_rows, + "api_version": "2.0", } except Exception as e: @@ -404,7 +263,7 @@ async def list_tables_by_object( async def get_table_data_by_object( ws_ref: str, table_name: str, - limit: int = Query(100, ge=1, le=500000), + limit: int = Query(DEFAULT_LIMIT, ge=1, le=MAX_LIMIT), offset: int = Query(0, ge=0), sort_column: str | None = Query(None), sort_order: str | None = Query("ASC"), @@ -414,55 +273,28 @@ async def get_table_data_by_object( ): """ Query table data from a BERDLTables object. - - **Example:** - ```bash - curl -H "Authorization: $KB_TOKEN" \ - "https://appdev.kbase.us/services/berdl_table_scanner/object/76990/7/2/tables/Genes/data?limit=5" - ``` """ - start_time = time.time() - try: token = get_auth_token(authorization) cache_dir = get_cache_dir() berdl_table_id = ws_ref - db_path = download_pangenome_db( - berdl_table_id=berdl_table_id, - auth_token=token, - cache_dir=cache_dir, - kb_env=kb_env - ) - - if not validate_table_exists(db_path, table_name): - available = list_tables(db_path) - raise HTTPException(404, f"Table '{table_name}' not found. Available: {available}") + # Get and validate DB access + db_path = await get_object_db_path(berdl_table_id, token, kb_env, cache_dir) + await ensure_table_accessible(db_path, table_name) - headers, data, total_count, filtered_count, db_query_ms, conversion_ms = get_table_data( - sqlite_file=db_path, + result = await TableRequestProcessor.process_data_request( + db_path=db_path, table_name=table_name, limit=limit, offset=offset, sort_column=sort_column, - sort_order=sort_order, + sort_order=sort_order or "ASC", search_value=search, + handle_ref_or_id=berdl_table_id ) - - response_time_ms = (time.time() - start_time) * 1000 - - return { - "berdl_table_id": berdl_table_id, - "table_name": table_name, - "headers": headers, - "data": data, - "row_count": len(data), - "total_count": total_count, - "filtered_count": filtered_count, - "response_time_ms": response_time_ms, - "db_query_ms": db_query_ms, - "sqlite_file": str(db_path) - } + + return result except HTTPException: raise @@ -472,112 +304,24 @@ async def get_table_data_by_object( # ============================================================================= -# LEGACY ENDPOINTS (for backwards compatibility) +# DATA ACCESS ENDPOINTS # ============================================================================= -@router.get("/pangenomes", response_model=PangenomesResponse, tags=["Legacy"]) -async def get_pangenomes( - berdl_table_id: str = Query(..., description="BERDLTables object reference"), - kb_env: str = Query("appdev"), - authorization: str | None = Header(None) -): - """ - List pangenomes from BERDLTables object. - - Returns: - - pangenomes: List of pangenome info - - pangenome_count: Total number of pangenomes - """ - try: - token = get_auth_token(authorization) - - # Support comma-separated list of IDs - berdl_ids = [bid.strip() for bid in berdl_table_id.split(",") if bid.strip()] - - all_pangenomes: list[dict] = [] - - for bid in berdl_ids: - try: - pangenomes = list_pangenomes_from_object(bid, token, kb_env) - # Tag each pangenome with its source ID - for pg in pangenomes: - pg["source_berdl_id"] = bid - all_pangenomes.extend(pangenomes) - except Exception as e: - logger.error(f"Error fetching pangenomes for {bid}: {e}") - # Continue fetching others even if one fails - continue - - pangenome_list = [PangenomeInfo(**pg) for pg in all_pangenomes] - - return PangenomesResponse( - pangenomes=pangenome_list, - pangenome_count=len(pangenome_list) - ) - except Exception as e: - logger.error(f"Error in get_pangenomes: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - -@router.get("/tables", response_model=TableListResponse, tags=["Legacy"]) -async def get_tables( - berdl_table_id: str = Query(..., description="BERDLTables object reference"), - kb_env: str = Query("appdev"), - authorization: str | None = Header(None) -): - """List tables for a BERDLTable object (auto-resolves pangenome).""" - try: - token = get_auth_token(authorization) - cache_dir = get_cache_dir() - - db_path = download_pangenome_db(berdl_table_id, token, cache_dir, kb_env) - table_names = list_tables(db_path) - - tables = [] - for name in table_names: - try: - columns = get_table_columns(db_path, name) - row_count = get_table_row_count(db_path, name) - tables.append(TableInfo(name=name, row_count=row_count, column_count=len(columns))) - except Exception: - tables.append(TableInfo(name=name)) - - return TableListResponse(tables=tables) - except Exception as e: - logger.error(f"Error listing tables: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - -@router.post("/table-data", response_model=TableDataResponse, tags=["Legacy"]) +@router.post("/table-data", response_model=TableDataResponse, tags=["Data Access"]) async def query_table_data( request: TableDataRequest, authorization: str | None = Header(None) ): """ Query table data using a JSON body. Recommended for programmatic access. - - **Example:** - ```bash - curl -X POST -H "Authorization: $KB_TOKEN" -H "Content-Type: application/json" \ - -d '{ - "berdl_table_id": "76990/7/2", - "table_name": "Metadata_Conditions", - "limit": 5" - }' \ - "https://appdev.kbase.us/services/berdl_table_scanner/table-data" - ``` """ - start_time = time.time() - try: token = get_auth_token(authorization) cache_dir = get_cache_dir() kb_env = getattr(request, 'kb_env', 'appdev') or 'appdev' - # Determine filters (support both query_filters and col_filter) filters = request.col_filter if request.col_filter else request.query_filters - # Download (or get cached) DB - auto-resolves ID if None try: db_path = download_pangenome_db( request.berdl_table_id, token, cache_dir, kb_env @@ -588,69 +332,35 @@ async def query_table_data( if not validate_table_exists(db_path, request.table_name): available = list_tables(db_path) raise ValueError(f"Table '{request.table_name}' not found. Available: {available}") - - try: - ensure_indices(db_path, request.table_name) - except: - pass - - headers, data, total_count, filtered_count, db_query_ms, conversion_ms = get_table_data( - sqlite_file=db_path, + + columns_list = None + if request.columns and request.columns != "all": + columns_list = [c.strip() for c in request.columns.split(",") if c.strip()] + + effective_sort_col = request.sort_column + effective_sort_dir = request.sort_order + + if not effective_sort_col and request.order_by: + first_sort = request.order_by[0] + effective_sort_col = first_sort.get("column") + effective_sort_dir = first_sort.get("direction", "ASC").upper() + + return await TableRequestProcessor.process_data_request( + db_path=db_path, table_name=request.table_name, limit=request.limit, offset=request.offset, - sort_column=request.sort_column, - sort_order=request.sort_order, + sort_column=effective_sort_col, + sort_order=effective_sort_dir or "ASC", search_value=request.search_value, - query_filters=filters, - columns=request.columns, - order_by=request.order_by - ) - - response_time_ms = (time.time() - start_time) * 1000 - - return TableDataResponse( - headers=headers, - data=data, - row_count=len(data), - total_count=total_count, - filtered_count=filtered_count, - table_name=request.table_name, - response_time_ms=response_time_ms, - db_query_ms=db_query_ms, - conversion_ms=conversion_ms, - source="Cache" if is_cached(db_path) else "Downloaded", - cache_file=str(db_path), - sqlite_file=str(db_path) + columns=columns_list, + filters=filters, + handle_ref_or_id=request.berdl_table_id ) - except ValueError as e: - raise HTTPException(status_code=400, detail=str(e)) + except HTTPException: + raise except Exception as e: - logger.error(f"Error querying table data: {e}") + logger.error(f"Error querying data: {e}") raise HTTPException(status_code=500, detail=str(e)) - -# ============================================================================= -# CACHE MANAGEMENT -# ============================================================================= - -@router.post("/clear-cache", response_model=CacheResponse, tags=["Cache Management"]) -async def clear_pangenome_cache( - berdl_table_id: str | None = Query(None) -): - """Clear cached databases.""" - try: - cache_dir = get_cache_dir() - result = clear_cache(cache_dir, berdl_table_id) - return CacheResponse(status="success", message=result.get("message", "Cache cleared")) - except Exception as e: - return CacheResponse(status="error", message=str(e)) - - -@router.get("/cache", tags=["Cache Management"]) -async def list_cache(): - """List cached items.""" - cache_dir = get_cache_dir() - items = list_cached_items(cache_dir) - return {"cache_dir": str(cache_dir), "items": items, "total": len(items)} diff --git a/app/services/__init__.py b/app/services/__init__.py new file mode 100644 index 0000000..c05a668 --- /dev/null +++ b/app/services/__init__.py @@ -0,0 +1,25 @@ +""" +TableScanner Services Package. + +This package contains data query and schema analysis services. + +Modules: + - connection_pool: Database connection pooling and management + - query_service: Enhanced query execution with type-aware filtering + - schema_service: Schema information retrieval + - statistics_service: Column statistics computation + - schema_analyzer: Database schema introspection and profiling + - fingerprint: Database fingerprinting for caching +""" + +from .data.schema_analyzer import SchemaAnalyzer, ColumnProfile, TableProfile +from .data.fingerprint import DatabaseFingerprint + +__all__ = [ + # Schema analysis + "SchemaAnalyzer", + "ColumnProfile", + "TableProfile", + # Fingerprinting + "DatabaseFingerprint", +] diff --git a/app/services/data/__init__.py b/app/services/data/__init__.py new file mode 100644 index 0000000..dd828f1 --- /dev/null +++ b/app/services/data/__init__.py @@ -0,0 +1,19 @@ +""" +Data Analysis Services. + +Schema analysis, fingerprinting, and validation. +""" + +from .schema_analyzer import SchemaAnalyzer +from .fingerprint import DatabaseFingerprint +from .type_inference import TypeInferenceEngine, InferredType, DataType +from .validation import validate_config + +__all__ = [ + "SchemaAnalyzer", + "DatabaseFingerprint", + "TypeInferenceEngine", + "InferredType", + "DataType", + "validate_config", +] diff --git a/app/services/data/connection_pool.py b/app/services/data/connection_pool.py new file mode 100644 index 0000000..4bfc0f4 --- /dev/null +++ b/app/services/data/connection_pool.py @@ -0,0 +1,268 @@ +""" +Database Connection Pool Manager. + +Manages a pool of SQLite database connections with: +- Automatic lifecycle management (30-minute inactivity timeout) +- Connection reuse for performance +- SQLite performance optimizations (WAL mode, cache size, etc.) +- Prepared statement caching +- Automatic cleanup of expired connections +""" + +from __future__ import annotations + +import sqlite3 +import logging +import threading +import time +from pathlib import Path +from typing import Any +from collections import OrderedDict +from dataclasses import dataclass, field + +logger = logging.getLogger(__name__) + + +@dataclass +class ConnectionInfo: + """Information about a cached database connection.""" + + connection: sqlite3.Connection + db_path: Path + last_access: float = field(default_factory=time.time) + access_count: int = 0 + file_mtime: float = 0.0 + prepared_statements: dict[str, sqlite3.Cursor] = field(default_factory=dict) + + def touch(self) -> None: + """Update last access time and increment access count.""" + self.last_access = time.time() + self.access_count += 1 + + +class ConnectionPool: + """ + Manages a pool of SQLite database connections. + + Features: + - Opens databases on first access + - Caches connections in memory + - Tracks last access time and access count + - Automatically closes databases after 30 minutes of inactivity + - Cleans up expired connections every 5 minutes + - Reloads database if file modification time changes + - Applies SQLite performance optimizations + - Caches prepared statements for reuse + """ + + # Connection timeout: 30 minutes of inactivity + CONNECTION_TIMEOUT_SECONDS = 30 * 60 + + # Cleanup interval: run cleanup every 5 minutes + CLEANUP_INTERVAL_SECONDS = 5 * 60 + + def __init__(self) -> None: + """Initialize the connection pool.""" + self._connections: dict[str, ConnectionInfo] = OrderedDict() + self._lock = threading.RLock() + self._last_cleanup = time.time() + + logger.info("Initialized SQLite connection pool") + + def get_connection(self, db_path: Path) -> sqlite3.Connection: + """ + Get a connection to a SQLite database. + + Opens the database if not already cached, or returns existing connection. + Automatically applies performance optimizations and checks for file changes. + + Args: + db_path: Path to the SQLite database file + + Returns: + SQLite connection object + + Raises: + sqlite3.Error: If database cannot be opened + """ + db_key = str(db_path.absolute()) + + with self._lock: + # Check if connection exists and is still valid + if db_key in self._connections: + conn_info = self._connections[db_key] + + # Check if file has been modified + try: + current_mtime = db_path.stat().st_mtime + if current_mtime != conn_info.file_mtime: + logger.info(f"Database file modified, reloading: {db_path}") + self._close_connection(db_key, conn_info) + # Will create new connection below + else: + # Connection is valid, update access time + conn_info.touch() + # Move to end (LRU) + self._connections.move_to_end(db_key) + return conn_info.connection + except OSError: + # File no longer exists, remove connection + logger.warning(f"Database file no longer exists: {db_path}") + self._close_connection(db_key, conn_info) + del self._connections[db_key] + + # Create new connection + logger.debug(f"Opening new database connection: {db_path}") + conn = sqlite3.connect(str(db_path), check_same_thread=False) + conn.row_factory = sqlite3.Row + + # Apply performance optimizations + self._optimize_connection(conn) + + # Store connection info + try: + file_mtime = db_path.stat().st_mtime + except OSError: + file_mtime = 0.0 + + conn_info = ConnectionInfo( + connection=conn, + db_path=db_path, + file_mtime=file_mtime + ) + conn_info.touch() + + self._connections[db_key] = conn_info + + # Run cleanup if needed + self._maybe_cleanup() + + return conn + + def _optimize_connection(self, conn: sqlite3.Connection) -> None: + """ + Apply SQLite performance optimizations. + + Sets pragmas for better performance: + - journal_mode=WAL: Write-Ahead Logging for better concurrency + - synchronous=NORMAL: Balance between safety and performance + - cache_size=-64000: 64MB cache (negative = KB) + - temp_store=MEMORY: Store temporary tables in memory + - mmap_size=268435456: 256MB memory-mapped I/O + """ + try: + conn.execute("PRAGMA journal_mode=WAL") + conn.execute("PRAGMA synchronous=NORMAL") + conn.execute("PRAGMA cache_size=-64000") + conn.execute("PRAGMA temp_store=MEMORY") + conn.execute("PRAGMA mmap_size=268435456") + logger.debug("Applied SQLite performance optimizations") + except sqlite3.Error as e: + logger.warning(f"Failed to apply some SQLite optimizations: {e}") + + def _close_connection(self, db_key: str, conn_info: ConnectionInfo) -> None: + """Close a connection and clean up resources.""" + try: + # Close prepared statements + for stmt in conn_info.prepared_statements.values(): + try: + stmt.close() + except Exception: + pass + + # Close connection + conn_info.connection.close() + logger.debug(f"Closed database connection: {conn_info.db_path}") + except Exception as e: + logger.warning(f"Error closing connection: {e}") + + def _maybe_cleanup(self) -> None: + """Run cleanup if enough time has passed.""" + now = time.time() + if now - self._last_cleanup < self.CLEANUP_INTERVAL_SECONDS: + return + + self._last_cleanup = now + self.cleanup_expired() + + def cleanup_expired(self) -> None: + """ + Close and remove connections that have been inactive for too long. + + Connections are closed if they haven't been accessed in the last + 30 minutes (CONNECTION_TIMEOUT_SECONDS). + """ + now = time.time() + expired_keys = [] + + with self._lock: + for db_key, conn_info in list(self._connections.items()): + age = now - conn_info.last_access + if age > self.CONNECTION_TIMEOUT_SECONDS: + expired_keys.append((db_key, conn_info)) + + for db_key, conn_info in expired_keys: + logger.info( + f"Closing expired connection (inactive {age:.0f}s): {conn_info.db_path}" + ) + self._close_connection(db_key, conn_info) + del self._connections[db_key] + + if expired_keys: + logger.info(f"Cleaned up {len(expired_keys)} expired connections") + + def close_all(self) -> None: + """Close all connections in the pool.""" + with self._lock: + for db_key, conn_info in list(self._connections.items()): + self._close_connection(db_key, conn_info) + self._connections.clear() + + logger.info("Closed all database connections") + + def get_stats(self) -> dict[str, Any]: + """ + Get statistics about the connection pool. + + Returns: + Dictionary with pool statistics + """ + with self._lock: + now = time.time() + connections = [] + + for db_key, conn_info in self._connections.items(): + age = now - conn_info.last_access + connections.append({ + "db_path": str(conn_info.db_path), + "last_access_seconds_ago": age, + "access_count": conn_info.access_count, + "prepared_statements": len(conn_info.prepared_statements) + }) + + return { + "total_connections": len(self._connections), + "connections": connections + } + + +# Global connection pool instance +_global_pool: ConnectionPool | None = None +_pool_lock = threading.Lock() + + +def get_connection_pool() -> ConnectionPool: + """ + Get the global connection pool instance. + + Returns: + Global ConnectionPool instance + """ + global _global_pool + + if _global_pool is None: + with _pool_lock: + if _global_pool is None: + _global_pool = ConnectionPool() + + return _global_pool diff --git a/app/services/data/fingerprint.py b/app/services/data/fingerprint.py new file mode 100644 index 0000000..04051ed --- /dev/null +++ b/app/services/data/fingerprint.py @@ -0,0 +1,231 @@ +""" +Database Fingerprinting. + +Creates unique fingerprints from database schema structure for cache +invalidation. Fingerprints are based on schema characteristics, not data, +to enable efficient caching of generated configs. +""" + +from __future__ import annotations + +import hashlib +import json +import logging +import os +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +from .schema_analyzer import SchemaAnalyzer, TableProfile + +logger = logging.getLogger(__name__) + + +class DatabaseFingerprint: + """ + Creates unique fingerprints from database schema structure. + + The fingerprint is based on: + - Table names (sorted) + - Column names and types for each table + - Row counts (optional, for change detection) + + This allows caching generated configs and detecting when + a database schema has changed. + """ + + def __init__(self, config_dir: str | Path | None = None) -> None: + """ + Initialize fingerprinting service. + + Args: + config_dir: Directory for storing cached configs + """ + default_dir = os.getenv("GENERATED_CONFIG_DIR", "/tmp/tablescanner_configs") + self.config_dir = Path(config_dir or default_dir) + self.config_dir.mkdir(parents=True, exist_ok=True) + + def compute(self, db_path: Path, include_row_counts: bool = False) -> str: + """ + Compute fingerprint for a database. + + Args: + db_path: Path to the SQLite database + include_row_counts: Whether to include row counts in fingerprint + (makes fingerprint change when data changes) + + Returns: + SHA256 hex string (first 16 characters) + """ + analyzer = SchemaAnalyzer(sample_size=0) # No samples needed + profiles = analyzer.analyze_database(db_path) + + return self.compute_from_profiles(profiles, include_row_counts) + + def compute_from_profiles( + self, + profiles: list[TableProfile], + include_row_counts: bool = False + ) -> str: + """ + Compute fingerprint from table profiles. + + Args: + profiles: List of TableProfile objects + include_row_counts: Whether to include row counts + + Returns: + SHA256 hex string (first 16 characters) + """ + # Build deterministic schema representation + schema_data: list[dict[str, Any]] = [] + + for table in sorted(profiles, key=lambda t: t.name): + table_data: dict[str, Any] = { + "name": table.name, + "columns": [ + {"name": col.name, "type": col.sqlite_type} + for col in sorted(table.columns, key=lambda c: c.name) + ], + } + if include_row_counts: + table_data["row_count"] = table.row_count + + schema_data.append(table_data) + + # Create deterministic JSON string + schema_json = json.dumps(schema_data, sort_keys=True, separators=(",", ":")) + + # Compute SHA256 hash + hash_bytes = hashlib.sha256(schema_json.encode()).hexdigest() + + # Return first 16 characters for reasonable uniqueness + readability + return hash_bytes[:16] + + def compute_for_handle(self, handle_ref: str, db_path: Path) -> str: + """ + Compute fingerprint incorporating handle reference. + + This creates a unique ID that includes both the source + handle and the schema structure. + + Args: + handle_ref: The KBase handle reference + db_path: Path to the SQLite database + + Returns: + Combined fingerprint string + """ + schema_fp = self.compute(db_path) + # Sanitize handle ref for use in filenames + safe_handle = handle_ref.replace("/", "_").replace(":", "_") + return f"{safe_handle}_{schema_fp}" + + # ─── Cache Management ─────────────────────────────────────────────────── + + def is_cached(self, fingerprint: str) -> bool: + """Check if a config is cached for this fingerprint.""" + config_path = self._get_cache_path(fingerprint) + return config_path.exists() + + def get_cached_config(self, fingerprint: str) -> dict | None: + """ + Retrieve cached config for a fingerprint. + + Args: + fingerprint: Database fingerprint + + Returns: + Cached config dict or None if not found + """ + config_path = self._get_cache_path(fingerprint) + + if not config_path.exists(): + return None + + try: + with open(config_path, "r") as f: + return json.load(f) + except (json.JSONDecodeError, OSError) as e: + logger.warning(f"Failed to load cached config {fingerprint}: {e}") + return None + + def cache_config(self, fingerprint: str, config: dict) -> Path: + """ + Cache a generated config. + + Args: + fingerprint: Database fingerprint + config: Generated config to cache + + Returns: + Path to the cached config file + """ + config_path = self._get_cache_path(fingerprint) + + # Add metadata + config_with_meta = { + "_fingerprint": fingerprint, + "_cached_at": self._get_timestamp(), + **config, + } + + with open(config_path, "w") as f: + json.dump(config_with_meta, f, indent=2) + + logger.info(f"Cached config to {config_path}") + return config_path + + def clear_cache(self, fingerprint: str | None = None) -> int: + """ + Clear cached configs. + + Args: + fingerprint: Specific fingerprint to clear, or None for all + + Returns: + Number of configs cleared + """ + if fingerprint: + config_path = self._get_cache_path(fingerprint) + if config_path.exists(): + config_path.unlink() + return 1 + return 0 + + # Clear all + count = 0 + for config_file in self.config_dir.glob("*.json"): + config_file.unlink() + count += 1 + return count + + def list_cached(self) -> list[dict[str, Any]]: + """List all cached configs with metadata.""" + cached: list[dict[str, Any]] = [] + + for config_file in self.config_dir.glob("*.json"): + try: + with open(config_file, "r") as f: + config = json.load(f) + cached.append({ + "fingerprint": config.get("_fingerprint", config_file.stem), + "cached_at": config.get("_cached_at"), + "id": config.get("id"), + "name": config.get("name"), + "path": str(config_file), + }) + except (json.JSONDecodeError, OSError): + continue + + return cached + + # ─── Private Methods ──────────────────────────────────────────────────── + + def _get_cache_path(self, fingerprint: str) -> Path: + """Get cache file path for a fingerprint.""" + return self.config_dir / f"{fingerprint}.json" + + def _get_timestamp(self) -> str: + """Get current ISO timestamp.""" + return datetime.now(timezone.utc).isoformat() diff --git a/app/services/data/query_service.py b/app/services/data/query_service.py new file mode 100644 index 0000000..6efba77 --- /dev/null +++ b/app/services/data/query_service.py @@ -0,0 +1,621 @@ +""" +Enhanced Query Service for DataTables Viewer API. + +Provides comprehensive query execution with: +- Type-aware filtering with proper numeric conversion +- Advanced filter operators (eq, ne, gt, gte, lt, lte, like, ilike, in, not_in, between, is_null, is_not_null) +- Aggregations with GROUP BY +- Full-text search (FTS5) +- Automatic indexing +- Query result caching +- Comprehensive metadata in responses +""" + +from __future__ import annotations + +import sqlite3 +import logging +import time +import hashlib +import json +import threading +from pathlib import Path +from typing import Any, Literal +from collections import OrderedDict +from dataclasses import dataclass + +from app.services.data.connection_pool import get_connection_pool +from app.config_constants import ( + CACHE_TTL_SECONDS, + CACHE_MAX_ENTRIES, + INDEX_CACHE_TTL +) +from app.exceptions import ( + TableNotFoundError, + ColumnNotFoundError, + InvalidFilterError +) + +logger = logging.getLogger(__name__) + + +@dataclass +class FilterSpec: + """Filter specification for query building.""" + + column: str + operator: str + value: Any = None + value2: Any = None # For 'between' operator + + +@dataclass +class AggregationSpec: + """Aggregation specification for query building.""" + + column: str + function: str # count, sum, avg, min, max, stddev, variance, distinct_count + alias: str | None = None + + +@dataclass +class ColumnType: + """Column type information from schema.""" + + name: str + type: str # INTEGER, REAL, TEXT, etc. + notnull: bool = False + pk: bool = False + dflt_value: Any = None + + +class QueryCache: + """ + Query result cache with 5-minute TTL and LRU eviction. + + Cache key format: {dbPath}:{tableName}:{JSON.stringify(queryParams)} + Invalidates when table modification time changes. + """ + + def __init__(self) -> None: + """Initialize the query cache.""" + self._cache: OrderedDict[str, tuple[Any, float]] = OrderedDict() + self._lock = threading.Lock() + + def get(self, cache_key: str, table_mtime: float) -> Any | None: + """ + Get cached query result. + + Args: + cache_key: Cache key for the query + table_mtime: Table file modification time + + Returns: + Cached result if valid, None otherwise + """ + with self._lock: + if cache_key not in self._cache: + return None + + result, cached_mtime = self._cache[cache_key] + + # Check if table has been modified + if cached_mtime != table_mtime: + del self._cache[cache_key] + return None + + # Move to end (LRU) + self._cache.move_to_end(cache_key) + return result + + def set(self, cache_key: str, result: Any, table_mtime: float) -> None: + """ + Store query result in cache. + + Args: + cache_key: Cache key for the query + result: Query result to cache + table_mtime: Table file modification time + """ + with self._lock: + # Evict oldest if at capacity + if len(self._cache) >= CACHE_MAX_ENTRIES: + self._cache.popitem(last=False) + + self._cache[cache_key] = (result, table_mtime) + # Move to end (LRU) + self._cache.move_to_end(cache_key) + + def clear(self) -> None: + """Clear all cached results.""" + with self._lock: + self._cache.clear() + + +# Global query cache instance +_query_cache: QueryCache | None = None +_cache_lock = threading.Lock() + + +def get_query_cache() -> QueryCache: + """Get the global query cache instance.""" + global _query_cache + + if _query_cache is None: + with _cache_lock: + if _query_cache is None: + _query_cache = QueryCache() + + return _query_cache + + +class QueryService: + """ + Enhanced query service for DataTables Viewer API. + + Provides comprehensive query execution with type-aware filtering, + aggregations, full-text search, and result caching. + """ + + def __init__(self) -> None: + """Initialize the query service.""" + self.pool = get_connection_pool() + self.cache = get_query_cache() + # In-memory cache for index existence to avoid frequent sqlite_master queries + # Key: {db_path}:{table_name}:{column_name}, Value: timestamp + self._index_cache: dict[str, float] = {} + self._index_lock = threading.Lock() + + def get_column_types(self, db_path: Path, table_name: str) -> list[ColumnType]: + """ + Get column type information from table schema. + """ + conn = self.pool.get_connection(db_path) + cursor = conn.cursor() + + try: + # Validate table existence + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=?", (table_name,)) + if not cursor.fetchone(): + raise TableNotFoundError(table_name) + + cursor.execute(f"PRAGMA table_info(\"{table_name}\")") + rows = cursor.fetchall() + + column_types = [] + for row in rows: + # PRAGMA table_info returns: cid, name, type, notnull, dflt_value, pk + column_types.append(ColumnType( + name=row[1], + type=row[2] or "TEXT", # Default to TEXT if type is NULL + notnull=bool(row[3]), + pk=bool(row[5]), + dflt_value=row[4] + )) + + return column_types + + except sqlite3.Error as e: + logger.error(f"Error getting column types: {e}") + raise + + def is_numeric_column(self, column_type: str) -> bool: + """Check if a column type is numeric.""" + if not column_type: + return False + type_upper = column_type.upper() + return any(numeric_type in type_upper for numeric_type in ["INT", "REAL", "NUMERIC"]) + + def convert_numeric_value(self, value: Any, column_type: str) -> float | int: + """Convert a value to numeric type based on column type.""" + if value is None: + return 0 + + type_upper = column_type.upper() + + if "INT" in type_upper: + try: + return int(float(str(value))) + except (ValueError, TypeError): + return 0 + else: + try: + return float(str(value)) + except (ValueError, TypeError): + return 0.0 + + def ensure_index(self, db_path: Path, table_name: str, column: str) -> None: + """Ensure an index exists on a column. Optimized with in-memory cache.""" + cache_key = f"{db_path}:{table_name}:{column}" + + with self._index_lock: + # Check cache with TTL + if cache_key in self._index_cache: + if time.time() - self._index_cache[cache_key] < INDEX_CACHE_TTL: + return + + conn = self.pool.get_connection(db_path) + cursor = conn.cursor() + + try: + index_name = f"idx_{table_name}_{column}".replace(" ", "_").replace("-", "_") + safe_table = f'"{table_name}"' + safe_column = f'"{column}"' + + cursor.execute( + f'CREATE INDEX IF NOT EXISTS "{index_name}" ON {safe_table}({safe_column})' + ) + conn.commit() + + with self._index_lock: + self._index_cache[cache_key] = time.time() + + except sqlite3.Error as e: + logger.warning(f"Error creating index on {table_name}.{column}: {e}") + + def ensure_fts5_table(self, db_path: Path, table_name: str, text_columns: list[str]) -> bool: + """Ensure FTS5 virtual table exists for full-text search.""" + if not text_columns: + return False + + conn = self.pool.get_connection(db_path) + cursor = conn.cursor() + + try: + fts5_table_name = f"{table_name}_fts5" + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=?", (fts5_table_name,)) + if cursor.fetchone(): + return True + + # Check capabilities + cursor.execute("PRAGMA compile_options") + if "ENABLE_FTS5" not in [row[0] for row in cursor.fetchall()]: + return False + + safe_columns = ", ".join(f'"{col}"' for col in text_columns) + cursor.execute(f""" + CREATE VIRTUAL TABLE IF NOT EXISTS "{fts5_table_name}" + USING fts5({safe_columns}, content="{table_name}", content_rowid="rowid") + """) + + # Populate + cursor.execute(f"PRAGMA table_info(\"{table_name}\")") + # If table has integer PK, use it as rowid implicitly + + cursor.execute(f""" + INSERT INTO "{fts5_table_name}"(rowid, {safe_columns}) + SELECT rowid, {safe_columns} FROM "{table_name}" + """) + + conn.commit() + return True + except sqlite3.Error: + return False + + def _build_select_clause( + self, + columns: list[str] | None, + aggregations: list[AggregationSpec] | None, + group_by: list[str] | None, + column_types: dict[str, ColumnType] + ) -> tuple[str, list[str]]: + """ + Build SELECT clause and return logic for headers. + + Returns: + Tuple of (select_sql, headers_list) + """ + select_parts = [] + headers = [] + + if aggregations: + # GROUP BY columns in SELECT + if group_by: + for col in group_by: + if col in column_types: + select_parts.append(f'"{col}"') + headers.append(col) + + # Aggregation columns + for agg in aggregations: + if agg.column != "*" and agg.column not in column_types: + continue + + safe_col = f'"{agg.column}"' if agg.column != "*" else "*" + + if agg.function == "count": + expr = f"COUNT({safe_col})" + elif agg.function == "distinct_count": + expr = f"COUNT(DISTINCT {safe_col})" + elif agg.function in ["sum", "avg", "min", "max"]: + expr = f"{agg.function.upper()}({safe_col})" + else: + continue + + alias = agg.alias or f"{agg.function}_{agg.column}" + safe_alias = alias.replace('"', '') + select_parts.append(f'{expr} AS "{safe_alias}"') + headers.append(safe_alias) + + if not select_parts: + select_parts = ["*"] + else: + # Regular columns + if columns: + valid_cols = [] + for col in columns: + if col in column_types: + valid_cols.append(f'"{col}"') + headers.append(col) + if valid_cols: + select_parts = valid_cols + else: + select_parts = ["*"] + # If columns were requested but none valid, we return all? + # Existing logic implies strict checking but fallback to * if empty list? + # The legacy logic: if columns list provided, only use valid ones. If none valid, maybe *? + # Let's assume if columns is empty list, we default to * + else: + select_parts = ["*"] + headers = list(column_types.keys()) + + return ", ".join(select_parts), headers + + def _build_where_clause( + self, + db_path: Path, + table_name: str, + filters: list[FilterSpec] | None, + search_value: str | None, + column_types_list: list[ColumnType], + column_types_map: dict[str, ColumnType], + params: list[Any] + ) -> str: + """Build WHERE clause including global search and field filters.""" + where_conditions = [] + + # Global Search + if search_value: + text_columns = [ + col.name for col in column_types_list + if not self.is_numeric_column(col.type) + ] + + if text_columns and self.ensure_fts5_table(db_path, table_name, text_columns): + fts5_table = f"{table_name}_fts5" + where_conditions.append( + f'rowid IN (SELECT rowid FROM "{fts5_table}" WHERE "{fts5_table}" MATCH ?)' + ) + params.append(search_value) + elif text_columns: + search_conditions = [] + for col in text_columns: + search_conditions.append(f'"{col}" LIKE ?') + params.append(f"%{search_value}%") + if search_conditions: + where_conditions.append(f"({' OR '.join(search_conditions)})") + + # Filters + if filters: + for filter_spec in filters: + condition = self._build_single_filter(filter_spec, column_types_map, params) + if condition: + where_conditions.append(condition) + + return f" WHERE {' AND '.join(where_conditions)}" if where_conditions else "" + + def _build_single_filter( + self, + filter_spec: FilterSpec, + column_types: dict[str, ColumnType], + params: list[Any] + ) -> str: + """Build SQL condition for a single filter.""" + column = filter_spec.column + operator = filter_spec.operator.lower() + value = filter_spec.value + + if column not in column_types: + logger.warning(f"Column '{column}' not found, skipping filter") + return "" + + col_type = column_types[column] + is_numeric = self.is_numeric_column(col_type.type) + safe_column = f'"{column}"' + + if operator == "is_null": + return f"{safe_column} IS NULL" + if operator == "is_not_null": + return f"{safe_column} IS NOT NULL" + + if value is None: + return "" + + # Numeric handling + if is_numeric and operator in ["eq", "ne", "gt", "gte", "lt", "lte", "between", "in", "not_in"]: + if operator == "between": + if filter_spec.value2 is None: return "" + params.append(self.convert_numeric_value(value, col_type.type)) + params.append(self.convert_numeric_value(filter_spec.value2, col_type.type)) + return f"{safe_column} BETWEEN ? AND ?" + elif operator in ["in", "not_in"]: + if not isinstance(value, list): return "" + vals = [self.convert_numeric_value(v, col_type.type) for v in value] + placeholders = ",".join(["?"] * len(vals)) + params.extend(vals) + op = "IN" if operator == "in" else "NOT IN" + return f"{safe_column} {op} ({placeholders})" + else: + params.append(self.convert_numeric_value(value, col_type.type)) + else: + # Text handling + if operator in ["like", "ilike"]: + params.append(f"%{value}%") + elif operator in ["in", "not_in"]: + if not isinstance(value, list): return "" + placeholders = ",".join(["?"] * len(value)) + params.extend(value) + op = "IN" if operator == "in" else "NOT IN" + return f"{safe_column} {op} ({placeholders})" + else: + params.append(value) + + operator_map = { + "eq": "=", "ne": "!=", "gt": ">", "gte": ">=", + "lt": "<", "lte": "<=", "like": "LIKE", "ilike": "LIKE" + } + + sql_op = operator_map.get(operator) + return f"{safe_column} {sql_op} ?" if sql_op else "" + + def execute_query( + self, + db_path: Path, + table_name: str, + limit: int = 100, + offset: int = 0, + columns: list[str] | None = None, + sort_column: str | None = None, + sort_order: str = "ASC", + search_value: str | None = None, + filters: list[FilterSpec] | None = None, + aggregations: list[AggregationSpec] | None = None, + group_by: list[str] | None = None, + use_cache: bool = True + ) -> dict[str, Any]: + """Execute a comprehensive query with all features.""" + try: + table_mtime = db_path.stat().st_mtime + except OSError: + table_mtime = 0.0 + + # 1. Cache Check + cache_key = self._build_cache_key( + db_path, table_name, limit, offset, columns, sort_column, + sort_order, search_value, filters, aggregations, group_by + ) + + if use_cache: + cached = self.cache.get(cache_key, table_mtime) + if cached: + cached["cached"] = True + return cached + + # 2. Schema & Validation + column_types_list = self.get_column_types(db_path, table_name) + column_types_map = {col.name: col for col in column_types_list} + + conn = self.pool.get_connection(db_path) + cursor = conn.cursor() + + # 3. Indices + if filters: + for f in filters: + if f.column in column_types_map: + self.ensure_index(db_path, table_name, f.column) + if sort_column and sort_column in column_types_map: + self.ensure_index(db_path, table_name, sort_column) + + # 4. Query Construction + select_clause, headers = self._build_select_clause(columns, aggregations, group_by, column_types_map) + + where_params: list[Any] = [] + where_clause = self._build_where_clause( + db_path, table_name, filters, search_value, + column_types_list, column_types_map, where_params + ) + + group_by_clause = "" + if group_by: + valid_groups = [f'"{col}"' for col in group_by if col in column_types_map] + if valid_groups: + group_by_clause = " GROUP BY " + ", ".join(valid_groups) + + order_by_clause = "" + if sort_column and sort_column in column_types_map: + direction = "DESC" if sort_order.upper() == "DESC" else "ASC" + order_by_clause = f' ORDER BY "{sort_column}" {direction}' + elif not aggregations and column_types_list: + order_by_clause = f' ORDER BY "{column_types_list[0].name}" ASC' + + limit_clause = f" LIMIT {int(limit)}" + offset_clause = f" OFFSET {int(offset)}" if offset > 0 else "" + + # 5. Execution + # Count Query + count_query = f'SELECT COUNT(*) FROM "{table_name}"{where_clause}' + cursor.execute(count_query, where_params) + total_count = cursor.fetchone()[0] + + # Data Query + query = f'SELECT {select_clause} FROM "{table_name}"{where_clause}{group_by_clause}{order_by_clause}{limit_clause}{offset_clause}' + + start_time = time.time() + cursor.execute(query, where_params) + rows = cursor.fetchall() + execution_time_ms = (time.time() - start_time) * 1000 + + # 6. Formatting + data = [[str(val) if val is not None else "" for val in row] for row in rows] + + response_column_types = [] + for col_name in headers: + if col_name in column_types_map: + ct = column_types_map[col_name] + response_column_types.append({ + "name": ct.name, "type": ct.type, + "notnull": ct.notnull, "pk": ct.pk, "dflt_value": ct.dflt_value + }) + else: + response_column_types.append({ + "name": col_name, "type": "REAL", "notnull": False, + "pk": False, "dflt_value": None + }) + + result = { + "headers": headers, + "data": data, + "total_count": total_count, + "column_types": response_column_types, + "query_metadata": { + "query_type": "aggregate" if aggregations else "select", + "sql": query, + "filters_applied": len(filters) if filters else 0, + "has_search": bool(search_value) + }, + "cached": False, + "execution_time_ms": execution_time_ms, + "limit": limit, + "offset": offset, + "table_name": table_name, + "database_path": str(db_path) + } + + if use_cache: + self.cache.set(cache_key, result, table_mtime) + + return result + + def _build_cache_key(self, db_path, table_name, limit, offset, columns, sort_column, + sort_order, search_value, filters, aggregations, group_by) -> str: + """Build precise cache key.""" + params = { + "db": str(db_path), "tbl": table_name, "l": limit, "o": offset, + "cols": columns, "sc": sort_column, "so": sort_order, "q": search_value, + "f": [(f.column, f.operator, f.value, f.value2) for f in (filters or [])], + "a": [(a.column, a.function, a.alias) for a in (aggregations or [])], + "gb": group_by + } + return hashlib.md5(json.dumps(params, sort_keys=True, default=str).encode()).hexdigest() + +_query_service: QueryService | None = None +_service_lock = threading.Lock() + +def get_query_service() -> QueryService: + """Get the global query service instance.""" + global _query_service + if _query_service is None: + with _service_lock: + if _query_service is None: + _query_service = QueryService() + return _query_service diff --git a/app/services/data/schema_analyzer.py b/app/services/data/schema_analyzer.py new file mode 100644 index 0000000..6bd3908 --- /dev/null +++ b/app/services/data/schema_analyzer.py @@ -0,0 +1,376 @@ +""" +Schema Analyzer. + +Comprehensive database schema introspection with sample value analysis. +Profiles tables and columns to provide input for type inference and AI analysis. +""" + +from __future__ import annotations + +import logging +import sqlite3 +import sys +import re +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +logger = logging.getLogger(__name__) + + +@dataclass +class ColumnProfile: + """Detailed profile of a database column.""" + + name: str + sqlite_type: str # INTEGER, TEXT, REAL, BLOB, NULL + sample_values: list[Any] = field(default_factory=list) + null_count: int = 0 + total_count: int = 0 + unique_count: int = 0 + avg_length: float = 0.0 # For TEXT columns + min_value: Any = None # For numeric columns + max_value: Any = None + detected_patterns: list[str] = field(default_factory=list) + + @property + def null_ratio(self) -> float: + """Percentage of NULL values.""" + return self.null_count / self.total_count if self.total_count > 0 else 0.0 + + @property + def unique_ratio(self) -> float: + """Cardinality indicator (unique / total).""" + return self.unique_count / self.total_count if self.total_count > 0 else 0.0 + + @property + def is_likely_id(self) -> bool: + """Check if column is likely an identifier.""" + # High cardinality + low nulls + ID-like name pattern + return ( + self.unique_ratio > 0.9 and + self.null_ratio < 0.01 and + any(p in self.name.lower() for p in ["id", "key", "ref"]) + ) + + +@dataclass +class TableProfile: + """Complete profile of a database table.""" + + name: str + row_count: int = 0 + columns: list[ColumnProfile] = field(default_factory=list) + primary_key: str | None = None + foreign_keys: list[str] = field(default_factory=list) + + @property + def column_count(self) -> int: + return len(self.columns) + + def get_column(self, name: str) -> ColumnProfile | None: + """Get a column profile by name.""" + for col in self.columns: + if col.name == name: + return col + return None + + +class SchemaAnalyzer: + """ + Database schema introspection and profiling. + + Analyzes SQLite databases to extract: + - Table metadata (row counts, column counts) + - Column details (types, nullability, cardinality) + - Sample values for type inference + - Statistical summaries + """ + + def __init__(self, sample_size: int = 10) -> None: + """ + Initialize the schema analyzer. + + Args: + sample_size: Number of sample values to collect per column + """ + self.sample_size = sample_size + + def analyze_database(self, db_path: Path) -> list[TableProfile]: + """ + Analyze all tables in a SQLite database. + + Args: + db_path: Path to the SQLite database file + + Returns: + List of TableProfile objects for each table + """ + profiles: list[TableProfile] = [] + + try: + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + + # Get list of user tables + cursor.execute(""" + SELECT name FROM sqlite_master + WHERE type='table' + AND name NOT LIKE 'sqlite_%' + ORDER BY name + """) + tables = [row[0] for row in cursor.fetchall()] + + for table_name in tables: + try: + profile = self._analyze_table(cursor, table_name) + profiles.append(profile) + except Exception as e: + logger.warning(f"Error analyzing table {table_name}: {e}") + + conn.close() + + except sqlite3.Error as e: + logger.error(f"Error opening database {db_path}: {e}") + raise + + logger.info(f"Analyzed {len(profiles)} tables from {db_path}") + return profiles + + def analyze_table(self, db_path: Path, table_name: str) -> TableProfile: + """ + Analyze a single table in a SQLite database. + + Args: + db_path: Path to the SQLite database file + table_name: Name of the table to analyze + + Returns: + TableProfile for the specified table + """ + try: + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + + profile = self._analyze_table(cursor, table_name) + + conn.close() + return profile + + except sqlite3.Error as e: + logger.error(f"Error analyzing table {table_name}: {e}") + raise + + def get_sample_values( + self, + db_path: Path, + table_name: str, + column_name: str, + n: int | None = None + ) -> list[Any]: + """ + Get sample values from a specific column. + + Args: + db_path: Path to the SQLite database file + table_name: Name of the table + column_name: Name of the column + n: Number of samples (defaults to self.sample_size) + + Returns: + List of sample values (distinct, non-null when possible) + """ + if n is None: + n = self.sample_size + + try: + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + + # Validate table exists + cursor.execute( + "SELECT name FROM sqlite_master WHERE type='table' AND name=?", + (table_name,) + ) + if not cursor.fetchone(): + raise ValueError(f"Table not found: {table_name}") + + # Get distinct non-null samples first + safe_col = column_name.replace('"', '""') + cursor.execute(f''' + SELECT DISTINCT "{safe_col}" + FROM "{table_name}" + WHERE "{safe_col}" IS NOT NULL + LIMIT ? + ''', (n,)) + + samples = [row[0] for row in cursor.fetchall()] + conn.close() + + return samples + + except sqlite3.Error as e: + logger.error(f"Error getting samples from {table_name}.{column_name}: {e}") + raise + + # ─── Private Methods ──────────────────────────────────────────────────── + + def _analyze_table(self, cursor: sqlite3.Cursor, table_name: str) -> TableProfile: + """Analyze a single table using an open cursor.""" + + profile = TableProfile(name=table_name) + + # Get row count + cursor.execute(f'SELECT COUNT(*) FROM "{table_name}"') + profile.row_count = cursor.fetchone()[0] + + # Get column info + cursor.execute(f'PRAGMA table_info("{table_name}")') + columns_info = cursor.fetchall() + + # Get primary key + for col_info in columns_info: + if col_info[5] == 1: # pk column in PRAGMA result + profile.primary_key = col_info[1] + break + + # Analyze each column + for col_info in columns_info: + col_name = col_info[1] + col_type = col_info[2] or "TEXT" + + col_profile = self._analyze_column( + cursor, table_name, col_name, col_type, profile.row_count + ) + profile.columns.append(col_profile) + + return profile + + def _analyze_column( + self, + cursor: sqlite3.Cursor, + table_name: str, + col_name: str, + col_type: str, + row_count: int + ) -> ColumnProfile: + """Analyze a single column.""" + + safe_col = col_name.replace('"', '""') + safe_table = table_name.replace('"', '""') + + profile = ColumnProfile( + name=col_name, + sqlite_type=col_type.upper(), + total_count=row_count, + ) + + if row_count == 0: + return profile + + # Get null count + cursor.execute(f''' + SELECT COUNT(*) FROM "{safe_table}" WHERE "{safe_col}" IS NULL + ''') + profile.null_count = cursor.fetchone()[0] + + # Get unique count (limit to avoid performance issues on large tables) + try: + cursor.execute(f''' + SELECT COUNT(DISTINCT "{safe_col}") FROM "{safe_table}" + ''') + profile.unique_count = cursor.fetchone()[0] + except sqlite3.Error: + profile.unique_count = 0 + + # Get sample values (distinct, non-null) + cursor.execute(f''' + SELECT DISTINCT "{safe_col}" + FROM "{safe_table}" + WHERE "{safe_col}" IS NOT NULL + LIMIT {self.sample_size} + ''') + profile.sample_values = [row[0] for row in cursor.fetchall()] + + # Get statistics for numeric columns + if col_type.upper() in ("INTEGER", "REAL", "NUMERIC"): + try: + cursor.execute(f''' + SELECT MIN("{safe_col}"), MAX("{safe_col}"), AVG(LENGTH(CAST("{safe_col}" AS TEXT))) + FROM "{safe_table}" + WHERE "{safe_col}" IS NOT NULL + ''') + result = cursor.fetchone() + if result: + profile.min_value = result[0] + profile.max_value = result[1] + profile.avg_length = result[2] or 0.0 + except sqlite3.Error: + pass + + # Get average length for text columns + elif col_type.upper() in ("TEXT", "VARCHAR", "CHAR", ""): + try: + cursor.execute(f''' + SELECT AVG(LENGTH("{safe_col}")) + FROM "{safe_table}" + WHERE "{safe_col}" IS NOT NULL + ''') + result = cursor.fetchone() + if result and result[0]: + profile.avg_length = float(result[0]) + except sqlite3.Error: + pass + + # Detect patterns in sample values + profile.detected_patterns = self._detect_patterns(profile.sample_values) + + return profile + + def _detect_patterns(self, values: list[Any]) -> list[str]: + """Detect common patterns in sample values.""" + patterns: list[str] = [] + + if not values: + return patterns + + str_values = [str(v) for v in values if v is not None] + if not str_values: + return patterns + + # Check for URL pattern + if all(v.startswith(("http://", "https://")) for v in str_values): + patterns.append("url") + + # Check for email pattern + if all("@" in v and "." in v for v in str_values): + patterns.append("email") + + # Check for GO term pattern + if all(v.startswith("GO:") for v in str_values): + patterns.append("go_term") + + + # Check for ISO date pattern + date_pattern = re.compile(r"^\d{4}-\d{2}-\d{2}") + if all(date_pattern.match(v) for v in str_values): + patterns.append("iso_date") + + # Check for UUID pattern + uuid_pattern = re.compile( + r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", + re.IGNORECASE + ) + if all(uuid_pattern.match(v) for v in str_values): + patterns.append("uuid") + + # Check for sequence pattern (DNA/RNA/Protein) + seq_pattern = re.compile(r"^[ATCGUN]+$", re.IGNORECASE) + protein_pattern = re.compile(r"^[ACDEFGHIKLMNPQRSTVWY]+$", re.IGNORECASE) + if all(len(v) > 20 for v in str_values): + if all(seq_pattern.match(v) for v in str_values): + patterns.append("nucleotide_sequence") + elif all(protein_pattern.match(v) for v in str_values): + patterns.append("protein_sequence") + + return patterns diff --git a/app/services/data/schema_service.py b/app/services/data/schema_service.py new file mode 100644 index 0000000..4db9985 --- /dev/null +++ b/app/services/data/schema_service.py @@ -0,0 +1,154 @@ +""" +Schema Information Service. + +Provides table and column schema information including: +- Column names, types, constraints (NOT NULL, PRIMARY KEY) +- Default values +- Indexes +""" + +from __future__ import annotations + +import sqlite3 +import logging +import threading +from pathlib import Path +from typing import Any + +from app.services.data.connection_pool import get_connection_pool +from app.services.data.query_service import QueryService +from app.utils.sqlite import list_tables + +logger = logging.getLogger(__name__) + + +class SchemaService: + """ + Service for retrieving database schema information. + """ + + def __init__(self) -> None: + """Initialize the schema service.""" + self.pool = get_connection_pool() + self.query_service = QueryService() + + def get_table_schema( + self, + db_path: Path, + table_name: str + ) -> dict[str, Any]: + """ + Get schema information for a single table. + + Args: + db_path: Path to SQLite database + table_name: Name of the table + + Returns: + Dictionary with table schema information + """ + conn = self.pool.get_connection(db_path) + cursor = conn.cursor() + + # Get column information + column_types = self.query_service.get_column_types(db_path, table_name) + + columns = [] + for col_type in column_types: + columns.append({ + "name": col_type.name, + "type": col_type.type, + "notnull": col_type.notnull, + "pk": col_type.pk, + "dflt_value": col_type.dflt_value + }) + + # Get indexes + indexes = self._get_table_indexes(cursor, table_name) + + return { + "table": table_name, + "columns": columns, + "indexes": indexes + } + + def get_all_tables_schema( + self, + db_path: Path + ) -> dict[str, Any]: + """ + Get schema information for all tables in the database. + + Args: + db_path: Path to SQLite database + + Returns: + Dictionary mapping table names to schema information + """ + """ + + table_names = list_tables(db_path) + schemas = {} + + for table_name in table_names: + try: + schemas[table_name] = self.get_table_schema(db_path, table_name) + except Exception as e: + logger.warning(f"Error getting schema for {table_name}: {e}") + + return schemas + + def _get_table_indexes( + self, + cursor: sqlite3.Cursor, + table_name: str + ) -> list[dict[str, str]]: + """ + Get all indexes for a table. + + Args: + cursor: Database cursor + table_name: Name of the table + + Returns: + List of index information dictionaries + """ + indexes = [] + + try: + # Get indexes for this table + cursor.execute(""" + SELECT name, sql + FROM sqlite_master + WHERE type='index' + AND tbl_name=? + AND name NOT LIKE 'sqlite_%' + """, (table_name,)) + + for row in cursor.fetchall(): + indexes.append({ + "name": row[0], + "sql": row[1] or "" + }) + + except sqlite3.Error as e: + logger.warning(f"Error getting indexes for {table_name}: {e}") + + return indexes + + +# Global schema service instance +_schema_service: SchemaService | None = None +_schema_service_lock = threading.Lock() + + +def get_schema_service() -> SchemaService: + """Get the global schema service instance.""" + global _schema_service + + if _schema_service is None: + with _schema_service_lock: + if _schema_service is None: + _schema_service = SchemaService() + + return _schema_service diff --git a/app/services/data/statistics_service.py b/app/services/data/statistics_service.py new file mode 100644 index 0000000..11ef552 --- /dev/null +++ b/app/services/data/statistics_service.py @@ -0,0 +1,326 @@ +""" +Column Statistics Service. + +Pre-computes and caches column statistics including: +- null_count, distinct_count, min, max, mean, median, stddev +- Sample values for data exploration +""" + +from __future__ import annotations + +import sqlite3 +import logging +import time +import threading +import math +from pathlib import Path +from typing import Any +from collections import OrderedDict +from dataclasses import dataclass + +from app.services.data.connection_pool import get_connection_pool +from app.services.data.query_service import QueryService + +logger = logging.getLogger(__name__) + + +@dataclass +class ColumnStatistics: + """Statistics for a single column.""" + + column: str + type: str + null_count: int = 0 + distinct_count: int = 0 + min: Any = None + max: Any = None + mean: float | None = None + median: float | None = None + stddev: float | None = None + sample_values: list[Any] = None + + def __post_init__(self): + """Initialize sample_values if None.""" + if self.sample_values is None: + self.sample_values = [] + + +class StatisticsCache: + """ + Cache for pre-computed column statistics. + + Invalidates when table modification time changes. + """ + + def __init__(self) -> None: + """Initialize the statistics cache.""" + self._cache: dict[str, tuple[dict[str, Any], float]] = {} + self._lock = threading.Lock() + + def get(self, cache_key: str, table_mtime: float) -> dict[str, Any] | None: + """ + Get cached statistics. + + Args: + cache_key: Cache key (db_path:table_name) + table_mtime: Table file modification time + + Returns: + Cached statistics if valid, None otherwise + """ + with self._lock: + if cache_key not in self._cache: + return None + + stats, cached_mtime = self._cache[cache_key] + + # Check if table has been modified + if cached_mtime != table_mtime: + del self._cache[cache_key] + return None + + return stats + + def set(self, cache_key: str, stats: dict[str, Any], table_mtime: float) -> None: + """ + Store statistics in cache. + + Args: + cache_key: Cache key (db_path:table_name) + stats: Statistics dictionary + table_mtime: Table file modification time + """ + with self._lock: + self._cache[cache_key] = (stats, table_mtime) + + def clear(self) -> None: + """Clear all cached statistics.""" + with self._lock: + self._cache.clear() + + +# Global statistics cache instance +_stats_cache: StatisticsCache | None = None +_stats_cache_lock = threading.Lock() + + +def get_statistics_cache() -> StatisticsCache: + """Get the global statistics cache instance.""" + global _stats_cache + + if _stats_cache is None: + with _stats_cache_lock: + if _stats_cache is None: + _stats_cache = StatisticsCache() + + return _stats_cache + + +class StatisticsService: + """ + Service for computing and caching column statistics. + """ + + def __init__(self) -> None: + """Initialize the statistics service.""" + self.pool = get_connection_pool() + self.query_service = QueryService() + self.cache = get_statistics_cache() + + def get_table_statistics( + self, + db_path: Path, + table_name: str, + use_cache: bool = True + ) -> dict[str, Any]: + """ + Get comprehensive statistics for all columns in a table. + + Args: + db_path: Path to SQLite database + table_name: Name of the table + use_cache: Whether to use cached statistics + + Returns: + Dictionary with table and column statistics + """ + # Get table modification time for cache invalidation + try: + table_mtime = db_path.stat().st_mtime + except OSError: + table_mtime = 0.0 + + cache_key = f"{db_path.absolute()}:{table_name}" + + # Check cache + if use_cache: + cached_stats = self.cache.get(cache_key, table_mtime) + if cached_stats is not None: + logger.debug(f"Cache hit for statistics: {table_name}") + return cached_stats + + # Get connection + conn = self.pool.get_connection(db_path) + cursor = conn.cursor() + + # Get row count + cursor.execute(f'SELECT COUNT(*) FROM "{table_name}"') + row_count = cursor.fetchone()[0] + + # Get column types + column_types = self.query_service.get_column_types(db_path, table_name) + + # Compute statistics for each column + column_stats_list = [] + + for col_type in column_types: + stats = self._compute_column_statistics( + cursor, table_name, col_type, row_count + ) + column_stats_list.append(stats) + + # Build response + result = { + "table": table_name, + "row_count": row_count, + "columns": [ + { + "column": stats.column, + "type": stats.type, + "null_count": stats.null_count, + "distinct_count": stats.distinct_count, + "min": stats.min, + "max": stats.max, + "mean": stats.mean, + "median": stats.median, + "stddev": stats.stddev, + "sample_values": stats.sample_values + } + for stats in column_stats_list + ], + "last_updated": int(time.time() * 1000) # Milliseconds since epoch + } + + # Cache result + if use_cache: + self.cache.set(cache_key, result, table_mtime) + + return result + + def _compute_column_statistics( + self, + cursor: sqlite3.Cursor, + table_name: str, + col_type: Any, # ColumnType from query_service + row_count: int + ) -> ColumnStatistics: + """ + Compute statistics for a single column. + + Args: + cursor: Database cursor + table_name: Name of the table + col_type: ColumnType object + row_count: Total row count + + Returns: + ColumnStatistics object + """ + column = col_type.name + sql_type = col_type.type + is_numeric = self.query_service.is_numeric_column(sql_type) + + safe_column = f'"{column}"' + + stats = ColumnStatistics(column=column, type=sql_type) + + try: + # Null count + cursor.execute(f'SELECT COUNT(*) FROM "{table_name}" WHERE {safe_column} IS NULL') + stats.null_count = cursor.fetchone()[0] + + # Distinct count + cursor.execute(f'SELECT COUNT(DISTINCT {safe_column}) FROM "{table_name}"') + stats.distinct_count = cursor.fetchone()[0] + + if is_numeric: + # Numeric statistics + try: + # Min, max, mean + cursor.execute(f''' + SELECT + MIN({safe_column}), + MAX({safe_column}), + AVG({safe_column}) + FROM "{table_name}" + WHERE {safe_column} IS NOT NULL + ''') + row = cursor.fetchone() + if row and row[0] is not None: + stats.min = float(row[0]) if "REAL" in sql_type.upper() else int(row[0]) + stats.max = float(row[1]) if "REAL" in sql_type.upper() else int(row[1]) + stats.mean = float(row[2]) if row[2] is not None else None + + # Median (approximate using ORDER BY and LIMIT) + if row_count > 0: + cursor.execute(f''' + SELECT {safe_column} + FROM "{table_name}" + WHERE {safe_column} IS NOT NULL + ORDER BY {safe_column} + LIMIT 1 OFFSET ? + ''', (row_count // 2,)) + median_row = cursor.fetchone() + if median_row and median_row[0] is not None: + stats.median = float(median_row[0]) if "REAL" in sql_type.upper() else int(median_row[0]) + + # Standard deviation (approximate) + if stats.mean is not None: + cursor.execute(f''' + SELECT AVG(({safe_column} - ?) * ({safe_column} - ?)) + FROM "{table_name}" + WHERE {safe_column} IS NOT NULL + ''', (stats.mean, stats.mean)) + variance_row = cursor.fetchone() + if variance_row and variance_row[0] is not None: + variance = float(variance_row[0]) + stats.stddev = math.sqrt(variance) if variance >= 0 else None + + except sqlite3.Error as e: + logger.warning(f"Error computing numeric statistics for {column}: {e}") + + # Sample values (always compute) + try: + cursor.execute(f''' + SELECT DISTINCT {safe_column} + FROM "{table_name}" + WHERE {safe_column} IS NOT NULL + LIMIT 5 + ''') + sample_rows = cursor.fetchall() + stats.sample_values = [row[0] for row in sample_rows if row[0] is not None] + + except sqlite3.Error as e: + logger.warning(f"Error getting sample values for {column}: {e}") + + except sqlite3.Error as e: + logger.warning(f"Error computing statistics for {column}: {e}") + + return stats + + +# Global statistics service instance +_stats_service: StatisticsService | None = None +_stats_service_lock = threading.Lock() + + +def get_statistics_service() -> StatisticsService: + """Get the global statistics service instance.""" + global _stats_service + + if _stats_service is None: + with _stats_service_lock: + if _stats_service is None: + _stats_service = StatisticsService() + + return _stats_service diff --git a/app/services/data/type_inference.py b/app/services/data/type_inference.py new file mode 100644 index 0000000..bdd97e2 --- /dev/null +++ b/app/services/data/type_inference.py @@ -0,0 +1,550 @@ +""" +Type Inference Engine. + +Rule-based pattern detection for inferring column data types and rendering +configurations. This module provides fast, deterministic type inference +without requiring AI, and serves as the foundation for hybrid inference. + +Works independently of AI providers and can serve as a fallback. +""" + +from __future__ import annotations + +import re +from dataclasses import dataclass, field +from typing import Any, Literal +from enum import Enum + + +class DataType(str, Enum): + """Column data types matching DataTables_Viewer ColumnDataType.""" + STRING = "string" + NUMBER = "number" + INTEGER = "integer" + FLOAT = "float" + BOOLEAN = "boolean" + DATE = "date" + DATETIME = "datetime" + TIMESTAMP = "timestamp" + JSON = "json" + ARRAY = "array" + SEQUENCE = "sequence" + ID = "id" + URL = "url" + EMAIL = "email" + ONTOLOGY = "ontology" + PERCENTAGE = "percentage" + FILESIZE = "filesize" + DURATION = "duration" + CURRENCY = "currency" + COLOR = "color" + IMAGE = "image" + CUSTOM = "custom" + + +@dataclass +class TransformConfig: + """Transform configuration for cell rendering.""" + type: str + options: dict[str, Any] = field(default_factory=dict) + + +@dataclass +class InferredType: + """Result of type inference for a column.""" + data_type: DataType + display_name: str + categories: list[str] + transform: TransformConfig | None = None + width: str = "auto" + pin: Literal["left", "right"] | None = None + sortable: bool = True + filterable: bool = True + copyable: bool = False + confidence: float = 1.0 + source: Literal["rules", "ai", "hybrid"] = "rules" + + +# ============================================================================= +# PATTERN DEFINITIONS +# ============================================================================= + +# Column name patterns mapped to inference results +NAME_PATTERNS: list[tuple[re.Pattern, dict[str, Any]]] = [ + # IDs - typically pinned left + (re.compile(r"^(ID|id)$"), { + "data_type": DataType.ID, + "categories": ["core"], + "pin": "left", + "copyable": True, + "width": "100px", + }), + (re.compile(r".*_ID$|.*_id$|.*Id$"), { + "data_type": DataType.ID, + "categories": ["core"], + "copyable": True, + "width": "120px", + }), + (re.compile(r"^Database_ID$|^database_id$"), { + "data_type": DataType.ID, + "categories": ["core"], + "copyable": True, + "width": "130px", + }), + + # UniRef IDs - need chain transformer to strip prefix + (re.compile(r"^uniref_\d+$|^UniRef_\d+$|^uniref\d+$"), { + "data_type": DataType.ID, + "categories": ["external"], + "copyable": True, + "width": "140px", + "transform": TransformConfig( + type="chain", + options={ + "transforms": [ + {"type": "replace", "options": {"find": "UniRef:", "replace": ""}}, + {"type": "link", "options": { + "urlTemplate": "https://www.uniprot.org/uniref/{value}", + "target": "_blank", + "icon": "bi-link-45deg" + }} + ] + } + ), + }), + + # External database references with link transforms + (re.compile(r"^Uniprot.*|^uniprot.*|.*UniProt.*"), { + "data_type": DataType.ID, + "categories": ["external"], + "width": "100px", + "transform": TransformConfig( + type="link", + options={ + "urlTemplate": "https://www.uniprot.org/uniprotkb/{value}", + "target": "_blank", + "icon": "bi-link-45deg" + } + ), + }), + (re.compile(r"^KEGG.*|^kegg.*"), { + "data_type": DataType.ID, + "categories": ["external"], + "width": "90px", + "transform": TransformConfig( + type="link", + options={ + "urlTemplate": "https://www.genome.jp/entry/{value}", + "target": "_blank" + } + ), + }), + (re.compile(r"^GO_.*|^go_.*"), { + "data_type": DataType.ONTOLOGY, + "categories": ["functional"], + "width": "180px", + "transform": TransformConfig( + type="ontology", + options={ + "prefix": "GO", + "urlTemplate": "https://amigo.geneontology.org/amigo/term/{value}", + "style": "badge" + } + ), + }), + + # Pfam domain IDs + (re.compile(r"^pfam.*|^Pfam.*|^PF\d+"), { + "data_type": DataType.ID, + "categories": ["ontology"], + "width": "100px", + "transform": TransformConfig( + type="chain", + options={ + "transforms": [ + {"type": "replace", "options": {"find": "pfam:", "replace": ""}}, + {"type": "link", "options": { + "urlTemplate": "https://www.ebi.ac.uk/interpro/entry/pfam/{value}", + "target": "_blank", + "icon": "bi-link-45deg" + }} + ] + } + ), + }), + + # NCBI protein IDs (RefSeq) + (re.compile(r"^ncbi.*|.*_ncbi.*|^NP_.*|^WP_.*|^XP_.*"), { + "data_type": DataType.ID, + "categories": ["external"], + "copyable": True, + "width": "120px", + "transform": TransformConfig( + type="link", + options={ + "urlTemplate": "https://www.ncbi.nlm.nih.gov/protein/{value}", + "target": "_blank", + "icon": "bi-link-45deg" + } + ), + }), + + # Strand indicator (+/-) + (re.compile(r"^strand$|^Strand$|.*_strand$"), { + "data_type": DataType.STRING, + "categories": ["core"], + "width": "80px", + "transform": TransformConfig( + type="badge", + options={ + "colorMap": { + "+": {"color": "#22c55e", "bgColor": "#dcfce7"}, + "-": {"color": "#ef4444", "bgColor": "#fee2e2"}, + ".": {"color": "#94a3b8", "bgColor": "#f1f5f9"} + } + } + ), + }), + + # Sequences + (re.compile(r".*Sequence.*|.*_seq$|.*_Seq$"), { + "data_type": DataType.SEQUENCE, + "categories": ["sequence"], + "sortable": False, + "filterable": False, + "copyable": True, + "width": "150px", + "transform": TransformConfig( + type="sequence", + options={"maxLength": 20, "showCopyButton": True} + ), + }), + + # Function/product descriptions + (re.compile(r".*function.*|.*Function.*|.*product.*|.*Product.*"), { + "data_type": DataType.STRING, + "categories": ["functional"], + "width": "300px", + }), + + # Statistical measures with special formatting + (re.compile(r"^Log2FC$|.*log2.*fold.*|.*Log2.*Fold.*"), { + "data_type": DataType.FLOAT, + "categories": ["expression"], + "width": "130px", + "transform": TransformConfig( + type="heatmap", + options={ + "min": -4, "max": 4, + "colorScale": "diverging", + "showValue": True, + "decimals": 2 + } + ), + }), + (re.compile(r"^P[_-]?[Vv]alue$|^pvalue$|^p_value$"), { + "data_type": DataType.FLOAT, + "categories": ["statistics"], + "width": "100px", + "transform": TransformConfig( + type="number", + options={"notation": "scientific", "decimals": 2} + ), + }), + (re.compile(r"^FDR$|^fdr$|^q[_-]?value$"), { + "data_type": DataType.FLOAT, + "categories": ["statistics"], + "width": "100px", + "transform": TransformConfig( + type="number", + options={"notation": "scientific", "decimals": 2} + ), + }), + + # Boolean indicators + (re.compile(r"^Significant$|^is_.*|^has_.*"), { + "data_type": DataType.BOOLEAN, + "categories": ["statistics"], + "width": "90px", + "transform": TransformConfig( + type="boolean", + options={ + "trueIcon": "bi-check-circle-fill", + "falseIcon": "bi-x-circle", + "trueColor": "#22c55e", + "falseColor": "#94a3b8" + } + ), + }), + + # Temperature with unit + (re.compile(r".*Temperature.*|.*_in_C$"), { + "data_type": DataType.FLOAT, + "categories": ["experimental"], + "width": "120px", + "transform": TransformConfig( + type="number", + options={"decimals": 1, "suffix": "°C"} + ), + }), + + # Concentration fields + (re.compile(r".*Concentration.*|.*_in_mM$|.*_in_mg.*"), { + "data_type": DataType.FLOAT, + "categories": ["media"], + "width": "120px", + "transform": TransformConfig( + type="number", + options={"decimals": 2} + ), + }), + + # Name fields + (re.compile(r"^Name$|^name$|.*_Name$|.*_name$"), { + "data_type": DataType.STRING, + "categories": ["core"], + "width": "200px", + }), + + # URL fields + (re.compile(r".*_URL$|.*_url$|.*Link$|.*link$"), { + "data_type": DataType.URL, + "categories": ["external"], + "width": "150px", + }), +] + +# Value patterns for detecting types from sample data +VALUE_PATTERNS: list[tuple[re.Pattern, DataType]] = [ + # URLs + (re.compile(r"^https?://"), DataType.URL), + # Email + (re.compile(r"^[\w.+-]+@[\w-]+\.[\w.-]+$"), DataType.EMAIL), + # GO terms + (re.compile(r"^GO:\d{7}"), DataType.ONTOLOGY), + # ISO dates + (re.compile(r"^\d{4}-\d{2}-\d{2}$"), DataType.DATE), + (re.compile(r"^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}"), DataType.DATETIME), + # Colors + (re.compile(r"^#[0-9a-fA-F]{6}$|^rgb\("), DataType.COLOR), + # DNA/RNA sequences (long strings of ATCGU only) + (re.compile(r"^[ATCGU]{20,}$", re.IGNORECASE), DataType.SEQUENCE), + # Protein sequences (amino acid codes) + (re.compile(r"^[ACDEFGHIKLMNPQRSTVWY]{20,}$", re.IGNORECASE), DataType.SEQUENCE), +] + + +# ============================================================================= +# TYPE INFERENCE ENGINE +# ============================================================================= + +class TypeInferenceEngine: + """ + Rule-based type inference engine. + + Analyzes column names and sample values to infer data types, + display configurations, and rendering transforms without AI. + """ + + def __init__(self) -> None: + self._name_patterns = NAME_PATTERNS + self._value_patterns = VALUE_PATTERNS + + def infer_from_name(self, column_name: str) -> InferredType | None: + """ + Infer column type from column name patterns. + + Args: + column_name: The name of the column + + Returns: + InferredType if a pattern matches, None otherwise + """ + for pattern, config in self._name_patterns: + if pattern.match(column_name): + return InferredType( + data_type=config.get("data_type", DataType.STRING), + display_name=self._format_display_name(column_name), + categories=config.get("categories", []), + transform=config.get("transform"), + width=config.get("width", "auto"), + pin=config.get("pin"), + sortable=config.get("sortable", True), + filterable=config.get("filterable", True), + copyable=config.get("copyable", False), + confidence=0.9, # High confidence for name pattern match + source="rules", + ) + return None + + def infer_from_values( + self, + column_name: str, + sample_values: list[Any], + sqlite_type: str = "TEXT" + ) -> InferredType: + """ + Infer column type from sample values. + + Args: + column_name: The name of the column + sample_values: List of sample values from the column + sqlite_type: The SQLite column type + + Returns: + InferredType with inferred configuration + """ + # First, try name-based inference + name_inference = self.infer_from_name(column_name) + if name_inference: + return name_inference + + # Filter out None/empty values for analysis + valid_values = [v for v in sample_values if v is not None and str(v).strip()] + + if not valid_values: + return self._default_inference(column_name, sqlite_type) + + # Check for boolean values + if self._is_boolean(valid_values): + return InferredType( + data_type=DataType.BOOLEAN, + display_name=self._format_display_name(column_name), + categories=["metadata"], + confidence=0.95, + ) + + # Check for numeric types based on SQLite type and values + if sqlite_type in ("INTEGER", "REAL") or self._is_numeric(valid_values): + return self._infer_numeric(column_name, valid_values, sqlite_type) + + # Check value patterns + str_values = [str(v) for v in valid_values] + for pattern, data_type in self._value_patterns: + matches = sum(1 for v in str_values if pattern.match(v)) + if matches / len(str_values) > 0.5: # >50% match threshold + return InferredType( + data_type=data_type, + display_name=self._format_display_name(column_name), + categories=self._default_category(data_type), + confidence=0.8, + ) + + # Default to string + return self._default_inference(column_name, sqlite_type) + + def infer( + self, + column_name: str, + sample_values: list[Any] | None = None, + sqlite_type: str = "TEXT" + ) -> InferredType: + """ + Full inference combining name and value analysis. + + Args: + column_name: The name of the column + sample_values: Optional list of sample values + sqlite_type: The SQLite column type + + Returns: + InferredType with best inference + """ + if sample_values: + return self.infer_from_values(column_name, sample_values, sqlite_type) + + name_inference = self.infer_from_name(column_name) + if name_inference: + return name_inference + + return self._default_inference(column_name, sqlite_type) + + # ─── Helper Methods ───────────────────────────────────────────────────── + + def _format_display_name(self, column_name: str) -> str: + """Convert column name to human-readable display name.""" + # Replace underscores and handle camelCase + name = re.sub(r"_", " ", column_name) + name = re.sub(r"([a-z])([A-Z])", r"\1 \2", name) + # Title case but preserve acronyms + words = name.split() + formatted = [] + for word in words: + if word.isupper() and len(word) <= 4: # Likely acronym + formatted.append(word) + else: + formatted.append(word.capitalize()) + return " ".join(formatted) + + def _is_boolean(self, values: list[Any]) -> bool: + """Check if values represent boolean data.""" + bool_values = {"true", "false", "yes", "no", "1", "0", "t", "f", "y", "n"} + str_values = {str(v).lower() for v in values} + return str_values.issubset(bool_values) and len(str_values) <= 2 + + def _is_numeric(self, values: list[Any]) -> bool: + """Check if all values are numeric.""" + for v in values: + if v is None: + continue + try: + float(v) + except (ValueError, TypeError): + return False + return True + + def _infer_numeric( + self, + column_name: str, + values: list[Any], + sqlite_type: str + ) -> InferredType: + """Infer numeric type details.""" + # Check if all values are integers + is_integer = all( + isinstance(v, int) or (isinstance(v, float) and v.is_integer()) + for v in values if v is not None + ) + + data_type = DataType.INTEGER if (sqlite_type == "INTEGER" or is_integer) else DataType.FLOAT + + return InferredType( + data_type=data_type, + display_name=self._format_display_name(column_name), + categories=["data"], + width="100px", + transform=TransformConfig( + type="number", + options={"decimals": 0 if is_integer else 2} + ) if data_type == DataType.FLOAT else None, + confidence=0.85, + ) + + def _default_inference(self, column_name: str, sqlite_type: str) -> InferredType: + """Return default string inference.""" + # Map SQLite types to data types + type_map = { + "INTEGER": DataType.INTEGER, + "REAL": DataType.FLOAT, + "BLOB": DataType.CUSTOM, + } + + return InferredType( + data_type=type_map.get(sqlite_type, DataType.STRING), + display_name=self._format_display_name(column_name), + categories=["data"], + confidence=0.5, + ) + + def _default_category(self, data_type: DataType) -> list[str]: + """Get default categories for a data type.""" + category_map = { + DataType.ID: ["core"], + DataType.URL: ["external"], + DataType.EMAIL: ["external"], + DataType.ONTOLOGY: ["functional"], + DataType.SEQUENCE: ["sequence"], + DataType.DATE: ["metadata"], + DataType.DATETIME: ["metadata"], + } + return category_map.get(data_type, ["data"]) diff --git a/app/services/data/validation.py b/app/services/data/validation.py new file mode 100644 index 0000000..553c038 --- /dev/null +++ b/app/services/data/validation.py @@ -0,0 +1,396 @@ +""" +Configuration Validation Module. + +Provides JSON schema validation for generated DataTables Viewer configurations +to ensure compatibility with the frontend viewer. +""" + +from __future__ import annotations + +import logging +from typing import Any + +logger = logging.getLogger(__name__) + +try: + from jsonschema import validate, ValidationError, Draft7Validator + HAS_JSONSCHEMA = True +except ImportError: + HAS_JSONSCHEMA = False + # Dummy objects if needed + validate = None + ValidationError = Exception + Draft7Validator = None + + +# ============================================================================= +# JSON SCHEMAS +# ============================================================================= + +# Schema for individual column configuration +COLUMN_SCHEMA = { + "type": "object", + "required": ["column", "displayName"], + "properties": { + "column": {"type": "string", "minLength": 1}, + "displayName": {"type": "string", "minLength": 1}, + "dataType": { + "type": "string", + "enum": [ + "string", "number", "integer", "float", "boolean", + "date", "datetime", "timestamp", "duration", + "id", "url", "email", "phone", + "percentage", "currency", "filesize", + "sequence", "ontology", "json", "array" + ] + }, + "visible": {"type": "boolean"}, + "sortable": {"type": "boolean"}, + "filterable": {"type": "boolean"}, + "searchable": {"type": "boolean"}, + "copyable": {"type": "boolean"}, + "width": {"type": "string"}, + "align": {"type": "string", "enum": ["left", "center", "right"]}, + "pin": {"type": ["string", "null"], "enum": ["left", "right", None]}, + "categories": { + "type": "array", + "items": {"type": "string"} + }, + "transform": { + "type": ["object", "null"], + "properties": { + "type": {"type": "string"}, + "options": {"type": "object"} + } + } + }, + "additionalProperties": True # Allow future extensions +} + +# Schema for table configuration +TABLE_SCHEMA = { + "type": "object", + "required": ["displayName", "columns"], + "properties": { + "displayName": {"type": "string", "minLength": 1}, + "description": {"type": "string"}, + "icon": {"type": "string"}, + "settings": {"type": "object"}, + "categories": { + "type": "array", + "items": {"type": "object"} + }, + "columns": { + "type": "array", + "items": COLUMN_SCHEMA, + "minItems": 1 + } + } +} + +# Schema for complete DataTypeConfig +DATATYPE_CONFIG_SCHEMA = { + "type": "object", + "required": ["id", "name", "tables"], + "properties": { + "id": {"type": "string", "minLength": 1}, + "name": {"type": "string", "minLength": 1}, + "description": {"type": "string"}, + "version": {"type": "string", "pattern": r"^\d+\.\d+\.\d+$"}, + "icon": {"type": "string"}, + "color": {"type": "string"}, + "objectType": {"type": "string"}, + "defaults": { + "type": "object", + "properties": { + "pageSize": {"type": "integer", "minimum": 1, "maximum": 1000}, + "density": {"type": "string", "enum": ["compact", "default", "comfortable"]}, + "showRowNumbers": {"type": "boolean"}, + "enableSelection": {"type": "boolean"}, + "enableExport": {"type": "boolean"} + } + }, + "sharedCategories": { + "type": "array", + "items": { + "type": "object", + "required": ["id", "name"], + "properties": { + "id": {"type": "string"}, + "name": {"type": "string"}, + "icon": {"type": "string"}, + "color": {"type": "string"}, + "defaultVisible": {"type": "boolean"}, + "order": {"type": "integer"} + } + } + }, + "tables": { + "type": "object", + "additionalProperties": TABLE_SCHEMA, + "minProperties": 1 + } + } +} + +# Schema for AI-generated column response (single table) +AI_RESPONSE_SCHEMA = { + "type": "object", + "required": ["columns"], + "properties": { + "columns": { + "type": "array", + "items": COLUMN_SCHEMA, + "minItems": 1 + } + } +} + + +# ============================================================================= +# VALIDATION FUNCTIONS +# ============================================================================= + +def validate_config(config: dict[str, Any]) -> tuple[bool, str | None]: + """ + Validate a complete DataTypeConfig against the schema. + + Args: + config: The configuration dictionary to validate + + Returns: + Tuple of (is_valid, error_message) + """ + try: + if not HAS_JSONSCHEMA: + raise ImportError("jsonschema not available") + + validator = Draft7Validator(DATATYPE_CONFIG_SCHEMA) + errors = list(validator.iter_errors(config)) + + if not errors: + return True, None + + # Format first error + first_error = errors[0] + path = ".".join(str(p) for p in first_error.absolute_path) or "root" + return False, f"Validation error at '{path}': {first_error.message}" + + except ImportError: + # jsonschema not available, do basic validation + return _basic_validation(config) + except Exception as e: + logger.warning(f"Validation error: {e}") + return False, str(e) + + +def validate_table_config(table_config: dict[str, Any]) -> tuple[bool, str | None]: + """ + Validate a single table configuration. + + Args: + table_config: Table configuration dictionary + + Returns: + Tuple of (is_valid, error_message) + """ + try: + if not HAS_JSONSCHEMA: + raise ImportError("jsonschema not available") + + validate(instance=table_config, schema=TABLE_SCHEMA) + return True, None + + except ImportError: + return _basic_table_validation(table_config) + except Exception as e: + return False, str(e) + + +def validate_ai_response(response: dict[str, Any]) -> tuple[bool, str | None]: + """ + Validate AI-generated column response. + + Args: + response: AI response dictionary + + Returns: + Tuple of (is_valid, error_message) + """ + try: + if not HAS_JSONSCHEMA: + raise ImportError("jsonschema not available") + + validate(instance=response, schema=AI_RESPONSE_SCHEMA) + return True, None + + except ImportError: + # Basic validation + if not isinstance(response, dict): + return False, "Response must be a dictionary" + if "columns" not in response: + return False, "Response must have 'columns' key" + if not isinstance(response["columns"], list): + return False, "'columns' must be an array" + if len(response["columns"]) == 0: + return False, "'columns' array must not be empty" + return True, None + + except Exception as e: + return False, str(e) + + +def validate_column_config(column: dict[str, Any]) -> tuple[bool, str | None]: + """ + Validate a single column configuration. + + Args: + column: Column configuration dictionary + + Returns: + Tuple of (is_valid, error_message) + """ + if not isinstance(column, dict): + return False, "Column must be a dictionary" + + if "column" not in column: + return False, "Column must have 'column' key" + + if "displayName" not in column: + return False, "Column must have 'displayName' key" + + # Validate transform structure if present + if "transform" in column and column["transform"] is not None: + transform = column["transform"] + if not isinstance(transform, dict): + return False, "Transform must be a dictionary" + if "type" not in transform: + return False, "Transform must have 'type' key" + + return True, None + + +# ============================================================================= +# BASIC VALIDATION (fallback when jsonschema unavailable) +# ============================================================================= + +def _basic_validation(config: dict[str, Any]) -> tuple[bool, str | None]: + """Basic validation without jsonschema library.""" + if not isinstance(config, dict): + return False, "Config must be a dictionary" + + # Check required fields + for field in ["id", "name", "tables"]: + if field not in config: + return False, f"Missing required field: {field}" + + if not isinstance(config["tables"], dict): + return False, "'tables' must be a dictionary" + + if len(config["tables"]) == 0: + return False, "'tables' must not be empty" + + # Validate each table + for table_name, table_config in config["tables"].items(): + is_valid, error = _basic_table_validation(table_config) + if not is_valid: + return False, f"Table '{table_name}': {error}" + + return True, None + + +def _basic_table_validation(table_config: dict[str, Any]) -> tuple[bool, str | None]: + """Basic table validation without jsonschema library.""" + if not isinstance(table_config, dict): + return False, "Table config must be a dictionary" + + if "displayName" not in table_config: + return False, "Missing 'displayName'" + + if "columns" not in table_config: + return False, "Missing 'columns'" + + if not isinstance(table_config["columns"], list): + return False, "'columns' must be an array" + + if len(table_config["columns"]) == 0: + return False, "'columns' must not be empty" + + # Validate each column + for i, column in enumerate(table_config["columns"]): + is_valid, error = validate_column_config(column) + if not is_valid: + return False, f"Column {i}: {error}" + + return True, None + + +# ============================================================================= +# SANITIZATION +# ============================================================================= + +def sanitize_config(config: dict[str, Any]) -> dict[str, Any]: + """ + Sanitize and normalize a config, fixing common issues. + + Args: + config: Raw configuration dictionary + + Returns: + Sanitized configuration + """ + sanitized = dict(config) + + # Ensure version format + if "version" not in sanitized or not sanitized["version"]: + sanitized["version"] = "1.0.0" + + # Normalize tables + if "tables" in sanitized: + for table_name, table_config in sanitized["tables"].items(): + sanitized["tables"][table_name] = _sanitize_table(table_config) + + return sanitized + + +def _sanitize_table(table_config: dict[str, Any]) -> dict[str, Any]: + """Sanitize a table configuration.""" + sanitized = dict(table_config) + + # Ensure columns exist + if "columns" not in sanitized: + sanitized["columns"] = [] + + # Sanitize each column + sanitized["columns"] = [ + _sanitize_column(col) for col in sanitized["columns"] + ] + + return sanitized + + +def _sanitize_column(column: dict[str, Any]) -> dict[str, Any]: + """Sanitize a column configuration.""" + sanitized = dict(column) + + # Default display name to column name + if "displayName" not in sanitized and "column" in sanitized: + col_name = sanitized["column"] + # Convert snake_case to Title Case + sanitized["displayName"] = col_name.replace("_", " ").title() + + # Default data type + if "dataType" not in sanitized: + sanitized["dataType"] = "string" + + # Ensure categories is a list + if "categories" not in sanitized: + sanitized["categories"] = [] + elif not isinstance(sanitized["categories"], list): + sanitized["categories"] = [sanitized["categories"]] + + # Normalize null transform + if "transform" in sanitized and sanitized["transform"] is None: + del sanitized["transform"] + + return sanitized diff --git a/app/services/db_helper.py b/app/services/db_helper.py new file mode 100644 index 0000000..f514889 --- /dev/null +++ b/app/services/db_helper.py @@ -0,0 +1,120 @@ +""" +Database helper service to consolidate retrieval and validation logic. +Reduces code duplication in API routes. +""" +import logging +from pathlib import Path +from uuid import uuid4 + +from fastapi import HTTPException + +from app.config import settings +from app.utils.workspace import KBaseClient, download_pangenome_db +from app.utils.sqlite import validate_table_exists, list_tables +from app.utils.async_utils import run_sync_in_thread + +logger = logging.getLogger(__name__) + +async def get_handle_db_path( + handle_ref: str, + token: str, + kb_env: str, + cache_dir: Path +) -> Path: + """ + Get (and download if needed) a SQLite database from a handle reference. + + Args: + handle_ref: Handle reference string + token: KBase auth token + kb_env: KBase environment + cache_dir: Cache directory path + + Returns: + Path to the local SQLite database file + """ + def _download_handle_db(): + # Cache path based on handle + safe_handle = handle_ref.replace(":", "_").replace("/", "_") + db_dir = cache_dir / "handles" + db_dir.mkdir(parents=True, exist_ok=True) + db_path = db_dir / f"{safe_handle}.db" + + # Atomic download if missing + if not db_path.exists(): + client = KBaseClient(token, kb_env, cache_dir) + temp_path = db_path.with_suffix(f".{uuid4().hex}.tmp") + try: + client.download_blob_file(handle_ref, temp_path) + temp_path.rename(db_path) + except Exception: + temp_path.unlink(missing_ok=True) + raise + return db_path + + try: + return await run_sync_in_thread(_download_handle_db) + except Exception as e: + logger.error(f"Error accessing handle database {handle_ref}: {e}") + raise HTTPException(status_code=500, detail=f"Failed to access database: {str(e)}") + + +async def get_object_db_path( + berdl_table_id: str, + token: str, + kb_env: str, + cache_dir: Path +) -> Path: + """ + Get (and download if needed) a SQLite database from a BERDL object. + + Args: + berdl_table_id: KBase workspace reference + token: KBase auth token + kb_env: KBase environment + cache_dir: Cache directory path + + Returns: + Path to the local SQLite database file + """ + try: + # download_pangenome_db already handles caching logic + return await run_sync_in_thread( + download_pangenome_db, + berdl_table_id, + token, + cache_dir, + kb_env + ) + except TimeoutError: + logger.error(f"Database download timed out for {berdl_table_id}") + raise HTTPException( + status_code=504, + detail="Database download timed out. Please try again later." + ) + except Exception as e: + logger.error(f"Error accessing object database {berdl_table_id}: {e}") + raise HTTPException(status_code=500, detail=f"Failed to access database: {str(e)}") + + +async def ensure_table_accessible(db_path: Path, table_name: str) -> bool: + """ + Validate that a table exists in the database. + Raises HTTPException 404 if not found. + + Args: + db_path: Path to SQLite database + table_name: Name of table to check + + Returns: + True if exists + """ + exists = await run_sync_in_thread(validate_table_exists, db_path, table_name) + + if not exists: + available = await run_sync_in_thread(list_tables, db_path) + raise HTTPException( + status_code=404, + detail=f"Table '{table_name}' not found. Available: {available}" + ) + return True diff --git a/app/utils/async_utils.py b/app/utils/async_utils.py new file mode 100644 index 0000000..0cd0d03 --- /dev/null +++ b/app/utils/async_utils.py @@ -0,0 +1,27 @@ +""" +Async utilities for standardized execution. +""" +import asyncio +from typing import TypeVar, Any, Callable + +T = TypeVar("T") + +async def run_sync_in_thread(func: Callable[..., T], *args: Any) -> T: + """ + Run a synchronous function in a separate thread. + + Handles compatibility between Python 3.9+ (asyncio.to_thread) + and older versions (loop.run_in_executor). + + Args: + func: The synchronous function to run + *args: Arguments to pass to the function + + Returns: + The result of the function call + """ + if hasattr(asyncio, 'to_thread'): + return await asyncio.to_thread(func, *args) + + loop = asyncio.get_event_loop() + return await loop.run_in_executor(None, func, *args) diff --git a/app/utils/request_utils.py b/app/utils/request_utils.py new file mode 100644 index 0000000..716c614 --- /dev/null +++ b/app/utils/request_utils.py @@ -0,0 +1,101 @@ +""" +Request processing utilities for TableScanner routes. +""" + +from __future__ import annotations + +import time +import logging +from typing import Any +from pathlib import Path + +from fastapi import HTTPException +from app.services.data.query_service import get_query_service, FilterSpec +from app.utils.async_utils import run_sync_in_thread +from app.exceptions import TableNotFoundError + +logger = logging.getLogger(__name__) + +class TableRequestProcessor: + """ + Handles common logic for table data requests: + - Parameter extraction + - Database access (via helper/callback) + - Query execution via QueryService + - Response formatting + """ + + @staticmethod + async def process_data_request( + db_path: Path, + table_name: str, + limit: int, + offset: int, + sort_column: str | None = None, + sort_order: str = "ASC", + search_value: str | None = None, + columns: list[str] | None = None, + filters: dict[str, Any] | None = None, + handle_ref_or_id: str | None = None + ) -> dict[str, Any]: + """ + Process a generic table data request. + """ + start_time = time.time() + + # Prepare filters + service_filters = [] + if filters: + for col, val in filters.items(): + service_filters.append(FilterSpec(column=col, operator="like", value=val)) + + # Determine sort direction + direction = "ASC" + if sort_order and sort_order.lower() == "desc": + direction = "DESC" + + def _execute(): + query_service = get_query_service() + try: + return query_service.execute_query( + db_path=db_path, + table_name=table_name, + limit=limit, + offset=offset, + columns=columns, + sort_column=sort_column, + sort_order=direction, + search_value=search_value, + filters=service_filters, + use_cache=True + ) + except TableNotFoundError as e: + # Re-raise to be handled by caller or global handler + raise ValueError(str(e)) + + try: + result = await run_sync_in_thread(_execute) + except ValueError as e: + # Map TableNotFoundError/ValueError to 404 for this context + raise HTTPException(status_code=404, detail=str(e)) + except Exception as e: + logger.error(f"Query execution failed: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + response_time_ms = (time.time() - start_time) * 1000 + + # Format response + return { + "berdl_table_id": handle_ref_or_id, # Context dependent + "handle_ref": handle_ref_or_id, # Context dependent + "table_name": table_name, + "headers": result["headers"], + "data": result["data"], + "row_count": len(result["data"]), + "total_count": result["total_count"], + "filtered_count": result["total_count"], # Matches logic in routes.py + "response_time_ms": response_time_ms, + "db_query_ms": result["execution_time_ms"], + "conversion_ms": 0.0, # Deprecated metric + "sqlite_file": str(db_path) + } diff --git a/app/utils/sqlite.py b/app/utils/sqlite.py index f304265..70c26dd 100644 --- a/app/utils/sqlite.py +++ b/app/utils/sqlite.py @@ -1,9 +1,10 @@ +""" +Low-level SQLite utilities. +""" from __future__ import annotations import sqlite3 import logging -import time from pathlib import Path -from typing import Any # Configure module logger logger = logging.getLogger(__name__) @@ -12,37 +13,20 @@ def _validate_table_name(cursor, table_name: str) -> None: """ Validate that table_name corresponds to an existing table in the database. - Prevents SQL injection by ensuring table_name is a valid identifier. """ - # Parameterized query is safe from injection cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=?", (table_name,)) if not cursor.fetchone(): - # Check for case-insensitive match or just fail raise ValueError(f"Invalid table name: {table_name}") -# ============================================================================= -# TABLE LISTING & METADATA -# ============================================================================= - def list_tables(db_path: Path) -> list[str]: """ List all user tables in a SQLite database. - - Args: - db_path: Path to the SQLite database file - - Returns: - List of table names (excludes sqlite_ system tables) - - Raises: - sqlite3.Error: If database access fails """ try: conn = sqlite3.connect(str(db_path)) cursor = conn.cursor() - # Query for user tables (exclude sqlite_ system tables) cursor.execute(""" SELECT name FROM sqlite_master WHERE type='table' @@ -52,8 +36,6 @@ def list_tables(db_path: Path) -> list[str]: tables = [row[0] for row in cursor.fetchall()] conn.close() - - logger.info(f"Found {len(tables)} tables in database: {tables}") return tables except sqlite3.Error as e: @@ -64,23 +46,14 @@ def list_tables(db_path: Path) -> list[str]: def get_table_columns(db_path: Path, table_name: str) -> list[str]: """ Get column names for a specific table. - - Args: - db_path: Path to the SQLite database file - table_name: Name of the table to query - - Returns: - List of column names """ try: conn = sqlite3.connect(str(db_path)) cursor = conn.cursor() - # Validate table name to prevent injection _validate_table_name(cursor, table_name) - # Use PRAGMA to get table info - cursor.execute(f"PRAGMA table_info({table_name})") + cursor.execute(f"PRAGMA table_info(\"{table_name}\")") columns = [row[1] for row in cursor.fetchall()] conn.close() @@ -94,13 +67,6 @@ def get_table_columns(db_path: Path, table_name: str) -> list[str]: def get_table_row_count(db_path: Path, table_name: str) -> int: """ Get the total row count for a table. - - Args: - db_path: Path to the SQLite database file - table_name: Name of the table - - Returns: - Number of rows in the table """ try: conn = sqlite3.connect(str(db_path)) @@ -108,7 +74,7 @@ def get_table_row_count(db_path: Path, table_name: str) -> int: _validate_table_name(cursor, table_name) - cursor.execute(f"SELECT COUNT(*) FROM {table_name}") + cursor.execute(f"SELECT COUNT(*) FROM \"{table_name}\"") count = cursor.fetchone()[0] conn.close() @@ -122,303 +88,9 @@ def get_table_row_count(db_path: Path, table_name: str) -> int: def validate_table_exists(db_path: Path, table_name: str) -> bool: """ Check if a table exists in the database. - - Args: - db_path: Path to the SQLite database file - table_name: Name of the table to check - - Returns: - True if table exists, False otherwise - """ - tables = list_tables(db_path) - return table_name in tables - - -# ============================================================================= -# INDEX OPTIMIZATION -# ============================================================================= - -def ensure_indices(db_path: Path, table_name: str) -> None: - """ - Ensure indices exist for all columns in the table to optimize filtering. - - This is an optimization step - failures are logged but not raised. - - Args: - db_path: Path to the SQLite database file - table_name: Name of the table - """ - try: - conn = sqlite3.connect(str(db_path)) - cursor = conn.cursor() - - _validate_table_name(cursor, table_name) - - # Get columns - cursor.execute(f"PRAGMA table_info({table_name})") - columns = [row[1] for row in cursor.fetchall()] - - # Create index for each column - for col in columns: - index_name = f"idx_{table_name}_{col}" - # Sanitize column name for SQL safety - safe_col = col.replace('"', '""') - cursor.execute( - f'CREATE INDEX IF NOT EXISTS "{index_name}" ON "{table_name}" ("{safe_col}")' - ) - - conn.commit() - conn.close() - logger.info(f"Ensured indices for table {table_name}") - - except sqlite3.Error as e: - # Don't raise, just log warning as this is an optimization step - logger.warning(f"Error creating indices for {table_name}: {e}") - - -# ============================================================================= -# DATA RETRIEVAL - SIMPLE QUERY -# ============================================================================= - -def query_sqlite(sqlite_file: Path, query_id: str) -> dict[str, Any]: - """ - Query SQLite database by ID. Legacy compatibility function. - - Args: - sqlite_file: Path to SQLite database - query_id: Query identifier - - Returns: - Query results as dictionary - """ - return { - "stub": "SQLite query results would go here", - "query_id": query_id, - "sqlite_file": str(sqlite_file) - } - - -# ============================================================================= -# DATA RETRIEVAL - FULL FEATURED -# ============================================================================= - -def get_table_data( - sqlite_file: Path, - table_name: str, - limit: int = 100, - offset: int = 0, - sort_column: str | None = None, - sort_order: str = "ASC", - search_value: str | None = None, - query_filters: dict[str, str] | None = None, - columns: str | None = "all", - order_by: list[dict[str, str]] | None = None -) -> tuple[list[str], list[Any], int, int, float, float]: - """ - Get paginated and filtered data from a table. - - Supports two filtering APIs for flexibility: - 1. `filters`: List of FilterSpec-style dicts with column, op, value - 2. `query_filters`: Simple dict of column -> search_value (LIKE matching) - - Args: - sqlite_file: Path to SQLite database - table_name: Name of the table to query - limit: Maximum number of rows to return - offset: Number of rows to skip - sort_column: Single column to sort by (alternative to order_by) - sort_order: Sort direction 'asc' or 'desc' (with sort_column) - search_value: Global search term for all columns - query_filters: Dict of column-specific search terms - columns: Comma-separated list of columns to select - order_by: List of order specifications [{column, direction}] - - Returns: - Tuple of (headers, data, total_count, filtered_count, db_query_ms, conversion_ms) - - Raises: - sqlite3.Error: If database query fails - ValueError: If invalid operator is specified """ - start_time = time.time() - - # Initialize legacy filters to None since removed from signature - filters = None - try: - conn = sqlite3.connect(str(sqlite_file)) - conn.row_factory = sqlite3.Row - cursor = conn.cursor() - - # Validate table name - _validate_table_name(cursor, table_name) - - # Get all column names first for validation - all_headers = get_table_columns(sqlite_file, table_name) - - if not all_headers: - logger.warning(f"Table {table_name} has no columns or doesn't exist") - return [], [], 0, 0, 0.0, 0.0 - - # Parse requested columns - selected_headers = all_headers - select_clause = "*" - - if columns and columns.lower() != "all": - requested = [c.strip() for c in columns.split(',') if c.strip()] - valid = [c for c in requested if c in all_headers] - if valid: - selected_headers = valid - safe_cols = [f'"{c}"' for c in selected_headers] - select_clause = ", ".join(safe_cols) - - headers = selected_headers - - # 1. Get total count (before filtering) - cursor.execute(f"SELECT COUNT(*) FROM {table_name}") - total_count = cursor.fetchone()[0] - - # 2. Build WHERE clause - conditions = [] - params = [] - - # 2a. Global Search (OR logic across all columns) - if search_value: - search_conditions = [] - term = f"%{search_value}%" - for col in headers: - search_conditions.append(f'"{col}" LIKE ?') - params.append(term) - - if search_conditions: - conditions.append(f"({' OR '.join(search_conditions)})") - - # 2b. Column Filters via query_filters dict (AND logic) - if query_filters: - for col, val in query_filters.items(): - if col in headers and val: - conditions.append(f'"{col}" LIKE ?') - params.append(f"%{val}%") - - # 2c. Structured filters via filters list (AND logic) - if filters: - allowed_ops = ["=", "!=", "<", ">", "<=", ">=", "LIKE", "IN"] - for filter_spec in filters: - column = filter_spec.get("column") - op = filter_spec.get("op", "LIKE") - value = filter_spec.get("value") - - if not column or column not in headers: - continue - - if op not in allowed_ops: - raise ValueError(f"Invalid operator: {op}") - - conditions.append(f'"{column}" {op} ?') - params.append(value) - - where_clause = "" - if conditions: - where_clause = " WHERE " + " AND ".join(conditions) - - # 3. Get filtered count - if where_clause: - cursor.execute(f"SELECT COUNT(*) FROM {table_name} {where_clause}", params) - filtered_count = cursor.fetchone()[0] - else: - filtered_count = total_count - - # 4. Build final query - query = f"SELECT {select_clause} FROM {table_name}{where_clause}" - - # Add ORDER BY clause - order_clauses = [] - - # Handle order_by list - if order_by: - for order_spec in order_by: - col = order_spec.get("column") - direction = order_spec.get("direction", "ASC").upper() - - if col and col in headers: - if direction not in ["ASC", "DESC"]: - direction = "ASC" - order_clauses.append(f'"{col}" {direction}') - - # Handle single sort_column (alternative API) - if sort_column and sort_column in headers: - direction = "DESC" if sort_order and sort_order.lower() == "desc" else "ASC" - order_clauses.append(f'"{sort_column}" {direction}') - - if order_clauses: - query += " ORDER BY " + ", ".join(order_clauses) - elif headers: - # Default sort for consistent pagination - query += f' ORDER BY "{headers[0]}" ASC' - - # Add LIMIT clause - if limit is not None: - query += f" LIMIT {int(limit)}" - - # Add OFFSET clause - if offset is not None: - query += f" OFFSET {int(offset)}" - - # Execute query with timing - query_start = time.time() - cursor.execute(query, params) - rows = cursor.fetchall() - db_query_ms = (time.time() - query_start) * 1000 - - conn.close() - - # Convert rows to string arrays with timing - conversion_start = time.time() - data = [] - for row in rows: - string_row = [ - str(value) if value is not None else "" - for value in row - ] - data.append(string_row) - conversion_ms = (time.time() - conversion_start) * 1000 - - return headers, data, total_count, filtered_count, db_query_ms, conversion_ms - - except sqlite3.Error as e: - logger.error(f"Error extracting data from {table_name}: {e}") - raise - - -# ============================================================================= -# CONVERSION (PLACEHOLDER) -# ============================================================================= - -def convert_to_sqlite(binary_file: Path, sqlite_file: Path) -> None: - """ - Convert binary file to SQLite database. - - This function handles conversion of various binary formats - to SQLite for efficient querying. - - Args: - binary_file: Path to binary file - sqlite_file: Path to output SQLite file - - Raises: - NotImplementedError: Conversion logic depends on binary format - """ - # Check if file is already a SQLite database - if binary_file.suffix == '.db': - # Just copy/link the file - import shutil - shutil.copy2(binary_file, sqlite_file) - logger.info(f"Copied SQLite database to {sqlite_file}") - return - - # TODO: Implement conversion logic based on binary file format - # The BERDLTables object stores SQLite directly, so this may not be needed - raise NotImplementedError( - f"SQLite conversion not implemented for format: {binary_file.suffix}" - ) - + tables = list_tables(db_path) + return table_name in tables + except Exception: + return False diff --git a/app/utils/workspace.py b/app/utils/workspace.py index b5cf86b..4c4e0f2 100644 --- a/app/utils/workspace.py +++ b/app/utils/workspace.py @@ -12,6 +12,19 @@ if str(LIB_PATH) not in sys.path: sys.path.insert(0, str(LIB_PATH)) +# Try conditional imports at top level +try: + from kbutillib.kb_ws_utils import KBWSUtils + from kbutillib.notebook_utils import NotebookUtils + HAS_KBUTILLIB = True +except ImportError: + HAS_KBUTILLIB = False + # Define dummy classes if needed for type hinting or logic check + KBWSUtils = object + NotebookUtils = object + +from app.config import settings + # Configure module logger logger = logging.getLogger(__name__) @@ -54,8 +67,8 @@ def __init__( def _init_client(self): """Initialize the appropriate client.""" try: - from kbutillib.kb_ws_utils import KBWSUtils - from kbutillib.notebook_utils import NotebookUtils + if not HAS_KBUTILLIB: + raise ImportError("KBUtilLib not found") # Create a proper combined class cache_dir = self.cache_dir @@ -130,6 +143,15 @@ def download_blob_file(self, handle_ref: str, target_path: Path) -> Path: def _get_endpoints(self) -> dict[str, str]: """Get endpoints for current environment.""" + # If the requested env matches the configured env, use the configured URLs + if self.kb_env == settings.KB_ENV: + return { + "workspace": settings.WORKSPACE_URL, + "shock": settings.BLOBSTORE_URL, + "handle": f"{settings.KBASE_ENDPOINT}/handle_service", + } + + # Fallback for other environments endpoints = { "appdev": { "workspace": "https://appdev.kbase.us/services/ws", @@ -172,7 +194,7 @@ def _get_object_fallback(self, ref: str, ws: int | None = None) -> dict[str, Any endpoints["workspace"], json=payload, headers=headers, - timeout=60 + timeout=30 # Reduced from 60 to fail faster ) response.raise_for_status() result = response.json() @@ -185,6 +207,85 @@ def _get_object_fallback(self, ref: str, ws: int | None = None) -> dict[str, Any raise ValueError(f"No data for: {ref}") return data_list[0] + + def get_object_with_type(self, ref: str, ws: int | None = None) -> tuple[dict[str, Any], str]: + """ + Get workspace object data along with its type. + + Args: + ref: Object reference or name + ws: Workspace ID (optional if ref is full reference) + + Returns: + Tuple of (object_data, object_type) + object_type is the full KBase type string (e.g., "KBaseFBA.GenomeDataLakeTables-2.0") + """ + # Build reference + if ws and "/" not in str(ref): + ref = f"{ws}/{ref}" + + # First get the object type using get_object_info3 + object_type = self._get_object_type(ref) + + # Then get the data using standard method + obj_data = self.get_object(ref) + + return obj_data, object_type + + def _get_object_type(self, ref: str) -> str: + """ + Get the KBase object type using Workspace.get_object_info3. + + Args: + ref: Object reference + + Returns: + Object type string (e.g., "KBaseFBA.GenomeDataLakeTables-2.0") + """ + headers = { + "Authorization": self.token, + "Content-Type": "application/json" + } + + payload = { + "method": "Workspace.get_object_info3", + "params": [{"objects": [{"ref": ref}]}], + "version": "1.1", + "id": "tablescanner-type" + } + + endpoints = self._get_endpoints() + response = requests.post( + endpoints["workspace"], + json=payload, + headers=headers, + timeout=30 + ) + response.raise_for_status() + result = response.json() + + if "error" in result: + logger.warning(f"Error getting object type: {result['error']}") + return "Unknown" + + # get_object_info3 returns: {"result": [{"infos": [[objid, name, type, ...]]}]} + infos = result.get("result", [{}])[0].get("infos", []) + if infos and infos[0] and len(infos[0]) > 2: + return infos[0][2] + + return "Unknown" + + def get_object_type_only(self, ref: str) -> str: + """ + Public method to get object type without fetching full data. + + Args: + ref: Object reference + + Returns: + Object type string + """ + return self._get_object_type(ref) def _download_blob_fallback(self, handle_ref: str, target_path: str) -> str: """Download from blobstore via direct API.""" @@ -275,45 +376,25 @@ def get_berdl_table_data( return obj -def list_pangenomes_from_object( +def get_object_type( berdl_table_id: str, auth_token: str, kb_env: str = "appdev" -) -> list[dict[str, Any]]: +) -> str: """ - List all pangenomes from a BERDLTables object. + Get the KBase object type for a workspace object. Args: - berdl_table_id: KBase workspace reference + berdl_table_id: KBase workspace reference (e.g., "76990/7/2") auth_token: KBase authentication token kb_env: KBase environment Returns: - List of pangenome info dictionaries with: - - pangenome_id - - pangenome_taxonomy - - handle_ref - - user_genomes - - berdl_genomes + Object type string (e.g., "KBaseGeneDataLakes.BERDLTables-1.0") """ - obj_data = get_berdl_table_data(berdl_table_id, auth_token, kb_env) - - pangenome_data = obj_data.get("pangenome_data", []) - - pangenomes = [] - for pg in pangenome_data: - pangenomes.append({ - - "pangenome_taxonomy": pg.get("pangenome_taxonomy", ""), - "user_genomes": pg.get("user_genomes", []), - "berdl_genomes": pg.get("berdl_genomes", []), - "genome_count": len(pg.get("user_genomes", [])) + len(pg.get("berdl_genomes", [])), - "handle_ref": pg.get("sqllite_tables_handle_ref", ""), - }) - - return pangenomes - - + """ + client = KBaseClient(auth_token, kb_env) + return client.get_object_type_only(berdl_table_id) @@ -353,12 +434,16 @@ def download_pangenome_db( return db_path # Fetch object metadata to get handle reference - pangenomes = list_pangenomes_from_object(berdl_table_id, auth_token, kb_env) - if not pangenomes: - raise ValueError(f"No pangenomes found in {berdl_table_id}") + obj_data = get_berdl_table_data(berdl_table_id, auth_token, kb_env) + pangenome_data = obj_data.get("pangenome_data", []) + if not pangenome_data: + raise ValueError(f"No pangenomes found in {berdl_table_id}") + # Take the first (and only expected) pangenome's handle - handle_ref = pangenomes[0]["handle_ref"] + handle_ref = pangenome_data[0].get("sqllite_tables_handle_ref") + if not handle_ref: + raise ValueError(f"No handle reference found in {berdl_table_id}") # Create cache directory db_dir.mkdir(parents=True, exist_ok=True) diff --git a/docs/API.md b/docs/API.md new file mode 100644 index 0000000..751dbf6 --- /dev/null +++ b/docs/API.md @@ -0,0 +1,60 @@ +# TableScanner API + +The **TableScanner** service provides read-only access to SQLite databases stored in KBase (via Workspace objects). It supports listing tables, inspecting schemas, and querying data with filtering, sorting, and pagination. + +## Base URL +- **Development**: `http://localhost:8000` +- **Production**: `https://kbase.us/services/berdl_table_scanner` (or similar) + +## Authentication +All endpoints require a KBase authentication token. +- **Header**: `Authorization: ` or `Authorization: Bearer ` + +--- + +## 1. Service Status + +### `GET /` +Basic service check. +- **Response**: `{"service": "TableScanner", "version": "1.0.0", "status": "running"}` + +### `GET /health` +Detailed health check including connection pool stats. + +--- + +## 2. Object Access +Access databases via KBase Workspace Object Reference (UPA, e.g., `76990/7/2`). + +### `GET /object/{ws_ref}/tables` +List tables for a BERDLTables object. +- **Response**: Table list with schema overviews. + +### `GET /object/{ws_ref}/tables/{table_name}/data` +Query table data. +- **Query Params**: + - `limit` (default: 100) + - `offset` (default: 0) + - `sort_column`, `sort_order` (`ASC`/`DESC`) + - `search` (Global text search) +- **Response**: Headers, data rows, total count. + +--- + +## 3. Data Access + +### `POST /table-data` +Complex query endpoint supporting advanced filtering. +- **Body**: + ```json + { + "berdl_table_id": "...", + "table_name": "Genes", + "limit": 100, + "filters": [ + {"column": "contigs", "operator": "gt", "value": 50}, + {"column": "gene_name", "operator": "like", "value": "kinase"} + ] + } + ``` +- **Supported Operators**: `eq`, `ne`, `gt`, `gte`, `lt`, `lte`, `like`, `ilike`, `in`, `not_in`, `between`, `is_null`, `is_not_null`. diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 8d3cc5a..80cc143 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -1,85 +1,74 @@ # TableScanner Architecture -TableScanner is a high-performance middleware service designed to provide fast, filtered, and paginated access to large tabular data stored in KBase. It solves the performance bottleneck of loading massive objects into memory by leveraging local SQLite caching and efficient indexing. +## Overview +TableScanner is a high-performance, read-only microservice designed to provide efficient access to tabular data stored in KBase (Workspace Objects or Blobstore Handles). It serves as a backend for the DataTables Viewer and other applications requiring filtered, paginated, and aggregated views of large datasets. ---- - -## High-Level Architecture +## System Architecture ```mermaid graph TD - User([User / API Client]) - TS[TableScanner Service] - KBaseWS[KBase Workspace] - KBaseBlob[KBase Blobstore] - LocalCache[(Local SQLite Cache)] - - User -->|API Requests| TS - TS -->|1. Resolve Metadata| KBaseWS - TS -->|2. Download Blob| KBaseBlob - TS -->|3. Store & Index| LocalCache - TS -->|4. SQL Query| LocalCache - LocalCache -->|5. Result| TS - TS -->|6. JSON Response| User + Client[Client Application] --> API[FastAPI Layer] + API --> Service[Query Service] + API --> DBHelper[DB Helper] + + subgraph Core Services + Service --> Pool[Connection Pool] + Pool --> SQLite[SQLite Cache] + Service --> FTS[FTS5 Search] + end + + subgraph Infrastructure + DBHelper --> WS[Workspace Client] + WS --> KBase[KBase Services] + WS --> Blob[Blobstore] + end ``` ---- - -## Caching Strategy: One DB per UPA - -TableScanner employs a strict **one-database-per-object** caching policy. Each KBase object reference (UPA, e.g., `76990/7/2`) is mapped to a unique local directory. - -- **Path Structure**: `{CACHE_DIR}/{sanitized_UPA}/tables.db` -- **Sanitization**: Special characters like `/`, `:`, and spaces are replaced with underscores to ensure filesystem compatibility. -- **Granularity**: Caching is performed at the object level. If multiple tables exist within a single SQLite blob, they are all cached together, improving subsequent access to related data. - ---- - -## Race Condition and Atomic Handling - -To ensure reliability in high-concurrency environments (multiple users requesting the same data simultaneously), TableScanner implements **Atomic File Operations**: - -### 1. Atomic Downloads -When a database needs to be downloaded, TableScanner does **not** download directly to the final path. -1. A unique temporary filename is generated using a UUID: `tables.db.{uuid}.tmp`. -2. The file is downloaded from the KBase Blobstore into this temporary file. -3. Once the download is successful and verified, a **filesystem-level atomic rename** (`os.rename`) is performed to move it to `tables.db`. -4. This ensures that if a process crashes or a network error occurs, the cache directory will not contain a partially-downloaded, corrupt database. - -### 2. Concurrent Request Handling -If two requests for the same UPA arrive at the same time: -- Both will check for the existence of `tables.db`. -- If it's missing, both may start a download to their own unique `temp` files. -- The first one to finish will atomically rename its temp file to `tables.db`. -- The second one to finish will also rename its file, overwriting the first. Since the content is identical (same UPA), the final state remains consistent and the database is never in a corrupt state during the swap. - ---- - -## Performance Optimization: Automatic Indexing - -TableScanner doesn't just store the data; it optimizes it. Upon the **first access** to any table: -- The service scans the table schema. -- It automatically generates a `idx_{table}_{column}` index for **every single column** in the table. -- This "Indexing on Demand" strategy ensures that even complex global searches or specific column filters remain sub-millisecond, regardless of the table size. - ---- - -## Data Lifecycle in Detail - -1. **Request**: User provides a KBase UPA and query parameters. -2. **Cache Verification**: Service checks if `{sanitized_UPA}/tables.db` exists and is valid. -3. **Metadata Resolution**: If not cached, `KBUtilLib` fetches the object from KBase to extract the Blobstore handle. -4. **Secure Download**: The blob is streamed to a temporary UUID file and then atomically renamed. -5. **Schema Check**: TableScanner verifies the requested table exists in the SQLite file. -6. **Index Check**: If it's the first time this table is being queried, indices are created for all columns. -7. **SQL Execution**: A standard SQL query with `LIMIT`, `OFFSET`, and `LIKE` filters is executed. -8. **Streaming Serialization**: Results are converted into a compact JSON list-of-lists and returned to the user. - ---- - -## Tech Stack and Key Components - -- **FastAPI**: Provides the high-performance async web layer. -- **SQLite**: The storage engine for tabular data, chosen for its zero-configuration and high performance with indices. -- **KBUtilLib**: Handles complex KBase Workspace and Blobstore interactions. -- **UUID-based Temp Storage**: Prevents race conditions during file I/O. +## Core Components + +### 1. API Layer (`app/routes.py`) +The entry point for all requests. It handles: +- **Object Access**: `/object/{ws_ref}/tables` +- **Data Queries**: `/table-data` (Advanced filtering) + +### 2. Query Service (`app/services/data/query_service.py`) +The heart of the application. It orchestrates query execution: +- **Type-Aware Filtering**: Automatically detects column types (text vs numeric) and applies correct SQL operators. +- **Advanced Aggregations**: Supports `GROUP BY`, `SUM`, `AVG`, `COUNT`, etc. +- **Full-Text Search**: Leverages SQLite FTS5 for fast global searching. +- **Result Caching**: Caches query results to minimize database I/O for repeated requests. + +### 3. Connection Pool (`app/services/data/connection_pool.py`) +Manages SQLite database connections efficiently: +- **Pooling**: Reuses connections to avoid open/close overhead. +- **Lifecycle**: Automatically closes idle connections after a timeout. +- **Optimization**: Configures PRAGMAs (WAL mode, memory mapping) for performance. + +### 4. Infrastructure Layer +- **DB Helper (`app/services/db_helper.py`)**: Resolves "Handle Refs" or "Workspace Refs" into local file paths, handling download and caching transparently. +- **Workspace Client (`app/utils/workspace.py`)**: Interacts with KBase services, falling back to direct HTTP queries if SDK clients are unavailable. + +## Data Flow + +1. **Request**: Client requests data (e.g., `GET /object/123/1/1/tables/Genes/data?limit=100`). +2. **Resolution**: `DB Helper` checks if the database for `123/1/1` is in the local cache. + - *Miss*: Downloads file from KBase Blobstore/Workspace. + - *Hit*: Returns path to local `.db` file. +3. **Connection**: `QueryService` requests a connection from `ConnectionPool`. +4. **Query Plan**: + - Checks schema for column types. + - Builds SQL query with parameterized filters. + - Ensures necessary indexes exist. +5. **Execution**: SQLite executes the query (using FTS or B-Tree indexes). +6. **Response**: Data is returned to the client as JSON. + +## Design Decisions + +- **Read-Only**: The service never modifies the source SQLite files. This simplifies concurrency control (WAL mode). +- **Synchronous I/O in Async App**: We use `run_sync_in_thread` to offload blocking SQLite operations to a thread pool, keeping the FastAPI event loop responsive. +- **Local Caching**: We aggressively cache database files locally to avoid the high latency of downloading multi-GB files from KBase for every request. + +## Security +- **Authentication**: All data access endpoints require a valid KBase Auth Token (`Authorization` header). +- **Authorization**: The service relies on KBase Services to validate if the token has access to the requested Workspace Object or Handle. +- **Input Validation**: Strict validation of table and column names prevents SQL injection. Parameterized queries are used for all values. diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md new file mode 100644 index 0000000..ef06c56 --- /dev/null +++ b/docs/CONTRIBUTING.md @@ -0,0 +1,81 @@ +# Contributing to TableScanner + +## Development Setup + +### Prerequisites +- Python 3.10+ +- KBase authentication token +- Access to KBase services (Workspace, Blobstore) + +### Quick Start +1. **Clone & Venv**: + ```bash + git clone + cd tablescanner + python3 -m venv venv + source venv/bin/activate + pip install -r requirements.txt + ``` + +2. **Configuration**: + Copy `.env.example` to `.env` and set `KB_SERVICE_AUTH_TOKEN`. + +3. **Run Locally**: + You can use the provided helper script: + ```bash + ./scripts/dev.sh + ``` + This script handles: + - Activating the virtual environment (`.venv`) + - Loading environment variables from `.env` + - Setting `PYTHONPATH` + - Starting the server via `fastapi dev` + + Alternatively, run manually: + ```bash + uvicorn app.main:app --reload --host 0.0.0.0 --port 8000 + ``` + +4. **Run with Docker**: + ```bash + docker-compose up --build + ``` + +--- + +## Project Structure +- `app/`: Core application code. + - `main.py`: Entry point. + - `routes.py`: API endpoints. + - `services/`: Business logic (Data queries, schema). + - `utils/`: Helpers (SQLite, KBase Client). + - `models.py`: Pydantic data models. +- `tests/`: Test suite. +- `docs/`: Documentation. + +--- + +## Testing + +### Running Tests +We use `unittest` (compatible with `pytest`). + +```bash +# Run all tests +python -m unittest discover tests + +# Or using pytest (recommended) +pytest tests/ -v +``` + +### Writing Tests +- Place unit tests in `tests/unit/`. +- Place integration tests in `tests/integration/`. +- Use `app/services/data/query_service.py` tests as a reference for mocking SQLite. + +--- + +## Code Style +- Follow PEP 8. +- Use type hints. +- Ensure purely synchronous I/O (like `sqlite3`) is wrapped in `run_sync_in_thread`. diff --git a/docs/QUICKSTART_DEMO.md b/docs/QUICKSTART_DEMO.md deleted file mode 100644 index b06de7d..0000000 --- a/docs/QUICKSTART_DEMO.md +++ /dev/null @@ -1,50 +0,0 @@ -# Quickstart Demo - -This guide walks you through running the TableScanner demo locally. - -## Prerequisites - -- Python 3.9+ -- KBase Auth Token (for accessing workspace objects) - -## Setup - -1. **Install Dependencies** - ```bash - pip install -r requirements.txt - ``` - -2. **Start the Service** - ```bash - uv run fastapi dev app/main.py - ``` - Server will start at `http://localhost:8000`. - -## Running the Demo - -1. Open the [Viewer](http://localhost:8000/static/viewer.html) in your browser. - -2. **Configuration:** - - **Environment**: Select `AppDev` (or appropriate env). - - **Auth Token**: Enter your KBase token. - -3. **Load Data:** - - **BERDL Table ID**: Enter `76990/ADP1Test`. - - Click the **Search** icon. - -4. **Explore:** - - Since `76990/ADP1Test` contains only one pangenome, it will be **auto-selected**. - - Tables will load automatically. - - Select a table (e.g., "Genome attributes") to view data. - - Hover over cells with IDs (UniProt, KEGG, etc.) to see tooltips. - - Click IDs to visit external databases. - -## Multi-Pangenome Demo - -To test loading multiple identifiers: - -1. **BERDL Table ID**: Enter `76990/ADP1Test, 76990/ADP1Test` (simulating two sources). -2. Click **Search**. -3. The **Pangenome** dropdown will appear. -4. Options will show as: `ADP1 [76990/ADP1Test]`. -5. Select different options to toggle between datasets (if they were different). diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..5eee708 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,31 @@ +# TableScanner + +**TableScanner** is a high-performance, read-only API service for querying SQLite databases stored in [KBase](https://kbase.us). It powers the DataTables Viewer and other applications requiring fast access to tabular data. + +## Documentation + +- **[API Reference](API.md)**: Endpoints, authentication, and usage examples. +- **[Architecture](ARCHITECTURE.md)**: System design and technical overview. +- **[Contributing Guide](CONTRIBUTING.md)**: Setup, testing, and development standards. + +## Quick Start + +### Run with Docker +```bash +docker-compose up --build +``` +The API will be available at `http://localhost:8000`. + +### Run Locally +```bash +# 1. Setup environment +python3 -m venv .venv +source .venv/bin/activate +pip install -r requirements.txt +cp .env.example .env # Edit with your KBase Token + +# 2. Run using helper script +./scripts/dev.sh +``` + +The `./scripts/dev.sh` script is the recommended way to run locally as it handles environment loading and PYTHONPATH setup automatically. diff --git a/docs/USAGE_GUIDE.md b/docs/USAGE_GUIDE.md deleted file mode 100644 index 6cb87b4..0000000 --- a/docs/USAGE_GUIDE.md +++ /dev/null @@ -1,107 +0,0 @@ -# Usage Guide - -This guide covers production usage of the TableScanner service. - -## API Endpoint -The service is deployed at: -``` -https://appdev.kbase.us/services/berdl_table_scanner -``` - -## Authentication -All requests require a valid KBase authentication token passed in the `Authorization` header. - -```bash -Authorization: -``` - ---- - -## 1. Using the Hierarchical REST API (Browser-friendly) - -This style uses hierarchical paths and standard GET requests. It is ideal for web applications or simple data navigation. - -### List Available Tables -Get a list of all tables found in a KBase object. - -**Endpoint:** `GET /object/{upa}/tables` - -**Example:** -```bash -curl -H "Authorization: $KB_TOKEN" \ - "https://appdev.kbase.us/services/berdl_table_scanner/object/76990/7/2/tables" -``` - -### Query Table Data -Retrieve paginated data from a specific table. - -**Endpoint:** `GET /object/{upa}/tables/{table_name}/data` - -**Parameters:** -- `limit`: (int) Maximum rows (default 100) -- `offset`: (int) Skip rows (default 0) -- `search`: (string) Global search term -- `sort_column`: (string) Column to sort by -- `sort_order`: (string) "ASC" or "DESC" - -**Example:** -```bash -curl -H "Authorization: $KB_TOKEN" \ - "https://appdev.kbase.us/services/berdl_table_scanner/object/76990/7/2/tables/Genes/data?limit=5" -``` - ---- - -## 2. Using the Flat POST API (Script-friendly) - -The Flat POST API is recommended for Python scripts and programmatic access. It allows sending complex query parameters in a single JSON body. - -**Endpoint:** `POST /table-data` - -### Implementation Example (Python) - -```python -import requests -import json - -url = "https://appdev.kbase.us/services/berdl_table_scanner/table-data" -headers = {"Authorization": "YOUR_KBASE_TOKEN"} - -payload = { - "berdl_table_id": "76990/7/2", - "table_name": "Metadata_Conditions", - "limit": 50, - "offset": 0, - "search_value": "glucose", - "col_filter": { - "organism": "E. coli" - }, - "sort_column": "yield", - "sort_order": "DESC" -} - -response = requests.post(url, json=payload, headers=headers) -data = response.json() - -print(f"Retrieved {len(data['data'])} rows.") -``` - ---- - -## Pro Tips - -### Multi-Source Search -The metadata endpoints support comma-separated IDs to aggregate pangenomes across multiple objects. - -```bash -GET /pangenomes?berdl_table_id=76990/7/2,76990/8/1 -``` - -### Performance -The first request for a large dataset may take a few seconds as the service downloads and indexes the database. Subsequent requests will be near-instant. - ---- - -## Web Viewer -Access the interactive viewer at: -`https://appdev.kbase.us/services/berdl_table_scanner/static/viewer.html` # TODO: implement this diff --git a/scripts/api_client.py b/scripts/api_client.py deleted file mode 100644 index 4143abc..0000000 --- a/scripts/api_client.py +++ /dev/null @@ -1,86 +0,0 @@ -import requests -import json -import os - -# Set your KBase authentication token -TOKEN = os.environ.get("KBASE_TOKEN") -if not TOKEN: - raise RuntimeError("KBASE_TOKEN environment variable is not set.") -HEADERS = {"Authorization": TOKEN} -BASE_URL = "http://127.0.0.1:8000" - -# ---------------------------------------------------------- -# STYLE 1: HIERARCHICAL REST (GET) -# Ideal for simple navigation and web viewers -# ---------------------------------------------------------- - -print("\n--- REST: List Tables ---") -# Literal path: /object/{upa}/tables -res = requests.get(f"{BASE_URL}/object/76990/7/2/tables", headers=HEADERS) -res.raise_for_status() -print(json.dumps(res.json()["tables"][:3], indent=2)) - - - -print("\n--- REST: Get Top 3 Genes ---") -# Literal path: /object/{upa}/tables/{table_name}/data -res = requests.get(f"{BASE_URL}/object/76990/7/2/tables/Genes/data", params={"limit": 3}, headers=HEADERS) -res.raise_for_status() -print(json.dumps(res.json()["data"], indent=2)) - - - -print("\n--- REST: Filtered Search (kinase) ---") -# Literal path with query parameters -params = {"limit": 3, "search": "kinase"} -res = requests.get(f"{BASE_URL}/object/76990/7/2/tables/Genes/data", params=params, headers=HEADERS) -res.raise_for_status() -print(json.dumps(res.json()["data"], indent=2)) - - -# ---------------------------------------------------------- -# STYLE 2: FLAT POST -# Ideal for complex queries and production scripts -# ---------------------------------------------------------- - -print("\n--- POST: Basic Fetch (3 rows) ---") -# Single endpoint for all data: /table-data -payload = { - "berdl_table_id": "76990/7/2", - "table_name": "Conditions", - "limit": 3 -} -res = requests.post(f"{BASE_URL}/table-data", json=payload, headers=HEADERS) -res.raise_for_status() -print(json.dumps(res.json()["data"], indent=2)) - - - -print("\n--- POST: Column-Specific Filter (Carbon_source=pyruvate) ---") -# Precise AND-logic filtering via col_filter -payload = { - "berdl_table_id": "76990/7/2", - "table_name": "Conditions", - "limit": 3, - "col_filter": {"Carbon_source": "pyruvate"} -} -res = requests.post(f"{BASE_URL}/table-data", json=payload, headers=HEADERS) -res.raise_for_status() -print(json.dumps(res.json()["data"], indent=2)) - - - -print("\n--- POST: Sorted Multi-column Query ---") -# Support for complex ordering -payload = { - "berdl_table_id": "76990/7/2", - "table_name": "Genes", - "limit": 3, - "order_by": [ - {"column": "Length", "direction": "DESC"}, - {"column": "ID", "direction": "ASC"} - ] -} -res = requests.post(f"{BASE_URL}/table-data", json=payload, headers=HEADERS) -res.raise_for_status() -print(json.dumps(res.json()["data"], indent=2)) diff --git a/static/viewer.html b/static/viewer.html index 463bb62..8732990 100644 --- a/static/viewer.html +++ b/static/viewer.html @@ -1,962 +1,1310 @@ - + - TableScanner - BERDL Table Viewer + TableScanner - Research Data Explorer - - - + - -
-
-
-

+
+ +

-
- + TableScanner
+ v2.0
-
-
- -
- -
-
Connection
-
-
- -
- - + +
+ +
-
-
-
+
+
+
diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..3a8be3e --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +# Tests package for TableScanner diff --git a/tests/integration/test_routes.py b/tests/integration/test_routes.py new file mode 100644 index 0000000..13c03f9 --- /dev/null +++ b/tests/integration/test_routes.py @@ -0,0 +1,36 @@ +import unittest +from fastapi.testclient import TestClient +from app.main import app + +class TestRoutes(unittest.TestCase): + def setUp(self): + self.client = TestClient(app) + + def test_health_check(self): + response = self.client.get("/health") + # 500/503 is NOT acceptable. Integration tests must ensure the application can start. + # The ConnectionPool does not require external connectivity to initialize. + self.assertEqual(response.status_code, 200) + + def test_api_docs_accessible(self): + response = self.client.get("/docs") + self.assertEqual(response.status_code, 200) + + def test_openapi_schema_structure(self): + response = self.client.get("/openapi.json") + self.assertEqual(response.status_code, 200) + data = response.json() + self.assertIn("paths", data) + # Verify Key Endpoints exist + self.assertIn("/object/{ws_ref}/tables", data["paths"]) + self.assertIn("/table-data", data["paths"]) + + # Verify Deprecated Endpoints are GONE + self.assertNotIn("/handle/{handle_ref}/tables", data["paths"]) + self.assertNotIn("/pangenomes", data["paths"]) + self.assertNotIn("/tables", data["paths"]) + self.assertNotIn("/config/providers", data["paths"]) + self.assertNotIn("/config/resolve", data["paths"]) + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unit/test_query_service.py b/tests/unit/test_query_service.py new file mode 100644 index 0000000..c57b719 --- /dev/null +++ b/tests/unit/test_query_service.py @@ -0,0 +1,107 @@ +import unittest +import sqlite3 +import tempfile +import shutil +import logging +from pathlib import Path +from app.services.data.query_service import QueryService, FilterSpec, AggregationSpec +from app.exceptions import TableNotFoundError + +# Configure logging +logging.basicConfig(level=logging.ERROR) + +class TestQueryService(unittest.TestCase): + def setUp(self): + self.temp_dir = tempfile.mkdtemp() + self.db_path = Path(self.temp_dir) / "test.db" + self.service = QueryService() + + # Create a test database + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + cursor.execute("CREATE TABLE users (id INTEGER PRIMARY KEY, name TEXT, age INTEGER, salary REAL, status TEXT)") + data = [ + (1, "Alice", 30, 50000.0, "active"), + (2, "Bob", 25, 45000.5, "inactive"), + (3, "Charlie", 35, 70000.0, "active"), + (4, "David", 30, 52000.0, "active"), + (5, "Eve", 28, 49000.0, "inactive"), + ] + cursor.executemany("INSERT INTO users VALUES (?, ?, ?, ?, ?)", data) + conn.commit() + conn.close() + + def tearDown(self): + shutil.rmtree(self.temp_dir) + + def test_simple_select(self): + result = self.service.execute_query(self.db_path, "users", limit=10) + self.assertEqual(len(result["data"]), 5) + self.assertEqual(result["total_count"], 5) + self.assertEqual(result["headers"], ["id", "name", "age", "salary", "status"]) + + def test_filter_numeric(self): + filters = [FilterSpec(column="age", operator="gt", value=28)] + result = self.service.execute_query(self.db_path, "users", filters=filters) + # Should be Alice(30), Charlie(35), David(30) + self.assertEqual(len(result["data"]), 3) + self.assertEqual(result["total_count"], 3) + + def test_filter_text(self): + filters = [FilterSpec(column="status", operator="eq", value="active")] + result = self.service.execute_query(self.db_path, "users", filters=filters) + self.assertEqual(len(result["data"]), 3) + + def test_sorting(self): + # Sort by age DESC + result = self.service.execute_query(self.db_path, "users", sort_column="age", sort_order="DESC") + data = result["data"] + # Charlie(35) first + self.assertEqual(data[0][1], "Charlie") + # Bob(25) last + self.assertEqual(data[4][1], "Bob") + + def test_aggregation(self): + aggs = [ + AggregationSpec(column="salary", function="avg", alias="avg_salary"), + AggregationSpec(column="status", function="count", alias="count") + ] + result = self.service.execute_query( + self.db_path, "users", + aggregations=aggs, + group_by=["status"], + sort_column="status" + ) + + self.assertEqual(len(result["data"]), 2) + row_active = next(r for r in result["data"] if r[0] == "active") + + # Active: Alice(50k), Charlie(70k), David(52k) -> Avg 57333.33 + self.assertAlmostEqual(float(row_active[1]), 57333.33, delta=0.1) + self.assertEqual(int(row_active[2]), 3) + + def test_sql_injection_sort_ignored(self): + """Ensure sort column injection attacks are ignored (fallback to default).""" + bad_col = "age; DROP TABLE users; --" + result = self.service.execute_query(self.db_path, "users", sort_column=bad_col) + self.assertEqual(len(result["data"]), 5) + + # Verify table still exists + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + cursor.execute("SELECT count(*) FROM users") + self.assertEqual(cursor.fetchone()[0], 5) + conn.close() + + def test_sql_injection_filter_safe(self): + """Ensure filter value injection is handled safely as literal string.""" + filters = [FilterSpec(column="name", operator="eq", value="Alice' OR '1'='1")] + result = self.service.execute_query(self.db_path, "users", filters=filters) + self.assertEqual(len(result["data"]), 0) + + def test_missing_table(self): + with self.assertRaises(TableNotFoundError): + self.service.execute_query(self.db_path, "non_existent_table") + +if __name__ == "__main__": + unittest.main()