Enter BERDLTable ID and click search
+diff --git a/.env.example b/.env.example index a5d5dc7..c90c0f0 100644 --- a/.env.example +++ b/.env.example @@ -1,15 +1,48 @@ # TableScanner Environment Variables # Copy this file to .env and fill in your actual values +# ============================================================================= +# AUTHENTICATION +# ============================================================================= # KBase Service Authentication Token -KB_SERVICE_AUTH_TOKEN=your_kbase_token_here +# For development testing, use your personal token from KBase +KB_SERVICE_AUTH_TOKEN=YOUR_KBASE_TOKEN_HERE -# Cache directory for storing downloaded files and SQLite databases +# ============================================================================= +# CACHE SETTINGS +# ============================================================================= +# Cache directory for storing downloaded SQLite databases CACHE_DIR=/tmp/tablescanner_cache +# Maximum age of cached files in hours (default: 24) +CACHE_MAX_AGE_HOURS=24 + +# ============================================================================= +# KBASE SERVICE URLS +# ============================================================================= # KBase Workspace Service URL +WORKSPACE_URL=https://appdev.kbase.us/services/ws + +# Base URL for KBase services +KBASE_ENDPOINT=https://appdev.kbase.us/services + +# KBase Blobstore/Shock service URL +BLOBSTORE_URL=https://appdev.kbase.us/services/shock-api + +# ============================================================================= +# APPLICATION SETTINGS +# ============================================================================= +# Enable debug mode (true/false) +DEBUG=false + +# ============================================================================= +# TEST DATA (AppDev) +# ============================================================================= +# Test BERDLTable object: 76990/ADP1Test +# Test pangenome: GCF_000368685.1 +# Narrative: https://appdev.kbase.us/narrative/76990 WORKSPACE_URL=https://kbase.us/services/ws # Root path for proxy deployment (e.g., "/services/berdl_table_scanner") # Leave empty if running at root path (i.e., "/") for local dev -ROOT_PATH=/services/berdl_table_scanner \ No newline at end of file +ROOT_PATH=/services/berdl_table_scanner diff --git a/.gitignore b/.gitignore index 18db92a..172f46b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +trash/ + .DS_Store .idea test/test.cfg @@ -20,3 +22,9 @@ venv/ # Environment variables .env + +# External libraries (cloned) +lib/ + +# Cache directory +cache/ diff --git a/app/config.py b/app/config.py index 00b1743..37fb984 100644 --- a/app/config.py +++ b/app/config.py @@ -1,18 +1,64 @@ """ Configuration settings for TableScanner application. + +Loads configuration from environment variables and .env file. +All KBase service URLs and authentication settings are managed here. """ from pydantic_settings import BaseSettings +from pydantic import Field class Settings(BaseSettings): - """Application settings.""" + """ + Application settings loaded from environment variables. + + Create a .env file based on .env.example to configure locally. + """ + + # ========================================================================== + # AUTHENTICATION + # ========================================================================== + KB_SERVICE_AUTH_TOKEN: str = Field( + ..., + description="KBase authentication token for API access" + ) + + # ========================================================================== + # CACHE SETTINGS + # ========================================================================== + CACHE_DIR: str = Field( + default="/tmp/tablescanner_cache", + description="Directory for caching downloaded files and SQLite databases" + ) + CACHE_MAX_AGE_HOURS: int = Field( + default=24, + description="Maximum age of cached files in hours before re-download" + ) - KB_SERVICE_AUTH_TOKEN: str - CACHE_DIR: str + # ========================================================================== + # KBASE SERVICE URLS + # ========================================================================== + WORKSPACE_URL: str = Field( + default="https://kbase.us/services/ws", + description="KBase Workspace service URL" + ) + KBASE_ENDPOINT: str = Field( + default="https://kbase.us/services", + description="Base URL for KBase services" + ) + BLOBSTORE_URL: str = Field( + default="https://kbase.us/services/shock-api", + description="KBase blobstore/shock service URL" + ) - # KBase Workspace settings - WORKSPACE_URL: str + # ========================================================================== + # APPLICATION SETTINGS + # ========================================================================== + DEBUG: bool = Field( + default=False, + description="Enable debug mode with verbose logging" + ) # Root path for proxy deployment (e.g., "/services/berdl_table_scanner") ROOT_PATH: str = "" @@ -23,5 +69,5 @@ class Config: case_sensitive = True -# Global settings instance +# Global settings instance - loaded at module import settings = Settings() \ No newline at end of file diff --git a/app/main.py b/app/main.py index 5f86a7f..b816777 100644 --- a/app/main.py +++ b/app/main.py @@ -1,10 +1,17 @@ """ TableScanner FastAPI Application -Main application factory module. +Main application factory module for the TableScanner service. +Provides REST API endpoints for querying BERDL table data. + +Run with: uv run fastapi dev app/main.py """ +from pathlib import Path from fastapi import FastAPI +from fastapi.staticfiles import StaticFiles +from fastapi.middleware.cors import CORSMiddleware + from app.routes import router from app.config import settings @@ -13,22 +20,55 @@ def create_app() -> FastAPI: """ Application factory function. + Creates and configures the FastAPI application with: + - Static file serving for viewer.html + - API routes + Returns: FastAPI: Configured FastAPI application instance """ + # Configure root_path for KBase dynamic services + # KBase services are often deployed at /services/service_name + # Pydantic Settings management or manual environ check can handle this. + import os + root_path = os.environ.get("KB_SERVICE_ROOT_PATH", "") + + + description = """ + ## TableScanner API + + A FastAPI service for querying BERDL table data from KBase. + + ### Features + - List pangenomes from BERDLTables objects + - List tables within a pangenome + - Query table data with filtering, sorting, and pagination + - Local caching for performance + + ### Authentication + Pass your KBase auth token in the `Authorization` header. + """ + app = FastAPI( title="TableScanner", - description="API for table scanning operations", + root_path=root_path, + description=description, version="1.0.0", - root_path=settings.ROOT_PATH + docs_url="/docs", + redoc_url="/redoc", ) # Store settings in app state for access throughout the application app.state.settings = settings - # Include routes + # Include API routes app.include_router(router) + # Mount static files directory for viewer.html + static_dir = Path(__file__).parent.parent / "static" + if static_dir.exists(): + app.mount("/static", StaticFiles(directory=static_dir), name="static") + return app diff --git a/app/models.py b/app/models.py index 2aeb79e..ae78965 100644 --- a/app/models.py +++ b/app/models.py @@ -1,15 +1,299 @@ """ -Pydantic models for request/response schemas. +Pydantic models for TableScanner API. + +Defines strictly typed request/response schemas for clean /docs output. +All models use Field with descriptions and examples for documentation. """ -from typing import Optional, List, Dict, Any -from pydantic import BaseModel +from typing import List, Dict, Optional, Any, Literal +from pydantic import BaseModel, Field + + +# ============================================================================= +# REQUEST MODELS +# ============================================================================= + +class OrderSpec(BaseModel): + """Specification for ordering/sorting query results.""" + column: str = Field(..., description="Column name to sort by") + order: Literal["ASC", "DESC"] = Field( + "ASC", + description="Sort direction: ASC (ascending) or DESC (descending)" + ) + + +class FilterSpec(BaseModel): + """Specification for column-specific filtering.""" + column: str = Field(..., description="Column name to filter") + value: str = Field(..., description="Filter value (uses LIKE matching)") + operator: Literal["LIKE", "=", ">", "<", ">=", "<="] = Field( + "LIKE", + description="Filter operator" + ) class SearchRequest(BaseModel): - """Search request with query parameters.""" - pangenome_id: str - table_name: str - limit: Optional[int] = None - order_by: Optional[List[Dict[str, str]]] = None - filters: Optional[List[Dict[str, Any]]] = None \ No newline at end of file + """ + Request model for /search endpoint. + + Provides a flexible interface for searching table data with + optional filtering, sorting, and pagination. + """ + berdl_table_id: str = Field( + ..., + description="BERDLTables object reference (e.g., '76990/ADPITest')", + examples=["76990/ADPITest"] + ) + pangenome_id: Optional[str] = Field( + None, + description="Pangenome ID within the BERDLTables object. Uses first available if not specified." + ) + table_name: str = Field( + ..., + description="Name of the table to query", + examples=["Genes", "Organisms"] + ) + limit: int = Field( + 100, + ge=1, + le=500000, + description="Maximum number of rows to return" + ) + offset: int = Field( + 0, + ge=0, + description="Number of rows to skip (for pagination)" + ) + search_value: Optional[str] = Field( + None, + description="Global search term (searches all columns)" + ) + order_by: Optional[List[Dict[str, str]]] = Field( + None, + description="List of {column, order} dicts for sorting", + examples=[[{"column": "gene_name", "order": "ASC"}]] + ) + filters: Optional[List[Dict[str, str]]] = Field( + None, + description="List of column filters [{column, value}]" + ) + kb_env: str = Field( + "appdev", + description="KBase environment: appdev, ci, or prod" + ) + + +class TableDataRequest(BaseModel): + """ + Request model for /table-data endpoint. + + Mirrors the parameters from the original BERDLTable_conversion_service + for API compatibility. + """ + berdl_table_id: str = Field( + ..., + description="BERDLTables object reference", + examples=["76990/ADPITest"] + ) + columns: Optional[str] = Field( + "all", + description="Comma-separated list of columns to select or 'all'", + examples=["gene_id, gene_name"] + ) + col_filter: Optional[Dict[str, str]] = Field( + None, + description="Column-specific filters (alias for query_filters)", + examples=[{"gene_name": "kinase"}] + ) + table_name: str = Field( + ..., + description="Table name within the SQLite database", + examples=["Genes"] + ) + limit: int = Field( + 100, + ge=1, + le=500000, + description="Maximum rows to return" + ) + offset: int = Field( + 0, + ge=0, + description="Offset for pagination" + ) + sort_column: Optional[str] = Field( + None, + description="Column to sort by" + ) + sort_order: Optional[Literal["ASC", "DESC"]] = Field( + "ASC", + description="Sort direction" + ) + order_by: Optional[List[Dict[str, str]]] = Field( + None, + description="Multi-column sort specifications [{'column': 'col_name', 'direction': 'asc'}]", + examples=[[{"column": "gene_name", "direction": "asc"}, {"column": "score", "direction": "desc"}]] + ) + search_value: Optional[str] = Field( + None, + description="Global search term" + ) + query_filters: Optional[Dict[str, str]] = Field( + None, + description="Column-specific filters {column_name: filter_value}", + examples=[{"gene_name": "kinase", "organism": "E. coli"}] + ) + pangenome_id: Optional[str] = Field( + None, + description="Specific pangenome ID (optimizes cache lookup)", + examples=["pg_123"] + ) + kb_env: str = Field( + "appdev", + description="KBase environment" + ) + + +# ============================================================================= +# RESPONSE MODELS +# ============================================================================= + +class TableColumn(BaseModel): + """Information about a table column.""" + name: str = Field(..., description="Column name") + type: Optional[str] = Field(None, description="Column data type") + + +class TableInfo(BaseModel): + """Information about a database table.""" + name: str = Field(..., description="Table name") + row_count: Optional[int] = Field(None, description="Number of rows") + column_count: Optional[int] = Field(None, description="Number of columns") + + +class TableListResponse(BaseModel): + """Response for listing tables in a pangenome database.""" + pangenome_id: str = Field(..., description="Pangenome identifier") + tables: List[TableInfo] = Field( + default_factory=list, + description="List of available tables" + ) + + +class PangenomeInfo(BaseModel): + """Information about a pangenome found in the SQLite file.""" + pangenome_id: str = Field(..., description="ID of the pangenome") + pangenome_taxonomy: Optional[str] = Field(None, description="Taxonomy of the pangenome") + genome_count: int = Field(..., description="Number of genomes in the pangenome") + source_berdl_id: str = Field(..., description="Source BERDL Table ID") + user_genomes: List[str] = Field( + default_factory=list, + description="List of user-provided genome references" + ) + berdl_genomes: List[str] = Field( + default_factory=list, + description="List of BERDL/Datalake genome identifiers" + ) + handle_ref: Optional[str] = Field( + None, + description="Blobstore handle reference for SQLite database" + ) + + +class PangenomesResponse(BaseModel): + """Response for listing pangenomes from a BERDLTables object.""" + pangenomes: List[PangenomeInfo] = Field( + default_factory=list, + description="List of available pangenomes" + ) + pangenome_count: int = Field( + 0, + description="Total number of pangenomes" + ) + auto_selected: Optional[str] = Field( + None, + description="Auto-selected pangenome ID when only one exists" + ) + + +class TableDataResponse(BaseModel): + """ + Response for table data queries. + + Includes the data, metadata, and performance metrics. + """ + headers: List[str] = Field( + ..., + description="Column names in order" + ) + data: List[List[str]] = Field( + ..., + description="Row data as list of lists" + ) + row_count: int = Field( + ..., + description="Number of rows in this response" + ) + total_count: int = Field( + ..., + description="Total rows in table (before filtering)" + ) + filtered_count: int = Field( + ..., + description="Rows matching filter criteria" + ) + table_name: str = Field( + ..., + description="Name of the queried table" + ) + pangenome_id: str = Field( + ..., + description="Pangenome identifier" + ) + response_time_ms: float = Field( + ..., + description="Total response time in milliseconds" + ) + db_query_ms: Optional[float] = Field( + None, + description="Database query time in milliseconds" + ) + conversion_ms: Optional[float] = Field( + None, + description="Data conversion time in milliseconds" + ) + source: Optional[str] = Field( + None, + description="Data source (Cache or Downloaded)" + ) + cache_file: Optional[str] = Field( + None, + description="Path to cached file" + ) + sqlite_file: Optional[str] = Field( + None, + description="Path to SQLite database" + ) + + +class CacheResponse(BaseModel): + """Response for cache operations.""" + status: Literal["success", "error"] = Field( + ..., + description="Operation status" + ) + message: str = Field( + ..., + description="Status message" + ) + + +class ServiceStatus(BaseModel): + """Service health check response.""" + service: str = Field(..., description="Service name") + version: str = Field(..., description="Service version") + status: Literal["running", "degraded", "error"] = Field( + ..., + description="Service status" + ) + cache_dir: str = Field(..., description="Cache directory path") \ No newline at end of file diff --git a/app/routes.py b/app/routes.py index c05a16b..2a778bf 100644 --- a/app/routes.py +++ b/app/routes.py @@ -1,92 +1,634 @@ """ -TableScanner API Routes +TableScanner API Routes. -Contains all API endpoint definitions. +REST API Structure (per architecture diagram): +- GET /{handle_ref}/tables - List tables in SQLite from handle +- GET /{handle_ref}/tables/{table}/schema - Get table schema +- GET /{handle_ref}/tables/{table}/data - Query table data with pagination +- GET /object/{ws_ref}/tables - List tables from KBase object +- GET /object/{ws_ref}/tables/{table}/data - Query via KBase object ref + +Also supports legacy endpoints for backwards compatibility. """ +import time +import logging from pathlib import Path -from fastapi import APIRouter, Request, HTTPException +from typing import Optional + +from fastapi import APIRouter, HTTPException, Header, Query +from fastapi.responses import JSONResponse -from app.models import SearchRequest -from app.utils.workspace import get_object_info -from app.utils.download import download_from_handle -from app.utils.cache import get_cache_paths, save_to_cache, is_cached -from app.utils.sqlite import convert_to_sqlite +from app.models import ( + SearchRequest, + TableDataRequest, + TableDataResponse, + PangenomesResponse, + PangenomeInfo, + TableListResponse, + TableInfo, + CacheResponse, + ServiceStatus, +) +from app.utils.workspace import ( + get_berdl_table_data, + list_pangenomes_from_object, + download_pangenome_db, +) +from app.utils.sqlite import ( + list_tables, + get_table_data, + get_table_columns, + get_table_row_count, + validate_table_exists, + ensure_indices, +) +from app.utils.cache import ( + is_cached, + get_cache_paths, + clear_cache, + list_cached_items, + cleanup_old_caches, +) +from app.config import settings +# Configure module logger +logger = logging.getLogger(__name__) + +# Create router router = APIRouter() -@router.get("/") -async def root(request: Request): - """Root endpoint returning service information.""" - settings = request.app.state.settings - return { - "service": "TableScanner", - "version": "1.0.0", - "status": "running", - "cache_dir": settings.CACHE_DIR - } +# ============================================================================= +# UTILITY FUNCTIONS +# ============================================================================= + +def get_auth_token(authorization: Optional[str] = None) -> str: + """Extract auth token from header or settings.""" + if authorization: + if authorization.startswith("Bearer "): + return authorization[7:] + return authorization + + if settings.KB_SERVICE_AUTH_TOKEN: + return settings.KB_SERVICE_AUTH_TOKEN + + raise HTTPException( + status_code=401, + detail="Authorization token required" + ) + + +def get_cache_dir() -> Path: + """Get configured cache directory.""" + return Path(settings.CACHE_DIR) + + +# ============================================================================= +# SERVICE STATUS +# ============================================================================= + +@router.get("/", response_model=ServiceStatus) +async def root(): + """Service health check.""" + return ServiceStatus( + service="TableScanner", + version="1.0.0", + status="running", + cache_dir=str(settings.CACHE_DIR) + ) + + +# ============================================================================= +# HANDLE-BASED ENDPOINTS (Primary REST API per diagram) +# /{handle_ref}/tables - List tables +# /{handle_ref}/tables/{table}/schema - Table schema +# /{handle_ref}/tables/{table}/data - Table data with pagination +# ============================================================================= +@router.get("/handle/{handle_ref}/tables") +async def list_tables_by_handle( + handle_ref: str, + kb_env: str = Query("appdev", description="KBase environment"), + authorization: Optional[str] = Header(None) +): + """ + List all tables in a SQLite database accessed via handle reference. + + The handle_ref is the KBase blobstore handle (e.g., KBH_248028). + """ + try: + token = get_auth_token(authorization) + cache_dir = get_cache_dir() + + # Download SQLite from handle + from app.utils.workspace import KBaseClient + client = KBaseClient(token, kb_env, cache_dir) + + # Cache path based on handle + safe_handle = handle_ref.replace(":", "_").replace("/", "_") + db_path = cache_dir / "handles" / f"{safe_handle}.db" + + if not db_path.exists(): + client.download_blob_file(handle_ref, db_path) + + # List tables + table_names = list_tables(db_path) + tables = [] + for name in table_names: + try: + columns = get_table_columns(db_path, name) + row_count = get_table_row_count(db_path, name) + tables.append({ + "name": name, + "row_count": row_count, + "column_count": len(columns) + }) + except Exception as e: + logger.warning(f"Error getting info for {name}: {e}") + tables.append({"name": name}) + + return { + "handle_ref": handle_ref, + "tables": tables, + "db_path": str(db_path) + } + + except Exception as e: + logger.error(f"Error listing tables from handle: {e}") + raise HTTPException(status_code=500, detail=str(e)) -@router.post("/search") -def search(request: Request, search_request: SearchRequest): + +@router.get("/handle/{handle_ref}/tables/{table_name}/schema") +async def get_table_schema_by_handle( + handle_ref: str, + table_name: str, + kb_env: str = Query("appdev"), + authorization: Optional[str] = Header(None) +): + """ + Get schema (columns) for a table accessed via handle reference. """ - Search endpoint with flexible querying. + try: + token = get_auth_token(authorization) + cache_dir = get_cache_dir() + + from app.utils.workspace import KBaseClient + client = KBaseClient(token, kb_env, cache_dir) + + safe_handle = handle_ref.replace(":", "_").replace("/", "_") + db_path = cache_dir / "handles" / f"{safe_handle}.db" + + if not db_path.exists(): + client.download_blob_file(handle_ref, db_path) + + if not validate_table_exists(db_path, table_name): + available = list_tables(db_path) + raise HTTPException(404, f"Table '{table_name}' not found. Available: {available}") + + columns = get_table_columns(db_path, table_name) + row_count = get_table_row_count(db_path, table_name) + + return { + "handle_ref": handle_ref, + "table_name": table_name, + "columns": columns, + "row_count": row_count + } + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error getting schema: {e}") + raise HTTPException(status_code=500, detail=str(e)) - Args: - search_request: Search parameters including pangenome_id, table_name, limit, order_by, filters +@router.get("/handle/{handle_ref}/tables/{table_name}/data") +async def get_table_data_by_handle( + handle_ref: str, + table_name: str, + limit: int = Query(100, ge=1, le=500000), + offset: int = Query(0, ge=0), + sort_column: Optional[str] = Query(None), + sort_order: Optional[str] = Query("ASC"), + search: Optional[str] = Query(None, description="Global search term"), + kb_env: str = Query("appdev"), + authorization: Optional[str] = Header(None) +): + """ + Query table data from SQLite via handle reference. + + Supports: + - Pagination: limit, offset + - Sorting: sort_column, sort_order + - Search: global search across all columns + """ + start_time = time.time() + + try: + token = get_auth_token(authorization) + cache_dir = get_cache_dir() + + from app.utils.workspace import KBaseClient + client = KBaseClient(token, kb_env, cache_dir) + + safe_handle = handle_ref.replace(":", "_").replace("/", "_") + db_path = cache_dir / "handles" / f"{safe_handle}.db" + + if not db_path.exists(): + client.download_blob_file(handle_ref, db_path) + + if not validate_table_exists(db_path, table_name): + available = list_tables(db_path) + raise HTTPException(404, f"Table '{table_name}' not found. Available: {available}") + + # Query data + headers, data, total_count, filtered_count, db_query_ms, conversion_ms = get_table_data( + sqlite_file=db_path, + table_name=table_name, + limit=limit, + offset=offset, + sort_column=sort_column, + sort_order=sort_order, + search_value=search, + ) + + response_time_ms = (time.time() - start_time) * 1000 + + return { + "handle_ref": handle_ref, + "table_name": table_name, + "headers": headers, + "data": data, + "row_count": len(data), + "total_count": total_count, + "filtered_count": filtered_count, + "response_time_ms": response_time_ms, + "db_query_ms": db_query_ms + } + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error querying data: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +# ============================================================================= +# OBJECT-BASED ENDPOINTS (via KBase workspace object reference) +# /object/{ws_ref}/pangenomes - List pangenomes from BERDLTables object +# /object/{ws_ref}/pangenomes/{pg_id}/tables - List tables for a pangenome +# /object/{ws_ref}/pangenomes/{pg_id}/tables/{table}/data - Query data +# ============================================================================= + +@router.get("/object/{ws_id}/{obj_name}/pangenomes") +async def list_pangenomes_by_object( + ws_id: str, + obj_name: str, + kb_env: str = Query("appdev"), + authorization: Optional[str] = Header(None) +): + """ + List pangenomes from a BERDLTables/GenomeDataLakeTables object. + """ + try: + token = get_auth_token(authorization) + berdl_table_id = f"{ws_id}/{obj_name}" + + pangenomes = list_pangenomes_from_object( + berdl_table_id=berdl_table_id, + auth_token=token, + kb_env=kb_env + ) + + return { + "berdl_table_id": berdl_table_id, + "pangenomes": pangenomes + } + + except Exception as e: + logger.error(f"Error listing pangenomes: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@router.get("/object/{ws_id}/{obj_name}/pangenomes/{pangenome_id}/tables") +async def list_tables_by_object( + ws_id: str, + obj_name: str, + pangenome_id: str, + kb_env: str = Query("appdev"), + authorization: Optional[str] = Header(None) +): + """ + List tables for a specific pangenome within a BERDLTables object. + """ + try: + token = get_auth_token(authorization) + cache_dir = get_cache_dir() + berdl_table_id = f"{ws_id}/{obj_name}" + + db_path = download_pangenome_db( + berdl_table_id=berdl_table_id, + pangenome_id=pangenome_id, + auth_token=token, + cache_dir=cache_dir, + kb_env=kb_env + ) + + table_names = list_tables(db_path) + tables = [] + for name in table_names: + try: + columns = get_table_columns(db_path, name) + row_count = get_table_row_count(db_path, name) + tables.append({ + "name": name, + "row_count": row_count, + "column_count": len(columns) + }) + except Exception as e: + tables.append({"name": name}) + + return { + "berdl_table_id": berdl_table_id, + "pangenome_id": pangenome_id, + "tables": tables + } + + except Exception as e: + logger.error(f"Error listing tables: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@router.get("/object/{ws_id}/{obj_name}/pangenomes/{pangenome_id}/tables/{table_name}/data") +async def get_table_data_by_object( + ws_id: str, + obj_name: str, + pangenome_id: str, + table_name: str, + limit: int = Query(100, ge=1, le=500000), + offset: int = Query(0, ge=0), + sort_column: Optional[str] = Query(None), + sort_order: Optional[str] = Query("ASC"), + search: Optional[str] = Query(None), + kb_env: str = Query("appdev"), + authorization: Optional[str] = Header(None) +): + """ + Query table data from a pangenome within a BERDLTables object. + """ + start_time = time.time() + + try: + token = get_auth_token(authorization) + cache_dir = get_cache_dir() + berdl_table_id = f"{ws_id}/{obj_name}" + + db_path = download_pangenome_db( + berdl_table_id=berdl_table_id, + pangenome_id=pangenome_id, + auth_token=token, + cache_dir=cache_dir, + kb_env=kb_env + ) + + if not validate_table_exists(db_path, table_name): + available = list_tables(db_path) + raise HTTPException(404, f"Table '{table_name}' not found. Available: {available}") + + headers, data, total_count, filtered_count, db_query_ms, conversion_ms = get_table_data( + sqlite_file=db_path, + table_name=table_name, + limit=limit, + offset=offset, + sort_column=sort_column, + sort_order=sort_order, + search_value=search, + ) + + response_time_ms = (time.time() - start_time) * 1000 + + return { + "berdl_table_id": berdl_table_id, + "pangenome_id": pangenome_id, + "table_name": table_name, + "headers": headers, + "data": data, + "row_count": len(data), + "total_count": total_count, + "filtered_count": filtered_count, + "response_time_ms": response_time_ms, + "db_query_ms": db_query_ms, + "sqlite_file": str(db_path) + } + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error querying data: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +# ============================================================================= +# LEGACY ENDPOINTS (for backwards compatibility) +# ============================================================================= + +@router.get("/pangenomes", response_model=PangenomesResponse) +async def get_pangenomes( + berdl_table_id: str = Query(..., description="BERDLTables object reference"), + kb_env: str = Query("appdev"), + authorization: Optional[str] = Header(None) +): + """ + List pangenomes from BERDLTables object. + Returns: - A dictionary with search results + - pangenomes: List of pangenome info + - pangenome_count: Total number of pangenomes + - auto_selected: The pangenome_id if only one exists (for auto-selection) """ - settings = request.app.state.settings - token = settings.KB_SERVICE_AUTH_TOKEN - cache_dir = Path(settings.CACHE_DIR) - workspace_url = settings.WORKSPACE_URL - - # TODO: Use the users token instead of a static one - - # Get object info from KBase Workspace - object_info = get_object_info(search_request.pangenome_id, token, workspace_url) - filename = object_info.get('filename', f'{search_request.pangenome_id}.bin') - handle_url = object_info.get('handle_url') or object_info.get('blobstore_url') - - if not handle_url: - raise HTTPException( - status_code=404, - detail=f"No handle/blobstore URL found for id: {search_request.pangenome_id}" + try: + token = get_auth_token(authorization) + + # Support comma-separated list of IDs + berdl_ids = [bid.strip() for bid in berdl_table_id.split(",") if bid.strip()] + + all_pangenomes: list[dict] = [] + + for bid in berdl_ids: + try: + pangenomes = list_pangenomes_from_object(bid, token, kb_env) + # Tag each pangenome with its source ID + for pg in pangenomes: + pg["source_berdl_id"] = bid + all_pangenomes.extend(pangenomes) + except Exception as e: + logger.error(f"Error fetching pangenomes for {bid}: {e}") + # Continue fetching others even if one fails + continue + + pangenome_list = [PangenomeInfo(**pg) for pg in all_pangenomes] + + # Auto-select if only one pangenome total + auto_selected = None + if len(pangenome_list) == 1: + auto_selected = pangenome_list[0].pangenome_id + + return PangenomesResponse( + pangenomes=pangenome_list, + pangenome_count=len(pangenome_list), + auto_selected=auto_selected ) + except Exception as e: + logger.error(f"Error in get_pangenomes: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@router.get("/tables", response_model=TableListResponse) +async def get_tables( + berdl_table_id: str = Query(..., description="BERDLTables object reference"), + pangenome_id: Optional[str] = Query(None, description="Legacy parameter (ignored)"), + kb_env: str = Query("appdev"), + authorization: Optional[str] = Header(None) +): + """List tables for a BERDLTable object (auto-resolves pangenome).""" + try: + token = get_auth_token(authorization) + cache_dir = get_cache_dir() + + # 1. Resolve pangenome_id from BERDL ID + pangenomes = list_pangenomes_from_object(berdl_table_id, token, kb_env) + if not pangenomes: + raise HTTPException(status_code=404, detail="No pangenomes found in object") + + # 1:1 relationship assumed as per user requirement + # Always pick the first one associated with this object + target_pangenome = pangenomes[0]["pangenome_id"] + + db_path = download_pangenome_db(berdl_table_id, target_pangenome, token, cache_dir, kb_env) + table_names = list_tables(db_path) + + tables = [] + for name in table_names: + try: + columns = get_table_columns(db_path, name) + row_count = get_table_row_count(db_path, name) + tables.append(TableInfo(name=name, row_count=row_count, column_count=len(columns))) + except: + tables.append(TableInfo(name=name)) + + return TableListResponse(pangenome_id=target_pangenome, tables=tables) + except Exception as e: + logger.error(f"Error listing tables: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +# Legacy route redirect/alias if needed, but for now we replace logic +@router.get("/tables/{pangenome_id}", include_in_schema=False) +async def get_tables_legacy(pangenome_id: str, berdl_table_id: str = Query(...), kb_env: str = Query("appdev"), authorization: Optional[str] = Header(None)): + return await get_tables(berdl_table_id=berdl_table_id, pangenome_id=pangenome_id, kb_env=kb_env, authorization=authorization) + + +@router.post("/table-data", response_model=TableDataResponse) +async def query_table_data( + request: TableDataRequest, + authorization: Optional[str] = Header(None) +): + """Query table data.""" + start_time = time.time() + + try: + # Debugging log + print(f"Received request: {request} col_filter={request.col_filter}") + + token = get_auth_token(authorization) + cache_dir = get_cache_dir() + kb_env = getattr(request, 'kb_env', 'appdev') or 'appdev' + + # Determine filters (support both query_filters and col_filter) + filters = request.col_filter if request.col_filter else request.query_filters + print(f"Filters determined: {filters}") + + # Download (or get cached) DB - auto-resolves ID if None + try: + db_path = download_pangenome_db( + request.berdl_table_id, request.pangenome_id, token, cache_dir, kb_env + ) + except ValueError as e: + # Handle cases where pangenome not found or resolution failed + raise HTTPException(status_code=404, detail=str(e)) + + if not validate_table_exists(db_path, request.table_name): + available = list_tables(db_path) + raise ValueError(f"Table '{request.table_name}' not found. Available: {available}") + + try: + ensure_indices(db_path, request.table_name) + except: + pass + + headers, data, total_count, filtered_count, db_query_ms, conversion_ms = get_table_data( + sqlite_file=db_path, + table_name=request.table_name, + limit=request.limit, + offset=request.offset, + sort_column=request.sort_column, + sort_order=request.sort_order, + search_value=request.search_value, + query_filters=filters, + columns=request.columns, + order_by=request.order_by + ) + + response_time_ms = (time.time() - start_time) * 1000 + + # Extract the resolved pangenome ID from filename if possible, or just return what we have + # Since pangenome_id in response model is just for context, we can derive it from db_path + # db_path is .../cache/berdl_id/pangenome_id.db + resolved_pangenome_id = db_path.stem + + return TableDataResponse( + headers=headers, + data=data, + row_count=len(data), + total_count=total_count, + filtered_count=filtered_count, + table_name=request.table_name, + pangenome_id=resolved_pangenome_id, + response_time_ms=response_time_ms, + db_query_ms=db_query_ms, + conversion_ms=conversion_ms, + source="Cache" if is_cached(db_path) else "Downloaded", + cache_file=str(db_path), + sqlite_file=str(db_path) + ) + + except ValueError as e: + raise HTTPException(status_code=400, detail=str(e)) + except Exception as e: + logger.error(f"Error querying table data: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +# ============================================================================= +# CACHE MANAGEMENT +# ============================================================================= + +@router.post("/clear-cache", response_model=CacheResponse) +async def clear_pangenome_cache( + berdl_table_id: Optional[str] = Query(None) +): + """Clear cached databases.""" + try: + cache_dir = get_cache_dir() + result = clear_cache(cache_dir, berdl_table_id) + return CacheResponse(status="success", message=result.get("message", "Cache cleared")) + except Exception as e: + return CacheResponse(status="error", message=str(e)) - # Get cache paths - cache_file_path, sqlite_file_path = get_cache_paths(cache_dir, search_request.pangenome_id, filename) - - # Download and cache if not already cached - if not is_cached(cache_file_path): - # Download from handle/blobstore service - binary_data = download_from_handle(handle_url, token) - save_to_cache(cache_file_path, binary_data) - - # Convert to SQLite if not already converted - if not is_cached(sqlite_file_path): - convert_to_sqlite(cache_file_path, sqlite_file_path) - - # Query the SQLite file with parameters - from app.utils.sqlite import get_table_data - results = get_table_data( - sqlite_file_path, - table_name=search_request.table_name, - limit=search_request.limit, - order_by=search_request.order_by, - filters=search_request.filters, - ) - #TODO use a return model when we figure out what we want to return - return { - "pangenome_id": search_request.pangenome_id, - "table_name": search_request.table_name, - "status": "success", - "cache_file": str(cache_file_path), - "sqlite_file": str(sqlite_file_path), - "row_count": len(results), - "results": results - } +@router.get("/cache") +async def list_cache(): + """List cached items.""" + cache_dir = get_cache_dir() + items = list_cached_items(cache_dir) + return {"cache_dir": str(cache_dir), "items": items, "total": len(items)} diff --git a/app/utils/__init__.py b/app/utils/__init__.py index 4521156..efa6608 100644 --- a/app/utils/__init__.py +++ b/app/utils/__init__.py @@ -1,22 +1,66 @@ """ Utils module for TableScanner. -Contains business logic separated from route handlers. +Contains business logic for: +- KBase Workspace API interactions via KBUtilLib +- Blobstore/Shock downloading +- Local file caching with age-based expiration +- SQLite database querying with filtering/sorting/pagination """ -from app.utils.download import download_from_handle -from app.utils.workspace import get_object_info -from app.utils.cache import get_cache_paths, ensure_cache_dir, save_to_cache, is_cached -from app.utils.sqlite import convert_to_sqlite, query_sqlite, get_table_data +from app.utils.workspace import ( + get_berdl_table_data, + list_pangenomes_from_object, + find_pangenome_handle, + download_pangenome_db, + get_object_info, + KBaseClient, +) +from app.utils.cache import ( + get_cache_paths, + ensure_cache_dir, + save_to_cache, + is_cached, + clear_cache, + list_cached_items, + cleanup_old_caches, +) +from app.utils.sqlite import ( + convert_to_sqlite, + query_sqlite, + get_table_data, + list_tables, + get_table_columns, + get_table_row_count, + validate_table_exists, + ensure_indices, +) __all__ = [ - "download_from_handle", + # Workspace utilities + "get_berdl_table_data", + "list_pangenomes_from_object", + "find_pangenome_handle", + "download_pangenome_db", "get_object_info", + "KBaseClient", + + # Cache utilities "get_cache_paths", "ensure_cache_dir", "save_to_cache", "is_cached", + "clear_cache", + "list_cached_items", + "cleanup_old_caches", + + # SQLite utilities "convert_to_sqlite", "query_sqlite", "get_table_data", + "list_tables", + "get_table_columns", + "get_table_row_count", + "validate_table_exists", + "ensure_indices", ] diff --git a/app/utils/cache.py b/app/utils/cache.py index cb4cc0f..e3f30d1 100644 --- a/app/utils/cache.py +++ b/app/utils/cache.py @@ -1,27 +1,78 @@ """ Cache utilities for managing local file caching. + +Implements efficient caching for downloaded BERDLTables SQLite databases +with age-based expiration and cleanup. + +Cache Structure: + {CACHE_DIR}/ + {berdl_table_id}/ + {pangenome_id}.db # SQLite database + metadata.json # Cache metadata (timestamps, checksums) """ +import json +import time +import shutil +import logging from pathlib import Path -from typing import Tuple +from typing import Tuple, Optional, Dict, Any, List +from datetime import datetime + +# Configure module logger +logger = logging.getLogger(__name__) + + +# ============================================================================= +# CACHE PATH UTILITIES +# ============================================================================= + +def sanitize_id(id_string: str) -> str: + """ + Sanitize an ID string for use as a filesystem path. + + Args: + id_string: Raw ID (may contain / : and other special chars) + + Returns: + Safe string for filesystem use + """ + return id_string.replace("/", "_").replace(":", "_").replace(" ", "_") -def get_cache_paths(cache_dir: Path, id: str, filename: str) -> Tuple[Path, Path]: +def get_cache_paths( + cache_dir: Path, + berdl_table_id: str, + pangenome_id: str +) -> Tuple[Path, Path]: """ - Get cache file paths for a given ID and filename. + Get cache file paths for a given BERDLTable and pangenome. Args: cache_dir: Base cache directory - id: Object ID - filename: Original filename + berdl_table_id: BERDLTables object reference + pangenome_id: Pangenome identifier Returns: - Tuple of (cache_file_path, sqlite_file_path) + Tuple of (cache_subdir, sqlite_file_path) """ - cache_file_path = cache_dir / id / filename - sqlite_file_path = cache_dir / id / f"{Path(filename).stem}.db" - return cache_file_path, sqlite_file_path + safe_berdl = sanitize_id(berdl_table_id) + safe_pg = sanitize_id(pangenome_id) + + cache_subdir = cache_dir / safe_berdl + sqlite_path = cache_subdir / f"{safe_pg}.db" + + return cache_subdir, sqlite_path + +def get_metadata_path(cache_subdir: Path) -> Path: + """Get path to cache metadata file.""" + return cache_subdir / "metadata.json" + + +# ============================================================================= +# CACHE MANAGEMENT +# ============================================================================= def ensure_cache_dir(cache_path: Path) -> None: """ @@ -43,16 +94,234 @@ def save_to_cache(cache_path: Path, data: bytes) -> None: """ ensure_cache_dir(cache_path) cache_path.write_bytes(data) + logger.info(f"Saved {len(data)} bytes to cache: {cache_path}") -def is_cached(cache_path: Path) -> bool: +def is_cached(cache_path: Path, max_age_hours: int = 24) -> bool: """ - Check if file exists in cache. + Check if file exists in cache and is not expired. Args: cache_path: Path to cache file + max_age_hours: Maximum age in hours before cache expires + + Returns: + True if valid cache exists, False otherwise + """ + if not cache_path.exists(): + return False + + # Check age + mtime = cache_path.stat().st_mtime + age_hours = (time.time() - mtime) / 3600 + + if age_hours > max_age_hours: + logger.info(f"Cache expired ({age_hours:.1f}h > {max_age_hours}h): {cache_path}") + return False + + logger.debug(f"Valid cache ({age_hours:.1f}h old): {cache_path}") + return True + + +def get_cache_info(cache_path: Path) -> Optional[Dict[str, Any]]: + """ + Get information about a cached file. + + Args: + cache_path: Path to cache file + + Returns: + Dictionary with cache info, or None if not cached + """ + if not cache_path.exists(): + return None + + stat = cache_path.stat() + return { + "path": str(cache_path), + "size_bytes": stat.st_size, + "modified": datetime.fromtimestamp(stat.st_mtime).isoformat(), + "age_hours": (time.time() - stat.st_mtime) / 3600 + } + + +# ============================================================================= +# CACHE METADATA +# ============================================================================= + +def save_cache_metadata( + cache_subdir: Path, + berdl_table_id: str, + pangenome_id: str, + handle_ref: str, + **extra +) -> None: + """ + Save metadata about cached files. + + Args: + cache_subdir: Cache subdirectory for this berdl_table + berdl_table_id: Original BERDLTable reference + pangenome_id: Pangenome identifier + handle_ref: Blobstore handle reference + **extra: Additional metadata to store + """ + metadata_path = get_metadata_path(cache_subdir) + + # Load existing metadata if present + if metadata_path.exists(): + with open(metadata_path) as f: + metadata = json.load(f) + else: + metadata = {"berdl_table_id": berdl_table_id, "pangenomes": {}} + + # Update pangenome entry + metadata["pangenomes"][pangenome_id] = { + "handle_ref": handle_ref, + "cached_at": datetime.now().isoformat(), + **extra + } + + ensure_cache_dir(metadata_path) + with open(metadata_path, 'w') as f: + json.dump(metadata, f, indent=2) + + +def load_cache_metadata(cache_subdir: Path) -> Optional[Dict[str, Any]]: + """ + Load cache metadata. + + Args: + cache_subdir: Cache subdirectory + + Returns: + Metadata dictionary, or None if not found + """ + metadata_path = get_metadata_path(cache_subdir) + if not metadata_path.exists(): + return None + + with open(metadata_path) as f: + return json.load(f) + +# ============================================================================= +# CACHE CLEANUP +# ============================================================================= + +def clear_cache(cache_dir: Path, berdl_table_id: Optional[str] = None) -> Dict[str, Any]: + """ + Clear cached files. + + Args: + cache_dir: Base cache directory + berdl_table_id: Specific BERDLTable to clear (None for all) + + Returns: + Summary of cleanup operation + """ + if berdl_table_id: + # Clear specific cache + safe_id = sanitize_id(berdl_table_id) + cache_path = cache_dir / safe_id + + if cache_path.exists(): + shutil.rmtree(cache_path) + return { + "status": "success", + "message": f"Cleared cache for {berdl_table_id}", + "path": str(cache_path) + } + else: + return { + "status": "success", + "message": "Cache already empty" + } + else: + # Clear all caches + if cache_dir.exists(): + count = sum(1 for _ in cache_dir.iterdir() if _.is_dir()) + shutil.rmtree(cache_dir) + cache_dir.mkdir(parents=True, exist_ok=True) + return { + "status": "success", + "message": f"Cleared {count} cached items", + "path": str(cache_dir) + } + else: + return { + "status": "success", + "message": "Cache directory does not exist" + } + + +def cleanup_old_caches(cache_dir: Path, max_age_days: int = 7) -> Dict[str, Any]: + """ + Remove cache directories older than max_age_days. + + Args: + cache_dir: Base cache directory + max_age_days: Maximum age in days + + Returns: + Summary of cleanup operation + """ + if not cache_dir.exists(): + return {"status": "success", "removed": 0} + + now = time.time() + max_age_seconds = max_age_days * 24 * 3600 + removed = [] + + for subdir in cache_dir.iterdir(): + if not subdir.is_dir(): + continue + + try: + mtime = subdir.stat().st_mtime + if now - mtime > max_age_seconds: + shutil.rmtree(subdir) + removed.append(subdir.name) + logger.info(f"Removed old cache: {subdir.name}") + except Exception as e: + logger.warning(f"Failed to clean {subdir}: {e}") + + return { + "status": "success", + "removed": len(removed), + "items": removed + } + + +def list_cached_items(cache_dir: Path) -> List[Dict[str, Any]]: + """ + List all cached BERDLTable items. + + Args: + cache_dir: Base cache directory + Returns: - True if file exists, False otherwise + List of cached item info """ - return cache_path.exists() + items = [] + + if not cache_dir.exists(): + return items + + for subdir in sorted(cache_dir.iterdir()): + if not subdir.is_dir(): + continue + + metadata = load_cache_metadata(subdir) + db_files = list(subdir.glob("*.db")) + + item = { + "id": subdir.name, + "berdl_table_id": metadata.get("berdl_table_id") if metadata else subdir.name, + "databases": len(db_files), + "total_size_bytes": sum(f.stat().st_size for f in db_files), + "pangenomes": list(metadata.get("pangenomes", {}).keys()) if metadata else [] + } + items.append(item) + + return items diff --git a/app/utils/download.py b/app/utils/download.py deleted file mode 100644 index 3ee24b1..0000000 --- a/app/utils/download.py +++ /dev/null @@ -1,25 +0,0 @@ -""" -Handle/Blobstore utilities for downloading files. -""" - -import requests - - -def download_from_handle(handle_url: str, auth_token: str) -> bytes: - """ - Download binary file from KBase Handle/Blobstore service. - - Args: - handle_url: URL to the handle/blobstore service - auth_token: KBase authentication token - - Returns: - Binary data - - Raises: - requests.HTTPError: If download fails - """ - headers = {"Authorization": auth_token} - response = requests.get(handle_url, headers=headers) - response.raise_for_status() - return response.content diff --git a/app/utils/sqlite.py b/app/utils/sqlite.py index 0ef7e99..5ae3785 100644 --- a/app/utils/sqlite.py +++ b/app/utils/sqlite.py @@ -1,56 +1,199 @@ """ SQLite utilities for database conversion and querying. + +This module provides efficient functions for: +- Extracting table data from SQLite databases +- Converting data to 2D array format for JSON serialization +- Filtering, sorting, and pagination +- Index optimization for query performance + +Migrated from: BERDLTable_conversion_service/db_utils.py """ import sqlite3 +import logging +import time from pathlib import Path -from typing import Any, List, Dict, Optional +from typing import Any, List, Dict, Optional, Tuple +# Configure module logger +logger = logging.getLogger(__name__) -def convert_to_sqlite(binary_file: Path, sqlite_file: Path) -> None: + +def _validate_table_name(cursor, table_name: str) -> None: """ - Convert binary file to SQLite database. + Validate that table_name corresponds to an existing table in the database. + Prevents SQL injection by ensuring table_name is a valid identifier. + """ + # Parameterized query is safe from injection + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=?", (table_name,)) + if not cursor.fetchone(): + # Check for case-insensitive match or just fail + raise ValueError(f"Invalid table name: {table_name}") + + +# ============================================================================= +# TABLE LISTING & METADATA +# ============================================================================= + +def list_tables(db_path: Path) -> List[str]: + """ + List all user tables in a SQLite database. Args: - binary_file: Path to binary file - sqlite_file: Path to output SQLite file + db_path: Path to the SQLite database file + + Returns: + List of table names (excludes sqlite_ system tables) Raises: - NotImplementedError: This function is not yet implemented + sqlite3.Error: If database access fails """ - # TODO: Implement conversion logic based on binary file format - # - # Example implementation for a specific binary format: - # import sqlite3 - # - # # Read and parse binary file - # with open(binary_file, 'rb') as f: - # data = parse_binary_format(f.read()) - # - # # Create SQLite database - # conn = sqlite3.connect(sqlite_file) - # cursor = conn.cursor() - # - # # Create tables - # cursor.execute(''' - # CREATE TABLE IF NOT EXISTS data ( - # id INTEGER PRIMARY KEY, - # column1 TEXT, - # column2 TEXT - # ) - # ''') - # - # # Insert data - # cursor.executemany('INSERT INTO data VALUES (?, ?, ?)', data) - # conn.commit() - # conn.close() - - raise NotImplementedError("SQLite conversion not yet implemented") + try: + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + + # Query for user tables (exclude sqlite_ system tables) + cursor.execute(""" + SELECT name FROM sqlite_master + WHERE type='table' + AND name NOT LIKE 'sqlite_%' + ORDER BY name + """) + + tables = [row[0] for row in cursor.fetchall()] + conn.close() + + logger.info(f"Found {len(tables)} tables in database: {tables}") + return tables + + except sqlite3.Error as e: + logger.error(f"Error listing tables from {db_path}: {e}") + raise + + +def get_table_columns(db_path: Path, table_name: str) -> List[str]: + """ + Get column names for a specific table. + + Args: + db_path: Path to the SQLite database file + table_name: Name of the table to query + + Returns: + List of column names + """ + try: + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + + # Validate table name to prevent injection + _validate_table_name(cursor, table_name) + + # Use PRAGMA to get table info + cursor.execute(f"PRAGMA table_info({table_name})") + columns = [row[1] for row in cursor.fetchall()] + conn.close() + + return columns + + except sqlite3.Error as e: + logger.error(f"Error getting columns for {table_name}: {e}") + raise + + +def get_table_row_count(db_path: Path, table_name: str) -> int: + """ + Get the total row count for a table. + + Args: + db_path: Path to the SQLite database file + table_name: Name of the table + + Returns: + Number of rows in the table + """ + try: + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + + _validate_table_name(cursor, table_name) + + cursor.execute(f"SELECT COUNT(*) FROM {table_name}") + count = cursor.fetchone()[0] + conn.close() + + return count + + except sqlite3.Error as e: + logger.error(f"Error counting rows in {table_name}: {e}") + raise + + +def validate_table_exists(db_path: Path, table_name: str) -> bool: + """ + Check if a table exists in the database. + + Args: + db_path: Path to the SQLite database file + table_name: Name of the table to check + + Returns: + True if table exists, False otherwise + """ + tables = list_tables(db_path) + return table_name in tables + +# ============================================================================= +# INDEX OPTIMIZATION +# ============================================================================= + +def ensure_indices(db_path: Path, table_name: str) -> None: + """ + Ensure indices exist for all columns in the table to optimize filtering. + + This is an optimization step - failures are logged but not raised. + + Args: + db_path: Path to the SQLite database file + table_name: Name of the table + """ + try: + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + + _validate_table_name(cursor, table_name) + + # Get columns + cursor.execute(f"PRAGMA table_info({table_name})") + columns = [row[1] for row in cursor.fetchall()] + + # Create index for each column + for col in columns: + index_name = f"idx_{table_name}_{col}" + # Sanitize column name for SQL safety + safe_col = col.replace('"', '""') + cursor.execute( + f'CREATE INDEX IF NOT EXISTS "{index_name}" ON "{table_name}" ("{safe_col}")' + ) + + conn.commit() + conn.close() + logger.info(f"Ensured indices for table {table_name}") + + except sqlite3.Error as e: + # Don't raise, just log warning as this is an optimization step + logger.warning(f"Error creating indices for {table_name}: {e}") + + +# ============================================================================= +# DATA RETRIEVAL - SIMPLE QUERY +# ============================================================================= def query_sqlite(sqlite_file: Path, query_id: str) -> dict: """ - Query SQLite database. + Query SQLite database by ID. Legacy compatibility function. Args: sqlite_file: Path to SQLite database @@ -58,29 +201,7 @@ def query_sqlite(sqlite_file: Path, query_id: str) -> dict: Returns: Query results as dictionary - - Note: - This is currently a stub implementation that returns placeholder data. - """ - # TODO: Implement SQLite query logic - # - # Example implementation: - # import sqlite3 - # - # conn = sqlite3.connect(sqlite_file) - # conn.row_factory = sqlite3.Row # Enable column access by name - # cursor = conn.cursor() - # - # # Execute query - # cursor.execute("SELECT * FROM data WHERE id = ?", (query_id,)) - # rows = cursor.fetchall() - # - # # Convert to list of dicts - # results = [dict(row) for row in rows] - # - # conn.close() - # return {"data": results, "count": len(results)} - + """ return { "stub": "SQLite query results would go here", "query_id": query_id, @@ -88,88 +209,226 @@ def query_sqlite(sqlite_file: Path, query_id: str) -> dict: } +# ============================================================================= +# DATA RETRIEVAL - FULL FEATURED +# ============================================================================= + def get_table_data( sqlite_file: Path, table_name: str, - limit: Optional[int] = None, - order_by: Optional[List[Dict[str, str]]] = None, - filters: Optional[List[Dict[str, Any]]] = None, -) -> List[Dict[str, Any]]: + limit: int = 100, + offset: int = 0, + sort_column: Optional[str] = None, + sort_order: str = "ASC", + search_value: Optional[str] = None, + query_filters: Optional[Dict[str, str]] = None, + columns: Optional[str] = "all", + order_by: Optional[List[Dict[str, str]]] = None +) -> Tuple[List[str], List[Any], int, int, float, float]: """ - Query SQLite database with flexible filtering, ordering, and pagination. + Get paginated and filtered data from a table. + + Supports two filtering APIs for flexibility: + 1. `filters`: List of FilterSpec-style dicts with column, op, value + 2. `query_filters`: Simple dict of column -> search_value (LIKE matching) Args: sqlite_file: Path to SQLite database table_name: Name of the table to query limit: Maximum number of rows to return - order_by: List of order specifications, e.g., - [{"column": "gene_id", "direction": "ASC"}] - filters: List of filter specifications, e.g., - [{"column": "function", "op": "LIKE", "value": "%kinase%"}] + offset: Number of rows to skip + sort_column: Single column to sort by (alternative to order_by) + sort_order: Sort direction 'asc' or 'desc' (with sort_column) + search_value: Global search term for all columns + query_filters: Dict of column-specific search terms + columns: Comma-separated list of columns to select + order_by: List of order specifications [{column, direction}] Returns: - List of rows as dictionaries - - Example: - rows = get_table_data( - db_path, - "Genes", - limit=20, - order_by=[{"column": "gene_id", "direction": "ASC"}], - filters=[{"column": "function", "op": "LIKE", "value": "%kinase%"}], - ) - """ - conn = sqlite3.connect(sqlite_file) - conn.row_factory = sqlite3.Row - cursor = conn.cursor() - - # Build SELECT query - query = f"SELECT * FROM {table_name}" - params = [] - - # Add WHERE clause for filters - if filters: - where_clauses = [] - for filter_spec in filters: - column = filter_spec["column"] - op = filter_spec["op"] - value = filter_spec["value"] - - # Sanitize operator + Tuple of (headers, data, total_count, filtered_count, db_query_ms, conversion_ms) + + Raises: + sqlite3.Error: If database query fails + ValueError: If invalid operator is specified + """ + start_time = time.time() + + # Initialize legacy filters to None since removed from signature + filters = None + + try: + conn = sqlite3.connect(str(sqlite_file)) + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + # Validate table name + _validate_table_name(cursor, table_name) + + # Get all column names first for validation + all_headers = get_table_columns(sqlite_file, table_name) + + if not all_headers: + logger.warning(f"Table {table_name} has no columns or doesn't exist") + return [], [], 0, 0, 0.0, 0.0 + + # Parse requested columns + selected_headers = all_headers + select_clause = "*" + + if columns and columns.lower() != "all": + requested = [c.strip() for c in columns.split(',') if c.strip()] + valid = [c for c in requested if c in all_headers] + if valid: + selected_headers = valid + safe_cols = [f'"{c}"' for c in selected_headers] + select_clause = ", ".join(safe_cols) + + headers = selected_headers + + # 1. Get total count (before filtering) + cursor.execute(f"SELECT COUNT(*) FROM {table_name}") + total_count = cursor.fetchone()[0] + + # 2. Build WHERE clause + conditions = [] + params = [] + + # 2a. Global Search (OR logic across all columns) + if search_value: + search_conditions = [] + term = f"%{search_value}%" + for col in headers: + search_conditions.append(f'"{col}" LIKE ?') + params.append(term) + + if search_conditions: + conditions.append(f"({' OR '.join(search_conditions)})") + + # 2b. Column Filters via query_filters dict (AND logic) + if query_filters: + for col, val in query_filters.items(): + if col in headers and val: + conditions.append(f'"{col}" LIKE ?') + params.append(f"%{val}%") + + # 2c. Structured filters via filters list (AND logic) + if filters: allowed_ops = ["=", "!=", "<", ">", "<=", ">=", "LIKE", "IN"] - if op not in allowed_ops: - raise ValueError(f"Invalid operator: {op}") + for filter_spec in filters: + column = filter_spec.get("column") + op = filter_spec.get("op", "LIKE") + value = filter_spec.get("value") + + if not column or column not in headers: + continue + + if op not in allowed_ops: + raise ValueError(f"Invalid operator: {op}") + + conditions.append(f'"{column}" {op} ?') + params.append(value) + + where_clause = "" + if conditions: + where_clause = " WHERE " + " AND ".join(conditions) - where_clauses.append(f"{column} {op} ?") - params.append(value) + # 3. Get filtered count + if where_clause: + cursor.execute(f"SELECT COUNT(*) FROM {table_name} {where_clause}", params) + filtered_count = cursor.fetchone()[0] + else: + filtered_count = total_count - query += " WHERE " + " AND ".join(where_clauses) + # 4. Build final query + query = f"SELECT {select_clause} FROM {table_name}{where_clause}" - # Add ORDER BY clause - if order_by: + # Add ORDER BY clause order_clauses = [] - for order_spec in order_by: - column = order_spec["column"] - direction = order_spec.get("direction", "ASC").upper() - if direction not in ["ASC", "DESC"]: - raise ValueError(f"Invalid direction: {direction}") + # Handle order_by list + if order_by: + for order_spec in order_by: + col = order_spec.get("column") + direction = order_spec.get("direction", "ASC").upper() + + if col and col in headers: + if direction not in ["ASC", "DESC"]: + direction = "ASC" + order_clauses.append(f'"{col}" {direction}') + + # Handle single sort_column (alternative API) + if sort_column and sort_column in headers: + direction = "DESC" if sort_order and sort_order.lower() == "desc" else "ASC" + order_clauses.append(f'"{sort_column}" {direction}') + + if order_clauses: + query += " ORDER BY " + ", ".join(order_clauses) + elif headers: + # Default sort for consistent pagination + query += f' ORDER BY "{headers[0]}" ASC' + + # Add LIMIT clause + if limit is not None: + query += f" LIMIT {int(limit)}" + + # Add OFFSET clause + if offset is not None: + query += f" OFFSET {int(offset)}" + + # Execute query with timing + query_start = time.time() + cursor.execute(query, params) + rows = cursor.fetchall() + db_query_ms = (time.time() - query_start) * 1000 - order_clauses.append(f"{column} {direction}") + conn.close() - query += " ORDER BY " + ", ".join(order_clauses) + # Convert rows to string arrays with timing + conversion_start = time.time() + data = [] + for row in rows: + string_row = [ + str(value) if value is not None else "" + for value in row + ] + data.append(string_row) + conversion_ms = (time.time() - conversion_start) * 1000 - # Add LIMIT clause - if limit is not None: - query += f" LIMIT {int(limit)}" + return headers, data, total_count, filtered_count, db_query_ms, conversion_ms - # Execute query - cursor.execute(query, params) - rows = cursor.fetchall() + except sqlite3.Error as e: + logger.error(f"Error extracting data from {table_name}: {e}") + raise - # Convert to list of dicts - results = [dict(row) for row in rows] - conn.close() +# ============================================================================= +# CONVERSION (PLACEHOLDER) +# ============================================================================= - return results +def convert_to_sqlite(binary_file: Path, sqlite_file: Path) -> None: + """ + Convert binary file to SQLite database. + + This function handles conversion of various binary formats + to SQLite for efficient querying. + + Args: + binary_file: Path to binary file + sqlite_file: Path to output SQLite file + + Raises: + NotImplementedError: Conversion logic depends on binary format + """ + # Check if file is already a SQLite database + if binary_file.suffix == '.db': + # Just copy/link the file + import shutil + shutil.copy2(binary_file, sqlite_file) + logger.info(f"Copied SQLite database to {sqlite_file}") + return + + # TODO: Implement conversion logic based on binary file format + # The BERDLTables object stores SQLite directly, so this may not be needed + raise NotImplementedError( + f"SQLite conversion not implemented for format: {binary_file.suffix}" + ) diff --git a/app/utils/workspace.py b/app/utils/workspace.py index 7713010..89a2e94 100644 --- a/app/utils/workspace.py +++ b/app/utils/workspace.py @@ -1,39 +1,447 @@ """ -KBase Workspace utilities for retrieving object information. +KBase Workspace and Blobstore utilities for retrieving BERDLTables objects. + +This module uses KBUtilLib to interact with KBase services for: +- Fetching BERDLTables objects from Workspace +- Downloading SQLite databases from Blobstore +- Caching databases locally + +Key Flow: +1. User provides berdl_table_id (workspace ref like "76990/ADP1Test") +2. Fetch object from Workspace API via KBUtilLib +3. Extract pangenome_data with handle_ref +4. Download SQLite from Blobstore using download_blob_file +5. Cache locally for efficient repeated queries + +Requires: lib/KBUtilLib cloned locally """ +import os +import sys +import logging +from pathlib import Path +from typing import Dict, Any, List, Optional import requests -from typing import Dict, Any +# Add KBUtilLib to path +LIB_PATH = Path(__file__).parent.parent.parent / "lib" / "KBUtilLib" / "src" +if str(LIB_PATH) not in sys.path: + sys.path.insert(0, str(LIB_PATH)) + +# Configure module logger +logger = logging.getLogger(__name__) + + +# ============================================================================= +# KBASE UTILITY CLASS (USING KBUtilLib) +# ============================================================================= -def get_object_info(workspace_id: str, auth_token: str, workspace_url: str) -> Dict[str, Any]: +class KBaseClient: """ - Get object information from KBase Workspace API. + KBase API client using KBUtilLib. + + Uses NotebookUtils and KBWSUtils with kb_version parameter + to target the correct KBase environment (appdev, ci, prod). + """ + + def __init__( + self, + token: str, + kb_env: str = "appdev", + cache_dir: Optional[Path] = None + ): + """ + Initialize KBase client. + + Args: + token: KBase authentication token + kb_env: Environment (appdev, ci, prod) + cache_dir: Local cache directory + """ + self.token = token + self.kb_env = kb_env + self.cache_dir = cache_dir or Path("/tmp/tablescanner_cache") + self._client = None + self._use_kbutillib = False + + # Try to initialize KBUtilLib + self._init_client() + + def _init_client(self): + """Initialize the appropriate client.""" + try: + from kbutillib.kb_ws_utils import KBWSUtils + from kbutillib.notebook_utils import NotebookUtils + + # Create a proper combined class + cache_dir = self.cache_dir + kb_env = self.kb_env + token = self.token + + class NotebookUtil(NotebookUtils, KBWSUtils): + def __init__(self): + super().__init__( + notebook_folder=str(cache_dir), + name="TableScanner", + kb_version=kb_env, + token=token + ) + + self._client = NotebookUtil() + self._use_kbutillib = True + logger.info(f"KBUtilLib client initialized for {self.kb_env}") + + except Exception as e: + logger.warning(f"KBUtilLib not available: {e}. Using fallback.") + self._use_kbutillib = False + + def get_object(self, ref: str, ws: Optional[int] = None) -> Dict[str, Any]: + """ + Get workspace object data. + + Args: + ref: Object reference or name + ws: Workspace ID (optional if ref is full reference) + + Returns: + Object data dictionary + """ + if self._use_kbutillib and self._client: + try: + return self._client.get_object(ref, ws=ws) + except Exception as e: + logger.warning(f"KBUtilLib get_object failed: {e}. Using fallback.") + return self._get_object_fallback(ref, ws) + else: + return self._get_object_fallback(ref, ws) + + def download_blob_file(self, handle_ref: str, target_path: Path) -> Path: + """ + Download file from blobstore using handle reference. + + Args: + handle_ref: Handle ID (KBH_xxxxx format) + target_path: Where to save the file + + Returns: + Path to downloaded file + """ + # Ensure directory exists + target_path = Path(target_path) + target_path.parent.mkdir(parents=True, exist_ok=True) + + if self._use_kbutillib and self._client: + try: + result = self._client.download_blob_file(handle_ref, str(target_path)) + if result: + return Path(result) + except Exception as e: + logger.warning(f"KBUtilLib download_blob_file failed: {e}. Using fallback.") + + return Path(self._download_blob_fallback(handle_ref, str(target_path))) + + # ========================================================================= + # FALLBACK METHODS (Direct API calls) + # ========================================================================= + + def _get_endpoints(self) -> Dict[str, str]: + """Get endpoints for current environment.""" + endpoints = { + "appdev": { + "workspace": "https://appdev.kbase.us/services/ws", + "shock": "https://appdev.kbase.us/services/shock-api", + "handle": "https://appdev.kbase.us/services/handle_service", + }, + "ci": { + "workspace": "https://ci.kbase.us/services/ws", + "shock": "https://ci.kbase.us/services/shock-api", + "handle": "https://ci.kbase.us/services/handle_service", + }, + "prod": { + "workspace": "https://kbase.us/services/ws", + "shock": "https://kbase.us/services/shock-api", + "handle": "https://kbase.us/services/handle_service", + }, + } + return endpoints.get(self.kb_env, endpoints["appdev"]) + + def _get_object_fallback(self, ref: str, ws: Optional[int] = None) -> Dict[str, Any]: + """Get workspace object via direct API call.""" + # Build reference + if ws and "/" not in str(ref): + ref = f"{ws}/{ref}" + + headers = { + "Authorization": self.token, + "Content-Type": "application/json" + } + + payload = { + "method": "Workspace.get_objects2", + "params": [{"objects": [{"ref": ref}]}], + "version": "1.1", + "id": "tablescanner-1" + } + + endpoints = self._get_endpoints() + response = requests.post( + endpoints["workspace"], + json=payload, + headers=headers, + timeout=60 + ) + response.raise_for_status() + result = response.json() + + if "error" in result: + raise ValueError(result["error"].get("message", "Unknown error")) + + data_list = result.get("result", [{}])[0].get("data", []) + if not data_list: + raise ValueError(f"No data for: {ref}") + + return data_list[0] + + def _download_blob_fallback(self, handle_ref: str, target_path: str) -> str: + """Download from blobstore via direct API.""" + endpoints = self._get_endpoints() + headers = {"Authorization": f"OAuth {self.token}"} + + # Resolve handle to shock ID + handle_payload = { + "method": "AbstractHandle.hids_to_handles", + "params": [[handle_ref]], + "version": "1.1", + "id": "tablescanner-2" + } + + shock_id = handle_ref # Default to handle_ref + try: + resp = requests.post( + endpoints["handle"], + json=handle_payload, + headers={"Authorization": self.token, "Content-Type": "application/json"}, + timeout=30 + ) + resp.raise_for_status() + handles = resp.json().get("result", [[]])[0] + if handles: + shock_id = handles[0].get("id", handle_ref) + except Exception as e: + logger.warning(f"Handle resolution failed, using handle_ref directly: {e}") + + # Download from shock + download_url = f"{endpoints['shock']}/node/{shock_id}?download_raw" + + response = requests.get( + download_url, + headers=headers, + stream=True, + timeout=300 + ) + response.raise_for_status() + + Path(target_path).parent.mkdir(parents=True, exist_ok=True) + with open(target_path, 'wb') as f: + for chunk in response.iter_content(8192): + f.write(chunk) + + logger.info(f"Downloaded {handle_ref} to {target_path}") + return target_path + + +# ============================================================================= +# HIGH-LEVEL FUNCTIONS +# ============================================================================= + +def get_berdl_table_data( + berdl_table_id: str, + auth_token: str, + kb_env: str = "appdev" +) -> Dict[str, Any]: + """ + Fetch BERDLTables object and extract pangenome information. + + BERDLTables structure: + { + "pangenome_data": [ + { + "pangenome_id": "pg_123", + "pangenome_taxonomy": "Escherichia coli", + "sqllite_tables_handle_ref": "KBH_xxxxx", + ... + } + ] + } Args: - workspace_id: The workspace object ID + berdl_table_id: KBase workspace reference (e.g., "76990/ADP1Test") auth_token: KBase authentication token - workspace_url: URL to the KBase Workspace service + kb_env: KBase environment Returns: - Dictionary containing object info including handle/blobstore URLs + Object data dictionary with pangenome_data + """ + client = KBaseClient(auth_token, kb_env) + obj = client.get_object(berdl_table_id) + + # Handle nested data structures + if isinstance(obj, dict) and "data" in obj: + return obj["data"] + return obj + + +def list_pangenomes_from_object( + berdl_table_id: str, + auth_token: str, + kb_env: str = "appdev" +) -> List[Dict[str, Any]]: + """ + List all pangenomes from a BERDLTables object. + + Args: + berdl_table_id: KBase workspace reference + auth_token: KBase authentication token + kb_env: KBase environment + + Returns: + List of pangenome info dictionaries with: + - pangenome_id + - pangenome_taxonomy + - handle_ref + - user_genomes + - berdl_genomes + """ + obj_data = get_berdl_table_data(berdl_table_id, auth_token, kb_env) + + pangenome_data = obj_data.get("pangenome_data", []) + + pangenomes = [] + for pg in pangenome_data: + pangenomes.append({ + "pangenome_id": pg.get("pangenome_id", ""), + "pangenome_taxonomy": pg.get("pangenome_taxonomy", ""), + "user_genomes": pg.get("user_genomes", []), + "berdl_genomes": pg.get("berdl_genomes", []), + "genome_count": len(pg.get("user_genomes", [])) + len(pg.get("berdl_genomes", [])), + "handle_ref": pg.get("sqllite_tables_handle_ref", ""), + }) + + return pangenomes + + +def find_pangenome_handle( + berdl_table_id: str, + pangenome_id: str, + auth_token: str, + kb_env: str = "appdev" +) -> str: + """ + Find the handle_ref for a specific pangenome. + + Args: + berdl_table_id: KBase workspace reference + pangenome_id: ID of pangenome to find + auth_token: KBase authentication token + kb_env: KBase environment + + Returns: + Handle reference string (KBH_xxxxx) Raises: - HTTPException: If the workspace API call fails - """ - # TODO: Implement actual KBase Workspace API call - # Example: - # headers = {"Authorization": auth_token} - # payload = { - # "method": "Workspace.get_objects2", - # "params": [{ - # "objects": [{"ref": workspace_id}] - # }], - # "version": "1.1" - # } - # response = requests.post(workspace_url, json=payload, headers=headers) - # response.raise_for_status() - # data = response.json() - # return data["result"][0]["data"][0] - - raise NotImplementedError("KBase Workspace API integration not yet implemented") + ValueError: If pangenome not found + """ + pangenomes = list_pangenomes_from_object(berdl_table_id, auth_token, kb_env) + + for pg in pangenomes: + if pg["pangenome_id"] == pangenome_id: + return pg["handle_ref"] + + available = [pg["pangenome_id"] for pg in pangenomes] + raise ValueError(f"Pangenome '{pangenome_id}' not found. Available: {available}") + + +def download_pangenome_db( + berdl_table_id: str, + pangenome_id: Optional[str], + auth_token: str, + cache_dir: Path, + kb_env: str = "appdev" +) -> Path: + """ + Download the SQLite database for a pangenome. + If pangenome_id is None, it is auto-resolved from the BERDL object (1:1 mapping assumed). + + Checks cache first, downloads only if not present. + """ + from app.utils.cache import is_cached, get_cache_paths + + cache_dir = Path(cache_dir) + safe_id = berdl_table_id.replace("/", "_").replace(":", "_") + db_dir = cache_dir / safe_id + + # 1. Resolve ID and Handle if not provided + target_id = pangenome_id + handle_ref = None + + # We always need the ID for the filename. + # If pangenome_id is missing, we must fetch the object metadata to get it. + # If pangenome_id IS provided, we might still need to fetch object to get the handle (unless cached). + + # Optimization: If pangenome_id is provided, check if file exists. + # If so, we don't need to fetch metadata. + if target_id: + db_path = db_dir / f"{target_id}.db" + if db_path.exists(): + logger.info(f"Using cached database: {db_path}") + return db_path + + # If not cached or ID unknown, we must fetch metadata + pangenomes = list_pangenomes_from_object(berdl_table_id, auth_token, kb_env) + if not pangenomes: + raise ValueError(f"No pangenomes found in {berdl_table_id}") + + if target_id: + # Verify and find handle + found = next((p for p in pangenomes if p["pangenome_id"] == target_id), None) + if not found: + raise ValueError(f"Pangenome '{target_id}' not found in {berdl_table_id}") + handle_ref = found["handle_ref"] + else: + # Auto-resolve: take the first one + found = pangenomes[0] + target_id = found["pangenome_id"] + handle_ref = found["handle_ref"] + + # Re-check cache with resolved ID + db_path = db_dir / f"{target_id}.db" + if db_path.exists(): + logger.info(f"Using cached database: {db_path} (resolved ID: {target_id})") + return db_path + + # Download + client = KBaseClient(auth_token, kb_env, cache_dir) + db_path = client.download_blob_file(handle_ref, db_path) + + logger.info(f"Downloaded database to: {db_path}") + return db_path + + +def get_object_info( + object_ref: str, + auth_token: str, + kb_env: str = "appdev" +) -> Dict[str, Any]: + """ + Get basic object info without full data. + + Args: + object_ref: KBase workspace reference + auth_token: KBase authentication token + kb_env: KBase environment + + Returns: + Object metadata + """ + client = KBaseClient(auth_token, kb_env) + return client.get_object(object_ref) diff --git a/docs/QUICKSTART_DEMO.md b/docs/QUICKSTART_DEMO.md new file mode 100644 index 0000000..b06de7d --- /dev/null +++ b/docs/QUICKSTART_DEMO.md @@ -0,0 +1,50 @@ +# Quickstart Demo + +This guide walks you through running the TableScanner demo locally. + +## Prerequisites + +- Python 3.9+ +- KBase Auth Token (for accessing workspace objects) + +## Setup + +1. **Install Dependencies** + ```bash + pip install -r requirements.txt + ``` + +2. **Start the Service** + ```bash + uv run fastapi dev app/main.py + ``` + Server will start at `http://localhost:8000`. + +## Running the Demo + +1. Open the [Viewer](http://localhost:8000/static/viewer.html) in your browser. + +2. **Configuration:** + - **Environment**: Select `AppDev` (or appropriate env). + - **Auth Token**: Enter your KBase token. + +3. **Load Data:** + - **BERDL Table ID**: Enter `76990/ADP1Test`. + - Click the **Search** icon. + +4. **Explore:** + - Since `76990/ADP1Test` contains only one pangenome, it will be **auto-selected**. + - Tables will load automatically. + - Select a table (e.g., "Genome attributes") to view data. + - Hover over cells with IDs (UniProt, KEGG, etc.) to see tooltips. + - Click IDs to visit external databases. + +## Multi-Pangenome Demo + +To test loading multiple identifiers: + +1. **BERDL Table ID**: Enter `76990/ADP1Test, 76990/ADP1Test` (simulating two sources). +2. Click **Search**. +3. The **Pangenome** dropdown will appear. +4. Options will show as: `ADP1 [76990/ADP1Test]`. +5. Select different options to toggle between datasets (if they were different). diff --git a/docs/USAGE_GUIDE.md b/docs/USAGE_GUIDE.md new file mode 100644 index 0000000..e7e56f9 --- /dev/null +++ b/docs/USAGE_GUIDE.md @@ -0,0 +1,87 @@ +# Usage Guide + +This guide covers production usage of the TableScanner service. + +## API Endpoint +The service is deployed at: +``` +https://appdev.kbase.us/services/berdl_table_scanner +``` + +## Python API Usage + +You can interact with the service programmatically using Python's `requests` library. + +### 1. Listing Pangenomes +```python +import requests + +service_url = "https://appdev.kbase.us/services/berdl_table_scanner" +token = "YOUR_KBASE_TOKEN" +berdl_id = "76990/ADP1Test" + +headers = {"Authorization": token} +params = {"berdl_table_id": berdl_id} + +response = requests.get(f"{service_url}/pangenomes", headers=headers, params=params) +data = response.json() + +print(f"Found {data['pangenome_count']} pangenomes") +for pg in data['pangenomes']: + print(f"- {pg['pangenome_id']} (Source: {pg['source_berdl_id']})") +``` + +### 2. Querying Table Data + +Query table data with filtering and column selection. + +```python +headers = {"Authorization": token} + +# Get data from "Conditions" table +berdl_id = "76990/ADP1Test" +table_name = "Conditions" + +payload = { + "berdl_table_id": berdl_id, + "table_name": table_name, + "columns": "Database_ID, Name", + "col_filter": { + "Name": "test" + }, + "order_by": [ + {"column": "Name", "direction": "ASC"} + ], + "limit": 5, + "offset": 0 +} + +response = requests.post(f"{service_url}/table-data", json=payload, headers=headers) +data = response.json() + +print(f"Loaded {data['row_count']} rows from {table_name}") +print(f"Headers: {data['headers']}") +``` + +## Multi-Source Querying + +The `/pangenomes` endpoint supports multiple comma-separated BERDL IDs. + +```python +multi_params = { + "berdl_table_id": "76990/ADP1Test, 12345/AnotherTable" +} + +response = requests.get(f"{service_url}/pangenomes", headers=headers, params=multi_params) +# Returns pangenomes from BOTH objects in a single list +``` + +## Viewer Usage + +The web viewer is available at: +`https://appdev.kbase.us/services/berdl_table_scanner/static/viewer.html` + +1. Enter **Auth Token**. +2. Enter **BERDL Table ID(s)** (comma-separated). +3. Click **Search**. +4. Use the interface to filter, sort, and export data. diff --git a/pyproject.toml b/pyproject.toml index fd3c46f..2e6923c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,8 +10,16 @@ dependencies = [ "minio>=7.2.20", "pydantic-settings>=2.0.0", "requests>=2.31.0", + "pandas>=2.2.0", + "PyYAML>=6.0", + "tqdm>=4.64.0", + "itables>=1.5.0", + "ipywidgets>=8.0.0", ] [build-system] requires = ["setuptools>=61.0"] build-backend = "setuptools.build_meta" + +[tool.setuptools.packages.find] +include = ["app*"] diff --git a/static/viewer.html b/static/viewer.html new file mode 100644 index 0000000..98d7449 --- /dev/null +++ b/static/viewer.html @@ -0,0 +1,1519 @@ + + + +
+ + +Enter BERDLTable ID and click search
+