From eab280c06193c6dc1d5a314f825306049ab55592 Mon Sep 17 00:00:00 2001 From: VibhavSetlur Date: Mon, 15 Dec 2025 18:31:11 -0600 Subject: [PATCH 1/5] Implemented local service --- .env.example | 38 +- .gitignore | 8 + app/config.py | 58 +- app/main.py | 49 +- app/models.py | 277 ++++++++- app/routes.py | 618 ++++++++++++++++--- app/utils/__init__.py | 56 +- app/utils/cache.py | 295 ++++++++- app/utils/download.py | 25 - app/utils/sqlite.py | 450 ++++++++++---- app/utils/workspace.py | 430 ++++++++++++- pyproject.toml | 3 + static/viewer.html | 1295 ++++++++++++++++++++++++++++++++++++++++ 13 files changed, 3322 insertions(+), 280 deletions(-) delete mode 100644 app/utils/download.py create mode 100644 static/viewer.html diff --git a/.env.example b/.env.example index 55428ec..49fbbf7 100644 --- a/.env.example +++ b/.env.example @@ -1,11 +1,43 @@ # TableScanner Environment Variables # Copy this file to .env and fill in your actual values +# ============================================================================= +# AUTHENTICATION +# ============================================================================= # KBase Service Authentication Token -KB_SERVICE_AUTH_TOKEN=your_kbase_token_here +# For development testing, use your personal token from KBase +KB_SERVICE_AUTH_TOKEN=YOUR_KBASE_TOKEN_HERE -# Cache directory for storing downloaded files and SQLite databases +# ============================================================================= +# CACHE SETTINGS +# ============================================================================= +# Cache directory for storing downloaded SQLite databases CACHE_DIR=/tmp/tablescanner_cache +# Maximum age of cached files in hours (default: 24) +CACHE_MAX_AGE_HOURS=24 + +# ============================================================================= +# KBASE SERVICE URLS +# ============================================================================= # KBase Workspace Service URL -WORKSPACE_URL=https://kbase.us/services/ws \ No newline at end of file +WORKSPACE_URL=https://appdev.kbase.us/services/ws + +# Base URL for KBase services +KBASE_ENDPOINT=https://appdev.kbase.us/services + +# KBase Blobstore/Shock service URL +BLOBSTORE_URL=https://appdev.kbase.us/services/shock-api + +# ============================================================================= +# APPLICATION SETTINGS +# ============================================================================= +# Enable debug mode (true/false) +DEBUG=false + +# ============================================================================= +# TEST DATA (AppDev) +# ============================================================================= +# Test BERDLTable object: 76990/ADP1Test +# Test pangenome: GCF_000368685.1 +# Narrative: https://appdev.kbase.us/narrative/76990 \ No newline at end of file diff --git a/.gitignore b/.gitignore index 18db92a..172f46b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +trash/ + .DS_Store .idea test/test.cfg @@ -20,3 +22,9 @@ venv/ # Environment variables .env + +# External libraries (cloned) +lib/ + +# Cache directory +cache/ diff --git a/app/config.py b/app/config.py index 8de5aaf..20cb648 100644 --- a/app/config.py +++ b/app/config.py @@ -1,18 +1,64 @@ """ Configuration settings for TableScanner application. + +Loads configuration from environment variables and .env file. +All KBase service URLs and authentication settings are managed here. """ from pydantic_settings import BaseSettings +from pydantic import Field class Settings(BaseSettings): - """Application settings.""" + """ + Application settings loaded from environment variables. + + Create a .env file based on .env.example to configure locally. + """ + + # ========================================================================== + # AUTHENTICATION + # ========================================================================== + KB_SERVICE_AUTH_TOKEN: str = Field( + ..., + description="KBase authentication token for API access" + ) + + # ========================================================================== + # CACHE SETTINGS + # ========================================================================== + CACHE_DIR: str = Field( + default="/tmp/tablescanner_cache", + description="Directory for caching downloaded files and SQLite databases" + ) + CACHE_MAX_AGE_HOURS: int = Field( + default=24, + description="Maximum age of cached files in hours before re-download" + ) - KB_SERVICE_AUTH_TOKEN: str - CACHE_DIR: str + # ========================================================================== + # KBASE SERVICE URLS + # ========================================================================== + WORKSPACE_URL: str = Field( + default="https://kbase.us/services/ws", + description="KBase Workspace service URL" + ) + KBASE_ENDPOINT: str = Field( + default="https://kbase.us/services", + description="Base URL for KBase services" + ) + BLOBSTORE_URL: str = Field( + default="https://kbase.us/services/shock-api", + description="KBase blobstore/shock service URL" + ) - # KBase Workspace settings - WORKSPACE_URL: str + # ========================================================================== + # APPLICATION SETTINGS + # ========================================================================== + DEBUG: bool = Field( + default=False, + description="Enable debug mode with verbose logging" + ) class Config: env_file = ".env" @@ -20,5 +66,5 @@ class Config: case_sensitive = True -# Global settings instance +# Global settings instance - loaded at module import settings = Settings() \ No newline at end of file diff --git a/app/main.py b/app/main.py index 70aef20..2ad5f90 100644 --- a/app/main.py +++ b/app/main.py @@ -1,10 +1,17 @@ """ TableScanner FastAPI Application -Main application factory module. +Main application factory module for the TableScanner service. +Provides REST API endpoints for querying BERDL table data. + +Run with: uv run fastapi dev app/main.py """ +from pathlib import Path from fastapi import FastAPI +from fastapi.staticfiles import StaticFiles +from fastapi.middleware.cors import CORSMiddleware + from app.routes import router from app.config import settings @@ -13,21 +20,55 @@ def create_app() -> FastAPI: """ Application factory function. + Creates and configures the FastAPI application with: + - CORS middleware for browser access + - Static file serving for viewer.html + - API routes + Returns: FastAPI: Configured FastAPI application instance """ app = FastAPI( title="TableScanner", - description="API for table scanning operations", - version="1.0.0" + description=""" +## TableScanner API + +A FastAPI service for querying BERDL table data from KBase. + +### Features +- List pangenomes from BERDLTables objects +- List tables within a pangenome +- Query table data with filtering, sorting, and pagination +- Local caching for performance + +### Authentication +Pass your KBase auth token in the `Authorization` header. + """, + version="1.0.0", + docs_url="/docs", + redoc_url="/redoc", + ) + + # Enable CORS for browser-based access + app.add_middleware( + CORSMiddleware, + allow_origins=["*"], # Allow all origins for development + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], ) # Store settings in app state for access throughout the application app.state.settings = settings - # Include routes + # Include API routes app.include_router(router) + # Mount static files directory for viewer.html + static_dir = Path(__file__).parent.parent / "static" + if static_dir.exists(): + app.mount("/static", StaticFiles(directory=static_dir), name="static") + return app diff --git a/app/models.py b/app/models.py index 2aeb79e..a4b7ddf 100644 --- a/app/models.py +++ b/app/models.py @@ -1,15 +1,274 @@ """ -Pydantic models for request/response schemas. +Pydantic models for TableScanner API. + +Defines strictly typed request/response schemas for clean /docs output. +All models use Field with descriptions and examples for documentation. """ -from typing import Optional, List, Dict, Any -from pydantic import BaseModel +from typing import List, Dict, Optional, Any, Literal +from pydantic import BaseModel, Field + + +# ============================================================================= +# REQUEST MODELS +# ============================================================================= + +class OrderSpec(BaseModel): + """Specification for ordering/sorting query results.""" + column: str = Field(..., description="Column name to sort by") + order: Literal["ASC", "DESC"] = Field( + "ASC", + description="Sort direction: ASC (ascending) or DESC (descending)" + ) + + +class FilterSpec(BaseModel): + """Specification for column-specific filtering.""" + column: str = Field(..., description="Column name to filter") + value: str = Field(..., description="Filter value (uses LIKE matching)") + operator: Literal["LIKE", "=", ">", "<", ">=", "<="] = Field( + "LIKE", + description="Filter operator" + ) class SearchRequest(BaseModel): - """Search request with query parameters.""" - pangenome_id: str - table_name: str - limit: Optional[int] = None - order_by: Optional[List[Dict[str, str]]] = None - filters: Optional[List[Dict[str, Any]]] = None \ No newline at end of file + """ + Request model for /search endpoint. + + Provides a flexible interface for searching table data with + optional filtering, sorting, and pagination. + """ + berdl_table_id: str = Field( + ..., + description="BERDLTables object reference (e.g., '76990/ADPITest')", + examples=["76990/ADPITest"] + ) + pangenome_id: Optional[str] = Field( + None, + description="Pangenome ID within the BERDLTables object. Uses first available if not specified." + ) + table_name: str = Field( + ..., + description="Name of the table to query", + examples=["Genes", "Organisms"] + ) + limit: int = Field( + 100, + ge=1, + le=500000, + description="Maximum number of rows to return" + ) + offset: int = Field( + 0, + ge=0, + description="Number of rows to skip (for pagination)" + ) + search_value: Optional[str] = Field( + None, + description="Global search term (searches all columns)" + ) + order_by: Optional[List[Dict[str, str]]] = Field( + None, + description="List of {column, order} dicts for sorting", + examples=[[{"column": "gene_name", "order": "ASC"}]] + ) + filters: Optional[List[Dict[str, str]]] = Field( + None, + description="List of column filters [{column, value}]" + ) + kb_env: str = Field( + "appdev", + description="KBase environment: appdev, ci, or prod" + ) + + +class TableDataRequest(BaseModel): + """ + Request model for /table-data endpoint. + + Mirrors the parameters from the original BERDLTable_conversion_service + for API compatibility. + """ + berdl_table_id: str = Field( + ..., + description="BERDLTables object reference", + examples=["76990/ADPITest"] + ) + pangenome_id: str = Field( + ..., + description="Pangenome ID to query", + examples=["pg_default"] + ) + table_name: str = Field( + ..., + description="Table name within the SQLite database", + examples=["Genes"] + ) + limit: int = Field( + 100, + ge=1, + le=500000, + description="Maximum rows to return" + ) + offset: int = Field( + 0, + ge=0, + description="Offset for pagination" + ) + sort_column: Optional[str] = Field( + None, + description="Column to sort by" + ) + sort_order: Optional[Literal["ASC", "DESC"]] = Field( + "ASC", + description="Sort direction" + ) + search_value: Optional[str] = Field( + None, + description="Global search term" + ) + query_filters: Optional[Dict[str, str]] = Field( + None, + description="Column-specific filters {column_name: filter_value}", + examples=[{"gene_name": "kinase", "organism": "E. coli"}] + ) + kb_env: str = Field( + "appdev", + description="KBase environment" + ) + + +# ============================================================================= +# RESPONSE MODELS +# ============================================================================= + +class TableColumn(BaseModel): + """Information about a table column.""" + name: str = Field(..., description="Column name") + type: Optional[str] = Field(None, description="Column data type") + + +class TableInfo(BaseModel): + """Information about a database table.""" + name: str = Field(..., description="Table name") + row_count: Optional[int] = Field(None, description="Number of rows") + column_count: Optional[int] = Field(None, description="Number of columns") + + +class TableListResponse(BaseModel): + """Response for listing tables in a pangenome database.""" + pangenome_id: str = Field(..., description="Pangenome identifier") + tables: List[TableInfo] = Field( + default_factory=list, + description="List of available tables" + ) + + +class PangenomeInfo(BaseModel): + """Information about a pangenome within a BERDLTables object.""" + pangenome_id: Optional[str] = Field(None, description="Unique pangenome identifier") + pangenome_taxonomy: Optional[str] = Field(None, description="Taxonomic classification") + user_genomes: List[str] = Field( + default_factory=list, + description="List of user-provided genome references" + ) + berdl_genomes: List[str] = Field( + default_factory=list, + description="List of BERDL/Datalake genome identifiers" + ) + handle_ref: Optional[str] = Field( + None, + description="Blobstore handle reference for SQLite database" + ) + + +class PangenomesResponse(BaseModel): + """Response for listing pangenomes from a BERDLTables object.""" + pangenomes: List[PangenomeInfo] = Field( + default_factory=list, + description="List of available pangenomes" + ) + + +class TableDataResponse(BaseModel): + """ + Response for table data queries. + + Includes the data, metadata, and performance metrics. + """ + headers: List[str] = Field( + ..., + description="Column names in order" + ) + data: List[List[str]] = Field( + ..., + description="Row data as list of lists" + ) + row_count: int = Field( + ..., + description="Number of rows in this response" + ) + total_count: int = Field( + ..., + description="Total rows in table (before filtering)" + ) + filtered_count: int = Field( + ..., + description="Rows matching filter criteria" + ) + table_name: str = Field( + ..., + description="Name of the queried table" + ) + pangenome_id: str = Field( + ..., + description="Pangenome identifier" + ) + response_time_ms: float = Field( + ..., + description="Total response time in milliseconds" + ) + db_query_ms: Optional[float] = Field( + None, + description="Database query time in milliseconds" + ) + conversion_ms: Optional[float] = Field( + None, + description="Data conversion time in milliseconds" + ) + source: Optional[str] = Field( + None, + description="Data source (Cache or Downloaded)" + ) + cache_file: Optional[str] = Field( + None, + description="Path to cached file" + ) + sqlite_file: Optional[str] = Field( + None, + description="Path to SQLite database" + ) + + +class CacheResponse(BaseModel): + """Response for cache operations.""" + status: Literal["success", "error"] = Field( + ..., + description="Operation status" + ) + message: str = Field( + ..., + description="Status message" + ) + + +class ServiceStatus(BaseModel): + """Service health check response.""" + service: str = Field(..., description="Service name") + version: str = Field(..., description="Service version") + status: Literal["running", "degraded", "error"] = Field( + ..., + description="Service status" + ) + cache_dir: str = Field(..., description="Cache directory path") \ No newline at end of file diff --git a/app/routes.py b/app/routes.py index c05a16b..08e461f 100644 --- a/app/routes.py +++ b/app/routes.py @@ -1,92 +1,564 @@ """ -TableScanner API Routes +TableScanner API Routes. -Contains all API endpoint definitions. +REST API Structure (per architecture diagram): +- GET /{handle_ref}/tables - List tables in SQLite from handle +- GET /{handle_ref}/tables/{table}/schema - Get table schema +- GET /{handle_ref}/tables/{table}/data - Query table data with pagination +- GET /object/{ws_ref}/tables - List tables from KBase object +- GET /object/{ws_ref}/tables/{table}/data - Query via KBase object ref + +Also supports legacy endpoints for backwards compatibility. """ +import time +import logging from pathlib import Path -from fastapi import APIRouter, Request, HTTPException +from typing import Optional + +from fastapi import APIRouter, HTTPException, Header, Query +from fastapi.responses import JSONResponse -from app.models import SearchRequest -from app.utils.workspace import get_object_info -from app.utils.download import download_from_handle -from app.utils.cache import get_cache_paths, save_to_cache, is_cached -from app.utils.sqlite import convert_to_sqlite +from app.models import ( + SearchRequest, + TableDataRequest, + TableDataResponse, + PangenomesResponse, + PangenomeInfo, + TableListResponse, + TableInfo, + CacheResponse, + ServiceStatus, +) +from app.utils.workspace import ( + get_berdl_table_data, + list_pangenomes_from_object, + download_pangenome_db, +) +from app.utils.sqlite import ( + list_tables, + get_table_data, + get_table_columns, + get_table_row_count, + validate_table_exists, + ensure_indices, +) +from app.utils.cache import ( + is_cached, + get_cache_paths, + clear_cache, + list_cached_items, + cleanup_old_caches, +) +from app.config import settings +# Configure module logger +logger = logging.getLogger(__name__) + +# Create router router = APIRouter() -@router.get("/") -async def root(request: Request): - """Root endpoint returning service information.""" - settings = request.app.state.settings - return { - "service": "TableScanner", - "version": "1.0.0", - "status": "running", - "cache_dir": settings.CACHE_DIR - } +# ============================================================================= +# UTILITY FUNCTIONS +# ============================================================================= + +def get_auth_token(authorization: Optional[str] = None) -> str: + """Extract auth token from header or settings.""" + if authorization: + if authorization.startswith("Bearer "): + return authorization[7:] + return authorization + + if settings.KB_SERVICE_AUTH_TOKEN: + return settings.KB_SERVICE_AUTH_TOKEN + + raise HTTPException( + status_code=401, + detail="Authorization token required" + ) + + +def get_cache_dir() -> Path: + """Get configured cache directory.""" + return Path(settings.CACHE_DIR) -@router.post("/search") -def search(request: Request, search_request: SearchRequest): +# ============================================================================= +# SERVICE STATUS +# ============================================================================= + +@router.get("/", response_model=ServiceStatus) +async def root(): + """Service health check.""" + return ServiceStatus( + service="TableScanner", + version="1.0.0", + status="running", + cache_dir=str(settings.CACHE_DIR) + ) + + +# ============================================================================= +# HANDLE-BASED ENDPOINTS (Primary REST API per diagram) +# /{handle_ref}/tables - List tables +# /{handle_ref}/tables/{table}/schema - Table schema +# /{handle_ref}/tables/{table}/data - Table data with pagination +# ============================================================================= + +@router.get("/handle/{handle_ref}/tables") +async def list_tables_by_handle( + handle_ref: str, + kb_env: str = Query("appdev", description="KBase environment"), + authorization: Optional[str] = Header(None) +): + """ + List all tables in a SQLite database accessed via handle reference. + + The handle_ref is the KBase blobstore handle (e.g., KBH_248028). """ - Search endpoint with flexible querying. + try: + token = get_auth_token(authorization) + cache_dir = get_cache_dir() + + # Download SQLite from handle + from app.utils.workspace import KBaseClient + client = KBaseClient(token, kb_env, cache_dir) + + # Cache path based on handle + safe_handle = handle_ref.replace(":", "_").replace("/", "_") + db_path = cache_dir / "handles" / f"{safe_handle}.db" + + if not db_path.exists(): + client.download_blob_file(handle_ref, db_path) + + # List tables + table_names = list_tables(db_path) + tables = [] + for name in table_names: + try: + columns = get_table_columns(db_path, name) + row_count = get_table_row_count(db_path, name) + tables.append({ + "name": name, + "row_count": row_count, + "column_count": len(columns) + }) + except Exception as e: + logger.warning(f"Error getting info for {name}: {e}") + tables.append({"name": name}) + + return { + "handle_ref": handle_ref, + "tables": tables, + "db_path": str(db_path) + } + + except Exception as e: + logger.error(f"Error listing tables from handle: {e}") + raise HTTPException(status_code=500, detail=str(e)) - Args: - search_request: Search parameters including pangenome_id, table_name, limit, order_by, filters - Returns: - A dictionary with search results +@router.get("/handle/{handle_ref}/tables/{table_name}/schema") +async def get_table_schema_by_handle( + handle_ref: str, + table_name: str, + kb_env: str = Query("appdev"), + authorization: Optional[str] = Header(None) +): """ - settings = request.app.state.settings - token = settings.KB_SERVICE_AUTH_TOKEN - cache_dir = Path(settings.CACHE_DIR) - workspace_url = settings.WORKSPACE_URL - - # TODO: Use the users token instead of a static one - - # Get object info from KBase Workspace - object_info = get_object_info(search_request.pangenome_id, token, workspace_url) - filename = object_info.get('filename', f'{search_request.pangenome_id}.bin') - handle_url = object_info.get('handle_url') or object_info.get('blobstore_url') - - if not handle_url: - raise HTTPException( - status_code=404, - detail=f"No handle/blobstore URL found for id: {search_request.pangenome_id}" + Get schema (columns) for a table accessed via handle reference. + """ + try: + token = get_auth_token(authorization) + cache_dir = get_cache_dir() + + from app.utils.workspace import KBaseClient + client = KBaseClient(token, kb_env, cache_dir) + + safe_handle = handle_ref.replace(":", "_").replace("/", "_") + db_path = cache_dir / "handles" / f"{safe_handle}.db" + + if not db_path.exists(): + client.download_blob_file(handle_ref, db_path) + + if not validate_table_exists(db_path, table_name): + available = list_tables(db_path) + raise HTTPException(404, f"Table '{table_name}' not found. Available: {available}") + + columns = get_table_columns(db_path, table_name) + row_count = get_table_row_count(db_path, table_name) + + return { + "handle_ref": handle_ref, + "table_name": table_name, + "columns": columns, + "row_count": row_count + } + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error getting schema: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@router.get("/handle/{handle_ref}/tables/{table_name}/data") +async def get_table_data_by_handle( + handle_ref: str, + table_name: str, + limit: int = Query(100, ge=1, le=500000), + offset: int = Query(0, ge=0), + sort_column: Optional[str] = Query(None), + sort_order: Optional[str] = Query("ASC"), + search: Optional[str] = Query(None, description="Global search term"), + kb_env: str = Query("appdev"), + authorization: Optional[str] = Header(None) +): + """ + Query table data from SQLite via handle reference. + + Supports: + - Pagination: limit, offset + - Sorting: sort_column, sort_order + - Search: global search across all columns + """ + start_time = time.time() + + try: + token = get_auth_token(authorization) + cache_dir = get_cache_dir() + + from app.utils.workspace import KBaseClient + client = KBaseClient(token, kb_env, cache_dir) + + safe_handle = handle_ref.replace(":", "_").replace("/", "_") + db_path = cache_dir / "handles" / f"{safe_handle}.db" + + if not db_path.exists(): + client.download_blob_file(handle_ref, db_path) + + if not validate_table_exists(db_path, table_name): + available = list_tables(db_path) + raise HTTPException(404, f"Table '{table_name}' not found. Available: {available}") + + # Query data + headers, data, total_count, filtered_count, db_query_ms, conversion_ms = get_table_data( + sqlite_file=db_path, + table_name=table_name, + limit=limit, + offset=offset, + sort_column=sort_column, + sort_order=sort_order, + search_value=search, ) + + response_time_ms = (time.time() - start_time) * 1000 + + return { + "handle_ref": handle_ref, + "table_name": table_name, + "headers": headers, + "data": data, + "row_count": len(data), + "total_count": total_count, + "filtered_count": filtered_count, + "response_time_ms": response_time_ms, + "db_query_ms": db_query_ms + } + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error querying data: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +# ============================================================================= +# OBJECT-BASED ENDPOINTS (via KBase workspace object reference) +# /object/{ws_ref}/pangenomes - List pangenomes from BERDLTables object +# /object/{ws_ref}/pangenomes/{pg_id}/tables - List tables for a pangenome +# /object/{ws_ref}/pangenomes/{pg_id}/tables/{table}/data - Query data +# ============================================================================= + +@router.get("/object/{ws_id}/{obj_name}/pangenomes") +async def list_pangenomes_by_object( + ws_id: str, + obj_name: str, + kb_env: str = Query("appdev"), + authorization: Optional[str] = Header(None) +): + """ + List pangenomes from a BERDLTables/GenomeDataLakeTables object. + """ + try: + token = get_auth_token(authorization) + berdl_table_id = f"{ws_id}/{obj_name}" + + pangenomes = list_pangenomes_from_object( + berdl_table_id=berdl_table_id, + auth_token=token, + kb_env=kb_env + ) + + return { + "berdl_table_id": berdl_table_id, + "pangenomes": pangenomes + } + + except Exception as e: + logger.error(f"Error listing pangenomes: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@router.get("/object/{ws_id}/{obj_name}/pangenomes/{pangenome_id}/tables") +async def list_tables_by_object( + ws_id: str, + obj_name: str, + pangenome_id: str, + kb_env: str = Query("appdev"), + authorization: Optional[str] = Header(None) +): + """ + List tables for a specific pangenome within a BERDLTables object. + """ + try: + token = get_auth_token(authorization) + cache_dir = get_cache_dir() + berdl_table_id = f"{ws_id}/{obj_name}" + + db_path = download_pangenome_db( + berdl_table_id=berdl_table_id, + pangenome_id=pangenome_id, + auth_token=token, + cache_dir=cache_dir, + kb_env=kb_env + ) + + table_names = list_tables(db_path) + tables = [] + for name in table_names: + try: + columns = get_table_columns(db_path, name) + row_count = get_table_row_count(db_path, name) + tables.append({ + "name": name, + "row_count": row_count, + "column_count": len(columns) + }) + except Exception as e: + tables.append({"name": name}) + + return { + "berdl_table_id": berdl_table_id, + "pangenome_id": pangenome_id, + "tables": tables + } + + except Exception as e: + logger.error(f"Error listing tables: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@router.get("/object/{ws_id}/{obj_name}/pangenomes/{pangenome_id}/tables/{table_name}/data") +async def get_table_data_by_object( + ws_id: str, + obj_name: str, + pangenome_id: str, + table_name: str, + limit: int = Query(100, ge=1, le=500000), + offset: int = Query(0, ge=0), + sort_column: Optional[str] = Query(None), + sort_order: Optional[str] = Query("ASC"), + search: Optional[str] = Query(None), + kb_env: str = Query("appdev"), + authorization: Optional[str] = Header(None) +): + """ + Query table data from a pangenome within a BERDLTables object. + """ + start_time = time.time() + + try: + token = get_auth_token(authorization) + cache_dir = get_cache_dir() + berdl_table_id = f"{ws_id}/{obj_name}" + + db_path = download_pangenome_db( + berdl_table_id=berdl_table_id, + pangenome_id=pangenome_id, + auth_token=token, + cache_dir=cache_dir, + kb_env=kb_env + ) + + if not validate_table_exists(db_path, table_name): + available = list_tables(db_path) + raise HTTPException(404, f"Table '{table_name}' not found. Available: {available}") + + headers, data, total_count, filtered_count, db_query_ms, conversion_ms = get_table_data( + sqlite_file=db_path, + table_name=table_name, + limit=limit, + offset=offset, + sort_column=sort_column, + sort_order=sort_order, + search_value=search, + ) + + response_time_ms = (time.time() - start_time) * 1000 + + return { + "berdl_table_id": berdl_table_id, + "pangenome_id": pangenome_id, + "table_name": table_name, + "headers": headers, + "data": data, + "row_count": len(data), + "total_count": total_count, + "filtered_count": filtered_count, + "response_time_ms": response_time_ms, + "db_query_ms": db_query_ms, + "sqlite_file": str(db_path) + } + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error querying data: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +# ============================================================================= +# LEGACY ENDPOINTS (for backwards compatibility) +# ============================================================================= + +@router.get("/pangenomes", response_model=PangenomesResponse) +async def get_pangenomes( + berdl_table_id: str = Query(..., description="BERDLTables object reference"), + kb_env: str = Query("appdev"), + authorization: Optional[str] = Header(None) +): + """List pangenomes from BERDLTables object (legacy endpoint).""" + try: + token = get_auth_token(authorization) + pangenomes = list_pangenomes_from_object(berdl_table_id, token, kb_env) + return PangenomesResponse(pangenomes=[PangenomeInfo(**pg) for pg in pangenomes]) + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +@router.get("/tables/{pangenome_id}", response_model=TableListResponse) +async def get_tables( + pangenome_id: str, + berdl_table_id: str = Query(...), + kb_env: str = Query("appdev"), + authorization: Optional[str] = Header(None) +): + """List tables for a pangenome (legacy endpoint).""" + try: + token = get_auth_token(authorization) + cache_dir = get_cache_dir() + + db_path = download_pangenome_db(berdl_table_id, pangenome_id, token, cache_dir, kb_env) + table_names = list_tables(db_path) + + tables = [] + for name in table_names: + try: + columns = get_table_columns(db_path, name) + row_count = get_table_row_count(db_path, name) + tables.append(TableInfo(name=name, row_count=row_count, column_count=len(columns))) + except: + tables.append(TableInfo(name=name)) + + return TableListResponse(pangenome_id=pangenome_id, tables=tables) + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +@router.post("/table-data", response_model=TableDataResponse) +async def query_table_data( + request: TableDataRequest, + authorization: Optional[str] = Header(None) +): + """Query table data (legacy POST endpoint).""" + start_time = time.time() + + try: + token = get_auth_token(authorization) + cache_dir = get_cache_dir() + kb_env = getattr(request, 'kb_env', 'appdev') or 'appdev' + + db_path = download_pangenome_db( + request.berdl_table_id, request.pangenome_id, token, cache_dir, kb_env + ) + + if not validate_table_exists(db_path, request.table_name): + available = list_tables(db_path) + raise ValueError(f"Table '{request.table_name}' not found. Available: {available}") + + try: + ensure_indices(db_path, request.table_name) + except: + pass + + headers, data, total_count, filtered_count, db_query_ms, conversion_ms = get_table_data( + sqlite_file=db_path, + table_name=request.table_name, + limit=request.limit, + offset=request.offset, + sort_column=request.sort_column, + sort_order=request.sort_order, + search_value=request.search_value, + query_filters=request.query_filters, + ) + + response_time_ms = (time.time() - start_time) * 1000 + + return TableDataResponse( + headers=headers, + data=data, + row_count=len(data), + total_count=total_count, + filtered_count=filtered_count, + table_name=request.table_name, + pangenome_id=request.pangenome_id, + response_time_ms=response_time_ms, + db_query_ms=db_query_ms, + conversion_ms=conversion_ms, + source="Cache" if is_cached(db_path) else "Downloaded", + cache_file=str(db_path), + sqlite_file=str(db_path) + ) + + except ValueError as e: + raise HTTPException(status_code=400, detail=str(e)) + except Exception as e: + logger.error(f"Error querying table data: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +# ============================================================================= +# CACHE MANAGEMENT +# ============================================================================= + +@router.post("/clear-cache", response_model=CacheResponse) +async def clear_pangenome_cache( + berdl_table_id: Optional[str] = Query(None) +): + """Clear cached databases.""" + try: + cache_dir = get_cache_dir() + result = clear_cache(cache_dir, berdl_table_id) + return CacheResponse(status="success", message=result.get("message", "Cache cleared")) + except Exception as e: + return CacheResponse(status="error", message=str(e)) - # Get cache paths - cache_file_path, sqlite_file_path = get_cache_paths(cache_dir, search_request.pangenome_id, filename) - - # Download and cache if not already cached - if not is_cached(cache_file_path): - # Download from handle/blobstore service - binary_data = download_from_handle(handle_url, token) - save_to_cache(cache_file_path, binary_data) - - # Convert to SQLite if not already converted - if not is_cached(sqlite_file_path): - convert_to_sqlite(cache_file_path, sqlite_file_path) - - # Query the SQLite file with parameters - from app.utils.sqlite import get_table_data - results = get_table_data( - sqlite_file_path, - table_name=search_request.table_name, - limit=search_request.limit, - order_by=search_request.order_by, - filters=search_request.filters, - ) - #TODO use a return model when we figure out what we want to return - return { - "pangenome_id": search_request.pangenome_id, - "table_name": search_request.table_name, - "status": "success", - "cache_file": str(cache_file_path), - "sqlite_file": str(sqlite_file_path), - "row_count": len(results), - "results": results - } +@router.get("/cache") +async def list_cache(): + """List cached items.""" + cache_dir = get_cache_dir() + items = list_cached_items(cache_dir) + return {"cache_dir": str(cache_dir), "items": items, "total": len(items)} diff --git a/app/utils/__init__.py b/app/utils/__init__.py index 4521156..efa6608 100644 --- a/app/utils/__init__.py +++ b/app/utils/__init__.py @@ -1,22 +1,66 @@ """ Utils module for TableScanner. -Contains business logic separated from route handlers. +Contains business logic for: +- KBase Workspace API interactions via KBUtilLib +- Blobstore/Shock downloading +- Local file caching with age-based expiration +- SQLite database querying with filtering/sorting/pagination """ -from app.utils.download import download_from_handle -from app.utils.workspace import get_object_info -from app.utils.cache import get_cache_paths, ensure_cache_dir, save_to_cache, is_cached -from app.utils.sqlite import convert_to_sqlite, query_sqlite, get_table_data +from app.utils.workspace import ( + get_berdl_table_data, + list_pangenomes_from_object, + find_pangenome_handle, + download_pangenome_db, + get_object_info, + KBaseClient, +) +from app.utils.cache import ( + get_cache_paths, + ensure_cache_dir, + save_to_cache, + is_cached, + clear_cache, + list_cached_items, + cleanup_old_caches, +) +from app.utils.sqlite import ( + convert_to_sqlite, + query_sqlite, + get_table_data, + list_tables, + get_table_columns, + get_table_row_count, + validate_table_exists, + ensure_indices, +) __all__ = [ - "download_from_handle", + # Workspace utilities + "get_berdl_table_data", + "list_pangenomes_from_object", + "find_pangenome_handle", + "download_pangenome_db", "get_object_info", + "KBaseClient", + + # Cache utilities "get_cache_paths", "ensure_cache_dir", "save_to_cache", "is_cached", + "clear_cache", + "list_cached_items", + "cleanup_old_caches", + + # SQLite utilities "convert_to_sqlite", "query_sqlite", "get_table_data", + "list_tables", + "get_table_columns", + "get_table_row_count", + "validate_table_exists", + "ensure_indices", ] diff --git a/app/utils/cache.py b/app/utils/cache.py index cb4cc0f..e3f30d1 100644 --- a/app/utils/cache.py +++ b/app/utils/cache.py @@ -1,27 +1,78 @@ """ Cache utilities for managing local file caching. + +Implements efficient caching for downloaded BERDLTables SQLite databases +with age-based expiration and cleanup. + +Cache Structure: + {CACHE_DIR}/ + {berdl_table_id}/ + {pangenome_id}.db # SQLite database + metadata.json # Cache metadata (timestamps, checksums) """ +import json +import time +import shutil +import logging from pathlib import Path -from typing import Tuple +from typing import Tuple, Optional, Dict, Any, List +from datetime import datetime + +# Configure module logger +logger = logging.getLogger(__name__) + + +# ============================================================================= +# CACHE PATH UTILITIES +# ============================================================================= + +def sanitize_id(id_string: str) -> str: + """ + Sanitize an ID string for use as a filesystem path. + + Args: + id_string: Raw ID (may contain / : and other special chars) + + Returns: + Safe string for filesystem use + """ + return id_string.replace("/", "_").replace(":", "_").replace(" ", "_") -def get_cache_paths(cache_dir: Path, id: str, filename: str) -> Tuple[Path, Path]: +def get_cache_paths( + cache_dir: Path, + berdl_table_id: str, + pangenome_id: str +) -> Tuple[Path, Path]: """ - Get cache file paths for a given ID and filename. + Get cache file paths for a given BERDLTable and pangenome. Args: cache_dir: Base cache directory - id: Object ID - filename: Original filename + berdl_table_id: BERDLTables object reference + pangenome_id: Pangenome identifier Returns: - Tuple of (cache_file_path, sqlite_file_path) + Tuple of (cache_subdir, sqlite_file_path) """ - cache_file_path = cache_dir / id / filename - sqlite_file_path = cache_dir / id / f"{Path(filename).stem}.db" - return cache_file_path, sqlite_file_path + safe_berdl = sanitize_id(berdl_table_id) + safe_pg = sanitize_id(pangenome_id) + + cache_subdir = cache_dir / safe_berdl + sqlite_path = cache_subdir / f"{safe_pg}.db" + + return cache_subdir, sqlite_path + +def get_metadata_path(cache_subdir: Path) -> Path: + """Get path to cache metadata file.""" + return cache_subdir / "metadata.json" + + +# ============================================================================= +# CACHE MANAGEMENT +# ============================================================================= def ensure_cache_dir(cache_path: Path) -> None: """ @@ -43,16 +94,234 @@ def save_to_cache(cache_path: Path, data: bytes) -> None: """ ensure_cache_dir(cache_path) cache_path.write_bytes(data) + logger.info(f"Saved {len(data)} bytes to cache: {cache_path}") -def is_cached(cache_path: Path) -> bool: +def is_cached(cache_path: Path, max_age_hours: int = 24) -> bool: """ - Check if file exists in cache. + Check if file exists in cache and is not expired. Args: cache_path: Path to cache file + max_age_hours: Maximum age in hours before cache expires + + Returns: + True if valid cache exists, False otherwise + """ + if not cache_path.exists(): + return False + + # Check age + mtime = cache_path.stat().st_mtime + age_hours = (time.time() - mtime) / 3600 + + if age_hours > max_age_hours: + logger.info(f"Cache expired ({age_hours:.1f}h > {max_age_hours}h): {cache_path}") + return False + + logger.debug(f"Valid cache ({age_hours:.1f}h old): {cache_path}") + return True + + +def get_cache_info(cache_path: Path) -> Optional[Dict[str, Any]]: + """ + Get information about a cached file. + + Args: + cache_path: Path to cache file + + Returns: + Dictionary with cache info, or None if not cached + """ + if not cache_path.exists(): + return None + + stat = cache_path.stat() + return { + "path": str(cache_path), + "size_bytes": stat.st_size, + "modified": datetime.fromtimestamp(stat.st_mtime).isoformat(), + "age_hours": (time.time() - stat.st_mtime) / 3600 + } + + +# ============================================================================= +# CACHE METADATA +# ============================================================================= + +def save_cache_metadata( + cache_subdir: Path, + berdl_table_id: str, + pangenome_id: str, + handle_ref: str, + **extra +) -> None: + """ + Save metadata about cached files. + + Args: + cache_subdir: Cache subdirectory for this berdl_table + berdl_table_id: Original BERDLTable reference + pangenome_id: Pangenome identifier + handle_ref: Blobstore handle reference + **extra: Additional metadata to store + """ + metadata_path = get_metadata_path(cache_subdir) + + # Load existing metadata if present + if metadata_path.exists(): + with open(metadata_path) as f: + metadata = json.load(f) + else: + metadata = {"berdl_table_id": berdl_table_id, "pangenomes": {}} + + # Update pangenome entry + metadata["pangenomes"][pangenome_id] = { + "handle_ref": handle_ref, + "cached_at": datetime.now().isoformat(), + **extra + } + + ensure_cache_dir(metadata_path) + with open(metadata_path, 'w') as f: + json.dump(metadata, f, indent=2) + + +def load_cache_metadata(cache_subdir: Path) -> Optional[Dict[str, Any]]: + """ + Load cache metadata. + + Args: + cache_subdir: Cache subdirectory + + Returns: + Metadata dictionary, or None if not found + """ + metadata_path = get_metadata_path(cache_subdir) + if not metadata_path.exists(): + return None + + with open(metadata_path) as f: + return json.load(f) + +# ============================================================================= +# CACHE CLEANUP +# ============================================================================= + +def clear_cache(cache_dir: Path, berdl_table_id: Optional[str] = None) -> Dict[str, Any]: + """ + Clear cached files. + + Args: + cache_dir: Base cache directory + berdl_table_id: Specific BERDLTable to clear (None for all) + + Returns: + Summary of cleanup operation + """ + if berdl_table_id: + # Clear specific cache + safe_id = sanitize_id(berdl_table_id) + cache_path = cache_dir / safe_id + + if cache_path.exists(): + shutil.rmtree(cache_path) + return { + "status": "success", + "message": f"Cleared cache for {berdl_table_id}", + "path": str(cache_path) + } + else: + return { + "status": "success", + "message": "Cache already empty" + } + else: + # Clear all caches + if cache_dir.exists(): + count = sum(1 for _ in cache_dir.iterdir() if _.is_dir()) + shutil.rmtree(cache_dir) + cache_dir.mkdir(parents=True, exist_ok=True) + return { + "status": "success", + "message": f"Cleared {count} cached items", + "path": str(cache_dir) + } + else: + return { + "status": "success", + "message": "Cache directory does not exist" + } + + +def cleanup_old_caches(cache_dir: Path, max_age_days: int = 7) -> Dict[str, Any]: + """ + Remove cache directories older than max_age_days. + + Args: + cache_dir: Base cache directory + max_age_days: Maximum age in days + + Returns: + Summary of cleanup operation + """ + if not cache_dir.exists(): + return {"status": "success", "removed": 0} + + now = time.time() + max_age_seconds = max_age_days * 24 * 3600 + removed = [] + + for subdir in cache_dir.iterdir(): + if not subdir.is_dir(): + continue + + try: + mtime = subdir.stat().st_mtime + if now - mtime > max_age_seconds: + shutil.rmtree(subdir) + removed.append(subdir.name) + logger.info(f"Removed old cache: {subdir.name}") + except Exception as e: + logger.warning(f"Failed to clean {subdir}: {e}") + + return { + "status": "success", + "removed": len(removed), + "items": removed + } + + +def list_cached_items(cache_dir: Path) -> List[Dict[str, Any]]: + """ + List all cached BERDLTable items. + + Args: + cache_dir: Base cache directory + Returns: - True if file exists, False otherwise + List of cached item info """ - return cache_path.exists() + items = [] + + if not cache_dir.exists(): + return items + + for subdir in sorted(cache_dir.iterdir()): + if not subdir.is_dir(): + continue + + metadata = load_cache_metadata(subdir) + db_files = list(subdir.glob("*.db")) + + item = { + "id": subdir.name, + "berdl_table_id": metadata.get("berdl_table_id") if metadata else subdir.name, + "databases": len(db_files), + "total_size_bytes": sum(f.stat().st_size for f in db_files), + "pangenomes": list(metadata.get("pangenomes", {}).keys()) if metadata else [] + } + items.append(item) + + return items diff --git a/app/utils/download.py b/app/utils/download.py deleted file mode 100644 index 3ee24b1..0000000 --- a/app/utils/download.py +++ /dev/null @@ -1,25 +0,0 @@ -""" -Handle/Blobstore utilities for downloading files. -""" - -import requests - - -def download_from_handle(handle_url: str, auth_token: str) -> bytes: - """ - Download binary file from KBase Handle/Blobstore service. - - Args: - handle_url: URL to the handle/blobstore service - auth_token: KBase authentication token - - Returns: - Binary data - - Raises: - requests.HTTPError: If download fails - """ - headers = {"Authorization": auth_token} - response = requests.get(handle_url, headers=headers) - response.raise_for_status() - return response.content diff --git a/app/utils/sqlite.py b/app/utils/sqlite.py index 0ef7e99..ed4aae9 100644 --- a/app/utils/sqlite.py +++ b/app/utils/sqlite.py @@ -1,56 +1,180 @@ """ SQLite utilities for database conversion and querying. + +This module provides efficient functions for: +- Extracting table data from SQLite databases +- Converting data to 2D array format for JSON serialization +- Filtering, sorting, and pagination +- Index optimization for query performance + +Migrated from: BERDLTable_conversion_service/db_utils.py """ import sqlite3 +import logging +import time from pathlib import Path -from typing import Any, List, Dict, Optional +from typing import Any, List, Dict, Optional, Tuple +# Configure module logger +logger = logging.getLogger(__name__) -def convert_to_sqlite(binary_file: Path, sqlite_file: Path) -> None: + +# ============================================================================= +# TABLE LISTING & METADATA +# ============================================================================= + +def list_tables(db_path: Path) -> List[str]: """ - Convert binary file to SQLite database. + List all user tables in a SQLite database. Args: - binary_file: Path to binary file - sqlite_file: Path to output SQLite file + db_path: Path to the SQLite database file + + Returns: + List of table names (excludes sqlite_ system tables) Raises: - NotImplementedError: This function is not yet implemented + sqlite3.Error: If database access fails """ - # TODO: Implement conversion logic based on binary file format - # - # Example implementation for a specific binary format: - # import sqlite3 - # - # # Read and parse binary file - # with open(binary_file, 'rb') as f: - # data = parse_binary_format(f.read()) - # - # # Create SQLite database - # conn = sqlite3.connect(sqlite_file) - # cursor = conn.cursor() - # - # # Create tables - # cursor.execute(''' - # CREATE TABLE IF NOT EXISTS data ( - # id INTEGER PRIMARY KEY, - # column1 TEXT, - # column2 TEXT - # ) - # ''') - # - # # Insert data - # cursor.executemany('INSERT INTO data VALUES (?, ?, ?)', data) - # conn.commit() - # conn.close() - - raise NotImplementedError("SQLite conversion not yet implemented") + try: + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + + # Query for user tables (exclude sqlite_ system tables) + cursor.execute(""" + SELECT name FROM sqlite_master + WHERE type='table' + AND name NOT LIKE 'sqlite_%' + ORDER BY name + """) + + tables = [row[0] for row in cursor.fetchall()] + conn.close() + + logger.info(f"Found {len(tables)} tables in database: {tables}") + return tables + + except sqlite3.Error as e: + logger.error(f"Error listing tables from {db_path}: {e}") + raise + + +def get_table_columns(db_path: Path, table_name: str) -> List[str]: + """ + Get column names for a specific table. + + Args: + db_path: Path to the SQLite database file + table_name: Name of the table to query + + Returns: + List of column names + """ + try: + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + + # Use PRAGMA to get table info + cursor.execute(f"PRAGMA table_info({table_name})") + columns = [row[1] for row in cursor.fetchall()] + conn.close() + + return columns + + except sqlite3.Error as e: + logger.error(f"Error getting columns for {table_name}: {e}") + raise + + +def get_table_row_count(db_path: Path, table_name: str) -> int: + """ + Get the total row count for a table. + + Args: + db_path: Path to the SQLite database file + table_name: Name of the table + + Returns: + Number of rows in the table + """ + try: + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + + cursor.execute(f"SELECT COUNT(*) FROM {table_name}") + count = cursor.fetchone()[0] + conn.close() + + return count + + except sqlite3.Error as e: + logger.error(f"Error counting rows in {table_name}: {e}") + raise + + +def validate_table_exists(db_path: Path, table_name: str) -> bool: + """ + Check if a table exists in the database. + + Args: + db_path: Path to the SQLite database file + table_name: Name of the table to check + + Returns: + True if table exists, False otherwise + """ + tables = list_tables(db_path) + return table_name in tables + + +# ============================================================================= +# INDEX OPTIMIZATION +# ============================================================================= + +def ensure_indices(db_path: Path, table_name: str) -> None: + """ + Ensure indices exist for all columns in the table to optimize filtering. + + This is an optimization step - failures are logged but not raised. + + Args: + db_path: Path to the SQLite database file + table_name: Name of the table + """ + try: + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + + # Get columns + cursor.execute(f"PRAGMA table_info({table_name})") + columns = [row[1] for row in cursor.fetchall()] + + # Create index for each column + for col in columns: + index_name = f"idx_{table_name}_{col}" + # Sanitize column name for SQL safety + safe_col = col.replace('"', '""') + cursor.execute( + f'CREATE INDEX IF NOT EXISTS "{index_name}" ON "{table_name}" ("{safe_col}")' + ) + conn.commit() + conn.close() + logger.info(f"Ensured indices for table {table_name}") + + except sqlite3.Error as e: + # Don't raise, just log warning as this is an optimization step + logger.warning(f"Error creating indices for {table_name}: {e}") + + +# ============================================================================= +# DATA RETRIEVAL - SIMPLE QUERY +# ============================================================================= def query_sqlite(sqlite_file: Path, query_id: str) -> dict: """ - Query SQLite database. + Query SQLite database by ID. Legacy compatibility function. Args: sqlite_file: Path to SQLite database @@ -58,29 +182,7 @@ def query_sqlite(sqlite_file: Path, query_id: str) -> dict: Returns: Query results as dictionary - - Note: - This is currently a stub implementation that returns placeholder data. - """ - # TODO: Implement SQLite query logic - # - # Example implementation: - # import sqlite3 - # - # conn = sqlite3.connect(sqlite_file) - # conn.row_factory = sqlite3.Row # Enable column access by name - # cursor = conn.cursor() - # - # # Execute query - # cursor.execute("SELECT * FROM data WHERE id = ?", (query_id,)) - # rows = cursor.fetchall() - # - # # Convert to list of dicts - # results = [dict(row) for row in rows] - # - # conn.close() - # return {"data": results, "count": len(results)} - + """ return { "stub": "SQLite query results would go here", "query_id": query_id, @@ -88,88 +190,204 @@ def query_sqlite(sqlite_file: Path, query_id: str) -> dict: } +# ============================================================================= +# DATA RETRIEVAL - FULL FEATURED +# ============================================================================= + def get_table_data( sqlite_file: Path, table_name: str, limit: Optional[int] = None, + offset: Optional[int] = None, order_by: Optional[List[Dict[str, str]]] = None, filters: Optional[List[Dict[str, Any]]] = None, -) -> List[Dict[str, Any]]: + sort_column: Optional[str] = None, + sort_order: Optional[str] = None, + search_value: Optional[str] = None, + query_filters: Optional[Dict[str, str]] = None, +) -> Tuple[List[str], List[List[str]], int, int, float, float]: """ - Query SQLite database with flexible filtering, ordering, and pagination. + Extract table data with pagination, sorting, and filtering. + + Supports two filtering APIs for flexibility: + 1. `filters`: List of FilterSpec-style dicts with column, op, value + 2. `query_filters`: Simple dict of column -> search_value (LIKE matching) Args: sqlite_file: Path to SQLite database table_name: Name of the table to query limit: Maximum number of rows to return - order_by: List of order specifications, e.g., - [{"column": "gene_id", "direction": "ASC"}] - filters: List of filter specifications, e.g., - [{"column": "function", "op": "LIKE", "value": "%kinase%"}] + offset: Number of rows to skip + order_by: List of order specifications [{column, direction}] + filters: List of filter specifications [{column, op, value}] + sort_column: Single column to sort by (alternative to order_by) + sort_order: Sort direction 'asc' or 'desc' (with sort_column) + search_value: Global search term for all columns + query_filters: Dict of column-specific search terms Returns: - List of rows as dictionaries - - Example: - rows = get_table_data( - db_path, - "Genes", - limit=20, - order_by=[{"column": "gene_id", "direction": "ASC"}], - filters=[{"column": "function", "op": "LIKE", "value": "%kinase%"}], - ) - """ - conn = sqlite3.connect(sqlite_file) - conn.row_factory = sqlite3.Row - cursor = conn.cursor() - - # Build SELECT query - query = f"SELECT * FROM {table_name}" - params = [] - - # Add WHERE clause for filters - if filters: - where_clauses = [] - for filter_spec in filters: - column = filter_spec["column"] - op = filter_spec["op"] - value = filter_spec["value"] - - # Sanitize operator + Tuple of (headers, data, total_count, filtered_count, db_query_ms, conversion_ms) + + Raises: + sqlite3.Error: If database query fails + ValueError: If invalid operator is specified + """ + try: + conn = sqlite3.connect(str(sqlite_file)) + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + # Get column names first + headers = get_table_columns(sqlite_file, table_name) + + if not headers: + logger.warning(f"Table {table_name} has no columns or doesn't exist") + return [], [], 0, 0, 0.0, 0.0 + + # 1. Get total count (before filtering) + cursor.execute(f"SELECT COUNT(*) FROM {table_name}") + total_count = cursor.fetchone()[0] + + # 2. Build WHERE clause + conditions = [] + params = [] + + # 2a. Global Search (OR logic across all columns) + if search_value: + search_conditions = [] + term = f"%{search_value}%" + for col in headers: + search_conditions.append(f'"{col}" LIKE ?') + params.append(term) + + if search_conditions: + conditions.append(f"({' OR '.join(search_conditions)})") + + # 2b. Column Filters via query_filters dict (AND logic) + if query_filters: + for col, val in query_filters.items(): + if col in headers and val: + conditions.append(f'"{col}" LIKE ?') + params.append(f"%{val}%") + + # 2c. Structured filters via filters list (AND logic) + if filters: allowed_ops = ["=", "!=", "<", ">", "<=", ">=", "LIKE", "IN"] - if op not in allowed_ops: - raise ValueError(f"Invalid operator: {op}") + for filter_spec in filters: + column = filter_spec.get("column") + op = filter_spec.get("op", "LIKE") + value = filter_spec.get("value") + + if not column or column not in headers: + continue + + if op not in allowed_ops: + raise ValueError(f"Invalid operator: {op}") - where_clauses.append(f"{column} {op} ?") - params.append(value) + conditions.append(f'"{column}" {op} ?') + params.append(value) - query += " WHERE " + " AND ".join(where_clauses) + where_clause = "" + if conditions: + where_clause = " WHERE " + " AND ".join(conditions) - # Add ORDER BY clause - if order_by: + # 3. Get filtered count + if where_clause: + cursor.execute(f"SELECT COUNT(*) FROM {table_name} {where_clause}", params) + filtered_count = cursor.fetchone()[0] + else: + filtered_count = total_count + + # 4. Build final query + query = f"SELECT * FROM {table_name}{where_clause}" + + # Add ORDER BY clause order_clauses = [] - for order_spec in order_by: - column = order_spec["column"] - direction = order_spec.get("direction", "ASC").upper() - if direction not in ["ASC", "DESC"]: - raise ValueError(f"Invalid direction: {direction}") + # Handle order_by list + if order_by: + for order_spec in order_by: + col = order_spec.get("column") + direction = order_spec.get("direction", "ASC").upper() + + if col and col in headers: + if direction not in ["ASC", "DESC"]: + direction = "ASC" + order_clauses.append(f'"{col}" {direction}') + + # Handle single sort_column (alternative API) + if sort_column and sort_column in headers: + direction = "DESC" if sort_order and sort_order.lower() == "desc" else "ASC" + order_clauses.append(f'"{sort_column}" {direction}') + + if order_clauses: + query += " ORDER BY " + ", ".join(order_clauses) + elif headers: + # Default sort for consistent pagination + query += f' ORDER BY "{headers[0]}" ASC' + + # Add LIMIT clause + if limit is not None: + query += f" LIMIT {int(limit)}" + + # Add OFFSET clause + if offset is not None: + query += f" OFFSET {int(offset)}" + + # Execute query with timing + query_start = time.time() + cursor.execute(query, params) + rows = cursor.fetchall() + db_query_ms = (time.time() - query_start) * 1000 + + conn.close() - order_clauses.append(f"{column} {direction}") + # Convert rows to string arrays with timing + conversion_start = time.time() + data = [] + for row in rows: + string_row = [ + str(value) if value is not None else "" + for value in row + ] + data.append(string_row) + conversion_ms = (time.time() - conversion_start) * 1000 - query += " ORDER BY " + ", ".join(order_clauses) + return headers, data, total_count, filtered_count, db_query_ms, conversion_ms - # Add LIMIT clause - if limit is not None: - query += f" LIMIT {int(limit)}" + except sqlite3.Error as e: + logger.error(f"Error extracting data from {table_name}: {e}") + raise - # Execute query - cursor.execute(query, params) - rows = cursor.fetchall() - # Convert to list of dicts - results = [dict(row) for row in rows] +# ============================================================================= +# CONVERSION (PLACEHOLDER) +# ============================================================================= - conn.close() +def convert_to_sqlite(binary_file: Path, sqlite_file: Path) -> None: + """ + Convert binary file to SQLite database. + + This function handles conversion of various binary formats + to SQLite for efficient querying. + + Args: + binary_file: Path to binary file + sqlite_file: Path to output SQLite file + + Raises: + NotImplementedError: Conversion logic depends on binary format + """ + # Check if file is already a SQLite database + if binary_file.suffix == '.db': + # Just copy/link the file + import shutil + shutil.copy2(binary_file, sqlite_file) + logger.info(f"Copied SQLite database to {sqlite_file}") + return - return results + # TODO: Implement conversion logic based on binary file format + # The BERDLTables object stores SQLite directly, so this may not be needed + raise NotImplementedError( + f"SQLite conversion not implemented for format: {binary_file.suffix}" + ) diff --git a/app/utils/workspace.py b/app/utils/workspace.py index 7713010..e115664 100644 --- a/app/utils/workspace.py +++ b/app/utils/workspace.py @@ -1,39 +1,419 @@ """ -KBase Workspace utilities for retrieving object information. +KBase Workspace and Blobstore utilities for retrieving BERDLTables objects. + +This module uses KBUtilLib to interact with KBase services for: +- Fetching BERDLTables objects from Workspace +- Downloading SQLite databases from Blobstore +- Caching databases locally + +Key Flow: +1. User provides berdl_table_id (workspace ref like "76990/ADP1Test") +2. Fetch object from Workspace API via KBUtilLib +3. Extract pangenome_data with handle_ref +4. Download SQLite from Blobstore using download_blob_file +5. Cache locally for efficient repeated queries + +Requires: lib/KBUtilLib cloned locally """ +import os +import sys +import logging +from pathlib import Path +from typing import Dict, Any, List, Optional import requests -from typing import Dict, Any +# Add KBUtilLib to path +LIB_PATH = Path(__file__).parent.parent.parent / "lib" / "KBUtilLib" / "src" +if str(LIB_PATH) not in sys.path: + sys.path.insert(0, str(LIB_PATH)) + +# Configure module logger +logger = logging.getLogger(__name__) -def get_object_info(workspace_id: str, auth_token: str, workspace_url: str) -> Dict[str, Any]: + +# ============================================================================= +# KBASE UTILITY CLASS (USING KBUtilLib) +# ============================================================================= + +class KBaseClient: + """ + KBase API client using KBUtilLib. + + Uses NotebookUtils and KBWSUtils with kb_version parameter + to target the correct KBase environment (appdev, ci, prod). """ - Get object information from KBase Workspace API. + + def __init__( + self, + token: str, + kb_env: str = "appdev", + cache_dir: Optional[Path] = None + ): + """ + Initialize KBase client. + + Args: + token: KBase authentication token + kb_env: Environment (appdev, ci, prod) + cache_dir: Local cache directory + """ + self.token = token + self.kb_env = kb_env + self.cache_dir = cache_dir or Path("/tmp/tablescanner_cache") + self._client = None + self._use_kbutillib = False + + # Try to initialize KBUtilLib + self._init_client() + + def _init_client(self): + """Initialize the appropriate client.""" + try: + from kbutillib.kb_ws_utils import KBWSUtils + from kbutillib.notebook_utils import NotebookUtils + + # Create a proper combined class + cache_dir = self.cache_dir + kb_env = self.kb_env + token = self.token + + class NotebookUtil(NotebookUtils, KBWSUtils): + def __init__(self): + super().__init__( + notebook_folder=str(cache_dir), + name="TableScanner", + kb_version=kb_env, + token=token + ) + + self._client = NotebookUtil() + self._use_kbutillib = True + logger.info(f"KBUtilLib client initialized for {self.kb_env}") + + except Exception as e: + logger.warning(f"KBUtilLib not available: {e}. Using fallback.") + self._use_kbutillib = False + + def get_object(self, ref: str, ws: Optional[int] = None) -> Dict[str, Any]: + """ + Get workspace object data. + + Args: + ref: Object reference or name + ws: Workspace ID (optional if ref is full reference) + + Returns: + Object data dictionary + """ + if self._use_kbutillib and self._client: + return self._client.get_object(ref, ws=ws) + else: + return self._get_object_fallback(ref, ws) + + def download_blob_file(self, handle_ref: str, target_path: Path) -> Path: + """ + Download file from blobstore using handle reference. + + Args: + handle_ref: Handle ID (KBH_xxxxx format) + target_path: Where to save the file + + Returns: + Path to downloaded file + """ + # Ensure directory exists + target_path = Path(target_path) + target_path.parent.mkdir(parents=True, exist_ok=True) + + if self._use_kbutillib and self._client: + result = self._client.download_blob_file(handle_ref, str(target_path)) + if result: + return Path(result) + raise ValueError(f"Failed to download from handle: {handle_ref}") + else: + return Path(self._download_blob_fallback(handle_ref, str(target_path))) + + # ========================================================================= + # FALLBACK METHODS (Direct API calls) + # ========================================================================= + + def _get_endpoints(self) -> Dict[str, str]: + """Get endpoints for current environment.""" + endpoints = { + "appdev": { + "workspace": "https://appdev.kbase.us/services/ws", + "shock": "https://appdev.kbase.us/services/shock-api", + "handle": "https://appdev.kbase.us/services/handle_service", + }, + "ci": { + "workspace": "https://ci.kbase.us/services/ws", + "shock": "https://ci.kbase.us/services/shock-api", + "handle": "https://ci.kbase.us/services/handle_service", + }, + "prod": { + "workspace": "https://kbase.us/services/ws", + "shock": "https://kbase.us/services/shock-api", + "handle": "https://kbase.us/services/handle_service", + }, + } + return endpoints.get(self.kb_env, endpoints["appdev"]) + + def _get_object_fallback(self, ref: str, ws: Optional[int] = None) -> Dict[str, Any]: + """Get workspace object via direct API call.""" + # Build reference + if ws and "/" not in str(ref): + ref = f"{ws}/{ref}" + + headers = { + "Authorization": self.token, + "Content-Type": "application/json" + } + + payload = { + "method": "Workspace.get_objects2", + "params": [{"objects": [{"ref": ref}]}], + "version": "1.1", + "id": "tablescanner-1" + } + + endpoints = self._get_endpoints() + response = requests.post( + endpoints["workspace"], + json=payload, + headers=headers, + timeout=60 + ) + response.raise_for_status() + result = response.json() + + if "error" in result: + raise ValueError(result["error"].get("message", "Unknown error")) + + data_list = result.get("result", [{}])[0].get("data", []) + if not data_list: + raise ValueError(f"No data for: {ref}") + + return data_list[0] + + def _download_blob_fallback(self, handle_ref: str, target_path: str) -> str: + """Download from blobstore via direct API.""" + endpoints = self._get_endpoints() + headers = {"Authorization": f"OAuth {self.token}"} + + # Resolve handle to shock ID + handle_payload = { + "method": "AbstractHandle.hids_to_handles", + "params": [[handle_ref]], + "version": "1.1", + "id": "tablescanner-2" + } + + shock_id = handle_ref # Default to handle_ref + try: + resp = requests.post( + endpoints["handle"], + json=handle_payload, + headers={"Authorization": self.token, "Content-Type": "application/json"}, + timeout=30 + ) + resp.raise_for_status() + handles = resp.json().get("result", [[]])[0] + if handles: + shock_id = handles[0].get("id", handle_ref) + except Exception as e: + logger.warning(f"Handle resolution failed, using handle_ref directly: {e}") + + # Download from shock + download_url = f"{endpoints['shock']}/node/{shock_id}?download_raw" + + response = requests.get( + download_url, + headers=headers, + stream=True, + timeout=300 + ) + response.raise_for_status() + + Path(target_path).parent.mkdir(parents=True, exist_ok=True) + with open(target_path, 'wb') as f: + for chunk in response.iter_content(8192): + f.write(chunk) + + logger.info(f"Downloaded {handle_ref} to {target_path}") + return target_path + + +# ============================================================================= +# HIGH-LEVEL FUNCTIONS +# ============================================================================= + +def get_berdl_table_data( + berdl_table_id: str, + auth_token: str, + kb_env: str = "appdev" +) -> Dict[str, Any]: + """ + Fetch BERDLTables object and extract pangenome information. + + BERDLTables structure: + { + "pangenome_data": [ + { + "pangenome_id": "pg_123", + "pangenome_taxonomy": "Escherichia coli", + "sqllite_tables_handle_ref": "KBH_xxxxx", + ... + } + ] + } Args: - workspace_id: The workspace object ID + berdl_table_id: KBase workspace reference (e.g., "76990/ADP1Test") auth_token: KBase authentication token - workspace_url: URL to the KBase Workspace service + kb_env: KBase environment Returns: - Dictionary containing object info including handle/blobstore URLs + Object data dictionary with pangenome_data + """ + client = KBaseClient(auth_token, kb_env) + obj = client.get_object(berdl_table_id) + + # Handle nested data structures + if isinstance(obj, dict) and "data" in obj: + return obj["data"] + return obj + + +def list_pangenomes_from_object( + berdl_table_id: str, + auth_token: str, + kb_env: str = "appdev" +) -> List[Dict[str, Any]]: + """ + List all pangenomes from a BERDLTables object. + + Args: + berdl_table_id: KBase workspace reference + auth_token: KBase authentication token + kb_env: KBase environment + + Returns: + List of pangenome info dictionaries with: + - pangenome_id + - pangenome_taxonomy + - handle_ref + - user_genomes + - berdl_genomes + """ + obj_data = get_berdl_table_data(berdl_table_id, auth_token, kb_env) + + pangenome_data = obj_data.get("pangenome_data", []) + + pangenomes = [] + for pg in pangenome_data: + pangenomes.append({ + "pangenome_id": pg.get("pangenome_id", ""), + "pangenome_taxonomy": pg.get("pangenome_taxonomy", ""), + "user_genomes": pg.get("user_genomes", []), + "berdl_genomes": pg.get("berdl_genomes", []), + "handle_ref": pg.get("sqllite_tables_handle_ref", ""), + }) + + return pangenomes + + +def find_pangenome_handle( + berdl_table_id: str, + pangenome_id: str, + auth_token: str, + kb_env: str = "appdev" +) -> str: + """ + Find the handle_ref for a specific pangenome. + + Args: + berdl_table_id: KBase workspace reference + pangenome_id: ID of pangenome to find + auth_token: KBase authentication token + kb_env: KBase environment + + Returns: + Handle reference string (KBH_xxxxx) Raises: - HTTPException: If the workspace API call fails - """ - # TODO: Implement actual KBase Workspace API call - # Example: - # headers = {"Authorization": auth_token} - # payload = { - # "method": "Workspace.get_objects2", - # "params": [{ - # "objects": [{"ref": workspace_id}] - # }], - # "version": "1.1" - # } - # response = requests.post(workspace_url, json=payload, headers=headers) - # response.raise_for_status() - # data = response.json() - # return data["result"][0]["data"][0] - - raise NotImplementedError("KBase Workspace API integration not yet implemented") + ValueError: If pangenome not found + """ + pangenomes = list_pangenomes_from_object(berdl_table_id, auth_token, kb_env) + + for pg in pangenomes: + if pg["pangenome_id"] == pangenome_id: + return pg["handle_ref"] + + available = [pg["pangenome_id"] for pg in pangenomes] + raise ValueError(f"Pangenome '{pangenome_id}' not found. Available: {available}") + + +def download_pangenome_db( + berdl_table_id: str, + pangenome_id: str, + auth_token: str, + cache_dir: Path, + kb_env: str = "appdev" +) -> Path: + """ + Download the SQLite database for a pangenome by ID. + + Checks cache first, downloads only if not present. + + Args: + berdl_table_id: KBase workspace reference + pangenome_id: Pangenome ID + auth_token: KBase authentication token + cache_dir: Local cache directory + kb_env: KBase environment + + Returns: + Path to the local SQLite database file + """ + from app.utils.cache import is_cached, get_cache_paths + + # Get cache path + cache_dir = Path(cache_dir) + safe_id = berdl_table_id.replace("/", "_").replace(":", "_") + db_dir = cache_dir / safe_id + db_path = db_dir / f"{pangenome_id}.db" + + # Check cache + if db_path.exists(): + logger.info(f"Using cached database: {db_path}") + return db_path + + # Find handle and download + handle_ref = find_pangenome_handle(berdl_table_id, pangenome_id, auth_token, kb_env) + + client = KBaseClient(auth_token, kb_env, cache_dir) + db_path = client.download_blob_file(handle_ref, db_path) + + logger.info(f"Downloaded database to: {db_path}") + return db_path + + +def get_object_info( + object_ref: str, + auth_token: str, + kb_env: str = "appdev" +) -> Dict[str, Any]: + """ + Get basic object info without full data. + + Args: + object_ref: KBase workspace reference + auth_token: KBase authentication token + kb_env: KBase environment + + Returns: + Object metadata + """ + client = KBaseClient(auth_token, kb_env) + return client.get_object(object_ref) diff --git a/pyproject.toml b/pyproject.toml index fd3c46f..b11cc95 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,3 +15,6 @@ dependencies = [ [build-system] requires = ["setuptools>=61.0"] build-backend = "setuptools.build_meta" + +[tool.setuptools.packages.find] +include = ["app*"] diff --git a/static/viewer.html b/static/viewer.html new file mode 100644 index 0000000..9922608 --- /dev/null +++ b/static/viewer.html @@ -0,0 +1,1295 @@ + + + + + + + TableScanner - BERDL Table Viewer + + + + + + + + + + +
+
+
+

+ + TableScanner + v1.1 + Local +

+
+ + +
+
+
+
+ +
+ +
+
Connection
+
+
+ +
+ + +
+
+
+ + +
+
+ + +
+
+ + +
+
+
+ +
+ + +
+
+
+
Data Selection
+
+
+ + +
+
+ + +
+
+ +
+ + +
+
+
+
+
+ +
+
+
View Controls
+
+
+ + +
+
+ + +
+
+ +
+ + + + +
+
+
+
+
+
+
+ + +
+
+
+ Data + No data + +
+
+ 0-0 + of 0 + (Total: 0) +
+
+ + + +
+ +

Enter BERDLTable ID and click search

+
+ + + + +
+ + +
+
+ + Ready +
+
+ --ms +
+
+ -- +
+
+ + -- +
+
+
+ + +
+
+
+
+ + + + + \ No newline at end of file From 2a85d24d4e822efad26e6d65e3fc6e86452caa7b Mon Sep 17 00:00:00 2001 From: VibhavSetlur Date: Tue, 16 Dec 2025 11:27:37 -0600 Subject: [PATCH 2/5] viewer adjustments --- app/models.py | 8 ++ app/routes.py | 22 ++++- static/viewer.html | 195 +++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 216 insertions(+), 9 deletions(-) diff --git a/app/models.py b/app/models.py index a4b7ddf..183ccc6 100644 --- a/app/models.py +++ b/app/models.py @@ -189,6 +189,14 @@ class PangenomesResponse(BaseModel): default_factory=list, description="List of available pangenomes" ) + pangenome_count: int = Field( + 0, + description="Total number of pangenomes" + ) + auto_selected: Optional[str] = Field( + None, + description="Auto-selected pangenome ID when only one exists" + ) class TableDataResponse(BaseModel): diff --git a/app/routes.py b/app/routes.py index 08e461f..e7edb04 100644 --- a/app/routes.py +++ b/app/routes.py @@ -439,11 +439,29 @@ async def get_pangenomes( kb_env: str = Query("appdev"), authorization: Optional[str] = Header(None) ): - """List pangenomes from BERDLTables object (legacy endpoint).""" + """ + List pangenomes from BERDLTables object. + + Returns: + - pangenomes: List of pangenome info + - pangenome_count: Total number of pangenomes + - auto_selected: The pangenome_id if only one exists (for auto-selection) + """ try: token = get_auth_token(authorization) pangenomes = list_pangenomes_from_object(berdl_table_id, token, kb_env) - return PangenomesResponse(pangenomes=[PangenomeInfo(**pg) for pg in pangenomes]) + pangenome_list = [PangenomeInfo(**pg) for pg in pangenomes] + + # Auto-select if only one pangenome + auto_selected = None + if len(pangenome_list) == 1: + auto_selected = pangenome_list[0].pangenome_id + + return PangenomesResponse( + pangenomes=pangenome_list, + pangenome_count=len(pangenome_list), + auto_selected=auto_selected + ) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) diff --git a/static/viewer.html b/static/viewer.html index 9922608..34ab700 100644 --- a/static/viewer.html +++ b/static/viewer.html @@ -244,6 +244,27 @@ display: none; } + /* Cell links - for transformed identifiers (UniProt, KEGG, etc.) */ + .ts-cell-link { + color: var(--ts-info); + text-decoration: none; + border-bottom: 1px dotted var(--ts-info); + transition: all 0.15s; + } + + .ts-cell-link:hover { + color: var(--ts-primary); + border-bottom-color: var(--ts-primary); + border-bottom-style: solid; + } + + .ts-cell-link::after { + content: ' ↗'; + font-size: 0.55em; + opacity: 0.6; + vertical-align: super; + } + .ts-table tbody tr:hover { background: rgba(99, 102, 241, 0.1); } @@ -801,6 +822,142 @@

return h; } + // ================================================================= + // CELL VALUE TRANSFORMATIONS + // Transform cell values into clickable links for known identifiers + // ================================================================= + + /** + * Pattern definitions for biological database identifiers. + * Each pattern has: regex to match, URL template, and display formatter. + */ + const LINK_PATTERNS = [ + // UniProt - matches UniProt:P12345 or column context + { + name: 'uniprot', + regex: /^(?:UniProt:)?([A-Z][0-9][A-Z0-9]{3}[0-9]|[A-Z][A-Z0-9]{5}[0-9](?:\-\d+)?)$/i, + url: (id) => `https://www.uniprot.org/uniprotkb/${id}`, + columns: ['uniprot_id', 'uniprot', 'swissprot_id'] + }, + // KEGG - matches KEGG:eco:b0001 or eco:b0001 + { + name: 'kegg', + regex: /^(?:KEGG:)?([a-z]{2,4}:[a-zA-Z0-9_]+)$/i, + url: (id) => `https://www.genome.jp/entry/${id}`, + columns: ['kegg_id', 'kegg', 'kegg_gene'] + }, + // Gene Ontology - matches GO:0008150 + { + name: 'go', + regex: /^(GO:\d{7})$/i, + url: (id) => `https://amigo.geneontology.org/amigo/term/${id.toUpperCase()}`, + columns: ['go_id', 'go_term', 'gene_ontology'] + }, + // Enzyme Commission - matches EC:1.2.3.4 or 1.2.3.4 + { + name: 'ec', + regex: /^(?:EC:)?(\d+\.\d+\.\d+\.\d+)$/, + url: (id) => `https://enzyme.expasy.org/EC/${id}`, + columns: ['ec_number', 'ec_id', 'ec'] + }, + // PDB - matches PDB:1ABC or 1ABC (4 chars) + { + name: 'pdb', + regex: /^(?:PDB:)?([0-9][A-Z0-9]{3})$/i, + url: (id) => `https://www.rcsb.org/structure/${id.toUpperCase()}`, + columns: ['pdb_id', 'pdb', 'structure_id'] + }, + // RefSeq - matches NC_, NP_, NM_, XP_, etc. + { + name: 'refseq', + regex: /^([ANXYW][CGMPRTW]_\d+(?:\.\d+)?)$/, + url: (id) => `https://www.ncbi.nlm.nih.gov/nuccore/${id}`, + columns: ['refseq_id', 'refseq', 'accession'] + }, + // GenBank - matches typical accession patterns + { + name: 'genbank', + regex: /^([A-Z]{1,2}\d{5,6}(?:\.\d+)?)$/, + url: (id) => `https://www.ncbi.nlm.nih.gov/nuccore/${id}`, + columns: ['genbank_id', 'genbank', 'accession'] + }, + // NCBI Gene ID - numeric only in gene_id columns + { + name: 'ncbi_gene', + regex: /^(\d{4,})$/, + url: (id) => `https://www.ncbi.nlm.nih.gov/gene/${id}`, + columns: ['gene_id', 'ncbi_gene_id', 'entrez_id'] + }, + // URLs - http/https links + { + name: 'url', + regex: /^(https?:\/\/[^\s]+)$/i, + url: (match) => match, + columns: [] // Match in any column + } + ]; + + /** + * Transform a cell value into HTML, creating links where appropriate. + * @param {string} value - The cell value + * @param {string} columnName - The column header name (for context) + * @returns {string} HTML string with links if applicable + */ + function transformCellValue(value, columnName) { + if (value === null || value === undefined || value === '') { + return ''; + } + + const strVal = String(value).trim(); + if (!strVal) return ''; + + // Escape HTML first + const escaped = strVal + .replace(/&/g, '&') + .replace(//g, '>') + .replace(/"/g, '"'); + + const colLower = columnName.toLowerCase(); + + // Check each pattern + for (const pattern of LINK_PATTERNS) { + // Check if column matches known columns for this pattern + const columnMatch = pattern.columns.length === 0 || + pattern.columns.some(c => colLower.includes(c)); + + if (!columnMatch && pattern.columns.length > 0) continue; + + const match = strVal.match(pattern.regex); + if (match) { + const id = match[1] || match[0]; + const url = pattern.url(id); + return `${escaped}`; + } + } + + return escaped; + } + + /** + * Check if a value looks like it could be linkified. + * Used for tooltip display. + */ + function isLinkableValue(value, columnName) { + if (!value) return false; + const strVal = String(value).trim(); + const colLower = columnName.toLowerCase(); + + for (const pattern of LINK_PATTERNS) { + const columnMatch = pattern.columns.length === 0 || + pattern.columns.some(c => colLower.includes(c)); + if (columnMatch && pattern.regex.test(strVal)) { + return pattern.name; + } + } + return null; + } + // ================================================================= // UI STATE // ================================================================= @@ -829,6 +986,11 @@

// ================================================================= // API CALLS // ================================================================= + + /** + * Load pangenomes from BERDLTable object. + * Handles auto-selection when only one pangenome exists. + */ async function loadPangenomes() { const berdlId = dom.berdlTableId.value.trim(); if (!berdlId) { showAlert('Enter BERDLTable ID', 'error'); return; } @@ -846,14 +1008,23 @@

const data = await res.json(); const pgs = data.pangenomes || []; + const autoSelected = data.auto_selected; + // Populate dropdown dom.pangenomeSelect.innerHTML = pgs.length ? pgs.map(p => ``).join('') : ''; dom.pangenomeSelect.disabled = pgs.length === 0; dom.refreshBtn.disabled = pgs.length === 0; - setStatus(`Found ${pgs.length} pangenomes`, 'success'); + + // Auto-select if only one pangenome + if (autoSelected) { + dom.pangenomeSelect.value = autoSelected; + setStatus(`Auto-selected: ${autoSelected}`, 'success'); + } else { + setStatus(`Found ${pgs.length} pangenomes`, 'success'); + } if (pgs.length) await loadTables(); @@ -1053,12 +1224,14 @@

dom.tableHead.innerHTML = headerHtml + filterHtml; - // Body + // Body - apply cell value transformations for links dom.tableBody.innerHTML = data.map((row, ri) => { return '' + headers.map((h, ci) => { const hidden = !state.visibleColumns.has(h) ? ' hidden-col' : ''; - const val = row[ci] ?? ''; - return `${val}`; + const rawVal = row[ci] ?? ''; + // Transform cell value to add links for known identifiers + const displayVal = transformCellValue(rawVal, h); + return `${displayVal}`; }).join('') + ''; }).join(''); @@ -1135,7 +1308,13 @@

// TOOLTIP // ================================================================= function showTooltip(e, col, value) { - dom.tooltipHeader.textContent = col; + // Check if this value is linkable + const linkType = isLinkableValue(value, col); + const header = linkType + ? `${col} [${linkType.toUpperCase()}]` + : col; + + dom.tooltipHeader.textContent = header; dom.tooltipValue.textContent = value || '(empty)'; dom.cellTooltip.style.left = (e.clientX + 10) + 'px'; dom.cellTooltip.style.top = (e.clientY + 10) + 'px'; @@ -1265,12 +1444,14 @@

} }); - // Cell tooltip + // Cell tooltip - use raw value from data attribute document.addEventListener('mouseover', e => { if (e.target.matches('.ts-table td')) { const colIdx = parseInt(e.target.dataset.col); const col = state.allHeaders[colIdx]; - showTooltip(e, col, e.target.textContent); + // Use raw value from data attribute (not transformed HTML) + const rawValue = e.target.dataset.raw || e.target.textContent; + showTooltip(e, col, rawValue); } }); From 626c50e15eaf4083b23663ad2d9888693d58b4f0 Mon Sep 17 00:00:00 2001 From: VibhavSetlur Date: Wed, 17 Dec 2025 11:10:15 -0600 Subject: [PATCH 3/5] docs, caching, viewer updates --- app/main.py | 7 + app/models.py | 31 +++- app/routes.py | 82 ++++++++-- app/utils/sqlite.py | 47 ++++-- app/utils/workspace.py | 60 +++++--- docs/QUICKSTART_DEMO.md | 50 ++++++ docs/USAGE_GUIDE.md | 87 +++++++++++ pyproject.toml | 5 + static/viewer.html | 328 ++++++++++++++++++++++------------------ 9 files changed, 496 insertions(+), 201 deletions(-) create mode 100644 docs/QUICKSTART_DEMO.md create mode 100644 docs/USAGE_GUIDE.md diff --git a/app/main.py b/app/main.py index 2ad5f90..5f98a5c 100644 --- a/app/main.py +++ b/app/main.py @@ -28,8 +28,15 @@ def create_app() -> FastAPI: Returns: FastAPI: Configured FastAPI application instance """ + # Configure root_path for KBase dynamic services + # KBase services are often deployed at /services/service_name + # Pydantic Settings management or manual environ check can handle this. + import os + root_path = os.environ.get("KB_SERVICE_ROOT_PATH", "") + app = FastAPI( title="TableScanner", + root_path=root_path, description=""" ## TableScanner API diff --git a/app/models.py b/app/models.py index 183ccc6..ae78965 100644 --- a/app/models.py +++ b/app/models.py @@ -95,10 +95,15 @@ class TableDataRequest(BaseModel): description="BERDLTables object reference", examples=["76990/ADPITest"] ) - pangenome_id: str = Field( - ..., - description="Pangenome ID to query", - examples=["pg_default"] + columns: Optional[str] = Field( + "all", + description="Comma-separated list of columns to select or 'all'", + examples=["gene_id, gene_name"] + ) + col_filter: Optional[Dict[str, str]] = Field( + None, + description="Column-specific filters (alias for query_filters)", + examples=[{"gene_name": "kinase"}] ) table_name: str = Field( ..., @@ -124,6 +129,11 @@ class TableDataRequest(BaseModel): "ASC", description="Sort direction" ) + order_by: Optional[List[Dict[str, str]]] = Field( + None, + description="Multi-column sort specifications [{'column': 'col_name', 'direction': 'asc'}]", + examples=[[{"column": "gene_name", "direction": "asc"}, {"column": "score", "direction": "desc"}]] + ) search_value: Optional[str] = Field( None, description="Global search term" @@ -133,6 +143,11 @@ class TableDataRequest(BaseModel): description="Column-specific filters {column_name: filter_value}", examples=[{"gene_name": "kinase", "organism": "E. coli"}] ) + pangenome_id: Optional[str] = Field( + None, + description="Specific pangenome ID (optimizes cache lookup)", + examples=["pg_123"] + ) kb_env: str = Field( "appdev", description="KBase environment" @@ -166,9 +181,11 @@ class TableListResponse(BaseModel): class PangenomeInfo(BaseModel): - """Information about a pangenome within a BERDLTables object.""" - pangenome_id: Optional[str] = Field(None, description="Unique pangenome identifier") - pangenome_taxonomy: Optional[str] = Field(None, description="Taxonomic classification") + """Information about a pangenome found in the SQLite file.""" + pangenome_id: str = Field(..., description="ID of the pangenome") + pangenome_taxonomy: Optional[str] = Field(None, description="Taxonomy of the pangenome") + genome_count: int = Field(..., description="Number of genomes in the pangenome") + source_berdl_id: str = Field(..., description="Source BERDL Table ID") user_genomes: List[str] = Field( default_factory=list, description="List of user-provided genome references" diff --git a/app/routes.py b/app/routes.py index e7edb04..2a778bf 100644 --- a/app/routes.py +++ b/app/routes.py @@ -449,10 +449,27 @@ async def get_pangenomes( """ try: token = get_auth_token(authorization) - pangenomes = list_pangenomes_from_object(berdl_table_id, token, kb_env) - pangenome_list = [PangenomeInfo(**pg) for pg in pangenomes] - # Auto-select if only one pangenome + # Support comma-separated list of IDs + berdl_ids = [bid.strip() for bid in berdl_table_id.split(",") if bid.strip()] + + all_pangenomes: list[dict] = [] + + for bid in berdl_ids: + try: + pangenomes = list_pangenomes_from_object(bid, token, kb_env) + # Tag each pangenome with its source ID + for pg in pangenomes: + pg["source_berdl_id"] = bid + all_pangenomes.extend(pangenomes) + except Exception as e: + logger.error(f"Error fetching pangenomes for {bid}: {e}") + # Continue fetching others even if one fails + continue + + pangenome_list = [PangenomeInfo(**pg) for pg in all_pangenomes] + + # Auto-select if only one pangenome total auto_selected = None if len(pangenome_list) == 1: auto_selected = pangenome_list[0].pangenome_id @@ -463,22 +480,32 @@ async def get_pangenomes( auto_selected=auto_selected ) except Exception as e: + logger.error(f"Error in get_pangenomes: {e}") raise HTTPException(status_code=500, detail=str(e)) -@router.get("/tables/{pangenome_id}", response_model=TableListResponse) +@router.get("/tables", response_model=TableListResponse) async def get_tables( - pangenome_id: str, - berdl_table_id: str = Query(...), + berdl_table_id: str = Query(..., description="BERDLTables object reference"), + pangenome_id: Optional[str] = Query(None, description="Legacy parameter (ignored)"), kb_env: str = Query("appdev"), authorization: Optional[str] = Header(None) ): - """List tables for a pangenome (legacy endpoint).""" + """List tables for a BERDLTable object (auto-resolves pangenome).""" try: token = get_auth_token(authorization) cache_dir = get_cache_dir() - db_path = download_pangenome_db(berdl_table_id, pangenome_id, token, cache_dir, kb_env) + # 1. Resolve pangenome_id from BERDL ID + pangenomes = list_pangenomes_from_object(berdl_table_id, token, kb_env) + if not pangenomes: + raise HTTPException(status_code=404, detail="No pangenomes found in object") + + # 1:1 relationship assumed as per user requirement + # Always pick the first one associated with this object + target_pangenome = pangenomes[0]["pangenome_id"] + + db_path = download_pangenome_db(berdl_table_id, target_pangenome, token, cache_dir, kb_env) table_names = list_tables(db_path) tables = [] @@ -490,9 +517,15 @@ async def get_tables( except: tables.append(TableInfo(name=name)) - return TableListResponse(pangenome_id=pangenome_id, tables=tables) + return TableListResponse(pangenome_id=target_pangenome, tables=tables) except Exception as e: + logger.error(f"Error listing tables: {e}") raise HTTPException(status_code=500, detail=str(e)) + +# Legacy route redirect/alias if needed, but for now we replace logic +@router.get("/tables/{pangenome_id}", include_in_schema=False) +async def get_tables_legacy(pangenome_id: str, berdl_table_id: str = Query(...), kb_env: str = Query("appdev"), authorization: Optional[str] = Header(None)): + return await get_tables(berdl_table_id=berdl_table_id, pangenome_id=pangenome_id, kb_env=kb_env, authorization=authorization) @router.post("/table-data", response_model=TableDataResponse) @@ -500,17 +533,29 @@ async def query_table_data( request: TableDataRequest, authorization: Optional[str] = Header(None) ): - """Query table data (legacy POST endpoint).""" + """Query table data.""" start_time = time.time() try: + # Debugging log + print(f"Received request: {request} col_filter={request.col_filter}") + token = get_auth_token(authorization) cache_dir = get_cache_dir() kb_env = getattr(request, 'kb_env', 'appdev') or 'appdev' - db_path = download_pangenome_db( - request.berdl_table_id, request.pangenome_id, token, cache_dir, kb_env - ) + # Determine filters (support both query_filters and col_filter) + filters = request.col_filter if request.col_filter else request.query_filters + print(f"Filters determined: {filters}") + + # Download (or get cached) DB - auto-resolves ID if None + try: + db_path = download_pangenome_db( + request.berdl_table_id, request.pangenome_id, token, cache_dir, kb_env + ) + except ValueError as e: + # Handle cases where pangenome not found or resolution failed + raise HTTPException(status_code=404, detail=str(e)) if not validate_table_exists(db_path, request.table_name): available = list_tables(db_path) @@ -529,11 +574,18 @@ async def query_table_data( sort_column=request.sort_column, sort_order=request.sort_order, search_value=request.search_value, - query_filters=request.query_filters, + query_filters=filters, + columns=request.columns, + order_by=request.order_by ) response_time_ms = (time.time() - start_time) * 1000 + # Extract the resolved pangenome ID from filename if possible, or just return what we have + # Since pangenome_id in response model is just for context, we can derive it from db_path + # db_path is .../cache/berdl_id/pangenome_id.db + resolved_pangenome_id = db_path.stem + return TableDataResponse( headers=headers, data=data, @@ -541,7 +593,7 @@ async def query_table_data( total_count=total_count, filtered_count=filtered_count, table_name=request.table_name, - pangenome_id=request.pangenome_id, + pangenome_id=resolved_pangenome_id, response_time_ms=response_time_ms, db_query_ms=db_query_ms, conversion_ms=conversion_ms, diff --git a/app/utils/sqlite.py b/app/utils/sqlite.py index ed4aae9..59c8d09 100644 --- a/app/utils/sqlite.py +++ b/app/utils/sqlite.py @@ -197,18 +197,18 @@ def query_sqlite(sqlite_file: Path, query_id: str) -> dict: def get_table_data( sqlite_file: Path, table_name: str, - limit: Optional[int] = None, - offset: Optional[int] = None, - order_by: Optional[List[Dict[str, str]]] = None, - filters: Optional[List[Dict[str, Any]]] = None, + limit: int = 100, + offset: int = 0, sort_column: Optional[str] = None, - sort_order: Optional[str] = None, + sort_order: str = "ASC", search_value: Optional[str] = None, query_filters: Optional[Dict[str, str]] = None, -) -> Tuple[List[str], List[List[str]], int, int, float, float]: + columns: Optional[str] = "all", + order_by: Optional[List[Dict[str, str]]] = None +) -> Tuple[List[str], List[Any], int, int, float, float]: """ - Extract table data with pagination, sorting, and filtering. - + Get paginated and filtered data from a table. + Supports two filtering APIs for flexibility: 1. `filters`: List of FilterSpec-style dicts with column, op, value 2. `query_filters`: Simple dict of column -> search_value (LIKE matching) @@ -218,12 +218,12 @@ def get_table_data( table_name: Name of the table to query limit: Maximum number of rows to return offset: Number of rows to skip - order_by: List of order specifications [{column, direction}] - filters: List of filter specifications [{column, op, value}] sort_column: Single column to sort by (alternative to order_by) sort_order: Sort direction 'asc' or 'desc' (with sort_column) search_value: Global search term for all columns query_filters: Dict of column-specific search terms + columns: Comma-separated list of columns to select + order_by: List of order specifications [{column, direction}] Returns: Tuple of (headers, data, total_count, filtered_count, db_query_ms, conversion_ms) @@ -232,18 +232,37 @@ def get_table_data( sqlite3.Error: If database query fails ValueError: If invalid operator is specified """ + start_time = time.time() + + # Initialize legacy filters to None since removed from signature + filters = None + try: conn = sqlite3.connect(str(sqlite_file)) conn.row_factory = sqlite3.Row cursor = conn.cursor() - # Get column names first - headers = get_table_columns(sqlite_file, table_name) + # Get all column names first for validation + all_headers = get_table_columns(sqlite_file, table_name) - if not headers: + if not all_headers: logger.warning(f"Table {table_name} has no columns or doesn't exist") return [], [], 0, 0, 0.0, 0.0 + # Parse requested columns + selected_headers = all_headers + select_clause = "*" + + if columns and columns.lower() != "all": + requested = [c.strip() for c in columns.split(',') if c.strip()] + valid = [c for c in requested if c in all_headers] + if valid: + selected_headers = valid + safe_cols = [f'"{c}"' for c in selected_headers] + select_clause = ", ".join(safe_cols) + + headers = selected_headers + # 1. Get total count (before filtering) cursor.execute(f"SELECT COUNT(*) FROM {table_name}") total_count = cursor.fetchone()[0] @@ -299,7 +318,7 @@ def get_table_data( filtered_count = total_count # 4. Build final query - query = f"SELECT * FROM {table_name}{where_clause}" + query = f"SELECT {select_clause} FROM {table_name}{where_clause}" # Add ORDER BY clause order_clauses = [] diff --git a/app/utils/workspace.py b/app/utils/workspace.py index e115664..7f27d26 100644 --- a/app/utils/workspace.py +++ b/app/utils/workspace.py @@ -317,6 +317,7 @@ def list_pangenomes_from_object( "pangenome_taxonomy": pg.get("pangenome_taxonomy", ""), "user_genomes": pg.get("user_genomes", []), "berdl_genomes": pg.get("berdl_genomes", []), + "genome_count": len(pg.get("user_genomes", [])) + len(pg.get("berdl_genomes", [])), "handle_ref": pg.get("sqllite_tables_handle_ref", ""), }) @@ -356,42 +357,63 @@ def find_pangenome_handle( def download_pangenome_db( berdl_table_id: str, - pangenome_id: str, + pangenome_id: Optional[str], auth_token: str, cache_dir: Path, kb_env: str = "appdev" ) -> Path: """ - Download the SQLite database for a pangenome by ID. + Download the SQLite database for a pangenome. + If pangenome_id is None, it is auto-resolved from the BERDL object (1:1 mapping assumed). Checks cache first, downloads only if not present. - - Args: - berdl_table_id: KBase workspace reference - pangenome_id: Pangenome ID - auth_token: KBase authentication token - cache_dir: Local cache directory - kb_env: KBase environment - - Returns: - Path to the local SQLite database file """ from app.utils.cache import is_cached, get_cache_paths - # Get cache path cache_dir = Path(cache_dir) safe_id = berdl_table_id.replace("/", "_").replace(":", "_") db_dir = cache_dir / safe_id - db_path = db_dir / f"{pangenome_id}.db" - # Check cache + # 1. Resolve ID and Handle if not provided + target_id = pangenome_id + handle_ref = None + + # We always need the ID for the filename. + # If pangenome_id is missing, we must fetch the object metadata to get it. + # If pangenome_id IS provided, we might still need to fetch object to get the handle (unless cached). + + # Optimization: If pangenome_id is provided, check if file exists. + # If so, we don't need to fetch metadata. + if target_id: + db_path = db_dir / f"{target_id}.db" + if db_path.exists(): + logger.info(f"Using cached database: {db_path}") + return db_path + + # If not cached or ID unknown, we must fetch metadata + pangenomes = list_pangenomes_from_object(berdl_table_id, auth_token, kb_env) + if not pangenomes: + raise ValueError(f"No pangenomes found in {berdl_table_id}") + + if target_id: + # Verify and find handle + found = next((p for p in pangenomes if p["pangenome_id"] == target_id), None) + if not found: + raise ValueError(f"Pangenome '{target_id}' not found in {berdl_table_id}") + handle_ref = found["handle_ref"] + else: + # Auto-resolve: take the first one + found = pangenomes[0] + target_id = found["pangenome_id"] + handle_ref = found["handle_ref"] + + # Re-check cache with resolved ID + db_path = db_dir / f"{target_id}.db" if db_path.exists(): - logger.info(f"Using cached database: {db_path}") + logger.info(f"Using cached database: {db_path} (resolved ID: {target_id})") return db_path - # Find handle and download - handle_ref = find_pangenome_handle(berdl_table_id, pangenome_id, auth_token, kb_env) - + # Download client = KBaseClient(auth_token, kb_env, cache_dir) db_path = client.download_blob_file(handle_ref, db_path) diff --git a/docs/QUICKSTART_DEMO.md b/docs/QUICKSTART_DEMO.md new file mode 100644 index 0000000..b06de7d --- /dev/null +++ b/docs/QUICKSTART_DEMO.md @@ -0,0 +1,50 @@ +# Quickstart Demo + +This guide walks you through running the TableScanner demo locally. + +## Prerequisites + +- Python 3.9+ +- KBase Auth Token (for accessing workspace objects) + +## Setup + +1. **Install Dependencies** + ```bash + pip install -r requirements.txt + ``` + +2. **Start the Service** + ```bash + uv run fastapi dev app/main.py + ``` + Server will start at `http://localhost:8000`. + +## Running the Demo + +1. Open the [Viewer](http://localhost:8000/static/viewer.html) in your browser. + +2. **Configuration:** + - **Environment**: Select `AppDev` (or appropriate env). + - **Auth Token**: Enter your KBase token. + +3. **Load Data:** + - **BERDL Table ID**: Enter `76990/ADP1Test`. + - Click the **Search** icon. + +4. **Explore:** + - Since `76990/ADP1Test` contains only one pangenome, it will be **auto-selected**. + - Tables will load automatically. + - Select a table (e.g., "Genome attributes") to view data. + - Hover over cells with IDs (UniProt, KEGG, etc.) to see tooltips. + - Click IDs to visit external databases. + +## Multi-Pangenome Demo + +To test loading multiple identifiers: + +1. **BERDL Table ID**: Enter `76990/ADP1Test, 76990/ADP1Test` (simulating two sources). +2. Click **Search**. +3. The **Pangenome** dropdown will appear. +4. Options will show as: `ADP1 [76990/ADP1Test]`. +5. Select different options to toggle between datasets (if they were different). diff --git a/docs/USAGE_GUIDE.md b/docs/USAGE_GUIDE.md new file mode 100644 index 0000000..e7e56f9 --- /dev/null +++ b/docs/USAGE_GUIDE.md @@ -0,0 +1,87 @@ +# Usage Guide + +This guide covers production usage of the TableScanner service. + +## API Endpoint +The service is deployed at: +``` +https://appdev.kbase.us/services/berdl_table_scanner +``` + +## Python API Usage + +You can interact with the service programmatically using Python's `requests` library. + +### 1. Listing Pangenomes +```python +import requests + +service_url = "https://appdev.kbase.us/services/berdl_table_scanner" +token = "YOUR_KBASE_TOKEN" +berdl_id = "76990/ADP1Test" + +headers = {"Authorization": token} +params = {"berdl_table_id": berdl_id} + +response = requests.get(f"{service_url}/pangenomes", headers=headers, params=params) +data = response.json() + +print(f"Found {data['pangenome_count']} pangenomes") +for pg in data['pangenomes']: + print(f"- {pg['pangenome_id']} (Source: {pg['source_berdl_id']})") +``` + +### 2. Querying Table Data + +Query table data with filtering and column selection. + +```python +headers = {"Authorization": token} + +# Get data from "Conditions" table +berdl_id = "76990/ADP1Test" +table_name = "Conditions" + +payload = { + "berdl_table_id": berdl_id, + "table_name": table_name, + "columns": "Database_ID, Name", + "col_filter": { + "Name": "test" + }, + "order_by": [ + {"column": "Name", "direction": "ASC"} + ], + "limit": 5, + "offset": 0 +} + +response = requests.post(f"{service_url}/table-data", json=payload, headers=headers) +data = response.json() + +print(f"Loaded {data['row_count']} rows from {table_name}") +print(f"Headers: {data['headers']}") +``` + +## Multi-Source Querying + +The `/pangenomes` endpoint supports multiple comma-separated BERDL IDs. + +```python +multi_params = { + "berdl_table_id": "76990/ADP1Test, 12345/AnotherTable" +} + +response = requests.get(f"{service_url}/pangenomes", headers=headers, params=multi_params) +# Returns pangenomes from BOTH objects in a single list +``` + +## Viewer Usage + +The web viewer is available at: +`https://appdev.kbase.us/services/berdl_table_scanner/static/viewer.html` + +1. Enter **Auth Token**. +2. Enter **BERDL Table ID(s)** (comma-separated). +3. Click **Search**. +4. Use the interface to filter, sort, and export data. diff --git a/pyproject.toml b/pyproject.toml index b11cc95..2e6923c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,11 @@ dependencies = [ "minio>=7.2.20", "pydantic-settings>=2.0.0", "requests>=2.31.0", + "pandas>=2.2.0", + "PyYAML>=6.0", + "tqdm>=4.64.0", + "itables>=1.5.0", + "ipywidgets>=8.0.0", ] [build-system] diff --git a/static/viewer.html b/static/viewer.html index 34ab700..145d5b3 100644 --- a/static/viewer.html +++ b/static/viewer.html @@ -522,12 +522,7 @@

Local

- -
@@ -730,7 +725,8 @@

currentPage: 0, pageSize: 100, filteredData: [], - pagedData: [] + pagedData: [], + showFilters: false }; // ================================================================= @@ -742,7 +738,7 @@

authToken: $('authToken'), apiUrlSelect: $('apiUrlSelect'), apiModeLabel: $('apiModeLabel'), - kbEnvSelect: $('kbEnvSelect'), + // kbEnvSelect removed loadBerdlBtn: $('loadBerdlBtn'), pangenomeSelect: $('pangenomeSelect'), tableSelect: $('tableSelect'), @@ -808,12 +804,15 @@

} function showAlert(msg, type = 'info') { - dom.alertArea.innerHTML = `
${msg}
`; + dom.alertArea.innerHTML = `
${msg}
`; setTimeout(() => dom.alertArea.innerHTML = '', 4000); } function getCacheKey() { - return `${dom.berdlTableId.value}|${dom.pangenomeSelect.value}|${dom.tableSelect.value}`; + const selectedOption = dom.pangenomeSelect.options[dom.pangenomeSelect.selectedIndex]; + const pangenomeId = selectedOption ? selectedOption.value : ''; + const berdlId = selectedOption ? selectedOption.dataset.berdl : ''; + return `${berdlId}|${pangenomeId}|${dom.tableSelect.value}`; } function getHeaders() { @@ -992,16 +991,21 @@

* Handles auto-selection when only one pangenome exists. */ async function loadPangenomes() { - const berdlId = dom.berdlTableId.value.trim(); - if (!berdlId) { showAlert('Enter BERDLTable ID', 'error'); return; } + const berdlIdInput = dom.berdlTableId.value.trim(); + + // Allow multiple IDs (split by comma, clean whitespace) + // Filter out empty strings + const berdlIds = berdlIdInput.split(',').map(s => s.trim()).filter(Boolean); + + if (berdlIds.length === 0) { showAlert('Enter BERDLTable ID(s)', 'error'); return; } if (!dom.authToken.value) { showAlert('Enter Auth Token', 'error'); return; } setStatus('Loading pangenomes...', 'loading'); try { const url = new URL(`${getApiUrl()}/pangenomes`); - url.searchParams.set('berdl_table_id', berdlId); - url.searchParams.set('kb_env', dom.kbEnvSelect.value); + // Join back with commas for the API call + url.searchParams.set('berdl_table_id', berdlIds.join(',')); const res = await fetch(url, { headers: getHeaders() }); if (!res.ok) throw new Error((await res.json()).detail || res.statusText); @@ -1010,20 +1014,31 @@

const pgs = data.pangenomes || []; const autoSelected = data.auto_selected; - // Populate dropdown + // Populate dropdown with group context if multiple IDs used + // Format: "PangenomeID - SourceID" + // VALUE: source_berdl_id (as pangenome_id is now inferred) dom.pangenomeSelect.innerHTML = pgs.length - ? pgs.map(p => ``).join('') + ? pgs.map(p => { + const label = `${p.pangenome_id}${p.pangenome_taxonomy ? ' - ' + p.pangenome_taxonomy : ''} [${p.source_berdl_id}]`; + return ``; + }).join('') : ''; dom.pangenomeSelect.disabled = pgs.length === 0; dom.refreshBtn.disabled = pgs.length === 0; - // Auto-select if only one pangenome + // Auto-select if ONLY one pangenome + // Since value is now source_berdl_id, we need to find it from the data if (autoSelected) { - dom.pangenomeSelect.value = autoSelected; - setStatus(`Auto-selected: ${autoSelected}`, 'success'); + // Find the corresponding source_berdl_id for the autoSelected pangenome_id + const target = pgs.find(p => p.pangenome_id === autoSelected); + if (target) { + dom.pangenomeSelect.value = target.pangenome_id; + setStatus(`Auto-selected: ${autoSelected}`, 'success'); + } } else { - setStatus(`Found ${pgs.length} pangenomes`, 'success'); + // If multiple IDs, just show count + setStatus(`Found ${pgs.length} pangenomes from ${berdlIds.length} source(s)`, 'success'); } if (pgs.length) await loadTables(); @@ -1035,29 +1050,38 @@

} async function loadTables() { - const pgId = dom.pangenomeSelect.value; - if (!pgId) return; + const selectedOption = dom.pangenomeSelect.options[dom.pangenomeSelect.selectedIndex]; + if (!selectedOption) return; + + const berdlId = selectedOption.dataset.berdl; + if (!berdlId) return; setStatus('Loading tables...', 'loading'); try { - const url = new URL(`${getApiUrl()}/tables/${pgId}`); - url.searchParams.set('berdl_table_id', dom.berdlTableId.value); - url.searchParams.set('kb_env', dom.kbEnvSelect.value); + const url = new URL(`${getApiUrl()}/tables`); + url.searchParams.set('berdl_table_id', berdlId); const res = await fetch(url, { headers: getHeaders() }); if (!res.ok) throw new Error((await res.json()).detail || res.statusText); const data = await res.json(); - const tables = data.tables || []; - dom.tableSelect.innerHTML = tables.length - ? tables.map(t => ``).join('') - : ''; + // Populate table select + dom.tableSelect.innerHTML = data.tables.length + ? data.tables.map(t => ``).join('') + : ''; - dom.tableSelect.disabled = tables.length === 0; - dom.loadDataBtn.disabled = tables.length === 0; - setStatus(`Found ${tables.length} tables`, 'success'); + dom.tableSelect.disabled = data.tables.length === 0; + + // Auto-load first table if no specific selection + if (data.tables.length) { + dom.tableSelect.selectedIndex = 0; + await loadTableData(); + } else { + showEmpty(); + } + setStatus(`Loaded ${data.tables.length} tables`, 'success'); } catch (err) { setStatus(`Error: ${err.message}`, 'error'); @@ -1065,6 +1089,10 @@

} } + function refreshData() { + loadTableData(); + } + async function loadTableData(forceRefresh = false) { const key = getCacheKey(); @@ -1088,18 +1116,46 @@

return; } + // ================================================================= + // STATE MANAGEMENT & SERVER-SIDE DATA FETCHING + // ================================================================= + + // Trigger a data refresh based on current state + + if (!dom.tableSelect.value) return; + + // If loading from distinct cache key (pangenome+table), reset view only if needed + // But generally we just fetch with current state + showLoading(); - try { - const body = { - berdl_table_id: dom.berdlTableId.value, - pangenome_id: dom.pangenomeSelect.value, - table_name: dom.tableSelect.value, - limit: 500000, - offset: 0, - kb_env: dom.kbEnvSelect.value - }; + const selectedOption = dom.pangenomeSelect.options[dom.pangenomeSelect.selectedIndex]; + const berdlId = selectedOption.dataset.berdl; + const pangenomeId = selectedOption.value; + + // Construct request based on STATE + const body = { + berdl_table_id: berdlId, + pangenome_id: pangenomeId, + table_name: dom.tableSelect.value, + limit: state.pageSize, + offset: state.currentPage * state.pageSize, + columns: 'all', + col_filter: state.columnFilters, + search_value: state.clientSearch, + sort_column: state.sortColumn, + sort_order: state.sortOrder ? state.sortOrder.toUpperCase() : 'ASC' + }; + // Add order_by if sorting is active (backend expects list for strictly complex sorts, but simple fields work too) + if (state.sortColumn) { + body.order_by = [{ + column: state.sortColumn, + direction: state.sortOrder ? state.sortOrder.toUpperCase() : 'ASC' + }]; + } + + try { const res = await fetch(`${getApiUrl()}/table-data`, { method: 'POST', headers: getHeaders(), @@ -1109,152 +1165,124 @@

if (!res.ok) throw new Error((await res.json()).detail || res.statusText); const data = await res.json(); - - // Store in client cache - state.cache.set(key, { headers: data.headers, data: data.data, meta: data }); - state.currentKey = key; + state.allData = data.data || []; // Current page data state.allHeaders = data.headers || []; - state.allData = data.data || []; + state.totalRows = data.filtered_count; // Use filtered count for pagination + state.serverTotal = data.row_count; - // Reset view state - state.visibleColumns = new Set(state.allHeaders); - state.sortColumn = null; - state.sortOrder = 'asc'; - state.columnFilters = {}; - state.clientSearch = ''; - state.currentPage = 0; - dom.clientSearch.value = ''; + // Init visible columns if first load + if (state.visibleColumns.size === 0) { + state.visibleColumns = new Set(state.allHeaders); + renderHeaders(); // Only render headers on first load or schema change + } - // Update UI + // Update UI components dom.cacheBadge.style.display = 'none'; dom.apiTime.textContent = data.response_time_ms?.toFixed(0) || '--'; dom.dataSource.textContent = data.source || 'Server'; dom.dbFile.textContent = data.sqlite_file ? data.sqlite_file.split('/').pop() : '--'; hideLoading(); - setStatus(`Loaded ${state.allData.length} rows`, 'success'); - applyFiltersAndRender(); + // Render ONLY body and updates + renderBody(); + updatePagination(); + updateHeaderIcons(); // Update sort icons without destroying inputs + updateColumnPanel(); + updateCounts(); } catch (err) { hideLoading(); showEmpty(); setStatus(`Error: ${err.message}`, 'error'); - showAlert(err.message, 'error'); + console.error(err); } } - // ================================================================= - // CLIENT-SIDE FILTERING, SORTING, PAGINATION - // ================================================================= - function applyFiltersAndRender() { - let data = [...state.allData]; - const headers = state.allHeaders; - - // Column filters - for (const [col, val] of Object.entries(state.columnFilters)) { - if (!val) continue; - const idx = headers.indexOf(col); - if (idx < 0) continue; - const lowerVal = val.toLowerCase(); - data = data.filter(row => String(row[idx] || '').toLowerCase().includes(lowerVal)); - } - - // Global search - if (state.clientSearch) { - const term = state.clientSearch.toLowerCase(); - data = data.filter(row => row.some(cell => String(cell || '').toLowerCase().includes(term))); - } - - // Sorting - if (state.sortColumn) { - const idx = headers.indexOf(state.sortColumn); - if (idx >= 0) { - data.sort((a, b) => { - const aVal = a[idx] ?? ''; - const bVal = b[idx] ?? ''; - const aNum = parseFloat(aVal); - const bNum = parseFloat(bVal); - if (!isNaN(aNum) && !isNaN(bNum)) { - return state.sortOrder === 'asc' ? aNum - bNum : bNum - aNum; - } - const cmp = String(aVal).localeCompare(String(bVal)); - return state.sortOrder === 'asc' ? cmp : -cmp; - }); - } - } - - state.filteredData = data; - - // Paginate - const start = state.currentPage * state.pageSize; - state.pagedData = data.slice(start, start + state.pageSize); - - renderTable(); - updatePagination(); - updateColumnPanel(); - } - // ================================================================= // RENDERING // ================================================================= - function renderTable() { - const headers = state.allHeaders; - const data = state.pagedData; - if (!headers.length) { showEmpty(); return; } - showTable(); + function renderHeaders() { + const headers = state.allHeaders; // Header row let headerHtml = ''; headers.forEach(h => { const hidden = !state.visibleColumns.has(h) ? ' hidden-col' : ''; - const sortCls = state.sortColumn === h ? (state.sortOrder === 'asc' ? ' sort-asc' : ' sort-desc') : ''; - headerHtml += `${h}`; + headerHtml += `${h}`; }); headerHtml += ''; - // Filter row - let filterHtml = ''; + // Filter row (Persist values if re-rendering) + // Note: We use state.columnFilters to populate, but usually we just toggle visibility + let filterHtml = ''; headers.forEach(h => { const hidden = !state.visibleColumns.has(h) ? ' hidden-col' : ''; - filterHtml += ``; + const val = state.columnFilters[h] || ''; + filterHtml += ``; }); filterHtml += ''; dom.tableHead.innerHTML = headerHtml + filterHtml; + } + + function renderBody() { + const headers = state.allHeaders; + const data = state.allData; // This is now just the current page + + if (!headers.length) { showEmpty(); return; } + showTable(); - // Body - apply cell value transformations for links dom.tableBody.innerHTML = data.map((row, ri) => { return '' + headers.map((h, ci) => { const hidden = !state.visibleColumns.has(h) ? ' hidden-col' : ''; const rawVal = row[ci] ?? ''; - // Transform cell value to add links for known identifiers const displayVal = transformCellValue(rawVal, h); return `${displayVal}`; }).join('') + ''; }).join(''); + } + + function updateHeaderIcons() { + // update sort classes + document.querySelectorAll('.ts-table thead th[data-col]').forEach(th => { + th.classList.remove('sort-asc', 'sort-desc'); + if (th.dataset.col === state.sortColumn) { + th.classList.add(state.sortOrder === 'asc' ? 'sort-asc' : 'sort-desc'); + } + }); + + // update visibility classes + const headers = state.allHeaders; + const visibleHeaders = headers.filter(h => state.visibleColumns.has(h)); + + // We'd need to complex match indices to update hidden classes dynamically + // For now, if visibility changes, we might need full re-render, + // but for sorting/filtering we rely on updateHeaderIcons. + } - // Counts + function updateCounts() { dom.tableBadge.textContent = dom.tableSelect.value; - dom.totalCount.textContent = state.allData.length; - dom.filteredCount.textContent = state.filteredData.length; + dom.totalCount.textContent = state.serverTotal || 0; + dom.filteredCount.textContent = state.totalRows || 0; const start = state.currentPage * state.pageSize; - dom.showStart.textContent = data.length ? start + 1 : 0; - dom.showEnd.textContent = start + data.length; - - dom.exportCsvBtn.disabled = false; + const count = state.allData.length; + dom.showStart.textContent = count ? start + 1 : 0; + dom.showEnd.textContent = start + count; } function updatePagination() { - const totalPages = Math.ceil(state.filteredData.length / state.pageSize); + const totalPages = Math.ceil((state.totalRows || 0) / state.pageSize); + if (totalPages <= 1) { dom.pagination.innerHTML = ''; return; } let html = ''; + // First/Prev html += ``; html += ``; @@ -1267,6 +1295,7 @@

html += ``; } + // Next/Last html += ``; html += ``; @@ -1292,14 +1321,14 @@

const visibleIndices = visibleHeaders.map(h => state.allHeaders.indexOf(h)); let csv = visibleHeaders.map(h => `"${h.replace(/"/g, '""')}"`).join(',') + '\n'; - state.filteredData.forEach(row => { + state.allData.forEach(row => { csv += visibleIndices.map(i => `"${String(row[i] ?? '').replace(/"/g, '""')}"`).join(',') + '\n'; }); const blob = new Blob([csv], { type: 'text/csv' }); const url = URL.createObjectURL(blob); const a = document.createElement('a'); - a.href = url; a.download = `${dom.tableSelect.value}_export.csv`; + a.href = url; a.download = `${dom.tableSelect.value}_page_${state.currentPage + 1}.csv`; document.body.appendChild(a); a.click(); document.body.removeChild(a); URL.revokeObjectURL(url); } @@ -1345,20 +1374,23 @@

dom.clientSearch.addEventListener('input', debounce(() => { state.clientSearch = dom.clientSearch.value; state.currentPage = 0; - applyFiltersAndRender(); - }, 200)); + refreshData(); + }, 500)); dom.pageSizeSelect.addEventListener('change', () => { state.pageSize = parseInt(dom.pageSizeSelect.value); state.currentPage = 0; - applyFiltersAndRender(); + refreshData(); }); dom.columnsBtn.addEventListener('click', () => dom.columnPanel.classList.toggle('show')); dom.filtersBtn.addEventListener('click', () => { + // Toggle state + state.showFilters = !state.showFilters; + // Update UI visibility document.querySelectorAll('.ts-filter-row').forEach(r => { - r.style.display = r.style.display === 'none' ? '' : 'none'; + r.style.display = state.showFilters ? '' : 'none'; }); }); @@ -1370,7 +1402,9 @@

state.clientSearch = ''; state.currentPage = 0; dom.clientSearch.value = ''; - applyFiltersAndRender(); + // Render headers to clear filter inputs physically + renderHeaders(); + refreshData(); }); dom.exportCsvBtn.addEventListener('click', exportCsv); @@ -1399,11 +1433,11 @@

state.sortOrder = 'asc'; } state.currentPage = 0; - applyFiltersAndRender(); + refreshData(); } }); - // Column filter input + // Column filter input (Debounced Server Call) document.addEventListener('input', debounce(e => { if (e.target.dataset.filter) { const col = e.target.dataset.filter; @@ -1411,16 +1445,16 @@

if (val) state.columnFilters[col] = val; else delete state.columnFilters[col]; state.currentPage = 0; - applyFiltersAndRender(); + refreshData(); } - }, 200)); + }, 500)); // Pagination document.addEventListener('click', e => { const btn = e.target.closest('.ts-page-btn[data-page]'); if (btn && !btn.disabled) { state.currentPage = parseInt(btn.dataset.page); - applyFiltersAndRender(); + refreshData(); } }); @@ -1430,26 +1464,28 @@

const col = e.target.dataset.toggleCol; if (e.target.checked) state.visibleColumns.add(col); else state.visibleColumns.delete(col); - applyFiltersAndRender(); + renderHeaders(); + renderBody(); } }); document.addEventListener('click', e => { if (e.target.id === 'selectAllCols') { state.visibleColumns = new Set(state.allHeaders); - applyFiltersAndRender(); + renderHeaders(); + renderBody(); } else if (e.target.id === 'selectNoneCols') { state.visibleColumns.clear(); - applyFiltersAndRender(); + renderHeaders(); + renderBody(); } }); - // Cell tooltip - use raw value from data attribute + // Cell tooltip document.addEventListener('mouseover', e => { if (e.target.matches('.ts-table td')) { const colIdx = parseInt(e.target.dataset.col); const col = state.allHeaders[colIdx]; - // Use raw value from data attribute (not transformed HTML) const rawValue = e.target.dataset.raw || e.target.textContent; showTooltip(e, col, rawValue); } From 6ac5cf2606be903941e531e1f8b8844fbbaf0860 Mon Sep 17 00:00:00 2001 From: VibhavSetlur Date: Wed, 17 Dec 2025 11:12:23 -0600 Subject: [PATCH 4/5] copilot issue fix --- app/utils/sqlite.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/app/utils/sqlite.py b/app/utils/sqlite.py index 59c8d09..5ae3785 100644 --- a/app/utils/sqlite.py +++ b/app/utils/sqlite.py @@ -20,6 +20,18 @@ logger = logging.getLogger(__name__) +def _validate_table_name(cursor, table_name: str) -> None: + """ + Validate that table_name corresponds to an existing table in the database. + Prevents SQL injection by ensuring table_name is a valid identifier. + """ + # Parameterized query is safe from injection + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=?", (table_name,)) + if not cursor.fetchone(): + # Check for case-insensitive match or just fail + raise ValueError(f"Invalid table name: {table_name}") + + # ============================================================================= # TABLE LISTING & METADATA # ============================================================================= @@ -75,6 +87,9 @@ def get_table_columns(db_path: Path, table_name: str) -> List[str]: conn = sqlite3.connect(str(db_path)) cursor = conn.cursor() + # Validate table name to prevent injection + _validate_table_name(cursor, table_name) + # Use PRAGMA to get table info cursor.execute(f"PRAGMA table_info({table_name})") columns = [row[1] for row in cursor.fetchall()] @@ -102,6 +117,8 @@ def get_table_row_count(db_path: Path, table_name: str) -> int: conn = sqlite3.connect(str(db_path)) cursor = conn.cursor() + _validate_table_name(cursor, table_name) + cursor.execute(f"SELECT COUNT(*) FROM {table_name}") count = cursor.fetchone()[0] conn.close() @@ -146,6 +163,8 @@ def ensure_indices(db_path: Path, table_name: str) -> None: conn = sqlite3.connect(str(db_path)) cursor = conn.cursor() + _validate_table_name(cursor, table_name) + # Get columns cursor.execute(f"PRAGMA table_info({table_name})") columns = [row[1] for row in cursor.fetchall()] @@ -242,6 +261,9 @@ def get_table_data( conn.row_factory = sqlite3.Row cursor = conn.cursor() + # Validate table name + _validate_table_name(cursor, table_name) + # Get all column names first for validation all_headers = get_table_columns(sqlite_file, table_name) From 1cd5a02c9feb3ac1713477f08becff3b9952e6b8 Mon Sep 17 00:00:00 2001 From: VibhavSetlur Date: Wed, 17 Dec 2025 12:13:36 -0600 Subject: [PATCH 5/5] viewer and token handling --- app/utils/workspace.py | 20 +++++++++++++------- static/viewer.html | 17 ++++++++++++----- 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/app/utils/workspace.py b/app/utils/workspace.py index 7f27d26..89a2e94 100644 --- a/app/utils/workspace.py +++ b/app/utils/workspace.py @@ -107,7 +107,11 @@ def get_object(self, ref: str, ws: Optional[int] = None) -> Dict[str, Any]: Object data dictionary """ if self._use_kbutillib and self._client: - return self._client.get_object(ref, ws=ws) + try: + return self._client.get_object(ref, ws=ws) + except Exception as e: + logger.warning(f"KBUtilLib get_object failed: {e}. Using fallback.") + return self._get_object_fallback(ref, ws) else: return self._get_object_fallback(ref, ws) @@ -127,12 +131,14 @@ def download_blob_file(self, handle_ref: str, target_path: Path) -> Path: target_path.parent.mkdir(parents=True, exist_ok=True) if self._use_kbutillib and self._client: - result = self._client.download_blob_file(handle_ref, str(target_path)) - if result: - return Path(result) - raise ValueError(f"Failed to download from handle: {handle_ref}") - else: - return Path(self._download_blob_fallback(handle_ref, str(target_path))) + try: + result = self._client.download_blob_file(handle_ref, str(target_path)) + if result: + return Path(result) + except Exception as e: + logger.warning(f"KBUtilLib download_blob_file failed: {e}. Using fallback.") + + return Path(self._download_blob_fallback(handle_ref, str(target_path))) # ========================================================================= # FALLBACK METHODS (Direct API calls) diff --git a/static/viewer.html b/static/viewer.html index 145d5b3..98d7449 100644 --- a/static/viewer.html +++ b/static/viewer.html @@ -518,7 +518,7 @@

TableScanner - v1.1 + v1.0 Local

@@ -726,7 +726,9 @@

pageSize: 100, filteredData: [], pagedData: [], - showFilters: false + pagedData: [], + showFilters: false, + lastTable: null }; // ================================================================= @@ -1170,10 +1172,13 @@

state.totalRows = data.filtered_count; // Use filtered count for pagination state.serverTotal = data.row_count; - // Init visible columns if first load - if (state.visibleColumns.size === 0) { + state.serverTotal = data.row_count; + + // Reset visible columns if table changed or first load + if (state.lastTable !== body.table_name || state.visibleColumns.size === 0) { state.visibleColumns = new Set(state.allHeaders); - renderHeaders(); // Only render headers on first load or schema change + state.lastTable = body.table_name; + renderHeaders(); } // Update UI components @@ -1474,10 +1479,12 @@

state.visibleColumns = new Set(state.allHeaders); renderHeaders(); renderBody(); + updateColumnPanel(); } else if (e.target.id === 'selectNoneCols') { state.visibleColumns.clear(); renderHeaders(); renderBody(); + updateColumnPanel(); } });