kbase · bio-boris · Dec 15, 2025 · Dec 15, 2025 · Dec 15, 2025 · Dec 15, 2025
diff --git a/.env.example b/.env.example
@@ -0,0 +1,11 @@
+# TableScanner Environment Variables
+# Copy this file to .env and fill in your actual values
+
+# KBase Service Authentication Token
+KB_SERVICE_AUTH_TOKEN=your_kbase_token_here
+
+# Cache directory for storing downloaded files and SQLite databases
+CACHE_DIR=/tmp/tablescanner_cache
+
+# KBase Workspace Service URL
+WORKSPACE_URL=https://kbase.us/services/ws
diff --git a/.gitignore b/.gitignore
@@ -17,3 +17,6 @@ uv.lock
 .venv
 venv/
 *.log
+
+# Environment variables
+.env
diff --git a/app/config.py b/app/config.py
@@ -0,0 +1,24 @@
+"""
+Configuration settings for TableScanner application.
+"""
+
+from pydantic_settings import BaseSettings
+
+
+class Settings(BaseSettings):
+    """Application settings."""
+
+    KB_SERVICE_AUTH_TOKEN: str
+    CACHE_DIR: str
-    """Application settings."""
-
-    KB_SERVICE_AUTH_TOKEN: str
-    CACHE_DIR: str
+    """
+    Application settings.
+
+    Required environment variables:
+        - KB_SERVICE_AUTH_TOKEN
+        - WORKSPACE_URL
+
+    Optional environment variables:
+        - CACHE_DIR (default: "./cache")
+    """
+
+    KB_SERVICE_AUTH_TOKEN: str
+    CACHE_DIR: str = "./cache"
-    """Application settings."""
-
-    KB_SERVICE_AUTH_TOKEN: str
-    CACHE_DIR: str
+    """
+    Application settings.
+
+    Required environment variables:
+        - KB_SERVICE_AUTH_TOKEN
+        - WORKSPACE_URL
+
+    Optional environment variables:
+        - CACHE_DIR (default: "./cache")
+    """
+
+    KB_SERVICE_AUTH_TOKEN: str
+    CACHE_DIR: str = "./cache"
+
+    # KBase Workspace settings
+    WORKSPACE_URL: str
+
+    class Config:
+        env_file = ".env"
+        env_file_encoding = "utf-8"
+        case_sensitive = True
+
+
+# Global settings instance
+settings = Settings()
diff --git a/app/main.py b/app/main.py
@@ -6,12 +6,13 @@
 
 from fastapi import FastAPI
 from app.routes import router
+from app.config import settings
 
 
 def create_app() -> FastAPI:
     """
     Application factory function.
-    
+
     Returns:
         FastAPI: Configured FastAPI application instance
     """
@@ -20,10 +21,13 @@ def create_app() -> FastAPI:
         description="API for table scanning operations",
         version="1.0.0"
     )
-
+
+    # Store settings in app state for access throughout the application
+    app.state.settings = settings
+
     # Include routes
     app.include_router(router)
-    
+
     return app
 
 

diff --git a/app/models.py b/app/models.py
@@ -0,0 +1,15 @@
+"""
+Pydantic models for request/response schemas.
+"""
+
+from typing import Optional, List, Dict, Any
+from pydantic import BaseModel
+
+
+class SearchRequest(BaseModel):
+    """Search request with query parameters."""
+    pangenome_id: str
+    table_name: str
-from pydantic import BaseModel
-
-
-class SearchRequest(BaseModel):
-    """Search request with query parameters."""
-    pangenome_id: str
-    table_name: str
+from pydantic import BaseModel, constr
+
+
+class SearchRequest(BaseModel):
+    """Search request with query parameters."""
+    pangenome_id: constr(min_length=1, regex=r"^[A-Za-z0-9_\-]+$")
+    table_name: constr(min_length=1, regex=r"^[A-Za-z_][A-Za-z0-9_]*$")
-from pydantic import BaseModel
-
-
-class SearchRequest(BaseModel):
-    """Search request with query parameters."""
-    pangenome_id: str
-    table_name: str
+from pydantic import BaseModel, constr
+
+
+class SearchRequest(BaseModel):
+    """Search request with query parameters."""
+    pangenome_id: constr(min_length=1, regex=r"^[A-Za-z0-9_\-]+$")
+    table_name: constr(min_length=1, regex=r"^[A-Za-z_][A-Za-z0-9_]*$")
+    limit: Optional[int] = None
-from pydantic import BaseModel
-
-
-class SearchRequest(BaseModel):
-    """Search request with query parameters."""
-    pangenome_id: str
-    table_name: str
-    limit: Optional[int] = None
+from pydantic import BaseModel, Field
+
+
+class SearchRequest(BaseModel):
+    """Search request with query parameters."""
+    pangenome_id: str
+    table_name: str
+    limit: Optional[int] = Field(
+        default=None,
+        ge=1,
+        le=1000,
+        description="Maximum number of results to return (min 1, max 1000)"
+    )
-from pydantic import BaseModel
-
-
-class SearchRequest(BaseModel):
-    """Search request with query parameters."""
-    pangenome_id: str
-    table_name: str
-    limit: Optional[int] = None
+from pydantic import BaseModel, Field
+
+
+class SearchRequest(BaseModel):
+    """Search request with query parameters."""
+    pangenome_id: str
+    table_name: str
+    limit: Optional[int] = Field(
+        default=None,
+        ge=1,
+        le=1000,
+        description="Maximum number of results to return (min 1, max 1000)"
+    )
+    order_by: Optional[List[Dict[str, str]]] = None
+    filters: Optional[List[Dict[str, Any]]] = None
diff --git a/app/routes.py b/app/routes.py
@@ -4,34 +4,89 @@
 Contains all API endpoint definitions.
 """
 
-from fastapi import APIRouter, Query
+from pathlib import Path
+from fastapi import APIRouter, Request, HTTPException
+
+from app.models import SearchRequest
+from app.utils.workspace import get_object_info
+from app.utils.download import download_from_handle
+from app.utils.cache import get_cache_paths, save_to_cache, is_cached
+from app.utils.sqlite import convert_to_sqlite
 
 router = APIRouter()
 
 
 @router.get("/")
-async def root():
+async def root(request: Request):
     """Root endpoint returning service information."""
+    settings = request.app.state.settings
     return {
         "service": "TableScanner",
         "version": "1.0.0",
-        "status": "running"
+        "status": "running",
+        "cache_dir": settings.CACHE_DIR
     }
 
 
-@router.get("/search")
-def search(id: str = Query(..., description="ID to search for")):
+@router.post("/search")
+def search(request: Request, search_request: SearchRequest):
-def search(request: Request, search_request: SearchRequest):
+async def search(request: Request, search_request: SearchRequest):
-def search(request: Request, search_request: SearchRequest):
+async def search(request: Request, search_request: SearchRequest):
     """
-    Search endpoint that takes an ID parameter.
-    
+    Search endpoint with flexible querying.
+
     Args:
-        id: The ID to search for (required)
-        
+        search_request: Search parameters including pangenome_id, table_name, limit, order_by, filters
+
     Returns:
         A dictionary with search results
     """
+    settings = request.app.state.settings
+    token = settings.KB_SERVICE_AUTH_TOKEN
+    cache_dir = Path(settings.CACHE_DIR)
+    workspace_url = settings.WORKSPACE_URL
+
+    # TODO: Use the users token instead of a static one
+
+    # Get object info from KBase Workspace
+    object_info = get_object_info(search_request.pangenome_id, token, workspace_url)
-    object_info = get_object_info(search_request.pangenome_id, token, workspace_url)
+    try:
+        object_info = get_object_info(search_request.pangenome_id, token, workspace_url)
+    except NotImplementedError:
+        raise HTTPException(
+            status_code=501,
+            detail="get_object_info is not implemented."
+        )
-    object_info = get_object_info(search_request.pangenome_id, token, workspace_url)
+    try:
+        object_info = get_object_info(search_request.pangenome_id, token, workspace_url)
+    except NotImplementedError:
+        raise HTTPException(
+            status_code=501,
+            detail="get_object_info is not implemented."
+        )
+    filename = object_info.get('filename', f'{search_request.pangenome_id}.bin')
+    handle_url = object_info.get('handle_url') or object_info.get('blobstore_url')
+
+    if not handle_url:
+        raise HTTPException(
+            status_code=404,
+            detail=f"No handle/blobstore URL found for id: {search_request.pangenome_id}"
+        )
+
+    # Get cache paths
+    cache_file_path, sqlite_file_path = get_cache_paths(cache_dir, search_request.pangenome_id, filename)
+
+    # Download and cache if not already cached
+    if not is_cached(cache_file_path):
+        # Download from handle/blobstore service
+        binary_data = download_from_handle(handle_url, token)
+        save_to_cache(cache_file_path, binary_data)
+
+    # Convert to SQLite if not already converted
+    if not is_cached(sqlite_file_path):
+        convert_to_sqlite(cache_file_path, sqlite_file_path)
-        convert_to_sqlite(cache_file_path, sqlite_file_path)
+        try:
+            convert_to_sqlite(cache_file_path, sqlite_file_path)
+        except NotImplementedError:
+            raise HTTPException(
+                status_code=501,
+                detail="The convert_to_sqlite function is not implemented."
+            )
-        convert_to_sqlite(cache_file_path, sqlite_file_path)
+        try:
+            convert_to_sqlite(cache_file_path, sqlite_file_path)
+        except NotImplementedError:
+            raise HTTPException(
+                status_code=501,
+                detail="The convert_to_sqlite function is not implemented."
+            )
+
-        convert_to_sqlite(cache_file_path, sqlite_file_path)
+        try:
+            convert_to_sqlite(cache_file_path, sqlite_file_path)
+        except NotImplementedError:
+            raise HTTPException(
+                status_code=501,
+                detail="Conversion to SQLite is not implemented yet."
+            )
-        convert_to_sqlite(cache_file_path, sqlite_file_path)
+        try:
+            convert_to_sqlite(cache_file_path, sqlite_file_path)
+        except NotImplementedError:
+            raise HTTPException(
+                status_code=501,
+                detail="Conversion to SQLite is not implemented yet."
+            )
+    # Query the SQLite file with parameters
+    from app.utils.sqlite import get_table_data
+    results = get_table_data(
+        sqlite_file_path,
+        table_name=search_request.table_name,
+        limit=search_request.limit,
+        order_by=search_request.order_by,
+        filters=search_request.filters,
+    )
+
+    #TODO use a return model when we figure out what we want to return
-    #TODO use a return model when we figure out what we want to return
+    # TODO: use a return model when we figure out what we want to return
-    #TODO use a return model when we figure out what we want to return
+    # TODO: use a return model when we figure out what we want to return
-    #TODO use a return model when we figure out what we want to return
+    # TODO: use a return model when we figure out what we want to return
-    #TODO use a return model when we figure out what we want to return
+    # TODO: use a return model when we figure out what we want to return
     return {
-        "query_id": id,
+        "pangenome_id": search_request.pangenome_id,
+        "table_name": search_request.table_name,
         "status": "success",
-        "message": f"Search completed for ID: {id}"
+        "cache_file": str(cache_file_path),
+        "sqlite_file": str(sqlite_file_path),
-        "cache_file": str(cache_file_path),
-        "sqlite_file": str(sqlite_file_path),
-        "cache_file": str(cache_file_path),
-        "sqlite_file": str(sqlite_file_path),
+        "row_count": len(results),
+        "results": results
     }
diff --git a/app/utils/__init__.py b/app/utils/__init__.py
@@ -0,0 +1,22 @@
+"""
+Utils module for TableScanner.
+
+Contains business logic separated from route handlers.
+"""
+
+from app.utils.download import download_from_handle
+from app.utils.workspace import get_object_info
+from app.utils.cache import get_cache_paths, ensure_cache_dir, save_to_cache, is_cached
+from app.utils.sqlite import convert_to_sqlite, query_sqlite, get_table_data
+
+__all__ = [
+    "download_from_handle",
+    "get_object_info",
+    "get_cache_paths",
+    "ensure_cache_dir",
+    "save_to_cache",
+    "is_cached",
+    "convert_to_sqlite",
+    "query_sqlite",
+    "get_table_data",
+]
diff --git a/app/utils/cache.py b/app/utils/cache.py
@@ -0,0 +1,58 @@
+"""
+Cache utilities for managing local file caching.
+"""
+
+from pathlib import Path
+from typing import Tuple
+
+
+def get_cache_paths(cache_dir: Path, id: str, filename: str) -> Tuple[Path, Path]:
+    """
+    Get cache file paths for a given ID and filename.
+
+    Args:
+        cache_dir: Base cache directory
+        id: Object ID
+        filename: Original filename
+
+    Returns:
+        Tuple of (cache_file_path, sqlite_file_path)
+    """
+    cache_file_path = cache_dir / id / filename
+    sqlite_file_path = cache_dir / id / f"{Path(filename).stem}.db"
+    return cache_file_path, sqlite_file_path
+
+
+def ensure_cache_dir(cache_path: Path) -> None:
+    """
+    Ensure cache directory exists.
+
+    Args:
+        cache_path: Path to cache file (directory will be created from parent)
+    """
+    cache_path.parent.mkdir(parents=True, exist_ok=True)
+
+
+def save_to_cache(cache_path: Path, data: bytes) -> None:
+    """
+    Save binary data to cache file.
+
+    Args:
+        cache_path: Path where file should be saved
+        data: Binary data to save
+    """
+    ensure_cache_dir(cache_path)
+    cache_path.write_bytes(data)
+
+
+def is_cached(cache_path: Path) -> bool:
+    """
+    Check if file exists in cache.
+
+    Args:
+        cache_path: Path to cache file
+
+    Returns:
+        True if file exists, False otherwise
+    """
+    return cache_path.exists()
diff --git a/app/utils/download.py b/app/utils/download.py
@@ -0,0 +1,25 @@
+"""
+Handle/Blobstore utilities for downloading files.
+"""
+
+import requests
+
+
+def download_from_handle(handle_url: str, auth_token: str) -> bytes:
+    """
+    Download binary file from KBase Handle/Blobstore service.
+
+    Args:
+        handle_url: URL to the handle/blobstore service
+        auth_token: KBase authentication token
+
+    Returns:
+        Binary data
+
+    Raises:
+        requests.HTTPError: If download fails
+    """
+    headers = {"Authorization": auth_token}
+    response = requests.get(handle_url, headers=headers)
-    response = requests.get(handle_url, headers=headers)
+    response = requests.get(handle_url, headers=headers, timeout=30)
-    response = requests.get(handle_url, headers=headers)
+    response = requests.get(handle_url, headers=headers, timeout=30)
+    response.raise_for_status()
-    response.raise_for_status()
+    try:
+        response.raise_for_status()
+    except requests.HTTPError as e:
+        raise requests.HTTPError(
+            f"Failed to download from {handle_url} (status code: {response.status_code}): {response.text}"
+        ) from e
-    response.raise_for_status()
+    try:
+        response.raise_for_status()
+    except requests.HTTPError as e:
+        raise requests.HTTPError(
+            f"Failed to download from {handle_url} (status code: {response.status_code}): {response.text}"
+        ) from e
+    return response.content
-Original file line number
+Diff line change
@@ Expand Up / @@ -17,3 +17,6 @@ uv.lock @@
     .venv
     venv/
     *.log
+    # Environment variables
+    .env