Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# TableScanner Environment Variables
# Copy this file to .env and fill in your actual values

# KBase Service Authentication Token
KB_SERVICE_AUTH_TOKEN=your_kbase_token_here

# Cache directory for storing downloaded files and SQLite databases
CACHE_DIR=/tmp/tablescanner_cache

# KBase Workspace Service URL
WORKSPACE_URL=https://kbase.us/services/ws
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,6 @@ uv.lock
.venv
venv/
*.log

# Environment variables
.env
24 changes: 24 additions & 0 deletions app/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""
Configuration settings for TableScanner application.
"""

from pydantic_settings import BaseSettings


class Settings(BaseSettings):
"""Application settings."""

KB_SERVICE_AUTH_TOKEN: str
CACHE_DIR: str
Comment on lines +9 to +12
Copy link

Copilot AI Dec 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing validation for required environment variables: If KB_SERVICE_AUTH_TOKEN, CACHE_DIR, or WORKSPACE_URL are not set in the environment, pydantic-settings will raise a validation error. Consider providing default values or adding clear documentation about required environment variables, especially for CACHE_DIR which could have a sensible default like "./cache".

Suggested change
"""Application settings."""
KB_SERVICE_AUTH_TOKEN: str
CACHE_DIR: str
"""
Application settings.
Required environment variables:
- KB_SERVICE_AUTH_TOKEN
- WORKSPACE_URL
Optional environment variables:
- CACHE_DIR (default: "./cache")
"""
KB_SERVICE_AUTH_TOKEN: str
CACHE_DIR: str = "./cache"

Copilot uses AI. Check for mistakes.

# KBase Workspace settings
WORKSPACE_URL: str

class Config:
env_file = ".env"
env_file_encoding = "utf-8"
case_sensitive = True


# Global settings instance
settings = Settings()
10 changes: 7 additions & 3 deletions app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@

from fastapi import FastAPI
from app.routes import router
from app.config import settings


def create_app() -> FastAPI:
"""
Application factory function.

Returns:
FastAPI: Configured FastAPI application instance
"""
Expand All @@ -20,10 +21,13 @@ def create_app() -> FastAPI:
description="API for table scanning operations",
version="1.0.0"
)


# Store settings in app state for access throughout the application
app.state.settings = settings

# Include routes
app.include_router(router)

return app


Expand Down
15 changes: 15 additions & 0 deletions app/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
"""
Pydantic models for request/response schemas.
"""

from typing import Optional, List, Dict, Any
from pydantic import BaseModel


class SearchRequest(BaseModel):
"""Search request with query parameters."""
pangenome_id: str
table_name: str
Comment on lines +6 to +12
Copy link

Copilot AI Dec 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing input validation: The pangenome_id and table_name fields accept any string without validation. Consider adding constraints (e.g., min_length, pattern validation) to prevent empty strings or malicious inputs, especially since table_name is directly used in SQL queries.

Suggested change
from pydantic import BaseModel
class SearchRequest(BaseModel):
"""Search request with query parameters."""
pangenome_id: str
table_name: str
from pydantic import BaseModel, constr
class SearchRequest(BaseModel):
"""Search request with query parameters."""
pangenome_id: constr(min_length=1, regex=r"^[A-Za-z0-9_\-]+$")
table_name: constr(min_length=1, regex=r"^[A-Za-z_][A-Za-z0-9_]*$")

Copilot uses AI. Check for mistakes.
limit: Optional[int] = None
Comment on lines +6 to +13
Copy link

Copilot AI Dec 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing validation for limit field: The limit field should have a minimum value (at least 1) and possibly a maximum value to prevent excessive memory usage or denial of service from extremely large result sets. Consider adding Field validators with appropriate constraints.

Suggested change
from pydantic import BaseModel
class SearchRequest(BaseModel):
"""Search request with query parameters."""
pangenome_id: str
table_name: str
limit: Optional[int] = None
from pydantic import BaseModel, Field
class SearchRequest(BaseModel):
"""Search request with query parameters."""
pangenome_id: str
table_name: str
limit: Optional[int] = Field(
default=None,
ge=1,
le=1000,
description="Maximum number of results to return (min 1, max 1000)"
)

Copilot uses AI. Check for mistakes.
order_by: Optional[List[Dict[str, str]]] = None
filters: Optional[List[Dict[str, Any]]] = None
77 changes: 66 additions & 11 deletions app/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,34 +4,89 @@
Contains all API endpoint definitions.
"""

from fastapi import APIRouter, Query
from pathlib import Path
from fastapi import APIRouter, Request, HTTPException

from app.models import SearchRequest
from app.utils.workspace import get_object_info
from app.utils.download import download_from_handle
from app.utils.cache import get_cache_paths, save_to_cache, is_cached
from app.utils.sqlite import convert_to_sqlite

router = APIRouter()


@router.get("/")
async def root():
async def root(request: Request):
"""Root endpoint returning service information."""
settings = request.app.state.settings
return {
"service": "TableScanner",
"version": "1.0.0",
"status": "running"
"status": "running",
"cache_dir": settings.CACHE_DIR
Copy link

Copilot AI Dec 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The cache_dir field is exposed in the root endpoint response but not documented in the function's docstring. Consider updating the docstring to describe what information is returned in the response, including the cache_dir field.

Copilot uses AI. Check for mistakes.
}


@router.get("/search")
def search(id: str = Query(..., description="ID to search for")):
@router.post("/search")
def search(request: Request, search_request: SearchRequest):
Copy link

Copilot AI Dec 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inconsistent async pattern: The search endpoint is a synchronous function while the root endpoint is async. For consistency and to properly handle blocking I/O operations, consider making this endpoint async and using appropriate async libraries for I/O operations.

Suggested change
def search(request: Request, search_request: SearchRequest):
async def search(request: Request, search_request: SearchRequest):

Copilot uses AI. Check for mistakes.
"""
Search endpoint that takes an ID parameter.
Search endpoint with flexible querying.
Args:
id: The ID to search for (required)
search_request: Search parameters including pangenome_id, table_name, limit, order_by, filters
Returns:
A dictionary with search results
"""
settings = request.app.state.settings
token = settings.KB_SERVICE_AUTH_TOKEN
cache_dir = Path(settings.CACHE_DIR)
workspace_url = settings.WORKSPACE_URL

# TODO: Use the users token instead of a static one

# Get object info from KBase Workspace
object_info = get_object_info(search_request.pangenome_id, token, workspace_url)
Copy link

Copilot AI Dec 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The get_object_info function always raises NotImplementedError (as seen in workspace.py line 39), which will cause this endpoint to fail on every request. The function call will crash before reaching the subsequent logic. This should be implemented or the endpoint should handle the NotImplementedError appropriately.

Suggested change
object_info = get_object_info(search_request.pangenome_id, token, workspace_url)
try:
object_info = get_object_info(search_request.pangenome_id, token, workspace_url)
except NotImplementedError:
raise HTTPException(
status_code=501,
detail="get_object_info is not implemented."
)

Copilot uses AI. Check for mistakes.
filename = object_info.get('filename', f'{search_request.pangenome_id}.bin')
handle_url = object_info.get('handle_url') or object_info.get('blobstore_url')

if not handle_url:
raise HTTPException(
status_code=404,
detail=f"No handle/blobstore URL found for id: {search_request.pangenome_id}"
)

# Get cache paths
cache_file_path, sqlite_file_path = get_cache_paths(cache_dir, search_request.pangenome_id, filename)

# Download and cache if not already cached
if not is_cached(cache_file_path):
# Download from handle/blobstore service
binary_data = download_from_handle(handle_url, token)
save_to_cache(cache_file_path, binary_data)
Comment on lines +66 to +67
Copy link

Copilot AI Dec 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Blocking I/O in async context: The download_from_handle function performs blocking network I/O using the synchronous requests library. Since this is called from a FastAPI endpoint, it blocks the event loop. Consider using an async HTTP client (like httpx) or running the download in a thread pool executor to maintain async benefits.

Copilot uses AI. Check for mistakes.

# Convert to SQLite if not already converted
if not is_cached(sqlite_file_path):
convert_to_sqlite(cache_file_path, sqlite_file_path)
Copy link

Copilot AI Dec 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The convert_to_sqlite function raises NotImplementedError, but this is called without a try-except block. This will cause the endpoint to fail with an unhandled exception. Either implement the function or add proper error handling to return a meaningful HTTP error response.

Suggested change
convert_to_sqlite(cache_file_path, sqlite_file_path)
try:
convert_to_sqlite(cache_file_path, sqlite_file_path)
except NotImplementedError:
raise HTTPException(
status_code=501,
detail="The convert_to_sqlite function is not implemented."
)

Copilot uses AI. Check for mistakes.

Comment on lines +71 to +72
Copy link

Copilot AI Dec 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The convert_to_sqlite function always raises NotImplementedError (as seen in sqlite.py line 48), which will cause the endpoint to fail when attempting to convert uncached files. This should be implemented or the endpoint should handle the NotImplementedError appropriately.

Suggested change
convert_to_sqlite(cache_file_path, sqlite_file_path)
try:
convert_to_sqlite(cache_file_path, sqlite_file_path)
except NotImplementedError:
raise HTTPException(
status_code=501,
detail="Conversion to SQLite is not implemented yet."
)

Copilot uses AI. Check for mistakes.
# Query the SQLite file with parameters
from app.utils.sqlite import get_table_data
Copy link

Copilot AI Dec 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The import for get_table_data should be moved to the top of the file with the other imports from app.utils.sqlite (line 14) for better code organization and consistency. Inline imports should be avoided unless there's a specific reason like circular imports.

Copilot uses AI. Check for mistakes.
results = get_table_data(
sqlite_file_path,
table_name=search_request.table_name,
limit=search_request.limit,
order_by=search_request.order_by,
filters=search_request.filters,
)

#TODO use a return model when we figure out what we want to return
Copy link

Copilot AI Dec 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The TODO comment is missing a space after the hash and colon. The standard format is '# TODO:' with proper spacing for consistency with other TODO comments in the codebase.

Suggested change
#TODO use a return model when we figure out what we want to return
# TODO: use a return model when we figure out what we want to return

Copilot uses AI. Check for mistakes.
Copy link

Copilot AI Dec 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inconsistent comment formatting: The TODO comment should have a space after the colon for consistency with the TODO on line 47.

Suggested change
#TODO use a return model when we figure out what we want to return
# TODO: use a return model when we figure out what we want to return

Copilot uses AI. Check for mistakes.
return {
"query_id": id,
"pangenome_id": search_request.pangenome_id,
"table_name": search_request.table_name,
"status": "success",
"message": f"Search completed for ID: {id}"
"cache_file": str(cache_file_path),
"sqlite_file": str(sqlite_file_path),
Comment on lines +88 to +89
Copy link

Copilot AI Dec 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Potential information disclosure: The response exposes internal file system paths (cache_file and sqlite_file). This could reveal sensitive information about the server's directory structure to clients. Consider removing these fields or making them optional for debugging purposes only.

Suggested change
"cache_file": str(cache_file_path),
"sqlite_file": str(sqlite_file_path),

Copilot uses AI. Check for mistakes.
"row_count": len(results),
"results": results
}
22 changes: 22 additions & 0 deletions app/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
"""
Utils module for TableScanner.

Contains business logic separated from route handlers.
"""

from app.utils.download import download_from_handle
from app.utils.workspace import get_object_info
from app.utils.cache import get_cache_paths, ensure_cache_dir, save_to_cache, is_cached
from app.utils.sqlite import convert_to_sqlite, query_sqlite, get_table_data

__all__ = [
"download_from_handle",
"get_object_info",
"get_cache_paths",
"ensure_cache_dir",
"save_to_cache",
"is_cached",
"convert_to_sqlite",
"query_sqlite",
"get_table_data",
]
58 changes: 58 additions & 0 deletions app/utils/cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""
Cache utilities for managing local file caching.
"""

from pathlib import Path
from typing import Tuple


def get_cache_paths(cache_dir: Path, id: str, filename: str) -> Tuple[Path, Path]:
"""
Get cache file paths for a given ID and filename.
Args:
cache_dir: Base cache directory
id: Object ID
filename: Original filename
Returns:
Tuple of (cache_file_path, sqlite_file_path)
"""
cache_file_path = cache_dir / id / filename
sqlite_file_path = cache_dir / id / f"{Path(filename).stem}.db"
Comment on lines +21 to +22
Copy link

Copilot AI Dec 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Path traversal vulnerability: The id and filename parameters are used directly in path construction without validation. An attacker could provide values like "../../../etc/passwd" to access or create files outside the intended cache directory. Consider sanitizing these inputs by removing path separators or validating against a safe pattern.

Copilot uses AI. Check for mistakes.
return cache_file_path, sqlite_file_path


def ensure_cache_dir(cache_path: Path) -> None:
"""
Ensure cache directory exists.
Args:
cache_path: Path to cache file (directory will be created from parent)
"""
cache_path.parent.mkdir(parents=True, exist_ok=True)


def save_to_cache(cache_path: Path, data: bytes) -> None:
"""
Save binary data to cache file.
Args:
cache_path: Path where file should be saved
data: Binary data to save
"""
ensure_cache_dir(cache_path)
cache_path.write_bytes(data)


def is_cached(cache_path: Path) -> bool:
"""
Check if file exists in cache.
Args:
cache_path: Path to cache file
Returns:
True if file exists, False otherwise
"""
return cache_path.exists()
25 changes: 25 additions & 0 deletions app/utils/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"""
Handle/Blobstore utilities for downloading files.
"""

import requests


def download_from_handle(handle_url: str, auth_token: str) -> bytes:
"""
Download binary file from KBase Handle/Blobstore service.
Args:
handle_url: URL to the handle/blobstore service
auth_token: KBase authentication token
Returns:
Binary data
Raises:
requests.HTTPError: If download fails
"""
headers = {"Authorization": auth_token}
response = requests.get(handle_url, headers=headers)
Copy link

Copilot AI Dec 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing timeout parameter: The requests.get call does not specify a timeout, which could cause the application to hang indefinitely if the remote server is unresponsive. Consider adding a timeout parameter (e.g., timeout=30) to prevent resource exhaustion.

Suggested change
response = requests.get(handle_url, headers=headers)
response = requests.get(handle_url, headers=headers, timeout=30)

Copilot uses AI. Check for mistakes.
response.raise_for_status()
Copy link

Copilot AI Dec 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Poor error handling: The raise_for_status() call will raise a generic HTTPError without additional context. Consider catching this exception and raising a more informative error that includes details about what failed (e.g., which URL, what status code) to help with debugging and provide better user feedback.

Suggested change
response.raise_for_status()
try:
response.raise_for_status()
except requests.HTTPError as e:
raise requests.HTTPError(
f"Failed to download from {handle_url} (status code: {response.status_code}): {response.text}"
) from e

Copilot uses AI. Check for mistakes.
return response.content
Loading
Loading