diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..55428ec --- /dev/null +++ b/.env.example @@ -0,0 +1,11 @@ +# TableScanner Environment Variables +# Copy this file to .env and fill in your actual values + +# KBase Service Authentication Token +KB_SERVICE_AUTH_TOKEN=your_kbase_token_here + +# Cache directory for storing downloaded files and SQLite databases +CACHE_DIR=/tmp/tablescanner_cache + +# KBase Workspace Service URL +WORKSPACE_URL=https://kbase.us/services/ws \ No newline at end of file diff --git a/.gitignore b/.gitignore index b1d4038..18db92a 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,6 @@ uv.lock .venv venv/ *.log + +# Environment variables +.env diff --git a/app/config.py b/app/config.py new file mode 100644 index 0000000..8de5aaf --- /dev/null +++ b/app/config.py @@ -0,0 +1,24 @@ +""" +Configuration settings for TableScanner application. +""" + +from pydantic_settings import BaseSettings + + +class Settings(BaseSettings): + """Application settings.""" + + KB_SERVICE_AUTH_TOKEN: str + CACHE_DIR: str + + # KBase Workspace settings + WORKSPACE_URL: str + + class Config: + env_file = ".env" + env_file_encoding = "utf-8" + case_sensitive = True + + +# Global settings instance +settings = Settings() \ No newline at end of file diff --git a/app/main.py b/app/main.py index a5872c7..70aef20 100644 --- a/app/main.py +++ b/app/main.py @@ -6,12 +6,13 @@ from fastapi import FastAPI from app.routes import router +from app.config import settings def create_app() -> FastAPI: """ Application factory function. - + Returns: FastAPI: Configured FastAPI application instance """ @@ -20,10 +21,13 @@ def create_app() -> FastAPI: description="API for table scanning operations", version="1.0.0" ) - + + # Store settings in app state for access throughout the application + app.state.settings = settings + # Include routes app.include_router(router) - + return app diff --git a/app/models.py b/app/models.py new file mode 100644 index 0000000..2aeb79e --- /dev/null +++ b/app/models.py @@ -0,0 +1,15 @@ +""" +Pydantic models for request/response schemas. +""" + +from typing import Optional, List, Dict, Any +from pydantic import BaseModel + + +class SearchRequest(BaseModel): + """Search request with query parameters.""" + pangenome_id: str + table_name: str + limit: Optional[int] = None + order_by: Optional[List[Dict[str, str]]] = None + filters: Optional[List[Dict[str, Any]]] = None \ No newline at end of file diff --git a/app/routes.py b/app/routes.py index bcf52f5..c05a16b 100644 --- a/app/routes.py +++ b/app/routes.py @@ -4,34 +4,89 @@ Contains all API endpoint definitions. """ -from fastapi import APIRouter, Query +from pathlib import Path +from fastapi import APIRouter, Request, HTTPException + +from app.models import SearchRequest +from app.utils.workspace import get_object_info +from app.utils.download import download_from_handle +from app.utils.cache import get_cache_paths, save_to_cache, is_cached +from app.utils.sqlite import convert_to_sqlite router = APIRouter() @router.get("/") -async def root(): +async def root(request: Request): """Root endpoint returning service information.""" + settings = request.app.state.settings return { "service": "TableScanner", "version": "1.0.0", - "status": "running" + "status": "running", + "cache_dir": settings.CACHE_DIR } -@router.get("/search") -def search(id: str = Query(..., description="ID to search for")): +@router.post("/search") +def search(request: Request, search_request: SearchRequest): """ - Search endpoint that takes an ID parameter. - + Search endpoint with flexible querying. + Args: - id: The ID to search for (required) - + search_request: Search parameters including pangenome_id, table_name, limit, order_by, filters + Returns: A dictionary with search results """ + settings = request.app.state.settings + token = settings.KB_SERVICE_AUTH_TOKEN + cache_dir = Path(settings.CACHE_DIR) + workspace_url = settings.WORKSPACE_URL + + # TODO: Use the users token instead of a static one + + # Get object info from KBase Workspace + object_info = get_object_info(search_request.pangenome_id, token, workspace_url) + filename = object_info.get('filename', f'{search_request.pangenome_id}.bin') + handle_url = object_info.get('handle_url') or object_info.get('blobstore_url') + + if not handle_url: + raise HTTPException( + status_code=404, + detail=f"No handle/blobstore URL found for id: {search_request.pangenome_id}" + ) + + # Get cache paths + cache_file_path, sqlite_file_path = get_cache_paths(cache_dir, search_request.pangenome_id, filename) + + # Download and cache if not already cached + if not is_cached(cache_file_path): + # Download from handle/blobstore service + binary_data = download_from_handle(handle_url, token) + save_to_cache(cache_file_path, binary_data) + + # Convert to SQLite if not already converted + if not is_cached(sqlite_file_path): + convert_to_sqlite(cache_file_path, sqlite_file_path) + + # Query the SQLite file with parameters + from app.utils.sqlite import get_table_data + results = get_table_data( + sqlite_file_path, + table_name=search_request.table_name, + limit=search_request.limit, + order_by=search_request.order_by, + filters=search_request.filters, + ) + + #TODO use a return model when we figure out what we want to return return { - "query_id": id, + "pangenome_id": search_request.pangenome_id, + "table_name": search_request.table_name, "status": "success", - "message": f"Search completed for ID: {id}" + "cache_file": str(cache_file_path), + "sqlite_file": str(sqlite_file_path), + "row_count": len(results), + "results": results } diff --git a/app/utils/__init__.py b/app/utils/__init__.py new file mode 100644 index 0000000..4521156 --- /dev/null +++ b/app/utils/__init__.py @@ -0,0 +1,22 @@ +""" +Utils module for TableScanner. + +Contains business logic separated from route handlers. +""" + +from app.utils.download import download_from_handle +from app.utils.workspace import get_object_info +from app.utils.cache import get_cache_paths, ensure_cache_dir, save_to_cache, is_cached +from app.utils.sqlite import convert_to_sqlite, query_sqlite, get_table_data + +__all__ = [ + "download_from_handle", + "get_object_info", + "get_cache_paths", + "ensure_cache_dir", + "save_to_cache", + "is_cached", + "convert_to_sqlite", + "query_sqlite", + "get_table_data", +] diff --git a/app/utils/cache.py b/app/utils/cache.py new file mode 100644 index 0000000..cb4cc0f --- /dev/null +++ b/app/utils/cache.py @@ -0,0 +1,58 @@ +""" +Cache utilities for managing local file caching. +""" + +from pathlib import Path +from typing import Tuple + + +def get_cache_paths(cache_dir: Path, id: str, filename: str) -> Tuple[Path, Path]: + """ + Get cache file paths for a given ID and filename. + + Args: + cache_dir: Base cache directory + id: Object ID + filename: Original filename + + Returns: + Tuple of (cache_file_path, sqlite_file_path) + """ + cache_file_path = cache_dir / id / filename + sqlite_file_path = cache_dir / id / f"{Path(filename).stem}.db" + return cache_file_path, sqlite_file_path + + +def ensure_cache_dir(cache_path: Path) -> None: + """ + Ensure cache directory exists. + + Args: + cache_path: Path to cache file (directory will be created from parent) + """ + cache_path.parent.mkdir(parents=True, exist_ok=True) + + +def save_to_cache(cache_path: Path, data: bytes) -> None: + """ + Save binary data to cache file. + + Args: + cache_path: Path where file should be saved + data: Binary data to save + """ + ensure_cache_dir(cache_path) + cache_path.write_bytes(data) + + +def is_cached(cache_path: Path) -> bool: + """ + Check if file exists in cache. + + Args: + cache_path: Path to cache file + + Returns: + True if file exists, False otherwise + """ + return cache_path.exists() diff --git a/app/utils/download.py b/app/utils/download.py new file mode 100644 index 0000000..3ee24b1 --- /dev/null +++ b/app/utils/download.py @@ -0,0 +1,25 @@ +""" +Handle/Blobstore utilities for downloading files. +""" + +import requests + + +def download_from_handle(handle_url: str, auth_token: str) -> bytes: + """ + Download binary file from KBase Handle/Blobstore service. + + Args: + handle_url: URL to the handle/blobstore service + auth_token: KBase authentication token + + Returns: + Binary data + + Raises: + requests.HTTPError: If download fails + """ + headers = {"Authorization": auth_token} + response = requests.get(handle_url, headers=headers) + response.raise_for_status() + return response.content diff --git a/app/utils/sqlite.py b/app/utils/sqlite.py new file mode 100644 index 0000000..0ef7e99 --- /dev/null +++ b/app/utils/sqlite.py @@ -0,0 +1,175 @@ +""" +SQLite utilities for database conversion and querying. +""" + +import sqlite3 +from pathlib import Path +from typing import Any, List, Dict, Optional + + +def convert_to_sqlite(binary_file: Path, sqlite_file: Path) -> None: + """ + Convert binary file to SQLite database. + + Args: + binary_file: Path to binary file + sqlite_file: Path to output SQLite file + + Raises: + NotImplementedError: This function is not yet implemented + """ + # TODO: Implement conversion logic based on binary file format + # + # Example implementation for a specific binary format: + # import sqlite3 + # + # # Read and parse binary file + # with open(binary_file, 'rb') as f: + # data = parse_binary_format(f.read()) + # + # # Create SQLite database + # conn = sqlite3.connect(sqlite_file) + # cursor = conn.cursor() + # + # # Create tables + # cursor.execute(''' + # CREATE TABLE IF NOT EXISTS data ( + # id INTEGER PRIMARY KEY, + # column1 TEXT, + # column2 TEXT + # ) + # ''') + # + # # Insert data + # cursor.executemany('INSERT INTO data VALUES (?, ?, ?)', data) + # conn.commit() + # conn.close() + + raise NotImplementedError("SQLite conversion not yet implemented") + + +def query_sqlite(sqlite_file: Path, query_id: str) -> dict: + """ + Query SQLite database. + + Args: + sqlite_file: Path to SQLite database + query_id: Query identifier + + Returns: + Query results as dictionary + + Note: + This is currently a stub implementation that returns placeholder data. + """ + # TODO: Implement SQLite query logic + # + # Example implementation: + # import sqlite3 + # + # conn = sqlite3.connect(sqlite_file) + # conn.row_factory = sqlite3.Row # Enable column access by name + # cursor = conn.cursor() + # + # # Execute query + # cursor.execute("SELECT * FROM data WHERE id = ?", (query_id,)) + # rows = cursor.fetchall() + # + # # Convert to list of dicts + # results = [dict(row) for row in rows] + # + # conn.close() + # return {"data": results, "count": len(results)} + + return { + "stub": "SQLite query results would go here", + "query_id": query_id, + "sqlite_file": str(sqlite_file) + } + + +def get_table_data( + sqlite_file: Path, + table_name: str, + limit: Optional[int] = None, + order_by: Optional[List[Dict[str, str]]] = None, + filters: Optional[List[Dict[str, Any]]] = None, +) -> List[Dict[str, Any]]: + """ + Query SQLite database with flexible filtering, ordering, and pagination. + + Args: + sqlite_file: Path to SQLite database + table_name: Name of the table to query + limit: Maximum number of rows to return + order_by: List of order specifications, e.g., + [{"column": "gene_id", "direction": "ASC"}] + filters: List of filter specifications, e.g., + [{"column": "function", "op": "LIKE", "value": "%kinase%"}] + + Returns: + List of rows as dictionaries + + Example: + rows = get_table_data( + db_path, + "Genes", + limit=20, + order_by=[{"column": "gene_id", "direction": "ASC"}], + filters=[{"column": "function", "op": "LIKE", "value": "%kinase%"}], + ) + """ + conn = sqlite3.connect(sqlite_file) + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + # Build SELECT query + query = f"SELECT * FROM {table_name}" + params = [] + + # Add WHERE clause for filters + if filters: + where_clauses = [] + for filter_spec in filters: + column = filter_spec["column"] + op = filter_spec["op"] + value = filter_spec["value"] + + # Sanitize operator + allowed_ops = ["=", "!=", "<", ">", "<=", ">=", "LIKE", "IN"] + if op not in allowed_ops: + raise ValueError(f"Invalid operator: {op}") + + where_clauses.append(f"{column} {op} ?") + params.append(value) + + query += " WHERE " + " AND ".join(where_clauses) + + # Add ORDER BY clause + if order_by: + order_clauses = [] + for order_spec in order_by: + column = order_spec["column"] + direction = order_spec.get("direction", "ASC").upper() + + if direction not in ["ASC", "DESC"]: + raise ValueError(f"Invalid direction: {direction}") + + order_clauses.append(f"{column} {direction}") + + query += " ORDER BY " + ", ".join(order_clauses) + + # Add LIMIT clause + if limit is not None: + query += f" LIMIT {int(limit)}" + + # Execute query + cursor.execute(query, params) + rows = cursor.fetchall() + + # Convert to list of dicts + results = [dict(row) for row in rows] + + conn.close() + + return results diff --git a/app/utils/workspace.py b/app/utils/workspace.py new file mode 100644 index 0000000..7713010 --- /dev/null +++ b/app/utils/workspace.py @@ -0,0 +1,39 @@ +""" +KBase Workspace utilities for retrieving object information. +""" + +import requests +from typing import Dict, Any + + +def get_object_info(workspace_id: str, auth_token: str, workspace_url: str) -> Dict[str, Any]: + """ + Get object information from KBase Workspace API. + + Args: + workspace_id: The workspace object ID + auth_token: KBase authentication token + workspace_url: URL to the KBase Workspace service + + Returns: + Dictionary containing object info including handle/blobstore URLs + + Raises: + HTTPException: If the workspace API call fails + """ + # TODO: Implement actual KBase Workspace API call + # Example: + # headers = {"Authorization": auth_token} + # payload = { + # "method": "Workspace.get_objects2", + # "params": [{ + # "objects": [{"ref": workspace_id}] + # }], + # "version": "1.1" + # } + # response = requests.post(workspace_url, json=payload, headers=headers) + # response.raise_for_status() + # data = response.json() + # return data["result"][0]["data"][0] + + raise NotImplementedError("KBase Workspace API integration not yet implemented") diff --git a/pyproject.toml b/pyproject.toml index e9d2054..fd3c46f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,6 +8,8 @@ dependencies = [ "uvicorn>=0.38.0", #"KBUtilLib", # KBase internal library - version managed by KBase infrastructure "minio>=7.2.20", + "pydantic-settings>=2.0.0", + "requests>=2.31.0", ] [build-system] diff --git a/scripts/dev.sh b/scripts/dev.sh new file mode 100755 index 0000000..707104a --- /dev/null +++ b/scripts/dev.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# Development server startup script +# This script loads environment variables and starts the FastAPI dev server + +# Activate virtual environment +source .venv/bin/activate + +# Check if .env file exists, exit if not +if [ ! -f .env ]; then + echo "Error: .env file not found!" + echo "Please copy .env.example to .env and fill in your values:" + echo " cp .env.example .env" + exit 1 +fi + +# Load environment variables from .env file +export $(grep -v '^#' .env | xargs) + +# Add current directory to PYTHONPATH so app module can be imported +export PYTHONPATH="${PYTHONPATH}:$(pwd)" + +fastapi dev \ No newline at end of file