Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 30 additions & 38 deletions .env.example
Original file line number Diff line number Diff line change
@@ -1,48 +1,40 @@
# TableScanner Environment Variables
# Copy this file to .env and fill in your actual values

# =============================================================================
# AUTHENTICATION
# =============================================================================
# KBase Service Authentication Token
# For development testing, use your personal token from KBase
KB_SERVICE_AUTH_TOKEN=YOUR_KBASE_TOKEN_HERE

# =============================================================================
# CACHE SETTINGS
# =============================================================================
# Cache directory for storing downloaded SQLite databases
# TableScanner Environment Configuration
# Copy this file to .env and fill in your values

# REQUIRED: KBase authentication token for API access
# Get your token from: https://narrative.kbase.us/#auth/account
KB_SERVICE_AUTH_TOKEN=your_token_here

# Cache directory for downloaded SQLite databases
# Default: /tmp/tablescanner_cache
CACHE_DIR=/tmp/tablescanner_cache

# Maximum age of cached files in hours (default: 24)
# Maximum age of cached files in hours before re-download
# Default: 24
CACHE_MAX_AGE_HOURS=24

# =============================================================================
# KBASE SERVICE URLS
# =============================================================================
# KBase Workspace Service URL
WORKSPACE_URL=https://appdev.kbase.us/services/ws

# Base URL for KBase services
KBASE_ENDPOINT=https://appdev.kbase.us/services
# Enable debug mode with verbose logging
# Default: false
DEBUG=false

# KBase Blobstore/Shock service URL
BLOBSTORE_URL=https://appdev.kbase.us/services/shock-api
# KBase environment (appdev, ci, prod)
# Default: appdev
KB_ENV=appdev

# =============================================================================
# APPLICATION SETTINGS
# =============================================================================
# Enable debug mode (true/false)
DEBUG=false
# CORS allowed origins (JSON array format)
# Use ["*"] for all origins (development only)
# For production, specify exact origins: ["https://kbase.us", "https://narrative.kbase.us"]
CORS_ORIGINS=["*"]

# =============================================================================
# TEST DATA (AppDev)
# =============================================================================
# Test BERDLTable object: 76990/ADP1Test
# Test pangenome: GCF_000368685.1
# Narrative: https://appdev.kbase.us/narrative/76990
# KBase service URLs (usually don't need to change)
WORKSPACE_URL=https://kbase.us/services/ws
KBASE_ENDPOINT=https://kbase.us/services
BLOBSTORE_URL=https://kbase.us/services/shock-api

# Timeout settings (seconds)
DOWNLOAD_TIMEOUT_SECONDS=30.0
KBASE_API_TIMEOUT_SECONDS=10.0

# Root path for proxy deployment (e.g., "/services/berdl_table_scanner")
# Leave empty if running at root path (i.e., "/") for local dev
ROOT_PATH=/services/berdl_table_scanner
# Leave empty for standalone deployment
KB_SERVICE_ROOT_PATH=
10 changes: 10 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@ trash/
docs/DEMO_SCRIPT.md
docs/QUICKSTART.md
docs/internal/
DATABASE_SCHEMA.md
docs/personal/
archive/
docs/archive
dummy.db

.DS_Store
.idea
Expand Down Expand Up @@ -31,3 +36,8 @@ lib/

# Cache directory
cache/

# Project-specific artifacts
DATABASE_SCHEMA.md
*.webp
*.png
17 changes: 15 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,8 +1,21 @@
FROM ghcr.io/astral-sh/uv:python3.13-alpine
RUN apk --no-cache add curl
RUN apk --no-cache add curl git
WORKDIR /app

# Clone KBUtilLib (required external dependency)
# This creates /app/lib/KBUtilLib/ which is referenced by app/utils/workspace.py
RUN mkdir -p lib && \
cd lib && \
git clone https://github.com/cshenry/KBUtilLib.git && \
Comment on lines +7 to +9
Copy link

Copilot AI Jan 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Docker build clones KBUtilLib directly from GitHub using git clone https://github.com/cshenry/KBUtilLib.git without pinning to a specific commit, tag, or verifying integrity. This means every build will execute whatever code is currently on the default branch of an external repository, so a compromise of that repo or the GitHub supply chain could inject arbitrary code into your image and run it with access to any secrets or data inside the container. To mitigate this supply-chain risk, fetch KBUtilLib via a pinned, immutable reference (e.g., specific commit or release) and/or vendor it into the repo so builds cannot be silently altered upstream.

Copilot uses AI. Check for mistakes.
cd ..

# Add KBUtilLib to PYTHONPATH so it can be imported
ENV PYTHONPATH=/app/lib/KBUtilLib/src:${PYTHONPATH}

Check warning on line 13 in Dockerfile

View workflow job for this annotation

GitHub Actions / build-develop-open / build-image

Variables should be defined before their use

UndefinedVar: Usage of undefined variable '$PYTHONPATH' More info: https://docs.docker.com/go/dockerfile/rule/undefined-var/

# Copy application code and dependencies
COPY app ./app
COPY pyproject.toml /app/pyproject.toml
RUN uv sync

EXPOSE 8000
CMD ["uv", "run", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
CMD ["uv", "run", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
149 changes: 110 additions & 39 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,64 +1,135 @@
# TableScanner

TableScanner is a microservice for providing filtered and paginated access to tabular data stored in KBase. It uses local SQLite caching and indexing to provide fast access to large datasets without loading them entirely into memory.
TableScanner is a production-grade microservice for querying tabular data from KBase SQLite databases. It provides a comprehensive DataTables Viewer-compatible API with advanced query capabilities, type-aware filtering, and performance optimizations.

## Functionality
## Features

The service provides two methods for data access:
1. **Hierarchical REST**: Path-based endpoints for navigating objects and tables using GET requests.
2. **Flat POST**: A single endpoint (`/table-data`) that accepts a JSON payload for all query parameters.
- **Data Access**: Query SQLite databases from KBase objects and handles.
- **Local Uploads**: Upload local SQLite files (`.db`, `.sqlite`) for temporary access and testing.
- **User-Driven Auth**: Secure access where each user provides their own KBase token.
- **Type-Aware Filtering**: Automatic numeric conversion for proper filtering results.
- **Advanced Operators**: Support for `eq`, `ne`, `gt`, `gte`, `lt`, `lte`, `like`, `ilike`, `in`, `not_in`, `between`, `is_null`, `is_not_null`.
- **Aggregations**: `GROUP BY` support with `count`, `sum`, `avg`, `min`, `max`, `stddev`, `variance`, `distinct_count`.
- **Table Statistics**: Rich column statistics including null counts, distinct counts, min/max/mean, and sample values.
- **Full-Text Search**: FTS5 support with automatic virtual table creation.
- **Automatic Operations**: Lifecycle management for connection pooling, query caching, and automatic disk cleanup.

## Architecture
## Quick Start

TableScanner operates as a bridge between KBase storage and client applications:
1. **Data Fetching**: Retrieves SQLite databases from the KBase Blobstore.
2. **Local Caching**: Stores databases locally to avoid repeated downloads.
3. **Indexing**: Creates indices on-the-fly for all table columns to optimize query performance.
4. **API Layer**: A FastAPI application that handles requests and executes SQL queries against the local cache.
### Production (Docker)

Technical details on race conditions and concurrency handling are available in [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md).

## Setup

### Production
```bash
docker compose up --build -d
```
The service will be available at `http://localhost:8000`. API documentation is at `/docs`.
The service will be available at `http://localhost:8000`. API documentation is available at `/docs`.

### Development

```bash
cp .env.example .env
bash scripts/dev.sh
# Edit .env and set local development parameters
./scripts/dev.sh
```

## Authentication

**Each user must provide their own KBase authentication token.** The service prioritizes user-provided tokens over shared service tokens.

- **Header (Recommended)**: `Authorization: Bearer <token>`
- **Cookie**: `kbase_session=<token>` (Used by DataTables Viewer)
- **Legacy Fallback**: `KB_SERVICE_AUTH_TOKEN` in `.env` is for **local testing only**.

## API Usage Examples

### 1. Upload a Local Database
Upload a SQLite file to receive a temporary handle.

```bash
curl -X POST "http://localhost:8000/upload" \
-F "file=@/path/to/my_data.db"
# Returns: {"handle": "local:a1b2-c3d4", ...}
```

### 2. List Tables
Works with KBase UPA or the local handle returned above.

```bash
curl -H "Authorization: Bearer $KB_TOKEN" \
"http://localhost:8000/object/76990/7/2/tables"
```

## API Usage
### 3. Get Table Statistics
Retrieve detailed column metrics and sample values.

### Path-based REST
List tables:
`GET /object/{upa}/tables`
```bash
curl -H "Authorization: Bearer $KB_TOKEN" \
"http://localhost:8000/object/76990/7/2/tables/Genes/stats"
```

### 4. Advanced Query (POST)
Comprehensive filtering and pagination.

```bash
curl -X POST -H "Authorization: Bearer $KB_TOKEN" \
-H "Content-Type: application/json" \
-d '{
"berdl_table_id": "76990/7/2",
"table_name": "Genes",
"limit": 100,
"filters": [
{"column": "gene_length", "operator": "gt", "value": 1000}
]
}' \
"http://localhost:8000/table-data"
```

## Performance & Optimization

- **Gzip Compression**: Compresses large responses (>1KB) to reduce bandwidth usage.
- **High-Performance JSON**: Uses `orjson` for fast JSON serialization.
- **Parallel Metadata Fetching**: Retrieves table metadata concurrently for fast listing.
- **Metadata Caching**: Caches object types locally to minimize KBase API calls.
- **Connection Pooling**: Reuses database connections for up to 10 minutes of inactivity.
- **Automatic Cleanup**: Expired caches are purged on startup. Uploaded databases automatically expire after **1 hour**.
- **Query Caching**: 5-minute TTL, max 1000 entries per instance.
- **Atomic Renaming**: Ensures file integrity during downloads and uploads.

Query table data:
`GET /object/{upa}/tables/{table_name}/data?limit=100`
## Documentation

### Flat POST
Query table data:
`POST /table-data`
- **[API Reference](docs/API.md)** - Complete API documentation with examples
- **[Architecture Dictionary](docs/ARCHITECTURE.md)** - System design and technical overview
- **[Deployment Readiness](docs/internal/DEPLOYMENT_READINESS.md)** - Checklist for production deployment
- **[Contributing Guide](docs/CONTRIBUTING.md)** - Setup, testing, and contribution guidelines

Payload example:
```json
{
"berdl_table_id": "76990/7/2",
"table_name": "Genes",
"limit": 100
}
## Testing

```bash
# Set PYTHONPATH and run all tests
PYTHONPATH=. pytest

# Run integration tests for local upload
PYTHONPATH=. pytest tests/integration/test_local_upload.py
```

## Project Structure
- `app/`: Application logic and routes.
- `app/utils/`: Utilities for caching, SQLite operations, and Workspace integration.
- `docs/`: Technical documentation.
- `scripts/`: Client examples and utility scripts.

```
TableScanner/
├── app/
│ ├── main.py # FastAPI application & Lifecycle handlers
│ ├── routes.py # API endpoints & Auth logic
│ ├── models.py # Pydantic (V2) models
│ ├── config.py # Configuration (BaseSettings)
│ ├── services/
│ │ ├── data/ # Query & Connection pooling logic
│ │ └── db_helper.py # Secure handle resolution
│ └── utils/ # SQLite, KBase Client, and Cache utilities
├── docs/ # API and Architectural documentation
├── tests/ # Unit & Integration tests
├── scripts/ # Development helper scripts
└── static/ # Static assets for the viewer
```

## License
MIT License.

MIT License
37 changes: 28 additions & 9 deletions app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
All KBase service URLs and authentication settings are managed here.
"""

from pydantic_settings import BaseSettings
from pydantic_settings import BaseSettings, SettingsConfigDict
from pydantic import Field


Expand All @@ -15,13 +15,19 @@ class Settings(BaseSettings):

Create a .env file based on .env.example to configure locally.
"""

model_config = SettingsConfigDict(
env_file=".env",
env_file_encoding="utf-8",
case_sensitive=True
)

# ==========================================================================
# AUTHENTICATION
# ==========================================================================
KB_SERVICE_AUTH_TOKEN: str = Field(
...,
description="KBase authentication token for API access"
KB_SERVICE_AUTH_TOKEN: str | None = Field(
default=None,
description="KBase authentication token for service-to-service API access (optional if using header/cookie auth)"
)

# ==========================================================================
Expand Down Expand Up @@ -59,14 +65,27 @@ class Settings(BaseSettings):
default=False,
description="Enable debug mode with verbose logging"
)
KB_ENV: str = Field(
default="appdev",
description="KBase environment (appdev, ci, prod)"
)
CORS_ORIGINS: list[str] = Field(
default=["*"],
description="List of allowed origins for CORS. Use ['*'] for all."
)

# Root path for proxy deployment (e.g., "/services/berdl_table_scanner")
ROOT_PATH: str = ""

class Config:
env_file = ".env"
env_file_encoding = "utf-8"
case_sensitive = True

# Timeout settings
DOWNLOAD_TIMEOUT_SECONDS: float = Field(
default=30.0,
description="Timeout in seconds for downloading databases"
)
KBASE_API_TIMEOUT_SECONDS: float = Field(
default=10.0,
description="Timeout in seconds for KBase API calls"
)


# Global settings instance - loaded at module import
Expand Down
20 changes: 20 additions & 0 deletions app/config_constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
"""
Configuration constants for TableScanner.
"""

# Default values
DEFAULT_LIMIT = 100
MAX_LIMIT = 500000
DEFAULT_OFFSET = 0
DEFAULT_SORT_ORDER = "ASC"

# Cache settings
CACHE_TTL_SECONDS = 300 # 5 minutes
CACHE_MAX_ENTRIES = 1000
INDEX_CACHE_TTL = 3600 # 1 hour

# Timeout settings
KBASE_API_TIMEOUT_SECONDS = 30

# API Version
API_VERSION = "2.0"
5 changes: 5 additions & 0 deletions app/db/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""
Database module for Config Control Plane.

Provides SQLite-based persistent storage for configuration records.
"""
Loading
Loading