Skip to content

Commit 9bf9b1b

Browse files
authored
Merge pull request #154 from MITLibraries/TIMX-515-static-duckdb-file-prep
TIMX 515 - static DuckDB file prep
2 parents ed1cf59 + d1af981 commit 9bf9b1b

File tree

5 files changed

+94
-23
lines changed

5 files changed

+94
-23
lines changed

Makefile

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
-include .env
2+
13
SHELL=/bin/bash
24
DATETIME:=$(shell date -u +%Y%m%dT%H%M%SZ)
35

@@ -53,4 +55,17 @@ black-apply: # Apply changes with 'black'
5355
pipenv run black .
5456

5557
ruff-apply: # Resolve 'fixable errors' with 'ruff'
56-
pipenv run ruff check --fix .
58+
pipenv run ruff check --fix .
59+
60+
61+
######################
62+
# Minio S3 Instance
63+
######################
64+
minio-start:
65+
docker run \
66+
-p 9000:9000 \
67+
-p 9001:9001 \
68+
-v $(MINIO_DATA):/data \
69+
-e "MINIO_ROOT_USER=$(MINIO_USERNAME)" \
70+
-e "MINIO_ROOT_PASSWORD=$(MINIO_PASSWORD)" \
71+
quay.io/minio/minio server /data --console-address ":9001"

README.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,31 @@ None at this time.
4545
```shell
4646
TDA_LOG_LEVEL=# log level for timdex-dataset-api, accepts [DEBUG, INFO, WARNING, ERROR], default INFO
4747
WARNING_ONLY_LOGGERS=# Comma-seperated list of logger names to set as WARNING only, e.g. 'botocore,charset_normalizer,smart_open'
48+
49+
MINIO_S3_ENDPOINT_URL=# If set, informs the library to use this Minio S3 instance. Requires the http(s):// protocol.
50+
MINIO_USERNAME=# Username / AWS Key for Minio; required when MINIO_S3_ENDPOINT_URL is set
51+
MINIO_PASSWORD=# Pasword / AWS Secret for Minio; required when MINIO_S3_ENDPOINT_URL is set
52+
MINIO_DATA=# Path to persist MinIO data if started via Makefile command
53+
```
54+
55+
## Local S3 via MinIO
56+
57+
Set env vars:
58+
```shell
59+
MINIO_S3_ENDPOINT_URL=http://localhost:9000
60+
MINIO_USERNAME="admin"
61+
MINIO_PASSWORD="password"
62+
MINIO_DATA=<path to persist MinIO data, e.g. /tmp/minio>
4863
```
4964

65+
Use a `Makefile` command that will start a MinIO instance:
66+
67+
```shell
68+
make minio-start
69+
```
70+
71+
With the env var `MINIO_S3_ENDPOINT_URL` set, this library will configure `pyarrow` and DuckDB connections to point at this local MinIO S3 instance.
72+
5073
## Usage
5174

5275
Currently, the most common use cases are:

tests/conftest.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ def _test_env(monkeypatch):
2121
monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "fake_secret_key")
2222
monkeypatch.setenv("AWS_SESSION_TOKEN", "fake_session_token")
2323
monkeypatch.setenv("AWS_DEFAULT_REGION", "us-east-1")
24+
monkeypatch.delenv("MINIO_S3_ENDPOINT_URL", raising=False)
2425

2526

2627
@pytest.fixture

timdex_dataset_api/dataset.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def __init__(
128128

129129
# reading
130130
self._current_records: bool = False
131-
self.timdex_dataset_metadata: TIMDEXDatasetMetadata = None # type: ignore[assignment]
131+
self.metadata: TIMDEXDatasetMetadata = None # type: ignore[assignment]
132132

133133
@property
134134
def row_count(self) -> int:
@@ -173,8 +173,8 @@ def load(
173173
# read dataset metadata if only current records are requested
174174
self._current_records = current_records
175175
if current_records:
176-
self.timdex_dataset_metadata = TIMDEXDatasetMetadata(timdex_dataset=self)
177-
self.paths = self.timdex_dataset_metadata.get_current_parquet_files(**filters)
176+
self.metadata = TIMDEXDatasetMetadata(timdex_dataset=self)
177+
self.paths = self.metadata.get_current_parquet_files(**filters)
178178

179179
# perform initial load of full dataset
180180
self.dataset = self._load_pyarrow_dataset()
@@ -285,11 +285,23 @@ def _parse_date_filters(self, run_date: str | date | None) -> DatasetFilters:
285285

286286
@staticmethod
287287
def get_s3_filesystem() -> fs.FileSystem:
288-
"""Instantiate a pyarrow S3 Filesystem for dataset loading."""
288+
"""Instantiate a pyarrow S3 Filesystem for dataset loading.
289+
290+
If the env var 'MINIO_S3_ENDPOINT_URL' is present, assume a local MinIO S3
291+
instance and configure accordingly, otherwise assume normal AWS S3.
292+
"""
289293
session = boto3.session.Session()
290294
credentials = session.get_credentials()
291295
if not credentials:
292296
raise RuntimeError("Could not locate AWS credentials")
297+
298+
if os.getenv("MINIO_S3_ENDPOINT_URL"):
299+
return fs.S3FileSystem(
300+
access_key=os.environ["MINIO_USERNAME"],
301+
secret_key=os.environ["MINIO_PASSWORD"],
302+
endpoint_override=os.environ["MINIO_S3_ENDPOINT_URL"],
303+
)
304+
293305
return fs.S3FileSystem(
294306
secret_key=credentials.secret_key,
295307
access_key=credentials.access_key,
@@ -509,9 +521,7 @@ def _yield_current_record_batches(
509521
- filters: pairs of column:value to filter the dataset metadata required
510522
"""
511523
# get map of timdex_record_id to run_id for current version of that record
512-
record_to_run_map = self.timdex_dataset_metadata.get_current_record_to_run_map(
513-
**filters
514-
)
524+
record_to_run_map = self.metadata.get_current_record_to_run_map(**filters)
515525

516526
# loop through batches, yielding only current records
517527
for batch in batches:

timdex_dataset_api/metadata.py

Lines changed: 37 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
"""timdex_dataset_api/metadata.py"""
22

3+
import os
34
import time
45
from typing import TYPE_CHECKING, Unpack
6+
from urllib.parse import urlparse
57

68
import duckdb
79

@@ -87,8 +89,8 @@ def _setup_database(self) -> None:
8789
# bump threads for high parallelization of lightweight data calls for metadata
8890
self.set_database_thread_usage(64)
8991

90-
# setup AWS credentials chain
91-
self._create_aws_credential_chain()
92+
# configure s3 connection
93+
self._configure_s3_connection()
9294

9395
# create a table of metadata about all rows in dataset
9496
self._create_full_dataset_table()
@@ -101,21 +103,41 @@ def _setup_database(self) -> None:
101103
f"path: '{self.db_path}'"
102104
)
103105

104-
def _create_aws_credential_chain(self) -> None:
105-
"""Setup AWS credentials chain in database connection.
106+
def _configure_s3_connection(self) -> None:
107+
"""Configure S3 connection for DuckDB access.
106108
107-
https://duckdb.org/docs/stable/core_extensions/aws.html
109+
If the env var 'MINIO_S3_ENDPOINT_URL' is present, assume a local MinIO S3
110+
instance and configure accordingly, otherwise assume normal AWS S3 and setup a
111+
credentials chain in DuckDB.
108112
"""
109-
logger.info("setting up AWS credentials chain")
110-
query = """
111-
create or replace secret secret (
112-
type s3,
113-
provider credential_chain,
114-
chain 'sso;env;config',
115-
refresh true
116-
);
117-
"""
118-
self.conn.execute(query)
113+
logger.info("configuring S3 connection")
114+
115+
if os.getenv("MINIO_S3_ENDPOINT_URL"):
116+
self.conn.execute(
117+
f"""
118+
create or replace secret minio_s3_secret (
119+
type s3,
120+
endpoint '{urlparse(os.environ["MINIO_S3_ENDPOINT_URL"]).netloc}',
121+
key_id '{os.environ["MINIO_USERNAME"]}',
122+
secret '{os.environ["MINIO_PASSWORD"]}',
123+
region 'us-east-1',
124+
url_style 'path',
125+
use_ssl false
126+
);
127+
"""
128+
)
129+
130+
else:
131+
self.conn.execute(
132+
"""
133+
create or replace secret aws_s3_secret (
134+
type s3,
135+
provider credential_chain,
136+
chain 'sso;env;config',
137+
refresh true
138+
);
139+
"""
140+
)
119141

120142
def _create_full_dataset_table(self) -> None:
121143
"""Create a table of metadata about all records in the parquet dataset.

0 commit comments

Comments
 (0)