@@ -128,7 +128,7 @@ def __init__(
128128
129129 # reading
130130 self ._current_records : bool = False
131- self .timdex_dataset_metadata : TIMDEXDatasetMetadata = None # type: ignore[assignment]
131+ self .metadata : TIMDEXDatasetMetadata = None # type: ignore[assignment]
132132
133133 @property
134134 def row_count (self ) -> int :
@@ -173,8 +173,8 @@ def load(
173173 # read dataset metadata if only current records are requested
174174 self ._current_records = current_records
175175 if current_records :
176- self .timdex_dataset_metadata = TIMDEXDatasetMetadata (timdex_dataset = self )
177- self .paths = self .timdex_dataset_metadata .get_current_parquet_files (** filters )
176+ self .metadata = TIMDEXDatasetMetadata (timdex_dataset = self )
177+ self .paths = self .metadata .get_current_parquet_files (** filters )
178178
179179 # perform initial load of full dataset
180180 self .dataset = self ._load_pyarrow_dataset ()
@@ -285,11 +285,23 @@ def _parse_date_filters(self, run_date: str | date | None) -> DatasetFilters:
285285
286286 @staticmethod
287287 def get_s3_filesystem () -> fs .FileSystem :
288- """Instantiate a pyarrow S3 Filesystem for dataset loading."""
288+ """Instantiate a pyarrow S3 Filesystem for dataset loading.
289+
290+ If the env var 'MINIO_S3_ENDPOINT_URL' is present, assume a local MinIO S3
291+ instance and configure accordingly, otherwise assume normal AWS S3.
292+ """
289293 session = boto3 .session .Session ()
290294 credentials = session .get_credentials ()
291295 if not credentials :
292296 raise RuntimeError ("Could not locate AWS credentials" )
297+
298+ if os .getenv ("MINIO_S3_ENDPOINT_URL" ):
299+ return fs .S3FileSystem (
300+ access_key = os .environ ["MINIO_USERNAME" ],
301+ secret_key = os .environ ["MINIO_PASSWORD" ],
302+ endpoint_override = os .environ ["MINIO_S3_ENDPOINT_URL" ],
303+ )
304+
293305 return fs .S3FileSystem (
294306 secret_key = credentials .secret_key ,
295307 access_key = credentials .access_key ,
@@ -509,9 +521,7 @@ def _yield_current_record_batches(
509521 - filters: pairs of column:value to filter the dataset metadata required
510522 """
511523 # get map of timdex_record_id to run_id for current version of that record
512- record_to_run_map = self .timdex_dataset_metadata .get_current_record_to_run_map (
513- ** filters
514- )
524+ record_to_run_map = self .metadata .get_current_record_to_run_map (** filters )
515525
516526 # loop through batches, yielding only current records
517527 for batch in batches :
0 commit comments