From 25e4f20fb286fb98ce5a02fe9c22ac11db6cf44a Mon Sep 17 00:00:00 2001 From: Jennings Anderson Date: Tue, 3 Feb 2026 11:13:47 -0800 Subject: [PATCH 1/5] Adding PMTiles as links in the catalogs --- pyproject.toml | 2 +- src/overture_stac/cli.py | 24 +-- src/overture_stac/overture_stac.py | 286 +++++++++-------------------- uv.lock | 11 +- 4 files changed, 102 insertions(+), 221 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8152fa7..47d9ac6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ dependencies = [ "pyyaml>=6.0.2", ] -[project.optional-dependencies] +[dependency-groups] dev = [ "pytest>=7.4.0", "ruff>=0.1.0", diff --git a/src/overture_stac/cli.py b/src/overture_stac/cli.py index 29cdc64..844252e 100644 --- a/src/overture_stac/cli.py +++ b/src/overture_stac/cli.py @@ -30,13 +30,6 @@ def main(): help="Debug flag to only generate 1 item per collection", ) - parser.add_argument( - "--no-parallel", - dest="parallel", - action="store_false", - help="Disable parallel processing (default: parallel enabled)", - ) - parser.add_argument( "--workers", type=int, @@ -47,15 +40,16 @@ def main(): args = parser.parse_args() filesystem = fs.S3FileSystem(anonymous=True, region="us-west-2") - public_releases = filesystem.get_file_info( + available_releases = filesystem.get_file_info( fs.FileSelector("overturemaps-us-west-2/release") ) # TODO: These should be stored elsewhere, but for now we'll hardcode them here - schema_version_mapping = { + schema_version_mapping: dict[str, str] = { + "2026-03-18.0": "TBD", + "2026-02-18.0": "1.15.0", "2026-01-21.0": "1.15.0", "2025-12-17.0": "1.15.0", - "2025-11-19.0": "1.14.0", } overture_releases_catalog = pystac.Catalog( @@ -67,11 +61,13 @@ def main(): output.mkdir(parents=True, exist_ok=True) for idx, release_info in enumerate( - sorted(public_releases, key=lambda p: p.path, reverse=True) + sorted(available_releases, key=lambda p: p.path, reverse=True) ): release = release_info.path.split("/")[-1] - title = f"{release} Overture Release" if idx > 0 else "Latest Overture Release" + title: str = ( + f"{release} Overture Release" if idx > 0 else "Latest Overture Release" + ) this_release = OvertureRelease( release=release, @@ -80,9 +76,7 @@ def main(): debug=args.debug, ) - this_release.build_release_catalog( - title=title, parallel=args.parallel, max_workers=args.workers - ) + this_release.build_release_catalog(title=title, max_workers=args.workers) child = overture_releases_catalog.add_child( child=this_release.release_catalog, title=title diff --git a/src/overture_stac/overture_stac.py b/src/overture_stac/overture_stac.py index 7b0ab3a..c1acef7 100644 --- a/src/overture_stac/overture_stac.py +++ b/src/overture_stac/overture_stac.py @@ -10,7 +10,7 @@ import pystac import stac_geoparquet -TYPE_LICENSE_MAP = { +TYPE_LICENSE_MAP: dict[str, str] = { "bathymetry": "CC0-1.0", "land_cover": " CC-BY-4.0", "infrastructure": "ODbL-1.0", @@ -20,15 +20,23 @@ "building": "ODbL-1.0", "division": "ODbL-1.0", "division_area": "ODbL-1.0", - "division_bopundary": "ODbL-1.0", + "division_boundary": "ODbL-1.0", "segment": "ODbL-1.0", "connector": "ODbL-1.0", - "place": "CDLA-Permissive-2.0", + "place": "CDLA-Permissive-2.0, Apache 2.0, CC0 1.0.", "address": "Multiple Open Licenses", } -def process_theme_worker(theme_path, release_path, s3_region, debug, release_datetime): +def process_theme_worker( + theme_path: str, + release_path: str, + s3_region: str, + debug: bool, + release_datetime: datetime, + release: str, + available_pmtiles: dict[str, str], +) -> tuple[pystac.Catalog, list[dict], dict[str, list[pystac.Item]], str]: """ Worker function to process a single theme independently. @@ -41,6 +49,8 @@ def process_theme_worker(theme_path, release_path, s3_region, debug, release_dat s3_region: AWS region debug: Debug mode flag release_datetime: Release datetime + release: Release version string + available_pmtiles: Dict of available PMTiles files for this release Returns: tuple: (theme_catalog, manifest_items, type_collections, theme_name) @@ -57,6 +67,18 @@ def process_theme_worker(theme_path, release_path, s3_region, debug, release_dat id=theme_name, description=f"Overture's {theme_name} theme" ) + # Add PMTiles link if available for this theme + if theme_name in available_pmtiles: + logger.info(f"Adding PMTiles link for theme {theme_name}") + theme_catalog.add_link( + pystac.Link( + rel="pmtiles", + target=f"https://tiles.overturemaps.org/{release}/{theme_name}.pmtiles", + media_type="application/vnd.pmtiles", + title=f"{theme_name} PMTiles", + ) + ) + # Get theme types theme_path_selector = fs.FileSelector(theme_path) theme_types = filesystem.get_file_info(theme_path_selector) @@ -139,20 +161,20 @@ def process_theme_worker(theme_path, release_path, s3_region, debug, release_dat media_type="application/vnd.apache.parquet", ), ) - stac_item.add_asset( key="aws-https", asset=pystac.Asset( href=f"https://overturemaps-us-west-2.s3.us-west-2.amazonaws.com/{rel_path}", media_type="application/vnd.apache.parquet", + extra_fields={"storage:refs": ["aws-s3"]}, ), ) - stac_item.add_asset( key="azure-https", asset=pystac.Asset( href=f"https://overturemapswestus2.blob.core.windows.net/{rel_path}", media_type="application/vnd.apache.parquet", + extra_fields={"storage:refs": ["azure"]}, ), ) @@ -223,7 +245,41 @@ def __init__( self.release_datetime = datetime.strptime(release.split(".")[0], "%Y-%m-%d") - def make_release_catalog(self, title: Optional[str]): + # Discover available PMTiles for this release + self.available_pmtiles = self._get_available_pmtiles() + + def _get_available_pmtiles(self) -> dict[str, str]: + """ + Discover available PMTiles files for this release. + + Returns: + dict: Mapping of base names (without .pmtiles) to full S3 paths + """ + pmtiles_path: str = f"overturemaps-extras-us-west-2/tiles/{self.release}" + available_pmtiles: dict[str, str] = {} + + try: + pmtiles_selector = fs.FileSelector(pmtiles_path) + pmtiles_files = self.filesystem.get_file_info(pmtiles_selector) + + for file_info in pmtiles_files: + if file_info.path.endswith(".pmtiles"): + filename = file_info.path.split("/")[-1] + base_name = filename.replace(".pmtiles", "") + available_pmtiles[base_name] = file_info.path + self.logger.debug(f"Found PMTiles: {filename}") + + self.logger.info( + f"Discovered {len(available_pmtiles)} PMTiles files for release {self.release}" + ) + except Exception as e: + self.logger.warning( + f"Could not access PMTiles bucket for release {self.release}: {e}" + ) + + return available_pmtiles + + def make_release_catalog(self, title: Optional[str]) -> None: self.logger.info( f"Creating Release Catalog for {self.release} with schema {self.schema}" ) @@ -243,228 +299,43 @@ def make_release_catalog(self, title: Optional[str]): "storage:schemes": { "aws": { "type": "aws-s3", - "platform": "https://{bucket}.s3.{region}.amazonaws.com/release/{release_version}", - "release_version": self.release, + "platform": "https://{bucket}.s3.{region}.amazonaws.com", "bucket": "overturemaps-us-west-2", "region": "us-west-2", "requester_pays": "false", }, "azure": { "type": "ms-azure", - "platform": "https://{account}.blob.core.windows.net/release/{release_version}", + "platform": "https://{account}.blob.core.windows.net/", "account": "overturemapswestus2", "requester_pays": "false", }, }, } - def get_release_themes(self): + def get_release_themes(self) -> None: release_path_selector = fs.FileSelector(self.release_path.replace("s3://", "")) self.themes = self.filesystem.get_file_info(release_path_selector) - def create_stac_item_from_fragment(self, fragment, schema=None, type_name=None): - if schema is None: - schema = fragment.metadata.schema.to_arrow_schema() - - filename = fragment.path.split("/")[-1] - rel_path = ("/").join(fragment.path.split("/")[1:]) - - self.logger.info(f"Creating STAC item from: {filename}") - - # Build bbox from metadata: - geo = json.loads(schema.metadata[b"geo"].decode("utf-8")) - - xmin, ymin, xmax, ymax = geo.get("columns").get("geometry").get("bbox") - - geojson_bbox_geometry = { - "type": "Polygon", - "coordinates": [ - [ - [xmin, ymin], - [xmax, ymin], - [xmax, ymax], - [xmin, ymax], - [xmin, ymin], - ] - ], - } - - filename = fragment.path.split("/")[-1] - rel_path = ("/").join(fragment.path.split("/")[1:]) - - stac_item = pystac.Item( - id=filename.split("-")[1], - geometry=geojson_bbox_geometry, - bbox=[xmin, ymin, xmax, ymax], - properties={ - "num_rows": fragment.count_rows(), - "num_row_groups": fragment.num_row_groups, - }, - datetime=self.release_datetime, - ) - - self.manifest_items.append( - { - "type": "Feature", - "properties": { - "ovt_type": type_name, - "rel_path": rel_path, - }, - "geometry": geojson_bbox_geometry, - "bbox": [xmin, ymin, xmax, ymax], - } - ) - - # Add GeoParquet from s3 - stac_item.add_asset( - key="aws-s3", - asset=pystac.Asset( - href=f"s3://{fragment.path}", - media_type="application/vnd.apache.parquet", # application/x-parquet ? - ), - ) - - # Add s3 http link - stac_item.add_asset( - key="aws-https", - asset=pystac.Asset( - href=f"https://overturemaps-us-west-2.s3.us-west-2.amazonaws.com/{rel_path}", - media_type="application/vnd.apache.parquet", # application/x-parquet ? - ), - ) - - # Add Azure https link - stac_item.add_asset( - key="azure-https", - asset=pystac.Asset( - href=f"https://overturemapswestus2.blob.core.windows.net/{rel_path}", - media_type="application/vnd.apache.parquet", # application/x-parquet ? - ), - ) - - return stac_item - - def process_type(self, theme_type: fs.FileInfo): - type_name = theme_type.path.split("=")[-1] - self.logger.info(f"Opening Type: {type_name}") - - type_dataset = ds.dataset( - theme_type.path, filesystem=self.filesystem, format="parquet" - ) - - self.type_collections[type_name] = [] - schema = None - - for fragment in ( - list(type_dataset.get_fragments())[:1] - if self.debug - else type_dataset.get_fragments() - ): - schema = fragment.metadata.schema.to_arrow_schema() - - item = self.create_stac_item_from_fragment( - fragment, schema, type_name=type_name - ) - - self.type_collections[type_name].append(item) - - type_collection = pystac.Collection( - id=type_name, - description=f"Overture's {type_name} collection", - extent=pystac.Extent( - spatial=pystac.SpatialExtent( - bboxes=[i.bbox for i in self.type_collections[type_name]] - ), - temporal=pystac.TemporalExtent(intervals=[None, None]), - ), - license=TYPE_LICENSE_MAP.get(type_name), - ) - - type_collection.add_items(self.type_collections[type_name]) - - type_collection.summaries = pystac.Summaries( - { - "schema": ( - json.loads(schema.metadata[b"geo"]).get("version") - if schema is not None - else None - ), - "columns": schema.names if schema is not None else None, - } - ) - - if not self.debug: - type_collection.extra_fields = {"features": type_dataset.count_rows()} - - return type_collection - - def add_theme_to_catalog(self, theme: fs.FileSelector): - theme_name = theme.path.split("=")[-1] - self.logger.info(f"Processing Theme: {theme_name}") - theme_path_selector = fs.FileSelector(theme.path) - theme_types = self.filesystem.get_file_info(theme_path_selector) - - theme_catalog = pystac.Catalog( - id=theme_name, description=f"Overture's {theme_name} theme" - ) - - for theme_type in theme_types: - theme_catalog.add_child(self.process_type(theme_type)) - - # Ensure - theme_path = Path(self.output, theme_name) - theme_path.mkdir(parents=True, exist_ok=True) - - self.release_catalog.add_child(child=theme_catalog, title=theme_name) - - def build_release_catalog(self, title, parallel=True, max_workers=4): + def build_release_catalog(self, title: str, max_workers: int = 4) -> None: """ - Build release catalog with optional parallelization. + Build release catalog using parallel processing. Args: title: Title for the release catalog - parallel: Whether to process themes in parallel (default: True) max_workers: Number of parallel workers (default: 4) """ self.make_release_catalog(title=title) self.get_release_themes() - if parallel and len(self.themes) > 1: - self._build_parallel(max_workers) - else: - self._build_sequential() - - # Write outputs - with open(f"{self.output}/manifest.geojson", "w") as f: - json.dump({"type": "FeatureCollection", "features": self.manifest_items}, f) - - # Write GeoParquet Collections - all_items = [] - for _ovt_type, items in self.type_collections.items(): - all_items += items - - stac_geoparquet.arrow.to_parquet( - table=stac_geoparquet.arrow.parse_stac_items_to_arrow(all_items), - output_path=f"{self.output}/collections.parquet", - ) - - def _build_sequential(self): - """Build catalog sequentially (original implementation).""" - self.logger.info("Building catalog sequentially...") - for theme in self.themes: - self.add_theme_to_catalog(theme) - - def _build_parallel(self, max_workers): - """Build catalog in parallel using ProcessPoolExecutor.""" self.logger.info(f"Building catalog in parallel with {max_workers} workers...") # Prepare arguments for worker processes theme_paths = [theme.path for theme in self.themes] - s3_region = "us-west-2" # Extract from filesystem if needed + s3_region = "us-west-2" # Process themes in parallel with ProcessPoolExecutor(max_workers=max_workers) as executor: - # Submit all tasks future_to_theme = { executor.submit( process_theme_worker, @@ -473,11 +344,12 @@ def _build_parallel(self, max_workers): s3_region, self.debug, self.release_datetime, + self.release, + self.available_pmtiles, ): theme_path for theme_path in theme_paths } - # Collect results as they complete for future in as_completed(future_to_theme): theme_path = future_to_theme[future] try: @@ -488,7 +360,6 @@ def _build_parallel(self, max_workers): theme_name, ) = future.result() - # Merge results into main catalog self.logger.info(f"Merging results for theme: {theme_name}") self.release_catalog.add_child( child=theme_catalog, title=theme_name @@ -496,7 +367,6 @@ def _build_parallel(self, max_workers): self.manifest_items.extend(manifest_items) self.type_collections.update(type_collections) - # Ensure theme directory exists theme_path_dir = Path(self.output, theme_name) theme_path_dir.mkdir(parents=True, exist_ok=True) @@ -505,3 +375,17 @@ def _build_parallel(self, max_workers): f"Theme {theme_path} generated an exception: {exc}" ) raise + + # Write outputs + with open(f"{self.output}/manifest.geojson", "w") as f: + json.dump({"type": "FeatureCollection", "features": self.manifest_items}, f) + + # Write GeoParquet Collections + all_items = [] + for _ovt_type, items in self.type_collections.items(): + all_items += items + + stac_geoparquet.arrow.to_parquet( + table=stac_geoparquet.arrow.parse_stac_items_to_arrow(all_items), + output_path=f"{self.output}/collections.parquet", + ) diff --git a/uv.lock b/uv.lock index 35f03f6..e016808 100644 --- a/uv.lock +++ b/uv.lock @@ -587,7 +587,7 @@ dependencies = [ { name = "stac-geoparquet" }, ] -[package.optional-dependencies] +[package.dev-dependencies] dev = [ { name = "pytest" }, { name = "ruff" }, @@ -597,12 +597,15 @@ dev = [ requires-dist = [ { name = "pyarrow", specifier = ">=14.0.1" }, { name = "pystac", specifier = ">=1.8.4" }, - { name = "pytest", marker = "extra == 'dev'", specifier = ">=7.4.0" }, { name = "pyyaml", specifier = ">=6.0.2" }, - { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.1.0" }, { name = "stac-geoparquet", specifier = ">=0.7.0" }, ] -provides-extras = ["dev"] + +[package.metadata.requires-dev] +dev = [ + { name = "pytest", specifier = ">=7.4.0" }, + { name = "ruff", specifier = ">=0.1.0" }, +] [[package]] name = "packaging" From 21945545f3b0cb2d0b09c55529c90458c8ee5042 Mon Sep 17 00:00:00 2001 From: Jennings Anderson Date: Tue, 3 Feb 2026 14:12:32 -0800 Subject: [PATCH 2/5] Update overture_stac.py --- .github/workflows/publish-stac.yaml | 2 +- .github/workflows/staging.yaml | 6 +-- src/overture_stac/overture_stac.py | 57 +++++++++++++++-------------- 3 files changed, 33 insertions(+), 32 deletions(-) diff --git a/.github/workflows/publish-stac.yaml b/.github/workflows/publish-stac.yaml index 769bd53..865301b 100644 --- a/.github/workflows/publish-stac.yaml +++ b/.github/workflows/publish-stac.yaml @@ -1,4 +1,4 @@ -name: Publish STAC +name: Publish STAC to GH-Pages on: # Run daily at 6 AM UTC diff --git a/.github/workflows/staging.yaml b/.github/workflows/staging.yaml index 24066b0..7748636 100644 --- a/.github/workflows/staging.yaml +++ b/.github/workflows/staging.yaml @@ -44,7 +44,7 @@ jobs: needs: build environment: name: staging - url: https://staging.overturemaps.org/${{github.repository}}/pr/${{ github.event.number }}/index.html + url: https://staging.overturemaps.org/${{ github.event.repository.name }}/pr/${{ github.event.number }}/index.html steps: - name: Configure AWS credentials šŸ” @@ -60,7 +60,7 @@ jobs: - name: Copy to S3 run: | - aws s3 cp --recursive . s3://overture-managed-staging-usw2/gh-pages/${{ github.repository }}/pr/${{ github.event.number }}/ + aws s3 sync --delete . s3://overture-managed-staging-usw2/gh-pages/${{ github.event.repository.name }}/pr/${{ github.event.number }}/ - name: Bust the Cache - run: aws cloudfront create-invalidation --distribution-id E1KP2IN0H2RGGT --paths "/${{ github.repository }}/pr/${{ github.event.number }}/*" + run: aws cloudfront create-invalidation --distribution-id E1KP2IN0H2RGGT --paths "/${{ github.event.repository.name }}/pr/${{ github.event.number }}/*" diff --git a/src/overture_stac/overture_stac.py b/src/overture_stac/overture_stac.py index c1acef7..1af771e 100644 --- a/src/overture_stac/overture_stac.py +++ b/src/overture_stac/overture_stac.py @@ -75,7 +75,7 @@ def process_theme_worker( rel="pmtiles", target=f"https://tiles.overturemaps.org/{release}/{theme_name}.pmtiles", media_type="application/vnd.pmtiles", - title=f"{theme_name} PMTiles", + title=f"PMTiles", ) ) @@ -137,6 +137,21 @@ def process_theme_worker( properties={ "num_rows": fragment.count_rows(), "num_row_groups": fragment.num_row_groups, + "storage:schemes": { + "aws": { + "type": "aws-s3", + "platform": "https://{bucket}.s3.{region}.amazonaws.com", + "bucket": "overturemaps-us-west-2", + "region": "us-west-2", + "requester_pays": "false", + }, + "azure": { + "type": "ms-azure", + "platform": "https://{account}.blob.core.windows.net/", + "account": "overturemapswestus2", + "requester_pays": "false", + }, + }, }, datetime=release_datetime, ) @@ -155,22 +170,22 @@ def process_theme_worker( # Add assets stac_item.add_asset( - key="aws-s3", - asset=pystac.Asset( - href=f"s3://{fragment.path}", - media_type="application/vnd.apache.parquet", - ), - ) - stac_item.add_asset( - key="aws-https", + key="aws", asset=pystac.Asset( href=f"https://overturemaps-us-west-2.s3.us-west-2.amazonaws.com/{rel_path}", media_type="application/vnd.apache.parquet", - extra_fields={"storage:refs": ["aws-s3"]}, + extra_fields={ + "storage:refs": ["aws"], + "alternate": { + "href": f"s3://{fragment.path}", + "storage:refs": ["aws"], + "name": "S3", + }, + }, ), ) stac_item.add_asset( - key="azure-https", + key="azure", asset=pystac.Asset( href=f"https://overturemapswestus2.blob.core.windows.net/{rel_path}", media_type="application/vnd.apache.parquet", @@ -274,7 +289,7 @@ def _get_available_pmtiles(self) -> dict[str, str]: ) except Exception as e: self.logger.warning( - f"Could not access PMTiles bucket for release {self.release}: {e}" + f"Couldn't find PMTiles for release: {self.release}: {e}" ) return available_pmtiles @@ -289,28 +304,14 @@ def make_release_catalog(self, title: Optional[str]) -> None: title=title if title is not None else self.release, description=f"Geoparquet data released in the Overture {self.release} release", stac_extensions=[ - "https://stac-extensions.github.io/storage/v2.0.0/schema.json" + "https://stac-extensions.github.io/storage/v2.0.0/schema.json", + "https://stac-extensions.github.io/alternate-assets/v1.1.0/schema.json", ], ) self.release_catalog.extra_fields = { "release:version": self.release, "schema:version": self.schema, "schema:tag": f"https://github.com/OvertureMaps/schema/releases/tag/v{self.schema}", - "storage:schemes": { - "aws": { - "type": "aws-s3", - "platform": "https://{bucket}.s3.{region}.amazonaws.com", - "bucket": "overturemaps-us-west-2", - "region": "us-west-2", - "requester_pays": "false", - }, - "azure": { - "type": "ms-azure", - "platform": "https://{account}.blob.core.windows.net/", - "account": "overturemapswestus2", - "requester_pays": "false", - }, - }, } def get_release_themes(self) -> None: From cc270cf37ff3183e9c2e6f6bb1559aa32953ad13 Mon Sep 17 00:00:00 2001 From: Jennings Anderson Date: Tue, 3 Feb 2026 14:17:52 -0800 Subject: [PATCH 3/5] Update cli.py --- .github/workflows/staging.yaml | 4 ++-- src/overture_stac/cli.py | 1 - src/overture_stac/overture_stac.py | 24 ++++++++++++++++-------- 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/.github/workflows/staging.yaml b/.github/workflows/staging.yaml index 7748636..4cebb63 100644 --- a/.github/workflows/staging.yaml +++ b/.github/workflows/staging.yaml @@ -36,7 +36,7 @@ jobs: uses: actions/upload-artifact@v4 with: name: stac-catalog - path: build-artifact + path: public_releases deploy: name: Deploy @@ -56,7 +56,7 @@ jobs: - name: Download artifacts šŸ“„ uses: actions/download-artifact@v4 with: - name: build-artifact + name: stac-catalog - name: Copy to S3 run: | diff --git a/src/overture_stac/cli.py b/src/overture_stac/cli.py index 844252e..ce5dec2 100644 --- a/src/overture_stac/cli.py +++ b/src/overture_stac/cli.py @@ -5,7 +5,6 @@ import pyarrow.fs as fs import pystac - from overture_stac.overture_stac import OvertureRelease from overture_stac.registry_manifest import RegistryManifest diff --git a/src/overture_stac/overture_stac.py b/src/overture_stac/overture_stac.py index 1af771e..cabbeb8 100644 --- a/src/overture_stac/overture_stac.py +++ b/src/overture_stac/overture_stac.py @@ -98,20 +98,28 @@ def process_theme_worker( local_type_collections[type_name] = [] schema = None - for fragment in ( - list(type_dataset.get_fragments())[:1] - if debug - else type_dataset.get_fragments() - ): + # Get all fragments to calculate progress + all_fragments = list(type_dataset.get_fragments()) + if debug: + all_fragments = all_fragments[:1] + + total_fragments = len(all_fragments) + last_reported_percent = -10 # Start at -10 so 0% is reported + + for idx, fragment in enumerate(all_fragments): schema = fragment.metadata.schema.to_arrow_schema() # Create STAC item from fragment filename = fragment.path.split("/")[-1] rel_path = ("/").join(fragment.path.split("/")[1:]) - logger.info( - f" [ {fragment.path.split('/')[-2]} : {'.' * len(local_manifest_items)} ]" - ) + # Log progress at 10% increments + current_percent = int((idx / total_fragments) * 100) if total_fragments > 0 else 100 + if current_percent >= last_reported_percent + 10: + last_reported_percent = (current_percent // 10) * 10 + logger.info( + f" [ {fragment.path.split('/')[-2]} : {last_reported_percent}% complete ]" + ) # Build bbox from metadata geo = json.loads(schema.metadata[b"geo"].decode("utf-8")) From 7afd66b11a5c5670b1f3c62574d4ee7703a061e8 Mon Sep 17 00:00:00 2001 From: Jennings Anderson Date: Tue, 3 Feb 2026 17:03:20 -0800 Subject: [PATCH 4/5] Better tests and such --- .github/workflows/staging.yaml | 2 +- .gitignore | 2 + pyproject.toml | 6 +- src/overture_stac/cli.py | 1 + src/overture_stac/overture_stac.py | 30 +- tests/setup_test_catalog.py | 311 ++++++++++++++++++++ tests/test_e2e_stac_catalog.py | 444 +++++++++++++++++++++++++++++ tests/test_overture_stac.py | 24 -- tests/test_registry_manifest.py | 117 -------- 9 files changed, 779 insertions(+), 158 deletions(-) create mode 100644 tests/setup_test_catalog.py create mode 100644 tests/test_e2e_stac_catalog.py delete mode 100644 tests/test_overture_stac.py delete mode 100644 tests/test_registry_manifest.py diff --git a/.github/workflows/staging.yaml b/.github/workflows/staging.yaml index 4cebb63..9fe2252 100644 --- a/.github/workflows/staging.yaml +++ b/.github/workflows/staging.yaml @@ -44,7 +44,7 @@ jobs: needs: build environment: name: staging - url: https://staging.overturemaps.org/${{ github.event.repository.name }}/pr/${{ github.event.number }}/index.html + url: https://staging.overturemaps.org/${{ github.event.repository.name }}/pr/${{ github.event.number }}/catalog.json steps: - name: Configure AWS credentials šŸ” diff --git a/.gitignore b/.gitignore index 8d7a591..7fcde2c 100644 --- a/.gitignore +++ b/.gitignore @@ -77,3 +77,5 @@ output/ *.swp *.swo *~ + +tests/data diff --git a/pyproject.toml b/pyproject.toml index 47d9ac6..28f051a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,11 +36,15 @@ testpaths = ["tests"] python_files = ["test_*.py"] python_classes = ["Test*"] python_functions = ["test_*"] -addopts = ["--verbose"] +addopts = ["--verbose", "-W", "default"] markers = [ "integration: marks tests as integration tests (connects to real services)", "slow: marks tests as slow (deselect with '-m \"not slow\"')", ] +filterwarnings = [ + "default", + "ignore::DeprecationWarning:pystac.*", +] [tool.ruff] line-length = 88 diff --git a/src/overture_stac/cli.py b/src/overture_stac/cli.py index ce5dec2..844252e 100644 --- a/src/overture_stac/cli.py +++ b/src/overture_stac/cli.py @@ -5,6 +5,7 @@ import pyarrow.fs as fs import pystac + from overture_stac.overture_stac import OvertureRelease from overture_stac.registry_manifest import RegistryManifest diff --git a/src/overture_stac/overture_stac.py b/src/overture_stac/overture_stac.py index cabbeb8..37bd5d2 100644 --- a/src/overture_stac/overture_stac.py +++ b/src/overture_stac/overture_stac.py @@ -75,7 +75,7 @@ def process_theme_worker( rel="pmtiles", target=f"https://tiles.overturemaps.org/{release}/{theme_name}.pmtiles", media_type="application/vnd.pmtiles", - title=f"PMTiles", + title="PMTiles", ) ) @@ -98,13 +98,12 @@ def process_theme_worker( local_type_collections[type_name] = [] schema = None - # Get all fragments to calculate progress + # Get all fragments all_fragments = list(type_dataset.get_fragments()) if debug: - all_fragments = all_fragments[:1] + all_fragments = all_fragments[:2] - total_fragments = len(all_fragments) - last_reported_percent = -10 # Start at -10 so 0% is reported + total_fragments: int = len(all_fragments) for idx, fragment in enumerate(all_fragments): schema = fragment.metadata.schema.to_arrow_schema() @@ -113,12 +112,10 @@ def process_theme_worker( filename = fragment.path.split("/")[-1] rel_path = ("/").join(fragment.path.split("/")[1:]) - # Log progress at 10% increments - current_percent = int((idx / total_fragments) * 100) if total_fragments > 0 else 100 - if current_percent >= last_reported_percent + 10: - last_reported_percent = (current_percent // 10) * 10 + # Log progress every 10 fragments + if idx % 10 == 0 or idx == total_fragments - 1: logger.info( - f" [ {fragment.path.split('/')[-2]} : {last_reported_percent}% complete ]" + f" [ {fragment.path.split('/')[-2]} : {idx + 1}/{total_fragments} fragments ]" ) # Build bbox from metadata @@ -184,10 +181,13 @@ def process_theme_worker( media_type="application/vnd.apache.parquet", extra_fields={ "storage:refs": ["aws"], + "alternate:name": "HTTPS", "alternate": { - "href": f"s3://{fragment.path}", - "storage:refs": ["aws"], - "name": "S3", + "s3": { + "href": f"s3://{fragment.path}", + "alternate:name": "S3", + "description": "Access the files via regular Amazon AWS S3 tooling.", + } }, }, ), @@ -211,7 +211,7 @@ def process_theme_worker( spatial=pystac.SpatialExtent( bboxes=[i.bbox for i in local_type_collections[type_name]] ), - temporal=pystac.TemporalExtent(intervals=[None, None]), + temporal=pystac.TemporalExtent(intervals=[[None, None]]), ), license=TYPE_LICENSE_MAP.get(type_name), ) @@ -313,7 +313,7 @@ def make_release_catalog(self, title: Optional[str]) -> None: description=f"Geoparquet data released in the Overture {self.release} release", stac_extensions=[ "https://stac-extensions.github.io/storage/v2.0.0/schema.json", - "https://stac-extensions.github.io/alternate-assets/v1.1.0/schema.json", + "https://stac-extensions.github.io/alternate-assets/v1.2.0/schema.json", ], ) self.release_catalog.extra_fields = { diff --git a/tests/setup_test_catalog.py b/tests/setup_test_catalog.py new file mode 100644 index 0000000..0ff22e9 --- /dev/null +++ b/tests/setup_test_catalog.py @@ -0,0 +1,311 @@ +#!/usr/bin/env python3 +""" +Setup script for building test STAC catalogs. + +This script builds a STAC catalog in debug mode using the same discovery +mechanism as the CLI. Run this before running integration tests. + +Usage: + python tests/setup_test_catalog.py + python tests/setup_test_catalog.py --output tests/data + python tests/setup_test_catalog.py --release 2025-01-22.0 + python tests/setup_test_catalog.py --serve # Build and serve on localhost +""" + +import argparse +import logging +import os +import sys +import threading +from functools import partial +from http.server import HTTPServer, SimpleHTTPRequestHandler +from pathlib import Path + +import pyarrow.fs as fs +import pystac + +from overture_stac.overture_stac import OvertureRelease + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# Schema version mapping (same as CLI) +SCHEMA_VERSION_MAPPING: dict[str, str] = { + "2026-03-18.0": "TBD", + "2026-02-18.0": "1.15.0", + "2026-01-21.0": "1.15.0", + "2025-12-17.0": "1.15.0", +} + +DEFAULT_PORT = 8888 + + +def discover_releases() -> list[str]: + """ + Discover available releases from S3 using the same mechanism as the CLI. + + Returns: + List of release names sorted by date (newest first) + """ + logger.info("Discovering available releases from S3...") + filesystem = fs.S3FileSystem(anonymous=True, region="us-west-2") + available_releases = filesystem.get_file_info( + fs.FileSelector("overturemaps-us-west-2/release") + ) + + releases = [info.path.split("/")[-1] for info in available_releases] + releases.sort(reverse=True) + + logger.info(f"Found {len(releases)} releases: {releases[:5]}...") + return releases + + +def get_latest_release() -> str: + """Get the latest available release.""" + releases = discover_releases() + if not releases: + raise RuntimeError("No releases found in S3 bucket") + return releases[0] + + +def build_test_catalog( + output_dir: Path, + release: str | None = None, + workers: int = 2, +) -> Path: + """ + Build a STAC catalog in debug mode for testing. + + Args: + output_dir: Directory to output the catalog + release: Specific release to build (None = latest) + workers: Number of parallel workers + + Returns: + Path to the built catalog directory + """ + output_dir.mkdir(parents=True, exist_ok=True) + + # Discover or use specified release + if release is None: + release = get_latest_release() + logger.info(f"Using latest release: {release}") + else: + logger.info(f"Using specified release: {release}") + + schema = SCHEMA_VERSION_MAPPING.get(release, "unknown") + logger.info(f"Schema version: {schema}") + + # Build the release catalog + logger.info(f"Building catalog for {release} in debug mode...") + + overture_release = OvertureRelease( + release=release, + schema=schema, + output=output_dir, + debug=True, # Only process first 2 fragments per type + ) + + title = f"Test Release {release}" + overture_release.build_release_catalog(title=title, max_workers=workers) + + # Create root catalog to match CLI structure + root_catalog = pystac.Catalog( + id="Overture Releases", + description="All Overture Releases (Test)", + ) + + child = root_catalog.add_child( + child=overture_release.release_catalog, + title=title, + ) + child.extra_fields = {"latest": True} + overture_release.release_catalog.extra_fields["latest"] = True + root_catalog.extra_fields = {"latest": release} + + # # Add registry manifest + # try: + # registry_manifest = RegistryManifest() + # root_catalog.extra_fields["registry"] = { + # "path": "s3://overturemaps-us-west-2/registry", + # "manifest": registry_manifest.create_manifest(), + # } + # except Exception as e: + # logger.warning(f"Could not create registry manifest: {e}") + + # Normalize and save + logger.info(f"Saving catalog to {output_dir}...") + root_catalog.normalize_and_save( + root_href=str(output_dir), + catalog_type=pystac.CatalogType.SELF_CONTAINED, + ) + + catalog_path = output_dir / release + logger.info(f"Catalog built successfully at {catalog_path}") + + return catalog_path + + +class CORSRequestHandler(SimpleHTTPRequestHandler): + """HTTP request handler with CORS support for STAC browser compatibility.""" + + def end_headers(self): + self.send_header("Access-Control-Allow-Origin", "*") + self.send_header("Access-Control-Allow-Methods", "GET, OPTIONS") + self.send_header("Access-Control-Allow-Headers", "*") + super().end_headers() + + def do_OPTIONS(self): + self.send_response(200) + self.end_headers() + + def log_message(self, format, *args): + logger.debug(f"HTTP: {args[0]}") + + +def serve_catalog(directory: Path, port: int = DEFAULT_PORT) -> HTTPServer: + """ + Start an HTTP server to serve the catalog directory. + + Args: + directory: Directory to serve + port: Port to serve on + + Returns: + HTTPServer instance + """ + os.chdir(directory) + handler = partial(CORSRequestHandler, directory=directory) + server = HTTPServer(("localhost", port), handler) + return server + + +def run_server_blocking(directory: Path, port: int = DEFAULT_PORT): + """Run the HTTP server in blocking mode (for CLI use).""" + server = serve_catalog(directory, port) + print(f"Serving catalog at http://localhost:{port}") + print(f"Root catalog: http://localhost:{port}/catalog.json") + print("Press Ctrl+C to stop...") + + try: + server.serve_forever() + except KeyboardInterrupt: + print("\nShutting down server...") + finally: + server.shutdown() + server.server_close() + print("Server stopped.") + + +def start_server_background(directory: Path, port: int = DEFAULT_PORT) -> HTTPServer: + """ + Start the HTTP server in a background thread. + + Args: + directory: Directory to serve + port: Port to serve on + + Returns: + HTTPServer instance (call shutdown() to stop) + """ + server = serve_catalog(directory, port) + thread = threading.Thread(target=server.serve_forever, daemon=True) + thread.start() + logger.info(f"Server started at http://localhost:{port}") + return server + + +def main(): + """Main entry point for the setup script.""" + parser = argparse.ArgumentParser( + description="Build a test STAC catalog in debug mode" + ) + + parser.add_argument( + "--output", + type=str, + default=str(Path(__file__).parent / "data"), + help="Output directory for the test catalog (default: tests/data)", + ) + + parser.add_argument( + "--release", + type=str, + default=None, + help="Specific release to build (default: latest)", + ) + + parser.add_argument( + "--workers", + type=int, + default=2, + help="Number of parallel workers (default: 2)", + ) + + parser.add_argument( + "--list-releases", + action="store_true", + help="List available releases and exit", + ) + + parser.add_argument( + "--serve", + action="store_true", + help="Serve the catalog on localhost after building", + ) + + parser.add_argument( + "--serve-only", + action="store_true", + help="Only serve an existing catalog (don't build)", + ) + + parser.add_argument( + "--port", + type=int, + default=DEFAULT_PORT, + help=f"Port for HTTP server (default: {DEFAULT_PORT})", + ) + + args = parser.parse_args() + + if args.list_releases: + releases = discover_releases() + print("Available releases:") + for release in releases: + schema = SCHEMA_VERSION_MAPPING.get(release, "unknown") + print(f" {release} (schema: {schema})") + return + + output_dir = Path(args.output) + + if args.serve_only: + if not output_dir.exists(): + print(f"Error: Output directory {output_dir} does not exist.") + print("Run without --serve-only to build the catalog first.") + sys.exit(1) + run_server_blocking(output_dir, args.port) + return + + # Build the catalog + catalog_path = build_test_catalog( + output_dir=output_dir, + release=args.release, + workers=args.workers, + ) + + print("\nāœ“ Test catalog built successfully!") + print(f" Location: {catalog_path}") + + if args.serve: + print("\nStarting HTTP server...") + run_server_blocking(output_dir, args.port) + else: + print("\nTo serve the catalog:") + print(" python tests/setup_test_catalog.py --serve-only") + print("\nRun integration tests with:") + print(" pytest tests/test_e2e_stac_catalog.py -m integration") + + +if __name__ == "__main__": + main() diff --git a/tests/test_e2e_stac_catalog.py b/tests/test_e2e_stac_catalog.py new file mode 100644 index 0000000..87d1605 --- /dev/null +++ b/tests/test_e2e_stac_catalog.py @@ -0,0 +1,444 @@ +"""End-to-end integration tests for STAC catalog validation. + +These tests validate a pre-built STAC catalog served via HTTP using pystac. +Run the setup script first to build the test catalog: + + python tests/setup_test_catalog.py + +Then run the tests (server will start automatically): + + pytest tests/test_e2e_stac_catalog.py -m integration +""" + +import json +import socket +import time +from pathlib import Path + +import pystac +import pytest + +from tests.setup_test_catalog import start_server_background + +DEFAULT_PORT = 8888 + + +def get_test_data_dir() -> Path: + """Get the test data directory.""" + return Path(__file__).parent / "data" + + +def find_release_name() -> str | None: + """ + Find a release directory name in the test data directory. + + Returns: + Release name (e.g., '2025-01-22.0'), or None if not found + """ + data_dir = get_test_data_dir() + if not data_dir.exists(): + return None + + for item in data_dir.iterdir(): + if item.is_dir() and item.name[0].isdigit(): + catalog_path = item / "catalog.json" + if catalog_path.exists(): + return item.name + + return None + + +def is_port_in_use(port: int) -> bool: + """Check if a port is already in use.""" + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + return s.connect_ex(("localhost", port)) == 0 + + +def get_all_items_from_catalog(catalog: pystac.Catalog) -> list[pystac.Item]: + """ + Recursively collect all items from a catalog. + + This replaces the deprecated get_all_items() method. + """ + items = [] + for child in catalog.get_children(): + if isinstance(child, pystac.Collection): + items.extend(child.get_items()) + elif isinstance(child, pystac.Catalog): + items.extend(get_all_items_from_catalog(child)) + return items + + +def get_all_collections_from_catalog( + catalog: pystac.Catalog, +) -> list[pystac.Collection]: + """ + Recursively collect all collections from a catalog. + + This replaces the deprecated get_all_collections() method. + """ + collections = [] + for child in catalog.get_children(): + if isinstance(child, pystac.Collection): + collections.append(child) + elif isinstance(child, pystac.Catalog): + collections.extend(get_all_collections_from_catalog(child)) + return collections + + +@pytest.fixture(scope="module") +def catalog_server(): + """ + Start an HTTP server to serve the test catalog. + + Yields the base URL for the catalog. + """ + data_dir = get_test_data_dir() + + if not data_dir.exists() or not (data_dir / "catalog.json").exists(): + pytest.skip( + "Test catalog not found. Run 'python tests/setup_test_catalog.py' first." + ) + + port = DEFAULT_PORT + + # Check if server is already running + if is_port_in_use(port): + # Assume it's our server already running + yield f"http://localhost:{port}" + return + + # Start the server + server = start_server_background(data_dir, port) + time.sleep(0.5) # Give server time to start + + try: + yield f"http://localhost:{port}" + finally: + # Properly shutdown and close the server + server.shutdown() + server.server_close() + + +@pytest.fixture(scope="module") +def root_catalog_url(catalog_server) -> str: + """Get the URL for the root catalog.""" + return f"{catalog_server}/catalog.json" + + +@pytest.fixture(scope="module") +def release_catalog_url(catalog_server) -> str: + """Get the URL for a release catalog.""" + release_name = find_release_name() + if release_name is None: + pytest.skip("No release catalog found in test data.") + return f"{catalog_server}/{release_name}/catalog.json" + + +@pytest.fixture(scope="module") +def release_name() -> str: + """Get the release name from the test data directory.""" + name = find_release_name() + if name is None: + pytest.skip("No release found in test data.") + return name + + +@pytest.fixture(scope="module") +def root_catalog(root_catalog_url) -> pystac.Catalog: + """Load the root catalog from the HTTP server.""" + return pystac.Catalog.from_file(root_catalog_url) + + +@pytest.fixture(scope="module") +def release_catalog(release_catalog_url) -> pystac.Catalog: + """Load a release catalog from the HTTP server.""" + return pystac.Catalog.from_file(release_catalog_url) + + +class TestServerSetup: + """Tests that verify the test server is running correctly.""" + + @pytest.mark.integration + def test_server_is_running(self, catalog_server): + """Test that the HTTP server is running.""" + import urllib.request + + url = f"{catalog_server}/catalog.json" + response = urllib.request.urlopen(url) + assert response.status == 200 + + @pytest.mark.integration + def test_catalog_json_is_valid(self, catalog_server): + """Test that catalog.json is valid JSON.""" + import urllib.request + + url = f"{catalog_server}/catalog.json" + response = urllib.request.urlopen(url) + data = json.loads(response.read().decode()) + assert "type" in data + assert data["type"] == "Catalog" + + +class TestRootCatalogValidation: + """Tests for validating the root catalog structure via HTTP.""" + + @pytest.mark.integration + def test_root_catalog_readable(self, root_catalog): + """Test that pystac can read the root catalog from HTTP.""" + assert root_catalog is not None + assert root_catalog.id is not None + assert root_catalog.description is not None + + @pytest.mark.integration + def test_root_catalog_has_children(self, root_catalog): + """Test that the root catalog has child releases.""" + children = list(root_catalog.get_children()) + assert len(children) > 0, "Root catalog should have release children" + + @pytest.mark.integration + def test_root_catalog_has_latest_field(self, root_catalog): + """Test that the root catalog has a 'latest' field.""" + assert "latest" in root_catalog.extra_fields + + +class TestReleaseCatalogValidation: + """Tests for validating release catalog with pystac via HTTP.""" + + @pytest.mark.integration + def test_release_catalog_readable(self, release_catalog): + """Test that pystac can read the release catalog from HTTP.""" + assert release_catalog is not None + assert release_catalog.id is not None + assert release_catalog.description is not None + + @pytest.mark.integration + def test_release_catalog_has_themes(self, release_catalog): + """Test that the release catalog has theme children.""" + children = list(release_catalog.get_children()) + assert len(children) > 0, "Release catalog should have theme children" + + @pytest.mark.integration + def test_release_catalog_has_collections(self, release_catalog): + """Test that the release catalog contains collections.""" + collections = get_all_collections_from_catalog(release_catalog) + assert len(collections) > 0, "Release catalog should have collections" + + @pytest.mark.integration + def test_release_catalog_has_items(self, release_catalog): + """Test that the release catalog contains items.""" + items = get_all_items_from_catalog(release_catalog) + assert len(items) > 0, "Release catalog should have items" + + @pytest.mark.integration + def test_release_catalog_extra_fields(self, release_catalog): + """Test that the release catalog has expected extra fields.""" + assert "release:version" in release_catalog.extra_fields + assert "schema:version" in release_catalog.extra_fields + assert "schema:tag" in release_catalog.extra_fields + + @pytest.mark.integration + def test_release_catalog_stac_extensions(self, release_catalog): + """Test that the release catalog declares STAC extensions.""" + assert release_catalog.stac_extensions is not None + assert len(release_catalog.stac_extensions) > 0 + + extension_urls = release_catalog.stac_extensions + assert any("storage" in ext for ext in extension_urls) + assert any("alternate" in ext for ext in extension_urls) + + +class TestCollectionValidation: + """Tests for validating STAC collections via HTTP.""" + + @pytest.mark.integration + def test_collections_have_valid_extents(self, release_catalog): + """Test that collections have valid spatial and temporal extents.""" + for collection in get_all_collections_from_catalog(release_catalog): + assert collection.extent is not None, ( + f"Collection {collection.id} should have extent" + ) + assert collection.extent.spatial is not None + assert collection.extent.temporal is not None + + # Spatial extent should have bboxes + assert len(collection.extent.spatial.bboxes) > 0, ( + f"Collection {collection.id} should have spatial bboxes" + ) + + # Temporal extent should have intervals + assert len(collection.extent.temporal.intervals) > 0, ( + f"Collection {collection.id} should have temporal intervals" + ) + + @pytest.mark.integration + def test_collections_have_licenses(self, release_catalog): + """Test that collections have licenses where expected.""" + for collection in get_all_collections_from_catalog(release_catalog): + if collection.license is not None: + assert isinstance(collection.license, str) + + @pytest.mark.integration + def test_collections_have_summaries(self, release_catalog): + """Test that collections have summaries.""" + for collection in get_all_collections_from_catalog(release_catalog): + assert collection.summaries is not None, ( + f"Collection {collection.id} should have summaries" + ) + + +class TestItemValidation: + """Tests for validating STAC items via HTTP.""" + + @pytest.mark.integration + def test_items_have_valid_geometry(self, release_catalog): + """Test that items have valid geometry and bbox.""" + for item in get_all_items_from_catalog(release_catalog): + assert item.geometry is not None, f"Item {item.id} should have geometry" + assert item.geometry["type"] == "Polygon" + assert "coordinates" in item.geometry + + assert item.bbox is not None, f"Item {item.id} should have bbox" + assert len(item.bbox) == 4 + + @pytest.mark.integration + def test_items_have_datetime(self, release_catalog): + """Test that items have datetime.""" + for item in get_all_items_from_catalog(release_catalog): + assert item.datetime is not None, f"Item {item.id} should have datetime" + + @pytest.mark.integration + def test_items_have_assets(self, release_catalog): + """Test that items have assets.""" + for item in get_all_items_from_catalog(release_catalog): + assert len(item.assets) > 0, f"Item {item.id} should have assets" + assert "aws" in item.assets or "azure" in item.assets, ( + f"Item {item.id} should have aws or azure asset" + ) + + @pytest.mark.integration + def test_items_have_required_properties(self, release_catalog): + """Test that items have required custom properties.""" + for item in get_all_items_from_catalog(release_catalog): + assert "num_rows" in item.properties, ( + f"Item {item.id} should have num_rows property" + ) + assert "num_row_groups" in item.properties, ( + f"Item {item.id} should have num_row_groups property" + ) + + @pytest.mark.integration + def test_items_have_storage_schemes(self, release_catalog): + """Test that items have storage:schemes property.""" + for item in get_all_items_from_catalog(release_catalog): + assert "storage:schemes" in item.properties, ( + f"Item {item.id} should have storage:schemes property" + ) + schemes = item.properties["storage:schemes"] + assert "aws" in schemes or "azure" in schemes + + +class TestCatalogWalk: + """Tests for walking through the catalog structure via HTTP.""" + + @pytest.mark.integration + def test_walk_catalog_hierarchy(self, release_catalog): + """Test walking through the entire catalog hierarchy.""" + theme_count = 0 + collection_count = 0 + item_count = 0 + + for theme_catalog in release_catalog.get_children(): + theme_count += 1 + + for collection in theme_catalog.get_children(): + if isinstance(collection, pystac.Collection): + collection_count += 1 + + for _item in collection.get_items(): + item_count += 1 + + assert theme_count > 0, "Should have at least one theme" + assert collection_count > 0, "Should have at least one collection" + assert item_count > 0, "Should have at least one item" + + print( + f"\nCatalog structure: {theme_count} themes, " + f"{collection_count} collections, {item_count} items" + ) + + @pytest.mark.integration + def test_all_links_resolve(self, release_catalog): + """Test that all catalog links can be resolved via HTTP.""" + for child in release_catalog.get_children(): + assert child is not None + for grandchild in child.get_children(): + assert grandchild is not None + + +class TestAssetValidation: + """Tests for validating STAC assets.""" + + @pytest.mark.integration + def test_assets_have_valid_hrefs(self, release_catalog): + """Test that assets have valid href URLs.""" + for item in get_all_items_from_catalog(release_catalog): + for asset_key, asset in item.assets.items(): + assert asset.href is not None, ( + f"Asset {asset_key} in item {item.id} should have href" + ) + assert asset.href.startswith("http"), ( + f"Asset {asset_key} href should be a URL" + ) + + @pytest.mark.integration + def test_assets_have_media_type(self, release_catalog): + """Test that assets have media type.""" + for item in get_all_items_from_catalog(release_catalog): + for asset_key, asset in item.assets.items(): + assert asset.media_type is not None, ( + f"Asset {asset_key} in item {item.id} should have media_type" + ) + assert "parquet" in asset.media_type, "Asset should be parquet type" + + +class TestManifestValidation: + """Tests for validating the manifest.geojson file.""" + + @pytest.mark.integration + def test_manifest_accessible(self, catalog_server, release_name): + """Test that manifest.geojson is accessible via HTTP.""" + import urllib.request + + url = f"{catalog_server}/{release_name}/manifest.geojson" + response = urllib.request.urlopen(url) + assert response.status == 200 + + @pytest.mark.integration + def test_manifest_is_valid_geojson(self, catalog_server, release_name): + """Test that manifest.geojson is valid GeoJSON.""" + import urllib.request + + url = f"{catalog_server}/{release_name}/manifest.geojson" + response = urllib.request.urlopen(url) + manifest = json.loads(response.read().decode()) + + assert manifest["type"] == "FeatureCollection" + assert "features" in manifest + assert len(manifest["features"]) > 0 + + @pytest.mark.integration + def test_manifest_features_have_properties(self, catalog_server, release_name): + """Test that manifest features have expected properties.""" + import urllib.request + + url = f"{catalog_server}/{release_name}/manifest.geojson" + response = urllib.request.urlopen(url) + manifest = json.loads(response.read().decode()) + + for feature in manifest["features"]: + assert "properties" in feature + assert "ovt_type" in feature["properties"] + assert "rel_path" in feature["properties"] + assert "geometry" in feature + assert "bbox" in feature diff --git a/tests/test_overture_stac.py b/tests/test_overture_stac.py deleted file mode 100644 index b97ea50..0000000 --- a/tests/test_overture_stac.py +++ /dev/null @@ -1,24 +0,0 @@ -"""Tests for overture_stac module.""" - -from overture_stac.overture_stac import TYPE_LICENSE_MAP - - -def test_license_map_exists(): - """Test that TYPE_LICENSE_MAP is defined.""" - assert TYPE_LICENSE_MAP is not None - assert isinstance(TYPE_LICENSE_MAP, dict) - - -def test_common_types_have_licenses(): - """Test that common types have license mappings.""" - common_types = ["building", "place", "address", "water", "land"] - - for type_name in common_types: - assert type_name in TYPE_LICENSE_MAP - - -def test_license_values_are_strings(): - """Test that all license values are strings.""" - for license_value in TYPE_LICENSE_MAP.values(): - assert isinstance(license_value, str) - assert len(license_value) > 0 diff --git a/tests/test_registry_manifest.py b/tests/test_registry_manifest.py deleted file mode 100644 index 0a79c41..0000000 --- a/tests/test_registry_manifest.py +++ /dev/null @@ -1,117 +0,0 @@ -"""Tests for registry_manifest module.""" - -import json - -import pytest - -from overture_stac.registry_manifest import RegistryManifest - - -def test_init_default_params(): - """Test initialization with default parameters.""" - manifest = RegistryManifest() - assert manifest.registry_path == "overturemaps-us-west-2/registry" - assert manifest.filesystem is not None - - -def test_init_custom_params(): - """Test initialization with custom parameters.""" - custom_path = "custom-bucket/registry" - custom_region = "eu-west-1" - - manifest = RegistryManifest(registry_path=custom_path, s3_region=custom_region) - assert manifest.registry_path == custom_path - - -def test_manifest_sorting(): - """Test that manifest entries are sorted by max_id.""" - entries = [ - ["file2.parquet", "id_200"], - ["file1.parquet", "id_100"], - ["file3.parquet", "id_300"], - ] - - entries.sort(key=lambda x: x[1]) - - assert entries[0][1] == "id_100" - assert entries[1][1] == "id_200" - assert entries[2][1] == "id_300" - - -def test_manifest_format(): - """Test that manifest returns list of [filename, max_id] tuples.""" - expected_entry = ["test.parquet", "test_id"] - assert isinstance(expected_entry, list) - assert len(expected_entry) == 2 - assert isinstance(expected_entry[0], str) - assert isinstance(expected_entry[1], str) - - -@pytest.mark.integration -@pytest.mark.slow -def test_create_registry_manifest_integration(): - """ - Integration test: Actually connect to S3 and create registry manifest. - - This test connects to the real S3 bucket and generates the manifest. - It's marked as 'integration' and 'slow' so it can be skipped in CI. - - Run with: pytest -v -m integration - Skip with: pytest -v -m "not integration" - """ - print("\n" + "=" * 80) - print("INTEGRATION TEST: Creating Registry Manifest from S3") - print("=" * 80) - - # Create the manifest - registry = RegistryManifest() - manifest_data = registry.create_manifest() - - # Pretty print the results - print(f"\nāœ“ Successfully created manifest with {len(manifest_data)} files\n") - - if manifest_data: - print("First 10 entries:") - print("-" * 80) - for i, entry in enumerate(manifest_data[:10], 1): - filename, max_id = entry - print(f"{i:2d}. {filename:50s} | max_id: {max_id}") - - if len(manifest_data) > 10: - print(f"\n... and {len(manifest_data) - 10} more files") - - print("\n" + "-" * 80) - print("Last 5 entries:") - print("-" * 80) - for i, entry in enumerate(manifest_data[-5:], len(manifest_data) - 4): - filename, max_id = entry - print(f"{i:2d}. {filename:50s} | max_id: {max_id}") - - # Print as JSON - print("\n" + "=" * 80) - print("Manifest as JSON (first 5 entries):") - print("=" * 80) - print(json.dumps(manifest_data[:5], indent=2)) - - # Verify structure - print("\n" + "=" * 80) - print("Verification:") - print("=" * 80) - assert isinstance(manifest_data, list), "Manifest should be a list" - assert len(manifest_data) > 0, "Manifest should contain entries" - - # Verify each entry has the correct structure - for entry in manifest_data: - assert isinstance(entry, list), f"Entry should be a list: {entry}" - assert len(entry) == 2, f"Entry should have 2 elements: {entry}" - assert isinstance(entry[0], str), f"Filename should be string: {entry[0]}" - assert isinstance(entry[1], str), f"min_id should be string: {entry[1]}" - - # Verify sorting (each min_id should be >= previous) - for i in range(1, len(manifest_data)): - prev_id = manifest_data[i - 1][1] - curr_id = manifest_data[i][1] - assert curr_id >= prev_id, f"Manifest not sorted: {prev_id} > {curr_id}" - - print("āœ“ All verifications passed!") - print("=" * 80 + "\n") From aa4fe4a87305500a05c73fa802ddbf5d7281ac64 Mon Sep 17 00:00:00 2001 From: Jennings Anderson Date: Tue, 3 Feb 2026 17:08:07 -0800 Subject: [PATCH 5/5] Update ci.yaml --- .github/workflows/ci.yaml | 4 ++-- README.md | 41 ++++++++++++--------------------------- pyproject.toml | 2 +- uv.lock | 2 +- 4 files changed, 16 insertions(+), 33 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index fe807c0..cf9ee17 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -50,11 +50,11 @@ jobs: - name: Install dependencies run: | - uv pip install --system -e ".[dev]" + uv sync --all-groups - name: Run tests run: | - pytest + uv run pytest test-package-install: name: Test Package Installation diff --git a/README.md b/README.md index aed6614..99a5266 100644 --- a/README.md +++ b/README.md @@ -4,47 +4,30 @@ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) -Generate STAC (SpatioTemporal Asset Catalog) catalogs for all public Overture Maps releases. +Generate STAC catalogs for all public Overture Maps releases. -See it in action here: - +**[Browse the catalog](https://radiantearth.github.io/stac-browser/#/external/labs.overturemaps.org/stac/catalog.json?.language=en)** -### Installing/Updating Dependencies +## Setup ```bash -# Install package in editable mode with dev dependencies -uv pip install -e ".[dev]" - -# Install just the package (no dev dependencies) -uv pip install -e . - -# Update dependencies -uv pip install --upgrade -e ".[dev]" - -# Add a new dependency (manually edit pyproject.toml, then): -uv pip install -e ".[dev]" +uv sync ``` -### Running the Application +## Usage ```bash -# Run the STAC generator (parallel mode with 4 workers by default) -gen-stac --output ./public_releases - -# Run in debug mode (generates only 1 item per collection) -gen-stac --output ./public_releases --debug +gen-stac --output ./releases -# Control parallelization -gen-stac --output ./public_releases --workers 8 # Use 8 parallel workers -gen-stac --output ./public_releases --no-parallel # Disable parallelization +# Debug mode (2 items per collection) +gen-stac --output ./releases --debug -# Recommended for production (balance speed and resource usage) -gen-stac --output ./public_releases --workers 4 +# Custom worker count (default: 4) +gen-stac --output ./releases --workers 8 ``` -### Before Committing +## Development ```bash -# Run the full CI check locally -ruff format . && ruff check . && pytest +uv run ruff format . && uv run ruff check . && uv run pytest ``` diff --git a/pyproject.toml b/pyproject.toml index 28f051a..605c175 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "overture-stac" -version = "1.0.2" +version = "1.0.3" description = "Generate STAC catalogs for Overture Maps Releases" authors = [ {name = "Overture Maps Foundation"} diff --git a/uv.lock b/uv.lock index e016808..7bfd367 100644 --- a/uv.lock +++ b/uv.lock @@ -576,7 +576,7 @@ wheels = [ [[package]] name = "overture-stac" -version = "1.0.2" +version = "1.0.3" source = { editable = "." } dependencies = [ { name = "pyarrow", version = "21.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },