From f90e827e87fbbeac9e64c89c204e74d6d306f402 Mon Sep 17 00:00:00 2001 From: tejas Date: Wed, 24 Sep 2025 11:26:51 +0200 Subject: [PATCH 01/19] Updated publisher logic to support multi mode parameter --- CHANGES.md | 9 +- deep_code/cli/publish.py | 44 +++- deep_code/tools/publish.py | 235 ++++++++++++++-------- deep_code/utils/dataset_stac_generator.py | 17 +- deep_code/utils/ogc_api_record.py | 2 +- deep_code/version.py | 2 +- 6 files changed, 211 insertions(+), 98 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index f57e678..53eeac1 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -33,4 +33,11 @@ - Introduced build_link_to_jnb method for creating STAC-compatible notebook links with metadata on kernel, environment, and containerization. - Added originating application platform metadata to generated OGC API records for - DeepESDL experiments and workflows. \ No newline at end of file + DeepESDL experiments and workflows. + +## Changes in 0.1.6 + +- Publisher now supports `mode` parameter, This allows more flexible publishing: + - `"dataset"` → publish dataset only + - `"workflow"` → publish workflow only + - `"all"` → publish both (default) \ No newline at end of file diff --git a/deep_code/cli/publish.py b/deep_code/cli/publish.py index d3c9001..3ec680f 100644 --- a/deep_code/cli/publish.py +++ b/deep_code/cli/publish.py @@ -4,11 +4,43 @@ # Permissions are hereby granted under the terms of the MIT License: # https://opensource.org/licenses/MIT. +from pathlib import Path + import click from deep_code.tools.publish import Publisher +def _validate_inputs(dataset_config, workflow_config, mode): + mode = mode.lower() + + def ensure_file(path: str, label: str): + if path is None: + raise click.UsageError(f"{label} is required but was not provided.") + if not Path(path).is_file(): + raise click.UsageError(f"{label} not found: {path}") + + if mode == "dataset": + # Need dataset only + ensure_file(dataset_config, "DATASET_CONFIG") + if workflow_config is not None: + click.echo("ℹ️ Ignoring WORKFLOW_CONFIG since mode=dataset.", err=True) + + elif mode == "workflow": + # Need workflow config only + ensure_file(workflow_config, "WORKFLOW_CONFIG") + + elif mode == "all": + # Need both + ensure_file(dataset_config, "DATASET_CONFIG") + ensure_file(workflow_config, "WORKFLOW_CONFIG") + + else: + raise click.UsageError( + "Invalid mode. Choose one of: all, dataset, workflow_experiment." + ) + + @click.command(name="publish") @click.argument("dataset_config", type=click.Path(exists=True)) @click.argument("workflow_config", type=click.Path(exists=True)) @@ -19,7 +51,14 @@ default="production", help="Target environment for publishing (production, staging, testing)", ) -def publish(dataset_config, workflow_config, environment): +@click.option( + "--mode", + "-m", + type=click.Choice(["all", "dataset", "workflow"], case_sensitive=False), + default="all", + help="Publishing mode: dataset only, workflow only, or both", +) +def publish(dataset_config, workflow_config, environment, mode): """Request publishing a dataset along with experiment and workflow metadata to the open science catalogue. """ @@ -28,4 +67,5 @@ def publish(dataset_config, workflow_config, environment): workflow_config_path=workflow_config, environment=environment.lower(), ) - publisher.publish_all() + result = publisher.publish(mode=mode.lower()) + click.echo(f"Pull request created: {result}") diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 15b3699..3ed9362 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -7,6 +7,7 @@ import logging from datetime import datetime from pathlib import Path +from typing import Any, Literal import fsspec import jsonpickle @@ -98,13 +99,14 @@ def publish_files( class Publisher: - """Publishes products (datasets) to the OSC GitHub repository. + """Publishes products (datasets), workflows and experiments to the OSC GitHub + repository. """ def __init__( self, - dataset_config_path: str, - workflow_config_path: str, + dataset_config_path: str | None = None, + workflow_config_path: str | None = None, environment: str = "production", ): self.environment = environment @@ -121,24 +123,37 @@ def __init__( self.collection_id = "" self.workflow_title = "" - # Paths to configuration files + # Paths to configuration files, may be optional self.dataset_config_path = dataset_config_path self.workflow_config_path = workflow_config_path + # Config dicts (loaded lazily) + self.dataset_config: dict[str, Any] = {} + self.workflow_config: dict[str, Any] = {} + + # Values that may be set from configs (lazy) + self.collection_id: str | None = None + self.workflow_title: str | None = None + self.workflow_id: str | None = None + # Load configuration files self._read_config_files() - self.collection_id = self.dataset_config.get("collection_id") - self.workflow_title = self.workflow_config.get("properties", {}).get("title") - self.workflow_id = self.workflow_config.get("workflow_id") - if not self.collection_id: - raise ValueError("collection_id is missing in dataset config.") + if self.dataset_config: + self.collection_id = self.dataset_config.get("collection_id") + if self.workflow_config: + self.workflow_title = self.workflow_config.get("properties", {}).get( + "title" + ) + self.workflow_id = self.workflow_config.get("workflow_id") def _read_config_files(self) -> None: - with fsspec.open(self.dataset_config_path, "r") as file: - self.dataset_config = yaml.safe_load(file) or {} - with fsspec.open(self.workflow_config_path, "r") as file: - self.workflow_config = yaml.safe_load(file) or {} + if self.dataset_config_path: + with fsspec.open(self.dataset_config_path, "r") as file: + self.dataset_config = yaml.safe_load(file) or {} + if self.workflow_config_path: + with fsspec.open(self.workflow_config_path, "r") as file: + self.workflow_config = yaml.safe_load(file) or {} @staticmethod def _write_to_file(file_path: str, data: dict): @@ -206,10 +221,18 @@ def _update_variable_catalogs(self, generator, file_dict, variable_ids): ) file_dict[var_file_path] = updated_catalog.to_dict() - def publish_dataset(self, write_to_file: bool = False): + def publish_dataset( + self, + write_to_file: bool = False, + mode: Literal["all", "dataset", "workflow"] = "all", + ) -> dict[str, Any]: """Prepare dataset/product collection for publishing to the specified GitHub repository.""" + if not self.dataset_config: + raise ValueError( + "No dataset config loaded. Provide dataset_config_path to publish dataset." + ) dataset_id = self.dataset_config.get("dataset_id") self.collection_id = self.dataset_config.get("collection_id") documentation_link = self.dataset_config.get("documentation_link") @@ -240,7 +263,7 @@ def publish_dataset(self, write_to_file: bool = False): ) variable_ids = generator.get_variable_ids() - ds_collection = generator.build_dataset_stac_collection() + ds_collection = generator.build_dataset_stac_collection(mode=mode) # Prepare a dictionary of file paths and content file_dict = {} @@ -315,9 +338,19 @@ def _update_base_catalog( return base_catalog - def generate_workflow_experiment_records(self, write_to_file: bool = False) -> None: + def generate_workflow_experiment_records( + self, + write_to_file: bool = False, + mode: Literal["all", "dataset", "workflow"] = "all", + ) -> dict[str, Any]: """prepare workflow and experiment as ogc api record to publish it to the specified GitHub repository.""" + + file_dict = {} + + if mode not in {"workflow", "all"}: + return file_dict # nothing to do for mode="dataset" + workflow_id = self._normalize_name(self.workflow_config.get("workflow_id")) if not workflow_id: raise ValueError("workflow_id is missing in workflow config.") @@ -367,41 +400,48 @@ def generate_workflow_experiment_records(self, write_to_file: bool = False) -> N exp_record_properties.type = "experiment" exp_record_properties.osc_workflow = workflow_id - dataset_link = link_builder.build_link_to_dataset(self.collection_id) - - experiment_record = ExperimentAsOgcRecord( - id=workflow_id, - title=self.workflow_title, - type="Feature", - jupyter_notebook_url=jupyter_notebook_url, - collection_id=self.collection_id, - properties=exp_record_properties, - links=links + theme_links + dataset_link, - ) - # Convert to dictionary and cleanup - experiment_dict = experiment_record.to_dict() - if "jupyter_notebook_url" in experiment_dict: - del experiment_dict["jupyter_notebook_url"] - if "collection_id" in experiment_dict: - del experiment_dict["collection_id"] - if "osc:project" in experiment_dict["properties"]: - del experiment_dict["properties"]["osc:project"] - # add experiment record to file_dict - exp_file_path = f"experiments/{workflow_id}/record.json" - file_dict[exp_file_path] = experiment_dict - - # Update base catalogs of experiments and workflows with links - file_dict["experiments/catalog.json"] = self._update_base_catalog( - catalog_path="experiments/catalog.json", - item_id=workflow_id, - self_href=EXPERIMENT_BASE_CATALOG_SELF_HREF, - ) + if mode in ["all"]: + if not getattr(self, "collection_id", None): + raise ValueError( + "collection_id is required to generate the experiment record when mode='all' " + "(the experiment links to the output dataset)." + ) + # generate experiment record only if there is an output dataset + dataset_link = link_builder.build_link_to_dataset(self.collection_id) + + experiment_record = ExperimentAsOgcRecord( + id=workflow_id, + title=self.workflow_title, + type="Feature", + jupyter_notebook_url=jupyter_notebook_url, + collection_id=self.collection_id, + properties=exp_record_properties, + links=links + theme_links + dataset_link, + ) + # Convert to dictionary and cleanup + experiment_dict = experiment_record.to_dict() + if "jupyter_notebook_url" in experiment_dict: + del experiment_dict["jupyter_notebook_url"] + if "collection_id" in experiment_dict: + del experiment_dict["collection_id"] + if "osc:project" in experiment_dict["properties"]: + del experiment_dict["properties"]["osc:project"] + # add experiment record to file_dict + exp_file_path = f"experiments/{workflow_id}/record.json" + file_dict[exp_file_path] = experiment_dict + + # Update base catalogs of experiments and workflows with links + file_dict["experiments/catalog.json"] = self._update_base_catalog( + catalog_path="experiments/catalog.json", + item_id=workflow_id, + self_href=EXPERIMENT_BASE_CATALOG_SELF_HREF, + ) - file_dict["workflows/catalog.json"] = self._update_base_catalog( - catalog_path="workflows/catalog.json", - item_id=workflow_id, - self_href=WORKFLOW_BASE_CATALOG_SELF_HREF, - ) + file_dict["workflows/catalog.json"] = self._update_base_catalog( + catalog_path="workflows/catalog.json", + item_id=workflow_id, + self_href=WORKFLOW_BASE_CATALOG_SELF_HREF, + ) # Write to files if testing if write_to_file: for file_path, data in file_dict.items(): @@ -409,43 +449,68 @@ def generate_workflow_experiment_records(self, write_to_file: bool = False) -> N return {} return file_dict - def publish_all(self, write_to_file: bool = False): - """Publish both dataset and workflow/experiment in a single PR.""" - # Get file dictionaries from both methods - dataset_files = self.publish_dataset(write_to_file=write_to_file) - workflow_files = self.generate_workflow_experiment_records( - write_to_file=write_to_file - ) + def publish( + self, + write_to_file: bool = False, + mode: Literal["all", "dataset", "workflow"] = "all", + ) -> dict[str, Any] | str: + """ + Publish both dataset and workflow/experiment in a single PR. + Args: + write_to_file: If True, write JSON files locally and return the generated dict(s). + If False, open a PR and return the PR URL. + mode: Select which artifacts to publish: + - "dataset": only dataset collection & related catalogs + - "workflow": only workflow records + - "all": both + Returns: + dict[str, Any] when write_to_file=True (the files written), + or str when write_to_file=False (the PR URL). + """ - # Combine the file dictionaries - combined_files = {**dataset_files, **workflow_files} + files: dict[str, Any] = {} - if not write_to_file: - # Create branch name, commit message, PR info - branch_name = ( - f"{OSC_BRANCH_NAME}-{self.collection_id}" - f"-{datetime.now().strftime('%Y%m%d%H%M%S')}" - ) - commit_message = ( - f"Add new dataset collection: {self.collection_id} and " - f"workflow/experiment: {self.workflow_config.get('workflow_id')}" - ) - pr_title = ( - f"Add new dataset collection: {self.collection_id} and " - f"workflow/experiment: {self.workflow_config.get('workflow_id')}" - ) - pr_body = ( - f"This PR adds a new dataset collection: {self.collection_id} and " - f"its corresponding workflow/experiment to the repository." - ) + if mode in ("dataset", "all"): + ds_files = self.publish_dataset(write_to_file=False, mode=mode) + files.update(ds_files) - # Publish all files in one go - pr_url = self.gh_publisher.publish_files( - branch_name=branch_name, - file_dict=combined_files, - commit_message=commit_message, - pr_title=pr_title, - pr_body=pr_body, + if mode in ("workflow", "all"): + wf_files = self.generate_workflow_experiment_records(write_to_file=False) + files.update(wf_files) + + if not files: + raise ValueError( + "Nothing to publish. Choose mode='dataset', 'workflow', or 'all'." ) - logger.info(f"Pull request created: {pr_url}") + if write_to_file: + for file_path, data in files.items(): + # file_path might be a Path (from _update_and_add_to_file_dict) – normalize to str + out_path = str(file_path) + self._write_to_file(out_path, data) + return {} # consistent with existing write_to_file behavior + + # Prepare PR + mode_label = { + "dataset": f"dataset: {self.collection_id or 'unknown'}", + "workflow": f"workflow: {self.workflow_id or 'unknown'}", + "all": f"dataset: {self.collection_id or 'unknown'} + workflow/experiment: {self.workflow_id or 'unknown'}", + }[mode] + + branch_name = ( + f"{OSC_BRANCH_NAME}-{(self.collection_id or self.workflow_id or 'osc')}" + f"-{datetime.now().strftime('%Y%m%d%H%M%S')}" + ) + commit_message = f"Publish {mode_label}" + pr_title = f"Publish {mode_label}" + pr_body = f"This PR publishes {mode_label} to the repository." + + pr_url = self.gh_publisher.publish_files( + branch_name=branch_name, + file_dict=files, + commit_message=commit_message, + pr_title=pr_title, + pr_body=pr_body, + ) + logger.info(f"Pull request created: {pr_url}") + return pr_url diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index a30a817..29ff3b7 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -377,7 +377,7 @@ def build_theme(osc_themes: list[str]) -> Theme: concepts = [ThemeConcept(id=theme_str) for theme_str in osc_themes] return Theme(concepts=concepts, scheme=OSC_THEME_SCHEME) - def build_dataset_stac_collection(self) -> Collection: + def build_dataset_stac_collection(self, mode: str) -> Collection: """Build an OSC STAC Collection for the dataset. Returns: @@ -484,14 +484,15 @@ def build_dataset_stac_collection(self) -> Collection: ) ) - collection.add_link( - Link( - rel="related", - target=f"../../experiments/{self.workflow_id}/record.json", - media_type="application/json", - title=f"Experiment: {self.workflow_title}", + if mode in "all": + collection.add_link( + Link( + rel="related", + target=f"../../experiments/{self.workflow_id}/record.json", + media_type="application/json", + title=f"Experiment: {self.workflow_title}", + ) ) - ) collection.license = self.license_type diff --git a/deep_code/utils/ogc_api_record.py b/deep_code/utils/ogc_api_record.py index 22bc5eb..1ab260d 100644 --- a/deep_code/utils/ogc_api_record.py +++ b/deep_code/utils/ogc_api_record.py @@ -1,4 +1,4 @@ -from typing import Any, Optional, Tuple, List, Dict +from typing import Any, Dict, List, Optional, Tuple from urllib.parse import quote, urlencode, urlparse from xrlint.util.constructible import MappingConstructible diff --git a/deep_code/version.py b/deep_code/version.py index c80384e..e81a4d9 100644 --- a/deep_code/version.py +++ b/deep_code/version.py @@ -19,4 +19,4 @@ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. -version = "0.1.5" +version = "0.1.6.dev" From 8e5c02537067e1e20c3a8f5354fecfb1e5e4c5a8 Mon Sep 17 00:00:00 2001 From: tejas Date: Wed, 24 Sep 2025 14:25:06 +0200 Subject: [PATCH 02/19] added test cases for publishing modes --- deep_code/tests/tools/test_publish.py | 40 +++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/deep_code/tests/tools/test_publish.py b/deep_code/tests/tools/test_publish.py index d5cace9..4d58155 100644 --- a/deep_code/tests/tools/test_publish.py +++ b/deep_code/tests/tools/test_publish.py @@ -110,6 +110,46 @@ def test_read_config_files(self): self.assertEqual(self.publisher.dataset_config, dataset_config) self.assertEqual(self.publisher.workflow_config, workflow_config) + @patch("deep_code.tools.publish.GitHubPublisher") + def test_environment_repo_selection(self, mock_gp): + Publisher(environment="production") + assert mock_gp.call_args.kwargs["repo_name"] == "open-science-catalog-metadata" + Publisher(environment="staging") + assert ( + mock_gp.call_args.kwargs["repo_name"] + == "open-science-catalog-metadata-staging" + ) + Publisher(environment="testing") + assert ( + mock_gp.call_args.kwargs["repo_name"] + == "open-science-catalog-metadata-testing" + ) + + @patch.object(Publisher, "publish_dataset", return_value={"a": {}}) + @patch.object( + Publisher, "generate_workflow_experiment_records", return_value={"b": {}} + ) + def test_publish_mode_routing(self, mock_wf, mock_ds): + # dataset only + self.publisher.publish(write_to_file=True, mode="dataset") + mock_ds.assert_called() + mock_wf.assert_not_called() + + mock_ds.reset_mock() + mock_wf.reset_mock() + self.publisher.publish(write_to_file=True, mode="workflow") + mock_ds.assert_not_called() + mock_wf.assert_called() + + @patch.object(Publisher, "generate_workflow_experiment_records", return_value={}) + @patch.object(Publisher, "publish_dataset", return_value={}) + def test_publish_nothing_to_publish_raises( + self, mock_publish_dataset, mock_generate_workflow_experiment_records + ): + with pytest.raises(ValueError): + self.publisher.publish(write_to_file=False, mode="dataset") + mock_publish_dataset.assert_called_once() + mock_generate_workflow_experiment_records.assert_not_called() class TestParseGithubNotebookUrl: @pytest.mark.parametrize( From d54cba2ac5819b8f2dcc8cd05529c4af87d3a312 Mon Sep 17 00:00:00 2001 From: tejas Date: Wed, 24 Sep 2025 14:55:06 +0200 Subject: [PATCH 03/19] additional test case --- deep_code/tests/tools/test_publish.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/deep_code/tests/tools/test_publish.py b/deep_code/tests/tools/test_publish.py index 4d58155..42dbc47 100644 --- a/deep_code/tests/tools/test_publish.py +++ b/deep_code/tests/tools/test_publish.py @@ -151,6 +151,29 @@ def test_publish_nothing_to_publish_raises( mock_publish_dataset.assert_called_once() mock_generate_workflow_experiment_records.assert_not_called() + @patch.object(Publisher, "publish_dataset", return_value={"x": {}}) + @patch.object( + Publisher, "generate_workflow_experiment_records", return_value={"y": {}} + ) + def test_publish_builds_pr_params(self, mock_wf, mock_ds): + # Make PR creation return a fixed URL + self.publisher.gh_publisher.publish_files.return_value = "PR_URL" + + # Provide IDs for commit/PR labels + self.publisher.collection_id = "col" + self.publisher.workflow_id = "wf" + + url = self.publisher.publish(write_to_file=False, mode="all") + assert url == "PR_URL" + + # Inspect the call arguments to publish_files + _, kwargs = self.publisher.gh_publisher.publish_files.call_args + assert "dataset: col" in kwargs["commit_message"] + assert "workflow/experiment: wf" in kwargs["commit_message"] + assert "dataset: col" in kwargs["pr_title"] + assert "workflow/experiment: wf" in kwargs["pr_title"] + + class TestParseGithubNotebookUrl: @pytest.mark.parametrize( "url,repo_url,repo_name,branch,file_path", From 62ee7bf6005b02456cff21bea764f0878d413b9a Mon Sep 17 00:00:00 2001 From: tejas Date: Wed, 24 Sep 2025 16:27:15 +0200 Subject: [PATCH 04/19] enhanced cli to auto detect the config type if not explicitly mentioned --- deep_code/cli/publish.py | 225 ++++++++++++++++++++++++++++++++++--- deep_code/tools/publish.py | 4 +- 2 files changed, 212 insertions(+), 17 deletions(-) diff --git a/deep_code/cli/publish.py b/deep_code/cli/publish.py index 3ec680f..56561bd 100644 --- a/deep_code/cli/publish.py +++ b/deep_code/cli/publish.py @@ -5,45 +5,211 @@ # https://opensource.org/licenses/MIT. from pathlib import Path +from typing import Literal, Optional, Tuple import click +import yaml from deep_code.tools.publish import Publisher +Mode = Literal["all", "dataset", "workflow"] -def _validate_inputs(dataset_config, workflow_config, mode): +DATASET_MARKERS = { + "stac_version", + "extent", + "license", + "summaries", + "assets", + "providers", + "collection", + "collection_id", + "id", +} +WORKFLOW_MARKERS = { + "workflow", + "workflow_id", + "workflow_title", + "experiment", + "jupyter_notebook_url", + "notebook", + "parameters", + "input_datasets", +} + + +def _validate_inputs( + dataset_config: Optional[str], workflow_config: Optional[str], mode: str +): mode = mode.lower() - def ensure_file(path: str, label: str): + def ensure_file(path: Optional[str], label: str): if path is None: raise click.UsageError(f"{label} is required but was not provided.") if not Path(path).is_file(): raise click.UsageError(f"{label} not found: {path}") if mode == "dataset": - # Need dataset only ensure_file(dataset_config, "DATASET_CONFIG") if workflow_config is not None: - click.echo("ℹ️ Ignoring WORKFLOW_CONFIG since mode=dataset.", err=True) + click.echo("Ignoring WORKFLOW_CONFIG since mode=dataset.", err=True) elif mode == "workflow": - # Need workflow config only ensure_file(workflow_config, "WORKFLOW_CONFIG") elif mode == "all": - # Need both ensure_file(dataset_config, "DATASET_CONFIG") ensure_file(workflow_config, "WORKFLOW_CONFIG") else: + raise click.UsageError("Invalid mode. Choose one of: all, dataset, workflow.") + + +def _detect_config_type(path: Path) -> Literal["dataset", "workflow"]: + """Detect config type via filename hints and YAML top-level keys.""" + name = path.name.lower() + if "workflow" in name or "experiment" in name: + return "workflow" + if "dataset" in name or "collection" in name: + return "dataset" + + try: + data = yaml.safe_load(path.read_text(encoding="utf-8")) + except Exception as e: + raise ValueError(f"Cannot read YAML from {path}: {e}") + + if not isinstance(data, dict): + raise ValueError(f"YAML in {path} must be a mapping/object at the top level.") + + keys = set(data.keys()) + ds_score = len(keys & DATASET_MARKERS) + wf_score = len(keys & WORKFLOW_MARKERS) + + if ds_score > wf_score: + return "dataset" + if wf_score > ds_score: + return "workflow" + + raise ValueError( + f"Ambiguous config type for {path}. " + "Rename to include 'dataset' or 'workflow', or pass the missing file explicitly." + ) + + +def _assign_configs( + pos_first: Optional[str], + pos_second: Optional[str], + mode: Mode, + explicit_dataset: Optional[str], + explicit_workflow: Optional[str], +) -> Tuple[Optional[str], Optional[str]]: + """ + Decide which file is dataset vs workflow. + Precedence: explicit flags > positional + detection. + Returns (dataset_config, workflow_config). + """ + ds = explicit_dataset + wf = explicit_workflow + + # If both explicit provided, we're done; warn if extra positionals are passed. + pos_args = [p for p in (pos_first, pos_second) if p] + if ds and wf: + if pos_args: + click.echo( + "Positional config paths ignored because explicit flags were provided.", + err=True, + ) + return ds, wf + + # Helper to assign a single positional file to the missing slot + def _assign_single(p: str): + nonlocal ds, wf + if ds and wf: + raise click.UsageError( + "Both dataset and workflow configs already provided; remove extra positional files." + ) + # Use mode as a strong hint when only one is missing + if not ds and mode == "dataset": + ds = p + return + if not wf and mode == "workflow": + wf = p + return + # Otherwise detect + kind = _detect_config_type(Path(p)) + if kind == "dataset": + if ds and Path(ds).resolve() != Path(p).resolve(): + raise click.UsageError( + f"Multiple dataset configs supplied: {ds} and {p}" + ) + ds = p + else: + if wf and Path(wf).resolve() != Path(p).resolve(): + raise click.UsageError( + f"Multiple workflow configs supplied: {wf} and {p}" + ) + wf = p + + # If exactly one explicit provided, try to fill the other via positionals + if ds and not wf: + if len(pos_args) > 1: + raise click.UsageError( + "Provide at most one positional file when using --dataset-config." + ) + if pos_args: + _assign_single(pos_args[0]) + return ds, wf + + if wf and not ds: + if len(pos_args) > 1: + raise click.UsageError( + "Provide at most one positional file when using --workflow-config." + ) + if pos_args: + _assign_single(pos_args[0]) + return ds, wf + + # No explicit flags: rely on positionals + detection + if not pos_args: + return None, None + if len(pos_args) == 1: + p = pos_args[0] + if mode == "dataset": + return p, None + if mode == "workflow": + return None, p + # mode == "all": detect and require the other later in validation + kind = _detect_config_type(Path(p)) + return (p, None) if kind == "dataset" else (None, p) + + # Two positionals: detect both and assign + p1, p2 = pos_args[0], pos_args[1] + k1 = _detect_config_type(Path(p1)) + k2 = _detect_config_type(Path(p2)) + if k1 == k2: raise click.UsageError( - "Invalid mode. Choose one of: all, dataset, workflow_experiment." + f"Both files look like '{k1}' configs: {p1} and {p2}. " + "Please rename one or use --dataset-config/--workflow-config." ) + ds = p1 if k1 == "dataset" else p2 + wf = p1 if k1 == "workflow" else p2 + return ds, wf @click.command(name="publish") -@click.argument("dataset_config", type=click.Path(exists=True)) -@click.argument("workflow_config", type=click.Path(exists=True)) +@click.argument("dataset_config", type=click.Path(exists=True), required=False) +@click.argument("workflow_config", type=click.Path(exists=True), required=False) +@click.option( + "--dataset-config", + "dataset_config_opt", + type=click.Path(exists=True), + help="Explicit path to dataset config (overrides positional detection).", +) +@click.option( + "--workflow-config", + "workflow_config_opt", + type=click.Path(exists=True), + help="Explicit path to workflow config (overrides positional detection).", +) @click.option( "--environment", "-e", @@ -58,14 +224,41 @@ def ensure_file(path: str, label: str): default="all", help="Publishing mode: dataset only, workflow only, or both", ) -def publish(dataset_config, workflow_config, environment, mode): - """Request publishing a dataset along with experiment and workflow metadata to the - open science catalogue. +def publish( + dataset_config, + workflow_config, + dataset_config_opt, + workflow_config_opt, + environment, + mode, +): """ + Publish dataset and/or workflow/experiment metadata. + + Examples: + deep-code publish workflow.yaml -e staging -m workflow + deep-code publish dataset.yaml -e staging -m dataset + deep-code publish dataset.yaml workflow.yaml -m all + deep-code publish --dataset-config dataset.yaml --workflow-config wf.yaml -m all + deep-code publish --dataset-config dataset.yaml -m dataset + deep-code publish --workflow-config wf.yaml -m workflow + """ + mode = mode.lower() + ds_path, wf_path = _assign_configs( + dataset_config, + workflow_config, + mode, # type: ignore[arg-type] + dataset_config_opt, + workflow_config_opt, + ) + + _validate_inputs(ds_path, wf_path, mode) + publisher = Publisher( - dataset_config_path=dataset_config, - workflow_config_path=workflow_config, + dataset_config_path=ds_path, + workflow_config_path=wf_path, environment=environment.lower(), ) - result = publisher.publish(mode=mode.lower()) - click.echo(f"Pull request created: {result}") + result = publisher.publish(mode=mode) + + click.echo(result if isinstance(result, str) else "Wrote files locally.") diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 3ed9362..cc7880a 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -475,7 +475,9 @@ def publish( files.update(ds_files) if mode in ("workflow", "all"): - wf_files = self.generate_workflow_experiment_records(write_to_file=False) + wf_files = self.generate_workflow_experiment_records( + write_to_file=False, mode=mode + ) files.update(wf_files) if not files: From 4e2a06894c5d6ead68966d15b97d849d451aca35 Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 25 Sep 2025 17:08:41 +0200 Subject: [PATCH 05/19] Contacts in OGC API records no longer include default --- deep_code/utils/ogc_api_record.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/deep_code/utils/ogc_api_record.py b/deep_code/utils/ogc_api_record.py index 1ab260d..3f12b66 100644 --- a/deep_code/utils/ogc_api_record.py +++ b/deep_code/utils/ogc_api_record.py @@ -17,19 +17,32 @@ class Contact(MappingConstructible["Contact"], JsonSerializable): def __init__( self, - name: str, - organization: str, - position: str | None = "", + name: str | None = None, + organization: str | None = None, + position: str | None = None, links: list[dict[str, Any]] | None = None, - contactInstructions: str | None = "", - roles: list[str] = None, + contactInstructions: str | None = None, + roles: list[str] | None = None, ): self.name = name self.organization = organization self.position = position - self.links = links or [] + self.links = links self.contactInstructions = contactInstructions - self.roles = roles or ["principal investigator"] + self.roles = roles + + def to_dict(self, value_name: str | None = None) -> dict[str, JsonValue]: + """Serialize to JSON, dropping None values.""" + data = { + "name": self.name, + "organization": self.organization, + "position": self.position, + "links": self.links, + "contactInstructions": self.contactInstructions, + "roles": self.roles, + } + # keep only explicitly set fields + return {k: v for k, v in data.items() if v is not None} class ThemeConcept(MappingConstructible["ThemeConcept"], JsonSerializable): From ce6838c6d427e388e8382257b0144b3792757a6c Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 25 Sep 2025 17:09:00 +0200 Subject: [PATCH 06/19] updated change log --- CHANGES.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 53eeac1..cdc934f 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -40,4 +40,11 @@ - Publisher now supports `mode` parameter, This allows more flexible publishing: - `"dataset"` → publish dataset only - `"workflow"` → publish workflow only - - `"all"` → publish both (default) \ No newline at end of file + - `"all"` → publish both (default) + +- CLI: publish now auto-detects dataset vs workflow configs and also accepts + --dataset-config / --workflow-config; single-file calls use -m to disambiguate + (e.g., deep-code publish workflow.yaml -m workflow). + +- Contacts in OGC API records no longer include default or empty fields, only + properties explicitly defined in the workflow configuration will now be generated. \ No newline at end of file From e5254c25fb29c27119eafca2ae90006efb37f5b3 Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 26 Sep 2025 11:53:38 +0200 Subject: [PATCH 07/19] updated example notebook --- examples/notebooks/publish_to_EarthCODE.ipynb | 262 ++++++++++++++---- 1 file changed, 205 insertions(+), 57 deletions(-) diff --git a/examples/notebooks/publish_to_EarthCODE.ipynb b/examples/notebooks/publish_to_EarthCODE.ipynb index b2872df..125b56d 100644 --- a/examples/notebooks/publish_to_EarthCODE.ipynb +++ b/examples/notebooks/publish_to_EarthCODE.ipynb @@ -59,6 +59,7 @@ "import warnings\n", "\n", "from xcube.core.store import new_data_store\n", + "\n", "from deep_code.tools.lint import LintDataset\n", "from deep_code.tools.publish import Publisher" ] @@ -105,7 +106,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "dcc6255f-b680-419c-acf1-38968c104790", "metadata": { "tags": [] @@ -115,19 +116,21 @@ "name": "stderr", "output_type": "stream", "text": [ - "WARNING - 2025-09-12T13:17:36Z - COPERNICUS_MARINE_SERVICE_USERNAME is deprecated. Please use COPERNICUSMARINE_SERVICE_USERNAME instead.\n", + "WARNING - 2025-09-23T14:37:42Z - COPERNICUS_MARINE_SERVICE_USERNAME is deprecated. Please use COPERNICUSMARINE_SERVICE_USERNAME instead.\n", "WARNING:copernicusmarine:COPERNICUS_MARINE_SERVICE_USERNAME is deprecated. Please use COPERNICUSMARINE_SERVICE_USERNAME instead.\n", - "WARNING - 2025-09-12T13:17:36Z - COPERNICUS_MARINE_SERVICE_PASSWORD is deprecated. Please use COPERNICUSMARINE_SERVICE_PASSWORD instead.\n", - "WARNING:copernicusmarine:COPERNICUS_MARINE_SERVICE_PASSWORD is deprecated. Please use COPERNICUSMARINE_SERVICE_PASSWORD instead.\n" + "WARNING - 2025-09-23T14:37:42Z - COPERNICUS_MARINE_SERVICE_PASSWORD is deprecated. Please use COPERNICUSMARINE_SERVICE_PASSWORD instead.\n", + "WARNING:copernicusmarine:COPERNICUS_MARINE_SERVICE_PASSWORD is deprecated. Please use COPERNICUSMARINE_SERVICE_PASSWORD instead.\n", + "/home/tejas/micromamba/envs/deep-code/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" ] }, { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 3, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -139,7 +142,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "8003fe01-2186-4a60-9ec6-b6ce51d2a185", "metadata": { "tags": [] @@ -149,9 +152,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO - 2025-09-12T13:17:42Z - Selected dataset version: \"201904\"\n", + "INFO - 2025-09-23T14:37:44Z - Selected dataset version: \"201904\"\n", "INFO:copernicusmarine:Selected dataset version: \"201904\"\n", - "INFO - 2025-09-12T13:17:42Z - Selected dataset part: \"default\"\n", + "INFO - 2025-09-23T14:37:44Z - Selected dataset part: \"default\"\n", "INFO:copernicusmarine:Selected dataset part: \"default\"\n" ] }, @@ -538,14 +541,14 @@ " sea_surface_temperature (time, latitude, longitude) float64 10MB dask.array<chunksize=(1, 451, 512), meta=np.ndarray>\n", "Attributes:\n", " Conventions: CF-1.6\n", - " comment: IN NO EVENT SHALL DMI OR ITS REPRESENTATIVES BE LIABLE FOR ...\n", - " source: VIIRS_SST_NPP_NAR-OSISAF-L3C, SEVIRI_SST-OSISAF-L3C, EUR-L2...\n", " title: DMI North Sea/Baltic Sea SST, L3S, (SST_BAL_SST_L3S_NRT_OBS...\n", - " references: Høyer, J. L., Le Borgne, P., & Eastwood, S. (2014). A bias ...\n", + " source: VIIRS_SST_NPP_NAR-OSISAF-L3C, SEVIRI_SST-OSISAF-L3C, EUR-L2...\n", " history: Version 1.0\n", - " institution: Danish Meteorological Institute, DMI
  • Conventions :
    CF-1.6
    title :
    DMI North Sea/Baltic Sea SST, L3S, (SST_BAL_SST_L3S_NRT_OBSERVATIONS_010_032)
    source :
    VIIRS_SST_NPP_NAR-OSISAF-L3C, SEVIRI_SST-OSISAF-L3C, EUR-L2P-AVHRR_METOP_B, NAVO-VIIRS_NPP, VIIRS_N20_ACSPO, MyOcean_HR_ICE
    history :
    Version 1.0
    institution :
    Danish Meteorological Institute, DMI
    references :
    Høyer, J. L., Le Borgne, P., & Eastwood, S. (2014). A bias correction method for Arctic satellite sea surface temperature observations. Remote Sensing of Environment, 146, 201-213.
    comment :
    IN NO EVENT SHALL DMI OR ITS REPRESENTATIVES BE LIABLE FOR ANY DAMAGES WHATSOEVER INCLUDING, WITHOUT LIMITATION, SPECIAL, INDIRECT, INCIDENTAL OR CONSEQUENTIAL DAMAGES OR DAMAGES FOR LOSS OF BUSINESS PROFITS OR SAVINGS, BUSINESS INTERRUPTION, LOSS OF BUSINESS INFORMATION OR OTHER PECUNIARY LOSS ARISING OUT OF THE USE OF OR THE BUSINESS INTERRUPTION, LOSS OF BUSINESS INFORMATION OR OTHER PECUNIARY LOSS ARISING OUT OF THE USE OF OR THE INABILITY TO USE THIS DMI PRODUCT, EVEN IF DMI HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. THIS LIMITATION SHALL APPLY TO CLAIMS OF PERSONAL INJURY TO THE EXTENT PERMITTED BY LAW. SOME COUNTRIES OR STATES DO NOT ALLOW THE EXCLUSION OR LIMITATION OF LIABILITY FOR CONSEQUENTIAL, SPECIAL, INDIRECT, INCIDENTAL DAMAGES AND, ACCORDINGLY, SOME PORTIONS OF THESE LIMITATIONS MAY NOT APPLY TO YOU. BY USING THIS PRODUCT, YOU HAVE ACCEPTED THAT THE ABOVE LIMITATIONS OR THE MAXIMUM LEGALLY APPLICABLE SUBSET OF THESE LIMITATIONS APPLY TO YOUR USE OF THIS PRODUCT. WARNING Some applications are unable to properly handle signed bytevalues. If values are encountered > 127, please subtract 256 from this reported value
  • " ], "text/plain": [ " Size: 10MB\n", @@ -666,15 +669,15 @@ " sea_surface_temperature (time, latitude, longitude) float64 10MB dask.array\n", "Attributes:\n", " Conventions: CF-1.6\n", - " comment: IN NO EVENT SHALL DMI OR ITS REPRESENTATIVES BE LIABLE FOR ...\n", - " source: VIIRS_SST_NPP_NAR-OSISAF-L3C, SEVIRI_SST-OSISAF-L3C, EUR-L2...\n", " title: DMI North Sea/Baltic Sea SST, L3S, (SST_BAL_SST_L3S_NRT_OBS...\n", - " references: Høyer, J. L., Le Borgne, P., & Eastwood, S. (2014). A bias ...\n", + " source: VIIRS_SST_NPP_NAR-OSISAF-L3C, SEVIRI_SST-OSISAF-L3C, EUR-L2...\n", " history: Version 1.0\n", - " institution: Danish Meteorological Institute, DMI" + " institution: Danish Meteorological Institute, DMI\n", + " references: Høyer, J. L., Le Borgne, P., & Eastwood, S. (2014). A bias ...\n", + " comment: IN NO EVENT SHALL DMI OR ITS REPRESENTATIVES BE LIABLE FOR ..." ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -701,12 +704,61 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "16598aff-6967-4690-85b9-efb672369cf1", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "

    <dataset> - 5 problems (2 errors and 3 warnings)

    \n", + "
    \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    ds.coords['latitude']warnUnexpected encoding '_FillValue', coordinates must not have missing data.var-missing-data
    ds.coords['longitude']warnUnexpected encoding '_FillValue', coordinates must not have missing data.var-missing-data
    ds.data_vars['sea_surface_temperature']warnValid ranges are not recognized by xarray (as of Feb 2025).var-missing-data
    ds.data_vars['sea_surface_temperature']errorVariable 'sea_surface_temperature' missing 'gcmd_keyword_url' attribute.deepcode/variable-gcmd-keyword-url
    dserrorDataset missing required 'description' attribute.deepcode/dataset-description
    \n", + "
    " + ], + "text/plain": [ + "Result(file_path='', config_object=ConfigObject(name=None, files=None, ignores=None, linter_options=None, opener_options=None, processor=None, plugins={'__core__': Plugin(meta=PluginMeta(name='__core__', version='0.5.1', ref='xrlint.plugins.core:export_plugin', docs_url='https://bcdev.github.io/xrlint/rule-ref'), rules={'access-latency': Rule(meta=RuleMeta(name='access-latency', version='1.0.0', description='Ensure that the time it takes to open a dataset from its source does a exceed a given `threshold` in seconds. The default threshold is `2.5`.', schema={'type': 'object', 'properties': {'threshold': {'type': 'number', 'default': 2.5, 'exclusiveMinimum': 0, 'title': 'Threshold time in seconds'}}}, ref=None, docs_url=None, type='problem'), op_class=), 'var-units': Rule(meta=RuleMeta(name='var-units', version='1.0.0', description='Every variable should provide a description of its units.', schema=None, ref=None, docs_url='https://cfconventions.org/cf-conventions/cf-conventions.html#units', type='suggestion'), op_class=), 'var-flags': Rule(meta=RuleMeta(name='var-flags', version='1.0.0', description=\"Validate attributes 'flag_values', 'flag_masks' and 'flag_meanings' that make variables that contain flag values self describing. \", schema=None, ref=None, docs_url='https://cfconventions.org/cf-conventions/cf-conventions.html#flags', type='suggestion'), op_class=), 'conventions': Rule(meta=RuleMeta(name='conventions', version='1.0.0', description='Datasets should identify the applicable conventions using the `Conventions` attribute.\\n The rule has an optional configuration parameter `match` which is a regex pattern that the value of the `Conventions` attribute must match, if any. If not provided, the rule just verifies that the attribute exists and whether it is a character string.', schema={'type': 'object', 'properties': {'match': {'type': 'string', 'title': 'Regex pattern'}}}, ref=None, docs_url='https://cfconventions.org/cf-conventions/cf-conventions.html#identification-of-conventions', type='suggestion'), op_class=), 'var-missing-data': Rule(meta=RuleMeta(name='var-missing-data', version='1.0.0', description='Checks the recommended use of missing data, i.e., coordinate variables should not define missing data, but packed data should. Notifies about the use of valid ranges to indicate missing data, which is currently not supported by xarray.', schema=None, ref=None, docs_url='https://cfconventions.org/cf-conventions/cf-conventions.html#units', type='suggestion'), op_class=), 'time-coordinate': Rule(meta=RuleMeta(name='time-coordinate', version='1.0.0', description='Time coordinates should have valid and unambiguous time units encoding.', schema=None, ref=None, docs_url='https://cfconventions.org/cf-conventions/cf-conventions.html#time-coordinate', type='problem'), op_class=), 'coords-for-dims': Rule(meta=RuleMeta(name='coords-for-dims', version='1.0.0', description='Dimensions of data variables should have corresponding coordinates.', schema=None, ref=None, docs_url=None, type='problem'), op_class=), 'content-desc': Rule(meta=RuleMeta(name='content-desc', version='1.0.0', description=\"A dataset should provide information about where the data came from and what has been done to it. This information is mainly for the benefit of human readers. The rule accepts the following configuration parameters:\\n\\n- `globals`: list of names of required global attributes. Defaults to `['title', 'history']`.\\n- `commons`: list of names of required variable attributes that can also be defined globally. Defaults to `['institution', 'source', 'references', 'comment']`.\\n- `no_vars`: do not check variables at all. Defaults to `False`.\\n- `ignored_vars`: list of ignored variables (regex patterns). Defaults to `['crs', 'spatial_ref']`.\\n\", schema={'type': 'object', 'properties': {'globals': {'type': 'array', 'default': ['title', 'history'], 'items': {'type': 'string'}, 'title': 'Global attribute names'}, 'commons': {'type': 'array', 'default': ['institution', 'source', 'references', 'comment'], 'items': {'type': 'string'}, 'title': 'Common attribute names'}, 'skip_vars': {'type': 'boolean', 'default': False, 'title': 'Do not check variables'}, 'ignored_vars': {'type': 'array', 'default': ['crs', 'spatial_ref'], 'items': {'type': 'string'}, 'title': 'Ignored variables (regex name patterns)'}}}, ref=None, docs_url='https://cfconventions.org/cf-conventions/cf-conventions.html#description-of-file-contents', type='suggestion'), op_class=), 'grid-mappings': Rule(meta=RuleMeta(name='grid-mappings', version='1.0.0', description='Grid mappings, if any, shall have valid grid mapping coordinate variables.', schema=None, ref=None, docs_url='https://cfconventions.org/cf-conventions/cf-conventions.html#grid-mappings-and-projections', type='problem'), op_class=), 'no-empty-chunks': Rule(meta=RuleMeta(name='no-empty-chunks', version='1.0.0', description='Empty chunks should not be encoded and written. The rule currently applies to Zarr format only.', schema=None, ref=None, docs_url='https://docs.xarray.dev/en/stable/generated/xarray.Dataset.to_zarr.html#xarray-dataset-to-zarr', type='suggestion'), op_class=), 'var-desc': Rule(meta=RuleMeta(name='var-desc', version='1.0.0', description=\"Check that each data variable provides an identification and description of the content. The rule can be configured by parameter `attrs` which is a list of names of attributes that provides descriptive information. It defaults to `['standard_name', 'long_name']`.\", schema={'type': 'object', 'properties': {'attrs': {'type': 'array', 'default': ['standard_name', 'long_name'], 'items': {'type': 'string'}, 'title': 'Attribute names to check'}}}, ref=None, docs_url='https://cfconventions.org/cf-conventions/cf-conventions.html#standard-name', type='suggestion'), op_class=), 'lat-coordinate': Rule(meta=RuleMeta(name='lat-coordinate', version='1.0.0', description='Latitude coordinate should have standard units and standard names.', schema=None, ref=None, docs_url='https://cfconventions.org/cf-conventions/cf-conventions.html#latitude-coordinate', type='problem'), op_class=), 'lon-coordinate': Rule(meta=RuleMeta(name='lon-coordinate', version='1.0.0', description='Longitude coordinate should have standard units and standard names.', schema=None, ref=None, docs_url='https://cfconventions.org/cf-conventions/cf-conventions.html#longitude-coordinate', type='problem'), op_class=), 'no-empty-attrs': Rule(meta=RuleMeta(name='no-empty-attrs', version='1.0.0', description='Every dataset element should have metadata that describes it.', schema=None, ref=None, docs_url=None, type='suggestion'), op_class=)}, processors={}, configs={'recommended': [ConfigObject(name='recommended', files=None, ignores=None, linter_options=None, opener_options=None, processor=None, plugins=None, rules={'access-latency': RuleConfig(severity=1, args=(), kwargs={}), 'content-desc': RuleConfig(severity=1, args=(), kwargs={}), 'conventions': RuleConfig(severity=1, args=(), kwargs={}), 'coords-for-dims': RuleConfig(severity=2, args=(), kwargs={}), 'grid-mappings': RuleConfig(severity=2, args=(), kwargs={}), 'lat-coordinate': RuleConfig(severity=2, args=(), kwargs={}), 'lon-coordinate': RuleConfig(severity=2, args=(), kwargs={}), 'no-empty-attrs': RuleConfig(severity=1, args=(), kwargs={}), 'no-empty-chunks': RuleConfig(severity=0, args=(), kwargs={}), 'time-coordinate': RuleConfig(severity=2, args=(), kwargs={}), 'var-desc': RuleConfig(severity=1, args=(), kwargs={}), 'var-flags': RuleConfig(severity=2, args=(), kwargs={}), 'var-missing-data': RuleConfig(severity=1, args=(), kwargs={}), 'var-units': RuleConfig(severity=1, args=(), kwargs={})}, settings=None)], 'all': [ConfigObject(name='all', files=None, ignores=None, linter_options=None, opener_options=None, processor=None, plugins=None, rules={'access-latency': RuleConfig(severity=2, args=(), kwargs={}), 'var-units': RuleConfig(severity=2, args=(), kwargs={}), 'var-flags': RuleConfig(severity=2, args=(), kwargs={}), 'conventions': RuleConfig(severity=2, args=(), kwargs={}), 'var-missing-data': RuleConfig(severity=2, args=(), kwargs={}), 'time-coordinate': RuleConfig(severity=2, args=(), kwargs={}), 'coords-for-dims': RuleConfig(severity=2, args=(), kwargs={}), 'content-desc': RuleConfig(severity=2, args=(), kwargs={}), 'grid-mappings': RuleConfig(severity=2, args=(), kwargs={}), 'no-empty-chunks': RuleConfig(severity=2, args=(), kwargs={}), 'var-desc': RuleConfig(severity=2, args=(), kwargs={}), 'lat-coordinate': RuleConfig(severity=2, args=(), kwargs={}), 'lon-coordinate': RuleConfig(severity=2, args=(), kwargs={}), 'no-empty-attrs': RuleConfig(severity=2, args=(), kwargs={})}, settings=None)]}), 'deepcode': Plugin(meta=PluginMeta(name='deepcode', version='1.0.0', ref=None, docs_url=None), rules={'dataset-description': Rule(meta=RuleMeta(name='dataset-description', version='0.0.0', description=\"Ensures the dataset has a 'description' attribute.\", schema=None, ref=None, docs_url=None, type='problem'), op_class=), 'variable-gcmd-keyword-url': Rule(meta=RuleMeta(name='variable-gcmd-keyword-url', version='0.0.0', description=\"Ensures all variables have a 'gcmd_keyword_url' attribute.\", schema=None, ref=None, docs_url=None, type='problem'), op_class=)}, processors={}, configs={'recommended': [ConfigObject(name=None, files=None, ignores=None, linter_options=None, opener_options=None, processor=None, plugins=None, rules={'deepcode/variable-gcmd-keyword-url': RuleConfig(severity=2, args=(), kwargs={}), 'deepcode/dataset-description': RuleConfig(severity=2, args=(), kwargs={})}, settings=None)]})}, rules={'access-latency': RuleConfig(severity=1, args=(), kwargs={}), 'content-desc': RuleConfig(severity=0, args=(), kwargs={}), 'conventions': RuleConfig(severity=0, args=(), kwargs={}), 'coords-for-dims': RuleConfig(severity=2, args=(), kwargs={}), 'grid-mappings': RuleConfig(severity=2, args=(), kwargs={}), 'lat-coordinate': RuleConfig(severity=2, args=(), kwargs={}), 'lon-coordinate': RuleConfig(severity=2, args=(), kwargs={}), 'no-empty-attrs': RuleConfig(severity=0, args=(), kwargs={}), 'no-empty-chunks': RuleConfig(severity=0, args=(), kwargs={}), 'time-coordinate': RuleConfig(severity=0, args=(), kwargs={}), 'var-desc': RuleConfig(severity=1, args=(), kwargs={}), 'var-flags': RuleConfig(severity=2, args=(), kwargs={}), 'var-missing-data': RuleConfig(severity=1, args=(), kwargs={}), 'var-units': RuleConfig(severity=1, args=(), kwargs={}), 'deepcode/variable-gcmd-keyword-url': RuleConfig(severity=2, args=(), kwargs={}), 'deepcode/dataset-description': RuleConfig(severity=2, args=(), kwargs={})}, settings=None), messages=[Message(message=\"Unexpected encoding '_FillValue', coordinates must not have missing data.\", node_path=\"ds.coords['latitude']\", rule_id='var-missing-data', severity=1, fatal=None, fix=None, suggestions=None), Message(message=\"Unexpected encoding '_FillValue', coordinates must not have missing data.\", node_path=\"ds.coords['longitude']\", rule_id='var-missing-data', severity=1, fatal=None, fix=None, suggestions=None), Message(message='Valid ranges are not recognized by xarray (as of Feb 2025).', node_path=\"ds.data_vars['sea_surface_temperature']\", rule_id='var-missing-data', severity=1, fatal=None, fix=None, suggestions=None), Message(message=\"Variable 'sea_surface_temperature' missing 'gcmd_keyword_url' attribute.\", node_path=\"ds.data_vars['sea_surface_temperature']\", rule_id='deepcode/variable-gcmd-keyword-url', severity=2, fatal=None, fix=None, suggestions=None), Message(message=\"Dataset missing required 'description' attribute.\", node_path='ds', rule_id='deepcode/dataset-description', severity=2, fatal=None, fix=None, suggestions=[Suggestion(desc=\"Add a 'description' attribute to dataset.attrs.\", data=None, fix=None)])])" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "linter = LintDataset(dataset=ds)\n", "linter.lint_dataset()" @@ -740,7 +792,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "f7b34a8b-447b-4be5-b0f2-0c70cc9352e3", "metadata": { "tags": [] @@ -831,7 +883,7 @@ "id": "b7aa7bc9-83d5-4e43-9f7f-a4da18e8cacd", "metadata": {}, "source": [ - "Once the dataset and workflow metadata are prepared and validated, users can initiate the publishing process using the deep-code CLI. The following command automates the entire workflow:" + "Once the dataset and workflow metadata are prepared and validated, users can initiate the publishing process using the deep-code CLI or the python function. The following command automates the entire workflow:" ] }, { @@ -850,17 +902,52 @@ "4. Creates a Pull Request (PR) for review by the Open Science Catalog steward" ] }, + { + "cell_type": "markdown", + "id": "505ec89b-ff01-4deb-8bac-ef5bb99aa9ee", + "metadata": {}, + "source": [ + "Execute the below cell to confirm if you are in the correct directory, the current directory should have the .gitaccess file" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "0adca69e-31a5-4ff6-8466-1a129ee85779", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Current Working Directory: /home/tejas/bc/projects/deepesdl/deep-code/examples/notebooks\n", + "Current Working Directory: /home/tejas/bc/projects/deepesdl/deep-code/examples/notebooks\n" + ] + } + ], + "source": [ + "notebook_path = os.path.abspath(\"publish_to_EarthCODE.ipynb\")\n", + "notebook_dir = os.path.dirname(notebook_path)\n", + "print(\"Current Working Directory:\", notebook_dir)\n", + "# Change working directory\n", + "os.chdir(notebook_dir)\n", + "\n", + "# Confirm\n", + "print(\"Current Working Directory:\", os.getcwd())" + ] + }, { "cell_type": "markdown", "id": "e763ef5b-c4c2-4991-a612-7f2bff9a676d", "metadata": {}, "source": [ - "## publish using the python function" + "## publish using the python function\n", + "Before Publish, Please prepare dataset and/or workflow configuration file for your use case" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 3, "id": "c7e3a5a8-d74d-4bd4-9b76-45ace4b2d341", "metadata": { "tags": [] @@ -881,25 +968,25 @@ "INFO:deep_code.utils.dataset_stac_generator:Successfully opened dataset 'cmems_sst_v2.zarr' with configuration: Public store\n", "INFO:deep_code.tools.publish:Variable catalog already exists for sea-surface-temperature, adding product link.\n", "INFO:deep_code.tools.publish:Generating OGC API Record for the workflow...\n", - "INFO:root:Creating new branch: add-new-collection-cmems-sst-20250912151834...\n", - "Switched to a new branch 'add-new-collection-cmems-sst-20250912151834'\n", - "INFO:deep_code.tools.publish:Adding products/cmems-sst/collection.json to add-new-collection-cmems-sst-20250912151834\n", + "INFO:root:Creating new branch: add-new-collection-cmems-sst-20250924111300...\n", + "Switched to a new branch 'add-new-collection-cmems-sst-20250924111300'\n", + "INFO:deep_code.tools.publish:Adding products/cmems-sst/collection.json to add-new-collection-cmems-sst-20250924111300\n", "INFO:root:Adding new file: products/cmems-sst/collection.json...\n", - "INFO:deep_code.tools.publish:Adding variables/sea-surface-temperature/catalog.json to add-new-collection-cmems-sst-20250912151834\n", + "INFO:deep_code.tools.publish:Adding variables/sea-surface-temperature/catalog.json to add-new-collection-cmems-sst-20250924111300\n", "INFO:root:Adding new file: variables/sea-surface-temperature/catalog.json...\n", - "INFO:deep_code.tools.publish:Adding /home/tejas/temp_repo/variables/catalog.json to add-new-collection-cmems-sst-20250912151834\n", + "INFO:deep_code.tools.publish:Adding /home/tejas/temp_repo/variables/catalog.json to add-new-collection-cmems-sst-20250924111300\n", "INFO:root:Adding new file: /home/tejas/temp_repo/variables/catalog.json...\n", - "INFO:deep_code.tools.publish:Adding /home/tejas/temp_repo/products/catalog.json to add-new-collection-cmems-sst-20250912151834\n", + "INFO:deep_code.tools.publish:Adding /home/tejas/temp_repo/products/catalog.json to add-new-collection-cmems-sst-20250924111300\n", "INFO:root:Adding new file: /home/tejas/temp_repo/products/catalog.json...\n", - "INFO:deep_code.tools.publish:Adding /home/tejas/temp_repo/projects/deep-earth-system-data-lab/collection.json to add-new-collection-cmems-sst-20250912151834\n", + "INFO:deep_code.tools.publish:Adding /home/tejas/temp_repo/projects/deep-earth-system-data-lab/collection.json to add-new-collection-cmems-sst-20250924111300\n", "INFO:root:Adding new file: /home/tejas/temp_repo/projects/deep-earth-system-data-lab/collection.json...\n", - "INFO:deep_code.tools.publish:Adding workflows/lps-demo-cmems-sst-workflow/record.json to add-new-collection-cmems-sst-20250912151834\n", - "INFO:root:Adding new file: workflows/lps-demo-cmems-sst-workflow/record.json...\n", - "INFO:deep_code.tools.publish:Adding experiments/lps-demo-cmems-sst-workflow/record.json to add-new-collection-cmems-sst-20250912151834\n", - "INFO:root:Adding new file: experiments/lps-demo-cmems-sst-workflow/record.json...\n", - "INFO:deep_code.tools.publish:Adding experiments/catalog.json to add-new-collection-cmems-sst-20250912151834\n", + "INFO:deep_code.tools.publish:Adding workflows/demo-cmems-sst-workflow/record.json to add-new-collection-cmems-sst-20250924111300\n", + "INFO:root:Adding new file: workflows/demo-cmems-sst-workflow/record.json...\n", + "INFO:deep_code.tools.publish:Adding experiments/demo-cmems-sst-workflow/record.json to add-new-collection-cmems-sst-20250924111300\n", + "INFO:root:Adding new file: experiments/demo-cmems-sst-workflow/record.json...\n", + "INFO:deep_code.tools.publish:Adding experiments/catalog.json to add-new-collection-cmems-sst-20250924111300\n", "INFO:root:Adding new file: experiments/catalog.json...\n", - "INFO:deep_code.tools.publish:Adding workflows/catalog.json to add-new-collection-cmems-sst-20250912151834\n", + "INFO:deep_code.tools.publish:Adding workflows/catalog.json to add-new-collection-cmems-sst-20250924111300\n", "INFO:root:Adding new file: workflows/catalog.json...\n", "INFO:root:Committing and pushing changes...\n" ] @@ -908,11 +995,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "[add-new-collection-cmems-sst-20250912151834 c3381005] Add new dataset collection: cmems-sst and workflow/experiment: lps-demo-cmems-sst-workflow\n", + "[add-new-collection-cmems-sst-20250924111300 5093f608] Publish dataset: cmems-sst + workflow/experiment: demo-cmems-sst-workflow\n", " 9 files changed, 434 insertions(+), 6 deletions(-)\n", - " create mode 100644 experiments/lps-demo-cmems-sst-workflow/record.json\n", + " create mode 100644 experiments/demo-cmems-sst-workflow/record.json\n", " create mode 100644 products/cmems-sst/collection.json\n", - " create mode 100644 workflows/lps-demo-cmems-sst-workflow/record.json\n" + " create mode 100644 workflows/demo-cmems-sst-workflow/record.json\n" ] }, { @@ -920,11 +1007,11 @@ "output_type": "stream", "text": [ "remote: \n", - "remote: Create a pull request for 'add-new-collection-cmems-sst-20250912151834' on GitHub by visiting: \n", - "remote: https://github.com/tejasmharish/open-science-catalog-metadata-staging/pull/new/add-new-collection-cmems-sst-20250912151834 \n", + "remote: Create a pull request for 'add-new-collection-cmems-sst-20250924111300' on GitHub by visiting: \n", + "remote: https://github.com/tejasmharish/open-science-catalog-metadata-staging/pull/new/add-new-collection-cmems-sst-20250924111300 \n", "remote: \n", "To https://github.com/tejasmharish/open-science-catalog-metadata-staging.git\n", - " * [new branch] add-new-collection-cmems-sst-20250912151834 -> add-new-collection-cmems-sst-20250912151834\n", + " * [new branch] add-new-collection-cmems-sst-20250924111300 -> add-new-collection-cmems-sst-20250924111300\n", "INFO:root:Creating a pull request...\n" ] }, @@ -932,14 +1019,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "Branch 'add-new-collection-cmems-sst-20250912151834' set up to track remote branch 'add-new-collection-cmems-sst-20250912151834' from 'origin'.\n" + "Branch 'add-new-collection-cmems-sst-20250924111300' set up to track remote branch 'add-new-collection-cmems-sst-20250924111300' from 'origin'.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "INFO:root:Pull request created: https://github.com/ESA-EarthCODE/open-science-catalog-metadata-staging/pull/156\n", + "INFO:root:Pull request created: https://github.com/ESA-EarthCODE/open-science-catalog-metadata-staging/pull/162\n", "INFO:deep_code.tools.publish:Pull request created: None\n", "INFO:root:Cleaning up local repository...\n", "INFO:deep_code.tools.publish:Pull request created: None\n" @@ -947,13 +1034,24 @@ } ], "source": [ - "# publish using the python function\n", + "# publish dataset and workflow using the python function\n", "publisher = Publisher(\n", " dataset_config_path=\"dataset-config.yaml\",\n", - " workflow_config_path=\"workflow-config.yaml\",\n", - " environment=\"staging\",\n", + " workflow_config_path=\"workflow-config.yaml\"\n", ")\n", - "publisher.publish_all()" + "publisher.publish()\n", + "\n", + "# # To publish only workflow\n", + "# publisher = Publisher(\n", + "# workflow_config_path=\"workflow-config.yaml\"\n", + "# )\n", + "# publisher.publish(mode=\"worflow\")\n", + "\n", + "# # To publish only dataset\n", + "# publisher = Publisher(\n", + "# dataset_config_path=\"dataset-config.yaml\",\n", + "# )\n", + "# publisher.publish(mode=\"dataset\")" ] }, { @@ -961,7 +1059,11 @@ "id": "09bcacf5-f435-4af3-8984-5ddf421504b8", "metadata": {}, "source": [ - "## publish using cli" + "## publish using cli\n", + "- Publisher now supports `mode` parameter, This allows more flexible publishing:\n", + " - `\"dataset\"` → publish dataset only\n", + " - `\"workflow\"` → publish workflow only\n", + " - `\"all\"` → publish both (default)" ] }, { @@ -973,7 +1075,15 @@ }, "outputs": [], "source": [ - "!deep-code publish dataset-config.yaml workflow-config.yaml -e staging" + "!deep-code publish dataset-config.yaml workflow-config.yaml " + ] + }, + { + "cell_type": "markdown", + "id": "635aafd0-6058-4c13-9679-3351aea7b173", + "metadata": {}, + "source": [ + "publish dataset with -m dataset (dataset mode)" ] }, { @@ -982,7 +1092,45 @@ "id": "406f5d35-03eb-4a6c-ba99-f70ee1101038", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "!deep-code publish dataset-config.yaml -m dataset" + ] + }, + { + "cell_type": "markdown", + "id": "e2110051-2b36-493e-912f-e5666ad8398a", + "metadata": {}, + "source": [ + "publish workflow with -m workflow (workflow mode)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "739f1046-71ee-4388-a14a-b2af53f986e9", + "metadata": {}, + "outputs": [], + "source": [ + "!deep-code publish workflow-config.yaml -m workflow" + ] + }, + { + "cell_type": "markdown", + "id": "749eaef3-ccf1-4756-96a9-f279ab640a4c", + "metadata": {}, + "source": [ + "explicitly pass workflow config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1e20403-95b5-4cee-8730-6c458e13c19d", + "metadata": {}, + "outputs": [], + "source": [ + "!deep-code publish --workflow-config workflow-config.yaml " + ] } ], "metadata": { From ee51b57db633b4b29d47a3979e84f6248ac26c50 Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 26 Sep 2025 11:57:16 +0200 Subject: [PATCH 08/19] updated README.md --- README.md | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d82ecce..ea15223 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,8 @@ catalog ### Usage ``` -deep-code publish DATASET_CONFIG WORKFLOW_CONFIG [--environment ENVIRONMENT] +deep-code publish DATASET_CONFIG WORKFLOW_CONFIG [--environment ENVIRONMENT] [--mode +all|dataset|workflow] ``` #### Arguments @@ -104,8 +105,12 @@ deep-code publish DATASET_CONFIG WORKFLOW_CONFIG [--environment ENVIRONMENT] (e.g., workflow-config.yaml) #### Options + --dataset-config, - Explict path to dataset config + --workflow-config, - Explicit path to workflow config --environment, -e - Target catalog environment: production (default) | staging | testing + --mode, -m Publishing mode: + all (default) | dataset | workflow #### Examples: 1. Publish to staging catalog @@ -120,6 +125,18 @@ deep-code publish dataset-config.yaml workflow-config.yaml -e testing ``` deep-code publish dataset-config.yaml workflow-config.yaml ``` +4. Publish Dataset only +``` +deep-code publish dataset-config.yaml -m dataset + +deep-code publish --dataset-config dataset.yaml -m dataset +``` +5. Publish Workflow only +``` +deep-code publish dataset-config.yaml -m workflow + +deep-code publish --workflow-config workflow.yaml -m dataset +``` #### dataset-config.yaml example ``` From d4419c34f10396959f1fcc85172ad1d9aba78ece Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 26 Sep 2025 14:30:38 +0200 Subject: [PATCH 09/19] updated unint test for ogc api record class --- deep_code/tests/utils/test_ogc_api_record.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/deep_code/tests/utils/test_ogc_api_record.py b/deep_code/tests/utils/test_ogc_api_record.py index 6cc3f3f..e302aa1 100644 --- a/deep_code/tests/utils/test_ogc_api_record.py +++ b/deep_code/tests/utils/test_ogc_api_record.py @@ -38,10 +38,10 @@ def test_contact_initialization(self): def test_contact_default_values(self): contact = Contact(name="Jane Doe", organization="DeepESDL") - self.assertEqual(contact.position, "") - self.assertEqual(contact.links, []) - self.assertEqual(contact.contactInstructions, "") - self.assertEqual(contact.roles, ["principal investigator"]) + self.assertEqual(contact.position, None) + self.assertEqual(contact.links, None) + self.assertEqual(contact.contactInstructions, None) + self.assertEqual(contact.roles, None) class TestThemeConcept(unittest.TestCase): From 96692f623b4073225e45816eb6c5202f0febeb61 Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 26 Sep 2025 17:19:06 +0200 Subject: [PATCH 10/19] build child link for experiment in workflow record according to modes --- deep_code/tools/publish.py | 16 ++++++++++------ deep_code/utils/ogc_api_record.py | 19 ++++++++++++------- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index cc7880a..5c68750 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -386,6 +386,8 @@ def generate_workflow_experiment_records( jupyter_notebook_url=jupyter_notebook_url, themes=osc_themes, ) + if mode == "all": + link_builder.build_child_link_to_related_experiment() # Convert to dictionary and cleanup workflow_dict = workflow_record.to_dict() if "jupyter_notebook_url" in workflow_dict: @@ -400,6 +402,13 @@ def generate_workflow_experiment_records( exp_record_properties.type = "experiment" exp_record_properties.osc_workflow = workflow_id + # Update base catalogs of workflows with links + file_dict["workflows/catalog.json"] = self._update_base_catalog( + catalog_path="workflows/catalog.json", + item_id=workflow_id, + self_href=WORKFLOW_BASE_CATALOG_SELF_HREF, + ) + if mode in ["all"]: if not getattr(self, "collection_id", None): raise ValueError( @@ -430,18 +439,13 @@ def generate_workflow_experiment_records( exp_file_path = f"experiments/{workflow_id}/record.json" file_dict[exp_file_path] = experiment_dict - # Update base catalogs of experiments and workflows with links + # Update base catalogs of experiments with links file_dict["experiments/catalog.json"] = self._update_base_catalog( catalog_path="experiments/catalog.json", item_id=workflow_id, self_href=EXPERIMENT_BASE_CATALOG_SELF_HREF, ) - file_dict["workflows/catalog.json"] = self._update_base_catalog( - catalog_path="workflows/catalog.json", - item_id=workflow_id, - self_href=WORKFLOW_BASE_CATALOG_SELF_HREF, - ) # Write to files if testing if write_to_file: for file_path, data in file_dict.items(): diff --git a/deep_code/utils/ogc_api_record.py b/deep_code/utils/ogc_api_record.py index 3f12b66..e95ec10 100644 --- a/deep_code/utils/ogc_api_record.py +++ b/deep_code/utils/ogc_api_record.py @@ -128,7 +128,7 @@ def build_theme_links_for_records(self): return self.theme_links @staticmethod - def build_link_to_dataset(collection_id): + def build_link_to_dataset(collection_id) -> list[dict[str, str]]: return [ { "rel": "child", @@ -138,6 +138,17 @@ def build_link_to_dataset(collection_id): } ] + def build_child_link_to_related_experiment(self) -> list[dict[str, str]]: + """Build a link to the related experiment record if publishing mode is 'all'.""" + return [ + { + "rel": "child", + "href": f"../../experiments/{self.id}/record.json", + "type": "application/json", + "title": self.title, + } + ] + def build_link_to_jnb(self, workflow_title, jupyter_nb_url) -> List[Dict[str, Any]]: return [ { @@ -278,12 +289,6 @@ def _generate_static_links(self): "type": "application/json", "title": "Workflows", }, - { - "rel": "child", - "href": f"../../experiments/{self.id}/record.json", - "type": "application/json", - "title": f"{self.title}", - }, { "rel": "jupyter-notebook", "type": "application/json", From 834578e0994f1697b0823d4fa15d739ca0011794 Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 29 Sep 2025 17:21:47 +0200 Subject: [PATCH 11/19] Enhanced GitHub automation to automatically fork synchronize with upstream before committing and opening a PR to ensure branches are always up-to-date. --- CHANGES.md | 5 +- .../tests/utils/test_github_automation.py | 394 +++++++++++++----- deep_code/tools/publish.py | 13 +- deep_code/utils/github_automation.py | 271 ++++++++---- 4 files changed, 508 insertions(+), 175 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index cdc934f..4bdc254 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -47,4 +47,7 @@ (e.g., deep-code publish workflow.yaml -m workflow). - Contacts in OGC API records no longer include default or empty fields, only - properties explicitly defined in the workflow configuration will now be generated. \ No newline at end of file + properties explicitly defined in the workflow configuration will now be generated. + +- Enhanced GitHub automation to automatically fork synchronize with upstream before + committing and opening a PR to ensure branches are always up-to-date. \ No newline at end of file diff --git a/deep_code/tests/utils/test_github_automation.py b/deep_code/tests/utils/test_github_automation.py index efa284c..7c40147 100644 --- a/deep_code/tests/utils/test_github_automation.py +++ b/deep_code/tests/utils/test_github_automation.py @@ -1,171 +1,363 @@ +# tests/test_github_automation.py import logging import unittest from pathlib import Path -from unittest.mock import MagicMock, patch +from unittest.mock import ANY, MagicMock, call, patch from deep_code.utils.github_automation import GitHubAutomation +def make_cp(stdout: str = ""): + """Helper to mimic subprocess.CompletedProcess-like return for our mocks.""" + m = MagicMock() + m.stdout = stdout + return m + + class TestGitHubAutomation(unittest.TestCase): def setUp(self): - # Set up test data self.username = "testuser" self.token = "testtoken" self.repo_owner = "testowner" self.repo_name = "testrepo" - self.github_automation = GitHubAutomation( - self.username, self.token, self.repo_owner, self.repo_name + self.gha = GitHubAutomation( + self.username, + self.token, + self.repo_owner, + self.repo_name, + local_clone_dir="/tmp/temp_repo", ) - logging.disable(logging.CRITICAL) # Disable logging during tests + logging.disable(logging.CRITICAL) def tearDown(self): - logging.disable(logging.NOTSET) # Re-enable logging after tests + logging.disable(logging.NOTSET) @patch("requests.post") def test_fork_repository(self, mock_post): - # Mock the response from GitHub API - mock_response = MagicMock() - mock_response.raise_for_status.return_value = None - mock_post.return_value = mock_response + mock_post.return_value = MagicMock(**{"raise_for_status.return_value": None}) + self.gha.fork_repository() - # Call the method - self.github_automation.fork_repository() - - # Assertions mock_post.assert_called_once_with( f"https://api.github.com/repos/{self.repo_owner}/{self.repo_name}/forks", headers={"Authorization": f"token {self.token}"}, + timeout=60, ) @patch("subprocess.run") - def test_clone_repository_new(self, mock_run): - # Mock the subprocess.run method - mock_run.return_value = MagicMock() + def test_clone_sync_repository_new(self, mock_run): + """ + No .git directory → we clone and then ensure upstream remote gets added. + """ + # Simulate: "git remote -v" returns nothing so we add 'upstream' + def run_side_effect(args, cwd, check, capture_output=False, text=True): + if args[:3] == ["git", "remote", "-v"]: + return make_cp(stdout="") + return make_cp() + + mock_run.side_effect = run_side_effect + + with patch.object(Path, "mkdir") as _mk, patch( + "pathlib.Path.exists", side_effect=lambda p=None: False + ): + self.gha.clone_sync_repository() + + # Expect a clone, then a 'git remote -v', then 'git remote add upstream ...' + origin_url = f"https://{self.username}:{self.token}@github.com/{self.username}/{self.repo_name}.git" + + expected_calls = [ + # git clone /tmp/temp_repo (cwd=".") + call( + ["git", "clone", origin_url, "/tmp/temp_repo"], + cwd=".", + check=True, + capture_output=False, + text=True, + ), + # ensure_upstream_remote -> remote -v (capture_output=True) + call( + ["git", "remote", "-v"], + cwd="/tmp/temp_repo", + check=True, + capture_output=True, + text=True, + ), + # since not present -> add upstream + call( + [ + "git", + "remote", + "add", + "upstream", + f"https://github.com/{self.repo_owner}/{self.repo_name}.git", + ], + cwd="/tmp/temp_repo", + check=True, + capture_output=False, + text=True, + ), + ] + # Only assert these key calls occurred in order (there can be other internal calls) + mock_run.assert_has_calls(expected_calls, any_order=False) + + @patch("subprocess.run") + def test_clone_sync_repository_existing(self, mock_run): + """ + .git exists → we fetch/prune and still ensure upstream remote. + """ + + def run_side_effect(args, cwd, check, capture_output=False, text=True): + if args[:3] == ["git", "remote", "-v"]: + # Pretend we already have no 'upstream' to force add + return make_cp(stdout="") + return make_cp() + + mock_run.side_effect = run_side_effect - # Mock os.path.exists to return False (directory does not exist) - with patch("os.path.exists", return_value=False): - self.github_automation.clone_sync_repository() + # Path.exists should return True when asked about /tmp/temp_repo/.git + def exists_side_effect(self): + return str(self).endswith("/tmp/temp_repo/.git") - # Assertions - mock_run.assert_called_once_with( + with patch("pathlib.Path.exists", new=exists_side_effect): + self.gha.clone_sync_repository() + + mock_run.assert_any_call( + ["git", "fetch", "--all", "--prune"], + cwd="/tmp/temp_repo", + check=True, + capture_output=False, + text=True, + ) + mock_run.assert_any_call( + ["git", "remote", "-v"], + cwd="/tmp/temp_repo", + check=True, + capture_output=True, + text=True, + ) + mock_run.assert_any_call( [ "git", - "clone", - f"https://{self.username}:{self.token}@github.com/{self.username}/{self.repo_name}.git", - self.github_automation.local_clone_dir, + "remote", + "add", + "upstream", + f"https://github.com/{self.repo_owner}/{self.repo_name}.git", ], + cwd="/tmp/temp_repo", check=True, + capture_output=False, + text=True, ) @patch("subprocess.run") - def test_clone_repository_existing(self, mock_run): - # Mock the subprocess.run method - mock_run.return_value = MagicMock() + def test_sync_fork_with_upstream_merge_strategy(self, mock_run): + """ + Ensure sequence: fetch upstream+origin, checkout/ensure base branch, merge, push. + """ + + def run_side_effect(args, cwd, check, capture_output=False, text=True): + # 'git branch' to check if base exists locally → return no branches, so it creates + if args[:2] == ["git", "branch"]: + return make_cp(stdout="") + if args[:3] == ["git", "remote", "-v"]: + return make_cp( + stdout="upstream\t{url} (fetch)\n".format( + url=f"https://github.com/{self.repo_owner}/{self.repo_name}.git" + ) + ) + return make_cp() + + mock_run.side_effect = run_side_effect + + # Pretend .git exists so we operate in repo + def exists_side_effect(self): + return str(self).endswith("/tmp/temp_repo/.git") - # Mock os.path.exists to return True (directory exists) - with patch("os.path.exists", return_value=True): - with patch("os.chdir"): - self.github_automation.clone_sync_repository() + with patch("pathlib.Path.exists", new=exists_side_effect): + self.gha.sync_fork_with_upstream(base_branch="main", strategy="merge") - # Assertions - mock_run.assert_called_once_with(["git", "pull"], check=True) + # Key steps + mock_run.assert_any_call( + ["git", "fetch", "upstream", "main"], + cwd="/tmp/temp_repo", + check=True, + capture_output=False, + text=True, + ) + mock_run.assert_any_call( + ["git", "fetch", "origin", "main"], + cwd="/tmp/temp_repo", + check=True, + capture_output=False, + text=True, + ) + # It may try to create local branch from origin first, then upstream + # We just ensure a merge with upstream and push occurred + mock_run.assert_any_call( + ["git", "merge", "upstream/main"], + cwd="/tmp/temp_repo", + check=True, + capture_output=False, + text=True, + ) + mock_run.assert_any_call( + ["git", "push", "origin", "main"], + cwd="/tmp/temp_repo", + check=True, + capture_output=False, + text=True, + ) @patch("subprocess.run") def test_create_branch(self, mock_run): - # Mock the subprocess.run method - mock_run.return_value = MagicMock() - - # Mock os.chdir - with patch("os.chdir"): - self.github_automation.create_branch("test-branch") + mock_run.return_value = make_cp() + # Ensure .git exists + with patch( + "pathlib.Path.exists", + new=lambda self: str(self).endswith("/tmp/temp_repo/.git"), + ): + self.gha.create_branch("feature/x", from_branch="main") - # Assertions - mock_run.assert_called_once_with( - ["git", "checkout", "-b", "test-branch"], check=True + mock_run.assert_has_calls( + [ + call( + ["git", "checkout", "main"], + cwd="/tmp/temp_repo", + check=True, + capture_output=False, + text=True, + ), + call( + ["git", "checkout", "-B", "feature/x"], + cwd="/tmp/temp_repo", + check=True, + capture_output=False, + text=True, + ), + ], + any_order=False, ) @patch("subprocess.run") def test_add_file(self, mock_run): - # Mock the subprocess.run method - mock_run.return_value = MagicMock() + mock_run.return_value = make_cp() + with patch.object(Path, "mkdir") as _mk, patch.object( + Path, "write_text" + ) as _wt: + # Ensure .git exists + with patch( + "pathlib.Path.exists", + new=lambda self: str(self).endswith("/tmp/temp_repo/.git"), + ): + self.gha.add_file("dir/file.json", {"k": "v"}) + + _wt.assert_called_once() # JSON written + mock_run.assert_any_call( + ["git", "add", "/tmp/temp_repo/dir/file.json"], + cwd="/tmp/temp_repo", + check=True, + capture_output=False, + text=True, + ) - # Mock os.chdir and Path - with patch("os.chdir"), patch("pathlib.Path.mkdir"), patch( - "builtins.open", unittest.mock.mock_open() + @patch("subprocess.run") + def test_commit_and_push(self, mock_run): + mock_run.return_value = make_cp() + with patch( + "pathlib.Path.exists", + new=lambda self: str(self).endswith("/tmp/temp_repo/.git"), ): - self.github_automation.add_file("test/file.json", {"key": "value"}) + self.gha.commit_and_push("feat", "my message") - # Assertions - mock_run.assert_called_once_with( + mock_run.assert_has_calls( [ - "git", - "add", - str(Path(self.github_automation.local_clone_dir) / "test/file.json"), + call( + ["git", "checkout", "feat"], + cwd="/tmp/temp_repo", + check=True, + capture_output=False, + text=True, + ), + call( + ["git", "commit", "-m", "my message"], + cwd="/tmp/temp_repo", + check=True, + capture_output=False, + text=True, + ), + call( + ["git", "push", "-u", "origin", "feat"], + cwd="/tmp/temp_repo", + check=True, + capture_output=False, + text=True, + ), ], - check=True, + any_order=False, ) @patch("subprocess.run") - def test_commit_and_push(self, mock_run): - # Mock the subprocess.run method - mock_run.return_value = MagicMock() + def test_commit_and_push_nothing_to_commit(self, mock_run): + # Make the commit call raise like 'git' does when nothing to commit + def run_side_effect(args, cwd, check, capture_output=False, text=True): + if args[:2] == ["git", "commit"]: + e = Exception("nothing to commit, working tree clean") + # Mimic our _run raising RuntimeError + raise RuntimeError(str(e)) + return make_cp() - # Mock os.chdir - with patch("os.chdir"): - self.github_automation.commit_and_push("test-branch", "Test commit message") + mock_run.side_effect = run_side_effect + + with patch( + "pathlib.Path.exists", + new=lambda self: str(self).endswith("/tmp/temp_repo/.git"), + ): + # Should not raise; should still push + self.gha.commit_and_push("feat", "msg") - # Assertions - mock_run.assert_any_call( - ["git", "commit", "-m", "Test commit message"], check=True - ) mock_run.assert_any_call( - ["git", "push", "-u", "origin", "test-branch"], check=True + ["git", "push", "-u", "origin", "feat"], + cwd="/tmp/temp_repo", + check=True, + capture_output=False, + text=True, ) @patch("requests.post") def test_create_pull_request(self, mock_post): - # Mock the response from GitHub API - mock_response = MagicMock() - mock_response.raise_for_status.return_value = None - mock_response.json.return_value = {"html_url": "https://github.com/test/pull/1"} - mock_post.return_value = mock_response - - # Mock os.chdir - with patch("os.chdir"): - self.github_automation.create_pull_request( - "test-branch", "Test PR", "Test body" - ) - - # Assertions + mock_post.return_value = MagicMock( + **{ + "raise_for_status.return_value": None, + "json.return_value": {"html_url": "https://github.com/test/pull/1"}, + } + ) + url = self.gha.create_pull_request( + "feat", "PR title", "Body", base_branch="main" + ) + + self.assertEqual(url, "https://github.com/test/pull/1") mock_post.assert_called_once_with( f"https://api.github.com/repos/{self.repo_owner}/{self.repo_name}/pulls", headers={"Authorization": f"token {self.token}"}, json={ - "title": "Test PR", - "head": f"{self.username}:test-branch", + "title": "PR title", + "head": f"{self.username}:feat", "base": "main", - "body": "Test body", + "body": "Body", }, + timeout=60, ) - @patch("subprocess.run") - def test_clean_up(self, mock_run): - # Mock the subprocess.run method - mock_run.return_value = MagicMock() - - # Mock os.chdir - with patch("os.chdir"): - self.github_automation.clean_up() - - # Assertions - mock_run.assert_called_once_with( - ["rm", "-rf", self.github_automation.local_clone_dir] - ) + @patch("shutil.rmtree") + def test_clean_up(self, mock_rm): + # Simulate repo path exists + with patch("pathlib.Path.exists", return_value=True): + self.gha.clean_up() + mock_rm.assert_called_once_with(Path("/tmp/temp_repo")) - def test_file_exists(self): - # Mock os.path.isfile - with patch("os.path.isfile", return_value=True): - result = self.github_automation.file_exists("test/file.json") + def test_file_exists_true(self): + with patch("pathlib.Path.is_file", return_value=True): + self.assertTrue(self.gha.file_exists("a/b.json")) - # Assertions - self.assertTrue(result) + def test_file_exists_false(self): + with patch("pathlib.Path.is_file", return_value=False): + self.assertFalse(self.gha.file_exists("a/b.json")) diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 5c68750..edd0c7a 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -62,6 +62,8 @@ def publish_files( commit_message: str, pr_title: str, pr_body: str, + base_branch: str = "main", + sync_strategy: str = "merge", # 'ff' | 'rebase' | 'merge' ) -> str: """Publish multiple files to a new branch and open a PR. @@ -71,12 +73,21 @@ def publish_files( commit_message: Commit message for all changes. pr_title: Title of the pull request. pr_body: Description/body of the pull request. + base_branch: base branch, default main + sync_strategy: git sync strategy Returns: URL of the created pull request. """ try: - self.github_automation.create_branch(branch_name) + # Ensure local clone and remotes are ready + self.github_automation.clone_sync_repository() + # *** Sync fork with upstream before creating the branch/committing *** + self.github_automation.sync_fork_with_upstream( + base_branch=base_branch, strategy=sync_strategy + ) + + self.github_automation.create_branch(branch_name, from_branch=base_branch) # Add each file to the branch for file_path, content in file_dict.items(): diff --git a/deep_code/utils/github_automation.py b/deep_code/utils/github_automation.py index 8090069..e52eb75 100644 --- a/deep_code/utils/github_automation.py +++ b/deep_code/utils/github_automation.py @@ -1,14 +1,17 @@ #!/usr/bin/env python3 - # Copyright (c) 2025 by Brockmann Consult GmbH # Permissions are hereby granted under the terms of the MIT License: # https://opensource.org/licenses/MIT. +from __future__ import annotations + import json import logging import os +import shutil import subprocess from pathlib import Path +from typing import Any import requests @@ -23,99 +26,218 @@ class GitHubAutomation: token: Personal access token for GitHub. repo_owner: Owner of the repository to fork. repo_name: Name of the repository to fork. + local_clone_dir: Optional path to use for local clone (defaults to ~/temp_repo). """ - def __init__(self, username: str, token: str, repo_owner: str, repo_name: str): + def __init__( + self, + username: str, + token: str, + repo_owner: str, + repo_name: str, + local_clone_dir: str | None = None, + ): self.username = username self.token = token self.repo_owner = repo_owner self.repo_name = repo_name + self.base_repo_url = f"https://github.com/{repo_owner}/{repo_name}.git" - self.fork_repo_url = ( + # Tokenized origin URL for pushes to the fork + self.origin_repo_url = ( f"https://{username}:{token}@github.com/{username}/{repo_name}.git" ) - self.local_clone_dir = os.path.join(os.path.expanduser("~"), "temp_repo") + self.local_clone_dir = ( + os.path.join(os.path.expanduser("~"), "temp_repo") + if local_clone_dir is None + else local_clone_dir + ) + + def _run( + self, + cmd: list[str], + cwd: str | Path | None = None, + check: bool = True, + capture_output: bool = False, + text: bool = True, + ) -> subprocess.CompletedProcess: + """Run a subprocess command with consistent logging and error handling.""" + cwd_str = str(cwd) if cwd is not None else str(self.local_clone_dir) + logging.debug("RUN: %s (cwd=%s)", " ".join(cmd), cwd_str) + try: + return subprocess.run( + cmd, cwd=cwd_str, check=check, capture_output=capture_output, text=text + ) + except subprocess.CalledProcessError as e: + stdout = e.stdout or "" + stderr = e.stderr or "" + raise RuntimeError( + f"Command failed: {' '.join(cmd)}\nSTDOUT:\n{stdout}\nSTDERR:\n{stderr}" + ) from e + + def _run_git(self, args: list[str], cwd: str | Path | None = None) -> None: + self._run(["git", *args], cwd=cwd) + + def _git_output(self, args: list[str]) -> str: + res = self._run(["git", *args], capture_output=True) + return res.stdout - def fork_repository(self): + def _ensure_repo_dir(self) -> Path: + path = Path(self.local_clone_dir) + path.mkdir(parents=True, exist_ok=True) + return path + + def _ensure_upstream_remote(self) -> None: + """Ensure a remote named 'upstream' points to the base repo.""" + repo = self._ensure_repo_dir() + remotes = self._git_output(["remote", "-v"]) + upstream_url = self.base_repo_url + + if "upstream" not in remotes: + logging.info("Adding 'upstream' remote -> %s", upstream_url) + self._run_git(["remote", "add", "upstream", upstream_url], cwd=repo) + else: + # If the URL mismatches, fix it + if f"upstream\t{upstream_url} (fetch)" not in remotes: + logging.info("Updating 'upstream' remote URL -> %s", upstream_url) + self._run_git(["remote", "set-url", "upstream", upstream_url], cwd=repo) + + def fork_repository(self) -> None: """Fork the repository to the user's GitHub account.""" logging.info("Forking repository...") url = f"https://api.github.com/repos/{self.repo_owner}/{self.repo_name}/forks" headers = {"Authorization": f"token {self.token}"} - response = requests.post(url, headers=headers) + response = requests.post(url, headers=headers, timeout=60) response.raise_for_status() - logging.info(f"Repository forked to {self.username}/{self.repo_name}") + logging.info("Repository forked to %s/%s", self.username, self.repo_name) + + def clone_sync_repository(self) -> None: + """Clone the forked repository locally if missing; otherwise fetch & fast-forward origin.""" + repo = self._ensure_repo_dir() + git_dir = repo / ".git" + + if not git_dir.exists(): + logging.info("Cloning fork: %s -> %s", self.origin_repo_url, repo) + self._run(["git", "clone", self.origin_repo_url, str(repo)], cwd=".") + # Ensure default branch is tracked locally (we don't assume 'main') + # This will be handled by later sync step. + else: + logging.info("Local clone exists; fetching latest from origin.") + self._run_git(["fetch", "--all", "--prune"], cwd=repo) + + # Always ensure we have the upstream remote configured + self._ensure_upstream_remote() + + def sync_fork_with_upstream( + self, base_branch: str = "main", strategy: str = "merge" + ) -> None: + """Sync local and origin base branch from upstream base branch. + strategy: + - 'ff' : fast-forward only + - 'rebase' : rebase local base_branch onto upstream/base_branch + - 'merge' : merge upstream/base_branch into local/base_branch (default) + """ + repo = self._ensure_repo_dir() + logging.info( + "Syncing fork with upstream (%s) on branch '%s'...", strategy, base_branch + ) + + # Make sure remotes are present and fresh + self._ensure_upstream_remote() + self._run_git(["fetch", "upstream", base_branch], cwd=repo) + self._run_git(["fetch", "origin", base_branch], cwd=repo) - def clone_sync_repository(self): - """Clone the forked repository locally if it doesn't exist, or pull updates if it does.""" - logging.info("Checking local repository...") - if not os.path.exists(self.local_clone_dir): - logging.info("Cloning forked repository...") + # Ensure we have a local branch for base_branch + local_branches = self._git_output(["branch"]) + if f" {base_branch}\n" not in (local_branches + "\n"): + # Create local base branch tracking origin/base_branch if it exists, otherwise upstream try: - subprocess.run( - ["git", "clone", self.fork_repo_url, self.local_clone_dir], - check=True, + self._run_git( + ["checkout", "-b", base_branch, f"origin/{base_branch}"], cwd=repo + ) + except RuntimeError: + self._run_git( + ["checkout", "-b", base_branch, f"upstream/{base_branch}"], cwd=repo ) - logging.info(f"Repository cloned to {self.local_clone_dir}") - except subprocess.CalledProcessError as e: - raise RuntimeError(f"Failed to clone repository: {e}") else: - logging.info("Local repository already exists. Pulling latest changes...") - try: - os.chdir(self.local_clone_dir) - subprocess.run(["git", "pull"], check=True) - logging.info("Repository updated with latest changes.") - except subprocess.CalledProcessError as e: - raise RuntimeError(f"Failed to pull latest changes: {e}") - - def create_branch(self, branch_name: str): - """Create a new branch in the local repository.""" - logging.info(f"Creating new branch: {branch_name}...") - try: - os.chdir(self.local_clone_dir) - subprocess.run(["git", "checkout", "-b", branch_name], check=True) - except subprocess.CalledProcessError as e: - raise RuntimeError(f"Failed Creating branch: '{branch_name}': {e}") + self._run_git(["checkout", base_branch], cwd=repo) - def add_file(self, file_path: str, content): - """Add a new file to the local repository.""" - logging.info(f"Adding new file: {file_path}...") - os.chdir(self.local_clone_dir) - full_path = Path(self.local_clone_dir) / file_path + if strategy == "ff": + self._run_git(["merge", "--ff-only", f"upstream/{base_branch}"], cwd=repo) + elif strategy == "rebase": + self._run_git(["rebase", f"upstream/{base_branch}"], cwd=repo) + elif strategy == "merge": + self._run_git(["merge", f"upstream/{base_branch}"], cwd=repo) + else: + raise ValueError("strategy must be one of: 'ff', 'rebase', 'merge'") + + # Push updated base branch to fork + self._run_git(["push", "origin", base_branch], cwd=repo) + logging.info( + "Fork origin/%s is now aligned with upstream/%s.", base_branch, base_branch + ) + + def create_branch(self, branch_name: str, from_branch: str = "main") -> None: + """Create or reset a local branch from a given base branch.""" + repo = self._ensure_repo_dir() + logging.info("Creating branch '%s' from '%s'...", branch_name, from_branch) + + # Ensure base exists locally (caller should have synced beforehand) + self._run_git(["checkout", from_branch], cwd=repo) + # -B creates or resets the branch to the current HEAD of from_branch + self._run_git(["checkout", "-B", branch_name], cwd=repo) + + def add_file(self, file_path: str, content: Any) -> None: + """Add a new file (serialized to JSON) to the local repository and stage it.""" + repo = self._ensure_repo_dir() + full_path = Path(repo) / file_path full_path.parent.mkdir(parents=True, exist_ok=True) - # Ensure content is serializable + + # Normalize content to something JSON serializable if hasattr(content, "to_dict"): content = content.to_dict() if not isinstance(content, (dict, list, str, int, float, bool, type(None))): raise TypeError(f"Cannot serialize content of type {type(content)}") + try: json_content = json.dumps( content, indent=2, ensure_ascii=False, default=serialize ) except TypeError as e: - raise RuntimeError(f"JSON serialization failed: {e}") - with open(full_path, "w", encoding="utf-8") as f: - f.write(json_content) - try: - subprocess.run(["git", "add", str(full_path)], check=True) - except subprocess.CalledProcessError as e: - raise RuntimeError(f"Failed to add file '{file_path}': {e}") + raise RuntimeError( + f"JSON serialization failed for '{file_path}': {e}" + ) from e + + full_path.write_text(json_content, encoding="utf-8") + self._run_git(["add", str(full_path)], cwd=repo) + logging.info("Added and staged file: %s", file_path) + + def commit_and_push(self, branch_name: str, commit_message: str) -> None: + """Commit staged changes on the branch and push to origin.""" + repo = self._ensure_repo_dir() + logging.info("Committing and pushing changes on '%s'...", branch_name) - def commit_and_push(self, branch_name: str, commit_message: str): - """Commit changes and push to the forked repository.""" - logging.info("Committing and pushing changes...") - os.chdir(self.local_clone_dir) + self._run_git(["checkout", branch_name], cwd=repo) try: - subprocess.run(["git", "commit", "-m", commit_message], check=True) - subprocess.run(["git", "push", "-u", "origin", branch_name], check=True) - except subprocess.CalledProcessError as e: - raise RuntimeError(f"Failed to commit and push: {e}") + self._run_git(["commit", "-m", commit_message], cwd=repo) + except RuntimeError as e: + if "nothing to commit" in str(e).lower(): + logging.info("Nothing to commit on '%s'; pushing anyway.", branch_name) + else: + raise + self._run_git(["push", "-u", "origin", branch_name], cwd=repo) def create_pull_request( self, branch_name: str, pr_title: str, pr_body: str, base_branch: str = "main" - ): - """Create a pull request from the forked repository to the base repository.""" - logging.info("Creating a pull request...") - os.chdir(self.local_clone_dir) + ) -> str: + """Create a pull request from fork:branch_name → base_repo:base_branch. + + Returns: + The created PR URL. + """ + logging.info( + "Creating pull request '%s' -> base:%s ...", branch_name, base_branch + ) url = f"https://api.github.com/repos/{self.repo_owner}/{self.repo_name}/pulls" headers = {"Authorization": f"token {self.token}"} data = { @@ -124,22 +246,27 @@ def create_pull_request( "base": base_branch, "body": pr_body, } - response = requests.post(url, headers=headers, json=data) + response = requests.post(url, headers=headers, json=data, timeout=60) response.raise_for_status() - pr_url = response.json()["html_url"] - logging.info(f"Pull request created: {pr_url}") + pr_url = response.json().get("html_url", "") + logging.info("Pull request created: %s", pr_url) + return pr_url - def clean_up(self): - """Clean up the local cloned repository.""" - logging.info("Cleaning up local repository...") - os.chdir("..") + def clean_up(self) -> None: + """Remove the local cloned repository directory.""" + repo = Path(self.local_clone_dir) + logging.info("Cleaning up local repository at %s ...", repo) try: - subprocess.run(["rm", "-rf", self.local_clone_dir]) - except subprocess.CalledProcessError as e: - raise RuntimeError(f"Failed to clean-up local repository: {e}") + if repo.exists(): + shutil.rmtree(repo) + except Exception as e: + raise RuntimeError( + f"Failed to clean up local repository '{repo}': {e}" + ) from e - def file_exists(self, file_path) -> bool: + def file_exists(self, file_path: str) -> bool: + """Check if a file exists within the local clone directory.""" full_path = Path(self.local_clone_dir) / file_path - exists = os.path.isfile(full_path) - logging.debug(f"Checking existence of {full_path}: {exists}") + exists = full_path.is_file() + logging.debug("Checking existence of %s: %s", full_path, exists) return exists From 6cda01236d261d669634b9774caad6b6b633e234 Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 29 Sep 2025 17:24:59 +0200 Subject: [PATCH 12/19] remove unused import --- deep_code/tests/utils/test_github_automation.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/deep_code/tests/utils/test_github_automation.py b/deep_code/tests/utils/test_github_automation.py index 7c40147..671adb3 100644 --- a/deep_code/tests/utils/test_github_automation.py +++ b/deep_code/tests/utils/test_github_automation.py @@ -1,8 +1,7 @@ -# tests/test_github_automation.py import logging import unittest from pathlib import Path -from unittest.mock import ANY, MagicMock, call, patch +from unittest.mock import MagicMock, call, patch from deep_code.utils.github_automation import GitHubAutomation From 704ff9c7e9340309a93539f196db201f86dc4131 Mon Sep 17 00:00:00 2001 From: Tejas Morbagal Harish Date: Tue, 30 Sep 2025 13:32:20 +0200 Subject: [PATCH 13/19] Update CHANGES.md Co-authored-by: Thomas Storm --- CHANGES.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 4bdc254..980e892 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -42,7 +42,7 @@ - `"workflow"` → publish workflow only - `"all"` → publish both (default) -- CLI: publish now auto-detects dataset vs workflow configs and also accepts +- CLI: the `publish` command now auto-detects dataset vs workflow configs and also accepts --dataset-config / --workflow-config; single-file calls use -m to disambiguate (e.g., deep-code publish workflow.yaml -m workflow). From 3fd117ea3286ee3caafb2687e708a68223e79591 Mon Sep 17 00:00:00 2001 From: Tejas Morbagal Harish Date: Tue, 30 Sep 2025 13:32:37 +0200 Subject: [PATCH 14/19] Update deep_code/cli/publish.py Co-authored-by: Thomas Storm --- deep_code/cli/publish.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deep_code/cli/publish.py b/deep_code/cli/publish.py index 56561bd..49da430 100644 --- a/deep_code/cli/publish.py +++ b/deep_code/cli/publish.py @@ -46,7 +46,7 @@ def ensure_file(path: Optional[str], label: str): if path is None: raise click.UsageError(f"{label} is required but was not provided.") if not Path(path).is_file(): - raise click.UsageError(f"{label} not found: {path}") + raise click.UsageError(f"{label} not found: {path} is not a file") if mode == "dataset": ensure_file(dataset_config, "DATASET_CONFIG") From 8abb34597d71c0681701e9a274201c3dbb4479eb Mon Sep 17 00:00:00 2001 From: Tejas Morbagal Harish Date: Tue, 30 Sep 2025 13:34:00 +0200 Subject: [PATCH 15/19] Update examples/notebooks/publish_to_EarthCODE.ipynb Co-authored-by: Thomas Storm --- examples/notebooks/publish_to_EarthCODE.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/notebooks/publish_to_EarthCODE.ipynb b/examples/notebooks/publish_to_EarthCODE.ipynb index 125b56d..47faf85 100644 --- a/examples/notebooks/publish_to_EarthCODE.ipynb +++ b/examples/notebooks/publish_to_EarthCODE.ipynb @@ -942,7 +942,7 @@ "metadata": {}, "source": [ "## publish using the python function\n", - "Before Publish, Please prepare dataset and/or workflow configuration file for your use case" + "Before publishing, please prepare dataset and/or workflow configuration file for your use case" ] }, { From 3b65e0649b828f157c60eab8cf82201c2a874d58 Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 30 Sep 2025 14:07:27 +0200 Subject: [PATCH 16/19] refactor to address comments --- deep_code/cli/publish.py | 18 +++++++++--------- deep_code/tools/publish.py | 28 +++++++++++++++++++++------- 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/deep_code/cli/publish.py b/deep_code/cli/publish.py index 49da430..19f1a0d 100644 --- a/deep_code/cli/publish.py +++ b/deep_code/cli/publish.py @@ -5,7 +5,7 @@ # https://opensource.org/licenses/MIT. from pathlib import Path -from typing import Literal, Optional, Tuple +from typing import Literal import click import yaml @@ -38,11 +38,11 @@ def _validate_inputs( - dataset_config: Optional[str], workflow_config: Optional[str], mode: str + dataset_config: str | None, workflow_config: str | None, mode: str ): mode = mode.lower() - def ensure_file(path: Optional[str], label: str): + def ensure_file(path: str | None, label: str): if path is None: raise click.UsageError(f"{label} is required but was not provided.") if not Path(path).is_file(): @@ -96,12 +96,12 @@ def _detect_config_type(path: Path) -> Literal["dataset", "workflow"]: def _assign_configs( - pos_first: Optional[str], - pos_second: Optional[str], + pos_first: str | None, + pos_second: str | None, mode: Mode, - explicit_dataset: Optional[str], - explicit_workflow: Optional[str], -) -> Tuple[Optional[str], Optional[str]]: + explicit_dataset: str | None, + explicit_workflow: str | None, +) -> tuple[str | None, str | None]: """ Decide which file is dataset vs workflow. Precedence: explicit flags > positional + detection. @@ -121,7 +121,7 @@ def _assign_configs( return ds, wf # Helper to assign a single positional file to the missing slot - def _assign_single(p: str): + def _assign_single(p: str) -> tuple[str | None, str | None]: nonlocal ds, wf if ds and wf: raise click.UsageError( diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index edd0c7a..b9a5e2d 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -2,7 +2,6 @@ # Copyright (c) 2025 by Brockmann Consult GmbH # Permissions are hereby granted under the terms of the MIT License: # https://opensource.org/licenses/MIT. - import copy import logging from datetime import datetime @@ -63,7 +62,9 @@ def publish_files( pr_title: str, pr_body: str, base_branch: str = "main", - sync_strategy: str = "merge", # 'ff' | 'rebase' | 'merge' + sync_strategy: Literal[ + "ff", "rebase", "merge" + ] = "merge", # 'ff' | 'rebase' | 'merge' ) -> str: """Publish multiple files to a new branch and open a PR. @@ -73,16 +74,29 @@ def publish_files( commit_message: Commit message for all changes. pr_title: Title of the pull request. pr_body: Description/body of the pull request. - base_branch: base branch, default main - sync_strategy: git sync strategy + base_branch: Base branch to branch from and open the PR against (default: "main") + sync_strategy: How to sync local with the remote base before pushing: + - "ff": Fast-forward only (no merge commits; fails if FF not possible). + - "rebase": Rebase local changes onto the updated base branch. + - "merge": Create a merge commit (default). Returns: URL of the created pull request. + + Raises: + ValueError: If an unsupported sync_strategy is provided. """ + + if sync_strategy not in {"ff", "rebase", "merge"}: + raise ValueError( + f'Invalid sync_strategy="{sync_strategy}". ' + 'Accepted values are "ff", "rebase", "merge".' + ) + try: # Ensure local clone and remotes are ready self.github_automation.clone_sync_repository() - # *** Sync fork with upstream before creating the branch/committing *** + # Sync fork with upstream before creating the branch/committing self.github_automation.sync_fork_with_upstream( base_branch=base_branch, strategy=sync_strategy ) @@ -134,7 +148,7 @@ def __init__( self.collection_id = "" self.workflow_title = "" - # Paths to configuration files, may be optional + # Paths to configuration files, can be optional self.dataset_config_path = dataset_config_path self.workflow_config_path = workflow_config_path @@ -142,7 +156,7 @@ def __init__( self.dataset_config: dict[str, Any] = {} self.workflow_config: dict[str, Any] = {} - # Values that may be set from configs (lazy) + # Values that may be set from configs self.collection_id: str | None = None self.workflow_title: str | None = None self.workflow_id: str | None = None From 360f5e0811fe14be7a5052116c17193cd6094713 Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 30 Sep 2025 14:39:52 +0200 Subject: [PATCH 17/19] Prevented duplicate item and self links when updating base catalogs of workflows and experiments. --- CHANGES.md | 5 ++++- deep_code/tools/publish.py | 43 ++++++++++++++++++++++++++++++-------- 2 files changed, 38 insertions(+), 10 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 980e892..5a88e93 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -50,4 +50,7 @@ properties explicitly defined in the workflow configuration will now be generated. - Enhanced GitHub automation to automatically fork synchronize with upstream before - committing and opening a PR to ensure branches are always up-to-date. \ No newline at end of file + committing and opening a PR to ensure branches are always up-to-date. + +- Prevented duplicate item and self links when updating base catalogs of workflows and + experiments. \ No newline at end of file diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index b9a5e2d..9207999 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -343,24 +343,49 @@ def _update_base_catalog( Returns: Updated Catalog object. """ - # Load the base catalog base_catalog = Catalog.from_file( Path(self.gh_publisher.github_automation.local_clone_dir) / catalog_path ) - # Add a link to the new item - base_catalog.add_link( - Link( - rel="item", - target=f"./{item_id}/record.json", - media_type="application/json", - title=f"{self.workflow_title}", + item_href = f"./{item_id}/record.json" + + # Ensure the "item" link is unique + def link_href(link: Link) -> str | None: + # PySTAC Link may store href in .target or .href; .get_href() resolves if base HREF set + return ( + getattr(link, "href", None) + or getattr(link, "target", None) + or (link.get_href() if hasattr(link, "get_href") else None) ) + + has_item = any( + (l.rel == "item") and (link_href(l) == item_href) + for l in base_catalog.links ) + if not has_item: + base_catalog.add_link( + Link( + rel="item", + target=item_href, + media_type="application/json", + title=self.workflow_title, + ) + ) - # Set the self-href for the base catalog + # Ensure there is exactly one "self" link + base_catalog.links = [l for l in base_catalog.links if l.rel != "self"] base_catalog.set_self_href(self_href) + # deduplicate by (rel, href) + seen = set() + unique_links = [] + for l in base_catalog.links: + key = (l.rel, link_href(l)) + if key not in seen: + unique_links.append(l) + seen.add(key) + base_catalog.links = unique_links + return base_catalog def generate_workflow_experiment_records( From 5d1f3a8e8188643ab35c5943565dca02eb52f01b Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 30 Sep 2025 15:22:08 +0200 Subject: [PATCH 18/19] cleared the ouput cells --- examples/notebooks/publish_to_EarthCODE.ipynb | 721 +----------------- 1 file changed, 13 insertions(+), 708 deletions(-) diff --git a/examples/notebooks/publish_to_EarthCODE.ipynb b/examples/notebooks/publish_to_EarthCODE.ipynb index 47faf85..eb8d520 100644 --- a/examples/notebooks/publish_to_EarthCODE.ipynb +++ b/examples/notebooks/publish_to_EarthCODE.ipynb @@ -48,7 +48,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "dc774651-bef8-4483-9a83-7fcacdc88797", "metadata": { "tags": [] @@ -66,7 +66,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "626af94e-9397-4e1f-83e8-9fb1de5d9e0b", "metadata": { "tags": [] @@ -106,35 +106,12 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "dcc6255f-b680-419c-acf1-38968c104790", "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING - 2025-09-23T14:37:42Z - COPERNICUS_MARINE_SERVICE_USERNAME is deprecated. Please use COPERNICUSMARINE_SERVICE_USERNAME instead.\n", - "WARNING:copernicusmarine:COPERNICUS_MARINE_SERVICE_USERNAME is deprecated. Please use COPERNICUSMARINE_SERVICE_USERNAME instead.\n", - "WARNING - 2025-09-23T14:37:42Z - COPERNICUS_MARINE_SERVICE_PASSWORD is deprecated. Please use COPERNICUSMARINE_SERVICE_PASSWORD instead.\n", - "WARNING:copernicusmarine:COPERNICUS_MARINE_SERVICE_PASSWORD is deprecated. Please use COPERNICUSMARINE_SERVICE_PASSWORD instead.\n", - "/home/tejas/micromamba/envs/deep-code/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "store = new_data_store(\"cmems\")\n", "store" @@ -142,546 +119,12 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "8003fe01-2186-4a60-9ec6-b6ce51d2a185", "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO - 2025-09-23T14:37:44Z - Selected dataset version: \"201904\"\n", - "INFO:copernicusmarine:Selected dataset version: \"201904\"\n", - "INFO - 2025-09-23T14:37:44Z - Selected dataset part: \"default\"\n", - "INFO:copernicusmarine:Selected dataset part: \"default\"\n" - ] - }, - { - "data": { - "text/html": [ - "
    \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
    <xarray.Dataset> Size: 10MB\n",
    -       "Dimensions:                  (time: 5, latitude: 451, longitude: 551)\n",
    -       "Coordinates:\n",
    -       "  * latitude                 (latitude) float32 2kB 53.0 53.02 ... 61.98 62.0\n",
    -       "  * longitude                (longitude) float32 2kB 9.0 9.02 ... 19.98 20.0\n",
    -       "  * time                     (time) datetime64[ns] 40B 2022-01-01 ... 2022-01-05\n",
    -       "Data variables:\n",
    -       "    sea_surface_temperature  (time, latitude, longitude) float64 10MB dask.array<chunksize=(1, 451, 512), meta=np.ndarray>\n",
    -       "Attributes:\n",
    -       "    Conventions:  CF-1.6\n",
    -       "    title:        DMI North Sea/Baltic Sea SST, L3S, (SST_BAL_SST_L3S_NRT_OBS...\n",
    -       "    source:       VIIRS_SST_NPP_NAR-OSISAF-L3C, SEVIRI_SST-OSISAF-L3C, EUR-L2...\n",
    -       "    history:      Version 1.0\n",
    -       "    institution:  Danish Meteorological Institute, DMI\n",
    -       "    references:   Høyer, J. L., Le Borgne, P., & Eastwood, S. (2014). A bias ...\n",
    -       "    comment:      IN NO EVENT SHALL DMI OR ITS REPRESENTATIVES BE LIABLE FOR ...
    " - ], - "text/plain": [ - " Size: 10MB\n", - "Dimensions: (time: 5, latitude: 451, longitude: 551)\n", - "Coordinates:\n", - " * latitude (latitude) float32 2kB 53.0 53.02 ... 61.98 62.0\n", - " * longitude (longitude) float32 2kB 9.0 9.02 ... 19.98 20.0\n", - " * time (time) datetime64[ns] 40B 2022-01-01 ... 2022-01-05\n", - "Data variables:\n", - " sea_surface_temperature (time, latitude, longitude) float64 10MB dask.array\n", - "Attributes:\n", - " Conventions: CF-1.6\n", - " title: DMI North Sea/Baltic Sea SST, L3S, (SST_BAL_SST_L3S_NRT_OBS...\n", - " source: VIIRS_SST_NPP_NAR-OSISAF-L3C, SEVIRI_SST-OSISAF-L3C, EUR-L2...\n", - " history: Version 1.0\n", - " institution: Danish Meteorological Institute, DMI\n", - " references: Høyer, J. L., Le Borgne, P., & Eastwood, S. (2014). A bias ...\n", - " comment: IN NO EVENT SHALL DMI OR ITS REPRESENTATIVES BE LIABLE FOR ..." - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "ds = store.open_data(\n", " \"DMI-BALTIC-SST-L3S-NRT-OBS_FULL_TIME_SERIE\",\n", @@ -704,61 +147,12 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "16598aff-6967-4690-85b9-efb672369cf1", "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "text/html": [ - "
    \n", - "

    <dataset> - 5 problems (2 errors and 3 warnings)

    \n", - "
    \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
    ds.coords['latitude']warnUnexpected encoding '_FillValue', coordinates must not have missing data.var-missing-data
    ds.coords['longitude']warnUnexpected encoding '_FillValue', coordinates must not have missing data.var-missing-data
    ds.data_vars['sea_surface_temperature']warnValid ranges are not recognized by xarray (as of Feb 2025).var-missing-data
    ds.data_vars['sea_surface_temperature']errorVariable 'sea_surface_temperature' missing 'gcmd_keyword_url' attribute.deepcode/variable-gcmd-keyword-url
    dserrorDataset missing required 'description' attribute.deepcode/dataset-description
    \n", - "
    " - ], - "text/plain": [ - "Result(file_path='', config_object=ConfigObject(name=None, files=None, ignores=None, linter_options=None, opener_options=None, processor=None, plugins={'__core__': Plugin(meta=PluginMeta(name='__core__', version='0.5.1', ref='xrlint.plugins.core:export_plugin', docs_url='https://bcdev.github.io/xrlint/rule-ref'), rules={'access-latency': Rule(meta=RuleMeta(name='access-latency', version='1.0.0', description='Ensure that the time it takes to open a dataset from its source does a exceed a given `threshold` in seconds. The default threshold is `2.5`.', schema={'type': 'object', 'properties': {'threshold': {'type': 'number', 'default': 2.5, 'exclusiveMinimum': 0, 'title': 'Threshold time in seconds'}}}, ref=None, docs_url=None, type='problem'), op_class=), 'var-units': Rule(meta=RuleMeta(name='var-units', version='1.0.0', description='Every variable should provide a description of its units.', schema=None, ref=None, docs_url='https://cfconventions.org/cf-conventions/cf-conventions.html#units', type='suggestion'), op_class=), 'var-flags': Rule(meta=RuleMeta(name='var-flags', version='1.0.0', description=\"Validate attributes 'flag_values', 'flag_masks' and 'flag_meanings' that make variables that contain flag values self describing. \", schema=None, ref=None, docs_url='https://cfconventions.org/cf-conventions/cf-conventions.html#flags', type='suggestion'), op_class=), 'conventions': Rule(meta=RuleMeta(name='conventions', version='1.0.0', description='Datasets should identify the applicable conventions using the `Conventions` attribute.\\n The rule has an optional configuration parameter `match` which is a regex pattern that the value of the `Conventions` attribute must match, if any. If not provided, the rule just verifies that the attribute exists and whether it is a character string.', schema={'type': 'object', 'properties': {'match': {'type': 'string', 'title': 'Regex pattern'}}}, ref=None, docs_url='https://cfconventions.org/cf-conventions/cf-conventions.html#identification-of-conventions', type='suggestion'), op_class=), 'var-missing-data': Rule(meta=RuleMeta(name='var-missing-data', version='1.0.0', description='Checks the recommended use of missing data, i.e., coordinate variables should not define missing data, but packed data should. Notifies about the use of valid ranges to indicate missing data, which is currently not supported by xarray.', schema=None, ref=None, docs_url='https://cfconventions.org/cf-conventions/cf-conventions.html#units', type='suggestion'), op_class=), 'time-coordinate': Rule(meta=RuleMeta(name='time-coordinate', version='1.0.0', description='Time coordinates should have valid and unambiguous time units encoding.', schema=None, ref=None, docs_url='https://cfconventions.org/cf-conventions/cf-conventions.html#time-coordinate', type='problem'), op_class=), 'coords-for-dims': Rule(meta=RuleMeta(name='coords-for-dims', version='1.0.0', description='Dimensions of data variables should have corresponding coordinates.', schema=None, ref=None, docs_url=None, type='problem'), op_class=), 'content-desc': Rule(meta=RuleMeta(name='content-desc', version='1.0.0', description=\"A dataset should provide information about where the data came from and what has been done to it. This information is mainly for the benefit of human readers. The rule accepts the following configuration parameters:\\n\\n- `globals`: list of names of required global attributes. Defaults to `['title', 'history']`.\\n- `commons`: list of names of required variable attributes that can also be defined globally. Defaults to `['institution', 'source', 'references', 'comment']`.\\n- `no_vars`: do not check variables at all. Defaults to `False`.\\n- `ignored_vars`: list of ignored variables (regex patterns). Defaults to `['crs', 'spatial_ref']`.\\n\", schema={'type': 'object', 'properties': {'globals': {'type': 'array', 'default': ['title', 'history'], 'items': {'type': 'string'}, 'title': 'Global attribute names'}, 'commons': {'type': 'array', 'default': ['institution', 'source', 'references', 'comment'], 'items': {'type': 'string'}, 'title': 'Common attribute names'}, 'skip_vars': {'type': 'boolean', 'default': False, 'title': 'Do not check variables'}, 'ignored_vars': {'type': 'array', 'default': ['crs', 'spatial_ref'], 'items': {'type': 'string'}, 'title': 'Ignored variables (regex name patterns)'}}}, ref=None, docs_url='https://cfconventions.org/cf-conventions/cf-conventions.html#description-of-file-contents', type='suggestion'), op_class=), 'grid-mappings': Rule(meta=RuleMeta(name='grid-mappings', version='1.0.0', description='Grid mappings, if any, shall have valid grid mapping coordinate variables.', schema=None, ref=None, docs_url='https://cfconventions.org/cf-conventions/cf-conventions.html#grid-mappings-and-projections', type='problem'), op_class=), 'no-empty-chunks': Rule(meta=RuleMeta(name='no-empty-chunks', version='1.0.0', description='Empty chunks should not be encoded and written. The rule currently applies to Zarr format only.', schema=None, ref=None, docs_url='https://docs.xarray.dev/en/stable/generated/xarray.Dataset.to_zarr.html#xarray-dataset-to-zarr', type='suggestion'), op_class=), 'var-desc': Rule(meta=RuleMeta(name='var-desc', version='1.0.0', description=\"Check that each data variable provides an identification and description of the content. The rule can be configured by parameter `attrs` which is a list of names of attributes that provides descriptive information. It defaults to `['standard_name', 'long_name']`.\", schema={'type': 'object', 'properties': {'attrs': {'type': 'array', 'default': ['standard_name', 'long_name'], 'items': {'type': 'string'}, 'title': 'Attribute names to check'}}}, ref=None, docs_url='https://cfconventions.org/cf-conventions/cf-conventions.html#standard-name', type='suggestion'), op_class=), 'lat-coordinate': Rule(meta=RuleMeta(name='lat-coordinate', version='1.0.0', description='Latitude coordinate should have standard units and standard names.', schema=None, ref=None, docs_url='https://cfconventions.org/cf-conventions/cf-conventions.html#latitude-coordinate', type='problem'), op_class=), 'lon-coordinate': Rule(meta=RuleMeta(name='lon-coordinate', version='1.0.0', description='Longitude coordinate should have standard units and standard names.', schema=None, ref=None, docs_url='https://cfconventions.org/cf-conventions/cf-conventions.html#longitude-coordinate', type='problem'), op_class=), 'no-empty-attrs': Rule(meta=RuleMeta(name='no-empty-attrs', version='1.0.0', description='Every dataset element should have metadata that describes it.', schema=None, ref=None, docs_url=None, type='suggestion'), op_class=)}, processors={}, configs={'recommended': [ConfigObject(name='recommended', files=None, ignores=None, linter_options=None, opener_options=None, processor=None, plugins=None, rules={'access-latency': RuleConfig(severity=1, args=(), kwargs={}), 'content-desc': RuleConfig(severity=1, args=(), kwargs={}), 'conventions': RuleConfig(severity=1, args=(), kwargs={}), 'coords-for-dims': RuleConfig(severity=2, args=(), kwargs={}), 'grid-mappings': RuleConfig(severity=2, args=(), kwargs={}), 'lat-coordinate': RuleConfig(severity=2, args=(), kwargs={}), 'lon-coordinate': RuleConfig(severity=2, args=(), kwargs={}), 'no-empty-attrs': RuleConfig(severity=1, args=(), kwargs={}), 'no-empty-chunks': RuleConfig(severity=0, args=(), kwargs={}), 'time-coordinate': RuleConfig(severity=2, args=(), kwargs={}), 'var-desc': RuleConfig(severity=1, args=(), kwargs={}), 'var-flags': RuleConfig(severity=2, args=(), kwargs={}), 'var-missing-data': RuleConfig(severity=1, args=(), kwargs={}), 'var-units': RuleConfig(severity=1, args=(), kwargs={})}, settings=None)], 'all': [ConfigObject(name='all', files=None, ignores=None, linter_options=None, opener_options=None, processor=None, plugins=None, rules={'access-latency': RuleConfig(severity=2, args=(), kwargs={}), 'var-units': RuleConfig(severity=2, args=(), kwargs={}), 'var-flags': RuleConfig(severity=2, args=(), kwargs={}), 'conventions': RuleConfig(severity=2, args=(), kwargs={}), 'var-missing-data': RuleConfig(severity=2, args=(), kwargs={}), 'time-coordinate': RuleConfig(severity=2, args=(), kwargs={}), 'coords-for-dims': RuleConfig(severity=2, args=(), kwargs={}), 'content-desc': RuleConfig(severity=2, args=(), kwargs={}), 'grid-mappings': RuleConfig(severity=2, args=(), kwargs={}), 'no-empty-chunks': RuleConfig(severity=2, args=(), kwargs={}), 'var-desc': RuleConfig(severity=2, args=(), kwargs={}), 'lat-coordinate': RuleConfig(severity=2, args=(), kwargs={}), 'lon-coordinate': RuleConfig(severity=2, args=(), kwargs={}), 'no-empty-attrs': RuleConfig(severity=2, args=(), kwargs={})}, settings=None)]}), 'deepcode': Plugin(meta=PluginMeta(name='deepcode', version='1.0.0', ref=None, docs_url=None), rules={'dataset-description': Rule(meta=RuleMeta(name='dataset-description', version='0.0.0', description=\"Ensures the dataset has a 'description' attribute.\", schema=None, ref=None, docs_url=None, type='problem'), op_class=), 'variable-gcmd-keyword-url': Rule(meta=RuleMeta(name='variable-gcmd-keyword-url', version='0.0.0', description=\"Ensures all variables have a 'gcmd_keyword_url' attribute.\", schema=None, ref=None, docs_url=None, type='problem'), op_class=)}, processors={}, configs={'recommended': [ConfigObject(name=None, files=None, ignores=None, linter_options=None, opener_options=None, processor=None, plugins=None, rules={'deepcode/variable-gcmd-keyword-url': RuleConfig(severity=2, args=(), kwargs={}), 'deepcode/dataset-description': RuleConfig(severity=2, args=(), kwargs={})}, settings=None)]})}, rules={'access-latency': RuleConfig(severity=1, args=(), kwargs={}), 'content-desc': RuleConfig(severity=0, args=(), kwargs={}), 'conventions': RuleConfig(severity=0, args=(), kwargs={}), 'coords-for-dims': RuleConfig(severity=2, args=(), kwargs={}), 'grid-mappings': RuleConfig(severity=2, args=(), kwargs={}), 'lat-coordinate': RuleConfig(severity=2, args=(), kwargs={}), 'lon-coordinate': RuleConfig(severity=2, args=(), kwargs={}), 'no-empty-attrs': RuleConfig(severity=0, args=(), kwargs={}), 'no-empty-chunks': RuleConfig(severity=0, args=(), kwargs={}), 'time-coordinate': RuleConfig(severity=0, args=(), kwargs={}), 'var-desc': RuleConfig(severity=1, args=(), kwargs={}), 'var-flags': RuleConfig(severity=2, args=(), kwargs={}), 'var-missing-data': RuleConfig(severity=1, args=(), kwargs={}), 'var-units': RuleConfig(severity=1, args=(), kwargs={}), 'deepcode/variable-gcmd-keyword-url': RuleConfig(severity=2, args=(), kwargs={}), 'deepcode/dataset-description': RuleConfig(severity=2, args=(), kwargs={})}, settings=None), messages=[Message(message=\"Unexpected encoding '_FillValue', coordinates must not have missing data.\", node_path=\"ds.coords['latitude']\", rule_id='var-missing-data', severity=1, fatal=None, fix=None, suggestions=None), Message(message=\"Unexpected encoding '_FillValue', coordinates must not have missing data.\", node_path=\"ds.coords['longitude']\", rule_id='var-missing-data', severity=1, fatal=None, fix=None, suggestions=None), Message(message='Valid ranges are not recognized by xarray (as of Feb 2025).', node_path=\"ds.data_vars['sea_surface_temperature']\", rule_id='var-missing-data', severity=1, fatal=None, fix=None, suggestions=None), Message(message=\"Variable 'sea_surface_temperature' missing 'gcmd_keyword_url' attribute.\", node_path=\"ds.data_vars['sea_surface_temperature']\", rule_id='deepcode/variable-gcmd-keyword-url', severity=2, fatal=None, fix=None, suggestions=None), Message(message=\"Dataset missing required 'description' attribute.\", node_path='ds', rule_id='deepcode/dataset-description', severity=2, fatal=None, fix=None, suggestions=[Suggestion(desc=\"Add a 'description' attribute to dataset.attrs.\", data=None, fix=None)])])" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "linter = LintDataset(dataset=ds)\n", "linter.lint_dataset()" @@ -792,7 +186,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "f7b34a8b-447b-4be5-b0f2-0c70cc9352e3", "metadata": { "tags": [] @@ -912,19 +306,10 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "0adca69e-31a5-4ff6-8466-1a129ee85779", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Current Working Directory: /home/tejas/bc/projects/deepesdl/deep-code/examples/notebooks\n", - "Current Working Directory: /home/tejas/bc/projects/deepesdl/deep-code/examples/notebooks\n" - ] - } - ], + "outputs": [], "source": [ "notebook_path = os.path.abspath(\"publish_to_EarthCODE.ipynb\")\n", "notebook_dir = os.path.dirname(notebook_path)\n", @@ -947,92 +332,12 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "c7e3a5a8-d74d-4bd4-9b76-45ace4b2d341", "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:root:Forking repository...\n", - "INFO:root:Repository forked to tejasmharish/open-science-catalog-metadata-staging\n", - "INFO:root:Checking local repository...\n", - "INFO:root:Cloning forked repository...\n", - "Cloning into '/home/tejas/temp_repo'...\n", - "INFO:root:Repository cloned to /home/tejas/temp_repo\n", - "INFO:deep_code.tools.publish:Generating STAC collection...\n", - "INFO:deep_code.utils.dataset_stac_generator:Attempting to open dataset 'cmems_sst_v2.zarr' with configuration: Public store\n", - "INFO:deep_code.utils.dataset_stac_generator:Successfully opened dataset 'cmems_sst_v2.zarr' with configuration: Public store\n", - "INFO:deep_code.tools.publish:Variable catalog already exists for sea-surface-temperature, adding product link.\n", - "INFO:deep_code.tools.publish:Generating OGC API Record for the workflow...\n", - "INFO:root:Creating new branch: add-new-collection-cmems-sst-20250924111300...\n", - "Switched to a new branch 'add-new-collection-cmems-sst-20250924111300'\n", - "INFO:deep_code.tools.publish:Adding products/cmems-sst/collection.json to add-new-collection-cmems-sst-20250924111300\n", - "INFO:root:Adding new file: products/cmems-sst/collection.json...\n", - "INFO:deep_code.tools.publish:Adding variables/sea-surface-temperature/catalog.json to add-new-collection-cmems-sst-20250924111300\n", - "INFO:root:Adding new file: variables/sea-surface-temperature/catalog.json...\n", - "INFO:deep_code.tools.publish:Adding /home/tejas/temp_repo/variables/catalog.json to add-new-collection-cmems-sst-20250924111300\n", - "INFO:root:Adding new file: /home/tejas/temp_repo/variables/catalog.json...\n", - "INFO:deep_code.tools.publish:Adding /home/tejas/temp_repo/products/catalog.json to add-new-collection-cmems-sst-20250924111300\n", - "INFO:root:Adding new file: /home/tejas/temp_repo/products/catalog.json...\n", - "INFO:deep_code.tools.publish:Adding /home/tejas/temp_repo/projects/deep-earth-system-data-lab/collection.json to add-new-collection-cmems-sst-20250924111300\n", - "INFO:root:Adding new file: /home/tejas/temp_repo/projects/deep-earth-system-data-lab/collection.json...\n", - "INFO:deep_code.tools.publish:Adding workflows/demo-cmems-sst-workflow/record.json to add-new-collection-cmems-sst-20250924111300\n", - "INFO:root:Adding new file: workflows/demo-cmems-sst-workflow/record.json...\n", - "INFO:deep_code.tools.publish:Adding experiments/demo-cmems-sst-workflow/record.json to add-new-collection-cmems-sst-20250924111300\n", - "INFO:root:Adding new file: experiments/demo-cmems-sst-workflow/record.json...\n", - "INFO:deep_code.tools.publish:Adding experiments/catalog.json to add-new-collection-cmems-sst-20250924111300\n", - "INFO:root:Adding new file: experiments/catalog.json...\n", - "INFO:deep_code.tools.publish:Adding workflows/catalog.json to add-new-collection-cmems-sst-20250924111300\n", - "INFO:root:Adding new file: workflows/catalog.json...\n", - "INFO:root:Committing and pushing changes...\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[add-new-collection-cmems-sst-20250924111300 5093f608] Publish dataset: cmems-sst + workflow/experiment: demo-cmems-sst-workflow\n", - " 9 files changed, 434 insertions(+), 6 deletions(-)\n", - " create mode 100644 experiments/demo-cmems-sst-workflow/record.json\n", - " create mode 100644 products/cmems-sst/collection.json\n", - " create mode 100644 workflows/demo-cmems-sst-workflow/record.json\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "remote: \n", - "remote: Create a pull request for 'add-new-collection-cmems-sst-20250924111300' on GitHub by visiting: \n", - "remote: https://github.com/tejasmharish/open-science-catalog-metadata-staging/pull/new/add-new-collection-cmems-sst-20250924111300 \n", - "remote: \n", - "To https://github.com/tejasmharish/open-science-catalog-metadata-staging.git\n", - " * [new branch] add-new-collection-cmems-sst-20250924111300 -> add-new-collection-cmems-sst-20250924111300\n", - "INFO:root:Creating a pull request...\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Branch 'add-new-collection-cmems-sst-20250924111300' set up to track remote branch 'add-new-collection-cmems-sst-20250924111300' from 'origin'.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:root:Pull request created: https://github.com/ESA-EarthCODE/open-science-catalog-metadata-staging/pull/162\n", - "INFO:deep_code.tools.publish:Pull request created: None\n", - "INFO:root:Cleaning up local repository...\n", - "INFO:deep_code.tools.publish:Pull request created: None\n" - ] - } - ], + "outputs": [], "source": [ "# publish dataset and workflow using the python function\n", "publisher = Publisher(\n", From bd6bcd64595f5338b6b024a6f21acf639b7f8552 Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 30 Sep 2025 15:24:25 +0200 Subject: [PATCH 19/19] modified to pass flake8/ruff --- deep_code/tools/publish.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 9207999..fe3f9eb 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -333,15 +333,15 @@ def _normalize_name(name: str | None) -> str | None: def _update_base_catalog( self, catalog_path: str, item_id: str, self_href: str ) -> Catalog: - """Update a base catalog by adding a link to a new item. + """Update a base catalog by adding a unique item link and a single self link. Args: - catalog_path: Path to the base catalog JSON file. - item_id: ID of the new item (experiment or workflow). - self_href: Self-href for the base catalog. + catalog_path: Path to the base catalog JSON file, relative to the repo root. + item_id: ID (directory name) of the new item (workflow/experiment). + self_href: Absolute self-href for the base catalog. Returns: - Updated Catalog object. + The updated PySTAC Catalog object (in-memory). """ base_catalog = Catalog.from_file( Path(self.gh_publisher.github_automation.local_clone_dir) / catalog_path @@ -349,18 +349,18 @@ def _update_base_catalog( item_href = f"./{item_id}/record.json" - # Ensure the "item" link is unique - def link_href(link: Link) -> str | None: - # PySTAC Link may store href in .target or .href; .get_href() resolves if base HREF set + def resolve_href(link: Link) -> str | None: + # PySTAC keeps raw targets; get_href() may resolve relative paths if a base HREF is set return ( getattr(link, "href", None) or getattr(link, "target", None) or (link.get_href() if hasattr(link, "get_href") else None) ) + # 1) Add the "item" link only if it's not already present has_item = any( - (l.rel == "item") and (link_href(l) == item_href) - for l in base_catalog.links + (link.rel == "item") and (resolve_href(link) == item_href) + for link in base_catalog.links ) if not has_item: base_catalog.add_link( @@ -372,17 +372,17 @@ def link_href(link: Link) -> str | None: ) ) - # Ensure there is exactly one "self" link - base_catalog.links = [l for l in base_catalog.links if l.rel != "self"] + # 2) Ensure there is exactly one "self" link + base_catalog.links = [link for link in base_catalog.links if link.rel != "self"] base_catalog.set_self_href(self_href) - # deduplicate by (rel, href) - seen = set() - unique_links = [] - for l in base_catalog.links: - key = (l.rel, link_href(l)) + # 3) Defense-in-depth: deduplicate by (rel, href) + seen: set[tuple[str, str | None]] = set() + unique_links: list[Link] = [] + for link in base_catalog.links: + key = (link.rel, resolve_href(link)) if key not in seen: - unique_links.append(l) + unique_links.append(link) seen.add(key) base_catalog.links = unique_links