diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 2bceb9c1b..5d48df387 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -41,10 +41,22 @@ jobs: - '3.13' mode: - normal + vendored_dandiapi: + # Allow vendor information for the dandi-api instance to default to + # the default values specified in + # dandi/tests/data/dandiarchive-docker/docker-compose.yml + - default include: - os: ubuntu-latest python: '3.10' mode: dandi-api + - os: ubuntu-latest + python: '3.10' + mode: dandi-api + vendored_dandiapi: ember-dandi + instance_name: EMBER-DANDI + instance_identifier: 'RRID:SCR_026700' + doi_prefix: '10.82754' - os: ubuntu-latest python: 3.13 mode: obolibrary-only @@ -76,6 +88,21 @@ jobs: python -m pip install --upgrade pip wheel pip install ".[extras,test]" + # Set only if matrix.instance_name is defined + - name: Set DANDI_TESTS_INSTANCE_NAME + if: ${{ matrix.instance_name }} + run: echo "DANDI_TESTS_INSTANCE_NAME=${{ matrix.instance_name }}" >> "$GITHUB_ENV" + + # Set only if matrix.instance_identifier is defined + - name: Set DANDI_TESTS_INSTANCE_IDENTIFIER + if: ${{ matrix.instance_identifier }} + run: echo "DANDI_TESTS_INSTANCE_IDENTIFIER=${{ matrix.instance_identifier }}" >> "$GITHUB_ENV" + + # Set only if matrix.doi_prefix is defined + - name: Set DANDI_TESTS_DOI_PREFIX + if: ${{ matrix.doi_prefix }} + run: echo "DANDI_TESTS_DOI_PREFIX=${{ matrix.doi_prefix }}" >> "$GITHUB_ENV" + - name: Install dev versions of select dependencies if: matrix.mode == 'dev-deps' run: | diff --git a/dandi/cli/cmd_service_scripts.py b/dandi/cli/cmd_service_scripts.py index a989b1307..8e55645c0 100644 --- a/dandi/cli/cmd_service_scripts.py +++ b/dandi/cli/cmd_service_scripts.py @@ -18,6 +18,8 @@ from requests.auth import HTTPBasicAuth from requests.exceptions import HTTPError +from dandi.consts import known_instances + from .base import ChoiceList, instance_option, map_to_click_exceptions from .. import __version__, lgr from ..dandiapi import DandiAPIClient, RemoteBlobAsset, RESTFullAPIClient @@ -246,8 +248,15 @@ def update_dandiset_from_doi( Update the metadata for the draft version of a Dandiset with information from a given DOI record. """ - if dandiset.startswith("DANDI:"): - dandiset = dandiset[6:] + known_instance_names = [k.upper() for k in known_instances.keys()] + + # Strip instance name prefix from dandiset ID, if present + for name in known_instance_names: + dandiset_id_prefix = f"{name}:" + if dandiset.startswith(dandiset_id_prefix): + dandiset = dandiset[len(dandiset_id_prefix) :] + break + start_time = datetime.now().astimezone() with DandiAPIClient.for_dandi_instance(dandi_instance, authenticate=True) as client: with RESTFullAPIClient( diff --git a/dandi/cli/tests/test_service_scripts.py b/dandi/cli/tests/test_service_scripts.py index 5cb2e9c38..9c9ced5be 100644 --- a/dandi/cli/tests/test_service_scripts.py +++ b/dandi/cli/tests/test_service_scripts.py @@ -10,6 +10,7 @@ import anys from click.testing import CliRunner from dandischema.consts import DANDI_SCHEMA_VERSION +from dandischema.models import ID_PATTERN import pytest from dandi import __version__ @@ -105,13 +106,13 @@ def test_update_dandiset_from_doi( metadata = new_dandiset.dandiset.get_raw_metadata() with (DATA_DIR / "update_dandiset_from_doi" / f"{name}.json").open() as fp: expected = json.load(fp) - expected["id"] = f"DANDI:{dandiset_id}/draft" + expected["id"] = anys.AnyFullmatch(rf"{ID_PATTERN}:{dandiset_id}/draft") expected["url"] = f"{repository}/dandiset/{dandiset_id}/draft" expected["@context"] = ( "https://raw.githubusercontent.com/dandi/schema/master/releases" f"/{DANDI_SCHEMA_VERSION}/context.json" ) - expected["identifier"] = f"DANDI:{dandiset_id}" + expected["identifier"] = anys.AnyFullmatch(rf"{ID_PATTERN}:{dandiset_id}") expected["repository"] = repository expected["dateCreated"] = anys.ANY_AWARE_DATETIME_STR expected["schemaVersion"] = DANDI_SCHEMA_VERSION diff --git a/dandi/dandiarchive.py b/dandi/dandiarchive.py index 1cc7ac5d7..2a9472463 100644 --- a/dandi/dandiarchive.py +++ b/dandi/dandiarchive.py @@ -743,7 +743,8 @@ def parse( continue groups = match.groupdict() if "instance_name" in groups: - # map to lower case so we could immediately map DANDI: into "dandi" instance + # map to lower case so we could immediately map an instance name to + # an instance in `dandi.consts.known_instances` groups["instance_name"] = groups["instance_name"].lower() lgr.log(5, "Matched %r into %s", url, groups) rewrite = settings.get("rewrite", False) diff --git a/dandi/dandiset.py b/dandi/dandiset.py index 7d5e7f935..02a5fbf72 100644 --- a/dandi/dandiset.py +++ b/dandi/dandiset.py @@ -127,17 +127,7 @@ def _get_identifier(metadata: dict) -> str | None: id_ = metadata["identifier"] lgr.debug("Found identifier %s in top level 'identifier'", str(id_)) - if isinstance(id_, dict): - # New formalized model, but see below DANDI: way - # TODO: add schemaVersion handling but only after we have them provided - # in all metadata records from dandi-api server - if id_.get("propertyID") != "DANDI": - raise ValueError( - f"Got following identifier record when was expecting a record " - f"with 'propertyID: DANDI': {id_}" - ) - id_ = str(id_.get("value", "")) - elif id_ is not None: + if id_ is not None: assert isinstance(id_, str) # result of https://github.com/dandi/dandi-cli/pull/348 which ??? # TODO: RF to avoid this evil!!! diff --git a/dandi/tests/test_download.py b/dandi/tests/test_download.py index 07f83b57e..5f982d43b 100644 --- a/dandi/tests/test_download.py +++ b/dandi/tests/test_download.py @@ -16,6 +16,7 @@ import time from unittest import mock +from dandischema.models import ID_PATTERN import numpy as np import pytest from pytest_mock import MockerFixture @@ -258,7 +259,7 @@ def test_download_dandiset_yaml(text_dandiset: SampleDandiset, tmp_path: Path) - assert list_paths(tmp_path, dirs=True) == [tmp_path / dandiset_metadata_file] with (tmp_path / dandiset_metadata_file).open(encoding="utf-8") as fp: metadata = yaml_load(fp) - assert metadata["id"] == f"DANDI:{dandiset_id}/draft" + assert re.match(rf"^{ID_PATTERN}:{dandiset_id}/draft$", metadata["id"]) def test_download_asset_id(text_dandiset: SampleDandiset, tmp_path: Path) -> None: diff --git a/dandi/tests/test_utils.py b/dandi/tests/test_utils.py index 9f7eb8dfc..3a15043c5 100644 --- a/dandi/tests/test_utils.py +++ b/dandi/tests/test_utils.py @@ -29,6 +29,7 @@ get_utcnow_datetime, is_page2_url, is_same_time, + is_url, on_windows, post_upload_size_check, under_paths, @@ -589,3 +590,67 @@ def test_post_upload_size_check_erroring( logging.ERROR, f"Size of {p} was 42 at start of upload but is now 19 after upload", ) in caplog.record_tuples + + +class TestIsUrl: + @pytest.mark.parametrize( + "s", + [ + # Standard valid HTTP/FTP URLs + "http://example.com", + "https://example.com", + "https://example.com/path", + "https://example.com/path?query=1#frag", + "https://example.com:8443/path", + "http://127.0.0.1:8000", + "ftp://example.com/path/file.txt", + "ftp://user:pass@example.com/dir", + # These pass pydantic validation but are not very useful URLs + "http:/example.com", + # Typical DANDI Archive dandiset URLs (also valid HTTP URLs) + "https://dandiarchive.org/dandiset/000027", + "https://dandiarchive.org/dandiset/000027/draft", + "https://dandiarchive.org/dandiset/000027/0.210428.2206", + # DANDI identifiers and ids + "DANDI:123456", + "DANDI:123456/draft", + "DANDI:123456/1.123456.1234", + "DANDI-SANDBOX:123456", + "DANDI-SANDBOX:123456/draft", + "DANDI-SANDBOX:123456/1.123456.1234", + # Customized DANDI URLs + "dandi://dandi/123456", + "dandi://dandi/123456/draft", + "dandi://dandi/123456/1.123456.1234", + "dandi://dandi-sandbox/123456", + "dandi://dandi-sandbox/123456/draft", + "dandi://dandi-sandbox/123456/1.123456.1234", + ], + ) + def test_valid_urls(self, s: str) -> None: + assert is_url(s) is True + + @pytest.mark.parametrize( + "s", + [ + # Clearly invalid URLs + "not a url", + "example", + "example .com", + "://example.com", + "", + " ", + # DANDI-like string that should not be treated as a valid DANDI URL + "dandi://not-a-real-dandiset", + # Invalid DANDI identifiers and ids because of unknown instance name + "FAKEDANDI:123456", + "FAKEDANDI:123456/draft", + "FAKEDANDI:123456/1.123456.1234", + # Customized DANDI URLs + "dandi://fakedandi/123456", + "dandi://fakedandi/123456/draft", + "dandi://fakedandi/123456/1.123456.1234", + ], + ) + def test_invalid_urls(self, s: str) -> None: + assert is_url(s) is False diff --git a/dandi/utils.py b/dandi/utils.py index 9d77f33cc..cd6ea7afd 100644 --- a/dandi/utils.py +++ b/dandi/utils.py @@ -28,7 +28,7 @@ import dateutil.parser from multidict import MultiDict # dependency of yarl -from pydantic import BaseModel, Field +from pydantic import AnyHttpUrl, BaseModel, Field, FtpUrl, TypeAdapter, ValidationError import requests import ruamel.yaml from semantic_version import Version @@ -648,13 +648,36 @@ def _get_instance( ) -def is_url(s: str) -> bool: - """Very primitive url detection for now +# This is defined in module level because repeated invocations of +# TypeAdapter creation is expensive +_url_adapter: TypeAdapter[AnyHttpUrl | FtpUrl] = TypeAdapter(AnyHttpUrl | FtpUrl) + - TODO: redo +def is_url(s: str) -> bool: """ - return s.lower().startswith(("http://", "https://", "dandi:", "ftp://")) - # Slashes are not required after "dandi:" so as to support "DANDI:" + Determines whether the input string `s` is a valid URL (standard URL or DANDI URL). + """ + + # Importing from within function to avoid possible circular imports + # since this a utility module + from dandi.dandiarchive import parse_dandi_url + from dandi.exceptions import UnknownURLError + + try: + _url_adapter.validate_python(s) + except ValidationError: + # `s` is not a standard URL, try parsing it as DANDI URL + try: + parse_dandi_url(s) + except UnknownURLError: + # `s` is neither a standard URL nor a DANDI URL, returning `False` + return False + + # `s` is a DANDI URL, returning `True` + return True + + # `s` is a standard URL + return True def get_module_version(module: str | types.ModuleType) -> str | None: