diff --git a/dandi/cli/base.py b/dandi/cli/base.py index 9d8bc6ace..298bfbbb2 100644 --- a/dandi/cli/base.py +++ b/dandi/cli/base.py @@ -1,5 +1,7 @@ from functools import wraps import os +import re +from typing import Optional import click @@ -147,3 +149,61 @@ def wrapper(obj, *args, **kwargs): map_to_click_exceptions._do_map = not bool( # type: ignore[attr-defined] os.environ.get("DANDI_DEVEL", None) ) + + +def _compile_regex(regex: str) -> re.Pattern: + """ + Helper to compile a regex pattern expressed as an `str` into a `re.Pattern` + + Parameters + ---------- + regex : str + The regex pattern expressed as a string. + + Returns + ------- + re.Pattern + The compiled regex pattern. + """ + try: + compiled_regex = re.compile(regex) + except re.error as e: + raise click.BadParameter(f"Invalid regex pattern {regex!r}: {e}") from e + + return compiled_regex + + +def parse_regexes( + _ctx: click.Context, _param: click.Parameter, value: Optional[str] +) -> Optional[set[re.Pattern]]: + """ + Callback to parse a string of comma-separated regex patterns + + Parameters + ---------- + _ctx : click.Context + The Click context (not used). + + _param : click.Parameter + The Click parameter (not used). + + value : str | None + The input string containing comma-separated regex patterns. It is assumed + that none of the patterns contain commas themselves. + + Returns + ------- + set[re.Pattern] + A set of compiled regex patterns. + + Notes + ----- + This callback is only suitable to parse patterns that do not contain commas. + """ + if value is None: + # Handle the case where no value is provided + return None + + regexes = set(value.split(",")) + + return {_compile_regex(regex) for regex in regexes} diff --git a/dandi/cli/cmd_validate.py b/dandi/cli/cmd_validate.py index b0b5012f5..dbebe96c9 100644 --- a/dandi/cli/cmd_validate.py +++ b/dandi/cli/cmd_validate.py @@ -3,14 +3,20 @@ from collections.abc import Iterable import logging import os +from pathlib import Path import re -from typing import cast +from typing import Optional, cast import warnings import click -from .base import devel_debug_option, devel_option, map_to_click_exceptions -from ..utils import pluralize +from .base import ( + devel_debug_option, + devel_option, + map_to_click_exceptions, + parse_regexes, +) +from ..utils import filter_by_id_patterns, filter_by_paths, pluralize from ..validate import validate as validate_ from ..validate_types import Severity, ValidationResult @@ -80,6 +86,26 @@ def validate_bids( default="none", ) @click.option("--ignore", metavar="REGEX", help="Regex matching error IDs to ignore") +@click.option( + "--match", + metavar="REGEX,REGEX,...", + help=( + "Comma-separated regex patterns used to filter issues in validation results " + "by their ID. Only issues with an ID matching at least one of the given " + "patterns are included in the eventual result. " + "(No pattern should contain a comma.)" + ), + callback=parse_regexes, +) +@click.option( + "--include-path", + multiple=True, + type=click.Path(exists=True, resolve_path=True, path_type=Path), + help=( + "Filter issues in the validation results to only those associated with the " + "given path(s). This option can be specified multiple times." + ), +) @click.option( "--min-severity", help="Only display issues with severities above this level.", @@ -92,6 +118,8 @@ def validate_bids( def validate( paths: tuple[str, ...], ignore: str | None, + match: Optional[set[re.Pattern]], + include_path: tuple[Path, ...], grouping: str, min_severity: str, schema: str | None = None, @@ -134,17 +162,28 @@ def validate( if i.severity is not None and i.severity.value >= min_severity_value ] - _process_issues(filtered_results, grouping, ignore) + _process_issues(filtered_results, grouping, ignore, match, include_path) def _process_issues( validator_result: Iterable[ValidationResult], grouping: str, ignore: str | None = None, + match: Optional[set[re.Pattern]] = None, + include_path: tuple[Path, ...] = (), ) -> None: issues = [i for i in validator_result if i.severity is not None] if ignore is not None: issues = [i for i in issues if not re.search(ignore, i.id)] + + # Filter issues by ID patterns if provided + if match is not None: + issues = filter_by_id_patterns(issues, match) + + # Filter issues by included paths if provided + if include_path: + issues = filter_by_paths(issues, include_path) + purviews = [i.purview for i in issues] if grouping == "none": display_errors( diff --git a/dandi/cli/tests/test_base.py b/dandi/cli/tests/test_base.py new file mode 100644 index 000000000..9fa4971eb --- /dev/null +++ b/dandi/cli/tests/test_base.py @@ -0,0 +1,71 @@ +import re + +import click +import pytest + +from dandi.cli.base import _compile_regex, parse_regexes + +DUMMY_CTX = click.Context(click.Command("dummy")) +DUMMY_PARAM = click.Option(["--dummy"]) + + +class TestCompileRegex: + @pytest.mark.parametrize( + "pattern", + [ + "abc", + "[a-z]+", + "^start$", + r"a\.b", + ], + ) + def test_valid_patterns_return_pattern(self, pattern): + compiled = _compile_regex(pattern) + assert isinstance(compiled, re.Pattern) + assert compiled.pattern == pattern + + @pytest.mark.parametrize("pattern", ["(", "[a-z", "\\"]) + def test_invalid_patterns_raise_bad_parameter(self, pattern): + with pytest.raises(click.BadParameter) as exc_info: + _compile_regex(pattern) + msg = str(exc_info.value) + assert "Invalid regex pattern" in msg + assert repr(pattern) in msg + + +class TestParseRegexes: + def test_none_returns_none(self): + assert parse_regexes(DUMMY_CTX, DUMMY_PARAM, None) is None + + @pytest.mark.parametrize( + "value, expected_patterns_in_strs", + [ + # Single patterns + ("abc", {"abc"}), + ("[a-z]+", {"[a-z]+"}), + (r"a\.b", {r"a\.b"}), + (r"", {r""}), + # Multiple patterns + ("foo,,bar", {"foo", "", "bar"}), + ("^start$,end$", {"^start$", "end$"}), + (r"a\.b,c+d", {r"a\.b", r"c+d"}), + # duplicates should be collapsed by the internal set() + ("foo,foo,bar", {"foo", "bar"}), + ], + ) + def test_parse_patterns( + self, value: str, expected_patterns_in_strs: set[str] + ) -> None: + result = parse_regexes(DUMMY_CTX, DUMMY_PARAM, value) + assert isinstance(result, set) + + assert {p.pattern for p in result} == expected_patterns_in_strs + + @pytest.mark.parametrize( + "value, bad_pattern", [("(", "("), ("foo,(", "("), ("good,[a-z", "[a-z")] + ) + def test_invalid_pattern_raises_bad_parameter( + self, value: str, bad_pattern: str + ) -> None: + with pytest.raises(click.BadParameter, match=re.escape(bad_pattern)): + parse_regexes(DUMMY_CTX, DUMMY_PARAM, value) diff --git a/dandi/cli/tests/test_cmd_validate.py b/dandi/cli/tests/test_cmd_validate.py index 6694ef3ec..a380c37f7 100644 --- a/dandi/cli/tests/test_cmd_validate.py +++ b/dandi/cli/tests/test_cmd_validate.py @@ -2,6 +2,7 @@ from click.testing import CliRunner import pytest +import pytest_mock from ..cmd_validate import _process_issues, validate from ...tests.xfail import mark_xfail_windows_python313_posixsubprocess @@ -149,3 +150,240 @@ def test_validate_bids_error_grouping_notification( # Does it notify the user correctly? notification_substring = "Invalid value for '--grouping'" assert notification_substring in r.output + + +def _mock_validate(*paths, **kwargs): + """Mock validation function that returns controlled ValidationResult objects.""" + origin = Origin( + type=OriginType.VALIDATION, + validator=Validator.dandi, + validator_version="test", + ) + + # Return a set of validation results with different IDs + results = [ + ValidationResult( + id="BIDS.DATATYPE_MISMATCH", + origin=origin, + severity=Severity.ERROR, + scope=Scope.FILE, + message="Datatype mismatch error", + path=Path(paths[0]) / "file1.nii", + ), + ValidationResult( + id="BIDS.EXTENSION_MISMATCH", + origin=origin, + severity=Severity.ERROR, + scope=Scope.FILE, + message="Extension mismatch error", + path=Path(paths[0]) / "file2.jpg", + ), + ValidationResult( + id="DANDI.NO_DANDISET_FOUND", + origin=origin, + severity=Severity.ERROR, + scope=Scope.DANDISET, + message="No dandiset found", + path=Path(paths[0]), + ), + ValidationResult( + id="NWBI.check_data_orientation", + origin=origin, + severity=Severity.WARNING, + scope=Scope.FILE, + message="Data orientation warning", + path=Path(paths[0]) / "file3.nwb", + ), + ] + return iter(results) + + +class TestValidateMatchOption: + """Test the --match option for filtering validation results.""" + + @pytest.mark.parametrize( + "match_patterns,parsed_patterns,should_contain,should_not_contain", + [ + # Single pattern matching specific validation ID + ( + r"BIDS\.DATATYPE_MISMATCH", + {r"BIDS\.DATATYPE_MISMATCH"}, + ["BIDS.DATATYPE_MISMATCH"], + [ + "BIDS.EXTENSION_MISMATCH", + "DANDI.NO_DANDISET_FOUND", + "NWBI.check_data_orientation", + ], + ), + # Single pattern matching by prefix + ( + r"^BIDS\.", + {r"^BIDS\."}, + ["BIDS.DATATYPE_MISMATCH", "BIDS.EXTENSION_MISMATCH"], + ["DANDI.NO_DANDISET_FOUND", "NWBI.check_data_orientation"], + ), + # Single pattern that matches nothing (should show "No errors found") + ( + r"NONEXISTENT_ID", + {r"NONEXISTENT_ID"}, + ["No errors found"], + ["BIDS", "DANDI", "NWBI"], + ), + # Multiple patterns separated by comma + ( + r"BIDS\.DATATYPE_MISMATCH,BIDS\.EXTENSION_MISMATCH", + {r"BIDS\.DATATYPE_MISMATCH", r"BIDS\.EXTENSION_MISMATCH"}, + ["BIDS.DATATYPE_MISMATCH", "BIDS.EXTENSION_MISMATCH"], + ["DANDI.NO_DANDISET_FOUND", "NWBI.check_data_orientation"], + ), + # Multiple patterns with wildcard + ( + r"BIDS\.\S+,DANDI\.\S+", + {r"BIDS\.\S+", r"DANDI\.\S+"}, + [ + "BIDS.DATATYPE_MISMATCH", + "BIDS.EXTENSION_MISMATCH", + "DANDI.NO_DANDISET_FOUND", + ], + ["NWBI"], + ), + ], + ) + def test_match_patterns( + self, + tmp_path: Path, + match_patterns: str, + parsed_patterns: set[str], + should_contain: list[str], + should_not_contain: list[str], + monkeypatch: pytest.MonkeyPatch, + mocker: pytest_mock.MockerFixture, + ) -> None: + """Test --match option with single or multiple comma-separated patterns.""" + from .. import cmd_validate + + # Use to monitor what compiled patterns are passed by the CLI + process_issues_spy = mocker.spy(cmd_validate, "_process_issues") + + monkeypatch.setattr(cmd_validate, "validate_", _mock_validate) + + r = CliRunner().invoke(validate, [f"--match={match_patterns}", str(tmp_path)]) + + process_issues_spy.assert_called_once() + call_args = process_issues_spy.call_args + + # Ensure the patterns are parsed and passed correctly + passed_patterns = call_args.kwargs.get( + "match", call_args.args[3] if len(call_args.args) > 3 else None + ) + assert ( + passed_patterns is not None + ), "No match patterns were passed to _process_issues" + assert {p.pattern for p in passed_patterns} == parsed_patterns + + for text in should_contain: + assert text in r.output, f"Expected '{text}' in output but not found" + + for text in should_not_contain: + assert text not in r.output, f"Expected '{text}' NOT in output but found" + + def test_match_invalid_regex(self, tmp_path: Path) -> None: + """Test --match option with invalid regex pattern.""" + # Invalid regex pattern with unmatched parenthesis + r = CliRunner(mix_stderr=False).invoke( + validate, [r"--match=(INVALID", str(tmp_path)], catch_exceptions=False + ) + + # Should fail with an error about invalid regex + assert r.exit_code != 0 + assert "error" in r.stderr.lower() and "--match" in r.stderr + + def test_match_with_ignore_combination( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + """Test --match and --ignore options used together.""" + from .. import cmd_validate + + monkeypatch.setattr(cmd_validate, "validate_", _mock_validate) + + # Then use both match and ignore + r = CliRunner().invoke( + validate, + [ + r"--match=BIDS\.DATATYPE_MISMATCH", + r"--ignore=DATATYPE_MISMATCH", + str(tmp_path), + ], + ) + + assert "BIDS.DATATYPE_MISMATCH" not in r.output + assert "No errors found" in r.output + + +class TestValidateIncludePathOption: + """Test the --include-path option for filtering validation results.""" + + def test_nonexistent_include_path( + self, + tmp_path: Path, + ) -> None: + """Test --include-path option with a non-existent path.""" + + non_existent_path = tmp_path / "nonexistent.nwb" + + r = CliRunner().invoke( + validate, + [f"--include-path={non_existent_path}", str(tmp_path)], + ) + + # Should exit with an error about `--include-path` + assert r.exit_code != 0 + assert "--include-path" in r.output + + @pytest.mark.parametrize( + "include_paths", + [ + ["path1.nwb"], + ["path1.nwb", "path2.nwb"], + ], + ) + def test_validated_option_args( + self, + include_paths: list[str], + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, + mocker: pytest_mock.MockerFixture, + ) -> None: + """ + Test that the validated arguments for --include-path are correct + """ + from .. import cmd_validate + + # Use to monitor what compiled patterns are passed by the CLI + process_issues_spy = mocker.spy(cmd_validate, "_process_issues") + + # We actually don't care about validation results here. + # We are only mocking to avoid running actual validation logic. + monkeypatch.setattr(cmd_validate, "validate_", _mock_validate) + + paths = [tmp_path / p for p in include_paths] + for p in paths: + p.touch() + + cli_args = [f"--include-path={p}" for p in paths] + [str(tmp_path)] + CliRunner().invoke( + validate, + cli_args, + ) + + process_issues_spy.assert_called_once() + call_args = process_issues_spy.call_args + + # Ensure the paths are parsed and passed correctly + passed_paths = call_args.kwargs.get( + "include_path", + call_args.args[4] if len(call_args.args) > 4 else None, + ) + assert len(passed_paths) == len(include_paths) + assert all(isinstance(p, Path) for p in passed_paths) + assert all(p.is_absolute() for p in passed_paths) diff --git a/dandi/tests/test_utils.py b/dandi/tests/test_utils.py index 9f7eb8dfc..66ca5304b 100644 --- a/dandi/tests/test_utils.py +++ b/dandi/tests/test_utils.py @@ -5,6 +5,7 @@ import logging import os.path as op from pathlib import Path +import re import time import pytest @@ -20,6 +21,8 @@ _get_instance, ensure_datetime, ensure_strtime, + filter_by_id_patterns, + filter_by_paths, find_files, flatten, flattened, @@ -33,6 +36,14 @@ post_upload_size_check, under_paths, ) +from ..validate_types import ( + Origin, + OriginType, + Scope, + Severity, + ValidationResult, + Validator, +) def test_find_files() -> None: @@ -589,3 +600,376 @@ def test_post_upload_size_check_erroring( logging.ERROR, f"Size of {p} was 42 at start of upload but is now 19 after upload", ) in caplog.record_tuples + + +class TestFilterByIdPatterns: + """Test the filter_by_id_patterns function.""" + + @pytest.fixture + def sample_validation_results(self) -> list[ValidationResult]: + """Create sample validation results for testing.""" + origin_validation_nwbinspector = Origin( + type=OriginType.VALIDATION, + validator=Validator.nwbinspector, + validator_version="", + ) + origin_validation_dandi = Origin( + type=OriginType.VALIDATION, + validator=Validator.dandi, + validator_version="", + ) + + return [ + ValidationResult( + id="NWBI.check_data_orientation", + origin=origin_validation_nwbinspector, + scope=Scope.FILE, + message="Data may be in the wrong orientation.", + path=Path("sub-mouse001/sub-mouse001.nwb"), + severity=Severity.WARNING, + ), + ValidationResult( + id="NWBI.check_missing_unit", + origin=origin_validation_nwbinspector, + scope=Scope.FILE, + message="Missing text for attribute 'unit'.", + path=Path("sub-mouse002/sub-mouse002.nwb"), + severity=Severity.WARNING, + ), + ValidationResult( + id="DANDI.NO_DANDISET_FOUND", + origin=origin_validation_dandi, + scope=Scope.FILE, + message="No dandiset.yaml found.", + path=Path("sub-mouse003/sub-mouse003.nwb"), + severity=Severity.ERROR, + ), + ValidationResult( + id="DANDI.METADATA_MISSING", + origin=origin_validation_dandi, + scope=Scope.FILE, + message="Required metadata is missing.", + path=Path("sub-mouse004/sub-mouse004.nwb"), + severity=Severity.ERROR, + ), + ] + + @pytest.mark.parametrize( + "pattern_strs,expected_ids", + [ + # Single pattern matching one result + ([r"NWBI\.check_data_orientation"], ["NWBI.check_data_orientation"]), + ([r".NO_DANDISET_FOUND"], ["DANDI.NO_DANDISET_FOUND"]), + # Single pattern matching multiple results with prefix + ( + [r"NWBI\.\S+"], + ["NWBI.check_data_orientation", "NWBI.check_missing_unit"], + ), + # Single pattern matching multiple results with different prefix + ([r"DANDI"], ["DANDI.NO_DANDISET_FOUND", "DANDI.METADATA_MISSING"]), + # Multiple patterns matching different results + ( + [r"NWBI\.check_data_orientation", r"DANDI\.NO_DANDISET_FOUND"], + ["NWBI.check_data_orientation", "DANDI.NO_DANDISET_FOUND"], + ), + # Multiple patterns matching different subsets + ( + [r"NWBI\.\S+", r"DANDI\.\S+"], + [ + "NWBI.check_data_orientation", + "NWBI.check_missing_unit", + "DANDI.NO_DANDISET_FOUND", + "DANDI.METADATA_MISSING", + ], + ), + # Multiple patterns with some overlap + ( + [r".*check.*", r"NWBI\..*"], + [ + "NWBI.check_data_orientation", + "NWBI.check_missing_unit", + ], + ), + # Multiple patterns with complete overlap (should not duplicate) + ( + [r"NWBI\.check_data_orientation", r"data_orientation"], + [ + "NWBI.check_data_orientation", + ], + ), + # Multiple patterns where one is subset of other + ( + [r"\S+", r"NWBI\.\S+"], + [ + "NWBI.check_data_orientation", + "NWBI.check_missing_unit", + "DANDI.NO_DANDISET_FOUND", + "DANDI.METADATA_MISSING", + ], + ), + # Pattern that matches nothing + ([r"NONEXISTENT_PATTERN"], []), + # Empty patterns list + ([], []), + ], + ) + def test_pattern_filtering( + self, + sample_validation_results: list[ValidationResult], + pattern_strs: list[str], + expected_ids: list[str], + ) -> None: + """Test filtering with various pattern combinations and edge cases.""" + patterns = {re.compile(p) for p in pattern_strs} + filtered = filter_by_id_patterns(sample_validation_results, patterns) + filtered_ids = [result.id for result in filtered] + assert filtered_ids == expected_ids + + def test_empty_validation_results(self) -> None: + """Test filtering with an empty validation results list.""" + patterns = {re.compile(r".*")} + filtered = filter_by_id_patterns([], patterns) + assert filtered == [] + + def test_preserves_order( + self, sample_validation_results: list[ValidationResult] + ) -> None: + """Test that filtering preserves the original order of results.""" + patterns = {re.compile(r".*")} + filtered = filter_by_id_patterns(sample_validation_results, patterns) + assert filtered == sample_validation_results + + +class TestFilterByPaths: + """Test the filter_by_paths function.""" + + @pytest.fixture + def sample_validation_results(self, tmp_path: Path) -> list[ValidationResult]: + """Create sample validation results with file paths for testing.""" + # Create a directory structure for testing + (tmp_path / "sub-mouse001").mkdir() + (tmp_path / "sub-mouse002").mkdir() + (tmp_path / "sub-mouse003").mkdir() + (tmp_path / "data" / "nested").mkdir(parents=True) + (tmp_path / "dataset").mkdir() + + # Create test files + file1 = tmp_path / "sub-mouse001" / "data.nwb" + file2 = tmp_path / "sub-mouse002" / "data.nwb" + file3 = tmp_path / "sub-mouse003" / "data.nwb" + file4 = tmp_path / "data" / "nested" / "experiment.nwb" + file5 = tmp_path / "file_with_both.nwb" + file1.touch() + file2.touch() + file3.touch() + file4.touch() + file5.touch() + + origin_nwbinspector = Origin( + type=OriginType.VALIDATION, + validator=Validator.nwbinspector, + validator_version="", + ) + origin_dandi = Origin( + type=OriginType.VALIDATION, + validator=Validator.dandi, + validator_version="", + ) + + return [ + # Results with only 'path' + ValidationResult( + id="NWBI.check_1", + origin=origin_nwbinspector, + scope=Scope.FILE, + message="Issue 1", + path=file1, + severity=Severity.WARNING, + ), + ValidationResult( + id="NWBI.check_2", + origin=origin_nwbinspector, + scope=Scope.FILE, + message="Issue 2", + path=file2, + severity=Severity.WARNING, + ), + ValidationResult( + id="NWBI.check_3", + origin=origin_nwbinspector, + scope=Scope.FILE, + message="Issue 3", + path=file3, + severity=Severity.ERROR, + ), + ValidationResult( + id="NWBI.check_4", + origin=origin_nwbinspector, + scope=Scope.FILE, + message="Issue 4", + path=file4, + severity=Severity.WARNING, + ), + # Result with only 'dataset_path' + ValidationResult( + id="DANDI.dataset_only", + origin=origin_dandi, + scope=Scope.DANDISET, + message="Dataset issue", + dataset_path=tmp_path / "dataset", + severity=Severity.ERROR, + ), + # Result with both 'path' and 'dataset_path' + ValidationResult( + id="DANDI.both_paths", + origin=origin_dandi, + scope=Scope.FILE, + message="Issue with both paths", + path=file5, + dataset_path=tmp_path / "dataset", + severity=Severity.WARNING, + ), + ] + + @pytest.mark.parametrize( + "path_specs,expected_ids", + [ + # Single file path + (["sub-mouse001/data.nwb"], ["NWBI.check_1"]), + # Multiple file paths + ( + ["sub-mouse001/data.nwb", "sub-mouse002/data.nwb"], + ["NWBI.check_1", "NWBI.check_2"], + ), + # Directory path matching all files under it + (["sub-mouse001"], ["NWBI.check_1"]), + # Parent directory path matching multiple subdirectories + (["data"], ["NWBI.check_4"]), + # Multiple directories + ( + ["sub-mouse001", "sub-mouse003"], + ["NWBI.check_1", "NWBI.check_3"], + ), + # Nested directory + (["data/nested"], ["NWBI.check_4"]), + # All paths via parent directory (includes all results) + ( + ["."], + [ + "NWBI.check_1", + "NWBI.check_2", + "NWBI.check_3", + "NWBI.check_4", + "DANDI.dataset_only", + "DANDI.both_paths", + ], + ), + # Empty paths tuple + ([], []), + # No duplicates when result matches multiple filter paths (file + parent dir) + (["sub-mouse001/data.nwb", "sub-mouse001"], ["NWBI.check_1"]), + # Result with dataset_path + (["dataset"], ["DANDI.dataset_only", "DANDI.both_paths"]), + # Result with both path and dataset_path - filter by file path + (["file_with_both.nwb"], ["DANDI.both_paths"]), + ], + ) + def test_path_filtering( + self, + sample_validation_results: list[ValidationResult], + tmp_path: Path, + path_specs: list[str], + expected_ids: list[str], + ) -> None: + """Test filtering with various path combinations and edge cases.""" + paths = tuple(tmp_path / p for p in path_specs) + filtered = filter_by_paths(sample_validation_results, paths) + filtered_ids = [result.id for result in filtered] + assert filtered_ids == expected_ids + + def test_nonexistent_path_raises_error( + self, sample_validation_results: list[ValidationResult], tmp_path: Path + ) -> None: + """Test that filtering with a non-existent path raises ValueError.""" + nonexistent_path = tmp_path / "nonexistent" + with pytest.raises(ValueError, match="does not exist"): + filter_by_paths(sample_validation_results, (nonexistent_path,)) + + def test_empty_validation_results(self, tmp_path: Path) -> None: + """Test filtering with an empty validation results list.""" + filtered = filter_by_paths([], (tmp_path,)) + assert filtered == [] + + def test_preserves_order( + self, sample_validation_results: list[ValidationResult], tmp_path: Path + ) -> None: + """Test that filtering preserves the original order of results.""" + filtered = filter_by_paths(sample_validation_results, (tmp_path,)) + assert filtered == sample_validation_results + + def test_result_with_nonexistent_path(self, tmp_path: Path) -> None: + """Test that results with non-existent paths are ignored during filtering.""" + existing_file = tmp_path / "existing.nwb" + nonexistent_file = tmp_path / "nonexistent.nwb" + existing_file.touch() + + origin = Origin( + type=OriginType.VALIDATION, + validator=Validator.nwbinspector, + validator_version="", + ) + + results = [ + ValidationResult( + id="NWBI.check_1", + origin=origin, + scope=Scope.FILE, + message="Issue 1", + path=existing_file, + severity=Severity.WARNING, + ), + ValidationResult( + id="NWBI.check_2", + origin=origin, + scope=Scope.FILE, + message="Issue 2", + path=nonexistent_file, # This file doesn't exist + severity=Severity.WARNING, + ), + ] + + # Filter by parent directory - should only get the existing file's result + filtered = filter_by_paths(results, (tmp_path,)) + assert len(filtered) == 1 + assert filtered[0].id == "NWBI.check_1" + + def test_relative_paths_resolved( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + """Test that relative paths are properly resolved to absolute paths.""" + monkeypatch.chdir(tmp_path) + + subdir = tmp_path / "subdir" + subdir.mkdir() + file1 = subdir / "data.nwb" + file1.touch() + + origin = Origin( + type=OriginType.VALIDATION, + validator=Validator.nwbinspector, + validator_version="", + ) + + result = ValidationResult( + id="NWBI.check_1", + origin=origin, + scope=Scope.FILE, + message="Issue", + path=file1, + severity=Severity.WARNING, + ) + + # Use relative path + filtered = filter_by_paths([result], (Path("subdir"),)) + assert len(filtered) == 1 + assert filtered[0].id == "NWBI.check_1" diff --git a/dandi/utils.py b/dandi/utils.py index 93a0e138c..3b37e93f2 100644 --- a/dandi/utils.py +++ b/dandi/utils.py @@ -24,7 +24,7 @@ from time import sleep import traceback import types -from typing import IO, Any, List, Optional, Protocol, TypeVar, Union +from typing import IO, TYPE_CHECKING, Any, List, Optional, Protocol, TypeVar, Union import dateutil.parser from multidict import MultiDict # dependency of yarl @@ -38,6 +38,9 @@ from .consts import DandiInstance, known_instances, known_instances_rev from .exceptions import BadCliVersionError, CliVersionTooOldError +if TYPE_CHECKING: + from .validate_types import ValidationResult + AnyPath = Union[str, Path] @@ -972,3 +975,76 @@ class StrEnum(str, Enum): @staticmethod def _generate_next_value_(name, _start, _count, _last_values): return name + + +def filter_by_id_patterns( + validation_results: Iterable[ValidationResult], patterns: set[re.Pattern] +) -> list[ValidationResult]: + """ + Filter validation results by matching their IDs against provided regex patterns. + + Parameters + ---------- + validation_results : Iterable[ValidationResult] + The iterable of validation results to filter. + patterns : set[re.Pattern] + The set of regex patterns to match validation result IDs against. + + Returns + ------- + list[ValidationResult] + The filtered list of validation results whose IDs match any of the provided patterns. + """ + + filtered_results = [] + for result in validation_results: + if any(re.search(pattern, result.id) for pattern in patterns): + filtered_results.append(result) + return filtered_results + + +def filter_by_paths( + validation_results: Iterable[ValidationResult], paths: tuple[Path, ...] +) -> list[ValidationResult]: + """ + Filter validation results by matching their associated paths against provided paths. + + Parameters + ---------- + validation_results : Iterable[ValidationResult] + The iterable of validation results to filter. + paths : tuple[Path, ...] + The tuple of paths to match validation result paths against. + + Returns + ------- + list[ValidationResult] + The filtered list of validation results whose associated paths match the + provided paths. 'Matching' refers to cases where an associated path of a + validation result is the same path as, or falls under, + one of the provided paths. + + Raises + ------ + ValueError + If any of the provided paths doesn't exist in the filesystem. + """ + for p in paths: + if not p.exists(): + raise ValueError(f"Provided path {p} does not exist") + + # Ensure all provided paths are resolved to their absolute forms + paths = tuple(p.resolve(strict=True) for p in paths) + + filtered_results = [] + for r in validation_results: + associated_paths = [ + p.resolve(strict=True) + for p in (r.path, r.dataset_path) + if p is not None and p.exists() + ] + for assoc_path in associated_paths: + if any(assoc_path.is_relative_to(p) for p in paths): + filtered_results.append(r) + break + return filtered_results diff --git a/docs/source/cmdline/validate.rst b/docs/source/cmdline/validate.rst index 111ccd441..372153f19 100644 --- a/docs/source/cmdline/validate.rst +++ b/docs/source/cmdline/validate.rst @@ -22,6 +22,20 @@ Options Ignore any validation errors & warnings whose ID matches the given regular expression +.. option:: --match REGEX,REGEX,... + + Comma-separated regex patterns used to filter issues in validation results by their + ID. Only issues with an ID matching at least one of the given patterns are included + in the eventual result. Note: The separator used to separate the patterns is a + comma (`,`), so no pattern should contain a comma. + +.. option:: --include-path PATH + + Filter issues in the validation results to only those associated with the + given path(s). A validation issue is associated with a path if its associated + path(s) are the same as or fall under the provided path. This option can be + specified multiple times to include multiple paths. + .. option:: --min-severity [HINT|WARNING|ERROR] Only display issues with severities above this level (HINT by default)