diff --git a/dvc/repo/index.py b/dvc/repo/index.py index f28db66b6d..3090517b39 100644 --- a/dvc/repo/index.py +++ b/dvc/repo/index.py @@ -69,7 +69,16 @@ def collect_files( def is_ignored(path: str) -> bool: # apply only for the local fs - return is_local_fs and scm.is_ignored(path) + if not is_local_fs: + return False + + # For DVC files, use DVC's ignore system which properly handles + # ** globbing patterns with negations (e.g., data/** + !data/**/*.dvc) + if is_valid_filename(path): + return repo.dvcignore.is_ignored_file(path) + + # For other files, use Git's ignore system + return scm.is_ignored(path) def is_dvcfile_and_not_ignored(root: str, file: str) -> bool: return is_valid_filename(file) and not is_ignored(f"{root}{sep}{file}") diff --git a/test_issue_reproduction.py b/test_issue_reproduction.py new file mode 100644 index 0000000000..791b886640 --- /dev/null +++ b/test_issue_reproduction.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 + +""" +Reproduction test for issue #10987: +dvc status reports "no data tracked" when using ** globbing patterns in .gitignore +""" + +import os +import tempfile +from pathlib import Path + +from dvc.repo import Repo + + +def test_gitignore_globbing_reproduction(): + """Reproduce the ** globbing pattern issue from #10987""" + + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_path = Path(tmp_dir) + + # Initialize git and dvc + os.chdir(tmp_path) + os.system("git init") + os.system("dvc init --no-scm") + + # Create directory structure + data_dir = tmp_path / "data" / "raw" + data_dir.mkdir(parents=True) + + # Create data file + data_file = data_dir / "example.nc" + data_file.write_text("test data") + + # Create .gitignore with problematic ** patterns + gitignore_content = """ +# Ignore all data files +data/raw/** +data/interim/** +data/processed/** + +# But keep DVC metafiles +!data/raw/**/*.dvc +!data/interim/**/*.dvc +!data/processed/**/*.dvc + +.dvc/cache/ +""" + gitignore = tmp_path / ".gitignore" + gitignore.write_text(gitignore_content.strip()) + + # Add data file to DVC + os.system(f"dvc add {data_file}") + + # Add to git + os.system(f"git add {data_file}.dvc .gitignore") + os.system('git commit -m "Add data file"') + + # Now test the issue + repo = Repo(".") + + print(f"Number of stages in index: {len(repo.index.stages)}") + print(f"DVC files in git: {list(tmp_path.rglob('*.dvc'))}") + + # Check if the .dvc file is being ignored + dvc_file_path = str(data_file) + ".dvc" + print( + f"Is {dvc_file_path} ignored by git? {repo.scm.is_ignored(dvc_file_path)}" + ) + + # Check collect_files output + from dvc.repo.index import collect_files + + collected_files = list(collect_files(repo)) + print(f"Collected files: {collected_files}") + + # The bug: index should have stages but doesn't + assert len(repo.index.stages) > 0, ( + f"Expected stages in index, but got {len(repo.index.stages)}" + ) + + +if __name__ == "__main__": + test_gitignore_globbing_reproduction() + print("Test passed!") diff --git a/tests/func/test_gitignore_globbing_fix.py b/tests/func/test_gitignore_globbing_fix.py new file mode 100644 index 0000000000..97ae71ca76 --- /dev/null +++ b/tests/func/test_gitignore_globbing_fix.py @@ -0,0 +1,218 @@ +def test_gitignore_globbing_with_dvc_files(tmp_dir, scm, dvc): + """Test that ** globbing patterns in .gitignore with negations work correctly. + + This is a regression test for issue #10987: dvc status reports "no data tracked" + when using ** globbing patterns in .gitignore. + + The issue occurs when .gitignore has patterns like: + - data/raw/** (ignore everything in data/raw/) + - !data/raw/**/*.dvc (except .dvc files) + """ + # Create directory structure + data_dir = tmp_dir / "data" / "raw" + data_dir.mkdir(parents=True) + + # Create data file + data_file = data_dir / "example.nc" + data_file.write_text("test data") + + # Create .gitignore with ** globbing patterns + gitignore_content = """ +# Ignore all data files +data/raw/** +data/interim/** +data/processed/** + +# But keep DVC metafiles +!data/raw/**/*.dvc +!data/interim/**/*.dvc +!data/processed/**/*.dvc + +.dvc/cache/ +""".strip() + + gitignore = tmp_dir / ".gitignore" + gitignore.write_text(gitignore_content) + + # Add data file to DVC + dvc.add(str(data_file)) + + # The .dvc file should exist + dvc_file = data_dir / "example.nc.dvc" + assert dvc_file.exists() + + # Add to git + scm.add([str(dvc_file), str(gitignore)]) + scm.commit("Add data file and gitignore") + + # Refresh DVC to re-read gitignore + dvc._reset() + + # The key test: DVC should recognize the .dvc file even with ** patterns + # Before the fix, this would return an empty list + assert len(dvc.index.stages) > 0, ( + "DVC should find stages even with ** globbing patterns" + ) + + # The .dvc file should not be ignored by DVC's ignore system + assert not dvc.dvcignore.is_ignored_file(str(dvc_file)) + + # The data file itself should be ignored by git (as expected) + assert scm.is_ignored(str(data_file)) + + # But the .dvc file should not be ignored by git (due to negation pattern) + assert not scm.is_ignored(str(dvc_file)) + + # DVC status should work correctly + status = dvc.status() + # status should be empty (up to date) or have status info, but not fail + assert isinstance(status, dict) + + +def test_gitignore_globbing_specific_vs_double_star(tmp_dir, scm, dvc): + """Test the difference between specific patterns and ** patterns. + + This verifies that the workaround mentioned in the issue + (using specific patterns instead of **) works. + """ + # Create test files + (tmp_dir / "data").mkdir() + (tmp_dir / "data" / "file1.txt").write_text("content") + (tmp_dir / "data" / "file2.csv").write_text("content") + + # Add to DVC + dvc.add("data/file1.txt") + dvc.add("data/file2.csv") + + # Test 1: With ** patterns (the problematic case) + gitignore_star = """ +data/** +!data/**/*.dvc +""".strip() + + gitignore = tmp_dir / ".gitignore" + gitignore.write_text(gitignore_star) + scm.add([".dvc", "data/file1.txt.dvc", "data/file2.csv.dvc", ".gitignore"]) + scm.commit("Test with ** patterns") + + dvc._reset() + + # Should work with the fix + assert len(dvc.index.stages) == 2 + + # Test 2: With specific patterns (the workaround) + gitignore_specific = """ +data/*.txt +data/*.csv +""".strip() + + gitignore.write_text(gitignore_specific) + scm.add([".gitignore"]) + scm.commit("Test with specific patterns") + + dvc._reset() + + # Should also work + assert len(dvc.index.stages) == 2 + + +def test_collect_files_with_complex_gitignore(tmp_dir, scm, dvc): + """Test collect_files function directly with complex gitignore patterns.""" + from dvc.repo.index import collect_files + + # Create nested structure + nested_dir = tmp_dir / "project" / "data" / "raw" / "subdir" + nested_dir.mkdir(parents=True) + + # Create multiple data files + files = [ + nested_dir / "file1.nc", + nested_dir / "file2.nc", + (tmp_dir / "project" / "data" / "processed" / "result.csv"), + ] + + # Ensure processed dir exists + files[2].parent.mkdir(parents=True) + + for f in files: + f.write_text(f"data in {f.name}") + + # Add all to DVC + for f in files: + dvc.add(str(f)) + + # Complex gitignore with nested ** patterns + gitignore_content = """ +# Ignore data directories with ** patterns +project/data/raw/** +project/data/interim/** +project/data/processed/** + +# Keep DVC files +!project/data/raw/**/*.dvc +!project/data/interim/**/*.dvc +!project/data/processed/**/*.dvc + +# Also ignore some other patterns +*.log +temp/ +.cache/ +""".strip() + + gitignore = tmp_dir / ".gitignore" + gitignore.write_text(gitignore_content) + + # Add all .dvc files and gitignore to git + dvc_files = list(tmp_dir.rglob("*.dvc")) + scm.add([str(f) for f in dvc_files] + [str(gitignore)]) + scm.commit("Add complex gitignore with nested structure") + + dvc._reset() + + # Test collect_files function + collected = list(collect_files(dvc)) + + # Should find all 3 DVC files + assert len(collected) == 3 + + # Verify the paths are correct + collected_paths = [path for path, _ in collected] + for dvc_file in dvc_files: + assert str(dvc_file) in collected_paths + + +def test_is_ignored_function_behavior(tmp_dir, scm, dvc): + """Test the is_ignored function behavior directly.""" + from dvc.repo.index import collect_files + + # Create test structure + data_dir = tmp_dir / "data" + data_dir.mkdir() + test_file = data_dir / "test.txt" + test_file.write_text("test") + + dvc.add(str(test_file)) + dvc_file = data_dir / "test.txt.dvc" + + # Gitignore that ignores data dir but keeps .dvc files + gitignore = tmp_dir / ".gitignore" + gitignore.write_text("data/**\n!data/**/*.dvc") + + scm.add([str(dvc_file), str(gitignore)]) + scm.commit("Test ignore behavior") + + dvc._reset() + + # Test ignore behavior + assert dvc.scm.is_ignored(str(test_file)) # Data file should be ignored by git + assert not dvc.scm.is_ignored( + str(dvc_file) + ) # DVC file should not be ignored by git + assert not dvc.dvcignore.is_ignored_file( + str(dvc_file) + ) # DVC file should not be ignored by DVC + + # The key test: collect_files should find the DVC file + collected = list(collect_files(dvc)) + assert len(collected) == 1 + assert str(dvc_file) in [path for path, _ in collected] diff --git a/verify_fix.py b/verify_fix.py new file mode 100644 index 0000000000..6402af172f --- /dev/null +++ b/verify_fix.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 + +""" +Verification script for the gitignore ** globbing fix. + +This script simulates the exact issue scenario to verify the fix works. +""" + +import os +import tempfile +from pathlib import Path + + +def verify_fix(): + """Verify that the fix works for the original issue scenario.""" + + print("๐Ÿ” Testing gitignore ** globbing fix...") + + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_path = Path(tmp_dir) + + # Change to test directory + original_cwd = os.getcwd() + os.chdir(tmp_path) + + try: + # Initialize git and dvc + print(" ๐Ÿ“ Setting up test repository...") + os.system("git init >/dev/null 2>&1") + os.system("dvc init --no-scm >/dev/null 2>&1") + + # Create directory structure exactly like the issue + data_dir = tmp_path / "data" / "raw" + data_dir.mkdir(parents=True) + + # Create data file + data_file = data_dir / "example.nc" + data_file.write_text("test data") + + # Create .gitignore with problematic ** patterns from the issue + gitignore_content = """# Ignore all data files +data/raw/** +data/interim/** +data/processed/** + +# But keep DVC metafiles +!data/raw/**/*.dvc +!data/interim/**/*.dvc +!data/processed/**/*.dvc + +.dvc/cache/ +""" + gitignore = tmp_path / ".gitignore" + gitignore.write_text(gitignore_content.strip()) + + # Add data file to DVC + print(" ๐Ÿ“ฆ Adding file to DVC...") + result = os.system(f"dvc add {data_file} >/dev/null 2>&1") + if result != 0: + print(" โŒ Failed to add file to DVC") + return False + + # Add to git + dvc_file = data_file.with_suffix(".nc.dvc") + os.system(f"git add {dvc_file} .gitignore >/dev/null 2>&1") + os.system('git commit -m "Add data file" >/dev/null 2>&1') + + # Now test the fix by importing DVC and checking status + print(" ๐Ÿงช Testing DVC index recognition...") + + # Import here to use our modified version + try: + from dvc.repo import Repo + + repo = Repo(".") + + # The critical test: repo.index.stages should NOT be empty + stages_count = len(repo.index.stages) + print(f" ๐Ÿ“Š Found {stages_count} stages in DVC index") + + if stages_count == 0: + print(" โŒ FAIL: No stages found (original bug persists)") + return False + print( + " โœ… SUCCESS: DVC correctly recognizes .dvc files with ** patterns!" + ) + + # Additional verification: check ignore behavior + dvc_file_str = str(dvc_file) + data_file_str = str(data_file) + + print(" ๐Ÿ” Verifying ignore behavior:") + + # Data file should be ignored by git + if repo.scm.is_ignored(data_file_str): + print(" โœ… Data file correctly ignored by git") + else: + print(" โš ๏ธ Data file not ignored by git (unexpected)") + + # DVC file should NOT be ignored by git (due to negation) + if not repo.scm.is_ignored(dvc_file_str): + print(" โœ… DVC file correctly NOT ignored by git") + else: + print(" โŒ DVC file incorrectly ignored by git") + return False + + # DVC file should NOT be ignored by DVC + if not repo.dvcignore.is_ignored_file(dvc_file_str): + print(" โœ… DVC file correctly NOT ignored by DVC") + else: + print(" โŒ DVC file incorrectly ignored by DVC") + return False + + return True + + except ImportError as e: + print(f" โŒ Could not import DVC: {e}") + return False + except Exception as e: + print(f" โŒ Error during test: {e}") + return False + + finally: + os.chdir(original_cwd) + + +if __name__ == "__main__": + success = verify_fix() + + if success: + print("\n๐ŸŽ‰ Fix verification PASSED!") + print(" The gitignore ** globbing issue has been resolved.") + else: + print("\n๐Ÿ’ฅ Fix verification FAILED!") + print(" The issue may still exist.") + exit(1)