Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion dvc/repo/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,16 @@ def collect_files(

def is_ignored(path: str) -> bool:
# apply only for the local fs
return is_local_fs and scm.is_ignored(path)
if not is_local_fs:
return False

# For DVC files, use DVC's ignore system which properly handles
# ** globbing patterns with negations (e.g., data/** + !data/**/*.dvc)
if is_valid_filename(path):
return repo.dvcignore.is_ignored_file(path)

# For other files, use Git's ignore system
return scm.is_ignored(path)

def is_dvcfile_and_not_ignored(root: str, file: str) -> bool:
return is_valid_filename(file) and not is_ignored(f"{root}{sep}{file}")
Expand Down
84 changes: 84 additions & 0 deletions test_issue_reproduction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/usr/bin/env python3

"""
Reproduction test for issue #10987:
dvc status reports "no data tracked" when using ** globbing patterns in .gitignore
"""

import os
import tempfile
from pathlib import Path

from dvc.repo import Repo


def test_gitignore_globbing_reproduction():
"""Reproduce the ** globbing pattern issue from #10987"""

with tempfile.TemporaryDirectory() as tmp_dir:
tmp_path = Path(tmp_dir)

# Initialize git and dvc
os.chdir(tmp_path)
os.system("git init")
os.system("dvc init --no-scm")

# Create directory structure
data_dir = tmp_path / "data" / "raw"
data_dir.mkdir(parents=True)

# Create data file
data_file = data_dir / "example.nc"
data_file.write_text("test data")

# Create .gitignore with problematic ** patterns
gitignore_content = """
# Ignore all data files
data/raw/**
data/interim/**
data/processed/**

# But keep DVC metafiles
!data/raw/**/*.dvc
!data/interim/**/*.dvc
!data/processed/**/*.dvc

.dvc/cache/
"""
gitignore = tmp_path / ".gitignore"
gitignore.write_text(gitignore_content.strip())

# Add data file to DVC
os.system(f"dvc add {data_file}")

# Add to git
os.system(f"git add {data_file}.dvc .gitignore")
os.system('git commit -m "Add data file"')

# Now test the issue
repo = Repo(".")

print(f"Number of stages in index: {len(repo.index.stages)}")
print(f"DVC files in git: {list(tmp_path.rglob('*.dvc'))}")

# Check if the .dvc file is being ignored
dvc_file_path = str(data_file) + ".dvc"
print(
f"Is {dvc_file_path} ignored by git? {repo.scm.is_ignored(dvc_file_path)}"
)

# Check collect_files output
from dvc.repo.index import collect_files

collected_files = list(collect_files(repo))
print(f"Collected files: {collected_files}")

# The bug: index should have stages but doesn't
assert len(repo.index.stages) > 0, (
f"Expected stages in index, but got {len(repo.index.stages)}"
)


if __name__ == "__main__":
test_gitignore_globbing_reproduction()
print("Test passed!")
218 changes: 218 additions & 0 deletions tests/func/test_gitignore_globbing_fix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
def test_gitignore_globbing_with_dvc_files(tmp_dir, scm, dvc):
"""Test that ** globbing patterns in .gitignore with negations work correctly.

This is a regression test for issue #10987: dvc status reports "no data tracked"
when using ** globbing patterns in .gitignore.

The issue occurs when .gitignore has patterns like:
- data/raw/** (ignore everything in data/raw/)
- !data/raw/**/*.dvc (except .dvc files)
"""
# Create directory structure
data_dir = tmp_dir / "data" / "raw"
data_dir.mkdir(parents=True)

# Create data file
data_file = data_dir / "example.nc"
data_file.write_text("test data")

# Create .gitignore with ** globbing patterns
gitignore_content = """
# Ignore all data files
data/raw/**
data/interim/**
data/processed/**

# But keep DVC metafiles
!data/raw/**/*.dvc
!data/interim/**/*.dvc
!data/processed/**/*.dvc

.dvc/cache/
""".strip()

gitignore = tmp_dir / ".gitignore"
gitignore.write_text(gitignore_content)

# Add data file to DVC
dvc.add(str(data_file))

# The .dvc file should exist
dvc_file = data_dir / "example.nc.dvc"
assert dvc_file.exists()

# Add to git
scm.add([str(dvc_file), str(gitignore)])
scm.commit("Add data file and gitignore")

# Refresh DVC to re-read gitignore
dvc._reset()

# The key test: DVC should recognize the .dvc file even with ** patterns
# Before the fix, this would return an empty list
assert len(dvc.index.stages) > 0, (
"DVC should find stages even with ** globbing patterns"
)

# The .dvc file should not be ignored by DVC's ignore system
assert not dvc.dvcignore.is_ignored_file(str(dvc_file))

# The data file itself should be ignored by git (as expected)
assert scm.is_ignored(str(data_file))

# But the .dvc file should not be ignored by git (due to negation pattern)
assert not scm.is_ignored(str(dvc_file))

# DVC status should work correctly
status = dvc.status()
# status should be empty (up to date) or have status info, but not fail
assert isinstance(status, dict)


def test_gitignore_globbing_specific_vs_double_star(tmp_dir, scm, dvc):
"""Test the difference between specific patterns and ** patterns.

This verifies that the workaround mentioned in the issue
(using specific patterns instead of **) works.
"""
# Create test files
(tmp_dir / "data").mkdir()
(tmp_dir / "data" / "file1.txt").write_text("content")
(tmp_dir / "data" / "file2.csv").write_text("content")

# Add to DVC
dvc.add("data/file1.txt")
dvc.add("data/file2.csv")

# Test 1: With ** patterns (the problematic case)
gitignore_star = """
data/**
!data/**/*.dvc
""".strip()

gitignore = tmp_dir / ".gitignore"
gitignore.write_text(gitignore_star)
scm.add([".dvc", "data/file1.txt.dvc", "data/file2.csv.dvc", ".gitignore"])
scm.commit("Test with ** patterns")

dvc._reset()

# Should work with the fix
assert len(dvc.index.stages) == 2

# Test 2: With specific patterns (the workaround)
gitignore_specific = """
data/*.txt
data/*.csv
""".strip()

gitignore.write_text(gitignore_specific)
scm.add([".gitignore"])
scm.commit("Test with specific patterns")

dvc._reset()

# Should also work
assert len(dvc.index.stages) == 2


def test_collect_files_with_complex_gitignore(tmp_dir, scm, dvc):
"""Test collect_files function directly with complex gitignore patterns."""
from dvc.repo.index import collect_files

# Create nested structure
nested_dir = tmp_dir / "project" / "data" / "raw" / "subdir"
nested_dir.mkdir(parents=True)

# Create multiple data files
files = [
nested_dir / "file1.nc",
nested_dir / "file2.nc",
(tmp_dir / "project" / "data" / "processed" / "result.csv"),
]

# Ensure processed dir exists
files[2].parent.mkdir(parents=True)

for f in files:
f.write_text(f"data in {f.name}")

# Add all to DVC
for f in files:
dvc.add(str(f))

# Complex gitignore with nested ** patterns
gitignore_content = """
# Ignore data directories with ** patterns
project/data/raw/**
project/data/interim/**
project/data/processed/**

# Keep DVC files
!project/data/raw/**/*.dvc
!project/data/interim/**/*.dvc
!project/data/processed/**/*.dvc

# Also ignore some other patterns
*.log
temp/
.cache/
""".strip()

gitignore = tmp_dir / ".gitignore"
gitignore.write_text(gitignore_content)

# Add all .dvc files and gitignore to git
dvc_files = list(tmp_dir.rglob("*.dvc"))
scm.add([str(f) for f in dvc_files] + [str(gitignore)])
scm.commit("Add complex gitignore with nested structure")

dvc._reset()

# Test collect_files function
collected = list(collect_files(dvc))

# Should find all 3 DVC files
assert len(collected) == 3

# Verify the paths are correct
collected_paths = [path for path, _ in collected]
for dvc_file in dvc_files:
assert str(dvc_file) in collected_paths


def test_is_ignored_function_behavior(tmp_dir, scm, dvc):
"""Test the is_ignored function behavior directly."""
from dvc.repo.index import collect_files

# Create test structure
data_dir = tmp_dir / "data"
data_dir.mkdir()
test_file = data_dir / "test.txt"
test_file.write_text("test")

dvc.add(str(test_file))
dvc_file = data_dir / "test.txt.dvc"

# Gitignore that ignores data dir but keeps .dvc files
gitignore = tmp_dir / ".gitignore"
gitignore.write_text("data/**\n!data/**/*.dvc")

scm.add([str(dvc_file), str(gitignore)])
scm.commit("Test ignore behavior")

dvc._reset()

# Test ignore behavior
assert dvc.scm.is_ignored(str(test_file)) # Data file should be ignored by git
assert not dvc.scm.is_ignored(
str(dvc_file)
) # DVC file should not be ignored by git
assert not dvc.dvcignore.is_ignored_file(
str(dvc_file)
) # DVC file should not be ignored by DVC

# The key test: collect_files should find the DVC file
collected = list(collect_files(dvc))
assert len(collected) == 1
assert str(dvc_file) in [path for path, _ in collected]
Loading
Loading