diff --git a/doc/VectorCode-cli.txt b/doc/VectorCode-cli.txt index 754bb9cd..0530a95f 100644 --- a/doc/VectorCode-cli.txt +++ b/doc/VectorCode-cli.txt @@ -459,6 +459,10 @@ certain conditions. See the wiki for an example to use it with git hooks. +If you’re working with nested repos, you can pass `--recursive`/`-r` so that +the `vectorise` command will honour the `.gitignore`s and `vectorcode.exclude`s +in the nested repos. + MAKING A QUERY ~ diff --git a/docs/cli.md b/docs/cli.md index 55a6b3ae..f16f5034 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -426,6 +426,10 @@ on certain conditions. See [the wiki](https://github.com/Davidyz/VectorCode/wiki/Tips-and-Tricks#git-hooks) for an example to use it with git hooks. +If you're working with nested repos, you can pass `--recursive`/`-r` so that +the `vectorise` command will honour the `.gitignore`s and `vectorcode.exclude`s +in the nested repos. + ### Making a Query To retrieve a list of documents from the database, you can use the following command: diff --git a/src/vectorcode/cli_utils.py b/src/vectorcode/cli_utils.py index 00e9dc48..6c839f1b 100644 --- a/src/vectorcode/cli_utils.py +++ b/src/vectorcode/cli_utils.py @@ -8,11 +8,12 @@ from datetime import datetime from enum import Enum, StrEnum from pathlib import Path -from typing import Any, Optional, Sequence, Union +from typing import Any, Generator, Iterable, Optional, Sequence, Union import json5 import shtab from filelock import AsyncFileLock +from pathspec import GitIgnoreSpec from vectorcode import __version__ @@ -671,3 +672,58 @@ def get_lock(self, path: str | os.PathLike) -> AsyncFileLock: if self.__locks.get(path) is None: self.__locks[path] = AsyncFileLock(path) # pyright: ignore[reportArgumentType] return self.__locks[path] + + +class SpecResolver: + """ + This class is a wrapper around filespec that makes it easier to work with file specs that are not in cwd. + """ + + @classmethod + def from_path(cls, spec_path: str, project_root: Optional[str] = None): + """ + Automatically determine the appropriate `base_dir` for resolving file specs that are outside of the project root. + Only supports `.gitignore` and `.vectorcode/vectorcode.{include,exclude}`. + Raises `ValueError` if the spec path is not one of them. + """ + base_dir = "." + if spec_path.endswith(".gitignore"): + base_dir = spec_path.replace(".gitignore", "") + else: + path_obj = Path(spec_path) + if path_obj.name in {"vectorcode.include", "vectorcode.exclude"}: + if path_obj.parent.name == ".vectorcode": + # project config + base_dir = str(path_obj.parent.parent) + else: + # assume to be global config + base_dir = project_root or "." + else: # pragma: nocover + raise ValueError(f"Unsupported spec path: {spec_path}") + return cls(spec_path, base_dir) + + def __init__(self, spec: str | GitIgnoreSpec, base_dir: str = "."): + if isinstance(spec, str): + with open(spec) as fin: + self.spec = GitIgnoreSpec.from_lines( + (i.strip() for i in fin.readlines()) + ) + else: + self.spec = spec + self.base_dir = base_dir + + def match( + self, paths: Iterable[str], negated: bool = False + ) -> Generator[str, None, None]: + # get paths relative to `base_dir` + + base = Path(self.base_dir).resolve() + for p in paths: + if base in Path(p).resolve().parents: + should_yield = self.spec.match_file(os.path.relpath(p, self.base_dir)) + if negated: + should_yield = not should_yield + if should_yield: + yield p + else: + yield p diff --git a/src/vectorcode/subcommands/vectorise.py b/src/vectorcode/subcommands/vectorise.py index a0bea88f..31e6420e 100644 --- a/src/vectorcode/subcommands/vectorise.py +++ b/src/vectorcode/subcommands/vectorise.py @@ -1,4 +1,5 @@ import asyncio +import glob import hashlib import json import logging @@ -20,6 +21,7 @@ GLOBAL_EXCLUDE_SPEC, GLOBAL_INCLUDE_SPEC, Config, + SpecResolver, expand_globs, expand_path, ) @@ -187,22 +189,13 @@ def show_stats(configs: Config, stats: VectoriseStats): def exclude_paths_by_spec( - paths: Iterable[str], specs: pathspec.PathSpec | str + paths: Iterable[str], spec_path: str, project_root: Optional[str] = None ) -> list[str]: """ Files matched by the specs will be excluded. """ - if isinstance(specs, str): - with open(specs) as fin: - specs = pathspec.GitIgnoreSpec.from_lines(fin.readlines()) - return [path for path in paths if not specs.match_file(path)] - -def include_paths_by_spec(paths: Iterable[str], specs: pathspec.PathSpec) -> list[str]: - """ - Only include paths matched by the specs. - """ - return [path for path in paths if specs.match_file(path)] + return list(SpecResolver.from_path(spec_path, project_root).match(paths, True)) def load_files_from_include(project_root: str) -> list[str]: @@ -235,10 +228,16 @@ def find_exclude_specs(configs: Config) -> list[str]: Load a list of paths to exclude specs. Can be `.gitignore` or local/global `vectorcode.exclude` """ - gitignore_path = os.path.join(str(configs.project_root), ".gitignore") - specs = [ - gitignore_path, - ] + if configs.recursive: + specs = glob.glob( + os.path.join(str(configs.project_root), "**", ".gitignore"), recursive=True + ) + glob.glob( + os.path.join(str(configs.project_root), "**", "vectorcode.exclude"), + recursive=True, + ) + else: + specs = [os.path.join(str(configs.project_root), ".gitignore")] + exclude_spec_path = os.path.join( str(configs.project_root), ".vectorcode", "vectorcode.exclude" ) @@ -246,6 +245,8 @@ def find_exclude_specs(configs: Config) -> list[str]: specs.append(exclude_spec_path) elif os.path.isfile(GLOBAL_EXCLUDE_SPEC): specs.append(GLOBAL_EXCLUDE_SPEC) + specs = [i for i in specs if os.path.isfile(i)] + logger.debug(f"Loaded exclude specs: {specs}") return specs @@ -272,7 +273,10 @@ async def vectorise(configs: Config) -> int: for spec_path in find_exclude_specs(configs): if os.path.isfile(spec_path): logger.info(f"Loading ignore specs from {spec_path}.") - files = exclude_paths_by_spec((str(i) for i in files), spec_path) + files = exclude_paths_by_spec( + (str(i) for i in files), spec_path, str(configs.project_root) + ) + logger.debug(f"Files after excluding: {files}") else: # pragma: nocover logger.info("Ignoring exclude specs.") diff --git a/tests/subcommands/test_vectorise.py b/tests/subcommands/test_vectorise.py index 6b2287bf..ca64e1a5 100644 --- a/tests/subcommands/test_vectorise.py +++ b/tests/subcommands/test_vectorise.py @@ -6,21 +6,20 @@ from contextlib import ExitStack from unittest.mock import AsyncMock, MagicMock, mock_open, patch -import pathspec import pytest from chromadb.api.models.AsyncCollection import AsyncCollection from tree_sitter import Point from vectorcode.chunking import Chunk -from vectorcode.cli_utils import Config +from vectorcode.cli_utils import CliAction, Config from vectorcode.subcommands.vectorise import ( VectoriseStats, chunked_add, exclude_paths_by_spec, + find_exclude_specs, get_uuid, hash_file, hash_str, - include_paths_by_spec, load_files_from_include, show_stats, vectorise, @@ -223,19 +222,44 @@ def test_show_stats_pipe_true(capsys): def test_exclude_paths_by_spec(): - paths = ["file1.py", "file2.py", "exclude.py"] - specs = pathspec.GitIgnoreSpec.from_lines(lines=["exclude.py"]) - excluded_paths = exclude_paths_by_spec(paths, specs) - assert "exclude.py" not in excluded_paths - assert len(excluded_paths) == 2 + with tempfile.TemporaryDirectory() as dir: + paths = list( + os.path.join(dir, i) for i in ["file1.py", "file2.py", "exclude.py"] + ) + spec_path = os.path.join(dir, ".gitignore") + with open(spec_path, mode="w") as spec_file: + spec_file.writelines(["exclude.py"]) + + paths_after_exclude = exclude_paths_by_spec(paths, spec_path) + assert "exclude.py" not in paths_after_exclude + assert len(paths_after_exclude) == 2 + os.remove(spec_path) + + +def test_nested_exclude_paths_by_spec(): + paths = [ + "file1.py", + "file2.py", + "exclude.py", + os.path.join("nested", "nested_exclude.py"), + ] + with tempfile.TemporaryDirectory() as project_root: + paths = [os.path.join(project_root, i) for i in paths] + with open(os.path.join(project_root, ".gitignore"), mode="w") as fin: + fin.writelines(["/exclude.py"]) + nested_git_dir = os.path.join(project_root, "nested") + os.makedirs(nested_git_dir, exist_ok=True) + with open(os.path.join(nested_git_dir, ".gitignore"), mode="w") as fin: + fin.writelines(["/nested_exclude.py"]) -def test_include_paths_by_spec(): - paths = ["file1.py", "file2.py", "include.py"] - specs = pathspec.GitIgnoreSpec.from_lines(lines=["include.py", "file1.py"]) - included_paths = include_paths_by_spec(paths, specs) - assert "file2.py" not in included_paths - assert len(included_paths) == 2 + specs = find_exclude_specs(Config(project_root=project_root, recursive=True)) + paths_after_exclude = paths[:] + for spec in specs: + paths_after_exclude = exclude_paths_by_spec(paths_after_exclude, spec) + assert "exclude.py" not in paths_after_exclude + assert "nested/nested_exclude.py" not in paths_after_exclude + assert len(paths_after_exclude) == 2 @patch("os.path.isfile") @@ -615,151 +639,174 @@ async def test_vectorise_gitignore(): @pytest.mark.asyncio -async def test_vectorise_exclude_file(tmpdir): +async def test_vectorise_exclude_file(): # Create a temporary .vectorcode directory and vectorcode.exclude file - exclude_dir = tmpdir.mkdir(".vectorcode") - exclude_file = exclude_dir.join("vectorcode.exclude") - exclude_file.write("excluded_file.py\n") - - configs = Config( - db_url="http://test_host:1234", - db_path="test_db", - embedding_function="SentenceTransformerEmbeddingFunction", - embedding_params={}, - project_root=str(tmpdir), - files=["test_file.py", "excluded_file.py"], - recursive=False, - force=False, - pipe=False, - ) - mock_client = AsyncMock() - mock_collection = AsyncMock() - mock_collection.get.return_value = {"ids": []} - - with ( - patch("vectorcode.subcommands.vectorise.ClientManager") as MockClientManager, - patch( - "vectorcode.subcommands.vectorise.get_collection", - return_value=mock_collection, - ), - patch("vectorcode.subcommands.vectorise.verify_ef", return_value=True), - patch( - "os.path.isfile", - side_effect=lambda path: True if path == str(exclude_file) else False, - ), - patch("builtins.open", return_value=open(str(exclude_file), "r")), - patch( - "vectorcode.subcommands.vectorise.expand_globs", - return_value=["test_file.py", "excluded_file.py"], - ), - patch("vectorcode.subcommands.vectorise.chunked_add") as mock_chunked_add, - ): - MockClientManager.return_value._create_client.return_value = mock_client - await vectorise(configs) - # Assert that chunked_add is only called for test_file.py, not excluded_file.py - call_args = [call[0][0] for call in mock_chunked_add.call_args_list] - assert "excluded_file.py" not in call_args - assert "test_file.py" in call_args - assert mock_chunked_add.call_count == 1 - + with tempfile.TemporaryDirectory() as tmpdir: + exclude_dir = os.path.join(tmpdir, ".vectorcode") + nested_dir = os.path.join(tmpdir, "nested") + + os.makedirs(exclude_dir, exist_ok=True) + os.makedirs(nested_dir, exist_ok=True) + + exclude_spec = os.path.join(exclude_dir, "vectorcode.exclude") + with open(exclude_spec, mode="w") as fin: + fin.writelines(["excluded_file.py"]) + with open(os.path.join(nested_dir, ".gitignore"), "w") as fin: + fin.writelines(["excluded_file.py"]) + nested_file_path = os.path.join(nested_dir, "nested_excluded_file.py") + with open(nested_file_path, "w") as fin: + # non-recursive case. This file should be included. + fin.writelines(['print("hello world")']) + + configs = Config( + db_url="http://test_host:1234", + db_path="test_db", + embedding_function="SentenceTransformerEmbeddingFunction", + embedding_params={}, + project_root=str(tmpdir), + files=[ + os.path.join(tmpdir, "test_file.py"), + os.path.join(tmpdir, "excluded_file.py"), + nested_file_path, + ], + recursive=False, + force=False, + pipe=False, + ) + mock_client = AsyncMock() + mock_collection = AsyncMock() + mock_collection.get.return_value = {"ids": []} -MOCK_GLOBAL_EXCLUDE_PATH = "/mock/global/.config/vectorcode/vectorcode.exclude" + with ( + patch( + "vectorcode.subcommands.vectorise.ClientManager" + ) as MockClientManager, + patch( + "vectorcode.subcommands.vectorise.get_collection", + return_value=mock_collection, + ), + patch("vectorcode.subcommands.vectorise.verify_ef", return_value=True), + patch( + "vectorcode.subcommands.vectorise.expand_globs", + return_value=configs.files, + ), + patch("vectorcode.subcommands.vectorise.chunked_add") as mock_chunked_add, + ): + MockClientManager.return_value._create_client.return_value = mock_client + await vectorise(configs) + # Assert that chunked_add is only called for test_file.py, not excluded_file.py + call_args = [call[0][0] for call in mock_chunked_add.call_args_list] + assert str(os.path.join(tmpdir, "excluded_file.py")) not in call_args + assert os.path.join(tmpdir, "test_file.py") in call_args + assert mock_chunked_add.call_count == 2 @pytest.mark.asyncio -@patch("vectorcode.subcommands.vectorise.get_collection", new_callable=AsyncMock) -@patch("vectorcode.subcommands.vectorise.expand_globs", new_callable=AsyncMock) -@patch("vectorcode.subcommands.vectorise.chunked_add", new_callable=AsyncMock) -@patch("os.path.isfile") -@patch("builtins.open", new_callable=mock_open) -@patch("vectorcode.subcommands.vectorise.GLOBAL_EXCLUDE_SPEC", MOCK_GLOBAL_EXCLUDE_PATH) -@patch("pathspec.GitIgnoreSpec") -@patch("vectorcode.subcommands.vectorise.verify_ef", return_value=True) -async def test_vectorise_uses_global_exclude_when_local_missing( - mock_verify_ef, # Add argument for the new patch - mock_gitignore_spec, - mock_open_builtin, - mock_isfile, - mock_chunked_add, - mock_expand_globs, - mock_get_collection, - tmp_path, -): - """ - Tests that vectorise uses the global exclude file if the local one - and .gitignore are missing. - """ - project_root = str(tmp_path) - configs = Config(project_root=project_root, force=False, pipe=True) - local_gitignore = tmp_path / ".gitignore" - local_exclude_file = tmp_path / ".vectorcode" / "vectorcode.exclude" - - initial_files = [str(tmp_path / "file1.py"), str(tmp_path / "ignored.bin")] - mock_expand_globs.return_value = initial_files - - def isfile_side_effect(p): - path_str = str(p) - if path_str == str(local_gitignore): - return False - if path_str == str(local_exclude_file): - return False - if path_str == MOCK_GLOBAL_EXCLUDE_PATH: - return True - if path_str == str(tmp_path / "file1.py"): - return True - return False - - mock_isfile.side_effect = isfile_side_effect - - global_exclude_content = "*.bin" - m_open = mock_open(read_data=global_exclude_content) - with ( - patch("builtins.open", m_open), - patch("vectorcode.subcommands.vectorise.ClientManager") as MockClientManager, - ): - mock_spec_instance = MagicMock() - mock_spec_instance.match_file = lambda path: str(path).endswith(".bin") - mock_gitignore_spec.from_lines.return_value = mock_spec_instance - - mock_client_instance = AsyncMock() - mock_client_instance.get_max_batch_size = AsyncMock(return_value=100) - - MockClientManager.return_value._create_client.return_value = ( - mock_client_instance - ) - - mock_collection_instance = AsyncMock() - mock_collection_instance.get = AsyncMock( - return_value={ - "ids": ["id1"], - "metadatas": [ - {"path": str(tmp_path / "file1.py")} - ], # Simulate file1.py is in DB - } +async def test_vectorise_exclude_file_recursive(): + # Create a temporary .vectorcode directory and vectorcode.exclude file + with tempfile.TemporaryDirectory() as tmpdir: + exclude_dir = os.path.join(tmpdir, ".vectorcode") + nested_dir = os.path.join(tmpdir, "nested") + + os.makedirs(exclude_dir, exist_ok=True) + os.makedirs(nested_dir, exist_ok=True) + + exclude_spec = os.path.join(exclude_dir, "vectorcode.exclude") + with open(exclude_spec, mode="w") as fin: + fin.writelines(["excluded_file.py"]) + with open(os.path.join(nested_dir, ".gitignore"), "w") as fin: + fin.writelines(["excluded_file.py"]) + with open(os.path.join(nested_dir, "excluded_file.py"), "w") as fin: + # recursive case. This file should be skipped. + fin.writelines(['print("hello world")']) + + configs = Config( + db_url="http://test_host:1234", + db_path="test_db", + embedding_function="SentenceTransformerEmbeddingFunction", + embedding_params={}, + project_root=str(tmpdir), + files=[ + os.path.join(tmpdir, "test_file.py"), + os.path.join(tmpdir, "excluded_file.py"), + ], + recursive=True, + force=False, + pipe=False, ) - mock_collection_instance.delete = AsyncMock() - mock_get_collection.return_value = mock_collection_instance + mock_client = AsyncMock() + mock_collection = AsyncMock() + mock_collection.get.return_value = {"ids": []} - await vectorise(configs) - - mock_verify_ef.assert_called_once_with(mock_collection_instance, configs) - - mock_isfile.assert_any_call(str(local_gitignore)) - mock_isfile.assert_any_call(str(local_exclude_file)) - mock_isfile.assert_any_call(MOCK_GLOBAL_EXCLUDE_PATH) + with ( + patch( + "vectorcode.subcommands.vectorise.ClientManager" + ) as MockClientManager, + patch( + "vectorcode.subcommands.vectorise.get_collection", + return_value=mock_collection, + ), + patch("vectorcode.subcommands.vectorise.verify_ef", return_value=True), + patch( + "vectorcode.subcommands.vectorise.expand_globs", + return_value=configs.files, + ), + patch("vectorcode.subcommands.vectorise.chunked_add") as mock_chunked_add, + ): + MockClientManager.return_value._create_client.return_value = mock_client + await vectorise(configs) + # Assert that chunked_add is only called for test_file.py, not excluded_file.py + call_args = [call[0][0] for call in mock_chunked_add.call_args_list] + assert str(os.path.join(tmpdir, "excluded_file.py")) not in call_args + assert os.path.join(tmpdir, "test_file.py") in call_args + assert mock_chunked_add.call_count == 1 - mock_isfile.assert_any_call(str(tmp_path / "file1.py")) - m_open.assert_called_with(MOCK_GLOBAL_EXCLUDE_PATH) - mock_gitignore_spec.from_lines.assert_called_once_with([global_exclude_content]) +@pytest.mark.asyncio +async def test_vectorise_uses_global_exclude_when_local_missing(): + mock_client = AsyncMock() + mock_collection = AsyncMock() + mock_collection.get.return_value = {"ids": []} - found_correct_call = False - for call in mock_chunked_add.call_args_list: - args, _ = call - if args[0] == str(tmp_path / "file1.py"): - found_correct_call = True - break - assert found_correct_call, ( - f"chunked_add not called with {str(tmp_path / 'file1.py')}" + with tempfile.TemporaryDirectory() as temp_home: + os.environ["HOME"] = temp_home + global_config_dir = os.path.join(temp_home, ".config", "vectorcode") + os.makedirs(global_config_dir, exist_ok=True) + with open( + os.path.join(global_config_dir, "vectorcode.exclude"), mode="w" + ) as fin: + fin.writelines(["exclude.py"]) + + project_root = os.path.join(temp_home, "project") + os.makedirs(project_root, exist_ok=True) + files = list( + os.path.join(project_root, i) for i in ("include.py", "exclude.py") ) - assert mock_chunked_add.call_count == 1 + for f_name in files: + full_path = os.path.join(project_root, f_name) + with open(full_path, mode="w") as fin: + pass + with ( + patch( + "vectorcode.subcommands.vectorise.ClientManager" + ) as MockClientManager, + patch( + "vectorcode.subcommands.vectorise.get_collection", + return_value=mock_collection, + ), + patch("vectorcode.subcommands.vectorise.verify_ef", return_value=True), + patch("vectorcode.subcommands.vectorise.chunked_add") as mock_chunked_add, + patch( + "vectorcode.subcommands.vectorise.GLOBAL_EXCLUDE_SPEC", + os.path.join(temp_home, ".config", "vectorcode", "vectorcode.exclude"), + ), + ): + MockClientManager.return_value._create_client.return_value = mock_client + await vectorise( + Config( + project_root=project_root, + files=list(os.path.join(project_root, i) for i in files), + action=CliAction.vectorise, + ) + ) + mock_chunked_add.assert_called_once() diff --git a/tests/test_cli_utils.py b/tests/test_cli_utils.py index d93bcb8f..e8e79f2b 100644 --- a/tests/test_cli_utils.py +++ b/tests/test_cli_utils.py @@ -6,6 +6,7 @@ from unittest.mock import patch import pytest +from pathspec import GitIgnoreSpec from vectorcode import cli_utils from vectorcode.cli_utils import ( @@ -15,6 +16,7 @@ LockManager, PromptCategory, QueryInclude, + SpecResolver, cleanup_path, expand_envs_in_dict, expand_globs, @@ -576,3 +578,55 @@ async def test_filelock(): with tempfile.TemporaryDirectory() as tmp_dir: manager.get_lock(tmp_dir) assert os.path.isfile(os.path.join(tmp_dir, "vectorcode.lock")) + + +def test_specresolver(): + spec = GitIgnoreSpec.from_lines(["file1.txt"]) + nested_path = "nested/file1.txt" + assert nested_path in list( + SpecResolver(spec, base_dir="nested").match([nested_path]) + ) + assert nested_path not in list( + SpecResolver(spec, base_dir="nested").match([nested_path], negated=True) + ) + + with tempfile.TemporaryDirectory() as dir: + nested_dir = os.path.join(dir, "nested") + nested_path = os.path.join(nested_dir, "file1.txt") + os.makedirs(nested_dir, exist_ok=True) + nested_path = os.path.join(dir, "nested", "file1.txt") + with tempfile.NamedTemporaryFile(mode="w", delete=False, dir=nested_dir) as f: + f.writelines(["file1.txt"]) + spec_filename = f.name + + assert nested_path in list( + SpecResolver(spec_filename, base_dir=nested_dir).match([nested_path]) + ) + + +def test_specresolver_builder(): + with ( + patch("vectorcode.cli_utils.GitIgnoreSpec"), + patch("vectorcode.cli_utils.open"), + ): + base_dir = os.path.normpath(os.path.join("foo", "bar")) + assert ( + os.path.normpath( + SpecResolver.from_path(os.path.join(base_dir, ".gitignore")).base_dir + ) + == base_dir + ) + + assert ( + os.path.normpath( + SpecResolver.from_path( + os.path.join(base_dir, ".vectorcode", "vectorcode.exclude") + ).base_dir + ) + == base_dir + ) + assert os.path.normpath( + SpecResolver.from_path( + os.path.join(base_dir, "vectorcode", "vectorcode.exclude") + ).base_dir + ) == os.path.normpath(".")