diff --git a/.github/workflows/test_and_cov.yml b/.github/workflows/test_and_cov.yml index 53325bd8..4ff9b509 100644 --- a/.github/workflows/test_and_cov.yml +++ b/.github/workflows/test_and_cov.yml @@ -44,9 +44,9 @@ jobs: - name: Set custom HF cache directory run: | export HF_HOME="$GITHUB_WORKSPACE/hf_cache" - export SENTENCE_TRANSFORMERS_HOME=$HF_HOME - mkdir -p $HF_HOME - [ -z "$(ls $HF_HOME)" ] || rm $HF_HOME/* -rf && true + export SENTENCE_TRANSFORMERS_HOME="$HF_HOME" + mkdir -p "$HF_HOME" + [ -z "$(ls "$HF_HOME")" ] || rm "${HF_HOME:?}/*" -rf && true - name: run tests run: pdm run pytest --enable-coredumpy --coredumpy-dir ${{ env.COREDUMPY_DUMP_DIR }} @@ -57,6 +57,9 @@ jobs: pdm run coverage report -m pdm run coverage xml -i + - name: static analysis by basedpyright + run: pdm run basedpyright + - name: upload coverage reports to codecov uses: codecov/codecov-action@v5 with: diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md index 723304ab..7a7c4939 100644 --- a/docs/CONTRIBUTING.md +++ b/docs/CONTRIBUTING.md @@ -20,6 +20,10 @@ run tests, and `make coverage` to generate a coverage report. The testing and coverage report are also in the CI configuration, but it might still help to run them locally before you open the PR. +This project also runs static analysis with +[basedpyright](https://docs.basedpyright.com). GitHub Action will also run the +check when a PR is submitted. + You may also find it helpful to [enable logging](https://github.com/Davidyz/VectorCode/blob/main/docs/cli.md#debugging-and-diagnosing) for the CLI when developing new features or working on fixes. diff --git a/pyproject.toml b/pyproject.toml index 1860b3ee..31837117 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,14 +39,6 @@ vectorcode-mcp-server = "vectorcode.mcp_main:main" requires = ["pdm-backend"] build-backend = "pdm.backend" -[tool.pdm] -distribution = true - -[tool.pdm.version] -source = "scm" -write_to = "./vectorcode/_version.py" -write_template = "__version__ = '{}' # pragma: no cover" - [tool.coverage.run] omit = [ "./tests/*", @@ -56,6 +48,15 @@ omit = [ ] include = ['src/vectorcode/**/*.py'] + +[tool.pdm] +distribution = true + +[tool.pdm.version] +source = "scm" +write_to = "./vectorcode/_version.py" +write_template = "__version__ = '{}' # pragma: no cover" + [dependency-groups] dev = [ "ipython>=8.31.0", @@ -68,6 +69,7 @@ dev = [ "pytest-asyncio>=0.25.3", "debugpy>=1.8.12", "coredumpy>=0.4.1", + "basedpyright>=1.29.2", ] [project.optional-dependencies] @@ -75,3 +77,7 @@ legacy = ["numpy<2.0.0", "torch==2.2.2", "transformers<=4.49.0"] intel = ['optimum[openvino]', 'openvino'] lsp = ['pygls<2.0.0', 'lsprotocol'] mcp = ['mcp<2.0.0', 'pydantic'] + +[tool.basedpyright] +typeCheckingMode = "standard" +ignore = ["./tests/"] diff --git a/src/vectorcode/chunking.py b/src/vectorcode/chunking.py index 0a62b3c8..2f8ab751 100644 --- a/src/vectorcode/chunking.py +++ b/src/vectorcode/chunking.py @@ -5,13 +5,13 @@ from dataclasses import dataclass from functools import cache from io import TextIOWrapper -from typing import Generator, Optional +from typing import Generator, Optional, cast from pygments.lexer import Lexer from pygments.lexers import get_lexer_for_filename from pygments.util import ClassNotFound from tree_sitter import Node, Point -from tree_sitter_language_pack import get_parser +from tree_sitter_language_pack import SupportedLanguage, get_parser from vectorcode.cli_utils import Config @@ -279,7 +279,6 @@ def __load_file_lines(self, path: str) -> list[str]: lines = fin.readlines() return lines - def __get_parser_from_config(self, file_path: str): """ Get parser based on filetype_map config. @@ -291,23 +290,31 @@ def __get_parser_from_config(self, file_path: str): filename = os.path.basename(file_path) extension = os.path.splitext(file_path)[1] - if extension.startswith('.'): + if extension.startswith("."): extension = extension[1:] logger.debug(f"Checking filetype map for extension '{extension}' in {filename}") for _language, patterns in filetype_map.items(): - language = _language.lower() + language = _language.lower() for pattern in patterns: try: if re.search(pattern, extension): - logger.debug(f"'{filename}' extension matches pattern '{pattern}' for language '{language}'. Attempting to load parser.") - parser = get_parser(language) - logger.debug(f"Found parser for language '{language}' from config.") + logger.debug( + f"'{filename}' extension matches pattern '{pattern}' for language '{language}'. Attempting to load parser." + ) + parser = get_parser(cast(SupportedLanguage, language)) + logger.debug( + f"Found parser for language '{language}' from config." + ) return parser except re.error as e: - e.add_note(f"\nInvalid regex pattern '{pattern}' for language '{language}' in filetype_map") + e.add_note( + f"\nInvalid regex pattern '{pattern}' for language '{language}' in filetype_map" + ) raise except LookupError as e: - e.add_note(f"\nTreeSitter Parser for language '{language}' not found. Please check your filetype_map config.") + e.add_note( + f"\nTreeSitter Parser for language '{language}' not found. Please check your filetype_map config." + ) raise logger.debug(f"No matching filetype map entry found for {filename}.") @@ -336,11 +343,12 @@ def chunk(self, data: str) -> Generator[Chunk, None, None]: lang_names.extend(lexer.aliases) for name in lang_names: try: - parser = get_parser(name.lower()) + parser = get_parser(cast(SupportedLanguage, name.lower())) if parser is not None: language = name.lower() logger.debug( - "Detected %s filetype for treesitter chunking.", language + "Detected %s filetype for treesitter chunking.", + language, ) break except LookupError: # pragma: nocover diff --git a/src/vectorcode/cli_utils.py b/src/vectorcode/cli_utils.py index 36db2703..5957fe80 100644 --- a/src/vectorcode/cli_utils.py +++ b/src/vectorcode/cli_utils.py @@ -18,8 +18,6 @@ logger = logging.getLogger(name=__name__) -PathLike = Union[str, Path] - GLOBAL_CONFIG_PATH = os.path.join( os.path.expanduser("~"), ".config", "vectorcode", "config.json" ) @@ -70,8 +68,8 @@ class Config: to_be_deleted: list[str] = field(default_factory=list) pipe: bool = False action: Optional[CliAction] = None - files: list[PathLike] = field(default_factory=list) - project_root: Optional[PathLike] = None + files: list[Union[str, os.PathLike]] = field(default_factory=list) + project_root: Optional[Union[str, Path]] = None query: Optional[list[str]] = None db_url: str = "http://127.0.0.1:8000" embedding_function: str = "SentenceTransformerEmbeddingFunction" # This should fallback to whatever the default is. @@ -84,7 +82,7 @@ class Config: chunk_size: int = 2500 overlap_ratio: float = 0.2 query_multiplier: int = -1 - query_exclude: list[PathLike] = field(default_factory=list) + query_exclude: list[Union[str, os.PathLike]] = field(default_factory=list) reranker: Optional[str] = "CrossEncoderReranker" reranker_params: dict[str, Any] = field(default_factory=lambda: {}) check_item: Optional[str] = None @@ -214,7 +212,7 @@ def get_cli_parser(): "--project_root", default=None, help="Project root to be used as an identifier of the project.", - ).complete = shtab.DIRECTORY + ).complete = shtab.DIRECTORY # type:ignore shared_parser.add_argument( "--pipe", "-p", @@ -226,7 +224,7 @@ def get_cli_parser(): "--no_stderr", action="store_true", default=False, - help="Supress all STDERR messages.", + help="Suppress all STDERR messages.", ) main_parser = argparse.ArgumentParser( "vectorcode", @@ -253,7 +251,7 @@ def get_cli_parser(): ) vectorise_parser.add_argument( "file_paths", nargs="*", help="Paths to files to be vectorised." - ).complete = shtab.FILE + ).complete = shtab.FILE # type:ignore vectorise_parser.add_argument( "--recursive", "-r", @@ -297,7 +295,7 @@ def get_cli_parser(): ) query_parser.add_argument( "--exclude", nargs="*", help="Files to exclude from query results." - ).complete = shtab.FILE + ).complete = shtab.FILE # type:ignore query_parser.add_argument( "--absolute", default=False, @@ -382,7 +380,7 @@ def get_cli_parser(): ) chunks_parser.add_argument( "file_paths", nargs="*", help="Paths to files to be chunked." - ).complete = shtab.FILE + ).complete = shtab.FILE # type:ignore return main_parser @@ -443,7 +441,7 @@ def expand_envs_in_dict(d: dict): stack.append(curr[k]) -async def load_config_file(path: Optional[PathLike] = None): +async def load_config_file(path: Optional[Union[str, Path]] = None): """Load config file from ~/.config/vectorcode/config.json""" if path is None: path = GLOBAL_CONFIG_PATH @@ -466,7 +464,7 @@ async def load_config_file(path: Optional[PathLike] = None): return Config() -async def find_project_config_dir(start_from: PathLike = "."): +async def find_project_config_dir(start_from: Union[str, Path] = "."): """Returns the project-local config directory.""" current_dir = Path(start_from).resolve() project_root_anchors = [".vectorcode", ".git"] @@ -486,7 +484,7 @@ async def find_project_config_dir(start_from: PathLike = "."): def find_project_root( - start_from: PathLike, root_anchor: PathLike = ".vectorcode" + start_from: Union[str, Path], root_anchor: Union[str, Path] = ".vectorcode" ) -> str | None: start_from = Path(start_from) if os.path.isfile(start_from): @@ -500,7 +498,7 @@ def find_project_root( start_from = start_from.parent -async def get_project_config(project_root: PathLike) -> Config: +async def get_project_config(project_root: Union[str, Path]) -> Config: """ Load config file for `project_root`. Fallback to global config, and then default config. @@ -520,7 +518,7 @@ async def get_project_config(project_root: PathLike) -> Config: return config -def expand_path(path: PathLike, absolute: bool = False) -> PathLike: +def expand_path(path: Union[str, Path], absolute: bool = False) -> Union[str, Path]: expanded = os.path.expanduser(os.path.expandvars(path)) if absolute: return os.path.abspath(expanded) @@ -528,21 +526,21 @@ def expand_path(path: PathLike, absolute: bool = False) -> PathLike: async def expand_globs( - paths: list[PathLike], recursive: bool = False, include_hidden: bool = False -) -> list[PathLike]: + paths: Sequence[os.PathLike | str], + recursive: bool = False, + include_hidden: bool = False, +) -> list[str]: result = set() - stack = paths + stack = list(str(i) for i in paths) while stack: curr = stack.pop() if os.path.isfile(curr): result.add(expand_path(curr)) elif "**" in str(curr): - stack.extend( - glob.glob(str(curr), recursive=True, include_hidden=include_hidden) - ) + stack.extend(glob.glob(curr, recursive=True, include_hidden=include_hidden)) elif "*" in str(curr): stack.extend( - glob.glob(str(curr), recursive=recursive, include_hidden=include_hidden) + glob.glob(curr, recursive=recursive, include_hidden=include_hidden) ) elif os.path.isdir(curr) and recursive: stack.extend( diff --git a/src/vectorcode/common.py b/src/vectorcode/common.py index 8ebe5ba5..bfde24a0 100644 --- a/src/vectorcode/common.py +++ b/src/vectorcode/common.py @@ -158,7 +158,7 @@ def get_embedding_function(configs: Config) -> chromadb.EmbeddingFunction | None logger.warning( f"Failed to use {configs.embedding_function}. Falling back to Sentence Transformer.", ) - return embedding_functions.SentenceTransformerEmbeddingFunction() + return embedding_functions.SentenceTransformerEmbeddingFunction() # type:ignore except Exception as e: e.add_note( "\nFor errors caused by missing dependency, consult the documentation of pipx (or whatever package manager that you installed VectorCode with) for instructions to inject libraries into the virtual environment." diff --git a/src/vectorcode/subcommands/ls.py b/src/vectorcode/subcommands/ls.py index 754ea2a0..246eb85b 100644 --- a/src/vectorcode/subcommands/ls.py +++ b/src/vectorcode/subcommands/ls.py @@ -19,7 +19,7 @@ async def get_collection_list(client: AsyncClientAPI) -> list[dict]: meta = collection.metadata document_meta = await collection.get(include=[IncludeEnum.metadatas]) unique_files = set( - i.get("path") for i in document_meta["metadatas"] if i is not None + i.get("path") for i in (document_meta["metadatas"] or []) if i is not None ) result.append( { diff --git a/src/vectorcode/subcommands/query/__init__.py b/src/vectorcode/subcommands/query/__init__.py index f22a2e16..e29eab2f 100644 --- a/src/vectorcode/subcommands/query/__init__.py +++ b/src/vectorcode/subcommands/query/__init__.py @@ -1,8 +1,9 @@ import json import logging import os +from typing import cast -from chromadb import GetResult +from chromadb import GetResult, Where from chromadb.api.models.AsyncCollection import AsyncCollection from chromadb.api.types import IncludeEnum from chromadb.errors import InvalidCollectionException, InvalidDimensionException @@ -70,7 +71,7 @@ async def get_query_result_files( IncludeEnum.distances, IncludeEnum.documents, ], - where=filter or None, + where=cast(Where, filter) or None, ) except IndexError: # no results found @@ -99,21 +100,23 @@ async def build_query_results( {str(key): full_result[str(key)] for key in configs.include} ) elif QueryInclude.chunk in configs.include: - chunk: GetResult = await collection.get( + chunks: GetResult = await collection.get( identifier, include=[IncludeEnum.metadatas, IncludeEnum.documents] ) - meta = chunk.get( + meta = chunks.get( "metadatas", ) if meta is not None and len(meta) != 0: - full_result: dict[str, str | int] = { - "chunk": str(chunk.get("documents", [""])[0]) - } + chunk_texts = chunks.get("documents") + assert chunk_texts is not None, ( + "QueryResult does not contain `documents`!" + ) + full_result: dict[str, str | int] = {"chunk": str(chunk_texts[0])} if meta[0].get("start") is not None and meta[0].get("end") is not None: path = str(meta[0].get("path")) with open(path) as fin: - start: int = meta[0]["start"] - end: int = meta[0]["end"] + start: int = int(meta[0]["start"]) + end: int = int(meta[0]["end"]) full_result["chunk"] = "".join(fin.readlines()[start : end + 1]) full_result["start_line"] = start full_result["end_line"] = end @@ -122,7 +125,7 @@ async def build_query_results( meta[0]["path"] if configs.use_absolute_path else os.path.relpath( - meta[0]["path"], str(configs.project_root) + str(meta[0]["path"]), str(configs.project_root) ) ) diff --git a/src/vectorcode/subcommands/query/reranker/naive.py b/src/vectorcode/subcommands/query/reranker/naive.py index 4ec6a8e3..6e6b4336 100644 --- a/src/vectorcode/subcommands/query/reranker/naive.py +++ b/src/vectorcode/subcommands/query/reranker/naive.py @@ -24,4 +24,8 @@ async def compute_similarity( assert self._raw_results.get("distances") is not None assert self.configs.query, "Expecting query messages in self.configs" idx = self.configs.query.index(query_message) - return list(-i for i in self._raw_results.get("distances")[idx]) + dist = self._raw_results.get("distances") + if dist is None: # pragma: nocover + raise ValueError("QueryResult should contain distances!") + else: + return list(-i for i in dist[idx]) diff --git a/src/vectorcode/subcommands/vectorise.py b/src/vectorcode/subcommands/vectorise.py index b0c50d8c..a838124f 100644 --- a/src/vectorcode/subcommands/vectorise.py +++ b/src/vectorcode/subcommands/vectorise.py @@ -78,7 +78,7 @@ async def chunked_add( logger.debug(f"Chunked into {len(chunks)} pieces.") metas = [] for chunk in chunks: - meta: dict[str, str | dict[str, int]] = {"path": full_path_str} + meta: dict[str, str | int] = {"path": full_path_str} if isinstance(chunk, Chunk): meta["start"] = chunk.start.row meta["end"] = chunk.end.row @@ -231,7 +231,7 @@ async def vectorise(configs: Config) -> int: async with collection_lock: all_results = await collection.get(include=[IncludeEnum.metadatas]) if all_results is not None and all_results.get("metadatas"): - paths = (meta["path"] for meta in all_results["metadatas"]) + paths = (meta["path"] for meta in (all_results["metadatas"] or [])) orphans = set() for path in paths: if isinstance(path, str) and not os.path.isfile(path):