Davidyz · Davidyz · May 22, 2025 · May 22, 2025
diff --git a/.github/workflows/test_and_cov.yml b/.github/workflows/test_and_cov.yml
@@ -44,9 +44,9 @@ jobs:
       - name: Set custom HF cache directory
         run: |
           export HF_HOME="$GITHUB_WORKSPACE/hf_cache"
-          export SENTENCE_TRANSFORMERS_HOME=$HF_HOME
-          mkdir -p $HF_HOME
-          [ -z "$(ls $HF_HOME)" ] || rm $HF_HOME/* -rf && true
+          export SENTENCE_TRANSFORMERS_HOME="$HF_HOME"
+          mkdir -p "$HF_HOME"
+          [ -z "$(ls "$HF_HOME")" ] || rm "${HF_HOME:?}/*" -rf && true
 
       - name: run tests 
         run: pdm run pytest --enable-coredumpy --coredumpy-dir ${{ env.COREDUMPY_DUMP_DIR }}
@@ -57,6 +57,9 @@ jobs:
           pdm run coverage report -m
           pdm run coverage xml -i
 
+      - name: static analysis by basedpyright
+        run: pdm run basedpyright
+
       - name: upload coverage reports to codecov
         uses: codecov/codecov-action@v5
         with:

diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md
@@ -20,6 +20,10 @@ run tests, and `make coverage` to generate a coverage report. The testing and
 coverage report are also in the CI configuration, but it might still help to run
 them locally before you open the PR.
 
+This project also runs static analysis with
+[basedpyright](https://docs.basedpyright.com). GitHub Action will also run the
+check when a PR is submitted.
+
 You may also find it helpful to 
 [enable logging](https://github.com/Davidyz/VectorCode/blob/main/docs/cli.md#debugging-and-diagnosing) 
 for the CLI when developing new features or working on fixes.

diff --git a/pyproject.toml b/pyproject.toml
@@ -39,14 +39,6 @@ vectorcode-mcp-server = "vectorcode.mcp_main:main"
 requires = ["pdm-backend"]
 build-backend = "pdm.backend"
 
-[tool.pdm]
-distribution = true
-
-[tool.pdm.version]
-source = "scm"
-write_to = "./vectorcode/_version.py"
-write_template = "__version__ = '{}' # pragma: no cover"
-
 [tool.coverage.run]
 omit = [
     "./tests/*",
@@ -56,6 +48,15 @@ omit = [
 ]
 include = ['src/vectorcode/**/*.py']
 
+
+[tool.pdm]
+distribution = true
+
+[tool.pdm.version]
+source = "scm"
+write_to = "./vectorcode/_version.py"
+write_template = "__version__ = '{}' # pragma: no cover"
+
 [dependency-groups]
 dev = [
     "ipython>=8.31.0",
@@ -68,10 +69,15 @@ dev = [
     "pytest-asyncio>=0.25.3",
     "debugpy>=1.8.12",
     "coredumpy>=0.4.1",
+    "basedpyright>=1.29.2",
 ]
 
 [project.optional-dependencies]
 legacy = ["numpy<2.0.0", "torch==2.2.2", "transformers<=4.49.0"]
 intel = ['optimum[openvino]', 'openvino']
 lsp = ['pygls<2.0.0', 'lsprotocol']
 mcp = ['mcp<2.0.0', 'pydantic']
+
+[tool.basedpyright]
+typeCheckingMode = "standard"
+ignore = ["./tests/"]
diff --git a/src/vectorcode/chunking.py b/src/vectorcode/chunking.py
@@ -5,13 +5,13 @@
 from dataclasses import dataclass
 from functools import cache
 from io import TextIOWrapper
-from typing import Generator, Optional
+from typing import Generator, Optional, cast
 
 from pygments.lexer import Lexer
 from pygments.lexers import get_lexer_for_filename
 from pygments.util import ClassNotFound
 from tree_sitter import Node, Point
-from tree_sitter_language_pack import get_parser
+from tree_sitter_language_pack import SupportedLanguage, get_parser
 
 from vectorcode.cli_utils import Config
 
@@ -279,7 +279,6 @@ def __load_file_lines(self, path: str) -> list[str]:
             lines = fin.readlines()
         return lines
 
-
     def __get_parser_from_config(self, file_path: str):
         """
         Get parser based on filetype_map config.
@@ -291,23 +290,31 @@ def __get_parser_from_config(self, file_path: str):
 
         filename = os.path.basename(file_path)
         extension = os.path.splitext(file_path)[1]
-        if extension.startswith('.'):
+        if extension.startswith("."):
             extension = extension[1:]
         logger.debug(f"Checking filetype map for extension '{extension}' in {filename}")
         for _language, patterns in filetype_map.items():
-            language =  _language.lower()
+            language = _language.lower()
             for pattern in patterns:
                 try:
                     if re.search(pattern, extension):
-                        logger.debug(f"'{filename}' extension matches pattern '{pattern}' for language '{language}'. Attempting to load parser.")
-                        parser = get_parser(language)
-                        logger.debug(f"Found parser for language '{language}' from config.")
+                        logger.debug(
+                            f"'{filename}' extension matches pattern '{pattern}' for language '{language}'. Attempting to load parser."
+                        )
+                        parser = get_parser(cast(SupportedLanguage, language))
+                        logger.debug(
+                            f"Found parser for language '{language}' from config."
+                        )
                         return parser
                 except re.error as e:
-                    e.add_note(f"\nInvalid regex pattern '{pattern}' for language '{language}' in filetype_map")
+                    e.add_note(
+                        f"\nInvalid regex pattern '{pattern}' for language '{language}' in filetype_map"
+                    )
                     raise
                 except LookupError as e:
-                    e.add_note(f"\nTreeSitter Parser for language '{language}' not found. Please check your filetype_map config.")
+                    e.add_note(
+                        f"\nTreeSitter Parser for language '{language}' not found. Please check your filetype_map config."
+                    )
                     raise
 
         logger.debug(f"No matching filetype map entry found for {filename}.")
@@ -336,11 +343,12 @@ def chunk(self, data: str) -> Generator[Chunk, None, None]:
                 lang_names.extend(lexer.aliases)
                 for name in lang_names:
                     try:
-                        parser = get_parser(name.lower())
+                        parser = get_parser(cast(SupportedLanguage, name.lower()))
                         if parser is not None:
                             language = name.lower()
                             logger.debug(
-                                "Detected %s filetype for treesitter chunking.", language
+                                "Detected %s filetype for treesitter chunking.",
+                                language,
                             )
                             break
                     except LookupError:  # pragma: nocover

diff --git a/src/vectorcode/cli_utils.py b/src/vectorcode/cli_utils.py
@@ -18,8 +18,6 @@
 logger = logging.getLogger(name=__name__)
 
 
-PathLike = Union[str, Path]
-
 GLOBAL_CONFIG_PATH = os.path.join(
     os.path.expanduser("~"), ".config", "vectorcode", "config.json"
 )
@@ -70,8 +68,8 @@ class Config:
     to_be_deleted: list[str] = field(default_factory=list)
     pipe: bool = False
     action: Optional[CliAction] = None
-    files: list[PathLike] = field(default_factory=list)
-    project_root: Optional[PathLike] = None
+    files: list[Union[str, os.PathLike]] = field(default_factory=list)
+    project_root: Optional[Union[str, Path]] = None
     query: Optional[list[str]] = None
     db_url: str = "http://127.0.0.1:8000"
     embedding_function: str = "SentenceTransformerEmbeddingFunction"  # This should fallback to whatever the default is.
@@ -84,7 +82,7 @@ class Config:
     chunk_size: int = 2500
     overlap_ratio: float = 0.2
     query_multiplier: int = -1
-    query_exclude: list[PathLike] = field(default_factory=list)
+    query_exclude: list[Union[str, os.PathLike]] = field(default_factory=list)
     reranker: Optional[str] = "CrossEncoderReranker"
     reranker_params: dict[str, Any] = field(default_factory=lambda: {})
     check_item: Optional[str] = None
@@ -214,7 +212,7 @@ def get_cli_parser():
         "--project_root",
         default=None,
         help="Project root to be used as an identifier of the project.",
-    ).complete = shtab.DIRECTORY
+    ).complete = shtab.DIRECTORY  # type:ignore
     shared_parser.add_argument(
         "--pipe",
         "-p",
@@ -226,7 +224,7 @@ def get_cli_parser():
         "--no_stderr",
         action="store_true",
         default=False,
-        help="Supress all STDERR messages.",
+        help="Suppress all STDERR messages.",
     )
     main_parser = argparse.ArgumentParser(
         "vectorcode",
@@ -253,7 +251,7 @@ def get_cli_parser():
     )
     vectorise_parser.add_argument(
         "file_paths", nargs="*", help="Paths to files to be vectorised."
-    ).complete = shtab.FILE
+    ).complete = shtab.FILE  # type:ignore
     vectorise_parser.add_argument(
         "--recursive",
         "-r",
@@ -297,7 +295,7 @@ def get_cli_parser():
     )
     query_parser.add_argument(
         "--exclude", nargs="*", help="Files to exclude from query results."
-    ).complete = shtab.FILE
+    ).complete = shtab.FILE  # type:ignore
     query_parser.add_argument(
         "--absolute",
         default=False,
@@ -382,7 +380,7 @@ def get_cli_parser():
     )
     chunks_parser.add_argument(
         "file_paths", nargs="*", help="Paths to files to be chunked."
-    ).complete = shtab.FILE
+    ).complete = shtab.FILE  # type:ignore
     return main_parser
 
 
@@ -443,7 +441,7 @@ def expand_envs_in_dict(d: dict):
                 stack.append(curr[k])
 
 
-async def load_config_file(path: Optional[PathLike] = None):
+async def load_config_file(path: Optional[Union[str, Path]] = None):
     """Load config file from ~/.config/vectorcode/config.json"""
     if path is None:
         path = GLOBAL_CONFIG_PATH
@@ -466,7 +464,7 @@ async def load_config_file(path: Optional[PathLike] = None):
     return Config()
 
 
-async def find_project_config_dir(start_from: PathLike = "."):
+async def find_project_config_dir(start_from: Union[str, Path] = "."):
     """Returns the project-local config directory."""
     current_dir = Path(start_from).resolve()
     project_root_anchors = [".vectorcode", ".git"]
@@ -486,7 +484,7 @@ async def find_project_config_dir(start_from: PathLike = "."):
 
 
 def find_project_root(
-    start_from: PathLike, root_anchor: PathLike = ".vectorcode"
+    start_from: Union[str, Path], root_anchor: Union[str, Path] = ".vectorcode"
 ) -> str | None:
     start_from = Path(start_from)
     if os.path.isfile(start_from):
@@ -500,7 +498,7 @@ def find_project_root(
         start_from = start_from.parent
 
 
-async def get_project_config(project_root: PathLike) -> Config:
+async def get_project_config(project_root: Union[str, Path]) -> Config:
     """
     Load config file for `project_root`.
     Fallback to global config, and then default config.
@@ -520,29 +518,29 @@ async def get_project_config(project_root: PathLike) -> Config:
     return config
 
 
-def expand_path(path: PathLike, absolute: bool = False) -> PathLike:
+def expand_path(path: Union[str, Path], absolute: bool = False) -> Union[str, Path]:
     expanded = os.path.expanduser(os.path.expandvars(path))
     if absolute:
         return os.path.abspath(expanded)
     return expanded
 
 
 async def expand_globs(
-    paths: list[PathLike], recursive: bool = False, include_hidden: bool = False
-) -> list[PathLike]:
+    paths: Sequence[os.PathLike | str],
+    recursive: bool = False,
+    include_hidden: bool = False,
+) -> list[str]:
     result = set()
-    stack = paths
+    stack = list(str(i) for i in paths)
     while stack:
         curr = stack.pop()
         if os.path.isfile(curr):
             result.add(expand_path(curr))
         elif "**" in str(curr):
-            stack.extend(
-                glob.glob(str(curr), recursive=True, include_hidden=include_hidden)
-            )
+            stack.extend(glob.glob(curr, recursive=True, include_hidden=include_hidden))
         elif "*" in str(curr):
             stack.extend(
-                glob.glob(str(curr), recursive=recursive, include_hidden=include_hidden)
+                glob.glob(curr, recursive=recursive, include_hidden=include_hidden)
             )
         elif os.path.isdir(curr) and recursive:
             stack.extend(

diff --git a/src/vectorcode/common.py b/src/vectorcode/common.py
@@ -158,7 +158,7 @@ def get_embedding_function(configs: Config) -> chromadb.EmbeddingFunction | None
         logger.warning(
             f"Failed to use {configs.embedding_function}. Falling back to Sentence Transformer.",
         )
-        return embedding_functions.SentenceTransformerEmbeddingFunction()
+        return embedding_functions.SentenceTransformerEmbeddingFunction()  # type:ignore
     except Exception as e:
         e.add_note(
             "\nFor errors caused by missing dependency, consult the documentation of pipx (or whatever package manager that you installed VectorCode with) for instructions to inject libraries into the virtual environment."

diff --git a/src/vectorcode/subcommands/ls.py b/src/vectorcode/subcommands/ls.py
@@ -19,7 +19,7 @@ async def get_collection_list(client: AsyncClientAPI) -> list[dict]:
         meta = collection.metadata
         document_meta = await collection.get(include=[IncludeEnum.metadatas])
         unique_files = set(
-            i.get("path") for i in document_meta["metadatas"] if i is not None
+            i.get("path") for i in (document_meta["metadatas"] or []) if i is not None
         )
         result.append(
             {

diff --git a/src/vectorcode/subcommands/query/__init__.py b/src/vectorcode/subcommands/query/__init__.py
@@ -1,8 +1,9 @@
 import json
 import logging
 import os
+from typing import cast
 
-from chromadb import GetResult
+from chromadb import GetResult, Where
 from chromadb.api.models.AsyncCollection import AsyncCollection
 from chromadb.api.types import IncludeEnum
 from chromadb.errors import InvalidCollectionException, InvalidDimensionException
@@ -70,7 +71,7 @@ async def get_query_result_files(
                 IncludeEnum.distances,
                 IncludeEnum.documents,
             ],
-            where=filter or None,
+            where=cast(Where, filter) or None,
         )
     except IndexError:
         # no results found
@@ -99,21 +100,23 @@ async def build_query_results(
                 {str(key): full_result[str(key)] for key in configs.include}
             )
         elif QueryInclude.chunk in configs.include:
-            chunk: GetResult = await collection.get(
+            chunks: GetResult = await collection.get(
                 identifier, include=[IncludeEnum.metadatas, IncludeEnum.documents]
             )
-            meta = chunk.get(
+            meta = chunks.get(
                 "metadatas",
             )
             if meta is not None and len(meta) != 0:
-                full_result: dict[str, str | int] = {
-                    "chunk": str(chunk.get("documents", [""])[0])
-                }
+                chunk_texts = chunks.get("documents")
+                assert chunk_texts is not None, (
+                    "QueryResult does not contain `documents`!"
+                )
+                full_result: dict[str, str | int] = {"chunk": str(chunk_texts[0])}
                 if meta[0].get("start") is not None and meta[0].get("end") is not None:
                     path = str(meta[0].get("path"))
                     with open(path) as fin:
-                        start: int = meta[0]["start"]
-                        end: int = meta[0]["end"]
+                        start: int = int(meta[0]["start"])
+                        end: int = int(meta[0]["end"])
                         full_result["chunk"] = "".join(fin.readlines()[start : end + 1])
                     full_result["start_line"] = start
                     full_result["end_line"] = end
@@ -122,7 +125,7 @@ async def build_query_results(
                             meta[0]["path"]
                             if configs.use_absolute_path
                             else os.path.relpath(
-                                meta[0]["path"], str(configs.project_root)
+                                str(meta[0]["path"]), str(configs.project_root)
                             )
                         )
 

diff --git a/src/vectorcode/subcommands/query/reranker/naive.py b/src/vectorcode/subcommands/query/reranker/naive.py
@@ -24,4 +24,8 @@ async def compute_similarity(
         assert self._raw_results.get("distances") is not None
         assert self.configs.query, "Expecting query messages in self.configs"
         idx = self.configs.query.index(query_message)
-        return list(-i for i in self._raw_results.get("distances")[idx])
+        dist = self._raw_results.get("distances")
+        if dist is None:  # pragma: nocover
+            raise ValueError("QueryResult should contain distances!")
+        else:
+            return list(-i for i in dist[idx])