Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions .github/workflows/test_and_cov.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,9 @@ jobs:
- name: Set custom HF cache directory
run: |
export HF_HOME="$GITHUB_WORKSPACE/hf_cache"
export SENTENCE_TRANSFORMERS_HOME=$HF_HOME
mkdir -p $HF_HOME
[ -z "$(ls $HF_HOME)" ] || rm $HF_HOME/* -rf && true
export SENTENCE_TRANSFORMERS_HOME="$HF_HOME"
mkdir -p "$HF_HOME"
[ -z "$(ls "$HF_HOME")" ] || rm "${HF_HOME:?}/*" -rf && true

- name: run tests
run: pdm run pytest --enable-coredumpy --coredumpy-dir ${{ env.COREDUMPY_DUMP_DIR }}
Expand All @@ -57,6 +57,9 @@ jobs:
pdm run coverage report -m
pdm run coverage xml -i

- name: static analysis by basedpyright
run: pdm run basedpyright

- name: upload coverage reports to codecov
uses: codecov/codecov-action@v5
with:
Expand Down
4 changes: 4 additions & 0 deletions docs/CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ run tests, and `make coverage` to generate a coverage report. The testing and
coverage report are also in the CI configuration, but it might still help to run
them locally before you open the PR.

This project also runs static analysis with
[basedpyright](https://docs.basedpyright.com). GitHub Action will also run the
check when a PR is submitted.

You may also find it helpful to
[enable logging](https://github.com/Davidyz/VectorCode/blob/main/docs/cli.md#debugging-and-diagnosing)
for the CLI when developing new features or working on fixes.
Expand Down
22 changes: 14 additions & 8 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,6 @@ vectorcode-mcp-server = "vectorcode.mcp_main:main"
requires = ["pdm-backend"]
build-backend = "pdm.backend"

[tool.pdm]
distribution = true

[tool.pdm.version]
source = "scm"
write_to = "./vectorcode/_version.py"
write_template = "__version__ = '{}' # pragma: no cover"

[tool.coverage.run]
omit = [
"./tests/*",
Expand All @@ -56,6 +48,15 @@ omit = [
]
include = ['src/vectorcode/**/*.py']


[tool.pdm]
distribution = true

[tool.pdm.version]
source = "scm"
write_to = "./vectorcode/_version.py"
write_template = "__version__ = '{}' # pragma: no cover"

[dependency-groups]
dev = [
"ipython>=8.31.0",
Expand All @@ -68,10 +69,15 @@ dev = [
"pytest-asyncio>=0.25.3",
"debugpy>=1.8.12",
"coredumpy>=0.4.1",
"basedpyright>=1.29.2",
]

[project.optional-dependencies]
legacy = ["numpy<2.0.0", "torch==2.2.2", "transformers<=4.49.0"]
intel = ['optimum[openvino]', 'openvino']
lsp = ['pygls<2.0.0', 'lsprotocol']
mcp = ['mcp<2.0.0', 'pydantic']

[tool.basedpyright]
typeCheckingMode = "standard"
ignore = ["./tests/"]
32 changes: 20 additions & 12 deletions src/vectorcode/chunking.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
from dataclasses import dataclass
from functools import cache
from io import TextIOWrapper
from typing import Generator, Optional
from typing import Generator, Optional, cast

from pygments.lexer import Lexer
from pygments.lexers import get_lexer_for_filename
from pygments.util import ClassNotFound
from tree_sitter import Node, Point
from tree_sitter_language_pack import get_parser
from tree_sitter_language_pack import SupportedLanguage, get_parser

from vectorcode.cli_utils import Config

Expand Down Expand Up @@ -279,7 +279,6 @@ def __load_file_lines(self, path: str) -> list[str]:
lines = fin.readlines()
return lines


def __get_parser_from_config(self, file_path: str):
"""
Get parser based on filetype_map config.
Expand All @@ -291,23 +290,31 @@ def __get_parser_from_config(self, file_path: str):

filename = os.path.basename(file_path)
extension = os.path.splitext(file_path)[1]
if extension.startswith('.'):
if extension.startswith("."):
extension = extension[1:]
logger.debug(f"Checking filetype map for extension '{extension}' in {filename}")
for _language, patterns in filetype_map.items():
language = _language.lower()
language = _language.lower()
for pattern in patterns:
try:
if re.search(pattern, extension):
logger.debug(f"'{filename}' extension matches pattern '{pattern}' for language '{language}'. Attempting to load parser.")
parser = get_parser(language)
logger.debug(f"Found parser for language '{language}' from config.")
logger.debug(
f"'{filename}' extension matches pattern '{pattern}' for language '{language}'. Attempting to load parser."
)
parser = get_parser(cast(SupportedLanguage, language))
logger.debug(
f"Found parser for language '{language}' from config."
)
return parser
except re.error as e:
e.add_note(f"\nInvalid regex pattern '{pattern}' for language '{language}' in filetype_map")
e.add_note(
f"\nInvalid regex pattern '{pattern}' for language '{language}' in filetype_map"
)
raise
except LookupError as e:
e.add_note(f"\nTreeSitter Parser for language '{language}' not found. Please check your filetype_map config.")
e.add_note(
f"\nTreeSitter Parser for language '{language}' not found. Please check your filetype_map config."
)
raise

logger.debug(f"No matching filetype map entry found for {filename}.")
Expand Down Expand Up @@ -336,11 +343,12 @@ def chunk(self, data: str) -> Generator[Chunk, None, None]:
lang_names.extend(lexer.aliases)
for name in lang_names:
try:
parser = get_parser(name.lower())
parser = get_parser(cast(SupportedLanguage, name.lower()))
if parser is not None:
language = name.lower()
logger.debug(
"Detected %s filetype for treesitter chunking.", language
"Detected %s filetype for treesitter chunking.",
language,
)
break
except LookupError: # pragma: nocover
Expand Down
42 changes: 20 additions & 22 deletions src/vectorcode/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@
logger = logging.getLogger(name=__name__)


PathLike = Union[str, Path]

GLOBAL_CONFIG_PATH = os.path.join(
os.path.expanduser("~"), ".config", "vectorcode", "config.json"
)
Expand Down Expand Up @@ -70,8 +68,8 @@ class Config:
to_be_deleted: list[str] = field(default_factory=list)
pipe: bool = False
action: Optional[CliAction] = None
files: list[PathLike] = field(default_factory=list)
project_root: Optional[PathLike] = None
files: list[Union[str, os.PathLike]] = field(default_factory=list)
project_root: Optional[Union[str, Path]] = None
query: Optional[list[str]] = None
db_url: str = "http://127.0.0.1:8000"
embedding_function: str = "SentenceTransformerEmbeddingFunction" # This should fallback to whatever the default is.
Expand All @@ -84,7 +82,7 @@ class Config:
chunk_size: int = 2500
overlap_ratio: float = 0.2
query_multiplier: int = -1
query_exclude: list[PathLike] = field(default_factory=list)
query_exclude: list[Union[str, os.PathLike]] = field(default_factory=list)
reranker: Optional[str] = "CrossEncoderReranker"
reranker_params: dict[str, Any] = field(default_factory=lambda: {})
check_item: Optional[str] = None
Expand Down Expand Up @@ -214,7 +212,7 @@ def get_cli_parser():
"--project_root",
default=None,
help="Project root to be used as an identifier of the project.",
).complete = shtab.DIRECTORY
).complete = shtab.DIRECTORY # type:ignore
shared_parser.add_argument(
"--pipe",
"-p",
Expand All @@ -226,7 +224,7 @@ def get_cli_parser():
"--no_stderr",
action="store_true",
default=False,
help="Supress all STDERR messages.",
help="Suppress all STDERR messages.",
)
main_parser = argparse.ArgumentParser(
"vectorcode",
Expand All @@ -253,7 +251,7 @@ def get_cli_parser():
)
vectorise_parser.add_argument(
"file_paths", nargs="*", help="Paths to files to be vectorised."
).complete = shtab.FILE
).complete = shtab.FILE # type:ignore
vectorise_parser.add_argument(
"--recursive",
"-r",
Expand Down Expand Up @@ -297,7 +295,7 @@ def get_cli_parser():
)
query_parser.add_argument(
"--exclude", nargs="*", help="Files to exclude from query results."
).complete = shtab.FILE
).complete = shtab.FILE # type:ignore
query_parser.add_argument(
"--absolute",
default=False,
Expand Down Expand Up @@ -382,7 +380,7 @@ def get_cli_parser():
)
chunks_parser.add_argument(
"file_paths", nargs="*", help="Paths to files to be chunked."
).complete = shtab.FILE
).complete = shtab.FILE # type:ignore
return main_parser


Expand Down Expand Up @@ -443,7 +441,7 @@ def expand_envs_in_dict(d: dict):
stack.append(curr[k])


async def load_config_file(path: Optional[PathLike] = None):
async def load_config_file(path: Optional[Union[str, Path]] = None):
"""Load config file from ~/.config/vectorcode/config.json"""
if path is None:
path = GLOBAL_CONFIG_PATH
Expand All @@ -466,7 +464,7 @@ async def load_config_file(path: Optional[PathLike] = None):
return Config()


async def find_project_config_dir(start_from: PathLike = "."):
async def find_project_config_dir(start_from: Union[str, Path] = "."):
"""Returns the project-local config directory."""
current_dir = Path(start_from).resolve()
project_root_anchors = [".vectorcode", ".git"]
Expand All @@ -486,7 +484,7 @@ async def find_project_config_dir(start_from: PathLike = "."):


def find_project_root(
start_from: PathLike, root_anchor: PathLike = ".vectorcode"
start_from: Union[str, Path], root_anchor: Union[str, Path] = ".vectorcode"
) -> str | None:
start_from = Path(start_from)
if os.path.isfile(start_from):
Expand All @@ -500,7 +498,7 @@ def find_project_root(
start_from = start_from.parent


async def get_project_config(project_root: PathLike) -> Config:
async def get_project_config(project_root: Union[str, Path]) -> Config:
"""
Load config file for `project_root`.
Fallback to global config, and then default config.
Expand All @@ -520,29 +518,29 @@ async def get_project_config(project_root: PathLike) -> Config:
return config


def expand_path(path: PathLike, absolute: bool = False) -> PathLike:
def expand_path(path: Union[str, Path], absolute: bool = False) -> Union[str, Path]:
expanded = os.path.expanduser(os.path.expandvars(path))
if absolute:
return os.path.abspath(expanded)
return expanded


async def expand_globs(
paths: list[PathLike], recursive: bool = False, include_hidden: bool = False
) -> list[PathLike]:
paths: Sequence[os.PathLike | str],
recursive: bool = False,
include_hidden: bool = False,
) -> list[str]:
result = set()
stack = paths
stack = list(str(i) for i in paths)
while stack:
curr = stack.pop()
if os.path.isfile(curr):
result.add(expand_path(curr))
elif "**" in str(curr):
stack.extend(
glob.glob(str(curr), recursive=True, include_hidden=include_hidden)
)
stack.extend(glob.glob(curr, recursive=True, include_hidden=include_hidden))
elif "*" in str(curr):
stack.extend(
glob.glob(str(curr), recursive=recursive, include_hidden=include_hidden)
glob.glob(curr, recursive=recursive, include_hidden=include_hidden)
)
elif os.path.isdir(curr) and recursive:
stack.extend(
Expand Down
2 changes: 1 addition & 1 deletion src/vectorcode/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def get_embedding_function(configs: Config) -> chromadb.EmbeddingFunction | None
logger.warning(
f"Failed to use {configs.embedding_function}. Falling back to Sentence Transformer.",
)
return embedding_functions.SentenceTransformerEmbeddingFunction()
return embedding_functions.SentenceTransformerEmbeddingFunction() # type:ignore
except Exception as e:
e.add_note(
"\nFor errors caused by missing dependency, consult the documentation of pipx (or whatever package manager that you installed VectorCode with) for instructions to inject libraries into the virtual environment."
Expand Down
2 changes: 1 addition & 1 deletion src/vectorcode/subcommands/ls.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ async def get_collection_list(client: AsyncClientAPI) -> list[dict]:
meta = collection.metadata
document_meta = await collection.get(include=[IncludeEnum.metadatas])
unique_files = set(
i.get("path") for i in document_meta["metadatas"] if i is not None
i.get("path") for i in (document_meta["metadatas"] or []) if i is not None
)
result.append(
{
Expand Down
23 changes: 13 additions & 10 deletions src/vectorcode/subcommands/query/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import json
import logging
import os
from typing import cast

from chromadb import GetResult
from chromadb import GetResult, Where
from chromadb.api.models.AsyncCollection import AsyncCollection
from chromadb.api.types import IncludeEnum
from chromadb.errors import InvalidCollectionException, InvalidDimensionException
Expand Down Expand Up @@ -70,7 +71,7 @@ async def get_query_result_files(
IncludeEnum.distances,
IncludeEnum.documents,
],
where=filter or None,
where=cast(Where, filter) or None,
)
except IndexError:
# no results found
Expand Down Expand Up @@ -99,21 +100,23 @@ async def build_query_results(
{str(key): full_result[str(key)] for key in configs.include}
)
elif QueryInclude.chunk in configs.include:
chunk: GetResult = await collection.get(
chunks: GetResult = await collection.get(
identifier, include=[IncludeEnum.metadatas, IncludeEnum.documents]
)
meta = chunk.get(
meta = chunks.get(
"metadatas",
)
if meta is not None and len(meta) != 0:
full_result: dict[str, str | int] = {
"chunk": str(chunk.get("documents", [""])[0])
}
chunk_texts = chunks.get("documents")
assert chunk_texts is not None, (
"QueryResult does not contain `documents`!"
)
full_result: dict[str, str | int] = {"chunk": str(chunk_texts[0])}
if meta[0].get("start") is not None and meta[0].get("end") is not None:
path = str(meta[0].get("path"))
with open(path) as fin:
start: int = meta[0]["start"]
end: int = meta[0]["end"]
start: int = int(meta[0]["start"])
end: int = int(meta[0]["end"])
full_result["chunk"] = "".join(fin.readlines()[start : end + 1])
full_result["start_line"] = start
full_result["end_line"] = end
Expand All @@ -122,7 +125,7 @@ async def build_query_results(
meta[0]["path"]
if configs.use_absolute_path
else os.path.relpath(
meta[0]["path"], str(configs.project_root)
str(meta[0]["path"]), str(configs.project_root)
)
)

Expand Down
6 changes: 5 additions & 1 deletion src/vectorcode/subcommands/query/reranker/naive.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,8 @@ async def compute_similarity(
assert self._raw_results.get("distances") is not None
assert self.configs.query, "Expecting query messages in self.configs"
idx = self.configs.query.index(query_message)
return list(-i for i in self._raw_results.get("distances")[idx])
dist = self._raw_results.get("distances")
if dist is None: # pragma: nocover
raise ValueError("QueryResult should contain distances!")
else:
return list(-i for i in dist[idx])
Loading