From a1ea70b61e45b35702bda8bfcd7be1bb4451bab6 Mon Sep 17 00:00:00 2001 From: Jufralice Date: Wed, 14 May 2025 17:25:19 +0200 Subject: [PATCH 1/8] feat(treesitter): Handle lexers from pygments.lexers.templates modules (lexers with '+' in their language names) when looking up parsers When looking up a tree-sitter parser for a language name containing '+', such as 'javascript+genshitext', we also add the primary name (before '+') to the list of names to attempt. This ensure that we fallback to the primary language parser (eg: javascript) if none of the 'hybrid' language-names exists in 'tree-sitter-language-pack' --- src/vectorcode/chunking.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/vectorcode/chunking.py b/src/vectorcode/chunking.py index 906c68fb..00677b60 100644 --- a/src/vectorcode/chunking.py +++ b/src/vectorcode/chunking.py @@ -309,6 +309,11 @@ def chunk(self, data: str) -> Generator[Chunk, None, None]: break except LookupError: # pragma: nocover pass + if '+' in name: + primary_name = name.split('+')[0] + if primary_name not in lang_names: + lang_names.append(primary_name) + logger.debug("Added primary lang_name: %s to the list of lang_names to test", primary_name) if parser is None: logger.debug( From 699d36050b39bded3e59d3bdaaebf182b4c2663f Mon Sep 17 00:00:00 2001 From: Jufralice Date: Wed, 14 May 2025 22:58:23 +0200 Subject: [PATCH 2/8] Add tests to cover my fix for the 'https://github.com/Davidyz/VectorCode/issues/139' issue => 'tree_sitter_language_pack won't find a parser for 'hybrid' lexer language names (like 'javascript+genshitext') falling back to naive chunking method instead of chunking with the 'javascript' treesitter parser' --- tests/test_chunking.py | 44 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/tests/test_chunking.py b/tests/test_chunking.py index d7d5bda2..7d6a2bad 100644 --- a/tests/test_chunking.py +++ b/tests/test_chunking.py @@ -223,6 +223,50 @@ def bar(): assert chunks == ['def 测试():\n return "foo"', 'def bar():\n return "bar"'] os.remove(test_file) +def test_treesitter_chunker_javascript(): + """Test TreeSitterChunker with a sample javascript file using tempfile.""" + chunker = TreeSitterChunker(Config(chunk_size=60)) + + test_content = r""" +function foo() { + return "foo"; +} + +function bar() { + return "bar"; +} + """ + + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".js") as tmp_file: + tmp_file.write(test_content) + test_file = tmp_file.name + + chunks = list(str(i) for i in chunker.chunk(test_file)) + assert chunks == ['function foo() {\n return "foo";\n}', 'function bar() {\n return "bar";\n}'] + os.remove(test_file) + +def test_treesitter_chunker_javascript_genshi(): + """Test TreeSitterChunker with a sample javascript + genshi file using tempfile.""" + chunker = TreeSitterChunker(Config(chunk_size=60)) + + test_content = r""" +function foo() { + return `foo with ${genshi}`; +} + +function bar() { + return "bar"; +} + """ + + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".js") as tmp_file: + tmp_file.write(test_content) + test_file = tmp_file.name + + chunks = list(str(i) for i in chunker.chunk(test_file)) + assert chunks == ['function foo() {\n return `foo with ${genshi}`;\n}', 'function bar() {\n return "bar";\n}'] + os.remove(test_file) + def test_treesitter_chunker_filter(): chunker = TreeSitterChunker( From 8f0d4a3171c471833409beb40a6950742f7a9e4a Mon Sep 17 00:00:00 2001 From: Jufralice Date: Fri, 16 May 2025 13:19:58 +0200 Subject: [PATCH 3/8] feat(chunking): Add filetype_map config option Introduce `filetype_map` configuration option to allow users to explicitly map file extensions (via regex patterns) to specific tree-sitter languages. This configuration is checked before attempting automatic language detection based on Pygments lexers, providing a way to handle files with unusual extensions or mixed languages where automatic detection might fail or pick the wrong parser. Also updates Pygments lexer lookup to use `get_lexer_for_filename`. --- docs/cli.md | 13 +++++++ src/vectorcode/chunking.py | 78 ++++++++++++++++++++++++++----------- src/vectorcode/cli_utils.py | 4 ++ tests/test_chunking.py | 7 ++-- 4 files changed, 77 insertions(+), 25 deletions(-) diff --git a/docs/cli.md b/docs/cli.md index 3609c811..bea7012f 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -324,6 +324,19 @@ The JSON configuration file may hold the following values: "hnsw:construction_ef": 100 } ``` +- `filetype_map`: `dict[str, list[str]]`, a dictionary where keys are + [language name](https://github.com/Goldziher/tree-sitter-language-pack?tab=readme-ov-file#available-languages) + and values are lists of [Python regex patterns](https://docs.python.org/3/library/re.html) + that will match file extensions. This allows overriding automatic language + detection and specifying a treesitter parser for certain file types for which the language parser cannot be + correctly identified (e.g., `.phtml` files containing both php and html). + Example configuration: + ```json5 + "filetype_map": { + "php": ["^phtml$"] + } + ``` + - `chunk_filters`: `dict[str, list[str]]`, a dictionary where the keys are [language name](https://github.com/Goldziher/tree-sitter-language-pack?tab=readme-ov-file#available-languages) and values are lists of [Python regex patterns](https://docs.python.org/3/library/re.html) diff --git a/src/vectorcode/chunking.py b/src/vectorcode/chunking.py index 00677b60..643c2658 100644 --- a/src/vectorcode/chunking.py +++ b/src/vectorcode/chunking.py @@ -8,7 +8,7 @@ from typing import Generator, Optional from pygments.lexer import Lexer -from pygments.lexers import guess_lexer_for_filename +from pygments.lexers import get_lexer_for_filename from pygments.util import ClassNotFound from tree_sitter import Node, Point from tree_sitter_language_pack import get_parser @@ -240,7 +240,7 @@ def __chunk_node( @cache def __guess_type(self, path: str, content: str) -> Optional[Lexer]: try: - return guess_lexer_for_filename(path, content) + return get_lexer_for_filename(path, content) except ClassNotFound: return None @@ -279,6 +279,43 @@ def __load_file_lines(self, path: str) -> list[str]: lines = fin.readlines() return lines + + def __get_parser_from_config(self, file_path: str): + """ + Get parser based on filetype_map config. + """ + filetype_map = self.config.filetype_map + if not filetype_map: + logger.debug("filetype_map is empty in config.") + return None + + filename = os.path.basename(file_path) + extension = os.path.splitext(file_path)[1] + if extension.startswith('.'): + extension = extension[1:] + logger.debug(f"Checking filetype map for extension '{extension}' in {filename}") + for _language, patterns in filetype_map.items(): + language = _language.lower() + for pattern in patterns: + try: + if re.search(pattern, extension): + logger.debug(f"'{filename}' extension matches pattern '{pattern}' for language '{language}'. Attempting to load parser.") + parser = get_parser(language) + if parser is None: + raise LookupError(f"Parser not found for language '{language}'. Please check your filetype_map config.") + logger.debug(f"Found parser for language '{language}' from config.") + return parser + except re.error as e: + logger.error(f"Invalid regex pattern '{pattern}' for language '{language}' in filetype_map: {e}") + except LookupError: + logger.error(f"Configured parser for language '{language}' not found or failed to load. Please check your filetype_map config.") + raise ValueError(f"Configured parser for language '{language}' not found.") from None + except Exception as e: + logger.error(f"An unexpected error occurred while processing filetype_map for language '{language}' and pattern '{pattern}': {e}") + + logger.debug(f"No matching filetype map entry found for {filename}.") + return None + def chunk(self, data: str) -> Generator[Chunk, None, None]: """ data: path to the file @@ -294,26 +331,23 @@ def chunk(self, data: str) -> Generator[Chunk, None, None]: return parser = None language = None - lexer = self.__guess_type(data, content) - if lexer is not None: - lang_names = [lexer.name] - lang_names.extend(lexer.aliases) - for name in lang_names: - try: - parser = get_parser(name.lower()) - if parser is not None: - language = name.lower() - logger.debug( - "Detected %s filetype for treesitter chunking.", language - ) - break - except LookupError: # pragma: nocover - pass - if '+' in name: - primary_name = name.split('+')[0] - if primary_name not in lang_names: - lang_names.append(primary_name) - logger.debug("Added primary lang_name: %s to the list of lang_names to test", primary_name) + parser = self.__get_parser_from_config(data) + if parser is None: + lexer = self.__guess_type(data, content) + if lexer is not None: + lang_names = [lexer.name] + lang_names.extend(lexer.aliases) + for name in lang_names: + try: + parser = get_parser(name.lower()) + if parser is not None: + language = name.lower() + logger.debug( + "Detected %s filetype for treesitter chunking.", language + ) + break + except LookupError: # pragma: nocover + pass if parser is None: logger.debug( diff --git a/src/vectorcode/cli_utils.py b/src/vectorcode/cli_utils.py index d33f3752..36db2703 100644 --- a/src/vectorcode/cli_utils.py +++ b/src/vectorcode/cli_utils.py @@ -94,6 +94,7 @@ class Config: ) hnsw: dict[str, str | int] = field(default_factory=dict) chunk_filters: dict[str, list[str]] = field(default_factory=dict) + filetype_map: dict[str, list[str]] = field(default_factory=dict) encoding: str = "utf8" hooks: bool = False @@ -156,6 +157,9 @@ async def import_from(cls, config_dict: dict[str, Any]) -> "Config": "chunk_filters": config_dict.get( "chunk_filters", default_config.chunk_filters ), + "filetype_map": config_dict.get( + "filetype_map", default_config.filetype_map + ), "encoding": config_dict.get("encoding", default_config.encoding), } ) diff --git a/tests/test_chunking.py b/tests/test_chunking.py index 7d6a2bad..281b9d24 100644 --- a/tests/test_chunking.py +++ b/tests/test_chunking.py @@ -246,8 +246,9 @@ def test_treesitter_chunker_javascript(): os.remove(test_file) def test_treesitter_chunker_javascript_genshi(): - """Test TreeSitterChunker with a sample javascript + genshi file using tempfile.""" - chunker = TreeSitterChunker(Config(chunk_size=60)) + """Test TreeSitterChunker with a sample javascript + genshi file using tempfile. (bypassing lexers via the filetype_map config param)""" + chunker = TreeSitterChunker(Config(chunk_size=60, filetype_map={"javascript": ["^kid$"]})) + # chunker = TreeSitterChunker(Config(chunk_size=60)) test_content = r""" function foo() { @@ -259,7 +260,7 @@ def test_treesitter_chunker_javascript_genshi(): } """ - with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".js") as tmp_file: + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".kid") as tmp_file: tmp_file.write(test_content) test_file = tmp_file.name From fef41646620c1b9b0f0d619bbe66f3ce7a271efd Mon Sep 17 00:00:00 2001 From: Jufralice Date: Fri, 16 May 2025 15:55:54 +0200 Subject: [PATCH 4/8] test(chunking): Add more tests for the chunker when using the filetype_map parameter. covering the error handling also remove the check of parser being None since get_parser will never return None --- src/vectorcode/chunking.py | 5 +--- tests/test_chunking.py | 48 +++++++++++++++++++++++++++++++++++++- 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/src/vectorcode/chunking.py b/src/vectorcode/chunking.py index 643c2658..903e227f 100644 --- a/src/vectorcode/chunking.py +++ b/src/vectorcode/chunking.py @@ -301,17 +301,14 @@ def __get_parser_from_config(self, file_path: str): if re.search(pattern, extension): logger.debug(f"'{filename}' extension matches pattern '{pattern}' for language '{language}'. Attempting to load parser.") parser = get_parser(language) - if parser is None: - raise LookupError(f"Parser not found for language '{language}'. Please check your filetype_map config.") logger.debug(f"Found parser for language '{language}' from config.") return parser except re.error as e: logger.error(f"Invalid regex pattern '{pattern}' for language '{language}' in filetype_map: {e}") + raise ValueError(f"Invalid regex pattern '{pattern}' for language '{language}' in filetype_map: {e}") except LookupError: logger.error(f"Configured parser for language '{language}' not found or failed to load. Please check your filetype_map config.") raise ValueError(f"Configured parser for language '{language}' not found.") from None - except Exception as e: - logger.error(f"An unexpected error occurred while processing filetype_map for language '{language}' and pattern '{pattern}': {e}") logger.debug(f"No matching filetype map entry found for {filename}.") return None diff --git a/tests/test_chunking.py b/tests/test_chunking.py index 281b9d24..7946f14f 100644 --- a/tests/test_chunking.py +++ b/tests/test_chunking.py @@ -248,7 +248,6 @@ def test_treesitter_chunker_javascript(): def test_treesitter_chunker_javascript_genshi(): """Test TreeSitterChunker with a sample javascript + genshi file using tempfile. (bypassing lexers via the filetype_map config param)""" chunker = TreeSitterChunker(Config(chunk_size=60, filetype_map={"javascript": ["^kid$"]})) - # chunker = TreeSitterChunker(Config(chunk_size=60)) test_content = r""" function foo() { @@ -268,6 +267,53 @@ def test_treesitter_chunker_javascript_genshi(): assert chunks == ['function foo() {\n return `foo with ${genshi}`;\n}', 'function bar() {\n return "bar";\n}'] os.remove(test_file) +def test_treesitter_chunker_parser_from_config_no_parser_found_error(): + """Test TreeSitterChunker filetype_map: should raise an error if no parser is found""" + chunker = TreeSitterChunker(Config(chunk_size=60, filetype_map={"unknown_parser": ["^kid$"]})) + + test_content = r""" +function foo() { + return `foo with ${genshi}`; +} + +function bar() { + return "bar"; +} + """ + + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".kid") as tmp_file: + tmp_file.write(test_content) + test_file = tmp_file.name + + + with pytest.raises(ValueError): + chunks = list(str(i) for i in chunker.chunk(test_file)) + os.remove(test_file) + +def test_treesitter_chunker_parser_from_config_regex_error(): + """Test TreeSitterChunker filetype_map: should raise an error if a regex is invalid""" + chunker = TreeSitterChunker(Config(chunk_size=60, filetype_map={"javascript": ["\\"]})) + + test_content = r""" +function foo() { + return `foo with ${genshi}`; +} + +function bar() { + return "bar"; +} + """ + + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".kid") as tmp_file: + tmp_file.write(test_content) + test_file = tmp_file.name + + + with pytest.raises(ValueError): + chunks = list(str(i) for i in chunker.chunk(test_file)) + os.remove(test_file) + + def test_treesitter_chunker_filter(): chunker = TreeSitterChunker( From 203b5b41196e361d41414127fdd2538ffc2555f1 Mon Sep 17 00:00:00 2001 From: Jufralice Date: Fri, 16 May 2025 16:02:25 +0200 Subject: [PATCH 5/8] test(chunking): forgot to assign the emptyness of chunks --- tests/test_chunking.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_chunking.py b/tests/test_chunking.py index 7946f14f..80459cf6 100644 --- a/tests/test_chunking.py +++ b/tests/test_chunking.py @@ -288,6 +288,7 @@ def test_treesitter_chunker_parser_from_config_no_parser_found_error(): with pytest.raises(ValueError): chunks = list(str(i) for i in chunker.chunk(test_file)) + assert chunks == [] os.remove(test_file) def test_treesitter_chunker_parser_from_config_regex_error(): @@ -311,6 +312,7 @@ def test_treesitter_chunker_parser_from_config_regex_error(): with pytest.raises(ValueError): chunks = list(str(i) for i in chunker.chunk(test_file)) + assert chunks == [] os.remove(test_file) From 4c18e823854f21d604c9b9ae70ba4bd7fe1e2b61 Mon Sep 17 00:00:00 2001 From: Jufralice Date: Fri, 16 May 2025 16:35:54 +0200 Subject: [PATCH 6/8] (test) chunker. add yet another test to get full coverage --- tests/test_chunking.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/test_chunking.py b/tests/test_chunking.py index 80459cf6..bd4e327c 100644 --- a/tests/test_chunking.py +++ b/tests/test_chunking.py @@ -315,6 +315,28 @@ def test_treesitter_chunker_parser_from_config_regex_error(): assert chunks == [] os.remove(test_file) +def test_treesitter_chunker_parser_from_config_no_language_match(): + """Test TreeSitterChunker filetype_map: should continue with the lexer parser checks if no language matches a regex""" + chunker = TreeSitterChunker(Config(chunk_size=60, filetype_map={"php": ["^jsx$"]})) + + test_content = r""" +function foo() { + return "foo"; +} + +function bar() { + return "bar"; +} + """ + + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".js") as tmp_file: + tmp_file.write(test_content) + test_file = tmp_file.name + + chunks = list(str(i) for i in chunker.chunk(test_file)) + assert chunks == ['function foo() {\n return "foo";\n}', 'function bar() {\n return "bar";\n}'] + os.remove(test_file) + def test_treesitter_chunker_filter(): From 87f586639040db0c12db201ca6f3e4cbd5766c96 Mon Sep 17 00:00:00 2001 From: Jufralice Date: Tue, 20 May 2025 16:07:21 +0200 Subject: [PATCH 7/8] Re-raise original exceptions (`re.error`, `LookupError`) with added notes instead of raising a new ValueError. This preserves the original exception type and traceback for better debugging. Update tests to expect `Exception` instead of `ValueError` due to the change in re-raised exception types. --- src/vectorcode/chunking.py | 10 +++++----- tests/test_chunking.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/vectorcode/chunking.py b/src/vectorcode/chunking.py index 903e227f..0a62b3c8 100644 --- a/src/vectorcode/chunking.py +++ b/src/vectorcode/chunking.py @@ -304,11 +304,11 @@ def __get_parser_from_config(self, file_path: str): logger.debug(f"Found parser for language '{language}' from config.") return parser except re.error as e: - logger.error(f"Invalid regex pattern '{pattern}' for language '{language}' in filetype_map: {e}") - raise ValueError(f"Invalid regex pattern '{pattern}' for language '{language}' in filetype_map: {e}") - except LookupError: - logger.error(f"Configured parser for language '{language}' not found or failed to load. Please check your filetype_map config.") - raise ValueError(f"Configured parser for language '{language}' not found.") from None + e.add_note(f"\nInvalid regex pattern '{pattern}' for language '{language}' in filetype_map") + raise + except LookupError as e: + e.add_note(f"\nTreeSitter Parser for language '{language}' not found. Please check your filetype_map config.") + raise logger.debug(f"No matching filetype map entry found for {filename}.") return None diff --git a/tests/test_chunking.py b/tests/test_chunking.py index bd4e327c..75d02935 100644 --- a/tests/test_chunking.py +++ b/tests/test_chunking.py @@ -286,7 +286,7 @@ def test_treesitter_chunker_parser_from_config_no_parser_found_error(): test_file = tmp_file.name - with pytest.raises(ValueError): + with pytest.raises(Exception): chunks = list(str(i) for i in chunker.chunk(test_file)) assert chunks == [] os.remove(test_file) @@ -310,7 +310,7 @@ def test_treesitter_chunker_parser_from_config_regex_error(): test_file = tmp_file.name - with pytest.raises(ValueError): + with pytest.raises(Exception): chunks = list(str(i) for i in chunker.chunk(test_file)) assert chunks == [] os.remove(test_file) From e15967d8bd1393431f50cac33e7007475fe0b18d Mon Sep 17 00:00:00 2001 From: Jufralice Date: Tue, 20 May 2025 16:17:17 +0200 Subject: [PATCH 8/8] test(chunking): Correct test to expect LookupError instead of generic Exception --- tests/test_chunking.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_chunking.py b/tests/test_chunking.py index 75d02935..5154696a 100644 --- a/tests/test_chunking.py +++ b/tests/test_chunking.py @@ -286,7 +286,7 @@ def test_treesitter_chunker_parser_from_config_no_parser_found_error(): test_file = tmp_file.name - with pytest.raises(Exception): + with pytest.raises(LookupError): chunks = list(str(i) for i in chunker.chunk(test_file)) assert chunks == [] os.remove(test_file)