From a1ea70b61e45b35702bda8bfcd7be1bb4451bab6 Mon Sep 17 00:00:00 2001
From: Jufralice <france.julien@gmail.com>
Date: Wed, 14 May 2025 17:25:19 +0200
Subject: [PATCH 1/8] feat(treesitter): Handle lexers from
 pygments.lexers.templates modules (lexers with '+' in their language names)
 when looking up parsers

When looking up a tree-sitter parser for a language name containing '+', such as 'javascript+genshitext',
we also add the primary name (before '+') to the list of names to attempt. This ensure that we fallback to the primary language parser (eg: javascript) if none of the 'hybrid' language-names exists in 'tree-sitter-language-pack'
---
 src/vectorcode/chunking.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/vectorcode/chunking.py b/src/vectorcode/chunking.py
index 906c68fb..00677b60 100644
--- a/src/vectorcode/chunking.py
+++ b/src/vectorcode/chunking.py
@@ -309,6 +309,11 @@ def chunk(self, data: str) -> Generator[Chunk, None, None]:
                         break
                 except LookupError:  # pragma: nocover
                     pass
+                if '+' in name:
+                    primary_name = name.split('+')[0]
+                    if primary_name not in lang_names:
+                        lang_names.append(primary_name)
+                        logger.debug("Added primary lang_name: %s to the list of lang_names to test", primary_name)
 
         if parser is None:
             logger.debug(

From 699d36050b39bded3e59d3bdaaebf182b4c2663f Mon Sep 17 00:00:00 2001
From: Jufralice <france.julien@gmail.com>
Date: Wed, 14 May 2025 22:58:23 +0200
Subject: [PATCH 2/8] Add tests to cover my fix for the
 'https://github.com/Davidyz/VectorCode/issues/139' issue =>
 'tree_sitter_language_pack won't find a parser for 'hybrid' lexer language
 names (like 'javascript+genshitext') falling back to naive chunking method
 instead of chunking with the 'javascript' treesitter parser'

---
 tests/test_chunking.py | 44 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/tests/test_chunking.py b/tests/test_chunking.py
index d7d5bda2..7d6a2bad 100644
--- a/tests/test_chunking.py
+++ b/tests/test_chunking.py
@@ -223,6 +223,50 @@ def bar():
     assert chunks == ['def 测试():\n    return "foo"', 'def bar():\n    return "bar"']
     os.remove(test_file)
 
+def test_treesitter_chunker_javascript():
+    """Test TreeSitterChunker with a sample javascript file using tempfile."""
+    chunker = TreeSitterChunker(Config(chunk_size=60))
+
+    test_content = r"""
+function foo() {
+    return "foo";
+}
+
+function bar() {
+    return "bar";
+}
+    """
+
+    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".js") as tmp_file:
+        tmp_file.write(test_content)
+        test_file = tmp_file.name
+
+    chunks = list(str(i) for i in chunker.chunk(test_file))
+    assert chunks == ['function foo() {\n    return "foo";\n}', 'function bar() {\n    return "bar";\n}']
+    os.remove(test_file)
+
+def test_treesitter_chunker_javascript_genshi():
+    """Test TreeSitterChunker with a sample javascript + genshi file using tempfile."""
+    chunker = TreeSitterChunker(Config(chunk_size=60))
+
+    test_content = r"""
+function foo() {
+    return `foo with ${genshi}`;
+}
+
+function bar() {
+    return "bar";
+}
+    """
+
+    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".js") as tmp_file:
+        tmp_file.write(test_content)
+        test_file = tmp_file.name
+
+    chunks = list(str(i) for i in chunker.chunk(test_file))
+    assert chunks == ['function foo() {\n    return `foo with ${genshi}`;\n}', 'function bar() {\n    return "bar";\n}']
+    os.remove(test_file)
+
 
 def test_treesitter_chunker_filter():
     chunker = TreeSitterChunker(

From 8f0d4a3171c471833409beb40a6950742f7a9e4a Mon Sep 17 00:00:00 2001
From: Jufralice <france.julien@gmail.com>
Date: Fri, 16 May 2025 13:19:58 +0200
Subject: [PATCH 3/8] feat(chunking): Add filetype_map config option

Introduce `filetype_map` configuration option to allow users to
explicitly
map file extensions (via regex patterns) to specific tree-sitter
languages.

This configuration is checked before attempting automatic language
detection based on Pygments lexers, providing a way to handle files
with unusual extensions or mixed languages where automatic detection
might fail or pick the wrong parser.

Also updates Pygments lexer lookup to use `get_lexer_for_filename`.
---
 docs/cli.md                 | 13 +++++++
 src/vectorcode/chunking.py  | 78 ++++++++++++++++++++++++++-----------
 src/vectorcode/cli_utils.py |  4 ++
 tests/test_chunking.py      |  7 ++--
 4 files changed, 77 insertions(+), 25 deletions(-)

diff --git a/docs/cli.md b/docs/cli.md
index 3609c811..bea7012f 100644
--- a/docs/cli.md
+++ b/docs/cli.md
@@ -324,6 +324,19 @@ The JSON configuration file may hold the following values:
     "hnsw:construction_ef": 100
   }
   ```
+- `filetype_map`: `dict[str, list[str]]`, a dictionary where keys are
+    [language name](https://github.com/Goldziher/tree-sitter-language-pack?tab=readme-ov-file#available-languages)
+    and values are lists of [Python regex patterns](https://docs.python.org/3/library/re.html)
+    that will match file extensions. This allows overriding automatic language
+    detection and specifying a treesitter parser for certain file types for which the language parser cannot be
+    correctly identified (e.g., `.phtml` files containing both php and html).
+    Example configuration:
+    ```json5
+    "filetype_map": {
+      "php": ["^phtml$"]
+    }
+    ```
+
 - `chunk_filters`: `dict[str, list[str]]`, a dictionary where the keys are
   [language name](https://github.com/Goldziher/tree-sitter-language-pack?tab=readme-ov-file#available-languages)
   and values are lists of [Python regex patterns](https://docs.python.org/3/library/re.html) 
diff --git a/src/vectorcode/chunking.py b/src/vectorcode/chunking.py
index 00677b60..643c2658 100644
--- a/src/vectorcode/chunking.py
+++ b/src/vectorcode/chunking.py
@@ -8,7 +8,7 @@
 from typing import Generator, Optional
 
 from pygments.lexer import Lexer
-from pygments.lexers import guess_lexer_for_filename
+from pygments.lexers import get_lexer_for_filename
 from pygments.util import ClassNotFound
 from tree_sitter import Node, Point
 from tree_sitter_language_pack import get_parser
@@ -240,7 +240,7 @@ def __chunk_node(
     @cache
     def __guess_type(self, path: str, content: str) -> Optional[Lexer]:
         try:
-            return guess_lexer_for_filename(path, content)
+            return get_lexer_for_filename(path, content)
 
         except ClassNotFound:
             return None
@@ -279,6 +279,43 @@ def __load_file_lines(self, path: str) -> list[str]:
             lines = fin.readlines()
         return lines
 
+
+    def __get_parser_from_config(self, file_path: str):
+        """
+        Get parser based on filetype_map config.
+        """
+        filetype_map = self.config.filetype_map
+        if not filetype_map:
+            logger.debug("filetype_map is empty in config.")
+            return None
+
+        filename = os.path.basename(file_path)
+        extension = os.path.splitext(file_path)[1]
+        if extension.startswith('.'):
+            extension = extension[1:]
+        logger.debug(f"Checking filetype map for extension '{extension}' in {filename}")
+        for _language, patterns in filetype_map.items():
+            language =  _language.lower()
+            for pattern in patterns:
+                try:
+                    if re.search(pattern, extension):
+                        logger.debug(f"'{filename}' extension matches pattern '{pattern}' for language '{language}'. Attempting to load parser.")
+                        parser = get_parser(language)
+                        if parser is None:
+                            raise LookupError(f"Parser not found for language '{language}'. Please check your filetype_map config.")
+                        logger.debug(f"Found parser for language '{language}' from config.")
+                        return parser
+                except re.error as e:
+                    logger.error(f"Invalid regex pattern '{pattern}' for language '{language}' in filetype_map: {e}")
+                except LookupError:
+                    logger.error(f"Configured parser for language '{language}' not found or failed to load. Please check your filetype_map config.")
+                    raise ValueError(f"Configured parser for language '{language}' not found.") from None
+                except Exception as e:
+                    logger.error(f"An unexpected error occurred while processing filetype_map for language '{language}' and pattern '{pattern}': {e}")
+
+        logger.debug(f"No matching filetype map entry found for {filename}.")
+        return None
+
     def chunk(self, data: str) -> Generator[Chunk, None, None]:
         """
         data: path to the file
@@ -294,26 +331,23 @@ def chunk(self, data: str) -> Generator[Chunk, None, None]:
             return
         parser = None
         language = None
-        lexer = self.__guess_type(data, content)
-        if lexer is not None:
-            lang_names = [lexer.name]
-            lang_names.extend(lexer.aliases)
-            for name in lang_names:
-                try:
-                    parser = get_parser(name.lower())
-                    if parser is not None:
-                        language = name.lower()
-                        logger.debug(
-                            "Detected %s filetype for treesitter chunking.", language
-                        )
-                        break
-                except LookupError:  # pragma: nocover
-                    pass
-                if '+' in name:
-                    primary_name = name.split('+')[0]
-                    if primary_name not in lang_names:
-                        lang_names.append(primary_name)
-                        logger.debug("Added primary lang_name: %s to the list of lang_names to test", primary_name)
+        parser = self.__get_parser_from_config(data)
+        if parser is None:
+            lexer = self.__guess_type(data, content)
+            if lexer is not None:
+                lang_names = [lexer.name]
+                lang_names.extend(lexer.aliases)
+                for name in lang_names:
+                    try:
+                        parser = get_parser(name.lower())
+                        if parser is not None:
+                            language = name.lower()
+                            logger.debug(
+                                "Detected %s filetype for treesitter chunking.", language
+                            )
+                            break
+                    except LookupError:  # pragma: nocover
+                        pass
 
         if parser is None:
             logger.debug(
diff --git a/src/vectorcode/cli_utils.py b/src/vectorcode/cli_utils.py
index d33f3752..36db2703 100644
--- a/src/vectorcode/cli_utils.py
+++ b/src/vectorcode/cli_utils.py
@@ -94,6 +94,7 @@ class Config:
     )
     hnsw: dict[str, str | int] = field(default_factory=dict)
     chunk_filters: dict[str, list[str]] = field(default_factory=dict)
+    filetype_map: dict[str, list[str]] = field(default_factory=dict)
     encoding: str = "utf8"
     hooks: bool = False
 
@@ -156,6 +157,9 @@ async def import_from(cls, config_dict: dict[str, Any]) -> "Config":
                 "chunk_filters": config_dict.get(
                     "chunk_filters", default_config.chunk_filters
                 ),
+                "filetype_map": config_dict.get(
+                    "filetype_map", default_config.filetype_map
+                ),
                 "encoding": config_dict.get("encoding", default_config.encoding),
             }
         )
diff --git a/tests/test_chunking.py b/tests/test_chunking.py
index 7d6a2bad..281b9d24 100644
--- a/tests/test_chunking.py
+++ b/tests/test_chunking.py
@@ -246,8 +246,9 @@ def test_treesitter_chunker_javascript():
     os.remove(test_file)
 
 def test_treesitter_chunker_javascript_genshi():
-    """Test TreeSitterChunker with a sample javascript + genshi file using tempfile."""
-    chunker = TreeSitterChunker(Config(chunk_size=60))
+    """Test TreeSitterChunker with a sample javascript + genshi file using tempfile. (bypassing lexers via the filetype_map config param)"""
+    chunker = TreeSitterChunker(Config(chunk_size=60, filetype_map={"javascript": ["^kid$"]}))
+    # chunker = TreeSitterChunker(Config(chunk_size=60))
 
     test_content = r"""
 function foo() {
@@ -259,7 +260,7 @@ def test_treesitter_chunker_javascript_genshi():
 }
     """
 
-    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".js") as tmp_file:
+    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".kid") as tmp_file:
         tmp_file.write(test_content)
         test_file = tmp_file.name
 

From fef41646620c1b9b0f0d619bbe66f3ce7a271efd Mon Sep 17 00:00:00 2001
From: Jufralice <france.julien@gmail.com>
Date: Fri, 16 May 2025 15:55:54 +0200
Subject: [PATCH 4/8] test(chunking): Add more tests for the chunker when using
 the filetype_map parameter. covering the error handling also remove the check
 of parser being None since get_parser will never return None

---
 src/vectorcode/chunking.py |  5 +---
 tests/test_chunking.py     | 48 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/src/vectorcode/chunking.py b/src/vectorcode/chunking.py
index 643c2658..903e227f 100644
--- a/src/vectorcode/chunking.py
+++ b/src/vectorcode/chunking.py
@@ -301,17 +301,14 @@ def __get_parser_from_config(self, file_path: str):
                     if re.search(pattern, extension):
                         logger.debug(f"'{filename}' extension matches pattern '{pattern}' for language '{language}'. Attempting to load parser.")
                         parser = get_parser(language)
-                        if parser is None:
-                            raise LookupError(f"Parser not found for language '{language}'. Please check your filetype_map config.")
                         logger.debug(f"Found parser for language '{language}' from config.")
                         return parser
                 except re.error as e:
                     logger.error(f"Invalid regex pattern '{pattern}' for language '{language}' in filetype_map: {e}")
+                    raise ValueError(f"Invalid regex pattern '{pattern}' for language '{language}' in filetype_map: {e}")
                 except LookupError:
                     logger.error(f"Configured parser for language '{language}' not found or failed to load. Please check your filetype_map config.")
                     raise ValueError(f"Configured parser for language '{language}' not found.") from None
-                except Exception as e:
-                    logger.error(f"An unexpected error occurred while processing filetype_map for language '{language}' and pattern '{pattern}': {e}")
 
         logger.debug(f"No matching filetype map entry found for {filename}.")
         return None
diff --git a/tests/test_chunking.py b/tests/test_chunking.py
index 281b9d24..7946f14f 100644
--- a/tests/test_chunking.py
+++ b/tests/test_chunking.py
@@ -248,7 +248,6 @@ def test_treesitter_chunker_javascript():
 def test_treesitter_chunker_javascript_genshi():
     """Test TreeSitterChunker with a sample javascript + genshi file using tempfile. (bypassing lexers via the filetype_map config param)"""
     chunker = TreeSitterChunker(Config(chunk_size=60, filetype_map={"javascript": ["^kid$"]}))
-    # chunker = TreeSitterChunker(Config(chunk_size=60))
 
     test_content = r"""
 function foo() {
@@ -268,6 +267,53 @@ def test_treesitter_chunker_javascript_genshi():
     assert chunks == ['function foo() {\n    return `foo with ${genshi}`;\n}', 'function bar() {\n    return "bar";\n}']
     os.remove(test_file)
 
+def test_treesitter_chunker_parser_from_config_no_parser_found_error():
+    """Test TreeSitterChunker filetype_map: should raise an error if no parser is found"""
+    chunker = TreeSitterChunker(Config(chunk_size=60, filetype_map={"unknown_parser": ["^kid$"]}))
+
+    test_content = r"""
+function foo() {
+    return `foo with ${genshi}`;
+}
+
+function bar() {
+    return "bar";
+}
+    """
+
+    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".kid") as tmp_file:
+        tmp_file.write(test_content)
+        test_file = tmp_file.name
+
+
+    with pytest.raises(ValueError):
+        chunks = list(str(i) for i in chunker.chunk(test_file))
+    os.remove(test_file)
+
+def test_treesitter_chunker_parser_from_config_regex_error():
+    """Test TreeSitterChunker filetype_map: should raise an error if a regex is invalid"""
+    chunker = TreeSitterChunker(Config(chunk_size=60, filetype_map={"javascript": ["\\"]}))
+
+    test_content = r"""
+function foo() {
+    return `foo with ${genshi}`;
+}
+
+function bar() {
+    return "bar";
+}
+    """
+
+    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".kid") as tmp_file:
+        tmp_file.write(test_content)
+        test_file = tmp_file.name
+
+
+    with pytest.raises(ValueError):
+        chunks = list(str(i) for i in chunker.chunk(test_file))
+    os.remove(test_file)
+
+
 
 def test_treesitter_chunker_filter():
     chunker = TreeSitterChunker(

From 203b5b41196e361d41414127fdd2538ffc2555f1 Mon Sep 17 00:00:00 2001
From: Jufralice <france.julien@gmail.com>
Date: Fri, 16 May 2025 16:02:25 +0200
Subject: [PATCH 5/8] test(chunking): forgot to assign the emptyness of chunks

---
 tests/test_chunking.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_chunking.py b/tests/test_chunking.py
index 7946f14f..80459cf6 100644
--- a/tests/test_chunking.py
+++ b/tests/test_chunking.py
@@ -288,6 +288,7 @@ def test_treesitter_chunker_parser_from_config_no_parser_found_error():
 
     with pytest.raises(ValueError):
         chunks = list(str(i) for i in chunker.chunk(test_file))
+        assert chunks == []
     os.remove(test_file)
 
 def test_treesitter_chunker_parser_from_config_regex_error():
@@ -311,6 +312,7 @@ def test_treesitter_chunker_parser_from_config_regex_error():
 
     with pytest.raises(ValueError):
         chunks = list(str(i) for i in chunker.chunk(test_file))
+        assert chunks == []
     os.remove(test_file)
 
 

From 4c18e823854f21d604c9b9ae70ba4bd7fe1e2b61 Mon Sep 17 00:00:00 2001
From: Jufralice <france.julien@gmail.com>
Date: Fri, 16 May 2025 16:35:54 +0200
Subject: [PATCH 6/8] (test) chunker. add yet another test to get full coverage

---
 tests/test_chunking.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/tests/test_chunking.py b/tests/test_chunking.py
index 80459cf6..bd4e327c 100644
--- a/tests/test_chunking.py
+++ b/tests/test_chunking.py
@@ -315,6 +315,28 @@ def test_treesitter_chunker_parser_from_config_regex_error():
         assert chunks == []
     os.remove(test_file)
 
+def test_treesitter_chunker_parser_from_config_no_language_match():
+    """Test TreeSitterChunker filetype_map: should continue with the lexer parser checks if no language matches a regex"""
+    chunker = TreeSitterChunker(Config(chunk_size=60, filetype_map={"php": ["^jsx$"]}))
+
+    test_content = r"""
+function foo() {
+    return "foo";
+}
+
+function bar() {
+    return "bar";
+}
+    """
+
+    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".js") as tmp_file:
+        tmp_file.write(test_content)
+        test_file = tmp_file.name
+
+    chunks = list(str(i) for i in chunker.chunk(test_file))
+    assert chunks == ['function foo() {\n    return "foo";\n}', 'function bar() {\n    return "bar";\n}']
+    os.remove(test_file)
+
 
 
 def test_treesitter_chunker_filter():

From 87f586639040db0c12db201ca6f3e4cbd5766c96 Mon Sep 17 00:00:00 2001
From: Jufralice <france.julien@gmail.com>
Date: Tue, 20 May 2025 16:07:21 +0200
Subject: [PATCH 7/8] Re-raise original exceptions (`re.error`, `LookupError`)
 with added notes instead of raising a new ValueError. This preserves the
 original exception type and traceback for better debugging.

Update tests to expect `Exception` instead of `ValueError` due to the change in re-raised exception types.
---
 src/vectorcode/chunking.py | 10 +++++-----
 tests/test_chunking.py     |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/vectorcode/chunking.py b/src/vectorcode/chunking.py
index 903e227f..0a62b3c8 100644
--- a/src/vectorcode/chunking.py
+++ b/src/vectorcode/chunking.py
@@ -304,11 +304,11 @@ def __get_parser_from_config(self, file_path: str):
                         logger.debug(f"Found parser for language '{language}' from config.")
                         return parser
                 except re.error as e:
-                    logger.error(f"Invalid regex pattern '{pattern}' for language '{language}' in filetype_map: {e}")
-                    raise ValueError(f"Invalid regex pattern '{pattern}' for language '{language}' in filetype_map: {e}")
-                except LookupError:
-                    logger.error(f"Configured parser for language '{language}' not found or failed to load. Please check your filetype_map config.")
-                    raise ValueError(f"Configured parser for language '{language}' not found.") from None
+                    e.add_note(f"\nInvalid regex pattern '{pattern}' for language '{language}' in filetype_map")
+                    raise
+                except LookupError as e:
+                    e.add_note(f"\nTreeSitter Parser for language '{language}' not found. Please check your filetype_map config.")
+                    raise
 
         logger.debug(f"No matching filetype map entry found for {filename}.")
         return None
diff --git a/tests/test_chunking.py b/tests/test_chunking.py
index bd4e327c..75d02935 100644
--- a/tests/test_chunking.py
+++ b/tests/test_chunking.py
@@ -286,7 +286,7 @@ def test_treesitter_chunker_parser_from_config_no_parser_found_error():
         test_file = tmp_file.name
 
 
-    with pytest.raises(ValueError):
+    with pytest.raises(Exception):
         chunks = list(str(i) for i in chunker.chunk(test_file))
         assert chunks == []
     os.remove(test_file)
@@ -310,7 +310,7 @@ def test_treesitter_chunker_parser_from_config_regex_error():
         test_file = tmp_file.name
 
 
-    with pytest.raises(ValueError):
+    with pytest.raises(Exception):
         chunks = list(str(i) for i in chunker.chunk(test_file))
         assert chunks == []
     os.remove(test_file)

From e15967d8bd1393431f50cac33e7007475fe0b18d Mon Sep 17 00:00:00 2001
From: Jufralice <france.julien@gmail.com>
Date: Tue, 20 May 2025 16:17:17 +0200
Subject: [PATCH 8/8] test(chunking): Correct test to expect LookupError
 instead of generic Exception

---
 tests/test_chunking.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_chunking.py b/tests/test_chunking.py
index 75d02935..5154696a 100644
--- a/tests/test_chunking.py
+++ b/tests/test_chunking.py
@@ -286,7 +286,7 @@ def test_treesitter_chunker_parser_from_config_no_parser_found_error():
         test_file = tmp_file.name
 
 
-    with pytest.raises(Exception):
+    with pytest.raises(LookupError):
         chunks = list(str(i) for i in chunker.chunk(test_file))
         assert chunks == []
     os.remove(test_file)