Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion src/vectorcode/chunking.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ def __init__(self, config: Optional[Config] = None):
if config is None:
config = Config()
super().__init__(config)
self._fallback_chunker = StringChunker(config)

def __chunk_node(
self, node: Node, text_bytes: bytes
Expand All @@ -153,6 +154,12 @@ def __chunk_node(
prev_node = None
current_start = None

logger.debug("nbr children: %s", len(node.children))
# if node has no children we fallback to the string chunker
if len(node.children) == 0 and node.text:
logger.debug("No children, falling back to string chunker")
yield from self._fallback_chunker.chunk(node.text.decode())

for child in node.children:
child_bytes = text_bytes[child.start_byte : child.end_byte]
child_text = child_bytes.decode()
Expand Down Expand Up @@ -307,7 +314,7 @@ def chunk(self, data: str) -> Generator[Chunk, None, None]:
logger.debug(
"Unable to pick a suitable parser. Fall back to naive chunking"
)
yield from StringChunker(self.config).chunk(content)
yield from self._fallback_chunker.chunk(content)
else:
pattern_str = self.__build_pattern(language=language)
content_bytes = content.encode()
Expand Down
19 changes: 19 additions & 0 deletions tests/test_chunking.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import tempfile
from unittest.mock import MagicMock

import pytest
from tree_sitter import Point
Expand Down Expand Up @@ -159,6 +160,24 @@ def bar():
os.remove(test_file)


def test_treesitter_chunker_fallback_on_long_node():
test_content = r"""
def foo():
return "a very very very very very long string"
"""
config = Config(chunk_size=15)
with (
tempfile.NamedTemporaryFile(
mode="w", delete=False, suffix=".py"
) as temp_py_file,
):
temp_py_file.write(test_content)
ts_chunker = TreeSitterChunker(config)
ts_chunker._fallback_chunker.chunk = MagicMock()
list(ts_chunker.chunk(temp_py_file.name))
ts_chunker._fallback_chunker.chunk.assert_called_once()


def test_treesitter_chunker_python_encoding():
"""Test TreeSitterChunker with a sample file using tempfile."""
chunker = TreeSitterChunker(Config(chunk_size=30, encoding="gbk"))
Expand Down