diff --git a/doc/VectorCode-cli.txt b/doc/VectorCode-cli.txt index 25589b47..9d675453 100644 --- a/doc/VectorCode-cli.txt +++ b/doc/VectorCode-cli.txt @@ -660,17 +660,21 @@ If you used `--include chunk path` parameters, the array will look like this: "chunk": "foo", "start_line": 1, "end_line": 1, + "chunk_id": "chunk_id_1" }, { "path": "path_to_another_file.py", "chunk": "bar", "start_line": 1, "end_line": 1, + "chunk_id": "chunk_id_2" } ] < -Keep in mind that both `start_line` and `end_line` are inclusive. +Keep in mind that both `start_line` and `end_line` are inclusive. The +`chunk_id` is a random string that can be used as a unique identifier to +distinguish between chunks. These are the same IDs used in the database. VECTORCODE VECTORISE diff --git a/docs/cli.md b/docs/cli.md index 393ee0a3..8d619b71 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -595,16 +595,20 @@ If you used `--include chunk path` parameters, the array will look like this: "chunk": "foo", "start_line": 1, "end_line": 1, + "chunk_id": "chunk_id_1" }, { "path": "path_to_another_file.py", "chunk": "bar", "start_line": 1, "end_line": 1, + "chunk_id": "chunk_id_2" } ] ``` -Keep in mind that both `start_line` and `end_line` are inclusive. +Keep in mind that both `start_line` and `end_line` are inclusive. The `chunk_id` +is a random string that can be used as a unique identifier to distinguish +between chunks. These are the same IDs used in the database. #### `vectorcode vectorise` The output is in JSON format. It contains a dictionary with the following fields: diff --git a/lua/vectorcode/integrations/codecompanion/common.lua b/lua/vectorcode/integrations/codecompanion/common.lua index 4bcc7f1b..987bfcb5 100644 --- a/lua/vectorcode/integrations/codecompanion/common.lua +++ b/lua/vectorcode/integrations/codecompanion/common.lua @@ -1,3 +1,5 @@ +---@module "codecompanion" + local job_runner local vc_config = require("vectorcode.config") local notify_opts = vc_config.notify_opts @@ -17,8 +19,10 @@ local default_ls_options = {} ---@type VectorCode.CodeCompanion.VectoriseToolOpts local default_vectorise_options = {} +local TOOL_RESULT_SOURCE = "VectorCodeToolResult" + return { - tool_result_source = "VectorCodeToolResult", + tool_result_source = TOOL_RESULT_SOURCE, ---@param t table|string ---@return string flatten_table_to_string = function(t) @@ -122,6 +126,7 @@ return { end return llm_message end, + ---@param use_lsp boolean ---@return VectorCode.JobRunner initialise_runner = function(use_lsp) diff --git a/lua/vectorcode/integrations/codecompanion/query_tool.lua b/lua/vectorcode/integrations/codecompanion/query_tool.lua index 03ac3ff8..a98b0624 100644 --- a/lua/vectorcode/integrations/codecompanion/query_tool.lua +++ b/lua/vectorcode/integrations/codecompanion/query_tool.lua @@ -9,6 +9,77 @@ local job_runner = nil ---@alias QueryToolArgs { project_root:string, count: integer, query: string[] } +---@alias chat_id integer +---@alias result_id string +---@type +local result_tracker = {} + +---@param results VectorCode.QueryResult[] +---@param chat CodeCompanion.Chat +---@return VectorCode.QueryResult[] +local filter_results = function(results, chat) + local existing_refs = chat.refs or {} + + existing_refs = vim + .iter(existing_refs) + :filter( + ---@param ref CodeCompanion.Chat.Ref + function(ref) + return ref.source == cc_common.tool_result_source or ref.path or ref.bufnr + end + ) + :map( + ---@param ref CodeCompanion.Chat.Ref + function(ref) + if ref.source == cc_common.tool_result_source then + return ref.id + elseif ref.path then + return ref.path + elseif ref.bufnr then + return vim.api.nvim_buf_get_name(ref.bufnr) + end + end + ) + :totable() + + ---@type VectorCode.QueryResult[] + local filtered_results = vim + .iter(results) + :filter( + ---@param res VectorCode.QueryResult + function(res) + -- return true if res should be kept + if res.chunk then + if res.chunk_id == nil then + -- no chunk_id, always include + return true + end + if + result_tracker[chat.id] ~= nil and result_tracker[chat.id][res.chunk_id] + then + return false + end + return not vim.tbl_contains(existing_refs, res.chunk_id) + else + if result_tracker[chat.id] ~= nil and result_tracker[chat.id][res.path] then + return false + end + return not vim.tbl_contains(existing_refs, res.path) + end + end + ) + :totable() + + for _, res in pairs(filtered_results) do + if result_tracker[chat.id] == nil then + result_tracker[chat.id] = {} + end + result_tracker[chat.id][res.chunk_id or res.path] = true + end + + return filtered_results +end + ---@param opts VectorCode.CodeCompanion.QueryToolOpts? ---@return CodeCompanion.Agent.Tool return check_cli_wrap(function(opts) @@ -148,6 +219,7 @@ You may include multiple keywords in the command. description = [[ Query messages used for the search. They should also contain relevant keywords. For example, you should include `parameter`, `arguments` and `return value` for the query `function`. +If a query returned empty or repeated results, you should avoid using these query keywords, unless the user instructed otherwise. ]], }, count = { @@ -219,6 +291,9 @@ For example, you should include `parameter`, `arguments` and `return value` for if opts.max_num > 0 then max_result = math.min(opts.max_num or 1, max_result) end + if opts.no_duplicate then + stdout = filter_results(stdout, agent.chat) + end for i, file in pairs(stdout) do if i <= max_result then if i == 1 then @@ -240,14 +315,14 @@ For example, you should include `parameter`, `arguments` and `return value` for user_message ) if not opts.chunk_mode then - -- skip referencing because there will be multiple chunks with the same path (id). - -- TODO: figure out a way to deduplicate. - agent.chat.references:add({ + -- only add to reference if running in full document mode + local ref = { source = cc_common.tool_result_source, id = file.path, path = file.path, opts = { visible = false }, - }) + } + agent.chat.references:add(ref) end end end diff --git a/lua/vectorcode/types.lua b/lua/vectorcode/types.lua index 7421a343..779493bf 100644 --- a/lua/vectorcode/types.lua +++ b/lua/vectorcode/types.lua @@ -5,6 +5,7 @@ ---@field chunk string? ---@field start_line integer? ---@field end_line integer? +---@field chunk_id string? ---@class VectorCode.LsResult ---@field project-root string diff --git a/src/vectorcode/subcommands/query/__init__.py b/src/vectorcode/subcommands/query/__init__.py index 4f4b507a..51c3a550 100644 --- a/src/vectorcode/subcommands/query/__init__.py +++ b/src/vectorcode/subcommands/query/__init__.py @@ -114,7 +114,10 @@ async def build_query_results( assert chunk_texts is not None, ( "QueryResult does not contain `documents`!" ) - full_result: dict[str, str | int] = {"chunk": str(chunk_texts[0])} + full_result: dict[str, str | int] = { + "chunk": str(chunk_texts[0]), + "chunk_id": identifier, + } if meta[0].get("start") is not None and meta[0].get("end") is not None: path = str(meta[0].get("path")) with open(path) as fin: diff --git a/tests/subcommands/query/test_query.py b/tests/subcommands/query/test_query.py index a6b17689..4a54de9d 100644 --- a/tests/subcommands/query/test_query.py +++ b/tests/subcommands/query/test_query.py @@ -173,6 +173,7 @@ async def test_build_query_results_chunk_mode_success(mock_collection, mock_conf "chunk": expected_chunk_content, "start_line": start_line, "end_line": end_line, + "chunk_id": identifier, } assert results[0] == expected_full_result