From 0a39773e5c2a0f25ee3ba7e6bf8e82d1274e2af4 Mon Sep 17 00:00:00 2001 From: nicholaspsmith Date: Tue, 27 Jan 2026 11:02:31 -0500 Subject: [PATCH 1/2] perf: increase Ollama parallelism for better GPU utilization - Increase batch size from 10 to 50 texts per request - Increase concurrency from 4 to 20 parallel requests - Processes 1000 texts at a time instead of 40 Co-Authored-By: Claude --- src/__tests__/embeddings/ollama.test.ts | 19 ++++++++++--------- src/embeddings/ollama.ts | 10 +++++----- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/src/__tests__/embeddings/ollama.test.ts b/src/__tests__/embeddings/ollama.test.ts index a06093c..61d254e 100644 --- a/src/__tests__/embeddings/ollama.test.ts +++ b/src/__tests__/embeddings/ollama.test.ts @@ -250,7 +250,7 @@ describe('OllamaBackend', () => { expect(result).toEqual([[0.1], [0.2], [0.3], [0.4], [0.5]]); }); - it('should use default batch size of 10', async () => { + it('should use default batch size of 50', async () => { const mockFetch = vi.fn().mockImplementation(async (_url, options) => { const body = JSON.parse(options.body as string); const inputLen = body.input.length; @@ -260,14 +260,14 @@ describe('OllamaBackend', () => { }); vi.stubGlobal('fetch', mockFetch); - // Create backend without custom batchSize (should use default 10) + // Create backend without custom batchSize (should use default 50) const backend = new OllamaBackend({ backend: 'ollama' }); - const texts = Array.from({ length: 25 }, (_, i) => `text${i}`); + const texts = Array.from({ length: 120 }, (_, i) => `text${i}`); const result = await backend.embedBatch(texts); - // Should make 3 batch requests (10+10+5) + // Should make 3 batch requests (50+50+20) expect(mockFetch).toHaveBeenCalledTimes(3); - expect(result).toHaveLength(25); + expect(result).toHaveLength(120); }); it('should process batches in parallel based on concurrency', async () => { @@ -292,16 +292,17 @@ describe('OllamaBackend', () => { expect(result).toHaveLength(4); }); - it('should use default concurrency of 4', async () => { + it('should use default concurrency of 20', async () => { const mockFetch = vi.fn().mockResolvedValue(createOllamaBatchEmbeddingResponse([[0.1]])); vi.stubGlobal('fetch', mockFetch); // Create backend with small batchSize to trigger multiple batches const backend = new OllamaBackend({ backend: 'ollama', batchSize: 1 }); - await backend.embedBatch(['t1', 't2', 't3', 't4', 't5', 't6', 't7', 't8']); + const texts = Array.from({ length: 25 }, (_, i) => `t${i}`); + await backend.embedBatch(texts); - // With concurrency=4 and 8 items with batchSize=1, should still make 8 requests - expect(mockFetch).toHaveBeenCalledTimes(8); + // With concurrency=20 and 25 items with batchSize=1, should make 25 requests + expect(mockFetch).toHaveBeenCalledTimes(25); }); it('should preserve result order with parallel processing', async () => { diff --git a/src/embeddings/ollama.ts b/src/embeddings/ollama.ts index b45495a..0912d70 100644 --- a/src/embeddings/ollama.ts +++ b/src/embeddings/ollama.ts @@ -2,14 +2,14 @@ import type { EmbeddingBackend, EmbeddingConfig } from './types.js'; import { chunkArray } from './types.js'; import { fetchWithRetry } from './retry.js'; -/** Default batch size for Ollama (texts per batch request) */ -const DEFAULT_BATCH_SIZE = 10; +/** Default batch size for Ollama (texts per request) */ +const DEFAULT_BATCH_SIZE = 50; /** Default concurrency for Ollama (parallel batch requests) */ -const DEFAULT_CONCURRENCY = 4; +const DEFAULT_CONCURRENCY = 20; -/** Default timeout for embedding requests (2 minutes per batch) */ -const DEFAULT_TIMEOUT_MS = 2 * 60 * 1000; +/** Default timeout for embedding requests (5 minutes per batch) */ +const DEFAULT_TIMEOUT_MS = 5 * 60 * 1000; /** Default Ollama model optimized for code search */ export const DEFAULT_OLLAMA_MODEL = 'qwen3-embedding:0.6b'; From 41bbec79ea20795812ccee1222aa01e7ba484f8a Mon Sep 17 00:00:00 2001 From: nicholaspsmith Date: Tue, 27 Jan 2026 11:03:00 -0500 Subject: [PATCH 2/2] fix: add diagnostic log before embedBatch call Co-Authored-By: Claude --- src/search/indexer.ts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/search/indexer.ts b/src/search/indexer.ts index 8ac2ad1..d610bc7 100644 --- a/src/search/indexer.ts +++ b/src/search/indexer.ts @@ -1438,6 +1438,9 @@ export class CodeIndexer { for (let i = 0; i < chunks.length; i += embeddingBatchSize) { const batch = chunks.slice(i, i + embeddingBatchSize); const texts = batch.map((c) => c.content); + console.error( + `[lance-context] Sending ${texts.length} texts to embedding backend (batch ${Math.floor(i / embeddingBatchSize) + 1}/${Math.ceil(chunks.length / embeddingBatchSize)})...` + ); const embeddings = await this.embeddingBackend.embedBatch(texts); batch.forEach((chunk, idx) => { chunk.embedding = embeddings[idx];