From 0a39773e5c2a0f25ee3ba7e6bf8e82d1274e2af4 Mon Sep 17 00:00:00 2001
From: nicholaspsmith <npsmith1990@gmail.com>
Date: Tue, 27 Jan 2026 11:02:31 -0500
Subject: [PATCH 1/2] perf: increase Ollama parallelism for better GPU
 utilization

- Increase batch size from 10 to 50 texts per request
- Increase concurrency from 4 to 20 parallel requests
- Processes 1000 texts at a time instead of 40

Co-Authored-By: Claude <noreply@anthropic.com>
---
 src/__tests__/embeddings/ollama.test.ts | 19 ++++++++++---------
 src/embeddings/ollama.ts                | 10 +++++-----
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/src/__tests__/embeddings/ollama.test.ts b/src/__tests__/embeddings/ollama.test.ts
index a06093c..61d254e 100644
--- a/src/__tests__/embeddings/ollama.test.ts
+++ b/src/__tests__/embeddings/ollama.test.ts
@@ -250,7 +250,7 @@ describe('OllamaBackend', () => {
       expect(result).toEqual([[0.1], [0.2], [0.3], [0.4], [0.5]]);
     });
 
-    it('should use default batch size of 10', async () => {
+    it('should use default batch size of 50', async () => {
       const mockFetch = vi.fn().mockImplementation(async (_url, options) => {
         const body = JSON.parse(options.body as string);
         const inputLen = body.input.length;
@@ -260,14 +260,14 @@ describe('OllamaBackend', () => {
       });
       vi.stubGlobal('fetch', mockFetch);
 
-      // Create backend without custom batchSize (should use default 10)
+      // Create backend without custom batchSize (should use default 50)
       const backend = new OllamaBackend({ backend: 'ollama' });
-      const texts = Array.from({ length: 25 }, (_, i) => `text${i}`);
+      const texts = Array.from({ length: 120 }, (_, i) => `text${i}`);
       const result = await backend.embedBatch(texts);
 
-      // Should make 3 batch requests (10+10+5)
+      // Should make 3 batch requests (50+50+20)
       expect(mockFetch).toHaveBeenCalledTimes(3);
-      expect(result).toHaveLength(25);
+      expect(result).toHaveLength(120);
     });
 
     it('should process batches in parallel based on concurrency', async () => {
@@ -292,16 +292,17 @@ describe('OllamaBackend', () => {
       expect(result).toHaveLength(4);
     });
 
-    it('should use default concurrency of 4', async () => {
+    it('should use default concurrency of 20', async () => {
       const mockFetch = vi.fn().mockResolvedValue(createOllamaBatchEmbeddingResponse([[0.1]]));
       vi.stubGlobal('fetch', mockFetch);
 
       // Create backend with small batchSize to trigger multiple batches
       const backend = new OllamaBackend({ backend: 'ollama', batchSize: 1 });
-      await backend.embedBatch(['t1', 't2', 't3', 't4', 't5', 't6', 't7', 't8']);
+      const texts = Array.from({ length: 25 }, (_, i) => `t${i}`);
+      await backend.embedBatch(texts);
 
-      // With concurrency=4 and 8 items with batchSize=1, should still make 8 requests
-      expect(mockFetch).toHaveBeenCalledTimes(8);
+      // With concurrency=20 and 25 items with batchSize=1, should make 25 requests
+      expect(mockFetch).toHaveBeenCalledTimes(25);
     });
 
     it('should preserve result order with parallel processing', async () => {
diff --git a/src/embeddings/ollama.ts b/src/embeddings/ollama.ts
index b45495a..0912d70 100644
--- a/src/embeddings/ollama.ts
+++ b/src/embeddings/ollama.ts
@@ -2,14 +2,14 @@ import type { EmbeddingBackend, EmbeddingConfig } from './types.js';
 import { chunkArray } from './types.js';
 import { fetchWithRetry } from './retry.js';
 
-/** Default batch size for Ollama (texts per batch request) */
-const DEFAULT_BATCH_SIZE = 10;
+/** Default batch size for Ollama (texts per request) */
+const DEFAULT_BATCH_SIZE = 50;
 
 /** Default concurrency for Ollama (parallel batch requests) */
-const DEFAULT_CONCURRENCY = 4;
+const DEFAULT_CONCURRENCY = 20;
 
-/** Default timeout for embedding requests (2 minutes per batch) */
-const DEFAULT_TIMEOUT_MS = 2 * 60 * 1000;
+/** Default timeout for embedding requests (5 minutes per batch) */
+const DEFAULT_TIMEOUT_MS = 5 * 60 * 1000;
 
 /** Default Ollama model optimized for code search */
 export const DEFAULT_OLLAMA_MODEL = 'qwen3-embedding:0.6b';

From 41bbec79ea20795812ccee1222aa01e7ba484f8a Mon Sep 17 00:00:00 2001
From: nicholaspsmith <npsmith1990@gmail.com>
Date: Tue, 27 Jan 2026 11:03:00 -0500
Subject: [PATCH 2/2] fix: add diagnostic log before embedBatch call

Co-Authored-By: Claude <noreply@anthropic.com>
---
 src/search/indexer.ts | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/search/indexer.ts b/src/search/indexer.ts
index 8ac2ad1..d610bc7 100644
--- a/src/search/indexer.ts
+++ b/src/search/indexer.ts
@@ -1438,6 +1438,9 @@ export class CodeIndexer {
     for (let i = 0; i < chunks.length; i += embeddingBatchSize) {
       const batch = chunks.slice(i, i + embeddingBatchSize);
       const texts = batch.map((c) => c.content);
+      console.error(
+        `[lance-context] Sending ${texts.length} texts to embedding backend (batch ${Math.floor(i / embeddingBatchSize) + 1}/${Math.ceil(chunks.length / embeddingBatchSize)})...`
+      );
       const embeddings = await this.embeddingBackend.embedBatch(texts);
       batch.forEach((chunk, idx) => {
         chunk.embedding = embeddings[idx];