genesis-ai-dev · dadukhankevin · Feb 6, 2026 · Feb 9, 2026
diff --git a/src/copilotSettings/copilotSettings.ts b/src/copilotSettings/copilotSettings.ts
@@ -351,7 +351,7 @@ export async function generateChatSystemMessage(
 
         const prompt = `Generate a concise, one-paragraph set of linguistic instructions critical for a linguistically informed translator to keep in mind at all times when translating from ${sourceLanguage.refName} to ${targetLanguage.refName}. Keep it to a single plaintext paragraph. Note key lexicosemantic, information structuring, register-relevant and other key distinctions necessary for grammatical, natural text in ${targetLanguage.refName} if the starting place is ${sourceLanguage.refName}. ${htmlInstruction} Preserve original line breaks from <currentTask><source> by returning text with the same number of lines separated by newline characters. Do not include XML in your answer.`;
 
-        const response = await callLLM(
+        const result = await callLLM(
             [
                 {
                     role: "user",
@@ -361,7 +361,7 @@ export async function generateChatSystemMessage(
             llmConfig
         );
 
-        return response;
+        return result.text;
     } catch (error) {
         debug("[generateChatSystemMessage] Error generating message:", error);
         return null;

diff --git a/src/providers/codexCellEditorProvider/codexCellEditorMessagehandling.ts b/src/providers/codexCellEditorProvider/codexCellEditorMessagehandling.ts
@@ -836,6 +836,15 @@ const messageHandlers: Record<string, (ctx: MessageHandlerContext) => Promise<vo
         const typedEvent = event as Extract<EditorPostMessages, { command: "llmCompletion"; }>;
         debug("llmCompletion message received", { event, document, provider, webviewPanel });
 
+        // Fire-and-forget: record single-cell translation telemetry
+        import("../../utils/abTestingAnalytics").then(({ recordAbResult }) =>
+            recordAbResult({
+                category: "batch_vs_single",
+                options: ["single", "batch"],
+                winner: 0,
+            })
+        ).catch(() => { /* analytics must never block translation */ });
+
         const cellId = typedEvent.content.currentLineId;
         const addContentToValue = typedEvent.content.addContentToValue;
 
@@ -1442,10 +1451,19 @@ const messageHandlers: Record<string, (ctx: MessageHandlerContext) => Promise<vo
 
     selectABTestVariant: async ({ event, document, webviewPanel, provider }) => {
         const typedEvent = event as Extract<EditorPostMessages, { command: "selectABTestVariant"; }>;
-        const { cellId, selectedIndex, selectedContent, testId, testName, selectionTimeMs, variants } = typedEvent.content || {};
-        const variantNames: string[] | undefined = variants;
+        const { cellId, selectedIndex, selectedContent, testId, testName, variants, models } = typedEvent.content || {};
         const isRecovery = testName === "Recovery" || (typeof testId === "string" && testId.includes("-recovery-"));
 
+        // Decrement pending A/B test count so normal source highlighting can resume
+        if (provider.pendingABTestCount > 0) {
+            provider.pendingABTestCount--;
+        }
+
+        // For model comparison tests, use model names as the analytics options;
+        // otherwise fall back to the variant text (existing behavior).
+        const isModelComparison = testName === "model_comparison" && Array.isArray(models) && models.length > 0;
+        const variantNames: string[] | undefined = isModelComparison ? models : variants;
+
         // Check if this was a pending attention check
         const attentionCheck = getAttentionCheck(testId);
 
@@ -1459,7 +1477,6 @@ const messageHandlers: Record<string, (ctx: MessageHandlerContext) => Promise<vo
                     testId,
                     cellId,
                     passed: !pickedWrong,
-                    selectionTimeMs,
                     correctIndex: attentionCheck.correctIndex,
                     decoyCellId: attentionCheck.decoyCellId
                 });
@@ -1481,17 +1498,18 @@ const messageHandlers: Record<string, (ctx: MessageHandlerContext) => Promise<vo
                             testName: "Recovery",
                         },
                     });
+                    provider.pendingABTestCount++;
                 }
                 return;
             }
 
             // User picked correctly - apply and clear
             clearAttentionCheck(testId);
         } else {
-            // Regular A/B test
+            // Regular A/B test (including model comparison)
             if (!isRecovery) {
                 const { recordVariantSelection } = await import("../../utils/abTestingUtils");
-                await recordVariantSelection(testId, cellId, selectedIndex, selectionTimeMs, variantNames, testName);
+                await recordVariantSelection(testId, cellId, selectedIndex, variantNames, testName);
             }
         }
 
@@ -1507,7 +1525,7 @@ const messageHandlers: Record<string, (ctx: MessageHandlerContext) => Promise<vo
             }
         }
 
-        debug(`A/B test feedback recorded: Cell ${cellId}, variant ${selectedIndex}, test ${testId}, took ${selectionTimeMs}ms`);
+        debug(`A/B test feedback recorded: Cell ${cellId}, variant ${selectedIndex}, test ${testId}`);
     },
 
     updateCellDisplayMode: async ({ event, document, webviewPanel, provider }) => {

diff --git a/src/providers/codexCellEditorProvider/codexCellEditorProvider.ts b/src/providers/codexCellEditorProvider/codexCellEditorProvider.ts
@@ -2,7 +2,7 @@ import * as vscode from "vscode";
 import { fetchCompletionConfig } from "@/utils/llmUtils";
 import { CodexNotebookReader } from "../../serializer";
 import { workspaceStoreListener } from "../../utils/workspaceEventListener";
-import { llmCompletion } from "../translationSuggestions/llmCompletion";
+import { llmCompletion, LLMCompletionResult } from "../translationSuggestions/llmCompletion";
 import { CodexCellTypes, EditType } from "../../../types/enums";
 import {
     QuillCellContent,
@@ -143,6 +143,10 @@ export class CodexCellEditorProvider implements vscode.CustomEditorProvider<Code
     }[] = [];
     private isProcessingQueue: boolean = false;
 
+    // When > 0, A/B tests are awaiting user selection — source highlighting
+    // is driven by the webview's A/B queue instead of normal cell navigation.
+    public pendingABTestCount: number = 0;
+
     // New state for autocompletion process
     public autocompletionState: {
         isProcessing: boolean;
@@ -335,6 +339,15 @@ export class CodexCellEditorProvider implements vscode.CustomEditorProvider<Code
                         // Only send highlight messages to source files when a codex file is active
                         const valueIsCodexFile = this.isCodexFile(value.uri);
                         if (valueIsCodexFile) {
+                            // When A/B tests are queued during batch, let the webview's
+                            // A/B queue drive source highlighting instead of batch navigation.
+                            const suppressSourceHighlight =
+                                this.pendingABTestCount > 0 &&
+                                this.autocompletionState.isProcessing;
+                            if (suppressSourceHighlight) {
+                                debug("Suppressing source highlight during A/B test queue");
+                                return;
+                            }
                             debug("Processing codex file highlight");
                             // Send highlight using cellId (primary) or globalReferences (if available)
                             for (const [panelUri, panel] of this.webviewPanels.entries()) {
@@ -1641,6 +1654,15 @@ export class CodexCellEditorProvider implements vscode.CustomEditorProvider<Code
             // Send state to webview
             this.broadcastAutocompletionState();
 
+            // Fire-and-forget: record batch translation telemetry (once per batch initiation)
+            import("../../utils/abTestingAnalytics").then(({ recordAbResult }) =>
+                recordAbResult({
+                    category: "batch_vs_single",
+                    options: ["single", "batch"],
+                    winner: 1,
+                })
+            ).catch(() => { /* analytics must never block translation */ });
+
             // Determine if LLM is ready (API key or auth token). We still run transcriptions even if not ready.
             let llmReady = true;
             try {
@@ -2541,10 +2563,19 @@ export class CodexCellEditorProvider implements vscode.CustomEditorProvider<Code
                 const sourceUri = getCorrespondingSourceUri(codexUri);
 
                 // Send highlight/clear messages and milestone jump to source files when a codex file is active
+                // When A/B tests are pending, source highlighting is driven by the
+                // webview's A/B queue (via setCurrentIdToGlobalState for the active test).
+                // Skip source-panel updates here so batch processing doesn't override it.
+                const suppressSourceHighlight = this.pendingABTestCount > 0 && this.autocompletionState.isProcessing;
+
                 for (const [panelUri, panel] of this.webviewPanels.entries()) {
                     const isSourceFile = this.isSourceText(panelUri);
                     // copy this to update target with merged cells
                     if (isSourceFile) {
+                        if (suppressSourceHighlight) {
+                            continue;
+                        }
+
                         // Check if this is the matching source file
                         const isMatchingSource = sourceUri && panelUri === sourceUri.toString();
 
@@ -3386,7 +3417,7 @@ export class CodexCellEditorProvider implements vscode.CustomEditorProvider<Code
                         new vscode.CancellationTokenSource().token;
 
                     // Determine if this is a batch operation (chapter autocomplete or multiple cells queued)
-                    // A/B testing is disabled during batch operations to avoid interrupting the workflow
+                    // During batch, A/B tests are non-blocking: variant[0] is auto-applied and the queue continues
                     const isBatchOperation = this.autocompletionState.isProcessing ||
                         (this.singleCellQueueState.isProcessing && this.singleCellQueueState.totalCells > 1);
 
@@ -3407,8 +3438,8 @@ export class CodexCellEditorProvider implements vscode.CustomEditorProvider<Code
                     }
 
                     // If multiple variants are present, send to the webview for selection
-                    if (completionResult && Array.isArray((completionResult as any).variants) && (completionResult as any).variants.length > 1) {
-                        const { variants, testId, testName, isAttentionCheck, correctIndex, decoyCellId } = completionResult as any;
+                    if (completionResult && Array.isArray(completionResult.variants) && completionResult.variants.length > 1) {
+                        const { variants, testId, testName, isAttentionCheck, correctIndex, decoyCellId, models } = completionResult;
 
                         // If variants are identical (ignoring whitespace), treat as single completion
                         try {
@@ -3432,21 +3463,21 @@ export class CodexCellEditorProvider implements vscode.CustomEditorProvider<Code
                             debug("Error comparing variants for identity; proceeding with A/B UI", { error: e });
                         }
 
-                        if (webviewPanel) {
-                            const actualTestId = testId || `${currentCellId}-${Date.now()}`;
+                        const actualTestId = testId || `${currentCellId}-${Date.now()}`;
 
-                            // If this is an attention check, register it so we can handle the response
-                            if (isAttentionCheck && typeof correctIndex === 'number') {
-                                const { registerAttentionCheck } = await import("./codexCellEditorMessagehandling");
-                                registerAttentionCheck(actualTestId, {
-                                    cellId: currentCellId,
-                                    correctIndex,
-                                    correctVariant: variants[correctIndex],
-                                    decoyCellId,
-                                });
-                                console.log(`[Attention Check] Registered for testId ${actualTestId}, correctIndex ${correctIndex}`);
-                            }
+                        // If this is an attention check, register it so we can handle the response
+                        if (isAttentionCheck && typeof correctIndex === 'number') {
+                            const { registerAttentionCheck } = await import("./codexCellEditorMessagehandling");
+                            registerAttentionCheck(actualTestId, {
+                                cellId: currentCellId,
+                                correctIndex,
+                                correctVariant: variants[correctIndex],
+                                decoyCellId,
+                            });
+                            console.log(`[Attention Check] Registered for testId ${actualTestId}, correctIndex ${correctIndex}`);
+                        }
 
+                        if (webviewPanel) {
                             // Send variants to webview - frontend doesn't need attention check details
                             this.postMessageToWebview(webviewPanel, {
                                 type: "providerSendsABTestVariants",
@@ -3455,21 +3486,38 @@ export class CodexCellEditorProvider implements vscode.CustomEditorProvider<Code
                                     cellId: currentCellId,
                                     testId: actualTestId,
                                     testName,
+                                    // Include model identifiers for server-initiated model comparison tests
+                                    ...(Array.isArray(models) && models.length > 0 ? { models } : {}),
                                 },
                             });
+                            this.pendingABTestCount++;
                         }
 
-                        // Mark single cell translation as complete so UI progress/spinners stop
-                        this.updateSingleCellTranslation(1.0);
+                        if (isBatchOperation) {
+                            // NON-BLOCKING path: don't write to the cell yet — the user
+                            // needs to pick a variant first via the A/B selector. The
+                            // selectABTestVariant handler will persist their choice.
+                            // The queue continues immediately without waiting.
+                            // Source panel scrolling is driven by the webview based on
+                            // which A/B test is currently displayed (not queued).
+                            this.updateSingleCellTranslation(1.0);
+
+                            debug("LLM completion A/B variants sent (batch non-blocking, awaiting user selection)", {
+                                cellId: currentCellId,
+                                variantsCount: variants?.length,
+                            });
+                            return "";
+                        }
 
-                        // Do not update the cell value now; the frontend will apply the chosen variant
-                        // Return an empty string for consistency with callers expecting a string
+                        // BLOCKING path (single-cell): do not update the cell value now;
+                        // the frontend will apply the chosen variant when the user selects one.
+                        this.updateSingleCellTranslation(1.0);
                         debug("LLM completion A/B variants sent", { cellId: currentCellId, variantsCount: variants?.length });
                         return "";
                     }
 
                     // Otherwise, handle as a single completion using the first variant
-                    const singleCompletion = (completionResult as any)?.variants?.[0] ?? "";
+                    const singleCompletion = completionResult?.variants?.[0] ?? "";
 
                     progress.report({ message: "Updating document...", increment: 40 });
 

diff --git a/src/providers/translationSuggestions/llmCompletion.ts b/src/providers/translationSuggestions/llmCompletion.ts
@@ -90,6 +90,8 @@ export interface LLMCompletionResult {
     isAttentionCheck?: boolean;
     correctIndex?: number;
     decoyCellId?: string;
+    /** Model identifiers for server-initiated model comparison A/B tests. */
+    models?: string[];
 }
 
 export async function llmCompletion(
@@ -246,10 +248,10 @@ export async function llmCompletion(
             );
 
             // Unified AB testing via registry with random test selection (global gating)
-            // A/B testing is disabled during batch operations (chapter autocomplete, batch transcription)
-            // to avoid interrupting the user with variant selection UI
+            // A/B tests can fire during batch operations — the caller queues them
+            // non-blockingly for user selection
             const extConfig = vscode.workspace.getConfiguration("codex-editor-extension");
-            const abEnabled = Boolean(extConfig.get("abTestingEnabled") ?? true) && !isBatchOperation;
+            const abEnabled = Boolean(extConfig.get("abTestingEnabled") ?? true);
             const abProbabilityRaw = extConfig.get<number>("abTestingProbability");
             const abProbability = Math.max(0, Math.min(1, typeof abProbabilityRaw === "number" ? abProbabilityRaw : 0.15));
             const randomValue = Math.random();
@@ -260,9 +262,7 @@ export async function llmCompletion(
             }
 
             if (!triggerAB && completionConfig.debugMode) {
-                if (isBatchOperation) {
-                    console.debug(`[llmCompletion] A/B testing disabled during batch operation`);
-                } else if (!abEnabled) {
+                if (!abEnabled) {
                     console.debug(`[llmCompletion] A/B testing disabled in settings`);
                 } else {
                     console.debug(`[llmCompletion] A/B test not triggered (random ${randomValue.toFixed(3)} >= probability ${abProbability})`);
@@ -313,10 +313,33 @@ export async function llmCompletion(
                 }
             }
 
-            // A/B testing not triggered (or failed): call LLM once, return two identical variants
-            const completion = await callLLM(messages, completionConfig, token);
+            // A/B testing not triggered (or failed): call LLM with ab_eligible flag
+            // so the server can optionally return a multi-model A/B test response.
+            const llmResult = await callLLM(messages, completionConfig, token, /* abEligible */ true);
             const allowHtml = Boolean(completionConfig.allowHtmlPredictions);
 
+            // If the server returned a multi-model A/B test, build the result from it
+            if (llmResult.abTest) {
+                const serverVariants = llmResult.abTest.variants.map((txt) =>
+                    postProcessABTestResult(txt, allowHtml, returnHTML)
+                );
+                if (completionConfig.debugMode) {
+                    console.debug(
+                        `[llmCompletion] Server returned model A/B test: models=${llmResult.abTest.models.join(", ")}, variants=${serverVariants.length}`
+                    );
+                }
+                return {
+                    variants: serverVariants,
+                    isABTest: true,
+                    testId: `${currentCellId}-model-${Date.now()}`,
+                    testName: "model_comparison",
+                    models: llmResult.abTest.models,
+                };
+            }
+
+            // Standard single-completion path
+            const completion = llmResult.text;
+
             // Preserve multi-line completions: strip any leading "->" markers per line, then join with <br/>
             const lines = (completion || "").split(/\r?\n/);
             const processed = lines