diff --git a/src/providers/translationSuggestions/llmCompletion.ts b/src/providers/translationSuggestions/llmCompletion.ts index ec5bfbf64..0b22704f9 100644 --- a/src/providers/translationSuggestions/llmCompletion.ts +++ b/src/providers/translationSuggestions/llmCompletion.ts @@ -60,8 +60,29 @@ export async function llmCompletion( throw new Error(`No source content found for cell ${currentCellId}. The search index may be incomplete. Try running "Force Complete Rebuild" from the command palette.`); } + // Sanitize HTML content to extract plain text (handles transcription spans, etc.) + const sanitizeHtmlContent = (html: string): string => { + if (!html) return ''; + return html + .replace(/]*class=["']footnote-marker["'][^>]*>[\s\S]*?<\/sup>/gi, '') + .replace(/]*data-footnote[^>]*>[\s\S]*?<\/sup>/gi, '') + .replace(/]*>[\s\S]*?<\/sup>/gi, '') + .replace(/<\/p>/gi, ' ') + .replace(/<[^>]*>/g, '') + .replace(/ /g, ' ') + .replace(/&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/&#\d+;/g, ' ') + .replace(/&[a-zA-Z]+;/g, ' ') + .replace(/\s+/g, ' ') + .trim(); + }; + const sourceContent = validSourceCells - .map((cell) => cell!.content) + .map((cell) => sanitizeHtmlContent(cell!.content || "")) .join(" "); // Get few-shot examples (existing behavior encapsulated) @@ -117,20 +138,17 @@ export async function llmCompletion( // Create the prompt const userMessageInstructions = [ - "1. Analyze the provided reference data to understand the translation patterns and style.", + "1. Analyze any provided reference data to understand the translation patterns and style.", "2. Complete the partial or complete translation of the line.", "3. Ensure your translation fits seamlessly with the existing partial translation.", "4. Provide only the completed translation without any additional commentary or metadata.", `5. Translate only into the target language ${targetLanguage}.`, - "6. Pay careful attention to the provided reference data.", + "6. Use reference data and context when available to match the style.", "7. If in doubt, err on the side of literalness.", (completionConfig.allowHtmlPredictions ? "8. If the project has any styles, return HTML with the appropriate tags or classes as per the examples in the translation memory." : null) ].join("\n"); - let systemMessage = chatSystemMessage || `You are a helpful assistant`; - systemMessage += `\n\nAlways translate from the source language to the target language, ${targetLanguage}, relying strictly on reference data and context provided by the user. The language may be an ultra-low resource language, so it is critical to follow the patterns and style of the provided reference data closely.`; - systemMessage += `\n\n${userMessageInstructions}`; - // Note: Do not attempt to reduce reasoning via prompt text to avoid unintended behavior + const systemMessage = chatSystemMessage || `You are a helpful assistant`; // Note: Validation filtering is now implemented via the useOnlyValidatedExamples setting // This controls whether only validated translation pairs are used in few-shot examples diff --git a/src/providers/translationSuggestions/shared.ts b/src/providers/translationSuggestions/shared.ts index efb388ef1..55f4df249 100644 --- a/src/providers/translationSuggestions/shared.ts +++ b/src/providers/translationSuggestions/shared.ts @@ -29,6 +29,27 @@ export async function fetchFewShotExamples( console.error(`[fetchFewShotExamples] Query was: "${sourceContent}", candidates: ${initialCandidateCount}, validated: ${useOnlyValidatedExamples}`); } + // Sanitize HTML content for consistent comparison (handles transcription spans, etc.) + const sanitizeHtmlContent = (html: string): string => { + if (!html) return ''; + return html + .replace(/]*class=["']footnote-marker["'][^>]*>[\s\S]*?<\/sup>/gi, '') + .replace(/]*data-footnote[^>]*>[\s\S]*?<\/sup>/gi, '') + .replace(/]*>[\s\S]*?<\/sup>/gi, '') + .replace(/<\/p>/gi, ' ') + .replace(/<[^>]*>/g, '') + .replace(/ /g, ' ') + .replace(/&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/&#\d+;/g, ' ') + .replace(/&[a-zA-Z]+;/g, ' ') + .replace(/\s+/g, ' ') + .trim(); + }; + // Instead of filtering, rank all valid complete pairs by relevance const currentTokens = tokenizeText({ method: "whitespace_and_punctuation", text: sourceContent }); @@ -54,8 +75,10 @@ export async function fetchFewShotExamples( }) .map((pair) => { // Calculate relevance score based on token overlap - const pairSourceContent = pair.sourceCell?.content || ""; - const pairTokens = tokenizeText({ method: "whitespace_and_punctuation", text: pairSourceContent }); + // Sanitize pair source content to match the sanitized query content + const pairSourceContentRaw = pair.sourceCell?.content || ""; + const pairSourceContentSanitized = sanitizeHtmlContent(pairSourceContentRaw); + const pairTokens = tokenizeText({ method: "whitespace_and_punctuation", text: pairSourceContentSanitized }); // Calculate overlap ratio const overlapCount = currentTokens.filter(token => pairTokens.includes(token)).length; @@ -142,8 +165,10 @@ export async function getPrecedingTranslationPairs( const maybeHtmlOrPlain = allowHtml ? (cellContent || "").trim() : stripHtmlTags(cellContent).trim(); const safeContent = maybeHtmlOrPlain || notTranslatedYetMessage; + // Always strip HTML from source to avoid teaching LLM about transcription markup + const sanitizedSourceContent = stripHtmlTags(combinedSourceContent).trim(); + const sourceInner = allowHtml ? wrapCdata(sanitizedSourceContent) : xmlEscape(sanitizedSourceContent); const targetInner = allowHtml ? wrapCdata(safeContent) : xmlEscape(safeContent); - const sourceInner = allowHtml ? wrapCdata(combinedSourceContent) : xmlEscape(combinedSourceContent); return `${sourceInner}${targetInner}`; }) ); @@ -162,9 +187,11 @@ export function buildFewShotExamplesText( .map((pair) => { const sourceRaw = pair.sourceCell?.content ?? ""; const targetRaw = pair.targetCell?.content ?? ""; + // Always strip HTML from source to avoid teaching LLM about transcription markup + const source = stripHtmlTags(sourceRaw).trim(); const target = allowHtml ? targetRaw.trim() : stripHtmlTags(targetRaw).trim(); + const sourceInner = allowHtml ? wrapCdata(source) : xmlEscape(source); const targetInner = allowHtml ? wrapCdata(target) : xmlEscape(target); - const sourceInner = allowHtml ? wrapCdata(sourceRaw) : xmlEscape(sourceRaw); // Format examples based on the setting if (exampleFormat === "target-only") { @@ -201,8 +228,8 @@ export function buildMessages( } else { systemMessage += `\n\nReturn plain text only (no XML/HTML). Preserve original line breaks from by returning text with the same number of lines separated by newline characters.`; } - systemMessage += `\n\nAlways translate from the source language to the target language, ${targetLanguage || "" - }, relying strictly on reference data and context provided by the user. The language may be an ultra-low resource language, so it is critical to follow the patterns and style of the provided reference data closely.`; + systemMessage += `\n\nAlways translate from the source language to the target language ${targetLanguage || "" + }. Use reference data and context when provided to match patterns and style. If no reference data is available, provide a natural, accurate translation.`; systemMessage += `\n\n${userInstructions.join("\n")}`; const contextXml = `\n${precedingContextPairs.filter(Boolean).join("\n")}\n`; diff --git a/webviews/codex-webviews/src/CodexCellEditor/WhisperTranscriptionClient.ts b/webviews/codex-webviews/src/CodexCellEditor/WhisperTranscriptionClient.ts index 9e7fd82f1..112ee24a2 100644 --- a/webviews/codex-webviews/src/CodexCellEditor/WhisperTranscriptionClient.ts +++ b/webviews/codex-webviews/src/CodexCellEditor/WhisperTranscriptionClient.ts @@ -91,7 +91,11 @@ export class WhisperTranscriptionClient { case 'error': { this.cleanup(); - let errorMsg = message.message || 'Transcription failed'; + // Handle empty or missing error messages from server + let errorMsg = message.message?.trim() || 'Transcription failed'; + if (!errorMsg || errorMsg === '') { + errorMsg = 'Transcription failed: Server returned an error with no details. This may indicate a temporary service issue or network problem.'; + } // Enhance error messages for common DNS/connection issues if (errorMsg.includes('Name or service not known') || errorMsg.includes('Errno -2')) { errorMsg = `Transcription failed: Unable to resolve ASR endpoint hostname. This usually means:\n1. You may be logged out - please check your authentication status\n2. The endpoint URL is invalid or unreachable\n3. There may be a network connectivity issue\n\nOriginal error: ${errorMsg}`;