Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 25 additions & 7 deletions src/providers/translationSuggestions/llmCompletion.ts
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,29 @@ export async function llmCompletion(
throw new Error(`No source content found for cell ${currentCellId}. The search index may be incomplete. Try running "Force Complete Rebuild" from the command palette.`);
}

// Sanitize HTML content to extract plain text (handles transcription spans, etc.)
const sanitizeHtmlContent = (html: string): string => {
if (!html) return '';
return html
.replace(/<sup[^>]*class=["']footnote-marker["'][^>]*>[\s\S]*?<\/sup>/gi, '')
.replace(/<sup[^>]*data-footnote[^>]*>[\s\S]*?<\/sup>/gi, '')
.replace(/<sup[^>]*>[\s\S]*?<\/sup>/gi, '')
.replace(/<\/p>/gi, ' ')
.replace(/<[^>]*>/g, '')
.replace(/&nbsp;/g, ' ')
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/&#\d+;/g, ' ')
.replace(/&[a-zA-Z]+;/g, ' ')
.replace(/\s+/g, ' ')
.trim();
};

const sourceContent = validSourceCells
.map((cell) => cell!.content)
.map((cell) => sanitizeHtmlContent(cell!.content || ""))
.join(" ");

// Get few-shot examples (existing behavior encapsulated)
Expand Down Expand Up @@ -117,20 +138,17 @@ export async function llmCompletion(

// Create the prompt
const userMessageInstructions = [
"1. Analyze the provided reference data to understand the translation patterns and style.",
"1. Analyze any provided reference data to understand the translation patterns and style.",
"2. Complete the partial or complete translation of the line.",
"3. Ensure your translation fits seamlessly with the existing partial translation.",
"4. Provide only the completed translation without any additional commentary or metadata.",
`5. Translate only into the target language ${targetLanguage}.`,
"6. Pay careful attention to the provided reference data.",
"6. Use reference data and context when available to match the style.",
"7. If in doubt, err on the side of literalness.",
(completionConfig.allowHtmlPredictions ? "8. If the project has any styles, return HTML with the appropriate tags or classes as per the examples in the translation memory." : null)
].join("\n");

let systemMessage = chatSystemMessage || `You are a helpful assistant`;
systemMessage += `\n\nAlways translate from the source language to the target language, ${targetLanguage}, relying strictly on reference data and context provided by the user. The language may be an ultra-low resource language, so it is critical to follow the patterns and style of the provided reference data closely.`;
systemMessage += `\n\n${userMessageInstructions}`;
// Note: Do not attempt to reduce reasoning via prompt text to avoid unintended behavior
const systemMessage = chatSystemMessage || `You are a helpful assistant`;

// Note: Validation filtering is now implemented via the useOnlyValidatedExamples setting
// This controls whether only validated translation pairs are used in few-shot examples
Expand Down
39 changes: 33 additions & 6 deletions src/providers/translationSuggestions/shared.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,27 @@ export async function fetchFewShotExamples(
console.error(`[fetchFewShotExamples] Query was: "${sourceContent}", candidates: ${initialCandidateCount}, validated: ${useOnlyValidatedExamples}`);
}

// Sanitize HTML content for consistent comparison (handles transcription spans, etc.)
const sanitizeHtmlContent = (html: string): string => {
if (!html) return '';
return html
.replace(/<sup[^>]*class=["']footnote-marker["'][^>]*>[\s\S]*?<\/sup>/gi, '')
.replace(/<sup[^>]*data-footnote[^>]*>[\s\S]*?<\/sup>/gi, '')
.replace(/<sup[^>]*>[\s\S]*?<\/sup>/gi, '')
.replace(/<\/p>/gi, ' ')
.replace(/<[^>]*>/g, '')
.replace(/&nbsp;/g, ' ')
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/&#\d+;/g, ' ')
.replace(/&[a-zA-Z]+;/g, ' ')
.replace(/\s+/g, ' ')
.trim();
};

// Instead of filtering, rank all valid complete pairs by relevance
const currentTokens = tokenizeText({ method: "whitespace_and_punctuation", text: sourceContent });

Expand All @@ -54,8 +75,10 @@ export async function fetchFewShotExamples(
})
.map((pair) => {
// Calculate relevance score based on token overlap
const pairSourceContent = pair.sourceCell?.content || "";
const pairTokens = tokenizeText({ method: "whitespace_and_punctuation", text: pairSourceContent });
// Sanitize pair source content to match the sanitized query content
const pairSourceContentRaw = pair.sourceCell?.content || "";
const pairSourceContentSanitized = sanitizeHtmlContent(pairSourceContentRaw);
const pairTokens = tokenizeText({ method: "whitespace_and_punctuation", text: pairSourceContentSanitized });

// Calculate overlap ratio
const overlapCount = currentTokens.filter(token => pairTokens.includes(token)).length;
Expand Down Expand Up @@ -142,8 +165,10 @@ export async function getPrecedingTranslationPairs(
const maybeHtmlOrPlain = allowHtml ? (cellContent || "").trim() : stripHtmlTags(cellContent).trim();
const safeContent = maybeHtmlOrPlain || notTranslatedYetMessage;

// Always strip HTML from source to avoid teaching LLM about transcription markup
const sanitizedSourceContent = stripHtmlTags(combinedSourceContent).trim();
const sourceInner = allowHtml ? wrapCdata(sanitizedSourceContent) : xmlEscape(sanitizedSourceContent);
const targetInner = allowHtml ? wrapCdata(safeContent) : xmlEscape(safeContent);
const sourceInner = allowHtml ? wrapCdata(combinedSourceContent) : xmlEscape(combinedSourceContent);
return `<contextItem><source>${sourceInner}</source><target>${targetInner}</target></contextItem>`;
})
);
Expand All @@ -162,9 +187,11 @@ export function buildFewShotExamplesText(
.map((pair) => {
const sourceRaw = pair.sourceCell?.content ?? "";
const targetRaw = pair.targetCell?.content ?? "";
// Always strip HTML from source to avoid teaching LLM about transcription markup
const source = stripHtmlTags(sourceRaw).trim();
const target = allowHtml ? targetRaw.trim() : stripHtmlTags(targetRaw).trim();
const sourceInner = allowHtml ? wrapCdata(source) : xmlEscape(source);
const targetInner = allowHtml ? wrapCdata(target) : xmlEscape(target);
const sourceInner = allowHtml ? wrapCdata(sourceRaw) : xmlEscape(sourceRaw);

// Format examples based on the setting
if (exampleFormat === "target-only") {
Expand Down Expand Up @@ -201,8 +228,8 @@ export function buildMessages(
} else {
systemMessage += `\n\nReturn plain text only (no XML/HTML). Preserve original line breaks from <currentTask><source> by returning text with the same number of lines separated by newline characters.`;
}
systemMessage += `\n\nAlways translate from the source language to the target language, ${targetLanguage || ""
}, relying strictly on reference data and context provided by the user. The language may be an ultra-low resource language, so it is critical to follow the patterns and style of the provided reference data closely.`;
systemMessage += `\n\nAlways translate from the source language to the target language ${targetLanguage || ""
}. Use reference data and context when provided to match patterns and style. If no reference data is available, provide a natural, accurate translation.`;
systemMessage += `\n\n${userInstructions.join("\n")}`;

const contextXml = `<context>\n${precedingContextPairs.filter(Boolean).join("\n")}\n</context>`;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,11 @@ export class WhisperTranscriptionClient {

case 'error': {
this.cleanup();
let errorMsg = message.message || 'Transcription failed';
// Handle empty or missing error messages from server
let errorMsg = message.message?.trim() || 'Transcription failed';
if (!errorMsg || errorMsg === '') {
errorMsg = 'Transcription failed: Server returned an error with no details. This may indicate a temporary service issue or network problem.';
}
// Enhance error messages for common DNS/connection issues
if (errorMsg.includes('Name or service not known') || errorMsg.includes('Errno -2')) {
errorMsg = `Transcription failed: Unable to resolve ASR endpoint hostname. This usually means:\n1. You may be logged out - please check your authentication status\n2. The endpoint URL is invalid or unreachable\n3. There may be a network connectivity issue\n\nOriginal error: ${errorMsg}`;
Expand Down