diff --git a/src/exportHandler/exportHandler.ts b/src/exportHandler/exportHandler.ts index 73c7bc8c2..7bedf94ad 100644 --- a/src/exportHandler/exportHandler.ts +++ b/src/exportHandler/exportHandler.ts @@ -853,6 +853,206 @@ async function exportCodexContentAsObsRoundtrip( ); } +/** + * USFM (Unified Standard Format Marker) Round-trip export + * Rebuilds original USFM file with translated content + */ +async function exportCodexContentAsUsfmRoundtrip( + userSelectedPath: string, + filesToExport: string[], + _options?: ExportOptions +) { + const workspaceFolders = vscode.workspace.workspaceFolders; + if (!workspaceFolders) { + vscode.window.showErrorMessage("No workspace folder found."); + return; + } + + const exportFolder = vscode.Uri.file(userSelectedPath); + + return vscode.window.withProgress( + { + location: vscode.ProgressLocation.Notification, + title: "Exporting USFM Round-trip", + cancellable: false, + }, + async (progress) => { + const increment = filesToExport.length > 0 ? 100 / filesToExport.length : 100; + + // Import USFM exporter from experimental (now standalone implementation) + const experimentalExporter = await import("../../webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/usfmExporter"); + const exportUsfmRoundtrip = experimentalExporter.exportUsfmRoundtrip; + + // For each selected codex file, reconstruct the USFM with translations + for (const [index, filePath] of filesToExport.entries()) { + progress.report({ message: `Processing ${index + 1}/${filesToExport.length}`, increment }); + try { + const file = vscode.Uri.file(filePath); + const fileName = basename(file.fsPath); + const bookCode = fileName.split(".")[0] || ""; + + console.log(`[USFM Export] Processing ${fileName} using USFM round-trip exporter`); + + // Read codex notebook + const codexNotebook = await readCodexNotebookFromUri(file); + + // Check if this is a USFM file (experimental or standalone) + const importerType = (codexNotebook.metadata as any)?.importerType; + const corpusMarker = (codexNotebook.metadata as any)?.corpusMarker; + + if (importerType !== 'usfm-experimental' && corpusMarker !== 'usfm') { + console.warn(`[USFM Export] Skipping ${fileName} - not imported with USFM importer (importerType: ${importerType}, corpusMarker: ${corpusMarker})`); + vscode.window.showWarningMessage(`Skipping ${fileName} - not imported with USFM importer`); + continue; + } + + // Get original file name from metadata with fallback + // Try multiple sources: codex metadata, source notebook metadata, or construct from bookCode + let metadataOriginalFileName = (codexNotebook.metadata as any)?.originalFileName; + const metadataBookCode = (codexNotebook.metadata as any)?.bookCode; + const finalBookCode = metadataBookCode || bookCode; + + // If not found in codex metadata, try source notebook metadata + if (!metadataOriginalFileName) { + try { + const sourceFileName = fileName.replace('.codex', '.source'); + const sourceFileUri = vscode.Uri.joinPath( + workspaceFolders[0].uri, + ".project", + "sourceTexts", + sourceFileName + ); + const sourceNotebook = await readCodexNotebookFromUri(sourceFileUri); + metadataOriginalFileName = (sourceNotebook.metadata as any)?.originalFileName; + if (metadataOriginalFileName) { + console.log(`[USFM Export] Found originalFileName in source notebook: ${metadataOriginalFileName}`); + } + } catch (error) { + // Source notebook not found or error reading it, continue with fallbacks + console.log(`[USFM Export] Could not read source notebook for originalFileName`); + } + } + + // Try common USFM file extensions + const possibleExtensions = ['.usfm', '.sfm', '.USFM', '.SFM']; + let originalFileName = metadataOriginalFileName; + + // If no originalFileName, try to find it in originals folder + if (!originalFileName && finalBookCode) { + const originalsDir = vscode.Uri.joinPath( + workspaceFolders[0].uri, + ".project", + "attachments", + "originals" + ); + + // Try each extension + for (const ext of possibleExtensions) { + const testFileName = `${finalBookCode}${ext}`; + const testUri = vscode.Uri.joinPath(originalsDir, testFileName); + try { + await vscode.workspace.fs.stat(testUri); + originalFileName = testFileName; + console.log(`[USFM Export] Found original file: ${testFileName}`); + break; + } catch { + // File doesn't exist, try next extension + } + } + } + + // Final fallback: construct filename from bookCode + if (!originalFileName) { + originalFileName = `${finalBookCode}.usfm`; + console.log(`[USFM Export] Using fallback filename: ${originalFileName}`); + } + + // Load original USFM file from attachments/originals + const originalsDir = vscode.Uri.joinPath( + workspaceFolders[0].uri, + ".project", + "attachments", + "originals" + ); + const originalFileUri = vscode.Uri.joinPath(originalsDir, originalFileName); + + let originalUsfmContent: string; + try { + const originalFileData = await vscode.workspace.fs.readFile(originalFileUri); + originalUsfmContent = new TextDecoder('utf-8').decode(originalFileData); + console.log(`[USFM Export] Loaded original USFM file: ${originalFileName}`); + } catch (error) { + // Fallback: try to get from structureMetadata if available + const structureMetadata = (codexNotebook.metadata as any)?.structureMetadata; + if (structureMetadata?.originalUsfmContent) { + originalUsfmContent = structureMetadata.originalUsfmContent; + console.log(`[USFM Export] Using original USFM content from metadata (file not found at ${originalFileUri.fsPath})`); + } else { + throw new Error(`Original USFM file not found at ${originalFileUri.fsPath} and no original content in metadata`); + } + } + + // Build codex cells array + // Include id property if it exists (some cells have id at top level, others in metadata.id) + const codexCells = codexNotebook.cells.map(cell => { + const cellData: any = { + kind: cell.kind, + value: cell.value, + metadata: cell.metadata, + }; + // Include id if it exists at top level (for ProcessedCell structure) + if ((cell as any).id) { + cellData.id = (cell as any).id; + } + return cellData; + }); + + // Get lineMappings from structureMetadata if available (for standalone exporter) + const structureMetadata = (codexNotebook.metadata as any)?.structureMetadata; + const lineMappings = structureMetadata?.lineMappings; + + // Debug: Log structureMetadata and lineMappings + if (lineMappings) { + console.log(`[USFM Export] Found lineMappings: ${lineMappings.length} entries`); + const sampleMapping = lineMappings.find((m: any) => m.cellId); + console.log(`[USFM Export] Sample mapping with cellId:`, sampleMapping); + console.log(`[USFM Export] Mappings with cellId count:`, lineMappings.filter((m: any) => m.cellId && m.cellId !== '').length); + } else { + console.warn(`[USFM Export] No lineMappings found in structureMetadata`); + console.log(`[USFM Export] structureMetadata keys:`, structureMetadata ? Object.keys(structureMetadata) : 'null'); + } + + // Export USFM with translations + // If we have lineMappings, use them for precise round-trip export + let updatedUsfmContent: string; + if (lineMappings) { + updatedUsfmContent = await exportUsfmRoundtrip(originalUsfmContent, lineMappings, codexCells); + } else { + // Use backward-compatible signature (no lineMappings - fallback mode) + updatedUsfmContent = await exportUsfmRoundtrip(originalUsfmContent, codexCells); + } + + // Save to export folder + const timestamp = new Date().toISOString().replace(/[:.]/g, "-"); + const exportedName = originalFileName.replace(/\.(usfm|sfm|USFM|SFM)$/i, `_${timestamp}_translated.$1`); + const exportedUri = vscode.Uri.joinPath(exportFolder, exportedName); + + const encoder = new TextEncoder(); + await vscode.workspace.fs.writeFile(exportedUri, encoder.encode(updatedUsfmContent)); + + console.log(`[USFM Export] ✓ Exported ${exportedName}`); + + } catch (error) { + console.error(`[USFM Export] Error exporting ${filePath}:`, error); + vscode.window.showErrorMessage(`Failed to export ${basename(filePath)}: ${error instanceof Error ? error.message : 'Unknown error'}`); + } + } + + vscode.window.showInformationMessage(`USFM round-trip export completed to ${userSelectedPath}`); + } + ); +} + /** * TMS (Translation Memory System) Round-trip export * Supports both TMX and XLIFF formats @@ -1074,6 +1274,19 @@ async function exportCodexContentAsRebuild( // TMS (Translation Memory System) files use the TMS exporter filesByType['tms'] = filesByType['tms'] || []; filesByType['tms'].push(filePath); + } else if ( + corpusMarker === 'usfm' || + importerType === 'usfm-experimental' || + importerType === 'usfm' || + // Also check for NT/OT corpus markers with USFM file extensions (Bible books imported as USFM) + ((corpusMarker === 'NT' || corpusMarker === 'OT') && + originalFileName && + (originalFileName.endsWith('.usfm') || originalFileName.endsWith('.sfm') || originalFileName.endsWith('.USFM') || originalFileName.endsWith('.SFM'))) || + (originalFileName && (originalFileName.endsWith('.usfm') || originalFileName.endsWith('.sfm') || originalFileName.endsWith('.USFM') || originalFileName.endsWith('.SFM'))) + ) { + // USFM files use the USFM round-trip exporter + filesByType['usfm'] = filesByType['usfm'] || []; + filesByType['usfm'].push(filePath); } else { unsupportedFiles.push({ file: basename(filePath), marker: corpusMarker || importerType || 'unknown' }); } @@ -1193,6 +1406,22 @@ async function exportCodexContentAsRebuild( } } + // Export USFM files + if (filesByType['usfm']?.length > 0) { + console.log(`[Rebuild Export] Exporting ${filesByType['usfm'].length} USFM file(s)...`); + progress.report({ + message: `Exporting ${filesByType['usfm'].length} USFM file(s)...`, + increment: 20 + }); + try { + await exportCodexContentAsUsfmRoundtrip(userSelectedPath, filesByType['usfm'], options); + processedCount += filesByType['usfm'].length; + } catch (error) { + console.error('[Rebuild Export] USFM export failed:', error); + vscode.window.showErrorMessage(`USFM export failed: ${error instanceof Error ? error.message : 'Unknown error'}`); + } + } + progress.report({ message: "Complete", increment: 30 }); // Show summary diff --git a/src/projectManager/projectExportView.ts b/src/projectManager/projectExportView.ts index 6c278e743..48c6b13da 100644 --- a/src/projectManager/projectExportView.ts +++ b/src/projectManager/projectExportView.ts @@ -334,13 +334,14 @@ function getWebviewContent(
Intelligently detects file type and exports back to original format (DOCX, IDML, Biblica, OBS, TMS)
+Intelligently detects file type and exports back to original format (DOCX, IDML, Biblica, OBS, TMS, USFM)
+ Importing translation for:{" "} + {selectedSource.name} +
+ )} ++ {files.length} file{files.length > 1 ? "s" : ""}{" "} + selected +
++ {Array.from(files) + .map((f) => f.name) + .slice(0, 3) + .join(", ")} + {files.length > 3 && ` and ${files.length - 3} more...`} +
+
+ {file.preview}
+ {file.preview.length >= 300 && "..."}
+
+ reference: text
+ let footnoteContentHtml = ''; + if (reference) { + footnoteContentHtml = `${reference}: ${footnoteHtml}
`; + } else { + footnoteContentHtml = `${footnoteHtml}
`; + } + + // Escape HTML for use in data attribute + const escapedFootnote = footnoteContentHtml + .replace(/"/g, '"') + .replace(/'/g, '''); + + footnotes.push({ + caller: caller || '+', + content: escapedFootnote, + position, + }); + } + + // Replace footnotes in reverse order to preserve positions + for (let i = footnotes.length - 1; i >= 0; i--) { + const footnote = footnotes[i]; + const footnoteRegex2 = /\\f\s+([+\-*]|\w+)\s*(.*?)\\f\*/s; + const footnoteMatch = processedText.substring(footnote.position).match(footnoteRegex2); + if (footnoteMatch) { + const footnoteNumber = i + 1; // Use 1-based numbering + const replacement = `${footnoteNumber}`; + processedText = processedText.substring(0, footnote.position) + + replacement + + processedText.substring(footnote.position + footnoteMatch[0].length); + } + } + + // Now process other inline markers + type StackEntry = { marker: string; closers: string[]; }; + const stack: StackEntry[] = []; + + const openFor = (marker: string): { openers: string[]; closers: string[]; } => { + switch (marker) { + case 'bd': + return { openers: [``], closers: [''] }; + case 'it': + return { openers: [``], closers: [''] }; + case 'bdit': + return { openers: ['', ``], closers: ['', ''] }; + case 'sup': + return { openers: [``], closers: [''] }; + case 'sc': + return { openers: [``], closers: [''] }; + default: + return { openers: [``], closers: [''] }; + } + }; + + let i = 0; + let out = ''; + while (i < processedText.length) { + const ch = processedText[i]; + if (ch === '\\') { + let j = i + 1; + let name = ''; + // Support plus-prefixed note-internal markers like \+xt + if (processedText[j] === '+') { + name += '+'; + j++; + } + while (j < processedText.length && isAlphaNum(processedText[j])) { + name += processedText[j]; + j++; + } + // Milestones \qt-s/\qt-e are treated as inline spans with data-tag; we ignore -s/-e in HTML + if (processedText[j] === '-' && (processedText[j + 1] === 's' || processedText[j + 1] === 'e')) { + j += 2; + } + // Closing marker + if (processedText[j] === '*') { + let idx = stack.length - 1; + while (idx >= 0 && stack[idx].marker !== name) idx--; + if (idx >= 0) { + const entry = stack.splice(idx, 1)[0]; + for (const closer of entry.closers) out += closer; + } else { + out += ''; + } + j += 1; + i = j; + continue; + } + if (processedText[j] === ' ') j += 1; + const { openers, closers } = openFor(name); + openers.forEach(op => (out += op)); + stack.push({ marker: name, closers: [...closers] }); + i = j; + } else { + out += ch; + i++; + } + } + // Close any dangling tags + while (stack.length > 0) { + const entry = stack.pop()!; + for (const closer of entry.closers) out += closer; + } + return out; +}; + +// Convert HTML back to USFM inline markers +export const htmlInlineToUsfm = (html: string): string => { + // Check if DOMParser is available (browser context) + if (typeof DOMParser !== 'undefined') { + try { + const parser = new DOMParser(); + const doc = parser.parseFromString(`reference: text
ortext
+ const parser = new DOMParser(); + const footnoteDoc = parser.parseFromString(unescaped, 'text/html'); + const footnotePara = footnoteDoc.body.querySelector('p'); + + if (footnotePara) { + let reference = ''; + let footnoteText = ''; + + // Check for tag (reference) + const emTag = footnotePara.querySelector('em'); + if (emTag) { + reference = emTag.textContent?.trim() || ''; + // Remove the reference from the paragraph + const textNodes = Array.from(footnotePara.childNodes) + .filter(n => { + if (n.nodeType === Node.TEXT_NODE) return true; + if (n.nodeType === Node.ELEMENT_NODE) { + const el = n as Element; + return el.tagName.toLowerCase() !== 'em'; + } + return false; + }) + .map(n => { + if (n.nodeType === Node.TEXT_NODE) return n.textContent || ''; + if (n.nodeType === Node.ELEMENT_NODE) { + return htmlInlineToUsfm((n as HTMLElement).outerHTML); + } + return ''; + }) + .join('') + .trim(); + footnoteText = textNodes.replace(/^:?\s*/, ''); + } else { + // No reference, just text + footnoteText = htmlInlineToUsfm(footnotePara.innerHTML); + } + + // Build USFM footnote: \f + \fr reference \ft text\f* + let usfmFootnote = '\\f +'; + if (reference) { + usfmFootnote += ` \\fr ${reference}`; + } + if (footnoteText) { + usfmFootnote += ` \\ft ${footnoteText}`; + } + usfmFootnote += '\\f*'; + + return usfmFootnote; + } + } + + const tag = inferMarkerFromElement(el); + const inner = Array.from(el.childNodes).map(walk).join(''); + if (tag) { + return `\\${tag} ${inner}\\${tag}*`; + } + return inner; + } + return ''; + }; + + return Array.from(container.childNodes).map(walk).join(''); + } catch (error) { + console.warn('DOMParser failed, using regex fallback:', error); + } + } + + // Fallback: Regex-based approach for Node.js context + let result = html; + let changed = true; + let iterations = 0; + const maxIterations = 20; + + while (changed && iterations < maxIterations) { + iterations++; + changed = false; + const before = result; + + // Match innermost tags with data-tag first + result = result.replace(/<(\w+)[^>]*data-tag="([^"]+)"[^>]*>([^<]*)<\/\1>/gi, (match, tagName, dataTag, content) => { + changed = true; + const innerUsfm = content.trim(); + return innerUsfm ? `\\${dataTag} ${innerUsfm}\\${dataTag}*` : ''; + }); + + // Handle semantic tags without data-tag + result = result.replace(/]*>([^<]*)<\/strong>/gi, (match, content) => { + changed = true; + const innerUsfm = content.trim(); + return innerUsfm ? `\\bd ${innerUsfm}\\bd*` : ''; + }); + result = result.replace(/]*>([^<]*)<\/b>/gi, (match, content) => { + changed = true; + const innerUsfm = content.trim(); + return innerUsfm ? `\\bd ${innerUsfm}\\bd*` : ''; + }); + result = result.replace(/]*>([^<]*)<\/em>/gi, (match, content) => { + changed = true; + const innerUsfm = content.trim(); + return innerUsfm ? `\\it ${innerUsfm}\\it*` : ''; + }); + result = result.replace(/]*>([^<]*)<\/i>/gi, (match, content) => { + changed = true; + const innerUsfm = content.trim(); + return innerUsfm ? `\\it ${innerUsfm}\\it*` : ''; + }); + // Handle footnotes BEFORE regular sup tags + result = result.replace(/]*data-footnote="([^"]+)"[^>]*class="footnote-marker"[^>]*>(\d+)<\/sup>/gi, (match, footnoteContent, footnoteNum) => { + changed = true; + // Unescape HTML entities + const unescaped = footnoteContent + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/ /g, ' '); + + // Parse footnote HTML:reference: text
ortext
+ // Use regex to extract reference and text + const refMatch = unescaped.match(/([^<]+):\s*<\/em> (.*?)<\/p>/);
+ let usfmFootnote = '\\f +';
+
+ if (refMatch) {
+ const [, reference, text] = refMatch;
+ usfmFootnote += ` \\fr ${reference.trim()}`;
+ // Convert HTML in text back to USFM
+ const textUsfm = htmlInlineToUsfm(text);
+ if (textUsfm) {
+ usfmFootnote += ` \\ft ${textUsfm}`;
+ }
+ } else {
+ // No reference, just text
+ const textMatch = unescaped.match(/ (.*?)<\/p>/);
+ if (textMatch) {
+ const textUsfm = htmlInlineToUsfm(textMatch[1]);
+ if (textUsfm) {
+ usfmFootnote += ` \\ft ${textUsfm}`;
+ }
+ }
+ }
+ usfmFootnote += '\\f*';
+ return usfmFootnote;
+ });
+
+ result = result.replace(/]*>([^<]*)<\/sup>/gi, (match, content) => {
+ // Skip if this was already processed as a footnote
+ if (match.includes('data-footnote')) return match;
+ changed = true;
+ const innerUsfm = content.trim();
+ return innerUsfm ? `\\sup ${innerUsfm}\\sup*` : '';
+ });
+ result = result.replace(/]*style="[^"]*small-caps[^"]*"[^>]*>([^<]*)<\/span>/gi, (match, content) => {
+ changed = true;
+ const innerUsfm = content.trim();
+ return innerUsfm ? `\\sc ${innerUsfm}\\sc*` : '';
+ });
+
+ // Handle nested tags with data-tag (process recursively)
+ result = result.replace(/<(\w+)[^>]*data-tag="([^"]+)"[^>]*>(.*?)<\/\1>/gi, (match, tagName, dataTag, content) => {
+ if (content.includes('<')) {
+ const innerUsfm = htmlInlineToUsfm(content);
+ changed = true;
+ return innerUsfm ? `\\${dataTag} ${innerUsfm}\\${dataTag}*` : '';
+ }
+ return match;
+ });
+
+ if (result === before) {
+ changed = false;
+ }
+ }
+
+ // Clean up any remaining HTML tags
+ result = result.replace(/<[^>]+>/g, '');
+
+ return result.trim();
+};
+
diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/usfmParser.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/usfmParser.ts
new file mode 100644
index 000000000..5696b9b9b
--- /dev/null
+++ b/webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/usfmParser.ts
@@ -0,0 +1,466 @@
+/**
+ * Standalone USFM Parser
+ * Reads all USFM content, including header tags (\id, \toc, etc.) as part of chapter 1
+ * Splits file into bible chapters
+ * Skips empty paragraphs during import
+ */
+
+import { ProcessedCell } from '../../../types/common';
+import { createProcessedCell } from '../../../utils/workflowHelpers';
+import { convertUsfmInlineMarkersToHtml } from './usfmInlineMapper';
+
+export interface ParsedUsfmDocument {
+ bookCode: string;
+ bookName?: string;
+ fileName: string;
+ cells: ProcessedCell[];
+ verseCount: number;
+ paratextCount: number;
+ chapters: number[];
+ footnoteCount: number;
+ footnotes: any[];
+ // Preserve original USFM content for round-trip export
+ originalUsfmContent: string;
+ // Store line mappings for export
+ lineMappings: Array<{
+ lineIndex: number;
+ cellId: string;
+ originalLine: string;
+ marker: string;
+ hasContent: boolean;
+ }>;
+}
+
+/**
+ * Parse USFM file line-by-line
+ * - Includes header tags (\id, \toc, etc.) as part of chapter 1
+ * - Creates cells only for lines with content (skips empty markers like \p)
+ * - Preserves all structure for round-trip export
+ * @param file - The USFM file to parse
+ * @param versesOnly - If true, only parse verses (skip headers, sections, etc.) - used for target imports
+ */
+export async function parseUsfmFile(
+ file: File,
+ versesOnly: boolean = false
+): Promise
+ // The blank line effect comes from \b followed by empty \li1, not from \b itself
+ htmlParts.push('
');
+ if (text.trim()) {
+ htmlParts.push(text.trim());
+ }
+ breakTagMetadataParts.push('\\b');
+ } else if (breakTag && (breakTag.startsWith('\\li') || breakTag.startsWith('\\q'))) {
+ // Regular break marker (\li1, \q1, etc.) - add single
+ htmlParts.push('
');
+ if (text.trim()) {
+ htmlParts.push(text.trim());
+ }
+ // Always include break tag in metadata, even if text is empty (for empty \li1 lines)
+ breakTagMetadataParts.push(breakTag);
+ } else if (text.trim()) {
+ // Text without specific break tag - add single
+ htmlParts.push('
');
+ htmlParts.push(text.trim());
+ }
+ }
+ }
+
+ // Check if we have any content
+ const hasContent = htmlParts.some(part => part && part !== '
' && part !== '
');
+ if (!hasContent) {
+ // Empty verse - skip
+ currentVerse = null;
+ return;
+ }
+
+ const htmlContent = htmlParts.join('').trim();
+ const cellId = `${bookCode} ${cellChapter}:${verseNumber}`;
+
+ // Store break tags in metadata (for export) - include \b tags
+ const breakTagMetadata = breakTagMetadataParts.length > 0
+ ? breakTagMetadataParts.join('|')
+ : undefined;
+
+ const cellMetadata: any = {
+ bookCode,
+ bookName,
+ fileName: file.name,
+ chapter: cellChapter,
+ marker: '\\v',
+ originalLine: lines[startLineIndex]?.trim() || '',
+ originalText: verseText.join(' ').trim(), // Store original text for reference
+ lineIndex: startLineIndex,
+ verse: verseNumber,
+ cellLabel: `${bookCode} ${cellChapter}:${verseNumber}`,
+ breakTag: breakTagMetadata, // Store original break tags for export (including \b)
+ };
+
+ // Convert USFM inline markers to HTML (but keep
tags as-is)
+ const finalHtmlContent = convertUsfmInlineMarkersToHtml(htmlContent);
+
+ // Create cell
+ const cell = createProcessedCell(cellId, finalHtmlContent, {
+ type: 'text',
+ id: cellId,
+ ...cellMetadata,
+ } as any);
+
+ cells.push(cell);
+ verseCount++;
+ currentVerse = null;
+ }
+
+ // Parse each line
+ for (let lineIndex = 0; lineIndex < lines.length; lineIndex++) {
+ const line = lines[lineIndex];
+ const trimmedLine = line.trim();
+
+ // Keep empty lines in mappings but don't create cells for them
+ if (!trimmedLine) {
+ // If we're building a verse, add empty line as break
+ if (currentVerse) {
+ currentVerse.verseText.push('');
+ currentVerse.breakTags.push(''); // Empty line break
+ }
+ lineMappings.push({
+ lineIndex,
+ cellId: '',
+ originalLine: line,
+ marker: '',
+ hasContent: false,
+ });
+ continue;
+ }
+
+ // Process lines that start with \
+ if (trimmedLine.startsWith('\\')) {
+ // Extract marker and text
+ // Match: \marker text or \marker (without text)
+ const markerMatch = trimmedLine.match(/^\\([a-zA-Z]+\d*(?:-[se])?)\s*(.*)$/);
+
+ if (markerMatch) {
+ const [, marker, text] = markerMatch;
+ const textContent = text.trim();
+
+ // Extract book code from \id marker - ALWAYS process this even if versesOnly
+ if (marker === 'id' && textContent) {
+ // Try multiple patterns to extract book code
+ // Pattern 1: "MAT" or "MAT - Book Name" or "MAT Book Name"
+ const idMatch = textContent.match(/^([A-Z0-9]{2,4})\b/);
+ if (idMatch) {
+ bookCode = idMatch[1].toUpperCase();
+ bookCodeExtracted = true;
+ console.log(`[USFM Parser] Extracted book code: ${bookCode} from line: ${trimmedLine}`);
+ }
+ // Extract book name (everything after book code and optional dash)
+ const nameMatch = textContent.match(/^[A-Z0-9]{2,4}\s*-\s*(.+)$/);
+ if (nameMatch) {
+ bookName = nameMatch[1].trim();
+ } else {
+ // If no dash, try to extract name after book code
+ const nameMatch2 = textContent.match(/^[A-Z0-9]{2,4}\s+(.+)$/);
+ if (nameMatch2) {
+ bookName = nameMatch2[1].trim();
+ }
+ }
+ // If versesOnly, store in mappings but don't create a cell
+ if (versesOnly) {
+ lineMappings.push({
+ lineIndex,
+ cellId: '',
+ originalLine: line,
+ marker: `\\${marker}`,
+ hasContent: false,
+ });
+ continue;
+ }
+ }
+
+ // Track chapters - but headers before first chapter stay in chapter 1
+ if (marker === 'c' && textContent) {
+ const chapterNum = parseInt(textContent, 10);
+ if (!isNaN(chapterNum)) {
+ currentChapter = chapterNum;
+ chapters.add(chapterNum);
+ seenFirstChapter = true;
+ }
+ // Finish current verse if any (chapter change)
+ if (currentVerse) {
+ finishCurrentVerse();
+ }
+ }
+
+ // Determine cell type and metadata
+ // Headers before first chapter marker are assigned to chapter 1
+ const cellChapter = seenFirstChapter ? currentChapter : 1;
+
+ // Handle verse markers specially - collect multi-line verses
+ if (marker === 'v' || marker.startsWith('v')) {
+ // Finish previous verse if any
+ if (currentVerse) {
+ finishCurrentVerse();
+ }
+
+ // Extract verse number
+ const verseMatch = textContent.match(/^(\d+[a-z]?)\s*(.*)$/);
+ if (verseMatch) {
+ const [, verseNum, verseText] = verseMatch;
+ const verseNumber = /^\d+$/.test(verseNum) ? parseInt(verseNum, 10) : verseNum;
+
+ // Start new verse
+ currentVerse = {
+ verseNumber,
+ verseText: verseText ? [verseText] : [],
+ breakTags: [''],
+ startLineIndex: lineIndex,
+ chapter: cellChapter,
+ };
+ // Store verse marker line in mappings
+ lineMappings.push({
+ lineIndex,
+ cellId: `${bookCode} ${cellChapter}:${verseNumber}`,
+ originalLine: line,
+ marker: `\\${marker}`,
+ hasContent: true,
+ });
+ } else {
+ // Verse marker without number - shouldn't happen but handle it
+ lineMappings.push({
+ lineIndex,
+ cellId: '',
+ originalLine: line,
+ marker: `\\${marker}`,
+ hasContent: true,
+ });
+ }
+ continue;
+ }
+
+ // Handle break markers that continue a verse (li1, q1, q2, etc.)
+ const breakMarkers = ['li1', 'li2', 'li3', 'li4', 'q1', 'q2', 'q3', 'q4'];
+ if (currentVerse && breakMarkers.includes(marker)) {
+ // Add text to current verse with break tag
+ currentVerse.verseText.push(textContent);
+ currentVerse.breakTags.push(`\\${marker}`);
+ // Store break line in mappings (linked to verse)
+ lineMappings.push({
+ lineIndex,
+ cellId: `${bookCode} ${currentVerse.chapter}:${currentVerse.verseNumber}`,
+ originalLine: line,
+ marker: `\\${marker}`,
+ hasContent: true,
+ });
+ continue;
+ }
+
+ // Handle \b (blank line) marker within a verse - treat as double break
+ if (currentVerse && marker === 'b') {
+ // \b creates a blank line - add empty text with special break tag
+ currentVerse.verseText.push(''); // Empty text for the blank line
+ currentVerse.breakTags.push('\\b'); // Store \b marker
+ // Store \b line in mappings (linked to verse)
+ lineMappings.push({
+ lineIndex,
+ cellId: `${bookCode} ${currentVerse.chapter}:${currentVerse.verseNumber}`,
+ originalLine: line,
+ marker: '\\b',
+ hasContent: false, // \b itself has no content, it's just a blank line marker
+ });
+ continue;
+ }
+
+ // SKIP empty markers (like \p, \q1, etc. without text)
+ // Store them in mappings but don't create cells
+ if (!textContent) {
+ // Finish current verse if any (empty marker ends verse)
+ if (currentVerse) {
+ finishCurrentVerse();
+ }
+ lineMappings.push({
+ lineIndex,
+ cellId: '',
+ originalLine: line,
+ marker: `\\${marker}`,
+ hasContent: false,
+ });
+ continue;
+ }
+
+ // If versesOnly is true, skip non-verse markers (headers, sections, etc.)
+ if (versesOnly) {
+ // Finish current verse if any
+ if (currentVerse) {
+ finishCurrentVerse();
+ }
+ // Store in mappings but don't create a cell
+ lineMappings.push({
+ lineIndex,
+ cellId: '',
+ originalLine: line,
+ marker: `\\${marker}`,
+ hasContent: false,
+ });
+ continue;
+ }
+
+ // Finish current verse if any (non-verse marker ends verse)
+ if (currentVerse) {
+ finishCurrentVerse();
+ }
+
+ // All other markers (headers, sections, paragraphs with text, etc.)
+ const cellMetadata: any = {
+ bookCode,
+ bookName,
+ fileName: file.name,
+ chapter: cellChapter,
+ marker: `\\${marker}`, // Store the full marker (e.g., \id, \s1, \v)
+ originalLine: trimmedLine, // Store the full original line for matching
+ originalText: textContent, // Store just the text part
+ lineIndex, // Store line index for export
+ };
+
+ // Use marker name and index for unique ID
+ const cellId = `${bookCode} ${cellChapter}:${marker}:${lineIndex}`;
+ cellMetadata.originalText = textContent;
+ paratextCount++;
+
+ // Convert text content to HTML for display
+ const htmlContent = convertUsfmInlineMarkersToHtml(textContent);
+
+ // Create cell
+ // Ensure id is in metadata for VS Code notebook compatibility
+ const cell = createProcessedCell(cellId, htmlContent, {
+ type: 'text',
+ id: cellId, // Store id in metadata for VS Code notebook compatibility
+ ...cellMetadata,
+ } as any);
+
+ cells.push(cell);
+
+ // Store mapping for export
+ lineMappings.push({
+ lineIndex,
+ cellId,
+ originalLine: line,
+ marker: `\\${marker}`,
+ hasContent: true,
+ });
+ } else {
+ // Line starts with \ but doesn't match pattern - store in mappings
+ // Finish current verse if any
+ if (currentVerse) {
+ finishCurrentVerse();
+ }
+ lineMappings.push({
+ lineIndex,
+ cellId: '',
+ originalLine: line,
+ marker: '',
+ hasContent: false,
+ });
+ }
+ } else {
+ // Line doesn't start with \ - continuation line
+ // If we're building a verse, add as continuation
+ if (currentVerse) {
+ currentVerse.verseText.push(trimmedLine);
+ currentVerse.breakTags.push(''); // Continuation line (no break tag)
+ }
+ // Store in mappings but don't create a cell (continuation lines are part of previous cell)
+ lineMappings.push({
+ lineIndex,
+ cellId: currentVerse ? `${bookCode} ${currentVerse.chapter}:${currentVerse.verseNumber}` : '',
+ originalLine: line,
+ marker: '',
+ hasContent: currentVerse ? true : false,
+ });
+ }
+ }
+
+ // Finish any remaining verse
+ if (currentVerse) {
+ finishCurrentVerse();
+ }
+
+ // Ensure chapter 1 is in the chapters set if we have headers
+ if (!seenFirstChapter && cells.length > 0) {
+ chapters.add(1);
+ }
+
+ if (cells.length === 0) {
+ throw new Error(`No content found in USFM file: ${file.name}`);
+ }
+
+ // Warn if book code wasn't extracted
+ if (!bookCodeExtracted && bookCode === 'XXX') {
+ console.warn(`[USFM Parser] Book code not extracted from file ${file.name}, using default 'XXX'`);
+ }
+
+ return {
+ bookCode,
+ bookName,
+ fileName: file.name,
+ cells,
+ verseCount,
+ paratextCount,
+ chapters: Array.from(chapters).sort((a, b) => a - b),
+ footnoteCount: 0, // TODO: Extract footnotes if needed
+ footnotes: [],
+ originalUsfmContent: originalContent,
+ lineMappings,
+ };
+}