diff --git a/src/exportHandler/exportHandler.ts b/src/exportHandler/exportHandler.ts index 73c7bc8c2..7bedf94ad 100644 --- a/src/exportHandler/exportHandler.ts +++ b/src/exportHandler/exportHandler.ts @@ -853,6 +853,206 @@ async function exportCodexContentAsObsRoundtrip( ); } +/** + * USFM (Unified Standard Format Marker) Round-trip export + * Rebuilds original USFM file with translated content + */ +async function exportCodexContentAsUsfmRoundtrip( + userSelectedPath: string, + filesToExport: string[], + _options?: ExportOptions +) { + const workspaceFolders = vscode.workspace.workspaceFolders; + if (!workspaceFolders) { + vscode.window.showErrorMessage("No workspace folder found."); + return; + } + + const exportFolder = vscode.Uri.file(userSelectedPath); + + return vscode.window.withProgress( + { + location: vscode.ProgressLocation.Notification, + title: "Exporting USFM Round-trip", + cancellable: false, + }, + async (progress) => { + const increment = filesToExport.length > 0 ? 100 / filesToExport.length : 100; + + // Import USFM exporter from experimental (now standalone implementation) + const experimentalExporter = await import("../../webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/usfmExporter"); + const exportUsfmRoundtrip = experimentalExporter.exportUsfmRoundtrip; + + // For each selected codex file, reconstruct the USFM with translations + for (const [index, filePath] of filesToExport.entries()) { + progress.report({ message: `Processing ${index + 1}/${filesToExport.length}`, increment }); + try { + const file = vscode.Uri.file(filePath); + const fileName = basename(file.fsPath); + const bookCode = fileName.split(".")[0] || ""; + + console.log(`[USFM Export] Processing ${fileName} using USFM round-trip exporter`); + + // Read codex notebook + const codexNotebook = await readCodexNotebookFromUri(file); + + // Check if this is a USFM file (experimental or standalone) + const importerType = (codexNotebook.metadata as any)?.importerType; + const corpusMarker = (codexNotebook.metadata as any)?.corpusMarker; + + if (importerType !== 'usfm-experimental' && corpusMarker !== 'usfm') { + console.warn(`[USFM Export] Skipping ${fileName} - not imported with USFM importer (importerType: ${importerType}, corpusMarker: ${corpusMarker})`); + vscode.window.showWarningMessage(`Skipping ${fileName} - not imported with USFM importer`); + continue; + } + + // Get original file name from metadata with fallback + // Try multiple sources: codex metadata, source notebook metadata, or construct from bookCode + let metadataOriginalFileName = (codexNotebook.metadata as any)?.originalFileName; + const metadataBookCode = (codexNotebook.metadata as any)?.bookCode; + const finalBookCode = metadataBookCode || bookCode; + + // If not found in codex metadata, try source notebook metadata + if (!metadataOriginalFileName) { + try { + const sourceFileName = fileName.replace('.codex', '.source'); + const sourceFileUri = vscode.Uri.joinPath( + workspaceFolders[0].uri, + ".project", + "sourceTexts", + sourceFileName + ); + const sourceNotebook = await readCodexNotebookFromUri(sourceFileUri); + metadataOriginalFileName = (sourceNotebook.metadata as any)?.originalFileName; + if (metadataOriginalFileName) { + console.log(`[USFM Export] Found originalFileName in source notebook: ${metadataOriginalFileName}`); + } + } catch (error) { + // Source notebook not found or error reading it, continue with fallbacks + console.log(`[USFM Export] Could not read source notebook for originalFileName`); + } + } + + // Try common USFM file extensions + const possibleExtensions = ['.usfm', '.sfm', '.USFM', '.SFM']; + let originalFileName = metadataOriginalFileName; + + // If no originalFileName, try to find it in originals folder + if (!originalFileName && finalBookCode) { + const originalsDir = vscode.Uri.joinPath( + workspaceFolders[0].uri, + ".project", + "attachments", + "originals" + ); + + // Try each extension + for (const ext of possibleExtensions) { + const testFileName = `${finalBookCode}${ext}`; + const testUri = vscode.Uri.joinPath(originalsDir, testFileName); + try { + await vscode.workspace.fs.stat(testUri); + originalFileName = testFileName; + console.log(`[USFM Export] Found original file: ${testFileName}`); + break; + } catch { + // File doesn't exist, try next extension + } + } + } + + // Final fallback: construct filename from bookCode + if (!originalFileName) { + originalFileName = `${finalBookCode}.usfm`; + console.log(`[USFM Export] Using fallback filename: ${originalFileName}`); + } + + // Load original USFM file from attachments/originals + const originalsDir = vscode.Uri.joinPath( + workspaceFolders[0].uri, + ".project", + "attachments", + "originals" + ); + const originalFileUri = vscode.Uri.joinPath(originalsDir, originalFileName); + + let originalUsfmContent: string; + try { + const originalFileData = await vscode.workspace.fs.readFile(originalFileUri); + originalUsfmContent = new TextDecoder('utf-8').decode(originalFileData); + console.log(`[USFM Export] Loaded original USFM file: ${originalFileName}`); + } catch (error) { + // Fallback: try to get from structureMetadata if available + const structureMetadata = (codexNotebook.metadata as any)?.structureMetadata; + if (structureMetadata?.originalUsfmContent) { + originalUsfmContent = structureMetadata.originalUsfmContent; + console.log(`[USFM Export] Using original USFM content from metadata (file not found at ${originalFileUri.fsPath})`); + } else { + throw new Error(`Original USFM file not found at ${originalFileUri.fsPath} and no original content in metadata`); + } + } + + // Build codex cells array + // Include id property if it exists (some cells have id at top level, others in metadata.id) + const codexCells = codexNotebook.cells.map(cell => { + const cellData: any = { + kind: cell.kind, + value: cell.value, + metadata: cell.metadata, + }; + // Include id if it exists at top level (for ProcessedCell structure) + if ((cell as any).id) { + cellData.id = (cell as any).id; + } + return cellData; + }); + + // Get lineMappings from structureMetadata if available (for standalone exporter) + const structureMetadata = (codexNotebook.metadata as any)?.structureMetadata; + const lineMappings = structureMetadata?.lineMappings; + + // Debug: Log structureMetadata and lineMappings + if (lineMappings) { + console.log(`[USFM Export] Found lineMappings: ${lineMappings.length} entries`); + const sampleMapping = lineMappings.find((m: any) => m.cellId); + console.log(`[USFM Export] Sample mapping with cellId:`, sampleMapping); + console.log(`[USFM Export] Mappings with cellId count:`, lineMappings.filter((m: any) => m.cellId && m.cellId !== '').length); + } else { + console.warn(`[USFM Export] No lineMappings found in structureMetadata`); + console.log(`[USFM Export] structureMetadata keys:`, structureMetadata ? Object.keys(structureMetadata) : 'null'); + } + + // Export USFM with translations + // If we have lineMappings, use them for precise round-trip export + let updatedUsfmContent: string; + if (lineMappings) { + updatedUsfmContent = await exportUsfmRoundtrip(originalUsfmContent, lineMappings, codexCells); + } else { + // Use backward-compatible signature (no lineMappings - fallback mode) + updatedUsfmContent = await exportUsfmRoundtrip(originalUsfmContent, codexCells); + } + + // Save to export folder + const timestamp = new Date().toISOString().replace(/[:.]/g, "-"); + const exportedName = originalFileName.replace(/\.(usfm|sfm|USFM|SFM)$/i, `_${timestamp}_translated.$1`); + const exportedUri = vscode.Uri.joinPath(exportFolder, exportedName); + + const encoder = new TextEncoder(); + await vscode.workspace.fs.writeFile(exportedUri, encoder.encode(updatedUsfmContent)); + + console.log(`[USFM Export] ✓ Exported ${exportedName}`); + + } catch (error) { + console.error(`[USFM Export] Error exporting ${filePath}:`, error); + vscode.window.showErrorMessage(`Failed to export ${basename(filePath)}: ${error instanceof Error ? error.message : 'Unknown error'}`); + } + } + + vscode.window.showInformationMessage(`USFM round-trip export completed to ${userSelectedPath}`); + } + ); +} + /** * TMS (Translation Memory System) Round-trip export * Supports both TMX and XLIFF formats @@ -1074,6 +1274,19 @@ async function exportCodexContentAsRebuild( // TMS (Translation Memory System) files use the TMS exporter filesByType['tms'] = filesByType['tms'] || []; filesByType['tms'].push(filePath); + } else if ( + corpusMarker === 'usfm' || + importerType === 'usfm-experimental' || + importerType === 'usfm' || + // Also check for NT/OT corpus markers with USFM file extensions (Bible books imported as USFM) + ((corpusMarker === 'NT' || corpusMarker === 'OT') && + originalFileName && + (originalFileName.endsWith('.usfm') || originalFileName.endsWith('.sfm') || originalFileName.endsWith('.USFM') || originalFileName.endsWith('.SFM'))) || + (originalFileName && (originalFileName.endsWith('.usfm') || originalFileName.endsWith('.sfm') || originalFileName.endsWith('.USFM') || originalFileName.endsWith('.SFM'))) + ) { + // USFM files use the USFM round-trip exporter + filesByType['usfm'] = filesByType['usfm'] || []; + filesByType['usfm'].push(filePath); } else { unsupportedFiles.push({ file: basename(filePath), marker: corpusMarker || importerType || 'unknown' }); } @@ -1193,6 +1406,22 @@ async function exportCodexContentAsRebuild( } } + // Export USFM files + if (filesByType['usfm']?.length > 0) { + console.log(`[Rebuild Export] Exporting ${filesByType['usfm'].length} USFM file(s)...`); + progress.report({ + message: `Exporting ${filesByType['usfm'].length} USFM file(s)...`, + increment: 20 + }); + try { + await exportCodexContentAsUsfmRoundtrip(userSelectedPath, filesByType['usfm'], options); + processedCount += filesByType['usfm'].length; + } catch (error) { + console.error('[Rebuild Export] USFM export failed:', error); + vscode.window.showErrorMessage(`USFM export failed: ${error instanceof Error ? error.message : 'Unknown error'}`); + } + } + progress.report({ message: "Complete", increment: 30 }); // Show summary diff --git a/src/projectManager/projectExportView.ts b/src/projectManager/projectExportView.ts index 6c278e743..48c6b13da 100644 --- a/src/projectManager/projectExportView.ts +++ b/src/projectManager/projectExportView.ts @@ -334,13 +334,14 @@ function getWebviewContent(
Rebuild Export -

Intelligently detects file type and exports back to original format (DOCX, IDML, Biblica, OBS, TMS)

+

Intelligently detects file type and exports back to original format (DOCX, IDML, Biblica, OBS, TMS, USFM)

DOCX IDML Biblica OBS TMS + USFM
diff --git a/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts b/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts index 89bf98b51..5af432a53 100644 --- a/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts +++ b/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts @@ -5,7 +5,7 @@ import { promisify } from "util"; import { exec } from "child_process"; import { getWebviewHtml } from "../../utils/webviewTemplate"; import { createNoteBookPair } from "./codexFIleCreateUtils"; -import { WriteNotebooksMessage, WriteTranslationMessage, OverwriteResponseMessage, WriteNotebooksWithAttachmentsMessage, SelectAudioFileMessage, ReprocessAudioFileMessage, RequestAudioSegmentMessage, FinalizeAudioImportMessage, UpdateAudioSegmentsMessage } from "../../../webviews/codex-webviews/src/NewSourceUploader/types/plugin"; +import { WriteNotebooksMessage, WriteTranslationMessage, OverwriteResponseMessage, WriteNotebooksWithAttachmentsMessage, SelectAudioFileMessage, ReprocessAudioFileMessage, RequestAudioSegmentMessage, FinalizeAudioImportMessage, UpdateAudioSegmentsMessage, SaveFileMessage } from "../../../webviews/codex-webviews/src/NewSourceUploader/types/plugin"; import { handleSelectAudioFile, handleReprocessAudioFile, @@ -317,8 +317,12 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide message as FinalizeAudioImportMessage, token, webviewPanel, - (msg, tok, pan) => this.handleWriteNotebooks(msg as WriteNotebooksMessage, tok, pan) + async (msg, tok, pan) => { + await this.handleWriteNotebooks(msg as WriteNotebooksMessage, tok, pan); + } ); + } else if (message.command === "saveFile") { + await this.handleSaveFile(message as SaveFileMessage, webviewPanel); } } catch (error) { console.error("Error handling message:", error); @@ -1328,6 +1332,76 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide } } + /** + * Handle saveFile command from webview - saves a file using VS Code's save dialog + */ + private async handleSaveFile(message: SaveFileMessage, webviewPanel: vscode.WebviewPanel): Promise { + try { + const { fileName, dataBase64, mime } = message; + + // Extract base64 data (handle data: URL format) + let base64Data = dataBase64; + if (base64Data.includes(',')) { + // Remove data: URL prefix if present + base64Data = base64Data.split(',')[1]; + } + + // Convert base64 to Buffer + const buffer = Buffer.from(base64Data, 'base64'); + + if (buffer.length === 0) { + throw new Error('File data is empty'); + } + + // Show save dialog + const workspaceFolder = vscode.workspace.workspaceFolders?.[0]; + const defaultUri = workspaceFolder + ? vscode.Uri.joinPath(workspaceFolder.uri, fileName) + : undefined; + + const saveUri = await vscode.window.showSaveDialog({ + defaultUri, + saveLabel: 'Save', + filters: mime + ? { + 'All Files': ['*'], + [mime]: [fileName.split('.').pop() || '*'] + } + : undefined + }); + + if (!saveUri) { + // User cancelled + webviewPanel.webview.postMessage({ + command: "notification", + type: "info", + message: "File save cancelled" + }); + return; + } + + // Write file + await vscode.workspace.fs.writeFile(saveUri, buffer); + + // Send success notification + webviewPanel.webview.postMessage({ + command: "notification", + type: "success", + message: `File saved successfully: ${path.basename(saveUri.fsPath)}` + }); + + console.log(`[NEW SOURCE UPLOADER] File saved: ${saveUri.fsPath} (${buffer.length} bytes)`); + + } catch (error) { + console.error("[NEW SOURCE UPLOADER] Error saving file:", error); + webviewPanel.webview.postMessage({ + command: "notification", + type: "error", + message: error instanceof Error ? error.message : "Failed to save file" + }); + } + } + private checkIfBibleContent(document: vscode.NotebookDocument): boolean { // Check first few cells to see if they contain Bible verse references const cellsToCheck = Math.min(5, document.cellCount); diff --git a/src/test/suite/integration/project-healing.test.ts b/src/test/suite/integration/project-healing.test.ts index f9a286719..72f03f66f 100644 --- a/src/test/suite/integration/project-healing.test.ts +++ b/src/test/suite/integration/project-healing.test.ts @@ -360,7 +360,9 @@ suite("Integration: Project healing", () => { fs.rmSync(tempBackupDir, { recursive: true, force: true }); }); - test("Healing preserves .project directory structure via merge", async () => { + test("Healing preserves .project directory structure via merge", async function () { + this.timeout(10000); // Increase timeout for file operations + // Add additional files to .project const attachmentsDir = path.join(tempDir, ".project", "attachments"); fs.mkdirSync(attachmentsDir, { recursive: true }); diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/registry.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/registry.tsx index b9723644d..aec7147b0 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/registry.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/registry.tsx @@ -16,7 +16,8 @@ import { // import { docxImporterPlugin } from "./docx/index.tsx"; // Old mammoth.js importer import { docxRoundtripImporterPlugin as docxImporterPlugin } from "./docx/experiment/index.tsx"; // New round-trip importer import { markdownImporterPlugin } from "./markdown/index.tsx"; -import { usfmImporterPlugin } from "./usfm/index.tsx"; +import { usfmImporterPlugin } from "./usfm/index.tsx"; // Original USFM importer +import { usfmExperimentalImporterPlugin } from "./usfm/experimental/index.tsx"; // Experimental round-trip importer (standalone with headers in chapter 1) import { ebibleDownloadImporterPlugin } from "./ebibleCorpus/index.tsx"; import { maculaBibleImporterPlugin } from "./maculaBible/index.tsx"; import { subtitlesImporterPlugin } from "./subtitles/index.tsx"; @@ -26,6 +27,7 @@ import { paratextImporterPlugin } from "./paratext/index.tsx"; import { spreadsheetImporterPlugin } from "./spreadsheet/index.tsx"; import { audioImporterPlugin } from "./audio/index.tsx"; import { biblicaImporterPlugin } from "./biblica/index.tsx"; +import { biblicaSwapperImporterPlugin } from "./biblica-swapper/index.tsx"; import { tmsImporterPlugin } from "./tms/index.tsx"; // import { rtfImporterPlugin } from "./rtf/index.tsx"; import { pdfImporterPlugin } from "./pdf/index.tsx"; @@ -118,6 +120,12 @@ export const importerPlugins: ImporterPlugin[] = [ description: "Unified Standard Format Marker files", tags: [...(usfmImporterPlugin.tags || []), "Specialized", "Bible", "USFM"], }, + { + ...usfmExperimentalImporterPlugin, + name: "USFM Experimental", + description: "USFM files with round-trip export support (headers in chapter 1, verse-only target imports)", + tags: [...(usfmExperimentalImporterPlugin.tags || []), "Specialized", "Bible", "USFM", "Experimental", "Round-trip"], + }, { ...paratextImporterPlugin, name: "Paratext Projects", @@ -147,6 +155,12 @@ export const importerPlugins: ImporterPlugin[] = [ description: "Biblica IDML importer with Study Bible + Translated Bible support", tags: [...(biblicaImporterPlugin.tags || []), "Specialized", "Bible", "Biblica"], }, + { + ...biblicaSwapperImporterPlugin, + name: "Biblica Bible Swapper", + description: "Swap Bible text between two IDML files while preserving notes", + tags: [...(biblicaSwapperImporterPlugin.tags || []), "Specialized", "Bible", "Biblica"], + }, { ...obsImporterPlugin, name: "Bible Stories", diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/UsfmImporterForm.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/UsfmImporterForm.tsx new file mode 100644 index 000000000..b0269caa8 --- /dev/null +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/UsfmImporterForm.tsx @@ -0,0 +1,409 @@ +import React, { useState, useCallback } from "react"; +import { + ImporterComponentProps, + AlignedCell, + CellAligner, + ImportedContent, + sequentialCellAligner, +} from "../../../types/plugin"; +import { usfmCellAligner } from "./usfmCellAligner"; +import { NotebookPair, ImportProgress } from "../../../types/common"; +import { Button } from "../../../../components/ui/button"; +import { + Card, + CardContent, + CardDescription, + CardHeader, + CardTitle, +} from "../../../../components/ui/card"; +import { Progress } from "../../../../components/ui/progress"; +import { Alert, AlertDescription } from "../../../../components/ui/alert"; +import { Upload, FileText, CheckCircle, XCircle, ArrowLeft, Eye, Hash } from "lucide-react"; +import { Badge } from "../../../../components/ui/badge"; +import { usfmExperimentalImporter } from "./index"; +import { handleImportCompletion, notebookToImportedContent } from "../../common/translationHelper"; +import { AlignmentPreview } from "../../../components/AlignmentPreview"; + +// Use the experimental parser functions +const { validateFile, parseFile } = usfmExperimentalImporter; + +export const UsfmImporterForm: React.FC = (props) => { + const { onCancel, onTranslationComplete, alignContent, wizardContext } = props; + const [files, setFiles] = useState(null); + const [isDirty, setIsDirty] = useState(false); + const [isProcessing, setIsProcessing] = useState(false); + const [isAligning, setIsAligning] = useState(false); + const [isRetrying, setIsRetrying] = useState(false); + const [progress, setProgress] = useState([]); + const [error, setError] = useState(null); + const [results, setResults] = useState([]); + const [alignedCells, setAlignedCells] = useState(null); + const [importedContent, setImportedContent] = useState([]); + const [targetCells, setTargetCells] = useState([]); + const [previewFiles, setPreviewFiles] = useState>([]); + + const isTranslationImport = wizardContext?.intent === "target"; + const selectedSource = wizardContext?.selectedSource; + + const handleFileSelect = useCallback(async (e: React.ChangeEvent) => { + const selectedFiles = e.target.files; + if (selectedFiles && selectedFiles.length > 0) { + setFiles(selectedFiles); + setIsDirty(true); + setError(null); + setProgress([]); + setResults([]); + + // Show preview of first few files + const previews: Array<{ name: string; preview: string }> = []; + for (let i = 0; i < Math.min(3, selectedFiles.length); i++) { + const file = selectedFiles[i]; + try { + const text = await file.text(); + previews.push({ + name: file.name, + preview: text.substring(0, 300), + }); + } catch (err) { + console.warn("Could not preview file:", file.name, err); + } + } + setPreviewFiles(previews); + } + }, []); + + const handleImport = async () => { + if (!files || files.length === 0) return; + + setIsProcessing(true); + setError(null); + setProgress([]); + setResults([]); + setAlignedCells(null); + + try { + const notebookPairs: NotebookPair[] = []; + + // Progress callback + const onProgress = (progress: ImportProgress) => { + setProgress((prev) => [ + ...prev.filter((p) => p.stage !== progress.stage), + progress, + ]); + }; + + // Process each file + for (let i = 0; i < files.length; i++) { + const file = files[i]; + + onProgress({ + stage: "Processing", + message: `Processing ${file.name} (${i + 1}/${files.length})...`, + progress: (i / files.length) * 60, + }); + + // Validate file + const validation = await validateFile(file); + if (!validation.isValid) { + console.warn(`Skipping invalid file ${file.name}:`, validation.errors); + continue; + } + + // Parse file + // For target imports, only parse verses (skip headers, sections, etc.) + const importResult = await parseFile(file, onProgress, isTranslationImport); + + if (importResult.success && importResult.notebookPair) { + notebookPairs.push(importResult.notebookPair); + } else { + console.warn(`Failed to parse ${file.name}:`, importResult.error); + } + } + + if (notebookPairs.length === 0) { + throw new Error("No valid USFM files could be processed"); + } + + setResults(notebookPairs); + + // For translation imports, perform alignment + if (isTranslationImport && alignContent && selectedSource) { + onProgress({ + stage: "Alignment", + message: "Aligning USFM content with target cells...", + progress: 80, + }); + + setIsAligning(true); + + try { + // For multi-file imports, we'll use the first file for now + const primaryNotebook = notebookPairs[0]; + const importedContent = notebookToImportedContent(primaryNotebook); + setImportedContent(importedContent); + + // Use USFM-specific cell aligner that matches verses by verse number and chapter + // This ensures verses end up in the correct cells even if the target file is shorter + const aligned = await alignContent( + importedContent, + selectedSource.path, + usfmCellAligner + ); + + setAlignedCells(aligned); + setIsAligning(false); + + onProgress({ + stage: "Complete", + message: "Alignment complete - review and confirm", + progress: 100, + }); + } catch (err) { + setIsAligning(false); + throw new Error( + `Alignment failed: ${err instanceof Error ? err.message : "Unknown error"}` + ); + } + } else { + // For source imports, complete normally + onProgress({ + stage: "Complete", + message: `Successfully processed ${notebookPairs.length} file(s)`, + progress: 100, + }); + + setIsDirty(false); + + setTimeout(async () => { + try { + // For multi-file imports, pass all notebook pairs for batch import + const notebooks = + notebookPairs.length === 1 ? notebookPairs[0] : notebookPairs; + await handleImportCompletion(notebooks, props); + } catch (err) { + setError(err instanceof Error ? err.message : "Failed to complete import"); + } + }, 2000); + } + } catch (err) { + setError(err instanceof Error ? err.message : "Unknown error occurred"); + } finally { + setIsProcessing(false); + } + }; + + const handleConfirmAlignment = () => { + if (!alignedCells || !selectedSource || !onTranslationComplete) return; + onTranslationComplete(alignedCells, selectedSource.path); + }; + + const handleRetryAlignment = async (aligner: CellAligner) => { + if (!alignContent || !selectedSource || !importedContent) return; + + setIsRetrying(true); + setError(null); + + try { + const aligned = await alignContent(importedContent, selectedSource.path, aligner); + setAlignedCells(aligned); + } catch (err) { + setError(err instanceof Error ? err.message : "Alignment retry failed"); + } finally { + setIsRetrying(false); + } + }; + + const handleCancel = () => { + if (isDirty && !window.confirm("Cancel import? Any unsaved changes will be lost.")) { + return; + } + onCancel(); + }; + + const totalProgress = + progress.length > 0 + ? Math.round(progress.reduce((sum, p) => sum + (p.progress || 0), 0) / progress.length) + : 0; + + const totalCells = results.reduce((sum, pair) => sum + pair.source.cells.length, 0); + + // Render alignment preview for translation imports + if (alignedCells && isTranslationImport) { + return ( + + ); + } + + return ( +
+
+
+

+ + Import USFM Files {isTranslationImport && "(Translation)"} +

+ {isTranslationImport && selectedSource && ( +

+ Importing translation for:{" "} + {selectedSource.name} +

+ )} +
+ +
+ + + + Select USFM Files + + {isTranslationImport + ? "Import USFM translation files that will be aligned with existing cells. Content will be matched by verse references or inserted sequentially." + : "Import Unified Standard Format Marker (USFM) files used for biblical texts. Supports round-trip export with structure preservation."} + + + +
+ .usfm + .sfm + .SFM + .USFM + Round-trip +
+ +
+ + +
+ + {files && files.length > 0 && ( +
+
+
+ +
+

+ {files.length} file{files.length > 1 ? "s" : ""}{" "} + selected +

+

+ {Array.from(files) + .map((f) => f.name) + .slice(0, 3) + .join(", ")} + {files.length > 3 && ` and ${files.length - 3} more...`} +

+
+
+ +
+ + {previewFiles.length > 0 && ( + + + + + File Preview{" "} + {previewFiles.length < files.length && + `(showing first ${previewFiles.length})`} + + + + {previewFiles.map((file, index) => ( +
+

{file.name}

+
+                                                    {file.preview}
+                                                    {file.preview.length >= 300 && "..."}
+                                                
+
+ ))} +
+
+ )} +
+ )} + + {progress.length > 0 && ( +
+ + {progress.map((item, index) => ( +
+ {item.stage}: {item.message} +
+ ))} +
+ )} + + {error && ( + + + {error} + + )} + + {results.length > 0 && ( + + + + Successfully imported {results.length} book + {results.length > 1 ? "s" : ""} + with {totalCells} total cells! + + + )} +
+
+
+ ); +}; + diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/index.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/index.ts new file mode 100644 index 000000000..91dba0a79 --- /dev/null +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/index.ts @@ -0,0 +1,193 @@ +/** + * Experimental USFM Importer with round-trip export support + * Preserves original file structure and saves to attachments/originals + * Standalone implementation - doesn't rely on common/usfmUtils.ts + */ + +import { + ImporterPlugin, + FileValidationResult, + ImportResult, + ProgressCallback, +} from '../../../types/common'; +import { + createProgress, + validateFileExtension, +} from '../../../utils/workflowHelpers'; +import { parseUsfmFile } from './usfmParser'; +import { ProcessedNotebook, NotebookPair } from '../../../types/common'; +import { getCorpusMarkerForBook } from '../../../utils/corpusUtils'; + +const SUPPORTED_EXTENSIONS = ['usfm', 'sfm', 'SFM', 'USFM']; +const SUPPORTED_MIME_TYPES = ['text/plain', 'application/octet-stream']; + +/** + * Validates a USFM file + */ +export const validateFile = async (file: File): Promise => { + const errors: string[] = []; + const warnings: string[] = []; + + // Check file extension + if (!validateFileExtension(file.name, SUPPORTED_EXTENSIONS)) { + errors.push('File must have .usfm, .sfm, .SFM, or .USFM extension'); + } + + // Check file size (warn if > 10MB) + if (file.size > 10 * 1024 * 1024) { + warnings.push('Large USFM files may take longer to process'); + } + + // Basic content validation + try { + const content = await file.text(); + // Basic USFM validation + if (!content.includes('\\')) { + errors.push('File does not appear to contain USFM markers'); + } + if (!content.match(/\\id\s+/i)) { + warnings.push('No \\id marker found - some USFM files may not include this'); + } + } catch (error) { + errors.push('Could not read file content'); + } + + return { + isValid: errors.length === 0, + fileType: 'usfm', + errors, + warnings, + metadata: { + fileSize: file.size, + lastModified: new Date(file.lastModified).toISOString(), + }, + }; +}; + +/** + * Parses a USFM file into notebook cells with round-trip support + * @param file - The USFM file to parse + * @param onProgress - Optional progress callback + * @param versesOnly - If true, only parse verses (skip headers, sections, etc.) - used for target imports + */ +export const parseFile = async ( + file: File, + onProgress?: ProgressCallback, + versesOnly: boolean = false +): Promise => { + try { + onProgress?.(createProgress('Reading File', 'Reading USFM file...', 10)); + + // Read original file as ArrayBuffer for saving to attachments + const arrayBuffer = await file.arrayBuffer(); + + onProgress?.(createProgress('Parsing USFM', versesOnly ? 'Parsing USFM verses only...' : 'Parsing USFM content...', 30)); + + // Parse USFM file with structure preservation + // If versesOnly is true, only parse verses (for target imports) + const parsedDocument = await parseUsfmFile(file, versesOnly); + + onProgress?.(createProgress('Creating Notebooks', 'Converting to notebook pairs...', 80)); + + // Create notebook pair with proper metadata + const baseName = file.name.replace(/\.[^/.]+$/, ''); + + const sourceNotebook: ProcessedNotebook = { + name: baseName, + cells: parsedDocument.cells, + metadata: { + id: `usfm-experimental-source-${Date.now()}`, + originalFileName: file.name, + // Store original file data as ArrayBuffer for saving to attachments/originals + originalFileData: arrayBuffer, + importerType: 'usfm-experimental', + fileType: 'usfm', + corpusMarker: getCorpusMarkerForBook(baseName) || 'usfm', + createdAt: new Date().toISOString(), + bookCode: parsedDocument.bookCode, + bookName: parsedDocument.bookName, + totalVerses: parsedDocument.verseCount, + totalParatext: parsedDocument.paratextCount, + chapters: parsedDocument.chapters, + footnoteCount: parsedDocument.footnoteCount, + // Store structure metadata for export + structureMetadata: { + originalUsfmContent: parsedDocument.originalUsfmContent, + lineMappings: parsedDocument.lineMappings, // Store line mappings for round-trip + }, + }, + }; + + // Create codex notebook (empty cells for translation) + const codexCells = parsedDocument.cells.map(sourceCell => { + const isStyleCell = sourceCell.metadata?.type === 'style'; + return { + id: sourceCell.id, + content: isStyleCell ? sourceCell.content : '', // Keep style cells, empty others + images: sourceCell.images, + metadata: sourceCell.metadata, + }; + }); + + const codexNotebook: ProcessedNotebook = { + name: baseName, + cells: codexCells, + metadata: { + ...sourceNotebook.metadata, + id: `usfm-experimental-codex-${Date.now()}`, + // Don't duplicate original file data in codex metadata + originalFileData: undefined, + }, + }; + + const notebookPair: NotebookPair = { + source: sourceNotebook, + codex: codexNotebook, + }; + + onProgress?.(createProgress('Complete', 'USFM processing complete', 100)); + + return { + success: true, + notebookPair, + metadata: { + bookCode: parsedDocument.bookCode, + bookName: parsedDocument.bookName, + segmentCount: parsedDocument.cells.length, + verseCount: parsedDocument.verseCount, + paratextCount: parsedDocument.paratextCount, + chapters: parsedDocument.chapters, + footnoteCount: parsedDocument.footnoteCount, + }, + }; + + } catch (error) { + onProgress?.(createProgress('Error', 'USFM processing failed')); + + return { + success: false, + error: error instanceof Error ? error.message : 'Unknown error occurred', + }; + } +}; + +export const usfmExperimentalImporter: ImporterPlugin = { + name: 'USFM Importer (Experimental)', + supportedExtensions: SUPPORTED_EXTENSIONS, + supportedMimeTypes: SUPPORTED_MIME_TYPES, + description: 'Import Unified Standard Format Marker (USFM) biblical text files with round-trip export support. Headers are included in chapter 1.', + validateFile, + parseFile, + exportFile: async (originalUsfmContent: string, codexCells: Array<{ kind: number; value: string; metadata: any; }>, metadata?: any) => { + const { exportUsfmRoundtrip } = await import('./usfmExporter'); + const structureMetadata = metadata?.structureMetadata; + const lineMappings = structureMetadata?.lineMappings; + if (lineMappings) { + return exportUsfmRoundtrip(originalUsfmContent, lineMappings, codexCells); + } else { + // Fallback for old imports without lineMappings + return exportUsfmRoundtrip(originalUsfmContent, codexCells); + } + }, +}; + diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/index.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/index.tsx new file mode 100644 index 000000000..bd527cfff --- /dev/null +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/index.tsx @@ -0,0 +1,14 @@ +import { ImporterPlugin } from "../../../types/plugin"; +import { BookOpen } from "lucide-react"; +import { UsfmImporterForm } from "./UsfmImporterForm"; + +export const usfmExperimentalImporterPlugin: ImporterPlugin = { + id: "usfm-experimental", + name: "USFM Biblical Texts (Experimental)", + description: "Import Unified Standard Format Marker files for biblical texts with round-trip export support", + icon: BookOpen, + component: UsfmImporterForm, + supportedExtensions: ["usfm", "sfm", "SFM", "USFM"], + enabled: true, +}; + diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/usfmCellAligner.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/usfmCellAligner.ts new file mode 100644 index 000000000..b025815c8 --- /dev/null +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/usfmCellAligner.ts @@ -0,0 +1,211 @@ +/** + * Custom USFM Cell Aligner + * Matches verses by verse number and chapter (cell ID format: "BOOK CHAPTER:VERSE") + * rather than sequentially, ensuring verses end up in the correct cells even if + * the target file is shorter or not well-structured. + */ + +import { CellAligner, AlignedCell, ImportedContent } from '../../../types/plugin'; + +/** + * USFM cell aligner that matches verses by their cell ID (book chapter:verse) + * Falls back to exact ID matching for non-verse content + */ +export const usfmCellAligner: CellAligner = async ( + targetCells: any[], + sourceCells: any[], + importedContent: ImportedContent[] +): Promise => { + const alignedCells: AlignedCell[] = []; + + // Create maps for efficient lookup + // Map by exact cell ID (for verses: "BOOK CHAPTER:VERSE", for others: "BOOK CHAPTER:MARKER:INDEX") + const targetCellsById = new Map(); + + // Map by verse reference (for verse matching: "CHAPTER:VERSE") + const targetVersesByRef = new Map(); + + // Map by cell label (for fallback matching) + const targetCellsByLabel = new Map(); + + // Build lookup maps from target cells + targetCells.forEach((cell) => { + const cellId = cell.metadata?.id || cell.id; + if (cellId) { + const normalizedId = String(cellId).trim().toUpperCase(); + targetCellsById.set(normalizedId, cell); + // Also store original case for exact matching + targetCellsById.set(String(cellId).trim(), cell); + + // Extract verse reference if it's a verse cell + // Verse cell IDs are in format: "BOOK CHAPTER:VERSE" (e.g., "GEN 1:1", "1PE 1:1") + // Match pattern: book code (2+ chars), space(s), chapter number, colon, verse number + const verseMatch = String(cellId).match(/^([A-Z0-9]{2,})\s+(\d+):(\d+[a-z]?)$/i); + if (verseMatch) { + const [, bookCode, chapter, verse] = verseMatch; + const normalizedBookCode = bookCode.toUpperCase(); + // Create verse reference with book code: "BOOK CHAPTER:VERSE" for more precise matching + const verseRefWithBook = `${normalizedBookCode} ${chapter}:${verse}`; + // Also create reference without book: "CHAPTER:VERSE" for fallback matching + const verseRef = `${chapter}:${verse}`; + targetVersesByRef.set(verseRefWithBook, cell); + // Only set verseRef if not already set (prefer book-specific match) + if (!targetVersesByRef.has(verseRef)) { + targetVersesByRef.set(verseRef, cell); + } + } + } + + // Also index by cellLabel for fallback + const cellLabel = cell.metadata?.cellLabel; + if (cellLabel) { + const normalizedLabel = String(cellLabel).trim().toUpperCase(); + targetCellsByLabel.set(normalizedLabel, cell); + // Also store original case + targetCellsByLabel.set(String(cellLabel).trim(), cell); + } + }); + + let exactMatches = 0; + let verseMatches = 0; + let labelMatches = 0; + let unmatched = 0; + + // Track which target cells have been matched + const matchedTargetCells = new Set(); + + // Process each imported content item + // Only match verses to existing target cells - don't create new cells + for (const importedItem of importedContent) { + if (!importedItem.content.trim()) { + continue; // Skip empty content + } + + const importedId = importedItem.id; + let matchedCell: any | null = null; + let alignmentMethod: AlignedCell['alignmentMethod'] = 'custom'; + let confidence = 0.0; + + // Strategy 1: PRIORITIZE cellLabel matching (most reliable for verse matching) + // Check both importedItem.cellLabel and importedItem.metadata?.cellLabel + const cellLabel = importedItem.cellLabel || (importedItem as any).metadata?.cellLabel; + if (cellLabel) { + const labelStr = String(cellLabel).trim(); + const normalizedLabel = labelStr.toUpperCase(); + + if (targetCellsByLabel.has(labelStr)) { + matchedCell = targetCellsByLabel.get(labelStr); + alignmentMethod = 'custom'; + confidence = 0.95; // High confidence for label matching + labelMatches++; + } else if (targetCellsByLabel.has(normalizedLabel)) { + matchedCell = targetCellsByLabel.get(normalizedLabel); + alignmentMethod = 'custom'; + confidence = 0.95; // High confidence for label matching + labelMatches++; + } + } + + // Strategy 2: Try exact ID match (fallback) + // Try both original case and uppercase + if (!matchedCell && importedId) { + const normalizedId = String(importedId).trim().toUpperCase(); + const originalId = String(importedId).trim(); + + if (targetCellsById.has(originalId)) { + matchedCell = targetCellsById.get(originalId); + alignmentMethod = 'exact-id'; + confidence = 1.0; + exactMatches++; + } else if (targetCellsById.has(normalizedId)) { + matchedCell = targetCellsById.get(normalizedId); + alignmentMethod = 'exact-id'; + confidence = 1.0; + exactMatches++; + } + } + + // Strategy 3: Try verse reference matching (for verses) - last resort + // First try with book code for precise matching, then fallback to chapter:verse + if (!matchedCell && importedId) { + // Match pattern: book code (2+ chars), space(s), chapter number, colon, verse number + const verseMatch = String(importedId).match(/^([A-Z0-9]{2,})\s+(\d+):(\d+[a-z]?)$/i); + if (verseMatch) { + const [, bookCode, chapter, verse] = verseMatch; + const normalizedBookCode = bookCode.toUpperCase(); + // Try matching with normalized book code first (more precise) + const verseRefWithBook = `${normalizedBookCode} ${chapter}:${verse}`; + if (targetVersesByRef.has(verseRefWithBook)) { + matchedCell = targetVersesByRef.get(verseRefWithBook); + alignmentMethod = 'custom'; + confidence = 0.9; // High confidence for book-specific verse matching + verseMatches++; + } else { + // Fallback to chapter:verse matching (in case book codes differ slightly) + const verseRef = `${chapter}:${verse}`; + if (targetVersesByRef.has(verseRef)) { + matchedCell = targetVersesByRef.get(verseRef); + alignmentMethod = 'custom'; + confidence = 0.85; // Medium-high confidence for verse matching + verseMatches++; + } + } + } + } + + // Only add aligned cell if we found a match + // Skip unmatched verses - don't create new cells for them + if (matchedCell) { + matchedTargetCells.add(matchedCell); + alignedCells.push({ + notebookCell: matchedCell, + importedContent: importedItem, + alignmentMethod, + confidence, + }); + } else { + // No match found - skip this verse (don't create new cells) + // Log for debugging but don't add to alignedCells + console.warn(`[USFM Aligner] No match found for verse: ${importedId || 'unknown'}`); + unmatched++; + } + } + + // IMPORTANT: Preserve all existing target cells that weren't matched + // This ensures preface cells (chapter 0), headers, and other non-verse cells are kept + for (const targetCell of targetCells) { + if (!matchedTargetCells.has(targetCell)) { + // This cell wasn't matched - preserve it with its original content + alignedCells.push({ + notebookCell: targetCell, + importedContent: { + id: (targetCell.metadata?.id || targetCell.id) || '', + content: targetCell.value || targetCell.content || '', + cellLabel: targetCell.metadata?.cellLabel, + metadata: targetCell.metadata || {}, + }, + alignmentMethod: 'custom', // Preserved existing cell + confidence: 1.0, + }); + } + } + + const preservedCount = targetCells.length - matchedTargetCells.size; + console.log( + `USFM aligner: ${labelMatches} label matches, ${exactMatches} exact matches, ${verseMatches} verse matches, ` + + `${unmatched} unmatched imported verses (skipped), ${preservedCount} existing cells preserved` + ); + + // Debug: Log sample target cell labels and imported labels for troubleshooting + if (unmatched > 0 || labelMatches === 0) { + const sampleTargetLabels = Array.from(targetCellsByLabel.keys()).slice(0, 10); + const sampleImportedLabels = importedContent.slice(0, 10).map(item => + item.cellLabel || (item as any).metadata?.cellLabel || item.id + ); + console.log(`[USFM Aligner] Sample target cell labels:`, sampleTargetLabels); + console.log(`[USFM Aligner] Sample imported labels/IDs:`, sampleImportedLabels); + } + + return alignedCells; +}; + diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/usfmExporter.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/usfmExporter.ts new file mode 100644 index 000000000..fab1cc16d --- /dev/null +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/usfmExporter.ts @@ -0,0 +1,392 @@ +/** + * Standalone USFM Exporter for round-trip functionality + * Rebuilds original USFM file with translated content + * Only replaces text content after markers, preserving all markers and structure + * Skips empty paragraphs (they're not in cells, so they stay as-is) + */ + +import { htmlInlineToUsfm } from './usfmInlineMapper'; + +export interface LineMapping { + lineIndex: number; + cellId: string; + originalLine: string; + marker: string; + hasContent: boolean; +} + +/** + * Export USFM with updated content from Codex cells + * + * @param originalUsfmContent - The original USFM file content as string + * @param lineMappingsOrCells - Either line mappings array OR codex cells (for backward compatibility) + * @param codexCells - Array of Codex cell data with translations (optional if lineMappings provided) + * @returns Updated USFM content as string + */ +export async function exportUsfmRoundtrip( + originalUsfmContent: string, + lineMappingsOrCells: LineMapping[] | Array<{ kind: number; value: string; metadata: any; }>, + codexCells?: Array<{ + kind: number; + value: string; + metadata: any; + }> +): Promise { + // Determine if first param is lineMappings or codexCells (backward compatibility) + let lineMappings: LineMapping[]; + let cells: Array<{ kind: number; value: string; metadata: any; }>; + + if (lineMappingsOrCells.length > 0 && 'lineIndex' in lineMappingsOrCells[0]) { + // First param is lineMappings + lineMappings = lineMappingsOrCells as LineMapping[]; + cells = codexCells || []; + console.log(`[USFM Export] Received lineMappings array with ${lineMappings.length} entries`); + const sampleWithCellId = lineMappings.find(m => m.cellId && m.cellId !== ''); + console.log(`[USFM Export] Sample lineMapping with cellId:`, sampleWithCellId); + console.log(`[USFM Export] LineMappings with cellId: ${lineMappings.filter(m => m.cellId && m.cellId !== '').length}`); + } else { + // First param is codexCells (backward compatibility - old experimental exporter) + cells = lineMappingsOrCells as Array<{ kind: number; value: string; metadata: any; }>; + // Generate lineMappings from original content (less precise but works) + const lines = originalUsfmContent.split(/\r?\n/); + lineMappings = lines.map((line, index) => { + const trimmedLine = line.trim(); + if (!trimmedLine || !trimmedLine.startsWith('\\')) { + return { + lineIndex: index, + cellId: '', + originalLine: line, + marker: '', + hasContent: false, + }; + } + const markerMatch = trimmedLine.match(/^\\([a-zA-Z]+\d*(?:-[se])?)\s*(.*)$/); + const textContent = markerMatch?.[2]?.trim() || ''; + return { + lineIndex: index, + cellId: '', // Will be matched by originalLine/originalText + originalLine: line, + marker: markerMatch ? `\\${markerMatch[1]}` : '', + hasContent: !!textContent, + }; + }); + } + // Build mapping from cellId to translated content (keep as HTML for
handling) + const cellTranslations = new Map(); + + for (const cell of cells) { + const metadata = cell.metadata as any; + const translatedContent = cell.value.trim(); + + // Try to get cellId from multiple possible locations + // VS Code notebooks store id in metadata.id, but our ProcessedCell has id as top-level + const cellId = (cell as any).id || metadata?.id; + + // Skip empty cells or cells without ID + if (!translatedContent || !cellId) { + if (!cellId && translatedContent) { + console.warn(`[USFM Export] Cell has content but no ID:`, { + value: translatedContent.substring(0, 50), + metadataKeys: Object.keys(metadata || {}), + cellKeys: Object.keys(cell || {}) + }); + } + continue; + } + + // Store HTML content as-is (we'll handle
splitting and conversion later) + cellTranslations.set(cellId, translatedContent); + } + + console.log(`[USFM Export] Found ${cellTranslations.size} cell translations`); + console.log(`[USFM Export] Sample cell IDs from translations:`, Array.from(cellTranslations.keys()).slice(0, 5)); + console.log(`[USFM Export] Total line mappings: ${lineMappings.length}`); + const mappingsWithCellId = lineMappings.filter(m => m.cellId && m.cellId !== ''); + console.log(`[USFM Export] Line mappings with cellId: ${mappingsWithCellId.length}`); + console.log(`[USFM Export] Sample line mapping cell IDs:`, mappingsWithCellId.slice(0, 5).map(m => m.cellId)); + + // If lineMappings don't have cellIds, build a fallback mapping by originalLine/originalText + // This handles notebooks imported before cellIds were stored in lineMappings + const fallbackMapping = new Map(); + if (mappingsWithCellId.length === 0 && cellTranslations.size > 0) { + console.log(`[USFM Export] No cellIds in lineMappings, building fallback mapping by originalLine/originalText`); + for (const cell of cells) { + const metadata = cell.metadata as any; + const translatedContent = cell.value.trim(); + if (!translatedContent) continue; + + // Store HTML content as-is (we'll handle conversion later) + + // Try to match by originalLine first + if (metadata?.originalLine) { + const normalizedLine = String(metadata.originalLine).trim().replace(/\s+/g, ' '); + fallbackMapping.set(normalizedLine, translatedContent); + } + + // Also try by originalText + if (metadata?.originalText) { + const normalizedText = String(metadata.originalText).trim().replace(/\s+/g, ' '); + fallbackMapping.set(normalizedText, translatedContent); + } + } + console.log(`[USFM Export] Built fallback mapping with ${fallbackMapping.size} entries`); + } + + // Split original content into lines + const lines = originalUsfmContent.split(/\r?\n/); + const updatedLines: string[] = []; + let translationCount = 0; + let skippedCount = 0; + let fallbackCount = 0; + + // Track which lines to skip (they're part of a multi-line verse we already processed) + const linesToSkip = new Set(); + + // Process each line using the mappings + for (let i = 0; i < lines.length; i++) { + // Skip lines that are part of a multi-line verse we already processed + if (linesToSkip.has(i)) { + skippedCount++; + continue; + } + + const mapping = lineMappings[i]; + + // If no mapping for this line, keep original + if (!mapping) { + updatedLines.push(lines[i]); + continue; + } + + // If line has no content (empty marker or continuation), keep as-is + if (!mapping.hasContent) { + updatedLines.push(mapping.originalLine || lines[i]); + skippedCount++; + continue; + } + + let translation: string | undefined; + + // First try to match by cellId (most precise) + if (mapping.cellId && cellTranslations.has(mapping.cellId)) { + translation = cellTranslations.get(mapping.cellId); + translationCount++; + } + // Fallback: match by originalLine or originalText if cellIds aren't available + else if (fallbackMapping.size > 0 && mapping.originalLine) { + const originalLine = String(mapping.originalLine).trim(); + if (originalLine) { + const normalizedLine = originalLine.replace(/\s+/g, ' '); + if (fallbackMapping.has(normalizedLine)) { + translation = fallbackMapping.get(normalizedLine); + fallbackCount++; + } else { + // Try matching just the text part (after marker) + const markerMatch = originalLine.match(/^\\([a-zA-Z]+\d*(?:-[se])?)\s*(.*)$/); + if (markerMatch) { + const textPart = markerMatch[2]?.trim().replace(/\s+/g, ' '); + if (textPart && fallbackMapping.has(textPart)) { + translation = fallbackMapping.get(textPart); + fallbackCount++; + } + } + } + } + } + + if (translation) { + // Extract marker from original line + const originalLine = mapping.originalLine || lines[i]; + const markerMatch = originalLine.match(/^\\([a-zA-Z]+\d*(?:-[se])?)\s*(.*)$/); + + if (markerMatch) { + const [, marker, originalText] = markerMatch; + + // Handle verse markers specially (need to preserve verse number and break tags) + if (marker === 'v' || marker.startsWith('v')) { + const verseMatch = originalText.match(/^(\d+[a-z]?)\s*(.*)$/); + if (verseMatch) { + const [, verseNum] = verseMatch; + const trimmedTranslation = String(translation).trim(); + + if (trimmedTranslation) { + // Check if this verse has break tags (multi-line verse) + // Find the cell that contains this translation + let breakTagMetadata: string | undefined; + if (mapping.cellId) { + const cell = cells.find(c => { + const cellId = (c as any).id || (c.metadata as any)?.id; + return cellId === mapping.cellId; + }); + if (cell) { + breakTagMetadata = (cell.metadata as any)?.breakTag; + } + } + + // Check if translation contains
tags (multi-line verse) + // Split by
tags BEFORE converting to USFM (since
is structural) + const hasBrTags = //i.test(trimmedTranslation); + + if (hasBrTags) { + // Multi-line verse - split by
and map each part to corresponding USFM line + // Handle

(double break) as \b tag + // Split by
tags - consecutive
tags will create empty parts + const parts = trimmedTranslation.split(//i).map(p => p.trim()); + + // Note: When we split by
,

creates one empty part + // So if breakTags has \b, the corresponding empty part should map to \b + // We'll use breakTags array to determine which break tag to use for each part + + // Find all subsequent break tag lines (\li1, \li2, \b, etc.) that belong to this verse + const breakLines: Array<{ index: number; mapping: LineMapping; originalMarker: string }> = []; + for (let j = i + 1; j < lineMappings.length; j++) { + const nextMapping = lineMappings[j]; + // Check if this line belongs to the same verse (same cellId) + if (nextMapping.cellId === mapping.cellId) { + const breakMarkers = ['\\li1', '\\li2', '\\li3', '\\li4', '\\q1', '\\q2', '\\q3', '\\q4', '\\b']; + if (breakMarkers.includes(nextMapping.marker)) { + breakLines.push({ + index: j, + mapping: nextMapping, + originalMarker: nextMapping.marker + }); + } else { + // We've reached a different marker (next verse or section) - stop + break; + } + } else if (nextMapping.cellId && nextMapping.cellId !== '') { + // Different cellId - stop looking + break; + } + } + + // Get break tags from metadata if available + const breakTags = breakTagMetadata ? breakTagMetadata.split('|').filter(t => t) : []; + + // First part goes to the \v line + const firstPart = parts[0] || ''; + const firstPartUsfm = firstPart ? htmlInlineToUsfm(firstPart) : ''; + if (firstPartUsfm) { + updatedLines.push(`\\${marker} ${verseNum} ${firstPartUsfm}`); + } else { + updatedLines.push(`\\${marker} ${verseNum}`); + } + + // Map each subsequent part to the corresponding break tag line + // Use breakTagMetadata to determine the correct break tag for each position + let breakTagIdx = 0; // Index into breakTags array + let breakLineIdx = 0; // Index into breakLines array + + // Process parts sequentially, matching each to its corresponding break tag + for (let partIdx = 1; partIdx < parts.length; partIdx++) { + const part = parts[partIdx] || ''; + + // Check if we've run out of break tags + if (breakTagIdx >= breakTags.length) { + // No more break tags in metadata - use original markers from breakLines + const partUsfm = part ? htmlInlineToUsfm(part) : ''; + if (breakLineIdx < breakLines.length) { + const breakLine = breakLines[breakLineIdx]; + linesToSkip.add(breakLine.index); + const breakTag = breakLine.originalMarker || '\\li1'; + if (partUsfm) { + updatedLines.push(`${breakTag} ${partUsfm}`); + } else { + updatedLines.push(breakTag); + } + breakLineIdx++; + } else { + updatedLines.push(`\\li1${partUsfm ? ' ' + partUsfm : ''}`); + } + continue; + } + + const currentBreakTag = breakTags[breakTagIdx]; + + if (currentBreakTag === '\\b') { + // \b tag - output blank line marker + // \b produces one
, so it consumes one empty part (if empty) or processes text (if any) + if (breakLineIdx < breakLines.length) { + const breakLine = breakLines[breakLineIdx]; + linesToSkip.add(breakLine.index); + updatedLines.push('\\b'); + breakLineIdx++; + } else { + updatedLines.push('\\b'); + } + breakTagIdx++; + } else { + // Regular break tag (\li1, \q1, etc.) + const partUsfm = part ? htmlInlineToUsfm(part) : ''; + + if (breakLineIdx < breakLines.length) { + const breakLine = breakLines[breakLineIdx]; + linesToSkip.add(breakLine.index); + + // Use break tag from metadata + const breakTag = currentBreakTag || breakLine.originalMarker || '\\li1'; + if (partUsfm) { + updatedLines.push(`${breakTag} ${partUsfm}`); + } else { + // Empty part - output just the break tag (empty \li1 line) + updatedLines.push(breakTag); + } + breakLineIdx++; + } else { + // More parts than break lines - add new break line + const breakTag = currentBreakTag || '\\li1'; + if (partUsfm) { + updatedLines.push(`${breakTag} ${partUsfm}`); + } else { + updatedLines.push(breakTag); + } + } + breakTagIdx++; + } + } + + // Mark any remaining break lines (if we have fewer parts than break lines) to skip + for (let remainingIdx = breakLineIdx; remainingIdx < breakLines.length; remainingIdx++) { + linesToSkip.add(breakLines[remainingIdx].index); + } + } else { + // Single-line verse - convert HTML to USFM and output + const usfmText = htmlInlineToUsfm(trimmedTranslation); + updatedLines.push(`\\${marker} ${verseNum} ${usfmText}`); + } + } else { + // Empty translation - output just marker and verse number + updatedLines.push(`\\${marker} ${verseNum}`); + } + } else { + // Verse without number - shouldn't happen but handle it + const usfmText = htmlInlineToUsfm(String(translation)); + updatedLines.push(`\\${marker} ${usfmText}`); + } + } else { + // All other markers - preserve marker, replace text + const trimmedTranslation = String(translation).trim(); + if (trimmedTranslation) { + const usfmText = htmlInlineToUsfm(trimmedTranslation); + updatedLines.push(`\\${marker} ${usfmText}`); + } else { + // Empty translation - output just marker (will be skipped on re-import) + updatedLines.push(`\\${marker}`); + } + } + } else { + // Couldn't parse marker - keep original + updatedLines.push(originalLine); + } + } else { + // No translation found - keep original + updatedLines.push(mapping.originalLine || lines[i]); + if (mapping.cellId && cellTranslations.has(mapping.cellId)) { + console.warn(`[USFM Export] No translation found for cellId: ${mapping.cellId}`); + } + } + } + + console.log(`[USFM Export] Applied ${translationCount} translations by cellId, ${fallbackCount} by fallback matching, skipped ${skippedCount} empty/continuation lines`); + return updatedLines.join('\n'); +} diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/usfmInlineMapper.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/usfmInlineMapper.ts new file mode 100644 index 000000000..daa3016eb --- /dev/null +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/usfmInlineMapper.ts @@ -0,0 +1,445 @@ +/** + * Standalone USFM Inline Marker Converter + * Converts USFM inline markers to HTML and back + * Copied from common/usfmHtmlMapper.ts to be standalone + */ + +const isAlphaNum = (ch: string) => /[A-Za-z0-9]/.test(ch); + +// Helper function to convert inline markers in footnote text (without processing footnotes recursively) +function convertUsfmInlineMarkersInText(usfmText: string): string { + type StackEntry = { marker: string; closers: string[]; }; + const stack: StackEntry[] = []; + + const openFor = (marker: string): { openers: string[]; closers: string[]; } => { + switch (marker) { + case 'bd': + return { openers: [``], closers: [''] }; + case 'it': + return { openers: [``], closers: [''] }; + case 'bdit': + return { openers: ['', ``], closers: ['', ''] }; + case 'sup': + return { openers: [``], closers: [''] }; + case 'sc': + return { openers: [``], closers: [''] }; + default: + return { openers: [``], closers: [''] }; + } + }; + + let i = 0; + let out = ''; + while (i < usfmText.length) { + const ch = usfmText[i]; + if (ch === '\\') { + let j = i + 1; + let name = ''; + // Support plus-prefixed note-internal markers like \+xt + if (usfmText[j] === '+') { + name += '+'; + j++; + } + while (j < usfmText.length && isAlphaNum(usfmText[j])) { + name += usfmText[j]; + j++; + } + // Milestones \qt-s/\qt-e are treated as inline spans with data-tag; we ignore -s/-e in HTML + if (usfmText[j] === '-' && (usfmText[j + 1] === 's' || usfmText[j + 1] === 'e')) { + j += 2; + } + // Closing marker + if (usfmText[j] === '*') { + let idx = stack.length - 1; + while (idx >= 0 && stack[idx].marker !== name) idx--; + if (idx >= 0) { + const entry = stack.splice(idx, 1)[0]; + for (const closer of entry.closers) out += closer; + } else { + out += ''; + } + j += 1; + i = j; + continue; + } + if (usfmText[j] === ' ') j += 1; + const { openers, closers } = openFor(name); + openers.forEach(op => (out += op)); + stack.push({ marker: name, closers: [...closers] }); + i = j; + } else { + out += ch; + i++; + } + } + // Close any dangling tags + while (stack.length > 0) { + const entry = stack.pop()!; + for (const closer of entry.closers) out += closer; + } + return out; +} + +// Converts USFM inline markers (character-level styling) to HTML +export const convertUsfmInlineMarkersToHtml = (usfmText: string): string => { + // First, handle footnotes (\f...\f*) + // USFM footnote format: \f + \fr reference \ft footnote text\f* + // or simpler: \f + \ft footnote text\f* + let processedText = usfmText; + let footnoteCounter = 0; + + // Match footnote pattern: \f + \fr? ... \ft ... \f* + const footnoteRegex = /\\f\s+([+\-*]|\w+)\s*(.*?)\\f\*/gs; + const footnotes: Array<{ caller: string; content: string; position: number }> = []; + + let match; + while ((match = footnoteRegex.exec(usfmText)) !== null) { + footnoteCounter++; + const [fullMatch, caller, footnoteContent] = match; + const position = match.index; + + // Parse footnote content + let reference = ''; + let footnoteText = ''; + + // Extract \fr reference if present + const frMatch = footnoteContent.match(/\\fr\s+([^\\]+)/); + if (frMatch) { + reference = frMatch[1].trim(); + } + + // Extract \ft text (everything after \fr or the main content) + const ftMatch = footnoteContent.match(/\\ft\s+(.*)/s); + if (ftMatch) { + footnoteText = ftMatch[1].trim(); + } else { + // No \ft marker, use content directly (after removing \fr if present) + footnoteText = footnoteContent.replace(/\\fr\s+[^\\]+/g, '').trim(); + } + + // Convert footnote text to HTML (handle inline markers within footnote) + // Use a helper function to avoid recursion + const footnoteHtml = convertUsfmInlineMarkersInText(footnoteText); + + // Build footnote HTML in the format:

reference: text

+ let footnoteContentHtml = ''; + if (reference) { + footnoteContentHtml = `

${reference}:  ${footnoteHtml}

`; + } else { + footnoteContentHtml = `

${footnoteHtml}

`; + } + + // Escape HTML for use in data attribute + const escapedFootnote = footnoteContentHtml + .replace(/"/g, '"') + .replace(/'/g, '''); + + footnotes.push({ + caller: caller || '+', + content: escapedFootnote, + position, + }); + } + + // Replace footnotes in reverse order to preserve positions + for (let i = footnotes.length - 1; i >= 0; i--) { + const footnote = footnotes[i]; + const footnoteRegex2 = /\\f\s+([+\-*]|\w+)\s*(.*?)\\f\*/s; + const footnoteMatch = processedText.substring(footnote.position).match(footnoteRegex2); + if (footnoteMatch) { + const footnoteNumber = i + 1; // Use 1-based numbering + const replacement = `${footnoteNumber}`; + processedText = processedText.substring(0, footnote.position) + + replacement + + processedText.substring(footnote.position + footnoteMatch[0].length); + } + } + + // Now process other inline markers + type StackEntry = { marker: string; closers: string[]; }; + const stack: StackEntry[] = []; + + const openFor = (marker: string): { openers: string[]; closers: string[]; } => { + switch (marker) { + case 'bd': + return { openers: [``], closers: [''] }; + case 'it': + return { openers: [``], closers: [''] }; + case 'bdit': + return { openers: ['', ``], closers: ['', ''] }; + case 'sup': + return { openers: [``], closers: [''] }; + case 'sc': + return { openers: [``], closers: [''] }; + default: + return { openers: [``], closers: [''] }; + } + }; + + let i = 0; + let out = ''; + while (i < processedText.length) { + const ch = processedText[i]; + if (ch === '\\') { + let j = i + 1; + let name = ''; + // Support plus-prefixed note-internal markers like \+xt + if (processedText[j] === '+') { + name += '+'; + j++; + } + while (j < processedText.length && isAlphaNum(processedText[j])) { + name += processedText[j]; + j++; + } + // Milestones \qt-s/\qt-e are treated as inline spans with data-tag; we ignore -s/-e in HTML + if (processedText[j] === '-' && (processedText[j + 1] === 's' || processedText[j + 1] === 'e')) { + j += 2; + } + // Closing marker + if (processedText[j] === '*') { + let idx = stack.length - 1; + while (idx >= 0 && stack[idx].marker !== name) idx--; + if (idx >= 0) { + const entry = stack.splice(idx, 1)[0]; + for (const closer of entry.closers) out += closer; + } else { + out += ''; + } + j += 1; + i = j; + continue; + } + if (processedText[j] === ' ') j += 1; + const { openers, closers } = openFor(name); + openers.forEach(op => (out += op)); + stack.push({ marker: name, closers: [...closers] }); + i = j; + } else { + out += ch; + i++; + } + } + // Close any dangling tags + while (stack.length > 0) { + const entry = stack.pop()!; + for (const closer of entry.closers) out += closer; + } + return out; +}; + +// Convert HTML back to USFM inline markers +export const htmlInlineToUsfm = (html: string): string => { + // Check if DOMParser is available (browser context) + if (typeof DOMParser !== 'undefined') { + try { + const parser = new DOMParser(); + const doc = parser.parseFromString(`
${html}
`, 'text/html'); + const container = doc.body.firstElementChild as HTMLElement | null; + if (!container) return html; + + const inferMarkerFromElement = (el: HTMLElement): string | null => { + if (el.hasAttribute('data-tag')) return el.getAttribute('data-tag'); + const tag = el.tagName.toLowerCase(); + if (tag === 'strong' || tag === 'b') return 'bd'; + if (tag === 'em' || tag === 'i') return 'it'; + if (tag === 'sup') return 'sup'; + const style = el.getAttribute('style') || ''; + if (style.includes('small-caps')) return 'sc'; + return null; + }; + + const walk = (node: Node): string => { + if (node.nodeType === Node.TEXT_NODE) return node.textContent || ''; + if (node.nodeType === Node.ELEMENT_NODE) { + const el = node as HTMLElement; + + // Handle footnotes: N + if (el.tagName.toLowerCase() === 'sup' && + el.hasAttribute('data-footnote') && + el.classList.contains('footnote-marker')) { + const footnoteContent = el.getAttribute('data-footnote') || ''; + // Unescape HTML entities + const unescaped = footnoteContent + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/ /g, ' '); + + // Parse footnote HTML back to USFM + // Format:

reference: text

or

text

+ const parser = new DOMParser(); + const footnoteDoc = parser.parseFromString(unescaped, 'text/html'); + const footnotePara = footnoteDoc.body.querySelector('p'); + + if (footnotePara) { + let reference = ''; + let footnoteText = ''; + + // Check for tag (reference) + const emTag = footnotePara.querySelector('em'); + if (emTag) { + reference = emTag.textContent?.trim() || ''; + // Remove the reference from the paragraph + const textNodes = Array.from(footnotePara.childNodes) + .filter(n => { + if (n.nodeType === Node.TEXT_NODE) return true; + if (n.nodeType === Node.ELEMENT_NODE) { + const el = n as Element; + return el.tagName.toLowerCase() !== 'em'; + } + return false; + }) + .map(n => { + if (n.nodeType === Node.TEXT_NODE) return n.textContent || ''; + if (n.nodeType === Node.ELEMENT_NODE) { + return htmlInlineToUsfm((n as HTMLElement).outerHTML); + } + return ''; + }) + .join('') + .trim(); + footnoteText = textNodes.replace(/^:?\s*/, ''); + } else { + // No reference, just text + footnoteText = htmlInlineToUsfm(footnotePara.innerHTML); + } + + // Build USFM footnote: \f + \fr reference \ft text\f* + let usfmFootnote = '\\f +'; + if (reference) { + usfmFootnote += ` \\fr ${reference}`; + } + if (footnoteText) { + usfmFootnote += ` \\ft ${footnoteText}`; + } + usfmFootnote += '\\f*'; + + return usfmFootnote; + } + } + + const tag = inferMarkerFromElement(el); + const inner = Array.from(el.childNodes).map(walk).join(''); + if (tag) { + return `\\${tag} ${inner}\\${tag}*`; + } + return inner; + } + return ''; + }; + + return Array.from(container.childNodes).map(walk).join(''); + } catch (error) { + console.warn('DOMParser failed, using regex fallback:', error); + } + } + + // Fallback: Regex-based approach for Node.js context + let result = html; + let changed = true; + let iterations = 0; + const maxIterations = 20; + + while (changed && iterations < maxIterations) { + iterations++; + changed = false; + const before = result; + + // Match innermost tags with data-tag first + result = result.replace(/<(\w+)[^>]*data-tag="([^"]+)"[^>]*>([^<]*)<\/\1>/gi, (match, tagName, dataTag, content) => { + changed = true; + const innerUsfm = content.trim(); + return innerUsfm ? `\\${dataTag} ${innerUsfm}\\${dataTag}*` : ''; + }); + + // Handle semantic tags without data-tag + result = result.replace(/]*>([^<]*)<\/strong>/gi, (match, content) => { + changed = true; + const innerUsfm = content.trim(); + return innerUsfm ? `\\bd ${innerUsfm}\\bd*` : ''; + }); + result = result.replace(/]*>([^<]*)<\/b>/gi, (match, content) => { + changed = true; + const innerUsfm = content.trim(); + return innerUsfm ? `\\bd ${innerUsfm}\\bd*` : ''; + }); + result = result.replace(/]*>([^<]*)<\/em>/gi, (match, content) => { + changed = true; + const innerUsfm = content.trim(); + return innerUsfm ? `\\it ${innerUsfm}\\it*` : ''; + }); + result = result.replace(/]*>([^<]*)<\/i>/gi, (match, content) => { + changed = true; + const innerUsfm = content.trim(); + return innerUsfm ? `\\it ${innerUsfm}\\it*` : ''; + }); + // Handle footnotes BEFORE regular sup tags + result = result.replace(/]*data-footnote="([^"]+)"[^>]*class="footnote-marker"[^>]*>(\d+)<\/sup>/gi, (match, footnoteContent, footnoteNum) => { + changed = true; + // Unescape HTML entities + const unescaped = footnoteContent + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/ /g, ' '); + + // Parse footnote HTML:

reference: text

or

text

+ // Use regex to extract reference and text + const refMatch = unescaped.match(/

([^<]+):\s*<\/em> (.*?)<\/p>/); + let usfmFootnote = '\\f +'; + + if (refMatch) { + const [, reference, text] = refMatch; + usfmFootnote += ` \\fr ${reference.trim()}`; + // Convert HTML in text back to USFM + const textUsfm = htmlInlineToUsfm(text); + if (textUsfm) { + usfmFootnote += ` \\ft ${textUsfm}`; + } + } else { + // No reference, just text + const textMatch = unescaped.match(/

(.*?)<\/p>/); + if (textMatch) { + const textUsfm = htmlInlineToUsfm(textMatch[1]); + if (textUsfm) { + usfmFootnote += ` \\ft ${textUsfm}`; + } + } + } + usfmFootnote += '\\f*'; + return usfmFootnote; + }); + + result = result.replace(/]*>([^<]*)<\/sup>/gi, (match, content) => { + // Skip if this was already processed as a footnote + if (match.includes('data-footnote')) return match; + changed = true; + const innerUsfm = content.trim(); + return innerUsfm ? `\\sup ${innerUsfm}\\sup*` : ''; + }); + result = result.replace(/]*style="[^"]*small-caps[^"]*"[^>]*>([^<]*)<\/span>/gi, (match, content) => { + changed = true; + const innerUsfm = content.trim(); + return innerUsfm ? `\\sc ${innerUsfm}\\sc*` : ''; + }); + + // Handle nested tags with data-tag (process recursively) + result = result.replace(/<(\w+)[^>]*data-tag="([^"]+)"[^>]*>(.*?)<\/\1>/gi, (match, tagName, dataTag, content) => { + if (content.includes('<')) { + const innerUsfm = htmlInlineToUsfm(content); + changed = true; + return innerUsfm ? `\\${dataTag} ${innerUsfm}\\${dataTag}*` : ''; + } + return match; + }); + + if (result === before) { + changed = false; + } + } + + // Clean up any remaining HTML tags + result = result.replace(/<[^>]+>/g, ''); + + return result.trim(); +}; + diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/usfmParser.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/usfmParser.ts new file mode 100644 index 000000000..5696b9b9b --- /dev/null +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/usfmParser.ts @@ -0,0 +1,466 @@ +/** + * Standalone USFM Parser + * Reads all USFM content, including header tags (\id, \toc, etc.) as part of chapter 1 + * Splits file into bible chapters + * Skips empty paragraphs during import + */ + +import { ProcessedCell } from '../../../types/common'; +import { createProcessedCell } from '../../../utils/workflowHelpers'; +import { convertUsfmInlineMarkersToHtml } from './usfmInlineMapper'; + +export interface ParsedUsfmDocument { + bookCode: string; + bookName?: string; + fileName: string; + cells: ProcessedCell[]; + verseCount: number; + paratextCount: number; + chapters: number[]; + footnoteCount: number; + footnotes: any[]; + // Preserve original USFM content for round-trip export + originalUsfmContent: string; + // Store line mappings for export + lineMappings: Array<{ + lineIndex: number; + cellId: string; + originalLine: string; + marker: string; + hasContent: boolean; + }>; +} + +/** + * Parse USFM file line-by-line + * - Includes header tags (\id, \toc, etc.) as part of chapter 1 + * - Creates cells only for lines with content (skips empty markers like \p) + * - Preserves all structure for round-trip export + * @param file - The USFM file to parse + * @param versesOnly - If true, only parse verses (skip headers, sections, etc.) - used for target imports + */ +export async function parseUsfmFile( + file: File, + versesOnly: boolean = false +): Promise { + // Read original file content + const originalContent = await file.text(); + + const lines = originalContent.split(/\r?\n/); + const cells: ProcessedCell[] = []; + const lineMappings: ParsedUsfmDocument['lineMappings'] = []; + + let bookCode = 'XXX'; + let bookName: string | undefined; + let currentChapter = 1; // Start with chapter 1 (headers will be part of chapter 1) + let verseCount = 0; + let paratextCount = 0; + const chapters = new Set(); + + // Track if we've seen the first chapter marker + let seenFirstChapter = false; + + // Track if we've extracted book code + let bookCodeExtracted = false; + + // Track current verse being built (for multi-line verses) + let currentVerse: { + verseNumber: string | number; + verseText: string[]; + breakTags: string[]; // Store original break tags like \li1, \q1, etc. + startLineIndex: number; + chapter: number; + } | null = null; + + // Helper function to finish current verse and create cell + function finishCurrentVerse() { + if (!currentVerse) return; + + const { verseNumber, verseText, breakTags, startLineIndex, chapter } = currentVerse; + const cellChapter = chapter; + + // Build HTML parts - preserve structure including \b tags + const htmlParts: string[] = []; + const breakTagMetadataParts: string[] = []; + + for (let idx = 0; idx < verseText.length; idx++) { + const text = verseText[idx]; + const breakTag = breakTags[idx] || ''; + + if (idx === 0) { + // First part (from \v line) - no break tag before it + if (text.trim()) { + htmlParts.push(text.trim()); + } + } else { + // Subsequent parts - handle break tags + if (breakTag === '\\b') { + // \b creates a blank line marker - add single
+ // The blank line effect comes from \b followed by empty \li1, not from \b itself + htmlParts.push('
'); + if (text.trim()) { + htmlParts.push(text.trim()); + } + breakTagMetadataParts.push('\\b'); + } else if (breakTag && (breakTag.startsWith('\\li') || breakTag.startsWith('\\q'))) { + // Regular break marker (\li1, \q1, etc.) - add single
+ htmlParts.push('
'); + if (text.trim()) { + htmlParts.push(text.trim()); + } + // Always include break tag in metadata, even if text is empty (for empty \li1 lines) + breakTagMetadataParts.push(breakTag); + } else if (text.trim()) { + // Text without specific break tag - add single
+ htmlParts.push('
'); + htmlParts.push(text.trim()); + } + } + } + + // Check if we have any content + const hasContent = htmlParts.some(part => part && part !== '
' && part !== '

'); + if (!hasContent) { + // Empty verse - skip + currentVerse = null; + return; + } + + const htmlContent = htmlParts.join('').trim(); + const cellId = `${bookCode} ${cellChapter}:${verseNumber}`; + + // Store break tags in metadata (for export) - include \b tags + const breakTagMetadata = breakTagMetadataParts.length > 0 + ? breakTagMetadataParts.join('|') + : undefined; + + const cellMetadata: any = { + bookCode, + bookName, + fileName: file.name, + chapter: cellChapter, + marker: '\\v', + originalLine: lines[startLineIndex]?.trim() || '', + originalText: verseText.join(' ').trim(), // Store original text for reference + lineIndex: startLineIndex, + verse: verseNumber, + cellLabel: `${bookCode} ${cellChapter}:${verseNumber}`, + breakTag: breakTagMetadata, // Store original break tags for export (including \b) + }; + + // Convert USFM inline markers to HTML (but keep
tags as-is) + const finalHtmlContent = convertUsfmInlineMarkersToHtml(htmlContent); + + // Create cell + const cell = createProcessedCell(cellId, finalHtmlContent, { + type: 'text', + id: cellId, + ...cellMetadata, + } as any); + + cells.push(cell); + verseCount++; + currentVerse = null; + } + + // Parse each line + for (let lineIndex = 0; lineIndex < lines.length; lineIndex++) { + const line = lines[lineIndex]; + const trimmedLine = line.trim(); + + // Keep empty lines in mappings but don't create cells for them + if (!trimmedLine) { + // If we're building a verse, add empty line as break + if (currentVerse) { + currentVerse.verseText.push(''); + currentVerse.breakTags.push(''); // Empty line break + } + lineMappings.push({ + lineIndex, + cellId: '', + originalLine: line, + marker: '', + hasContent: false, + }); + continue; + } + + // Process lines that start with \ + if (trimmedLine.startsWith('\\')) { + // Extract marker and text + // Match: \marker text or \marker (without text) + const markerMatch = trimmedLine.match(/^\\([a-zA-Z]+\d*(?:-[se])?)\s*(.*)$/); + + if (markerMatch) { + const [, marker, text] = markerMatch; + const textContent = text.trim(); + + // Extract book code from \id marker - ALWAYS process this even if versesOnly + if (marker === 'id' && textContent) { + // Try multiple patterns to extract book code + // Pattern 1: "MAT" or "MAT - Book Name" or "MAT Book Name" + const idMatch = textContent.match(/^([A-Z0-9]{2,4})\b/); + if (idMatch) { + bookCode = idMatch[1].toUpperCase(); + bookCodeExtracted = true; + console.log(`[USFM Parser] Extracted book code: ${bookCode} from line: ${trimmedLine}`); + } + // Extract book name (everything after book code and optional dash) + const nameMatch = textContent.match(/^[A-Z0-9]{2,4}\s*-\s*(.+)$/); + if (nameMatch) { + bookName = nameMatch[1].trim(); + } else { + // If no dash, try to extract name after book code + const nameMatch2 = textContent.match(/^[A-Z0-9]{2,4}\s+(.+)$/); + if (nameMatch2) { + bookName = nameMatch2[1].trim(); + } + } + // If versesOnly, store in mappings but don't create a cell + if (versesOnly) { + lineMappings.push({ + lineIndex, + cellId: '', + originalLine: line, + marker: `\\${marker}`, + hasContent: false, + }); + continue; + } + } + + // Track chapters - but headers before first chapter stay in chapter 1 + if (marker === 'c' && textContent) { + const chapterNum = parseInt(textContent, 10); + if (!isNaN(chapterNum)) { + currentChapter = chapterNum; + chapters.add(chapterNum); + seenFirstChapter = true; + } + // Finish current verse if any (chapter change) + if (currentVerse) { + finishCurrentVerse(); + } + } + + // Determine cell type and metadata + // Headers before first chapter marker are assigned to chapter 1 + const cellChapter = seenFirstChapter ? currentChapter : 1; + + // Handle verse markers specially - collect multi-line verses + if (marker === 'v' || marker.startsWith('v')) { + // Finish previous verse if any + if (currentVerse) { + finishCurrentVerse(); + } + + // Extract verse number + const verseMatch = textContent.match(/^(\d+[a-z]?)\s*(.*)$/); + if (verseMatch) { + const [, verseNum, verseText] = verseMatch; + const verseNumber = /^\d+$/.test(verseNum) ? parseInt(verseNum, 10) : verseNum; + + // Start new verse + currentVerse = { + verseNumber, + verseText: verseText ? [verseText] : [], + breakTags: [''], + startLineIndex: lineIndex, + chapter: cellChapter, + }; + // Store verse marker line in mappings + lineMappings.push({ + lineIndex, + cellId: `${bookCode} ${cellChapter}:${verseNumber}`, + originalLine: line, + marker: `\\${marker}`, + hasContent: true, + }); + } else { + // Verse marker without number - shouldn't happen but handle it + lineMappings.push({ + lineIndex, + cellId: '', + originalLine: line, + marker: `\\${marker}`, + hasContent: true, + }); + } + continue; + } + + // Handle break markers that continue a verse (li1, q1, q2, etc.) + const breakMarkers = ['li1', 'li2', 'li3', 'li4', 'q1', 'q2', 'q3', 'q4']; + if (currentVerse && breakMarkers.includes(marker)) { + // Add text to current verse with break tag + currentVerse.verseText.push(textContent); + currentVerse.breakTags.push(`\\${marker}`); + // Store break line in mappings (linked to verse) + lineMappings.push({ + lineIndex, + cellId: `${bookCode} ${currentVerse.chapter}:${currentVerse.verseNumber}`, + originalLine: line, + marker: `\\${marker}`, + hasContent: true, + }); + continue; + } + + // Handle \b (blank line) marker within a verse - treat as double break + if (currentVerse && marker === 'b') { + // \b creates a blank line - add empty text with special break tag + currentVerse.verseText.push(''); // Empty text for the blank line + currentVerse.breakTags.push('\\b'); // Store \b marker + // Store \b line in mappings (linked to verse) + lineMappings.push({ + lineIndex, + cellId: `${bookCode} ${currentVerse.chapter}:${currentVerse.verseNumber}`, + originalLine: line, + marker: '\\b', + hasContent: false, // \b itself has no content, it's just a blank line marker + }); + continue; + } + + // SKIP empty markers (like \p, \q1, etc. without text) + // Store them in mappings but don't create cells + if (!textContent) { + // Finish current verse if any (empty marker ends verse) + if (currentVerse) { + finishCurrentVerse(); + } + lineMappings.push({ + lineIndex, + cellId: '', + originalLine: line, + marker: `\\${marker}`, + hasContent: false, + }); + continue; + } + + // If versesOnly is true, skip non-verse markers (headers, sections, etc.) + if (versesOnly) { + // Finish current verse if any + if (currentVerse) { + finishCurrentVerse(); + } + // Store in mappings but don't create a cell + lineMappings.push({ + lineIndex, + cellId: '', + originalLine: line, + marker: `\\${marker}`, + hasContent: false, + }); + continue; + } + + // Finish current verse if any (non-verse marker ends verse) + if (currentVerse) { + finishCurrentVerse(); + } + + // All other markers (headers, sections, paragraphs with text, etc.) + const cellMetadata: any = { + bookCode, + bookName, + fileName: file.name, + chapter: cellChapter, + marker: `\\${marker}`, // Store the full marker (e.g., \id, \s1, \v) + originalLine: trimmedLine, // Store the full original line for matching + originalText: textContent, // Store just the text part + lineIndex, // Store line index for export + }; + + // Use marker name and index for unique ID + const cellId = `${bookCode} ${cellChapter}:${marker}:${lineIndex}`; + cellMetadata.originalText = textContent; + paratextCount++; + + // Convert text content to HTML for display + const htmlContent = convertUsfmInlineMarkersToHtml(textContent); + + // Create cell + // Ensure id is in metadata for VS Code notebook compatibility + const cell = createProcessedCell(cellId, htmlContent, { + type: 'text', + id: cellId, // Store id in metadata for VS Code notebook compatibility + ...cellMetadata, + } as any); + + cells.push(cell); + + // Store mapping for export + lineMappings.push({ + lineIndex, + cellId, + originalLine: line, + marker: `\\${marker}`, + hasContent: true, + }); + } else { + // Line starts with \ but doesn't match pattern - store in mappings + // Finish current verse if any + if (currentVerse) { + finishCurrentVerse(); + } + lineMappings.push({ + lineIndex, + cellId: '', + originalLine: line, + marker: '', + hasContent: false, + }); + } + } else { + // Line doesn't start with \ - continuation line + // If we're building a verse, add as continuation + if (currentVerse) { + currentVerse.verseText.push(trimmedLine); + currentVerse.breakTags.push(''); // Continuation line (no break tag) + } + // Store in mappings but don't create a cell (continuation lines are part of previous cell) + lineMappings.push({ + lineIndex, + cellId: currentVerse ? `${bookCode} ${currentVerse.chapter}:${currentVerse.verseNumber}` : '', + originalLine: line, + marker: '', + hasContent: currentVerse ? true : false, + }); + } + } + + // Finish any remaining verse + if (currentVerse) { + finishCurrentVerse(); + } + + // Ensure chapter 1 is in the chapters set if we have headers + if (!seenFirstChapter && cells.length > 0) { + chapters.add(1); + } + + if (cells.length === 0) { + throw new Error(`No content found in USFM file: ${file.name}`); + } + + // Warn if book code wasn't extracted + if (!bookCodeExtracted && bookCode === 'XXX') { + console.warn(`[USFM Parser] Book code not extracted from file ${file.name}, using default 'XXX'`); + } + + return { + bookCode, + bookName, + fileName: file.name, + cells, + verseCount, + paratextCount, + chapters: Array.from(chapters).sort((a, b) => a - b), + footnoteCount: 0, // TODO: Extract footnotes if needed + footnotes: [], + originalUsfmContent: originalContent, + lineMappings, + }; +}