diff --git a/src/exportHandler/exportHandler.ts b/src/exportHandler/exportHandler.ts index a80293249..a6e221bc7 100644 --- a/src/exportHandler/exportHandler.ts +++ b/src/exportHandler/exportHandler.ts @@ -366,14 +366,14 @@ async function exportCodexContentAsIdmlRoundtrip( importerType === 'biblica' || fileType === 'biblica' || importerType === 'biblica-experimental' || // Backward compatibility - fileType === 'biblica-experimental' || // Backward compatibility - fileName.toLowerCase().endsWith('-biblica.codex'); + fileType === 'biblica-experimental'; // Backward compatibility + // Note: We no longer check filename suffix since importer type is stored in metadata const exporterType = isBiblicaFile ? 'Biblica' : 'Standard'; console.log(`[IDML Export] Processing ${fileName} (corpusMarker: ${corpusMarker}) using ${exporterType} exporter`); // Lookup original attachment by originalFileName or originalName metadata on the notebook (fallback to {bookCode}.idml) - // Note: NewSourceUploaderProvider stores it as "originalName", but some importers use "originalFileName" + // Note: originalFileName now points to the actual deduplicated file in attachments/originals const originalFileName = (codexNotebook.metadata as any)?.originalFileName || (codexNotebook.metadata as any)?.originalName || `${bookCode}.idml`; @@ -463,21 +463,24 @@ async function exportCodexContentAsDocxRoundtrip( continue; } - // Lookup original attachment by originalFileName metadata - const originalFileName = (codexNotebook.metadata as any)?.originalFileName || `${bookCode}.docx`; - // Originals are stored under `.project/attachments/files/originals/` (preferred). - // Fallback to legacy `.project/attachments/originals/` if needed. + // Lookup original attachment by originalFileName or originalName metadata + // Note: originalFileName now points to the actual deduplicated file in attachments/originals + const originalFileName = (codexNotebook.metadata as any)?.originalFileName || + (codexNotebook.metadata as any)?.originalName || + `${bookCode}.docx`; + // Originals are stored under `.project/attachments/originals/` (preferred). + // Fallback to legacy `.project/attachments/files/originals/` if needed. const originalsDirPreferred = vscode.Uri.joinPath( workspaceFolders[0].uri, ".project", "attachments", - "files", "originals" ); const originalsDirLegacy = vscode.Uri.joinPath( workspaceFolders[0].uri, ".project", "attachments", + "files", "originals" ); const preferredUri = vscode.Uri.joinPath(originalsDirPreferred, originalFileName); @@ -516,7 +519,7 @@ async function exportCodexContentAsDocxRoundtrip( ); } -// PDF Round-trip export +// PDF Round-trip export: Uses DOCX exporter then converts DOCX→PDF async function exportCodexContentAsPdfRoundtrip( userSelectedPath: string, filesToExport: string[], @@ -540,10 +543,10 @@ async function exportCodexContentAsPdfRoundtrip( async (progress) => { const increment = filesToExport.length > 0 ? 100 / filesToExport.length : 100; - // Import PDF exporter - const { exportPdfWithTranslations } = await import("../../webviews/codex-webviews/src/NewSourceUploader/importers/pdf/pdfExporter"); + // Import DOCX exporter (we'll use it to create DOCX, then convert to PDF) + const { exportDocxWithTranslations } = await import("../../webviews/codex-webviews/src/NewSourceUploader/importers/docx/experiment/docxExporter"); - // For each selected codex file, find its original attachment and create a translated copy in export folder + // For each selected codex file, export as DOCX then convert to PDF for (const [index, filePath] of filesToExport.entries()) { progress.report({ message: `Processing ${index + 1}/${filesToExport.length}`, increment }); try { @@ -551,46 +554,136 @@ async function exportCodexContentAsPdfRoundtrip( const fileName = basename(file.fsPath); const bookCode = fileName.split(".")[0] || ""; - console.log(`[PDF Export] Processing ${fileName} using PDF exporter`); + console.log(`[PDF Export] Processing ${fileName} using DOCX exporter + docx2pdf`); // Read codex notebook const codexNotebook = await readCodexNotebookFromUri(file); // Check if this is a PDF file const corpusMarker = (codexNotebook.metadata as any)?.corpusMarker || ''; - const isPdfFile = corpusMarker === 'pdf' || corpusMarker === 'pdf-importer' || corpusMarker === 'pdf-sentence'; + const isPdfFile = corpusMarker === 'pdf'; if (!isPdfFile) { console.warn(`[PDF Export] Skipping ${fileName} - not imported with PDF importer (corpusMarker: ${corpusMarker})`); vscode.window.showWarningMessage(`Skipping ${fileName} - not imported with PDF importer`); continue; } - // Lookup original attachment by originalFileName metadata - const originalFileName = (codexNotebook.metadata as any)?.originalFileName || `${bookCode}.pdf`; - const originalsDir = vscode.Uri.joinPath( + // Lookup original attachment by originalFileName or originalName metadata + const originalFileName = (codexNotebook.metadata as any)?.originalFileName || + (codexNotebook.metadata as any)?.originalName || + `${bookCode}.pdf`; + + // Check both preferred and legacy locations for converted DOCX + const originalsDirPreferred = vscode.Uri.joinPath( workspaceFolders[0].uri, ".project", "attachments", + "files", "originals" ); - const originalFileUri = vscode.Uri.joinPath(originalsDir, originalFileName); + const originalsDirLegacy = vscode.Uri.joinPath( + workspaceFolders[0].uri, + ".project", + "attachments", + "originals" + ); + + // Get converted DOCX filename from metadata or derive from PDF filename + const pdfMetadata = (codexNotebook.metadata as any)?.pdfDocumentMetadata; + const convertedDocxFileName = pdfMetadata?.convertedDocxFileName || originalFileName.replace(/\.pdf$/i, '.docx'); + + // Try preferred location first, then legacy + const convertedDocxUriPreferred = vscode.Uri.joinPath(originalsDirPreferred, convertedDocxFileName); + const convertedDocxUriLegacy = vscode.Uri.joinPath(originalsDirLegacy, convertedDocxFileName); + + let docxUri = convertedDocxUriPreferred; - // Load original PDF - const pdfData = await vscode.workspace.fs.readFile(originalFileUri); + try { + // Try preferred location first + await vscode.workspace.fs.stat(convertedDocxUriPreferred); + } catch { + // Fall back to legacy location + try { + await vscode.workspace.fs.stat(convertedDocxUriLegacy); + docxUri = convertedDocxUriLegacy; + } catch { + // If no converted DOCX exists, we need to convert PDF→DOCX first + // This should have been done during import, but handle gracefully + console.warn(`[PDF Export] No converted DOCX found at ${convertedDocxUriPreferred.fsPath} or ${convertedDocxUriLegacy.fsPath}`); + throw new Error(`No converted DOCX file found. Please re-import the PDF file.`); + } + } + + // Read converted DOCX + const docxBytes = await vscode.workspace.fs.readFile(docxUri); + const docxData = docxBytes.buffer.slice(docxBytes.byteOffset, docxBytes.byteOffset + docxBytes.byteLength) as ArrayBuffer; + console.log(`[PDF Export] Using converted DOCX: ${docxUri.fsPath}`); + + progress.report({ message: `Exporting DOCX for ${fileName}...`, increment: increment * 0.5 }); + + // Debug: Check cell metadata structure + console.log(`[PDF Export] Codex notebook has ${codexNotebook.cells.length} cells`); + if (codexNotebook.cells.length > 0) { + const firstCell = codexNotebook.cells[0]; + const cellMeta = firstCell.metadata as any; + console.log(`[PDF Export] First cell metadata:`, JSON.stringify({ + hasValue: !!firstCell.value, + valueLength: firstCell.value?.length || 0, + valuePreview: firstCell.value?.substring(0, 100) || '', + paragraphIndex: cellMeta?.paragraphIndex, + paragraphId: cellMeta?.paragraphId, + hasData: !!cellMeta?.data, + dataKeys: cellMeta?.data ? Object.keys(cellMeta.data) : [] + }, null, 2)); + } + + // Step 1: Use DOCX exporter to create translated DOCX + const updatedDocxData = await exportDocxWithTranslations( + docxData, + codexNotebook.cells + ); + + // Step 2: Save translated DOCX to attachments/files/temporary folder + const temporaryDir = vscode.Uri.joinPath( + workspaceFolders[0].uri, + ".project", + "attachments", + "files", + "temporary" + ); + + // Ensure temporary directory exists + try { + await vscode.workspace.fs.createDirectory(temporaryDir); + } catch { + // Directory may already exist + } - // Use PDF exporter to create translated PDF - // Convert Uint8Array to proper ArrayBuffer for pdf-lib - const pdfBuffer = new Uint8Array(pdfData).buffer as ArrayBuffer; - const updatedPdfData = await exportPdfWithTranslations(pdfBuffer, codexNotebook.cells); + // Get original PDF filename from metadata or derive from codex filename + const originalPdfFileName = originalFileName || fileName.replace(/\.codex$/i, '.pdf'); + const translatedDocxFileName = originalPdfFileName.replace(/\.pdf$/i, '_translated.docx'); + const translatedDocxUri = vscode.Uri.joinPath(temporaryDir, translatedDocxFileName); - // Save updated PDF into the chosen export folder + await vscode.workspace.fs.writeFile( + translatedDocxUri, + new Uint8Array(updatedDocxData) + ); + console.log(`[PDF Export] Saved translated DOCX to: ${translatedDocxUri.fsPath}`); + + progress.report({ message: `Converting DOCX to PDF for ${fileName}...`, increment: increment * 0.5 }); + + // Step 3: Convert DOCX → PDF using docx2pdf via extension host + const pdfData = await convertDocxToPdfViaExtension(translatedDocxUri.fsPath); + + // Step 4: Save translated PDF to user's selected destination const timestamp = new Date().toISOString().replace(/[:.]/g, "-"); - const translatedName = originalFileName.replace(/\.pdf$/i, `_${timestamp}_translated.pdf`); - const translatedUri = vscode.Uri.joinPath(exportFolder, translatedName); + const translatedPdfName = originalFileName.replace(/\.pdf$/i, `_${timestamp}_translated.pdf`); + const translatedPdfUri = vscode.Uri.joinPath(exportFolder, translatedPdfName); - await vscode.workspace.fs.writeFile(translatedUri, new Uint8Array(updatedPdfData)); + await vscode.workspace.fs.writeFile(translatedPdfUri, new Uint8Array(pdfData)); + console.log(`[PDF Export] Saved translated PDF to: ${translatedPdfUri.fsPath}`); - console.log(`[PDF Export] ✓ Exported ${translatedName}`); + console.log(`[PDF Export] ✓ Exported ${translatedPdfName}`); } catch (error) { console.error(`[PDF Export] Error exporting ${filePath}:`, error); @@ -603,6 +696,127 @@ async function exportCodexContentAsPdfRoundtrip( ); } +/** + * Converts DOCX file to PDF using docx2pdf via Python script + */ +async function convertDocxToPdfViaExtension(docxPath: string): Promise { + try { + // Get extension path + const extension = vscode.extensions.getExtension('project-accelerate.codex-editor-extension'); + if (!extension) { + throw new Error('Could not find Codex Editor extension'); + } + const scriptPath = path.join(extension.extensionPath, 'webviews', 'codex-webviews', 'src', 'NewSourceUploader', 'importers', 'pdf', 'scripts', 'docx_to_pdf.py'); + const tempDir = path.join(extension.extensionPath, '.temp'); + if (!fs.existsSync(tempDir)) { + fs.mkdirSync(tempDir, { recursive: true }); + } + const pdfPath = path.join(tempDir, `converted_${Date.now()}.pdf`); + + // Escape paths for shell + const escapedScriptPath = scriptPath.replace(/\\/g, '/'); + const escapedDocxPath = docxPath.replace(/\\/g, '/'); + const escapedPdfPath = pdfPath.replace(/\\/g, '/'); + + // Run Python script with file paths (no base64 in command line) + const pythonCmd = process.platform === 'win32' ? 'python' : 'python3'; + const command = `${pythonCmd} "${escapedScriptPath}" "${escapedDocxPath}" "${escapedPdfPath}"`; + + console.log(`[DOCX→PDF] Converting DOCX to PDF...`); + console.log(`[DOCX→PDF] Command: ${command}`); + console.log(`[DOCX→PDF] DOCX path: ${docxPath}`); + console.log(`[DOCX→PDF] PDF output path: ${pdfPath}`); + + // Verify DOCX file exists + if (!fs.existsSync(docxPath)) { + throw new Error(`DOCX file not found: ${docxPath}`); + } + + let stdout: string; + let stderr: string; + try { + const result = await execAsync(command, { maxBuffer: 50 * 1024 * 1024 }); + stdout = result.stdout; + stderr = result.stderr; + } catch (execError: any) { + // execAsync throws an error if exit code is non-zero + stdout = execError.stdout || ''; + stderr = execError.stderr || ''; + console.error(`[DOCX→PDF] Python script execution failed: ${execError.message}`); + console.error(`[DOCX→PDF] Exit code: ${execError.code}`); + console.error(`[DOCX→PDF] Stdout: ${stdout}`); + console.error(`[DOCX→PDF] Stderr: ${stderr}`); + + // Try to parse error from stdout if it's JSON + if (stdout.trim()) { + try { + const errorResult = JSON.parse(stdout); + if (errorResult.error) { + throw new Error(`DOCX to PDF conversion failed: ${errorResult.error}`); + } + } catch { + // Not JSON, use the stderr/stdout as error message + } + } + + throw new Error(`DOCX to PDF conversion failed: ${stderr || stdout || execError.message}`); + } + + // Log stderr for debugging + if (stderr) { + console.log(`[DOCX→PDF] Python stderr: ${stderr}`); + } + + // Log stdout for debugging + console.log(`[DOCX→PDF] Python stdout: ${stdout.substring(0, 500)}${stdout.length > 500 ? '...' : ''}`); + + if (!stdout.trim()) { + throw new Error('Python script returned no output'); + } + + let result; + try { + result = JSON.parse(stdout); + } catch (parseError) { + console.error(`[DOCX→PDF] Failed to parse Python output as JSON: ${parseError}`); + console.error(`[DOCX→PDF] Raw stdout: ${stdout}`); + throw new Error(`Failed to parse conversion result: ${parseError instanceof Error ? parseError.message : 'Unknown error'}. Output: ${stdout.substring(0, 200)}`); + } + + if (result.success) { + // Verify PDF file exists + if (!fs.existsSync(pdfPath)) { + throw new Error(`PDF file was not created at: ${pdfPath}`); + } + + // Read the generated PDF + const pdfData = fs.readFileSync(pdfPath); + console.log(`[DOCX→PDF] Read PDF file: ${pdfData.length} bytes`); + + // Clean up temp PDF file + try { + fs.unlinkSync(pdfPath); + } catch (e) { + console.warn(`[DOCX→PDF] Could not delete temp PDF: ${e}`); + } + + console.log(`[DOCX→PDF] ✓ Successfully converted DOCX to PDF`); + return pdfData.buffer.slice(pdfData.byteOffset, pdfData.byteOffset + pdfData.byteLength) as ArrayBuffer; + } else { + const errorMsg = result.error || 'DOCX to PDF conversion failed'; + console.error(`[DOCX→PDF] Conversion failed: ${errorMsg}`); + console.error(`[DOCX→PDF] Full result object:`, JSON.stringify(result, null, 2)); + throw new Error(errorMsg); + } + } catch (err) { + if (err instanceof Error && err.message.includes('DOCX to PDF conversion failed')) { + throw err; // Re-throw our custom errors + } + console.error(`[DOCX→PDF] Unexpected error: ${err}`); + throw err instanceof Error ? err : new Error(`Failed to convert DOCX to PDF: ${err}`); + } +} + /** * RTF Round-trip export using Pandoc * COMMENTED OUT - RTF importer disabled @@ -676,7 +890,9 @@ async function exportCodexContentAsPdfRoundtrip( : vscode.Uri.file(filePath.replace(/\.codex$/, '.source').replace(/files[/\\]target/, '.project/sourceTexts')); const sourceNotebook = await readCodexNotebookFromUri(sourceFile); - const originalFileName = (sourceNotebook.metadata as any)?.originalFileName || `${bookCode}.rtf`; + const originalFileName = (sourceNotebook.metadata as any)?.originalFileName || + (sourceNotebook.metadata as any)?.originalName || + `${bookCode}.rtf`; // Build translation map from codex cells const translations: { [paragraphIndex: number]: string; } = {}; @@ -840,7 +1056,9 @@ async function exportCodexContentAsObsRoundtrip( console.log('[OBS Export] Generated markdown with translations, length:', updatedMarkdown.length); // Determine output filename - const originalFileName = (codexNotebook.metadata as any)?.originalFileName || `${fileName.split('.')[0]}.md`; + const originalFileName = (codexNotebook.metadata as any)?.originalFileName || + (codexNotebook.metadata as any)?.originalName || + `${fileName.split('.')[0]}.md`; const baseFileName = originalFileName.replace(/\.md$/i, ''); // Create timestamped filename @@ -920,7 +1138,8 @@ async function exportCodexContentAsUsfmRoundtrip( // Get original file name from metadata with fallback // Try multiple sources: codex metadata, source notebook metadata, or construct from bookCode - let metadataOriginalFileName = (codexNotebook.metadata as any)?.originalFileName; + let metadataOriginalFileName = (codexNotebook.metadata as any)?.originalFileName || + (codexNotebook.metadata as any)?.originalName; const metadataBookCode = (codexNotebook.metadata as any)?.bookCode; const finalBookCode = metadataBookCode || bookCode; @@ -935,7 +1154,8 @@ async function exportCodexContentAsUsfmRoundtrip( sourceFileName ); const sourceNotebook = await readCodexNotebookFromUri(sourceFileUri); - metadataOriginalFileName = (sourceNotebook.metadata as any)?.originalFileName; + metadataOriginalFileName = (sourceNotebook.metadata as any)?.originalFileName || + (sourceNotebook.metadata as any)?.originalName; if (metadataOriginalFileName) { console.log(`[USFM Export] Found originalFileName in source notebook: ${metadataOriginalFileName}`); } @@ -1070,6 +1290,173 @@ async function exportCodexContentAsUsfmRoundtrip( ); } +/** + * Spreadsheet (CSV/TSV) Round-trip export + * Exports codex notebooks back to CSV/TSV format with translations + */ +async function exportCodexContentAsSpreadsheetRoundtrip( + userSelectedPath: string, + filesToExport: string[], + _options?: ExportOptions +) { + const workspaceFolders = vscode.workspace.workspaceFolders; + if (!workspaceFolders) { + vscode.window.showErrorMessage("No workspace folder found."); + return; + } + + const exportFolder = vscode.Uri.file(userSelectedPath); + await vscode.workspace.fs.createDirectory(exportFolder); + + return vscode.window.withProgress( + { + location: vscode.ProgressLocation.Notification, + title: "Exporting Spreadsheet Round-trip", + cancellable: false, + }, + async (progress) => { + const increment = filesToExport.length > 0 ? 100 / filesToExport.length : 100; + + // Import spreadsheet exporter + const { exportSpreadsheetWithTranslations, getDelimiterFromMetadata, getSpreadsheetExtension } = + await import("../../webviews/codex-webviews/src/NewSourceUploader/importers/bibleSpredSheet/spreadsheetExporter"); + + for (const [index, filePath] of filesToExport.entries()) { + progress.report({ message: `Processing ${index + 1}/${filesToExport.length}`, increment }); + try { + const file = vscode.Uri.file(filePath); + const fileName = basename(file.fsPath); + + console.log(`[Spreadsheet Export] Processing ${fileName}`); + + // Read codex notebook + const codexNotebook = await readCodexNotebookFromUri(file); + + // Check if this is a spreadsheet file + const corpusMarker = (codexNotebook.metadata as any)?.corpusMarker; + const importerType = (codexNotebook.metadata as any)?.importerType; + const originalFileName = (codexNotebook.metadata as any)?.originalFileName || + (codexNotebook.metadata as any)?.originalName || + ''; + + // Check for any spreadsheet importer type + const isSpreadsheet = + importerType === 'spreadsheet' || + importerType === 'spreadsheet-csv' || + importerType === 'spreadsheet-tsv' || + corpusMarker === 'spreadsheet' || + corpusMarker === 'spreadsheet-csv' || + corpusMarker === 'spreadsheet-tsv'; + + if (!isSpreadsheet) { + console.warn(`[Spreadsheet Export] Skipping ${fileName} - not imported with spreadsheet importer (importerType: ${importerType}, corpusMarker: ${corpusMarker})`); + vscode.window.showWarningMessage(`Skipping ${fileName} - not imported with spreadsheet importer`); + continue; + } + + // Get importer type and delimiter + const notebookImporterType = (codexNotebook.metadata as any)?.importerType; + const delimiter = getDelimiterFromMetadata(codexNotebook.metadata); + const extension = getSpreadsheetExtension(originalFileName, delimiter, notebookImporterType); + const columnHeaders = (codexNotebook.metadata as any)?.columnHeaders; + const sourceColumnIndex = (codexNotebook.metadata as any)?.sourceColumnIndex; + + console.log(`[Spreadsheet Export] Processing ${fileName}`); + console.log(`[Spreadsheet Export] - importerType: ${notebookImporterType}`); + console.log(`[Spreadsheet Export] - originalFileName: ${originalFileName}`); + console.log(`[Spreadsheet Export] - extension: ${extension}`); + console.log(`[Spreadsheet Export] - sourceColumnIndex: ${sourceColumnIndex}`); + console.log(`[Spreadsheet Export] - columnHeaders: ${columnHeaders ? columnHeaders.join(', ') : 'none'}`); + + // Get original file content from metadata (stored during import) + let originalFileContent: string | undefined = (codexNotebook.metadata as any)?.originalFileContent; + + if (originalFileContent) { + console.log(`[Spreadsheet Export] ✓ Found originalFileContent in metadata (${originalFileContent.length} chars)`); + console.log(`[Spreadsheet Export] First 200 chars: ${originalFileContent.substring(0, 200)}`); + } else { + console.log(`[Spreadsheet Export] No originalFileContent in metadata, trying file system...`); + + // Fallback: try to read from attachments folder (for older imports) + const originalsDir = vscode.Uri.joinPath( + workspaceFolders[0].uri, + '.project', + 'attachments', + 'files', + 'originals' + ); + const originalFileUri = vscode.Uri.joinPath(originalsDir, originalFileName); + + console.log(`[Spreadsheet Export] Looking for original file at: ${originalFileUri.fsPath}`); + + try { + const fileData = await vscode.workspace.fs.readFile(originalFileUri); + originalFileContent = Buffer.from(fileData).toString('utf-8'); + console.log(`[Spreadsheet Export] ✓ Loaded original file (${originalFileContent.length} chars)`); + } catch (err) { + console.warn(`[Spreadsheet Export] File not found at preferred location: ${err}`); + // Try legacy location + try { + const legacyDir = vscode.Uri.joinPath( + workspaceFolders[0].uri, + '.project', + 'attachments', + 'originals' + ); + const legacyUri = vscode.Uri.joinPath(legacyDir, originalFileName); + console.log(`[Spreadsheet Export] Trying legacy location: ${legacyUri.fsPath}`); + const fileData = await vscode.workspace.fs.readFile(legacyUri); + originalFileContent = Buffer.from(fileData).toString('utf-8'); + console.log(`[Spreadsheet Export] ✓ Loaded from legacy location (${originalFileContent.length} chars)`); + } catch (legacyErr) { + console.warn(`[Spreadsheet Export] ✗ Could not find original file anywhere. Will use fallback reconstruction.`); + } + } + } + + console.log(`[Spreadsheet Export] Metadata: importerType="${notebookImporterType}", delimiter="${delimiter === '\t' ? 'TAB' : delimiter}", sourceColumnIndex=${sourceColumnIndex}, hasOriginalContent=${!!originalFileContent}`); + + // Export with translations - true round-trip using original file content + const exportedContent = exportSpreadsheetWithTranslations( + codexNotebook.cells as any, + { + delimiter, + originalFileName, + originalFileContent, + columnHeaders, + sourceColumnIndex, + importerType: notebookImporterType, + } + ); + + // Generate output filename + const timestamp = new Date().toISOString().replace(/[:.]/g, "-"); + const baseName = originalFileName + ? originalFileName.replace(/\.(csv|tsv)$/i, '') + : fileName.replace(/\.codex$/i, ''); + const outputFileName = `${baseName}_translated_${timestamp}.${extension}`; + const outputUri = vscode.Uri.joinPath(exportFolder, outputFileName); + + // Write the file + await vscode.workspace.fs.writeFile( + outputUri, + Buffer.from(exportedContent, 'utf-8') + ); + + console.log(`[Spreadsheet Export] ✓ Exported ${outputFileName}`); + } catch (error) { + console.error(`[Spreadsheet Export] Error exporting ${filePath}:`, error); + vscode.window.showErrorMessage( + `Failed to export ${basename(filePath)}: ${error instanceof Error ? error.message : 'Unknown error'}` + ); + } + } + + vscode.window.showInformationMessage(`Spreadsheet round-trip export completed to ${userSelectedPath}`); + } + ); +} + /** * TMS (Translation Memory System) Round-trip export * Supports both TMX and XLIFF formats @@ -1116,7 +1503,9 @@ async function exportCodexContentAsTmsRoundtrip( const corpusMarker = (codexNotebook.metadata as any)?.corpusMarker; const fileFormat = (codexNotebook.metadata as any)?.fileFormat || corpusMarker; // Fallback to corpusMarker for old files const fileType = (codexNotebook.metadata as any)?.fileType; // Direct file type field (tmx or xliff) - const originalFileName = (codexNotebook.metadata as any)?.originalFileName; // Get original filename (stored as originalFileName in metadata) + // Get original filename - this now points to the actual deduplicated file in attachments/originals + const originalFileName = (codexNotebook.metadata as any)?.originalFileName || + (codexNotebook.metadata as any)?.originalName; if (corpusMarker !== 'tms' && fileFormat !== 'tms-tmx' && fileFormat !== 'tms-xliff') { console.warn(`[TMS Export] Skipping ${fileName} - not imported with TMS importer (corpusMarker: ${corpusMarker}, fileFormat: ${fileFormat})`); @@ -1233,12 +1622,16 @@ async function exportCodexContentAsRebuild( try { const file = vscode.Uri.file(filePath); const codexNotebook = await readCodexNotebookFromUri(file); - const corpusMarker = (codexNotebook.metadata as any)?.corpusMarker; - const importerType = (codexNotebook.metadata as any)?.importerType; - const fileType = (codexNotebook.metadata as any)?.fileType; - const originalFileName = (codexNotebook.metadata as any)?.originalFileName; + const corpusMarker = (codexNotebook.metadata as any)?.corpusMarker ? String((codexNotebook.metadata as any).corpusMarker).trim() : ''; + const importerType = (codexNotebook.metadata as any)?.importerType ? String((codexNotebook.metadata as any).importerType).trim() : ''; + const fileType = (codexNotebook.metadata as any)?.fileType ? String((codexNotebook.metadata as any).fileType).trim() : ''; + const originalFileName = (codexNotebook.metadata as any)?.originalFileName + ? String((codexNotebook.metadata as any).originalFileName).trim() + : (codexNotebook.metadata as any)?.originalName + ? String((codexNotebook.metadata as any).originalName).trim() + : ''; - console.log(`[Rebuild Export] File: ${basename(filePath)}, corpusMarker: ${corpusMarker}, importerType: ${importerType}, fileType: ${fileType}`); + console.log(`[Rebuild Export] File: ${basename(filePath)}, corpusMarker: "${corpusMarker}", importerType: "${importerType}", fileType: "${fileType}"`); // Group by supported types if (corpusMarker === 'docx-roundtrip') { @@ -1248,7 +1641,7 @@ async function exportCodexContentAsRebuild( corpusMarker === 'biblica' || corpusMarker === 'biblica-idml' || corpusMarker === 'idml-roundtrip' || - corpusMarker.startsWith('idml-') || + (corpusMarker && corpusMarker.startsWith('idml-')) || importerType === 'biblica' || fileType === 'biblica' || importerType === 'biblica-experimental' || // Backward compatibility @@ -1258,22 +1651,18 @@ async function exportCodexContentAsRebuild( // Includes Biblica importer which uses the same IDML format filesByType['idml'] = filesByType['idml'] || []; filesByType['idml'].push(filePath); - // } else if ( - // corpusMarker === 'pdf' || - // corpusMarker === 'pdf-importer' || // Backward compatibility - // corpusMarker === 'pdf-sentence' // Backward compatibility - // ) { - // // PDF files use the PDF exporter - // filesByType['pdf'] = filesByType['pdf'] || []; - // filesByType['pdf'].push(filePath); - // } else if ( - // corpusMarker === 'rtf' || - // corpusMarker === 'rtf-pandoc' || // Backward compatibility - // importerType === 'rtf-pandoc' - // ) { - // // RTF files use the Pandoc RTF exporter - // filesByType['rtf'] = filesByType['rtf'] || []; - // filesByType['rtf'].push(filePath); + } else if ( + corpusMarker === 'pdf' || + corpusMarker === 'pdf-importer' || // Backward compatibility + corpusMarker === 'pdf-sentence' || // Backward compatibility + importerType === 'pdf' || + fileType === 'pdf' || + (originalFileName && /\.pdf$/i.test(originalFileName)) // Fallback: check filename extension + ) { + // PDF files use the PDF exporter (DOCX exporter + docx2pdf conversion) + console.log(`[Rebuild Export] ✓ Detected PDF file: ${basename(filePath)} (corpusMarker: "${corpusMarker}", importerType: "${importerType}", fileType: "${fileType}")`); + filesByType['pdf'] = filesByType['pdf'] || []; + filesByType['pdf'].push(filePath); } else if (corpusMarker === 'obs' || importerType === 'obs') { // OBS (Open Bible Stories) markdown files use the OBS exporter // Fallback: also detect by importerType for older files @@ -1304,8 +1693,23 @@ async function exportCodexContentAsRebuild( // USFM files use the USFM round-trip exporter filesByType['usfm'] = filesByType['usfm'] || []; filesByType['usfm'].push(filePath); + } else if ( + corpusMarker === 'spreadsheet' || + corpusMarker === 'spreadsheet-csv' || + corpusMarker === 'spreadsheet-tsv' || + importerType === 'spreadsheet' || + importerType === 'spreadsheet-csv' || + importerType === 'spreadsheet-tsv' || + (originalFileName && /\.(csv|tsv)$/i.test(originalFileName)) + ) { + // Spreadsheet files (CSV/TSV) use the spreadsheet round-trip exporter + console.log(`[Rebuild Export] ✓ Detected Spreadsheet file: ${basename(filePath)} (corpusMarker: "${corpusMarker}", importerType: "${importerType}")`); + filesByType['spreadsheet'] = filesByType['spreadsheet'] || []; + filesByType['spreadsheet'].push(filePath); } else { - unsupportedFiles.push({ file: basename(filePath), marker: corpusMarker || importerType || 'unknown' }); + // Log what we detected for debugging + console.log(`[Rebuild Export] Unsupported file detected: ${basename(filePath)}, corpusMarker: ${corpusMarker}, importerType: ${importerType}, fileType: ${fileType}, originalFileName: ${originalFileName}`); + unsupportedFiles.push({ file: basename(filePath), marker: corpusMarker || importerType || fileType || 'unknown' }); } } catch (error) { console.error(`[Rebuild Export] Error analyzing ${filePath}:`, error); @@ -1356,23 +1760,21 @@ async function exportCodexContentAsRebuild( } } - // Export PDF files - // COMMENTED OUT - PDF exporter disabled (not working properly) - /* if (filesByType['pdf']?.length > 0) { - console.log(`[Rebuild Export] Exporting ${filesByType['pdf'].length} PDF file(s) to DOCX...`); + // Export PDF files (uses DOCX exporter + docx2pdf conversion) + if (filesByType['pdf']?.length > 0) { + console.log(`[Rebuild Export] Exporting ${filesByType['pdf'].length} PDF file(s)...`); progress.report({ - message: `Exporting ${filesByType['pdf'].length} PDF file(s) to DOCX...`, + message: `Exporting ${filesByType['pdf'].length} PDF file(s)...`, increment: 20 }); try { - const { exportPdfAsDocx } = await import("./pdfDocxExporter"); - await exportPdfAsDocx(userSelectedPath, filesByType['pdf']); + await exportCodexContentAsPdfRoundtrip(userSelectedPath, filesByType['pdf'], options); processedCount += filesByType['pdf'].length; } catch (error) { console.error('[Rebuild Export] PDF export failed:', error); vscode.window.showErrorMessage(`PDF export failed: ${error instanceof Error ? error.message : 'Unknown error'}`); } - } */ + } // Export RTF files using Pandoc // COMMENTED OUT - RTF importer disabled @@ -1439,6 +1841,22 @@ async function exportCodexContentAsRebuild( } } + // Export Spreadsheet (CSV/TSV) files + if (filesByType['spreadsheet']?.length > 0) { + console.log(`[Rebuild Export] Exporting ${filesByType['spreadsheet'].length} Spreadsheet file(s)...`); + progress.report({ + message: `Exporting ${filesByType['spreadsheet'].length} Spreadsheet file(s)...`, + increment: 20 + }); + try { + await exportCodexContentAsSpreadsheetRoundtrip(userSelectedPath, filesByType['spreadsheet'], options); + processedCount += filesByType['spreadsheet'].length; + } catch (error) { + console.error('[Rebuild Export] Spreadsheet export failed:', error); + vscode.window.showErrorMessage(`Spreadsheet export failed: ${error instanceof Error ? error.message : 'Unknown error'}`); + } + } + progress.report({ message: "Complete", increment: 30 }); // Show summary @@ -1462,7 +1880,7 @@ async function exportCodexContentAsRebuild( .join('\n'); vscode.window.showWarningMessage( - `The following files were skipped (unsupported or coming soon):\n${unsupportedList}\n\nSupported types: DOCX, IDML, Biblica, PDF`, + `The following files were skipped (unsupported or coming soon):\n${unsupportedList}\n\nSupported types: DOCX, IDML, Biblica, PDF, OBS, TMS, USFM, CSV/TSV`, { modal: false } ); } diff --git a/src/projectManager/projectExportView.ts b/src/projectManager/projectExportView.ts index 48c6b13da..22118b896 100644 --- a/src/projectManager/projectExportView.ts +++ b/src/projectManager/projectExportView.ts @@ -279,6 +279,13 @@ function getWebviewContent( opacity: 0.8; align-self: flex-start; } + .format-tag.format-tag-roundtrip { + background-color: rgba(34, 197, 94, 0.15) !important; + color: var(--vscode-charts-green, #16a34a) !important; + border: 1px solid rgba(34, 197, 94, 0.3) !important; + border-radius: 4px; + opacity: 1; + } @@ -287,6 +294,27 @@ function getWebviewContent( ? `

Select Export Format

+ +
+
+ +
+ Round-trip Export +

Intelligently detects file type and exports back the original file you imported with applied translations

+
+ USFM + DOCX + OBS + TMS + Markdown + CSV/TSV + IDML + Biblica Study Notes +
+
+
+
+
@@ -327,25 +355,6 @@ function getWebviewContent(
- - -
-
- -
- Rebuild Export -

Intelligently detects file type and exports back to original format (DOCX, IDML, Biblica, OBS, TMS, USFM)

-
- DOCX - IDML - Biblica - OBS - TMS - USFM -
-
-
-
diff --git a/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts b/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts index ac0291bef..c389a96d8 100644 --- a/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts +++ b/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts @@ -14,6 +14,7 @@ import { handleFinalizeAudioImport, } from "./importers/audioSplitter"; import { ProcessedNotebook } from "../../../webviews/codex-webviews/src/NewSourceUploader/types/common"; +import type { SpreadsheetNotebookMetadata } from "../../../webviews/codex-webviews/src/NewSourceUploader/types/processedNotebookMetadata"; import { NotebookPreview, CustomNotebookMetadata } from "../../../types"; import { CodexCell } from "../../utils/codexNotebookUtils"; import { CodexCellTypes } from "../../../types/enums"; @@ -318,6 +319,333 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide error: err instanceof Error ? err.message : 'Unknown error' }); } + } else if (message.command === "convertPdfToDocx") { + const { requestId, pdfBase64, outputPath } = message as { requestId: string; pdfBase64: string; outputPath?: string; }; + try { + const scriptPath = path.join(this.context.extensionPath, 'webviews', 'codex-webviews', 'src', 'NewSourceUploader', 'importers', 'pdf', 'scripts', 'pdf_to_docx.py'); + + // Verify script exists + if (!fs.existsSync(scriptPath)) { + throw new Error(`Python script not found at: ${scriptPath}`); + } + + // Create temp directory + const tempDir = path.join(this.context.extensionPath, '.temp'); + if (!fs.existsSync(tempDir)) { + fs.mkdirSync(tempDir, { recursive: true }); + } + + // Write base64 PDF to temporary file to avoid command line length limits + const tempPdfPath = path.join(tempDir, `input_${Date.now()}_${Math.random().toString(36).slice(2)}.pdf`); + const pdfBuffer = Buffer.from(pdfBase64, 'base64'); + fs.writeFileSync(tempPdfPath, pdfBuffer); + + // Use temp file if outputPath not provided + const docxPath = outputPath || path.join(tempDir, `converted_${Date.now()}.docx`); + + // Verify PDF file was written + if (!fs.existsSync(tempPdfPath)) { + throw new Error(`Failed to write PDF file to: ${tempPdfPath}`); + } + + // Run Python script with file paths + // On Windows, use proper quoting; on Unix, paths should work as-is + const pythonCmd = process.platform === 'win32' ? 'python' : 'python3'; + + // Quote paths properly for Windows (use double quotes and escape inner quotes) + const quotePath = (p: string) => { + if (process.platform === 'win32') { + // Windows: use double quotes and escape any existing quotes + return `"${p.replace(/"/g, '\\"')}"`; + } else { + // Unix: use single quotes and escape any existing quotes + return `'${p.replace(/'/g, "\\'")}'`; + } + }; + + const command = `${pythonCmd} ${quotePath(scriptPath)} ${quotePath(tempPdfPath)} ${quotePath(docxPath)}`; + + console.log(`[PDF→DOCX] Converting PDF to DOCX...`); + console.log(`[PDF→DOCX] Command: ${command}`); + + let stdout = ''; + let stderr = ''; + try { + const result = await execAsync(command, { maxBuffer: 50 * 1024 * 1024 }); + stdout = result.stdout || ''; + stderr = result.stderr || ''; + } catch (execErr: any) { + // execAsync throws an error when command fails, but stdout/stderr are in the error object + stdout = execErr.stdout || ''; + stderr = execErr.stderr || ''; + const errorMessage = execErr.message || 'Unknown error'; + + // If we have stdout that might be JSON, try to parse it + if (stdout.trim()) { + try { + const result = JSON.parse(stdout); + if (result.error) { + throw new Error(`Python script error: ${result.error}`); + } + } catch (parseErr) { + // Not JSON, use the exec error + } + } + + // Include both stdout and stderr in error message + const fullError = [ + errorMessage, + stdout ? `\nStdout: ${stdout}` : '', + stderr ? `\nStderr: ${stderr}` : '' + ].filter(Boolean).join(''); + + throw new Error(fullError); + } + + // Clean up temp PDF file + try { + if (fs.existsSync(tempPdfPath)) { + fs.unlinkSync(tempPdfPath); + } + } catch (cleanupErr) { + console.warn(`[PDF→DOCX] Could not delete temp PDF: ${cleanupErr}`); + } + + // Log progress messages from stderr (Python script sends progress updates there) + if (stderr) { + try { + // Try to parse JSON progress messages + const stderrLines = stderr.split('\n').filter(line => line.trim()); + for (const line of stderrLines) { + try { + const progressMsg = JSON.parse(line); + if (progressMsg.info) { + console.log(`[PDF→DOCX] ${progressMsg.info}`); + } + } catch { + // Not JSON, log as-is if it's not a success message + if (line.trim() && !line.includes('"success":true')) { + console.log(`[PDF→DOCX] ${line}`); + } + } + } + } catch { + // If parsing fails, just log the stderr + if (!stdout.includes('"success":true')) { + console.warn(`[PDF→DOCX] Python stderr: ${stderr}`); + } + } + } + + // Parse JSON result + let result; + try { + result = JSON.parse(stdout); + } catch (parseErr) { + throw new Error(`Failed to parse Python script output as JSON. Stdout: ${stdout.substring(0, 500)}${stdout.length > 500 ? '...' : ''}. Stderr: ${stderr}`); + } + + if (result.success) { + console.log(`[PDF→DOCX] ✓ Successfully converted PDF to DOCX`); + + // Verify the DOCX file exists and has content + if (!fs.existsSync(docxPath)) { + throw new Error(`DOCX file not found at: ${docxPath}`); + } + + const fileStats = fs.statSync(docxPath); + if (fileStats.size === 0) { + throw new Error(`DOCX file is empty at: ${docxPath}`); + } + + console.log(`[PDF→DOCX] Reading DOCX file (${fileStats.size} bytes)...`); + + // For large files (>50MB), save directly to workspace and send file path instead of base64 + // This avoids memory issues and webview message size limits + const LARGE_FILE_THRESHOLD = 50 * 1024 * 1024; // 50MB + const workspaceFolder = vscode.workspace.workspaceFolders?.[0]; + + if (fileStats.size > LARGE_FILE_THRESHOLD && workspaceFolder) { + console.log(`[PDF→DOCX] Large file detected (${fileStats.size} bytes), saving to workspace instead of sending via message...`); + + // Save DOCX to temporary location in workspace + const tempDir = vscode.Uri.joinPath(workspaceFolder.uri, '.project', 'temp'); + await vscode.workspace.fs.createDirectory(tempDir); + + const tempDocxUri = vscode.Uri.joinPath(tempDir, `pdf_conversion_${requestId}.docx`); + const docxBuffer = fs.readFileSync(docxPath); + await vscode.workspace.fs.writeFile(tempDocxUri, new Uint8Array(docxBuffer)); + + console.log(`[PDF→DOCX] Saved large DOCX to workspace: ${tempDocxUri.fsPath}`); + + webviewPanel.webview.postMessage({ + command: 'convertPdfToDocxResult', + requestId, + success: true, + docxFilePath: tempDocxUri.fsPath, // Send file path instead of base64 + outputPath: docxPath, + isLargeFile: true + }); + } else { + // For smaller files, send base64 as before + const docxBuffer = fs.readFileSync(docxPath); + const docxBase64 = docxBuffer.toString('base64'); + + // Verify base64 encoding is valid + if (!docxBase64 || docxBase64.length === 0) { + throw new Error('Failed to encode DOCX file to base64'); + } + + console.log(`[PDF→DOCX] Sending DOCX data to webview (${docxBase64.length} base64 chars)...`); + + webviewPanel.webview.postMessage({ + command: 'convertPdfToDocxResult', + requestId, + success: true, + docxBase64: docxBase64, + outputPath: docxPath, + isLargeFile: false + }); + } + } else { + throw new Error(result.error || 'Conversion failed'); + } + } catch (err) { + const errorMessage = err instanceof Error ? err.message : 'Unknown error'; + console.error('[NEW SOURCE UPLOADER] PDF→DOCX conversion failed:', err); + webviewPanel.webview.postMessage({ + command: 'convertPdfToDocxResult', + requestId, + success: false, + error: errorMessage + }); + } + } else if (message.command === "convertDocxToPdf") { + const { requestId, docxBase64, outputPath } = message as { requestId: string; docxBase64: string; outputPath?: string; }; + try { + const scriptPath = path.join(this.context.extensionPath, 'webviews', 'codex-webviews', 'src', 'NewSourceUploader', 'importers', 'pdf', 'scripts', 'docx_to_pdf.py'); + + // Verify script exists + if (!fs.existsSync(scriptPath)) { + throw new Error(`Python script not found at: ${scriptPath}`); + } + + // Create temp directory + const tempDir = path.join(this.context.extensionPath, '.temp'); + if (!fs.existsSync(tempDir)) { + fs.mkdirSync(tempDir, { recursive: true }); + } + + // Write base64 DOCX to temporary file to avoid command line length limits + const tempDocxPath = path.join(tempDir, `input_${Date.now()}_${Math.random().toString(36).slice(2)}.docx`); + const docxBuffer = Buffer.from(docxBase64, 'base64'); + fs.writeFileSync(tempDocxPath, docxBuffer); + + // Use temp file if outputPath not provided + const pdfPath = outputPath || path.join(tempDir, `converted_${Date.now()}.pdf`); + + // Verify DOCX file was written + if (!fs.existsSync(tempDocxPath)) { + throw new Error(`Failed to write DOCX file to: ${tempDocxPath}`); + } + + // Run Python script with file paths + // On Windows, use proper quoting; on Unix, paths should work as-is + const pythonCmd = process.platform === 'win32' ? 'python' : 'python3'; + + // Quote paths properly for Windows (use double quotes and escape inner quotes) + const quotePath = (p: string) => { + if (process.platform === 'win32') { + // Windows: use double quotes and escape any existing quotes + return `"${p.replace(/"/g, '\\"')}"`; + } else { + // Unix: use single quotes and escape any existing quotes + return `'${p.replace(/'/g, "\\'")}'`; + } + }; + + const command = `${pythonCmd} ${quotePath(scriptPath)} ${quotePath(tempDocxPath)} ${quotePath(pdfPath)}`; + + console.log(`[DOCX→PDF] Converting DOCX to PDF...`); + console.log(`[DOCX→PDF] Command: ${command}`); + + let stdout = ''; + let stderr = ''; + try { + const result = await execAsync(command, { maxBuffer: 50 * 1024 * 1024 }); + stdout = result.stdout || ''; + stderr = result.stderr || ''; + } catch (execErr: any) { + // execAsync throws an error when command fails, but stdout/stderr are in the error object + stdout = execErr.stdout || ''; + stderr = execErr.stderr || ''; + const errorMessage = execErr.message || 'Unknown error'; + + // If we have stdout that might be JSON, try to parse it + if (stdout.trim()) { + try { + const result = JSON.parse(stdout); + if (result.error) { + throw new Error(`Python script error: ${result.error}`); + } + } catch (parseErr) { + // Not JSON, use the exec error + } + } + + // Include both stdout and stderr in error message + const fullError = [ + errorMessage, + stdout ? `\nStdout: ${stdout}` : '', + stderr ? `\nStderr: ${stderr}` : '' + ].filter(Boolean).join(''); + + throw new Error(fullError); + } + + // Clean up temp DOCX file + try { + if (fs.existsSync(tempDocxPath)) { + fs.unlinkSync(tempDocxPath); + } + } catch (cleanupErr) { + console.warn(`[DOCX→PDF] Could not delete temp DOCX: ${cleanupErr}`); + } + + if (stderr && !stdout.includes('"success":true')) { + console.warn(`[DOCX→PDF] Python stderr: ${stderr}`); + } + + // Parse JSON result + let result; + try { + result = JSON.parse(stdout); + } catch (parseErr) { + throw new Error(`Failed to parse Python script output as JSON. Stdout: ${stdout.substring(0, 500)}${stdout.length > 500 ? '...' : ''}. Stderr: ${stderr}`); + } + + if (result.success) { + console.log(`[DOCX→PDF] ✓ Successfully converted DOCX to PDF`); + webviewPanel.webview.postMessage({ + command: 'convertDocxToPdfResult', + requestId, + success: true, + pdfBase64: result.pdfBase64, + outputPath: pdfPath + }); + } else { + throw new Error(result.error || 'Conversion failed'); + } + } catch (err) { + const errorMessage = err instanceof Error ? err.message : 'Unknown error'; + console.error('[NEW SOURCE UPLOADER] DOCX→PDF conversion failed:', err); + webviewPanel.webview.postMessage({ + command: 'convertDocxToPdfResult', + requestId, + success: false, + error: errorMessage + }); + } } else if (message.command === "fetchTargetFile") { // Fetch target file content for translation imports const { sourceFilePath } = message; @@ -605,6 +933,28 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide ...(processedNotebook.metadata?.importerType && { importerType: processedNotebook.metadata.importerType }), + // Spreadsheet-specific metadata for round-trip export + ...(processedNotebook.metadata.importerType === "spreadsheet" || + processedNotebook.metadata.importerType === "spreadsheet-csv" || + processedNotebook.metadata.importerType === "spreadsheet-tsv" + ? (() => { + const spreadsheetMetadata = processedNotebook.metadata as SpreadsheetNotebookMetadata; + return { + ...(spreadsheetMetadata.originalFileContent && { + originalFileContent: spreadsheetMetadata.originalFileContent + }), + ...(spreadsheetMetadata.columnHeaders && { + columnHeaders: spreadsheetMetadata.columnHeaders + }), + ...(spreadsheetMetadata.sourceColumnIndex !== undefined && { + sourceColumnIndex: spreadsheetMetadata.sourceColumnIndex + }), + ...(spreadsheetMetadata.delimiter && { + delimiter: spreadsheetMetadata.delimiter + }), + }; + })() + : {}), // Preserve USFM round-trip structure metadata (original content + line mappings) ...('structureMetadata' in processedNotebook.metadata && processedNotebook.metadata.structureMetadata ? { structureMetadata: processedNotebook.metadata.structureMetadata as CustomNotebookMetadata['structureMetadata'] } @@ -652,38 +1002,115 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide token: vscode.CancellationToken, webviewPanel: vscode.WebviewPanel ): Promise { - // Save original files if provided in metadata + // Import the original file utilities + const { saveOriginalFileWithDeduplication } = await import('./originalFileUtils'); + + // Save original files if provided in metadata (with hash-based deduplication) const workspaceFolder = vscode.workspace.workspaceFolders?.[0]; if (workspaceFolder) { for (const pair of message.notebookPairs) { if ("originalFileData" in pair.source.metadata && pair.source.metadata.originalFileData) { - // Save the original file in attachments - const originalFileName = pair.source.metadata.originalFileName || 'document.docx'; - // Store originals under attachments/files/originals for consistency with other attachment storage. - // (Some existing projects may have originals under attachments/originals; exporter will fallback.) - const originalsDir = vscode.Uri.joinPath( - workspaceFolder.uri, - '.project', - 'attachments', - 'files', - 'originals' - ); - await vscode.workspace.fs.createDirectory(originalsDir); - - const originalFileUri = vscode.Uri.joinPath(originalsDir, originalFileName); + // Save the original file with deduplication + const requestedFileName = pair.source.metadata.originalFileName || 'document.docx'; const fileData = pair.source.metadata.originalFileData; - // Convert ArrayBuffer to Uint8Array if needed + // Convert to Uint8Array if needed const buffer = fileData instanceof ArrayBuffer ? new Uint8Array(fileData) : Buffer.from(fileData); - await vscode.workspace.fs.writeFile(originalFileUri, buffer); + // Use hash-based deduplication to save the file + // This handles: + // 1. Same name, same hash: Keep existing file + // 2. Different name, same hash: Return existing filename + // 3. Same name, different hash: Rename to sample(1).idml etc. + const result = await saveOriginalFileWithDeduplication( + workspaceFolder, + requestedFileName, + buffer + ); + + console.log(`[NewSourceUploader] Original file: ${result.message}`); + + // Store the file hash in metadata for integrity verification and deduplication tracking + (pair.source.metadata as any).originalFileHash = result.hash; + if (pair.codex?.metadata) { + (pair.codex.metadata as any).originalFileHash = result.hash; + } + + // IMPORTANT: Preserve user's original filename as fileDisplayName before updating originalFileName + // This ensures the display name reflects what the user imported, while originalFileName + // points to the actual deduplicated file in attachments/originals + if (result.fileName !== requestedFileName) { + // Set fileDisplayName to user's original name (without extension) if not already set + if (!pair.source.metadata.fileDisplayName) { + const displayName = requestedFileName.replace(/\.[^/.]+$/, ''); // Remove extension + (pair.source.metadata as any).fileDisplayName = displayName; + console.log(`[NewSourceUploader] Set fileDisplayName: "${displayName}" (from original "${requestedFileName}")`); + } + if (pair.codex?.metadata && !pair.codex.metadata.fileDisplayName) { + const displayName = requestedFileName.replace(/\.[^/.]+$/, ''); + (pair.codex.metadata as any).fileDisplayName = displayName; + } + + // Update originalFileName to point to the actual stored file (deduplicated) + pair.source.metadata.originalFileName = result.fileName; + if (pair.codex?.metadata) { + pair.codex.metadata.originalFileName = result.fileName; + } + console.log(`[NewSourceUploader] Updated originalFileName to deduplicated file: "${result.fileName}"`); + } // CRITICAL: Do not persist original binary content into JSON notebooks. - // The original template is stored in `.project/attachments/originals/`. + // The original template is stored in `.project/attachments/originals/`. delete pair.source.metadata.originalFileData; } + + // For PDF imports: Also save the converted DOCX file for round-trip export (with deduplication) + const pdfMetadata = (pair.source.metadata as any)?.pdfDocumentMetadata; + if (pdfMetadata?.convertedDocxFileName) { + let docxBuffer: Uint8Array | null = null; + + // If convertedDocxData is present (small files), use it directly + if (pdfMetadata.convertedDocxData) { + const docxData = pdfMetadata.convertedDocxData; + docxBuffer = docxData instanceof ArrayBuffer + ? new Uint8Array(docxData) + : Buffer.from(docxData); + // Remove from metadata to avoid persisting in JSON + delete pdfMetadata.convertedDocxData; + } else if (pdfMetadata.isLargeFile) { + // For large files, check if temp file exists and read it + const tempDir = vscode.Uri.joinPath(workspaceFolder.uri, '.project', 'temp'); + try { + const tempFiles = await vscode.workspace.fs.readDirectory(tempDir); + const matchingFile = tempFiles.find(([name]) => name.startsWith('pdf_conversion_') && name.endsWith('.docx')); + if (matchingFile) { + const tempFileUri = vscode.Uri.joinPath(tempDir, matchingFile[0]); + docxBuffer = await vscode.workspace.fs.readFile(tempFileUri); + await vscode.workspace.fs.delete(tempFileUri); // Clean up temp file + } + } catch (err) { + console.warn(`[PDF Importer] Could not find/copy temp DOCX file: ${err}`); + } + } + + // Save with deduplication if we have data + if (docxBuffer) { + const docxResult = await saveOriginalFileWithDeduplication( + workspaceFolder, + pdfMetadata.convertedDocxFileName, + docxBuffer + ); + console.log(`[PDF Importer] Converted DOCX: ${docxResult.message}`); + + // Update convertedDocxFileName to point to the actual stored file (deduplicated) + if (docxResult.fileName !== pdfMetadata.convertedDocxFileName) { + console.log(`[PDF Importer] Updated convertedDocxFileName: "${pdfMetadata.convertedDocxFileName}" -> "${docxResult.fileName}"`); + pdfMetadata.convertedDocxFileName = docxResult.fileName; + } + } + } } } @@ -710,6 +1137,28 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide codexNotebooks, }); + // Register notebook references in the original files registry + // This tracks which notebooks use each original file, so we know when it's safe to delete + if (workspaceFolder) { + const { addNotebookReference } = await import('./originalFileUtils'); + for (const createdFile of createdFiles) { + try { + // Read the source notebook to get originalFileName from metadata + const sourceContent = await vscode.workspace.fs.readFile(createdFile.sourceUri); + const sourceNotebook = JSON.parse(new TextDecoder().decode(sourceContent)); + const originalFileName = sourceNotebook?.metadata?.originalName || sourceNotebook?.metadata?.originalFileName; + + if (originalFileName) { + // Use the source filename (without extension) as the notebook base name + const notebookBaseName = path.basename(createdFile.sourceUri.fsPath).replace(/\.[^/.]+$/, ''); + await addNotebookReference(workspaceFolder, originalFileName, notebookBaseName); + } + } catch (err) { + console.warn(`[NewSourceUploader] Could not register notebook reference: ${err}`); + } + } + } + // Migrate localized-books.json to codex metadata before deleting the file // Pass the newly created codex URIs directly to avoid search issues const createdCodexUris = createdFiles.map(f => f.codexUri); diff --git a/src/providers/NewSourceUploader/codexFIleCreateUtils.ts b/src/providers/NewSourceUploader/codexFIleCreateUtils.ts index b10b9661b..c7d41038f 100644 --- a/src/providers/NewSourceUploader/codexFIleCreateUtils.ts +++ b/src/providers/NewSourceUploader/codexFIleCreateUtils.ts @@ -91,6 +91,75 @@ async function collectExistingCorpusMarkers(workspaceFolder: vscode.WorkspaceFol return existingMarkers; } +/** + * Collects existing fileDisplayName values from source notebooks in the workspace. + * Returns an array of display names (including any with number suffixes like "Sample (1)"). + */ +async function collectExistingDisplayNames(workspaceFolder: vscode.WorkspaceFolder): Promise { + const existingDisplayNames: string[] = []; + + try { + const sourceFiles = await vscode.workspace.findFiles( + ".project/sourceTexts/*.source", + "**/node_modules/**" + ); + + const serializer = new CodexContentSerializer(); + + for (const file of sourceFiles) { + try { + const content = await vscode.workspace.fs.readFile(file); + const notebookData = await serializer.deserializeNotebook( + content, + new vscode.CancellationTokenSource().token + ); + + const metadata = notebookData.metadata as CustomNotebookMetadata | undefined; + if (metadata?.fileDisplayName) { + existingDisplayNames.push(metadata.fileDisplayName); + } + } catch (error) { + // Skip files that can't be read + console.warn(`[DISPLAY NAME] Could not read file ${file.fsPath}:`, error); + } + } + } catch (error) { + console.warn(`[DISPLAY NAME] Error collecting existing display names:`, error); + } + + return existingDisplayNames; +} + +/** + * Generates a unique display name by adding a number suffix if needed. + * Example: If "ACT-REV" exists, returns "ACT-REV (1)". If "ACT-REV (1)" also exists, returns "ACT-REV (2)". + */ +function getUniqueDisplayName(baseName: string, existingNames: string[]): string { + // Check if the base name already exists + if (!existingNames.includes(baseName)) { + return baseName; + } + + // Find the highest existing number suffix for this base name + // Pattern matches: "baseName (N)" where N is a number + const escapedBaseName = baseName.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); + const suffixPattern = new RegExp(`^${escapedBaseName} \\((\\d+)\\)$`); + + let maxNumber = 0; + for (const name of existingNames) { + const match = name.match(suffixPattern); + if (match) { + const num = parseInt(match[1], 10); + if (num > maxNumber) { + maxNumber = num; + } + } + } + + // Return the base name with the next number + return `${baseName} (${maxNumber + 1})`; +} + export async function createNoteBookPair({ token, sourceNotebooks, @@ -114,6 +183,9 @@ export async function createNoteBookPair({ // Collect existing corpusMarkers from the workspace const existingMarkers = await collectExistingCorpusMarkers(workspaceFolder); + // Collect existing display names for non-biblical imports to avoid duplicates + const existingDisplayNames = await collectExistingDisplayNames(workspaceFolder); + for (let i = 0; i < sourceNotebooks.length; i++) { checkCancellation(token); @@ -130,6 +202,47 @@ export async function createNoteBookPair({ console.log(`[CODEX FILE CREATE] Importer type: "${importerType}", Biblical: ${isBiblical}`); + // For non-biblical imports, use the metadata id (UUID) to create unique filenames + // This allows users to import changed source files multiple times and merge translations later + let notebookName = sourceNotebook.name; + let uniqueId: string | undefined; + + if (!isBiblical) { + // Use the metadata id (UUID) that was generated during import + uniqueId = sourceNotebook.metadata?.id; + + if (!uniqueId) { + // Fallback: generate a short unique id if metadata.id is missing + uniqueId = Math.random().toString(36).substring(2, 10); + console.warn(`[CODEX FILE CREATE] No metadata.id found, generated fallback id: "${uniqueId}"`); + } + + notebookName = `${sourceNotebook.name}-(${uniqueId})`; + + console.log(`[CODEX FILE CREATE] Non-biblical import: adding id "${uniqueId}" to filename`); + + // IMPORTANT: Do NOT modify originalFileName here. + // originalFileName must point to the actual file stored in attachments/originals/ + // (which may be deduplicated). The notebook filename uses UUIDs for uniqueness, + // but the original file reference should remain unchanged for round-trip export. + + // Generate unique display name for non-biblical imports + // If a file with the same display name already exists, add a number suffix + const baseDisplayName = sourceNotebook.metadata?.fileDisplayName || sourceNotebook.name; + const uniqueDisplayName = getUniqueDisplayName(baseDisplayName, existingDisplayNames); + + if (uniqueDisplayName !== baseDisplayName) { + console.log(`[CODEX FILE CREATE] Display name "${baseDisplayName}" already exists, using "${uniqueDisplayName}"`); + } + + // Update display name in metadata + sourceNotebook.metadata.fileDisplayName = uniqueDisplayName; + codexNotebook.metadata.fileDisplayName = uniqueDisplayName; + + // Add this display name to existing names for subsequent files in the same batch + existingDisplayNames.push(uniqueDisplayName); + } + // Use corpusMarker as-is from the importer (no normalization) // This matches how other importers like Docx and Biblica work const incomingCorpusMarker = sourceNotebook.metadata?.corpusMarker; @@ -150,8 +263,9 @@ export async function createNoteBookPair({ } // Create standardized filenames - only use USFM codes for biblical content - const sourceFilename = await createStandardizedFilename(sourceNotebook.name, ".source", isBiblical); - const codexFilename = await createStandardizedFilename(codexNotebook.name, ".codex", isBiblical); + // For non-biblical content, notebookName already includes the unique id + const sourceFilename = await createStandardizedFilename(notebookName, ".source", isBiblical); + const codexFilename = await createStandardizedFilename(notebookName, ".codex", isBiblical); // Create final URIs with standardized filenames const sourceUri = vscode.Uri.joinPath( diff --git a/src/providers/NewSourceUploader/originalFileUtils.ts b/src/providers/NewSourceUploader/originalFileUtils.ts new file mode 100644 index 000000000..e8d0546de --- /dev/null +++ b/src/providers/NewSourceUploader/originalFileUtils.ts @@ -0,0 +1,453 @@ +/** + * Original File Utilities + * + * Handles hash-based deduplication of original files stored in .project/attachments/originals/ + * + * Storage Structure: + * - .project/attachments/originals/ + * - file-hashes.json (registry of all imported files with their hashes) + * - sample.idml (actual original file) + * - sample(1).idml (renamed file if same name but different content) + * - other-document.docx (another original file) + * + * Features: + * - Computes SHA-256 hash of file content + * - Maintains a registry (file-hashes.json) of original files with their hashes + * - Saves actual original files to the originals folder + * - Prevents duplicate storage of identical files (same content = reuse existing file) + * - Handles filename conflicts by renaming (e.g., sample(1).idml, sample(2).idml) + */ + +import * as vscode from 'vscode'; +import * as crypto from 'crypto'; + +/** + * Registry entry for an original file + */ +export interface OriginalFileEntry { + /** SHA-256 hash of the file content */ + hash: string; + /** The filename stored in attachments/originals/ */ + fileName: string; + /** Original filename(s) that mapped to this file (for reference) */ + originalNames: string[]; + /** Notebook base names (without extension) that reference this original file */ + referencedBy: string[]; + /** Timestamp when first added */ + addedAt: string; +} + +/** + * Registry structure for original files + */ +export interface OriginalFilesRegistry { + /** Version for future migrations */ + version: number; + /** Map of hash -> file entry */ + files: { [hash: string]: OriginalFileEntry; }; + /** Map of filename -> hash (for quick filename lookup) */ + fileNameToHash: { [fileName: string]: string; }; +} + +/** + * Result of checking/adding an original file + */ +export interface OriginalFileResult { + /** The filename to use in metadata (may be different from requested) */ + fileName: string; + /** Whether a new file was saved (false if deduplicated) */ + savedNewFile: boolean; + /** The hash of the file */ + hash: string; + /** Message describing what happened */ + message: string; +} + +const REGISTRY_FILENAME = 'file-hashes.json'; + +/** + * Compute SHA-256 hash of file data + */ +export function computeFileHash(data: Uint8Array | ArrayBuffer | Buffer): string { + const buffer = data instanceof ArrayBuffer + ? Buffer.from(data) + : data instanceof Uint8Array + ? Buffer.from(data) + : data; + return crypto.createHash('sha256').update(buffer).digest('hex'); +} + +/** + * Get the path to the originals directory + */ +function getOriginalsDir(workspaceFolder: vscode.WorkspaceFolder): vscode.Uri { + return vscode.Uri.joinPath( + workspaceFolder.uri, + '.project', + 'attachments', + 'originals' + ); +} + +/** + * Get the path to the registry file + */ +function getRegistryPath(workspaceFolder: vscode.WorkspaceFolder): vscode.Uri { + return vscode.Uri.joinPath(getOriginalsDir(workspaceFolder), REGISTRY_FILENAME); +} + +/** + * Load the original files registry, creating an empty one if it doesn't exist + */ +export async function loadOriginalFilesRegistry( + workspaceFolder: vscode.WorkspaceFolder +): Promise { + const registryPath = getRegistryPath(workspaceFolder); + + try { + const data = await vscode.workspace.fs.readFile(registryPath); + const registry = JSON.parse(new TextDecoder().decode(data)) as OriginalFilesRegistry; + + // Ensure all required fields exist (migration safety) + if (!registry.files) registry.files = {}; + if (!registry.fileNameToHash) registry.fileNameToHash = {}; + if (!registry.version) registry.version = 1; + + // Migration: ensure all entries have referencedBy array + for (const entry of Object.values(registry.files)) { + if (!entry.referencedBy) { + entry.referencedBy = []; + } + } + + return registry; + } catch { + // Registry doesn't exist, create empty one + return { + version: 1, + files: {}, + fileNameToHash: {}, + }; + } +} + +/** + * Save the original files registry + */ +export async function saveOriginalFilesRegistry( + workspaceFolder: vscode.WorkspaceFolder, + registry: OriginalFilesRegistry +): Promise { + const originalsDir = getOriginalsDir(workspaceFolder); + await vscode.workspace.fs.createDirectory(originalsDir); + + const registryPath = getRegistryPath(workspaceFolder); + const data = new TextEncoder().encode(JSON.stringify(registry, null, 2)); + await vscode.workspace.fs.writeFile(registryPath, data); +} + +/** + * Generate a unique filename by adding (1), (2), etc. suffix + */ +function generateUniqueFileName( + baseName: string, + existingFileNames: Set +): string { + if (!existingFileNames.has(baseName)) { + return baseName; + } + + // Split filename into name and extension + const lastDotIndex = baseName.lastIndexOf('.'); + const nameWithoutExt = lastDotIndex > 0 ? baseName.slice(0, lastDotIndex) : baseName; + const extension = lastDotIndex > 0 ? baseName.slice(lastDotIndex) : ''; + + // Try incrementing numbers until we find a unique name + let counter = 1; + let newName: string; + do { + newName = `${nameWithoutExt}(${counter})${extension}`; + counter++; + } while (existingFileNames.has(newName)); + + return newName; +} + +/** + * Save an original file with hash-based deduplication + * + * Handles three scenarios: + * 1. Same name, same hash: Keep existing file, return existing filename + * 2. Different name, same hash: Keep existing file, return existing filename + * 3. Same name, different hash: Save with new name (e.g., sample(1).idml) + * + * @param workspaceFolder The workspace folder + * @param requestedFileName The desired filename for the original file + * @param fileData The file content + * @param notebookBaseName Optional base name of the notebook referencing this file (e.g., "test-(uuid)") + * @returns Result with the actual filename to use in metadata + */ +export async function saveOriginalFileWithDeduplication( + workspaceFolder: vscode.WorkspaceFolder, + requestedFileName: string, + fileData: Uint8Array | ArrayBuffer | Buffer, + notebookBaseName?: string +): Promise { + // Compute hash of the file + const hash = computeFileHash(fileData); + + // Load existing registry + const registry = await loadOriginalFilesRegistry(workspaceFolder); + + // Check if we already have a file with this hash + const existingEntry = registry.files[hash]; + + if (existingEntry) { + // We already have a file with the same content + console.log(`[OriginalFiles] File with hash ${hash.slice(0, 8)}... already exists as "${existingEntry.fileName}"`); + + let registryChanged = false; + + // Track this original name if it's new + if (!existingEntry.originalNames.includes(requestedFileName)) { + existingEntry.originalNames.push(requestedFileName); + registryChanged = true; + } + + // Track notebook reference + if (notebookBaseName && !existingEntry.referencedBy.includes(notebookBaseName)) { + existingEntry.referencedBy.push(notebookBaseName); + registryChanged = true; + } + + if (registryChanged) { + await saveOriginalFilesRegistry(workspaceFolder, registry); + } + + return { + fileName: existingEntry.fileName, + savedNewFile: false, + hash, + message: `Deduplicated: using existing file "${existingEntry.fileName}" (same content as "${requestedFileName}")`, + }; + } + + // No existing file with this hash - need to save + const originalsDir = getOriginalsDir(workspaceFolder); + await vscode.workspace.fs.createDirectory(originalsDir); + + // Check if the filename is already taken (by a different file with different hash) + const existingFileNames = new Set(Object.keys(registry.fileNameToHash)); + let actualFileName = requestedFileName; + + if (existingFileNames.has(requestedFileName)) { + // Filename conflict - need to generate a unique name + actualFileName = generateUniqueFileName(requestedFileName, existingFileNames); + console.log(`[OriginalFiles] Filename "${requestedFileName}" exists with different content, saving as "${actualFileName}"`); + } + + // Save the file + const fileUri = vscode.Uri.joinPath(originalsDir, actualFileName); + const buffer = fileData instanceof ArrayBuffer + ? new Uint8Array(fileData) + : fileData instanceof Buffer + ? new Uint8Array(fileData) + : fileData; + await vscode.workspace.fs.writeFile(fileUri, buffer); + + // Update registry + registry.files[hash] = { + hash, + fileName: actualFileName, + originalNames: [requestedFileName], + referencedBy: notebookBaseName ? [notebookBaseName] : [], + addedAt: new Date().toISOString(), + }; + registry.fileNameToHash[actualFileName] = hash; + + await saveOriginalFilesRegistry(workspaceFolder, registry); + + const message = actualFileName !== requestedFileName + ? `Saved as "${actualFileName}" (renamed from "${requestedFileName}" due to filename conflict)` + : `Saved new file "${actualFileName}"`; + + console.log(`[OriginalFiles] ${message}`); + + return { + fileName: actualFileName, + savedNewFile: true, + hash, + message, + }; +} + +/** + * Check if an original file exists by hash + */ +export async function findOriginalFileByHash( + workspaceFolder: vscode.WorkspaceFolder, + hash: string +): Promise { + const registry = await loadOriginalFilesRegistry(workspaceFolder); + return registry.files[hash] || null; +} + +/** + * Check if an original file exists by filename + */ +export async function findOriginalFileByName( + workspaceFolder: vscode.WorkspaceFolder, + fileName: string +): Promise { + const registry = await loadOriginalFilesRegistry(workspaceFolder); + const hash = registry.fileNameToHash[fileName]; + if (hash) { + return registry.files[hash] || null; + } + return null; +} + +/** + * Get all original files in the registry + */ +export async function getAllOriginalFiles( + workspaceFolder: vscode.WorkspaceFolder +): Promise { + const registry = await loadOriginalFilesRegistry(workspaceFolder); + return Object.values(registry.files); +} + +/** + * Remove a notebook reference from the registry. + * If no other notebooks reference the original file, deletes the file from disk and registry. + * + * @param workspaceFolder The workspace folder + * @param notebookBaseName The base name of the notebook being deleted (e.g., "test-(uuid)") + * @param originalFileName The originalFileName from the notebook's metadata (points to file in originals/) + * @returns Whether the original file was deleted from disk + */ +export async function removeNotebookReference( + workspaceFolder: vscode.WorkspaceFolder, + notebookBaseName: string, + originalFileName?: string +): Promise<{ originalFileDeleted: boolean; fileName: string | null }> { + const registry = await loadOriginalFilesRegistry(workspaceFolder); + + // Find the entry by originalFileName or by scanning referencedBy + let targetHash: string | null = null; + let targetEntry: OriginalFileEntry | null = null; + + if (originalFileName) { + // Look up by filename first + const hash = registry.fileNameToHash[originalFileName]; + if (hash && registry.files[hash]) { + targetHash = hash; + targetEntry = registry.files[hash]; + } + } + + // If not found by filename, scan all entries for this notebook reference + if (!targetEntry) { + for (const [hash, entry] of Object.entries(registry.files)) { + if (entry.referencedBy.includes(notebookBaseName)) { + targetHash = hash; + targetEntry = entry; + break; + } + } + } + + if (!targetEntry || !targetHash) { + console.log(`[OriginalFiles] No registry entry found for notebook "${notebookBaseName}"`); + return { originalFileDeleted: false, fileName: null }; + } + + // Remove this notebook from referencedBy + targetEntry.referencedBy = targetEntry.referencedBy.filter(ref => ref !== notebookBaseName); + console.log(`[OriginalFiles] Removed reference "${notebookBaseName}" from "${targetEntry.fileName}" (${targetEntry.referencedBy.length} references remaining)`); + + if (targetEntry.referencedBy.length === 0) { + // No more references - delete the original file and registry entry + const originalsDir = getOriginalsDir(workspaceFolder); + const fileUri = vscode.Uri.joinPath(originalsDir, targetEntry.fileName); + const deletedFileName = targetEntry.fileName; + + try { + await vscode.workspace.fs.delete(fileUri); + console.log(`[OriginalFiles] Deleted unreferenced original file: ${targetEntry.fileName}`); + } catch (err) { + console.warn(`[OriginalFiles] Could not delete original file "${targetEntry.fileName}": ${err}`); + } + + // Remove from registry + delete registry.files[targetHash]; + delete registry.fileNameToHash[targetEntry.fileName]; + await saveOriginalFilesRegistry(workspaceFolder, registry); + + return { originalFileDeleted: true, fileName: deletedFileName }; + } + + // Still has references, just save the updated registry + await saveOriginalFilesRegistry(workspaceFolder, registry); + return { originalFileDeleted: false, fileName: targetEntry.fileName }; +} + +/** + * Add a notebook reference to an existing registry entry (by originalFileName). + * Used when the notebook base name isn't known at import time but is known after file creation. + * + * @param workspaceFolder The workspace folder + * @param originalFileName The originalFileName stored in metadata + * @param notebookBaseName The base name of the notebook (e.g., "test-(uuid)") + */ +export async function addNotebookReference( + workspaceFolder: vscode.WorkspaceFolder, + originalFileName: string, + notebookBaseName: string +): Promise { + const registry = await loadOriginalFilesRegistry(workspaceFolder); + + const hash = registry.fileNameToHash[originalFileName]; + if (!hash || !registry.files[hash]) { + console.warn(`[OriginalFiles] Cannot add reference: no registry entry for "${originalFileName}"`); + return; + } + + const entry = registry.files[hash]; + if (!entry.referencedBy.includes(notebookBaseName)) { + entry.referencedBy.push(notebookBaseName); + await saveOriginalFilesRegistry(workspaceFolder, registry); + console.log(`[OriginalFiles] Added reference "${notebookBaseName}" to "${originalFileName}" (${entry.referencedBy.length} total)`); + } +} + +/** + * Clean up orphaned registry entries (files that no longer exist on disk) + */ +export async function cleanupOrphanedEntries( + workspaceFolder: vscode.WorkspaceFolder +): Promise { + const registry = await loadOriginalFilesRegistry(workspaceFolder); + const originalsDir = getOriginalsDir(workspaceFolder); + + let removedCount = 0; + + for (const [hash, entry] of Object.entries(registry.files)) { + const fileUri = vscode.Uri.joinPath(originalsDir, entry.fileName); + try { + await vscode.workspace.fs.stat(fileUri); + } catch { + // File doesn't exist, remove from registry + delete registry.files[hash]; + delete registry.fileNameToHash[entry.fileName]; + removedCount++; + console.log(`[OriginalFiles] Removed orphaned registry entry: ${entry.fileName}`); + } + } + + if (removedCount > 0) { + await saveOriginalFilesRegistry(workspaceFolder, registry); + } + + return removedCount; +} diff --git a/src/providers/codexCellEditorProvider/codexCellEditorProvider.ts b/src/providers/codexCellEditorProvider/codexCellEditorProvider.ts index 2275a53f3..505d913e8 100755 --- a/src/providers/codexCellEditorProvider/codexCellEditorProvider.ts +++ b/src/providers/codexCellEditorProvider/codexCellEditorProvider.ts @@ -707,6 +707,11 @@ export class CodexCellEditorProvider implements vscode.CustomEditorProvider 0 && message.type === "codexDocument") { + await this.recordFileDeletionToEditHistory(normalizedPath, message.label); + } + // Show appropriate message based on results if (deletedFiles.length > 0 && errors.length === 0) { vscode.window.showInformationMessage( @@ -399,6 +442,27 @@ export class NavigationWebviewProvider extends BaseWebviewProvider { } break; } + case "deleteCorpusMarker": { + try { + const content = message.content ?? {}; + const { corpusLabel, displayName, children } = content; + + const fileCount = children?.length ?? 0; + const confirmed = await vscode.window.showWarningMessage( + `Are you sure you want to delete the folder "${displayName}"? This will permanently delete ${fileCount} file(s) and cannot be undone.`, + { modal: true }, + "Delete" + ); + + if (confirmed === "Delete" && children?.length > 0) { + await this.deleteCorpusMarker(corpusLabel, displayName, children); + } + } catch (error) { + console.error("Error deleting corpus marker:", error); + vscode.window.showErrorMessage(`Failed to delete folder: ${error}`); + } + break; + } } } @@ -1219,6 +1283,206 @@ export class NavigationWebviewProvider extends BaseWebviewProvider { } } + private async recordFileDeletionToEditHistory(filePath: string, label: string): Promise { + const workspaceFolder = vscode.workspace.workspaceFolders?.[0]?.uri; + if (!workspaceFolder) return; + + try { + const author = await this.getCurrentUser(); + await MetadataManager.safeUpdateMetadata( + workspaceFolder, + (metadata: { edits?: unknown[] }) => { + if (!metadata.edits) metadata.edits = []; + addProjectMetadataEdit( + metadata, + EditMapUtils.deletedFile(), + { filePath, label }, + author + ); + return metadata; + }, + { author } + ); + } catch (err) { + console.warn(`[Navigation] Could not record file deletion to edit history: ${err}`); + } + } + + private async recordCorpusDeletionToEditHistory( + corpusMarker: string, + deletedFiles: Array<{ filePath: string; label: string }> + ): Promise { + const workspaceFolder = vscode.workspace.workspaceFolders?.[0]?.uri; + if (!workspaceFolder) return; + + try { + const author = await this.getCurrentUser(); + await MetadataManager.safeUpdateMetadata( + workspaceFolder, + (metadata: { edits?: unknown[] }) => { + if (!metadata.edits) metadata.edits = []; + addProjectMetadataEdit( + metadata, + EditMapUtils.deletedCorpusMarker(), + { corpusMarker, deletedFiles }, + author + ); + return metadata; + }, + { author } + ); + } catch (err) { + console.warn(`[Navigation] Could not record corpus deletion to edit history: ${err}`); + } + } + + private async getCurrentUser(): Promise { + try { + const authApi = getAuthApi(); + const userInfo = await authApi?.getUserInfo(); + return userInfo?.username || "anonymous"; + } catch { + return "anonymous"; + } + } + + private async deleteCorpusMarker( + corpusLabel: string, + displayName: string, + children: Array<{ uri: string; label: string; type: string }> + ): Promise { + const workspaceFolder = vscode.workspace.workspaceFolders?.[0]; + if (!workspaceFolder) { + vscode.window.showErrorMessage("No workspace folder found"); + return; + } + + const codexEditorProvider = CodexCellEditorProvider.getInstance(); + const closePanelByUri = (uri: vscode.Uri) => { + if (!codexEditorProvider) return; + const webviewPanels = codexEditorProvider.getWebviewPanels(); + let panelToClose = webviewPanels.get(uri.toString()); + if (!panelToClose) { + for (const [panelUri, panel] of webviewPanels.entries()) { + const panelUriObj = vscode.Uri.parse(panelUri); + if (panelUriObj.fsPath === uri.fsPath) { + panelToClose = panel; + break; + } + } + } + if (panelToClose) panelToClose.dispose(); + }; + + const allDeletedFiles: Array<{ filePath: string; label: string }> = []; + const errors: string[] = []; + + await vscode.window.withProgress( + { + location: vscode.ProgressLocation.Notification, + title: `Deleting folder "${displayName}"`, + cancellable: false, + }, + async (progress) => { + const total = children.length; + for (let i = 0; i < children.length; i++) { + const child = children[i]; + progress.report({ + increment: (100 / total), + message: `Deleting ${child.label}...`, + }); + + const normalizedPath = (child.uri as string).replace(/\\/g, "/"); + const codexUri = vscode.Uri.file(normalizedPath); + + // Close webviews + closePanelByUri(codexUri); + if (child.type === "codexDocument") { + const baseFileName = path.basename(normalizedPath); + const sourceFileName = baseFileName.replace(".codex", ".source"); + const sourceUri = vscode.Uri.joinPath( + workspaceFolder.uri, + ".project", + "sourceTexts", + sourceFileName + ); + closePanelByUri(sourceUri); + } + + let originalFileName: string | undefined; + let notebookBaseName: string | undefined; + try { + const codexContent = await vscode.workspace.fs.readFile(codexUri); + const codexNotebook = JSON.parse(new TextDecoder().decode(codexContent)); + originalFileName = codexNotebook?.metadata?.originalName || codexNotebook?.metadata?.originalFileName; + notebookBaseName = path.basename(normalizedPath).replace(/\.[^/.]+$/, ""); + } catch { + // File may already be gone + } + + try { + await vscode.workspace.fs.delete(codexUri); + allDeletedFiles.push({ filePath: normalizedPath, label: child.label }); + } catch (error) { + console.error("Error deleting codex file:", error); + errors.push(`Failed to delete ${child.label}: ${error}`); + } + + if (child.type === "codexDocument") { + try { + const baseFileName = path.basename(normalizedPath); + const sourceFileName = baseFileName.replace(".codex", ".source"); + const sourceUri = vscode.Uri.joinPath( + workspaceFolder.uri, + ".project", + "sourceTexts", + sourceFileName + ); + try { + await vscode.workspace.fs.delete(sourceUri); + } catch (deleteError: unknown) { + const err = deleteError as { code?: string }; + if (err.code !== "FileNotFound" && err.code !== "ENOENT") { + errors.push(`Failed to delete source for ${child.label}`); + } + } + } catch (error) { + errors.push(`Failed to delete source for ${child.label}`); + } + } + + if (notebookBaseName) { + try { + const { removeNotebookReference } = await import("../NewSourceUploader/originalFileUtils"); + await removeNotebookReference(workspaceFolder, notebookBaseName, originalFileName); + } catch { + // Non-fatal + } + } + } + } + ); + + // Record folder and all deleted files to edit history + if (allDeletedFiles.length > 0) { + await this.recordCorpusDeletionToEditHistory(corpusLabel, allDeletedFiles); + } + + if (allDeletedFiles.length > 0 && errors.length === 0) { + vscode.window.showInformationMessage( + `Successfully deleted folder "${displayName}" and ${allDeletedFiles.length} file(s)` + ); + } else if (allDeletedFiles.length > 0 && errors.length > 0) { + vscode.window.showWarningMessage( + `Partially deleted: ${allDeletedFiles.length} file(s). Errors: ${errors.join("; ")}` + ); + } else { + vscode.window.showErrorMessage(`Failed to delete folder "${displayName}": ${errors.join("; ")}`); + } + + await this.buildInitialData(); + } + public dispose(): void { this.disposables.forEach((d) => d.dispose()); } diff --git a/src/utils/bookNameUtils.ts b/src/utils/bookNameUtils.ts index 53ee44ab8..8d93284b5 100644 --- a/src/utils/bookNameUtils.ts +++ b/src/utils/bookNameUtils.ts @@ -117,20 +117,32 @@ export async function getBookDisplayName(usfmCode: string): Promise { export function isBiblicalImporterType(importerType: string | undefined): boolean { if (!importerType) return false; const normalizedType = importerType.toLowerCase().trim(); + + // Exact matches for biblical importers const bibleTypeImporters = [ 'usfm', + 'usfm-experimental', 'paratext', 'ebiblecorpus', 'ebible', 'ebible-download', 'maculabible', 'macula', - 'biblica', 'obs', - 'pdf', // PDF can contain Bible content - 'indesign', // InDesign can contain Bible content + // Note: 'pdf', 'docx', 'indesign', and 'biblica' are NOT included here + // because they are generic document formats that should preserve + // their original filenames rather than being converted to Bible book codes. + // The importer type is stored in metadata, so filename suffixes are not needed. ]; - return bibleTypeImporters.includes(normalizedType); + + // Check exact match first + if (bibleTypeImporters.includes(normalizedType)) { + return true; + } + + // Also check prefixes for variations (e.g., 'usfm-*' matches any USFM variant) + const biblicalPrefixes = ['usfm', 'paratext', 'ebible', 'macula']; + return biblicalPrefixes.some(prefix => normalizedType.startsWith(prefix)); } /** diff --git a/src/utils/editMapUtils.ts b/src/utils/editMapUtils.ts index fe0c00285..f74ffeaba 100644 --- a/src/utils/editMapUtils.ts +++ b/src/utils/editMapUtils.ts @@ -26,6 +26,8 @@ type MetaEditMap = ["meta"]; type MetaFieldEditMap = ["meta", string]; type LanguagesEditMap = ["languages"]; type SpellcheckIsEnabledEditMap = ["spellcheckIsEnabled"]; +type DeletedCorpusMarkerEditMap = ["deletedCorpusMarker"]; +type DeletedFileEditMap = ["deletedFile"]; import { EditType } from "../../types/enums"; @@ -144,6 +146,14 @@ export const EditMapUtils = { return ["spellcheckIsEnabled"]; }, + deletedCorpusMarker(): DeletedCorpusMarkerEditMap { + return ["deletedCorpusMarker"]; + }, + + deletedFile(): DeletedFileEditMap { + return ["deletedFile"]; + }, + // Compare editMaps equals(editMap1: readonly string[], editMap2: readonly string[]): boolean { return JSON.stringify(editMap1) === JSON.stringify(editMap2); diff --git a/types/index.d.ts b/types/index.d.ts index 8f06aba6e..a61d2ea7f 100644 --- a/types/index.d.ts +++ b/types/index.d.ts @@ -998,6 +998,12 @@ export interface CustomNotebookMetadata { * Stored at notebook-level (not per-cell). For most importers this matches originalFileName. */ sourceFile?: string; + /** + * Timestamp added to non-biblical imports to ensure unique filenames. + * Format: "YYYYMMDD_HHmmss" (e.g., "20260127_143025") + * This allows importing changed source files multiple times without overwriting. + */ + importTimestamp?: string; /** * One-time import context derived from the import process. * This is the canonical home for attributes that do not vary per-cell. @@ -1033,6 +1039,8 @@ type FileImporterType = | "markdown" | "subtitles" | "spreadsheet" + | "spreadsheet-csv" + | "spreadsheet-tsv" | "tms" | "pdf" | "indesign" @@ -1591,6 +1599,14 @@ type ProjectManagerMessageFromWebview = | { command: "triggerSync"; } | { command: "editBookName"; content: { bookAbbr: string; newBookName: string; }; } | { command: "editCorpusMarker"; content: { corpusLabel: string; newCorpusName: string; }; } + | { + command: "deleteCorpusMarker"; + content: { + corpusLabel: string; + displayName: string; + children: Array<{ uri: string; label: string; type: string }>; + }; + } | { command: "openCellLabelImporter"; } | { command: "openCodexMigrationTool"; } | { command: "navigateToMainMenu"; } diff --git a/webviews/codex-webviews/src/CodexCellEditor/CellList.tsx b/webviews/codex-webviews/src/CodexCellEditor/CellList.tsx index b3960bf3b..8f2cca040 100644 --- a/webviews/codex-webviews/src/CodexCellEditor/CellList.tsx +++ b/webviews/codex-webviews/src/CodexCellEditor/CellList.tsx @@ -508,11 +508,15 @@ const CellList: React.FC = ({ allCells: QuillCellContent[], fallbackCells?: QuillCellContent[] ): number => { + // Use cellMarkers[0] (UUID) for finding the cell's position, not getCellIdentifier + // getCellIdentifier may return non-unique values (e.g., Biblica imports where multiple + // cells share the same first globalReference due to verse array accumulation) + const cellUuid = cell.cellMarkers?.[0]; const cellIdentifier = getCellIdentifier(cell); - if (!cellIdentifier) return 1; // Fallback if no identifier + if (!cellUuid) return 1; // Fallback if no UUID const cellIndex = allCells.findIndex( - (unit) => getCellIdentifier(unit) === cellIdentifier + (unit) => unit.cellMarkers?.[0] === cellUuid ); // If not found in full document (e.g. state out of sync after opening a cell), use visible list @@ -663,6 +667,21 @@ const CellList: React.FC = ({ } } + // For cells with cellLabel but no verse-level global references (e.g., Biblica importer cells before verses), + // use the cellLabel instead of chapter-based verse number + // Biblica cells before verses have globalReferences like ["GEN"] (book only), while cells with verses have ["GEN 1:34"] (book chapter:verse) + const globalRefs = cell.data?.globalReferences; + const hasGlobalRefs = globalRefs && Array.isArray(globalRefs) && globalRefs.length > 0; + const hasVerseLevelRefs = hasGlobalRefs && globalRefs.some((ref: string) => { + // Check if reference contains chapter:verse format (e.g., "GEN 1:34" or "GEN 1:1") + return typeof ref === 'string' && /\d+:\d+/.test(ref); + }); + + // If cell has a label but no verse-level references, use the label (for Biblica cells before verses) + if (cell.cellLabel && !hasVerseLevelRefs) { + return cell.cellLabel; + } + // Get chapter-based verse number (skipping paratext cells). // Pass currentCellsArray as fallback so line numbers stay correct when fullDocumentTranslationUnits // is temporarily out of sync (e.g. after clicking a cell before prev/next refreshes state). diff --git a/webviews/codex-webviews/src/NavigationView/index.tsx b/webviews/codex-webviews/src/NavigationView/index.tsx index d5a17cff6..0ac9d99d3 100644 --- a/webviews/codex-webviews/src/NavigationView/index.tsx +++ b/webviews/codex-webviews/src/NavigationView/index.tsx @@ -456,6 +456,24 @@ function NavigationView() { })); }; + const handleDeleteCorpusMarker = (item: CodexItem) => { + const displayName = + item.children?.[0]?.corpusMarker || + formatLabel(item.label, state.bibleBookMap || new Map()); + vscode.postMessage({ + command: "deleteCorpusMarker", + content: { + corpusLabel: item.label, + displayName, + children: item.children?.map((c) => ({ + uri: c.uri, + label: c.label, + type: c.type, + })) ?? [], + }, + }); + }; + const handleRenameModalClose = () => { setState((prev) => ({ ...prev, @@ -762,18 +780,32 @@ function NavigationView() { )} {item.type === "corpus" && ( - + <> + + + )} {!isGroup && ( - -
+ )} - + {isTranslationImport ? `Tell us which column contains the translations for "${selectedSource?.name}"` - : "Tell us which columns contain your content. Optional: add an Attachments column with audio URLs to auto-attach audio to each cell."} + : <> + Tell us which columns contain your content. Explanation of column types: + {"\n\n"} + • Verse References: (ID column) used for cross-references and annotations + {"\n\n"} + • Source Content: your source text per row + {"\n\n"} + • Attachments: audio URLs separated by comma, semicolon, or space. + } @@ -877,7 +919,7 @@ export const SpreadsheetImporterForm: React.FC = (props)
- Global References + Verse References
{!isTranslationImport && ( @@ -933,7 +975,7 @@ export const SpreadsheetImporterForm: React.FC = (props) {getColumnTypeCount("globalReferences") > 0 && ( - Global References + Verse References )} {getColumnTypeCount("source") > 0 && ( @@ -964,7 +1006,7 @@ export const SpreadsheetImporterForm: React.FC = (props) )}
- diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/cellMetadata.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/bibleSpredSheet/cellMetadata.ts similarity index 66% rename from webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/cellMetadata.ts rename to webviews/codex-webviews/src/NewSourceUploader/importers/bibleSpredSheet/cellMetadata.ts index 5ee0f903b..c3e7b0b9e 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/cellMetadata.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/bibleSpredSheet/cellMetadata.ts @@ -14,7 +14,10 @@ import { v4 as uuidv4 } from 'uuid'; export interface SpreadsheetCellMetadataParams { originalContent: string; rowIndex: number; - originalRow: string[]; + /** The full original row values (all columns) */ + originalRowValues: string[]; + /** The index of the source content column */ + sourceColumnIndex: number; fileName: string; globalReferences?: string[]; } @@ -22,9 +25,10 @@ export interface SpreadsheetCellMetadataParams { /** * Creates metadata for a Spreadsheet cell * Always generates a UUID for the cell ID. + * Stores the full original row to enable round-trip export. */ export function createSpreadsheetCellMetadata(params: SpreadsheetCellMetadataParams): { metadata: any; cellId: string; } { - const { originalContent, rowIndex, originalRow, fileName, globalReferences } = params; + const { originalContent, rowIndex, originalRowValues, sourceColumnIndex, fileName, globalReferences } = params; const finalCellId = uuidv4(); @@ -36,7 +40,10 @@ export function createSpreadsheetCellMetadata(params: SpreadsheetCellMetadataPar edits: [], data: { rowIndex, - originalRow, + /** Full original row values for round-trip export */ + originalRowValues, + /** Index of the source column that contains the translatable content */ + sourceColumnIndex, originalContent, globalReferences: (globalReferences || []).map((r) => String(r).trim()).filter(Boolean), }, diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/index.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/bibleSpredSheet/index.tsx similarity index 92% rename from webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/index.tsx rename to webviews/codex-webviews/src/NewSourceUploader/importers/bibleSpredSheet/index.tsx index eada7eeb5..24008bac1 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/index.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/bibleSpredSheet/index.tsx @@ -11,7 +11,6 @@ export const spreadsheetImporterPlugin: ImporterPlugin = { component: SpreadsheetImporterForm, supportedExtensions: ["csv", "tsv"], supportedMimeTypes: ["text/csv", "text/tab-separated-values", "application/csv"], - tags: ["Structured", "Data", "Translation"], enabled: true, }; diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/parser.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/bibleSpredSheet/parser.ts similarity index 100% rename from webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/parser.ts rename to webviews/codex-webviews/src/NewSourceUploader/importers/bibleSpredSheet/parser.ts diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/bibleSpredSheet/spreadsheetExporter.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/bibleSpredSheet/spreadsheetExporter.ts new file mode 100644 index 000000000..b37059bb6 --- /dev/null +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/bibleSpredSheet/spreadsheetExporter.ts @@ -0,0 +1,325 @@ +/** + * Spreadsheet Exporter - True Round-trip Export + * + * Exports codex notebooks back to CSV/TSV format with translations. + * Uses the original file content stored during import, only replacing + * the source column content with translations while keeping everything + * else exactly the same. + * + * Supports both spreadsheet-csv and spreadsheet-tsv importer types. + */ + +export interface SpreadsheetCell { + id: string; + value: string; + metadata: { + id?: string; + data?: { + rowIndex?: number; + originalRowValues?: string[]; + sourceColumnIndex?: number; + originalContent?: string; + globalReferences?: string[]; + }; + }; +} + +export interface SpreadsheetNotebookMetadata { + delimiter?: string; + originalFileName?: string; + originalFileContent?: string; + columnHeaders?: string[]; + sourceColumnIndex?: number; + columnCount?: number; + importerType?: string; +} + +/** + * Parse a CSV/TSV line with proper quote handling + */ +function parseCSVLine(line: string, delimiter: string): string[] { + const result: string[] = []; + let current = ''; + let inQuotes = false; + let i = 0; + + while (i < line.length) { + const char = line[i]; + const nextChar = line[i + 1]; + + if (char === '"') { + if (inQuotes && nextChar === '"') { + current += '"'; + i += 2; + } else { + inQuotes = !inQuotes; + i++; + } + } else if (char === delimiter && !inQuotes) { + result.push(current); + current = ''; + i++; + } else { + current += char; + i++; + } + } + + result.push(current); + return result; +} + +/** + * Escape a field value for CSV/TSV output + */ +function escapeField(value: string, delimiter: string): string { + if (value === null || value === undefined) return ''; + const strValue = String(value); + + const needsQuotes = strValue.includes(delimiter) || + strValue.includes('"') || + strValue.includes('\n') || + strValue.includes('\r'); + + if (needsQuotes) { + const escaped = strValue.replace(/"/g, '""'); + return `"${escaped}"`; + } + + return strValue; +} + +/** + * Remove HTML tags from content (translations might have HTML) + */ +function stripHtmlTags(html: string): string { + if (!html) return ''; + return html + .replace(/<[^>]*>/g, '') + .replace(/ /g, ' ') + .replace(/&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/"/g, '"') + .replace(/'/g, "'") + .trim(); +} + +/** + * Export codex cells to spreadsheet format (CSV or TSV) + * + * TRUE ROUND-TRIP EXPORT: + * - Uses the original file content stored during import + * - Keeps the HEADER ROW exactly as it was (no changes) + * - Replaces ONLY the source column in DATA ROWS with translations + * - Preserves everything else exactly as it was + */ +export function exportSpreadsheetWithTranslations( + cells: SpreadsheetCell[], + metadata: SpreadsheetNotebookMetadata +): string { + const originalFileContent = metadata.originalFileContent; + const sourceColumnIndex = metadata.sourceColumnIndex; + + // Determine delimiter from importerType or metadata + let delimiter = metadata.delimiter || ','; + if (metadata.importerType === 'spreadsheet-tsv') { + delimiter = '\t'; + } else if (metadata.importerType === 'spreadsheet-csv') { + delimiter = ','; + } + + console.log(`[Spreadsheet Export] importerType: ${metadata.importerType}, delimiter: "${delimiter === '\t' ? 'TAB' : delimiter}"`); + + // Build a map of rowIndex -> translation + const translationsByRow = new Map(); + for (const cell of cells) { + const rowIndex = cell.metadata?.data?.rowIndex; + const translation = stripHtmlTags(cell.value || ''); + + if (typeof rowIndex === 'number' && translation) { + translationsByRow.set(rowIndex, translation); + } + } + + console.log(`[Spreadsheet Export] Built translation map with ${translationsByRow.size} translations`); + console.log(`[Spreadsheet Export] originalFileContent: ${originalFileContent ? 'found' : 'missing'}, sourceColumnIndex: ${sourceColumnIndex}`); + + // If we have the original file content, do true round-trip + if (originalFileContent) { + // Default to column index 2 (third column, typically "Transcrição") if not specified + const effectiveSourceColumnIndex = typeof sourceColumnIndex === 'number' ? sourceColumnIndex : 2; + + console.log(`[Spreadsheet Export] Using original file content for true round-trip export`); + console.log(`[Spreadsheet Export] Effective source column index: ${effectiveSourceColumnIndex}`); + console.log(`[Spreadsheet Export] Original content length: ${originalFileContent.length} chars`); + + // Remove BOM if present (UTF-8 BOM: EF BB BF) + let cleanContent = originalFileContent; + if (cleanContent.charCodeAt(0) === 0xFEFF) { + cleanContent = cleanContent.substring(1); + console.log(`[Spreadsheet Export] Removed BOM from content`); + } + + // Handle both Unix (\n) and Windows (\r\n) line endings + const lines = cleanContent.split(/\r?\n/); + const outputLines: string[] = []; + + console.log(`[Spreadsheet Export] File has ${lines.length} lines`); + + // First line is ALWAYS the header - keep it EXACTLY as is + if (lines.length > 0) { + const headerLine = lines[0]; + // Keep header line unchanged - DO NOT parse or modify it + outputLines.push(headerLine); + console.log(`[Spreadsheet Export] Preserved header (${headerLine.length} chars): "${headerLine.substring(0, 100)}${headerLine.length > 100 ? '...' : ''}"`); + } + + // Process data rows (skip first line which is header) + for (let i = 1; i < lines.length; i++) { + const line = lines[i]; + + // Skip empty lines at the end + if (!line.trim() && i === lines.length - 1) { + continue; + } + + // Skip completely empty lines + if (!line.trim()) { + outputLines.push(line); + continue; + } + + // Data row index (0-based, excluding header) + const dataRowIndex = i - 1; + + // Check if we have a translation for this row + const translation = translationsByRow.get(dataRowIndex); + + if (translation) { + // Parse the line to replace the source column + const fields = parseCSVLine(line, delimiter); + + if (effectiveSourceColumnIndex < fields.length) { + // Replace the source column with the translation + fields[effectiveSourceColumnIndex] = translation; + } + + // Rebuild the line with proper escaping + const outputLine = fields.map(f => escapeField(f, delimiter)).join(delimiter); + outputLines.push(outputLine); + } else { + // No translation for this row - keep it exactly as is + outputLines.push(line); + } + } + + console.log(`[Spreadsheet Export] Output ${outputLines.length} lines (1 header + ${outputLines.length - 1} data rows)`); + return outputLines.join('\n'); + } + + // Fallback: reconstruct from cell metadata (for legacy imports without originalFileContent) + console.log(`[Spreadsheet Export] Fallback: reconstructing from cell metadata`); + + const rows: string[] = []; + const columnHeaders = metadata.columnHeaders; + + // Add header row if available + if (columnHeaders && columnHeaders.length > 0) { + const headerRow = columnHeaders.map(h => escapeField(h, delimiter)); + rows.push(headerRow.join(delimiter)); + } + + // Sort cells by rowIndex + const sortedCells = [...cells].sort((a, b) => { + const aIndex = a.metadata?.data?.rowIndex ?? 0; + const bIndex = b.metadata?.data?.rowIndex ?? 0; + return aIndex - bIndex; + }); + + // Build data rows + for (const cell of sortedCells) { + const cellData = cell.metadata?.data; + const originalRowValues = cellData?.originalRowValues; + const cellSourceColumnIndex = cellData?.sourceColumnIndex ?? sourceColumnIndex; + const translation = stripHtmlTags(cell.value || ''); + + if (originalRowValues && originalRowValues.length > 0) { + const rowValues = [...originalRowValues]; + + if (typeof cellSourceColumnIndex === 'number' && cellSourceColumnIndex < rowValues.length) { + if (translation) { + rowValues[cellSourceColumnIndex] = translation; + } + } + + const escapedRow = rowValues.map(v => escapeField(v, delimiter)); + rows.push(escapedRow.join(delimiter)); + } else { + // Minimal fallback + const originalContent = cellData?.originalContent || ''; + const globalRefs = cellData?.globalReferences || []; + + const simpleRow: string[] = []; + if (globalRefs.length > 0) { + simpleRow.push(escapeField(globalRefs.join('; '), delimiter)); + } + simpleRow.push(escapeField(translation || originalContent, delimiter)); + + rows.push(simpleRow.join(delimiter)); + } + } + + return rows.join('\n'); +} + +/** + * Determine the appropriate file extension based on importer type or original file + */ +export function getSpreadsheetExtension(originalFileName: string | undefined, delimiter: string, importerType?: string): string { + // Check importer type first + if (importerType === 'spreadsheet-tsv') { + return 'tsv'; + } + if (importerType === 'spreadsheet-csv') { + return 'csv'; + } + + // Check original filename + if (originalFileName) { + const ext = originalFileName.toLowerCase().split('.').pop(); + if (ext === 'csv' || ext === 'tsv') { + return ext; + } + } + + // Default based on delimiter + return delimiter === '\t' ? 'tsv' : 'csv'; +} + +/** + * Determine delimiter from importer type, original file extension, or metadata + */ +export function getDelimiterFromMetadata(metadata: any): string { + // Check importer type first + if (metadata?.importerType === 'spreadsheet-tsv') { + return '\t'; + } + if (metadata?.importerType === 'spreadsheet-csv') { + return ','; + } + + // Check explicit delimiter in metadata + if (metadata?.delimiter) { + return metadata.delimiter; + } + + // Check original filename extension + const originalFileName = metadata?.originalFileName || ''; + if (originalFileName.toLowerCase().endsWith('.tsv')) { + return '\t'; + } + + // Default to comma (CSV) + return ','; +} diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/types.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/bibleSpredSheet/types.ts similarity index 100% rename from webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/types.ts rename to webviews/codex-webviews/src/NewSourceUploader/importers/bibleSpredSheet/types.ts diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/biblica/BiblicaImporterForm.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/biblica/BiblicaImporterForm.tsx index 159c4b8d9..f73e9dcdc 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/biblica/BiblicaImporterForm.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/biblica/BiblicaImporterForm.tsx @@ -25,6 +25,7 @@ import { ArrowLeft, BookOpen } from 'lucide-react'; +import { v4 as uuidv4 } from 'uuid'; import { IDMLParser } from './biblicaParser'; import { HTMLMapper } from './htmlMapper'; import { createProcessedCell, sanitizeFileName, createStandardCellId, addMilestoneCellsToNotebookPair } from '../../utils/workflowHelpers'; @@ -724,16 +725,14 @@ export const BiblicaImporterForm: React.FC = ({ addDebugLog(`Simplified note cells count: ${simplifiedNoteCells.length}`); - const baseName = sanitizeFileName(studyBibleFile.name.replace(/\.idml$/i, '')); - const notesNotebookName = sanitizeFileName(`${baseName}-notes`); - // Add -biblica suffix to originalFileName to match naming convention (e.g., "mat-john.idml" -> "mat-john-biblica.idml") - // This ensures the saved file in attachments matches what the exporter will look for - const originalFileName = studyBibleFile.name.replace( - /\.idml$/i, - "-biblica.idml" - ); - addDebugLog(`Base name: "${baseName}"`); - addDebugLog(`Notes notebook name: "${notesNotebookName}"`); + // Remove .idml extension and any "-notes" or "_notes" suffix from filename + const rawBaseName = studyBibleFile.name.replace(/\.idml$/i, ''); + const cleanBaseName = rawBaseName.replace(/[-_]?notes$/i, ''); + const baseName = sanitizeFileName(cleanBaseName); + // Use the original file name as-is - importer type is stored in metadata + const originalFileName = studyBibleFile.name; + addDebugLog(`Raw base name: "${rawBaseName}"`); + addDebugLog(`Clean base name (notes removed): "${baseName}"`); addDebugLog(`Original file name: "${originalFileName}"`); // Create notebook pair for notes only @@ -744,10 +743,10 @@ export const BiblicaImporterForm: React.FC = ({ if (simplifiedNoteCells.length > 0) { notebookPairs.push({ source: { - name: notesNotebookName, + name: baseName, cells: simplifiedNoteCells, metadata: { - id: `biblica-notes-source-${Date.now()}`, + id: uuidv4(), originalFileName: originalFileName, sourceFile: originalFileName, originalFileData: arrayBuffer, @@ -771,7 +770,7 @@ export const BiblicaImporterForm: React.FC = ({ } }, codex: { - name: notesNotebookName, + name: baseName, cells: simplifiedNoteCells.map(cell => ({ id: cell.id, content: '', // Empty codex for notes @@ -782,7 +781,7 @@ export const BiblicaImporterForm: React.FC = ({ } })), metadata: { - id: `biblica-notes-codex-${Date.now()}`, + id: uuidv4(), originalFileName: originalFileName, sourceFile: originalFileName, importerType: 'biblica', diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/biblica/index.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/biblica/index.tsx index 53a8cd2ff..3a357e6d8 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/biblica/index.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/biblica/index.tsx @@ -17,5 +17,4 @@ export const biblicaImporterPlugin: ImporterPlugin = { supportedExtensions: ['idml'], supportedMimeTypes: ['application/vnd.adobe.indesign-idml-package'], enabled: true, - tags: ['Bible', 'Biblica', 'Round-trip'], }; diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/common/usfmUtils.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/common/usfmUtils.ts index 64bf6deb4..d93d47330 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/common/usfmUtils.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/common/usfmUtils.ts @@ -1,3 +1,4 @@ +import { v4 as uuidv4 } from 'uuid'; import { ProcessedCell, ProcessedNotebook, @@ -544,7 +545,7 @@ export const createNotebookPair = = ({ name: baseName, cells: simplifiedCells, metadata: { - id: `indesign-source-${Date.now()}`, + id: uuidv4(), originalFileName: selectedFile.name, sourceFile: selectedFile.name, // Pass the original file bytes so the provider can persist it under .project/attachments/originals @@ -253,7 +254,7 @@ export const InDesignImporterForm: React.FC = ({ } })), metadata: { - id: `indesign-codex-${Date.now()}`, + id: uuidv4(), originalFileName: selectedFile.name, sourceFile: selectedFile.name, importerType: 'indesign', diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/indesign/index.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/indesign/index.tsx index 88c044881..71c716f86 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/indesign/index.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/indesign/index.tsx @@ -17,5 +17,4 @@ export const indesignImporterPlugin: ImporterPlugin = { supportedExtensions: ['idml'], supportedMimeTypes: ['application/vnd.adobe.indesign-idml-package'], enabled: true, - tags: ['Essential', 'Documents', 'Adobe', 'Professional', 'RoundTrip'], }; diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/maculaBible/MaculaBibleImporterForm.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/maculaBible/MaculaBibleImporterForm.tsx index ab34a1e44..919e17c8f 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/maculaBible/MaculaBibleImporterForm.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/maculaBible/MaculaBibleImporterForm.tsx @@ -1,4 +1,5 @@ import React, { useState, useCallback, useEffect } from "react"; +import { v4 as uuidv4 } from 'uuid'; import { ImporterComponentProps, AlignedCell, @@ -245,7 +246,7 @@ export const MaculaBibleImporterForm: React.FC = (props) }; }), metadata: { - id: notebookName, + id: uuidv4(), originalFileName: `${fullBookName}.macula`, // Use full name instead of code sourceFile: `${fullBookName}.macula`, importerType: "macula", @@ -268,6 +269,10 @@ export const MaculaBibleImporterForm: React.FC = (props) ...cell, content: "", // Empty for codex })), + metadata: { + ...sourceNotebook.metadata, + id: uuidv4(), + }, }; const notebookPair = { diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/maculaBible/index.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/maculaBible/index.tsx index aa58d6a7b..afdace697 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/maculaBible/index.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/maculaBible/index.tsx @@ -10,5 +10,4 @@ export const maculaBibleImporterPlugin: ImporterPlugin = { component: MaculaBibleImporterForm, supportedExtensions: [], // No file extensions - this downloads remotely enabled: true, - tags: ["Specialized", "Biblical", "Original Languages", "Hebrew", "Greek"], }; diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/markdown/index.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/markdown/index.ts index a1b49e7ae..1c44c0392 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/markdown/index.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/markdown/index.ts @@ -1,3 +1,4 @@ +import { v4 as uuidv4 } from 'uuid'; import { ImporterPlugin, FileValidationResult, @@ -315,7 +316,7 @@ export const parseFile = async ( name: baseName, cells, metadata: { - id: `source-${Date.now()}`, + id: uuidv4(), originalFileName: file.name, sourceFile: file.name, importerType: 'markdown', @@ -359,7 +360,7 @@ export const parseFile = async ( cells: codexCells, metadata: { ...sourceNotebook.metadata, - id: `codex-${Date.now()}`, + id: uuidv4(), }, }; diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/obs/index.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/obs/index.ts index 905a02d94..9ff5096f7 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/obs/index.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/obs/index.ts @@ -1,3 +1,4 @@ +import { v4 as uuidv4 } from 'uuid'; import { ImporterPlugin, FileValidationResult, @@ -278,7 +279,7 @@ const downloadObsRepository = async ( name: storyName, cells: storyCells, metadata: { - id: `obs-${obsStory.storyNumber.toString().padStart(2, '0')}-source`, + id: uuidv4(), originalFileName: storyFile.name, sourceFile: storyFile.name, corpusMarker: 'obs', // Enable round-trip export @@ -308,7 +309,7 @@ const downloadObsRepository = async ( name: storyName, cells: codexCells, metadata: { - id: `obs-${obsStory.storyNumber.toString().padStart(2, '0')}-codex`, + id: uuidv4(), originalFileName: storyFile.name, sourceFile: storyFile.name, corpusMarker: 'obs', // Enable round-trip export @@ -535,7 +536,7 @@ const parseObsMarkdown = async ( name: baseName, cells, metadata: { - id: `obs-source-${Date.now()}`, + id: uuidv4(), originalFileName: file.name, sourceFile: file.name, originalFileData: arrayBuffer, // Store original file for export - system will save to .project/attachments/originals/ @@ -567,7 +568,7 @@ const parseObsMarkdown = async ( cells: codexCells, metadata: { ...sourceNotebook.metadata, - id: `obs-codex-${Date.now()}`, + id: uuidv4(), // Don't duplicate the original file data in codex originalFileData: undefined, }, @@ -818,7 +819,7 @@ const parseObsZip = async ( name: storyName, cells, metadata: { - id: `obs-${obsStory.storyNumber.toString().padStart(2, '0')}-source`, + id: uuidv4(), originalFileName: markdownFile.name, sourceFile: markdownFile.name, corpusMarker: 'obs', // Enable round-trip export @@ -842,7 +843,7 @@ const parseObsZip = async ( name: storyName, cells: codexCells, metadata: { - id: `obs-${obsStory.storyNumber.toString().padStart(2, '0')}-codex`, + id: uuidv4(), originalFileName: markdownFile.name, sourceFile: markdownFile.name, corpusMarker: 'obs', // Enable round-trip export diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/obs/index.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/obs/index.tsx index c7d9d2074..0194e0400 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/obs/index.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/obs/index.tsx @@ -11,5 +11,4 @@ export const obsImporterPlugin: ImporterPlugin = { component: ObsImporterForm, supportedExtensions: ["md", "zip"], enabled: true, - tags: ["stories", "download", "repository"], }; diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/cellMetadata.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/cellMetadata.ts index 3c488dd65..77c9acb0b 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/cellMetadata.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/cellMetadata.ts @@ -71,7 +71,6 @@ export function createPdfCellMetadata(params: PdfCellMetadataParams): { metadata // Import metadata importTimestamp: new Date().toISOString(), - corpusMarker: 'pdf', importerVersion: '1.0.0', }, } diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/index.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/index.ts index cb3d4636a..677e69d63 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/index.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/index.ts @@ -1,3 +1,4 @@ +import { v4 as uuidv4 } from 'uuid'; import { ImporterPlugin, FileValidationResult, @@ -404,195 +405,239 @@ export const validateFile = async (file: File): Promise => }; /** - * Parses a PDF file for non-Bible text content + * Converts PDF to DOCX via extension host */ -export const parseFile = async ( - file: File, - onProgress?: ProgressCallback -): Promise => { - try { - onProgress?.(createProgress('Reading File', 'Reading PDF file...', 10)); - - // Read file as ArrayBuffer to store original for round-trip export - const arrayBuffer = await file.arrayBuffer(); - - onProgress?.(createProgress('Extracting Text', 'Extracting text from PDF...', 30)); - - const textContent = await extractTextViaExtension(file); - - onProgress?.(createProgress('Processing Content', 'Processing extracted text...', 50)); - - // Split content by paragraphs (double newlines) and HTML breaks - // PDFs preserve paragraph breaks which represent natural text units - // Falls back to sentence splitting only if no paragraph breaks are found - const segments = splitPdfContentIntoSegments(textContent); - - // Validate that we have segments - if (!segments || segments.length === 0) { - throw new Error('No content segments found in PDF. The PDF may be empty or contain only images.'); - } - - // Log for debugging - console.log(`[PDF Importer] Split PDF into ${segments.length} segments`); - - onProgress?.(createProgress('Creating Cells', 'Creating cells from text segments...', 70)); - - // Filter out empty segments and create cells - const validSegments = segments.filter(segment => segment && segment.trim().length > 0); +async function convertPdfToDocxViaExtension(file: File): Promise { + return new Promise((resolve, reject) => { + try { + const requestId = `pdf-to-docx-${Date.now()}-${Math.random().toString(36).slice(2)}`; - if (validSegments.length === 0) { - throw new Error('No valid content segments found in PDF after filtering.'); - } + const cleanup = () => window.removeEventListener('message', onMessage as any); - // Create cells for each segment - const cells = await Promise.all( - validSegments.map(async (segment, index) => { - // Ensure we have valid content - const cleanText = segment - .replace(/[\r\n]+/g, ' ') - .replace(/\s+/g, ' ') - .trim(); - - if (!cleanText || cleanText.length === 0) { - console.warn(`[PDF Importer] Skipping empty segment at index ${index}`); - return null; + const onMessage = (event: MessageEvent) => { + const data = (event && event.data) || {}; + if (data && data.command === 'convertPdfToDocxResult' && data.requestId === requestId) { + cleanup(); + if (data.success) { + try { + // For large files, the extension host saves the file and sends the path + // For smaller files, it sends base64 data + if (data.isLargeFile && data.docxFilePath) { + // Request the file from extension host using file path + (window as any).vscodeApi?.postMessage({ + command: 'readFileFromPath', + requestId: `read-docx-${requestId}`, + filePath: data.docxFilePath + }); + + // Set up listener for file data + const fileReaderCleanup = () => window.removeEventListener('message', fileReaderHandler as any); + const fileReaderHandler = (fileEvent: MessageEvent) => { + const fileData = (fileEvent && fileEvent.data) || {}; + if (fileData.command === 'readFileFromPathResult' && fileData.requestId === `read-docx-${requestId}`) { + fileReaderCleanup(); + if (fileData.success && fileData.fileData) { + // Convert base64 to File object + const base64 = fileData.fileData; + const binaryString = atob(base64); + const bytes = new Uint8Array(binaryString.length); + for (let i = 0; i < binaryString.length; i++) { + bytes[i] = binaryString.charCodeAt(i); + } + + const docxFileName = file.name.replace(/\.pdf$/i, '.docx'); + const docxFile = new File([bytes], docxFileName, { + type: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + lastModified: file.lastModified + }); + + resolve(docxFile); + } else { + reject(new Error(fileData.error || 'Failed to read DOCX file from path')); + } + } + }; + + window.addEventListener('message', fileReaderHandler as any); + + // Timeout for file read + setTimeout(() => { + fileReaderCleanup(); + reject(new Error('Timeout reading DOCX file from workspace')); + }, 60000); + } else { + // Standard base64 path for smaller files + const base64 = data.docxBase64; + + if (!base64 || typeof base64 !== 'string') { + throw new Error('Invalid base64 data received from conversion'); + } + + // Validate base64 string (basic check) + if (!/^[A-Za-z0-9+/]*={0,2}$/.test(base64.replace(/\s/g, ''))) { + throw new Error('Invalid base64 encoding format'); + } + + const binaryString = atob(base64); + const bytes = new Uint8Array(binaryString.length); + for (let i = 0; i < binaryString.length; i++) { + bytes[i] = binaryString.charCodeAt(i); + } + + // Create File object with .docx extension + const docxFileName = file.name.replace(/\.pdf$/i, '.docx'); + const docxFile = new File([bytes], docxFileName, { + type: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + lastModified: file.lastModified + }); + + resolve(docxFile); + } + } catch (decodeError) { + reject(new Error(`Failed to decode DOCX file: ${decodeError instanceof Error ? decodeError.message : 'Unknown error'}`)); + } + } else { + reject(new Error(data.error || 'Failed to convert PDF to DOCX')); + } } + }; - // Create cell metadata (generates UUID internally) - const { cellId, metadata: cellMetadata } = createPdfCellMetadata({ - originalContent: segment, - cellLabel: (index + 1).toString(), - segmentIndex: index, - fileName: file.name, - fileSize: file.size, - }); + window.addEventListener('message', onMessage as any); - // Get cleaned text from metadata - const cleanedText = cellMetadata.originalText || cleanText; + // Read PDF as base64 + const reader = new FileReader(); + reader.onerror = () => { + cleanup(); + reject(new Error('Failed to read PDF file')); + }; + reader.onload = () => { + const dataUrl = (reader.result as string) || ''; + const base64 = dataUrl.includes(',') ? dataUrl.split(',')[1] : dataUrl; + + (window as any).vscodeApi?.postMessage({ + command: 'convertPdfToDocx', + requestId, + pdfBase64: base64, + }); + }; + + setTimeout(() => reader.readAsDataURL(file), 0); - // Create HTML content with paragraph semantics - const htmlContent = `
-

${escapeHtml(cleanedText)}

-
`; + // Safety timeout - increased for large PDFs with CMYK conversion + setTimeout(() => { + cleanup(); + reject(new Error('PDF to DOCX conversion timed out after 10 minutes. Large PDFs with CMYK images may take longer. Please try again or use a smaller file.')); + }, 600000); // 10 minutes timeout for large files with CMYK conversion + } catch (err) { + reject(err instanceof Error ? err : new Error('Failed to request PDF to DOCX conversion')); + } + }); +} - const cell = createProcessedCell(cellId, htmlContent, { - type: 'text', - ...cellMetadata, - } as any); +/** + * Parses a PDF file by converting it to DOCX first, then using DOCX importer + * This approach provides better layout preservation and round-trip fidelity + */ +export const parseFile = async ( + file: File, + onProgress?: ProgressCallback +): Promise => { + try { + onProgress?.(createProgress('Converting PDF', 'Converting PDF to DOCX format...', 10)); - // Extract and process images from this cell (if any) - const images = await extractImagesFromHtml(htmlContent); - cell.images = images; + // Step 1: Convert PDF to DOCX using pdf2docx + const docxFile = await convertPdfToDocxViaExtension(file); - return cell; - }) - ); + onProgress?.(createProgress('Importing DOCX', 'Importing converted DOCX file...', 30)); - // Filter out any null cells (from empty segments) - const validCells = cells.filter((cell): cell is NonNullable => cell !== null); + // Step 2: Import the DOCX file using DOCX importer + const { parseFile: parseDocxFile } = await import('../docx/index'); + const docxResult = await parseDocxFile(docxFile, (progress) => { + // Map DOCX import progress (30-90%) to overall progress (30-90%) + const mappedProgress = 30 + (progress.progress || 0) * 0.6; + onProgress?.(createProgress(progress.stage || 'Importing DOCX', progress.message || '', mappedProgress)); + }); - if (validCells.length === 0) { - throw new Error('No valid cells created from PDF content. All segments were empty.'); + if (!docxResult.success || !docxResult.notebookPair) { + throw new Error('DOCX import failed after PDF conversion'); } - onProgress?.(createProgress('Creating Notebooks', 'Creating source and codex notebooks...', 90)); + // Step 3: Override corpusMarker to "pdf" while keeping all DOCX structure + const sourceNotebook = docxResult.notebookPair.source; + const codexNotebook = docxResult.notebookPair.codex; + + // For large files, don't store ArrayBuffers in metadata to avoid memory issues + // Instead, we'll save them during the write process + // Only store ArrayBuffers for smaller files (< 50MB) + const LARGE_FILE_THRESHOLD = 50 * 1024 * 1024; // 50MB + const shouldStoreBuffers = file.size < LARGE_FILE_THRESHOLD && docxFile.size < LARGE_FILE_THRESHOLD; + + let originalPdfArrayBuffer: ArrayBuffer | undefined; + let convertedDocxArrayBuffer: ArrayBuffer | undefined; + + if (shouldStoreBuffers) { + originalPdfArrayBuffer = await file.arrayBuffer(); + convertedDocxArrayBuffer = await docxFile.arrayBuffer(); + } - // Create source notebook - const sourceNotebook = { - name: sanitizeFileName(file.name), - cells: validCells, - metadata: { - id: `pdf-${Date.now()}`, + // Override metadata to indicate PDF origin + sourceNotebook.metadata = { + ...sourceNotebook.metadata, + id: uuidv4(), + corpusMarker: 'pdf', + importerType: 'pdf', + originalFileName: file.name, // Keep original PDF filename + originalFileData: originalPdfArrayBuffer, // Store original PDF only if small (will be saved to attachments/originals) + fileType: 'pdf', + importContext: { + ...sourceNotebook.metadata.importContext, + importerType: 'pdf', + fileName: file.name, originalFileName: file.name, - originalFileData: arrayBuffer, // Store original PDF for round-trip export - corpusMarker: 'pdf', - importerType: 'pdf', // Alias for corpusMarker (type requirement) - createdAt: new Date().toISOString(), - importContext: { - importerType: 'pdf', - fileName: file.name, - originalFileName: file.name, - fileSize: file.size, - importTimestamp: new Date().toISOString(), - }, - sourceFile: file.name, - totalCells: cells.length, - fileType: 'pdf', - importDate: new Date().toISOString(), - - // Segmentation info - segmentationType: 'sentences', - - // Round-trip metadata - pdfDocumentMetadata: { - originalFileName: file.name, - fileSize: file.size, - totalSentences: cells.length, - importerVersion: '1.0.0', - - // Placeholder for future PDF metadata enhancements - totalPages: undefined, // Will be populated when available - pdfVersion: undefined, - author: undefined, - title: undefined, - creationDate: undefined, - }, - } + fileSize: file.size, + }, + // Preserve DOCX metadata but mark as PDF + pdfDocumentMetadata: { + originalFileName: file.name, + fileSize: file.size, + convertedFromPdf: true, + convertedDocxFileName: docxFile.name, + // Store converted DOCX data for export only if small (will be saved separately) + convertedDocxData: convertedDocxArrayBuffer, + isLargeFile: !shouldStoreBuffers, // Flag to indicate files need to be saved from temp location + }, }; - // Create codex notebook (empty for translation) - const codexNotebook = { - name: `${sanitizeFileName(file.name)}`, - cells: validCells.map(sourceCell => - createProcessedCell(sourceCell.id, '', { - ...sourceCell.metadata, - originalContent: sourceCell.content - }) - ), - metadata: { - id: `pdf-codex-${Date.now()}`, + codexNotebook.metadata = { + ...codexNotebook.metadata, + id: uuidv4(), + corpusMarker: 'pdf', + importerType: 'pdf', + originalFileName: file.name, + fileType: 'pdf', + importContext: { + ...codexNotebook.metadata.importContext, + importerType: 'pdf', + fileName: file.name, originalFileName: file.name, - // Don't duplicate the original file data in codex - originalFileData: undefined, - corpusMarker: 'pdf', - importerType: 'pdf', // Alias for corpusMarker (type requirement) - createdAt: new Date().toISOString(), - importContext: { - importerType: 'pdf', - fileName: file.name, - originalFileName: file.name, - fileSize: file.size, - importTimestamp: new Date().toISOString(), - }, - sourceFile: file.name, - totalCells: cells.length, - fileType: 'pdf', - importDate: new Date().toISOString(), - isCodex: true, - - // Segmentation info - segmentationType: 'sentences', - - // Link to source metadata for round-trip - sourceMetadata: sourceNotebook.metadata, - } + fileSize: file.size, + }, }; - // Add milestone cells to the notebook pair - const notebookPairWithMilestones = addMilestoneCellsToNotebookPair({ - source: sourceNotebook, - codex: codexNotebook, - }); + // Note: corpusMarker is only set at notebook-level metadata, not in individual cells + // This keeps the notebook structure clean and avoids duplication onProgress?.(createProgress('Complete', 'PDF import completed successfully!', 100)); return { success: true, - notebookPair: notebookPairWithMilestones, + notebookPair: { + source: sourceNotebook, + codex: codexNotebook, + }, metadata: { - totalCells: validCells.length, + ...docxResult.metadata, fileType: 'pdf', - importDate: new Date().toISOString(), } }; diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/scripts/README.md b/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/scripts/README.md new file mode 100644 index 000000000..0f0eb00db --- /dev/null +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/scripts/README.md @@ -0,0 +1,129 @@ +# PDF Conversion Scripts + +These scripts handle PDF↔DOCX conversion with a focus on preserving document formatting, layout, images, and structure. + +## Overview + +Both scripts use a **hybrid approach** that tries multiple conversion methods in order of quality/availability: + +### PDF to DOCX (`pdf_to_docx.py`) +1. **LibreOffice headless** (best quality, free) +2. **pdf2docx** (good for most PDFs) +3. **Rich text extraction** (fallback when others fail) + +### DOCX to PDF (`docx_to_pdf.py`) +1. **LibreOffice headless** (free, no MS Office needed) +2. **docx2pdf** (requires Microsoft Word) + +## Installation + +### Required (Basic functionality) +```bash +pip install PyMuPDF python-docx Pillow +``` + +### Recommended (Better quality) +```bash +pip install pdf2docx docx2pdf +``` + +### Highly Recommended (Best quality - FREE) + +**LibreOffice** provides the best conversion quality and is completely free: + +- **Windows**: Download from https://www.libreoffice.org/download/download/ +- **macOS**: `brew install --cask libreoffice` +- **Linux**: `sudo apt install libreoffice` or `sudo dnf install libreoffice` + +## Quality Comparison + +| Method | Layout | Fonts | Images | Tables | Page Breaks | Headers/Footers | +|--------|--------|-------|--------|--------|-------------|-----------------| +| LibreOffice | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐ | +| pdf2docx | ⭐⭐⭐⭐ | ⭐⭐⭐⭐ | ⭐⭐⭐⭐ | ⭐⭐⭐⭐ | ⭐⭐⭐ | ⭐⭐ | +| Rich text extraction | ⭐⭐⭐ | ⭐⭐⭐ | ⭐⭐⭐ | ⭐⭐ | ⭐⭐⭐⭐ | ❌ | +| docx2pdf (MS Word) | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | + +## Usage + +### Command Line +```bash +# PDF to DOCX +python pdf_to_docx.py input.pdf output.docx + +# DOCX to PDF +python docx_to_pdf.py input.docx output.pdf +``` + +### From Python +```python +import json +from pdf_to_docx import convert_pdf_to_docx +from docx_to_pdf import convert_docx_to_pdf + +# PDF to DOCX +result = convert_pdf_to_docx("input.pdf", "output.docx") +if result["success"]: + print(f"Converted using: {result['method']}") +else: + print(f"Error: {result['error']}") + +# DOCX to PDF +result = convert_docx_to_pdf("input.docx", "output.pdf") +if result["success"]: + print(f"Converted using: {result['method']}") +else: + print(f"Error: {result['error']}") +``` + +## What Gets Preserved + +### PDF → DOCX +- ✅ Text content and flow +- ✅ Font names, sizes, colors +- ✅ Bold, italic, underline +- ✅ Images (with CMYK→RGB conversion) +- ✅ Tables (when using LibreOffice/pdf2docx) +- ✅ Page breaks +- ✅ Line breaks within paragraphs +- ✅ Multi-column layouts (LibreOffice) +- ✅ Headers/footers (LibreOffice) +- ⚠️ Complex vector graphics may be rasterized +- ⚠️ Form fields may not be editable + +### DOCX → PDF +- ✅ All text formatting +- ✅ Images +- ✅ Tables +- ✅ Page layout +- ✅ Headers/footers +- ✅ Hyperlinks + +## Troubleshooting + +### "LibreOffice not found" +Install LibreOffice from https://www.libreoffice.org/ + +### "docx2pdf requires Microsoft Word" +Either: +1. Install LibreOffice (free alternative) +2. Install Microsoft Word (Windows/macOS only) + +### CMYK image errors +The scripts automatically handle CMYK images by: +1. Converting CMYK to RGB using PIL +2. Saving as PNG for compatibility + +### Large file timeouts +Increase the timeout in the Python scripts if working with very large PDFs (default is 10 minutes for PDF→DOCX). + +## Dependencies + +| Package | Purpose | Required | +|---------|---------|----------| +| PyMuPDF | PDF parsing and text extraction | Yes | +| python-docx | DOCX creation | Yes | +| Pillow | Image handling (CMYK conversion) | Yes | +| pdf2docx | Direct PDF→DOCX conversion | Recommended | +| docx2pdf | DOCX→PDF via MS Word | Optional | +| LibreOffice | High-quality conversion | **Highly Recommended** | diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/scripts/docx_to_pdf.py b/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/scripts/docx_to_pdf.py new file mode 100644 index 000000000..a448f45d4 --- /dev/null +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/scripts/docx_to_pdf.py @@ -0,0 +1,300 @@ +#!/usr/bin/env python3 +""" +DOCX to PDF Converter - Hybrid Approach + +Tries multiple conversion methods in order of availability: +1. LibreOffice headless (free, cross-platform, no MS Office required) +2. docx2pdf (requires Microsoft Word on Windows/macOS) + +This ensures the conversion works on systems without Microsoft Office. +""" + +import sys +import os +import json +import base64 +import tempfile +import shutil +import subprocess +from pathlib import Path + + +def find_libreoffice() -> str | None: + """Find LibreOffice executable on the system.""" + if sys.platform == 'win32': + possible_paths = [ + r"C:\Program Files\LibreOffice\program\soffice.exe", + r"C:\Program Files (x86)\LibreOffice\program\soffice.exe", + os.path.expandvars(r"%PROGRAMFILES%\LibreOffice\program\soffice.exe"), + os.path.expandvars(r"%PROGRAMFILES(X86)%\LibreOffice\program\soffice.exe"), + ] + for path in possible_paths: + if os.path.exists(path): + return path + try: + result = subprocess.run(['where', 'soffice'], capture_output=True, text=True) + if result.returncode == 0 and result.stdout.strip(): + return result.stdout.strip().split('\n')[0] + except: + pass + else: + possible_paths = [ + '/usr/bin/soffice', + '/usr/bin/libreoffice', + '/Applications/LibreOffice.app/Contents/MacOS/soffice', + '/opt/libreoffice/program/soffice', + ] + for path in possible_paths: + if os.path.exists(path): + return path + try: + result = subprocess.run(['which', 'soffice'], capture_output=True, text=True) + if result.returncode == 0 and result.stdout.strip(): + return result.stdout.strip() + except: + pass + try: + result = subprocess.run(['which', 'libreoffice'], capture_output=True, text=True) + if result.returncode == 0 and result.stdout.strip(): + return result.stdout.strip() + except: + pass + + return None + + +def convert_with_libreoffice(docx_path: str, output_path: str) -> dict: + """ + Convert DOCX to PDF using LibreOffice headless mode. + This is the preferred method as it doesn't require Microsoft Office. + """ + soffice = find_libreoffice() + if not soffice: + return { + "success": False, + "error": "LibreOffice not found", + "method": "libreoffice" + } + + try: + print(json.dumps({"info": "Converting with LibreOffice..."}), file=sys.stderr) + + # Create temp directory for output + temp_dir = tempfile.mkdtemp(prefix="lo_pdf_") + + try: + # LibreOffice command for DOCX to PDF conversion + cmd = [ + soffice, + "--headless", + "--convert-to", "pdf", + "--outdir", temp_dir, + docx_path + ] + + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=300 # 5 minute timeout + ) + + if result.returncode != 0: + error_msg = result.stderr or result.stdout or "Unknown error" + return { + "success": False, + "error": f"LibreOffice conversion failed: {error_msg}", + "method": "libreoffice" + } + + # Find the output file + docx_basename = os.path.splitext(os.path.basename(docx_path))[0] + temp_output = os.path.join(temp_dir, f"{docx_basename}.pdf") + + if not os.path.exists(temp_output): + # Try finding any PDF in the output dir + for f in os.listdir(temp_dir): + if f.endswith('.pdf'): + temp_output = os.path.join(temp_dir, f) + break + + if not os.path.exists(temp_output): + return { + "success": False, + "error": "LibreOffice did not create PDF output", + "method": "libreoffice" + } + + # Verify file has content + file_size = os.path.getsize(temp_output) + if file_size == 0: + return { + "success": False, + "error": "LibreOffice created empty PDF", + "method": "libreoffice" + } + + # Move to final location + shutil.move(temp_output, output_path) + + # Read and encode PDF + with open(output_path, 'rb') as f: + pdf_bytes = f.read() + + pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8') + + print(json.dumps({"info": f"LibreOffice PDF conversion successful ({file_size} bytes)"}), file=sys.stderr) + + return { + "success": True, + "pdfBase64": pdf_base64, + "outputPath": output_path, + "method": "libreoffice" + } + + finally: + shutil.rmtree(temp_dir, ignore_errors=True) + + except subprocess.TimeoutExpired: + return { + "success": False, + "error": "LibreOffice conversion timed out", + "method": "libreoffice" + } + except Exception as e: + return { + "success": False, + "error": f"LibreOffice error: {str(e)}", + "method": "libreoffice" + } + + +def convert_with_docx2pdf(docx_path: str, output_path: str) -> dict: + """ + Convert DOCX to PDF using docx2pdf library. + Requires Microsoft Word on Windows/macOS. + """ + try: + from docx2pdf import convert + except ImportError: + return { + "success": False, + "error": "docx2pdf not installed. Install with: pip install docx2pdf", + "method": "docx2pdf" + } + + try: + print(json.dumps({"info": "Converting with docx2pdf (requires MS Word)..."}), file=sys.stderr) + + convert(docx_path, output_path) + + if not os.path.exists(output_path): + return { + "success": False, + "error": "docx2pdf did not create PDF. Is Microsoft Word installed?", + "method": "docx2pdf" + } + + file_size = os.path.getsize(output_path) + if file_size == 0: + return { + "success": False, + "error": "docx2pdf created empty PDF", + "method": "docx2pdf" + } + + # Read and encode PDF + with open(output_path, 'rb') as f: + pdf_bytes = f.read() + + pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8') + + print(json.dumps({"info": f"docx2pdf conversion successful ({file_size} bytes)"}), file=sys.stderr) + + return { + "success": True, + "pdfBase64": pdf_base64, + "outputPath": output_path, + "method": "docx2pdf" + } + + except Exception as e: + error_msg = str(e) if str(e) else repr(e) + + # Provide helpful error messages + if any(x in error_msg for x in ["COM", "Word", "win32com", "Microsoft"]): + error_msg += ". docx2pdf requires Microsoft Word to be installed." + + return { + "success": False, + "error": error_msg, + "method": "docx2pdf" + } + + +def convert_docx_to_pdf(docx_path: str, output_path: str) -> dict: + """ + Convert DOCX to PDF using the best available method. + + Tries methods in order: + 1. LibreOffice headless (preferred, free, no MS Office needed) + 2. docx2pdf (requires Microsoft Word) + """ + print(json.dumps({"info": "Starting DOCX to PDF conversion..."}), file=sys.stderr) + + # Verify input file exists + if not os.path.exists(docx_path): + return { + "success": False, + "error": f"Input DOCX file not found: {docx_path}" + } + + # Ensure output directory exists + output_dir = os.path.dirname(output_path) + if output_dir and not os.path.exists(output_dir): + os.makedirs(output_dir, exist_ok=True) + + methods_tried = [] + + # Method 1: Try LibreOffice (preferred, free) + print(json.dumps({"info": "Attempting Method 1: LibreOffice..."}), file=sys.stderr) + result = convert_with_libreoffice(docx_path, output_path) + methods_tried.append(f"LibreOffice: {result.get('error', 'success') if not result['success'] else 'success'}") + + if result["success"]: + print(json.dumps({"info": "✓ LibreOffice PDF conversion successful"}), file=sys.stderr) + return result + else: + print(json.dumps({"warning": f"LibreOffice failed: {result.get('error', 'unknown')}"}), file=sys.stderr) + + # Method 2: Try docx2pdf (requires MS Word) + print(json.dumps({"info": "Attempting Method 2: docx2pdf..."}), file=sys.stderr) + result = convert_with_docx2pdf(docx_path, output_path) + methods_tried.append(f"docx2pdf: {result.get('error', 'success') if not result['success'] else 'success'}") + + if result["success"]: + print(json.dumps({"info": "✓ docx2pdf conversion successful"}), file=sys.stderr) + return result + else: + print(json.dumps({"warning": f"docx2pdf failed: {result.get('error', 'unknown')}"}), file=sys.stderr) + + # All methods failed + return { + "success": False, + "error": f"All conversion methods failed. Install LibreOffice (free) from https://www.libreoffice.org/ or Microsoft Word. Tried: {'; '.join(methods_tried)}" + } + + +if __name__ == "__main__": + if len(sys.argv) < 3: + print(json.dumps({ + "success": False, + "error": "Usage: docx_to_pdf.py " + })) + sys.exit(1) + + docx_path = sys.argv[1] + output_path = sys.argv[2] + + result = convert_docx_to_pdf(docx_path, output_path) + print(json.dumps(result)) diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/scripts/pdf_to_docx.py b/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/scripts/pdf_to_docx.py new file mode 100644 index 000000000..ef03d9c58 --- /dev/null +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/scripts/pdf_to_docx.py @@ -0,0 +1,694 @@ +#!/usr/bin/env python3 +""" +PDF to DOCX Converter - Hybrid Approach + +Tries multiple conversion methods in order of quality: +1. LibreOffice headless (best layout preservation, free) +2. pdf2docx library (good for most PDFs) +3. Rich text extraction fallback (preserves content when others fail) + +Preserves fonts, sizes, colors, images, line breaks, page breaks, +tables, headers/footers, and document structure. +""" + +import sys +import os +import json +import io +import tempfile +import shutil +import subprocess +from pathlib import Path + +# Check for required libraries +try: + import fitz # PyMuPDF + HAS_PYMUPDF = True +except ImportError: + HAS_PYMUPDF = False + +try: + from pdf2docx import Converter + HAS_PDF2DOCX = True +except ImportError: + HAS_PDF2DOCX = False + +try: + from docx import Document + from docx.shared import Pt, Inches, RGBColor, Emu, Twips + from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_BREAK + from docx.oxml.ns import qn + from docx.oxml import OxmlElement + HAS_PYTHON_DOCX = True +except ImportError: + HAS_PYTHON_DOCX = False + +try: + from PIL import Image + HAS_PIL = True +except ImportError: + HAS_PIL = False + + +def find_libreoffice() -> str | None: + """Find LibreOffice executable on the system.""" + if sys.platform == 'win32': + # Common Windows paths + possible_paths = [ + r"C:\Program Files\LibreOffice\program\soffice.exe", + r"C:\Program Files (x86)\LibreOffice\program\soffice.exe", + os.path.expandvars(r"%PROGRAMFILES%\LibreOffice\program\soffice.exe"), + os.path.expandvars(r"%PROGRAMFILES(X86)%\LibreOffice\program\soffice.exe"), + ] + for path in possible_paths: + if os.path.exists(path): + return path + # Try PATH + try: + result = subprocess.run(['where', 'soffice'], capture_output=True, text=True) + if result.returncode == 0 and result.stdout.strip(): + return result.stdout.strip().split('\n')[0] + except: + pass + else: + # macOS / Linux + possible_paths = [ + '/usr/bin/soffice', + '/usr/bin/libreoffice', + '/Applications/LibreOffice.app/Contents/MacOS/soffice', + '/opt/libreoffice/program/soffice', + ] + for path in possible_paths: + if os.path.exists(path): + return path + # Try PATH + try: + result = subprocess.run(['which', 'soffice'], capture_output=True, text=True) + if result.returncode == 0 and result.stdout.strip(): + return result.stdout.strip() + except: + pass + try: + result = subprocess.run(['which', 'libreoffice'], capture_output=True, text=True) + if result.returncode == 0 and result.stdout.strip(): + return result.stdout.strip() + except: + pass + + return None + + +def convert_with_libreoffice(pdf_path: str, output_path: str) -> dict: + """ + Convert PDF to DOCX using LibreOffice headless mode. + This method provides the best layout preservation for most PDFs. + """ + soffice = find_libreoffice() + if not soffice: + return { + "success": False, + "error": "LibreOffice not found. Install from https://www.libreoffice.org/", + "method": "libreoffice" + } + + try: + print(json.dumps({"info": "Converting with LibreOffice (best quality)..."}), file=sys.stderr) + + # Create temp directory for output + temp_dir = tempfile.mkdtemp(prefix="lo_convert_") + + try: + # LibreOffice command for PDF to DOCX conversion + # --infilter specifies PDF import filter + # --convert-to specifies output format + cmd = [ + soffice, + "--headless", + "--infilter=writer_pdf_import", + "--convert-to", "docx:Office Open XML Text", + "--outdir", temp_dir, + pdf_path + ] + + print(json.dumps({"info": f"Running: {' '.join(cmd[:4])}..."}), file=sys.stderr) + + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=600 # 10 minute timeout for large PDFs + ) + + if result.returncode != 0: + error_msg = result.stderr or result.stdout or "Unknown error" + return { + "success": False, + "error": f"LibreOffice conversion failed: {error_msg}", + "method": "libreoffice" + } + + # Find the output file + pdf_basename = os.path.splitext(os.path.basename(pdf_path))[0] + temp_output = os.path.join(temp_dir, f"{pdf_basename}.docx") + + if not os.path.exists(temp_output): + # Try alternative naming + for f in os.listdir(temp_dir): + if f.endswith('.docx'): + temp_output = os.path.join(temp_dir, f) + break + + if not os.path.exists(temp_output): + return { + "success": False, + "error": "LibreOffice did not create output file", + "method": "libreoffice" + } + + # Check file size + file_size = os.path.getsize(temp_output) + if file_size == 0: + return { + "success": False, + "error": "LibreOffice created empty output file", + "method": "libreoffice" + } + + # Move to final location + shutil.move(temp_output, output_path) + + print(json.dumps({"info": f"LibreOffice conversion successful ({file_size} bytes)"}), file=sys.stderr) + + return { + "success": True, + "outputPath": output_path, + "method": "libreoffice" + } + + finally: + shutil.rmtree(temp_dir, ignore_errors=True) + + except subprocess.TimeoutExpired: + return { + "success": False, + "error": "LibreOffice conversion timed out after 10 minutes", + "method": "libreoffice" + } + except Exception as e: + return { + "success": False, + "error": f"LibreOffice error: {str(e)}", + "method": "libreoffice" + } + + +def convert_with_pdf2docx(pdf_path: str, output_path: str) -> dict: + """ + Convert PDF to DOCX using pdf2docx library. + Good for most text-based PDFs. + """ + if not HAS_PDF2DOCX: + return { + "success": False, + "error": "pdf2docx not installed. Install with: pip install pdf2docx", + "method": "pdf2docx" + } + + try: + print(json.dumps({"info": "Converting with pdf2docx..."}), file=sys.stderr) + + cv = Converter(pdf_path) + cv.convert(output_path, start=0, end=None) + cv.close() + + # Verify output + if not os.path.exists(output_path): + return { + "success": False, + "error": "pdf2docx did not create output file", + "method": "pdf2docx" + } + + file_size = os.path.getsize(output_path) + if file_size == 0: + return { + "success": False, + "error": "pdf2docx created empty output file", + "method": "pdf2docx" + } + + print(json.dumps({"info": f"pdf2docx conversion successful ({file_size} bytes)"}), file=sys.stderr) + + return { + "success": True, + "outputPath": output_path, + "method": "pdf2docx" + } + + except Exception as e: + error_msg = str(e) + # Check for known issues + is_recoverable = any(x in error_msg.lower() for x in [ + "pixmap must be grayscale or rgb", + "code=4", + "colorspace", + "cmyk" + ]) + + return { + "success": False, + "error": error_msg, + "method": "pdf2docx", + "recoverable": is_recoverable + } + + +def sanitize_text(text: str) -> str: + """ + Remove control characters that are not valid in XML/DOCX. + Keeps normal whitespace (space, tab, newline, carriage return). + """ + if not text: + return "" + + result = [] + for char in text: + code = ord(char) + # Valid XML chars: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] + if code == 0x9 or code == 0xA or code == 0xD or (code >= 0x20 and code <= 0xD7FF) or (code >= 0xE000 and code <= 0xFFFD): + result.append(char) + elif code < 0x20: + result.append(' ') + + return ''.join(result) + + +def get_rgb_from_color(color_value) -> tuple: + """Convert PyMuPDF color value to RGB tuple.""" + if color_value is None: + return (0, 0, 0) + + if isinstance(color_value, (list, tuple)): + if len(color_value) == 3: + return tuple(int(c * 255) for c in color_value) + elif len(color_value) == 1: + gray = int(color_value[0] * 255) + return (gray, gray, gray) + elif len(color_value) == 4: + c, m, y, k = color_value + r = int(255 * (1 - c) * (1 - k)) + g = int(255 * (1 - m) * (1 - k)) + b = int(255 * (1 - y) * (1 - k)) + return (r, g, b) + elif isinstance(color_value, (int, float)): + if isinstance(color_value, float): + gray = int(color_value * 255) + return (gray, gray, gray) + else: + if color_value == 0: + return (0, 0, 0) + r = (color_value >> 16) & 0xFF + g = (color_value >> 8) & 0xFF + b = color_value & 0xFF + return (r, g, b) + + return (0, 0, 0) + + +def extract_images_from_page(page, page_num: int, temp_dir: str) -> list: + """Extract all images from a PDF page and save them to temp files.""" + images = [] + + try: + image_list = page.get_images(full=True) + + for img_index, img_info in enumerate(image_list): + try: + xref = img_info[0] + base_image = page.parent.extract_image(xref) + if not base_image: + continue + + image_bytes = base_image.get("image") + image_ext = base_image.get("ext", "png") + + if not image_bytes: + continue + + # Convert CMYK to RGB if needed + if HAS_PIL and image_ext in ["jpeg", "jpg"]: + try: + img = Image.open(io.BytesIO(image_bytes)) + if img.mode == "CMYK": + img = img.convert("RGB") + buffer = io.BytesIO() + img.save(buffer, format="PNG") + image_bytes = buffer.getvalue() + image_ext = "png" + except Exception as pil_err: + print(json.dumps({"warning": f"PIL conversion failed: {pil_err}"}), file=sys.stderr) + + image_filename = f"page{page_num}_img{img_index}.{image_ext}" + image_path = os.path.join(temp_dir, image_filename) + + with open(image_path, "wb") as img_file: + img_file.write(image_bytes) + + img_rects = page.get_image_rects(xref) + if img_rects: + bbox = img_rects[0] + images.append((image_path, bbox, base_image.get("width", 100), base_image.get("height", 100))) + else: + images.append((image_path, None, base_image.get("width", 100), base_image.get("height", 100))) + + except Exception as img_err: + print(json.dumps({"warning": f"Failed to extract image {img_index}: {img_err}"}), file=sys.stderr) + continue + + except Exception as e: + print(json.dumps({"warning": f"Image extraction error on page {page_num}: {e}"}), file=sys.stderr) + + images.sort(key=lambda x: x[1][1] if x[1] else float('inf')) + return images + + +def add_image_to_doc(doc, image_path: str, width: int = None, height: int = None, max_width_inches: float = 6.0): + """Add an image to the document with appropriate sizing.""" + try: + if width and height: + aspect_ratio = height / width + img_width = min(max_width_inches, width / 96) + img_height = img_width * aspect_ratio + else: + img_width = max_width_inches / 2 + img_height = None + + para = doc.add_paragraph() + run = para.add_run() + + if img_height: + run.add_picture(image_path, width=Inches(img_width), height=Inches(img_height)) + else: + run.add_picture(image_path, width=Inches(img_width)) + + return True + except Exception as e: + print(json.dumps({"warning": f"Failed to add image: {e}"}), file=sys.stderr) + return False + + +def add_formatted_paragraph_with_breaks(doc, runs: list, page_avg_size: float): + """Add a paragraph to the document with formatted runs and line breaks.""" + if not runs: + return + + full_text = ''.join(r[0] for r in runs if not r[6]).strip() + if not full_text: + return + + para = doc.add_paragraph() + + first_non_break = next((r for r in runs if not r[6]), None) + first_size = first_non_break[2] if first_non_break else 12 + is_heading = first_size > page_avg_size * 1.3 + + for run_data in runs: + text, font_name, font_size, color, is_bold, is_italic, is_line_break = run_data + + if is_line_break: + run = para.add_run() + run.add_break(WD_BREAK.LINE) + continue + + if not text: + continue + + run = para.add_run(text) + run.font.size = Pt(font_size) + + try: + run.font.name = font_name + run._element.rPr.rFonts.set(qn('w:eastAsia'), font_name) + except: + run.font.name = 'Arial' + + if color != (0, 0, 0): + try: + run.font.color.rgb = RGBColor(color[0], color[1], color[2]) + except: + pass + + if is_bold or is_heading: + run.font.bold = True + if is_italic: + run.font.italic = True + + +def extract_text_to_docx(pdf_path: str, output_path: str) -> dict: + """ + Extract text from PDF and create a DOCX with preserved formatting. + Fallback method when other converters fail. + """ + if not HAS_PYTHON_DOCX: + return { + "success": False, + "error": "python-docx not installed. Install with: pip install python-docx", + "method": "rich_text_extraction" + } + + if not HAS_PYMUPDF: + return { + "success": False, + "error": "PyMuPDF not installed. Install with: pip install PyMuPDF", + "method": "rich_text_extraction" + } + + temp_dir = tempfile.mkdtemp(prefix="pdf_images_") + + try: + print(json.dumps({"info": "Using rich text extraction fallback..."}), file=sys.stderr) + + pdf_doc = fitz.open(pdf_path) + total_pages = len(pdf_doc) + + doc = Document() + + print(json.dumps({"info": f"Extracting from {total_pages} pages..."}), file=sys.stderr) + + total_images = 0 + + for page_num in range(total_pages): + page = pdf_doc[page_num] + page_height = page.rect.height + + page_images = extract_images_from_page(page, page_num, temp_dir) + total_images += len(page_images) + + text_dict = page.get_text("dict", flags=fitz.TEXT_PRESERVE_WHITESPACE) + blocks = text_dict.get("blocks", []) + + # Calculate average font size + all_sizes = [] + for block in blocks: + if block.get("type") == 0: + for line in block.get("lines", []): + for span in line.get("spans", []): + all_sizes.append(span.get("size", 12)) + + page_avg_size = sum(all_sizes) / len(all_sizes) if all_sizes else 12 + + # Combine content items sorted by position + content_items = [] + + for block in blocks: + if block.get("type") == 0: + bbox = block.get("bbox", [0, 0, 0, 0]) + content_items.append(("text", bbox[1], block)) + + for img_path, img_bbox, img_width, img_height in page_images: + y_pos = img_bbox[1] if img_bbox else page_height + content_items.append(("image", y_pos, (img_path, img_width, img_height))) + + content_items.sort(key=lambda x: x[1]) + + # Process content + for item_type, y_pos, item_data in content_items: + if item_type == "text": + block = item_data + lines = block.get("lines", []) + if not lines: + continue + + current_para_runs = [] + last_y1 = None + last_size = None + + for line in lines: + spans = line.get("spans", []) + if not spans: + continue + + line_bbox = line.get("bbox", [0, 0, 0, 0]) + line_y0 = line_bbox[1] + line_y1 = line_bbox[3] + line_height = line_y1 - line_y0 if line_y1 > line_y0 else 12 + + start_new_para = False + add_line_break = False + + if last_y1 is not None: + gap = line_y0 - last_y1 + if gap > line_height * 1.0: + start_new_para = True + elif gap > line_height * 0.2: + add_line_break = True + + first_span = spans[0] + current_size = first_span.get("size", 12) + if last_size is not None and abs(current_size - last_size) > 3: + start_new_para = True + + if start_new_para and current_para_runs: + add_formatted_paragraph_with_breaks(doc, current_para_runs, page_avg_size) + current_para_runs = [] + add_line_break = False + + if add_line_break and current_para_runs: + last_run = current_para_runs[-1] + current_para_runs.append(('\n', last_run[1], last_run[2], last_run[3], last_run[4], last_run[5], True)) + + for span in spans: + text = span.get("text", "") + if not text: + continue + + text = sanitize_text(text) + if not text: + continue + + font_name = span.get("font", "Arial") + font_size = span.get("size", 12) + color = get_rgb_from_color(span.get("color")) + flags = span.get("flags", 0) + is_bold = bool(flags & (1 << 4)) + is_italic = bool(flags & (1 << 1)) + + if '+' in font_name: + font_name = font_name.split('+', 1)[1] + + current_para_runs.append((text, font_name, font_size, color, is_bold, is_italic, False)) + + last_y1 = line_y1 + last_size = current_size + + if current_para_runs: + add_formatted_paragraph_with_breaks(doc, current_para_runs, page_avg_size) + + elif item_type == "image": + img_path, img_width, img_height = item_data + add_image_to_doc(doc, img_path, img_width, img_height) + + # Page break between pages + if page_num < total_pages - 1: + doc.add_page_break() + + if (page_num + 1) % 10 == 0 or (page_num + 1) == total_pages: + progress = int((page_num + 1) / total_pages * 100) + print(json.dumps({"info": f"Progress: {page_num + 1}/{total_pages} ({progress}%)"}), file=sys.stderr) + + pdf_doc.close() + doc.save(output_path) + + file_size = os.path.getsize(output_path) + print(json.dumps({"info": f"Rich text extraction complete: {total_pages} pages, {total_images} images, {file_size} bytes"}), file=sys.stderr) + + return { + "success": True, + "outputPath": output_path, + "method": "rich_text_extraction" + } + + except Exception as e: + import traceback + error_details = traceback.format_exc() + print(json.dumps({"error": f"Text extraction error: {error_details}"}), file=sys.stderr) + return { + "success": False, + "error": f"Text extraction failed: {str(e)}", + "method": "rich_text_extraction" + } + finally: + shutil.rmtree(temp_dir, ignore_errors=True) + + +def convert_pdf_to_docx(pdf_path: str, output_path: str) -> dict: + """ + Convert PDF to DOCX using the best available method. + + Tries methods in order of quality: + 1. LibreOffice headless (best layout preservation) + 2. pdf2docx (good for most PDFs) + 3. Rich text extraction (fallback) + """ + print(json.dumps({"info": "Starting PDF to DOCX conversion (hybrid approach)..."}), file=sys.stderr) + + # Verify input file exists + if not os.path.exists(pdf_path): + return { + "success": False, + "error": f"Input PDF file not found: {pdf_path}" + } + + methods_tried = [] + + # Method 1: Try LibreOffice (best quality) + print(json.dumps({"info": "Attempting Method 1: LibreOffice..."}), file=sys.stderr) + result = convert_with_libreoffice(pdf_path, output_path) + methods_tried.append(f"LibreOffice: {result.get('error', 'success') if not result['success'] else 'success'}") + + if result["success"]: + print(json.dumps({"info": "✓ LibreOffice conversion successful"}), file=sys.stderr) + return result + else: + print(json.dumps({"warning": f"LibreOffice failed: {result.get('error', 'unknown')}"}), file=sys.stderr) + + # Method 2: Try pdf2docx + print(json.dumps({"info": "Attempting Method 2: pdf2docx..."}), file=sys.stderr) + result = convert_with_pdf2docx(pdf_path, output_path) + methods_tried.append(f"pdf2docx: {result.get('error', 'success') if not result['success'] else 'success'}") + + if result["success"]: + print(json.dumps({"info": "✓ pdf2docx conversion successful"}), file=sys.stderr) + return result + else: + print(json.dumps({"warning": f"pdf2docx failed: {result.get('error', 'unknown')}"}), file=sys.stderr) + + # Method 3: Rich text extraction fallback + print(json.dumps({"info": "Attempting Method 3: Rich text extraction..."}), file=sys.stderr) + result = extract_text_to_docx(pdf_path, output_path) + methods_tried.append(f"Rich text: {result.get('error', 'success') if not result['success'] else 'success'}") + + if result["success"]: + print(json.dumps({"info": "✓ Rich text extraction successful"}), file=sys.stderr) + return result + + # All methods failed + return { + "success": False, + "error": f"All conversion methods failed. Tried: {'; '.join(methods_tried)}" + } + + +if __name__ == "__main__": + if len(sys.argv) < 3: + print(json.dumps({ + "success": False, + "error": "Usage: pdf_to_docx.py " + })) + sys.exit(1) + + pdf_path = sys.argv[1] + output_path = sys.argv[2] + + result = convert_pdf_to_docx(pdf_path, output_path) + print(json.dumps(result)) diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/plaintext/index.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/plaintext/index.ts index 351b1d7cb..51d247b72 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/plaintext/index.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/plaintext/index.ts @@ -1,3 +1,4 @@ +import { v4 as uuidv4 } from 'uuid'; import { ImporterPlugin, FileValidationResult, @@ -150,7 +151,7 @@ export const parseFile = async (file: File, onProgress?: ProgressCallback, optio name: baseName, cells, metadata: { - id: `plaintext-source-${Date.now()}`, + id: uuidv4(), originalFileName: file.name, sourceFile: file.name, importerType: 'plaintext', @@ -186,7 +187,7 @@ export const parseFile = async (file: File, onProgress?: ProgressCallback, optio cells: codexCells, metadata: { ...sourceNotebook.metadata, - id: `plaintext-codex-${Date.now()}`, + id: uuidv4(), }, }; diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/recursiveTextSplitter/RecursiveTextSplitterForm.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/recursiveTextSplitter/RecursiveTextSplitterForm.tsx index 7b7a08e6c..560b397e5 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/recursiveTextSplitter/RecursiveTextSplitterForm.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/recursiveTextSplitter/RecursiveTextSplitterForm.tsx @@ -510,7 +510,7 @@ export const RecursiveTextSplitterForm: React.FC = ({ name: cleanFileName, cells: sourceCells, metadata: { - id: `source-${Date.now()}`, + id: uuidv4(), originalFileName: file.name, sourceFile: file.name, importerType: "smart-segmenter", @@ -528,7 +528,7 @@ export const RecursiveTextSplitterForm: React.FC = ({ name: cleanFileName, cells: codexCells, metadata: { - id: `codex-${Date.now()}`, + id: uuidv4(), originalFileName: file.name, sourceFile: file.name, importerType: "smart-segmenter", diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/registry.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/registry.tsx index c185fa8c9..bcccfae1f 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/registry.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/registry.tsx @@ -16,21 +16,19 @@ import { // import { docxImporterPlugin } from "./docx/index.tsx"; // Old mammoth.js importer import { docxRoundtripImporterPlugin as docxImporterPlugin } from "./docx/experiment/index.tsx"; // New round-trip importer import { markdownImporterPlugin } from "./markdown/index.tsx"; -import { usfmImporterPlugin } from "./usfm/index.tsx"; // Original USFM importer +// import { usfmImporterPlugin } from "./usfm/index.tsx"; // Original USFM importer import { usfmExperimentalImporterPlugin } from "./usfm/experimental/index.tsx"; // Experimental round-trip importer (standalone with headers in chapter 1) import { ebibleDownloadImporterPlugin } from "./ebibleCorpus/index.tsx"; import { maculaBibleImporterPlugin } from "./maculaBible/index.tsx"; import { subtitlesImporterPlugin } from "./subtitles/index.tsx"; import { obsImporterPlugin } from "./obs/index.tsx"; -import { smartSegmenterPlugin } from "./recursiveTextSplitter/index.tsx"; +// import { smartSegmenterPlugin } from "./recursiveTextSplitter/index.tsx"; import { paratextImporterPlugin } from "./paratext/index.tsx"; -import { spreadsheetImporterPlugin } from "./spreadsheet/index.tsx"; +import { spreadsheetImporterPlugin } from "./bibleSpredSheet/index.tsx"; import { audioImporterPlugin } from "./audio/index.tsx"; import { biblicaImporterPlugin } from "./biblica/index.tsx"; -// import { biblicaSwapperImporterPlugin } from "./biblica-swapper/index.tsx"; import { tmsImporterPlugin } from "./tms/index.tsx"; -// import { rtfImporterPlugin } from "./rtf/index.tsx"; -import { pdfImporterPlugin } from "./pdf/index.tsx"; +// import { pdfImporterPlugin } from "./pdf/index.tsx"; import { indesignImporterPlugin } from "./indesign/index.tsx"; // Import placeholder components - these will be created for each importer @@ -52,118 +50,105 @@ const createPlaceholderComponent = (name: string) => { export const importerPlugins: ImporterPlugin[] = [ // Essential Tools - General purpose importers for broad appeal // Non-beta importers first - // { - // ...smartSegmenterPlugin, - // name: "Smart Segmenter", - // description: "Works with any text file", - // tags: [...(smartSegmenterPlugin.tags || []), "Essential", "Universal", "Text"], - // }, + // { + // ...smartSegmenterPlugin, + // name: "Smart Segmenter", + // description: "Works with any text file", + // tags: [...(smartSegmenterPlugin.tags || []), "Essential", "Universal", "Text"], + // }, { ...audioImporterPlugin, name: "Audio", description: "Import audio files with backend processing - supports large files", - tags: [...(audioImporterPlugin.tags || []), "Essential", "Media", "Audio"], + tags: ["Essential", "Media", "Audio"], }, { ...markdownImporterPlugin, name: "Markdown", - description: "GitHub-style markdown files", - tags: [...(markdownImporterPlugin.tags || []), "Essential", "Documentation", "GitHub"], + description: "GitHub-style markdown files with round-trip export support", + tags: ["Essential", "Documentation", "GitHub", "Round-trip"], }, { ...subtitlesImporterPlugin, name: "Subtitles", description: "Video captions with timestamps", - tags: [...(subtitlesImporterPlugin.tags || []), "Essential", "Media", "Video"], + tags: ["Essential", "Media", "Video"], }, { ...tmsImporterPlugin, name: "TMS Files", - description: "Translation memory and localization files (TMX/XLIFF)", - tags: [...(tmsImporterPlugin.tags || []), "Essential", "Translation", "Localization"], + description: "Translation memory and localization files (TMX/XLIFF) with round-trip export support", + tags: ["Essential", "Translation", "Localization", "Round-trip"], }, { ...docxImporterPlugin, name: "Word Documents", description: "Microsoft Word files with round-trip export support", - tags: [...(docxImporterPlugin.tags || []), "Essential", "Documents", "Microsoft"], - }, - { - ...spreadsheetImporterPlugin, - name: "Spreadsheets", - description: "Excel and Google Sheets", - tags: [...(spreadsheetImporterPlugin.tags || []), "Essential", "Spreadsheet", "Excel"], - }, - { - ...pdfImporterPlugin, - name: "PDF Documents", - description: "Portable Document Format files with Bible text", - icon: FileText, - tags: ["Essential", "Documents", "PDF"], + tags: ["Essential", "Documents", "Microsoft", "Round-trip"], }, { ...indesignImporterPlugin, name: "InDesign Files", description: "Adobe InDesign IDML files with round-trip loss-free editing", - tags: [...(indesignImporterPlugin.tags || []), "Essential", "Documents", "Adobe", "Professional", "Bible"], + tags: ["Essential", "Documents", "Adobe", "Round-trip"], }, + // { + // ...pdfImporterPlugin, + // name: "PDF Documents", + // description: "Portable Document Format files with Bible text", + // icon: FileText, + // tags: ["Essential", "Documents", "PDF"], + // }, // Specialized Tools - Domain-specific importers for Bible translation // Non-beta importers first + // { + // ...usfmImporterPlugin, + // name: "USFM Files", + // description: "Unified Standard Format Marker files", + // tags: [...(usfmImporterPlugin.tags || []), "Specialized", "Bible", "USFM"], + // }, { - ...usfmImporterPlugin, - name: "USFM Files", - description: "Unified Standard Format Marker files", - tags: [...(usfmImporterPlugin.tags || []), "Specialized", "Bible", "USFM"], + ...usfmExperimentalImporterPlugin, + name: "USFM New", + description: "USFM files with round-trip export support (headers in chapter 1, verse-only target imports)", + tags: ["Specialized", "Bible", "USFM", "Round-trip"], }, { ...paratextImporterPlugin, name: "Paratext Projects", description: "Translation projects with settings", - tags: [...(paratextImporterPlugin.tags || []), "Specialized", "Bible", "Paratext"], + tags: ["Specialized", "Bible", "Paratext"], }, { ...ebibleDownloadImporterPlugin, name: "eBible Download", description: "Download directly from eBible.org", - tags: [...(ebibleDownloadImporterPlugin.tags || []), "Specialized", "Bible", "Download"], + tags: ["Specialized", "Bible", "Download"], }, { ...maculaBibleImporterPlugin, name: "Macula Bible", description: "Hebrew and Greek with annotations", - tags: [ - ...(maculaBibleImporterPlugin.tags || []), - "Specialized", - "Bible", - "Original Languages", - ], + tags: ["Specialized", "Bible", "Original Languages"], }, { ...obsImporterPlugin, name: "Bible Stories", description: "Open Bible Stories format with round-trip export support", - tags: [...(obsImporterPlugin.tags || []), "Specialized", "Bible", "Stories", "Round-trip"], + tags: ["Specialized", "Bible", "Stories", "Round-trip"], }, - // { - // ...biblicaSwapperImporterPlugin, - // name: "Biblica Bible Swapper", - // description: "Swap Bible text between two IDML files while preserving notes", - // tags: [...(biblicaSwapperImporterPlugin.tags || []), "Specialized", "Bible", "Biblica"], - // }, - - // Beta importers at the end of Specialized section { - ...usfmExperimentalImporterPlugin, - name: "USFM Experimental", - description: "USFM files with round-trip export support (headers in chapter 1, verse-only target imports)", - tags: [...(usfmExperimentalImporterPlugin.tags || []), "Specialized", "Bible", "USFM", "Experimental", "Round-trip"], + ...biblicaImporterPlugin, + name: "Biblica Study Notes", + description: "Biblica IDML importer with Study Bible notes", + tags: ["Specialized", "Bible", "Biblica", "Round-trip"], }, { - ...biblicaImporterPlugin, - name: "Biblica Files", - description: "Biblica IDML importer with Study Bible", - tags: [...(biblicaImporterPlugin.tags || []), "Specialized", "Bible", "Biblica"], + ...spreadsheetImporterPlugin, + name: "Bible Spreadsheet with Audio data", + description: "CSV and TSV files with audio URLs", + tags: ["Specialized", "Bible", "Spreadsheet", "CSV", "TSV", "Round-trip"], }, ]; diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.ts index f4eb034c8..ffd18c202 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.ts @@ -1,3 +1,4 @@ +import { v4 as uuidv4 } from 'uuid'; import { ImporterPlugin, FileValidationResult, @@ -221,7 +222,7 @@ export const parseFile = async ( name: baseName, cells, metadata: { - id: baseNameAsId, + id: uuidv4(), originalFileName: file.name, sourceFile: file.name, importerType: 'subtitles', @@ -257,6 +258,7 @@ export const parseFile = async ( cells: codexCells, metadata: { ...sourceNotebook.metadata, + id: uuidv4(), }, }; diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.tsx index d662daf3b..a145a983d 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.tsx @@ -13,5 +13,4 @@ export const subtitlesImporterPlugin: ImporterPlugin = { cellAligner: subtitlesCellAligner, supportedExtensions: ["vtt", "srt"], enabled: true, - tags: ["Media", "Timed"], }; diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/tms/index.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/tms/index.ts index e9131ac31..dbfd60114 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/tms/index.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/tms/index.ts @@ -1,3 +1,4 @@ +import { v4 as uuidv4 } from 'uuid'; import { ImporterPlugin, FileValidationResult, @@ -323,7 +324,7 @@ export const parseFile = async ( name: file.name.replace(/\.(tmx|xliff|xlf)$/, ''), cells: cells, metadata: { - id: `translation-source-${Date.now()}`, + id: uuidv4(), originalFileName: file.name, sourceFile: file.name, originalFileData: arrayBuffer, // Store original file for round-trip export @@ -353,7 +354,7 @@ export const parseFile = async ( cells: codexCells, metadata: { ...sourceNotebook.metadata, - id: `translation-codex-${Date.now()}`, + id: uuidv4(), // Don't duplicate the original file data in codex originalFileData: undefined, }, diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/tms/index.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/tms/index.tsx index 19bc7add0..f59f6182e 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/tms/index.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/tms/index.tsx @@ -10,5 +10,4 @@ export const tmsImporterPlugin: ImporterPlugin = { component: TmxImporterForm, supportedExtensions: ["tmx", "xliff", "xlf"], enabled: true, - tags: ["Essential", "Translation", "Localization"], }; \ No newline at end of file diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/index.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/index.ts index 29102d096..3b3c7f89d 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/index.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/index.ts @@ -4,6 +4,7 @@ * Standalone implementation - doesn't rely on common/usfmUtils.ts */ +import { v4 as uuidv4 } from 'uuid'; import { ImporterPlugin, FileValidationResult, @@ -97,7 +98,7 @@ export const parseFile = async ( name: baseName, cells: parsedDocument.cells, metadata: { - id: `usfm-experimental-source-${Date.now()}`, + id: uuidv4(), originalFileName: file.name, sourceFile: file.name, // Store original file data as ArrayBuffer for saving to attachments/originals @@ -143,7 +144,7 @@ export const parseFile = async ( cells: codexCells, metadata: { ...sourceNotebook.metadata, - id: `usfm-experimental-codex-${Date.now()}`, + id: uuidv4(), // Don't duplicate original file data in codex metadata originalFileData: undefined, }, diff --git a/webviews/codex-webviews/src/NewSourceUploader/types/processedNotebookMetadata.ts b/webviews/codex-webviews/src/NewSourceUploader/types/processedNotebookMetadata.ts index 403427287..ffefa0e05 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/types/processedNotebookMetadata.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/types/processedNotebookMetadata.ts @@ -70,10 +70,13 @@ export interface PlaintextNotebookMetadata extends ProcessedNotebookMetadataBase } export interface SpreadsheetNotebookMetadata extends ProcessedNotebookMetadataBase { - importerType: "spreadsheet"; + importerType: "spreadsheet" | "spreadsheet-csv" | "spreadsheet-tsv"; delimiter?: string; columnCount?: number; rowCount?: number; + columnHeaders?: string[]; + sourceColumnIndex?: number; + originalFileContent?: string; } export interface SmartSegmenterNotebookMetadata extends ProcessedNotebookMetadataBase { @@ -243,6 +246,8 @@ export type ProcessedNotebookMetadataByImporter = { subtitles: SubtitlesNotebookMetadata; plaintext: PlaintextNotebookMetadata; spreadsheet: SpreadsheetNotebookMetadata; + "spreadsheet-csv": SpreadsheetNotebookMetadata; + "spreadsheet-tsv": SpreadsheetNotebookMetadata; "smart-segmenter": SmartSegmenterNotebookMetadata; audio: AudioNotebookMetadata; tms: TmsNotebookMetadata; diff --git a/webviews/codex-webviews/src/lib/types.ts b/webviews/codex-webviews/src/lib/types.ts index a6958c935..1fb1ae5d4 100644 --- a/webviews/codex-webviews/src/lib/types.ts +++ b/webviews/codex-webviews/src/lib/types.ts @@ -35,6 +35,14 @@ export interface CustomNotebookMetadata { corpusMarker: string; validationMigrationComplete?: boolean; fontSize?: number; + importerType?: string; + originalFileName?: string; + sourceFile?: string; + /** + * Timestamp added to non-biblical imports to ensure unique filenames. + * Format: "YYYYMMDD_HHmmss" (e.g., "20260127_143025") + */ + importTimestamp?: string; } export interface ProgressPercentages {