From 205864a2a914ae0e6274a1e50b95ab539232c90f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Pacanovsk=C3=BD?= Date: Sat, 24 Jan 2026 10:32:29 +0100 Subject: [PATCH 1/6] Biblica importer labeling fix, Spreasheet Roundtrip and PDF experimental Fixed labeling issue on Biblica importer. Implemented the same rebuild round-trip logic for Spreadsheet importer and refined it. Tried new approach with PDF importer (still not great, but works a little bit better) --- src/exportHandler/exportHandler.ts | 501 ++++++++++++++++-- src/projectManager/projectExportView.ts | 4 +- .../NewSourceUploaderProvider.ts | 388 ++++++++++++++ src/utils/bookNameUtils.ts | 7 +- types/index.d.ts | 2 + .../src/CodexCellEditor/CellList.tsx | 15 + .../importers/docx/experiment/docxExporter.ts | 29 +- .../NewSourceUploader/importers/docx/index.ts | 3 + .../importers/pdf/cellMetadata.ts | 1 - .../NewSourceUploader/importers/pdf/index.ts | 364 +++++++------ .../spreadsheet/SpreadsheetImporterForm.tsx | 43 +- .../importers/spreadsheet/cellMetadata.ts | 13 +- .../types/processedNotebookMetadata.ts | 7 +- 13 files changed, 1144 insertions(+), 233 deletions(-) diff --git a/src/exportHandler/exportHandler.ts b/src/exportHandler/exportHandler.ts index 8efb12259..a93f3aff4 100644 --- a/src/exportHandler/exportHandler.ts +++ b/src/exportHandler/exportHandler.ts @@ -516,7 +516,7 @@ async function exportCodexContentAsDocxRoundtrip( ); } -// PDF Round-trip export +// PDF Round-trip export: Uses DOCX exporter then converts DOCX→PDF async function exportCodexContentAsPdfRoundtrip( userSelectedPath: string, filesToExport: string[], @@ -540,10 +540,10 @@ async function exportCodexContentAsPdfRoundtrip( async (progress) => { const increment = filesToExport.length > 0 ? 100 / filesToExport.length : 100; - // Import PDF exporter - const { exportPdfWithTranslations } = await import("../../webviews/codex-webviews/src/NewSourceUploader/importers/pdf/pdfExporter"); + // Import DOCX exporter (we'll use it to create DOCX, then convert to PDF) + const { exportDocxWithTranslations } = await import("../../webviews/codex-webviews/src/NewSourceUploader/importers/docx/experiment/docxExporter"); - // For each selected codex file, find its original attachment and create a translated copy in export folder + // For each selected codex file, export as DOCX then convert to PDF for (const [index, filePath] of filesToExport.entries()) { progress.report({ message: `Processing ${index + 1}/${filesToExport.length}`, increment }); try { @@ -551,14 +551,14 @@ async function exportCodexContentAsPdfRoundtrip( const fileName = basename(file.fsPath); const bookCode = fileName.split(".")[0] || ""; - console.log(`[PDF Export] Processing ${fileName} using PDF exporter`); + console.log(`[PDF Export] Processing ${fileName} using DOCX exporter + docx2pdf`); // Read codex notebook const codexNotebook = await readCodexNotebookFromUri(file); // Check if this is a PDF file const corpusMarker = (codexNotebook.metadata as any)?.corpusMarker || ''; - const isPdfFile = corpusMarker === 'pdf' || corpusMarker === 'pdf-importer' || corpusMarker === 'pdf-sentence'; + const isPdfFile = corpusMarker === 'pdf'; if (!isPdfFile) { console.warn(`[PDF Export] Skipping ${fileName} - not imported with PDF importer (corpusMarker: ${corpusMarker})`); vscode.window.showWarningMessage(`Skipping ${fileName} - not imported with PDF importer`); @@ -567,30 +567,118 @@ async function exportCodexContentAsPdfRoundtrip( // Lookup original attachment by originalFileName metadata const originalFileName = (codexNotebook.metadata as any)?.originalFileName || `${bookCode}.pdf`; - const originalsDir = vscode.Uri.joinPath( + + // Check both preferred and legacy locations for converted DOCX + const originalsDirPreferred = vscode.Uri.joinPath( workspaceFolders[0].uri, ".project", "attachments", + "files", "originals" ); - const originalFileUri = vscode.Uri.joinPath(originalsDir, originalFileName); + const originalsDirLegacy = vscode.Uri.joinPath( + workspaceFolders[0].uri, + ".project", + "attachments", + "originals" + ); + + // Get converted DOCX filename from metadata or derive from PDF filename + const pdfMetadata = (codexNotebook.metadata as any)?.pdfDocumentMetadata; + const convertedDocxFileName = pdfMetadata?.convertedDocxFileName || originalFileName.replace(/\.pdf$/i, '.docx'); + + // Try preferred location first, then legacy + const convertedDocxUriPreferred = vscode.Uri.joinPath(originalsDirPreferred, convertedDocxFileName); + const convertedDocxUriLegacy = vscode.Uri.joinPath(originalsDirLegacy, convertedDocxFileName); + + let docxUri = convertedDocxUriPreferred; + + try { + // Try preferred location first + await vscode.workspace.fs.stat(convertedDocxUriPreferred); + } catch { + // Fall back to legacy location + try { + await vscode.workspace.fs.stat(convertedDocxUriLegacy); + docxUri = convertedDocxUriLegacy; + } catch { + // If no converted DOCX exists, we need to convert PDF→DOCX first + // This should have been done during import, but handle gracefully + console.warn(`[PDF Export] No converted DOCX found at ${convertedDocxUriPreferred.fsPath} or ${convertedDocxUriLegacy.fsPath}`); + throw new Error(`No converted DOCX file found. Please re-import the PDF file.`); + } + } + + // Read converted DOCX + const docxBytes = await vscode.workspace.fs.readFile(docxUri); + const docxData = docxBytes.buffer.slice(docxBytes.byteOffset, docxBytes.byteOffset + docxBytes.byteLength) as ArrayBuffer; + console.log(`[PDF Export] Using converted DOCX: ${docxUri.fsPath}`); + + progress.report({ message: `Exporting DOCX for ${fileName}...`, increment: increment * 0.5 }); + + // Debug: Check cell metadata structure + console.log(`[PDF Export] Codex notebook has ${codexNotebook.cells.length} cells`); + if (codexNotebook.cells.length > 0) { + const firstCell = codexNotebook.cells[0]; + const cellMeta = firstCell.metadata as any; + console.log(`[PDF Export] First cell metadata:`, JSON.stringify({ + hasValue: !!firstCell.value, + valueLength: firstCell.value?.length || 0, + valuePreview: firstCell.value?.substring(0, 100) || '', + paragraphIndex: cellMeta?.paragraphIndex, + paragraphId: cellMeta?.paragraphId, + hasData: !!cellMeta?.data, + dataKeys: cellMeta?.data ? Object.keys(cellMeta.data) : [] + }, null, 2)); + } + + // Step 1: Use DOCX exporter to create translated DOCX + const updatedDocxData = await exportDocxWithTranslations( + docxData, + codexNotebook.cells + ); + + // Step 2: Save translated DOCX to attachments/files/temporary folder + const temporaryDir = vscode.Uri.joinPath( + workspaceFolders[0].uri, + ".project", + "attachments", + "files", + "temporary" + ); + + // Ensure temporary directory exists + try { + await vscode.workspace.fs.createDirectory(temporaryDir); + } catch { + // Directory may already exist + } + + // Get original PDF filename from metadata or derive from codex filename + const originalPdfFileName = originalFileName || fileName.replace(/\.codex$/i, '.pdf'); + const translatedDocxFileName = originalPdfFileName.replace(/\.pdf$/i, '_translated.docx'); + const translatedDocxUri = vscode.Uri.joinPath(temporaryDir, translatedDocxFileName); + + await vscode.workspace.fs.writeFile( + translatedDocxUri, + new Uint8Array(updatedDocxData) + ); + console.log(`[PDF Export] Saved translated DOCX to: ${translatedDocxUri.fsPath}`); - // Load original PDF - const pdfData = await vscode.workspace.fs.readFile(originalFileUri); + progress.report({ message: `Converting DOCX to PDF for ${fileName}...`, increment: increment * 0.5 }); - // Use PDF exporter to create translated PDF - // Convert Uint8Array to proper ArrayBuffer for pdf-lib - const pdfBuffer = new Uint8Array(pdfData).buffer as ArrayBuffer; - const updatedPdfData = await exportPdfWithTranslations(pdfBuffer, codexNotebook.cells); + // Step 3: Convert DOCX → PDF using docx2pdf via extension host + const pdfData = await convertDocxToPdfViaExtension(translatedDocxUri.fsPath); - // Save updated PDF into the chosen export folder + // Step 4: Save translated PDF to user's selected destination const timestamp = new Date().toISOString().replace(/[:.]/g, "-"); - const translatedName = originalFileName.replace(/\.pdf$/i, `_${timestamp}_translated.pdf`); - const translatedUri = vscode.Uri.joinPath(exportFolder, translatedName); + const translatedPdfName = originalFileName.replace(/\.pdf$/i, `_${timestamp}_translated.pdf`); + const translatedPdfUri = vscode.Uri.joinPath(exportFolder, translatedPdfName); - await vscode.workspace.fs.writeFile(translatedUri, new Uint8Array(updatedPdfData)); + await vscode.workspace.fs.writeFile(translatedPdfUri, new Uint8Array(pdfData)); + console.log(`[PDF Export] Saved translated PDF to: ${translatedPdfUri.fsPath}`); - console.log(`[PDF Export] ✓ Exported ${translatedName}`); + console.log(`[PDF Export] ✓ Exported ${translatedPdfName}`); } catch (error) { console.error(`[PDF Export] Error exporting ${filePath}:`, error); @@ -603,6 +691,127 @@ async function exportCodexContentAsPdfRoundtrip( ); } +/** + * Converts DOCX file to PDF using docx2pdf via Python script + */ +async function convertDocxToPdfViaExtension(docxPath: string): Promise { + try { + // Get extension path + const extension = vscode.extensions.getExtension('project-accelerate.codex-editor-extension'); + if (!extension) { + throw new Error('Could not find Codex Editor extension'); + } + const scriptPath = path.join(extension.extensionPath, 'webviews', 'codex-webviews', 'src', 'NewSourceUploader', 'importers', 'pdf', 'scripts', 'docx_to_pdf.py'); + const tempDir = path.join(extension.extensionPath, '.temp'); + if (!fs.existsSync(tempDir)) { + fs.mkdirSync(tempDir, { recursive: true }); + } + const pdfPath = path.join(tempDir, `converted_${Date.now()}.pdf`); + + // Escape paths for shell + const escapedScriptPath = scriptPath.replace(/\\/g, '/'); + const escapedDocxPath = docxPath.replace(/\\/g, '/'); + const escapedPdfPath = pdfPath.replace(/\\/g, '/'); + + // Run Python script with file paths (no base64 in command line) + const pythonCmd = process.platform === 'win32' ? 'python' : 'python3'; + const command = `${pythonCmd} "${escapedScriptPath}" "${escapedDocxPath}" "${escapedPdfPath}"`; + + console.log(`[DOCX→PDF] Converting DOCX to PDF...`); + console.log(`[DOCX→PDF] Command: ${command}`); + console.log(`[DOCX→PDF] DOCX path: ${docxPath}`); + console.log(`[DOCX→PDF] PDF output path: ${pdfPath}`); + + // Verify DOCX file exists + if (!fs.existsSync(docxPath)) { + throw new Error(`DOCX file not found: ${docxPath}`); + } + + let stdout: string; + let stderr: string; + try { + const result = await execAsync(command, { maxBuffer: 50 * 1024 * 1024 }); + stdout = result.stdout; + stderr = result.stderr; + } catch (execError: any) { + // execAsync throws an error if exit code is non-zero + stdout = execError.stdout || ''; + stderr = execError.stderr || ''; + console.error(`[DOCX→PDF] Python script execution failed: ${execError.message}`); + console.error(`[DOCX→PDF] Exit code: ${execError.code}`); + console.error(`[DOCX→PDF] Stdout: ${stdout}`); + console.error(`[DOCX→PDF] Stderr: ${stderr}`); + + // Try to parse error from stdout if it's JSON + if (stdout.trim()) { + try { + const errorResult = JSON.parse(stdout); + if (errorResult.error) { + throw new Error(`DOCX to PDF conversion failed: ${errorResult.error}`); + } + } catch { + // Not JSON, use the stderr/stdout as error message + } + } + + throw new Error(`DOCX to PDF conversion failed: ${stderr || stdout || execError.message}`); + } + + // Log stderr for debugging + if (stderr) { + console.log(`[DOCX→PDF] Python stderr: ${stderr}`); + } + + // Log stdout for debugging + console.log(`[DOCX→PDF] Python stdout: ${stdout.substring(0, 500)}${stdout.length > 500 ? '...' : ''}`); + + if (!stdout.trim()) { + throw new Error('Python script returned no output'); + } + + let result; + try { + result = JSON.parse(stdout); + } catch (parseError) { + console.error(`[DOCX→PDF] Failed to parse Python output as JSON: ${parseError}`); + console.error(`[DOCX→PDF] Raw stdout: ${stdout}`); + throw new Error(`Failed to parse conversion result: ${parseError instanceof Error ? parseError.message : 'Unknown error'}. Output: ${stdout.substring(0, 200)}`); + } + + if (result.success) { + // Verify PDF file exists + if (!fs.existsSync(pdfPath)) { + throw new Error(`PDF file was not created at: ${pdfPath}`); + } + + // Read the generated PDF + const pdfData = fs.readFileSync(pdfPath); + console.log(`[DOCX→PDF] Read PDF file: ${pdfData.length} bytes`); + + // Clean up temp PDF file + try { + fs.unlinkSync(pdfPath); + } catch (e) { + console.warn(`[DOCX→PDF] Could not delete temp PDF: ${e}`); + } + + console.log(`[DOCX→PDF] ✓ Successfully converted DOCX to PDF`); + return pdfData.buffer.slice(pdfData.byteOffset, pdfData.byteOffset + pdfData.byteLength) as ArrayBuffer; + } else { + const errorMsg = result.error || 'DOCX to PDF conversion failed'; + console.error(`[DOCX→PDF] Conversion failed: ${errorMsg}`); + console.error(`[DOCX→PDF] Full result object:`, JSON.stringify(result, null, 2)); + throw new Error(errorMsg); + } + } catch (err) { + if (err instanceof Error && err.message.includes('DOCX to PDF conversion failed')) { + throw err; // Re-throw our custom errors + } + console.error(`[DOCX→PDF] Unexpected error: ${err}`); + throw err instanceof Error ? err : new Error(`Failed to convert DOCX to PDF: ${err}`); + } +} + /** * RTF Round-trip export using Pandoc * COMMENTED OUT - RTF importer disabled @@ -1065,6 +1274,171 @@ async function exportCodexContentAsUsfmRoundtrip( ); } +/** + * Spreadsheet (CSV/TSV) Round-trip export + * Exports codex notebooks back to CSV/TSV format with translations + */ +async function exportCodexContentAsSpreadsheetRoundtrip( + userSelectedPath: string, + filesToExport: string[], + _options?: ExportOptions +) { + const workspaceFolders = vscode.workspace.workspaceFolders; + if (!workspaceFolders) { + vscode.window.showErrorMessage("No workspace folder found."); + return; + } + + const exportFolder = vscode.Uri.file(userSelectedPath); + await vscode.workspace.fs.createDirectory(exportFolder); + + return vscode.window.withProgress( + { + location: vscode.ProgressLocation.Notification, + title: "Exporting Spreadsheet Round-trip", + cancellable: false, + }, + async (progress) => { + const increment = filesToExport.length > 0 ? 100 / filesToExport.length : 100; + + // Import spreadsheet exporter + const { exportSpreadsheetWithTranslations, getDelimiterFromMetadata, getSpreadsheetExtension } = + await import("../../webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/spreadsheetExporter"); + + for (const [index, filePath] of filesToExport.entries()) { + progress.report({ message: `Processing ${index + 1}/${filesToExport.length}`, increment }); + try { + const file = vscode.Uri.file(filePath); + const fileName = basename(file.fsPath); + + console.log(`[Spreadsheet Export] Processing ${fileName}`); + + // Read codex notebook + const codexNotebook = await readCodexNotebookFromUri(file); + + // Check if this is a spreadsheet file + const corpusMarker = (codexNotebook.metadata as any)?.corpusMarker; + const importerType = (codexNotebook.metadata as any)?.importerType; + const originalFileName = (codexNotebook.metadata as any)?.originalFileName || ''; + + // Check for any spreadsheet importer type + const isSpreadsheet = + importerType === 'spreadsheet' || + importerType === 'spreadsheet-csv' || + importerType === 'spreadsheet-tsv' || + corpusMarker === 'spreadsheet' || + corpusMarker === 'spreadsheet-csv' || + corpusMarker === 'spreadsheet-tsv'; + + if (!isSpreadsheet) { + console.warn(`[Spreadsheet Export] Skipping ${fileName} - not imported with spreadsheet importer (importerType: ${importerType}, corpusMarker: ${corpusMarker})`); + vscode.window.showWarningMessage(`Skipping ${fileName} - not imported with spreadsheet importer`); + continue; + } + + // Get importer type and delimiter + const notebookImporterType = (codexNotebook.metadata as any)?.importerType; + const delimiter = getDelimiterFromMetadata(codexNotebook.metadata); + const extension = getSpreadsheetExtension(originalFileName, delimiter, notebookImporterType); + const columnHeaders = (codexNotebook.metadata as any)?.columnHeaders; + const sourceColumnIndex = (codexNotebook.metadata as any)?.sourceColumnIndex; + + console.log(`[Spreadsheet Export] Processing ${fileName}`); + console.log(`[Spreadsheet Export] - importerType: ${notebookImporterType}`); + console.log(`[Spreadsheet Export] - originalFileName: ${originalFileName}`); + console.log(`[Spreadsheet Export] - extension: ${extension}`); + console.log(`[Spreadsheet Export] - sourceColumnIndex: ${sourceColumnIndex}`); + console.log(`[Spreadsheet Export] - columnHeaders: ${columnHeaders ? columnHeaders.join(', ') : 'none'}`); + + // Get original file content from metadata (stored during import) + let originalFileContent: string | undefined = (codexNotebook.metadata as any)?.originalFileContent; + + if (originalFileContent) { + console.log(`[Spreadsheet Export] ✓ Found originalFileContent in metadata (${originalFileContent.length} chars)`); + console.log(`[Spreadsheet Export] First 200 chars: ${originalFileContent.substring(0, 200)}`); + } else { + console.log(`[Spreadsheet Export] No originalFileContent in metadata, trying file system...`); + + // Fallback: try to read from attachments folder (for older imports) + const originalsDir = vscode.Uri.joinPath( + workspaceFolders[0].uri, + '.project', + 'attachments', + 'files', + 'originals' + ); + const originalFileUri = vscode.Uri.joinPath(originalsDir, originalFileName); + + console.log(`[Spreadsheet Export] Looking for original file at: ${originalFileUri.fsPath}`); + + try { + const fileData = await vscode.workspace.fs.readFile(originalFileUri); + originalFileContent = Buffer.from(fileData).toString('utf-8'); + console.log(`[Spreadsheet Export] ✓ Loaded original file (${originalFileContent.length} chars)`); + } catch (err) { + console.warn(`[Spreadsheet Export] File not found at preferred location: ${err}`); + // Try legacy location + try { + const legacyDir = vscode.Uri.joinPath( + workspaceFolders[0].uri, + '.project', + 'attachments', + 'originals' + ); + const legacyUri = vscode.Uri.joinPath(legacyDir, originalFileName); + console.log(`[Spreadsheet Export] Trying legacy location: ${legacyUri.fsPath}`); + const fileData = await vscode.workspace.fs.readFile(legacyUri); + originalFileContent = Buffer.from(fileData).toString('utf-8'); + console.log(`[Spreadsheet Export] ✓ Loaded from legacy location (${originalFileContent.length} chars)`); + } catch (legacyErr) { + console.warn(`[Spreadsheet Export] ✗ Could not find original file anywhere. Will use fallback reconstruction.`); + } + } + } + + console.log(`[Spreadsheet Export] Metadata: importerType="${notebookImporterType}", delimiter="${delimiter === '\t' ? 'TAB' : delimiter}", sourceColumnIndex=${sourceColumnIndex}, hasOriginalContent=${!!originalFileContent}`); + + // Export with translations - true round-trip using original file content + const exportedContent = exportSpreadsheetWithTranslations( + codexNotebook.cells as any, + { + delimiter, + originalFileName, + originalFileContent, + columnHeaders, + sourceColumnIndex, + importerType: notebookImporterType, + } + ); + + // Generate output filename + const timestamp = new Date().toISOString().replace(/[:.]/g, "-"); + const baseName = originalFileName + ? originalFileName.replace(/\.(csv|tsv)$/i, '') + : fileName.replace(/\.codex$/i, ''); + const outputFileName = `${baseName}_translated_${timestamp}.${extension}`; + const outputUri = vscode.Uri.joinPath(exportFolder, outputFileName); + + // Write the file + await vscode.workspace.fs.writeFile( + outputUri, + Buffer.from(exportedContent, 'utf-8') + ); + + console.log(`[Spreadsheet Export] ✓ Exported ${outputFileName}`); + } catch (error) { + console.error(`[Spreadsheet Export] Error exporting ${filePath}:`, error); + vscode.window.showErrorMessage( + `Failed to export ${basename(filePath)}: ${error instanceof Error ? error.message : 'Unknown error'}` + ); + } + } + + vscode.window.showInformationMessage(`Spreadsheet round-trip export completed to ${userSelectedPath}`); + } + ); +} + /** * TMS (Translation Memory System) Round-trip export * Supports both TMX and XLIFF formats @@ -1228,12 +1602,12 @@ async function exportCodexContentAsRebuild( try { const file = vscode.Uri.file(filePath); const codexNotebook = await readCodexNotebookFromUri(file); - const corpusMarker = (codexNotebook.metadata as any)?.corpusMarker; - const importerType = (codexNotebook.metadata as any)?.importerType; - const fileType = (codexNotebook.metadata as any)?.fileType; - const originalFileName = (codexNotebook.metadata as any)?.originalFileName; + const corpusMarker = (codexNotebook.metadata as any)?.corpusMarker ? String((codexNotebook.metadata as any).corpusMarker).trim() : ''; + const importerType = (codexNotebook.metadata as any)?.importerType ? String((codexNotebook.metadata as any).importerType).trim() : ''; + const fileType = (codexNotebook.metadata as any)?.fileType ? String((codexNotebook.metadata as any).fileType).trim() : ''; + const originalFileName = (codexNotebook.metadata as any)?.originalFileName ? String((codexNotebook.metadata as any).originalFileName).trim() : ''; - console.log(`[Rebuild Export] File: ${basename(filePath)}, corpusMarker: ${corpusMarker}, importerType: ${importerType}, fileType: ${fileType}`); + console.log(`[Rebuild Export] File: ${basename(filePath)}, corpusMarker: "${corpusMarker}", importerType: "${importerType}", fileType: "${fileType}"`); // Group by supported types if (corpusMarker === 'docx-roundtrip') { @@ -1243,7 +1617,7 @@ async function exportCodexContentAsRebuild( corpusMarker === 'biblica' || corpusMarker === 'biblica-idml' || corpusMarker === 'idml-roundtrip' || - corpusMarker.startsWith('idml-') || + (corpusMarker && corpusMarker.startsWith('idml-')) || importerType === 'biblica' || fileType === 'biblica' || importerType === 'biblica-experimental' || // Backward compatibility @@ -1253,22 +1627,18 @@ async function exportCodexContentAsRebuild( // Includes Biblica importer which uses the same IDML format filesByType['idml'] = filesByType['idml'] || []; filesByType['idml'].push(filePath); - // } else if ( - // corpusMarker === 'pdf' || - // corpusMarker === 'pdf-importer' || // Backward compatibility - // corpusMarker === 'pdf-sentence' // Backward compatibility - // ) { - // // PDF files use the PDF exporter - // filesByType['pdf'] = filesByType['pdf'] || []; - // filesByType['pdf'].push(filePath); - // } else if ( - // corpusMarker === 'rtf' || - // corpusMarker === 'rtf-pandoc' || // Backward compatibility - // importerType === 'rtf-pandoc' - // ) { - // // RTF files use the Pandoc RTF exporter - // filesByType['rtf'] = filesByType['rtf'] || []; - // filesByType['rtf'].push(filePath); + } else if ( + corpusMarker === 'pdf' || + corpusMarker === 'pdf-importer' || // Backward compatibility + corpusMarker === 'pdf-sentence' || // Backward compatibility + importerType === 'pdf' || + fileType === 'pdf' || + (originalFileName && /\.pdf$/i.test(originalFileName)) // Fallback: check filename extension + ) { + // PDF files use the PDF exporter (DOCX exporter + docx2pdf conversion) + console.log(`[Rebuild Export] ✓ Detected PDF file: ${basename(filePath)} (corpusMarker: "${corpusMarker}", importerType: "${importerType}", fileType: "${fileType}")`); + filesByType['pdf'] = filesByType['pdf'] || []; + filesByType['pdf'].push(filePath); } else if (corpusMarker === 'obs' || importerType === 'obs') { // OBS (Open Bible Stories) markdown files use the OBS exporter // Fallback: also detect by importerType for older files @@ -1299,8 +1669,23 @@ async function exportCodexContentAsRebuild( // USFM files use the USFM round-trip exporter filesByType['usfm'] = filesByType['usfm'] || []; filesByType['usfm'].push(filePath); + } else if ( + corpusMarker === 'spreadsheet' || + corpusMarker === 'spreadsheet-csv' || + corpusMarker === 'spreadsheet-tsv' || + importerType === 'spreadsheet' || + importerType === 'spreadsheet-csv' || + importerType === 'spreadsheet-tsv' || + (originalFileName && /\.(csv|tsv)$/i.test(originalFileName)) + ) { + // Spreadsheet files (CSV/TSV) use the spreadsheet round-trip exporter + console.log(`[Rebuild Export] ✓ Detected Spreadsheet file: ${basename(filePath)} (corpusMarker: "${corpusMarker}", importerType: "${importerType}")`); + filesByType['spreadsheet'] = filesByType['spreadsheet'] || []; + filesByType['spreadsheet'].push(filePath); } else { - unsupportedFiles.push({ file: basename(filePath), marker: corpusMarker || importerType || 'unknown' }); + // Log what we detected for debugging + console.log(`[Rebuild Export] Unsupported file detected: ${basename(filePath)}, corpusMarker: ${corpusMarker}, importerType: ${importerType}, fileType: ${fileType}, originalFileName: ${originalFileName}`); + unsupportedFiles.push({ file: basename(filePath), marker: corpusMarker || importerType || fileType || 'unknown' }); } } catch (error) { console.error(`[Rebuild Export] Error analyzing ${filePath}:`, error); @@ -1351,23 +1736,21 @@ async function exportCodexContentAsRebuild( } } - // Export PDF files - // COMMENTED OUT - PDF exporter disabled (not working properly) - /* if (filesByType['pdf']?.length > 0) { - console.log(`[Rebuild Export] Exporting ${filesByType['pdf'].length} PDF file(s) to DOCX...`); + // Export PDF files (uses DOCX exporter + docx2pdf conversion) + if (filesByType['pdf']?.length > 0) { + console.log(`[Rebuild Export] Exporting ${filesByType['pdf'].length} PDF file(s)...`); progress.report({ - message: `Exporting ${filesByType['pdf'].length} PDF file(s) to DOCX...`, + message: `Exporting ${filesByType['pdf'].length} PDF file(s)...`, increment: 20 }); try { - const { exportPdfAsDocx } = await import("./pdfDocxExporter"); - await exportPdfAsDocx(userSelectedPath, filesByType['pdf']); + await exportCodexContentAsPdfRoundtrip(userSelectedPath, filesByType['pdf'], options); processedCount += filesByType['pdf'].length; } catch (error) { console.error('[Rebuild Export] PDF export failed:', error); vscode.window.showErrorMessage(`PDF export failed: ${error instanceof Error ? error.message : 'Unknown error'}`); } - } */ + } // Export RTF files using Pandoc // COMMENTED OUT - RTF importer disabled @@ -1434,6 +1817,22 @@ async function exportCodexContentAsRebuild( } } + // Export Spreadsheet (CSV/TSV) files + if (filesByType['spreadsheet']?.length > 0) { + console.log(`[Rebuild Export] Exporting ${filesByType['spreadsheet'].length} Spreadsheet file(s)...`); + progress.report({ + message: `Exporting ${filesByType['spreadsheet'].length} Spreadsheet file(s)...`, + increment: 20 + }); + try { + await exportCodexContentAsSpreadsheetRoundtrip(userSelectedPath, filesByType['spreadsheet'], options); + processedCount += filesByType['spreadsheet'].length; + } catch (error) { + console.error('[Rebuild Export] Spreadsheet export failed:', error); + vscode.window.showErrorMessage(`Spreadsheet export failed: ${error instanceof Error ? error.message : 'Unknown error'}`); + } + } + progress.report({ message: "Complete", increment: 30 }); // Show summary @@ -1457,7 +1856,7 @@ async function exportCodexContentAsRebuild( .join('\n'); vscode.window.showWarningMessage( - `The following files were skipped (unsupported or coming soon):\n${unsupportedList}\n\nSupported types: DOCX, IDML, Biblica, PDF`, + `The following files were skipped (unsupported or coming soon):\n${unsupportedList}\n\nSupported types: DOCX, IDML, Biblica, PDF, OBS, TMS, USFM, CSV/TSV`, { modal: false } ); } diff --git a/src/projectManager/projectExportView.ts b/src/projectManager/projectExportView.ts index 48c6b13da..85675a65e 100644 --- a/src/projectManager/projectExportView.ts +++ b/src/projectManager/projectExportView.ts @@ -334,14 +334,16 @@ function getWebviewContent(
Rebuild Export -

Intelligently detects file type and exports back to original format (DOCX, IDML, Biblica, OBS, TMS, USFM)

+

Intelligently detects file type and exports back to original format (DOCX, IDML, Biblica, PDF, OBS, TMS, USFM, CSV/TSV)

DOCX IDML Biblica + PDF OBS TMS USFM + CSV/TSV
diff --git a/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts b/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts index b8e5a743f..b7caefe0e 100644 --- a/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts +++ b/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts @@ -262,6 +262,333 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide error: err instanceof Error ? err.message : 'Unknown error' }); } + } else if (message.command === "convertPdfToDocx") { + const { requestId, pdfBase64, outputPath } = message as { requestId: string; pdfBase64: string; outputPath?: string; }; + try { + const scriptPath = path.join(this.context.extensionPath, 'webviews', 'codex-webviews', 'src', 'NewSourceUploader', 'importers', 'pdf', 'scripts', 'pdf_to_docx.py'); + + // Verify script exists + if (!fs.existsSync(scriptPath)) { + throw new Error(`Python script not found at: ${scriptPath}`); + } + + // Create temp directory + const tempDir = path.join(this.context.extensionPath, '.temp'); + if (!fs.existsSync(tempDir)) { + fs.mkdirSync(tempDir, { recursive: true }); + } + + // Write base64 PDF to temporary file to avoid command line length limits + const tempPdfPath = path.join(tempDir, `input_${Date.now()}_${Math.random().toString(36).slice(2)}.pdf`); + const pdfBuffer = Buffer.from(pdfBase64, 'base64'); + fs.writeFileSync(tempPdfPath, pdfBuffer); + + // Use temp file if outputPath not provided + const docxPath = outputPath || path.join(tempDir, `converted_${Date.now()}.docx`); + + // Verify PDF file was written + if (!fs.existsSync(tempPdfPath)) { + throw new Error(`Failed to write PDF file to: ${tempPdfPath}`); + } + + // Run Python script with file paths + // On Windows, use proper quoting; on Unix, paths should work as-is + const pythonCmd = process.platform === 'win32' ? 'python' : 'python3'; + + // Quote paths properly for Windows (use double quotes and escape inner quotes) + const quotePath = (p: string) => { + if (process.platform === 'win32') { + // Windows: use double quotes and escape any existing quotes + return `"${p.replace(/"/g, '\\"')}"`; + } else { + // Unix: use single quotes and escape any existing quotes + return `'${p.replace(/'/g, "\\'")}'`; + } + }; + + const command = `${pythonCmd} ${quotePath(scriptPath)} ${quotePath(tempPdfPath)} ${quotePath(docxPath)}`; + + console.log(`[PDF→DOCX] Converting PDF to DOCX...`); + console.log(`[PDF→DOCX] Command: ${command}`); + + let stdout = ''; + let stderr = ''; + try { + const result = await execAsync(command, { maxBuffer: 50 * 1024 * 1024 }); + stdout = result.stdout || ''; + stderr = result.stderr || ''; + } catch (execErr: any) { + // execAsync throws an error when command fails, but stdout/stderr are in the error object + stdout = execErr.stdout || ''; + stderr = execErr.stderr || ''; + const errorMessage = execErr.message || 'Unknown error'; + + // If we have stdout that might be JSON, try to parse it + if (stdout.trim()) { + try { + const result = JSON.parse(stdout); + if (result.error) { + throw new Error(`Python script error: ${result.error}`); + } + } catch (parseErr) { + // Not JSON, use the exec error + } + } + + // Include both stdout and stderr in error message + const fullError = [ + errorMessage, + stdout ? `\nStdout: ${stdout}` : '', + stderr ? `\nStderr: ${stderr}` : '' + ].filter(Boolean).join(''); + + throw new Error(fullError); + } + + // Clean up temp PDF file + try { + if (fs.existsSync(tempPdfPath)) { + fs.unlinkSync(tempPdfPath); + } + } catch (cleanupErr) { + console.warn(`[PDF→DOCX] Could not delete temp PDF: ${cleanupErr}`); + } + + // Log progress messages from stderr (Python script sends progress updates there) + if (stderr) { + try { + // Try to parse JSON progress messages + const stderrLines = stderr.split('\n').filter(line => line.trim()); + for (const line of stderrLines) { + try { + const progressMsg = JSON.parse(line); + if (progressMsg.info) { + console.log(`[PDF→DOCX] ${progressMsg.info}`); + } + } catch { + // Not JSON, log as-is if it's not a success message + if (line.trim() && !line.includes('"success":true')) { + console.log(`[PDF→DOCX] ${line}`); + } + } + } + } catch { + // If parsing fails, just log the stderr + if (!stdout.includes('"success":true')) { + console.warn(`[PDF→DOCX] Python stderr: ${stderr}`); + } + } + } + + // Parse JSON result + let result; + try { + result = JSON.parse(stdout); + } catch (parseErr) { + throw new Error(`Failed to parse Python script output as JSON. Stdout: ${stdout.substring(0, 500)}${stdout.length > 500 ? '...' : ''}. Stderr: ${stderr}`); + } + + if (result.success) { + console.log(`[PDF→DOCX] ✓ Successfully converted PDF to DOCX`); + + // Verify the DOCX file exists and has content + if (!fs.existsSync(docxPath)) { + throw new Error(`DOCX file not found at: ${docxPath}`); + } + + const fileStats = fs.statSync(docxPath); + if (fileStats.size === 0) { + throw new Error(`DOCX file is empty at: ${docxPath}`); + } + + console.log(`[PDF→DOCX] Reading DOCX file (${fileStats.size} bytes)...`); + + // For large files (>50MB), save directly to workspace and send file path instead of base64 + // This avoids memory issues and webview message size limits + const LARGE_FILE_THRESHOLD = 50 * 1024 * 1024; // 50MB + const workspaceFolder = vscode.workspace.workspaceFolders?.[0]; + + if (fileStats.size > LARGE_FILE_THRESHOLD && workspaceFolder) { + console.log(`[PDF→DOCX] Large file detected (${fileStats.size} bytes), saving to workspace instead of sending via message...`); + + // Save DOCX to temporary location in workspace + const tempDir = vscode.Uri.joinPath(workspaceFolder.uri, '.project', 'temp'); + await vscode.workspace.fs.createDirectory(tempDir); + + const tempDocxUri = vscode.Uri.joinPath(tempDir, `pdf_conversion_${requestId}.docx`); + const docxBuffer = fs.readFileSync(docxPath); + await vscode.workspace.fs.writeFile(tempDocxUri, new Uint8Array(docxBuffer)); + + console.log(`[PDF→DOCX] Saved large DOCX to workspace: ${tempDocxUri.fsPath}`); + + webviewPanel.webview.postMessage({ + command: 'convertPdfToDocxResult', + requestId, + success: true, + docxFilePath: tempDocxUri.fsPath, // Send file path instead of base64 + outputPath: docxPath, + isLargeFile: true + }); + } else { + // For smaller files, send base64 as before + const docxBuffer = fs.readFileSync(docxPath); + const docxBase64 = docxBuffer.toString('base64'); + + // Verify base64 encoding is valid + if (!docxBase64 || docxBase64.length === 0) { + throw new Error('Failed to encode DOCX file to base64'); + } + + console.log(`[PDF→DOCX] Sending DOCX data to webview (${docxBase64.length} base64 chars)...`); + + webviewPanel.webview.postMessage({ + command: 'convertPdfToDocxResult', + requestId, + success: true, + docxBase64: docxBase64, + outputPath: docxPath, + isLargeFile: false + }); + } + } else { + throw new Error(result.error || 'Conversion failed'); + } + } catch (err) { + const errorMessage = err instanceof Error ? err.message : 'Unknown error'; + console.error('[NEW SOURCE UPLOADER] PDF→DOCX conversion failed:', err); + webviewPanel.webview.postMessage({ + command: 'convertPdfToDocxResult', + requestId, + success: false, + error: errorMessage + }); + } + } else if (message.command === "convertDocxToPdf") { + const { requestId, docxBase64, outputPath } = message as { requestId: string; docxBase64: string; outputPath?: string; }; + try { + const scriptPath = path.join(this.context.extensionPath, 'webviews', 'codex-webviews', 'src', 'NewSourceUploader', 'importers', 'pdf', 'scripts', 'docx_to_pdf.py'); + + // Verify script exists + if (!fs.existsSync(scriptPath)) { + throw new Error(`Python script not found at: ${scriptPath}`); + } + + // Create temp directory + const tempDir = path.join(this.context.extensionPath, '.temp'); + if (!fs.existsSync(tempDir)) { + fs.mkdirSync(tempDir, { recursive: true }); + } + + // Write base64 DOCX to temporary file to avoid command line length limits + const tempDocxPath = path.join(tempDir, `input_${Date.now()}_${Math.random().toString(36).slice(2)}.docx`); + const docxBuffer = Buffer.from(docxBase64, 'base64'); + fs.writeFileSync(tempDocxPath, docxBuffer); + + // Use temp file if outputPath not provided + const pdfPath = outputPath || path.join(tempDir, `converted_${Date.now()}.pdf`); + + // Verify DOCX file was written + if (!fs.existsSync(tempDocxPath)) { + throw new Error(`Failed to write DOCX file to: ${tempDocxPath}`); + } + + // Run Python script with file paths + // On Windows, use proper quoting; on Unix, paths should work as-is + const pythonCmd = process.platform === 'win32' ? 'python' : 'python3'; + + // Quote paths properly for Windows (use double quotes and escape inner quotes) + const quotePath = (p: string) => { + if (process.platform === 'win32') { + // Windows: use double quotes and escape any existing quotes + return `"${p.replace(/"/g, '\\"')}"`; + } else { + // Unix: use single quotes and escape any existing quotes + return `'${p.replace(/'/g, "\\'")}'`; + } + }; + + const command = `${pythonCmd} ${quotePath(scriptPath)} ${quotePath(tempDocxPath)} ${quotePath(pdfPath)}`; + + console.log(`[DOCX→PDF] Converting DOCX to PDF...`); + console.log(`[DOCX→PDF] Command: ${command}`); + + let stdout = ''; + let stderr = ''; + try { + const result = await execAsync(command, { maxBuffer: 50 * 1024 * 1024 }); + stdout = result.stdout || ''; + stderr = result.stderr || ''; + } catch (execErr: any) { + // execAsync throws an error when command fails, but stdout/stderr are in the error object + stdout = execErr.stdout || ''; + stderr = execErr.stderr || ''; + const errorMessage = execErr.message || 'Unknown error'; + + // If we have stdout that might be JSON, try to parse it + if (stdout.trim()) { + try { + const result = JSON.parse(stdout); + if (result.error) { + throw new Error(`Python script error: ${result.error}`); + } + } catch (parseErr) { + // Not JSON, use the exec error + } + } + + // Include both stdout and stderr in error message + const fullError = [ + errorMessage, + stdout ? `\nStdout: ${stdout}` : '', + stderr ? `\nStderr: ${stderr}` : '' + ].filter(Boolean).join(''); + + throw new Error(fullError); + } + + // Clean up temp DOCX file + try { + if (fs.existsSync(tempDocxPath)) { + fs.unlinkSync(tempDocxPath); + } + } catch (cleanupErr) { + console.warn(`[DOCX→PDF] Could not delete temp DOCX: ${cleanupErr}`); + } + + if (stderr && !stdout.includes('"success":true')) { + console.warn(`[DOCX→PDF] Python stderr: ${stderr}`); + } + + // Parse JSON result + let result; + try { + result = JSON.parse(stdout); + } catch (parseErr) { + throw new Error(`Failed to parse Python script output as JSON. Stdout: ${stdout.substring(0, 500)}${stdout.length > 500 ? '...' : ''}. Stderr: ${stderr}`); + } + + if (result.success) { + console.log(`[DOCX→PDF] ✓ Successfully converted DOCX to PDF`); + webviewPanel.webview.postMessage({ + command: 'convertDocxToPdfResult', + requestId, + success: true, + pdfBase64: result.pdfBase64, + outputPath: pdfPath + }); + } else { + throw new Error(result.error || 'Conversion failed'); + } + } catch (err) { + const errorMessage = err instanceof Error ? err.message : 'Unknown error'; + console.error('[NEW SOURCE UPLOADER] DOCX→PDF conversion failed:', err); + webviewPanel.webview.postMessage({ + command: 'convertDocxToPdfResult', + requestId, + success: false, + error: errorMessage + }); + } } else if (message.command === "fetchTargetFile") { // Fetch target file content for translation imports const { sourceFilePath } = message; @@ -455,6 +782,19 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide ...(processedNotebook.metadata?.importerType && { importerType: processedNotebook.metadata.importerType }), + // Spreadsheet-specific metadata for round-trip export + ...(processedNotebook.metadata?.originalFileContent && { + originalFileContent: processedNotebook.metadata.originalFileContent + }), + ...(processedNotebook.metadata?.columnHeaders && { + columnHeaders: processedNotebook.metadata.columnHeaders + }), + ...(processedNotebook.metadata?.sourceColumnIndex !== undefined && { + sourceColumnIndex: processedNotebook.metadata.sourceColumnIndex + }), + ...(processedNotebook.metadata?.delimiter && { + delimiter: processedNotebook.metadata.delimiter + }), }; return { @@ -530,6 +870,54 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide // The original template is stored in `.project/attachments/originals/`. delete pair.source.metadata.originalFileData; } + + // For PDF imports: Also save the converted DOCX file for round-trip export + const pdfMetadata = (pair.source.metadata as any)?.pdfDocumentMetadata; + if (pdfMetadata?.convertedDocxFileName) { + const originalsDir = vscode.Uri.joinPath( + workspaceFolder.uri, + '.project', + 'attachments', + 'files', + 'originals' + ); + await vscode.workspace.fs.createDirectory(originalsDir); + + const convertedDocxUri = vscode.Uri.joinPath(originalsDir, pdfMetadata.convertedDocxFileName); + + // If convertedDocxData is present (small files), save it directly + // If isLargeFile flag is set, the file should already be saved in temp location + if (pdfMetadata.convertedDocxData) { + const docxData = pdfMetadata.convertedDocxData; + // Convert ArrayBuffer to Uint8Array if needed + const docxBuffer = docxData instanceof ArrayBuffer + ? new Uint8Array(docxData) + : Buffer.from(docxData); + await vscode.workspace.fs.writeFile(convertedDocxUri, docxBuffer); + console.log(`[PDF Importer] Saved converted DOCX file: ${pdfMetadata.convertedDocxFileName}`); + // Remove from metadata to avoid persisting in JSON + delete pdfMetadata.convertedDocxData; + } else if (pdfMetadata.isLargeFile) { + // For large files, check if temp file exists and copy it + const tempDir = vscode.Uri.joinPath(workspaceFolder.uri, '.project', 'temp'); + const tempDocxUri = vscode.Uri.joinPath(tempDir, `pdf_conversion_*.docx`); + // Note: We'd need the actual requestId to find the temp file + // For now, try to find any matching temp file + try { + const tempFiles = await vscode.workspace.fs.readDirectory(tempDir); + const matchingFile = tempFiles.find(([name]) => name.startsWith('pdf_conversion_') && name.endsWith('.docx')); + if (matchingFile) { + const tempFileUri = vscode.Uri.joinPath(tempDir, matchingFile[0]); + const tempData = await vscode.workspace.fs.readFile(tempFileUri); + await vscode.workspace.fs.writeFile(convertedDocxUri, tempData); + await vscode.workspace.fs.delete(tempFileUri); // Clean up temp file + console.log(`[PDF Importer] Saved large converted DOCX file: ${pdfMetadata.convertedDocxFileName}`); + } + } catch (err) { + console.warn(`[PDF Importer] Could not find/copy temp DOCX file: ${err}`); + } + } + } } } diff --git a/src/utils/bookNameUtils.ts b/src/utils/bookNameUtils.ts index 53ee44ab8..a98e2036f 100644 --- a/src/utils/bookNameUtils.ts +++ b/src/utils/bookNameUtils.ts @@ -127,8 +127,11 @@ export function isBiblicalImporterType(importerType: string | undefined): boolea 'macula', 'biblica', 'obs', - 'pdf', // PDF can contain Bible content - 'indesign', // InDesign can contain Bible content + // Note: 'pdf', 'docx', and 'indesign' are NOT included here + // because they are generic document formats that should preserve + // their original filenames rather than being converted to Bible book codes. + // If specific biblical content detection is needed, it should be done + // at the importer level with explicit flags. ]; return bibleTypeImporters.includes(normalizedType); } diff --git a/types/index.d.ts b/types/index.d.ts index 95275f088..bc85cb64f 100644 --- a/types/index.d.ts +++ b/types/index.d.ts @@ -1000,6 +1000,8 @@ type FileImporterType = | "markdown" | "subtitles" | "spreadsheet" + | "spreadsheet-csv" + | "spreadsheet-tsv" | "tms" | "pdf" | "indesign" diff --git a/webviews/codex-webviews/src/CodexCellEditor/CellList.tsx b/webviews/codex-webviews/src/CodexCellEditor/CellList.tsx index c19246a7a..28b99f82e 100644 --- a/webviews/codex-webviews/src/CodexCellEditor/CellList.tsx +++ b/webviews/codex-webviews/src/CodexCellEditor/CellList.tsx @@ -598,6 +598,21 @@ const CellList: React.FC = ({ } } + // For cells with cellLabel but no verse-level global references (e.g., Biblica importer cells before verses), + // use the cellLabel instead of chapter-based verse number + // Biblica cells before verses have globalReferences like ["GEN"] (book only), while cells with verses have ["GEN 1:34"] (book chapter:verse) + const globalRefs = cell.data?.globalReferences; + const hasGlobalRefs = globalRefs && Array.isArray(globalRefs) && globalRefs.length > 0; + const hasVerseLevelRefs = hasGlobalRefs && globalRefs.some((ref: string) => { + // Check if reference contains chapter:verse format (e.g., "GEN 1:34" or "GEN 1:1") + return typeof ref === 'string' && /\d+:\d+/.test(ref); + }); + + // If cell has a label but no verse-level references, use the label (for Biblica cells before verses) + if (cell.cellLabel && !hasVerseLevelRefs) { + return cell.cellLabel; + } + // Get chapter-based verse number (skipping paratext cells) return getChapterBasedVerseNumber(cell, fullDocumentTranslationUnits).toString(); }, diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/experiment/docxExporter.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/experiment/docxExporter.ts index 999126963..11cd851c5 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/experiment/docxExporter.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/experiment/docxExporter.ts @@ -109,26 +109,34 @@ function collectTranslations( console.log(`[Exporter] Processing ${codexCells.length} cells for translations`); + // Debug: Log first few cells to understand structure + let emptyCount = 0; + let nonEmptyCount = 0; + for (let i = 0; i < codexCells.length; i++) { const cell = codexCells[i]; const meta = cell.metadata; - // Only DOCX cells have paragraphIndex/paragraphId; everything else is skipped naturally. - // (Don't rely on kind/type here; it varies by host and we only need the mapping fields.) - // Get translated content (strip HTML tags) - const translated = removeHtmlTags(cell.value).trim(); + const translated = removeHtmlTags(cell.value || '').trim(); if (!translated) { + emptyCount++; + // Debug first 3 empty cells + if (emptyCount <= 3) { + console.log(`[Exporter] Cell ${i} is empty. Raw value: "${(cell.value || '').substring(0, 50)}"`); + } continue; } + nonEmptyCount++; - // Get paragraph identifier + // Get paragraph identifier - check multiple locations for compatibility + // Priority: paragraphIndex > paragraphId > data.segmentIndex > cell index const paragraphId = meta?.paragraphId; const paragraphIndex = meta?.paragraphIndex; + const segmentIndex = meta?.data?.segmentIndex; if (typeof paragraphIndex === 'number') { translations.set(paragraphIndex, translated); - // Keep logs light; large documents can have thousands of cells. } else if (typeof paragraphId === 'string') { const m = paragraphId.match(/^p-(\d+)$/); if (m) { @@ -137,11 +145,16 @@ function collectTranslations( } else { console.warn(`[Exporter] ⚠ Unrecognized paragraphId format: ${paragraphId}`); } + } else if (typeof segmentIndex === 'number') { + // Fallback for older files that only have segmentIndex in data + translations.set(segmentIndex, translated); + } else { + // Last resort: use cell index (works if cells match paragraph order) + translations.set(i, translated); } } - console.log(`[Exporter] Collected ${translations.size} translations total`); - // Avoid dumping thousands of IDs in logs. + console.log(`[Exporter] Collected ${translations.size} translations total (${nonEmptyCount} non-empty cells, ${emptyCount} empty cells)`); return translations; } diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/index.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/index.ts index 9ac364734..6a64686bc 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/index.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/index.ts @@ -276,7 +276,10 @@ export const parseFile = async ( segmentToIdMap.set(segment, cellId); // Create cell with enhanced metadata including structure data + // Include paragraphIndex and paragraphId for DOCX exporter compatibility const cell = createProcessedCell(cellId, segment, { + paragraphIndex: index, + paragraphId: `p-${index}`, data: { originalOffset: { start: segmentStart, diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/cellMetadata.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/cellMetadata.ts index 3c488dd65..77c9acb0b 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/cellMetadata.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/cellMetadata.ts @@ -71,7 +71,6 @@ export function createPdfCellMetadata(params: PdfCellMetadataParams): { metadata // Import metadata importTimestamp: new Date().toISOString(), - corpusMarker: 'pdf', importerVersion: '1.0.0', }, } diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/index.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/index.ts index cb3d4636a..0b04b8a7e 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/index.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/index.ts @@ -404,195 +404,237 @@ export const validateFile = async (file: File): Promise => }; /** - * Parses a PDF file for non-Bible text content + * Converts PDF to DOCX via extension host */ -export const parseFile = async ( - file: File, - onProgress?: ProgressCallback -): Promise => { - try { - onProgress?.(createProgress('Reading File', 'Reading PDF file...', 10)); - - // Read file as ArrayBuffer to store original for round-trip export - const arrayBuffer = await file.arrayBuffer(); - - onProgress?.(createProgress('Extracting Text', 'Extracting text from PDF...', 30)); - - const textContent = await extractTextViaExtension(file); - - onProgress?.(createProgress('Processing Content', 'Processing extracted text...', 50)); - - // Split content by paragraphs (double newlines) and HTML breaks - // PDFs preserve paragraph breaks which represent natural text units - // Falls back to sentence splitting only if no paragraph breaks are found - const segments = splitPdfContentIntoSegments(textContent); - - // Validate that we have segments - if (!segments || segments.length === 0) { - throw new Error('No content segments found in PDF. The PDF may be empty or contain only images.'); - } - - // Log for debugging - console.log(`[PDF Importer] Split PDF into ${segments.length} segments`); - - onProgress?.(createProgress('Creating Cells', 'Creating cells from text segments...', 70)); - - // Filter out empty segments and create cells - const validSegments = segments.filter(segment => segment && segment.trim().length > 0); +async function convertPdfToDocxViaExtension(file: File): Promise { + return new Promise((resolve, reject) => { + try { + const requestId = `pdf-to-docx-${Date.now()}-${Math.random().toString(36).slice(2)}`; - if (validSegments.length === 0) { - throw new Error('No valid content segments found in PDF after filtering.'); - } + const cleanup = () => window.removeEventListener('message', onMessage as any); - // Create cells for each segment - const cells = await Promise.all( - validSegments.map(async (segment, index) => { - // Ensure we have valid content - const cleanText = segment - .replace(/[\r\n]+/g, ' ') - .replace(/\s+/g, ' ') - .trim(); - - if (!cleanText || cleanText.length === 0) { - console.warn(`[PDF Importer] Skipping empty segment at index ${index}`); - return null; + const onMessage = (event: MessageEvent) => { + const data = (event && event.data) || {}; + if (data && data.command === 'convertPdfToDocxResult' && data.requestId === requestId) { + cleanup(); + if (data.success) { + try { + // For large files, the extension host saves the file and sends the path + // For smaller files, it sends base64 data + if (data.isLargeFile && data.docxFilePath) { + // Request the file from extension host using file path + (window as any).vscodeApi?.postMessage({ + command: 'readFileFromPath', + requestId: `read-docx-${requestId}`, + filePath: data.docxFilePath + }); + + // Set up listener for file data + const fileReaderCleanup = () => window.removeEventListener('message', fileReaderHandler as any); + const fileReaderHandler = (fileEvent: MessageEvent) => { + const fileData = (fileEvent && fileEvent.data) || {}; + if (fileData.command === 'readFileFromPathResult' && fileData.requestId === `read-docx-${requestId}`) { + fileReaderCleanup(); + if (fileData.success && fileData.fileData) { + // Convert base64 to File object + const base64 = fileData.fileData; + const binaryString = atob(base64); + const bytes = new Uint8Array(binaryString.length); + for (let i = 0; i < binaryString.length; i++) { + bytes[i] = binaryString.charCodeAt(i); + } + + const docxFileName = file.name.replace(/\.pdf$/i, '.docx'); + const docxFile = new File([bytes], docxFileName, { + type: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + lastModified: file.lastModified + }); + + resolve(docxFile); + } else { + reject(new Error(fileData.error || 'Failed to read DOCX file from path')); + } + } + }; + + window.addEventListener('message', fileReaderHandler as any); + + // Timeout for file read + setTimeout(() => { + fileReaderCleanup(); + reject(new Error('Timeout reading DOCX file from workspace')); + }, 60000); + } else { + // Standard base64 path for smaller files + const base64 = data.docxBase64; + + if (!base64 || typeof base64 !== 'string') { + throw new Error('Invalid base64 data received from conversion'); + } + + // Validate base64 string (basic check) + if (!/^[A-Za-z0-9+/]*={0,2}$/.test(base64.replace(/\s/g, ''))) { + throw new Error('Invalid base64 encoding format'); + } + + const binaryString = atob(base64); + const bytes = new Uint8Array(binaryString.length); + for (let i = 0; i < binaryString.length; i++) { + bytes[i] = binaryString.charCodeAt(i); + } + + // Create File object with .docx extension + const docxFileName = file.name.replace(/\.pdf$/i, '.docx'); + const docxFile = new File([bytes], docxFileName, { + type: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + lastModified: file.lastModified + }); + + resolve(docxFile); + } + } catch (decodeError) { + reject(new Error(`Failed to decode DOCX file: ${decodeError instanceof Error ? decodeError.message : 'Unknown error'}`)); + } + } else { + reject(new Error(data.error || 'Failed to convert PDF to DOCX')); + } } + }; - // Create cell metadata (generates UUID internally) - const { cellId, metadata: cellMetadata } = createPdfCellMetadata({ - originalContent: segment, - cellLabel: (index + 1).toString(), - segmentIndex: index, - fileName: file.name, - fileSize: file.size, - }); + window.addEventListener('message', onMessage as any); - // Get cleaned text from metadata - const cleanedText = cellMetadata.originalText || cleanText; + // Read PDF as base64 + const reader = new FileReader(); + reader.onerror = () => { + cleanup(); + reject(new Error('Failed to read PDF file')); + }; + reader.onload = () => { + const dataUrl = (reader.result as string) || ''; + const base64 = dataUrl.includes(',') ? dataUrl.split(',')[1] : dataUrl; + + (window as any).vscodeApi?.postMessage({ + command: 'convertPdfToDocx', + requestId, + pdfBase64: base64, + }); + }; + + setTimeout(() => reader.readAsDataURL(file), 0); - // Create HTML content with paragraph semantics - const htmlContent = `
-

${escapeHtml(cleanedText)}

-
`; + // Safety timeout - increased for large PDFs with CMYK conversion + setTimeout(() => { + cleanup(); + reject(new Error('PDF to DOCX conversion timed out after 10 minutes. Large PDFs with CMYK images may take longer. Please try again or use a smaller file.')); + }, 600000); // 10 minutes timeout for large files with CMYK conversion + } catch (err) { + reject(err instanceof Error ? err : new Error('Failed to request PDF to DOCX conversion')); + } + }); +} - const cell = createProcessedCell(cellId, htmlContent, { - type: 'text', - ...cellMetadata, - } as any); +/** + * Parses a PDF file by converting it to DOCX first, then using DOCX importer + * This approach provides better layout preservation and round-trip fidelity + */ +export const parseFile = async ( + file: File, + onProgress?: ProgressCallback +): Promise => { + try { + onProgress?.(createProgress('Converting PDF', 'Converting PDF to DOCX format...', 10)); - // Extract and process images from this cell (if any) - const images = await extractImagesFromHtml(htmlContent); - cell.images = images; + // Step 1: Convert PDF to DOCX using pdf2docx + const docxFile = await convertPdfToDocxViaExtension(file); - return cell; - }) - ); + onProgress?.(createProgress('Importing DOCX', 'Importing converted DOCX file...', 30)); - // Filter out any null cells (from empty segments) - const validCells = cells.filter((cell): cell is NonNullable => cell !== null); + // Step 2: Import the DOCX file using DOCX importer + const { parseFile: parseDocxFile } = await import('../docx/index'); + const docxResult = await parseDocxFile(docxFile, (progress) => { + // Map DOCX import progress (30-90%) to overall progress (30-90%) + const mappedProgress = 30 + (progress.progress || 0) * 0.6; + onProgress?.(createProgress(progress.stage || 'Importing DOCX', progress.message || '', mappedProgress)); + }); - if (validCells.length === 0) { - throw new Error('No valid cells created from PDF content. All segments were empty.'); + if (!docxResult.success || !docxResult.notebookPair) { + throw new Error('DOCX import failed after PDF conversion'); } - onProgress?.(createProgress('Creating Notebooks', 'Creating source and codex notebooks...', 90)); + // Step 3: Override corpusMarker to "pdf" while keeping all DOCX structure + const sourceNotebook = docxResult.notebookPair.source; + const codexNotebook = docxResult.notebookPair.codex; + + // For large files, don't store ArrayBuffers in metadata to avoid memory issues + // Instead, we'll save them during the write process + // Only store ArrayBuffers for smaller files (< 50MB) + const LARGE_FILE_THRESHOLD = 50 * 1024 * 1024; // 50MB + const shouldStoreBuffers = file.size < LARGE_FILE_THRESHOLD && docxFile.size < LARGE_FILE_THRESHOLD; + + let originalPdfArrayBuffer: ArrayBuffer | undefined; + let convertedDocxArrayBuffer: ArrayBuffer | undefined; + + if (shouldStoreBuffers) { + originalPdfArrayBuffer = await file.arrayBuffer(); + convertedDocxArrayBuffer = await docxFile.arrayBuffer(); + } - // Create source notebook - const sourceNotebook = { - name: sanitizeFileName(file.name), - cells: validCells, - metadata: { - id: `pdf-${Date.now()}`, + // Override metadata to indicate PDF origin + sourceNotebook.metadata = { + ...sourceNotebook.metadata, + corpusMarker: 'pdf', + importerType: 'pdf', + originalFileName: file.name, // Keep original PDF filename + originalFileData: originalPdfArrayBuffer, // Store original PDF only if small (will be saved to attachments/originals) + fileType: 'pdf', + importContext: { + ...sourceNotebook.metadata.importContext, + importerType: 'pdf', + fileName: file.name, originalFileName: file.name, - originalFileData: arrayBuffer, // Store original PDF for round-trip export - corpusMarker: 'pdf', - importerType: 'pdf', // Alias for corpusMarker (type requirement) - createdAt: new Date().toISOString(), - importContext: { - importerType: 'pdf', - fileName: file.name, - originalFileName: file.name, - fileSize: file.size, - importTimestamp: new Date().toISOString(), - }, - sourceFile: file.name, - totalCells: cells.length, - fileType: 'pdf', - importDate: new Date().toISOString(), - - // Segmentation info - segmentationType: 'sentences', - - // Round-trip metadata - pdfDocumentMetadata: { - originalFileName: file.name, - fileSize: file.size, - totalSentences: cells.length, - importerVersion: '1.0.0', - - // Placeholder for future PDF metadata enhancements - totalPages: undefined, // Will be populated when available - pdfVersion: undefined, - author: undefined, - title: undefined, - creationDate: undefined, - }, - } + fileSize: file.size, + }, + // Preserve DOCX metadata but mark as PDF + pdfDocumentMetadata: { + originalFileName: file.name, + fileSize: file.size, + convertedFromPdf: true, + convertedDocxFileName: docxFile.name, + // Store converted DOCX data for export only if small (will be saved separately) + convertedDocxData: convertedDocxArrayBuffer, + isLargeFile: !shouldStoreBuffers, // Flag to indicate files need to be saved from temp location + }, }; - // Create codex notebook (empty for translation) - const codexNotebook = { - name: `${sanitizeFileName(file.name)}`, - cells: validCells.map(sourceCell => - createProcessedCell(sourceCell.id, '', { - ...sourceCell.metadata, - originalContent: sourceCell.content - }) - ), - metadata: { - id: `pdf-codex-${Date.now()}`, + codexNotebook.metadata = { + ...codexNotebook.metadata, + corpusMarker: 'pdf', + importerType: 'pdf', + originalFileName: file.name, + fileType: 'pdf', + importContext: { + ...codexNotebook.metadata.importContext, + importerType: 'pdf', + fileName: file.name, originalFileName: file.name, - // Don't duplicate the original file data in codex - originalFileData: undefined, - corpusMarker: 'pdf', - importerType: 'pdf', // Alias for corpusMarker (type requirement) - createdAt: new Date().toISOString(), - importContext: { - importerType: 'pdf', - fileName: file.name, - originalFileName: file.name, - fileSize: file.size, - importTimestamp: new Date().toISOString(), - }, - sourceFile: file.name, - totalCells: cells.length, - fileType: 'pdf', - importDate: new Date().toISOString(), - isCodex: true, - - // Segmentation info - segmentationType: 'sentences', - - // Link to source metadata for round-trip - sourceMetadata: sourceNotebook.metadata, - } + fileSize: file.size, + }, }; - // Add milestone cells to the notebook pair - const notebookPairWithMilestones = addMilestoneCellsToNotebookPair({ - source: sourceNotebook, - codex: codexNotebook, - }); + // Note: corpusMarker is only set at notebook-level metadata, not in individual cells + // This keeps the notebook structure clean and avoids duplication onProgress?.(createProgress('Complete', 'PDF import completed successfully!', 100)); return { success: true, - notebookPair: notebookPairWithMilestones, + notebookPair: { + source: sourceNotebook, + codex: codexNotebook, + }, metadata: { - totalCells: validCells.length, + ...docxResult.metadata, fileType: 'pdf', - importDate: new Date().toISOString(), } }; diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/SpreadsheetImporterForm.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/SpreadsheetImporterForm.tsx index 29067f193..f3d123cc7 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/SpreadsheetImporterForm.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/SpreadsheetImporterForm.tsx @@ -401,11 +401,18 @@ export const SpreadsheetImporterForm: React.FC = (props) ? parseGlobalReferencesField(row[parseInt(globalReferencesColumnIndex)]) : []; + // Build the full row values array + const originalRowValues: string[] = []; + for (let i = 0; i < parsedData.columns.length; i++) { + originalRowValues.push(row[i] || ''); + } + // Create cell metadata (always generates UUID) const { cellId, metadata: cellMetadata } = createSpreadsheetCellMetadata({ originalContent: row[parseInt(sourceColumnIndex!)], rowIndex: index, - originalRow: Object.keys(row), + originalRowValues, + sourceColumnIndex: parseInt(sourceColumnIndex!), fileName: selectedFile!.name, globalReferences, }); @@ -418,6 +425,19 @@ export const SpreadsheetImporterForm: React.FC = (props) }; }); + // Extract column headers for round-trip export + const columnHeaders = parsedData.columns.map(col => col.name); + + // Read the original file as TEXT for round-trip export (more reliable than ArrayBuffer for text files) + // This will be stored directly in metadata for the exporter to use + const originalFileContent = await selectedFile!.text(); + + // Determine specific importer type based on file extension/delimiter + const fileExtension = selectedFile!.name.toLowerCase().split('.').pop(); + const spreadsheetType = fileExtension === 'tsv' || parsedData.delimiter === '\t' + ? 'spreadsheet-tsv' + : 'spreadsheet-csv'; + const notebookPair: NotebookPair = { source: { name: parsedData.filename, @@ -426,10 +446,11 @@ export const SpreadsheetImporterForm: React.FC = (props) id: parsedData.filename, originalFileName: selectedFile!.name, sourceFile: selectedFile!.name, - importerType: "spreadsheet", + importerType: spreadsheetType, + corpusMarker: spreadsheetType, // For rebuild export detection createdAt: new Date().toISOString(), importContext: { - importerType: "spreadsheet", + importerType: spreadsheetType, fileName: selectedFile!.name, originalFileName: selectedFile!.name, fileSize: selectedFile!.size, @@ -438,6 +459,12 @@ export const SpreadsheetImporterForm: React.FC = (props) delimiter: parsedData.delimiter, columnCount: parsedData.columns.length, rowCount: parsedData.rows.length, + // Store for round-trip export + columnHeaders, + sourceColumnIndex: parseInt(sourceColumnIndex!), + // Store original file content as text for round-trip export + // This is passed directly to the exporter (no file system intermediary needed for text) + originalFileContent, }, }, codex: { @@ -450,15 +477,21 @@ export const SpreadsheetImporterForm: React.FC = (props) id: parsedData.filename, originalFileName: selectedFile!.name, sourceFile: selectedFile!.name, - importerType: "spreadsheet", + importerType: spreadsheetType, + corpusMarker: spreadsheetType, // For rebuild export detection createdAt: new Date().toISOString(), importContext: { - importerType: "spreadsheet", + importerType: spreadsheetType, fileName: selectedFile!.name, originalFileName: selectedFile!.name, fileSize: selectedFile!.size, importTimestamp: new Date().toISOString(), }, + delimiter: parsedData.delimiter, + columnHeaders, + sourceColumnIndex: parseInt(sourceColumnIndex!), + // Store original file content for round-trip export + originalFileContent, }, }, }; diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/cellMetadata.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/cellMetadata.ts index 5ee0f903b..c3e7b0b9e 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/cellMetadata.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/cellMetadata.ts @@ -14,7 +14,10 @@ import { v4 as uuidv4 } from 'uuid'; export interface SpreadsheetCellMetadataParams { originalContent: string; rowIndex: number; - originalRow: string[]; + /** The full original row values (all columns) */ + originalRowValues: string[]; + /** The index of the source content column */ + sourceColumnIndex: number; fileName: string; globalReferences?: string[]; } @@ -22,9 +25,10 @@ export interface SpreadsheetCellMetadataParams { /** * Creates metadata for a Spreadsheet cell * Always generates a UUID for the cell ID. + * Stores the full original row to enable round-trip export. */ export function createSpreadsheetCellMetadata(params: SpreadsheetCellMetadataParams): { metadata: any; cellId: string; } { - const { originalContent, rowIndex, originalRow, fileName, globalReferences } = params; + const { originalContent, rowIndex, originalRowValues, sourceColumnIndex, fileName, globalReferences } = params; const finalCellId = uuidv4(); @@ -36,7 +40,10 @@ export function createSpreadsheetCellMetadata(params: SpreadsheetCellMetadataPar edits: [], data: { rowIndex, - originalRow, + /** Full original row values for round-trip export */ + originalRowValues, + /** Index of the source column that contains the translatable content */ + sourceColumnIndex, originalContent, globalReferences: (globalReferences || []).map((r) => String(r).trim()).filter(Boolean), }, diff --git a/webviews/codex-webviews/src/NewSourceUploader/types/processedNotebookMetadata.ts b/webviews/codex-webviews/src/NewSourceUploader/types/processedNotebookMetadata.ts index 585c29981..13e9fc5c3 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/types/processedNotebookMetadata.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/types/processedNotebookMetadata.ts @@ -70,10 +70,13 @@ export interface PlaintextNotebookMetadata extends ProcessedNotebookMetadataBase } export interface SpreadsheetNotebookMetadata extends ProcessedNotebookMetadataBase { - importerType: "spreadsheet"; + importerType: "spreadsheet" | "spreadsheet-csv" | "spreadsheet-tsv"; delimiter?: string; columnCount?: number; rowCount?: number; + columnHeaders?: string[]; + sourceColumnIndex?: number; + originalFileContent?: string; } export interface SmartSegmenterNotebookMetadata extends ProcessedNotebookMetadataBase { @@ -243,6 +246,8 @@ export type ProcessedNotebookMetadataByImporter = { subtitles: SubtitlesNotebookMetadata; plaintext: PlaintextNotebookMetadata; spreadsheet: SpreadsheetNotebookMetadata; + "spreadsheet-csv": SpreadsheetNotebookMetadata; + "spreadsheet-tsv": SpreadsheetNotebookMetadata; "smart-segmenter": SmartSegmenterNotebookMetadata; audio: AudioNotebookMetadata; tms: TmsNotebookMetadata; From 1d6e3e1e72779fe511cf6ebf74a54360bae03f5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Pacanovsk=C3=BD?= Date: Sat, 24 Jan 2026 11:15:16 +0100 Subject: [PATCH 2/6] Extra files I forgot to commit --- .../importers/pdf/scripts/README.md | 129 ++++ .../importers/pdf/scripts/docx_to_pdf.py | 300 ++++++++ .../importers/pdf/scripts/pdf_to_docx.py | 694 ++++++++++++++++++ .../spreadsheet/spreadsheetExporter.ts | 325 ++++++++ 4 files changed, 1448 insertions(+) create mode 100644 webviews/codex-webviews/src/NewSourceUploader/importers/pdf/scripts/README.md create mode 100644 webviews/codex-webviews/src/NewSourceUploader/importers/pdf/scripts/docx_to_pdf.py create mode 100644 webviews/codex-webviews/src/NewSourceUploader/importers/pdf/scripts/pdf_to_docx.py create mode 100644 webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/spreadsheetExporter.ts diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/scripts/README.md b/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/scripts/README.md new file mode 100644 index 000000000..0f0eb00db --- /dev/null +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/scripts/README.md @@ -0,0 +1,129 @@ +# PDF Conversion Scripts + +These scripts handle PDF↔DOCX conversion with a focus on preserving document formatting, layout, images, and structure. + +## Overview + +Both scripts use a **hybrid approach** that tries multiple conversion methods in order of quality/availability: + +### PDF to DOCX (`pdf_to_docx.py`) +1. **LibreOffice headless** (best quality, free) +2. **pdf2docx** (good for most PDFs) +3. **Rich text extraction** (fallback when others fail) + +### DOCX to PDF (`docx_to_pdf.py`) +1. **LibreOffice headless** (free, no MS Office needed) +2. **docx2pdf** (requires Microsoft Word) + +## Installation + +### Required (Basic functionality) +```bash +pip install PyMuPDF python-docx Pillow +``` + +### Recommended (Better quality) +```bash +pip install pdf2docx docx2pdf +``` + +### Highly Recommended (Best quality - FREE) + +**LibreOffice** provides the best conversion quality and is completely free: + +- **Windows**: Download from https://www.libreoffice.org/download/download/ +- **macOS**: `brew install --cask libreoffice` +- **Linux**: `sudo apt install libreoffice` or `sudo dnf install libreoffice` + +## Quality Comparison + +| Method | Layout | Fonts | Images | Tables | Page Breaks | Headers/Footers | +|--------|--------|-------|--------|--------|-------------|-----------------| +| LibreOffice | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐ | +| pdf2docx | ⭐⭐⭐⭐ | ⭐⭐⭐⭐ | ⭐⭐⭐⭐ | ⭐⭐⭐⭐ | ⭐⭐⭐ | ⭐⭐ | +| Rich text extraction | ⭐⭐⭐ | ⭐⭐⭐ | ⭐⭐⭐ | ⭐⭐ | ⭐⭐⭐⭐ | ❌ | +| docx2pdf (MS Word) | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | + +## Usage + +### Command Line +```bash +# PDF to DOCX +python pdf_to_docx.py input.pdf output.docx + +# DOCX to PDF +python docx_to_pdf.py input.docx output.pdf +``` + +### From Python +```python +import json +from pdf_to_docx import convert_pdf_to_docx +from docx_to_pdf import convert_docx_to_pdf + +# PDF to DOCX +result = convert_pdf_to_docx("input.pdf", "output.docx") +if result["success"]: + print(f"Converted using: {result['method']}") +else: + print(f"Error: {result['error']}") + +# DOCX to PDF +result = convert_docx_to_pdf("input.docx", "output.pdf") +if result["success"]: + print(f"Converted using: {result['method']}") +else: + print(f"Error: {result['error']}") +``` + +## What Gets Preserved + +### PDF → DOCX +- ✅ Text content and flow +- ✅ Font names, sizes, colors +- ✅ Bold, italic, underline +- ✅ Images (with CMYK→RGB conversion) +- ✅ Tables (when using LibreOffice/pdf2docx) +- ✅ Page breaks +- ✅ Line breaks within paragraphs +- ✅ Multi-column layouts (LibreOffice) +- ✅ Headers/footers (LibreOffice) +- ⚠️ Complex vector graphics may be rasterized +- ⚠️ Form fields may not be editable + +### DOCX → PDF +- ✅ All text formatting +- ✅ Images +- ✅ Tables +- ✅ Page layout +- ✅ Headers/footers +- ✅ Hyperlinks + +## Troubleshooting + +### "LibreOffice not found" +Install LibreOffice from https://www.libreoffice.org/ + +### "docx2pdf requires Microsoft Word" +Either: +1. Install LibreOffice (free alternative) +2. Install Microsoft Word (Windows/macOS only) + +### CMYK image errors +The scripts automatically handle CMYK images by: +1. Converting CMYK to RGB using PIL +2. Saving as PNG for compatibility + +### Large file timeouts +Increase the timeout in the Python scripts if working with very large PDFs (default is 10 minutes for PDF→DOCX). + +## Dependencies + +| Package | Purpose | Required | +|---------|---------|----------| +| PyMuPDF | PDF parsing and text extraction | Yes | +| python-docx | DOCX creation | Yes | +| Pillow | Image handling (CMYK conversion) | Yes | +| pdf2docx | Direct PDF→DOCX conversion | Recommended | +| docx2pdf | DOCX→PDF via MS Word | Optional | +| LibreOffice | High-quality conversion | **Highly Recommended** | diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/scripts/docx_to_pdf.py b/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/scripts/docx_to_pdf.py new file mode 100644 index 000000000..a448f45d4 --- /dev/null +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/scripts/docx_to_pdf.py @@ -0,0 +1,300 @@ +#!/usr/bin/env python3 +""" +DOCX to PDF Converter - Hybrid Approach + +Tries multiple conversion methods in order of availability: +1. LibreOffice headless (free, cross-platform, no MS Office required) +2. docx2pdf (requires Microsoft Word on Windows/macOS) + +This ensures the conversion works on systems without Microsoft Office. +""" + +import sys +import os +import json +import base64 +import tempfile +import shutil +import subprocess +from pathlib import Path + + +def find_libreoffice() -> str | None: + """Find LibreOffice executable on the system.""" + if sys.platform == 'win32': + possible_paths = [ + r"C:\Program Files\LibreOffice\program\soffice.exe", + r"C:\Program Files (x86)\LibreOffice\program\soffice.exe", + os.path.expandvars(r"%PROGRAMFILES%\LibreOffice\program\soffice.exe"), + os.path.expandvars(r"%PROGRAMFILES(X86)%\LibreOffice\program\soffice.exe"), + ] + for path in possible_paths: + if os.path.exists(path): + return path + try: + result = subprocess.run(['where', 'soffice'], capture_output=True, text=True) + if result.returncode == 0 and result.stdout.strip(): + return result.stdout.strip().split('\n')[0] + except: + pass + else: + possible_paths = [ + '/usr/bin/soffice', + '/usr/bin/libreoffice', + '/Applications/LibreOffice.app/Contents/MacOS/soffice', + '/opt/libreoffice/program/soffice', + ] + for path in possible_paths: + if os.path.exists(path): + return path + try: + result = subprocess.run(['which', 'soffice'], capture_output=True, text=True) + if result.returncode == 0 and result.stdout.strip(): + return result.stdout.strip() + except: + pass + try: + result = subprocess.run(['which', 'libreoffice'], capture_output=True, text=True) + if result.returncode == 0 and result.stdout.strip(): + return result.stdout.strip() + except: + pass + + return None + + +def convert_with_libreoffice(docx_path: str, output_path: str) -> dict: + """ + Convert DOCX to PDF using LibreOffice headless mode. + This is the preferred method as it doesn't require Microsoft Office. + """ + soffice = find_libreoffice() + if not soffice: + return { + "success": False, + "error": "LibreOffice not found", + "method": "libreoffice" + } + + try: + print(json.dumps({"info": "Converting with LibreOffice..."}), file=sys.stderr) + + # Create temp directory for output + temp_dir = tempfile.mkdtemp(prefix="lo_pdf_") + + try: + # LibreOffice command for DOCX to PDF conversion + cmd = [ + soffice, + "--headless", + "--convert-to", "pdf", + "--outdir", temp_dir, + docx_path + ] + + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=300 # 5 minute timeout + ) + + if result.returncode != 0: + error_msg = result.stderr or result.stdout or "Unknown error" + return { + "success": False, + "error": f"LibreOffice conversion failed: {error_msg}", + "method": "libreoffice" + } + + # Find the output file + docx_basename = os.path.splitext(os.path.basename(docx_path))[0] + temp_output = os.path.join(temp_dir, f"{docx_basename}.pdf") + + if not os.path.exists(temp_output): + # Try finding any PDF in the output dir + for f in os.listdir(temp_dir): + if f.endswith('.pdf'): + temp_output = os.path.join(temp_dir, f) + break + + if not os.path.exists(temp_output): + return { + "success": False, + "error": "LibreOffice did not create PDF output", + "method": "libreoffice" + } + + # Verify file has content + file_size = os.path.getsize(temp_output) + if file_size == 0: + return { + "success": False, + "error": "LibreOffice created empty PDF", + "method": "libreoffice" + } + + # Move to final location + shutil.move(temp_output, output_path) + + # Read and encode PDF + with open(output_path, 'rb') as f: + pdf_bytes = f.read() + + pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8') + + print(json.dumps({"info": f"LibreOffice PDF conversion successful ({file_size} bytes)"}), file=sys.stderr) + + return { + "success": True, + "pdfBase64": pdf_base64, + "outputPath": output_path, + "method": "libreoffice" + } + + finally: + shutil.rmtree(temp_dir, ignore_errors=True) + + except subprocess.TimeoutExpired: + return { + "success": False, + "error": "LibreOffice conversion timed out", + "method": "libreoffice" + } + except Exception as e: + return { + "success": False, + "error": f"LibreOffice error: {str(e)}", + "method": "libreoffice" + } + + +def convert_with_docx2pdf(docx_path: str, output_path: str) -> dict: + """ + Convert DOCX to PDF using docx2pdf library. + Requires Microsoft Word on Windows/macOS. + """ + try: + from docx2pdf import convert + except ImportError: + return { + "success": False, + "error": "docx2pdf not installed. Install with: pip install docx2pdf", + "method": "docx2pdf" + } + + try: + print(json.dumps({"info": "Converting with docx2pdf (requires MS Word)..."}), file=sys.stderr) + + convert(docx_path, output_path) + + if not os.path.exists(output_path): + return { + "success": False, + "error": "docx2pdf did not create PDF. Is Microsoft Word installed?", + "method": "docx2pdf" + } + + file_size = os.path.getsize(output_path) + if file_size == 0: + return { + "success": False, + "error": "docx2pdf created empty PDF", + "method": "docx2pdf" + } + + # Read and encode PDF + with open(output_path, 'rb') as f: + pdf_bytes = f.read() + + pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8') + + print(json.dumps({"info": f"docx2pdf conversion successful ({file_size} bytes)"}), file=sys.stderr) + + return { + "success": True, + "pdfBase64": pdf_base64, + "outputPath": output_path, + "method": "docx2pdf" + } + + except Exception as e: + error_msg = str(e) if str(e) else repr(e) + + # Provide helpful error messages + if any(x in error_msg for x in ["COM", "Word", "win32com", "Microsoft"]): + error_msg += ". docx2pdf requires Microsoft Word to be installed." + + return { + "success": False, + "error": error_msg, + "method": "docx2pdf" + } + + +def convert_docx_to_pdf(docx_path: str, output_path: str) -> dict: + """ + Convert DOCX to PDF using the best available method. + + Tries methods in order: + 1. LibreOffice headless (preferred, free, no MS Office needed) + 2. docx2pdf (requires Microsoft Word) + """ + print(json.dumps({"info": "Starting DOCX to PDF conversion..."}), file=sys.stderr) + + # Verify input file exists + if not os.path.exists(docx_path): + return { + "success": False, + "error": f"Input DOCX file not found: {docx_path}" + } + + # Ensure output directory exists + output_dir = os.path.dirname(output_path) + if output_dir and not os.path.exists(output_dir): + os.makedirs(output_dir, exist_ok=True) + + methods_tried = [] + + # Method 1: Try LibreOffice (preferred, free) + print(json.dumps({"info": "Attempting Method 1: LibreOffice..."}), file=sys.stderr) + result = convert_with_libreoffice(docx_path, output_path) + methods_tried.append(f"LibreOffice: {result.get('error', 'success') if not result['success'] else 'success'}") + + if result["success"]: + print(json.dumps({"info": "✓ LibreOffice PDF conversion successful"}), file=sys.stderr) + return result + else: + print(json.dumps({"warning": f"LibreOffice failed: {result.get('error', 'unknown')}"}), file=sys.stderr) + + # Method 2: Try docx2pdf (requires MS Word) + print(json.dumps({"info": "Attempting Method 2: docx2pdf..."}), file=sys.stderr) + result = convert_with_docx2pdf(docx_path, output_path) + methods_tried.append(f"docx2pdf: {result.get('error', 'success') if not result['success'] else 'success'}") + + if result["success"]: + print(json.dumps({"info": "✓ docx2pdf conversion successful"}), file=sys.stderr) + return result + else: + print(json.dumps({"warning": f"docx2pdf failed: {result.get('error', 'unknown')}"}), file=sys.stderr) + + # All methods failed + return { + "success": False, + "error": f"All conversion methods failed. Install LibreOffice (free) from https://www.libreoffice.org/ or Microsoft Word. Tried: {'; '.join(methods_tried)}" + } + + +if __name__ == "__main__": + if len(sys.argv) < 3: + print(json.dumps({ + "success": False, + "error": "Usage: docx_to_pdf.py " + })) + sys.exit(1) + + docx_path = sys.argv[1] + output_path = sys.argv[2] + + result = convert_docx_to_pdf(docx_path, output_path) + print(json.dumps(result)) diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/scripts/pdf_to_docx.py b/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/scripts/pdf_to_docx.py new file mode 100644 index 000000000..ef03d9c58 --- /dev/null +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/scripts/pdf_to_docx.py @@ -0,0 +1,694 @@ +#!/usr/bin/env python3 +""" +PDF to DOCX Converter - Hybrid Approach + +Tries multiple conversion methods in order of quality: +1. LibreOffice headless (best layout preservation, free) +2. pdf2docx library (good for most PDFs) +3. Rich text extraction fallback (preserves content when others fail) + +Preserves fonts, sizes, colors, images, line breaks, page breaks, +tables, headers/footers, and document structure. +""" + +import sys +import os +import json +import io +import tempfile +import shutil +import subprocess +from pathlib import Path + +# Check for required libraries +try: + import fitz # PyMuPDF + HAS_PYMUPDF = True +except ImportError: + HAS_PYMUPDF = False + +try: + from pdf2docx import Converter + HAS_PDF2DOCX = True +except ImportError: + HAS_PDF2DOCX = False + +try: + from docx import Document + from docx.shared import Pt, Inches, RGBColor, Emu, Twips + from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_BREAK + from docx.oxml.ns import qn + from docx.oxml import OxmlElement + HAS_PYTHON_DOCX = True +except ImportError: + HAS_PYTHON_DOCX = False + +try: + from PIL import Image + HAS_PIL = True +except ImportError: + HAS_PIL = False + + +def find_libreoffice() -> str | None: + """Find LibreOffice executable on the system.""" + if sys.platform == 'win32': + # Common Windows paths + possible_paths = [ + r"C:\Program Files\LibreOffice\program\soffice.exe", + r"C:\Program Files (x86)\LibreOffice\program\soffice.exe", + os.path.expandvars(r"%PROGRAMFILES%\LibreOffice\program\soffice.exe"), + os.path.expandvars(r"%PROGRAMFILES(X86)%\LibreOffice\program\soffice.exe"), + ] + for path in possible_paths: + if os.path.exists(path): + return path + # Try PATH + try: + result = subprocess.run(['where', 'soffice'], capture_output=True, text=True) + if result.returncode == 0 and result.stdout.strip(): + return result.stdout.strip().split('\n')[0] + except: + pass + else: + # macOS / Linux + possible_paths = [ + '/usr/bin/soffice', + '/usr/bin/libreoffice', + '/Applications/LibreOffice.app/Contents/MacOS/soffice', + '/opt/libreoffice/program/soffice', + ] + for path in possible_paths: + if os.path.exists(path): + return path + # Try PATH + try: + result = subprocess.run(['which', 'soffice'], capture_output=True, text=True) + if result.returncode == 0 and result.stdout.strip(): + return result.stdout.strip() + except: + pass + try: + result = subprocess.run(['which', 'libreoffice'], capture_output=True, text=True) + if result.returncode == 0 and result.stdout.strip(): + return result.stdout.strip() + except: + pass + + return None + + +def convert_with_libreoffice(pdf_path: str, output_path: str) -> dict: + """ + Convert PDF to DOCX using LibreOffice headless mode. + This method provides the best layout preservation for most PDFs. + """ + soffice = find_libreoffice() + if not soffice: + return { + "success": False, + "error": "LibreOffice not found. Install from https://www.libreoffice.org/", + "method": "libreoffice" + } + + try: + print(json.dumps({"info": "Converting with LibreOffice (best quality)..."}), file=sys.stderr) + + # Create temp directory for output + temp_dir = tempfile.mkdtemp(prefix="lo_convert_") + + try: + # LibreOffice command for PDF to DOCX conversion + # --infilter specifies PDF import filter + # --convert-to specifies output format + cmd = [ + soffice, + "--headless", + "--infilter=writer_pdf_import", + "--convert-to", "docx:Office Open XML Text", + "--outdir", temp_dir, + pdf_path + ] + + print(json.dumps({"info": f"Running: {' '.join(cmd[:4])}..."}), file=sys.stderr) + + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=600 # 10 minute timeout for large PDFs + ) + + if result.returncode != 0: + error_msg = result.stderr or result.stdout or "Unknown error" + return { + "success": False, + "error": f"LibreOffice conversion failed: {error_msg}", + "method": "libreoffice" + } + + # Find the output file + pdf_basename = os.path.splitext(os.path.basename(pdf_path))[0] + temp_output = os.path.join(temp_dir, f"{pdf_basename}.docx") + + if not os.path.exists(temp_output): + # Try alternative naming + for f in os.listdir(temp_dir): + if f.endswith('.docx'): + temp_output = os.path.join(temp_dir, f) + break + + if not os.path.exists(temp_output): + return { + "success": False, + "error": "LibreOffice did not create output file", + "method": "libreoffice" + } + + # Check file size + file_size = os.path.getsize(temp_output) + if file_size == 0: + return { + "success": False, + "error": "LibreOffice created empty output file", + "method": "libreoffice" + } + + # Move to final location + shutil.move(temp_output, output_path) + + print(json.dumps({"info": f"LibreOffice conversion successful ({file_size} bytes)"}), file=sys.stderr) + + return { + "success": True, + "outputPath": output_path, + "method": "libreoffice" + } + + finally: + shutil.rmtree(temp_dir, ignore_errors=True) + + except subprocess.TimeoutExpired: + return { + "success": False, + "error": "LibreOffice conversion timed out after 10 minutes", + "method": "libreoffice" + } + except Exception as e: + return { + "success": False, + "error": f"LibreOffice error: {str(e)}", + "method": "libreoffice" + } + + +def convert_with_pdf2docx(pdf_path: str, output_path: str) -> dict: + """ + Convert PDF to DOCX using pdf2docx library. + Good for most text-based PDFs. + """ + if not HAS_PDF2DOCX: + return { + "success": False, + "error": "pdf2docx not installed. Install with: pip install pdf2docx", + "method": "pdf2docx" + } + + try: + print(json.dumps({"info": "Converting with pdf2docx..."}), file=sys.stderr) + + cv = Converter(pdf_path) + cv.convert(output_path, start=0, end=None) + cv.close() + + # Verify output + if not os.path.exists(output_path): + return { + "success": False, + "error": "pdf2docx did not create output file", + "method": "pdf2docx" + } + + file_size = os.path.getsize(output_path) + if file_size == 0: + return { + "success": False, + "error": "pdf2docx created empty output file", + "method": "pdf2docx" + } + + print(json.dumps({"info": f"pdf2docx conversion successful ({file_size} bytes)"}), file=sys.stderr) + + return { + "success": True, + "outputPath": output_path, + "method": "pdf2docx" + } + + except Exception as e: + error_msg = str(e) + # Check for known issues + is_recoverable = any(x in error_msg.lower() for x in [ + "pixmap must be grayscale or rgb", + "code=4", + "colorspace", + "cmyk" + ]) + + return { + "success": False, + "error": error_msg, + "method": "pdf2docx", + "recoverable": is_recoverable + } + + +def sanitize_text(text: str) -> str: + """ + Remove control characters that are not valid in XML/DOCX. + Keeps normal whitespace (space, tab, newline, carriage return). + """ + if not text: + return "" + + result = [] + for char in text: + code = ord(char) + # Valid XML chars: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] + if code == 0x9 or code == 0xA or code == 0xD or (code >= 0x20 and code <= 0xD7FF) or (code >= 0xE000 and code <= 0xFFFD): + result.append(char) + elif code < 0x20: + result.append(' ') + + return ''.join(result) + + +def get_rgb_from_color(color_value) -> tuple: + """Convert PyMuPDF color value to RGB tuple.""" + if color_value is None: + return (0, 0, 0) + + if isinstance(color_value, (list, tuple)): + if len(color_value) == 3: + return tuple(int(c * 255) for c in color_value) + elif len(color_value) == 1: + gray = int(color_value[0] * 255) + return (gray, gray, gray) + elif len(color_value) == 4: + c, m, y, k = color_value + r = int(255 * (1 - c) * (1 - k)) + g = int(255 * (1 - m) * (1 - k)) + b = int(255 * (1 - y) * (1 - k)) + return (r, g, b) + elif isinstance(color_value, (int, float)): + if isinstance(color_value, float): + gray = int(color_value * 255) + return (gray, gray, gray) + else: + if color_value == 0: + return (0, 0, 0) + r = (color_value >> 16) & 0xFF + g = (color_value >> 8) & 0xFF + b = color_value & 0xFF + return (r, g, b) + + return (0, 0, 0) + + +def extract_images_from_page(page, page_num: int, temp_dir: str) -> list: + """Extract all images from a PDF page and save them to temp files.""" + images = [] + + try: + image_list = page.get_images(full=True) + + for img_index, img_info in enumerate(image_list): + try: + xref = img_info[0] + base_image = page.parent.extract_image(xref) + if not base_image: + continue + + image_bytes = base_image.get("image") + image_ext = base_image.get("ext", "png") + + if not image_bytes: + continue + + # Convert CMYK to RGB if needed + if HAS_PIL and image_ext in ["jpeg", "jpg"]: + try: + img = Image.open(io.BytesIO(image_bytes)) + if img.mode == "CMYK": + img = img.convert("RGB") + buffer = io.BytesIO() + img.save(buffer, format="PNG") + image_bytes = buffer.getvalue() + image_ext = "png" + except Exception as pil_err: + print(json.dumps({"warning": f"PIL conversion failed: {pil_err}"}), file=sys.stderr) + + image_filename = f"page{page_num}_img{img_index}.{image_ext}" + image_path = os.path.join(temp_dir, image_filename) + + with open(image_path, "wb") as img_file: + img_file.write(image_bytes) + + img_rects = page.get_image_rects(xref) + if img_rects: + bbox = img_rects[0] + images.append((image_path, bbox, base_image.get("width", 100), base_image.get("height", 100))) + else: + images.append((image_path, None, base_image.get("width", 100), base_image.get("height", 100))) + + except Exception as img_err: + print(json.dumps({"warning": f"Failed to extract image {img_index}: {img_err}"}), file=sys.stderr) + continue + + except Exception as e: + print(json.dumps({"warning": f"Image extraction error on page {page_num}: {e}"}), file=sys.stderr) + + images.sort(key=lambda x: x[1][1] if x[1] else float('inf')) + return images + + +def add_image_to_doc(doc, image_path: str, width: int = None, height: int = None, max_width_inches: float = 6.0): + """Add an image to the document with appropriate sizing.""" + try: + if width and height: + aspect_ratio = height / width + img_width = min(max_width_inches, width / 96) + img_height = img_width * aspect_ratio + else: + img_width = max_width_inches / 2 + img_height = None + + para = doc.add_paragraph() + run = para.add_run() + + if img_height: + run.add_picture(image_path, width=Inches(img_width), height=Inches(img_height)) + else: + run.add_picture(image_path, width=Inches(img_width)) + + return True + except Exception as e: + print(json.dumps({"warning": f"Failed to add image: {e}"}), file=sys.stderr) + return False + + +def add_formatted_paragraph_with_breaks(doc, runs: list, page_avg_size: float): + """Add a paragraph to the document with formatted runs and line breaks.""" + if not runs: + return + + full_text = ''.join(r[0] for r in runs if not r[6]).strip() + if not full_text: + return + + para = doc.add_paragraph() + + first_non_break = next((r for r in runs if not r[6]), None) + first_size = first_non_break[2] if first_non_break else 12 + is_heading = first_size > page_avg_size * 1.3 + + for run_data in runs: + text, font_name, font_size, color, is_bold, is_italic, is_line_break = run_data + + if is_line_break: + run = para.add_run() + run.add_break(WD_BREAK.LINE) + continue + + if not text: + continue + + run = para.add_run(text) + run.font.size = Pt(font_size) + + try: + run.font.name = font_name + run._element.rPr.rFonts.set(qn('w:eastAsia'), font_name) + except: + run.font.name = 'Arial' + + if color != (0, 0, 0): + try: + run.font.color.rgb = RGBColor(color[0], color[1], color[2]) + except: + pass + + if is_bold or is_heading: + run.font.bold = True + if is_italic: + run.font.italic = True + + +def extract_text_to_docx(pdf_path: str, output_path: str) -> dict: + """ + Extract text from PDF and create a DOCX with preserved formatting. + Fallback method when other converters fail. + """ + if not HAS_PYTHON_DOCX: + return { + "success": False, + "error": "python-docx not installed. Install with: pip install python-docx", + "method": "rich_text_extraction" + } + + if not HAS_PYMUPDF: + return { + "success": False, + "error": "PyMuPDF not installed. Install with: pip install PyMuPDF", + "method": "rich_text_extraction" + } + + temp_dir = tempfile.mkdtemp(prefix="pdf_images_") + + try: + print(json.dumps({"info": "Using rich text extraction fallback..."}), file=sys.stderr) + + pdf_doc = fitz.open(pdf_path) + total_pages = len(pdf_doc) + + doc = Document() + + print(json.dumps({"info": f"Extracting from {total_pages} pages..."}), file=sys.stderr) + + total_images = 0 + + for page_num in range(total_pages): + page = pdf_doc[page_num] + page_height = page.rect.height + + page_images = extract_images_from_page(page, page_num, temp_dir) + total_images += len(page_images) + + text_dict = page.get_text("dict", flags=fitz.TEXT_PRESERVE_WHITESPACE) + blocks = text_dict.get("blocks", []) + + # Calculate average font size + all_sizes = [] + for block in blocks: + if block.get("type") == 0: + for line in block.get("lines", []): + for span in line.get("spans", []): + all_sizes.append(span.get("size", 12)) + + page_avg_size = sum(all_sizes) / len(all_sizes) if all_sizes else 12 + + # Combine content items sorted by position + content_items = [] + + for block in blocks: + if block.get("type") == 0: + bbox = block.get("bbox", [0, 0, 0, 0]) + content_items.append(("text", bbox[1], block)) + + for img_path, img_bbox, img_width, img_height in page_images: + y_pos = img_bbox[1] if img_bbox else page_height + content_items.append(("image", y_pos, (img_path, img_width, img_height))) + + content_items.sort(key=lambda x: x[1]) + + # Process content + for item_type, y_pos, item_data in content_items: + if item_type == "text": + block = item_data + lines = block.get("lines", []) + if not lines: + continue + + current_para_runs = [] + last_y1 = None + last_size = None + + for line in lines: + spans = line.get("spans", []) + if not spans: + continue + + line_bbox = line.get("bbox", [0, 0, 0, 0]) + line_y0 = line_bbox[1] + line_y1 = line_bbox[3] + line_height = line_y1 - line_y0 if line_y1 > line_y0 else 12 + + start_new_para = False + add_line_break = False + + if last_y1 is not None: + gap = line_y0 - last_y1 + if gap > line_height * 1.0: + start_new_para = True + elif gap > line_height * 0.2: + add_line_break = True + + first_span = spans[0] + current_size = first_span.get("size", 12) + if last_size is not None and abs(current_size - last_size) > 3: + start_new_para = True + + if start_new_para and current_para_runs: + add_formatted_paragraph_with_breaks(doc, current_para_runs, page_avg_size) + current_para_runs = [] + add_line_break = False + + if add_line_break and current_para_runs: + last_run = current_para_runs[-1] + current_para_runs.append(('\n', last_run[1], last_run[2], last_run[3], last_run[4], last_run[5], True)) + + for span in spans: + text = span.get("text", "") + if not text: + continue + + text = sanitize_text(text) + if not text: + continue + + font_name = span.get("font", "Arial") + font_size = span.get("size", 12) + color = get_rgb_from_color(span.get("color")) + flags = span.get("flags", 0) + is_bold = bool(flags & (1 << 4)) + is_italic = bool(flags & (1 << 1)) + + if '+' in font_name: + font_name = font_name.split('+', 1)[1] + + current_para_runs.append((text, font_name, font_size, color, is_bold, is_italic, False)) + + last_y1 = line_y1 + last_size = current_size + + if current_para_runs: + add_formatted_paragraph_with_breaks(doc, current_para_runs, page_avg_size) + + elif item_type == "image": + img_path, img_width, img_height = item_data + add_image_to_doc(doc, img_path, img_width, img_height) + + # Page break between pages + if page_num < total_pages - 1: + doc.add_page_break() + + if (page_num + 1) % 10 == 0 or (page_num + 1) == total_pages: + progress = int((page_num + 1) / total_pages * 100) + print(json.dumps({"info": f"Progress: {page_num + 1}/{total_pages} ({progress}%)"}), file=sys.stderr) + + pdf_doc.close() + doc.save(output_path) + + file_size = os.path.getsize(output_path) + print(json.dumps({"info": f"Rich text extraction complete: {total_pages} pages, {total_images} images, {file_size} bytes"}), file=sys.stderr) + + return { + "success": True, + "outputPath": output_path, + "method": "rich_text_extraction" + } + + except Exception as e: + import traceback + error_details = traceback.format_exc() + print(json.dumps({"error": f"Text extraction error: {error_details}"}), file=sys.stderr) + return { + "success": False, + "error": f"Text extraction failed: {str(e)}", + "method": "rich_text_extraction" + } + finally: + shutil.rmtree(temp_dir, ignore_errors=True) + + +def convert_pdf_to_docx(pdf_path: str, output_path: str) -> dict: + """ + Convert PDF to DOCX using the best available method. + + Tries methods in order of quality: + 1. LibreOffice headless (best layout preservation) + 2. pdf2docx (good for most PDFs) + 3. Rich text extraction (fallback) + """ + print(json.dumps({"info": "Starting PDF to DOCX conversion (hybrid approach)..."}), file=sys.stderr) + + # Verify input file exists + if not os.path.exists(pdf_path): + return { + "success": False, + "error": f"Input PDF file not found: {pdf_path}" + } + + methods_tried = [] + + # Method 1: Try LibreOffice (best quality) + print(json.dumps({"info": "Attempting Method 1: LibreOffice..."}), file=sys.stderr) + result = convert_with_libreoffice(pdf_path, output_path) + methods_tried.append(f"LibreOffice: {result.get('error', 'success') if not result['success'] else 'success'}") + + if result["success"]: + print(json.dumps({"info": "✓ LibreOffice conversion successful"}), file=sys.stderr) + return result + else: + print(json.dumps({"warning": f"LibreOffice failed: {result.get('error', 'unknown')}"}), file=sys.stderr) + + # Method 2: Try pdf2docx + print(json.dumps({"info": "Attempting Method 2: pdf2docx..."}), file=sys.stderr) + result = convert_with_pdf2docx(pdf_path, output_path) + methods_tried.append(f"pdf2docx: {result.get('error', 'success') if not result['success'] else 'success'}") + + if result["success"]: + print(json.dumps({"info": "✓ pdf2docx conversion successful"}), file=sys.stderr) + return result + else: + print(json.dumps({"warning": f"pdf2docx failed: {result.get('error', 'unknown')}"}), file=sys.stderr) + + # Method 3: Rich text extraction fallback + print(json.dumps({"info": "Attempting Method 3: Rich text extraction..."}), file=sys.stderr) + result = extract_text_to_docx(pdf_path, output_path) + methods_tried.append(f"Rich text: {result.get('error', 'success') if not result['success'] else 'success'}") + + if result["success"]: + print(json.dumps({"info": "✓ Rich text extraction successful"}), file=sys.stderr) + return result + + # All methods failed + return { + "success": False, + "error": f"All conversion methods failed. Tried: {'; '.join(methods_tried)}" + } + + +if __name__ == "__main__": + if len(sys.argv) < 3: + print(json.dumps({ + "success": False, + "error": "Usage: pdf_to_docx.py " + })) + sys.exit(1) + + pdf_path = sys.argv[1] + output_path = sys.argv[2] + + result = convert_pdf_to_docx(pdf_path, output_path) + print(json.dumps(result)) diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/spreadsheetExporter.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/spreadsheetExporter.ts new file mode 100644 index 000000000..b37059bb6 --- /dev/null +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/spreadsheetExporter.ts @@ -0,0 +1,325 @@ +/** + * Spreadsheet Exporter - True Round-trip Export + * + * Exports codex notebooks back to CSV/TSV format with translations. + * Uses the original file content stored during import, only replacing + * the source column content with translations while keeping everything + * else exactly the same. + * + * Supports both spreadsheet-csv and spreadsheet-tsv importer types. + */ + +export interface SpreadsheetCell { + id: string; + value: string; + metadata: { + id?: string; + data?: { + rowIndex?: number; + originalRowValues?: string[]; + sourceColumnIndex?: number; + originalContent?: string; + globalReferences?: string[]; + }; + }; +} + +export interface SpreadsheetNotebookMetadata { + delimiter?: string; + originalFileName?: string; + originalFileContent?: string; + columnHeaders?: string[]; + sourceColumnIndex?: number; + columnCount?: number; + importerType?: string; +} + +/** + * Parse a CSV/TSV line with proper quote handling + */ +function parseCSVLine(line: string, delimiter: string): string[] { + const result: string[] = []; + let current = ''; + let inQuotes = false; + let i = 0; + + while (i < line.length) { + const char = line[i]; + const nextChar = line[i + 1]; + + if (char === '"') { + if (inQuotes && nextChar === '"') { + current += '"'; + i += 2; + } else { + inQuotes = !inQuotes; + i++; + } + } else if (char === delimiter && !inQuotes) { + result.push(current); + current = ''; + i++; + } else { + current += char; + i++; + } + } + + result.push(current); + return result; +} + +/** + * Escape a field value for CSV/TSV output + */ +function escapeField(value: string, delimiter: string): string { + if (value === null || value === undefined) return ''; + const strValue = String(value); + + const needsQuotes = strValue.includes(delimiter) || + strValue.includes('"') || + strValue.includes('\n') || + strValue.includes('\r'); + + if (needsQuotes) { + const escaped = strValue.replace(/"/g, '""'); + return `"${escaped}"`; + } + + return strValue; +} + +/** + * Remove HTML tags from content (translations might have HTML) + */ +function stripHtmlTags(html: string): string { + if (!html) return ''; + return html + .replace(/<[^>]*>/g, '') + .replace(/ /g, ' ') + .replace(/&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/"/g, '"') + .replace(/'/g, "'") + .trim(); +} + +/** + * Export codex cells to spreadsheet format (CSV or TSV) + * + * TRUE ROUND-TRIP EXPORT: + * - Uses the original file content stored during import + * - Keeps the HEADER ROW exactly as it was (no changes) + * - Replaces ONLY the source column in DATA ROWS with translations + * - Preserves everything else exactly as it was + */ +export function exportSpreadsheetWithTranslations( + cells: SpreadsheetCell[], + metadata: SpreadsheetNotebookMetadata +): string { + const originalFileContent = metadata.originalFileContent; + const sourceColumnIndex = metadata.sourceColumnIndex; + + // Determine delimiter from importerType or metadata + let delimiter = metadata.delimiter || ','; + if (metadata.importerType === 'spreadsheet-tsv') { + delimiter = '\t'; + } else if (metadata.importerType === 'spreadsheet-csv') { + delimiter = ','; + } + + console.log(`[Spreadsheet Export] importerType: ${metadata.importerType}, delimiter: "${delimiter === '\t' ? 'TAB' : delimiter}"`); + + // Build a map of rowIndex -> translation + const translationsByRow = new Map(); + for (const cell of cells) { + const rowIndex = cell.metadata?.data?.rowIndex; + const translation = stripHtmlTags(cell.value || ''); + + if (typeof rowIndex === 'number' && translation) { + translationsByRow.set(rowIndex, translation); + } + } + + console.log(`[Spreadsheet Export] Built translation map with ${translationsByRow.size} translations`); + console.log(`[Spreadsheet Export] originalFileContent: ${originalFileContent ? 'found' : 'missing'}, sourceColumnIndex: ${sourceColumnIndex}`); + + // If we have the original file content, do true round-trip + if (originalFileContent) { + // Default to column index 2 (third column, typically "Transcrição") if not specified + const effectiveSourceColumnIndex = typeof sourceColumnIndex === 'number' ? sourceColumnIndex : 2; + + console.log(`[Spreadsheet Export] Using original file content for true round-trip export`); + console.log(`[Spreadsheet Export] Effective source column index: ${effectiveSourceColumnIndex}`); + console.log(`[Spreadsheet Export] Original content length: ${originalFileContent.length} chars`); + + // Remove BOM if present (UTF-8 BOM: EF BB BF) + let cleanContent = originalFileContent; + if (cleanContent.charCodeAt(0) === 0xFEFF) { + cleanContent = cleanContent.substring(1); + console.log(`[Spreadsheet Export] Removed BOM from content`); + } + + // Handle both Unix (\n) and Windows (\r\n) line endings + const lines = cleanContent.split(/\r?\n/); + const outputLines: string[] = []; + + console.log(`[Spreadsheet Export] File has ${lines.length} lines`); + + // First line is ALWAYS the header - keep it EXACTLY as is + if (lines.length > 0) { + const headerLine = lines[0]; + // Keep header line unchanged - DO NOT parse or modify it + outputLines.push(headerLine); + console.log(`[Spreadsheet Export] Preserved header (${headerLine.length} chars): "${headerLine.substring(0, 100)}${headerLine.length > 100 ? '...' : ''}"`); + } + + // Process data rows (skip first line which is header) + for (let i = 1; i < lines.length; i++) { + const line = lines[i]; + + // Skip empty lines at the end + if (!line.trim() && i === lines.length - 1) { + continue; + } + + // Skip completely empty lines + if (!line.trim()) { + outputLines.push(line); + continue; + } + + // Data row index (0-based, excluding header) + const dataRowIndex = i - 1; + + // Check if we have a translation for this row + const translation = translationsByRow.get(dataRowIndex); + + if (translation) { + // Parse the line to replace the source column + const fields = parseCSVLine(line, delimiter); + + if (effectiveSourceColumnIndex < fields.length) { + // Replace the source column with the translation + fields[effectiveSourceColumnIndex] = translation; + } + + // Rebuild the line with proper escaping + const outputLine = fields.map(f => escapeField(f, delimiter)).join(delimiter); + outputLines.push(outputLine); + } else { + // No translation for this row - keep it exactly as is + outputLines.push(line); + } + } + + console.log(`[Spreadsheet Export] Output ${outputLines.length} lines (1 header + ${outputLines.length - 1} data rows)`); + return outputLines.join('\n'); + } + + // Fallback: reconstruct from cell metadata (for legacy imports without originalFileContent) + console.log(`[Spreadsheet Export] Fallback: reconstructing from cell metadata`); + + const rows: string[] = []; + const columnHeaders = metadata.columnHeaders; + + // Add header row if available + if (columnHeaders && columnHeaders.length > 0) { + const headerRow = columnHeaders.map(h => escapeField(h, delimiter)); + rows.push(headerRow.join(delimiter)); + } + + // Sort cells by rowIndex + const sortedCells = [...cells].sort((a, b) => { + const aIndex = a.metadata?.data?.rowIndex ?? 0; + const bIndex = b.metadata?.data?.rowIndex ?? 0; + return aIndex - bIndex; + }); + + // Build data rows + for (const cell of sortedCells) { + const cellData = cell.metadata?.data; + const originalRowValues = cellData?.originalRowValues; + const cellSourceColumnIndex = cellData?.sourceColumnIndex ?? sourceColumnIndex; + const translation = stripHtmlTags(cell.value || ''); + + if (originalRowValues && originalRowValues.length > 0) { + const rowValues = [...originalRowValues]; + + if (typeof cellSourceColumnIndex === 'number' && cellSourceColumnIndex < rowValues.length) { + if (translation) { + rowValues[cellSourceColumnIndex] = translation; + } + } + + const escapedRow = rowValues.map(v => escapeField(v, delimiter)); + rows.push(escapedRow.join(delimiter)); + } else { + // Minimal fallback + const originalContent = cellData?.originalContent || ''; + const globalRefs = cellData?.globalReferences || []; + + const simpleRow: string[] = []; + if (globalRefs.length > 0) { + simpleRow.push(escapeField(globalRefs.join('; '), delimiter)); + } + simpleRow.push(escapeField(translation || originalContent, delimiter)); + + rows.push(simpleRow.join(delimiter)); + } + } + + return rows.join('\n'); +} + +/** + * Determine the appropriate file extension based on importer type or original file + */ +export function getSpreadsheetExtension(originalFileName: string | undefined, delimiter: string, importerType?: string): string { + // Check importer type first + if (importerType === 'spreadsheet-tsv') { + return 'tsv'; + } + if (importerType === 'spreadsheet-csv') { + return 'csv'; + } + + // Check original filename + if (originalFileName) { + const ext = originalFileName.toLowerCase().split('.').pop(); + if (ext === 'csv' || ext === 'tsv') { + return ext; + } + } + + // Default based on delimiter + return delimiter === '\t' ? 'tsv' : 'csv'; +} + +/** + * Determine delimiter from importer type, original file extension, or metadata + */ +export function getDelimiterFromMetadata(metadata: any): string { + // Check importer type first + if (metadata?.importerType === 'spreadsheet-tsv') { + return '\t'; + } + if (metadata?.importerType === 'spreadsheet-csv') { + return ','; + } + + // Check explicit delimiter in metadata + if (metadata?.delimiter) { + return metadata.delimiter; + } + + // Check original filename extension + const originalFileName = metadata?.originalFileName || ''; + if (originalFileName.toLowerCase().endsWith('.tsv')) { + return '\t'; + } + + // Default to comma (CSV) + return ','; +} From 9fe3bcfca69cc7e0ad0b5f43ef2361e47d270c40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Pacanovsk=C3=BD?= Date: Sat, 24 Jan 2026 11:49:36 +0100 Subject: [PATCH 3/6] Fixed issues with push --- .../NewSourceUploaderProvider.ts | 34 ++++++++++++------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts b/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts index b7caefe0e..899001be5 100644 --- a/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts +++ b/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts @@ -14,6 +14,7 @@ import { handleFinalizeAudioImport, } from "./importers/audioSplitter"; import { ProcessedNotebook } from "../../../webviews/codex-webviews/src/NewSourceUploader/types/common"; +import type { SpreadsheetNotebookMetadata } from "../../../webviews/codex-webviews/src/NewSourceUploader/types/processedNotebookMetadata"; import { NotebookPreview, CustomNotebookMetadata } from "../../../types"; import { CodexCell } from "../../utils/codexNotebookUtils"; import { CodexCellTypes } from "../../../types/enums"; @@ -783,18 +784,27 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide importerType: processedNotebook.metadata.importerType }), // Spreadsheet-specific metadata for round-trip export - ...(processedNotebook.metadata?.originalFileContent && { - originalFileContent: processedNotebook.metadata.originalFileContent - }), - ...(processedNotebook.metadata?.columnHeaders && { - columnHeaders: processedNotebook.metadata.columnHeaders - }), - ...(processedNotebook.metadata?.sourceColumnIndex !== undefined && { - sourceColumnIndex: processedNotebook.metadata.sourceColumnIndex - }), - ...(processedNotebook.metadata?.delimiter && { - delimiter: processedNotebook.metadata.delimiter - }), + ...(processedNotebook.metadata.importerType === "spreadsheet" || + processedNotebook.metadata.importerType === "spreadsheet-csv" || + processedNotebook.metadata.importerType === "spreadsheet-tsv" + ? (() => { + const spreadsheetMetadata = processedNotebook.metadata as SpreadsheetNotebookMetadata; + return { + ...(spreadsheetMetadata.originalFileContent && { + originalFileContent: spreadsheetMetadata.originalFileContent + }), + ...(spreadsheetMetadata.columnHeaders && { + columnHeaders: spreadsheetMetadata.columnHeaders + }), + ...(spreadsheetMetadata.sourceColumnIndex !== undefined && { + sourceColumnIndex: spreadsheetMetadata.sourceColumnIndex + }), + ...(spreadsheetMetadata.delimiter && { + delimiter: spreadsheetMetadata.delimiter + }), + }; + })() + : {}), }; return { From 79c37c0024c253b7c7ab2f829477a4aab6d64204 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Pacanovsk=C3=BD?= Date: Sat, 31 Jan 2026 12:47:48 +0100 Subject: [PATCH 4/6] Importer name handling update We can now import multiple files with same name, so we add UUID to the source and codex file name. The display names of the files should work like "sample", "sample (1)" etc. Now we should only store a single file with the exact same content thanks to original hashes we store in the originals folder. If we import a file with same name but different content, it will save it to originals folder as "sample(1)" and set this new file in the path to original file in the source/codex imported file. If we import a file with different name but exactly the same content as we already have, we still import the new source/codex file with appropriate name but set the original file path to already existing file with different name with that content. --- src/exportHandler/exportHandler.ts | 16 +- .../NewSourceUploaderProvider.ts | 237 +++++++------ .../NewSourceUploader/codexFIleCreateUtils.ts | 154 ++++++++- .../NewSourceUploader/originalFileUtils.ts | 325 ++++++++++++++++++ src/utils/bookNameUtils.ts | 19 +- types/index.d.ts | 6 + .../src/CodexCellEditor/CellList.tsx | 11 +- .../importers/audio/AudioImporterForm.tsx | 5 +- .../importers/audio2/AudioImporter2Form.tsx | 5 +- .../importers/biblica/BiblicaImporterForm.tsx | 27 +- .../importers/common/usfmUtils.ts | 5 +- .../importers/docx/experiment/index.ts | 5 +- .../NewSourceUploader/importers/docx/index.ts | 5 +- .../importers/ebibleCorpus/index.ts | 5 +- .../indesign/InDesignImporterForm.tsx | 5 +- .../maculaBible/MaculaBibleImporterForm.tsx | 7 +- .../importers/markdown/index.ts | 5 +- .../NewSourceUploader/importers/obs/index.ts | 13 +- .../NewSourceUploader/importers/pdf/index.ts | 3 + .../importers/plaintext/index.ts | 5 +- .../RecursiveTextSplitterForm.tsx | 4 +- .../spreadsheet/SpreadsheetImporterForm.tsx | 4 +- .../importers/subtitles/index.ts | 4 +- .../NewSourceUploader/importers/tms/index.ts | 5 +- .../importers/usfm/experimental/index.ts | 5 +- webviews/codex-webviews/src/lib/types.ts | 8 + 26 files changed, 723 insertions(+), 170 deletions(-) create mode 100644 src/providers/NewSourceUploader/originalFileUtils.ts diff --git a/src/exportHandler/exportHandler.ts b/src/exportHandler/exportHandler.ts index a93f3aff4..b5c415bba 100644 --- a/src/exportHandler/exportHandler.ts +++ b/src/exportHandler/exportHandler.ts @@ -366,14 +366,14 @@ async function exportCodexContentAsIdmlRoundtrip( importerType === 'biblica' || fileType === 'biblica' || importerType === 'biblica-experimental' || // Backward compatibility - fileType === 'biblica-experimental' || // Backward compatibility - fileName.toLowerCase().endsWith('-biblica.codex'); + fileType === 'biblica-experimental'; // Backward compatibility + // Note: We no longer check filename suffix since importer type is stored in metadata const exporterType = isBiblicaFile ? 'Biblica' : 'Standard'; console.log(`[IDML Export] Processing ${fileName} (corpusMarker: ${corpusMarker}) using ${exporterType} exporter`); // Lookup original attachment by originalFileName or originalName metadata on the notebook (fallback to {bookCode}.idml) - // Note: NewSourceUploaderProvider stores it as "originalName", but some importers use "originalFileName" + // Note: originalFileName now points to the actual deduplicated file in attachments/originals const originalFileName = (codexNotebook.metadata as any)?.originalFileName || (codexNotebook.metadata as any)?.originalName || `${bookCode}.idml`; @@ -464,20 +464,21 @@ async function exportCodexContentAsDocxRoundtrip( } // Lookup original attachment by originalFileName metadata + // Note: originalFileName now points to the actual deduplicated file in attachments/originals const originalFileName = (codexNotebook.metadata as any)?.originalFileName || `${bookCode}.docx`; - // Originals are stored under `.project/attachments/files/originals/` (preferred). - // Fallback to legacy `.project/attachments/originals/` if needed. + // Originals are stored under `.project/attachments/originals/` (preferred). + // Fallback to legacy `.project/attachments/files/originals/` if needed. const originalsDirPreferred = vscode.Uri.joinPath( workspaceFolders[0].uri, ".project", "attachments", - "files", "originals" ); const originalsDirLegacy = vscode.Uri.joinPath( workspaceFolders[0].uri, ".project", "attachments", + "files", "originals" ); const preferredUri = vscode.Uri.joinPath(originalsDirPreferred, originalFileName); @@ -1485,7 +1486,8 @@ async function exportCodexContentAsTmsRoundtrip( const corpusMarker = (codexNotebook.metadata as any)?.corpusMarker; const fileFormat = (codexNotebook.metadata as any)?.fileFormat || corpusMarker; // Fallback to corpusMarker for old files const fileType = (codexNotebook.metadata as any)?.fileType; // Direct file type field (tmx or xliff) - const originalFileName = (codexNotebook.metadata as any)?.originalFileName; // Get original filename (stored as originalFileName in metadata) + // Get original filename - this now points to the actual deduplicated file in attachments/originals + const originalFileName = (codexNotebook.metadata as any)?.originalFileName; if (corpusMarker !== 'tms' && fileFormat !== 'tms-tmx' && fileFormat !== 'tms-xliff') { console.warn(`[TMS Export] Skipping ${fileName} - not imported with TMS importer (corpusMarker: ${corpusMarker}, fileFormat: ${fileFormat})`); diff --git a/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts b/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts index 899001be5..164b6147b 100644 --- a/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts +++ b/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts @@ -267,35 +267,35 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide const { requestId, pdfBase64, outputPath } = message as { requestId: string; pdfBase64: string; outputPath?: string; }; try { const scriptPath = path.join(this.context.extensionPath, 'webviews', 'codex-webviews', 'src', 'NewSourceUploader', 'importers', 'pdf', 'scripts', 'pdf_to_docx.py'); - + // Verify script exists if (!fs.existsSync(scriptPath)) { throw new Error(`Python script not found at: ${scriptPath}`); } - + // Create temp directory const tempDir = path.join(this.context.extensionPath, '.temp'); if (!fs.existsSync(tempDir)) { fs.mkdirSync(tempDir, { recursive: true }); } - + // Write base64 PDF to temporary file to avoid command line length limits const tempPdfPath = path.join(tempDir, `input_${Date.now()}_${Math.random().toString(36).slice(2)}.pdf`); const pdfBuffer = Buffer.from(pdfBase64, 'base64'); fs.writeFileSync(tempPdfPath, pdfBuffer); - + // Use temp file if outputPath not provided const docxPath = outputPath || path.join(tempDir, `converted_${Date.now()}.docx`); - + // Verify PDF file was written if (!fs.existsSync(tempPdfPath)) { throw new Error(`Failed to write PDF file to: ${tempPdfPath}`); } - + // Run Python script with file paths // On Windows, use proper quoting; on Unix, paths should work as-is const pythonCmd = process.platform === 'win32' ? 'python' : 'python3'; - + // Quote paths properly for Windows (use double quotes and escape inner quotes) const quotePath = (p: string) => { if (process.platform === 'win32') { @@ -306,12 +306,12 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide return `'${p.replace(/'/g, "\\'")}'`; } }; - + const command = `${pythonCmd} ${quotePath(scriptPath)} ${quotePath(tempPdfPath)} ${quotePath(docxPath)}`; - + console.log(`[PDF→DOCX] Converting PDF to DOCX...`); console.log(`[PDF→DOCX] Command: ${command}`); - + let stdout = ''; let stderr = ''; try { @@ -323,7 +323,7 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide stdout = execErr.stdout || ''; stderr = execErr.stderr || ''; const errorMessage = execErr.message || 'Unknown error'; - + // If we have stdout that might be JSON, try to parse it if (stdout.trim()) { try { @@ -335,17 +335,17 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide // Not JSON, use the exec error } } - + // Include both stdout and stderr in error message const fullError = [ errorMessage, stdout ? `\nStdout: ${stdout}` : '', stderr ? `\nStderr: ${stderr}` : '' ].filter(Boolean).join(''); - + throw new Error(fullError); } - + // Clean up temp PDF file try { if (fs.existsSync(tempPdfPath)) { @@ -354,7 +354,7 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide } catch (cleanupErr) { console.warn(`[PDF→DOCX] Could not delete temp PDF: ${cleanupErr}`); } - + // Log progress messages from stderr (Python script sends progress updates there) if (stderr) { try { @@ -380,7 +380,7 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide } } } - + // Parse JSON result let result; try { @@ -388,40 +388,40 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide } catch (parseErr) { throw new Error(`Failed to parse Python script output as JSON. Stdout: ${stdout.substring(0, 500)}${stdout.length > 500 ? '...' : ''}. Stderr: ${stderr}`); } - + if (result.success) { console.log(`[PDF→DOCX] ✓ Successfully converted PDF to DOCX`); - + // Verify the DOCX file exists and has content if (!fs.existsSync(docxPath)) { throw new Error(`DOCX file not found at: ${docxPath}`); } - + const fileStats = fs.statSync(docxPath); if (fileStats.size === 0) { throw new Error(`DOCX file is empty at: ${docxPath}`); } - + console.log(`[PDF→DOCX] Reading DOCX file (${fileStats.size} bytes)...`); - + // For large files (>50MB), save directly to workspace and send file path instead of base64 // This avoids memory issues and webview message size limits const LARGE_FILE_THRESHOLD = 50 * 1024 * 1024; // 50MB const workspaceFolder = vscode.workspace.workspaceFolders?.[0]; - + if (fileStats.size > LARGE_FILE_THRESHOLD && workspaceFolder) { console.log(`[PDF→DOCX] Large file detected (${fileStats.size} bytes), saving to workspace instead of sending via message...`); - + // Save DOCX to temporary location in workspace const tempDir = vscode.Uri.joinPath(workspaceFolder.uri, '.project', 'temp'); await vscode.workspace.fs.createDirectory(tempDir); - + const tempDocxUri = vscode.Uri.joinPath(tempDir, `pdf_conversion_${requestId}.docx`); const docxBuffer = fs.readFileSync(docxPath); await vscode.workspace.fs.writeFile(tempDocxUri, new Uint8Array(docxBuffer)); - + console.log(`[PDF→DOCX] Saved large DOCX to workspace: ${tempDocxUri.fsPath}`); - + webviewPanel.webview.postMessage({ command: 'convertPdfToDocxResult', requestId, @@ -434,14 +434,14 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide // For smaller files, send base64 as before const docxBuffer = fs.readFileSync(docxPath); const docxBase64 = docxBuffer.toString('base64'); - + // Verify base64 encoding is valid if (!docxBase64 || docxBase64.length === 0) { throw new Error('Failed to encode DOCX file to base64'); } - + console.log(`[PDF→DOCX] Sending DOCX data to webview (${docxBase64.length} base64 chars)...`); - + webviewPanel.webview.postMessage({ command: 'convertPdfToDocxResult', requestId, @@ -468,35 +468,35 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide const { requestId, docxBase64, outputPath } = message as { requestId: string; docxBase64: string; outputPath?: string; }; try { const scriptPath = path.join(this.context.extensionPath, 'webviews', 'codex-webviews', 'src', 'NewSourceUploader', 'importers', 'pdf', 'scripts', 'docx_to_pdf.py'); - + // Verify script exists if (!fs.existsSync(scriptPath)) { throw new Error(`Python script not found at: ${scriptPath}`); } - + // Create temp directory const tempDir = path.join(this.context.extensionPath, '.temp'); if (!fs.existsSync(tempDir)) { fs.mkdirSync(tempDir, { recursive: true }); } - + // Write base64 DOCX to temporary file to avoid command line length limits const tempDocxPath = path.join(tempDir, `input_${Date.now()}_${Math.random().toString(36).slice(2)}.docx`); const docxBuffer = Buffer.from(docxBase64, 'base64'); fs.writeFileSync(tempDocxPath, docxBuffer); - + // Use temp file if outputPath not provided const pdfPath = outputPath || path.join(tempDir, `converted_${Date.now()}.pdf`); - + // Verify DOCX file was written if (!fs.existsSync(tempDocxPath)) { throw new Error(`Failed to write DOCX file to: ${tempDocxPath}`); } - + // Run Python script with file paths // On Windows, use proper quoting; on Unix, paths should work as-is const pythonCmd = process.platform === 'win32' ? 'python' : 'python3'; - + // Quote paths properly for Windows (use double quotes and escape inner quotes) const quotePath = (p: string) => { if (process.platform === 'win32') { @@ -507,12 +507,12 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide return `'${p.replace(/'/g, "\\'")}'`; } }; - + const command = `${pythonCmd} ${quotePath(scriptPath)} ${quotePath(tempDocxPath)} ${quotePath(pdfPath)}`; - + console.log(`[DOCX→PDF] Converting DOCX to PDF...`); console.log(`[DOCX→PDF] Command: ${command}`); - + let stdout = ''; let stderr = ''; try { @@ -524,7 +524,7 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide stdout = execErr.stdout || ''; stderr = execErr.stderr || ''; const errorMessage = execErr.message || 'Unknown error'; - + // If we have stdout that might be JSON, try to parse it if (stdout.trim()) { try { @@ -536,17 +536,17 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide // Not JSON, use the exec error } } - + // Include both stdout and stderr in error message const fullError = [ errorMessage, stdout ? `\nStdout: ${stdout}` : '', stderr ? `\nStderr: ${stderr}` : '' ].filter(Boolean).join(''); - + throw new Error(fullError); } - + // Clean up temp DOCX file try { if (fs.existsSync(tempDocxPath)) { @@ -555,11 +555,11 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide } catch (cleanupErr) { console.warn(`[DOCX→PDF] Could not delete temp DOCX: ${cleanupErr}`); } - + if (stderr && !stdout.includes('"success":true')) { console.warn(`[DOCX→PDF] Python stderr: ${stderr}`); } - + // Parse JSON result let result; try { @@ -567,7 +567,7 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide } catch (parseErr) { throw new Error(`Failed to parse Python script output as JSON. Stdout: ${stdout.substring(0, 500)}${stdout.length > 500 ? '...' : ''}. Stderr: ${stderr}`); } - + if (result.success) { console.log(`[DOCX→PDF] ✓ Successfully converted DOCX to PDF`); webviewPanel.webview.postMessage({ @@ -788,22 +788,22 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide processedNotebook.metadata.importerType === "spreadsheet-csv" || processedNotebook.metadata.importerType === "spreadsheet-tsv" ? (() => { - const spreadsheetMetadata = processedNotebook.metadata as SpreadsheetNotebookMetadata; - return { - ...(spreadsheetMetadata.originalFileContent && { - originalFileContent: spreadsheetMetadata.originalFileContent - }), - ...(spreadsheetMetadata.columnHeaders && { - columnHeaders: spreadsheetMetadata.columnHeaders - }), - ...(spreadsheetMetadata.sourceColumnIndex !== undefined && { - sourceColumnIndex: spreadsheetMetadata.sourceColumnIndex - }), - ...(spreadsheetMetadata.delimiter && { - delimiter: spreadsheetMetadata.delimiter - }), - }; - })() + const spreadsheetMetadata = processedNotebook.metadata as SpreadsheetNotebookMetadata; + return { + ...(spreadsheetMetadata.originalFileContent && { + originalFileContent: spreadsheetMetadata.originalFileContent + }), + ...(spreadsheetMetadata.columnHeaders && { + columnHeaders: spreadsheetMetadata.columnHeaders + }), + ...(spreadsheetMetadata.sourceColumnIndex !== undefined && { + sourceColumnIndex: spreadsheetMetadata.sourceColumnIndex + }), + ...(spreadsheetMetadata.delimiter && { + delimiter: spreadsheetMetadata.delimiter + }), + }; + })() : {}), }; @@ -848,85 +848,114 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide token: vscode.CancellationToken, webviewPanel: vscode.WebviewPanel ): Promise { - // Save original files if provided in metadata + // Import the original file utilities + const { saveOriginalFileWithDeduplication } = await import('./originalFileUtils'); + + // Save original files if provided in metadata (with hash-based deduplication) const workspaceFolder = vscode.workspace.workspaceFolders?.[0]; if (workspaceFolder) { for (const pair of message.notebookPairs) { if ("originalFileData" in pair.source.metadata && pair.source.metadata.originalFileData) { - // Save the original file in attachments - const originalFileName = pair.source.metadata.originalFileName || 'document.docx'; - // Store originals under attachments/files/originals for consistency with other attachment storage. - // (Some existing projects may have originals under attachments/originals; exporter will fallback.) - const originalsDir = vscode.Uri.joinPath( - workspaceFolder.uri, - '.project', - 'attachments', - 'files', - 'originals' - ); - await vscode.workspace.fs.createDirectory(originalsDir); - - const originalFileUri = vscode.Uri.joinPath(originalsDir, originalFileName); + // Save the original file with deduplication + const requestedFileName = pair.source.metadata.originalFileName || 'document.docx'; const fileData = pair.source.metadata.originalFileData; - // Convert ArrayBuffer to Uint8Array if needed + // Convert to Uint8Array if needed const buffer = fileData instanceof ArrayBuffer ? new Uint8Array(fileData) : Buffer.from(fileData); - await vscode.workspace.fs.writeFile(originalFileUri, buffer); + // Use hash-based deduplication to save the file + // This handles: + // 1. Same name, same hash: Keep existing file + // 2. Different name, same hash: Return existing filename + // 3. Same name, different hash: Rename to sample(1).idml etc. + const result = await saveOriginalFileWithDeduplication( + workspaceFolder, + requestedFileName, + buffer + ); + + console.log(`[NewSourceUploader] Original file: ${result.message}`); + + // Store the file hash in metadata for integrity verification and deduplication tracking + (pair.source.metadata as any).originalFileHash = result.hash; + if (pair.codex?.metadata) { + (pair.codex.metadata as any).originalFileHash = result.hash; + } + + // IMPORTANT: Preserve user's original filename as fileDisplayName before updating originalFileName + // This ensures the display name reflects what the user imported, while originalFileName + // points to the actual deduplicated file in attachments/originals + if (result.fileName !== requestedFileName) { + // Set fileDisplayName to user's original name (without extension) if not already set + if (!pair.source.metadata.fileDisplayName) { + const displayName = requestedFileName.replace(/\.[^/.]+$/, ''); // Remove extension + (pair.source.metadata as any).fileDisplayName = displayName; + console.log(`[NewSourceUploader] Set fileDisplayName: "${displayName}" (from original "${requestedFileName}")`); + } + if (pair.codex?.metadata && !pair.codex.metadata.fileDisplayName) { + const displayName = requestedFileName.replace(/\.[^/.]+$/, ''); + (pair.codex.metadata as any).fileDisplayName = displayName; + } + + // Update originalFileName to point to the actual stored file (deduplicated) + pair.source.metadata.originalFileName = result.fileName; + if (pair.codex?.metadata) { + pair.codex.metadata.originalFileName = result.fileName; + } + console.log(`[NewSourceUploader] Updated originalFileName to deduplicated file: "${result.fileName}"`); + } // CRITICAL: Do not persist original binary content into JSON notebooks. - // The original template is stored in `.project/attachments/originals/`. + // The original template is stored in `.project/attachments/originals/`. delete pair.source.metadata.originalFileData; } - // For PDF imports: Also save the converted DOCX file for round-trip export + // For PDF imports: Also save the converted DOCX file for round-trip export (with deduplication) const pdfMetadata = (pair.source.metadata as any)?.pdfDocumentMetadata; if (pdfMetadata?.convertedDocxFileName) { - const originalsDir = vscode.Uri.joinPath( - workspaceFolder.uri, - '.project', - 'attachments', - 'files', - 'originals' - ); - await vscode.workspace.fs.createDirectory(originalsDir); + let docxBuffer: Uint8Array | null = null; - const convertedDocxUri = vscode.Uri.joinPath(originalsDir, pdfMetadata.convertedDocxFileName); - - // If convertedDocxData is present (small files), save it directly - // If isLargeFile flag is set, the file should already be saved in temp location + // If convertedDocxData is present (small files), use it directly if (pdfMetadata.convertedDocxData) { const docxData = pdfMetadata.convertedDocxData; - // Convert ArrayBuffer to Uint8Array if needed - const docxBuffer = docxData instanceof ArrayBuffer + docxBuffer = docxData instanceof ArrayBuffer ? new Uint8Array(docxData) : Buffer.from(docxData); - await vscode.workspace.fs.writeFile(convertedDocxUri, docxBuffer); - console.log(`[PDF Importer] Saved converted DOCX file: ${pdfMetadata.convertedDocxFileName}`); // Remove from metadata to avoid persisting in JSON delete pdfMetadata.convertedDocxData; } else if (pdfMetadata.isLargeFile) { - // For large files, check if temp file exists and copy it + // For large files, check if temp file exists and read it const tempDir = vscode.Uri.joinPath(workspaceFolder.uri, '.project', 'temp'); - const tempDocxUri = vscode.Uri.joinPath(tempDir, `pdf_conversion_*.docx`); - // Note: We'd need the actual requestId to find the temp file - // For now, try to find any matching temp file try { const tempFiles = await vscode.workspace.fs.readDirectory(tempDir); const matchingFile = tempFiles.find(([name]) => name.startsWith('pdf_conversion_') && name.endsWith('.docx')); if (matchingFile) { const tempFileUri = vscode.Uri.joinPath(tempDir, matchingFile[0]); - const tempData = await vscode.workspace.fs.readFile(tempFileUri); - await vscode.workspace.fs.writeFile(convertedDocxUri, tempData); + docxBuffer = await vscode.workspace.fs.readFile(tempFileUri); await vscode.workspace.fs.delete(tempFileUri); // Clean up temp file - console.log(`[PDF Importer] Saved large converted DOCX file: ${pdfMetadata.convertedDocxFileName}`); } } catch (err) { console.warn(`[PDF Importer] Could not find/copy temp DOCX file: ${err}`); } } + + // Save with deduplication if we have data + if (docxBuffer) { + const docxResult = await saveOriginalFileWithDeduplication( + workspaceFolder, + pdfMetadata.convertedDocxFileName, + docxBuffer + ); + console.log(`[PDF Importer] Converted DOCX: ${docxResult.message}`); + + // Update convertedDocxFileName to point to the actual stored file (deduplicated) + if (docxResult.fileName !== pdfMetadata.convertedDocxFileName) { + console.log(`[PDF Importer] Updated convertedDocxFileName: "${pdfMetadata.convertedDocxFileName}" -> "${docxResult.fileName}"`); + pdfMetadata.convertedDocxFileName = docxResult.fileName; + } + } } } } diff --git a/src/providers/NewSourceUploader/codexFIleCreateUtils.ts b/src/providers/NewSourceUploader/codexFIleCreateUtils.ts index b10b9661b..7788f1c16 100644 --- a/src/providers/NewSourceUploader/codexFIleCreateUtils.ts +++ b/src/providers/NewSourceUploader/codexFIleCreateUtils.ts @@ -7,6 +7,21 @@ import { CodexContentSerializer } from "../../serializer"; import { CustomNotebookMetadata } from "../../../types"; import { formatJsonForNotebookFile } from "../../utils/notebookFileFormattingUtils"; +/** + * Adds a unique identifier to a filename, preserving the extension. + * Example: "document.idml" -> "document-(abc123).idml" + */ +function addIdToFilename(filename: string, id: string): string { + const lastDotIndex = filename.lastIndexOf('.'); + if (lastDotIndex === -1) { + // No extension + return `${filename}-(${id})`; + } + const baseName = filename.substring(0, lastDotIndex); + const extension = filename.substring(lastDotIndex); + return `${baseName}-(${id})${extension}`; +} + export function checkCancellation(token?: vscode.CancellationToken): void { if (token?.isCancellationRequested) { throw new vscode.CancellationError(); @@ -91,6 +106,75 @@ async function collectExistingCorpusMarkers(workspaceFolder: vscode.WorkspaceFol return existingMarkers; } +/** + * Collects existing fileDisplayName values from source notebooks in the workspace. + * Returns an array of display names (including any with number suffixes like "Sample (1)"). + */ +async function collectExistingDisplayNames(workspaceFolder: vscode.WorkspaceFolder): Promise { + const existingDisplayNames: string[] = []; + + try { + const sourceFiles = await vscode.workspace.findFiles( + ".project/sourceTexts/*.source", + "**/node_modules/**" + ); + + const serializer = new CodexContentSerializer(); + + for (const file of sourceFiles) { + try { + const content = await vscode.workspace.fs.readFile(file); + const notebookData = await serializer.deserializeNotebook( + content, + new vscode.CancellationTokenSource().token + ); + + const metadata = notebookData.metadata as CustomNotebookMetadata | undefined; + if (metadata?.fileDisplayName) { + existingDisplayNames.push(metadata.fileDisplayName); + } + } catch (error) { + // Skip files that can't be read + console.warn(`[DISPLAY NAME] Could not read file ${file.fsPath}:`, error); + } + } + } catch (error) { + console.warn(`[DISPLAY NAME] Error collecting existing display names:`, error); + } + + return existingDisplayNames; +} + +/** + * Generates a unique display name by adding a number suffix if needed. + * Example: If "ACT-REV" exists, returns "ACT-REV (1)". If "ACT-REV (1)" also exists, returns "ACT-REV (2)". + */ +function getUniqueDisplayName(baseName: string, existingNames: string[]): string { + // Check if the base name already exists + if (!existingNames.includes(baseName)) { + return baseName; + } + + // Find the highest existing number suffix for this base name + // Pattern matches: "baseName (N)" where N is a number + const escapedBaseName = baseName.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); + const suffixPattern = new RegExp(`^${escapedBaseName} \\((\\d+)\\)$`); + + let maxNumber = 0; + for (const name of existingNames) { + const match = name.match(suffixPattern); + if (match) { + const num = parseInt(match[1], 10); + if (num > maxNumber) { + maxNumber = num; + } + } + } + + // Return the base name with the next number + return `${baseName} (${maxNumber + 1})`; +} + export async function createNoteBookPair({ token, sourceNotebooks, @@ -114,6 +198,9 @@ export async function createNoteBookPair({ // Collect existing corpusMarkers from the workspace const existingMarkers = await collectExistingCorpusMarkers(workspaceFolder); + // Collect existing display names for non-biblical imports to avoid duplicates + const existingDisplayNames = await collectExistingDisplayNames(workspaceFolder); + for (let i = 0; i < sourceNotebooks.length; i++) { checkCancellation(token); @@ -130,6 +217,68 @@ export async function createNoteBookPair({ console.log(`[CODEX FILE CREATE] Importer type: "${importerType}", Biblical: ${isBiblical}`); + // For non-biblical imports, use the metadata id (UUID) to create unique filenames + // This allows users to import changed source files multiple times and merge translations later + let notebookName = sourceNotebook.name; + let uniqueId: string | undefined; + + if (!isBiblical) { + // Use the metadata id (UUID) that was generated during import + uniqueId = sourceNotebook.metadata?.id; + + if (!uniqueId) { + // Fallback: generate a short unique id if metadata.id is missing + uniqueId = Math.random().toString(36).substring(2, 10); + console.warn(`[CODEX FILE CREATE] No metadata.id found, generated fallback id: "${uniqueId}"`); + } + + notebookName = `${sourceNotebook.name}-(${uniqueId})`; + + console.log(`[CODEX FILE CREATE] Non-biblical import: adding id "${uniqueId}" to filename`); + + // Update originalFileName in metadata to include id for attachment tracking + // This ensures the original file saved in attachments/originals matches the notebook + if (sourceNotebook.metadata?.originalFileName) { + const idOriginalFileName = addIdToFilename( + sourceNotebook.metadata.originalFileName, + uniqueId + ); + sourceNotebook.metadata.originalFileName = idOriginalFileName; + // Also update sourceFile if it exists + if (sourceNotebook.metadata.sourceFile) { + sourceNotebook.metadata.sourceFile = idOriginalFileName; + } + console.log(`[CODEX FILE CREATE] Updated originalFileName to: "${idOriginalFileName}"`); + } + + // Update codex metadata to match + if (codexNotebook.metadata?.originalFileName) { + codexNotebook.metadata.originalFileName = addIdToFilename( + codexNotebook.metadata.originalFileName, + uniqueId + ); + } + if (codexNotebook.metadata?.sourceFile) { + codexNotebook.metadata.sourceFile = sourceNotebook.metadata.originalFileName; + } + + // Generate unique display name for non-biblical imports + // If a file with the same display name already exists, add a number suffix + const baseDisplayName = sourceNotebook.metadata?.fileDisplayName || sourceNotebook.name; + const uniqueDisplayName = getUniqueDisplayName(baseDisplayName, existingDisplayNames); + + if (uniqueDisplayName !== baseDisplayName) { + console.log(`[CODEX FILE CREATE] Display name "${baseDisplayName}" already exists, using "${uniqueDisplayName}"`); + } + + // Update display name in metadata + sourceNotebook.metadata.fileDisplayName = uniqueDisplayName; + codexNotebook.metadata.fileDisplayName = uniqueDisplayName; + + // Add this display name to existing names for subsequent files in the same batch + existingDisplayNames.push(uniqueDisplayName); + } + // Use corpusMarker as-is from the importer (no normalization) // This matches how other importers like Docx and Biblica work const incomingCorpusMarker = sourceNotebook.metadata?.corpusMarker; @@ -150,8 +299,9 @@ export async function createNoteBookPair({ } // Create standardized filenames - only use USFM codes for biblical content - const sourceFilename = await createStandardizedFilename(sourceNotebook.name, ".source", isBiblical); - const codexFilename = await createStandardizedFilename(codexNotebook.name, ".codex", isBiblical); + // For non-biblical content, notebookName already includes the unique id + const sourceFilename = await createStandardizedFilename(notebookName, ".source", isBiblical); + const codexFilename = await createStandardizedFilename(notebookName, ".codex", isBiblical); // Create final URIs with standardized filenames const sourceUri = vscode.Uri.joinPath( diff --git a/src/providers/NewSourceUploader/originalFileUtils.ts b/src/providers/NewSourceUploader/originalFileUtils.ts new file mode 100644 index 000000000..d4c358616 --- /dev/null +++ b/src/providers/NewSourceUploader/originalFileUtils.ts @@ -0,0 +1,325 @@ +/** + * Original File Utilities + * + * Handles hash-based deduplication of original files stored in .project/attachments/originals/ + * + * Storage Structure: + * - .project/attachments/originals/ + * - file-hashes.json (registry of all imported files with their hashes) + * - sample.idml (actual original file) + * - sample(1).idml (renamed file if same name but different content) + * - other-document.docx (another original file) + * + * Features: + * - Computes SHA-256 hash of file content + * - Maintains a registry (file-hashes.json) of original files with their hashes + * - Saves actual original files to the originals folder + * - Prevents duplicate storage of identical files (same content = reuse existing file) + * - Handles filename conflicts by renaming (e.g., sample(1).idml, sample(2).idml) + */ + +import * as vscode from 'vscode'; +import * as crypto from 'crypto'; + +/** + * Registry entry for an original file + */ +export interface OriginalFileEntry { + /** SHA-256 hash of the file content */ + hash: string; + /** The filename stored in attachments/originals/ */ + fileName: string; + /** Original filename(s) that mapped to this file (for reference) */ + originalNames: string[]; + /** Timestamp when first added */ + addedAt: string; +} + +/** + * Registry structure for original files + */ +export interface OriginalFilesRegistry { + /** Version for future migrations */ + version: number; + /** Map of hash -> file entry */ + files: { [hash: string]: OriginalFileEntry; }; + /** Map of filename -> hash (for quick filename lookup) */ + fileNameToHash: { [fileName: string]: string; }; +} + +/** + * Result of checking/adding an original file + */ +export interface OriginalFileResult { + /** The filename to use in metadata (may be different from requested) */ + fileName: string; + /** Whether a new file was saved (false if deduplicated) */ + savedNewFile: boolean; + /** The hash of the file */ + hash: string; + /** Message describing what happened */ + message: string; +} + +const REGISTRY_FILENAME = 'file-hashes.json'; + +/** + * Compute SHA-256 hash of file data + */ +export function computeFileHash(data: Uint8Array | ArrayBuffer | Buffer): string { + const buffer = data instanceof ArrayBuffer + ? Buffer.from(data) + : data instanceof Uint8Array + ? Buffer.from(data) + : data; + return crypto.createHash('sha256').update(buffer).digest('hex'); +} + +/** + * Get the path to the originals directory + */ +function getOriginalsDir(workspaceFolder: vscode.WorkspaceFolder): vscode.Uri { + return vscode.Uri.joinPath( + workspaceFolder.uri, + '.project', + 'attachments', + 'originals' + ); +} + +/** + * Get the path to the registry file + */ +function getRegistryPath(workspaceFolder: vscode.WorkspaceFolder): vscode.Uri { + return vscode.Uri.joinPath(getOriginalsDir(workspaceFolder), REGISTRY_FILENAME); +} + +/** + * Load the original files registry, creating an empty one if it doesn't exist + */ +export async function loadOriginalFilesRegistry( + workspaceFolder: vscode.WorkspaceFolder +): Promise { + const registryPath = getRegistryPath(workspaceFolder); + + try { + const data = await vscode.workspace.fs.readFile(registryPath); + const registry = JSON.parse(new TextDecoder().decode(data)) as OriginalFilesRegistry; + + // Ensure all required fields exist (migration safety) + if (!registry.files) registry.files = {}; + if (!registry.fileNameToHash) registry.fileNameToHash = {}; + if (!registry.version) registry.version = 1; + + return registry; + } catch { + // Registry doesn't exist, create empty one + return { + version: 1, + files: {}, + fileNameToHash: {}, + }; + } +} + +/** + * Save the original files registry + */ +export async function saveOriginalFilesRegistry( + workspaceFolder: vscode.WorkspaceFolder, + registry: OriginalFilesRegistry +): Promise { + const originalsDir = getOriginalsDir(workspaceFolder); + await vscode.workspace.fs.createDirectory(originalsDir); + + const registryPath = getRegistryPath(workspaceFolder); + const data = new TextEncoder().encode(JSON.stringify(registry, null, 2)); + await vscode.workspace.fs.writeFile(registryPath, data); +} + +/** + * Generate a unique filename by adding (1), (2), etc. suffix + */ +function generateUniqueFileName( + baseName: string, + existingFileNames: Set +): string { + if (!existingFileNames.has(baseName)) { + return baseName; + } + + // Split filename into name and extension + const lastDotIndex = baseName.lastIndexOf('.'); + const nameWithoutExt = lastDotIndex > 0 ? baseName.slice(0, lastDotIndex) : baseName; + const extension = lastDotIndex > 0 ? baseName.slice(lastDotIndex) : ''; + + // Try incrementing numbers until we find a unique name + let counter = 1; + let newName: string; + do { + newName = `${nameWithoutExt}(${counter})${extension}`; + counter++; + } while (existingFileNames.has(newName)); + + return newName; +} + +/** + * Save an original file with hash-based deduplication + * + * Handles three scenarios: + * 1. Same name, same hash: Keep existing file, return existing filename + * 2. Different name, same hash: Keep existing file, return existing filename + * 3. Same name, different hash: Save with new name (e.g., sample(1).idml) + * + * @param workspaceFolder The workspace folder + * @param requestedFileName The desired filename for the original file + * @param fileData The file content + * @returns Result with the actual filename to use in metadata + */ +export async function saveOriginalFileWithDeduplication( + workspaceFolder: vscode.WorkspaceFolder, + requestedFileName: string, + fileData: Uint8Array | ArrayBuffer | Buffer +): Promise { + // Compute hash of the file + const hash = computeFileHash(fileData); + + // Load existing registry + const registry = await loadOriginalFilesRegistry(workspaceFolder); + + // Check if we already have a file with this hash + const existingEntry = registry.files[hash]; + + if (existingEntry) { + // We already have a file with the same content + console.log(`[OriginalFiles] File with hash ${hash.slice(0, 8)}... already exists as "${existingEntry.fileName}"`); + + // Track this original name if it's new + if (!existingEntry.originalNames.includes(requestedFileName)) { + existingEntry.originalNames.push(requestedFileName); + await saveOriginalFilesRegistry(workspaceFolder, registry); + } + + return { + fileName: existingEntry.fileName, + savedNewFile: false, + hash, + message: `Deduplicated: using existing file "${existingEntry.fileName}" (same content as "${requestedFileName}")`, + }; + } + + // No existing file with this hash - need to save + const originalsDir = getOriginalsDir(workspaceFolder); + await vscode.workspace.fs.createDirectory(originalsDir); + + // Check if the filename is already taken (by a different file with different hash) + const existingFileNames = new Set(Object.keys(registry.fileNameToHash)); + let actualFileName = requestedFileName; + + if (existingFileNames.has(requestedFileName)) { + // Filename conflict - need to generate a unique name + actualFileName = generateUniqueFileName(requestedFileName, existingFileNames); + console.log(`[OriginalFiles] Filename "${requestedFileName}" exists with different content, saving as "${actualFileName}"`); + } + + // Save the file + const fileUri = vscode.Uri.joinPath(originalsDir, actualFileName); + const buffer = fileData instanceof ArrayBuffer + ? new Uint8Array(fileData) + : fileData instanceof Buffer + ? new Uint8Array(fileData) + : fileData; + await vscode.workspace.fs.writeFile(fileUri, buffer); + + // Update registry + registry.files[hash] = { + hash, + fileName: actualFileName, + originalNames: [requestedFileName], + addedAt: new Date().toISOString(), + }; + registry.fileNameToHash[actualFileName] = hash; + + await saveOriginalFilesRegistry(workspaceFolder, registry); + + const message = actualFileName !== requestedFileName + ? `Saved as "${actualFileName}" (renamed from "${requestedFileName}" due to filename conflict)` + : `Saved new file "${actualFileName}"`; + + console.log(`[OriginalFiles] ${message}`); + + return { + fileName: actualFileName, + savedNewFile: true, + hash, + message, + }; +} + +/** + * Check if an original file exists by hash + */ +export async function findOriginalFileByHash( + workspaceFolder: vscode.WorkspaceFolder, + hash: string +): Promise { + const registry = await loadOriginalFilesRegistry(workspaceFolder); + return registry.files[hash] || null; +} + +/** + * Check if an original file exists by filename + */ +export async function findOriginalFileByName( + workspaceFolder: vscode.WorkspaceFolder, + fileName: string +): Promise { + const registry = await loadOriginalFilesRegistry(workspaceFolder); + const hash = registry.fileNameToHash[fileName]; + if (hash) { + return registry.files[hash] || null; + } + return null; +} + +/** + * Get all original files in the registry + */ +export async function getAllOriginalFiles( + workspaceFolder: vscode.WorkspaceFolder +): Promise { + const registry = await loadOriginalFilesRegistry(workspaceFolder); + return Object.values(registry.files); +} + +/** + * Clean up orphaned registry entries (files that no longer exist on disk) + */ +export async function cleanupOrphanedEntries( + workspaceFolder: vscode.WorkspaceFolder +): Promise { + const registry = await loadOriginalFilesRegistry(workspaceFolder); + const originalsDir = getOriginalsDir(workspaceFolder); + + let removedCount = 0; + + for (const [hash, entry] of Object.entries(registry.files)) { + const fileUri = vscode.Uri.joinPath(originalsDir, entry.fileName); + try { + await vscode.workspace.fs.stat(fileUri); + } catch { + // File doesn't exist, remove from registry + delete registry.files[hash]; + delete registry.fileNameToHash[entry.fileName]; + removedCount++; + console.log(`[OriginalFiles] Removed orphaned registry entry: ${entry.fileName}`); + } + } + + if (removedCount > 0) { + await saveOriginalFilesRegistry(workspaceFolder, registry); + } + + return removedCount; +} diff --git a/src/utils/bookNameUtils.ts b/src/utils/bookNameUtils.ts index a98e2036f..8d93284b5 100644 --- a/src/utils/bookNameUtils.ts +++ b/src/utils/bookNameUtils.ts @@ -117,23 +117,32 @@ export async function getBookDisplayName(usfmCode: string): Promise { export function isBiblicalImporterType(importerType: string | undefined): boolean { if (!importerType) return false; const normalizedType = importerType.toLowerCase().trim(); + + // Exact matches for biblical importers const bibleTypeImporters = [ 'usfm', + 'usfm-experimental', 'paratext', 'ebiblecorpus', 'ebible', 'ebible-download', 'maculabible', 'macula', - 'biblica', 'obs', - // Note: 'pdf', 'docx', and 'indesign' are NOT included here + // Note: 'pdf', 'docx', 'indesign', and 'biblica' are NOT included here // because they are generic document formats that should preserve // their original filenames rather than being converted to Bible book codes. - // If specific biblical content detection is needed, it should be done - // at the importer level with explicit flags. + // The importer type is stored in metadata, so filename suffixes are not needed. ]; - return bibleTypeImporters.includes(normalizedType); + + // Check exact match first + if (bibleTypeImporters.includes(normalizedType)) { + return true; + } + + // Also check prefixes for variations (e.g., 'usfm-*' matches any USFM variant) + const biblicalPrefixes = ['usfm', 'paratext', 'ebible', 'macula']; + return biblicalPrefixes.some(prefix => normalizedType.startsWith(prefix)); } /** diff --git a/types/index.d.ts b/types/index.d.ts index c1cb0888f..945432267 100644 --- a/types/index.d.ts +++ b/types/index.d.ts @@ -979,6 +979,12 @@ export interface CustomNotebookMetadata { * Stored at notebook-level (not per-cell). For most importers this matches originalFileName. */ sourceFile?: string; + /** + * Timestamp added to non-biblical imports to ensure unique filenames. + * Format: "YYYYMMDD_HHmmss" (e.g., "20260127_143025") + * This allows importing changed source files multiple times without overwriting. + */ + importTimestamp?: string; /** * One-time import context derived from the import process. * This is the canonical home for attributes that do not vary per-cell. diff --git a/webviews/codex-webviews/src/CodexCellEditor/CellList.tsx b/webviews/codex-webviews/src/CodexCellEditor/CellList.tsx index 2348d5629..1d8240815 100644 --- a/webviews/codex-webviews/src/CodexCellEditor/CellList.tsx +++ b/webviews/codex-webviews/src/CodexCellEditor/CellList.tsx @@ -517,11 +517,14 @@ const CellList: React.FC = ({ // Now uses globalReferences and includes offset for pagination const getChapterBasedVerseNumber = useCallback( (cell: QuillCellContent, allCells: QuillCellContent[]): number => { - const cellIdentifier = getCellIdentifier(cell); - if (!cellIdentifier) return 1; // Fallback if no identifier + // Use cellMarkers[0] (UUID) for finding the cell's position, not getCellIdentifier + // getCellIdentifier may return non-unique values (e.g., Biblica imports where multiple + // cells share the same first globalReference due to verse array accumulation) + const cellUuid = cell.cellMarkers?.[0]; + if (!cellUuid) return 1; // Fallback if no UUID const cellIndex = allCells.findIndex( - (unit) => getCellIdentifier(unit) === cellIdentifier + (unit) => unit.cellMarkers?.[0] === cellUuid ); if (cellIndex === -1) return 1; // Fallback if not found @@ -544,7 +547,7 @@ const CellList: React.FC = ({ const offset = calculateLineNumberOffset(); return visibleCellCount + offset; }, - [getCellIdentifier, isChildCell, calculateLineNumberOffset] + [isChildCell, calculateLineNumberOffset] ); const generateCellLabel = useCallback( diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/audio/AudioImporterForm.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/audio/AudioImporterForm.tsx index af7039492..18d4b07a1 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/audio/AudioImporterForm.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/audio/AudioImporterForm.tsx @@ -1,4 +1,5 @@ import React, { useState, useCallback, useEffect, useRef } from "react"; +import { v4 as uuidv4 } from 'uuid'; import { ImporterComponentProps, SelectAudioFileMessage, ReprocessAudioFileMessage, FinalizeAudioImportMessage, AudioFileSelectedMessage, AudioFilesSelectedMessage, AudioImportProgressMessage, AudioImportCompleteMessage, UpdateAudioSegmentsMessage, AudioSegmentsUpdatedMessage } from "../../types/plugin"; import { Button } from "../../../components/ui/button"; import { Card, CardContent, CardHeader, CardTitle } from "../../../components/ui/card"; @@ -658,7 +659,7 @@ export const AudioImporterForm: React.FC = ({ name: docId, cells: sourceCells, metadata: { - id: docId, + id: uuidv4(), originalFileName: file.fileName, sourceFile: file.fileName, importerType: "audio", @@ -678,7 +679,7 @@ export const AudioImporterForm: React.FC = ({ name: docId, cells: codexCells, metadata: { - id: docId, + id: uuidv4(), originalFileName: file.fileName, sourceFile: file.fileName, importerType: "audio", diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/audio2/AudioImporter2Form.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/audio2/AudioImporter2Form.tsx index 716d41e53..1ebbe0c8b 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/audio2/AudioImporter2Form.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/audio2/AudioImporter2Form.tsx @@ -1,4 +1,5 @@ import React, { useState, useCallback, useEffect, useRef } from "react"; +import { v4 as uuidv4 } from 'uuid'; import { ImporterComponentProps, SelectAudioFileMessage, ReprocessAudioFileMessage, FinalizeAudioImportMessage, AudioFileSelectedMessage, AudioFilesSelectedMessage, AudioImportProgressMessage, AudioImportCompleteMessage, UpdateAudioSegmentsMessage, AudioSegmentsUpdatedMessage } from "../../types/plugin"; import { Button } from "../../../components/ui/button"; import { Card, CardContent, CardHeader, CardTitle } from "../../../components/ui/card"; @@ -658,7 +659,7 @@ export const AudioImporterForm: React.FC = ({ name: docId, cells: sourceCells, metadata: { - id: docId, + id: uuidv4(), originalFileName: file.fileName, sourceFile: file.fileName, importerType: "audio", @@ -678,7 +679,7 @@ export const AudioImporterForm: React.FC = ({ name: docId, cells: codexCells, metadata: { - id: docId, + id: uuidv4(), originalFileName: file.fileName, sourceFile: file.fileName, importerType: "audio", diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/biblica/BiblicaImporterForm.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/biblica/BiblicaImporterForm.tsx index 159c4b8d9..f73e9dcdc 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/biblica/BiblicaImporterForm.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/biblica/BiblicaImporterForm.tsx @@ -25,6 +25,7 @@ import { ArrowLeft, BookOpen } from 'lucide-react'; +import { v4 as uuidv4 } from 'uuid'; import { IDMLParser } from './biblicaParser'; import { HTMLMapper } from './htmlMapper'; import { createProcessedCell, sanitizeFileName, createStandardCellId, addMilestoneCellsToNotebookPair } from '../../utils/workflowHelpers'; @@ -724,16 +725,14 @@ export const BiblicaImporterForm: React.FC = ({ addDebugLog(`Simplified note cells count: ${simplifiedNoteCells.length}`); - const baseName = sanitizeFileName(studyBibleFile.name.replace(/\.idml$/i, '')); - const notesNotebookName = sanitizeFileName(`${baseName}-notes`); - // Add -biblica suffix to originalFileName to match naming convention (e.g., "mat-john.idml" -> "mat-john-biblica.idml") - // This ensures the saved file in attachments matches what the exporter will look for - const originalFileName = studyBibleFile.name.replace( - /\.idml$/i, - "-biblica.idml" - ); - addDebugLog(`Base name: "${baseName}"`); - addDebugLog(`Notes notebook name: "${notesNotebookName}"`); + // Remove .idml extension and any "-notes" or "_notes" suffix from filename + const rawBaseName = studyBibleFile.name.replace(/\.idml$/i, ''); + const cleanBaseName = rawBaseName.replace(/[-_]?notes$/i, ''); + const baseName = sanitizeFileName(cleanBaseName); + // Use the original file name as-is - importer type is stored in metadata + const originalFileName = studyBibleFile.name; + addDebugLog(`Raw base name: "${rawBaseName}"`); + addDebugLog(`Clean base name (notes removed): "${baseName}"`); addDebugLog(`Original file name: "${originalFileName}"`); // Create notebook pair for notes only @@ -744,10 +743,10 @@ export const BiblicaImporterForm: React.FC = ({ if (simplifiedNoteCells.length > 0) { notebookPairs.push({ source: { - name: notesNotebookName, + name: baseName, cells: simplifiedNoteCells, metadata: { - id: `biblica-notes-source-${Date.now()}`, + id: uuidv4(), originalFileName: originalFileName, sourceFile: originalFileName, originalFileData: arrayBuffer, @@ -771,7 +770,7 @@ export const BiblicaImporterForm: React.FC = ({ } }, codex: { - name: notesNotebookName, + name: baseName, cells: simplifiedNoteCells.map(cell => ({ id: cell.id, content: '', // Empty codex for notes @@ -782,7 +781,7 @@ export const BiblicaImporterForm: React.FC = ({ } })), metadata: { - id: `biblica-notes-codex-${Date.now()}`, + id: uuidv4(), originalFileName: originalFileName, sourceFile: originalFileName, importerType: 'biblica', diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/common/usfmUtils.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/common/usfmUtils.ts index 64bf6deb4..d93d47330 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/common/usfmUtils.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/common/usfmUtils.ts @@ -1,3 +1,4 @@ +import { v4 as uuidv4 } from 'uuid'; import { ProcessedCell, ProcessedNotebook, @@ -544,7 +545,7 @@ export const createNotebookPair = = ({ name: baseName, cells: simplifiedCells, metadata: { - id: `indesign-source-${Date.now()}`, + id: uuidv4(), originalFileName: selectedFile.name, sourceFile: selectedFile.name, // Pass the original file bytes so the provider can persist it under .project/attachments/originals @@ -253,7 +254,7 @@ export const InDesignImporterForm: React.FC = ({ } })), metadata: { - id: `indesign-codex-${Date.now()}`, + id: uuidv4(), originalFileName: selectedFile.name, sourceFile: selectedFile.name, importerType: 'indesign', diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/maculaBible/MaculaBibleImporterForm.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/maculaBible/MaculaBibleImporterForm.tsx index ab34a1e44..919e17c8f 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/maculaBible/MaculaBibleImporterForm.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/maculaBible/MaculaBibleImporterForm.tsx @@ -1,4 +1,5 @@ import React, { useState, useCallback, useEffect } from "react"; +import { v4 as uuidv4 } from 'uuid'; import { ImporterComponentProps, AlignedCell, @@ -245,7 +246,7 @@ export const MaculaBibleImporterForm: React.FC = (props) }; }), metadata: { - id: notebookName, + id: uuidv4(), originalFileName: `${fullBookName}.macula`, // Use full name instead of code sourceFile: `${fullBookName}.macula`, importerType: "macula", @@ -268,6 +269,10 @@ export const MaculaBibleImporterForm: React.FC = (props) ...cell, content: "", // Empty for codex })), + metadata: { + ...sourceNotebook.metadata, + id: uuidv4(), + }, }; const notebookPair = { diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/markdown/index.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/markdown/index.ts index a1b49e7ae..1c44c0392 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/markdown/index.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/markdown/index.ts @@ -1,3 +1,4 @@ +import { v4 as uuidv4 } from 'uuid'; import { ImporterPlugin, FileValidationResult, @@ -315,7 +316,7 @@ export const parseFile = async ( name: baseName, cells, metadata: { - id: `source-${Date.now()}`, + id: uuidv4(), originalFileName: file.name, sourceFile: file.name, importerType: 'markdown', @@ -359,7 +360,7 @@ export const parseFile = async ( cells: codexCells, metadata: { ...sourceNotebook.metadata, - id: `codex-${Date.now()}`, + id: uuidv4(), }, }; diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/obs/index.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/obs/index.ts index 905a02d94..9ff5096f7 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/obs/index.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/obs/index.ts @@ -1,3 +1,4 @@ +import { v4 as uuidv4 } from 'uuid'; import { ImporterPlugin, FileValidationResult, @@ -278,7 +279,7 @@ const downloadObsRepository = async ( name: storyName, cells: storyCells, metadata: { - id: `obs-${obsStory.storyNumber.toString().padStart(2, '0')}-source`, + id: uuidv4(), originalFileName: storyFile.name, sourceFile: storyFile.name, corpusMarker: 'obs', // Enable round-trip export @@ -308,7 +309,7 @@ const downloadObsRepository = async ( name: storyName, cells: codexCells, metadata: { - id: `obs-${obsStory.storyNumber.toString().padStart(2, '0')}-codex`, + id: uuidv4(), originalFileName: storyFile.name, sourceFile: storyFile.name, corpusMarker: 'obs', // Enable round-trip export @@ -535,7 +536,7 @@ const parseObsMarkdown = async ( name: baseName, cells, metadata: { - id: `obs-source-${Date.now()}`, + id: uuidv4(), originalFileName: file.name, sourceFile: file.name, originalFileData: arrayBuffer, // Store original file for export - system will save to .project/attachments/originals/ @@ -567,7 +568,7 @@ const parseObsMarkdown = async ( cells: codexCells, metadata: { ...sourceNotebook.metadata, - id: `obs-codex-${Date.now()}`, + id: uuidv4(), // Don't duplicate the original file data in codex originalFileData: undefined, }, @@ -818,7 +819,7 @@ const parseObsZip = async ( name: storyName, cells, metadata: { - id: `obs-${obsStory.storyNumber.toString().padStart(2, '0')}-source`, + id: uuidv4(), originalFileName: markdownFile.name, sourceFile: markdownFile.name, corpusMarker: 'obs', // Enable round-trip export @@ -842,7 +843,7 @@ const parseObsZip = async ( name: storyName, cells: codexCells, metadata: { - id: `obs-${obsStory.storyNumber.toString().padStart(2, '0')}-codex`, + id: uuidv4(), originalFileName: markdownFile.name, sourceFile: markdownFile.name, corpusMarker: 'obs', // Enable round-trip export diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/index.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/index.ts index 0b04b8a7e..677e69d63 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/index.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/pdf/index.ts @@ -1,3 +1,4 @@ +import { v4 as uuidv4 } from 'uuid'; import { ImporterPlugin, FileValidationResult, @@ -582,6 +583,7 @@ export const parseFile = async ( // Override metadata to indicate PDF origin sourceNotebook.metadata = { ...sourceNotebook.metadata, + id: uuidv4(), corpusMarker: 'pdf', importerType: 'pdf', originalFileName: file.name, // Keep original PDF filename @@ -608,6 +610,7 @@ export const parseFile = async ( codexNotebook.metadata = { ...codexNotebook.metadata, + id: uuidv4(), corpusMarker: 'pdf', importerType: 'pdf', originalFileName: file.name, diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/plaintext/index.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/plaintext/index.ts index 351b1d7cb..51d247b72 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/plaintext/index.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/plaintext/index.ts @@ -1,3 +1,4 @@ +import { v4 as uuidv4 } from 'uuid'; import { ImporterPlugin, FileValidationResult, @@ -150,7 +151,7 @@ export const parseFile = async (file: File, onProgress?: ProgressCallback, optio name: baseName, cells, metadata: { - id: `plaintext-source-${Date.now()}`, + id: uuidv4(), originalFileName: file.name, sourceFile: file.name, importerType: 'plaintext', @@ -186,7 +187,7 @@ export const parseFile = async (file: File, onProgress?: ProgressCallback, optio cells: codexCells, metadata: { ...sourceNotebook.metadata, - id: `plaintext-codex-${Date.now()}`, + id: uuidv4(), }, }; diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/recursiveTextSplitter/RecursiveTextSplitterForm.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/recursiveTextSplitter/RecursiveTextSplitterForm.tsx index 7b7a08e6c..560b397e5 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/recursiveTextSplitter/RecursiveTextSplitterForm.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/recursiveTextSplitter/RecursiveTextSplitterForm.tsx @@ -510,7 +510,7 @@ export const RecursiveTextSplitterForm: React.FC = ({ name: cleanFileName, cells: sourceCells, metadata: { - id: `source-${Date.now()}`, + id: uuidv4(), originalFileName: file.name, sourceFile: file.name, importerType: "smart-segmenter", @@ -528,7 +528,7 @@ export const RecursiveTextSplitterForm: React.FC = ({ name: cleanFileName, cells: codexCells, metadata: { - id: `codex-${Date.now()}`, + id: uuidv4(), originalFileName: file.name, sourceFile: file.name, importerType: "smart-segmenter", diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/SpreadsheetImporterForm.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/SpreadsheetImporterForm.tsx index f3d123cc7..0ed709abe 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/SpreadsheetImporterForm.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/SpreadsheetImporterForm.tsx @@ -443,7 +443,7 @@ export const SpreadsheetImporterForm: React.FC = (props) name: parsedData.filename, cells: sourceCells, metadata: { - id: parsedData.filename, + id: uuidv4(), originalFileName: selectedFile!.name, sourceFile: selectedFile!.name, importerType: spreadsheetType, @@ -474,7 +474,7 @@ export const SpreadsheetImporterForm: React.FC = (props) content: "", // Empty target cells })), metadata: { - id: parsedData.filename, + id: uuidv4(), originalFileName: selectedFile!.name, sourceFile: selectedFile!.name, importerType: spreadsheetType, diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.ts index f9a534488..259cbd385 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.ts @@ -1,3 +1,4 @@ +import { v4 as uuidv4 } from 'uuid'; import { ImporterPlugin, FileValidationResult, @@ -221,7 +222,7 @@ const parseFile = async ( name: baseName, cells, metadata: { - id: baseNameAsId, + id: uuidv4(), originalFileName: file.name, sourceFile: file.name, importerType: 'subtitles', @@ -257,6 +258,7 @@ const parseFile = async ( cells: codexCells, metadata: { ...sourceNotebook.metadata, + id: uuidv4(), }, }; diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/tms/index.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/tms/index.ts index e9131ac31..dbfd60114 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/tms/index.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/tms/index.ts @@ -1,3 +1,4 @@ +import { v4 as uuidv4 } from 'uuid'; import { ImporterPlugin, FileValidationResult, @@ -323,7 +324,7 @@ export const parseFile = async ( name: file.name.replace(/\.(tmx|xliff|xlf)$/, ''), cells: cells, metadata: { - id: `translation-source-${Date.now()}`, + id: uuidv4(), originalFileName: file.name, sourceFile: file.name, originalFileData: arrayBuffer, // Store original file for round-trip export @@ -353,7 +354,7 @@ export const parseFile = async ( cells: codexCells, metadata: { ...sourceNotebook.metadata, - id: `translation-codex-${Date.now()}`, + id: uuidv4(), // Don't duplicate the original file data in codex originalFileData: undefined, }, diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/index.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/index.ts index 29102d096..3b3c7f89d 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/index.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/usfm/experimental/index.ts @@ -4,6 +4,7 @@ * Standalone implementation - doesn't rely on common/usfmUtils.ts */ +import { v4 as uuidv4 } from 'uuid'; import { ImporterPlugin, FileValidationResult, @@ -97,7 +98,7 @@ export const parseFile = async ( name: baseName, cells: parsedDocument.cells, metadata: { - id: `usfm-experimental-source-${Date.now()}`, + id: uuidv4(), originalFileName: file.name, sourceFile: file.name, // Store original file data as ArrayBuffer for saving to attachments/originals @@ -143,7 +144,7 @@ export const parseFile = async ( cells: codexCells, metadata: { ...sourceNotebook.metadata, - id: `usfm-experimental-codex-${Date.now()}`, + id: uuidv4(), // Don't duplicate original file data in codex metadata originalFileData: undefined, }, diff --git a/webviews/codex-webviews/src/lib/types.ts b/webviews/codex-webviews/src/lib/types.ts index 145319027..38c0b4ac9 100644 --- a/webviews/codex-webviews/src/lib/types.ts +++ b/webviews/codex-webviews/src/lib/types.ts @@ -37,6 +37,14 @@ export interface CustomNotebookMetadata { cellDisplayMode?: "inline" | "one-line-per-cell"; validationMigrationComplete?: boolean; fontSize?: number; + importerType?: string; + originalFileName?: string; + sourceFile?: string; + /** + * Timestamp added to non-biblical imports to ensure unique filenames. + * Format: "YYYYMMDD_HHmmss" (e.g., "20260127_143025") + */ + importTimestamp?: string; } export interface ProgressPercentages { From 98324a7fdaecb522916ada486cf87cf8e2eff788 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Pacanovsk=C3=BD?= Date: Wed, 11 Feb 2026 19:03:32 +0100 Subject: [PATCH 5/6] All-around Importer Update Got rid of some unusel unfinished importers, made the webview more clear and readable with colored tags for round-trip, matching the green tags in "Round-trip Exporter" in export window. Moved spreadsheet importer and renamed it to Bible Spreadsheet for it's specific use case in specialized importers. --- src/exportHandler/exportHandler.ts | 41 ++- src/projectManager/projectExportView.ts | 49 ++-- .../NewSourceUploaderProvider.ts | 22 ++ .../NewSourceUploader/codexFIleCreateUtils.ts | 44 +-- .../NewSourceUploader/originalFileUtils.ts | 130 ++++++++- .../navigationWebviewProvider.ts | 266 +++++++++++++++++- src/utils/editMapUtils.ts | 10 + types/index.d.ts | 8 + .../src/NavigationView/index.tsx | 51 +++- .../components/PluginSelection.tsx | 13 +- .../importers/audio/index.tsx | 1 - .../importers/audio2/index.tsx | 1 - .../SpreadsheetImporterForm.tsx | 47 ++-- .../cellMetadata.ts | 0 .../index.tsx | 1 - .../parser.ts | 0 .../spreadsheetExporter.ts | 0 .../{spreadsheet => bibleSpredSheet}/types.ts | 0 .../importers/biblica/index.tsx | 1 - .../importers/docx/experiment/cellMetadata.ts | 2 +- .../importers/docx/experiment/index.tsx | 1 - .../importers/ebibleCorpus/index.tsx | 1 - .../importers/indesign/index.tsx | 1 - .../importers/maculaBible/index.tsx | 1 - .../NewSourceUploader/importers/obs/index.tsx | 1 - .../NewSourceUploader/importers/registry.tsx | 109 ++++--- .../importers/subtitles/index.tsx | 1 - .../NewSourceUploader/importers/tms/index.tsx | 1 - 28 files changed, 622 insertions(+), 181 deletions(-) rename webviews/codex-webviews/src/NewSourceUploader/importers/{spreadsheet => bibleSpredSheet}/SpreadsheetImporterForm.tsx (97%) rename webviews/codex-webviews/src/NewSourceUploader/importers/{spreadsheet => bibleSpredSheet}/cellMetadata.ts (100%) rename webviews/codex-webviews/src/NewSourceUploader/importers/{spreadsheet => bibleSpredSheet}/index.tsx (92%) rename webviews/codex-webviews/src/NewSourceUploader/importers/{spreadsheet => bibleSpredSheet}/parser.ts (100%) rename webviews/codex-webviews/src/NewSourceUploader/importers/{spreadsheet => bibleSpredSheet}/spreadsheetExporter.ts (100%) rename webviews/codex-webviews/src/NewSourceUploader/importers/{spreadsheet => bibleSpredSheet}/types.ts (100%) diff --git a/src/exportHandler/exportHandler.ts b/src/exportHandler/exportHandler.ts index b5c415bba..8b2578a21 100644 --- a/src/exportHandler/exportHandler.ts +++ b/src/exportHandler/exportHandler.ts @@ -463,9 +463,11 @@ async function exportCodexContentAsDocxRoundtrip( continue; } - // Lookup original attachment by originalFileName metadata + // Lookup original attachment by originalFileName or originalName metadata // Note: originalFileName now points to the actual deduplicated file in attachments/originals - const originalFileName = (codexNotebook.metadata as any)?.originalFileName || `${bookCode}.docx`; + const originalFileName = (codexNotebook.metadata as any)?.originalFileName || + (codexNotebook.metadata as any)?.originalName || + `${bookCode}.docx`; // Originals are stored under `.project/attachments/originals/` (preferred). // Fallback to legacy `.project/attachments/files/originals/` if needed. const originalsDirPreferred = vscode.Uri.joinPath( @@ -566,8 +568,10 @@ async function exportCodexContentAsPdfRoundtrip( continue; } - // Lookup original attachment by originalFileName metadata - const originalFileName = (codexNotebook.metadata as any)?.originalFileName || `${bookCode}.pdf`; + // Lookup original attachment by originalFileName or originalName metadata + const originalFileName = (codexNotebook.metadata as any)?.originalFileName || + (codexNotebook.metadata as any)?.originalName || + `${bookCode}.pdf`; // Check both preferred and legacy locations for converted DOCX const originalsDirPreferred = vscode.Uri.joinPath( @@ -886,7 +890,9 @@ async function convertDocxToPdfViaExtension(docxPath: string): Promise @@ -287,6 +294,27 @@ function getWebviewContent( ? `

Select Export Format

+ +
+
+ +
+ Round-trip Export +

Intelligently detects file type and exports back the original file you imported with applied translations

+
+ USFM + DOCX + OBS + TMS + Markdown + CSV/TSV + IDML + Biblica Study Notes +
+
+
+
+
@@ -327,27 +355,6 @@ function getWebviewContent(
- - -
-
- -
- Rebuild Export -

Intelligently detects file type and exports back to original format (DOCX, IDML, Biblica, PDF, OBS, TMS, USFM, CSV/TSV)

-
- DOCX - IDML - Biblica - PDF - OBS - TMS - USFM - CSV/TSV -
-
-
-
diff --git a/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts b/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts index f03a50b0d..10b87bfd9 100644 --- a/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts +++ b/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts @@ -983,6 +983,28 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide codexNotebooks, }); + // Register notebook references in the original files registry + // This tracks which notebooks use each original file, so we know when it's safe to delete + if (workspaceFolder) { + const { addNotebookReference } = await import('./originalFileUtils'); + for (const createdFile of createdFiles) { + try { + // Read the source notebook to get originalFileName from metadata + const sourceContent = await vscode.workspace.fs.readFile(createdFile.sourceUri); + const sourceNotebook = JSON.parse(new TextDecoder().decode(sourceContent)); + const originalFileName = sourceNotebook?.metadata?.originalName || sourceNotebook?.metadata?.originalFileName; + + if (originalFileName) { + // Use the source filename (without extension) as the notebook base name + const notebookBaseName = path.basename(createdFile.sourceUri.fsPath).replace(/\.[^/.]+$/, ''); + await addNotebookReference(workspaceFolder, originalFileName, notebookBaseName); + } + } catch (err) { + console.warn(`[NewSourceUploader] Could not register notebook reference: ${err}`); + } + } + } + // Migrate localized-books.json to codex metadata before deleting the file // Pass the newly created codex URIs directly to avoid search issues const createdCodexUris = createdFiles.map(f => f.codexUri); diff --git a/src/providers/NewSourceUploader/codexFIleCreateUtils.ts b/src/providers/NewSourceUploader/codexFIleCreateUtils.ts index 7788f1c16..c7d41038f 100644 --- a/src/providers/NewSourceUploader/codexFIleCreateUtils.ts +++ b/src/providers/NewSourceUploader/codexFIleCreateUtils.ts @@ -7,21 +7,6 @@ import { CodexContentSerializer } from "../../serializer"; import { CustomNotebookMetadata } from "../../../types"; import { formatJsonForNotebookFile } from "../../utils/notebookFileFormattingUtils"; -/** - * Adds a unique identifier to a filename, preserving the extension. - * Example: "document.idml" -> "document-(abc123).idml" - */ -function addIdToFilename(filename: string, id: string): string { - const lastDotIndex = filename.lastIndexOf('.'); - if (lastDotIndex === -1) { - // No extension - return `${filename}-(${id})`; - } - const baseName = filename.substring(0, lastDotIndex); - const extension = filename.substring(lastDotIndex); - return `${baseName}-(${id})${extension}`; -} - export function checkCancellation(token?: vscode.CancellationToken): void { if (token?.isCancellationRequested) { throw new vscode.CancellationError(); @@ -236,31 +221,10 @@ export async function createNoteBookPair({ console.log(`[CODEX FILE CREATE] Non-biblical import: adding id "${uniqueId}" to filename`); - // Update originalFileName in metadata to include id for attachment tracking - // This ensures the original file saved in attachments/originals matches the notebook - if (sourceNotebook.metadata?.originalFileName) { - const idOriginalFileName = addIdToFilename( - sourceNotebook.metadata.originalFileName, - uniqueId - ); - sourceNotebook.metadata.originalFileName = idOriginalFileName; - // Also update sourceFile if it exists - if (sourceNotebook.metadata.sourceFile) { - sourceNotebook.metadata.sourceFile = idOriginalFileName; - } - console.log(`[CODEX FILE CREATE] Updated originalFileName to: "${idOriginalFileName}"`); - } - - // Update codex metadata to match - if (codexNotebook.metadata?.originalFileName) { - codexNotebook.metadata.originalFileName = addIdToFilename( - codexNotebook.metadata.originalFileName, - uniqueId - ); - } - if (codexNotebook.metadata?.sourceFile) { - codexNotebook.metadata.sourceFile = sourceNotebook.metadata.originalFileName; - } + // IMPORTANT: Do NOT modify originalFileName here. + // originalFileName must point to the actual file stored in attachments/originals/ + // (which may be deduplicated). The notebook filename uses UUIDs for uniqueness, + // but the original file reference should remain unchanged for round-trip export. // Generate unique display name for non-biblical imports // If a file with the same display name already exists, add a number suffix diff --git a/src/providers/NewSourceUploader/originalFileUtils.ts b/src/providers/NewSourceUploader/originalFileUtils.ts index d4c358616..e8d0546de 100644 --- a/src/providers/NewSourceUploader/originalFileUtils.ts +++ b/src/providers/NewSourceUploader/originalFileUtils.ts @@ -31,6 +31,8 @@ export interface OriginalFileEntry { fileName: string; /** Original filename(s) that mapped to this file (for reference) */ originalNames: string[]; + /** Notebook base names (without extension) that reference this original file */ + referencedBy: string[]; /** Timestamp when first added */ addedAt: string; } @@ -111,6 +113,13 @@ export async function loadOriginalFilesRegistry( if (!registry.fileNameToHash) registry.fileNameToHash = {}; if (!registry.version) registry.version = 1; + // Migration: ensure all entries have referencedBy array + for (const entry of Object.values(registry.files)) { + if (!entry.referencedBy) { + entry.referencedBy = []; + } + } + return registry; } catch { // Registry doesn't exist, create empty one @@ -175,12 +184,14 @@ function generateUniqueFileName( * @param workspaceFolder The workspace folder * @param requestedFileName The desired filename for the original file * @param fileData The file content + * @param notebookBaseName Optional base name of the notebook referencing this file (e.g., "test-(uuid)") * @returns Result with the actual filename to use in metadata */ export async function saveOriginalFileWithDeduplication( workspaceFolder: vscode.WorkspaceFolder, requestedFileName: string, - fileData: Uint8Array | ArrayBuffer | Buffer + fileData: Uint8Array | ArrayBuffer | Buffer, + notebookBaseName?: string ): Promise { // Compute hash of the file const hash = computeFileHash(fileData); @@ -195,9 +206,21 @@ export async function saveOriginalFileWithDeduplication( // We already have a file with the same content console.log(`[OriginalFiles] File with hash ${hash.slice(0, 8)}... already exists as "${existingEntry.fileName}"`); + let registryChanged = false; + // Track this original name if it's new if (!existingEntry.originalNames.includes(requestedFileName)) { existingEntry.originalNames.push(requestedFileName); + registryChanged = true; + } + + // Track notebook reference + if (notebookBaseName && !existingEntry.referencedBy.includes(notebookBaseName)) { + existingEntry.referencedBy.push(notebookBaseName); + registryChanged = true; + } + + if (registryChanged) { await saveOriginalFilesRegistry(workspaceFolder, registry); } @@ -237,6 +260,7 @@ export async function saveOriginalFileWithDeduplication( hash, fileName: actualFileName, originalNames: [requestedFileName], + referencedBy: notebookBaseName ? [notebookBaseName] : [], addedAt: new Date().toISOString(), }; registry.fileNameToHash[actualFileName] = hash; @@ -293,6 +317,110 @@ export async function getAllOriginalFiles( return Object.values(registry.files); } +/** + * Remove a notebook reference from the registry. + * If no other notebooks reference the original file, deletes the file from disk and registry. + * + * @param workspaceFolder The workspace folder + * @param notebookBaseName The base name of the notebook being deleted (e.g., "test-(uuid)") + * @param originalFileName The originalFileName from the notebook's metadata (points to file in originals/) + * @returns Whether the original file was deleted from disk + */ +export async function removeNotebookReference( + workspaceFolder: vscode.WorkspaceFolder, + notebookBaseName: string, + originalFileName?: string +): Promise<{ originalFileDeleted: boolean; fileName: string | null }> { + const registry = await loadOriginalFilesRegistry(workspaceFolder); + + // Find the entry by originalFileName or by scanning referencedBy + let targetHash: string | null = null; + let targetEntry: OriginalFileEntry | null = null; + + if (originalFileName) { + // Look up by filename first + const hash = registry.fileNameToHash[originalFileName]; + if (hash && registry.files[hash]) { + targetHash = hash; + targetEntry = registry.files[hash]; + } + } + + // If not found by filename, scan all entries for this notebook reference + if (!targetEntry) { + for (const [hash, entry] of Object.entries(registry.files)) { + if (entry.referencedBy.includes(notebookBaseName)) { + targetHash = hash; + targetEntry = entry; + break; + } + } + } + + if (!targetEntry || !targetHash) { + console.log(`[OriginalFiles] No registry entry found for notebook "${notebookBaseName}"`); + return { originalFileDeleted: false, fileName: null }; + } + + // Remove this notebook from referencedBy + targetEntry.referencedBy = targetEntry.referencedBy.filter(ref => ref !== notebookBaseName); + console.log(`[OriginalFiles] Removed reference "${notebookBaseName}" from "${targetEntry.fileName}" (${targetEntry.referencedBy.length} references remaining)`); + + if (targetEntry.referencedBy.length === 0) { + // No more references - delete the original file and registry entry + const originalsDir = getOriginalsDir(workspaceFolder); + const fileUri = vscode.Uri.joinPath(originalsDir, targetEntry.fileName); + const deletedFileName = targetEntry.fileName; + + try { + await vscode.workspace.fs.delete(fileUri); + console.log(`[OriginalFiles] Deleted unreferenced original file: ${targetEntry.fileName}`); + } catch (err) { + console.warn(`[OriginalFiles] Could not delete original file "${targetEntry.fileName}": ${err}`); + } + + // Remove from registry + delete registry.files[targetHash]; + delete registry.fileNameToHash[targetEntry.fileName]; + await saveOriginalFilesRegistry(workspaceFolder, registry); + + return { originalFileDeleted: true, fileName: deletedFileName }; + } + + // Still has references, just save the updated registry + await saveOriginalFilesRegistry(workspaceFolder, registry); + return { originalFileDeleted: false, fileName: targetEntry.fileName }; +} + +/** + * Add a notebook reference to an existing registry entry (by originalFileName). + * Used when the notebook base name isn't known at import time but is known after file creation. + * + * @param workspaceFolder The workspace folder + * @param originalFileName The originalFileName stored in metadata + * @param notebookBaseName The base name of the notebook (e.g., "test-(uuid)") + */ +export async function addNotebookReference( + workspaceFolder: vscode.WorkspaceFolder, + originalFileName: string, + notebookBaseName: string +): Promise { + const registry = await loadOriginalFilesRegistry(workspaceFolder); + + const hash = registry.fileNameToHash[originalFileName]; + if (!hash || !registry.files[hash]) { + console.warn(`[OriginalFiles] Cannot add reference: no registry entry for "${originalFileName}"`); + return; + } + + const entry = registry.files[hash]; + if (!entry.referencedBy.includes(notebookBaseName)) { + entry.referencedBy.push(notebookBaseName); + await saveOriginalFilesRegistry(workspaceFolder, registry); + console.log(`[OriginalFiles] Added reference "${notebookBaseName}" to "${originalFileName}" (${entry.referencedBy.length} total)`); + } +} + /** * Clean up orphaned registry entries (files that no longer exist on disk) */ diff --git a/src/providers/navigationWebview/navigationWebviewProvider.ts b/src/providers/navigationWebview/navigationWebviewProvider.ts index 26b90f72a..d50d3ed1b 100644 --- a/src/providers/navigationWebview/navigationWebviewProvider.ts +++ b/src/providers/navigationWebview/navigationWebviewProvider.ts @@ -9,7 +9,8 @@ import { safePostMessageToView } from "../../utils/webviewUtils"; import { CodexItem } from "types"; import { getCellValueData, cellHasAudioUsingAttachments, computeValidationStats, computeProgressPercents } from "../../../sharedUtils"; import { normalizeCorpusMarker } from "../../utils/corpusMarkerUtils"; -import { addMetadataEdit } from "../../utils/editMapUtils"; +import { addMetadataEdit, addProjectMetadataEdit, EditMapUtils } from "../../utils/editMapUtils"; +import { MetadataManager } from "../../utils/metadataManager"; import { getAuthApi } from "../../extension"; import { CustomNotebookMetadata } from "../../../types"; import { getCorrespondingSourceUri, findCodexFilesByBookAbbr } from "../../utils/codexNotebookUtils"; @@ -259,6 +260,19 @@ export class NavigationWebviewProvider extends BaseWebviewProvider { } } + // Read the codex notebook metadata before deletion to get originalFileName + let originalFileName: string | undefined; + let notebookBaseName: string | undefined; + try { + const codexContent = await vscode.workspace.fs.readFile(codexUri); + const codexNotebook = JSON.parse(new TextDecoder().decode(codexContent)); + originalFileName = codexNotebook?.metadata?.originalName || codexNotebook?.metadata?.originalFileName; + // Derive notebook base name from codex filename (without extension) + notebookBaseName = path.basename(normalizedPath).replace(/\.[^/.]+$/, ''); + } catch (err) { + console.warn(`[Navigation] Could not read codex metadata for original file cleanup: ${err}`); + } + // Delete the codex file try { await vscode.workspace.fs.delete(codexUri); @@ -302,6 +316,35 @@ export class NavigationWebviewProvider extends BaseWebviewProvider { } } + // Clean up original file in attachments/originals (if applicable) + // Remove this notebook's reference; delete the original file only if no other notebooks use it + if (notebookBaseName) { + try { + const workspaceFolder = vscode.workspace.workspaceFolders?.[0]; + if (workspaceFolder) { + const { removeNotebookReference } = await import('../NewSourceUploader/originalFileUtils'); + const result = await removeNotebookReference( + workspaceFolder, + notebookBaseName, + originalFileName + ); + if (result.originalFileDeleted && result.fileName) { + deletedFiles.push(`original: ${result.fileName}`); + console.log(`[Navigation] Deleted unreferenced original file: ${result.fileName}`); + } else if (result.fileName) { + console.log(`[Navigation] Original file "${result.fileName}" still referenced by other notebooks, kept`); + } + } + } catch (err) { + console.warn(`[Navigation] Could not clean up original file: ${err}`); + } + } + + // Record single file deletion to project edit history + if (deletedFiles.length > 0 && message.type === "codexDocument") { + await this.recordFileDeletionToEditHistory(normalizedPath, message.label); + } + // Show appropriate message based on results if (deletedFiles.length > 0 && errors.length === 0) { vscode.window.showInformationMessage( @@ -399,6 +442,27 @@ export class NavigationWebviewProvider extends BaseWebviewProvider { } break; } + case "deleteCorpusMarker": { + try { + const content = message.content ?? {}; + const { corpusLabel, displayName, children } = content; + + const fileCount = children?.length ?? 0; + const confirmed = await vscode.window.showWarningMessage( + `Are you sure you want to delete the folder "${displayName}"? This will permanently delete ${fileCount} file(s) and cannot be undone.`, + { modal: true }, + "Delete" + ); + + if (confirmed === "Delete" && children?.length > 0) { + await this.deleteCorpusMarker(corpusLabel, displayName, children); + } + } catch (error) { + console.error("Error deleting corpus marker:", error); + vscode.window.showErrorMessage(`Failed to delete folder: ${error}`); + } + break; + } } } @@ -1193,6 +1257,206 @@ export class NavigationWebviewProvider extends BaseWebviewProvider { } } + private async recordFileDeletionToEditHistory(filePath: string, label: string): Promise { + const workspaceFolder = vscode.workspace.workspaceFolders?.[0]?.uri; + if (!workspaceFolder) return; + + try { + const author = await this.getCurrentUser(); + await MetadataManager.safeUpdateMetadata( + workspaceFolder, + (metadata: { edits?: unknown[] }) => { + if (!metadata.edits) metadata.edits = []; + addProjectMetadataEdit( + metadata, + EditMapUtils.deletedFile(), + { filePath, label }, + author + ); + return metadata; + }, + { author } + ); + } catch (err) { + console.warn(`[Navigation] Could not record file deletion to edit history: ${err}`); + } + } + + private async recordCorpusDeletionToEditHistory( + corpusMarker: string, + deletedFiles: Array<{ filePath: string; label: string }> + ): Promise { + const workspaceFolder = vscode.workspace.workspaceFolders?.[0]?.uri; + if (!workspaceFolder) return; + + try { + const author = await this.getCurrentUser(); + await MetadataManager.safeUpdateMetadata( + workspaceFolder, + (metadata: { edits?: unknown[] }) => { + if (!metadata.edits) metadata.edits = []; + addProjectMetadataEdit( + metadata, + EditMapUtils.deletedCorpusMarker(), + { corpusMarker, deletedFiles }, + author + ); + return metadata; + }, + { author } + ); + } catch (err) { + console.warn(`[Navigation] Could not record corpus deletion to edit history: ${err}`); + } + } + + private async getCurrentUser(): Promise { + try { + const authApi = getAuthApi(); + const userInfo = await authApi?.getUserInfo(); + return userInfo?.username || "anonymous"; + } catch { + return "anonymous"; + } + } + + private async deleteCorpusMarker( + corpusLabel: string, + displayName: string, + children: Array<{ uri: string; label: string; type: string }> + ): Promise { + const workspaceFolder = vscode.workspace.workspaceFolders?.[0]; + if (!workspaceFolder) { + vscode.window.showErrorMessage("No workspace folder found"); + return; + } + + const codexEditorProvider = CodexCellEditorProvider.getInstance(); + const closePanelByUri = (uri: vscode.Uri) => { + if (!codexEditorProvider) return; + const webviewPanels = codexEditorProvider.getWebviewPanels(); + let panelToClose = webviewPanels.get(uri.toString()); + if (!panelToClose) { + for (const [panelUri, panel] of webviewPanels.entries()) { + const panelUriObj = vscode.Uri.parse(panelUri); + if (panelUriObj.fsPath === uri.fsPath) { + panelToClose = panel; + break; + } + } + } + if (panelToClose) panelToClose.dispose(); + }; + + const allDeletedFiles: Array<{ filePath: string; label: string }> = []; + const errors: string[] = []; + + await vscode.window.withProgress( + { + location: vscode.ProgressLocation.Notification, + title: `Deleting folder "${displayName}"`, + cancellable: false, + }, + async (progress) => { + const total = children.length; + for (let i = 0; i < children.length; i++) { + const child = children[i]; + progress.report({ + increment: (100 / total), + message: `Deleting ${child.label}...`, + }); + + const normalizedPath = (child.uri as string).replace(/\\/g, "/"); + const codexUri = vscode.Uri.file(normalizedPath); + + // Close webviews + closePanelByUri(codexUri); + if (child.type === "codexDocument") { + const baseFileName = path.basename(normalizedPath); + const sourceFileName = baseFileName.replace(".codex", ".source"); + const sourceUri = vscode.Uri.joinPath( + workspaceFolder.uri, + ".project", + "sourceTexts", + sourceFileName + ); + closePanelByUri(sourceUri); + } + + let originalFileName: string | undefined; + let notebookBaseName: string | undefined; + try { + const codexContent = await vscode.workspace.fs.readFile(codexUri); + const codexNotebook = JSON.parse(new TextDecoder().decode(codexContent)); + originalFileName = codexNotebook?.metadata?.originalName || codexNotebook?.metadata?.originalFileName; + notebookBaseName = path.basename(normalizedPath).replace(/\.[^/.]+$/, ""); + } catch { + // File may already be gone + } + + try { + await vscode.workspace.fs.delete(codexUri); + allDeletedFiles.push({ filePath: normalizedPath, label: child.label }); + } catch (error) { + console.error("Error deleting codex file:", error); + errors.push(`Failed to delete ${child.label}: ${error}`); + } + + if (child.type === "codexDocument") { + try { + const baseFileName = path.basename(normalizedPath); + const sourceFileName = baseFileName.replace(".codex", ".source"); + const sourceUri = vscode.Uri.joinPath( + workspaceFolder.uri, + ".project", + "sourceTexts", + sourceFileName + ); + try { + await vscode.workspace.fs.delete(sourceUri); + } catch (deleteError: unknown) { + const err = deleteError as { code?: string }; + if (err.code !== "FileNotFound" && err.code !== "ENOENT") { + errors.push(`Failed to delete source for ${child.label}`); + } + } + } catch (error) { + errors.push(`Failed to delete source for ${child.label}`); + } + } + + if (notebookBaseName) { + try { + const { removeNotebookReference } = await import("../NewSourceUploader/originalFileUtils"); + await removeNotebookReference(workspaceFolder, notebookBaseName, originalFileName); + } catch { + // Non-fatal + } + } + } + } + ); + + // Record folder and all deleted files to edit history + if (allDeletedFiles.length > 0) { + await this.recordCorpusDeletionToEditHistory(corpusLabel, allDeletedFiles); + } + + if (allDeletedFiles.length > 0 && errors.length === 0) { + vscode.window.showInformationMessage( + `Successfully deleted folder "${displayName}" and ${allDeletedFiles.length} file(s)` + ); + } else if (allDeletedFiles.length > 0 && errors.length > 0) { + vscode.window.showWarningMessage( + `Partially deleted: ${allDeletedFiles.length} file(s). Errors: ${errors.join("; ")}` + ); + } else { + vscode.window.showErrorMessage(`Failed to delete folder "${displayName}": ${errors.join("; ")}`); + } + + await this.buildInitialData(); + } + public dispose(): void { this.disposables.forEach((d) => d.dispose()); } diff --git a/src/utils/editMapUtils.ts b/src/utils/editMapUtils.ts index 3f59752bb..f16bd709b 100644 --- a/src/utils/editMapUtils.ts +++ b/src/utils/editMapUtils.ts @@ -27,6 +27,8 @@ type MetaEditMap = ["meta"]; type MetaFieldEditMap = ["meta", string]; type LanguagesEditMap = ["languages"]; type SpellcheckIsEnabledEditMap = ["spellcheckIsEnabled"]; +type DeletedCorpusMarkerEditMap = ["deletedCorpusMarker"]; +type DeletedFileEditMap = ["deletedFile"]; import { EditType } from "../../types/enums"; @@ -149,6 +151,14 @@ export const EditMapUtils = { return ["spellcheckIsEnabled"]; }, + deletedCorpusMarker(): DeletedCorpusMarkerEditMap { + return ["deletedCorpusMarker"]; + }, + + deletedFile(): DeletedFileEditMap { + return ["deletedFile"]; + }, + // Compare editMaps equals(editMap1: readonly string[], editMap2: readonly string[]): boolean { return JSON.stringify(editMap1) === JSON.stringify(editMap2); diff --git a/types/index.d.ts b/types/index.d.ts index a6e274c44..1641efd0d 100644 --- a/types/index.d.ts +++ b/types/index.d.ts @@ -1578,6 +1578,14 @@ type ProjectManagerMessageFromWebview = | { command: "triggerSync"; } | { command: "editBookName"; content: { bookAbbr: string; newBookName: string; }; } | { command: "editCorpusMarker"; content: { corpusLabel: string; newCorpusName: string; }; } + | { + command: "deleteCorpusMarker"; + content: { + corpusLabel: string; + displayName: string; + children: Array<{ uri: string; label: string; type: string }>; + }; + } | { command: "openCellLabelImporter"; } | { command: "openCodexMigrationTool"; } | { command: "navigateToMainMenu"; } diff --git a/webviews/codex-webviews/src/NavigationView/index.tsx b/webviews/codex-webviews/src/NavigationView/index.tsx index 05e082d5a..22942bfe5 100644 --- a/webviews/codex-webviews/src/NavigationView/index.tsx +++ b/webviews/codex-webviews/src/NavigationView/index.tsx @@ -461,6 +461,24 @@ function NavigationView() { })); }; + const handleDeleteCorpusMarker = (item: CodexItem) => { + const displayName = + item.children?.[0]?.corpusMarker || + formatLabel(item.label, state.bibleBookMap || new Map()); + vscode.postMessage({ + command: "deleteCorpusMarker", + content: { + corpusLabel: item.label, + displayName, + children: item.children?.map((c) => ({ + uri: c.uri, + label: c.label, + type: c.type, + })) ?? [], + }, + }); + }; + const handleRenameModalClose = () => { setState((prev) => ({ ...prev, @@ -847,16 +865,29 @@ function NavigationView() {
)} {item.type === "corpus" && ( -
{ - e.stopPropagation(); - handleEditCorpusMarker(item); - }} - > - - Rename Group -
+ <> +
{ + e.stopPropagation(); + handleEditCorpusMarker(item); + }} + > + + Rename Group +
+
{ + e.preventDefault(); + e.stopPropagation(); + handleDeleteCorpusMarker(item); + }} + > + + Delete Folder +
+ )} {!isGroup && (
= ({ plugin, onSelect, className }) => { const Icon = plugin.icon; const isEnabled = plugin.enabled !== false; - const isBetaPlugin = plugin.id === "docx-roundtrip" || plugin.id === "pdf-importer" || plugin.id === "usfm-experimental" || plugin.id === "indesign-importer" || plugin.id === "biblica-importer" || plugin.id === "spreadsheet"; + const isBetaPlugin = plugin.id === "pdf-importer" || plugin.id === "indesign-importer" || plugin.id === "biblica-importer" || plugin.id === "spreadsheet"; return ( {plugin.tags .filter((tag) => !["Essential", "Specialized"].includes(tag)) - .slice(0, 2) .map((tag) => ( - + {tag} ))} diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/audio/index.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/audio/index.tsx index d3ab7611c..e7c82de95 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/audio/index.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/audio/index.tsx @@ -28,6 +28,5 @@ export const audioImporterPlugin: ImporterPlugin = { "audio/flac", ], enabled: true, - tags: ["Essential", "Media", "Audio"], }; diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/audio2/index.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/audio2/index.tsx index d3ab7611c..e7c82de95 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/audio2/index.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/audio2/index.tsx @@ -28,6 +28,5 @@ export const audioImporterPlugin: ImporterPlugin = { "audio/flac", ], enabled: true, - tags: ["Essential", "Media", "Audio"], }; diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/SpreadsheetImporterForm.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/bibleSpredSheet/SpreadsheetImporterForm.tsx similarity index 97% rename from webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/SpreadsheetImporterForm.tsx rename to webviews/codex-webviews/src/NewSourceUploader/importers/bibleSpredSheet/SpreadsheetImporterForm.tsx index 0ed709abe..498f5519f 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/SpreadsheetImporterForm.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/bibleSpredSheet/SpreadsheetImporterForm.tsx @@ -841,6 +841,15 @@ export const SpreadsheetImporterForm: React.FC = (props) onCancel(); }; + const handleBackToFileSelection = () => { + setParsedData(null); + setColumnMapping({}); + setIsDirty(false); + setError(null); + setPendingImport(null); + setPendingNotebookPair(null); + }; + const renderColumnMappingCard = () => { if (!parsedData) return null; @@ -853,26 +862,26 @@ export const SpreadsheetImporterForm: React.FC = (props) Choose Your Columns {!isTranslationImport && ( -
- - -
+ )}
- + {isTranslationImport ? `Tell us which column contains the translations for "${selectedSource?.name}"` - : "Tell us which columns contain your content. Optional: add an Attachments column with audio URLs to auto-attach audio to each cell."} + : <> + Tell us which columns contain your content. Explanation of column types: + {"\n\n"} + • Verse References: (ID column) used for cross-references and annotations + {"\n\n"} + • Source Content: your source text per row + {"\n\n"} + • Attachments: audio URLs separated by comma, semicolon, or space. + } @@ -910,7 +919,7 @@ export const SpreadsheetImporterForm: React.FC = (props)
- Global References + Verse References
{!isTranslationImport && ( @@ -966,7 +975,7 @@ export const SpreadsheetImporterForm: React.FC = (props) {getColumnTypeCount("globalReferences") > 0 && ( - Global References + Verse References )} {getColumnTypeCount("source") > 0 && ( @@ -997,7 +1006,7 @@ export const SpreadsheetImporterForm: React.FC = (props) )}
- diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/cellMetadata.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/bibleSpredSheet/cellMetadata.ts similarity index 100% rename from webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/cellMetadata.ts rename to webviews/codex-webviews/src/NewSourceUploader/importers/bibleSpredSheet/cellMetadata.ts diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/index.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/bibleSpredSheet/index.tsx similarity index 92% rename from webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/index.tsx rename to webviews/codex-webviews/src/NewSourceUploader/importers/bibleSpredSheet/index.tsx index eada7eeb5..24008bac1 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/index.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/bibleSpredSheet/index.tsx @@ -11,7 +11,6 @@ export const spreadsheetImporterPlugin: ImporterPlugin = { component: SpreadsheetImporterForm, supportedExtensions: ["csv", "tsv"], supportedMimeTypes: ["text/csv", "text/tab-separated-values", "application/csv"], - tags: ["Structured", "Data", "Translation"], enabled: true, }; diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/parser.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/bibleSpredSheet/parser.ts similarity index 100% rename from webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/parser.ts rename to webviews/codex-webviews/src/NewSourceUploader/importers/bibleSpredSheet/parser.ts diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/spreadsheetExporter.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/bibleSpredSheet/spreadsheetExporter.ts similarity index 100% rename from webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/spreadsheetExporter.ts rename to webviews/codex-webviews/src/NewSourceUploader/importers/bibleSpredSheet/spreadsheetExporter.ts diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/types.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/bibleSpredSheet/types.ts similarity index 100% rename from webviews/codex-webviews/src/NewSourceUploader/importers/spreadsheet/types.ts rename to webviews/codex-webviews/src/NewSourceUploader/importers/bibleSpredSheet/types.ts diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/biblica/index.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/biblica/index.tsx index 53a8cd2ff..3a357e6d8 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/biblica/index.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/biblica/index.tsx @@ -17,5 +17,4 @@ export const biblicaImporterPlugin: ImporterPlugin = { supportedExtensions: ['idml'], supportedMimeTypes: ['application/vnd.adobe.indesign-idml-package'], enabled: true, - tags: ['Bible', 'Biblica', 'Round-trip'], }; diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/experiment/cellMetadata.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/experiment/cellMetadata.ts index 3b9fccf56..4f4098b3b 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/experiment/cellMetadata.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/experiment/cellMetadata.ts @@ -54,7 +54,7 @@ export function createDocxCellMetadata(params: DocxCellMetadataParams): { metada }, // Cell label (paragraph number) - cellLabel: `¶${paragraphIndex + 1}`, + cellLabel: `${paragraphIndex + 1}`, }; return { diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/experiment/index.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/experiment/index.tsx index e4f2ddb27..e41303931 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/experiment/index.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/experiment/index.tsx @@ -14,7 +14,6 @@ export const docxRoundtripImporterPlugin: ImporterPlugin = { component: DocxImporterForm, supportedExtensions: ["docx"], supportedMimeTypes: ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"], - tags: ["Round-trip", "Export"], enabled: true, }; diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/ebibleCorpus/index.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/ebibleCorpus/index.tsx index 1e995c71f..88e7f1775 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/ebibleCorpus/index.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/ebibleCorpus/index.tsx @@ -10,5 +10,4 @@ export const ebibleDownloadImporterPlugin: ImporterPlugin = { component: EbibleDownloadImporterForm, supportedExtensions: [], // No file extensions - this downloads remotely enabled: true, - tags: ["Remote", "Biblical"], }; diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/indesign/index.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/indesign/index.tsx index 88c044881..71c716f86 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/indesign/index.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/indesign/index.tsx @@ -17,5 +17,4 @@ export const indesignImporterPlugin: ImporterPlugin = { supportedExtensions: ['idml'], supportedMimeTypes: ['application/vnd.adobe.indesign-idml-package'], enabled: true, - tags: ['Essential', 'Documents', 'Adobe', 'Professional', 'RoundTrip'], }; diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/maculaBible/index.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/maculaBible/index.tsx index aa58d6a7b..afdace697 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/maculaBible/index.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/maculaBible/index.tsx @@ -10,5 +10,4 @@ export const maculaBibleImporterPlugin: ImporterPlugin = { component: MaculaBibleImporterForm, supportedExtensions: [], // No file extensions - this downloads remotely enabled: true, - tags: ["Specialized", "Biblical", "Original Languages", "Hebrew", "Greek"], }; diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/obs/index.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/obs/index.tsx index c7d9d2074..0194e0400 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/obs/index.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/obs/index.tsx @@ -11,5 +11,4 @@ export const obsImporterPlugin: ImporterPlugin = { component: ObsImporterForm, supportedExtensions: ["md", "zip"], enabled: true, - tags: ["stories", "download", "repository"], }; diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/registry.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/registry.tsx index c185fa8c9..bcccfae1f 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/registry.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/registry.tsx @@ -16,21 +16,19 @@ import { // import { docxImporterPlugin } from "./docx/index.tsx"; // Old mammoth.js importer import { docxRoundtripImporterPlugin as docxImporterPlugin } from "./docx/experiment/index.tsx"; // New round-trip importer import { markdownImporterPlugin } from "./markdown/index.tsx"; -import { usfmImporterPlugin } from "./usfm/index.tsx"; // Original USFM importer +// import { usfmImporterPlugin } from "./usfm/index.tsx"; // Original USFM importer import { usfmExperimentalImporterPlugin } from "./usfm/experimental/index.tsx"; // Experimental round-trip importer (standalone with headers in chapter 1) import { ebibleDownloadImporterPlugin } from "./ebibleCorpus/index.tsx"; import { maculaBibleImporterPlugin } from "./maculaBible/index.tsx"; import { subtitlesImporterPlugin } from "./subtitles/index.tsx"; import { obsImporterPlugin } from "./obs/index.tsx"; -import { smartSegmenterPlugin } from "./recursiveTextSplitter/index.tsx"; +// import { smartSegmenterPlugin } from "./recursiveTextSplitter/index.tsx"; import { paratextImporterPlugin } from "./paratext/index.tsx"; -import { spreadsheetImporterPlugin } from "./spreadsheet/index.tsx"; +import { spreadsheetImporterPlugin } from "./bibleSpredSheet/index.tsx"; import { audioImporterPlugin } from "./audio/index.tsx"; import { biblicaImporterPlugin } from "./biblica/index.tsx"; -// import { biblicaSwapperImporterPlugin } from "./biblica-swapper/index.tsx"; import { tmsImporterPlugin } from "./tms/index.tsx"; -// import { rtfImporterPlugin } from "./rtf/index.tsx"; -import { pdfImporterPlugin } from "./pdf/index.tsx"; +// import { pdfImporterPlugin } from "./pdf/index.tsx"; import { indesignImporterPlugin } from "./indesign/index.tsx"; // Import placeholder components - these will be created for each importer @@ -52,118 +50,105 @@ const createPlaceholderComponent = (name: string) => { export const importerPlugins: ImporterPlugin[] = [ // Essential Tools - General purpose importers for broad appeal // Non-beta importers first - // { - // ...smartSegmenterPlugin, - // name: "Smart Segmenter", - // description: "Works with any text file", - // tags: [...(smartSegmenterPlugin.tags || []), "Essential", "Universal", "Text"], - // }, + // { + // ...smartSegmenterPlugin, + // name: "Smart Segmenter", + // description: "Works with any text file", + // tags: [...(smartSegmenterPlugin.tags || []), "Essential", "Universal", "Text"], + // }, { ...audioImporterPlugin, name: "Audio", description: "Import audio files with backend processing - supports large files", - tags: [...(audioImporterPlugin.tags || []), "Essential", "Media", "Audio"], + tags: ["Essential", "Media", "Audio"], }, { ...markdownImporterPlugin, name: "Markdown", - description: "GitHub-style markdown files", - tags: [...(markdownImporterPlugin.tags || []), "Essential", "Documentation", "GitHub"], + description: "GitHub-style markdown files with round-trip export support", + tags: ["Essential", "Documentation", "GitHub", "Round-trip"], }, { ...subtitlesImporterPlugin, name: "Subtitles", description: "Video captions with timestamps", - tags: [...(subtitlesImporterPlugin.tags || []), "Essential", "Media", "Video"], + tags: ["Essential", "Media", "Video"], }, { ...tmsImporterPlugin, name: "TMS Files", - description: "Translation memory and localization files (TMX/XLIFF)", - tags: [...(tmsImporterPlugin.tags || []), "Essential", "Translation", "Localization"], + description: "Translation memory and localization files (TMX/XLIFF) with round-trip export support", + tags: ["Essential", "Translation", "Localization", "Round-trip"], }, { ...docxImporterPlugin, name: "Word Documents", description: "Microsoft Word files with round-trip export support", - tags: [...(docxImporterPlugin.tags || []), "Essential", "Documents", "Microsoft"], - }, - { - ...spreadsheetImporterPlugin, - name: "Spreadsheets", - description: "Excel and Google Sheets", - tags: [...(spreadsheetImporterPlugin.tags || []), "Essential", "Spreadsheet", "Excel"], - }, - { - ...pdfImporterPlugin, - name: "PDF Documents", - description: "Portable Document Format files with Bible text", - icon: FileText, - tags: ["Essential", "Documents", "PDF"], + tags: ["Essential", "Documents", "Microsoft", "Round-trip"], }, { ...indesignImporterPlugin, name: "InDesign Files", description: "Adobe InDesign IDML files with round-trip loss-free editing", - tags: [...(indesignImporterPlugin.tags || []), "Essential", "Documents", "Adobe", "Professional", "Bible"], + tags: ["Essential", "Documents", "Adobe", "Round-trip"], }, + // { + // ...pdfImporterPlugin, + // name: "PDF Documents", + // description: "Portable Document Format files with Bible text", + // icon: FileText, + // tags: ["Essential", "Documents", "PDF"], + // }, // Specialized Tools - Domain-specific importers for Bible translation // Non-beta importers first + // { + // ...usfmImporterPlugin, + // name: "USFM Files", + // description: "Unified Standard Format Marker files", + // tags: [...(usfmImporterPlugin.tags || []), "Specialized", "Bible", "USFM"], + // }, { - ...usfmImporterPlugin, - name: "USFM Files", - description: "Unified Standard Format Marker files", - tags: [...(usfmImporterPlugin.tags || []), "Specialized", "Bible", "USFM"], + ...usfmExperimentalImporterPlugin, + name: "USFM New", + description: "USFM files with round-trip export support (headers in chapter 1, verse-only target imports)", + tags: ["Specialized", "Bible", "USFM", "Round-trip"], }, { ...paratextImporterPlugin, name: "Paratext Projects", description: "Translation projects with settings", - tags: [...(paratextImporterPlugin.tags || []), "Specialized", "Bible", "Paratext"], + tags: ["Specialized", "Bible", "Paratext"], }, { ...ebibleDownloadImporterPlugin, name: "eBible Download", description: "Download directly from eBible.org", - tags: [...(ebibleDownloadImporterPlugin.tags || []), "Specialized", "Bible", "Download"], + tags: ["Specialized", "Bible", "Download"], }, { ...maculaBibleImporterPlugin, name: "Macula Bible", description: "Hebrew and Greek with annotations", - tags: [ - ...(maculaBibleImporterPlugin.tags || []), - "Specialized", - "Bible", - "Original Languages", - ], + tags: ["Specialized", "Bible", "Original Languages"], }, { ...obsImporterPlugin, name: "Bible Stories", description: "Open Bible Stories format with round-trip export support", - tags: [...(obsImporterPlugin.tags || []), "Specialized", "Bible", "Stories", "Round-trip"], + tags: ["Specialized", "Bible", "Stories", "Round-trip"], }, - // { - // ...biblicaSwapperImporterPlugin, - // name: "Biblica Bible Swapper", - // description: "Swap Bible text between two IDML files while preserving notes", - // tags: [...(biblicaSwapperImporterPlugin.tags || []), "Specialized", "Bible", "Biblica"], - // }, - - // Beta importers at the end of Specialized section { - ...usfmExperimentalImporterPlugin, - name: "USFM Experimental", - description: "USFM files with round-trip export support (headers in chapter 1, verse-only target imports)", - tags: [...(usfmExperimentalImporterPlugin.tags || []), "Specialized", "Bible", "USFM", "Experimental", "Round-trip"], + ...biblicaImporterPlugin, + name: "Biblica Study Notes", + description: "Biblica IDML importer with Study Bible notes", + tags: ["Specialized", "Bible", "Biblica", "Round-trip"], }, { - ...biblicaImporterPlugin, - name: "Biblica Files", - description: "Biblica IDML importer with Study Bible", - tags: [...(biblicaImporterPlugin.tags || []), "Specialized", "Bible", "Biblica"], + ...spreadsheetImporterPlugin, + name: "Bible Spreadsheet with Audio data", + description: "CSV and TSV files with audio URLs", + tags: ["Specialized", "Bible", "Spreadsheet", "CSV", "TSV", "Round-trip"], }, ]; diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.tsx index d662daf3b..a145a983d 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/subtitles/index.tsx @@ -13,5 +13,4 @@ export const subtitlesImporterPlugin: ImporterPlugin = { cellAligner: subtitlesCellAligner, supportedExtensions: ["vtt", "srt"], enabled: true, - tags: ["Media", "Timed"], }; diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/tms/index.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/tms/index.tsx index 19bc7add0..f59f6182e 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/tms/index.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/tms/index.tsx @@ -10,5 +10,4 @@ export const tmsImporterPlugin: ImporterPlugin = { component: TmxImporterForm, supportedExtensions: ["tmx", "xliff", "xlf"], enabled: true, - tags: ["Essential", "Translation", "Localization"], }; \ No newline at end of file From b93bda72c3b4341346fa394045ed3983fce1bb20 Mon Sep 17 00:00:00 2001 From: LeviXIII Date: Tue, 17 Feb 2026 13:10:59 -0500 Subject: [PATCH 6/6] - Use fileDisplayName for the webview tab instead of having uuid and filename shown when there are duplicate file names. --- .../codexCellEditorProvider/codexCellEditorProvider.ts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/providers/codexCellEditorProvider/codexCellEditorProvider.ts b/src/providers/codexCellEditorProvider/codexCellEditorProvider.ts index 2275a53f3..505d913e8 100755 --- a/src/providers/codexCellEditorProvider/codexCellEditorProvider.ts +++ b/src/providers/codexCellEditorProvider/codexCellEditorProvider.ts @@ -707,6 +707,11 @@ export class CodexCellEditorProvider implements vscode.CustomEditorProvider