Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .github/workflows/spec-review.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ name: OOXML Spec Compliance Review
on:
pull_request:
paths:
- 'packages/super-editor/src/core/super-converter/v2/importer/**'
- 'packages/super-editor/src/core/super-converter/v2/exporter/**'
- 'packages/super-editor/src/core/super-converter/v3/handlers/**'

concurrency:
Expand Down Expand Up @@ -31,7 +33,7 @@ jobs:
- name: Get changed files
id: changed
run: |
FILES=$(gh pr diff ${{ github.event.pull_request.number }} --name-only | grep 'v3/handlers/' || true)
FILES=$(gh pr diff ${{ github.event.pull_request.number }} --name-only | grep -E 'v2/(importer|exporter)/|v3/handlers/' || true)
echo "files<<EOF" >> $GITHUB_OUTPUT
echo "$FILES" >> $GITHUB_OUTPUT
echo "EOF" >> $GITHUB_OUTPUT
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ export const createDocumentJson = (docx, converter, editor) => {

// Safety: drop any inline-only nodes that accidentally landed at the doc root
parsedContent = filterOutRootInlineNodes(parsedContent);
parsedContent = normalizeTableBookmarksInContent(parsedContent, editor);
collapseWhitespaceNextToInlinePassthrough(parsedContent);

const result = {
Expand Down Expand Up @@ -841,6 +842,210 @@ export function filterOutRootInlineNodes(content = []) {
return result;
}

/**
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ECMA spec doesn't actually show bookmarks as valid direct children of <w:tbl>. So this whole normalization is handling documents that aren't strictly conformant.

Would be nice to add a comment explaining that - something like:

/**                                                                                                                                                                                                                                              
   * Some non-conformant DOCX producers place bookmarks as direct table children.                                                                                                                                                                  
   * Per ECMA-376 §17.13.6.2, they should be inside cells (bookmarkStart) or                                                                                                                                                                       
   * as children of rows (bookmarkEnd). We relocate them for PM compatibility.                                                                                                                                                                     
   */

* Normalize bookmark nodes that appear as direct table children.
* Moves bookmarkStart/End into the first/last cell textblock of the table.
*
* Some non-conformant DOCX producers place bookmarks as direct table children.
* Per ECMA-376 §17.13.6.2, they should be inside cells (bookmarkStart) or
* as children of rows (bookmarkEnd).
* PM can't accept bookmarks as a direct child of table row and that is why
* we relocate them for compatibility.
*
* @param {Array<{type: string, content?: any[], attrs?: any}>} content
* @param {Editor} [editor]
* @returns {Array}
*/
export function normalizeTableBookmarksInContent(content = [], editor) {
if (!Array.isArray(content) || content.length === 0) return content;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Table bookmarks can have colFirst and colLast attributes that specify which columns they cover:

<w:bookMarkStart w:colFirst="0" w:colLast="1" w:id="0" w:name="table"/>

I didn't see these being preserved when we relocate the bookmarks. If someone has a column-range bookmark, we might lose that info. Worth checking if this matters for our use cases, or at least adding a comment about it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will check it

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@caio-pizzol aded support for colFirst/colLast attributes when calculating the cell for normalisation


return content.map((node) => normalizeTableBookmarksInNode(node, editor));
}

function normalizeTableBookmarksInNode(node, editor) {
if (!node || typeof node !== 'object') return node;

if (node.type === 'table') {
node = normalizeTableBookmarksInTable(node, editor);
}

if (Array.isArray(node.content)) {
node.content = normalizeTableBookmarksInContent(node.content, editor);
}

return node;
}

function parseColIndex(val) {
if (val == null || val === '') return null;
const n = parseInt(String(val), 10);
return Number.isNaN(n) ? null : Math.max(0, n);
}

/** colFirst/colLast apply only to bookmarkStart; bookmarkEnd always uses first/last cell by position. */
function getCellIndexForBookmark(bookmarkNode, position, rowCellCount) {
if (!rowCellCount) return 0;
if (bookmarkNode?.type === 'bookmarkEnd') {
return position === 'start' ? 0 : rowCellCount - 1;
}
const attrs = bookmarkNode?.attrs ?? {};
const col = parseColIndex(position === 'start' ? attrs.colFirst : attrs.colLast);
if (col == null) return position === 'start' ? 0 : rowCellCount - 1;
return Math.min(col, rowCellCount - 1);
}

function addBookmarkToRowCellInlines(rowCellInlines, rowIndex, position, bookmarkNode, rowCellCount) {
const cellIndex = getCellIndexForBookmark(bookmarkNode, position, rowCellCount);
const bucket = rowCellInlines[rowIndex][position];
if (!bucket[cellIndex]) bucket[cellIndex] = [];
bucket[cellIndex].push(bookmarkNode);
}

function normalizeTableBookmarksInTable(tableNode, editor) {
if (!tableNode || tableNode.type !== 'table' || !Array.isArray(tableNode.content)) return tableNode;

const rows = tableNode.content.filter((child) => child?.type === 'tableRow');
if (!rows.length) return tableNode;

/** @type {{ start: Record<number, unknown[]>, end: Record<number, unknown[]> }[]} */
const rowCellInlines = rows.map(() => ({
start: /** @type {Record<number, unknown[]>} */ ({}),
end: /** @type {Record<number, unknown[]>} */ ({}),
}));
const updatedRows = rows.slice();
let rowCursor = 0;

const normalizedContent = tableNode.content.reduce((acc, child) => {
if (child?.type === 'tableRow') {
acc.push(updatedRows[rowCursor] ?? child);
rowCursor += 1;
return acc;
}

if (isBookmarkNode(child)) {
const prevRowIndex = rowCursor > 0 ? rowCursor - 1 : null;
const nextRowIndex = rowCursor < rows.length ? rowCursor : null;
const row = (nextRowIndex ?? prevRowIndex) != null ? rows[nextRowIndex ?? prevRowIndex] : null;
const rowCellCount = row?.content?.length ?? 0;

if (child.type === 'bookmarkStart') {
if (nextRowIndex != null)
addBookmarkToRowCellInlines(rowCellInlines, nextRowIndex, 'start', child, rowCellCount);
else if (prevRowIndex != null)
addBookmarkToRowCellInlines(rowCellInlines, prevRowIndex, 'end', child, rowCellCount);
} else {
if (prevRowIndex != null) addBookmarkToRowCellInlines(rowCellInlines, prevRowIndex, 'end', child, rowCellCount);
else if (nextRowIndex != null)
addBookmarkToRowCellInlines(rowCellInlines, nextRowIndex, 'start', child, rowCellCount);
}
return acc;
}

acc.push(child);
return acc;
}, []);

updatedRows.forEach((row, index) => {
const { start: startByCell, end: endByCell } = rowCellInlines[index];
const cellIndices = [
...new Set([...Object.keys(startByCell).map(Number), ...Object.keys(endByCell).map(Number)]),
].sort((a, b) => a - b);
for (const cellIndex of cellIndices) {
const startNodes = startByCell[cellIndex];
const endNodes = endByCell[cellIndex];
if (startNodes?.length)
updatedRows[index] = insertInlineIntoRow(updatedRows[index], startNodes, editor, 'start', cellIndex);
if (endNodes?.length)
updatedRows[index] = insertInlineIntoRow(updatedRows[index], endNodes, editor, 'end', cellIndex);
}
});

// Rebuild content with updated rows (preserve any non-row nodes in place).
let updatedRowIndex = 0;
const rebuiltContent = normalizedContent.map((child) => {
if (child?.type === 'tableRow') {
const replacement = updatedRows[updatedRowIndex] ?? child;
updatedRowIndex += 1;
return replacement;
}
return child;
});

return {
...tableNode,
content: rebuiltContent,
};
}

/**
* @param {number} [cellIndex] - If set, insert into this cell; otherwise first (start) or last (end) cell.
*/
function insertInlineIntoRow(rowNode, inlineNodes, editor, position, cellIndex) {
if (!rowNode || !inlineNodes?.length) return rowNode;

if (!Array.isArray(rowNode.content) || rowNode.content.length === 0) {
const paragraph = { type: 'paragraph', content: inlineNodes };
const newCell = { type: 'tableCell', content: [paragraph], attrs: {}, marks: [] };
return { ...rowNode, content: [newCell] };
}

const lastCellIndex = rowNode.content.length - 1;
const targetIndex =
cellIndex != null ? Math.min(Math.max(0, cellIndex), lastCellIndex) : position === 'end' ? lastCellIndex : 0;
const targetCell = rowNode.content[targetIndex];
const updatedCell = insertInlineIntoCell(targetCell, inlineNodes, editor, position);

if (updatedCell === targetCell) return rowNode;

const nextContent = rowNode.content.slice();
nextContent[targetIndex] = updatedCell;
return { ...rowNode, content: nextContent };
}

function findTextblockIndex(content, editor, fromEnd) {
const start = fromEnd ? content.length - 1 : 0;
const end = fromEnd ? -1 : content.length;
const step = fromEnd ? -1 : 1;
for (let i = start; fromEnd ? i > end : i < end; i += step) {
if (isTextblockNode(content[i], editor)) return i;
}
return -1;
}

function insertInlineIntoCell(cellNode, inlineNodes, editor, position) {
if (!cellNode || !inlineNodes?.length) return cellNode;

const content = Array.isArray(cellNode.content) ? cellNode.content.slice() : [];
const targetIndex = findTextblockIndex(content, editor, position === 'end');

if (targetIndex === -1) {
const paragraph = { type: 'paragraph', content: inlineNodes };
if (position === 'end') content.push(paragraph);
else content.unshift(paragraph);
return { ...cellNode, content };
}

const targetBlock = content[targetIndex] || { type: 'paragraph', content: [] };
const blockContent = Array.isArray(targetBlock.content) ? targetBlock.content.slice() : [];
const nextBlockContent = position === 'end' ? blockContent.concat(inlineNodes) : inlineNodes.concat(blockContent);

content[targetIndex] = { ...targetBlock, content: nextBlockContent };
return { ...cellNode, content };
}

function isBookmarkNode(node) {
const typeName = node?.type;
return typeName === 'bookmarkStart' || typeName === 'bookmarkEnd';
}

function isTextblockNode(node, editor) {
const typeName = node?.type;
if (!typeName) return false;
const nodeType = editor?.schema?.nodes?.[typeName];
if (nodeType && typeof nodeType.isTextblock === 'boolean') return nodeType.isTextblock;
return typeName === 'paragraph';
}

/**
* Reconstruct original OOXML for preservable inline nodes using their attribute decoders.
*
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import { describe, it, expect } from 'vitest';
import { collapseWhitespaceNextToInlinePassthrough, filterOutRootInlineNodes } from './docxImporter.js';
import {
collapseWhitespaceNextToInlinePassthrough,
filterOutRootInlineNodes,
normalizeTableBookmarksInContent,
} from './docxImporter.js';

const n = (type, attrs = {}) => ({ type, attrs, marks: [] });

Expand Down Expand Up @@ -178,3 +182,114 @@ describe('collapseWhitespaceNextToInlinePassthrough', () => {
expect(tree[0].content[2].content[0].text).toBe('bar');
});
});

describe('normalizeTableBookmarksInContent', () => {
const table = (content) => ({ type: 'table', content, attrs: {}, marks: [] });
const row = (cells) => ({ type: 'tableRow', content: cells, attrs: {}, marks: [] });
const cell = (content) => ({ type: 'tableCell', content, attrs: {}, marks: [] });
const paragraph = (content) => ({ type: 'paragraph', content, attrs: {}, marks: [] });
const text = (value) => ({ type: 'text', text: value, marks: [] });
const bookmarkStart = (id, attrs = {}) => ({ type: 'bookmarkStart', attrs: { id, ...attrs } });
const bookmarkEnd = (id, attrs = {}) => ({ type: 'bookmarkEnd', attrs: { id, ...attrs } });

it('moves leading bookmarkStart into the first cell paragraph', () => {
const input = [table([bookmarkStart('b1'), row([cell([paragraph([text('Cell')])])])])];

const result = normalizeTableBookmarksInContent(input);
const normalizedTable = result[0];

expect(normalizedTable.content.some((node) => node.type === 'bookmarkStart')).toBe(false);
const paraContent = normalizedTable.content[0].content[0].content[0].content;
expect(paraContent[0]).toMatchObject({ type: 'bookmarkStart', attrs: { id: 'b1' } });
expect(paraContent[1]).toMatchObject({ type: 'text', text: 'Cell' });
});

it('moves trailing bookmarkEnd into the last cell paragraph', () => {
const input = [table([row([cell([paragraph([text('Cell')])])]), bookmarkEnd('b1')])];

const result = normalizeTableBookmarksInContent(input);
const normalizedTable = result[0];

expect(normalizedTable.content.some((node) => node.type === 'bookmarkEnd')).toBe(false);
const paraContent = normalizedTable.content[0].content[0].content[0].content;
expect(paraContent[0]).toMatchObject({ type: 'text', text: 'Cell' });
expect(paraContent[1]).toMatchObject({ type: 'bookmarkEnd', attrs: { id: 'b1' } });
});

it('moves bookmarkStart and bookmarkEnd into the same cell when no textblocks exist', () => {
const input = [table([bookmarkStart('b1'), row([cell([])]), bookmarkEnd('b1')])];

const result = normalizeTableBookmarksInContent(input);
const normalizedTable = result[0];

expect(normalizedTable.content.some((node) => node.type === 'bookmarkStart')).toBe(false);
expect(normalizedTable.content.some((node) => node.type === 'bookmarkEnd')).toBe(false);

const paraContent = normalizedTable.content[0].content[0].content[0].content;
expect(paraContent).toEqual([
{ type: 'bookmarkStart', attrs: { id: 'b1' } },
{ type: 'bookmarkEnd', attrs: { id: 'b1' } },
]);
});

it('anchors bookmark boundaries to adjacent rows when markers appear between rows', () => {
const input = [
table([
bookmarkStart('b1'),
row([cell([paragraph([text('R1')])])]),
bookmarkEnd('b1'),
row([cell([paragraph([text('R2')])])]),
]),
];

const result = normalizeTableBookmarksInContent(input);
const normalizedTable = result[0];

const row1Content = normalizedTable.content[0].content[0].content[0].content;
expect(row1Content).toEqual([
{ type: 'bookmarkStart', attrs: { id: 'b1' } },
{ type: 'text', text: 'R1', marks: [] },
{ type: 'bookmarkEnd', attrs: { id: 'b1' } },
]);

const row2Content = normalizedTable.content[1].content[0].content[0].content;
expect(row2Content).toEqual([{ type: 'text', text: 'R2', marks: [] }]);
});

it('creates a cell when a row is empty', () => {
const input = [table([bookmarkStart('b1'), row([]), bookmarkEnd('b1')])];

const result = normalizeTableBookmarksInContent(input);
const normalizedTable = result[0];

const rowContent = normalizedTable.content[0].content;
expect(rowContent).toHaveLength(1);
expect(rowContent[0].type).toBe('tableCell');

const paraContent = rowContent[0].content[0].content;
expect(paraContent).toEqual([
{ type: 'bookmarkStart', attrs: { id: 'b1' } },
{ type: 'bookmarkEnd', attrs: { id: 'b1' } },
]);
});

it('places bookmarkStart in the cell indicated by colFirst when present; bookmarkEnd uses first/last cell only', () => {
const twoCells = row([cell([paragraph([text('A')])]), cell([paragraph([text('B')])])]);
const input = [table([bookmarkStart('b1', { colFirst: '1' }), twoCells, bookmarkEnd('b1')])];

const result = normalizeTableBookmarksInContent(input);
const normalizedTable = result[0];
const rowContent = normalizedTable.content[0].content;

expect(normalizedTable.content.some((node) => node.type === 'bookmarkStart')).toBe(false);
expect(normalizedTable.content.some((node) => node.type === 'bookmarkEnd')).toBe(false);

const firstCellContent = rowContent[0].content[0].content;
expect(firstCellContent).toEqual([{ type: 'text', text: 'A', marks: [] }]);

const secondCellContent = rowContent[1].content[0].content;
expect(secondCellContent[0]).toMatchObject({ type: 'bookmarkStart', attrs: { id: 'b1', colFirst: '1' } });
expect(secondCellContent[1]).toMatchObject({ type: 'text', text: 'B', marks: [] });
expect(secondCellContent[2]).toMatchObject({ type: 'bookmarkEnd', attrs: { id: 'b1' } });
});
});
Loading