diff --git a/pyproject.toml b/pyproject.toml index 38e7174..81b8902 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ nupunkt = [ ] levenshtein = ["Levenshtein>=0.27.1"] pdf = ["pdfplumber>=0.10.0"] +docx = ["lxml>=4.9.0"] [project.scripts] redlines = 'redlines.cli:cli' @@ -86,3 +87,7 @@ ignore_missing_imports = true [[tool.mypy.overrides]] module = "pdfplumber" ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = ["lxml", "lxml.*"] +ignore_missing_imports = true diff --git a/redlines/__init__.py b/redlines/__init__.py index 1d0f930..e4bb5a2 100644 --- a/redlines/__init__.py +++ b/redlines/__init__.py @@ -77,6 +77,7 @@ """ +from .docx import * from .document import * from .enums import * from .pdf import * diff --git a/redlines/cli.py b/redlines/cli.py index b80aa32..5fa20ec 100644 --- a/redlines/cli.py +++ b/redlines/cli.py @@ -328,7 +328,7 @@ def cli() -> None: pass -@cli.command() # type: ignore[no-untyped-call] +@cli.command() # type: ignore[no-untyped-call, untyped-decorator] @click.argument("source", required=True) @click.argument("test", required=True) @click.option( @@ -365,7 +365,7 @@ def compare(ctx: click.Context, source: str, test: str, pretty: bool) -> None: _set_exit_code(ctx, redlines) -@cli.command() # type: ignore[no-untyped-call] +@cli.command() # type: ignore[no-untyped-call, untyped-decorator] @click.argument("source", required=True) @click.argument("test", required=True) @click.option( @@ -417,7 +417,7 @@ def text(ctx: click.Context, source: str, test: str, quiet: bool) -> None: _set_exit_code(ctx, redlines) -@cli.command() # type: ignore[no-untyped-call] +@cli.command() # type: ignore[no-untyped-call, untyped-decorator] @click.argument("source", required=True) @click.argument("test", required=True) @click.pass_context @@ -441,7 +441,7 @@ def simple_text(ctx: click.Context, source: str, test: str) -> None: _set_exit_code(ctx, redlines) -@cli.command() # type: ignore[no-untyped-call] +@cli.command() # type: ignore[no-untyped-call, untyped-decorator] @click.argument("source", required=True) @click.argument("test", required=True) @click.option( @@ -488,7 +488,7 @@ def markdown( _set_exit_code(ctx, redlines) -@cli.command() # type: ignore[no-untyped-call] +@cli.command() # type: ignore[no-untyped-call, untyped-decorator] @click.argument("source", required=True) @click.argument("test", required=True) @click.option( @@ -522,7 +522,7 @@ def json(ctx: click.Context, source: str, test: str, pretty: bool) -> None: _set_exit_code(ctx, redlines) -@cli.command() # type: ignore[no-untyped-call] +@cli.command() # type: ignore[no-untyped-call, untyped-decorator] @click.argument("source", required=True) @click.argument("test", required=True) @click.option( @@ -573,7 +573,7 @@ def stats(ctx: click.Context, source: str, test: str, quiet: bool) -> None: _set_exit_code(ctx, redlines) -@cli.command() # type: ignore[no-untyped-call] +@cli.command() # type: ignore[no-untyped-call, untyped-decorator] @click.option( "--open", "-o", diff --git a/redlines/docx.py b/redlines/docx.py new file mode 100644 index 0000000..20021df --- /dev/null +++ b/redlines/docx.py @@ -0,0 +1,219 @@ +"""DOCX document support for redlines. + +Provides format-aware comparison of ``.docx`` files. Each word-level token +carries both its text and formatting properties (bold, italic, font, paragraph +style, etc.). The diff algorithm treats tokens as different when *either* text +or formatting changes, so a word going from normal to bold shows up as a +replacement even though the text is identical. + +Installation +------------ + +DOCX support requires ``lxml``:: + + pip install lxml + + # or install redlines with the docx extra + pip install redlines[docx] + +Usage +----- + +:: + + from redlines import Redlines + from redlines.docx import DocxFile + + source = DocxFile("contract_v1.docx") + test = DocxFile("contract_v2.docx") + + diff = Redlines(source, test) + print(diff.output_json(pretty=True)) +""" + +from __future__ import annotations + +import os +import re +from difflib import SequenceMatcher +from typing import Any + +from .document import Document +from .processor import Chunk, DiffOperation, RedlinesProcessor, RichToken + +__all__: tuple[str, ...] = ("DocxFile", "DocxProcessor", "DOCX_AVAILABLE") + +try: + from lxml import etree as _etree # type: ignore[attr-defined] # noqa: F401 + + from .docx_parser import parse_docx + + DOCX_AVAILABLE = True +except ImportError: + DOCX_AVAILABLE = False + parse_docx = None # type: ignore[assignment] + +# Re-use the same word-level tokenizer as the plain-text processor. +_tokenizer = re.compile(r"((?:[^()\s]+|[().?!-])\s*)") + + +# ── Helpers ─────────────────────────────────────────────────────────── + +def _tokenize_run_text(text: str) -> list[str]: + """Split run text into word-level tokens.""" + return re.findall(_tokenizer, text) + + +def _build_rich_tokens( + paragraphs: list[dict[str, Any]], +) -> list[RichToken]: + """Convert parsed paragraph data into a flat list of ``RichToken`` objects. + + Paragraph properties are merged into every word token so that each token + carries a complete flat snapshot of both character and paragraph formatting. + Paragraphs are separated by a ``¶`` marker token (matching the convention + used by the plain-text processors). + """ + tokens: list[RichToken] = [] + + for para in paragraphs: + para_props: dict[str, str] = para["properties"] + runs: list[dict[str, Any]] = para["runs"] + + if not runs: + continue + + # Paragraph separator + if tokens: + tokens.append(RichToken(text=" ¶ ", formatting=())) + + for run in runs: + run_text: str = run["text"] + run_props: dict[str, str] = run["properties"] + + # Merge paragraph + run props into a flat dict + merged = {**para_props, **run_props} + formatting = tuple(sorted(merged.items())) + + for word in _tokenize_run_text(run_text): + tokens.append(RichToken(text=word, formatting=formatting)) + + return tokens + + +# ── DocxFile ────────────────────────────────────────────────────────── + +class DocxFile(Document): + """Document class for ``.docx`` files with rich formatting metadata. + + Implements the ``Document`` interface so it can be passed directly to + ``Redlines``. When both source and test are ``DocxFile`` objects the + ``Redlines`` constructor auto-selects ``DocxProcessor`` for format-aware + comparison. + + :param file_path: Path to a ``.docx`` file. + :raises ImportError: If ``lxml`` is not installed. + """ + + _text: str + _rich_tokens: list[RichToken] + _paragraphs: list[dict[str, Any]] + + def __init__(self, file_path: str | bytes | os.PathLike[str]) -> None: + if not DOCX_AVAILABLE: + raise ImportError( + "Missing required package: lxml.\n" + "\n" + "Cause: The lxml package is required for DOCX support but is not installed.\n" + "\n" + "To fix: Install lxml:\n" + " pip install lxml\n" + "\n" + " # Install redlines with DOCX support\n" + " pip install redlines[docx]\n" + ) + + self._paragraphs = parse_docx(file_path) + self._rich_tokens = _build_rich_tokens(self._paragraphs) + + # Plain text for backward-compatible .text property + text_parts: list[str] = [] + for para in self._paragraphs: + para_text = "".join(run["text"] for run in para["runs"]) + if para_text: + text_parts.append(para_text) + self._text = "\n\n".join(text_parts) + + @property + def text(self) -> str: + """Plain text extracted from the document (no formatting).""" + return self._text + + @property + def rich_tokens(self) -> list[RichToken]: + """Flat list of word-level tokens with formatting metadata.""" + return self._rich_tokens + + @property + def paragraphs(self) -> list[dict[str, Any]]: + """Raw paragraph structure as returned by the parser.""" + return self._paragraphs + + +# ── DocxProcessor ───────────────────────────────────────────────────── + +class DocxProcessor(RedlinesProcessor): + """Processor that compares two ``DocxFile`` documents at word level. + + Comparison uses ``RichToken`` objects so that both text *and* formatting + are considered. A word whose text is unchanged but whose formatting + differs will appear as a ``replace`` operation. + """ + + def process( + self, source: Document | str, test: Document | str + ) -> list[DiffOperation]: + if not isinstance(source, DocxFile) or not isinstance(test, DocxFile): + raise TypeError( + "DocxProcessor requires DocxFile inputs.\n" + "\n" + "Cause: Both source and test must be DocxFile instances.\n" + "\n" + "To fix:\n" + " source = DocxFile('old.docx')\n" + " test = DocxFile('new.docx')\n" + " diff = Redlines(source, test)\n" + ) + + source_tokens = source.rich_tokens + test_tokens = test.rich_tokens + + # Normalize text whitespace for comparison, keeping formatting intact. + source_norm = [rt.normalized() for rt in source_tokens] + test_norm = [rt.normalized() for rt in test_tokens] + + matcher = SequenceMatcher(None, source_norm, test_norm) + + # Plain-text token lists for backward-compatible Chunk.text + source_text_tokens = [rt.text for rt in source_tokens] + test_text_tokens = [rt.text for rt in test_tokens] + + source_chunk = Chunk( + text=source_text_tokens, + chunk_location=None, + rich_tokens=source_tokens, + ) + test_chunk = Chunk( + text=test_text_tokens, + chunk_location=None, + rich_tokens=test_tokens, + ) + + return [ + DiffOperation( + source_chunk=source_chunk, + test_chunk=test_chunk, + opcodes=opcode, + ) + for opcode in matcher.get_opcodes() + ] diff --git a/redlines/docx_parser.py b/redlines/docx_parser.py new file mode 100644 index 0000000..5c90b68 --- /dev/null +++ b/redlines/docx_parser.py @@ -0,0 +1,251 @@ +"""Low-level DOCX XML parsing using zipfile + lxml. + +Extracts paragraph and run structure from ``word/document.xml`` inside a +``.docx`` file. Each paragraph is returned as a dict with ``properties`` +(paragraph-level formatting) and ``runs`` (list of text segments with +character-level formatting). + +Only the main document body is parsed — headers, footers, comments and +footnotes are ignored. Track-change markup (````/````) is +read as-is (accepted state) rather than resolved. +""" + +from __future__ import annotations + +import os +import zipfile +from io import BytesIO +from typing import Any + +from lxml import etree # type: ignore[attr-defined] + +# ── OOXML namespaces ───────────────────────────────────────────────── +WORD_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" +W = f"{{{WORD_NS}}}" + + +# ── Public API ──────────────────────────────────────────────────────── + +def parse_docx( + source: str | bytes | os.PathLike[str] | BytesIO, +) -> list[dict[str, Any]]: + """Parse a DOCX file and return a list of paragraph dicts. + + Each paragraph dict has: + - ``properties``: ``dict[str, str]`` of paragraph-level formatting. + - ``runs``: ``list[dict]`` where each run has ``text`` (str) and + ``properties`` (``dict[str, str]``). + + :param source: File path, raw bytes of the ``.docx``, or a file-like + ``BytesIO`` object. + :returns: List of paragraph dicts. + """ + if isinstance(source, bytes): + buf = BytesIO(source) + elif isinstance(source, BytesIO): + buf = source + else: + buf = str(source) # type: ignore[assignment] + + with zipfile.ZipFile(buf) as zf: + doc_xml = zf.read("word/document.xml") + + root = etree.fromstring(doc_xml) + body = root.find(f"{W}body") + if body is None: + return [] + + return [_parse_paragraph(p) for p in body.findall(f"{W}p")] + + +# ── Paragraph parsing ───────────────────────────────────────────────── + +def _parse_paragraph(p_elem: etree._Element) -> dict[str, Any]: + ppr = p_elem.find(f"{W}pPr") + props = _extract_paragraph_properties(ppr) + + runs: list[dict[str, Any]] = [] + for child in p_elem: + tag = _local_tag(child) + if tag == "r": + run = _parse_run(child) + if run["text"]: + runs.append(run) + # Track-change wrappers — read the accepted content inside them. + elif tag in ("ins", "del"): + for inner in child.findall(f"{W}r"): + run = _parse_run(inner) + if run["text"]: + runs.append(run) + + return {"properties": props, "runs": runs} + + +# ── Run parsing ─────────────────────────────────────────────────────── + +def _parse_run(r_elem: etree._Element) -> dict[str, Any]: + rpr = r_elem.find(f"{W}rPr") + props = _extract_run_properties(rpr) + + text_parts: list[str] = [] + for child in r_elem: + tag = _local_tag(child) + if tag == "t": + text_parts.append(child.text or "") + elif tag == "tab": + text_parts.append("\t") + elif tag == "br": + text_parts.append("\n") + + return {"text": "".join(text_parts), "properties": props} + + +# ── Attribute helpers ───────────────────────────────────────────────── + +def _local_tag(elem: etree._Element) -> str: + """Return the local element name without namespace.""" + tag = elem.tag + if isinstance(tag, str) and "}" in tag: + return tag.split("}", 1)[1] + return str(tag) + + +def _get_val(elem: etree._Element) -> str | None: + """Get the ``val`` attribute, trying both namespaced and plain.""" + val: str | None = elem.get(f"{W}val") # type: ignore[assignment] # lxml returns Any + if val is None: + val = elem.get("val") # type: ignore[assignment] + return val + + +def _is_toggle_on(elem: etree._Element | None) -> bool | None: + """Interpret an OOXML toggle element. + + * ```` (no val) → True + * ```` → True + * ```` → False + * element absent → None + """ + if elem is None: + return None + val = _get_val(elem) + if val is None: + return True + return val.lower() not in ("false", "0", "off", "none") + + +# ── Property extraction ────────────────────────────────────────────── + +def _extract_paragraph_properties( + ppr: etree._Element | None, +) -> dict[str, str]: + props: dict[str, str] = {} + if ppr is None: + return props + + # Style name + pstyle = ppr.find(f"{W}pStyle") + if pstyle is not None: + val = _get_val(pstyle) + if val: + props["paragraph_style"] = val + + # Justification / alignment + jc = ppr.find(f"{W}jc") + if jc is not None: + val = _get_val(jc) + if val: + props["alignment"] = val + + # Indentation + ind = ppr.find(f"{W}ind") + if ind is not None: + for attr in ("left", "right", "hanging", "firstLine"): + val = ind.get(f"{W}{attr}") or ind.get(attr) + if val: + props[f"indent_{attr}"] = val + + # Spacing + spacing = ppr.find(f"{W}spacing") + if spacing is not None: + for attr in ("before", "after", "line", "lineRule"): + val = spacing.get(f"{W}{attr}") or spacing.get(attr) + if val: + props[f"spacing_{attr}"] = val + + # Numbering (lists) + numpr = ppr.find(f"{W}numPr") + if numpr is not None: + ilvl = numpr.find(f"{W}ilvl") + if ilvl is not None: + val = _get_val(ilvl) + if val: + props["num_level"] = val + numid = numpr.find(f"{W}numId") + if numid is not None: + val = _get_val(numid) + if val: + props["num_id"] = val + + return props + + +def _extract_run_properties( + rpr: etree._Element | None, +) -> dict[str, str]: + props: dict[str, str] = {} + if rpr is None: + return props + + # Boolean toggles + for prop_name in ("b", "i", "strike", "dstrike", "caps", "smallCaps"): + elem = rpr.find(f"{W}{prop_name}") + toggle = _is_toggle_on(elem) + if toggle is not None: + props[prop_name] = str(toggle).lower() + + # Underline + u_elem = rpr.find(f"{W}u") + if u_elem is not None: + u_val = _get_val(u_elem) + if u_val and u_val.lower() != "none": + props["u"] = u_val + + # Font family + rfonts = rpr.find(f"{W}rFonts") + if rfonts is not None: + for attr in ("ascii", "hAnsi", "eastAsia", "cs"): + font_val: str | None = rfonts.get(f"{W}{attr}") or rfonts.get(attr) # type: ignore[assignment] + if font_val: + props["font"] = font_val + break + + # Font size (half-points) + sz = rpr.find(f"{W}sz") + if sz is not None: + sz_val = _get_val(sz) + if sz_val: + props["sz"] = sz_val + + # Color + color = rpr.find(f"{W}color") + if color is not None: + color_val = _get_val(color) + if color_val: + props["color"] = color_val + + # Highlight + highlight = rpr.find(f"{W}highlight") + if highlight is not None: + hl_val = _get_val(highlight) + if hl_val: + props["highlight"] = hl_val + + # Superscript / subscript + vert_align = rpr.find(f"{W}vertAlign") + if vert_align is not None: + va_val = _get_val(vert_align) + if va_val: + props["vertAlign"] = va_val + + return props diff --git a/redlines/processor.py b/redlines/processor.py index ab287fe..80ca030 100644 --- a/redlines/processor.py +++ b/redlines/processor.py @@ -32,6 +32,7 @@ "Stats", "DiffOperation", "Chunk", + "RichToken", ) tokenizer = re.compile(r"((?:[^()\s]+|[().?!-])\s*)") @@ -183,6 +184,36 @@ def concatenate_sentences_and_add_chr_182(text: str) -> str: return "".join(result) +@dataclass(frozen=True) +class RichToken: + """A word-level token carrying text and formatting metadata. + + Used for format-aware comparison (e.g. DOCX documents). Two tokens are + equal only if both their text and formatting match, so "Hello" in bold + is different from "Hello" in normal. + + The formatting field is a sorted tuple of (key, value) string pairs, + making it hashable and usable with SequenceMatcher. + """ + + text: str + """The word text including any trailing whitespace.""" + formatting: tuple[tuple[str, str], ...] = () + """Sorted (key, value) pairs of formatting properties.""" + + def __str__(self) -> str: + return self.text + + @property + def formatting_dict(self) -> dict[str, str]: + """Return formatting as a plain dict (for JSON serialization).""" + return dict(self.formatting) + + def normalized(self) -> RichToken: + """Return a copy with stripped text (for comparison), preserving formatting.""" + return RichToken(text=self.text.strip(), formatting=self.formatting) + + @dataclass class Chunk: """A chunk of text that is being compared. In some cases, it may be the whole document""" @@ -191,6 +222,9 @@ class Chunk: """The tokens of the chunk""" chunk_location: str | None """An optional string describing the location of the chunk in the document. For example, a PDF page number""" + rich_tokens: list[RichToken] | None = None + """Optional rich tokens for format-aware comparison (e.g. DOCX). When present, each + entry corresponds 1:1 with the plain strings in ``text``.""" @dataclass diff --git a/redlines/redlines.py b/redlines/redlines.py index 57e0528..3040572 100644 --- a/redlines/redlines.py +++ b/redlines/redlines.py @@ -6,15 +6,101 @@ from rich.text import Text from typing_extensions import Unpack +from .docx import DocxFile, DocxProcessor from .document import Document from .enums import MarkdownStyle, OutputType -from .processor import DiffOperation, Redline, RedlinesProcessor, Stats, WholeDocumentProcessor +from .processor import DiffOperation, Redline, RedlinesProcessor, RichToken, Stats, WholeDocumentProcessor __all__: tuple[str, ...] = ( "Redlines", "RedlinesOptions", ) + +def _formatting_diff( + source_tokens: list[t.Any], test_tokens: list[t.Any] +) -> dict[str, dict[str, str | None]]: + """Compare formatting across two slices of RichTokens. + + Returns a dict of ``{property: {"from": old_val, "to": new_val}}`` for + every property that differs between the two token slices. Properties are + aggregated across all tokens in each slice — if a property has multiple + distinct values in a slice the most common one is used. + """ + from collections import Counter + + def _aggregate(tokens: list[RichToken]) -> dict[str, str]: + counts: dict[str, Counter[str]] = {} + for tok in tokens: + for k, v in tok.formatting: + counts.setdefault(k, Counter())[v] += 1 + return {k: ctr.most_common(1)[0][0] for k, ctr in counts.items()} + + src_agg = _aggregate(source_tokens) + tst_agg = _aggregate(test_tokens) + + all_keys = set(src_agg) | set(tst_agg) + diff: dict[str, dict[str, str | None]] = {} + for key in sorted(all_keys): + src_val = src_agg.get(key) + tst_val = tst_agg.get(key) + if src_val != tst_val: + diff[key] = {"from": src_val, "to": tst_val} + return diff + + +# Friendly display names for formatting properties. +_PROP_LABELS: dict[str, str] = { + "b": "bold", + "i": "italic", + "u": "underline", + "strike": "strikethrough", + "dstrike": "double-strikethrough", + "caps": "caps", + "smallCaps": "small-caps", + "sz": "font-size", + "font": "font", + "color": "color", + "highlight": "highlight", + "vertAlign": "vertical-align", + "paragraph_style": "style", + "alignment": "alignment", +} + + +def _describe_formatting_changes( + source_rich: list[RichToken], + test_rich: list[RichToken], +) -> str: + """Return a compact human-readable summary of formatting differences. + + Examples: + ``"+bold"`` ``"-italic"`` ``"style: Normal→Heading2"`` + ``"+bold, alignment: center→left"`` + """ + fmt = _formatting_diff(source_rich, test_rich) + if not fmt: + return "" + parts: list[str] = [] + for key, change in fmt.items(): + label = _PROP_LABELS.get(key, key) + frm = change["from"] + to = change["to"] + # Boolean-ish toggles: "+bold" / "-bold" + if to == "true" and frm is None: + parts.append(f"+{label}") + elif frm == "true" and to is None: + parts.append(f"-{label}") + # Value changes: "style: Normal→Heading2" + elif frm is not None and to is not None: + parts.append(f"{label}: {frm}\u2192{to}") + elif frm is None and to is not None: + parts.append(f"+{label}: {to}") + elif frm is not None and to is None: + parts.append(f"-{label}: {frm}") + return ", ".join(parts) + + # Workaround for enum + literal support in type hints # See: https://github.com/python/typing/issues/781 OutputTypeLike = OutputType | t.Literal["markdown", "rich"] @@ -27,6 +113,10 @@ class RedlinesOptions(t.TypedDict, total=False): """The CSS class to use for insertions when `markdown_style` is set to `custom_css`. Defaults to 'redline-inserted'.""" del_class: str """The CSS class to use for deletions when `markdown_style` is set to `custom_css`. Defaults to 'redline-deleted'.""" + fmt_class: str + """The CSS class to use for formatting-only changes when `markdown_style` is set to `custom_css`. Defaults to 'redline-formatting'.""" + fmt_note_class: str + """The CSS class to use for formatting annotations when `markdown_style` is set to `custom_css`. Defaults to 'redline-formatting-note'.""" class Redlines: @@ -185,6 +275,18 @@ def __init__( ``` + For DOCX comparison with formatting awareness: + + ```python + from redlines import Redlines + from redlines.docx import DocxFile + + source = DocxFile("old.docx") + test = DocxFile("new.docx") + diff = Redlines(source, test) + print(diff.output_json(pretty=True)) + ``` + For advanced use cases, you can specify a custom processor for different tokenization strategies: ```python @@ -209,10 +311,19 @@ def __init__( :param options: Additional options for comparison and output formatting. :type options: RedlinesOptions """ - self.processor = processor if processor is not None else WholeDocumentProcessor() - self.source = source.text if isinstance(source, Document) else source self.options = options self._diff_operations = None + + # Auto-detect DOCX inputs and use format-aware processor. + if isinstance(source, DocxFile) and isinstance(test, DocxFile): + self.processor = processor if processor is not None else DocxProcessor() + self._source = source.text + self._test = test.text + self._diff_operations = self.processor.process(source, test) + return + + self.processor = processor if processor is not None else WholeDocumentProcessor() + self.source = source.text if isinstance(source, Document) else source if test is not None: self.test = test.text if isinstance(test, Document) else test # self.compare() @@ -516,6 +627,11 @@ def output_markdown(self) -> str: You can also set your own CSS classes by specifying the name of the CSS class in the options `ins_class` and `del_class` respectively in the constructor or compare function. + When comparing DOCX files, formatting-only changes (where text is unchanged but formatting differs) + are rendered using separate `fmt` and `fmt_note` style pairs. For `custom_css`, these use the + "redline-formatting" and "redline-formatting-note" CSS classes by default, which can be overridden + using the `fmt_class` and `fmt_note_class` options. + ## Markdown output in specific environments Users have reported that the output doesn't display correctly in their environments. @@ -553,32 +669,56 @@ def output_markdown(self) -> str: # default_style = "red_green" - md_styles = { + # Formatting-change styles used alongside ins/del when rich tokens + # are available. "fmt" wraps the text of a formatting-only change, + # "fmt_note" wraps the human-readable annotation like "[+bold]". + md_styles: dict[str, tuple[str, str]] = { "ins": ( - f"", + "", "", ), "del": ( - f"", + "", "", ), + "fmt": ( + "", + "", + ), + "fmt_note": ( + "", + "", + ), } if "markdown_style" in self.options: style = self.options["markdown_style"] if style == "none" or style is None: - md_styles = {"ins": ("", ""), "del": ("", "")} + md_styles = { + "ins": ("", ""), + "del": ("", ""), + "fmt": ("", ""), + "fmt_note": ("", ""), + } elif style == "red": md_styles = { "ins": ( - f"", + "", "", ), "del": ( - f"", + "", + "", + ), + "fmt": ( + "", "", ), + "fmt_note": ( + "", + "", + ), } elif style == "custom_css": ins_class = ( @@ -591,36 +731,63 @@ def output_markdown(self) -> str: if "del_class" in self.options else "redline-deleted" ) - - elem_attributes = { - "ins": f"class='{ins_class}'", - "del": f"class='{del_class}'", - } + fmt_class = ( + self.options["fmt_class"] + if "fmt_class" in self.options + else "redline-formatting" + ) + fmt_note_class = ( + self.options["fmt_note_class"] + if "fmt_note_class" in self.options + else "redline-formatting-note" + ) md_styles = { "ins": ( - f"", + f"", "", ), "del": ( - f"", + f"", + "", + ), + "fmt": ( + f"", "", ), + "fmt_note": ( + f"", + "", + ), } elif style == "ghfm": - md_styles = {"ins": ("**", "**"), "del": ("~~", "~~")} + md_styles = { + "ins": ("**", "**"), + "del": ("~~", "~~"), + "fmt": ("***", "***"), + "fmt_note": (" _", "_"), + } elif style == "bbcode": md_styles = { "ins": ("[b][color=green]", "[/color][/b]"), "del": ("[s][color=red]", "[/color][/s]"), + "fmt": ("[u][color=blue]", "[/color][/u]"), + "fmt_note": ("[i][color=blue]", "[/color][/i]"), } elif style == "streamlit": - md_styles = {"ins": ("**:green[", "]** "), "del": ("~~:red[", "]~~ ")} + md_styles = { + "ins": ("**:green[", "]** "), + "del": ("~~:red[", "]~~ "), + "fmt": ("**:blue[", "]** "), + "fmt_note": ("_:blue[", "]_ "), + } for diff_op in self._diff_ops: tag, i1, i2, j1, j2 = diff_op.opcodes source_tokens = diff_op.source_chunk.text test_tokens = diff_op.test_chunk.text + source_rich = diff_op.source_chunk.rich_tokens + test_rich = diff_op.test_chunk.rich_tokens if tag == "equal": temp_str = "".join(source_tokens[i1:i2]) @@ -643,16 +810,45 @@ def output_markdown(self) -> str: # for 'delete', we make no change, because otherwise there will be two times '\n\n' than the original # text. elif tag == "replace": - result.append( - f"{md_styles['del'][0]}{''.join(source_tokens[i1:i2])}{md_styles['del'][1]}" - ) - temp_str = "".join(test_tokens[j1:j2]) - splits = re.split("¶ ", temp_str) - for split in splits: - result.append(f"{md_styles['ins'][0]}{split}{md_styles['ins'][1]}") - result.append("\n\n") - if len(splits) > 0: - result.pop() + src_text = "".join(source_tokens[i1:i2]) + tst_text = "".join(test_tokens[j1:j2]) + text_same = src_text.strip() == tst_text.strip() + + if text_same and source_rich is not None and test_rich is not None: + # Formatting-only change — show text once with annotation. + desc = _describe_formatting_changes( + source_rich[i1:i2], test_rich[j1:j2] + ) + clean = re.sub("¶ ", "\n\n", tst_text) + result.append( + f"{md_styles['fmt'][0]}{clean}{md_styles['fmt'][1]}" + ) + if desc: + result.append( + f"{md_styles['fmt_note'][0]}[{desc}]{md_styles['fmt_note'][1]}" + ) + else: + # Text changed — standard del + ins. + result.append( + f"{md_styles['del'][0]}{src_text}{md_styles['del'][1]}" + ) + temp_str = tst_text + splits = re.split("¶ ", temp_str) + for split in splits: + result.append(f"{md_styles['ins'][0]}{split}{md_styles['ins'][1]}") + result.append("\n\n") + if len(splits) > 0: + result.pop() + + # If there's also a formatting diff, append a note. + if source_rich is not None and test_rich is not None: + desc = _describe_formatting_changes( + source_rich[i1:i2], test_rich[j1:j2] + ) + if desc: + result.append( + f"{md_styles['fmt_note'][0]}[{desc}]{md_styles['fmt_note'][1]}" + ) return "".join(result) @@ -670,6 +866,8 @@ def output_rich(self) -> Text: tag, i1, i2, j1, j2 = diff_op.opcodes source_tokens = diff_op.source_chunk.text test_tokens = diff_op.test_chunk.text + source_rich = diff_op.source_chunk.rich_tokens + test_rich = diff_op.test_chunk.rich_tokens if tag == "equal": temp_str = "".join(source_tokens[i1:i2]) @@ -683,11 +881,31 @@ def output_rich(self) -> Text: elif tag == "delete": console_text.append("".join(source_tokens[i1:i2]), "strike red") elif tag == "replace": - console_text.append("".join(source_tokens[i1:i2]), "strike red") - temp_str = "".join(test_tokens[j1:j2]) - splits = re.split("¶ ", temp_str) - for split in splits: - console_text.append(split, "green") + src_text = "".join(source_tokens[i1:i2]) + tst_text = "".join(test_tokens[j1:j2]) + text_same = src_text.strip() == tst_text.strip() + + if text_same and source_rich is not None and test_rich is not None: + desc = _describe_formatting_changes( + source_rich[i1:i2], test_rich[j1:j2] + ) + clean = re.sub("¶ ", "\n\n", tst_text) + console_text.append(clean, "bold blue") + if desc: + console_text.append(f"[{desc}]", "blue") + else: + console_text.append(src_text, "strike red") + temp_str = tst_text + splits = re.split("¶ ", temp_str) + for split in splits: + console_text.append(split, "green") + + if source_rich is not None and test_rich is not None: + desc = _describe_formatting_changes( + source_rich[i1:i2], test_rich[j1:j2] + ) + if desc: + console_text.append(f"[{desc}]", "blue") return console_text @@ -737,12 +955,18 @@ def output_json(self, pretty: bool = False) -> str: # All operations share the same source and test tokens if not self._diff_ops: # Handle edge case of empty diff (e.g., both texts are empty or whitespace-only) - source_tokens = [] - test_tokens = [] + source_tokens: list[str] = [] + test_tokens: list[str] = [] + source_rich: list[RichToken] | None = None + test_rich: list[RichToken] | None = None else: first_op = self._diff_ops[0] source_tokens = first_op.source_chunk.text test_tokens = first_op.test_chunk.text + source_rich = first_op.source_chunk.rich_tokens + test_rich = first_op.test_chunk.rich_tokens + + has_rich = source_rich is not None and test_rich is not None # Don't clean tokens individually - we need to join them first then replace ¶ # This matches the approach in output_markdown (line 470) @@ -783,6 +1007,10 @@ def output_json(self, pretty: bool = False) -> str: "source_token_position": [i1, i2], "test_token_position": [j1, j2], } + if has_rich: + change["source_formatting"] = [ + rt.formatting_dict for rt in source_rich[i1:i2] # type: ignore[index] + ] source_char_offset += len(source_text) test_char_offset += len(test_text) elif tag == "delete": @@ -797,6 +1025,10 @@ def output_json(self, pretty: bool = False) -> str: "source_token_position": [i1, i2], "test_token_position": None, } + if has_rich: + change["source_formatting"] = [ + rt.formatting_dict for rt in source_rich[i1:i2] # type: ignore[index] + ] source_char_offset += len(source_text) elif tag == "insert": change = { @@ -810,6 +1042,10 @@ def output_json(self, pretty: bool = False) -> str: "source_token_position": None, "test_token_position": [j1, j2], } + if has_rich: + change["test_formatting"] = [ + rt.formatting_dict for rt in test_rich[j1:j2] # type: ignore[index] + ] test_char_offset += len(test_text) elif tag == "replace": change = { @@ -827,6 +1063,20 @@ def output_json(self, pretty: bool = False) -> str: "source_token_position": [i1, i2], "test_token_position": [j1, j2], } + if has_rich: + src_rich_slice = source_rich[i1:i2] # type: ignore[index] + tst_rich_slice = test_rich[j1:j2] # type: ignore[index] + change["source_formatting"] = [ + rt.formatting_dict for rt in src_rich_slice + ] + change["test_formatting"] = [ + rt.formatting_dict for rt in tst_rich_slice + ] + # Summarise what actually changed between source and test. + change["text_changed"] = source_text.strip() != test_text.strip() + fmt_changes = _formatting_diff(src_rich_slice, tst_rich_slice) + if fmt_changes: + change["formatting_changes"] = fmt_changes source_char_offset += len(source_text) test_char_offset += len(test_text) else: @@ -844,12 +1094,24 @@ def output_json(self, pretty: bool = False) -> str: # Build final JSON structure # Clean tokens for output by replacing paragraph markers - output_source_tokens = [ - token.replace("¶ ", "\n\n") for token in cleaned_source_tokens - ] - output_test_tokens = [ - token.replace("¶ ", "\n\n") for token in cleaned_test_tokens - ] + output_source_tokens: list[t.Any] + output_test_tokens: list[t.Any] + if has_rich: + output_source_tokens = [ + {"text": rt.text.replace("¶ ", "\n\n"), "formatting": rt.formatting_dict} + for rt in (source_rich or []) + ] + output_test_tokens = [ + {"text": rt.text.replace("¶ ", "\n\n"), "formatting": rt.formatting_dict} + for rt in (test_rich or []) + ] + else: + output_source_tokens = [ + token.replace("¶ ", "\n\n") for token in cleaned_source_tokens + ] + output_test_tokens = [ + token.replace("¶ ", "\n\n") for token in cleaned_test_tokens + ] result = { "source": self.source, diff --git a/tests/documents/DocxFile/source.docx b/tests/documents/DocxFile/source.docx new file mode 100644 index 0000000..d0ce69d Binary files /dev/null and b/tests/documents/DocxFile/source.docx differ diff --git a/tests/documents/DocxFile/test.docx b/tests/documents/DocxFile/test.docx new file mode 100644 index 0000000..2415074 Binary files /dev/null and b/tests/documents/DocxFile/test.docx differ diff --git a/tests/test_docx.py b/tests/test_docx.py new file mode 100644 index 0000000..fb31485 --- /dev/null +++ b/tests/test_docx.py @@ -0,0 +1,883 @@ +"""Tests for DOCX document comparison with formatting awareness.""" + +from __future__ import annotations + +import json +import os +import tempfile +import zipfile +from typing import Any + +import pytest + +# ── DOCX fixture builder ───────────────────────────────────────────── +# Creates minimal valid .docx files from paragraph/run descriptions so +# tests are self-contained (no binary fixture files needed). + +WORD_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" +REL_NS = "http://schemas.openxmlformats.org/package/2006/relationships" +CT_NS = "http://schemas.openxmlformats.org/package/2006/content-types" +OFFREL_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships" + + +def _build_run_xml(text: str, props: dict[str, str] | None = None) -> str: + """Build a element string.""" + rpr = "" + if props: + rpr_parts: list[str] = [] + for key, val in props.items(): + if key in ("b", "i", "strike", "dstrike", "caps", "smallCaps"): + if val == "true": + rpr_parts.append(f'') + else: + rpr_parts.append(f'') + elif key == "u": + rpr_parts.append(f'') + elif key == "sz": + rpr_parts.append(f'') + elif key == "color": + rpr_parts.append(f'') + elif key == "font": + rpr_parts.append(f'') + elif key == "highlight": + rpr_parts.append(f'') + elif key == "vertAlign": + rpr_parts.append(f'') + if rpr_parts: + rpr = "" + "".join(rpr_parts) + "" + + # xml:space="preserve" keeps leading/trailing spaces + return f'{rpr}{text}' + + +def _build_para_xml( + runs: list[dict[str, Any]], + props: dict[str, str] | None = None, +) -> str: + """Build a element string. + + Each run dict has ``text`` and optional ``props``. + """ + ppr = "" + if props: + ppr_parts: list[str] = [] + if "paragraph_style" in props: + ppr_parts.append(f'') + if "alignment" in props: + ppr_parts.append(f'') + if ppr_parts: + ppr = "" + "".join(ppr_parts) + "" + + run_xml = "".join( + _build_run_xml(r["text"], r.get("props")) for r in runs + ) + return f"{ppr}{run_xml}" + + +def build_docx(paragraphs: list[dict[str, Any]]) -> bytes: + """Build a minimal .docx (ZIP) from a list of paragraph descriptions. + + Each paragraph dict: + - ``runs``: list of ``{"text": str, "props": dict | None}`` + - ``props``: optional dict of paragraph-level properties + """ + body_parts = [_build_para_xml(p["runs"], p.get("props")) for p in paragraphs] + body_xml = "".join(body_parts) + + document_xml = ( + f'' + f'' + f"{body_xml}" + f"" + ) + + content_types = ( + f'' + f'' + f'' + f'' + f'' + f'' + ) + + rels = ( + f'' + f'' + f'' + f'' + ) + + import io + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf: + zf.writestr("[Content_Types].xml", content_types) + zf.writestr("_rels/.rels", rels) + zf.writestr("word/document.xml", document_xml) + return buf.getvalue() + + +def write_docx(path: str, paragraphs: list[dict[str, Any]]) -> None: + """Write a .docx to *path*.""" + with open(path, "wb") as f: + f.write(build_docx(paragraphs)) + + +# ── Skip if lxml not available ──────────────────────────────────────── + +pytest.importorskip("lxml") + +from redlines import Redlines # noqa: E402 +from redlines.docx import DocxFile, DocxProcessor, DOCX_AVAILABLE # noqa: E402 +from redlines.processor import RichToken # noqa: E402 + + +# ── Parser tests ────────────────────────────────────────────────────── + +class TestDocxParser: + """Test the low-level XML parser.""" + + def test_simple_paragraph(self, tmp_path: Any) -> None: + path = str(tmp_path / "simple.docx") + write_docx(path, [ + {"runs": [{"text": "Hello world"}]}, + ]) + + from redlines.docx_parser import parse_docx + paragraphs = parse_docx(path) + + assert len(paragraphs) == 1 + assert len(paragraphs[0]["runs"]) == 1 + assert paragraphs[0]["runs"][0]["text"] == "Hello world" + + def test_bold_run(self, tmp_path: Any) -> None: + path = str(tmp_path / "bold.docx") + write_docx(path, [ + {"runs": [{"text": "Bold text", "props": {"b": "true"}}]}, + ]) + + from redlines.docx_parser import parse_docx + paragraphs = parse_docx(path) + + run = paragraphs[0]["runs"][0] + assert run["properties"]["b"] == "true" + + def test_multiple_runs(self, tmp_path: Any) -> None: + path = str(tmp_path / "multi.docx") + write_docx(path, [ + {"runs": [ + {"text": "Normal "}, + {"text": "bold ", "props": {"b": "true"}}, + {"text": "italic", "props": {"i": "true"}}, + ]}, + ]) + + from redlines.docx_parser import parse_docx + paragraphs = parse_docx(path) + + runs = paragraphs[0]["runs"] + assert len(runs) == 3 + assert runs[0]["text"] == "Normal " + assert "b" not in runs[0]["properties"] + assert runs[1]["properties"]["b"] == "true" + assert runs[2]["properties"]["i"] == "true" + + def test_paragraph_style(self, tmp_path: Any) -> None: + path = str(tmp_path / "styled.docx") + write_docx(path, [ + { + "runs": [{"text": "Heading"}], + "props": {"paragraph_style": "Heading1"}, + }, + ]) + + from redlines.docx_parser import parse_docx + paragraphs = parse_docx(path) + + assert paragraphs[0]["properties"]["paragraph_style"] == "Heading1" + + def test_paragraph_alignment(self, tmp_path: Any) -> None: + path = str(tmp_path / "aligned.docx") + write_docx(path, [ + { + "runs": [{"text": "Centered"}], + "props": {"alignment": "center"}, + }, + ]) + + from redlines.docx_parser import parse_docx + paragraphs = parse_docx(path) + + assert paragraphs[0]["properties"]["alignment"] == "center" + + def test_font_and_size(self, tmp_path: Any) -> None: + path = str(tmp_path / "font.docx") + write_docx(path, [ + {"runs": [{"text": "Big text", "props": {"sz": "48", "font": "Arial"}}]}, + ]) + + from redlines.docx_parser import parse_docx + paragraphs = parse_docx(path) + + run = paragraphs[0]["runs"][0] + assert run["properties"]["sz"] == "48" + assert run["properties"]["font"] == "Arial" + + def test_color_and_highlight(self, tmp_path: Any) -> None: + path = str(tmp_path / "color.docx") + write_docx(path, [ + {"runs": [{"text": "Colored", "props": {"color": "FF0000", "highlight": "yellow"}}]}, + ]) + + from redlines.docx_parser import parse_docx + paragraphs = parse_docx(path) + + run = paragraphs[0]["runs"][0] + assert run["properties"]["color"] == "FF0000" + assert run["properties"]["highlight"] == "yellow" + + def test_empty_document(self, tmp_path: Any) -> None: + path = str(tmp_path / "empty.docx") + write_docx(path, []) + + from redlines.docx_parser import parse_docx + paragraphs = parse_docx(path) + + assert paragraphs == [] + + def test_multiple_paragraphs(self, tmp_path: Any) -> None: + path = str(tmp_path / "multi_para.docx") + write_docx(path, [ + {"runs": [{"text": "First paragraph"}]}, + {"runs": [{"text": "Second paragraph"}]}, + ]) + + from redlines.docx_parser import parse_docx + paragraphs = parse_docx(path) + + assert len(paragraphs) == 2 + assert paragraphs[0]["runs"][0]["text"] == "First paragraph" + assert paragraphs[1]["runs"][0]["text"] == "Second paragraph" + + +# ── RichToken tests ─────────────────────────────────────────────────── + +class TestRichToken: + def test_equality_same(self) -> None: + a = RichToken("hello", (("b", "true"),)) + b = RichToken("hello", (("b", "true"),)) + assert a == b + assert hash(a) == hash(b) + + def test_inequality_different_text(self) -> None: + a = RichToken("hello", (("b", "true"),)) + b = RichToken("world", (("b", "true"),)) + assert a != b + + def test_inequality_different_formatting(self) -> None: + a = RichToken("hello", (("b", "true"),)) + b = RichToken("hello", (("b", "false"),)) + assert a != b + + def test_inequality_formatting_vs_none(self) -> None: + a = RichToken("hello", (("b", "true"),)) + b = RichToken("hello", ()) + assert a != b + + def test_str(self) -> None: + t = RichToken("word ", (("i", "true"),)) + assert str(t) == "word " + + def test_formatting_dict(self) -> None: + t = RichToken("x", (("b", "true"), ("sz", "24"))) + assert t.formatting_dict == {"b": "true", "sz": "24"} + + def test_normalized(self) -> None: + t = RichToken("word ", (("b", "true"),)) + n = t.normalized() + assert n.text == "word" + assert n.formatting == t.formatting + + +# ── DocxFile tests ──────────────────────────────────────────────────── + +class TestDocxFile: + def test_text_property(self, tmp_path: Any) -> None: + path = str(tmp_path / "test.docx") + write_docx(path, [ + {"runs": [{"text": "Hello world"}]}, + ]) + + doc = DocxFile(path) + assert doc.text == "Hello world" + + def test_text_multiple_paragraphs(self, tmp_path: Any) -> None: + path = str(tmp_path / "test.docx") + write_docx(path, [ + {"runs": [{"text": "First"}]}, + {"runs": [{"text": "Second"}]}, + ]) + + doc = DocxFile(path) + assert doc.text == "First\n\nSecond" + + def test_rich_tokens(self, tmp_path: Any) -> None: + path = str(tmp_path / "test.docx") + write_docx(path, [ + {"runs": [ + {"text": "Hello ", "props": {"b": "true"}}, + {"text": "world"}, + ]}, + ]) + + doc = DocxFile(path) + tokens = doc.rich_tokens + + # "Hello " is one token, "world" is another + assert len(tokens) == 2 + hello_tok = tokens[0] + assert hello_tok.text == "Hello " + assert ("b", "true") in hello_tok.formatting + + world_tok = tokens[1] + assert world_tok.text == "world" + assert ("b", "true") not in world_tok.formatting + + def test_rich_tokens_paragraph_separator(self, tmp_path: Any) -> None: + path = str(tmp_path / "test.docx") + write_docx(path, [ + {"runs": [{"text": "First"}]}, + {"runs": [{"text": "Second"}]}, + ]) + + doc = DocxFile(path) + tokens = doc.rich_tokens + + # Should be: "First", " ¶ ", "Second" + assert len(tokens) == 3 + assert tokens[1].text == " ¶ " + + def test_rich_tokens_carry_paragraph_props(self, tmp_path: Any) -> None: + path = str(tmp_path / "test.docx") + write_docx(path, [ + { + "runs": [{"text": "Heading text"}], + "props": {"paragraph_style": "Heading1"}, + }, + ]) + + doc = DocxFile(path) + tokens = doc.rich_tokens + + # Each word token carries paragraph_style + for tok in tokens: + if tok.text.strip(): + assert ("paragraph_style", "Heading1") in tok.formatting + + +# ── DocxProcessor tests ────────────────────────────────────────────── + +class TestDocxProcessor: + def test_identical_docs(self, tmp_path: Any) -> None: + path1 = str(tmp_path / "a.docx") + path2 = str(tmp_path / "b.docx") + paras = [{"runs": [{"text": "Hello world"}]}] + write_docx(path1, paras) + write_docx(path2, paras) + + doc1 = DocxFile(path1) + doc2 = DocxFile(path2) + proc = DocxProcessor() + ops = proc.process(doc1, doc2) + + # Only equal operations + assert all(op.opcodes[0] == "equal" for op in ops) + + def test_text_change_detected(self, tmp_path: Any) -> None: + path1 = str(tmp_path / "a.docx") + path2 = str(tmp_path / "b.docx") + write_docx(path1, [{"runs": [{"text": "Hello world"}]}]) + write_docx(path2, [{"runs": [{"text": "Hello earth"}]}]) + + doc1 = DocxFile(path1) + doc2 = DocxFile(path2) + proc = DocxProcessor() + ops = proc.process(doc1, doc2) + + tags = [op.opcodes[0] for op in ops] + assert "replace" in tags + + def test_formatting_change_detected(self, tmp_path: Any) -> None: + """Same text but different formatting should be a replace.""" + path1 = str(tmp_path / "a.docx") + path2 = str(tmp_path / "b.docx") + write_docx(path1, [{"runs": [{"text": "Hello world"}]}]) + write_docx(path2, [{"runs": [{"text": "Hello world", "props": {"b": "true"}}]}]) + + doc1 = DocxFile(path1) + doc2 = DocxFile(path2) + proc = DocxProcessor() + ops = proc.process(doc1, doc2) + + tags = [op.opcodes[0] for op in ops] + assert "replace" in tags + + def test_paragraph_style_change_detected(self, tmp_path: Any) -> None: + """Changing paragraph style should be a replace.""" + path1 = str(tmp_path / "a.docx") + path2 = str(tmp_path / "b.docx") + write_docx(path1, [ + {"runs": [{"text": "Some text"}], "props": {"paragraph_style": "Normal"}}, + ]) + write_docx(path2, [ + {"runs": [{"text": "Some text"}], "props": {"paragraph_style": "Heading1"}}, + ]) + + doc1 = DocxFile(path1) + doc2 = DocxFile(path2) + proc = DocxProcessor() + ops = proc.process(doc1, doc2) + + tags = [op.opcodes[0] for op in ops] + assert "replace" in tags + + def test_rich_tokens_on_chunks(self, tmp_path: Any) -> None: + path1 = str(tmp_path / "a.docx") + path2 = str(tmp_path / "b.docx") + write_docx(path1, [{"runs": [{"text": "Hello"}]}]) + write_docx(path2, [{"runs": [{"text": "Hello"}]}]) + + doc1 = DocxFile(path1) + doc2 = DocxFile(path2) + proc = DocxProcessor() + ops = proc.process(doc1, doc2) + + # Chunks should carry rich_tokens + assert ops[0].source_chunk.rich_tokens is not None + assert ops[0].test_chunk.rich_tokens is not None + + +# ── Integration tests via Redlines ──────────────────────────────────── + +class TestDocxRedlinesIntegration: + def test_auto_selects_docx_processor(self, tmp_path: Any) -> None: + path1 = str(tmp_path / "a.docx") + path2 = str(tmp_path / "b.docx") + write_docx(path1, [{"runs": [{"text": "Hello world"}]}]) + write_docx(path2, [{"runs": [{"text": "Hello earth"}]}]) + + doc1 = DocxFile(path1) + doc2 = DocxFile(path2) + diff = Redlines(doc1, doc2) + + assert isinstance(diff.processor, DocxProcessor) + + def test_changes_property(self, tmp_path: Any) -> None: + path1 = str(tmp_path / "a.docx") + path2 = str(tmp_path / "b.docx") + write_docx(path1, [{"runs": [{"text": "The quick brown fox"}]}]) + write_docx(path2, [{"runs": [{"text": "The slow brown fox"}]}]) + + diff = Redlines(DocxFile(path1), DocxFile(path2)) + changes = diff.changes + + assert len(changes) == 1 + assert changes[0].operation == "replace" + assert "quick" in (changes[0].source_text or "") + assert "slow" in (changes[0].test_text or "") + + def test_stats(self, tmp_path: Any) -> None: + path1 = str(tmp_path / "a.docx") + path2 = str(tmp_path / "b.docx") + write_docx(path1, [{"runs": [{"text": "Hello world"}]}]) + write_docx(path2, [{"runs": [{"text": "Hello earth"}]}]) + + diff = Redlines(DocxFile(path1), DocxFile(path2)) + stats = diff.stats() + + assert stats.total_changes == 1 + assert stats.replacements == 1 + + def test_output_json_plain_text_change(self, tmp_path: Any) -> None: + path1 = str(tmp_path / "a.docx") + path2 = str(tmp_path / "b.docx") + write_docx(path1, [{"runs": [{"text": "Hello world"}]}]) + write_docx(path2, [{"runs": [{"text": "Hello earth"}]}]) + + diff = Redlines(DocxFile(path1), DocxFile(path2)) + data = json.loads(diff.output_json()) + + assert "changes" in data + assert "stats" in data + + # Source tokens should be rich (dicts with "text" and "formatting") + assert isinstance(data["source_tokens"][0], dict) + assert "text" in data["source_tokens"][0] + + def test_output_json_formatting_change(self, tmp_path: Any) -> None: + """Same text, different formatting → replace with formatting_changes.""" + path1 = str(tmp_path / "a.docx") + path2 = str(tmp_path / "b.docx") + write_docx(path1, [{"runs": [{"text": "Hello"}]}]) + write_docx(path2, [{"runs": [{"text": "Hello", "props": {"b": "true"}}]}]) + + diff = Redlines(DocxFile(path1), DocxFile(path2)) + data = json.loads(diff.output_json(pretty=True)) + + # Find the replace change + replaces = [c for c in data["changes"] if c["type"] == "replace"] + assert len(replaces) >= 1 + + replace = replaces[0] + assert replace["text_changed"] is False + assert "formatting_changes" in replace + assert "b" in replace["formatting_changes"] + + def test_output_json_formatting_and_text_change(self, tmp_path: Any) -> None: + path1 = str(tmp_path / "a.docx") + path2 = str(tmp_path / "b.docx") + write_docx(path1, [{"runs": [{"text": "Hello world"}]}]) + write_docx(path2, [{"runs": [{"text": "Hello earth", "props": {"b": "true"}}]}]) + + diff = Redlines(DocxFile(path1), DocxFile(path2)) + data = json.loads(diff.output_json()) + + replaces = [c for c in data["changes"] if c["type"] == "replace"] + assert len(replaces) >= 1 + + def test_output_json_insert_and_delete(self, tmp_path: Any) -> None: + path1 = str(tmp_path / "a.docx") + path2 = str(tmp_path / "b.docx") + write_docx(path1, [{"runs": [{"text": "Hello beautiful world"}]}]) + write_docx(path2, [{"runs": [{"text": "Hello world"}]}]) + + diff = Redlines(DocxFile(path1), DocxFile(path2)) + data = json.loads(diff.output_json()) + + types = {c["type"] for c in data["changes"]} + assert "equal" in types + assert "delete" in types or "replace" in types + + def test_output_json_identical_docs(self, tmp_path: Any) -> None: + path1 = str(tmp_path / "a.docx") + path2 = str(tmp_path / "b.docx") + paras = [{"runs": [{"text": "Hello world"}]}] + write_docx(path1, paras) + write_docx(path2, paras) + + diff = Redlines(DocxFile(path1), DocxFile(path2)) + data = json.loads(diff.output_json()) + + assert data["stats"]["total_changes"] == 0 + types = {c["type"] for c in data["changes"]} + assert types == {"equal"} + + def test_output_json_paragraph_style_change(self, tmp_path: Any) -> None: + """Paragraph style change shows in formatting_changes.""" + path1 = str(tmp_path / "a.docx") + path2 = str(tmp_path / "b.docx") + write_docx(path1, [ + {"runs": [{"text": "Title"}], "props": {"paragraph_style": "Normal"}}, + ]) + write_docx(path2, [ + {"runs": [{"text": "Title"}], "props": {"paragraph_style": "Heading1"}}, + ]) + + diff = Redlines(DocxFile(path1), DocxFile(path2)) + data = json.loads(diff.output_json(pretty=True)) + + replaces = [c for c in data["changes"] if c["type"] == "replace"] + assert len(replaces) >= 1 + replace = replaces[0] + assert replace["text_changed"] is False + assert "paragraph_style" in replace.get("formatting_changes", {}) + + def test_multi_paragraph_change(self, tmp_path: Any) -> None: + path1 = str(tmp_path / "a.docx") + path2 = str(tmp_path / "b.docx") + write_docx(path1, [ + {"runs": [{"text": "First paragraph"}]}, + {"runs": [{"text": "Second paragraph"}]}, + ]) + write_docx(path2, [ + {"runs": [{"text": "First paragraph"}]}, + {"runs": [{"text": "Modified paragraph"}]}, + ]) + + diff = Redlines(DocxFile(path1), DocxFile(path2)) + changes = diff.changes + + assert len(changes) == 1 + assert changes[0].operation == "replace" + + def test_markdown_output_still_works(self, tmp_path: Any) -> None: + """Markdown output uses plain text tokens and shouldn't break.""" + path1 = str(tmp_path / "a.docx") + path2 = str(tmp_path / "b.docx") + write_docx(path1, [{"runs": [{"text": "Hello world"}]}]) + write_docx(path2, [{"runs": [{"text": "Hello earth"}]}]) + + diff = Redlines(DocxFile(path1), DocxFile(path2)) + md = diff.output_markdown + + assert "world" in md + assert "earth" in md + + def test_opcodes(self, tmp_path: Any) -> None: + path1 = str(tmp_path / "a.docx") + path2 = str(tmp_path / "b.docx") + write_docx(path1, [{"runs": [{"text": "Hello world"}]}]) + write_docx(path2, [{"runs": [{"text": "Hello earth"}]}]) + + diff = Redlines(DocxFile(path1), DocxFile(path2)) + opcodes = diff.opcodes + + assert isinstance(opcodes, list) + assert all(isinstance(op, tuple) and len(op) == 5 for op in opcodes) + + +# ── Edge cases ──────────────────────────────────────────────────────── + +class TestDocxEdgeCases: + def test_empty_runs_ignored(self, tmp_path: Any) -> None: + """Paragraphs with empty text should not produce tokens.""" + path = str(tmp_path / "empty_run.docx") + write_docx(path, [ + {"runs": [{"text": ""}]}, + {"runs": [{"text": "Real text"}]}, + ]) + + doc = DocxFile(path) + # Only "Real text" tokens, no separator for empty paragraph + assert all("¶" not in tok.text for tok in doc.rich_tokens if tok.text.strip()) + + def test_docx_processor_rejects_strings(self) -> None: + proc = DocxProcessor() + with pytest.raises(TypeError, match="DocxProcessor requires DocxFile"): + proc.process("hello", "world") + + def test_underline_property(self, tmp_path: Any) -> None: + path = str(tmp_path / "underline.docx") + write_docx(path, [ + {"runs": [{"text": "Underlined", "props": {"u": "single"}}]}, + ]) + + from redlines.docx_parser import parse_docx + paragraphs = parse_docx(path) + + assert paragraphs[0]["runs"][0]["properties"]["u"] == "single" + + +# ── File-based comparison (tests/documents/DocxFile/) ───────────────── + +FIXTURES = os.path.join(os.path.dirname(__file__), "documents", "DocxFile") + + +class TestDocxFileComparison: + """Compare the persistent source.docx and test.docx fixture files. + + The fixtures represent a short "Service Agreement" document where + ``test.docx`` has deliberate text, formatting, and structural changes + relative to ``source.docx``: + + * Text change: "hereby agrees" → "consents" + * Formatting-only change: "in good faith" gains bold (was italic-only) + * Paragraph style change: "Terms and Conditions" Normal → Heading2 + * Alignment change: CONFIDENTIAL centered → left, red color removed + * New paragraph appended + """ + + @pytest.fixture() + def diff(self) -> Redlines: + source = DocxFile(os.path.join(FIXTURES, "source.docx")) + test = DocxFile(os.path.join(FIXTURES, "test.docx")) + return Redlines(source, test) + + @pytest.fixture() + def diff_json(self, diff: Redlines) -> dict[str, Any]: + return json.loads(diff.output_json(pretty=True)) + + # ── basic properties ────────────────────────────────────────────── + + def test_source_text_extracted(self, diff: Redlines) -> None: + assert "Service Agreement" in diff.source + assert "CONFIDENTIAL" in diff.source + + def test_test_text_extracted(self, diff: Redlines) -> None: + assert "Service Agreement" in diff.test + assert "date of signing" in diff.test + + def test_processor_is_docx(self, diff: Redlines) -> None: + assert isinstance(diff.processor, DocxProcessor) + + # ── stats ───────────────────────────────────────────────────────── + + def test_total_changes(self, diff: Redlines) -> None: + stats = diff.stats() + assert stats.total_changes == 4 + assert stats.replacements == 4 + + # ── text change: "hereby agrees" → "consents" ──────────────────── + + def test_text_change_detected(self, diff_json: dict[str, Any]) -> None: + replaces = [c for c in diff_json["changes"] if c["type"] == "replace"] + text_changes = [ + c for c in replaces + if c.get("text_changed") is True + and "agrees" in (c.get("source_text") or "") + ] + assert len(text_changes) == 1 + assert "consents" in text_changes[0]["test_text"] + + def test_text_change_has_underline_formatting_diff( + self, diff_json: dict[str, Any] + ) -> None: + """'consents' also gained underline, so formatting_changes includes u.""" + replaces = [c for c in diff_json["changes"] if c["type"] == "replace"] + text_changes = [ + c for c in replaces if "agrees" in (c.get("source_text") or "") + ] + fmt = text_changes[0].get("formatting_changes", {}) + assert "u" in fmt + assert fmt["u"]["to"] == "single" + + # ── formatting-only: "in good faith" italic → bold+italic ──────── + + def test_formatting_only_change(self, diff_json: dict[str, Any]) -> None: + replaces = [c for c in diff_json["changes"] if c["type"] == "replace"] + fmt_only = [ + c for c in replaces + if c.get("text_changed") is False + and "good faith" in (c.get("source_text") or "") + ] + assert len(fmt_only) == 1 + fmt = fmt_only[0]["formatting_changes"] + assert fmt["b"]["from"] is None + assert fmt["b"]["to"] == "true" + + # ── paragraph style change: Normal → Heading2 ──────────────────── + + def test_paragraph_style_change(self, diff_json: dict[str, Any]) -> None: + replaces = [c for c in diff_json["changes"] if c["type"] == "replace"] + style_changes = [ + c for c in replaces + if c.get("text_changed") is False + and "Terms" in (c.get("source_text") or "") + ] + assert len(style_changes) == 1 + fmt = style_changes[0]["formatting_changes"] + assert "paragraph_style" in fmt + assert fmt["paragraph_style"]["to"] == "Heading2" + + # ── alignment + color change on CONFIDENTIAL ───────────────────── + + def test_alignment_and_color_change(self, diff_json: dict[str, Any]) -> None: + replaces = [c for c in diff_json["changes"] if c["type"] == "replace"] + conf = [ + c for c in replaces if "CONFIDENTIAL" in (c.get("source_text") or "") + ] + assert len(conf) == 1 + fmt = conf[0].get("formatting_changes", {}) + assert fmt["alignment"]["from"] == "center" + assert fmt["alignment"]["to"] == "left" + assert fmt["color"]["from"] == "FF0000" + assert fmt["color"]["to"] is None + + # ── JSON structure validation ───────────────────────────────────── + + def test_tokens_are_rich(self, diff_json: dict[str, Any]) -> None: + """source_tokens and test_tokens carry formatting dicts.""" + for tok in diff_json["source_tokens"]: + assert isinstance(tok, dict) + assert "text" in tok + assert "formatting" in tok + + def test_changes_cover_full_text(self, diff_json: dict[str, Any]) -> None: + """Equal + changed spans should cover all source tokens.""" + total_source_span = 0 + for c in diff_json["changes"]: + if c["type"] in ("equal", "delete", "replace"): + i1, i2 = c["source_token_position"] + total_source_span += i2 - i1 + assert total_source_span == len(diff_json["source_tokens"]) + + def test_stats_section(self, diff_json: dict[str, Any]) -> None: + stats = diff_json["stats"] + assert stats["replacements"] == 4 + assert stats["total_changes"] == 4 + assert stats["insertions"] == 0 + assert stats["deletions"] == 0 + assert 0 < stats["change_ratio"] < 1 + + # ── markdown output ────────────────────────────────────────────── + + def test_markdown_output(self, diff: Redlines) -> None: + md = diff.output_markdown + assert isinstance(md, str) + assert "Service Agreement" in md + + def test_markdown_formatting_only_not_del_ins(self, diff: Redlines) -> None: + """Formatting-only changes should NOT show del+ins of same text.""" + md = diff.output_markdown + # "in good faith" is formatting-only (italic → bold+italic). + # It should appear once in blue, not as strikethrough + insertion. + assert "line-through;'>in good faith" not in md + + def test_markdown_formatting_only_has_annotation(self, diff: Redlines) -> None: + md = diff.output_markdown + assert "[+bold]" in md + + def test_markdown_style_change_annotation(self, diff: Redlines) -> None: + md = diff.output_markdown + assert "+style: Heading2" in md + + def test_markdown_text_change_still_del_ins(self, diff: Redlines) -> None: + """Text changes should still use standard del+ins.""" + md = diff.output_markdown + assert "hereby agrees" in md + assert "consents" in md + + def test_markdown_text_change_with_formatting_note(self, diff: Redlines) -> None: + """Text change that also has formatting diff should include a note.""" + md = diff.output_markdown + assert "+underline: single" in md + + # ── rich (terminal) output ─────────────────────────────────────── + + def test_rich_output(self, diff: Redlines) -> None: + from rich.text import Text + rich = diff.output_rich + assert isinstance(rich, Text) + plain = rich.plain + assert "Service Agreement" in plain + + def test_rich_formatting_only_has_annotation(self, diff: Redlines) -> None: + plain = diff.output_rich.plain + assert "[+bold]" in plain + + def test_rich_style_change_annotation(self, diff: Redlines) -> None: + plain = diff.output_rich.plain + assert "+style: Heading2" in plain + + # ── custom_css with fmt_class / fmt_note_class ─────────────────── + + def test_custom_css_default_fmt_classes(self) -> None: + source = DocxFile(os.path.join(FIXTURES, "source.docx")) + test = DocxFile(os.path.join(FIXTURES, "test.docx")) + r = Redlines(source, test, markdown_style="custom_css") + md = r.output_markdown + assert "class='redline-formatting'" in md + assert "class='redline-formatting-note'" in md + + def test_custom_css_overridden_fmt_classes(self) -> None: + source = DocxFile(os.path.join(FIXTURES, "source.docx")) + test = DocxFile(os.path.join(FIXTURES, "test.docx")) + r = Redlines( + source, + test, + markdown_style="custom_css", + fmt_class="my-fmt", + fmt_note_class="my-fmt-note", + ) + md = r.output_markdown + assert "class='my-fmt'" in md + assert "class='my-fmt-note'" in md