diff --git a/examples/query/README.md b/examples/query/README.md index 0b2218bc..e26f63c1 100644 --- a/examples/query/README.md +++ b/examples/query/README.md @@ -20,6 +20,35 @@ uv run query.py "What do we have here?" --score-threshold 0.5 mv data/ data.bak/ # or rm -rf if you want ``` +## Add Directory + +`add.py` supports adding an entire directory of documents at once. Files are automatically classified and parsed by their type (PDF, Markdown, Text, code, etc.). A summary table is printed after import showing which files were processed, failed, unsupported, or filtered. + +```bash +# Add all supported files in a directory +uv run add.py ~/Documents/research/ + +# Only include specific file types +uv run add.py ~/project/ --include '*.md' --include '*.pdf' + +# Exclude certain files +uv run add.py ~/project/ --exclude 'test_*' --exclude '*.pyc' + +# Skip specific sub-directories +uv run add.py ~/project/ --ignore-dirs node_modules --ignore-dirs .git + +# Combine options +uv run add.py ~/project/ --include '*.md' --exclude 'draft_*' --ignore-dirs vendor +``` + +### Directory Options + +| Option | Description | +|--------|-------------| +| `--include PATTERN` | Glob pattern for files to include (can be repeated) | +| `--exclude PATTERN` | Glob pattern for files to exclude (can be repeated) | +| `--ignore-dirs NAME` | Directory names to skip (can be repeated) | + ### Query Options | Option | Default | Description | @@ -50,7 +79,7 @@ Edit `ov.conf` to configure: ``` rag.py # RAG pipeline library -add.py # Add documents CLI +add.py # Add documents/directories CLI query.py # Query CLI q # Quick query wrapper logging_config.py # Logging configuration @@ -64,3 +93,4 @@ data/ # Database storage - Use `uv run query.py` for more control - Set `OV_DEBUG=1` only when debugging - Resources are indexed once, query unlimited times +- When adding directories, use `--include` / `--exclude` to control which files are imported diff --git a/examples/query/add.py b/examples/query/add.py index 68640008..b8997d45 100644 --- a/examples/query/add.py +++ b/examples/query/add.py @@ -7,12 +7,107 @@ import json import sys from pathlib import Path +from typing import Any, Dict, List + +from rich import box +from rich.console import Console +from rich.table import Table import openviking as ov from openviking_cli.utils.config.open_viking_config import OpenVikingConfig +console = Console() + + +# ── Table helpers ────────────────────────────────────────────────── + + +def _print_directory_summary(meta: Dict[str, Any], errors: List[str]) -> None: + """Print a rich-table summary for a directory import.""" + processed: List[Dict[str, str]] = meta.get("processed_files", []) + failed: List[Dict[str, str]] = meta.get("failed_files", []) + unsupported: List[Dict[str, str]] = meta.get("unsupported_files", []) + skipped: List[Dict[str, str]] = meta.get("skipped_files", []) -def add_resource(resource_path: str, config_path: str = "./ov.conf", data_path: str = "./data"): + n_total = len(processed) + len(failed) + len(unsupported) + len(skipped) + + if n_total == 0: + console.print(" (no files found)", style="dim") + return + + # Build a single combined table (ROUNDED box style, same as query.py) + table = Table( + title=f"Directory Import ({n_total} files)", + box=box.ROUNDED, + show_header=True, + header_style="bold magenta", + title_style="bold magenta", + ) + table.add_column("#", style="cyan", width=4) + table.add_column("Status", no_wrap=True) + table.add_column("File", style="bold white", no_wrap=True) + table.add_column("Detail") + + # Match failed files to their warning messages + fail_reasons: Dict[str, str] = {} + for err in errors: + for f in failed: + if f["path"] in err: + fail_reasons[f["path"]] = err + break + + idx = 0 + for f in processed: + idx += 1 + table.add_row( + str(idx), + "[green]processed[/green]", + f["path"], + f"[dim]{f.get('parser', '')}[/dim]", + ) + + for f in failed: + idx += 1 + reason = fail_reasons.get(f["path"], "parse error") + table.add_row( + str(idx), + "[red]failed[/red]", + f["path"], + f"[red]{reason}[/red]", + ) + + for f in unsupported: + idx += 1 + table.add_row( + str(idx), + "[yellow]unsupported[/yellow]", + f["path"], + "", + ) + + for f in skipped: + idx += 1 + status = f.get("status", "skip") + table.add_row( + str(idx), + f"[dim]{status}[/dim]", + f"[dim]{f['path']}[/dim]", + "", + ) + + console.print() + console.print(table) + + +# ── Main logic ───────────────────────────────────────────────────── + + +def add_resource( + resource_path: str, + config_path: str = "./ov.conf", + data_path: str = "./data", + **kwargs, +): """ Add a resource to OpenViking database @@ -20,6 +115,8 @@ def add_resource(resource_path: str, config_path: str = "./ov.conf", data_path: resource_path: Path to file, directory, or URL config_path: Path to config file data_path: Path to data directory + **kwargs: Extra options forwarded to ``add_resource`` (e.g. + ``include``, ``exclude``, ``ignore_dirs``). """ # Load config print(f"📋 Loading config from: {config_path}") @@ -34,25 +131,37 @@ def add_resource(resource_path: str, config_path: str = "./ov.conf", data_path: client.initialize() print("✓ Initialized\n") - print(f"📂 Adding resource: {resource_path}") - - # Check if it's a file and exists - if not resource_path.startswith("http"): + # Check if it's a local path and exists + is_local = not resource_path.startswith("http") + is_directory = False + if is_local: path = Path(resource_path).expanduser() if not path.exists(): - print(f"❌ Error: File not found: {path}") + print(f"❌ Error: Path not found: {path}") return False + is_directory = path.is_dir() - result = client.add_resource(path=resource_path) + if is_directory: + print(f"📂 Adding directory: {resource_path}") + else: + print(f"📄 Adding resource: {resource_path}") + + result = client.add_resource(path=resource_path, **kwargs) # Check result if result and "root_uri" in result: root_uri = result["root_uri"] - print(f"✓ Resource added: {root_uri}\n") + meta = result.get("meta", {}) + errors = result.get("errors", []) + print(f"✓ Resource added: {root_uri}") + + # Show directory-specific table + if is_directory: + _print_directory_summary(meta, errors) # Wait for processing - print("⏳ Processing and indexing...") - client.wait_processed(timeout=300) + print("\n⏳ Processing and indexing...") + client.wait_processed(timeout=600 if is_directory else 300) print("✓ Processing complete!\n") print("🎉 Resource is now searchable in the database!") @@ -61,7 +170,7 @@ def add_resource(resource_path: str, config_path: str = "./ov.conf", data_path: elif result and result.get("status") == "error": print("\n⚠️ Resource had parsing issues:") if "errors" in result: - for error in result["errors"][:3]: + for error in result["errors"][:5]: print(f" - {error}") print("\n💡 Some content may still be searchable.") return False @@ -123,6 +232,31 @@ def main(): "--data", type=str, default="./data", help="Path to data directory (default: ./data)" ) + # Directory-specific options + dir_group = parser.add_argument_group("directory options") + dir_group.add_argument( + "--include", + type=str, + action="append", + default=None, + help="Glob pattern for files to include (can be repeated, e.g. --include '*.md')", + ) + dir_group.add_argument( + "--exclude", + type=str, + action="append", + default=None, + help="Glob pattern for files to exclude (can be repeated, e.g. --exclude 'test_*')", + ) + dir_group.add_argument( + "--ignore-dirs", + type=str, + action="append", + default=None, + dest="ignore_dirs", + help="Directory names to skip (can be repeated, e.g. --ignore-dirs node_modules)", + ) + args = parser.parse_args() # Expand user paths @@ -132,8 +266,19 @@ def main(): else args.resource ) + # Build kwargs for directory options + # scan_directory expects include/exclude as comma-separated strings, + # and ignore_dirs as a Set[str]. + dir_kwargs = {} + if args.include: + dir_kwargs["include"] = ",".join(args.include) + if args.exclude: + dir_kwargs["exclude"] = ",".join(args.exclude) + if args.ignore_dirs: + dir_kwargs["ignore_dirs"] = set(args.ignore_dirs) + # Add the resource - success = add_resource(resource_path, args.config, args.data) + success = add_resource(resource_path, args.config, args.data, **dir_kwargs) sys.exit(0 if success else 1) diff --git a/openviking/async_client.py b/openviking/async_client.py index 84e3c7e1..1dc1f1a8 100644 --- a/openviking/async_client.py +++ b/openviking/async_client.py @@ -143,12 +143,15 @@ async def add_resource( instruction: str = "", wait: bool = False, timeout: float = None, + **kwargs, ) -> Dict[str, Any]: """Add resource to OpenViking (only supports resources scope). Args: wait: Whether to wait for semantic extraction and vectorization to complete timeout: Wait timeout in seconds + **kwargs: Extra options forwarded to the parser chain, e.g. + ``strict``, ``ignore_dirs``, ``include``, ``exclude``. """ await self._ensure_initialized() return await self._client.add_resource( @@ -158,6 +161,7 @@ async def add_resource( instruction=instruction, wait=wait, timeout=timeout, + **kwargs, ) async def wait_processed(self, timeout: float = None) -> Dict[str, Any]: diff --git a/openviking/client/local.py b/openviking/client/local.py index 389439ac..749069e6 100644 --- a/openviking/client/local.py +++ b/openviking/client/local.py @@ -58,6 +58,7 @@ async def add_resource( instruction: str = "", wait: bool = False, timeout: Optional[float] = None, + **kwargs, ) -> Dict[str, Any]: """Add resource to OpenViking.""" return await self._service.resources.add_resource( @@ -67,6 +68,7 @@ async def add_resource( instruction=instruction, wait=wait, timeout=timeout, + **kwargs, ) async def add_skill( diff --git a/openviking/parse/parsers/directory.py b/openviking/parse/parsers/directory.py new file mode 100644 index 00000000..5f2f05fa --- /dev/null +++ b/openviking/parse/parsers/directory.py @@ -0,0 +1,388 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +""" +Directory parser for OpenViking. + +Handles local directories containing mixed document types (PDF, Markdown, +Text, code, etc.). Follows the same three-phase pattern as +CodeRepositoryParser: + +1. Scan → classify files with ``scan_directory()`` +2. For each file: + - Files WITH a dedicated parser → ``parser.parse()`` handles conversion + and VikingFS temp creation; results are merged into the main temp. + - Files WITHOUT a parser (code, config, …) → written directly to VikingFS. +3. Return ``ParseResult`` so that ``TreeBuilder.finalize_from_temp`` + can move the content to AGFS and enqueue semantic processing. +""" + +import time +from pathlib import Path, PurePosixPath +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union + +from openviking.parse.base import ( + NodeType, + ParseResult, + ResourceNode, + create_parse_result, +) +from openviking.parse.parsers.base_parser import BaseParser +from openviking_cli.utils.logger import get_logger + +if TYPE_CHECKING: + from openviking.parse.directory_scan import ClassifiedFile + from openviking.parse.registry import ParserRegistry + +logger = get_logger(__name__) + + +class DirectoryParser(BaseParser): + """ + Parser for local directories. + + Scans the directory, delegates each file to its registered parser via + ``parser.parse()``, and merges all results into a single VikingFS temp. + Files without a dedicated parser are written directly. + + The resulting ``ParseResult.temp_dir_path`` is consumed by + ``TreeBuilder.finalize_from_temp`` exactly like any other parser. + """ + + @property + def supported_extensions(self) -> List[str]: + # Directories have no file extension; routing is handled + # by ``is_dir()`` checks in the registry / media processor. + return [] + + def can_parse(self, path: Union[str, Path]) -> bool: # type: ignore[override] + """Return *True* when *path* is an existing directory.""" + return Path(path).is_dir() + + # ------------------------------------------------------------------ + # Main entry point + # ------------------------------------------------------------------ + + async def parse( + self, + source: Union[str, Path], + instruction: str = "", + **kwargs, + ) -> ParseResult: + """Parse a local directory. + + Args: + source: Path to the directory. + instruction: Processing instruction (forwarded where applicable). + **kwargs: Extra options forwarded to ``scan_directory``: + ``strict``, ``ignore_dirs``, ``include``, ``exclude``. + + Returns: + ``ParseResult`` with ``temp_dir_path`` pointing to VikingFS temp. + """ + start_time = time.time() + source_path = Path(source).resolve() + + if not source_path.is_dir(): + raise NotADirectoryError(f"Not a directory: {source_path}") + + dir_name = source_path.name + warnings: List[str] = [] + + try: + # ── Phase 1: scan directory ─────────────────────────────── + from openviking.parse.directory_scan import scan_directory + from openviking.parse.registry import get_registry + + registry = get_registry() + + scan_result = scan_directory( + root=str(source_path), + registry=registry, + strict=kwargs.get("strict", False), + ignore_dirs=kwargs.get("ignore_dirs"), + include=kwargs.get("include"), + exclude=kwargs.get("exclude"), + ) + processable_files = scan_result.all_processable_files() + warnings.extend(scan_result.warnings) + + viking_fs = self._get_viking_fs() + temp_uri = self._create_temp_uri() + target_uri = f"{temp_uri}/{dir_name}" + await viking_fs.mkdir(temp_uri) + await viking_fs.mkdir(target_uri) + + if not processable_files: + root = ResourceNode( + type=NodeType.ROOT, + title=dir_name, + meta={"file_count": 0, "type": "directory"}, + ) + result = create_parse_result( + root=root, + source_path=str(source_path), + source_format="directory", + parser_name="DirectoryParser", + parse_time=time.time() - start_time, + warnings=warnings, + ) + result.temp_dir_path = temp_uri + return result + + # ── Phase 2: process each file ──────────────────────────── + file_count = 0 + processed_files: List[Dict[str, str]] = [] + failed_files: List[Dict[str, str]] = [] + + for cf in processable_files: + file_parser = self._assign_parser(cf, registry) + parser_name = type(file_parser).__name__ if file_parser else "direct" + ok = await self._process_single_file( + cf, + file_parser, + target_uri, + viking_fs, + warnings, + ) + if ok: + file_count += 1 + processed_files.append( + { + "path": cf.rel_path, + "parser": parser_name, + } + ) + else: + failed_files.append( + { + "path": cf.rel_path, + "parser": parser_name, + } + ) + + # Collect unsupported files from scan result + unsupported_files = [ + { + "path": uf.rel_path, + "status": "unsupported", + "reason": uf.classification, + } + for uf in scan_result.unsupported + ] + + # Parse skipped entries: format is "path (reason)" + skipped_files = self._parse_skipped(scan_result.skipped) + + # ── Phase 3: build ParseResult ──────────────────────────── + root = ResourceNode( + type=NodeType.ROOT, + title=dir_name, + meta={ + "file_count": file_count, + "type": "directory", + }, + ) + + result = create_parse_result( + root=root, + source_path=str(source_path), + source_format="directory", + parser_name="DirectoryParser", + parse_time=time.time() - start_time, + warnings=warnings, + ) + result.temp_dir_path = temp_uri + result.meta["file_count"] = file_count + result.meta["dir_name"] = dir_name + result.meta["total_processable"] = len(processable_files) + result.meta["processed_files"] = processed_files + result.meta["failed_files"] = failed_files + result.meta["unsupported_files"] = unsupported_files + result.meta["skipped_files"] = skipped_files + + return result + + except Exception as exc: + logger.error( + f"[DirectoryParser] Failed to parse directory {source_path}: {exc}", + exc_info=True, + ) + return create_parse_result( + root=ResourceNode(type=NodeType.ROOT), + source_path=str(source_path), + source_format="directory", + parser_name="DirectoryParser", + parse_time=time.time() - start_time, + warnings=[f"Failed to parse directory: {exc}"], + ) + + # ------------------------------------------------------------------ + # parse_content – not applicable for directories + # ------------------------------------------------------------------ + + async def parse_content( + self, + content: str, + source_path: Optional[str] = None, + instruction: str = "", + **kwargs, + ) -> ParseResult: + raise NotImplementedError("DirectoryParser does not support parse_content") + + # ------------------------------------------------------------------ + # Skipped entries parsing + # ------------------------------------------------------------------ + + _REASON_TO_STATUS = { + "dot directory": "ignore", + "dot file": "ignore", + "symlink": "ignore", + "empty file": "ignore", + "os error": "ignore", + "IGNORE_DIRS": "ignore", + "ignore_dirs": "ignore", + "excluded by include filter": "exclude", + "excluded by exclude filter": "exclude", + } + + @staticmethod + def _parse_skipped(skipped: List[str]) -> List[Dict[str, str]]: + """Parse skipped entry strings into structured dicts. + + Each entry has the format ``"rel_path (reason)"``. + Returns a list of ``{"path": ..., "status": ...}``. + """ + result: List[Dict[str, str]] = [] + for entry in skipped: + # Extract "path (reason)" + paren_idx = entry.rfind(" (") + if paren_idx != -1 and entry.endswith(")"): + path = entry[:paren_idx] + reason = entry[paren_idx + 2 : -1] + else: + path = entry + reason = "skip" + status = DirectoryParser._REASON_TO_STATUS.get(reason, "skip") + result.append({"path": path, "status": status}) + return result + + # ------------------------------------------------------------------ + # Parser assignment + # ------------------------------------------------------------------ + + @staticmethod + def _assign_parser( + classified_file: "ClassifiedFile", + registry: "ParserRegistry", + ) -> Optional[BaseParser]: + """Look up the parser for a file via the registry. + + Returns: + The ``BaseParser`` instance for the file's extension, or + ``None`` for text-fallback files with no dedicated parser. + """ + return registry.get_parser_for_file(classified_file.path) + + # ------------------------------------------------------------------ + # Per-file processing + # ------------------------------------------------------------------ + + @staticmethod + async def _process_single_file( + classified_file: "ClassifiedFile", + parser: Optional[BaseParser], + target_uri: str, + viking_fs: Any, + warnings: List[str], + ) -> bool: + """Process one file into the VikingFS directory temp. + + - Files WITH a parser → ``parser.parse()`` → merge output into + *target_uri* at the correct relative location. + - Files WITHOUT a parser → read and write directly to VikingFS. + + Returns: + *True* on success, *False* on failure. + """ + rel_path = classified_file.rel_path + src_file = classified_file.path + + if parser: + try: + sub_result = await parser.parse(str(src_file)) + if sub_result.temp_dir_path: + parent = str(PurePosixPath(rel_path).parent) + dest = f"{target_uri}/{parent}" if parent != "." else target_uri + await DirectoryParser._merge_temp( + viking_fs, + sub_result.temp_dir_path, + dest, + ) + return True + except Exception as exc: + warnings.append(f"Failed to parse {rel_path}: {exc}") + return False + else: + try: + content = src_file.read_bytes() + dst_uri = f"{target_uri}/{rel_path}" + await viking_fs.write_file(dst_uri, content) + return True + except Exception as exc: + warnings.append(f"Failed to upload {rel_path}: {exc}") + return False + + # ------------------------------------------------------------------ + # VikingFS merge helpers + # ------------------------------------------------------------------ + + @staticmethod + def _is_dir_entry(entry: Dict[str, Any]) -> bool: + """Check whether an AGFS ``ls`` entry represents a directory.""" + return bool(entry.get("isDir", False)) or entry.get("type") == "directory" + + @staticmethod + async def _merge_temp( + viking_fs: Any, + src_temp_uri: str, + dest_uri: str, + ) -> None: + """Move all content from a parser's temp directory into *dest_uri*. + + After the move the source temp is deleted. + """ + entries = await viking_fs.ls(src_temp_uri) + for entry in entries: + name = entry.get("name", "") + if not name or name in (".", ".."): + continue + src = entry.get("uri", f"{src_temp_uri.rstrip('/')}/{name}") + dst = f"{dest_uri.rstrip('/')}/{name}" + if DirectoryParser._is_dir_entry(entry): + await DirectoryParser._recursive_move(viking_fs, src, dst) + else: + await viking_fs.move_file(src, dst) + try: + await viking_fs.delete_temp(src_temp_uri) + except Exception: + pass + + @staticmethod + async def _recursive_move( + viking_fs: Any, + src_uri: str, + dst_uri: str, + ) -> None: + """Recursively move a VikingFS directory tree.""" + await viking_fs.mkdir(dst_uri, exist_ok=True) + entries = await viking_fs.ls(src_uri) + for entry in entries: + name = entry.get("name", "") + if not name or name in (".", ".."): + continue + s = f"{src_uri.rstrip('/')}/{name}" + d = f"{dst_uri.rstrip('/')}/{name}" + if DirectoryParser._is_dir_entry(entry): + await DirectoryParser._recursive_move(viking_fs, s, d) + else: + await viking_fs.move_file(s, d) diff --git a/openviking/parse/registry.py b/openviking/parse/registry.py index dd35062c..526b2272 100644 --- a/openviking/parse/registry.py +++ b/openviking/parse/registry.py @@ -71,6 +71,14 @@ def __init__(self, register_optional: bool = True): except ImportError as e: logger.warning(f"CodeRepositoryParser not available: {e}") + # Register directory parser + try: + from openviking.parse.parsers.directory import DirectoryParser + + self.register("directory", DirectoryParser()) + except ImportError as e: + logger.warning(f"DirectoryParser not available: {e}") + # Register optional media parsers if register_optional: try: @@ -245,6 +253,15 @@ async def parse(self, source: Union[str, Path], **kwargs) -> ParseResult: if is_potential_path: path = Path(source) if path.exists(): + # Directory → route to DirectoryParser + if path.is_dir(): + dir_parser = self._parsers.get("directory") + if dir_parser: + return await dir_parser.parse(path, **kwargs) + raise ValueError( + f"Source is a directory but DirectoryParser is not registered: {path}" + ) + parser = self.get_parser_for_file(path) if parser: return await parser.parse(path, **kwargs) diff --git a/openviking/service/resource_service.py b/openviking/service/resource_service.py index dd6e5dcb..b8a5026e 100644 --- a/openviking/service/resource_service.py +++ b/openviking/service/resource_service.py @@ -74,6 +74,7 @@ async def add_resource( instruction: str = "", wait: bool = False, timeout: Optional[float] = None, + **kwargs, ) -> Dict[str, Any]: """Add resource to OpenViking (only supports resources scope). @@ -84,6 +85,9 @@ async def add_resource( instruction: Processing instruction wait: Whether to wait for semantic extraction and vectorization to complete timeout: Wait timeout in seconds + **kwargs: Extra options forwarded to the parser chain, e.g. + ``strict``, ``ignore_dirs``, ``include``, ``exclude`` + (used by ``DirectoryParser``). Returns: Processing result @@ -104,6 +108,7 @@ async def add_resource( instruction=instruction, scope="resources", target=target, + **kwargs, ) if wait: diff --git a/openviking/sync_client.py b/openviking/sync_client.py index 632c9d7a..8dc1f60b 100644 --- a/openviking/sync_client.py +++ b/openviking/sync_client.py @@ -64,10 +64,24 @@ def add_resource( instruction: str = "", wait: bool = False, timeout: float = None, + **kwargs, ) -> Dict[str, Any]: - """Add resource to OpenViking (resources scope only)""" + """Add resource to OpenViking (resources scope only) + + Args: + **kwargs: Extra options forwarded to the parser chain, e.g. + ``strict``, ``ignore_dirs``, ``include``, ``exclude``. + """ return run_async( - self._async_client.add_resource(path, target, reason, instruction, wait, timeout) + self._async_client.add_resource( + path, + target, + reason, + instruction, + wait, + timeout, + **kwargs, + ) ) def add_skill( diff --git a/openviking/utils/media_processor.py b/openviking/utils/media_processor.py index 5fa06ca6..3ff58b2e 100644 --- a/openviking/utils/media_processor.py +++ b/openviking/utils/media_processor.py @@ -44,6 +44,7 @@ async def process( self, source: str, instruction: str = "", + **kwargs, ) -> ParseResult: """Process any source (file/URL/content) with appropriate strategy.""" # Check if URL @@ -55,6 +56,8 @@ async def process( if is_potential_path: path = Path(source) if path.exists(): + if path.is_dir(): + return await self._process_directory(path, instruction, **kwargs) return await self._process_file(path, instruction) else: logger.warning(f"Path {path} does not exist") @@ -74,6 +77,26 @@ async def _process_url(self, url: str, instruction: str) -> ParseResult: parser = HTMLParser() return await parser.parse(url, instruction=instruction) + async def _process_directory( + self, + dir_path: Path, + instruction: str, + **kwargs, + ) -> ParseResult: + """Process directory source via DirectoryParser. + + Args: + dir_path: Path to the directory. + instruction: Processing instruction. + **kwargs: Forwarded to ``DirectoryParser.parse()`` → + ``scan_directory()``: ``strict``, ``ignore_dirs``, + ``include``, ``exclude``. + """ + from openviking.parse.parsers.directory import DirectoryParser + + parser = DirectoryParser() + return await parser.parse(str(dir_path), instruction=instruction, **kwargs) + async def _process_file( self, file_path: Path, diff --git a/openviking/utils/resource_processor.py b/openviking/utils/resource_processor.py index ed946f55..156c48dd 100644 --- a/openviking/utils/resource_processor.py +++ b/openviking/utils/resource_processor.py @@ -76,6 +76,7 @@ async def process_resource( scope: str = "resources", user: Optional[str] = None, target: Optional[str] = None, + **kwargs, ) -> Dict[str, Any]: """ Process and store a new resource. @@ -94,14 +95,27 @@ async def process_resource( # ============ Phase 1: Parse source (Parser generates L0/L1 and writes to temp) ============ try: media_processor = self._get_media_processor() - parse_result = await media_processor.process(source=path, instruction=instruction) + parse_result = await media_processor.process( + source=path, + instruction=instruction, + **kwargs, + ) result["source_path"] = parse_result.source_path or path + result["meta"] = parse_result.meta - if not parse_result.success: + # Only abort when no temp content was produced at all. + # For directory imports partial success (some files failed) is + # normal – finalization should still proceed. + if not parse_result.temp_dir_path: result["status"] = "error" - result["errors"].extend(parse_result.warnings) + result["errors"].extend( + parse_result.warnings or ["Parse failed: no content generated"], + ) return result + if parse_result.warnings: + result["errors"].extend(parse_result.warnings) + except Exception as e: result["status"] = "error" result["errors"].append(f"Parse error: {e}") diff --git a/tests/parse/test_add_directory.py b/tests/parse/test_add_directory.py new file mode 100644 index 00000000..0402adf3 --- /dev/null +++ b/tests/parse/test_add_directory.py @@ -0,0 +1,463 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""Unit tests for DirectoryParser. + +Verifies that: +- DirectoryParser correctly scans directories and classifies files; +- Files WITH a parser are delegated via ``parser.parse()`` and their + VikingFS temp output is merged into the main directory temp; +- Files WITHOUT a parser are written directly to VikingFS; +- Empty directories are handled gracefully; +- PDF files are converted via PDFParser; +- The directory structure is preserved; +- Errors during parsing are captured as warnings. +""" + +from pathlib import Path +from typing import Any, Dict, List +from unittest.mock import AsyncMock, patch + +import pytest + +from openviking.parse.base import ( + NodeType, + ResourceNode, + create_parse_result, +) +from openviking.parse.parsers.base_parser import BaseParser +from openviking.parse.parsers.directory import DirectoryParser + +# --------------------------------------------------------------------------- +# Fake VikingFS – records mkdir / write / move / ls operations +# --------------------------------------------------------------------------- + + +class FakeVikingFS: + """Minimal VikingFS mock that records calls and supports merge ops.""" + + def __init__(self): + self.dirs: List[str] = [] + self.files: Dict[str, bytes] = {} + self._temp_counter = 0 + + # ---- write operations ------------------------------------------------ + + async def mkdir(self, uri: str, exist_ok: bool = False, **kw) -> None: + if uri not in self.dirs: + self.dirs.append(uri) + + async def write(self, uri: str, data: Any) -> str: + if isinstance(data, str): + data = data.encode("utf-8") + self.files[uri] = data + return uri + + async def write_file(self, uri: str, content: Any) -> None: + if isinstance(content, str): + content = content.encode("utf-8") + self.files[uri] = content + + async def write_file_bytes(self, uri: str, content: bytes) -> None: + self.files[uri] = content + + # ---- read / list operations ------------------------------------------ + + async def read(self, uri: str, offset: int = 0, size: int = -1) -> bytes: + return self.files.get(uri, b"") + + async def ls(self, uri: str) -> List[Dict[str, Any]]: + """List direct children of *uri* (mirrors real AGFS entry format).""" + prefix = uri.rstrip("/") + "/" + children: Dict[str, bool] = {} # name → is_dir + for key in list(self.files.keys()) + self.dirs: + if key.startswith(prefix): + rest = key[len(prefix) :] + if rest: + child_name = rest.split("/")[0] + is_deeper = "/" in rest[len(child_name) :] + child_full = f"{prefix}{child_name}" + is_dir = children.get(child_name, False) or is_deeper or child_full in self.dirs + children[child_name] = is_dir + result = [] + for name in sorted(children): + child_uri = f"{uri.rstrip('/')}/{name}" + is_dir = children[name] + result.append( + { + "name": name, + "uri": child_uri, + # Match real AGFS format: "isDir" boolean field + "isDir": is_dir, + "type": "directory" if is_dir else "file", + } + ) + return result + + # ---- move / delete operations ---------------------------------------- + + async def move_file(self, from_uri: str, to_uri: str) -> None: + if from_uri in self.files: + self.files[to_uri] = self.files.pop(from_uri) + + async def delete_temp(self, temp_uri: str) -> None: + prefix = temp_uri.rstrip("/") + "/" + to_del = [k for k in self.files if k == temp_uri or k.startswith(prefix)] + for k in to_del: + del self.files[k] + self.dirs = [d for d in self.dirs if d != temp_uri and not d.startswith(prefix)] + + # ---- temp URI -------------------------------------------------------- + + def create_temp_uri(self) -> str: + self._temp_counter += 1 + return f"viking://temp/dir_{self._temp_counter}" + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +def fake_fs(): + return FakeVikingFS() + + +@pytest.fixture +def parser(fake_fs): + """DirectoryParser with VikingFS patched for ALL BaseParser instances.""" + with patch.object(BaseParser, "_get_viking_fs", return_value=fake_fs): + yield DirectoryParser() + + +# ---- directory fixtures -------------------------------------------------- + + +@pytest.fixture +def tmp_code(tmp_path: Path) -> Path: + """Flat directory with code files (no dedicated parser).""" + (tmp_path / "main.py").write_text("print('hello')", encoding="utf-8") + (tmp_path / "util.py").write_text("def add(a, b): return a + b", encoding="utf-8") + (tmp_path / "app.js").write_text("console.log('hi')", encoding="utf-8") + return tmp_path + + +@pytest.fixture +def tmp_nested_code(tmp_path: Path) -> Path: + """Nested directory with code files only (no dedicated parser):: + + tmp_path/ + a/ + b/ + c.py + d.py + x.py + top.py + """ + ab = tmp_path / "a" / "b" + ab.mkdir(parents=True) + (ab / "c.py").write_text("# c", encoding="utf-8") + (ab / "d.py").write_text("# d", encoding="utf-8") + (tmp_path / "a" / "x.py").write_text("# x", encoding="utf-8") + (tmp_path / "top.py").write_text("# top", encoding="utf-8") + return tmp_path + + +@pytest.fixture +def tmp_empty(tmp_path: Path) -> Path: + """Directory with no processable files.""" + (tmp_path / ".hidden").write_text("hidden", encoding="utf-8") + (tmp_path / "empty.txt").write_bytes(b"") + return tmp_path + + +@pytest.fixture +def tmp_mixed(tmp_path: Path) -> Path: + """Directory with processable and unsupported files.""" + (tmp_path / "main.py").write_text("print(1)", encoding="utf-8") + (tmp_path / "data.xyz").write_text("unknown", encoding="utf-8") + (tmp_path / "archive.rar").write_bytes(b"RAR\x00") + return tmp_path + + +# --------------------------------------------------------------------------- +# Tests: basic properties +# --------------------------------------------------------------------------- + + +class TestDirectoryParserBasic: + """Basic DirectoryParser properties.""" + + def test_supported_extensions_empty(self): + p = DirectoryParser() + assert p.supported_extensions == [] + + def test_can_parse_directory(self, tmp_path: Path): + p = DirectoryParser() + assert p.can_parse(tmp_path) is True + + def test_can_parse_file(self, tmp_path: Path): + f = tmp_path / "test.md" + f.write_text("hello") + p = DirectoryParser() + assert p.can_parse(f) is False + + @pytest.mark.asyncio + async def test_parse_content_not_implemented(self): + p = DirectoryParser() + with pytest.raises(NotImplementedError): + await p.parse_content("some content") + + @pytest.mark.asyncio + async def test_not_a_directory_raises(self, tmp_path: Path, parser): + f = tmp_path / "file.txt" + f.write_text("hello") + with pytest.raises(NotADirectoryError): + await parser.parse(str(f)) + + +# --------------------------------------------------------------------------- +# Tests: empty directory +# --------------------------------------------------------------------------- + + +class TestEmptyDirectory: + """Empty directories should be handled gracefully.""" + + @pytest.mark.asyncio + async def test_empty_dir_returns_zero_files(self, tmp_empty: Path, parser, fake_fs) -> None: + result = await parser.parse(str(tmp_empty)) + + assert result.parser_name == "DirectoryParser" + assert result.source_format == "directory" + assert result.temp_dir_path is not None + assert result.meta.get("file_count", 0) == 0 or len(fake_fs.files) == 0 + + +# --------------------------------------------------------------------------- +# Tests: files without a parser (direct write) +# --------------------------------------------------------------------------- + + +class TestDirectWriteFiles: + """Code files with no dedicated parser should be written directly.""" + + @pytest.mark.asyncio + async def test_all_files_uploaded(self, tmp_code: Path, parser, fake_fs) -> None: + result = await parser.parse(str(tmp_code)) + + assert result.parser_name == "DirectoryParser" + assert result.temp_dir_path is not None + + uploaded_names = {uri.split("/")[-1] for uri in fake_fs.files} + assert "main.py" in uploaded_names + assert "util.py" in uploaded_names + assert "app.js" in uploaded_names + + @pytest.mark.asyncio + async def test_dir_name_in_uri(self, tmp_code: Path, parser, fake_fs) -> None: + await parser.parse(str(tmp_code)) + + dir_name = tmp_code.name + for uri in fake_fs.files: + assert f"/{dir_name}/" in uri + + @pytest.mark.asyncio + async def test_content_preserved(self, tmp_path: Path, parser, fake_fs) -> None: + (tmp_path / "hello.py").write_text("print('world')", encoding="utf-8") + await parser.parse(str(tmp_path)) + + for uri, content in fake_fs.files.items(): + if uri.endswith("hello.py"): + assert content == b"print('world')" + break + else: + pytest.fail("hello.py not found in uploaded files") + + +# --------------------------------------------------------------------------- +# Tests: nested directory structure +# --------------------------------------------------------------------------- + + +class TestNestedDirectory: + """Nested directory structure should be preserved.""" + + @pytest.mark.asyncio + async def test_structure_preserved(self, tmp_nested_code: Path, parser, fake_fs) -> None: + await parser.parse(str(tmp_nested_code)) + + dir_name = tmp_nested_code.name + rel_paths = set() + for uri in fake_fs.files: + idx = uri.find(f"/{dir_name}/") + if idx >= 0: + rel = uri[idx + len(f"/{dir_name}/") :] + rel_paths.add(rel) + + assert "top.py" in rel_paths + assert "a/x.py" in rel_paths + assert "a/b/c.py" in rel_paths + assert "a/b/d.py" in rel_paths + + @pytest.mark.asyncio + async def test_file_count(self, tmp_nested_code: Path, parser, fake_fs) -> None: + await parser.parse(str(tmp_nested_code)) + assert len(fake_fs.files) == 4 + + +# --------------------------------------------------------------------------- +# Tests: unsupported files handled +# --------------------------------------------------------------------------- + + +class TestMixedDirectory: + """Unsupported files should be skipped with warnings (non-strict).""" + + @pytest.mark.asyncio + async def test_only_processable_uploaded(self, tmp_mixed: Path, parser, fake_fs) -> None: + await parser.parse(str(tmp_mixed)) + + uploaded_names = {uri.split("/")[-1] for uri in fake_fs.files} + assert "main.py" in uploaded_names + assert "data.xyz" not in uploaded_names + assert "archive.rar" not in uploaded_names + + @pytest.mark.asyncio + async def test_warnings_for_unsupported(self, tmp_mixed: Path, parser, fake_fs) -> None: + result = await parser.parse(str(tmp_mixed)) + assert len(result.warnings) > 0 + + +# --------------------------------------------------------------------------- +# Tests: files with a parser (parser.parse() path) +# --------------------------------------------------------------------------- + + +class TestParserDelegation: + """Files with a dedicated parser should be processed via parser.parse().""" + + @pytest.mark.asyncio + async def test_md_file_goes_through_parser(self, tmp_path: Path, parser, fake_fs) -> None: + """Markdown files should be processed by MarkdownParser.parse().""" + (tmp_path / "readme.md").write_text("# Hello\nworld", encoding="utf-8") + + result = await parser.parse(str(tmp_path)) + + # MarkdownParser creates a temp dir and stores processed content. + # After merging, the content should appear under our temp. + assert result.meta["file_count"] == 1 + assert len(fake_fs.files) > 0 + + @pytest.mark.asyncio + async def test_txt_file_goes_through_parser(self, tmp_path: Path, parser, fake_fs) -> None: + """Text files should be processed by TextParser (delegates to Markdown).""" + (tmp_path / "notes.txt").write_text("some notes here", encoding="utf-8") + + result = await parser.parse(str(tmp_path)) + + assert result.meta["file_count"] == 1 + assert len(fake_fs.files) > 0 + + +# --------------------------------------------------------------------------- +# Tests: PDF conversion via parser.parse() +# --------------------------------------------------------------------------- + + +class TestPDFConversion: + """PDF files should be processed via PDFParser.parse().""" + + @pytest.mark.asyncio + async def test_pdf_processed_by_parser(self, tmp_path: Path, parser, fake_fs) -> None: + pdf_file = tmp_path / "document.pdf" + pdf_file.write_bytes(b"%PDF-1.4 fake pdf") + + # Mock PDFParser.parse to return a ParseResult with fake content + # in VikingFS (simulating conversion). + mock_temp = fake_fs.create_temp_uri() # e.g. viking://temp/dir_2 + doc_dir = f"{mock_temp}/document" + await fake_fs.mkdir(mock_temp) + await fake_fs.mkdir(doc_dir) + await fake_fs.write_file(f"{doc_dir}/document.md", "# Converted PDF") + + fake_result = create_parse_result( + root=ResourceNode(type=NodeType.ROOT), + source_path=str(pdf_file), + source_format="pdf", + parser_name="PDFParser", + parse_time=0.1, + ) + fake_result.temp_dir_path = mock_temp + + with patch( + "openviking.parse.parsers.directory.DirectoryParser._assign_parser", + ) as mock_assign: + from openviking.parse.parsers.pdf import PDFParser as _PDF + + mock_pdf = AsyncMock(spec=_PDF) + mock_pdf.parse = AsyncMock(return_value=fake_result) + + def assign_side_effect(cf, registry): + if cf.path.suffix == ".pdf": + return mock_pdf + return registry.get_parser_for_file(cf.path) + + mock_assign.side_effect = assign_side_effect + + await parser.parse(str(tmp_path)) + + # The converted .md should be under our directory temp + dir_name = tmp_path.name + found_md = any( + uri.endswith("document.md") and f"/{dir_name}/" in uri for uri in fake_fs.files + ) + assert found_md, f"document.md not found. Files: {list(fake_fs.files.keys())}" + + @pytest.mark.asyncio + async def test_pdf_parse_failure_adds_warning(self, tmp_path: Path, parser, fake_fs) -> None: + pdf_file = tmp_path / "bad.pdf" + pdf_file.write_bytes(b"%PDF-1.4 broken") + + with patch( + "openviking.parse.parsers.directory.DirectoryParser._assign_parser", + ) as mock_assign: + from openviking.parse.parsers.pdf import PDFParser as _PDF + + mock_pdf = AsyncMock(spec=_PDF) + mock_pdf.parse = AsyncMock(side_effect=RuntimeError("conversion failed")) + + def assign_side_effect(cf, registry): + if cf.path.suffix == ".pdf": + return mock_pdf + return registry.get_parser_for_file(cf.path) + + mock_assign.side_effect = assign_side_effect + + result = await parser.parse(str(tmp_path)) + + # Should have a warning, not a crash + assert any("bad.pdf" in w for w in result.warnings) + + +# --------------------------------------------------------------------------- +# Tests: ParseResult metadata +# --------------------------------------------------------------------------- + + +class TestParseResultMetadata: + """ParseResult should contain correct metadata.""" + + @pytest.mark.asyncio + async def test_result_fields(self, tmp_code: Path, parser, fake_fs) -> None: + result = await parser.parse(str(tmp_code)) + + assert result.parser_name == "DirectoryParser" + assert result.source_format == "directory" + assert result.source_path == str(tmp_code.resolve()) + assert result.temp_dir_path is not None + assert result.parse_time is not None + assert result.parse_time > 0 + assert result.meta["dir_name"] == tmp_code.name + assert result.meta["total_processable"] == 3 + assert result.meta["file_count"] == 3 diff --git a/tests/parse/test_directory_parser_routing.py b/tests/parse/test_directory_parser_routing.py new file mode 100644 index 00000000..88ae61bb --- /dev/null +++ b/tests/parse/test_directory_parser_routing.py @@ -0,0 +1,366 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""Isolated unit tests for directory-import parser routing and path mapping. + +This script verifies **two independent concerns** without invoking the full +``ResourceService`` pipeline: + +1. **Parser selection** – given a set of file extensions, the ``ParserRegistry`` + (and ``scan_directory``) correctly classifies each file and selects the + expected parser type (MarkdownParser, HTMLParser, PDFParser, TextParser, + or the text-fallback path for code / config files). + +2. **Path mapping** – the ``_process_directory_file`` helper in + ``ResourceService`` converts each file's relative path into the correct + Viking target URI so that the imported directory structure is preserved. + For example, ``a/b/c.md`` with base target ``viking://resources/mydir`` + produces target ``viking://resources/mydir/a/b`` and the parser names + the document ``c``, yielding final URI ``viking://resources/mydir/a/b/c``. +""" + +from pathlib import Path, PurePosixPath +from typing import Dict, List, Tuple + +import pytest + +from openviking.parse.directory_scan import ( + DirectoryScanResult, + scan_directory, +) +from openviking.parse.parsers.html import HTMLParser +from openviking.parse.parsers.markdown import MarkdownParser +from openviking.parse.parsers.pdf import PDFParser +from openviking.parse.parsers.text import TextParser +from openviking.parse.registry import ParserRegistry + +# ═══════════════════════════════════════════════════════════════════════════ +# Part 1 – Parser selection +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.fixture +def registry() -> ParserRegistry: + """Default registry (no optional parsers like ImageParser).""" + return ParserRegistry(register_optional=False) + + +# -- directory tree that covers every parser type ---------------------------- + + +@pytest.fixture +def tmp_all_parsers(tmp_path: Path) -> Path: + """Directory tree with files that exercise every built-in parser. + + Layout:: + + tmp_path/ + docs/ + guide.md -> MarkdownParser + spec.markdown -> MarkdownParser + readme.mdown -> MarkdownParser + web/ + index.html -> HTMLParser + page.htm -> HTMLParser + pdfs/ + paper.pdf -> PDFParser (binary, requires real bytes) + text/ + notes.txt -> TextParser + log.text -> TextParser + code/ + app.py -> text-fallback (is_text_file) + main.js -> text-fallback + style.css -> text-fallback + config/ + settings.yaml -> text-fallback + data.json -> text-fallback + rules.toml -> text-fallback + unsupported/ + image.bmp -> unsupported (binary, no parser) + archive.rar -> unsupported + """ + (tmp_path / "docs").mkdir() + (tmp_path / "docs" / "guide.md").write_text("# Guide", encoding="utf-8") + (tmp_path / "docs" / "spec.markdown").write_text("# Spec", encoding="utf-8") + (tmp_path / "docs" / "readme.mdown").write_text("# Readme", encoding="utf-8") + + (tmp_path / "web").mkdir() + (tmp_path / "web" / "index.html").write_text("", encoding="utf-8") + (tmp_path / "web" / "page.htm").write_text("", encoding="utf-8") + + (tmp_path / "pdfs").mkdir() + # Minimal PDF header so it's not empty + (tmp_path / "pdfs" / "paper.pdf").write_bytes(b"%PDF-1.4 minimal") + + (tmp_path / "text").mkdir() + (tmp_path / "text" / "notes.txt").write_text("plain text", encoding="utf-8") + (tmp_path / "text" / "log.text").write_text("log entry", encoding="utf-8") + + (tmp_path / "code").mkdir() + (tmp_path / "code" / "app.py").write_text("print(1)", encoding="utf-8") + (tmp_path / "code" / "main.js").write_text("console.log(1)", encoding="utf-8") + (tmp_path / "code" / "style.css").write_text("body{}", encoding="utf-8") + + (tmp_path / "config").mkdir() + (tmp_path / "config" / "settings.yaml").write_text("k: v", encoding="utf-8") + (tmp_path / "config" / "data.json").write_text("{}", encoding="utf-8") + (tmp_path / "config" / "rules.toml").write_text("[section]", encoding="utf-8") + + (tmp_path / "unsupported").mkdir() + (tmp_path / "unsupported" / "image.bmp").write_bytes(b"BM\x00\x00") + (tmp_path / "unsupported" / "archive.rar").write_bytes(b"RAR\x00") + + return tmp_path + + +class TestParserSelection: + """Each file extension must be resolved to the correct parser class.""" + + # Extension -> expected parser class (or None = no dedicated parser, uses + # text-fallback through ParserRegistry.parse which falls through to TextParser) + DEDICATED_PARSER_MAP: Dict[str, type] = { + ".md": MarkdownParser, + ".markdown": MarkdownParser, + ".mdown": MarkdownParser, + ".html": HTMLParser, + ".htm": HTMLParser, + ".pdf": PDFParser, + ".txt": TextParser, + ".text": TextParser, + } + + # Extensions that are *processable* (via is_text_file) but have no + # dedicated parser in the registry – they fall back to TextParser at + # parse-time via ``ParserRegistry.parse``. + TEXT_FALLBACK_EXTENSIONS = {".py", ".js", ".css", ".yaml", ".json", ".toml"} + + def test_dedicated_parsers_resolve(self, registry: ParserRegistry) -> None: + """get_parser_for_file returns the correct class for each extension.""" + for ext, expected_cls in self.DEDICATED_PARSER_MAP.items(): + dummy_path = Path(f"/tmp/file{ext}") + parser = registry.get_parser_for_file(dummy_path) + assert parser is not None, f"No parser returned for {ext}" + assert isinstance(parser, expected_cls), ( + f"{ext}: expected {expected_cls.__name__}, got {type(parser).__name__}" + ) + + def test_text_fallback_returns_none_from_registry(self, registry: ParserRegistry) -> None: + """Code / config extensions have no *dedicated* parser, so + ``get_parser_for_file`` returns None. The registry's ``parse()`` + falls back to TextParser internally.""" + for ext in self.TEXT_FALLBACK_EXTENSIONS: + dummy_path = Path(f"/tmp/file{ext}") + parser = registry.get_parser_for_file(dummy_path) + assert parser is None, ( + f"{ext}: expected None (text-fallback), got {type(parser).__name__}" + ) + + def test_scan_classifies_all_files_correctly( + self, tmp_all_parsers: Path, registry: ParserRegistry + ) -> None: + """scan_directory should mark dedicated-parser and text-fallback + files as processable, and truly unknown formats as unsupported.""" + result: DirectoryScanResult = scan_directory( + tmp_all_parsers, registry=registry, strict=False + ) + + processable_exts = {Path(f.rel_path).suffix.lower() for f in result.processable} + unsupported_exts = {Path(f.rel_path).suffix.lower() for f in result.unsupported} + + # All dedicated-parser extensions must be processable + for ext in self.DEDICATED_PARSER_MAP: + assert ext in processable_exts, f"{ext} should be processable" + + # All text-fallback extensions must be processable + for ext in self.TEXT_FALLBACK_EXTENSIONS: + assert ext in processable_exts, f"{ext} should be processable (text-fallback)" + + # .bmp and .rar are unsupported + assert ".bmp" in unsupported_exts + assert ".rar" in unsupported_exts + + def test_each_processable_file_has_a_parser_or_is_text( + self, tmp_all_parsers: Path, registry: ParserRegistry + ) -> None: + """Every processable file must either have a dedicated parser or pass + ``is_text_file``.""" + from openviking.parse.parsers.upload_utils import is_text_file + + result = scan_directory(tmp_all_parsers, registry=registry, strict=False) + for cf in result.processable: + has_parser = registry.get_parser_for_file(cf.path) is not None + is_text = is_text_file(cf.path) + assert has_parser or is_text, ( + f"{cf.rel_path}: not a known parser type and not a text file" + ) + + +class TestParserCanParse: + """Parser.can_parse must accept its own supported extensions.""" + + @pytest.mark.parametrize( + "parser_cls,filenames", + [ + (MarkdownParser, ["doc.md", "spec.markdown", "x.mdown", "y.mkd"]), + (HTMLParser, ["page.html", "site.htm"]), + (PDFParser, ["paper.pdf"]), + (TextParser, ["notes.txt", "log.text"]), + ], + ) + def test_can_parse_returns_true(self, parser_cls: type, filenames: List[str]) -> None: + parser = parser_cls() + for name in filenames: + assert parser.can_parse(Path(name)), ( + f"{parser_cls.__name__}.can_parse('{name}') should be True" + ) + + @pytest.mark.parametrize( + "parser_cls,filenames", + [ + (MarkdownParser, ["file.py", "file.html", "file.pdf"]), + (HTMLParser, ["file.md", "file.pdf", "file.txt"]), + (PDFParser, ["file.md", "file.txt", "file.html"]), + (TextParser, ["file.md", "file.html", "file.pdf"]), + ], + ) + def test_can_parse_returns_false_for_wrong_extension( + self, parser_cls: type, filenames: List[str] + ) -> None: + parser = parser_cls() + for name in filenames: + assert not parser.can_parse(Path(name)), ( + f"{parser_cls.__name__}.can_parse('{name}') should be False" + ) + + +# ═══════════════════════════════════════════════════════════════════════════ +# Part 2 – Relative-path → Viking URI mapping +# ═══════════════════════════════════════════════════════════════════════════ + +# The mapping logic lives in ``ResourceService._process_directory_file``. +# Instead of pulling in the full service we replicate the *pure* path +# computation here so tests stay isolated and fast. + + +def _compute_file_target(rel_path: str, base_target: str) -> str: + """Replicate the target-URI computation from _process_directory_file.""" + parent_rel = str(PurePosixPath(rel_path).parent) + if parent_rel == ".": + return base_target + return f"{base_target}/{parent_rel}" + + +def _expected_final_uri(rel_path: str, base_target: str) -> str: + """Expected final URI after the parser names the document by file stem. + + The TreeBuilder computes: ``final_uri = base_uri.join(doc_name)`` + where ``doc_name`` is typically the file stem. + """ + file_target = _compute_file_target(rel_path, base_target) + stem = Path(rel_path).stem + return f"{file_target}/{stem}" + + +class TestPathMapping: + """Verify that relative file paths map to the correct Viking URIs.""" + + BASE = "viking://resources/mydir" + + # (relative_path, expected_target_for_process_resource, expected_final_uri) + CASES: List[Tuple[str, str, str]] = [ + # Root-level file + ("top.md", "viking://resources/mydir", "viking://resources/mydir/top"), + ("README.txt", "viking://resources/mydir", "viking://resources/mydir/README"), + # One level deep + ( + "docs/guide.md", + "viking://resources/mydir/docs", + "viking://resources/mydir/docs/guide", + ), + ( + "src/app.py", + "viking://resources/mydir/src", + "viking://resources/mydir/src/app", + ), + # Two levels deep + ( + "a/b/c.md", + "viking://resources/mydir/a/b", + "viking://resources/mydir/a/b/c", + ), + ( + "a/b/d.txt", + "viking://resources/mydir/a/b", + "viking://resources/mydir/a/b/d", + ), + # Three levels deep + ( + "x/y/z/deep.md", + "viking://resources/mydir/x/y/z", + "viking://resources/mydir/x/y/z/deep", + ), + ] + + @pytest.mark.parametrize("rel_path,expected_target,_", CASES) + def test_target_uri_computation(self, rel_path: str, expected_target: str, _: str) -> None: + """_compute_file_target produces the correct parent-based target.""" + assert _compute_file_target(rel_path, self.BASE) == expected_target + + @pytest.mark.parametrize("rel_path,_,expected_uri", CASES) + def test_final_uri_matches_rel_path_structure( + self, rel_path: str, _: str, expected_uri: str + ) -> None: + """The final URI (target + file stem) preserves the directory tree.""" + assert _expected_final_uri(rel_path, self.BASE) == expected_uri + + +class TestPathMappingFromScan: + """End-to-end: scan a real directory, then verify every processable file's + relative path maps to the expected Viking URI.""" + + @pytest.fixture + def tmp_deep(self, tmp_path: Path) -> Path: + """Create a three-level nested directory. + + Structure:: + + tmp_path/ + a/ + b/ + c.md + x.md + top.md + src/ + main.py + """ + ab = tmp_path / "a" / "b" + ab.mkdir(parents=True) + (ab / "c.md").write_text("# C", encoding="utf-8") + (tmp_path / "a" / "x.md").write_text("# X", encoding="utf-8") + (tmp_path / "top.md").write_text("# Top", encoding="utf-8") + (tmp_path / "src").mkdir() + (tmp_path / "src" / "main.py").write_text("pass", encoding="utf-8") + return tmp_path + + def test_scan_then_map_preserves_structure(self, tmp_deep: Path) -> None: + """For every processable file, the computed final URI should embed + the same directory hierarchy as the original relative path.""" + result = scan_directory(tmp_deep, strict=False) + base = f"viking://resources/{tmp_deep.name}" + + for cf in result.processable: + rel = cf.rel_path.replace("\\", "/") # normalize for Windows + final_uri = _expected_final_uri(rel, base) + + # The URI path (after viking://resources/) should equal + # / + uri_path = final_uri[len("viking://resources/") :] + expected_path = f"{tmp_deep.name}/{str(PurePosixPath(rel).with_suffix(''))}" + assert uri_path == expected_path, ( + f"Mapping mismatch for {rel}: got URI path '{uri_path}', expected '{expected_path}'" + ) + + def test_empty_directory_produces_no_mappings(self, tmp_path: Path) -> None: + """An empty directory has no processable files → zero URI mappings.""" + (tmp_path / ".gitkeep").write_text("", encoding="utf-8") # skipped: empty + result = scan_directory(tmp_path, strict=False) + assert len(result.processable) == 0