Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 31 additions & 1 deletion examples/query/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,35 @@ uv run query.py "What do we have here?" --score-threshold 0.5
mv data/ data.bak/ # or rm -rf if you want
```

## Add Directory

`add.py` supports adding an entire directory of documents at once. Files are automatically classified and parsed by their type (PDF, Markdown, Text, code, etc.). A summary table is printed after import showing which files were processed, failed, unsupported, or filtered.

```bash
# Add all supported files in a directory
uv run add.py ~/Documents/research/

# Only include specific file types
uv run add.py ~/project/ --include '*.md' --include '*.pdf'

# Exclude certain files
uv run add.py ~/project/ --exclude 'test_*' --exclude '*.pyc'

# Skip specific sub-directories
uv run add.py ~/project/ --ignore-dirs node_modules --ignore-dirs .git

# Combine options
uv run add.py ~/project/ --include '*.md' --exclude 'draft_*' --ignore-dirs vendor
```

### Directory Options

| Option | Description |
|--------|-------------|
| `--include PATTERN` | Glob pattern for files to include (can be repeated) |
| `--exclude PATTERN` | Glob pattern for files to exclude (can be repeated) |
| `--ignore-dirs NAME` | Directory names to skip (can be repeated) |

### Query Options

| Option | Default | Description |
Expand Down Expand Up @@ -50,7 +79,7 @@ Edit `ov.conf` to configure:

```
rag.py # RAG pipeline library
add.py # Add documents CLI
add.py # Add documents/directories CLI
query.py # Query CLI
q # Quick query wrapper
logging_config.py # Logging configuration
Expand All @@ -64,3 +93,4 @@ data/ # Database storage
- Use `uv run query.py` for more control
- Set `OV_DEBUG=1` only when debugging
- Resources are indexed once, query unlimited times
- When adding directories, use `--include` / `--exclude` to control which files are imported
169 changes: 157 additions & 12 deletions examples/query/add.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,116 @@
import json
import sys
from pathlib import Path
from typing import Any, Dict, List

from rich import box
from rich.console import Console
from rich.table import Table

import openviking as ov
from openviking_cli.utils.config.open_viking_config import OpenVikingConfig

console = Console()


# ── Table helpers ──────────────────────────────────────────────────


def _print_directory_summary(meta: Dict[str, Any], errors: List[str]) -> None:
"""Print a rich-table summary for a directory import."""
processed: List[Dict[str, str]] = meta.get("processed_files", [])
failed: List[Dict[str, str]] = meta.get("failed_files", [])
unsupported: List[Dict[str, str]] = meta.get("unsupported_files", [])
skipped: List[Dict[str, str]] = meta.get("skipped_files", [])

def add_resource(resource_path: str, config_path: str = "./ov.conf", data_path: str = "./data"):
n_total = len(processed) + len(failed) + len(unsupported) + len(skipped)

if n_total == 0:
console.print(" (no files found)", style="dim")
return

# Build a single combined table (ROUNDED box style, same as query.py)
table = Table(
title=f"Directory Import ({n_total} files)",
box=box.ROUNDED,
show_header=True,
header_style="bold magenta",
title_style="bold magenta",
)
table.add_column("#", style="cyan", width=4)
table.add_column("Status", no_wrap=True)
table.add_column("File", style="bold white", no_wrap=True)
table.add_column("Detail")

# Match failed files to their warning messages
fail_reasons: Dict[str, str] = {}
for err in errors:
for f in failed:
if f["path"] in err:
fail_reasons[f["path"]] = err
break

idx = 0
for f in processed:
idx += 1
table.add_row(
str(idx),
"[green]processed[/green]",
f["path"],
f"[dim]{f.get('parser', '')}[/dim]",
)

for f in failed:
idx += 1
reason = fail_reasons.get(f["path"], "parse error")
table.add_row(
str(idx),
"[red]failed[/red]",
f["path"],
f"[red]{reason}[/red]",
)

for f in unsupported:
idx += 1
table.add_row(
str(idx),
"[yellow]unsupported[/yellow]",
f["path"],
"",
)

for f in skipped:
idx += 1
status = f.get("status", "skip")
table.add_row(
str(idx),
f"[dim]{status}[/dim]",
f"[dim]{f['path']}[/dim]",
"",
)

console.print()
console.print(table)


# ── Main logic ─────────────────────────────────────────────────────


def add_resource(
resource_path: str,
config_path: str = "./ov.conf",
data_path: str = "./data",
**kwargs,
):
"""
Add a resource to OpenViking database

Args:
resource_path: Path to file, directory, or URL
config_path: Path to config file
data_path: Path to data directory
**kwargs: Extra options forwarded to ``add_resource`` (e.g.
``include``, ``exclude``, ``ignore_dirs``).
"""
# Load config
print(f"📋 Loading config from: {config_path}")
Expand All @@ -34,25 +131,37 @@ def add_resource(resource_path: str, config_path: str = "./ov.conf", data_path:
client.initialize()
print("✓ Initialized\n")

print(f"📂 Adding resource: {resource_path}")

# Check if it's a file and exists
if not resource_path.startswith("http"):
# Check if it's a local path and exists
is_local = not resource_path.startswith("http")
is_directory = False
if is_local:
path = Path(resource_path).expanduser()
if not path.exists():
print(f"❌ Error: File not found: {path}")
print(f"❌ Error: Path not found: {path}")
return False
is_directory = path.is_dir()

result = client.add_resource(path=resource_path)
if is_directory:
print(f"📂 Adding directory: {resource_path}")
else:
print(f"📄 Adding resource: {resource_path}")

result = client.add_resource(path=resource_path, **kwargs)

# Check result
if result and "root_uri" in result:
root_uri = result["root_uri"]
print(f"✓ Resource added: {root_uri}\n")
meta = result.get("meta", {})
errors = result.get("errors", [])
print(f"✓ Resource added: {root_uri}")

# Show directory-specific table
if is_directory:
_print_directory_summary(meta, errors)

# Wait for processing
print("⏳ Processing and indexing...")
client.wait_processed(timeout=300)
print("\n⏳ Processing and indexing...")
client.wait_processed(timeout=600 if is_directory else 300)
print("✓ Processing complete!\n")

print("🎉 Resource is now searchable in the database!")
Expand All @@ -61,7 +170,7 @@ def add_resource(resource_path: str, config_path: str = "./ov.conf", data_path:
elif result and result.get("status") == "error":
print("\n⚠️ Resource had parsing issues:")
if "errors" in result:
for error in result["errors"][:3]:
for error in result["errors"][:5]:
print(f" - {error}")
print("\n💡 Some content may still be searchable.")
return False
Expand Down Expand Up @@ -123,6 +232,31 @@ def main():
"--data", type=str, default="./data", help="Path to data directory (default: ./data)"
)

# Directory-specific options
dir_group = parser.add_argument_group("directory options")
dir_group.add_argument(
"--include",
type=str,
action="append",
default=None,
help="Glob pattern for files to include (can be repeated, e.g. --include '*.md')",
)
dir_group.add_argument(
"--exclude",
type=str,
action="append",
default=None,
help="Glob pattern for files to exclude (can be repeated, e.g. --exclude 'test_*')",
)
dir_group.add_argument(
"--ignore-dirs",
type=str,
action="append",
default=None,
dest="ignore_dirs",
help="Directory names to skip (can be repeated, e.g. --ignore-dirs node_modules)",
)

args = parser.parse_args()

# Expand user paths
Expand All @@ -132,8 +266,19 @@ def main():
else args.resource
)

# Build kwargs for directory options
# scan_directory expects include/exclude as comma-separated strings,
# and ignore_dirs as a Set[str].
dir_kwargs = {}
if args.include:
dir_kwargs["include"] = ",".join(args.include)
if args.exclude:
dir_kwargs["exclude"] = ",".join(args.exclude)
if args.ignore_dirs:
dir_kwargs["ignore_dirs"] = set(args.ignore_dirs)

# Add the resource
success = add_resource(resource_path, args.config, args.data)
success = add_resource(resource_path, args.config, args.data, **dir_kwargs)

sys.exit(0 if success else 1)

Expand Down
4 changes: 4 additions & 0 deletions openviking/async_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,12 +143,15 @@ async def add_resource(
instruction: str = "",
wait: bool = False,
timeout: float = None,
**kwargs,
) -> Dict[str, Any]:
"""Add resource to OpenViking (only supports resources scope).

Args:
wait: Whether to wait for semantic extraction and vectorization to complete
timeout: Wait timeout in seconds
**kwargs: Extra options forwarded to the parser chain, e.g.
``strict``, ``ignore_dirs``, ``include``, ``exclude``.
"""
await self._ensure_initialized()
return await self._client.add_resource(
Expand All @@ -158,6 +161,7 @@ async def add_resource(
instruction=instruction,
wait=wait,
timeout=timeout,
**kwargs,
)

async def wait_processed(self, timeout: float = None) -> Dict[str, Any]:
Expand Down
2 changes: 2 additions & 0 deletions openviking/client/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ async def add_resource(
instruction: str = "",
wait: bool = False,
timeout: Optional[float] = None,
**kwargs,
) -> Dict[str, Any]:
"""Add resource to OpenViking."""
return await self._service.resources.add_resource(
Expand All @@ -67,6 +68,7 @@ async def add_resource(
instruction=instruction,
wait=wait,
timeout=timeout,
**kwargs,
)

async def add_skill(
Expand Down
Loading
Loading