diff --git a/crates/ov_cli/src/main.rs b/crates/ov_cli/src/main.rs index 66430907..56586218 100644 --- a/crates/ov_cli/src/main.rs +++ b/crates/ov_cli/src/main.rs @@ -460,7 +460,7 @@ async fn main() { } async fn handle_add_resource( - path: String, + mut path: String, to: Option, reason: String, instruction: String, @@ -468,6 +468,34 @@ async fn handle_add_resource( timeout: Option, ctx: CliContext, ) -> Result<()> { + // Validate path: if it's a local path, check if it exists + if !path.starts_with("http://") && !path.starts_with("https://") { + use std::path::Path; + + // Unescape path: replace backslash followed by space with just space + let unescaped_path = path.replace("\\ ", " "); + let path_obj = Path::new(&unescaped_path); + if !path_obj.exists() { + eprintln!("Error: Path '{}' does not exist.", path); + + // Check if there might be unquoted spaces + use std::env; + let args: Vec = env::args().collect(); + + if let Some(add_resource_pos) = args.iter().position(|s| s == "add-resource" || s == "add") { + if args.len() > add_resource_pos + 2 { + let extra_args = &args[add_resource_pos + 2..]; + let suggested_path = format!("{} {}", path, extra_args.join(" ")); + eprintln!("\nIt looks like you may have forgotten to quote a path with spaces."); + eprintln!("Suggested command: ov add-resource \"{}\"", suggested_path); + } + } + + std::process::exit(1); + } + path = unescaped_path; + } + let client = ctx.get_client(); commands::resources::add_resource( &client, &path, to, reason, instruction, wait, timeout, ctx.output_format, ctx.compact diff --git a/examples/chatmem/ov.conf.example b/examples/chatmem/ov.conf.example index 2e9a40ae..6a085e5d 100644 --- a/examples/chatmem/ov.conf.example +++ b/examples/chatmem/ov.conf.example @@ -12,6 +12,7 @@ "api_base" : "https://ark-cn-beijing.bytedance.net/api/v3", "api_key" : "not_gonna_give_u_this", "backend" : "volcengine", - "model" : "doubao-seed-1-8-251228" + "model" : "doubao-seed-1-8-251228", + "thinking": false } } diff --git a/examples/mcp-query/ov.conf.example b/examples/mcp-query/ov.conf.example index bf4a45de..fc40ea92 100644 --- a/examples/mcp-query/ov.conf.example +++ b/examples/mcp-query/ov.conf.example @@ -12,6 +12,7 @@ "api_base" : "https://ark-cn-beijing.bytedance.net/api/v3", "api_key" : "", "provider" : "volcengine", - "model" : "doubao-seed-1-8-251228" + "model" : "doubao-seed-1-8-251228", + "thinking": false } } diff --git a/examples/memex/ov.conf.example b/examples/memex/ov.conf.example index d5187cd3..42697b72 100644 --- a/examples/memex/ov.conf.example +++ b/examples/memex/ov.conf.example @@ -12,6 +12,7 @@ "api_base" : "https://ark.cn-beijing.volces.com/api/v3", "api_key" : "your-volcengine-api-key", "backend" : "volcengine", - "model" : "doubao-seed-1-8-251228" + "model" : "doubao-seed-1-8-251228", + "thinking": false } } diff --git a/examples/ov.conf.example b/examples/ov.conf.example index 205cd7d9..34cbc6a4 100644 --- a/examples/ov.conf.example +++ b/examples/ov.conf.example @@ -45,7 +45,8 @@ "api_base": "https://ark.cn-beijing.volces.com/api/v3", "temperature": 0.0, "max_retries": 2, - "provider": "volcengine" + "provider": "volcengine", + "thinking": false }, "rerank": { "ak": null, diff --git a/examples/query/ov.conf.example b/examples/query/ov.conf.example index 58d034c0..fdc7cb55 100644 --- a/examples/query/ov.conf.example +++ b/examples/query/ov.conf.example @@ -12,6 +12,7 @@ "api_base" : "https://ark-cn-beijing.bytedance.net/api/v3", "api_key" : "not_gonna_give_u_this", "provider" : "volcengine", - "model" : "doubao-seed-1-8-251228" + "model" : "doubao-seed-1-8-251228", + "thinking": false } } diff --git a/examples/server_client/ov.conf.example b/examples/server_client/ov.conf.example index 582d79b8..13eb55db 100644 --- a/examples/server_client/ov.conf.example +++ b/examples/server_client/ov.conf.example @@ -34,6 +34,7 @@ "api_base": "https://ark.cn-beijing.volces.com/api/v3", "temperature": 0.0, "max_retries": 2, - "provider": "volcengine" + "provider": "volcengine", + "thinking": false } } diff --git a/openviking/models/vlm/backends/litellm_vlm.py b/openviking/models/vlm/backends/litellm_vlm.py index f1efa562..2373e5dd 100644 --- a/openviking/models/vlm/backends/litellm_vlm.py +++ b/openviking/models/vlm/backends/litellm_vlm.py @@ -139,21 +139,31 @@ def _build_kwargs(self, model: str, messages: list) -> dict[str, Any]: return kwargs - def get_completion(self, prompt: str) -> str: + def get_completion(self, prompt: str, thinking: bool = False) -> str: """Get text completion synchronously.""" model = self._resolve_model(self.model or "gpt-4o-mini") messages = [{"role": "user", "content": prompt}] + original_thinking = self._thinking + if thinking: + self._thinking = thinking kwargs = self._build_kwargs(model, messages) + self._thinking = original_thinking response = completion(**kwargs) self._update_token_usage_from_response(response) return response.choices[0].message.content or "" - async def get_completion_async(self, prompt: str, max_retries: int = 0) -> str: + async def get_completion_async( + self, prompt: str, thinking: bool = False, max_retries: int = 0 + ) -> str: """Get text completion asynchronously.""" model = self._resolve_model(self.model or "gpt-4o-mini") messages = [{"role": "user", "content": prompt}] + original_thinking = self._thinking + if thinking: + self._thinking = thinking kwargs = self._build_kwargs(model, messages) + self._thinking = original_thinking last_error = None for attempt in range(max_retries + 1): @@ -164,7 +174,7 @@ async def get_completion_async(self, prompt: str, max_retries: int = 0) -> str: except Exception as e: last_error = e if attempt < max_retries: - await asyncio.sleep(2 ** attempt) + await asyncio.sleep(2**attempt) if last_error: raise last_error @@ -174,6 +184,7 @@ def get_vision_completion( self, prompt: str, images: List[Union[str, Path, bytes]], + thinking: bool = False, ) -> str: """Get vision completion synchronously.""" model = self._resolve_model(self.model or "gpt-4o-mini") @@ -184,7 +195,11 @@ def get_vision_completion( content.append({"type": "text", "text": prompt}) messages = [{"role": "user", "content": content}] + original_thinking = self._thinking + if thinking: + self._thinking = thinking kwargs = self._build_kwargs(model, messages) + self._thinking = original_thinking response = completion(**kwargs) self._update_token_usage_from_response(response) @@ -194,6 +209,7 @@ async def get_vision_completion_async( self, prompt: str, images: List[Union[str, Path, bytes]], + thinking: bool = False, ) -> str: """Get vision completion asynchronously.""" model = self._resolve_model(self.model or "gpt-4o-mini") @@ -204,7 +220,11 @@ async def get_vision_completion_async( content.append({"type": "text", "text": prompt}) messages = [{"role": "user", "content": content}] + original_thinking = self._thinking + if thinking: + self._thinking = thinking kwargs = self._build_kwargs(model, messages) + self._thinking = original_thinking response = await acompletion(**kwargs) self._update_token_usage_from_response(response) diff --git a/openviking/models/vlm/backends/openai_vlm.py b/openviking/models/vlm/backends/openai_vlm.py index c6c5b230..18a22ff7 100644 --- a/openviking/models/vlm/backends/openai_vlm.py +++ b/openviking/models/vlm/backends/openai_vlm.py @@ -131,6 +131,7 @@ def get_vision_completion( self, prompt: str, images: List[Union[str, Path, bytes]], + thinking: bool = False, ) -> str: """Get vision completion""" client = self.get_client() @@ -140,11 +141,16 @@ def get_vision_completion( content.append(self._prepare_image(img)) content.append({"type": "text", "text": prompt}) - response = client.chat.completions.create( - model=self.model or "gpt-4o-mini", - messages=[{"role": "user", "content": content}], - temperature=self.temperature, - ) + kwargs = { + "model": self.model or "gpt-4o-mini", + "messages": [{"role": "user", "content": content}], + "temperature": self.temperature, + } + + if self.provider == "volcengine": + kwargs["thinking"] = {"type": "disabled" if not thinking else "enabled"} + + response = client.chat.completions.create(**kwargs) self._update_token_usage_from_response(response) return response.choices[0].message.content or "" @@ -152,6 +158,7 @@ async def get_vision_completion_async( self, prompt: str, images: List[Union[str, Path, bytes]], + thinking: bool = False, ) -> str: """Get vision completion asynchronously""" client = self.get_async_client() @@ -161,10 +168,15 @@ async def get_vision_completion_async( content.append(self._prepare_image(img)) content.append({"type": "text", "text": prompt}) - response = await client.chat.completions.create( - model=self.model or "gpt-4o-mini", - messages=[{"role": "user", "content": content}], - temperature=self.temperature, - ) + kwargs = { + "model": self.model or "gpt-4o-mini", + "messages": [{"role": "user", "content": content}], + "temperature": self.temperature, + } + + if self.provider == "volcengine": + kwargs["thinking"] = {"type": "disabled" if not thinking else "enabled"} + + response = await client.chat.completions.create(**kwargs) self._update_token_usage_from_response(response) return response.choices[0].message.content or "" diff --git a/openviking/models/vlm/backends/volcengine_vlm.py b/openviking/models/vlm/backends/volcengine_vlm.py index f11a289e..b5841cc8 100644 --- a/openviking/models/vlm/backends/volcengine_vlm.py +++ b/openviking/models/vlm/backends/volcengine_vlm.py @@ -54,22 +54,26 @@ def get_async_client(self): ) return self._async_client - def get_completion(self, prompt: str) -> str: - return super().get_completion(prompt) + def get_completion(self, prompt: str, thinking: bool = False) -> str: + return super().get_completion(prompt, thinking) - async def get_completion_async(self, prompt: str, max_retries: int = 0) -> str: - return await super().get_completion_async(prompt, max_retries) + async def get_completion_async( + self, prompt: str, thinking: bool = False, max_retries: int = 0 + ) -> str: + return await super().get_completion_async(prompt, thinking, max_retries) def get_vision_completion( self, prompt: str, images: List[Union[str, Path, bytes]], + thinking: bool = False, ) -> str: - return super().get_vision_completion(prompt, images) + return super().get_vision_completion(prompt, images, thinking) async def get_vision_completion_async( self, prompt: str, images: List[Union[str, Path, bytes]], + thinking: bool = False, ) -> str: - return await super().get_vision_completion_async(prompt, images) + return await super().get_vision_completion_async(prompt, images, thinking) diff --git a/openviking/models/vlm/base.py b/openviking/models/vlm/base.py index ef55f712..cd563f9c 100644 --- a/openviking/models/vlm/base.py +++ b/openviking/models/vlm/base.py @@ -27,12 +27,14 @@ def __init__(self, config: Dict[str, Any]): self._token_tracker = TokenUsageTracker() @abstractmethod - def get_completion(self, prompt: str) -> str: + def get_completion(self, prompt: str, thinking: bool = False) -> str: """Get text completion""" pass @abstractmethod - async def get_completion_async(self, prompt: str, max_retries: int = 0) -> str: + async def get_completion_async( + self, prompt: str, thinking: bool = False, max_retries: int = 0 + ) -> str: """Get text completion asynchronously""" pass @@ -41,6 +43,7 @@ def get_vision_completion( self, prompt: str, images: List[Union[str, Path, bytes]], + thinking: bool = False, ) -> str: """Get vision completion""" pass @@ -50,6 +53,7 @@ async def get_vision_completion_async( self, prompt: str, images: List[Union[str, Path, bytes]], + thinking: bool = False, ) -> str: """Get vision completion asynchronously""" pass @@ -128,16 +132,20 @@ def create(config: Dict[str, Any]) -> VLMBase: if not use_litellm: if provider == "openai": from .backends.openai_vlm import OpenAIVLM + return OpenAIVLM(config) elif provider == "volcengine": from .backends.volcengine_vlm import VolcEngineVLM + return VolcEngineVLM(config) from .backends.litellm_vlm import LiteLLMVLMProvider + return LiteLLMVLMProvider(config) @staticmethod def get_available_providers() -> List[str]: """Get list of available providers""" from .registry import get_all_provider_names + return get_all_provider_names() diff --git a/openviking/models/vlm/llm.py b/openviking/models/vlm/llm.py index 07c52179..6c8b9c56 100644 --- a/openviking/models/vlm/llm.py +++ b/openviking/models/vlm/llm.py @@ -168,34 +168,38 @@ def complete_json( self, prompt: str, schema: Optional[Dict[str, Any]] = None, + thinking: bool = False, ) -> Optional[Dict[str, Any]]: """Get JSON completion from VLM.""" if schema: prompt = f"{prompt}\n\n{get_json_schema_prompt(schema)}" - response = self._get_vlm().get_completion(prompt) + response = self._get_vlm().get_completion(prompt, thinking) return parse_json_from_response(response) async def complete_json_async( self, prompt: str, schema: Optional[Dict[str, Any]] = None, + thinking: bool = False, + max_retries: int = 0, ) -> Optional[Dict[str, Any]]: """Async version of complete_json.""" if schema: prompt = f"{prompt}\n\n{get_json_schema_prompt(schema)}" - response = await self._get_vlm().get_completion_async(prompt) + response = await self._get_vlm().get_completion_async(prompt, thinking, max_retries) return parse_json_from_response(response) def complete_model( self, prompt: str, model_class: Type[T], + thinking: bool = False, ) -> Optional[T]: """Get structured completion validated against a Pydantic model.""" schema = model_class.model_json_schema() - response = self.complete_json(prompt, schema=schema) + response = self.complete_json(prompt, schema=schema, thinking=thinking) if response is None: return None @@ -209,10 +213,14 @@ async def complete_model_async( self, prompt: str, model_class: Type[T], + thinking: bool = False, + max_retries: int = 0, ) -> Optional[T]: """Async version of complete_model.""" schema = model_class.model_json_schema() - response = await self.complete_json_async(prompt, schema=schema) + response = await self.complete_json_async( + prompt, schema=schema, thinking=thinking, max_retries=max_retries + ) if response is None: return None @@ -226,14 +234,16 @@ def get_vision_completion( self, prompt: str, images: list, + thinking: bool = False, ) -> str: """Get vision completion.""" - return self._get_vlm().get_vision_completion(prompt, images) + return self._get_vlm().get_vision_completion(prompt, images, thinking) async def get_vision_completion_async( self, prompt: str, images: list, + thinking: bool = False, ) -> str: """Async vision completion.""" - return await self._get_vlm().get_vision_completion_async(prompt, images) + return await self._get_vlm().get_vision_completion_async(prompt, images, thinking) diff --git a/openviking/parse/parsers/README.md b/openviking/parse/parsers/README.md index b8bcefe0..94fcc23c 100644 --- a/openviking/parse/parsers/README.md +++ b/openviking/parse/parsers/README.md @@ -144,10 +144,66 @@ L1: """ 代码解析器,支持语法高亮和代码结构分析。能识别函数、类、方法等代码元素。 ### 6. MediaParser (`media.py`) -**支持格式**: `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp`, `.mp4`, `.mov`, `.avi`, `.webm`, `.mp3`, `.wav`, `.m4a`, `.flac` + +**支持格式**: +- 图片: `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp` +- 视频: `.mp4`, `.mov`, `.avi`, `.webm` +- 音频: `.mp3`, `.wav`, `.m4a`, `.flac` 多媒体解析器,使用 VLM(视觉语言模型)分析图像、视频和音频内容,生成文本描述。 +#### 存储组织策略 + +多媒体文件的存储采用以下层级结构: + +``` +viking://resource/ +├── images/ # 图片文件 +│ └── 20240820/ # 上传日期(YYYYMMDD) +│ └── 20240820_123456_jpg/ # 文件文件夹(文件名_扩展名) +│ ├── .abstract.md # L0 摘要 +│ ├── .overview.md # L1 概览 +│ └── 20240820_123456.jpg # 原始文件 +├── audio/ # 音频文件 +│ └── 20240820/ +│ └── my_song_mp3/ +│ ├── .abstract.md +│ ├── .overview.md +│ └── my_song.mp3 +└── video/ # 视频文件 + └── 20240820/ + └── my_video_mp4/ + ├── .abstract.md + ├── .overview.md + └── my_video.mp4 +``` + +详细说明: + +1. **媒体子目录**: 在 `viking://resource` 下按类型划分为三个子目录 + - `viking://resource/images`: 存储未明确指定目标路径的图片文件 + - `viking://resource/audio`: 存储未明确指定目标路径的音频文件 + - `viking://resource/video`: 存储未明确指定目标路径的视频文件 + +2. **日期子目录**: 每次上传的文件按当前日期(格式:YYYYMMDD)组织,而非文件内部元信息的时间 + - 例如:`viking://resource/images/20240820/` 存储 2024年8月20日上传的所有图片 + +3. **文件文件夹**: 为每个多媒体文件创建一个专属文件夹,命名规则为:`文件名_扩展名`(扩展名不含点) + - 示例:上传 `20240820_123456.jpg` → 创建文件夹 `20240820_123456_jpg` + +4. **文件夹内容**: 每个文件文件夹内包含: + - **原始文件**: 保持原始文件名,空格字符替换为下划线 `_`(因为 OpenViking URI 不允许包含空格) + - 示例:`photo 1.jpg` → 保存为 `photo_1.jpg` + - `.abstract.md` (L0 层): 摘要信息(<200 token) + - 图片:文件名、内容描述、画面风格等 + - 音频:文件名、时长、内容概述等 + - 视频:文件名、时长、内容概述等 + - `.overview.md` (L1 层): 概览信息 + - 图片:除摘要外,还包含尺寸、OCR 识别结果、场景和主体描述等 + - 音频:除摘要外,还包含语音/歌词识别结果、章节时间线等 + - 视频:除摘要外,还包含使用场景等(未来会支持切分视频,递归存储子文件) + + ## 核心组件 ### BaseParser (`base_parser.py`) diff --git a/openviking/parse/parsers/html.py b/openviking/parse/parsers/html.py index 28e47885..85fd0c0a 100644 --- a/openviking/parse/parsers/html.py +++ b/openviking/parse/parsers/html.py @@ -601,6 +601,10 @@ async def parse_content( def _sanitize_for_path(self, text: str) -> str: """Sanitize text for use in file path.""" - safe = re.sub(r"[^\w\u4e00-\u9fff\s-]", "", text) + safe = re.sub( + r"[^\w\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af\u3400-\u4dbf\U00020000-\U0002a6df\s-]", + "", + text, + ) safe = re.sub(r"\s+", "_", safe) return safe.strip("_")[:50] or "section" diff --git a/openviking/parse/parsers/markdown.py b/openviking/parse/parsers/markdown.py index e6ddbe3a..1570baf3 100644 --- a/openviking/parse/parsers/markdown.py +++ b/openviking/parse/parsers/markdown.py @@ -334,7 +334,11 @@ def _smart_split_content(self, content: str, max_size: int) -> List[str]: return parts if parts else [content] def _sanitize_for_path(self, text: str) -> str: - safe = re.sub(r"[^\w\u4e00-\u9fff\s-]", "", text) + safe = re.sub( + r"[^\w\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af\u3400-\u4dbf\U00020000-\U0002a6df\s-]", + "", + text, + ) safe = re.sub(r"\s+", "_", safe) return safe.strip("_")[:50] or "section" diff --git a/openviking/parse/parsers/media.py b/openviking/parse/parsers/media.py deleted file mode 100644 index cb25a803..00000000 --- a/openviking/parse/parsers/media.py +++ /dev/null @@ -1,536 +0,0 @@ -# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. -# SPDX-License-Identifier: Apache-2.0 -""" -Media parser interfaces for OpenViking - Future expansion. - -This module defines parser interfaces for media types (image, audio, video). -These are placeholder implementations that raise NotImplementedError. -They serve as a design reference for future media parsing capabilities. - -For current document parsing (PDF, Markdown, HTML, Text), see other parser modules. -""" - -from pathlib import Path -from typing import List, Optional, Union - -from PIL import Image - -from openviking.parse.base import NodeType, ParseResult, ResourceNode -from openviking.parse.parsers.base_parser import BaseParser -from openviking_cli.utils.config.parser_config import AudioConfig, ImageConfig, VideoConfig - -# ============================================================================= -# Configuration Classes -# ============================================================================= - - -# ============================================================================= -# Parser Classes -# ============================================================================= - - -class ImageParser(BaseParser): - """ - Image parser - Future implementation. - - Planned Features: - 1. Visual content understanding using VLM (Vision Language Model) - 2. OCR text extraction for images containing text - 3. Metadata extraction (dimensions, format, EXIF data) - 4. Generate semantic description and structured ResourceNode - - Example workflow: - 1. Load image file - 2. (Optional) Perform OCR to extract text - 3. (Optional) Use VLM to generate visual description - 4. Create ResourceNode with image metadata and descriptions - 5. Return ParseResult - - Supported formats: PNG, JPG, JPEG, GIF, BMP, WEBP, SVG - """ - - def __init__(self, config: Optional[ImageConfig] = None, **kwargs): - """ - Initialize ImageParser. - - Args: - config: Image parsing configuration - **kwargs: Additional configuration parameters - """ - self.config = config or ImageConfig() - - @property - def supported_extensions(self) -> List[str]: - """Return supported image file extensions.""" - return [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp", ".svg"] - - async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult: - """ - Parse image file using three-phase architecture. - - Phase 1: Generate temporary files - - Copy original image to temp_uri/content.{ext} - - Generate description.md using VLM - - (Optional) Generate ocr.md using OCR - - Phase 2: Generate semantic info - - Generate abstract and overview based on description.md - - Overview includes file list and usage instructions - - Phase 3: Build directory structure - - Move all files to final URI - - Generate .abstract.md, .overview.md - - Args: - source: Image file path - **kwargs: Additional parsing parameters - - Returns: - ParseResult with image content - - Raises: - FileNotFoundError: If source file does not exist - IOError: If image processing fails - """ - from openviking.storage.viking_fs import get_viking_fs - - # Convert to Path object - file_path = Path(source) if isinstance(source, str) else source - if not file_path.exists(): - raise FileNotFoundError(f"Image file not found: {source}") - - viking_fs = get_viking_fs() - temp_uri = viking_fs.create_temp_uri() - - # Phase 1: Generate temporary files - image_bytes = file_path.read_bytes() - ext = file_path.suffix - - # 1.1 Save original image - await viking_fs.write_file_bytes(f"{temp_uri}/content{ext}", image_bytes) - - # 1.2 Extract image metadata - try: - img = Image.open(file_path) - width, height = img.size - format_str = img.format or ext[1:].upper() - except Exception: - width, height = 0, 0 - format_str = ext[1:].upper() - - # 1.3 Generate VLM description - description = "" - if self.config.enable_vlm: - description = await self._vlm_describe(image_bytes, self.config.vlm_model) - else: - # Fallback: basic description - description = f"Image file: {file_path.name} ({format_str}, {width}x{height})" - - await viking_fs.write_file(f"{temp_uri}/description.md", description) - - # 1.4 OCR (optional) - ocr_text = None - if self.config.enable_ocr: - ocr_text = await self._ocr_extract(image_bytes, self.config.ocr_lang) - if ocr_text: - await viking_fs.write_file(f"{temp_uri}/ocr.md", ocr_text) - - # Create ResourceNode - root_node = ResourceNode( - type=NodeType.ROOT, - title=file_path.stem, - level=0, - detail_file=None, - content_path=None, - children=[], - meta={ - "width": width, - "height": height, - "format": format_str.lower(), - "content_type": "image", - "source_title": file_path.stem, - "semantic_name": file_path.stem, - }, - ) - - # Phase 2: Generate semantic info - await self._generate_semantic_info(root_node, temp_uri, viking_fs, ocr_text is not None) - - # Phase 3: Build directory structure (handled by TreeBuilder) - return ParseResult( - root=root_node, - source_path=str(file_path), - temp_dir_path=temp_uri, - source_format="image", - parser_name="ImageParser", - meta={"content_type": "image", "format": format_str.lower()}, - ) - - async def _vlm_describe(self, image_bytes: bytes, model: Optional[str]) -> str: - """ - Generate image description using VLM. - - Args: - image_bytes: Image binary data - model: VLM model name - - Returns: - Image description in markdown format - - TODO: Integrate with actual VLM API (OpenAI GPT-4V, Claude Vision, etc.) - """ - # Fallback implementation - returns basic placeholder - return "Image description (VLM integration pending)\n\nThis is an image. VLM description feature has not yet integrated external API." - - async def _ocr_extract(self, image_bytes: bytes, lang: str) -> Optional[str]: - """ - Extract text from image using OCR. - - Args: - image_bytes: Image binary data - lang: OCR language code - - Returns: - Extracted text in markdown format, or None if no text found - - TODO: Integrate with OCR API (Tesseract, PaddleOCR, etc.) - """ - # Not implemented - return None - return None - - async def _generate_semantic_info( - self, node: ResourceNode, temp_uri: str, viking_fs, has_ocr: bool - ): - """ - Phase 2: Generate abstract and overview. - - Args: - node: ResourceNode to update - temp_uri: Temporary URI - viking_fs: VikingFS instance - has_ocr: Whether OCR file exists - """ - # Read description.md - description = await viking_fs.read_file(f"{temp_uri}/description.md") - - # Generate abstract (short summary, < 100 tokens) - abstract = description[:200] if len(description) > 200 else description - - # Generate overview (content summary + file list + usage instructions) - overview_parts = [ - "## Content Summary\n", - description, - "\n\n## Available Files\n", - f"- content.{node.meta['format']}: Original image file ({node.meta['width']}x{node.meta['height']}, {node.meta['format'].upper()} format)\n", - "- description.md: Detailed image description generated by VLM\n", - ] - - if has_ocr: - overview_parts.append("- ocr.md: OCR text recognition result from the image\n") - - overview_parts.append("\n## Usage\n") - overview_parts.append("### View Image\n") - overview_parts.append("```python\n") - overview_parts.append("image_bytes = await image_resource.view()\n") - overview_parts.append("# Returns: PNG/JPG format image binary data\n") - overview_parts.append("# Purpose: Display or save the image\n") - overview_parts.append("```\n\n") - - overview_parts.append("### Get VLM-generated Image Description\n") - overview_parts.append("```python\n") - overview_parts.append("description = await image_resource.description()\n") - overview_parts.append("# Returns: FileContent object for further processing\n") - overview_parts.append("# Purpose: Understand image content\n") - overview_parts.append("```\n\n") - - if has_ocr: - overview_parts.append("### Get OCR-recognized Text\n") - overview_parts.append("```python\n") - overview_parts.append("ocr_text = await image_resource.ocr()\n") - overview_parts.append("# Returns: FileContent object or None\n") - overview_parts.append("# Purpose: Extract text information from the image\n") - overview_parts.append("```\n\n") - - overview_parts.append("### Get Image Metadata\n") - overview_parts.append("```python\n") - overview_parts.append( - f"size = image_resource.get_size() # ({node.meta['width']}, {node.meta['height']})\n" - ) - overview_parts.append(f'format = image_resource.get_format() # "{node.meta["format"]}"\n') - overview_parts.append("```\n") - - overview = "".join(overview_parts) - - # Store in node meta - node.meta["abstract"] = abstract - node.meta["overview"] = overview - - async def parse_content( - self, content: str, source_path: Optional[str] = None, instruction: str = "", **kwargs - ) -> ParseResult: - """ - Parse image from content string - Not yet implemented. - - Args: - content: Image content (base64 or binary string) - source_path: Optional source path for metadata - **kwargs: Additional parsing parameters - - Returns: - ParseResult with image content - - Raises: - NotImplementedError: This feature is not yet implemented - """ - raise NotImplementedError("Image parsing not yet implemented") - - -class AudioParser(BaseParser): - """ - Audio parser - Future implementation. - - Planned Features: - 1. Speech-to-text transcription using ASR models - 2. Audio metadata extraction (duration, sample rate, channels) - 3. Speaker diarization (identify different speakers) - 4. Timestamp alignment for transcribed text - 5. Generate structured ResourceNode with transcript - - Example workflow: - 1. Load audio file - 2. Extract metadata (duration, format, sample rate) - 3. Transcribe speech to text using Whisper or similar - 4. (Optional) Perform speaker diarization - 5. Create ResourceNode with: - - type: NodeType.ROOT - - children: sections for each speaker/timestamp - - meta: audio metadata and timestamps - 6. Return ParseResult - - Supported formats: MP3, WAV, OGG, FLAC, AAC, M4A - """ - - def __init__(self, config: Optional[AudioConfig] = None, **kwargs): - """ - Initialize AudioParser. - - Args: - config: Audio parsing configuration - **kwargs: Additional configuration parameters - """ - self.config = config or AudioConfig() - - @property - def supported_extensions(self) -> List[str]: - """Return supported audio file extensions.""" - return [".mp3", ".wav", ".ogg", ".flac", ".aac", ".m4a", ".opus"] - - async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult: - """ - Parse audio file - Not yet implemented. - - Planned implementation: - 1. Load audio file - 2. Extract metadata using librosa or similar - 3. If enable_transcription: - - Transcribe using Whisper or similar ASR model - - Generate timestamps for each segment - - (Optional) Perform speaker diarization - 4. Create ResourceNode tree: - - Root node with audio metadata - - Child nodes for each transcribed segment - 5. Return ParseResult - - Args: - source: Audio file path or URL - **kwargs: Additional parsing parameters - - Returns: - ParseResult with transcribed content - - Raises: - NotImplementedError: This feature is not yet implemented - """ - raise NotImplementedError( - "Audio parsing is not yet implemented. " - "This is a placeholder interface for future expansion. " - "\n\nPlanned features:" - "\n- Speech-to-text transcription (Whisper)" - "\n- Speaker diarization" - "\n- Timestamp alignment" - "\n- Audio metadata extraction" - "\n\nWorkaround: Extract audio manually and add transcripts as " - "text or markdown files." - ) - - async def parse_content( - self, content: str, source_path: Optional[str] = None, instruction: str = "", **kwargs - ) -> ParseResult: - """ - Parse audio from content string - Not yet implemented. - - Args: - content: Audio content (base64 or binary string) - source_path: Optional source path for metadata - **kwargs: Additional parsing parameters - - Returns: - ParseResult with transcribed content - - Raises: - NotImplementedError: This feature is not yet implemented - """ - raise NotImplementedError("Audio parsing not yet implemented") - - -class VideoParser(BaseParser): - """ - Video parser - Future implementation. - - Planned Features: - 1. Key frame extraction at regular intervals - 2. Audio track transcription using ASR - 3. VLM-based scene description for key frames - 4. Video metadata extraction (duration, resolution, codec) - 5. Generate structured ResourceNode combining visual and audio - - Example workflow: - 1. Load video file - 2. Extract metadata (duration, resolution, fps) - 3. Extract audio track → transcribe using AudioParser - 4. Extract key frames at specified intervals - 5. For each frame: generate VLM description - 6. Create ResourceNode tree: - - Root: video metadata - - Children: timeline nodes (each with frame + transcript) - 7. Return ParseResult - - Supported formats: MP4, AVI, MOV, MKV, WEBM - """ - - def __init__(self, config: Optional[VideoConfig] = None, **kwargs): - """ - Initialize VideoParser. - - Args: - config: Video parsing configuration - **kwargs: Additional configuration parameters - """ - self.config = config or VideoConfig() - - @property - def supported_extensions(self) -> List[str]: - """Return supported video file extensions.""" - return [".mp4", ".avi", ".mov", ".mkv", ".webm", ".flv", ".wmv"] - - async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult: - """ - Parse video file - Not yet implemented. - - Planned implementation: - 1. Load video file using cv2 or similar - 2. Extract metadata (duration, resolution, fps, codec) - 3. Extract audio track: - - Save as temporary audio file - - Parse using AudioParser - 4. Extract key frames: - - At specified intervals (e.g., every 10 seconds) - - Save frames as images - 5. For each frame (if enable_vlm_description): - - Use VLM to generate scene description - 6. Create ResourceNode tree: - - Root: video metadata - - Children: Timeline segments - - Each segment contains: - - Timestamp - - Frame description (VLM) - - Transcript (ASR) - 7. Return ParseResult - - Args: - source: Video file path or URL - **kwargs: Additional parsing parameters - - Returns: - ParseResult with video content - - Raises: - NotImplementedError: This feature is not yet implemented - """ - raise NotImplementedError( - "Video parsing is not yet implemented. " - "This is a placeholder interface for future expansion. " - "\n\nPlanned features:" - "\n- Key frame extraction" - "\n- Audio track transcription" - "\n- VLM scene description" - "\n- Timeline-based structured output" - "\n\nWorkaround: Extract frames and audio manually, then process " - "as images and audio files." - ) - - async def parse_content( - self, content: str, source_path: Optional[str] = None, instruction: str = "", **kwargs - ) -> ParseResult: - """ - Parse video from content string - Not yet implemented. - - Args: - content: Video content (base64 or binary string) - source_path: Optional source path for metadata - **kwargs: Additional parsing parameters - - Returns: - ParseResult with video content - - Raises: - NotImplementedError: This feature is not yet implemented - """ - raise NotImplementedError("Video parsing not yet implemented") - - -# ============================================================================= -# Utility Functions -# ============================================================================= - - -def is_media_parser_available(parser_type: str) -> bool: - """ - Check if a media parser type is currently available. - - Args: - parser_type: Type of parser ("image", "audio", "video") - - Returns: - False (all media parsers are future implementations) - - Examples: - >>> is_media_parser_available("image") - False - >>> is_media_parser_available("video") - False - """ - return False - - -def get_media_parser_status() -> dict: - """ - Get status of all media parsers. - - Returns: - Dictionary with parser names and their implementation status - - Examples: - >>> status = get_media_parser_status() - >>> print(status) - { - "image": "planned", - "audio": "planned", - "video": "planned" - } - """ - return { - "image": "planned", - "audio": "planned", - "video": "planned", - } diff --git a/openviking/parse/parsers/media/__init__.py b/openviking/parse/parsers/media/__init__.py new file mode 100644 index 00000000..7fed46b5 --- /dev/null +++ b/openviking/parse/parsers/media/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 + +from .audio import AudioParser +from .image import ImageParser +from .video import VideoParser + +__all__ = ["ImageParser", "AudioParser", "VideoParser"] diff --git a/openviking/parse/parsers/media/audio.py b/openviking/parse/parsers/media/audio.py new file mode 100644 index 00000000..f0a018c3 --- /dev/null +++ b/openviking/parse/parsers/media/audio.py @@ -0,0 +1,307 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +""" +Audio parser - Future implementation. + +Planned Features: +1. Speech-to-text transcription using ASR models +2. Audio metadata extraction (duration, sample rate, channels) +3. Speaker diarization (identify different speakers) +4. Timestamp alignment for transcribed text +5. Generate structured ResourceNode with transcript + +Example workflow: + 1. Load audio file + 2. Extract metadata (duration, format, sample rate) + 3. Transcribe speech to text using Whisper or similar + 4. (Optional) Perform speaker diarization + 5. Create ResourceNode with: + - type: NodeType.ROOT + - children: sections for each speaker/timestamp + - meta: audio metadata and timestamps + 6. Return ParseResult + +Supported formats: MP3, WAV, OGG, FLAC, AAC, M4A +""" + +from pathlib import Path +from typing import List, Optional, Union + +from openviking.parse.base import NodeType, ParseResult, ResourceNode +from openviking.parse.parsers.base_parser import BaseParser +from openviking_cli.utils.config.parser_config import AudioConfig + + +class AudioParser(BaseParser): + """ + Audio parser for audio files. + """ + + def __init__(self, config: Optional[AudioConfig] = None, **kwargs): + """ + Initialize AudioParser. + + Args: + config: Audio parsing configuration + **kwargs: Additional configuration parameters + """ + self.config = config or AudioConfig() + + @property + def supported_extensions(self) -> List[str]: + """Return supported audio file extensions.""" + return [".mp3", ".wav", ".ogg", ".flac", ".aac", ".m4a", ".opus"] + + async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult: + """ + Parse audio file using three-phase architecture. + + Phase 1: Generate temporary files + - Copy original audio to temp_uri/content.{ext} + - (Optional) Generate transcript with timestamps + + Phase 2: Generate semantic info + - Generate abstract and overview based on description + - Overview includes file list and usage instructions + + Phase 3: Build directory structure + - Move all files to final URI + - Generate .abstract.md, .overview.md + + Args: + source: Audio file path + **kwargs: Additional parsing parameters + + Returns: + ParseResult with audio content + + Raises: + FileNotFoundError: If source file does not exist + IOError: If audio processing fails + """ + from openviking.storage.viking_fs import get_viking_fs + + # Convert to Path object + file_path = Path(source) if isinstance(source, str) else source + if not file_path.exists(): + raise FileNotFoundError(f"Audio file not found: {source}") + + viking_fs = get_viking_fs() + temp_uri = viking_fs.create_temp_uri() + + # Phase 1: Generate temporary files + audio_bytes = file_path.read_bytes() + ext = file_path.suffix + + from openviking_cli.utils.uri import VikingURI + + # Sanitize original filename (replace spaces with underscores) + original_filename = file_path.name.replace(" ", "_") + # Root directory name: filename stem + _ + extension (without dot) + stem = file_path.stem.replace(" ", "_") + ext_no_dot = ext[1:] if ext else "" + root_dir_name = VikingURI.sanitize_segment(f"{stem}_{ext_no_dot}") + root_dir_uri = f"{temp_uri}/{root_dir_name}" + await viking_fs.mkdir(root_dir_uri) + + # 1.1 Save original audio with original filename (sanitized) + await viking_fs.write_file_bytes(f"{root_dir_uri}/{original_filename}", audio_bytes) + + # 1.2 Validate audio file using magic bytes + # Define magic bytes for supported audio formats + audio_magic_bytes = { + ".mp3": [b"ID3", b"\xff\xfb", b"\xff\xf3", b"\xff\xf2"], + ".wav": [b"RIFF"], + ".ogg": [b"OggS"], + ".flac": [b"fLaC"], + ".aac": [b"\xff\xf1", b"\xff\xf9"], + ".m4a": [b"\x00\x00\x00", b"ftypM4A", b"ftypisom"], + ".opus": [b"OggS"], + } + + # Check magic bytes + valid = False + ext_lower = ext.lower() + magic_list = audio_magic_bytes.get(ext_lower, []) + for magic in magic_list: + if len(audio_bytes) >= len(magic) and audio_bytes.startswith(magic): + valid = True + break + + if not valid: + raise ValueError( + f"Invalid audio file: {file_path}. File signature does not match expected format {ext_lower}" + ) + + # Extract audio metadata (placeholder) + duration = 0 + sample_rate = 0 + channels = 0 + format_str = ext[1:].upper() + + # 1.3 Generate ASR description + description = "" + if self.config.enable_transcription: + description = await self._asr_transcribe(audio_bytes, self.config.asr_model) + else: + # Fallback: basic description + description = f"Audio file: {file_path.name} ({format_str}, {duration}s, {sample_rate}Hz, {channels}ch)" + + # 1.4 Transcript with timestamps (optional) + transcript_text = None + if self.config.enable_transcription and self.config.enable_timestamps: + transcript_text = await self._asr_transcribe_with_timestamps( + audio_bytes, self.config.asr_model + ) + if transcript_text: + await viking_fs.write_file(f"{root_dir_uri}/transcript.md", transcript_text) + + # Create ResourceNode + root_node = ResourceNode( + type=NodeType.ROOT, + title=file_path.stem, + level=0, + detail_file=None, + content_path=None, + children=[], + meta={ + "duration": duration, + "sample_rate": sample_rate, + "channels": channels, + "format": format_str.lower(), + "content_type": "audio", + "source_title": file_path.stem, + "semantic_name": file_path.stem, + "original_filename": original_filename, + }, + ) + + # Phase 2: Generate semantic info + await self._generate_semantic_info( + root_node, description, viking_fs, transcript_text is not None + ) + + # Phase 3: Build directory structure (handled by TreeBuilder) + return ParseResult( + root=root_node, + source_path=str(file_path), + temp_dir_path=temp_uri, + source_format="audio", + parser_name="AudioParser", + meta={"content_type": "audio", "format": format_str.lower()}, + ) + + async def _asr_transcribe(self, audio_bytes: bytes, model: Optional[str]) -> str: + """ + Generate audio transcription using ASR. + + Args: + audio_bytes: Audio binary data + model: ASR model name + + Returns: + Audio transcription in markdown format + + TODO: Integrate with actual ASR API (Whisper, etc.) + """ + # Fallback implementation - returns basic placeholder + return "Audio transcription (ASR integration pending)\n\nThis is an audio. ASR transcription feature has not yet integrated external API." + + async def _asr_transcribe_with_timestamps( + self, audio_bytes: bytes, model: Optional[str] + ) -> Optional[str]: + """ + Extract transcription with timestamps from audio using ASR. + + Args: + audio_bytes: Audio binary data + model: ASR model name + + Returns: + Transcript with timestamps in markdown format, or None if not available + + TODO: Integrate with ASR API + """ + # Not implemented - return None + return None + + async def _generate_semantic_info( + self, node: ResourceNode, description: str, viking_fs, has_transcript: bool + ): + """ + Phase 2: Generate abstract and overview. + + Args: + node: ResourceNode to update + description: Audio description + viking_fs: VikingFS instance + has_transcript: Whether transcript file exists + """ + # Generate abstract (short summary, < 100 tokens) + abstract = description[:200] if len(description) > 200 else description + + # Generate overview (content summary + file list + usage instructions) + overview_parts = [ + "## Content Summary\n", + description, + "\n\n## Available Files\n", + f"- {node.meta['original_filename']}: Original audio file ({node.meta['duration']}s, {node.meta['sample_rate']}Hz, {node.meta['channels']}ch, {node.meta['format'].upper()} format)\n", + ] + + if has_transcript: + overview_parts.append("- transcript.md: Transcript with timestamps from the audio\n") + + overview_parts.append("\n## Usage\n") + overview_parts.append("### Play Audio\n") + overview_parts.append("```python\n") + overview_parts.append("audio_bytes = await audio_resource.play()\n") + overview_parts.append("# Returns: Audio file binary data\n") + overview_parts.append("# Purpose: Play or save the audio\n") + overview_parts.append("```\n\n") + + if has_transcript: + overview_parts.append("### Get Timestamps Transcript\n") + overview_parts.append("```python\n") + overview_parts.append("timestamps = await audio_resource.timestamps()\n") + overview_parts.append("# Returns: FileContent object or None\n") + overview_parts.append("# Purpose: Extract timestamped transcript from the audio\n") + overview_parts.append("```\n\n") + + overview_parts.append("### Get Audio Metadata\n") + overview_parts.append("```python\n") + overview_parts.append( + f"duration = audio_resource.get_duration() # {node.meta['duration']}s\n" + ) + overview_parts.append( + f"sample_rate = audio_resource.get_sample_rate() # {node.meta['sample_rate']}Hz\n" + ) + overview_parts.append( + f"channels = audio_resource.get_channels() # {node.meta['channels']}\n" + ) + overview_parts.append(f'format = audio_resource.get_format() # "{node.meta["format"]}"\n') + overview_parts.append("```\n") + + overview = "".join(overview_parts) + + # Store in node meta + node.meta["abstract"] = abstract + node.meta["overview"] = overview + + async def parse_content( + self, content: str, source_path: Optional[str] = None, instruction: str = "", **kwargs + ) -> ParseResult: + """ + Parse audio from content string - Not yet implemented. + + Args: + content: Audio content (base64 or binary string) + source_path: Optional source path for metadata + **kwargs: Additional parsing parameters + + Returns: + ParseResult with audio content + + Raises: + NotImplementedError: This feature is not yet implemented + """ + raise NotImplementedError("Audio parsing from content not yet implemented") diff --git a/openviking/parse/parsers/media/image.py b/openviking/parse/parsers/media/image.py new file mode 100644 index 00000000..c82b9589 --- /dev/null +++ b/openviking/parse/parsers/media/image.py @@ -0,0 +1,285 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +""" +Media parser interfaces for OpenViking - Future expansion. + +This module defines parser interfaces for media types (image, audio, video). +These are placeholder implementations that raise NotImplementedError. +They serve as a design reference for future media parsing capabilities. + +For current document parsing (PDF, Markdown, HTML, Text), see other parser modules. +""" + +from pathlib import Path +from typing import List, Optional, Union + +from PIL import Image + +from openviking.parse.base import NodeType, ParseResult, ResourceNode +from openviking.parse.parsers.base_parser import BaseParser +from openviking_cli.utils.config.parser_config import ImageConfig + +# ============================================================================= +# Configuration Classes +# ============================================================================= + + +# ============================================================================= +# Parser Classes +# ============================================================================= + + +class ImageParser(BaseParser): + """ + Image parser - Future implementation. + + Planned Features: + 1. Visual content understanding using VLM (Vision Language Model) + 2. OCR text extraction for images containing text + 3. Metadata extraction (dimensions, format, EXIF data) + 4. Generate semantic description and structured ResourceNode + + Example workflow: + 1. Load image file + 2. (Optional) Perform OCR to extract text + 3. (Optional) Use VLM to generate visual description + 4. Create ResourceNode with image metadata and descriptions + 5. Return ParseResult + + Supported formats: PNG, JPG, JPEG, GIF, BMP, WEBP, SVG + """ + + def __init__(self, config: Optional[ImageConfig] = None, **kwargs): + """ + Initialize ImageParser. + + Args: + config: Image parsing configuration + **kwargs: Additional configuration parameters + """ + self.config = config or ImageConfig() + + @property + def supported_extensions(self) -> List[str]: + """Return supported image file extensions.""" + return [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp", ".svg"] + + async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult: + """ + Parse image file using three-phase architecture. + + Phase 1: Generate temporary files + - Copy original image to temp_uri/content.{ext} + - (Optional) Generate ocr.md using OCR + + Phase 2: Generate semantic info + - Generate abstract and overview based on description + - Overview includes file list and usage instructions + + Phase 3: Build directory structure + - Move all files to final URI + - Generate .abstract.md, .overview.md + + Args: + source: Image file path + **kwargs: Additional parsing parameters + + Returns: + ParseResult with image content + + Raises: + FileNotFoundError: If source file does not exist + IOError: If image processing fails + """ + from openviking.storage.viking_fs import get_viking_fs + + # Convert to Path object + file_path = Path(source) if isinstance(source, str) else source + if not file_path.exists(): + raise FileNotFoundError(f"Image file not found: {source}") + + viking_fs = get_viking_fs() + temp_uri = viking_fs.create_temp_uri() + + # Phase 1: Generate temporary files + image_bytes = file_path.read_bytes() + ext = file_path.suffix + + from openviking_cli.utils.uri import VikingURI + + # Sanitize original filename (replace spaces with underscores) + original_filename = file_path.name.replace(" ", "_") + # Root directory name: filename stem + _ + extension (without dot) + stem = file_path.stem.replace(" ", "_") + ext_no_dot = ext[1:] if ext else "" + root_dir_name = VikingURI.sanitize_segment(f"{stem}_{ext_no_dot}") + root_dir_uri = f"{temp_uri}/{root_dir_name}" + await viking_fs.mkdir(root_dir_uri) + + # 1.1 Save original image with original filename (sanitized) + await viking_fs.write_file_bytes(f"{root_dir_uri}/{original_filename}", image_bytes) + + # 1.2 Validate and extract image metadata + try: + img = Image.open(file_path) + img.verify() # Verify that it's a valid image + img.close() # Close and reopen to reset after verify() + img = Image.open(file_path) + width, height = img.size + format_str = img.format or ext[1:].upper() + except Exception as e: + raise ValueError(f"Invalid image file: {file_path}. Error: {e}") from e + + # 1.3 Generate VLM description + description = "" + if self.config.enable_vlm: + description = await self._vlm_describe(image_bytes, self.config.vlm_model) + else: + # Fallback: basic description + description = f"Image file: {file_path.name} ({format_str}, {width}x{height})" + + # 1.4 OCR (optional) + ocr_text = None + if self.config.enable_ocr: + ocr_text = await self._ocr_extract(image_bytes, self.config.ocr_lang) + if ocr_text: + await viking_fs.write_file(f"{root_dir_uri}/ocr.md", ocr_text) + + # Create ResourceNode + root_node = ResourceNode( + type=NodeType.ROOT, + title=file_path.stem, + level=0, + detail_file=None, + content_path=None, + children=[], + meta={ + "width": width, + "height": height, + "format": format_str.lower(), + "content_type": "image", + "source_title": file_path.stem, + "semantic_name": file_path.stem, + "original_filename": original_filename, + }, + ) + + # Phase 2: Generate semantic info + await self._generate_semantic_info(root_node, description, viking_fs, ocr_text is not None) + + # Phase 3: Build directory structure (handled by TreeBuilder) + return ParseResult( + root=root_node, + source_path=str(file_path), + temp_dir_path=temp_uri, + source_format="image", + parser_name="ImageParser", + meta={"content_type": "image", "format": format_str.lower()}, + ) + + async def _vlm_describe(self, image_bytes: bytes, model: Optional[str]) -> str: + """ + Generate image description using VLM. + + Args: + image_bytes: Image binary data + model: VLM model name + + Returns: + Image description in markdown format + + TODO: Integrate with actual VLM API (OpenAI GPT-4V, Claude Vision, etc.) + """ + # Fallback implementation - returns basic placeholder + return "Image description (VLM integration pending)\n\nThis is an image. VLM description feature has not yet integrated external API." + + async def _ocr_extract(self, image_bytes: bytes, lang: str) -> Optional[str]: + """ + Extract text from image using OCR. + + Args: + image_bytes: Image binary data + lang: OCR language code + + Returns: + Extracted text in markdown format, or None if no text found + + TODO: Integrate with OCR API (Tesseract, PaddleOCR, etc.) + """ + # Not implemented - return None + return None + + async def _generate_semantic_info( + self, node: ResourceNode, description: str, viking_fs, has_ocr: bool + ): + """ + Phase 2: Generate abstract and overview. + + Args: + node: ResourceNode to update + description: Image description + viking_fs: VikingFS instance + has_ocr: Whether OCR file exists + """ + # Generate abstract (short summary, < 100 tokens) + abstract = description[:200] if len(description) > 200 else description + + # Generate overview (content summary + file list + usage instructions) + overview_parts = [ + "## Content Summary\n", + description, + "\n\n## Available Files\n", + f"- {node.meta['original_filename']}: Original image file ({node.meta['width']}x{node.meta['height']}, {node.meta['format'].upper()} format)\n", + ] + + if has_ocr: + overview_parts.append("- ocr.md: OCR text recognition result from the image\n") + + overview_parts.append("\n## Usage\n") + overview_parts.append("### View Image\n") + overview_parts.append("```python\n") + overview_parts.append("image_bytes = await image_resource.view()\n") + overview_parts.append("# Returns: PNG/JPG format image binary data\n") + overview_parts.append("# Purpose: Display or save the image\n") + overview_parts.append("```\n\n") + + if has_ocr: + overview_parts.append("### Get OCR-recognized Text\n") + overview_parts.append("```python\n") + overview_parts.append("ocr_text = await image_resource.ocr()\n") + overview_parts.append("# Returns: FileContent object or None\n") + overview_parts.append("# Purpose: Extract text information from the image\n") + overview_parts.append("```\n\n") + + overview_parts.append("### Get Image Metadata\n") + overview_parts.append("```python\n") + overview_parts.append( + f"size = image_resource.get_size() # ({node.meta['width']}, {node.meta['height']})\n" + ) + overview_parts.append(f'format = image_resource.get_format() # "{node.meta["format"]}"\n') + overview_parts.append("```\n") + + overview = "".join(overview_parts) + + # Store in node meta + node.meta["abstract"] = abstract + node.meta["overview"] = overview + + async def parse_content( + self, content: str, source_path: Optional[str] = None, instruction: str = "", **kwargs + ) -> ParseResult: + """ + Parse image from content string - Not yet implemented. + + Args: + content: Image content (base64 or binary string) + source_path: Optional source path for metadata + **kwargs: Additional parsing parameters + + Returns: + ParseResult with image content + + Raises: + NotImplementedError: This feature is not yet implemented + """ + raise NotImplementedError("Image parsing not yet implemented") diff --git a/openviking/parse/parsers/media/video.py b/openviking/parse/parsers/media/video.py new file mode 100644 index 00000000..53cccf67 --- /dev/null +++ b/openviking/parse/parsers/media/video.py @@ -0,0 +1,286 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +""" +Video parser - Future implementation. + +Planned Features: +1. Key frame extraction at regular intervals +2. Audio track transcription using ASR +3. VLM-based scene description for key frames +4. Video metadata extraction (duration, resolution, codec) +5. Generate structured ResourceNode combining visual and audio + +Example workflow: + 1. Load video file + 2. Extract metadata (duration, resolution, fps) + 3. Extract audio track → transcribe using AudioParser + 4. Extract key frames at specified intervals + 5. For each frame: generate VLM description + 6. Create ResourceNode tree: + - Root: video metadata + - Children: timeline nodes (each with frame + transcript) + 7. Return ParseResult + +Supported formats: MP4, AVI, MOV, MKV, WEBM +""" + +from pathlib import Path +from typing import List, Optional, Union + +from openviking.parse.base import NodeType, ParseResult, ResourceNode +from openviking.parse.parsers.base_parser import BaseParser +from openviking_cli.utils.config.parser_config import VideoConfig + + +class VideoParser(BaseParser): + """ + Video parser for video files. + """ + + def __init__(self, config: Optional[VideoConfig] = None, **kwargs): + """ + Initialize VideoParser. + + Args: + config: Video parsing configuration + **kwargs: Additional configuration parameters + """ + self.config = config or VideoConfig() + + @property + def supported_extensions(self) -> List[str]: + """Return supported video file extensions.""" + return [".mp4", ".avi", ".mov", ".mkv", ".webm", ".flv", ".wmv"] + + async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult: + """ + Parse video file using three-phase architecture. + + Phase 1: Generate temporary files + - Copy original video to temp_uri/content.{ext} + - Extract key frames + - Extract audio track and transcribe using ASR + + Phase 2: Generate semantic info + - Generate abstract and overview based on descriptions + - Overview includes file list and usage instructions + + Phase 3: Build directory structure + - Move all files to final URI + - Generate .abstract.md, .overview.md + + Args: + source: Video file path + **kwargs: Additional parsing parameters + + Returns: + ParseResult with video content + + Raises: + FileNotFoundError: If source file does not exist + IOError: If video processing fails + """ + from openviking.storage.viking_fs import get_viking_fs + + # Convert to Path object + file_path = Path(source) if isinstance(source, str) else source + if not file_path.exists(): + raise FileNotFoundError(f"Video file not found: {source}") + + viking_fs = get_viking_fs() + temp_uri = viking_fs.create_temp_uri() + + # Phase 1: Generate temporary files + video_bytes = file_path.read_bytes() + ext = file_path.suffix + + from openviking_cli.utils.uri import VikingURI + + # Sanitize original filename (replace spaces with underscores) + original_filename = file_path.name.replace(" ", "_") + # Root directory name: filename stem + _ + extension (without dot) + stem = file_path.stem.replace(" ", "_") + ext_no_dot = ext[1:] if ext else "" + root_dir_name = VikingURI.sanitize_segment(f"{stem}_{ext_no_dot}") + root_dir_uri = f"{temp_uri}/{root_dir_name}" + await viking_fs.mkdir(root_dir_uri) + + # 1.1 Save original video with original filename (sanitized) + await viking_fs.write_file_bytes(f"{root_dir_uri}/{original_filename}", video_bytes) + + # 1.2 Validate video file using magic bytes + # Define magic bytes for supported video formats + video_magic_bytes = { + ".mp4": [b"\x00\x00\x00", b"ftyp"], + ".avi": [b"RIFF"], + ".mov": [b"\x00\x00\x00", b"ftyp"], + ".mkv": [b"\x1a\x45\xdf\xa3"], + ".webm": [b"\x1a\x45\xdf\xa3"], + ".flv": [b"FLV"], + ".wmv": [b"\x30\x26\xb2\x75\x8e\x66\xcf\x11"], + } + + # Check magic bytes + valid = False + ext_lower = ext.lower() + magic_list = video_magic_bytes.get(ext_lower, []) + for magic in magic_list: + if len(video_bytes) >= len(magic) and video_bytes.startswith(magic): + valid = True + break + + if not valid: + raise ValueError( + f"Invalid video file: {file_path}. File signature does not match expected format {ext_lower}" + ) + + # Extract video metadata (placeholder) + duration = 0 + width = 0 + height = 0 + fps = 0 + format_str = ext[1:].upper() + + # 1.3 Generate combined description + description = "" + if self.config.enable_key_frames or self.config.enable_audio_transcription: + description = await self._generate_video_description(file_path, self.config) + else: + # Fallback: basic description + description = f"Video file: {file_path.name} ({format_str}, {duration}s, {width}x{height}, {fps}fps)" + + # 1.4 Key frames (optional) + key_frames_dir = f"{root_dir_uri}/keyframes" + has_key_frames = False + if self.config.enable_key_frames: + await viking_fs.mkdir(key_frames_dir) + has_key_frames = True + + # Create ResourceNode + root_node = ResourceNode( + type=NodeType.ROOT, + title=file_path.stem, + level=0, + detail_file=None, + content_path=None, + children=[], + meta={ + "duration": duration, + "width": width, + "height": height, + "fps": fps, + "format": format_str.lower(), + "content_type": "video", + "source_title": file_path.stem, + "semantic_name": file_path.stem, + "original_filename": original_filename, + }, + ) + + # Phase 2: Generate semantic info + await self._generate_semantic_info(root_node, description, viking_fs, has_key_frames) + + # Phase 3: Build directory structure (handled by TreeBuilder) + return ParseResult( + root=root_node, + source_path=str(file_path), + temp_dir_path=temp_uri, + source_format="video", + parser_name="VideoParser", + meta={"content_type": "video", "format": format_str.lower()}, + ) + + async def _generate_video_description(self, file_path: Path, config: VideoConfig) -> str: + """ + Generate video description using key frames and audio transcription. + + Args: + file_path: Video file path + config: Video parsing configuration + + Returns: + Video description in markdown format + + TODO: Integrate with actual video processing libraries + """ + # Fallback implementation - returns basic placeholder + return "Video description (video processing integration pending)\n\nThis is a video. Video processing feature has not yet integrated external libraries." + + async def _generate_semantic_info( + self, node: ResourceNode, description: str, viking_fs, has_key_frames: bool + ): + """ + Phase 2: Generate abstract and overview. + + Args: + node: ResourceNode to update + description: Video description + viking_fs: VikingFS instance + has_key_frames: Whether key frames directory exists + """ + # Generate abstract (short summary, < 100 tokens) + abstract = description[:200] if len(description) > 200 else description + + # Generate overview (content summary + file list + usage instructions) + overview_parts = [ + "## Content Summary\n", + description, + "\n\n## Available Files\n", + f"- {node.meta['original_filename']}: Original video file ({node.meta['duration']}s, {node.meta['width']}x{node.meta['height']}, {node.meta['fps']}fps, {node.meta['format'].upper()} format)\n", + ] + + if has_key_frames: + overview_parts.append("- keyframes/: Directory containing extracted key frames\n") + + overview_parts.append("\n## Usage\n") + overview_parts.append("### Play Video\n") + overview_parts.append("```python\n") + overview_parts.append("video_bytes = await video_resource.play()\n") + overview_parts.append("# Returns: Video file binary data\n") + overview_parts.append("# Purpose: Play or save the video\n") + overview_parts.append("```\n\n") + + if has_key_frames: + overview_parts.append("### Get Key Frames\n") + overview_parts.append("```python\n") + overview_parts.append("keyframes = await video_resource.keyframes()\n") + overview_parts.append("# Returns: List of key frame resources\n") + overview_parts.append("# Purpose: Analyze video scenes\n") + overview_parts.append("```\n\n") + + overview_parts.append("### Get Video Metadata\n") + overview_parts.append("```python\n") + overview_parts.append( + f"duration = video_resource.get_duration() # {node.meta['duration']}s\n" + ) + overview_parts.append( + f"resolution = video_resource.get_resolution() # ({node.meta['width']}, {node.meta['height']})\n" + ) + overview_parts.append(f"fps = video_resource.get_fps() # {node.meta['fps']}\n") + overview_parts.append(f'format = video_resource.get_format() # "{node.meta["format"]}"\n') + overview_parts.append("```\n") + + overview = "".join(overview_parts) + + # Store in node meta + node.meta["abstract"] = abstract + node.meta["overview"] = overview + + async def parse_content( + self, content: str, source_path: Optional[str] = None, instruction: str = "", **kwargs + ) -> ParseResult: + """ + Parse video from content string - Not yet implemented. + + Args: + content: Video content (base64 or binary string) + source_path: Optional source path for metadata + **kwargs: Additional parsing parameters + + Returns: + ParseResult with video content + + Raises: + NotImplementedError: This feature is not yet implemented + """ + raise NotImplementedError("Video parsing from content not yet implemented") diff --git a/openviking/parse/registry.py b/openviking/parse/registry.py index 526b2272..378bcd7d 100644 --- a/openviking/parse/registry.py +++ b/openviking/parse/registry.py @@ -82,12 +82,18 @@ def __init__(self, register_optional: bool = True): # Register optional media parsers if register_optional: try: - from openviking.parse.parsers.media import ImageParser + from openviking.parse.parsers.media import AudioParser, ImageParser, VideoParser self.register("image", ImageParser()) logger.info("Registered ImageParser for image formats") + + self.register("audio", AudioParser()) + logger.info("Registered AudioParser for audio formats") + + self.register("video", VideoParser()) + logger.info("Registered VideoParser for video formats") except ImportError as e: - logger.debug(f"ImageParser not registered: {e}") + logger.debug(f"Media parsers not registered: {e}") def register(self, name: str, parser: BaseParser) -> None: """ diff --git a/openviking/parse/tree_builder.py b/openviking/parse/tree_builder.py index 903e822d..0e582dab 100644 --- a/openviking/parse/tree_builder.py +++ b/openviking/parse/tree_builder.py @@ -108,14 +108,50 @@ async def finalize_from_temp( doc_dirs = [e for e in entries if e.get("isDir") and e["name"] not in [".", ".."]] if len(doc_dirs) != 1: - raise ValueError(f"Expected 1 document directory in {temp_uri}, found {len(doc_dirs)}") + logger.error( + f"[TreeBuilder] Expected 1 document directory in {temp_uri}, found {len(doc_dirs)}" + ) + raise ValueError( + f"[TreeBuilder] Expected 1 document directory in {temp_uri}, found {len(doc_dirs)}" + ) - doc_name = doc_dirs[0]["name"] + from openviking_cli.utils.uri import VikingURI + + doc_name = VikingURI.sanitize_segment(doc_dirs[0]["name"]) doc_uri = f"{temp_uri}/{doc_name}" # 2. Determine base_uri if base_uri is None: - base_uri = self._get_base_uri(scope) + # Check if it's a media file (image/audio/video) + media_type = None + if source_format: + if source_format in ["image", "audio", "video"]: + media_type = source_format + elif source_path: + from pathlib import Path + + ext = Path(source_path).suffix.lower() + image_exts = [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp", ".svg"] + audio_exts = [".mp3", ".wav", ".ogg", ".flac", ".aac", ".m4a", ".opus"] + video_exts = [".mp4", ".mov", ".avi", ".webm", ".mkv"] + if ext in image_exts: + media_type = "image" + elif ext in audio_exts: + media_type = "audio" + elif ext in video_exts: + media_type = "video" + + if media_type: + # Map singular media types to plural directory names + media_dir_map = {"image": "images", "audio": "audio", "video": "video"} + media_dir = media_dir_map.get(media_type, media_type) + # Get current date in YYYYMMDD format + from datetime import datetime + + date_str = datetime.now().strftime("%Y%m%d") + base_uri = f"viking://resources/{media_dir}/{date_str}" + else: + base_uri = self._get_base_uri(scope) logger.info(f"Finalizing from temp: {temp_uri} -> {base_uri}") diff --git a/openviking_cli/cli/commands/resources.py b/openviking_cli/cli/commands/resources.py index 92940dd7..a9bfc28f 100644 --- a/openviking_cli/cli/commands/resources.py +++ b/openviking_cli/cli/commands/resources.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 """Resource management commands.""" +from pathlib import Path from typing import Optional import typer @@ -23,10 +24,62 @@ def add_resource_command( timeout: Optional[float] = typer.Option(600.0, help="Wait timeout in seconds"), ) -> None: """Add resources into OpenViking.""" + # Validate path: if it's a local path, check if it exists + final_path = path + if not (path.startswith("http://") or path.startswith("https://")): + unescaped_path = path.replace("\\ ", " ") + local_path = Path(unescaped_path) + final_path = unescaped_path + if not local_path.exists(): + # Check if there are extra arguments (possible unquoted path with spaces) + import sys + + # Find the index of 'add-resource' in sys.argv + try: + add_resource_idx = sys.argv.index("add-resource") + except ValueError: + add_resource_idx = sys.argv.index("add") if "add" in sys.argv else -1 + + if add_resource_idx != -1 and len(sys.argv) > add_resource_idx + 2: + # There are extra positional arguments - likely unquoted path with spaces + extra_args = sys.argv[add_resource_idx + 2 :] + suggested_path = f"{path} {' '.join(extra_args)}" + typer.echo( + typer.style( + f"Error: Path '{path}' does not exist.", + fg=typer.colors.RED, + bold=True, + ), + err=True, + ) + typer.echo( + typer.style( + "\nIt looks like you may have forgotten to quote a path with spaces.", + fg=typer.colors.YELLOW, + ), + err=True, + ) + typer.echo( + typer.style( + f'Suggested command: ov add-resource "{suggested_path}"', + fg=typer.colors.CYAN, + ), + err=True, + ) + raise typer.Exit(code=1) + else: + typer.echo( + typer.style( + f"Error: Path '{path}' does not exist.", fg=typer.colors.RED, bold=True + ), + err=True, + ) + raise typer.Exit(code=1) + run( ctx, lambda client: client.add_resource( - path=path, + path=final_path, target=to, reason=reason, instruction=instruction, diff --git a/openviking_cli/client/http.py b/openviking_cli/client/http.py index 39526602..a5cb6903 100644 --- a/openviking_cli/client/http.py +++ b/openviking_cli/client/http.py @@ -36,6 +36,7 @@ load_json_config, resolve_config_path, ) +from openviking_cli.utils.uri import VikingURI # Error code to exception class mapping ERROR_CODE_TO_EXCEPTION = { @@ -281,6 +282,7 @@ async def ls( node_limit: int = 1000, ) -> List[Any]: """List directory contents.""" + uri = VikingURI.normalize(uri) response = await self._http.get( "/api/v1/fs/ls", params={ @@ -304,6 +306,7 @@ async def tree( node_limit: int = 1000, ) -> List[Dict[str, Any]]: """Get directory tree.""" + uri = VikingURI.normalize(uri) response = await self._http.get( "/api/v1/fs/tree", params={ @@ -318,6 +321,7 @@ async def tree( async def stat(self, uri: str) -> Dict[str, Any]: """Get resource status.""" + uri = VikingURI.normalize(uri) response = await self._http.get( "/api/v1/fs/stat", params={"uri": uri}, @@ -326,6 +330,7 @@ async def stat(self, uri: str) -> Dict[str, Any]: async def mkdir(self, uri: str) -> None: """Create directory.""" + uri = VikingURI.normalize(uri) response = await self._http.post( "/api/v1/fs/mkdir", json={"uri": uri}, @@ -334,6 +339,7 @@ async def mkdir(self, uri: str) -> None: async def rm(self, uri: str, recursive: bool = False) -> None: """Remove resource.""" + uri = VikingURI.normalize(uri) response = await self._http.request( "DELETE", "/api/v1/fs", @@ -343,6 +349,8 @@ async def rm(self, uri: str, recursive: bool = False) -> None: async def mv(self, from_uri: str, to_uri: str) -> None: """Move resource.""" + from_uri = VikingURI.normalize(from_uri) + to_uri = VikingURI.normalize(to_uri) response = await self._http.post( "/api/v1/fs/mv", json={"from_uri": from_uri, "to_uri": to_uri}, @@ -353,6 +361,7 @@ async def mv(self, from_uri: str, to_uri: str) -> None: async def read(self, uri: str) -> str: """Read file content.""" + uri = VikingURI.normalize(uri) response = await self._http.get( "/api/v1/content/read", params={"uri": uri}, @@ -361,6 +370,7 @@ async def read(self, uri: str) -> str: async def abstract(self, uri: str) -> str: """Read L0 abstract.""" + uri = VikingURI.normalize(uri) response = await self._http.get( "/api/v1/content/abstract", params={"uri": uri}, @@ -369,6 +379,7 @@ async def abstract(self, uri: str) -> str: async def overview(self, uri: str) -> str: """Read L1 overview.""" + uri = VikingURI.normalize(uri) response = await self._http.get( "/api/v1/content/overview", params={"uri": uri}, @@ -386,6 +397,8 @@ async def find( filter: Optional[Dict[str, Any]] = None, ) -> FindResult: """Semantic search without session context.""" + if target_uri: + target_uri = VikingURI.normalize(target_uri) response = await self._http.post( "/api/v1/search/find", json={ @@ -409,6 +422,8 @@ async def search( filter: Optional[Dict[str, Any]] = None, ) -> FindResult: """Semantic search with optional session context.""" + if target_uri: + target_uri = VikingURI.normalize(target_uri) sid = session_id or (session.session_id if session else None) response = await self._http.post( "/api/v1/search/search", @@ -425,6 +440,7 @@ async def search( async def grep(self, uri: str, pattern: str, case_insensitive: bool = False) -> Dict[str, Any]: """Content search with pattern.""" + uri = VikingURI.normalize(uri) response = await self._http.post( "/api/v1/search/grep", json={ @@ -437,6 +453,7 @@ async def grep(self, uri: str, pattern: str, case_insensitive: bool = False) -> async def glob(self, pattern: str, uri: str = "viking://") -> Dict[str, Any]: """File pattern matching.""" + uri = VikingURI.normalize(uri) response = await self._http.post( "/api/v1/search/glob", json={"pattern": pattern, "uri": uri}, @@ -447,6 +464,7 @@ async def glob(self, pattern: str, uri: str = "viking://") -> Dict[str, Any]: async def relations(self, uri: str) -> List[Any]: """Get relations for a resource.""" + uri = VikingURI.normalize(uri) response = await self._http.get( "/api/v1/relations", params={"uri": uri}, @@ -455,6 +473,11 @@ async def relations(self, uri: str) -> List[Any]: async def link(self, from_uri: str, to_uris: Union[str, List[str]], reason: str = "") -> None: """Create link between resources.""" + from_uri = VikingURI.normalize(from_uri) + if isinstance(to_uris, str): + to_uris = VikingURI.normalize(to_uris) + else: + to_uris = [VikingURI.normalize(u) for u in to_uris] response = await self._http.post( "/api/v1/relations/link", json={"from_uri": from_uri, "to_uris": to_uris, "reason": reason}, @@ -463,6 +486,8 @@ async def link(self, from_uri: str, to_uris: Union[str, List[str]], reason: str async def unlink(self, from_uri: str, to_uri: str) -> None: """Remove link between resources.""" + from_uri = VikingURI.normalize(from_uri) + to_uri = VikingURI.normalize(to_uri) response = await self._http.request( "DELETE", "/api/v1/relations/link", @@ -512,6 +537,7 @@ async def add_message(self, session_id: str, role: str, content: str) -> Dict[st async def export_ovpack(self, uri: str, to: str) -> str: """Export context as .ovpack file.""" + uri = VikingURI.normalize(uri) response = await self._http.post( "/api/v1/pack/export", json={"uri": uri, "to": to}, @@ -527,6 +553,7 @@ async def import_ovpack( vectorize: bool = True, ) -> str: """Import .ovpack file.""" + parent = VikingURI.normalize(parent) response = await self._http.post( "/api/v1/pack/import", json={ diff --git a/openviking_cli/utils/config/vlm_config.py b/openviking_cli/utils/config/vlm_config.py index ad1bea8f..411c7d76 100644 --- a/openviking_cli/utils/config/vlm_config.py +++ b/openviking_cli/utils/config/vlm_config.py @@ -15,17 +15,16 @@ class VLMConfig(BaseModel): max_retries: int = Field(default=2, description="Maximum retry attempts") provider: Optional[str] = Field(default=None, description="Provider type") - backend: Optional[str] = Field(default=None, description="Backend provider (Deprecated, use 'provider' instead)") + backend: Optional[str] = Field( + default=None, description="Backend provider (Deprecated, use 'provider' instead)" + ) providers: Dict[str, Dict[str, Any]] = Field( default_factory=dict, - description="Multi-provider configuration, e.g. {'deepseek': {'api_key': 'xxx', 'api_base': 'xxx'}}" + description="Multi-provider configuration, e.g. {'deepseek': {'api_key': 'xxx', 'api_base': 'xxx'}}", ) - default_provider: Optional[str] = Field( - default=None, - description="Default provider name" - ) + default_provider: Optional[str] = Field(default=None, description="Default provider name") thinking: bool = Field(default=False, description="Enable thinking mode for VolcEngine models") @@ -141,6 +140,7 @@ def get_vlm_instance(self) -> Any: if self._vlm_instance is None: config_dict = self._build_vlm_config_dict() from openviking.models.vlm import VLMFactory + self._vlm_instance = VLMFactory.create(config_dict) return self._vlm_instance @@ -166,13 +166,15 @@ def _build_vlm_config_dict(self) -> Dict[str, Any]: return result - def get_completion(self, prompt: str) -> str: + def get_completion(self, prompt: str, thinking: bool = False) -> str: """Get LLM completion.""" - return self.get_vlm_instance().get_completion(prompt) + return self.get_vlm_instance().get_completion(prompt, thinking) - async def get_completion_async(self, prompt: str, max_retries: int = 0) -> str: + async def get_completion_async( + self, prompt: str, thinking: bool = False, max_retries: int = 0 + ) -> str: """Get LLM completion asynchronously, max_retries=0 means no retry.""" - return await self.get_vlm_instance().get_completion_async(prompt, max_retries) + return await self.get_vlm_instance().get_completion_async(prompt, thinking, max_retries) def is_available(self) -> bool: """Check if LLM is configured.""" @@ -182,14 +184,16 @@ def get_vision_completion( self, prompt: str, images: list, + thinking: bool = False, ) -> str: """Get LLM completion with images.""" - return self.get_vlm_instance().get_vision_completion(prompt, images) + return self.get_vlm_instance().get_vision_completion(prompt, images, thinking) async def get_vision_completion_async( self, prompt: str, images: list, + thinking: bool = False, ) -> str: """Get LLM completion with images asynchronously.""" - return await self.get_vlm_instance().get_vision_completion_async(prompt, images) + return await self.get_vlm_instance().get_vision_completion_async(prompt, images, thinking) diff --git a/openviking_cli/utils/uri.py b/openviking_cli/utils/uri.py index 9cf6d856..6f50a3a9 100644 --- a/openviking_cli/utils/uri.py +++ b/openviking_cli/utils/uri.py @@ -201,7 +201,7 @@ def build_semantic_uri( Build a semantic URI based on parent URI. """ # Sanitize semantic name for URI - safe_name = VikingURI._sanitize_segment(semantic_name) + safe_name = VikingURI.sanitize_segment(semantic_name) if not is_leaf: return f"{parent_uri}/{safe_name}" @@ -211,11 +211,12 @@ def build_semantic_uri( return f"{parent_uri}/{safe_name}/{node_id}" @staticmethod - def _sanitize_segment(text: str) -> str: + def sanitize_segment(text: str) -> str: """ Sanitize text for use in URI segment. - Preserves Chinese characters but replaces special characters. + Preserves CJK characters (Chinese, Japanese, Korean) and other common scripts + while replacing special characters. Args: text: Original text @@ -223,8 +224,18 @@ def _sanitize_segment(text: str) -> str: Returns: URI-safe string """ - # Preserve Chinese characters, letters, numbers, underscores, hyphens - safe = re.sub(r"[^\w\u4e00-\u9fff\-]", "_", text) + # Preserve: + # - Letters, numbers, underscores, hyphens (\w includes [a-zA-Z0-9_]) + # - CJK Unified Ideographs (Chinese, Japanese Kanji, Korean Hanja) + # - Hiragana and Katakana (Japanese) + # - Hangul Syllables (Korean) + # - CJK Unified Ideographs Extension A + # - CJK Unified Ideographs Extension B + safe = re.sub( + r"[^\w\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af\u3400-\u4dbf\U00020000-\U0002a6df\-]", + "_", + text, + ) # Merge consecutive underscores safe = re.sub(r"_+", "_", safe) # Strip leading/trailing underscores and limit length @@ -245,6 +256,33 @@ def __eq__(self, other) -> bool: def __hash__(self) -> int: return hash(self.uri) + @staticmethod + def normalize(uri: str) -> str: + """ + Normalize URI by ensuring it has the viking:// scheme. + + If the input already starts with viking://, returns it as-is. + If it starts with /, prepends viking:// (resulting in viking:///... which is invalid, + so we strip leading / first). + Otherwise, prepends viking://. + + Examples: + "/resources/images" -> "viking://resources/images" + "resources/images" -> "viking://resources/images" + "viking://resources/images" -> "viking://resources/images" + + Args: + uri: Input URI string + + Returns: + Normalized URI with viking:// scheme + """ + if uri.startswith(f"{VikingURI.SCHEME}://"): + return uri + # Strip leading slashes + uri = uri.lstrip("/") + return f"{VikingURI.SCHEME}://{uri}" + @classmethod def create_temp_uri(cls) -> str: """Create temp directory URI like viking://temp/MMDDHHMM_XXXXXX"""