diff --git a/crates/ov_cli/src/main.rs b/crates/ov_cli/src/main.rs
index 66430907..56586218 100644
--- a/crates/ov_cli/src/main.rs
+++ b/crates/ov_cli/src/main.rs
@@ -460,7 +460,7 @@ async fn main() {
 }
 
 async fn handle_add_resource(
-    path: String,
+    mut path: String,
     to: Option<String>,
     reason: String,
     instruction: String,
@@ -468,6 +468,34 @@ async fn handle_add_resource(
     timeout: Option<f64>,
     ctx: CliContext,
 ) -> Result<()> {
+    // Validate path: if it's a local path, check if it exists
+    if !path.starts_with("http://") && !path.starts_with("https://") {
+        use std::path::Path;
+        
+        // Unescape path: replace backslash followed by space with just space
+        let unescaped_path = path.replace("\\ ", " ");
+        let path_obj = Path::new(&unescaped_path);
+        if !path_obj.exists() {
+            eprintln!("Error: Path '{}' does not exist.", path);
+            
+            // Check if there might be unquoted spaces
+            use std::env;
+            let args: Vec<String> = env::args().collect();
+            
+            if let Some(add_resource_pos) = args.iter().position(|s| s == "add-resource" || s == "add") {
+                if args.len() > add_resource_pos + 2 {
+                    let extra_args = &args[add_resource_pos + 2..];
+                    let suggested_path = format!("{} {}", path, extra_args.join(" "));
+                    eprintln!("\nIt looks like you may have forgotten to quote a path with spaces.");
+                    eprintln!("Suggested command: ov add-resource \"{}\"", suggested_path);
+                }
+            }
+            
+            std::process::exit(1);
+        }
+        path = unescaped_path;
+    }
+    
     let client = ctx.get_client();
     commands::resources::add_resource(
         &client, &path, to, reason, instruction, wait, timeout, ctx.output_format, ctx.compact
diff --git a/examples/chatmem/ov.conf.example b/examples/chatmem/ov.conf.example
index 2e9a40ae..6a085e5d 100644
--- a/examples/chatmem/ov.conf.example
+++ b/examples/chatmem/ov.conf.example
@@ -12,6 +12,7 @@
     "api_base" : "https://ark-cn-beijing.bytedance.net/api/v3",
     "api_key"  : "not_gonna_give_u_this",
     "backend"  : "volcengine",
-    "model"    : "doubao-seed-1-8-251228"
+    "model"    : "doubao-seed-1-8-251228",
+    "thinking": false
   }
 }
diff --git a/examples/mcp-query/ov.conf.example b/examples/mcp-query/ov.conf.example
index bf4a45de..fc40ea92 100644
--- a/examples/mcp-query/ov.conf.example
+++ b/examples/mcp-query/ov.conf.example
@@ -12,6 +12,7 @@
     "api_base" : "https://ark-cn-beijing.bytedance.net/api/v3",
     "api_key"  : "<your-api-key>",
     "provider" : "volcengine",
-    "model"    : "doubao-seed-1-8-251228"
+    "model"    : "doubao-seed-1-8-251228",
+    "thinking": false
   }
 }
diff --git a/examples/memex/ov.conf.example b/examples/memex/ov.conf.example
index d5187cd3..42697b72 100644
--- a/examples/memex/ov.conf.example
+++ b/examples/memex/ov.conf.example
@@ -12,6 +12,7 @@
     "api_base" : "https://ark.cn-beijing.volces.com/api/v3",
     "api_key"  : "your-volcengine-api-key",
     "backend"  : "volcengine",
-    "model"    : "doubao-seed-1-8-251228"
+    "model"    : "doubao-seed-1-8-251228",
+    "thinking": false
   }
 }
diff --git a/examples/ov.conf.example b/examples/ov.conf.example
index 205cd7d9..34cbc6a4 100644
--- a/examples/ov.conf.example
+++ b/examples/ov.conf.example
@@ -45,7 +45,8 @@
     "api_base": "https://ark.cn-beijing.volces.com/api/v3",
     "temperature": 0.0,
     "max_retries": 2,
-    "provider": "volcengine"
+    "provider": "volcengine",
+    "thinking": false
   },
   "rerank": {
     "ak": null,
diff --git a/examples/query/ov.conf.example b/examples/query/ov.conf.example
index 58d034c0..fdc7cb55 100644
--- a/examples/query/ov.conf.example
+++ b/examples/query/ov.conf.example
@@ -12,6 +12,7 @@
     "api_base" : "https://ark-cn-beijing.bytedance.net/api/v3",
     "api_key"  : "not_gonna_give_u_this",
     "provider" : "volcengine",
-    "model"    : "doubao-seed-1-8-251228"
+    "model"    : "doubao-seed-1-8-251228",
+    "thinking": false
   }
 }
diff --git a/examples/server_client/ov.conf.example b/examples/server_client/ov.conf.example
index 582d79b8..13eb55db 100644
--- a/examples/server_client/ov.conf.example
+++ b/examples/server_client/ov.conf.example
@@ -34,6 +34,7 @@
     "api_base": "https://ark.cn-beijing.volces.com/api/v3",
     "temperature": 0.0,
     "max_retries": 2,
-    "provider": "volcengine"
+    "provider": "volcengine",
+    "thinking": false
   }
 }
diff --git a/openviking/models/vlm/backends/litellm_vlm.py b/openviking/models/vlm/backends/litellm_vlm.py
index f1efa562..2373e5dd 100644
--- a/openviking/models/vlm/backends/litellm_vlm.py
+++ b/openviking/models/vlm/backends/litellm_vlm.py
@@ -139,21 +139,31 @@ def _build_kwargs(self, model: str, messages: list) -> dict[str, Any]:
 
         return kwargs
 
-    def get_completion(self, prompt: str) -> str:
+    def get_completion(self, prompt: str, thinking: bool = False) -> str:
         """Get text completion synchronously."""
         model = self._resolve_model(self.model or "gpt-4o-mini")
         messages = [{"role": "user", "content": prompt}]
+        original_thinking = self._thinking
+        if thinking:
+            self._thinking = thinking
         kwargs = self._build_kwargs(model, messages)
+        self._thinking = original_thinking
 
         response = completion(**kwargs)
         self._update_token_usage_from_response(response)
         return response.choices[0].message.content or ""
 
-    async def get_completion_async(self, prompt: str, max_retries: int = 0) -> str:
+    async def get_completion_async(
+        self, prompt: str, thinking: bool = False, max_retries: int = 0
+    ) -> str:
         """Get text completion asynchronously."""
         model = self._resolve_model(self.model or "gpt-4o-mini")
         messages = [{"role": "user", "content": prompt}]
+        original_thinking = self._thinking
+        if thinking:
+            self._thinking = thinking
         kwargs = self._build_kwargs(model, messages)
+        self._thinking = original_thinking
 
         last_error = None
         for attempt in range(max_retries + 1):
@@ -164,7 +174,7 @@ async def get_completion_async(self, prompt: str, max_retries: int = 0) -> str:
             except Exception as e:
                 last_error = e
                 if attempt < max_retries:
-                    await asyncio.sleep(2 ** attempt)
+                    await asyncio.sleep(2**attempt)
 
         if last_error:
             raise last_error
@@ -174,6 +184,7 @@ def get_vision_completion(
         self,
         prompt: str,
         images: List[Union[str, Path, bytes]],
+        thinking: bool = False,
     ) -> str:
         """Get vision completion synchronously."""
         model = self._resolve_model(self.model or "gpt-4o-mini")
@@ -184,7 +195,11 @@ def get_vision_completion(
         content.append({"type": "text", "text": prompt})
 
         messages = [{"role": "user", "content": content}]
+        original_thinking = self._thinking
+        if thinking:
+            self._thinking = thinking
         kwargs = self._build_kwargs(model, messages)
+        self._thinking = original_thinking
 
         response = completion(**kwargs)
         self._update_token_usage_from_response(response)
@@ -194,6 +209,7 @@ async def get_vision_completion_async(
         self,
         prompt: str,
         images: List[Union[str, Path, bytes]],
+        thinking: bool = False,
     ) -> str:
         """Get vision completion asynchronously."""
         model = self._resolve_model(self.model or "gpt-4o-mini")
@@ -204,7 +220,11 @@ async def get_vision_completion_async(
         content.append({"type": "text", "text": prompt})
 
         messages = [{"role": "user", "content": content}]
+        original_thinking = self._thinking
+        if thinking:
+            self._thinking = thinking
         kwargs = self._build_kwargs(model, messages)
+        self._thinking = original_thinking
 
         response = await acompletion(**kwargs)
         self._update_token_usage_from_response(response)
diff --git a/openviking/models/vlm/backends/openai_vlm.py b/openviking/models/vlm/backends/openai_vlm.py
index c6c5b230..18a22ff7 100644
--- a/openviking/models/vlm/backends/openai_vlm.py
+++ b/openviking/models/vlm/backends/openai_vlm.py
@@ -131,6 +131,7 @@ def get_vision_completion(
         self,
         prompt: str,
         images: List[Union[str, Path, bytes]],
+        thinking: bool = False,
     ) -> str:
         """Get vision completion"""
         client = self.get_client()
@@ -140,11 +141,16 @@ def get_vision_completion(
             content.append(self._prepare_image(img))
         content.append({"type": "text", "text": prompt})
 
-        response = client.chat.completions.create(
-            model=self.model or "gpt-4o-mini",
-            messages=[{"role": "user", "content": content}],
-            temperature=self.temperature,
-        )
+        kwargs = {
+            "model": self.model or "gpt-4o-mini",
+            "messages": [{"role": "user", "content": content}],
+            "temperature": self.temperature,
+        }
+
+        if self.provider == "volcengine":
+            kwargs["thinking"] = {"type": "disabled" if not thinking else "enabled"}
+
+        response = client.chat.completions.create(**kwargs)
         self._update_token_usage_from_response(response)
         return response.choices[0].message.content or ""
 
@@ -152,6 +158,7 @@ async def get_vision_completion_async(
         self,
         prompt: str,
         images: List[Union[str, Path, bytes]],
+        thinking: bool = False,
     ) -> str:
         """Get vision completion asynchronously"""
         client = self.get_async_client()
@@ -161,10 +168,15 @@ async def get_vision_completion_async(
             content.append(self._prepare_image(img))
         content.append({"type": "text", "text": prompt})
 
-        response = await client.chat.completions.create(
-            model=self.model or "gpt-4o-mini",
-            messages=[{"role": "user", "content": content}],
-            temperature=self.temperature,
-        )
+        kwargs = {
+            "model": self.model or "gpt-4o-mini",
+            "messages": [{"role": "user", "content": content}],
+            "temperature": self.temperature,
+        }
+
+        if self.provider == "volcengine":
+            kwargs["thinking"] = {"type": "disabled" if not thinking else "enabled"}
+
+        response = await client.chat.completions.create(**kwargs)
         self._update_token_usage_from_response(response)
         return response.choices[0].message.content or ""
diff --git a/openviking/models/vlm/backends/volcengine_vlm.py b/openviking/models/vlm/backends/volcengine_vlm.py
index f11a289e..b5841cc8 100644
--- a/openviking/models/vlm/backends/volcengine_vlm.py
+++ b/openviking/models/vlm/backends/volcengine_vlm.py
@@ -54,22 +54,26 @@ def get_async_client(self):
             )
         return self._async_client
 
-    def get_completion(self, prompt: str) -> str:
-        return super().get_completion(prompt)
+    def get_completion(self, prompt: str, thinking: bool = False) -> str:
+        return super().get_completion(prompt, thinking)
 
-    async def get_completion_async(self, prompt: str, max_retries: int = 0) -> str:
-        return await super().get_completion_async(prompt, max_retries)
+    async def get_completion_async(
+        self, prompt: str, thinking: bool = False, max_retries: int = 0
+    ) -> str:
+        return await super().get_completion_async(prompt, thinking, max_retries)
 
     def get_vision_completion(
         self,
         prompt: str,
         images: List[Union[str, Path, bytes]],
+        thinking: bool = False,
     ) -> str:
-        return super().get_vision_completion(prompt, images)
+        return super().get_vision_completion(prompt, images, thinking)
 
     async def get_vision_completion_async(
         self,
         prompt: str,
         images: List[Union[str, Path, bytes]],
+        thinking: bool = False,
     ) -> str:
-        return await super().get_vision_completion_async(prompt, images)
+        return await super().get_vision_completion_async(prompt, images, thinking)
diff --git a/openviking/models/vlm/base.py b/openviking/models/vlm/base.py
index ef55f712..cd563f9c 100644
--- a/openviking/models/vlm/base.py
+++ b/openviking/models/vlm/base.py
@@ -27,12 +27,14 @@ def __init__(self, config: Dict[str, Any]):
         self._token_tracker = TokenUsageTracker()
 
     @abstractmethod
-    def get_completion(self, prompt: str) -> str:
+    def get_completion(self, prompt: str, thinking: bool = False) -> str:
         """Get text completion"""
         pass
 
     @abstractmethod
-    async def get_completion_async(self, prompt: str, max_retries: int = 0) -> str:
+    async def get_completion_async(
+        self, prompt: str, thinking: bool = False, max_retries: int = 0
+    ) -> str:
         """Get text completion asynchronously"""
         pass
 
@@ -41,6 +43,7 @@ def get_vision_completion(
         self,
         prompt: str,
         images: List[Union[str, Path, bytes]],
+        thinking: bool = False,
     ) -> str:
         """Get vision completion"""
         pass
@@ -50,6 +53,7 @@ async def get_vision_completion_async(
         self,
         prompt: str,
         images: List[Union[str, Path, bytes]],
+        thinking: bool = False,
     ) -> str:
         """Get vision completion asynchronously"""
         pass
@@ -128,16 +132,20 @@ def create(config: Dict[str, Any]) -> VLMBase:
         if not use_litellm:
             if provider == "openai":
                 from .backends.openai_vlm import OpenAIVLM
+
                 return OpenAIVLM(config)
             elif provider == "volcengine":
                 from .backends.volcengine_vlm import VolcEngineVLM
+
                 return VolcEngineVLM(config)
 
         from .backends.litellm_vlm import LiteLLMVLMProvider
+
         return LiteLLMVLMProvider(config)
 
     @staticmethod
     def get_available_providers() -> List[str]:
         """Get list of available providers"""
         from .registry import get_all_provider_names
+
         return get_all_provider_names()
diff --git a/openviking/models/vlm/llm.py b/openviking/models/vlm/llm.py
index 07c52179..6c8b9c56 100644
--- a/openviking/models/vlm/llm.py
+++ b/openviking/models/vlm/llm.py
@@ -168,34 +168,38 @@ def complete_json(
         self,
         prompt: str,
         schema: Optional[Dict[str, Any]] = None,
+        thinking: bool = False,
     ) -> Optional[Dict[str, Any]]:
         """Get JSON completion from VLM."""
         if schema:
             prompt = f"{prompt}\n\n{get_json_schema_prompt(schema)}"
 
-        response = self._get_vlm().get_completion(prompt)
+        response = self._get_vlm().get_completion(prompt, thinking)
         return parse_json_from_response(response)
 
     async def complete_json_async(
         self,
         prompt: str,
         schema: Optional[Dict[str, Any]] = None,
+        thinking: bool = False,
+        max_retries: int = 0,
     ) -> Optional[Dict[str, Any]]:
         """Async version of complete_json."""
         if schema:
             prompt = f"{prompt}\n\n{get_json_schema_prompt(schema)}"
 
-        response = await self._get_vlm().get_completion_async(prompt)
+        response = await self._get_vlm().get_completion_async(prompt, thinking, max_retries)
         return parse_json_from_response(response)
 
     def complete_model(
         self,
         prompt: str,
         model_class: Type[T],
+        thinking: bool = False,
     ) -> Optional[T]:
         """Get structured completion validated against a Pydantic model."""
         schema = model_class.model_json_schema()
-        response = self.complete_json(prompt, schema=schema)
+        response = self.complete_json(prompt, schema=schema, thinking=thinking)
         if response is None:
             return None
 
@@ -209,10 +213,14 @@ async def complete_model_async(
         self,
         prompt: str,
         model_class: Type[T],
+        thinking: bool = False,
+        max_retries: int = 0,
     ) -> Optional[T]:
         """Async version of complete_model."""
         schema = model_class.model_json_schema()
-        response = await self.complete_json_async(prompt, schema=schema)
+        response = await self.complete_json_async(
+            prompt, schema=schema, thinking=thinking, max_retries=max_retries
+        )
         if response is None:
             return None
 
@@ -226,14 +234,16 @@ def get_vision_completion(
         self,
         prompt: str,
         images: list,
+        thinking: bool = False,
     ) -> str:
         """Get vision completion."""
-        return self._get_vlm().get_vision_completion(prompt, images)
+        return self._get_vlm().get_vision_completion(prompt, images, thinking)
 
     async def get_vision_completion_async(
         self,
         prompt: str,
         images: list,
+        thinking: bool = False,
     ) -> str:
         """Async vision completion."""
-        return await self._get_vlm().get_vision_completion_async(prompt, images)
+        return await self._get_vlm().get_vision_completion_async(prompt, images, thinking)
diff --git a/openviking/parse/parsers/README.md b/openviking/parse/parsers/README.md
index b8bcefe0..94fcc23c 100644
--- a/openviking/parse/parsers/README.md
+++ b/openviking/parse/parsers/README.md
@@ -144,10 +144,66 @@ L1: """
 代码解析器，支持语法高亮和代码结构分析。能识别函数、类、方法等代码元素。
 
 ### 6. MediaParser (`media.py`)
-**支持格式**: `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp`, `.mp4`, `.mov`, `.avi`, `.webm`, `.mp3`, `.wav`, `.m4a`, `.flac`
+
+**支持格式**:
+- 图片: `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp`
+- 视频: `.mp4`, `.mov`, `.avi`, `.webm`
+- 音频: `.mp3`, `.wav`, `.m4a`, `.flac`
 
 多媒体解析器，使用 VLM（视觉语言模型）分析图像、视频和音频内容，生成文本描述。
 
+#### 存储组织策略
+
+多媒体文件的存储采用以下层级结构：
+
+```
+viking://resource/
+├── images/     # 图片文件
+│   └── 20240820/  # 上传日期（YYYYMMDD）
+│       └── 20240820_123456_jpg/  # 文件文件夹（文件名_扩展名）
+│           ├── .abstract.md    # L0 摘要
+│           ├── .overview.md    # L1 概览
+│           └── 20240820_123456.jpg  # 原始文件
+├── audio/      # 音频文件
+│   └── 20240820/
+│       └── my_song_mp3/
+│           ├── .abstract.md
+│           ├── .overview.md
+│           └── my_song.mp3
+└── video/      # 视频文件
+    └── 20240820/
+        └── my_video_mp4/
+            ├── .abstract.md
+            ├── .overview.md
+            └── my_video.mp4
+```
+
+详细说明：
+
+1. **媒体子目录**: 在 `viking://resource` 下按类型划分为三个子目录
+   - `viking://resource/images`: 存储未明确指定目标路径的图片文件
+   - `viking://resource/audio`: 存储未明确指定目标路径的音频文件
+   - `viking://resource/video`: 存储未明确指定目标路径的视频文件
+
+2. **日期子目录**: 每次上传的文件按当前日期（格式：YYYYMMDD）组织，而非文件内部元信息的时间
+   - 例如：`viking://resource/images/20240820/` 存储 2024年8月20日上传的所有图片
+
+3. **文件文件夹**: 为每个多媒体文件创建一个专属文件夹，命名规则为：`文件名_扩展名`（扩展名不含点）
+   - 示例：上传 `20240820_123456.jpg` → 创建文件夹 `20240820_123456_jpg`
+
+4. **文件夹内容**: 每个文件文件夹内包含：
+   - **原始文件**: 保持原始文件名，空格字符替换为下划线 `_`（因为 OpenViking URI 不允许包含空格）
+     - 示例：`photo 1.jpg` → 保存为 `photo_1.jpg`
+   - `.abstract.md` (L0 层): 摘要信息（<200 token）
+     - 图片：文件名、内容描述、画面风格等
+     - 音频：文件名、时长、内容概述等
+     - 视频：文件名、时长、内容概述等
+   - `.overview.md` (L1 层): 概览信息
+     - 图片：除摘要外，还包含尺寸、OCR 识别结果、场景和主体描述等
+     - 音频：除摘要外，还包含语音/歌词识别结果、章节时间线等
+     - 视频：除摘要外，还包含使用场景等（未来会支持切分视频，递归存储子文件）
+
+
 ## 核心组件
 
 ### BaseParser (`base_parser.py`)
diff --git a/openviking/parse/parsers/html.py b/openviking/parse/parsers/html.py
index 28e47885..85fd0c0a 100644
--- a/openviking/parse/parsers/html.py
+++ b/openviking/parse/parsers/html.py
@@ -601,6 +601,10 @@ async def parse_content(
 
     def _sanitize_for_path(self, text: str) -> str:
         """Sanitize text for use in file path."""
-        safe = re.sub(r"[^\w\u4e00-\u9fff\s-]", "", text)
+        safe = re.sub(
+            r"[^\w\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af\u3400-\u4dbf\U00020000-\U0002a6df\s-]",
+            "",
+            text,
+        )
         safe = re.sub(r"\s+", "_", safe)
         return safe.strip("_")[:50] or "section"
diff --git a/openviking/parse/parsers/markdown.py b/openviking/parse/parsers/markdown.py
index e6ddbe3a..1570baf3 100644
--- a/openviking/parse/parsers/markdown.py
+++ b/openviking/parse/parsers/markdown.py
@@ -334,7 +334,11 @@ def _smart_split_content(self, content: str, max_size: int) -> List[str]:
         return parts if parts else [content]
 
     def _sanitize_for_path(self, text: str) -> str:
-        safe = re.sub(r"[^\w\u4e00-\u9fff\s-]", "", text)
+        safe = re.sub(
+            r"[^\w\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af\u3400-\u4dbf\U00020000-\U0002a6df\s-]",
+            "",
+            text,
+        )
         safe = re.sub(r"\s+", "_", safe)
         return safe.strip("_")[:50] or "section"
 
diff --git a/openviking/parse/parsers/media.py b/openviking/parse/parsers/media.py
deleted file mode 100644
index cb25a803..00000000
--- a/openviking/parse/parsers/media.py
+++ /dev/null
@@ -1,536 +0,0 @@
-# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
-# SPDX-License-Identifier: Apache-2.0
-"""
-Media parser interfaces for OpenViking - Future expansion.
-
-This module defines parser interfaces for media types (image, audio, video).
-These are placeholder implementations that raise NotImplementedError.
-They serve as a design reference for future media parsing capabilities.
-
-For current document parsing (PDF, Markdown, HTML, Text), see other parser modules.
-"""
-
-from pathlib import Path
-from typing import List, Optional, Union
-
-from PIL import Image
-
-from openviking.parse.base import NodeType, ParseResult, ResourceNode
-from openviking.parse.parsers.base_parser import BaseParser
-from openviking_cli.utils.config.parser_config import AudioConfig, ImageConfig, VideoConfig
-
-# =============================================================================
-# Configuration Classes
-# =============================================================================
-
-
-# =============================================================================
-# Parser Classes
-# =============================================================================
-
-
-class ImageParser(BaseParser):
-    """
-    Image parser - Future implementation.
-
-    Planned Features:
-    1. Visual content understanding using VLM (Vision Language Model)
-    2. OCR text extraction for images containing text
-    3. Metadata extraction (dimensions, format, EXIF data)
-    4. Generate semantic description and structured ResourceNode
-
-    Example workflow:
-        1. Load image file
-        2. (Optional) Perform OCR to extract text
-        3. (Optional) Use VLM to generate visual description
-        4. Create ResourceNode with image metadata and descriptions
-        5. Return ParseResult
-
-    Supported formats: PNG, JPG, JPEG, GIF, BMP, WEBP, SVG
-    """
-
-    def __init__(self, config: Optional[ImageConfig] = None, **kwargs):
-        """
-        Initialize ImageParser.
-
-        Args:
-            config: Image parsing configuration
-            **kwargs: Additional configuration parameters
-        """
-        self.config = config or ImageConfig()
-
-    @property
-    def supported_extensions(self) -> List[str]:
-        """Return supported image file extensions."""
-        return [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp", ".svg"]
-
-    async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult:
-        """
-        Parse image file using three-phase architecture.
-
-        Phase 1: Generate temporary files
-        - Copy original image to temp_uri/content.{ext}
-        - Generate description.md using VLM
-        - (Optional) Generate ocr.md using OCR
-
-        Phase 2: Generate semantic info
-        - Generate abstract and overview based on description.md
-        - Overview includes file list and usage instructions
-
-        Phase 3: Build directory structure
-        - Move all files to final URI
-        - Generate .abstract.md, .overview.md
-
-        Args:
-            source: Image file path
-            **kwargs: Additional parsing parameters
-
-        Returns:
-            ParseResult with image content
-
-        Raises:
-            FileNotFoundError: If source file does not exist
-            IOError: If image processing fails
-        """
-        from openviking.storage.viking_fs import get_viking_fs
-
-        # Convert to Path object
-        file_path = Path(source) if isinstance(source, str) else source
-        if not file_path.exists():
-            raise FileNotFoundError(f"Image file not found: {source}")
-
-        viking_fs = get_viking_fs()
-        temp_uri = viking_fs.create_temp_uri()
-
-        # Phase 1: Generate temporary files
-        image_bytes = file_path.read_bytes()
-        ext = file_path.suffix
-
-        # 1.1 Save original image
-        await viking_fs.write_file_bytes(f"{temp_uri}/content{ext}", image_bytes)
-
-        # 1.2 Extract image metadata
-        try:
-            img = Image.open(file_path)
-            width, height = img.size
-            format_str = img.format or ext[1:].upper()
-        except Exception:
-            width, height = 0, 0
-            format_str = ext[1:].upper()
-
-        # 1.3 Generate VLM description
-        description = ""
-        if self.config.enable_vlm:
-            description = await self._vlm_describe(image_bytes, self.config.vlm_model)
-        else:
-            # Fallback: basic description
-            description = f"Image file: {file_path.name} ({format_str}, {width}x{height})"
-
-        await viking_fs.write_file(f"{temp_uri}/description.md", description)
-
-        # 1.4 OCR (optional)
-        ocr_text = None
-        if self.config.enable_ocr:
-            ocr_text = await self._ocr_extract(image_bytes, self.config.ocr_lang)
-            if ocr_text:
-                await viking_fs.write_file(f"{temp_uri}/ocr.md", ocr_text)
-
-        # Create ResourceNode
-        root_node = ResourceNode(
-            type=NodeType.ROOT,
-            title=file_path.stem,
-            level=0,
-            detail_file=None,
-            content_path=None,
-            children=[],
-            meta={
-                "width": width,
-                "height": height,
-                "format": format_str.lower(),
-                "content_type": "image",
-                "source_title": file_path.stem,
-                "semantic_name": file_path.stem,
-            },
-        )
-
-        # Phase 2: Generate semantic info
-        await self._generate_semantic_info(root_node, temp_uri, viking_fs, ocr_text is not None)
-
-        # Phase 3: Build directory structure (handled by TreeBuilder)
-        return ParseResult(
-            root=root_node,
-            source_path=str(file_path),
-            temp_dir_path=temp_uri,
-            source_format="image",
-            parser_name="ImageParser",
-            meta={"content_type": "image", "format": format_str.lower()},
-        )
-
-    async def _vlm_describe(self, image_bytes: bytes, model: Optional[str]) -> str:
-        """
-        Generate image description using VLM.
-
-        Args:
-            image_bytes: Image binary data
-            model: VLM model name
-
-        Returns:
-            Image description in markdown format
-
-        TODO: Integrate with actual VLM API (OpenAI GPT-4V, Claude Vision, etc.)
-        """
-        # Fallback implementation - returns basic placeholder
-        return "Image description (VLM integration pending)\n\nThis is an image. VLM description feature has not yet integrated external API."
-
-    async def _ocr_extract(self, image_bytes: bytes, lang: str) -> Optional[str]:
-        """
-        Extract text from image using OCR.
-
-        Args:
-            image_bytes: Image binary data
-            lang: OCR language code
-
-        Returns:
-            Extracted text in markdown format, or None if no text found
-
-        TODO: Integrate with OCR API (Tesseract, PaddleOCR, etc.)
-        """
-        # Not implemented - return None
-        return None
-
-    async def _generate_semantic_info(
-        self, node: ResourceNode, temp_uri: str, viking_fs, has_ocr: bool
-    ):
-        """
-        Phase 2: Generate abstract and overview.
-
-        Args:
-            node: ResourceNode to update
-            temp_uri: Temporary URI
-            viking_fs: VikingFS instance
-            has_ocr: Whether OCR file exists
-        """
-        # Read description.md
-        description = await viking_fs.read_file(f"{temp_uri}/description.md")
-
-        # Generate abstract (short summary, < 100 tokens)
-        abstract = description[:200] if len(description) > 200 else description
-
-        # Generate overview (content summary + file list + usage instructions)
-        overview_parts = [
-            "## Content Summary\n",
-            description,
-            "\n\n## Available Files\n",
-            f"- content.{node.meta['format']}: Original image file ({node.meta['width']}x{node.meta['height']}, {node.meta['format'].upper()} format)\n",
-            "- description.md: Detailed image description generated by VLM\n",
-        ]
-
-        if has_ocr:
-            overview_parts.append("- ocr.md: OCR text recognition result from the image\n")
-
-        overview_parts.append("\n## Usage\n")
-        overview_parts.append("### View Image\n")
-        overview_parts.append("```python\n")
-        overview_parts.append("image_bytes = await image_resource.view()\n")
-        overview_parts.append("# Returns: PNG/JPG format image binary data\n")
-        overview_parts.append("# Purpose: Display or save the image\n")
-        overview_parts.append("```\n\n")
-
-        overview_parts.append("### Get VLM-generated Image Description\n")
-        overview_parts.append("```python\n")
-        overview_parts.append("description = await image_resource.description()\n")
-        overview_parts.append("# Returns: FileContent object for further processing\n")
-        overview_parts.append("# Purpose: Understand image content\n")
-        overview_parts.append("```\n\n")
-
-        if has_ocr:
-            overview_parts.append("### Get OCR-recognized Text\n")
-            overview_parts.append("```python\n")
-            overview_parts.append("ocr_text = await image_resource.ocr()\n")
-            overview_parts.append("# Returns: FileContent object or None\n")
-            overview_parts.append("# Purpose: Extract text information from the image\n")
-            overview_parts.append("```\n\n")
-
-        overview_parts.append("### Get Image Metadata\n")
-        overview_parts.append("```python\n")
-        overview_parts.append(
-            f"size = image_resource.get_size()  # ({node.meta['width']}, {node.meta['height']})\n"
-        )
-        overview_parts.append(f'format = image_resource.get_format()  # "{node.meta["format"]}"\n')
-        overview_parts.append("```\n")
-
-        overview = "".join(overview_parts)
-
-        # Store in node meta
-        node.meta["abstract"] = abstract
-        node.meta["overview"] = overview
-
-    async def parse_content(
-        self, content: str, source_path: Optional[str] = None, instruction: str = "", **kwargs
-    ) -> ParseResult:
-        """
-        Parse image from content string - Not yet implemented.
-
-        Args:
-            content: Image content (base64 or binary string)
-            source_path: Optional source path for metadata
-            **kwargs: Additional parsing parameters
-
-        Returns:
-            ParseResult with image content
-
-        Raises:
-            NotImplementedError: This feature is not yet implemented
-        """
-        raise NotImplementedError("Image parsing not yet implemented")
-
-
-class AudioParser(BaseParser):
-    """
-    Audio parser - Future implementation.
-
-    Planned Features:
-    1. Speech-to-text transcription using ASR models
-    2. Audio metadata extraction (duration, sample rate, channels)
-    3. Speaker diarization (identify different speakers)
-    4. Timestamp alignment for transcribed text
-    5. Generate structured ResourceNode with transcript
-
-    Example workflow:
-        1. Load audio file
-        2. Extract metadata (duration, format, sample rate)
-        3. Transcribe speech to text using Whisper or similar
-        4. (Optional) Perform speaker diarization
-        5. Create ResourceNode with:
-           - type: NodeType.ROOT
-           - children: sections for each speaker/timestamp
-           - meta: audio metadata and timestamps
-        6. Return ParseResult
-
-    Supported formats: MP3, WAV, OGG, FLAC, AAC, M4A
-    """
-
-    def __init__(self, config: Optional[AudioConfig] = None, **kwargs):
-        """
-        Initialize AudioParser.
-
-        Args:
-            config: Audio parsing configuration
-            **kwargs: Additional configuration parameters
-        """
-        self.config = config or AudioConfig()
-
-    @property
-    def supported_extensions(self) -> List[str]:
-        """Return supported audio file extensions."""
-        return [".mp3", ".wav", ".ogg", ".flac", ".aac", ".m4a", ".opus"]
-
-    async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult:
-        """
-        Parse audio file - Not yet implemented.
-
-        Planned implementation:
-        1. Load audio file
-        2. Extract metadata using librosa or similar
-        3. If enable_transcription:
-           - Transcribe using Whisper or similar ASR model
-           - Generate timestamps for each segment
-           - (Optional) Perform speaker diarization
-        4. Create ResourceNode tree:
-           - Root node with audio metadata
-           - Child nodes for each transcribed segment
-        5. Return ParseResult
-
-        Args:
-            source: Audio file path or URL
-            **kwargs: Additional parsing parameters
-
-        Returns:
-            ParseResult with transcribed content
-
-        Raises:
-            NotImplementedError: This feature is not yet implemented
-        """
-        raise NotImplementedError(
-            "Audio parsing is not yet implemented. "
-            "This is a placeholder interface for future expansion. "
-            "\n\nPlanned features:"
-            "\n- Speech-to-text transcription (Whisper)"
-            "\n- Speaker diarization"
-            "\n- Timestamp alignment"
-            "\n- Audio metadata extraction"
-            "\n\nWorkaround: Extract audio manually and add transcripts as "
-            "text or markdown files."
-        )
-
-    async def parse_content(
-        self, content: str, source_path: Optional[str] = None, instruction: str = "", **kwargs
-    ) -> ParseResult:
-        """
-        Parse audio from content string - Not yet implemented.
-
-        Args:
-            content: Audio content (base64 or binary string)
-            source_path: Optional source path for metadata
-            **kwargs: Additional parsing parameters
-
-        Returns:
-            ParseResult with transcribed content
-
-        Raises:
-            NotImplementedError: This feature is not yet implemented
-        """
-        raise NotImplementedError("Audio parsing not yet implemented")
-
-
-class VideoParser(BaseParser):
-    """
-    Video parser - Future implementation.
-
-    Planned Features:
-    1. Key frame extraction at regular intervals
-    2. Audio track transcription using ASR
-    3. VLM-based scene description for key frames
-    4. Video metadata extraction (duration, resolution, codec)
-    5. Generate structured ResourceNode combining visual and audio
-
-    Example workflow:
-        1. Load video file
-        2. Extract metadata (duration, resolution, fps)
-        3. Extract audio track → transcribe using AudioParser
-        4. Extract key frames at specified intervals
-        5. For each frame: generate VLM description
-        6. Create ResourceNode tree:
-           - Root: video metadata
-           - Children: timeline nodes (each with frame + transcript)
-        7. Return ParseResult
-
-    Supported formats: MP4, AVI, MOV, MKV, WEBM
-    """
-
-    def __init__(self, config: Optional[VideoConfig] = None, **kwargs):
-        """
-        Initialize VideoParser.
-
-        Args:
-            config: Video parsing configuration
-            **kwargs: Additional configuration parameters
-        """
-        self.config = config or VideoConfig()
-
-    @property
-    def supported_extensions(self) -> List[str]:
-        """Return supported video file extensions."""
-        return [".mp4", ".avi", ".mov", ".mkv", ".webm", ".flv", ".wmv"]
-
-    async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult:
-        """
-        Parse video file - Not yet implemented.
-
-        Planned implementation:
-        1. Load video file using cv2 or similar
-        2. Extract metadata (duration, resolution, fps, codec)
-        3. Extract audio track:
-           - Save as temporary audio file
-           - Parse using AudioParser
-        4. Extract key frames:
-           - At specified intervals (e.g., every 10 seconds)
-           - Save frames as images
-        5. For each frame (if enable_vlm_description):
-           - Use VLM to generate scene description
-        6. Create ResourceNode tree:
-           - Root: video metadata
-           - Children: Timeline segments
-             - Each segment contains:
-               - Timestamp
-               - Frame description (VLM)
-               - Transcript (ASR)
-        7. Return ParseResult
-
-        Args:
-            source: Video file path or URL
-            **kwargs: Additional parsing parameters
-
-        Returns:
-            ParseResult with video content
-
-        Raises:
-            NotImplementedError: This feature is not yet implemented
-        """
-        raise NotImplementedError(
-            "Video parsing is not yet implemented. "
-            "This is a placeholder interface for future expansion. "
-            "\n\nPlanned features:"
-            "\n- Key frame extraction"
-            "\n- Audio track transcription"
-            "\n- VLM scene description"
-            "\n- Timeline-based structured output"
-            "\n\nWorkaround: Extract frames and audio manually, then process "
-            "as images and audio files."
-        )
-
-    async def parse_content(
-        self, content: str, source_path: Optional[str] = None, instruction: str = "", **kwargs
-    ) -> ParseResult:
-        """
-        Parse video from content string - Not yet implemented.
-
-        Args:
-            content: Video content (base64 or binary string)
-            source_path: Optional source path for metadata
-            **kwargs: Additional parsing parameters
-
-        Returns:
-            ParseResult with video content
-
-        Raises:
-            NotImplementedError: This feature is not yet implemented
-        """
-        raise NotImplementedError("Video parsing not yet implemented")
-
-
-# =============================================================================
-# Utility Functions
-# =============================================================================
-
-
-def is_media_parser_available(parser_type: str) -> bool:
-    """
-    Check if a media parser type is currently available.
-
-    Args:
-        parser_type: Type of parser ("image", "audio", "video")
-
-    Returns:
-        False (all media parsers are future implementations)
-
-    Examples:
-        >>> is_media_parser_available("image")
-        False
-        >>> is_media_parser_available("video")
-        False
-    """
-    return False
-
-
-def get_media_parser_status() -> dict:
-    """
-    Get status of all media parsers.
-
-    Returns:
-        Dictionary with parser names and their implementation status
-
-    Examples:
-        >>> status = get_media_parser_status()
-        >>> print(status)
-        {
-            "image": "planned",
-            "audio": "planned",
-            "video": "planned"
-        }
-    """
-    return {
-        "image": "planned",
-        "audio": "planned",
-        "video": "planned",
-    }
diff --git a/openviking/parse/parsers/media/__init__.py b/openviking/parse/parsers/media/__init__.py
new file mode 100644
index 00000000..7fed46b5
--- /dev/null
+++ b/openviking/parse/parsers/media/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+
+from .audio import AudioParser
+from .image import ImageParser
+from .video import VideoParser
+
+__all__ = ["ImageParser", "AudioParser", "VideoParser"]
diff --git a/openviking/parse/parsers/media/audio.py b/openviking/parse/parsers/media/audio.py
new file mode 100644
index 00000000..f0a018c3
--- /dev/null
+++ b/openviking/parse/parsers/media/audio.py
@@ -0,0 +1,307 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+"""
+Audio parser - Future implementation.
+
+Planned Features:
+1. Speech-to-text transcription using ASR models
+2. Audio metadata extraction (duration, sample rate, channels)
+3. Speaker diarization (identify different speakers)
+4. Timestamp alignment for transcribed text
+5. Generate structured ResourceNode with transcript
+
+Example workflow:
+    1. Load audio file
+    2. Extract metadata (duration, format, sample rate)
+    3. Transcribe speech to text using Whisper or similar
+    4. (Optional) Perform speaker diarization
+    5. Create ResourceNode with:
+       - type: NodeType.ROOT
+       - children: sections for each speaker/timestamp
+       - meta: audio metadata and timestamps
+    6. Return ParseResult
+
+Supported formats: MP3, WAV, OGG, FLAC, AAC, M4A
+"""
+
+from pathlib import Path
+from typing import List, Optional, Union
+
+from openviking.parse.base import NodeType, ParseResult, ResourceNode
+from openviking.parse.parsers.base_parser import BaseParser
+from openviking_cli.utils.config.parser_config import AudioConfig
+
+
+class AudioParser(BaseParser):
+    """
+    Audio parser for audio files.
+    """
+
+    def __init__(self, config: Optional[AudioConfig] = None, **kwargs):
+        """
+        Initialize AudioParser.
+
+        Args:
+            config: Audio parsing configuration
+            **kwargs: Additional configuration parameters
+        """
+        self.config = config or AudioConfig()
+
+    @property
+    def supported_extensions(self) -> List[str]:
+        """Return supported audio file extensions."""
+        return [".mp3", ".wav", ".ogg", ".flac", ".aac", ".m4a", ".opus"]
+
+    async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult:
+        """
+        Parse audio file using three-phase architecture.
+
+        Phase 1: Generate temporary files
+        - Copy original audio to temp_uri/content.{ext}
+        - (Optional) Generate transcript with timestamps
+
+        Phase 2: Generate semantic info
+        - Generate abstract and overview based on description
+        - Overview includes file list and usage instructions
+
+        Phase 3: Build directory structure
+        - Move all files to final URI
+        - Generate .abstract.md, .overview.md
+
+        Args:
+            source: Audio file path
+            **kwargs: Additional parsing parameters
+
+        Returns:
+            ParseResult with audio content
+
+        Raises:
+            FileNotFoundError: If source file does not exist
+            IOError: If audio processing fails
+        """
+        from openviking.storage.viking_fs import get_viking_fs
+
+        # Convert to Path object
+        file_path = Path(source) if isinstance(source, str) else source
+        if not file_path.exists():
+            raise FileNotFoundError(f"Audio file not found: {source}")
+
+        viking_fs = get_viking_fs()
+        temp_uri = viking_fs.create_temp_uri()
+
+        # Phase 1: Generate temporary files
+        audio_bytes = file_path.read_bytes()
+        ext = file_path.suffix
+
+        from openviking_cli.utils.uri import VikingURI
+
+        # Sanitize original filename (replace spaces with underscores)
+        original_filename = file_path.name.replace(" ", "_")
+        # Root directory name: filename stem + _ + extension (without dot)
+        stem = file_path.stem.replace(" ", "_")
+        ext_no_dot = ext[1:] if ext else ""
+        root_dir_name = VikingURI.sanitize_segment(f"{stem}_{ext_no_dot}")
+        root_dir_uri = f"{temp_uri}/{root_dir_name}"
+        await viking_fs.mkdir(root_dir_uri)
+
+        # 1.1 Save original audio with original filename (sanitized)
+        await viking_fs.write_file_bytes(f"{root_dir_uri}/{original_filename}", audio_bytes)
+
+        # 1.2 Validate audio file using magic bytes
+        # Define magic bytes for supported audio formats
+        audio_magic_bytes = {
+            ".mp3": [b"ID3", b"\xff\xfb", b"\xff\xf3", b"\xff\xf2"],
+            ".wav": [b"RIFF"],
+            ".ogg": [b"OggS"],
+            ".flac": [b"fLaC"],
+            ".aac": [b"\xff\xf1", b"\xff\xf9"],
+            ".m4a": [b"\x00\x00\x00", b"ftypM4A", b"ftypisom"],
+            ".opus": [b"OggS"],
+        }
+
+        # Check magic bytes
+        valid = False
+        ext_lower = ext.lower()
+        magic_list = audio_magic_bytes.get(ext_lower, [])
+        for magic in magic_list:
+            if len(audio_bytes) >= len(magic) and audio_bytes.startswith(magic):
+                valid = True
+                break
+
+        if not valid:
+            raise ValueError(
+                f"Invalid audio file: {file_path}. File signature does not match expected format {ext_lower}"
+            )
+
+        # Extract audio metadata (placeholder)
+        duration = 0
+        sample_rate = 0
+        channels = 0
+        format_str = ext[1:].upper()
+
+        # 1.3 Generate ASR description
+        description = ""
+        if self.config.enable_transcription:
+            description = await self._asr_transcribe(audio_bytes, self.config.asr_model)
+        else:
+            # Fallback: basic description
+            description = f"Audio file: {file_path.name} ({format_str}, {duration}s, {sample_rate}Hz, {channels}ch)"
+
+        # 1.4 Transcript with timestamps (optional)
+        transcript_text = None
+        if self.config.enable_transcription and self.config.enable_timestamps:
+            transcript_text = await self._asr_transcribe_with_timestamps(
+                audio_bytes, self.config.asr_model
+            )
+            if transcript_text:
+                await viking_fs.write_file(f"{root_dir_uri}/transcript.md", transcript_text)
+
+        # Create ResourceNode
+        root_node = ResourceNode(
+            type=NodeType.ROOT,
+            title=file_path.stem,
+            level=0,
+            detail_file=None,
+            content_path=None,
+            children=[],
+            meta={
+                "duration": duration,
+                "sample_rate": sample_rate,
+                "channels": channels,
+                "format": format_str.lower(),
+                "content_type": "audio",
+                "source_title": file_path.stem,
+                "semantic_name": file_path.stem,
+                "original_filename": original_filename,
+            },
+        )
+
+        # Phase 2: Generate semantic info
+        await self._generate_semantic_info(
+            root_node, description, viking_fs, transcript_text is not None
+        )
+
+        # Phase 3: Build directory structure (handled by TreeBuilder)
+        return ParseResult(
+            root=root_node,
+            source_path=str(file_path),
+            temp_dir_path=temp_uri,
+            source_format="audio",
+            parser_name="AudioParser",
+            meta={"content_type": "audio", "format": format_str.lower()},
+        )
+
+    async def _asr_transcribe(self, audio_bytes: bytes, model: Optional[str]) -> str:
+        """
+        Generate audio transcription using ASR.
+
+        Args:
+            audio_bytes: Audio binary data
+            model: ASR model name
+
+        Returns:
+            Audio transcription in markdown format
+
+        TODO: Integrate with actual ASR API (Whisper, etc.)
+        """
+        # Fallback implementation - returns basic placeholder
+        return "Audio transcription (ASR integration pending)\n\nThis is an audio. ASR transcription feature has not yet integrated external API."
+
+    async def _asr_transcribe_with_timestamps(
+        self, audio_bytes: bytes, model: Optional[str]
+    ) -> Optional[str]:
+        """
+        Extract transcription with timestamps from audio using ASR.
+
+        Args:
+            audio_bytes: Audio binary data
+            model: ASR model name
+
+        Returns:
+            Transcript with timestamps in markdown format, or None if not available
+
+        TODO: Integrate with ASR API
+        """
+        # Not implemented - return None
+        return None
+
+    async def _generate_semantic_info(
+        self, node: ResourceNode, description: str, viking_fs, has_transcript: bool
+    ):
+        """
+        Phase 2: Generate abstract and overview.
+
+        Args:
+            node: ResourceNode to update
+            description: Audio description
+            viking_fs: VikingFS instance
+            has_transcript: Whether transcript file exists
+        """
+        # Generate abstract (short summary, < 100 tokens)
+        abstract = description[:200] if len(description) > 200 else description
+
+        # Generate overview (content summary + file list + usage instructions)
+        overview_parts = [
+            "## Content Summary\n",
+            description,
+            "\n\n## Available Files\n",
+            f"- {node.meta['original_filename']}: Original audio file ({node.meta['duration']}s, {node.meta['sample_rate']}Hz, {node.meta['channels']}ch, {node.meta['format'].upper()} format)\n",
+        ]
+
+        if has_transcript:
+            overview_parts.append("- transcript.md: Transcript with timestamps from the audio\n")
+
+        overview_parts.append("\n## Usage\n")
+        overview_parts.append("### Play Audio\n")
+        overview_parts.append("```python\n")
+        overview_parts.append("audio_bytes = await audio_resource.play()\n")
+        overview_parts.append("# Returns: Audio file binary data\n")
+        overview_parts.append("# Purpose: Play or save the audio\n")
+        overview_parts.append("```\n\n")
+
+        if has_transcript:
+            overview_parts.append("### Get Timestamps Transcript\n")
+            overview_parts.append("```python\n")
+            overview_parts.append("timestamps = await audio_resource.timestamps()\n")
+            overview_parts.append("# Returns: FileContent object or None\n")
+            overview_parts.append("# Purpose: Extract timestamped transcript from the audio\n")
+            overview_parts.append("```\n\n")
+
+        overview_parts.append("### Get Audio Metadata\n")
+        overview_parts.append("```python\n")
+        overview_parts.append(
+            f"duration = audio_resource.get_duration()  # {node.meta['duration']}s\n"
+        )
+        overview_parts.append(
+            f"sample_rate = audio_resource.get_sample_rate()  # {node.meta['sample_rate']}Hz\n"
+        )
+        overview_parts.append(
+            f"channels = audio_resource.get_channels()  # {node.meta['channels']}\n"
+        )
+        overview_parts.append(f'format = audio_resource.get_format()  # "{node.meta["format"]}"\n')
+        overview_parts.append("```\n")
+
+        overview = "".join(overview_parts)
+
+        # Store in node meta
+        node.meta["abstract"] = abstract
+        node.meta["overview"] = overview
+
+    async def parse_content(
+        self, content: str, source_path: Optional[str] = None, instruction: str = "", **kwargs
+    ) -> ParseResult:
+        """
+        Parse audio from content string - Not yet implemented.
+
+        Args:
+            content: Audio content (base64 or binary string)
+            source_path: Optional source path for metadata
+            **kwargs: Additional parsing parameters
+
+        Returns:
+            ParseResult with audio content
+
+        Raises:
+            NotImplementedError: This feature is not yet implemented
+        """
+        raise NotImplementedError("Audio parsing from content not yet implemented")
diff --git a/openviking/parse/parsers/media/image.py b/openviking/parse/parsers/media/image.py
new file mode 100644
index 00000000..c82b9589
--- /dev/null
+++ b/openviking/parse/parsers/media/image.py
@@ -0,0 +1,285 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+"""
+Media parser interfaces for OpenViking - Future expansion.
+
+This module defines parser interfaces for media types (image, audio, video).
+These are placeholder implementations that raise NotImplementedError.
+They serve as a design reference for future media parsing capabilities.
+
+For current document parsing (PDF, Markdown, HTML, Text), see other parser modules.
+"""
+
+from pathlib import Path
+from typing import List, Optional, Union
+
+from PIL import Image
+
+from openviking.parse.base import NodeType, ParseResult, ResourceNode
+from openviking.parse.parsers.base_parser import BaseParser
+from openviking_cli.utils.config.parser_config import ImageConfig
+
+# =============================================================================
+# Configuration Classes
+# =============================================================================
+
+
+# =============================================================================
+# Parser Classes
+# =============================================================================
+
+
+class ImageParser(BaseParser):
+    """
+    Image parser - Future implementation.
+
+    Planned Features:
+    1. Visual content understanding using VLM (Vision Language Model)
+    2. OCR text extraction for images containing text
+    3. Metadata extraction (dimensions, format, EXIF data)
+    4. Generate semantic description and structured ResourceNode
+
+    Example workflow:
+        1. Load image file
+        2. (Optional) Perform OCR to extract text
+        3. (Optional) Use VLM to generate visual description
+        4. Create ResourceNode with image metadata and descriptions
+        5. Return ParseResult
+
+    Supported formats: PNG, JPG, JPEG, GIF, BMP, WEBP, SVG
+    """
+
+    def __init__(self, config: Optional[ImageConfig] = None, **kwargs):
+        """
+        Initialize ImageParser.
+
+        Args:
+            config: Image parsing configuration
+            **kwargs: Additional configuration parameters
+        """
+        self.config = config or ImageConfig()
+
+    @property
+    def supported_extensions(self) -> List[str]:
+        """Return supported image file extensions."""
+        return [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp", ".svg"]
+
+    async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult:
+        """
+        Parse image file using three-phase architecture.
+
+        Phase 1: Generate temporary files
+        - Copy original image to temp_uri/content.{ext}
+        - (Optional) Generate ocr.md using OCR
+
+        Phase 2: Generate semantic info
+        - Generate abstract and overview based on description
+        - Overview includes file list and usage instructions
+
+        Phase 3: Build directory structure
+        - Move all files to final URI
+        - Generate .abstract.md, .overview.md
+
+        Args:
+            source: Image file path
+            **kwargs: Additional parsing parameters
+
+        Returns:
+            ParseResult with image content
+
+        Raises:
+            FileNotFoundError: If source file does not exist
+            IOError: If image processing fails
+        """
+        from openviking.storage.viking_fs import get_viking_fs
+
+        # Convert to Path object
+        file_path = Path(source) if isinstance(source, str) else source
+        if not file_path.exists():
+            raise FileNotFoundError(f"Image file not found: {source}")
+
+        viking_fs = get_viking_fs()
+        temp_uri = viking_fs.create_temp_uri()
+
+        # Phase 1: Generate temporary files
+        image_bytes = file_path.read_bytes()
+        ext = file_path.suffix
+
+        from openviking_cli.utils.uri import VikingURI
+
+        # Sanitize original filename (replace spaces with underscores)
+        original_filename = file_path.name.replace(" ", "_")
+        # Root directory name: filename stem + _ + extension (without dot)
+        stem = file_path.stem.replace(" ", "_")
+        ext_no_dot = ext[1:] if ext else ""
+        root_dir_name = VikingURI.sanitize_segment(f"{stem}_{ext_no_dot}")
+        root_dir_uri = f"{temp_uri}/{root_dir_name}"
+        await viking_fs.mkdir(root_dir_uri)
+
+        # 1.1 Save original image with original filename (sanitized)
+        await viking_fs.write_file_bytes(f"{root_dir_uri}/{original_filename}", image_bytes)
+
+        # 1.2 Validate and extract image metadata
+        try:
+            img = Image.open(file_path)
+            img.verify()  # Verify that it's a valid image
+            img.close()  # Close and reopen to reset after verify()
+            img = Image.open(file_path)
+            width, height = img.size
+            format_str = img.format or ext[1:].upper()
+        except Exception as e:
+            raise ValueError(f"Invalid image file: {file_path}. Error: {e}") from e
+
+        # 1.3 Generate VLM description
+        description = ""
+        if self.config.enable_vlm:
+            description = await self._vlm_describe(image_bytes, self.config.vlm_model)
+        else:
+            # Fallback: basic description
+            description = f"Image file: {file_path.name} ({format_str}, {width}x{height})"
+
+        # 1.4 OCR (optional)
+        ocr_text = None
+        if self.config.enable_ocr:
+            ocr_text = await self._ocr_extract(image_bytes, self.config.ocr_lang)
+            if ocr_text:
+                await viking_fs.write_file(f"{root_dir_uri}/ocr.md", ocr_text)
+
+        # Create ResourceNode
+        root_node = ResourceNode(
+            type=NodeType.ROOT,
+            title=file_path.stem,
+            level=0,
+            detail_file=None,
+            content_path=None,
+            children=[],
+            meta={
+                "width": width,
+                "height": height,
+                "format": format_str.lower(),
+                "content_type": "image",
+                "source_title": file_path.stem,
+                "semantic_name": file_path.stem,
+                "original_filename": original_filename,
+            },
+        )
+
+        # Phase 2: Generate semantic info
+        await self._generate_semantic_info(root_node, description, viking_fs, ocr_text is not None)
+
+        # Phase 3: Build directory structure (handled by TreeBuilder)
+        return ParseResult(
+            root=root_node,
+            source_path=str(file_path),
+            temp_dir_path=temp_uri,
+            source_format="image",
+            parser_name="ImageParser",
+            meta={"content_type": "image", "format": format_str.lower()},
+        )
+
+    async def _vlm_describe(self, image_bytes: bytes, model: Optional[str]) -> str:
+        """
+        Generate image description using VLM.
+
+        Args:
+            image_bytes: Image binary data
+            model: VLM model name
+
+        Returns:
+            Image description in markdown format
+
+        TODO: Integrate with actual VLM API (OpenAI GPT-4V, Claude Vision, etc.)
+        """
+        # Fallback implementation - returns basic placeholder
+        return "Image description (VLM integration pending)\n\nThis is an image. VLM description feature has not yet integrated external API."
+
+    async def _ocr_extract(self, image_bytes: bytes, lang: str) -> Optional[str]:
+        """
+        Extract text from image using OCR.
+
+        Args:
+            image_bytes: Image binary data
+            lang: OCR language code
+
+        Returns:
+            Extracted text in markdown format, or None if no text found
+
+        TODO: Integrate with OCR API (Tesseract, PaddleOCR, etc.)
+        """
+        # Not implemented - return None
+        return None
+
+    async def _generate_semantic_info(
+        self, node: ResourceNode, description: str, viking_fs, has_ocr: bool
+    ):
+        """
+        Phase 2: Generate abstract and overview.
+
+        Args:
+            node: ResourceNode to update
+            description: Image description
+            viking_fs: VikingFS instance
+            has_ocr: Whether OCR file exists
+        """
+        # Generate abstract (short summary, < 100 tokens)
+        abstract = description[:200] if len(description) > 200 else description
+
+        # Generate overview (content summary + file list + usage instructions)
+        overview_parts = [
+            "## Content Summary\n",
+            description,
+            "\n\n## Available Files\n",
+            f"- {node.meta['original_filename']}: Original image file ({node.meta['width']}x{node.meta['height']}, {node.meta['format'].upper()} format)\n",
+        ]
+
+        if has_ocr:
+            overview_parts.append("- ocr.md: OCR text recognition result from the image\n")
+
+        overview_parts.append("\n## Usage\n")
+        overview_parts.append("### View Image\n")
+        overview_parts.append("```python\n")
+        overview_parts.append("image_bytes = await image_resource.view()\n")
+        overview_parts.append("# Returns: PNG/JPG format image binary data\n")
+        overview_parts.append("# Purpose: Display or save the image\n")
+        overview_parts.append("```\n\n")
+
+        if has_ocr:
+            overview_parts.append("### Get OCR-recognized Text\n")
+            overview_parts.append("```python\n")
+            overview_parts.append("ocr_text = await image_resource.ocr()\n")
+            overview_parts.append("# Returns: FileContent object or None\n")
+            overview_parts.append("# Purpose: Extract text information from the image\n")
+            overview_parts.append("```\n\n")
+
+        overview_parts.append("### Get Image Metadata\n")
+        overview_parts.append("```python\n")
+        overview_parts.append(
+            f"size = image_resource.get_size()  # ({node.meta['width']}, {node.meta['height']})\n"
+        )
+        overview_parts.append(f'format = image_resource.get_format()  # "{node.meta["format"]}"\n')
+        overview_parts.append("```\n")
+
+        overview = "".join(overview_parts)
+
+        # Store in node meta
+        node.meta["abstract"] = abstract
+        node.meta["overview"] = overview
+
+    async def parse_content(
+        self, content: str, source_path: Optional[str] = None, instruction: str = "", **kwargs
+    ) -> ParseResult:
+        """
+        Parse image from content string - Not yet implemented.
+
+        Args:
+            content: Image content (base64 or binary string)
+            source_path: Optional source path for metadata
+            **kwargs: Additional parsing parameters
+
+        Returns:
+            ParseResult with image content
+
+        Raises:
+            NotImplementedError: This feature is not yet implemented
+        """
+        raise NotImplementedError("Image parsing not yet implemented")
diff --git a/openviking/parse/parsers/media/video.py b/openviking/parse/parsers/media/video.py
new file mode 100644
index 00000000..53cccf67
--- /dev/null
+++ b/openviking/parse/parsers/media/video.py
@@ -0,0 +1,286 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+"""
+Video parser - Future implementation.
+
+Planned Features:
+1. Key frame extraction at regular intervals
+2. Audio track transcription using ASR
+3. VLM-based scene description for key frames
+4. Video metadata extraction (duration, resolution, codec)
+5. Generate structured ResourceNode combining visual and audio
+
+Example workflow:
+    1. Load video file
+    2. Extract metadata (duration, resolution, fps)
+    3. Extract audio track → transcribe using AudioParser
+    4. Extract key frames at specified intervals
+    5. For each frame: generate VLM description
+    6. Create ResourceNode tree:
+       - Root: video metadata
+       - Children: timeline nodes (each with frame + transcript)
+    7. Return ParseResult
+
+Supported formats: MP4, AVI, MOV, MKV, WEBM
+"""
+
+from pathlib import Path
+from typing import List, Optional, Union
+
+from openviking.parse.base import NodeType, ParseResult, ResourceNode
+from openviking.parse.parsers.base_parser import BaseParser
+from openviking_cli.utils.config.parser_config import VideoConfig
+
+
+class VideoParser(BaseParser):
+    """
+    Video parser for video files.
+    """
+
+    def __init__(self, config: Optional[VideoConfig] = None, **kwargs):
+        """
+        Initialize VideoParser.
+
+        Args:
+            config: Video parsing configuration
+            **kwargs: Additional configuration parameters
+        """
+        self.config = config or VideoConfig()
+
+    @property
+    def supported_extensions(self) -> List[str]:
+        """Return supported video file extensions."""
+        return [".mp4", ".avi", ".mov", ".mkv", ".webm", ".flv", ".wmv"]
+
+    async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult:
+        """
+        Parse video file using three-phase architecture.
+
+        Phase 1: Generate temporary files
+        - Copy original video to temp_uri/content.{ext}
+        - Extract key frames
+        - Extract audio track and transcribe using ASR
+
+        Phase 2: Generate semantic info
+        - Generate abstract and overview based on descriptions
+        - Overview includes file list and usage instructions
+
+        Phase 3: Build directory structure
+        - Move all files to final URI
+        - Generate .abstract.md, .overview.md
+
+        Args:
+            source: Video file path
+            **kwargs: Additional parsing parameters
+
+        Returns:
+            ParseResult with video content
+
+        Raises:
+            FileNotFoundError: If source file does not exist
+            IOError: If video processing fails
+        """
+        from openviking.storage.viking_fs import get_viking_fs
+
+        # Convert to Path object
+        file_path = Path(source) if isinstance(source, str) else source
+        if not file_path.exists():
+            raise FileNotFoundError(f"Video file not found: {source}")
+
+        viking_fs = get_viking_fs()
+        temp_uri = viking_fs.create_temp_uri()
+
+        # Phase 1: Generate temporary files
+        video_bytes = file_path.read_bytes()
+        ext = file_path.suffix
+
+        from openviking_cli.utils.uri import VikingURI
+
+        # Sanitize original filename (replace spaces with underscores)
+        original_filename = file_path.name.replace(" ", "_")
+        # Root directory name: filename stem + _ + extension (without dot)
+        stem = file_path.stem.replace(" ", "_")
+        ext_no_dot = ext[1:] if ext else ""
+        root_dir_name = VikingURI.sanitize_segment(f"{stem}_{ext_no_dot}")
+        root_dir_uri = f"{temp_uri}/{root_dir_name}"
+        await viking_fs.mkdir(root_dir_uri)
+
+        # 1.1 Save original video with original filename (sanitized)
+        await viking_fs.write_file_bytes(f"{root_dir_uri}/{original_filename}", video_bytes)
+
+        # 1.2 Validate video file using magic bytes
+        # Define magic bytes for supported video formats
+        video_magic_bytes = {
+            ".mp4": [b"\x00\x00\x00", b"ftyp"],
+            ".avi": [b"RIFF"],
+            ".mov": [b"\x00\x00\x00", b"ftyp"],
+            ".mkv": [b"\x1a\x45\xdf\xa3"],
+            ".webm": [b"\x1a\x45\xdf\xa3"],
+            ".flv": [b"FLV"],
+            ".wmv": [b"\x30\x26\xb2\x75\x8e\x66\xcf\x11"],
+        }
+
+        # Check magic bytes
+        valid = False
+        ext_lower = ext.lower()
+        magic_list = video_magic_bytes.get(ext_lower, [])
+        for magic in magic_list:
+            if len(video_bytes) >= len(magic) and video_bytes.startswith(magic):
+                valid = True
+                break
+
+        if not valid:
+            raise ValueError(
+                f"Invalid video file: {file_path}. File signature does not match expected format {ext_lower}"
+            )
+
+        # Extract video metadata (placeholder)
+        duration = 0
+        width = 0
+        height = 0
+        fps = 0
+        format_str = ext[1:].upper()
+
+        # 1.3 Generate combined description
+        description = ""
+        if self.config.enable_key_frames or self.config.enable_audio_transcription:
+            description = await self._generate_video_description(file_path, self.config)
+        else:
+            # Fallback: basic description
+            description = f"Video file: {file_path.name} ({format_str}, {duration}s, {width}x{height}, {fps}fps)"
+
+        # 1.4 Key frames (optional)
+        key_frames_dir = f"{root_dir_uri}/keyframes"
+        has_key_frames = False
+        if self.config.enable_key_frames:
+            await viking_fs.mkdir(key_frames_dir)
+            has_key_frames = True
+
+        # Create ResourceNode
+        root_node = ResourceNode(
+            type=NodeType.ROOT,
+            title=file_path.stem,
+            level=0,
+            detail_file=None,
+            content_path=None,
+            children=[],
+            meta={
+                "duration": duration,
+                "width": width,
+                "height": height,
+                "fps": fps,
+                "format": format_str.lower(),
+                "content_type": "video",
+                "source_title": file_path.stem,
+                "semantic_name": file_path.stem,
+                "original_filename": original_filename,
+            },
+        )
+
+        # Phase 2: Generate semantic info
+        await self._generate_semantic_info(root_node, description, viking_fs, has_key_frames)
+
+        # Phase 3: Build directory structure (handled by TreeBuilder)
+        return ParseResult(
+            root=root_node,
+            source_path=str(file_path),
+            temp_dir_path=temp_uri,
+            source_format="video",
+            parser_name="VideoParser",
+            meta={"content_type": "video", "format": format_str.lower()},
+        )
+
+    async def _generate_video_description(self, file_path: Path, config: VideoConfig) -> str:
+        """
+        Generate video description using key frames and audio transcription.
+
+        Args:
+            file_path: Video file path
+            config: Video parsing configuration
+
+        Returns:
+            Video description in markdown format
+
+        TODO: Integrate with actual video processing libraries
+        """
+        # Fallback implementation - returns basic placeholder
+        return "Video description (video processing integration pending)\n\nThis is a video. Video processing feature has not yet integrated external libraries."
+
+    async def _generate_semantic_info(
+        self, node: ResourceNode, description: str, viking_fs, has_key_frames: bool
+    ):
+        """
+        Phase 2: Generate abstract and overview.
+
+        Args:
+            node: ResourceNode to update
+            description: Video description
+            viking_fs: VikingFS instance
+            has_key_frames: Whether key frames directory exists
+        """
+        # Generate abstract (short summary, < 100 tokens)
+        abstract = description[:200] if len(description) > 200 else description
+
+        # Generate overview (content summary + file list + usage instructions)
+        overview_parts = [
+            "## Content Summary\n",
+            description,
+            "\n\n## Available Files\n",
+            f"- {node.meta['original_filename']}: Original video file ({node.meta['duration']}s, {node.meta['width']}x{node.meta['height']}, {node.meta['fps']}fps, {node.meta['format'].upper()} format)\n",
+        ]
+
+        if has_key_frames:
+            overview_parts.append("- keyframes/: Directory containing extracted key frames\n")
+
+        overview_parts.append("\n## Usage\n")
+        overview_parts.append("### Play Video\n")
+        overview_parts.append("```python\n")
+        overview_parts.append("video_bytes = await video_resource.play()\n")
+        overview_parts.append("# Returns: Video file binary data\n")
+        overview_parts.append("# Purpose: Play or save the video\n")
+        overview_parts.append("```\n\n")
+
+        if has_key_frames:
+            overview_parts.append("### Get Key Frames\n")
+            overview_parts.append("```python\n")
+            overview_parts.append("keyframes = await video_resource.keyframes()\n")
+            overview_parts.append("# Returns: List of key frame resources\n")
+            overview_parts.append("# Purpose: Analyze video scenes\n")
+            overview_parts.append("```\n\n")
+
+        overview_parts.append("### Get Video Metadata\n")
+        overview_parts.append("```python\n")
+        overview_parts.append(
+            f"duration = video_resource.get_duration()  # {node.meta['duration']}s\n"
+        )
+        overview_parts.append(
+            f"resolution = video_resource.get_resolution()  # ({node.meta['width']}, {node.meta['height']})\n"
+        )
+        overview_parts.append(f"fps = video_resource.get_fps()  # {node.meta['fps']}\n")
+        overview_parts.append(f'format = video_resource.get_format()  # "{node.meta["format"]}"\n')
+        overview_parts.append("```\n")
+
+        overview = "".join(overview_parts)
+
+        # Store in node meta
+        node.meta["abstract"] = abstract
+        node.meta["overview"] = overview
+
+    async def parse_content(
+        self, content: str, source_path: Optional[str] = None, instruction: str = "", **kwargs
+    ) -> ParseResult:
+        """
+        Parse video from content string - Not yet implemented.
+
+        Args:
+            content: Video content (base64 or binary string)
+            source_path: Optional source path for metadata
+            **kwargs: Additional parsing parameters
+
+        Returns:
+            ParseResult with video content
+
+        Raises:
+            NotImplementedError: This feature is not yet implemented
+        """
+        raise NotImplementedError("Video parsing from content not yet implemented")
diff --git a/openviking/parse/registry.py b/openviking/parse/registry.py
index 526b2272..378bcd7d 100644
--- a/openviking/parse/registry.py
+++ b/openviking/parse/registry.py
@@ -82,12 +82,18 @@ def __init__(self, register_optional: bool = True):
         # Register optional media parsers
         if register_optional:
             try:
-                from openviking.parse.parsers.media import ImageParser
+                from openviking.parse.parsers.media import AudioParser, ImageParser, VideoParser
 
                 self.register("image", ImageParser())
                 logger.info("Registered ImageParser for image formats")
+
+                self.register("audio", AudioParser())
+                logger.info("Registered AudioParser for audio formats")
+
+                self.register("video", VideoParser())
+                logger.info("Registered VideoParser for video formats")
             except ImportError as e:
-                logger.debug(f"ImageParser not registered: {e}")
+                logger.debug(f"Media parsers not registered: {e}")
 
     def register(self, name: str, parser: BaseParser) -> None:
         """
diff --git a/openviking/parse/tree_builder.py b/openviking/parse/tree_builder.py
index 903e822d..0e582dab 100644
--- a/openviking/parse/tree_builder.py
+++ b/openviking/parse/tree_builder.py
@@ -108,14 +108,50 @@ async def finalize_from_temp(
         doc_dirs = [e for e in entries if e.get("isDir") and e["name"] not in [".", ".."]]
 
         if len(doc_dirs) != 1:
-            raise ValueError(f"Expected 1 document directory in {temp_uri}, found {len(doc_dirs)}")
+            logger.error(
+                f"[TreeBuilder] Expected 1 document directory in {temp_uri}, found {len(doc_dirs)}"
+            )
+            raise ValueError(
+                f"[TreeBuilder] Expected 1 document directory in {temp_uri}, found {len(doc_dirs)}"
+            )
 
-        doc_name = doc_dirs[0]["name"]
+        from openviking_cli.utils.uri import VikingURI
+
+        doc_name = VikingURI.sanitize_segment(doc_dirs[0]["name"])
         doc_uri = f"{temp_uri}/{doc_name}"
 
         # 2. Determine base_uri
         if base_uri is None:
-            base_uri = self._get_base_uri(scope)
+            # Check if it's a media file (image/audio/video)
+            media_type = None
+            if source_format:
+                if source_format in ["image", "audio", "video"]:
+                    media_type = source_format
+            elif source_path:
+                from pathlib import Path
+
+                ext = Path(source_path).suffix.lower()
+                image_exts = [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp", ".svg"]
+                audio_exts = [".mp3", ".wav", ".ogg", ".flac", ".aac", ".m4a", ".opus"]
+                video_exts = [".mp4", ".mov", ".avi", ".webm", ".mkv"]
+                if ext in image_exts:
+                    media_type = "image"
+                elif ext in audio_exts:
+                    media_type = "audio"
+                elif ext in video_exts:
+                    media_type = "video"
+
+            if media_type:
+                # Map singular media types to plural directory names
+                media_dir_map = {"image": "images", "audio": "audio", "video": "video"}
+                media_dir = media_dir_map.get(media_type, media_type)
+                # Get current date in YYYYMMDD format
+                from datetime import datetime
+
+                date_str = datetime.now().strftime("%Y%m%d")
+                base_uri = f"viking://resources/{media_dir}/{date_str}"
+            else:
+                base_uri = self._get_base_uri(scope)
 
         logger.info(f"Finalizing from temp: {temp_uri} -> {base_uri}")
 
diff --git a/openviking_cli/cli/commands/resources.py b/openviking_cli/cli/commands/resources.py
index 92940dd7..a9bfc28f 100644
--- a/openviking_cli/cli/commands/resources.py
+++ b/openviking_cli/cli/commands/resources.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 """Resource management commands."""
 
+from pathlib import Path
 from typing import Optional
 
 import typer
@@ -23,10 +24,62 @@ def add_resource_command(
         timeout: Optional[float] = typer.Option(600.0, help="Wait timeout in seconds"),
     ) -> None:
         """Add resources into OpenViking."""
+        # Validate path: if it's a local path, check if it exists
+        final_path = path
+        if not (path.startswith("http://") or path.startswith("https://")):
+            unescaped_path = path.replace("\\ ", " ")
+            local_path = Path(unescaped_path)
+            final_path = unescaped_path
+            if not local_path.exists():
+                # Check if there are extra arguments (possible unquoted path with spaces)
+                import sys
+
+                # Find the index of 'add-resource' in sys.argv
+                try:
+                    add_resource_idx = sys.argv.index("add-resource")
+                except ValueError:
+                    add_resource_idx = sys.argv.index("add") if "add" in sys.argv else -1
+
+                if add_resource_idx != -1 and len(sys.argv) > add_resource_idx + 2:
+                    # There are extra positional arguments - likely unquoted path with spaces
+                    extra_args = sys.argv[add_resource_idx + 2 :]
+                    suggested_path = f"{path} {' '.join(extra_args)}"
+                    typer.echo(
+                        typer.style(
+                            f"Error: Path '{path}' does not exist.",
+                            fg=typer.colors.RED,
+                            bold=True,
+                        ),
+                        err=True,
+                    )
+                    typer.echo(
+                        typer.style(
+                            "\nIt looks like you may have forgotten to quote a path with spaces.",
+                            fg=typer.colors.YELLOW,
+                        ),
+                        err=True,
+                    )
+                    typer.echo(
+                        typer.style(
+                            f'Suggested command: ov add-resource "{suggested_path}"',
+                            fg=typer.colors.CYAN,
+                        ),
+                        err=True,
+                    )
+                    raise typer.Exit(code=1)
+                else:
+                    typer.echo(
+                        typer.style(
+                            f"Error: Path '{path}' does not exist.", fg=typer.colors.RED, bold=True
+                        ),
+                        err=True,
+                    )
+                    raise typer.Exit(code=1)
+
         run(
             ctx,
             lambda client: client.add_resource(
-                path=path,
+                path=final_path,
                 target=to,
                 reason=reason,
                 instruction=instruction,
diff --git a/openviking_cli/client/http.py b/openviking_cli/client/http.py
index 39526602..a5cb6903 100644
--- a/openviking_cli/client/http.py
+++ b/openviking_cli/client/http.py
@@ -36,6 +36,7 @@
     load_json_config,
     resolve_config_path,
 )
+from openviking_cli.utils.uri import VikingURI
 
 # Error code to exception class mapping
 ERROR_CODE_TO_EXCEPTION = {
@@ -281,6 +282,7 @@ async def ls(
         node_limit: int = 1000,
     ) -> List[Any]:
         """List directory contents."""
+        uri = VikingURI.normalize(uri)
         response = await self._http.get(
             "/api/v1/fs/ls",
             params={
@@ -304,6 +306,7 @@ async def tree(
         node_limit: int = 1000,
     ) -> List[Dict[str, Any]]:
         """Get directory tree."""
+        uri = VikingURI.normalize(uri)
         response = await self._http.get(
             "/api/v1/fs/tree",
             params={
@@ -318,6 +321,7 @@ async def tree(
 
     async def stat(self, uri: str) -> Dict[str, Any]:
         """Get resource status."""
+        uri = VikingURI.normalize(uri)
         response = await self._http.get(
             "/api/v1/fs/stat",
             params={"uri": uri},
@@ -326,6 +330,7 @@ async def stat(self, uri: str) -> Dict[str, Any]:
 
     async def mkdir(self, uri: str) -> None:
         """Create directory."""
+        uri = VikingURI.normalize(uri)
         response = await self._http.post(
             "/api/v1/fs/mkdir",
             json={"uri": uri},
@@ -334,6 +339,7 @@ async def mkdir(self, uri: str) -> None:
 
     async def rm(self, uri: str, recursive: bool = False) -> None:
         """Remove resource."""
+        uri = VikingURI.normalize(uri)
         response = await self._http.request(
             "DELETE",
             "/api/v1/fs",
@@ -343,6 +349,8 @@ async def rm(self, uri: str, recursive: bool = False) -> None:
 
     async def mv(self, from_uri: str, to_uri: str) -> None:
         """Move resource."""
+        from_uri = VikingURI.normalize(from_uri)
+        to_uri = VikingURI.normalize(to_uri)
         response = await self._http.post(
             "/api/v1/fs/mv",
             json={"from_uri": from_uri, "to_uri": to_uri},
@@ -353,6 +361,7 @@ async def mv(self, from_uri: str, to_uri: str) -> None:
 
     async def read(self, uri: str) -> str:
         """Read file content."""
+        uri = VikingURI.normalize(uri)
         response = await self._http.get(
             "/api/v1/content/read",
             params={"uri": uri},
@@ -361,6 +370,7 @@ async def read(self, uri: str) -> str:
 
     async def abstract(self, uri: str) -> str:
         """Read L0 abstract."""
+        uri = VikingURI.normalize(uri)
         response = await self._http.get(
             "/api/v1/content/abstract",
             params={"uri": uri},
@@ -369,6 +379,7 @@ async def abstract(self, uri: str) -> str:
 
     async def overview(self, uri: str) -> str:
         """Read L1 overview."""
+        uri = VikingURI.normalize(uri)
         response = await self._http.get(
             "/api/v1/content/overview",
             params={"uri": uri},
@@ -386,6 +397,8 @@ async def find(
         filter: Optional[Dict[str, Any]] = None,
     ) -> FindResult:
         """Semantic search without session context."""
+        if target_uri:
+            target_uri = VikingURI.normalize(target_uri)
         response = await self._http.post(
             "/api/v1/search/find",
             json={
@@ -409,6 +422,8 @@ async def search(
         filter: Optional[Dict[str, Any]] = None,
     ) -> FindResult:
         """Semantic search with optional session context."""
+        if target_uri:
+            target_uri = VikingURI.normalize(target_uri)
         sid = session_id or (session.session_id if session else None)
         response = await self._http.post(
             "/api/v1/search/search",
@@ -425,6 +440,7 @@ async def search(
 
     async def grep(self, uri: str, pattern: str, case_insensitive: bool = False) -> Dict[str, Any]:
         """Content search with pattern."""
+        uri = VikingURI.normalize(uri)
         response = await self._http.post(
             "/api/v1/search/grep",
             json={
@@ -437,6 +453,7 @@ async def grep(self, uri: str, pattern: str, case_insensitive: bool = False) ->
 
     async def glob(self, pattern: str, uri: str = "viking://") -> Dict[str, Any]:
         """File pattern matching."""
+        uri = VikingURI.normalize(uri)
         response = await self._http.post(
             "/api/v1/search/glob",
             json={"pattern": pattern, "uri": uri},
@@ -447,6 +464,7 @@ async def glob(self, pattern: str, uri: str = "viking://") -> Dict[str, Any]:
 
     async def relations(self, uri: str) -> List[Any]:
         """Get relations for a resource."""
+        uri = VikingURI.normalize(uri)
         response = await self._http.get(
             "/api/v1/relations",
             params={"uri": uri},
@@ -455,6 +473,11 @@ async def relations(self, uri: str) -> List[Any]:
 
     async def link(self, from_uri: str, to_uris: Union[str, List[str]], reason: str = "") -> None:
         """Create link between resources."""
+        from_uri = VikingURI.normalize(from_uri)
+        if isinstance(to_uris, str):
+            to_uris = VikingURI.normalize(to_uris)
+        else:
+            to_uris = [VikingURI.normalize(u) for u in to_uris]
         response = await self._http.post(
             "/api/v1/relations/link",
             json={"from_uri": from_uri, "to_uris": to_uris, "reason": reason},
@@ -463,6 +486,8 @@ async def link(self, from_uri: str, to_uris: Union[str, List[str]], reason: str
 
     async def unlink(self, from_uri: str, to_uri: str) -> None:
         """Remove link between resources."""
+        from_uri = VikingURI.normalize(from_uri)
+        to_uri = VikingURI.normalize(to_uri)
         response = await self._http.request(
             "DELETE",
             "/api/v1/relations/link",
@@ -512,6 +537,7 @@ async def add_message(self, session_id: str, role: str, content: str) -> Dict[st
 
     async def export_ovpack(self, uri: str, to: str) -> str:
         """Export context as .ovpack file."""
+        uri = VikingURI.normalize(uri)
         response = await self._http.post(
             "/api/v1/pack/export",
             json={"uri": uri, "to": to},
@@ -527,6 +553,7 @@ async def import_ovpack(
         vectorize: bool = True,
     ) -> str:
         """Import .ovpack file."""
+        parent = VikingURI.normalize(parent)
         response = await self._http.post(
             "/api/v1/pack/import",
             json={
diff --git a/openviking_cli/utils/config/vlm_config.py b/openviking_cli/utils/config/vlm_config.py
index ad1bea8f..411c7d76 100644
--- a/openviking_cli/utils/config/vlm_config.py
+++ b/openviking_cli/utils/config/vlm_config.py
@@ -15,17 +15,16 @@ class VLMConfig(BaseModel):
     max_retries: int = Field(default=2, description="Maximum retry attempts")
 
     provider: Optional[str] = Field(default=None, description="Provider type")
-    backend: Optional[str] = Field(default=None, description="Backend provider (Deprecated, use 'provider' instead)")
+    backend: Optional[str] = Field(
+        default=None, description="Backend provider (Deprecated, use 'provider' instead)"
+    )
 
     providers: Dict[str, Dict[str, Any]] = Field(
         default_factory=dict,
-        description="Multi-provider configuration, e.g. {'deepseek': {'api_key': 'xxx', 'api_base': 'xxx'}}"
+        description="Multi-provider configuration, e.g. {'deepseek': {'api_key': 'xxx', 'api_base': 'xxx'}}",
     )
 
-    default_provider: Optional[str] = Field(
-        default=None,
-        description="Default provider name"
-    )
+    default_provider: Optional[str] = Field(default=None, description="Default provider name")
 
     thinking: bool = Field(default=False, description="Enable thinking mode for VolcEngine models")
 
@@ -141,6 +140,7 @@ def get_vlm_instance(self) -> Any:
         if self._vlm_instance is None:
             config_dict = self._build_vlm_config_dict()
             from openviking.models.vlm import VLMFactory
+
             self._vlm_instance = VLMFactory.create(config_dict)
         return self._vlm_instance
 
@@ -166,13 +166,15 @@ def _build_vlm_config_dict(self) -> Dict[str, Any]:
 
         return result
 
-    def get_completion(self, prompt: str) -> str:
+    def get_completion(self, prompt: str, thinking: bool = False) -> str:
         """Get LLM completion."""
-        return self.get_vlm_instance().get_completion(prompt)
+        return self.get_vlm_instance().get_completion(prompt, thinking)
 
-    async def get_completion_async(self, prompt: str, max_retries: int = 0) -> str:
+    async def get_completion_async(
+        self, prompt: str, thinking: bool = False, max_retries: int = 0
+    ) -> str:
         """Get LLM completion asynchronously, max_retries=0 means no retry."""
-        return await self.get_vlm_instance().get_completion_async(prompt, max_retries)
+        return await self.get_vlm_instance().get_completion_async(prompt, thinking, max_retries)
 
     def is_available(self) -> bool:
         """Check if LLM is configured."""
@@ -182,14 +184,16 @@ def get_vision_completion(
         self,
         prompt: str,
         images: list,
+        thinking: bool = False,
     ) -> str:
         """Get LLM completion with images."""
-        return self.get_vlm_instance().get_vision_completion(prompt, images)
+        return self.get_vlm_instance().get_vision_completion(prompt, images, thinking)
 
     async def get_vision_completion_async(
         self,
         prompt: str,
         images: list,
+        thinking: bool = False,
     ) -> str:
         """Get LLM completion with images asynchronously."""
-        return await self.get_vlm_instance().get_vision_completion_async(prompt, images)
+        return await self.get_vlm_instance().get_vision_completion_async(prompt, images, thinking)
diff --git a/openviking_cli/utils/uri.py b/openviking_cli/utils/uri.py
index 9cf6d856..6f50a3a9 100644
--- a/openviking_cli/utils/uri.py
+++ b/openviking_cli/utils/uri.py
@@ -201,7 +201,7 @@ def build_semantic_uri(
         Build a semantic URI based on parent URI.
         """
         # Sanitize semantic name for URI
-        safe_name = VikingURI._sanitize_segment(semantic_name)
+        safe_name = VikingURI.sanitize_segment(semantic_name)
 
         if not is_leaf:
             return f"{parent_uri}/{safe_name}"
@@ -211,11 +211,12 @@ def build_semantic_uri(
             return f"{parent_uri}/{safe_name}/{node_id}"
 
     @staticmethod
-    def _sanitize_segment(text: str) -> str:
+    def sanitize_segment(text: str) -> str:
         """
         Sanitize text for use in URI segment.
 
-        Preserves Chinese characters but replaces special characters.
+        Preserves CJK characters (Chinese, Japanese, Korean) and other common scripts
+        while replacing special characters.
 
         Args:
             text: Original text
@@ -223,8 +224,18 @@ def _sanitize_segment(text: str) -> str:
         Returns:
             URI-safe string
         """
-        # Preserve Chinese characters, letters, numbers, underscores, hyphens
-        safe = re.sub(r"[^\w\u4e00-\u9fff\-]", "_", text)
+        # Preserve:
+        # - Letters, numbers, underscores, hyphens (\w includes [a-zA-Z0-9_])
+        # - CJK Unified Ideographs (Chinese, Japanese Kanji, Korean Hanja)
+        # - Hiragana and Katakana (Japanese)
+        # - Hangul Syllables (Korean)
+        # - CJK Unified Ideographs Extension A
+        # - CJK Unified Ideographs Extension B
+        safe = re.sub(
+            r"[^\w\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af\u3400-\u4dbf\U00020000-\U0002a6df\-]",
+            "_",
+            text,
+        )
         # Merge consecutive underscores
         safe = re.sub(r"_+", "_", safe)
         # Strip leading/trailing underscores and limit length
@@ -245,6 +256,33 @@ def __eq__(self, other) -> bool:
     def __hash__(self) -> int:
         return hash(self.uri)
 
+    @staticmethod
+    def normalize(uri: str) -> str:
+        """
+        Normalize URI by ensuring it has the viking:// scheme.
+
+        If the input already starts with viking://, returns it as-is.
+        If it starts with /, prepends viking:// (resulting in viking:///... which is invalid,
+        so we strip leading / first).
+        Otherwise, prepends viking://.
+
+        Examples:
+            "/resources/images" -> "viking://resources/images"
+            "resources/images" -> "viking://resources/images"
+            "viking://resources/images" -> "viking://resources/images"
+
+        Args:
+            uri: Input URI string
+
+        Returns:
+            Normalized URI with viking:// scheme
+        """
+        if uri.startswith(f"{VikingURI.SCHEME}://"):
+            return uri
+        # Strip leading slashes
+        uri = uri.lstrip("/")
+        return f"{VikingURI.SCHEME}://{uri}"
+
     @classmethod
     def create_temp_uri(cls) -> str:
         """Create temp directory URI like viking://temp/MMDDHHMM_XXXXXX"""