diff --git a/challenge/onsite_competition/onsite_competition_rules_en-US.md b/challenge/onsite_competition/onsite_competition_rules_en-US.md
index d9c241c9..ff4e3568 100644
--- a/challenge/onsite_competition/onsite_competition_rules_en-US.md
+++ b/challenge/onsite_competition/onsite_competition_rules_en-US.md
@@ -38,13 +38,12 @@ The challenge centers on vision-language fusion for cross-room end-to-end naviga
 - A standardized debugging time and environment will be provided onsite. Teams may use model weights different from the online stage and make environment-specific configuration adjustments, but modifications to the core algorithm logic are strictly prohibited.
 
 ### 4.2 Procedure
-Each team will receive 10 fixed-sequence instructions. Start from the first one.
+Each team will receive 10 instructions. Start from any instruction.
 For each instruction:
 - Move the robot to the given starting position.
 - Provide the instruction to the robot and raise your hand to signal the start.
-- If execution fails, teams may retry (up to 3 attempts) or skip the instruction.
-- After 3 failed attempts (timeout, collision, human intervention, etc.), the instruction must be skipped.
-- Skipped instructions cannot be retried later.
+- If execution fails, teams may retry or skip the instruction.
+- Instruction can be repeated if failed (timeout, collision, human intervention, etc.).
 - Before each attempt, the algorithm state must be reset and previous observations cleared.
 
 ### 4.3 Time Limit
@@ -82,7 +81,7 @@ Successfully completing one instruction will be one success, and the completion
 | Action | Score Impact |
 |:--|:--|
 | Successfully reach goal | success |
-| Minor scrape more than 5 times or Collision with obstacle | fail |
+| Minor scrape or Collision with obstacle | fail |
 
 If there is a trend of continuous collisions, the referee has the right to terminate the robot’s current action, with the severity of the impact determined by the on-site referee.
 
diff --git a/challenge/onsite_competition/onsite_competition_rules_zh-CN.md b/challenge/onsite_competition/onsite_competition_rules_zh-CN.md
index 9edabee0..6f3ed366 100644
--- a/challenge/onsite_competition/onsite_competition_rules_zh-CN.md
+++ b/challenge/onsite_competition/onsite_competition_rules_zh-CN.md
@@ -32,11 +32,10 @@
 - 比赛现场将提供统一的调试时间和环境，允许使用与线上赛不同的模型权重，允许进行环境适配性配置修改，但禁止修改核心算法逻辑。
 
 ### 4.2 比赛过程
-每支队伍开始挑战时将会提供十条固定顺序的指令，从第一条指令开始。
+每支队伍开始挑战时将会提供十条导航指令，选手可选择指令进行。
   - 移动机器人到达该指令的起始位置后，将指令输入给机器人后举手示意作为开始挑战的信号。
-  - 指令执行过程中若中途失败，可选择重试，也可以选择跳过该指令，顺序执行下一条未执行指令。
-  - 每条指令可重试3次，若3次失败，则必须跳过该指令。（失败指选手选择重新开始，或其他条件，如超时，冲撞障碍物，人为干预等）
-  - 已经跳过的指令不能重复执行。
+  - 指令执行过程中若中途失败，可选择重试，也可以选择跳过该指令。
+  - 失败时可选择重新开始（如超时，冲撞障碍物，人为干预等）
   - 每次测试前需重置算法状态并清除所有前序执行过程中获取的观测缓存。
 
 ### 4.3 时间限制
@@ -60,7 +59,7 @@
 - 成功完成1条指令，算成功一次，并记录该指令的完成时间。（成功条件：在比赛中，每条指令均对应一个目标位置。目标位置定义为半径 2 米的圆形区域（不穿墙），机器人最终停止在该范围即可判定为该条指令导航成功。）
 
 **扣分规则**：
-- 若机器人频繁刮蹭（5次）或直接撞击障碍物，本次导航将被强制终止，该条指令失败。
+- 若机器人连续刮蹭或直接撞击障碍物，本次导航将被强制终止，该条指令失败。
 
 | Action | Score |
 |:--|:--:|
diff --git a/challenge/onsite_competition/sdk/cam.py b/challenge/onsite_competition/sdk/cam.py
index 9526c23d..b76caa68 100644
--- a/challenge/onsite_competition/sdk/cam.py
+++ b/challenge/onsite_competition/sdk/cam.py
@@ -99,7 +99,8 @@ def get_observation(self, timeout_ms: int = 1000) -> Dict:
         if not color or not depth:
             raise RuntimeError("can not align color/depth frame")
 
-        rgb = np.asanyarray(color.get_data())  # HxWx3, uint8 (BGR)
+        bgr = np.asanyarray(color.get_data())  # HxWx3, uint8 (BGR)
+        rgb = bgr[..., ::-1]  # HxWx3, uint8 (convert to RGB)
         depth_raw = np.asanyarray(depth.get_data())  # HxW, uint16
         if depth_raw.shape != rgb.shape[:2]:
             # Extreme fallback (theoretically should be consistent after alignment).
diff --git a/challenge/onsite_competition/sdk/control.py b/challenge/onsite_competition/sdk/control.py
index 5040946d..062a13c4 100644
--- a/challenge/onsite_competition/sdk/control.py
+++ b/challenge/onsite_competition/sdk/control.py
@@ -35,6 +35,10 @@ def odom_callback(self, msg):
             self.start_yaw = yaw
             rospy.loginfo(f"start yaw: {math.degrees(self.start_yaw):.2f}")
 
+        # --- position (new)
+        p = msg.pose.pose.position
+        self.current_xy = (p.x, p.y)
+
     def execute_turn(self):
         if self.start_yaw is None:
             rospy.loginfo("wait for init yaw state...")
@@ -87,7 +91,7 @@ class DiscreteRobotController(Turn90Degrees):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)  # initialize parent class
 
-    def stand_still(self, duration=0.5):
+    def stand_still(self, duration=0.2):
         twist = Twist()
         twist.linear.x = 0.0
         twist.angular.z = 0.0
@@ -95,9 +99,9 @@ def stand_still(self, duration=0.5):
         rospy.sleep(duration)  # Maintain stand still for a short duration
         rospy.loginfo("Stand still command executed.")
 
-    def move_forward(self, distance=0.25):
+    def move_forward(self, distance=0.5, speed=0.2):
         twist = Twist()
-        twist.linear.x = 0.2  # Forward speed
+        twist.linear.x = speed  # Forward speed
         twist.angular.z = 0.0
         duration = distance / twist.linear.x  # Time to move forward the specified distance
 
@@ -108,34 +112,103 @@ def move_forward(self, distance=0.25):
             self.cmd_vel_pub.publish(twist)
             self.rate.sleep()
 
-        self.stand_still()  # Stop after moving forward
+        # self.stand_still()  # Stop after moving forward
         rospy.loginfo("Move forward command executed.")
 
-    def turn_left(self, angle=15, speed=0.2):
-        self.turn_angle = math.radians(angle)  # update angle
-        self.angular_speed = speed  # Set positive angular speed for left turn
-        self.start_yaw = None  # Reset start yaw to current position
-        self.turning = False  # Reset turning flag
-        self.run()
-        self.stand_still()  # Stop after moving forward
-        rospy.loginfo("Turn left command executed.")
+    # ---- NEW: feedback-controlled forward motion
+
+
+def move_feedback(self, distance=0.25, speed=0.5, tol=0.02, timeout=None):
+    """
+    里程计闭环直线移动：支持正/负 distance。
+    - distance: 目标路程（米）。>0 前进，<0 倒车。
+    - speed: 名义线速度（m/s），可给正/负，最终会按正数取绝对值。
+    - tol: 终止距离容差（米）
+    - timeout: 超时（秒）；默认 max(3*|distance|/speed, 3.0)
+    """
+    # 等待位姿
+    while self.current_xy is None and not rospy.is_shutdown():
+        rospy.loginfo_throttle(2.0, "Waiting for /odom...")
+        self.rate.sleep()
+    if rospy.is_shutdown():
+        return
+
+    # 方向与目标
+    direction = 1.0 if distance >= 0.0 else -1.0
+    target = abs(distance)
+    speed = abs(speed) if speed is not None else 0.5
+
+    start_xy = self.current_xy
+    start_time = rospy.Time.now()
+
+    if timeout is None:
+        timeout = max(3.0, 3.0 * (target / max(speed, 0.05)))
 
-    def turn_right(self, angle=15, speed=-0.2):
+    twist = Twist()
+
+    # 简单 P 控制，越靠近越慢
+    Kp = 1.5
+    min_speed = 0.06  # 防止轮子停转
+
+    rospy.loginfo(
+        f"move_linear: target={target:.3f} m, dir={'forward' if direction>0 else 'backward'}, "
+        f"speed≈{speed:.2f} m/s, tol={tol:.3f} m"
+    )
+
+    try:
+        while not rospy.is_shutdown():
+            # 超时
+            if (rospy.Time.now() - start_time).to_sec() > timeout:
+                rospy.logwarn("move_linear timeout reached; stopping.")
+                break
+
+            # 走过的距离（欧式距离，不区分前后，目标用 abs）
+            cx, cy = self.current_xy
+            sx, sy = start_xy
+            traveled = math.hypot(cx - sx, cy - sy)
+            remaining = target - traveled
+
+            # 达标退出
+            if remaining <= tol:
+                rospy.loginfo(f"Reached distance: traveled={traveled:.3f} m (tol {tol} m)")
+                break
+
+            # 速度控制（带方向）
+            v = Kp * remaining
+            v = max(min(v, speed), min_speed)  # [min_speed, speed]
+            twist.linear.x = direction * v  # 关键：按方向加符号
+            twist.angular.z = 0.0
+            self.cmd_vel_pub.publish(twist)
+
+            rospy.loginfo_throttle(
+                1.0, f"traveled={traveled:.3f} m, remaining={remaining:.3f} m, v={twist.linear.x:.2f} m/s"
+            )
+            self.rate.sleep()
+
+    finally:
+        # 停车
+        twist.linear.x = 0.0
+        twist.angular.z = 0.0
+        self.cmd_vel_pub.publish(twist)
+
+    rospy.loginfo("Move linear command executed.")
+
+    def turn(self, angle=15, speed=0.5):
         self.turn_angle = math.radians(angle)  # update angle
         self.angular_speed = speed  # Set positive angular speed for left turn
         self.start_yaw = None  # Reset start yaw to current position
         self.turning = False  # Reset turning flag
         self.run()
-        self.stand_still()  # Stop after moving forward
-        rospy.loginfo("Turn right command executed.")
+        rospy.loginfo("Turn left command executed.")
 
 
 if __name__ == '__main__':
     try:
         control = DiscreteRobotController()
-        control.turn_left(10)
-        control.move_forward(0.1)
-        control.turn_right(10)
+        control.turn(15, 0.5)  # left
+        control.turn(15, -0.5)  # right
+        control.move_feedback(0.25, 0.5)
+        control.move_feedback(-0.25, 0.5)
 
     except rospy.ROSInterruptException:
         pass
diff --git a/challenge/onsite_competition/sdk/main.py b/challenge/onsite_competition/sdk/main.py
index 705329cd..f93b908b 100644
--- a/challenge/onsite_competition/sdk/main.py
+++ b/challenge/onsite_competition/sdk/main.py
@@ -3,10 +3,9 @@
 import sys
 
 from real_world_env import RealWorldEnv
-from stream import app, start_env
+from stream import run, set_instruction
 
-from internnav.agent.utils.client import AgentClient
-from internnav.configs.evaluator.default_config import get_config
+from internnav.utils.comm_utils.client import AgentClient
 
 
 def parse_args():
@@ -14,15 +13,16 @@ def parse_args():
     parser.add_argument(
         "--config",
         type=str,
-        default='scripts/eval/configs/h1_cma_cfg.py',
+        default='challenge/onsite_competition/configs/h1_internvla_n1_cfg.py',
         help='eval config file path, e.g. scripts/eval/configs/h1_cma_cfg.py',
     )
     parser.add_argument(
         "--instruction",
         type=str,
+        default='Go straight and pass the sofa and turn right into the hallway. Keep walking down, pass the kitchen and the bathroom, then enter the study room at the far end on the right with a desk, stop next to the white shelf.',
         help='current instruction to follow',
     )
-    parser.add_argument("--tag", type=str, help="tag for the run, saved by the tag name which is team-task-trail")
+    parser.add_argument("--uninteractive_mode", action='store_true', help="whether to confirm each step")
     return parser.parse_args()
 
 
@@ -34,46 +34,87 @@ def load_eval_cfg(config_path, attr_name='eval_cfg'):
     return getattr(config_module, attr_name)
 
 
-# TODO add logging for each step, saved by the tag name which is team-task-trail
+def confirm(msg: str) -> bool:
+    """
+    Ask user to confirm. Return True if user types 'y' (case-insensitive),
+    False for anything else (including empty input).
+    """
+    try:
+        answer = input(f"{msg} [y/N]: ").strip().lower()
+    except (EOFError, KeyboardInterrupt):
+        print("\nCancelled.")
+        return False
+    return answer in ("", "y")
+
+
+def get_instruction() -> int:
+    try:
+        import json
+
+        instruction_lst = json.load(open("challenge/onsite_competition/instructions.json"))
+        print("Available instructions:")
+        for i, item in enumerate(instruction_lst):
+            print(f"{i}: {item['instruction_title']}")
+        answer = input("input instruction id: ").strip().lower()
+        answer = int(answer)
+    except (EOFError, KeyboardInterrupt):
+        print("\nCancelled.")
+        sys.exit()
+    return instruction_lst[answer]['instruction'][0]
+
+
+def action_to_word(action: int) -> str:
+    action = max(0, min(3, action))
+    wl = ["stand still", "move forward", "turn left", "turn right"]
+    return wl[action]
+
+
 def main():
     args = parse_args()
     print("--- Loading config from:", args.config, "---")
-    evaluator_cfg = load_eval_cfg(args.config, attr_name='eval_cfg')
-    cfg = get_config(evaluator_cfg)
-    print(cfg)
+    cfg = load_eval_cfg(args.config, attr_name='eval_cfg')
+    agent_cfg = cfg.agent
 
-    # initialize user agent
-    agent = AgentClient(cfg.agent)
+    # initialize user's agent
+    agent = AgentClient(agent_cfg)
 
     # initialize real world env
-    env = RealWorldEnv()
+    env = RealWorldEnv(fps=30, duration=0.1, distance=0.3, angle=15, move_speed=0.5, turn_speed=0.5)
+    env.reverse()  # reverse move direction if using a rear camera
+    env.step(0)
+    obs = env.get_observation()
 
     # start stream
-    start_env(env)
-    app.run(host="0.0.0.0", port=8080, threaded=True)
+    print("--- start running steam app ---")
+    run(env=env)
+
+    while True:
+        instruction = get_instruction()
+        print("\nNew instruction:", instruction)
+        set_instruction(instruction)
 
-    try:
         while True:
             # print("get observation...")
             # obs contains {rgb, depth, instruction}
             obs = env.get_observation()
-            obs["instruction"] = args.instruction
+            # print(obs)
+            obs["instruction"] = instruction
 
-            # print("agent step...")
+            print("agent step...")
             # action is a integer in [0, 3], agent return [{'action': [int], 'ideal_flag': bool}] (same to internvla_n1 agent)
-            try:
-                action = agent.step(obs)[0]['action'][0]
-                print(f"agent step success, action is {action}")
-            except Exception as e:
-                print(f"agent step error {e}")
-                continue
-
-            # print("env step...")
-            try:
+            action = agent.step([obs])[0]['action'][0]
+            print("agent step success, action:", action)
+
+            if args.uninteractive_mode or confirm(f"Execute this action [{action_to_word(action)}]?"):
+                print("env step...")
                 env.step(action)
                 print("env step success")
-            except Exception as e:
-                print(f"env step error {e}")
-                continue
-    finally:
-        env.close()
+            else:
+                print("Stop requested. Exiting loop.")
+                print("agent reset...")
+                agent.reset()
+                break
+
+
+if __name__ == "__main__":
+    main()
diff --git a/challenge/onsite_competition/sdk/real_world_env.py b/challenge/onsite_competition/sdk/real_world_env.py
index e004bd64..ef176a58 100644
--- a/challenge/onsite_competition/sdk/real_world_env.py
+++ b/challenge/onsite_competition/sdk/real_world_env.py
@@ -4,11 +4,20 @@
 from cam import AlignedRealSense
 from control import DiscreteRobotController
 
-from internnav.env import Env
+# from internnav.env import Env
 
 
-class RealWorldEnv(Env):
-    def __init__(self, fps: int = 30):
+class RealWorldEnv:
+    def __init__(
+        self,
+        fps: int = 30,
+        duration: float = 0.5,
+        distance: float = 0.25,
+        angle: int = 15,
+        turn_speed: float = 0.5,
+        move_speed: float = 0.3,
+    ):
+
         self.node = DiscreteRobotController()
         self.cam = AlignedRealSense()
         self.latest_obs = None
@@ -22,6 +31,16 @@ def __init__(self, fps: int = 30):
         self.thread = threading.Thread(target=self._capture_loop, daemon=True)
         self.thread.start()
 
+        # control setting
+        self.duration = duration
+        self.distance = distance
+        self.angle = angle
+        self.turn_speed = turn_speed  # rad/s
+        self.move_speed = move_speed  # m/s
+
+    def reverse(self):
+        self.distance = -self.distance
+
     def _capture_loop(self):
         """keep capturing frames"""
         interval = 1.0 / self.fps
@@ -52,13 +71,13 @@ def step(self, action: int):
             3: turn right
         """
         if action == 0:
-            self.node.stand_still()
+            self.node.stand_still(self.duration)
         elif action == 1:
-            self.node.move_forward()
+            self.node.move_feedback(self.distance, self.move_speed)
         elif action == 2:
-            self.node.turn_left()
+            self.node.turn(self.angle, self.turn_speed)
         elif action == 3:
-            self.node.turn_right()
+            self.node.turn(self.angle, -self.turn_speed)
 
     def close(self):
         self.stop_flag.set()
diff --git a/challenge/onsite_competition/sdk/stream.py b/challenge/onsite_competition/sdk/stream.py
index b045cf12..cfbad940 100644
--- a/challenge/onsite_competition/sdk/stream.py
+++ b/challenge/onsite_competition/sdk/stream.py
@@ -1,13 +1,16 @@
 # stream_server.py
+import threading
 import time
+from typing import Optional
 
 import cv2
-from flask import Flask, Response
+import numpy as np
+from flask import Flask, Response, make_response, stream_with_context
 
 app = Flask(__name__)
 
-# 由主程序注入
 _env = None
+_stop = threading.Event()  # graceful stop for Ctrl+C
 
 
 def set_env(env):
@@ -16,23 +19,173 @@ def set_env(env):
     _env = env
 
 
-def _mjpeg_generator(jpeg_quality: int = 80):
+_instruction = ""
+_instruction_lock = threading.Lock()
+
+
+def set_instruction(text: str) -> None:
+    """
+    Set the instruction text that will be displayed on the viewer page.
+    Thread-safe; can be called from your main/control loop at any time.
+    """
+    global _instruction
+    with _instruction_lock:
+        _instruction = str(text) if text is not None else ""
+
+
+def get_instruction() -> str:
+    """
+    Get the current instruction text (thread-safe).
+    """
+    with _instruction_lock:
+        return _instruction
+
+
+def _encode_jpeg(frame_bgr: np.ndarray, quality: int = 80) -> Optional[bytes]:
+    ok, jpg = cv2.imencode(".jpg", frame_bgr, [cv2.IMWRITE_JPEG_QUALITY, quality])
+    return jpg.tobytes() if ok else None
+
+
+def _mjpeg_generator(jpeg_quality: int = 80, fps_limit: float = 30.0):
     boundary = b"--frame"
-    while True:
+    min_interval = 1.0 / fps_limit if fps_limit > 0 else 0.0
+
+    last = 0.0
+    while not _stop.is_set():
         if _env is None:
-            time.sleep(0.1)
+            time.sleep(0.05)
             continue
+
         obs = _env.get_observation()
-        if obs is None:
+        if not obs or "rgb" not in obs:
             time.sleep(0.01)
             continue
-        frame_bgr = obs["rgb"]
-        ok, jpg = cv2.imencode(".jpg", frame_bgr, [cv2.IMWRITE_JPEG_QUALITY, jpeg_quality])
-        if not ok:
+
+        frame = obs["rgb"]  # Expect BGR for OpenCV; convert if your source is RGB
+        if frame is None:
+            time.sleep(0.01)
+            continue
+
+        # If your env returns RGB, uncomment the next line:
+        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+
+        jpg = _encode_jpeg(frame, quality=jpeg_quality)
+        if jpg is None:
             continue
-        yield (boundary + b"\r\n" b"Content-Type: image/jpeg\r\n\r\n" + jpg.tobytes() + b"\r\n")
+
+        # multipart chunk
+        yield (
+            boundary + b"\r\n"
+            b"Content-Type: image/jpeg\r\n"
+            b"Cache-Control: no-cache\r\n"
+            b"Pragma: no-cache\r\n\r\n" + jpg + b"\r\n"
+        )
+
+        # simple pacing (prevents 100% CPU + helps Ctrl+C responsiveness)
+        now = time.time()
+        sleep_t = min_interval - (now - last)
+        if sleep_t > 0:
+            time.sleep(sleep_t)
+        last = now
 
 
 @app.route("/stream")
 def stream():
-    return Response(_mjpeg_generator(), mimetype="multipart/x-mixed-replace; boundary=frame")
+    # stream_with_context ensures Flask doesn’t buffer the generator
+    gen = stream_with_context(_mjpeg_generator())
+    resp = Response(gen, mimetype="multipart/x-mixed-replace; boundary=frame", direct_passthrough=True)
+    # extra headers to avoid caching
+    resp.headers["Cache-Control"] = "no-cache, no-store, must-revalidate"
+    resp.headers["Pragma"] = "no-cache"
+    resp.headers["Connection"] = "keep-alive"
+    return resp
+
+
+@app.route("/instruction")
+def instruction():
+    text = get_instruction()
+    r = make_response(text)
+    r.headers["Content-Type"] = "text/plain; charset=utf-8"
+    r.headers["Cache-Control"] = "no-cache, no-store, must-revalidate"
+    r.headers["Pragma"] = "no-cache"
+    return r
+
+
+@app.route("/")
+def index():
+    # simple viewer page with instruction overlay
+    html = """
+    <!doctype html>
+    <title>MJPEG Stream</title>
+    <meta charset="utf-8" />
+    <style>
+      :root { color-scheme: dark; }
+      body {
+        margin:0; background:#111; height:100vh; overflow:hidden;
+        display:grid; place-items:center; font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, Apple Color Emoji, Segoe UI Emoji;
+      }
+      .wrap { position:relative; width:min(100vw, 100vh*16/9); height:min(100vh, 100vw*9/16); }
+      .wrap img { width:100%; height:100%; object-fit:contain; display:block; background:#000; }
+      #instr {
+        position:absolute; left:1rem; right:1rem; bottom:1rem;
+        background:rgba(0,0,0,0.55); backdrop-filter: blur(4px);
+        border:1px solid rgba(255,255,255,0.15);
+        border-radius:12px; padding:0.75rem 1rem;
+        color:#eaeaea; line-height:1.4; white-space:pre-wrap; word-break:break-word;
+        font-size:clamp(14px, 1.8vw, 18px);
+      }
+      #instr .label { opacity:0.7; font-size:0.9em; margin-right:0.5rem; }
+    </style>
+    <div class="wrap">
+      <img src="/stream" alt="stream"/>
+      <div id="instr"><span class="label">Instruction:</span><span id="instr-text">(none)</span></div>
+    </div>
+    <script>
+      async function refreshInstruction() {
+        try {
+          const res = await fetch("/instruction", { cache: "no-store" });
+          if (!res.ok) return;
+          const text = await res.text();
+          // Use textContent to avoid injecting HTML
+          document.getElementById("instr-text").textContent = text || "(none)";
+        } catch (e) {
+          // ignore fetch errors (server might be restarting)
+        }
+      }
+      // Initial fetch and periodic refresh
+      refreshInstruction();
+      setInterval(refreshInstruction, 500); // update 2x/sec; adjust if you want slower
+      // Also refresh when page becomes visible again
+      document.addEventListener("visibilitychange", () => {
+        if (document.visibilityState === "visible") refreshInstruction();
+      });
+    </script>
+    """
+    r = make_response(html)
+    r.headers["Cache-Control"] = "no-cache"
+    return r
+
+
+def _handle_sigint(sig, frame):
+    _stop.set()
+
+
+# start the HTTP server in a background thread (non-blocking)
+def run_server(env, host, port):
+    # make env accessible to the server
+    set_env(env)
+    # IMPORTANT: disable reloader so it doesn't spawn another process
+    app.run(host="0.0.0.0", port=8080, threaded=True, use_reloader=False)
+
+
+def run(host="0.0.0.0", port=8080, env=None):
+    server_thread = threading.Thread(target=run_server, args=(env, host, port), daemon=True)
+    server_thread.start()
+    time.sleep(0.3)  # tiny delay to let the server bind the port
+    print("--- stream app is running on http://0.0.0.0:8080 ---")
+
+
+if __name__ == "__main__":
+    # Example: plug your env here before run()
+    # set_env(MyEnv())
+    run()
diff --git a/challenge/onsite_competition/sdk/test_robot.py b/challenge/onsite_competition/sdk/test_robot.py
index 9069ebee..67de86ef 100644
--- a/challenge/onsite_competition/sdk/test_robot.py
+++ b/challenge/onsite_competition/sdk/test_robot.py
@@ -1,5 +1,5 @@
 from real_world_env import RealWorldEnv
-from save_obs import save_obs
+from stream import run, set_env
 
 env = RealWorldEnv()
 
@@ -9,5 +9,13 @@
 env.step(0)  # 0: no movement (stand still)
 
 obs = env.get_observation()  # {rgb: array, depth: array, instruction: str}
-meta = save_obs(obs, outdir="./captures", prefix="test")
-print("Saved observation metadata:", meta)
+obs["instruction"] = "red"
+print(obs["instruction"])
+# meta = save_obs(obs, outdir="./captures1", prefix="test")
+# print("Saved observation metadata:", meta)
+
+# test stream
+
+set_env(env)
+print("--- start running steam app ---")
+run()
diff --git a/internnav/evaluator/base.py b/internnav/evaluator/base.py
index 1e899d06..f397ae04 100644
--- a/internnav/evaluator/base.py
+++ b/internnav/evaluator/base.py
@@ -1,6 +1,6 @@
-from internnav.agent.utils.client import AgentClient
 from internnav.configs.evaluator import EvalCfg
 from internnav.env import Env
+from internnav.utils.comm_utils.client import AgentClient
 
 
 class Evaluator:
diff --git a/internnav/evaluator/utils/eval.py b/internnav/evaluator/utils/eval.py
index b5308320..c370461b 100644
--- a/internnav/evaluator/utils/eval.py
+++ b/internnav/evaluator/utils/eval.py
@@ -1,18 +1,16 @@
-import base64
-import pickle
+from internutopia.core.config.robot import ControllerCfg
+from internutopia_extension.configs.robots.h1 import H1RobotCfg
+from internutopia_extension.configs.sensors import RepCameraCfg
 
-from internnav.evaluator.utils.common import load_scene_usd, load_kujiale_scene_usd
-from internnav.projects.dataloader.resumable import ResumablePathKeyDataloader
-from internnav.projects.internutopia_vln_extension.configs.tasks.vln_eval_task import (
-    VLNEvalTaskCfg,
-)
 from internnav.configs.evaluator import EvalCfg
-from internutopia_extension.configs.robots.h1 import H1RobotCfg
-from internutopia.core.config.robot import ControllerCfg
+from internnav.evaluator.utils.common import load_kujiale_scene_usd, load_scene_usd
+from internnav.projects.dataloader.resumable import ResumablePathKeyDataloader
 from internnav.projects.internutopia_vln_extension.configs.metrics.vln_pe_metrics import (
     VLNPEMetricCfg,
 )
-from internutopia_extension.configs.sensors import RepCameraCfg
+from internnav.projects.internutopia_vln_extension.configs.tasks.vln_eval_task import (
+    VLNEvalTaskCfg,
+)
 
 
 def generate_episode(dataloader: ResumablePathKeyDataloader, config: EvalCfg):
@@ -21,7 +19,7 @@ def generate_episode(dataloader: ResumablePathKeyDataloader, config: EvalCfg):
     eval_path_key_list = dataloader.resumed_path_key_list
     path_key_data = dataloader.path_key_data
     episodes = []
-    
+
     robot = H1RobotCfg(
         **config.task.robot.robot_settings,
         controllers=[ControllerCfg(**cfg.controller_settings) for cfg in config.task.robot.controllers],
@@ -75,9 +73,3 @@ def generate_episode(dataloader: ResumablePathKeyDataloader, config: EvalCfg):
             )
         )
     return episodes
-
-
-def serialize_obs(obs):
-    serialized = pickle.dumps(obs)
-    encoded = base64.b64encode(serialized).decode('utf-8')
-    return encoded
diff --git a/internnav/agent/utils/client.py b/internnav/utils/comm_utils/client.py
similarity index 90%
rename from internnav/agent/utils/client.py
rename to internnav/utils/comm_utils/client.py
index aa00f6ef..032c6b6e 100644
--- a/internnav/agent/utils/client.py
+++ b/internnav/utils/comm_utils/client.py
@@ -1,9 +1,16 @@
+import base64
+import pickle
 from typing import Any, Dict, List, Optional
 
 import requests
 
 from internnav.configs.agent import AgentCfg, InitRequest, ResetRequest, StepRequest
-from internnav.evaluator.utils.eval import serialize_obs
+
+
+def serialize_obs(obs):
+    serialized = pickle.dumps(obs)
+    encoded = base64.b64encode(serialized).decode('utf-8')
+    return encoded
 
 
 class AgentClient:
diff --git a/internnav/agent/utils/server.py b/internnav/utils/comm_utils/server.py
similarity index 98%
rename from internnav/agent/utils/server.py
rename to internnav/utils/comm_utils/server.py
index 0f39dda5..a2090768 100644
--- a/internnav/agent/utils/server.py
+++ b/internnav/utils/comm_utils/server.py
@@ -31,6 +31,7 @@ def _register_routes(self):
             ('/init', self.init_agent, ['POST'], status.HTTP_201_CREATED),
             ('/{agent_name}/step', self.step_agent, ['POST'], None),
             ('/{agent_name}/reset', self.reset_agent, ['POST'], None),
+            # TODO: Add stop server route
         ]
 
         for path, handler, methods, status_code in route_config:
diff --git a/scripts/eval/configs/h1_internvla_n1_cfg.py b/scripts/eval/configs/h1_internvla_n1_cfg.py
index e918bb04..d6a3b0ab 100644
--- a/scripts/eval/configs/h1_internvla_n1_cfg.py
+++ b/scripts/eval/configs/h1_internvla_n1_cfg.py
@@ -10,35 +10,35 @@
 
 eval_cfg = EvalCfg(
     agent=AgentCfg(
-        server_port=8023,
+        server_port=8087,
         model_name='internvla_n1',
         ckpt_path='',
         model_settings={
-            'env_num': 1, 'sim_num': 1,
-            'model_path': "checkpoints/InternVLA-N1-Preview",
-            'camera_intrinsic': [
-                [585.0, 0.0, 320.0], [0.0, 585.0, 240.0], [0.0, 0.0, 1.0]
-            ],
-            
-            'width': 640, 'height': 480, 'hfov': 79,
-            'resize_w': 384, 'resize_h': 384,
+            'env_num': 1,
+            'sim_num': 1,
+            'model_path': "checkpoints/InternVLA-N1",
+            'camera_intrinsic': [[585.0, 0.0, 320.0], [0.0, 585.0, 240.0], [0.0, 0.0, 1.0]],
+            'width': 640,
+            'height': 480,
+            'hfov': 79,
+            'resize_w': 384,
+            'resize_h': 384,
             'max_new_tokens': 1024,
             'num_frames': 32,
             'num_history': 8,
             'num_future_steps': 4,
-            
             'device': 'cuda:0',
             'predict_step_nums': 32,
-            'continuous_traj': True, 
+            'continuous_traj': True,
             # debug
-            'vis_debug': True, # If vis_debug=True, you can get visualization results
-            'vis_debug_path': './logs/test/vis_debug' 
+            'vis_debug': True,  # If vis_debug=True, you can get visualization results
+            'vis_debug_path': './logs/test/vis_debug',
         },
     ),
     env=EnvCfg(
         env_type='vln_multi',
         env_settings={
-            'use_fabric': False, # Please set use_fabric=False due to the render delay;
+            'use_fabric': False,  # Please set use_fabric=False due to the render delay;
             'headless': True,
         },
     ),
@@ -46,7 +46,7 @@
         task_name='test',
         task_settings={
             'env_num': 1,
-            'use_distributed': False, # If the others setting in task_settings, please set use_distributed = False.
+            'use_distributed': False,  # If the others setting in task_settings, please set use_distributed = False.
             'proc_num': 1,
         },
         scene=SceneCfg(
@@ -54,18 +54,18 @@
             scene_data_dir='data/scene_data/mp3d_pe',
         ),
         robot_name='h1',
-        robot_flash=True, # If robot_flash is True, the mode is flash (set world_pose directly); else you choose physical mode.
+        robot_flash=True,  # If robot_flash is True, the mode is flash (set world_pose directly); else you choose physical mode.
         robot_usd_path='data/Embodiments/vln-pe/h1/h1_internvla.usd',
-        camera_resolution=[640, 480], # (W,H)
+        camera_resolution=[640, 480],  # (W,H)
         camera_prim_path='torso_link/h1_1_25_down_30',
-        one_step_stand_still = True, #For dual-system, please keep this param True.
+        one_step_stand_still=True,  # For dual-system, please keep this param True.
     ),
     dataset=EvalDatasetCfg(
         dataset_type="mp3d",
         dataset_settings={
             'base_data_dir': 'data/vln_pe/raw_data/r2r',
             'split_data_types': ['val_unseen'],  # 'val_seen'
-            'filter_stairs': False,      
+            'filter_stairs': False,
             # 'selected_scans': ['zsNo4HB9uLZ'],
             # 'selected_scans': ['8194nk5LbLH', 'pLe4wQe7qrG'],
         },
diff --git a/tests/function_test/test_server.py b/tests/function_test/test_server.py
index f279cfd4..885bf366 100644
--- a/tests/function_test/test_server.py
+++ b/tests/function_test/test_server.py
@@ -9,7 +9,7 @@
 def start_server():
     server_cmd = [
         sys.executable,
-        "internnav/agent/utils/server.py",
+        "internnav/utils/comm_utils/server.py",
     ]
 
     proc = subprocess.Popen(