diff --git a/README.md b/README.md
index 5aa1c37a..40919191 100644
--- a/README.md
+++ b/README.md
@@ -34,9 +34,9 @@ The toolbox supports the most comprehensive 6 datasets \& benchmarks and 10+ pop
 The toolbox supports the most advanced high-quality navigation dataset, InternData-N1, which includes 3k+ scenes and 830k VLN data covering diverse embodiments and scenes, and the first dual-system navigation foundation model with leading performance on all the benchmarks and zero-shot generalization capability in the real world, InternVLA-N1.
 
 ## 🔥
-- [2025/10] Add a simple [inference-only demo](scripts/eval/inference_only_demo.ipynb) of InternVLA-N1.
+- [2025/10] Add a simple [inference-only demo](scripts/notebooks/inference_only_demo.ipynb) of InternVLA-N1.
 - [2025/10] InternVLA-N1 [technique report](https://internrobotics.github.io/internvla-n1.github.io/static/pdfs/InternVLA_N1.pdf) is released. Please check our [homepage](https://internrobotics.github.io/internvla-n1.github.io/).
-- [2025/09] Real-world deployment code of InternVLA-N1 is released.- [2025/09] Real-world deployment code of InternVLA-N1 is released.
+- [2025/09] Real-world deployment code of InternVLA-N1 is released.
 - [2025/07] We are hosting 🏆IROS 2025 Grand Challenge, stay tuned at [official website](https://internrobotics.shlab.org.cn/challenge/2025/).
 - [2025/07] InternNav v0.1.1 released.
 
diff --git a/challenge/onsite_competition/captures/rs_depth_mm.png b/challenge/onsite_competition/captures/rs_depth_mm.png
deleted file mode 100644
index 7c46c358..00000000
Binary files a/challenge/onsite_competition/captures/rs_depth_mm.png and /dev/null differ
diff --git a/challenge/onsite_competition/captures/rs_depth_vis.png b/challenge/onsite_competition/captures/rs_depth_vis.png
deleted file mode 100644
index 85a629b8..00000000
Binary files a/challenge/onsite_competition/captures/rs_depth_vis.png and /dev/null differ
diff --git a/internnav/habitat_extensions/evaluator.py b/internnav/projects/habitat_extensions/evaluator.py
similarity index 100%
rename from internnav/habitat_extensions/evaluator.py
rename to internnav/projects/habitat_extensions/evaluator.py
diff --git a/internnav/habitat_extensions/evaluator_single.py b/internnav/projects/habitat_extensions/evaluator_single.py
similarity index 75%
rename from internnav/habitat_extensions/evaluator_single.py
rename to internnav/projects/habitat_extensions/evaluator_single.py
index 26f01620..da2a1574 100644
--- a/internnav/habitat_extensions/evaluator_single.py
+++ b/internnav/projects/habitat_extensions/evaluator_single.py
@@ -1,55 +1,45 @@
 import argparse
-import os
 import copy
-import json
-from typing import Any
-from collections import OrderedDict
-from PIL import Image, ImageDraw, ImageFont
-import numpy as np
-import tqdm
-import quaternion
 import itertools
+import os
 import random
 import re
-import cv2
 import sys
-import time
+from collections import OrderedDict
+from typing import Any
 
+import cv2
+import habitat
+import numpy as np
+import quaternion
 import torch
-from torch import Tensor
-from transformers.image_utils import to_numpy_array
 from depth_camera_filtering import filter_depth
-
-
-from omegaconf import OmegaConf
-import habitat
-from habitat import logger, Env
+from habitat import Env
 from habitat.config.default import get_agent_config
-from habitat_baselines.config.default import get_config as get_habitat_config
-from habitat.tasks.nav.shortest_path_follower import ShortestPathFollower
 from habitat.config.default_structured_configs import (
     CollisionsMeasurementConfig,
     FogOfWarConfig,
     TopDownMapMeasurementConfig,
 )
+from habitat.tasks.nav.shortest_path_follower import ShortestPathFollower
 from habitat.utils.visualizations.utils import images_to_video, observations_to_image
-from transformers import (
-    AutoTokenizer,
-    AutoProcessor,
-    Qwen2_5_VLForConditionalGeneration,
-)
-
-
+from habitat_baselines.config.default import get_config as get_habitat_config
+from omegaconf import OmegaConf
+from PIL import Image
+from transformers import AutoProcessor
+from transformers.image_utils import to_numpy_array
 
 PROJECT_ROOT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 sys.path.append(PROJECT_ROOT_PATH)
 print(f"PROJECT_ROOT_PATH {PROJECT_ROOT_PATH}")
 from internnav.model.basemodel.internvla_n1.internvla_n1 import InternVLAN1ForCausalLM
-from internnav.model.utils.vln_utils import rho_theta, image_resize, traj_to_actions, chunk_token, split_and_clean, open_image
-from internnav.habitat_extensions import measures
+from internnav.model.utils.vln_utils import (
+    chunk_token,
+    split_and_clean,
+    traj_to_actions,
+)
 from internnav.utils.dist import *
 
-
 DEFAULT_IMAGE_TOKEN = "<image>"
 
 
@@ -119,52 +109,53 @@ def __init__(
         camera_fov_rad = np.deg2rad(self.sim_sensors_config.depth_sensor.hfov)
         self._camera_fov = camera_fov_rad
         self._fx = self._fy = self.sim_sensors_config.depth_sensor.width / (2 * np.tan(camera_fov_rad / 2))
-        
+
         self.model = model
         self.processor = processor
-        
-
 
-        prompt = f"You are an autonomous navigation assistant. Your task is to <instruction>. Where should you go next to stay on track? Please output the next waypoint\'s coordinates in the image. Please output STOP when you have successfully completed the task." 
+        prompt = f"You are an autonomous navigation assistant. Your task is to <instruction>. Where should you go next to stay on track? Please output the next waypoint\'s coordinates in the image. Please output STOP when you have successfully completed the task."
         answer = ""
         self.conversation = [{"from": "human", "value": prompt}, {"from": "gpt", "value": answer}]
-        
+
         self.conjunctions = [
-                                'you can see ',
-                                'in front of you is ',
-                                'there is ',
-                                'you can spot ',
-                                'you are toward the ',
-                                'ahead of you is ',
-                                'in your sight is '
-                            ]
-        
-        self.actions2idx = OrderedDict({
-            'STOP': [0],
-            "↑": [1],
-            "←": [2],
-            "→": [3],
-            "↓": [5],
-        })
-                                        
+            'you can see ',
+            'in front of you is ',
+            'there is ',
+            'you can spot ',
+            'you are toward the ',
+            'ahead of you is ',
+            'in your sight is ',
+        ]
+
+        self.actions2idx = OrderedDict(
+            {
+                'STOP': [0],
+                "↑": [1],
+                "←": [2],
+                "→": [3],
+                "↓": [5],
+            }
+        )
+
         self.num_frames = args.num_frames
         self.num_future_steps = args.num_future_steps
         self.num_history = args.num_history
-        
-    
-    def preprocess_depth_image_v2(self, depth_image, do_depth_scale=True, depth_scale=1000, target_height=None, target_width=None):
+
+    def preprocess_depth_image_v2(
+        self, depth_image, do_depth_scale=True, depth_scale=1000, target_height=None, target_width=None
+    ):
         if target_height is None:
             target_height = self.image_processor.crop_size['height']  # 384
-            target_width  = self.image_processor.crop_size['width']  # 384
-        
+            target_width = self.image_processor.crop_size['width']  # 384
+
         resized_depth_image = depth_image.resize((target_width, target_height), Image.NEAREST)
-        
+
         img = to_numpy_array(resized_depth_image)
         if do_depth_scale:
             img = img / depth_scale
-    
+
         return img, (target_width, target_height)
-    
+
     def get_intrinsic_matrix(self, sensor_cfg) -> np.ndarray:
         width = sensor_cfg.width
         height = sensor_cfg.height
@@ -174,19 +165,16 @@ def get_intrinsic_matrix(self, sensor_cfg) -> np.ndarray:
         cx = (width - 1.0) / 2.0
         cy = (height - 1.0) / 2.0
 
-        intrinsic_matrix = np.array([
-            [fx,  0.0, cx, 0.0],
-            [ 0.0, fy, cy, 0.0],
-            [ 0.0,  0.0,  1.0, 0.0],
-            [ 0.0,  0.0,  0.0, 1.0]
-        ])
+        intrinsic_matrix = np.array(
+            [[fx, 0.0, cx, 0.0], [0.0, fy, cy, 0.0], [0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 1.0]]
+        )
         return intrinsic_matrix
-    
+
     def preprocess_instrinsic(self, intrinsic, ori_size, target_size):  # (V, 4, 4) (resize_shape) (h, w)
         intrinsic = copy.deepcopy(intrinsic)
         if len(intrinsic.shape) == 2:
             intrinsic = intrinsic[None, :, :]  # (1, 4, 4) or (B, 4, 4)
-        
+
         intrinsic[:, 0] /= ori_size[0] / target_size[0]  # width
         intrinsic[:, 1] /= ori_size[1] / target_size[1]  # height
 
@@ -197,11 +185,11 @@ def preprocess_instrinsic(self, intrinsic, ori_size, target_size):  # (V, 4, 4)
             intrinsic = intrinsic.squeeze(0)
 
         return intrinsic
-    
+
     def get_axis_align_matrix(self):
         ma = np.array([[0, 0, 1, 0], [-1, 0, 0, 0], [0, -1, 0, 0], [0, 0, 0, 1]])
         return ma
-    
+
     def xyz_yaw_to_tf_matrix(self, xyz: np.ndarray, yaw: float) -> np.ndarray:
         x, y, z = xyz
         transformation_matrix = np.array(
@@ -225,7 +213,7 @@ def xyz_pitch_to_tf_matrix(self, xyz: np.ndarray, pitch: float) -> np.ndarray:
             ]
         )
         return transformation_matrix
-    
+
     def xyz_yaw_pitch_to_tf_matrix(self, xyz: np.ndarray, yaw: float, pitch: float) -> np.ndarray:
         """Converts a given position and yaw, pitch angles to a 4x4 transformation matrix.
 
@@ -243,7 +231,7 @@ def xyz_yaw_pitch_to_tf_matrix(self, xyz: np.ndarray, yaw: float, pitch: float)
         transformation_matrix[:3, :3] = rot1 @ rot2
         transformation_matrix[:3, 3] = xyz
         return transformation_matrix
-    
+
     def pixel_to_gps(self, pixel, depth, intrinsic, tf_camera_to_episodic):
         '''
         Args:
@@ -269,15 +257,15 @@ def pixel_to_gps(self, pixel, depth, intrinsic, tf_camera_to_episodic):
         x = point_episodic[0]
         y = point_episodic[1]
 
-        return (x, y) # same as habitat gps
-    
+        return (x, y)  # same as habitat gps
+
     def config_env(self) -> Env:
         env = Env(config=self.config)
         # env.episodes = env.episodes[0:1]
         return env
-    
+
     def run_single_eval(self):
-        
+
         self.model.eval()
         self.env = self.config_env()
         self.scene_episode_dict = {}
@@ -285,31 +273,41 @@ def run_single_eval(self):
             if episode.scene_id not in self.scene_episode_dict:
                 self.scene_episode_dict[episode.scene_id] = []
             self.scene_episode_dict[episode.scene_id].append(episode)
-        intrinsic_matrix = self.get_intrinsic_matrix(self.config.habitat.simulator.agents.main_agent.sim_sensors.rgb_sensor)
+        intrinsic_matrix = self.get_intrinsic_matrix(
+            self.config.habitat.simulator.agents.main_agent.sim_sensors.rgb_sensor
+        )
         sucs, spls, oss, nes = [], [], [], []
         done_res = []
-        if True: # fixme
+        if True:  # fixme
             scenes_keys = list(sorted(self.scene_episode_dict.keys()))
 
             self.infer_success = False
             self.infer_data_ready = False
-            print('---------------current infer scence:', scenes_keys[self.infer_scene_id])
+            print('---------------current infer scene:', scenes_keys[self.infer_scene_id])
             selected_scenes = ['17DRP5sb8fy', 'r1Q1Z4BcV1o', 'dhjEzFoUFzH']
-            key_name = 'data/scene_datasets/mp3d/' + selected_scenes[self.infer_scene_id] + '/' + selected_scenes[self.infer_scene_id] + '.glb'
+            key_name = (
+                'data/scene_datasets/mp3d/'
+                + selected_scenes[self.infer_scene_id]
+                + '/'
+                + selected_scenes[self.infer_scene_id]
+                + '.glb'
+            )
             episodes = self.scene_episode_dict[key_name]
             step_size = len(episodes) // 6
             # episode_id = 0
-            
+
             episode = episodes[self.infer_episode_id * step_size]
 
-            episode_instruction = self.infer_instruction# episode.instruction.instruction_text if 'objectnav' not in self.config_path else episode.object_category
+            episode_instruction = (
+                self.infer_instruction
+            )  # episode.instruction.instruction_text if 'objectnav' not in self.config_path else episode.object_category
             print("episode start", episode_instruction)
 
             episode_id = int(episode.episode_id)
             env = self.env
             env.current_episode = episode
             observations = env.reset()
-            
+
             agent_state = env.sim.get_agent_state()
             rotation = agent_state.rotation
             translation = agent_state.position
@@ -317,16 +315,14 @@ def run_single_eval(self):
             transformation_matrix = np.eye(4)
             transformation_matrix[:3, :3] = rotation_matrix
             transformation_matrix[:3, 3] = translation
-                
-            agent = ShortestPathFollower(
-                            env.sim, 0.25, False
-                        )
-            
+
+            agent = ShortestPathFollower(env.sim, 0.25, False)
+
             os.makedirs(os.path.join(self.output_path, f'check_sim_{self.epoch}'), exist_ok=True)
-            
+
             vis_frames = []
             step_id = 0
-            
+
             if self.save_video:
                 os.makedirs(self.output_path, exist_ok=True)
             initial_height = env.sim.get_agent_state().position[1]
@@ -336,17 +332,17 @@ def run_single_eval(self):
             action_seq = []
             past_key_values = None
             output_ids = None
-            
+
             goal = None
             action = None
             look_down_observations = None
             look_down_id_list = []
             messages = []
-            last_action=None
+            last_action = None
             local_actions = []
-            
+
             # begin evaluation main loop
-            while not env.episode_over and step_id <=70:
+            while not env.episode_over and step_id <= 70:
                 rgb = observations["rgb"]
                 depth = observations["depth"]
                 x, y = observations["gps"]
@@ -356,23 +352,32 @@ def run_single_eval(self):
                 depth = depth * 1000
 
                 agent_state = env.sim.get_agent_state()
-                height = agent_state.position[1] - initial_height # Habitat GPS makes west negative, so flip y
+                height = agent_state.position[1] - initial_height  # Habitat GPS makes west negative, so flip y
                 camera_position = np.array([x, -y, self._camera_height + height])
                 robot_xy = camera_position[:2]
                 # tf_camera_to_episodic = self.xyz_yaw_to_tf_matrix(camera_position, camera_yaw) @ self.get_axis_align_matrix()
-                tf_camera_to_episodic = self.xyz_yaw_pitch_to_tf_matrix(camera_position, camera_yaw, np.deg2rad(30)) @ self.get_axis_align_matrix()
-                
-                image = Image.fromarray(rgb).convert('RGB')  # raw observation image 
-                image_size = image.size  #640*480
+                tf_camera_to_episodic = (
+                    self.xyz_yaw_pitch_to_tf_matrix(camera_position, camera_yaw, np.deg2rad(30))
+                    @ self.get_axis_align_matrix()
+                )
+
+                image = Image.fromarray(rgb).convert('RGB')  # raw observation image
+                image_size = image.size  # 640*480
                 save_raw_image = image.copy()
-                
+
                 if action == 5:
-                    look_down_image = image #Image.fromarray(look_down_observations['rgb']).convert('RGB')
+                    look_down_image = image  # Image.fromarray(look_down_observations['rgb']).convert('RGB')
                     save_raw_image = look_down_image.copy()
 
                     # rgb_list.append(look_down_image)
-                    look_down_depth, resize_shape = self.preprocess_depth_image_v2(Image.fromarray(depth.astype(np.uint16), mode='I;16'), do_depth_scale=True, depth_scale=1000, target_height=224, target_width=224)
-                    look_down_depth = torch.as_tensor(np.ascontiguousarray(look_down_depth)).float() # [H, W]
+                    look_down_depth, resize_shape = self.preprocess_depth_image_v2(
+                        Image.fromarray(depth.astype(np.uint16), mode='I;16'),
+                        do_depth_scale=True,
+                        depth_scale=1000,
+                        target_height=224,
+                        target_width=224,
+                    )
+                    look_down_depth = torch.as_tensor(np.ascontiguousarray(look_down_depth)).float()  # [H, W]
                     ### depth clip to 5m
                     look_down_depth[look_down_depth > 5.0] = 5.0
                 else:
@@ -387,131 +392,131 @@ def run_single_eval(self):
                     depth = filter_depth(depth.reshape(depth.shape[:2]), blur_type=None)
                     depth = depth * (self._max_depth - self._min_depth) + self._min_depth
                     depth = depth * 1000
-                    look_down_depth, resize_shape = self.preprocess_depth_image_v2(Image.fromarray(depth.astype(np.uint16), mode='I;16'), do_depth_scale=True, depth_scale=1000, target_height=224, target_width=224)
-                    look_down_depth = torch.as_tensor(np.ascontiguousarray(look_down_depth)).float() # [H, W]
+                    look_down_depth, resize_shape = self.preprocess_depth_image_v2(
+                        Image.fromarray(depth.astype(np.uint16), mode='I;16'),
+                        do_depth_scale=True,
+                        depth_scale=1000,
+                        target_height=224,
+                        target_width=224,
+                    )
+                    look_down_depth = torch.as_tensor(np.ascontiguousarray(look_down_depth)).float()  # [H, W]
                     ### depth clip to 5m
                     look_down_depth[look_down_depth > 5.0] = 5.0
 
                     env.step(4)
                     env.step(4)
 
-                
                 info = env.get_metrics()
                 current_frame_infer_pixel = False
                 if len(action_seq) == 0 and goal is None:  # 只有执行完一次输出的所有 action_seq 才能继续做模型推理
-                    if action != 5: 
+                    if action != 5:
                         sources = copy.deepcopy(self.conversation)
                         if 'objectnav' in self.config_path:
-                            sources[0]["value"] = sources[0]["value"].replace('<instruction>.', random.choice(self.objectnav_instructions).format(target_object=episode.object_category.replace('_', ' ')))
-                        else: 
-                            sources[0]["value"] = sources[0]["value"].replace('<instruction>.', episode_instruction[:-1])
-                        cur_images = rgb_list[-1:]   # current observation 
+                            sources[0]["value"] = sources[0]["value"].replace(
+                                '<instruction>.',
+                                random.choice(self.objectnav_instructions).format(
+                                    target_object=episode.object_category.replace('_', ' ')
+                                ),
+                            )
+                        else:
+                            sources[0]["value"] = sources[0]["value"].replace(
+                                '<instruction>.', episode_instruction[:-1]
+                            )
+                        cur_images = rgb_list[-1:]  # current observation
                         if step_id == 0:
                             history_id = []
                         else:
-                            history_id = np.unique(np.linspace(0, step_id - 1, self.num_history, dtype=np.int32)).tolist()
+                            history_id = np.unique(
+                                np.linspace(0, step_id - 1, self.num_history, dtype=np.int32)
+                            ).tolist()
                             placeholder = (DEFAULT_IMAGE_TOKEN + '\n') * len(history_id)
                             sources[0]["value"] += f' These are your historical observations: {placeholder}.'
-                        
+
                         history_id = sorted(history_id)
                         print('history_idddddddd', step_id, history_id)
                         input_images = [rgb_list[i] for i in history_id] + cur_images
                         input_img_id = 0
-                    else: 
-                        assert action == 5 # last action is look down
+                    else:
+                        assert action == 5  # last action is look down
                         sources = [{"from": "human", "value": ""}, {"from": "gpt", "value": ""}]
                         input_images += [look_down_image]
-                        messages.append({
-                                        'role': 'assistant',
-                                        'content': [
-                                            {
-                                                'type': 'text',
-                                                'text': llm_outputs
-                                            }
-                                        ]
-                                    })
+                        messages.append({'role': 'assistant', 'content': [{'type': 'text', 'text': llm_outputs}]})
                         input_img_id = -1
-                        
+
                     prompt = random.choice(self.conjunctions) + DEFAULT_IMAGE_TOKEN
                     sources[0]["value"] += f" {prompt}."
                     print('sources', step_id, sources)
                     prompt_instruction = copy.deepcopy(sources[0]["value"])
                     parts = split_and_clean(prompt_instruction)
-                    
+
                     content = []
-                    for i in range (len(parts)):
+                    for i in range(len(parts)):
                         if parts[i] == "<image>":
                             content.append({"type": "image", "image": input_images[input_img_id]})
-                            input_img_id +=1
+                            input_img_id += 1
                         else:
-                            content.append({"type": "text", "text": parts[i]}) 
-                    
-                    messages.append({
-                                        'role': 'user',
-                                        'content': content
-                                    })
-                    
+                            content.append({"type": "text", "text": parts[i]})
+
+                    messages.append({'role': 'user', 'content': content})
+
                     print('step_id', step_id, 'messages:', messages)
-                
-                    text = self.processor.apply_chat_template(
-                        messages,tokenize=False, add_generation_prompt=True
-                    )
+
+                    text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
                     inputs = self.processor(text=[text], images=input_images, return_tensors="pt").to(self.model.device)
 
                     with torch.no_grad():
                         output_ids = self.model.generate(**inputs, max_new_tokens=128, do_sample=False)
-                        
 
-                    llm_outputs = self.processor.tokenizer.decode(output_ids[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
+                    llm_outputs = self.processor.tokenizer.decode(
+                        output_ids[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
+                    )
                     print('step_id:', step_id, 'output text:', llm_outputs)
-                    
-                    if bool(re.search(r'\d', llm_outputs)): # output pixel goal 
+
+                    if bool(re.search(r'\d', llm_outputs)):  # output pixel goal
                         current_frame_infer_pixel = True
                         forward_action = 0
                         coord = [int(c) for c in re.findall(r'\d+', llm_outputs)]
                         pixel_goal = [int(coord[1]), int(coord[0])]  # switch the goal o
-                        
+
                         goal = self.pixel_to_gps(pixel_goal, depth / 1000, intrinsic_matrix, tf_camera_to_episodic)
                         print('before', goal, depth.shape)
-                        goal = (transformation_matrix @ np.array([-goal[1], 0, -goal[0], 1]))[:3] 
-                        
+                        goal = (transformation_matrix @ np.array([-goal[1], 0, -goal[0], 1]))[:3]
+
                         if not env.sim.pathfinder.is_navigable(np.array(goal)):
                             goal = np.array(env.sim.pathfinder.snap_point(np.array(goal)))
-                            
+
                         # look down --> horizontal
                         env.step(4)
                         env.step(4)
-                        
+
                         # action = agent.get_next_action(goal)
                         local_actions = []
                         pixel_values = inputs.pixel_values
-                        image_grid_thw = torch.cat(
-                            [thw.unsqueeze(0) for thw in inputs.image_grid_thw], dim=0
-                        )
-                        
+                        image_grid_thw = torch.cat([thw.unsqueeze(0) for thw in inputs.image_grid_thw], dim=0)
+
                         with torch.no_grad():
                             traj_latents = self.model.generate_latents(output_ids, pixel_values, image_grid_thw)
-                        
+
                         image_dp = torch.tensor(np.array(look_down_image.resize((224, 224)))).to(torch.bfloat16)
                         pix_goal_image = copy.copy(image_dp)
                         images_dp = torch.stack([pix_goal_image, image_dp]).unsqueeze(0)
                         depth_dp = look_down_depth.unsqueeze(-1).to(torch.bfloat16)
                         pix_goal_depth = copy.copy(depth_dp)
                         depths_dp = torch.stack([pix_goal_depth, depth_dp]).unsqueeze(0)
-                        
+
                         with torch.no_grad():
-                            dp_actions = self.model.generate_traj(traj_latents) 
-                    
+                            dp_actions = self.model.generate_traj(traj_latents)
+
                         random_choice = np.random.choice(dp_actions.shape[0])
                         if self.args.continuous_traj:
                             action_list = traj_to_actions(dp_actions)
                             if len(action_list) < 8:
-                                action_list += [0] * (8-len(action_list))
+                                action_list += [0] * (8 - len(action_list))
                         else:
                             action_list = chunk_token(dp_actions[random_choice])
                         print("first action_list", action_list)
-                        local_actions = action_list                        
+                        local_actions = action_list
                         action = local_actions[0]
                         if action == 0:
                             goal = None
@@ -524,10 +529,10 @@ def run_single_eval(self):
                             continue
 
                         print('predicted goal', pixel_goal, goal, flush=True)
-                    else:                           
+                    else:
                         action_seq = self.parse_actions(llm_outputs)
                         print('actions', action_seq, flush=True)
-                                                
+
                 if len(action_seq) != 0:
                     action = action_seq[0]
                     action_seq.pop(0)
@@ -536,14 +541,14 @@ def run_single_eval(self):
                         action = local_actions.pop(0)
                     else:
                         action = 0
-                    forward_action +=1
+                    forward_action += 1
                     print('forward_action', forward_action, flush=True)
                     if forward_action > 8:
-                        goal =  None
+                        goal = None
                         output_ids = None
                         messages = []
                         step_id += 1
-                        forward_action =0
+                        forward_action = 0
                         local_actions = []
                         continue
                     if action == 0:
@@ -551,28 +556,38 @@ def run_single_eval(self):
                         output_ids = None
                         messages = []
                         step_id += 1
-                        forward_action =0
+                        forward_action = 0
                         local_actions = []
                         continue
                 else:
                     action = 0
-                    
+
                 if info['top_down_map'] is not None:
                     frame = observations_to_image({'rgb': np.asarray(save_raw_image)}, info)
                     if current_frame_infer_pixel:
-                        frame = cv2.putText(frame, f"{pixel_goal[1], pixel_goal[0]}", (50, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)
+                        frame = cv2.putText(
+                            frame,
+                            f"{pixel_goal[1], pixel_goal[0]}",
+                            (50, 80),
+                            cv2.FONT_HERSHEY_SIMPLEX,
+                            1,
+                            (255, 0, 0),
+                            2,
+                        )
                         frame = cv2.circle(frame, (pixel_goal[1], pixel_goal[0]), 5, (255, 0, 0), -1)
                     else:
                         output_str = str(action)
-                        output_str = output_str.replace('1', "Go forward").replace('2', 'Turn left').replace('3', 'Turn right')
+                        output_str = (
+                            output_str.replace('1', "Go forward").replace('2', 'Turn left').replace('3', 'Turn right')
+                        )
                         output_str = output_str.replace('5', 'Look down').replace('0', 'Stop!')
                         frame = cv2.putText(frame, output_str, (50, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)
                     vis_frames.append(frame)
                     if action == 5:
                         vis_frames.append(frame)
-                    
+
                 print("step_id", step_id, "action", action)
-                
+
                 if action == 5:
                     env.step(action)
                     observations = env.step(action)
@@ -580,19 +595,23 @@ def run_single_eval(self):
                     observations = env.step(action)
                     step_id += 1
                     messages = []
-            
+
             self.infer_success_cnt += 1
-            
+
             metrics = env.get_metrics()
-            if self.save_video :
-                images_to_video(
-                    vis_frames, self.output_path, f"res_{self.infer_success_cnt}",fps=6, quality=9
-                )
+            if self.save_video:
+                images_to_video(vis_frames, self.output_path, f"res_{self.infer_success_cnt}", fps=6, quality=9)
             self.infer_success = True
             vis_frames.clear()
 
         env.close()
-        return torch.tensor(sucs).to(self.device), torch.tensor(spls).to(self.device), torch.tensor(oss).to(self.device), torch.tensor(nes).to(self.device), torch.tensor(len(sucs)).to(self.device)
+        return (
+            torch.tensor(sucs).to(self.device),
+            torch.tensor(spls).to(self.device),
+            torch.tensor(oss).to(self.device),
+            torch.tensor(nes).to(self.device),
+            torch.tensor(len(sucs)).to(self.device),
+        )
 
     def parse_actions(self, output):
         action_patterns = '|'.join(re.escape(action) for action in self.actions2idx)
@@ -607,13 +626,14 @@ def preprocess_qwenvl(self, source):
         prompt = random.choice(self.conjunctions) + DEFAULT_IMAGE_TOKEN
         if len(source[0]["value"]) != 0:
             source[0]["value"] += f" {prompt}."
-        else: 
-            source[0]["value"] = f"{prompt}." # Please output the next waypoint\'s coordinates in the image."
+        else:
+            source[0]["value"] = f"{prompt}."  # Please output the next waypoint\'s coordinates in the image."
         return source
-    
+
+
 def eval():
     global local_rank
-    
+
     parser = argparse.ArgumentParser()
     parser.add_argument("--local_rank", default=0, type=int, help="node rank")
     parser.add_argument("--model_path", type=str, default="")
@@ -629,18 +649,14 @@ def eval():
     parser.add_argument("--predict_step_nums", type=int, default=16)
     parser.add_argument("--continuous_traj", action="store_true", default=False)
     parser.add_argument("--max_new_tokens", type=int, default=1024)
-    
-    parser.add_argument('--world_size', default=1, type=int,
-                        help='number of distributed processes')
-    parser.add_argument('--rank', default=0, type=int,
-                        help='rank')
-    parser.add_argument('--gpu', default=0, type=int,
-                        help='gpu')
+
+    parser.add_argument('--world_size', default=1, type=int, help='number of distributed processes')
+    parser.add_argument('--rank', default=0, type=int, help='rank')
+    parser.add_argument('--gpu', default=0, type=int, help='gpu')
     parser.add_argument('--port', default='2333')
     parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training')
-    parser.add_argument('--device', default='cuda',
-                        help='device to use for training / testing')
-    
+    parser.add_argument('--device', default='cuda', help='device to use for training / testing')
+
     args = parser.parse_args()
     init_distributed_mode(args)
     local_rank = args.local_rank
@@ -651,10 +667,9 @@ def eval():
 
     device = torch.device(f"cuda:{local_rank}")
     model = InternVLAN1ForCausalLM.from_pretrained(
-        args.model_path, torch_dtype=torch.bfloat16,
-        attn_implementation="flash_attention_2", device_map={"": device}
+        args.model_path, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2", device_map={"": device}
     )
-    
+
     # start the evaluation
     evaluate(model, processor, args)
 
@@ -670,9 +685,9 @@ def evaluate(model, processor, args):
         model=model,
         processor=processor,
         epoch=0,
-        args=args
+        args=args,
     )
-    sucs, spls, oss, nes, ep_num = evaluator.eval_action(idx=get_rank()) 
+    sucs, spls, oss, nes, ep_num = evaluator.eval_action(idx=get_rank())
 
 
 if __name__ == "__main__":
diff --git a/internnav/habitat_extensions/fonts/arial.ttf b/internnav/projects/habitat_extensions/fonts/arial.ttf
similarity index 100%
rename from internnav/habitat_extensions/fonts/arial.ttf
rename to internnav/projects/habitat_extensions/fonts/arial.ttf
diff --git a/internnav/habitat_extensions/maps.py b/internnav/projects/habitat_extensions/maps.py
similarity index 90%
rename from internnav/habitat_extensions/maps.py
rename to internnav/projects/habitat_extensions/maps.py
index 2ba9112a..68f62d73 100644
--- a/internnav/habitat_extensions/maps.py
+++ b/internnav/projects/habitat_extensions/maps.py
@@ -28,9 +28,7 @@
 MAP_SHORTEST_PATH_WAYPOINT = 14
 
 TOP_DOWN_MAP_COLORS = np.full((256, 3), 150, dtype=np.uint8)
-TOP_DOWN_MAP_COLORS[15:] = cv2.applyColorMap(
-    np.arange(241, dtype=np.uint8), cv2.COLORMAP_JET
-).squeeze(1)[:, ::-1]
+TOP_DOWN_MAP_COLORS[15:] = cv2.applyColorMap(np.arange(241, dtype=np.uint8), cv2.COLORMAP_JET).squeeze(1)[:, ::-1]
 TOP_DOWN_MAP_COLORS[MAP_INVALID_POINT] = [255, 255, 255]  # White
 TOP_DOWN_MAP_COLORS[MAP_VALID_POINT] = [150, 150, 150]  # Light Grey
 TOP_DOWN_MAP_COLORS[MAP_BORDER_INDICATOR] = [50, 50, 50]  # Grey
@@ -73,9 +71,7 @@ def colorize_top_down_map(
         # Only desaturate valid points as only valid points get revealed
         desat_mask = top_down_map != MAP_INVALID_POINT
 
-        _map[desat_mask] = (
-            _map * fog_of_war_desat_values[fog_of_war_mask]
-        ).astype(np.uint8)[desat_mask]
+        _map[desat_mask] = (_map * fog_of_war_desat_values[fog_of_war_mask]).astype(np.uint8)[desat_mask]
 
     return _map
 
@@ -205,9 +201,7 @@ def draw_reference_path(
         pt_from = pt_to
 
     for pt in shortest_path_points:
-        drawpoint(
-            img, (pt[1], pt[0]), MAP_SHORTEST_PATH_WAYPOINT, meters_per_px
-        )
+        drawpoint(img, (pt[1], pt[0]), MAP_SHORTEST_PATH_WAYPOINT, meters_per_px)
 
 
 def draw_straight_shortest_path_points(
@@ -219,10 +213,7 @@ def draw_straight_shortest_path_points(
     """Draws the shortest path from start to goal assuming a standard
     discrete action space.
     """
-    shortest_path_points = [
-        habitat_maps.to_grid(p[2], p[0], img.shape[0:2], sim)[::-1]
-        for p in shortest_path_points
-    ]
+    shortest_path_points = [habitat_maps.to_grid(p[2], p[0], img.shape[0:2], sim)[::-1] for p in shortest_path_points]
 
     habitat_maps.draw_path(
         img,
@@ -232,9 +223,7 @@ def draw_straight_shortest_path_points(
     )
 
 
-def draw_source_and_target(
-    img: np.ndarray, sim: Simulator, episode: VLNEpisode, meters_per_px: float
-) -> None:
+def draw_source_and_target(img: np.ndarray, sim: Simulator, episode: VLNEpisode, meters_per_px: float) -> None:
     s_x, s_y = habitat_maps.to_grid(
         episode.start_position[2],
         episode.start_position[0],
@@ -285,18 +274,14 @@ def get_nearest_node(graph: nx.Graph, current_position: List[float]) -> str:
     for node in graph:
         node_pos = graph.nodes[node]["position"]
         node_pos = np.take(node_pos, (0, 2))
-        cur_dist = np.linalg.norm(
-            np.array(node_pos) - np.array(current_position), ord=2
-        )
+        cur_dist = np.linalg.norm(np.array(node_pos) - np.array(current_position), ord=2)
         if cur_dist < dist:
             dist = cur_dist
             nearest = node
     return nearest
 
 
-def update_nearest_node(
-    graph: nx.Graph, nearest_node: str, current_position: np.ndarray
-) -> str:
+def update_nearest_node(graph: nx.Graph, nearest_node: str, current_position: np.ndarray) -> str:
     """Determine the closest MP3D node to the agent's current position as
     given by a [x,z] position vector. The selected node must be reachable
     from the previous MP3D node as specified in the nav-graph edges.
@@ -309,9 +294,7 @@ def update_nearest_node(
     for node in [nearest_node] + [e[1] for e in graph.edges(nearest_node)]:
         node_pos = graph.nodes[node]["position"]
         node_pos = np.take(node_pos, (0, 2))
-        cur_dist = np.linalg.norm(
-            np.array(node_pos) - np.array(current_position), ord=2
-        )
+        cur_dist = np.linalg.norm(np.array(node_pos) - np.array(current_position), ord=2)
         if cur_dist < dist:
             dist = cur_dist
             nearest = node
@@ -325,18 +308,14 @@ def draw_mp3d_nodes(
     graph: nx.Graph,
     meters_per_px: float,
 ) -> None:
-    n = get_nearest_node(
-        graph, (episode.start_position[0], episode.start_position[2])
-    )
+    n = get_nearest_node(graph, (episode.start_position[0], episode.start_position[2]))
     starting_height = graph.nodes[n]["position"][1]
     for node in graph:
         pos = graph.nodes[node]["position"]
 
         # no obvious way to differentiate between floors. Use this for now.
         if abs(pos[1] - starting_height) < 1.0:
-            r_x, r_y = habitat_maps.to_grid(
-                pos[2], pos[0], img.shape[0:2], sim
-            )
+            r_x, r_y = habitat_maps.to_grid(pos[2], pos[0], img.shape[0:2], sim)
 
             # only paint if over a valid point
             if img[r_x, r_y]:
@@ -388,4 +367,4 @@ def image_resize(
             img = img.permute(0, 1, 3, 4, 2)
     if no_batch_dim:
         img = img.squeeze(dim=0)  # Removes the batch dimension
-    return img
\ No newline at end of file
+    return img
diff --git a/internnav/habitat_extensions/measures.py b/internnav/projects/habitat_extensions/measures.py
similarity index 94%
rename from internnav/habitat_extensions/measures.py
rename to internnav/projects/habitat_extensions/measures.py
index 048d4473..751bbf00 100644
--- a/internnav/habitat_extensions/measures.py
+++ b/internnav/projects/habitat_extensions/measures.py
@@ -1,23 +1,15 @@
-import gzip
-import json
-import pickle
 from typing import Any, List, Union
 
 import numpy as np
+
 # from dtw import dtw
 # from fastdtw import fastdtw
 # from habitat.config import Config
-from habitat.core.dataset import Episode
-from habitat.core.embodied_task import Action, EmbodiedTask, Measure
-from habitat.core.logging import logger
+from habitat.core.embodied_task import EmbodiedTask, Measure
 from habitat.core.registry import registry
 from habitat.core.simulator import Simulator
 from habitat.core.utils import try_cv2_import
-from habitat.tasks.nav.nav import DistanceToGoal, Success
-from habitat.tasks.utils import cartesian_to_polar
-from habitat.utils.geometry_utils import quaternion_rotate_vector
-from habitat.utils.visualizations import fog_of_war
-from habitat.utils.visualizations import maps as habitat_maps
+from habitat.tasks.nav.nav import DistanceToGoal
 from numpy import ndarray
 
 # from utils import maps
@@ -26,9 +18,7 @@
 cv2 = try_cv2_import()
 
 
-def euclidean_distance(
-    pos_a: Union[List[float], ndarray], pos_b: Union[List[float], ndarray]
-) -> float:
+def euclidean_distance(pos_a: Union[List[float], ndarray], pos_b: Union[List[float], ndarray]) -> float:
     return np.linalg.norm(np.array(pos_b) - np.array(pos_a), ord=2)
 
 
@@ -54,9 +44,7 @@ def reset_metric(self, *args: Any, **kwargs: Any):
 
     def update_metric(self, *args: Any, **kwargs: Any):
         current_position = self._sim.get_agent_state().position
-        self._metric += euclidean_distance(
-            current_position, self._previous_position
-        )
+        self._metric += euclidean_distance(current_position, self._previous_position)
         self._previous_position = current_position
 
 
@@ -73,16 +61,12 @@ def _get_uuid(self, *args: Any, **kwargs: Any) -> str:
         return self.cls_uuid
 
     def reset_metric(self, *args: Any, task: EmbodiedTask, **kwargs: Any):
-        task.measurements.check_measure_dependencies(
-            self.uuid, [DistanceToGoal.cls_uuid]
-        )
+        task.measurements.check_measure_dependencies(self.uuid, [DistanceToGoal.cls_uuid])
         self._metric = float("inf")
         self.update_metric(task=task)
 
     def update_metric(self, *args: Any, task: EmbodiedTask, **kwargs: Any):
-        distance_to_target = task.measurements.measures[
-            DistanceToGoal.cls_uuid
-        ].get_metric()
+        distance_to_target = task.measurements.measures[DistanceToGoal.cls_uuid].get_metric()
         self._metric = min(self._metric, distance_to_target)
 
 
@@ -105,9 +89,7 @@ def _get_uuid(self, *args: Any, **kwargs: Any) -> str:
         return self.cls_uuid
 
     def reset_metric(self, *args: Any, task: EmbodiedTask, **kwargs: Any):
-        task.measurements.check_measure_dependencies(
-            self.uuid, [DistanceToGoal.cls_uuid]
-        )
+        task.measurements.check_measure_dependencies(self.uuid, [DistanceToGoal.cls_uuid])
         self._metric = 0.0
         self.update_metric(task=task)
 
diff --git a/scripts/eval/navigation_ui.py b/scripts/demo/navigation_ui.py
similarity index 77%
rename from scripts/eval/navigation_ui.py
rename to scripts/demo/navigation_ui.py
index 73d522f2..2f13bfda 100644
--- a/scripts/eval/navigation_ui.py
+++ b/scripts/demo/navigation_ui.py
@@ -1,32 +1,19 @@
-import base64
 import json
-import logging
 import os
-import subprocess
-import sys
-import threading
 import time
 import uuid
+from collections import defaultdict
 from datetime import datetime, timedelta
-from typing import Dict, List, Optional
+from typing import Optional
 
 import gradio as gr
-import numpy as np
-import open3d as o3d
-import plotly.graph_objects as go
 import requests
-from fastapi import APIRouter, FastAPI, HTTPException, status, BackgroundTasks, Response
-from pydantic import BaseModel
-
-import asyncio
-import uvicorn
-from collections import defaultdict
 
-BACKEND_URL = os.getenv("BACKEND_URL", "http://localhost:8001") # fastapi server
+BACKEND_URL = os.getenv("BACKEND_URL", "http://localhost:8001")  # fastapi server
 API_ENDPOINTS = {
     "submit_task": f"{BACKEND_URL}/predict/video",
     "query_status": f"{BACKEND_URL}/predict/task",
-    "get_result": f"{BACKEND_URL}/predict"
+    "get_result": f"{BACKEND_URL}/predict",
 }
 
 
@@ -34,17 +21,17 @@
     "scene_1": {
         "description": "Modern Apartment",
         "name": "17DRP5sb8fy",
-        "glb_path": "scene_assets/scene1_no_ceiling.glb"  # PLY file path
+        "glb_path": "scene_assets/scene1_no_ceiling.glb",  # PLY file path
     },
     "scene_2": {
         "description": "Office Building",
         "name": "r1Q1Z4BcV1o",
-        "glb_path": "scene_assets/scene2_no_ceiling.glb"
+        "glb_path": "scene_assets/scene2_no_ceiling.glb",
     },
     "scene_3": {
         "description": "University Campus",
         "name": "dhjEzFoUFzH",
-        "glb_path": "scene_assets/scene3_no_ceiling.glb"
+        "glb_path": "scene_assets/scene3_no_ceiling.glb",
     },
 }
 
@@ -60,17 +47,18 @@
     },
     "episode_4": {
         "description": "4",
-    }
+    },
 }
 
-MODEL_CHOICES = [] 
+MODEL_CHOICES = []
 
 
 ###############################################################################
 
 SESSION_TASKS = {}
 IP_REQUEST_RECORDS = defaultdict(list)
-IP_LIMIT = 5  
+IP_LIMIT = 5
+
 
 def is_request_allowed(ip: str) -> bool:
     now = datetime.now()
@@ -80,6 +68,7 @@ def is_request_allowed(ip: str) -> bool:
         return True
     return False
 
+
 ###############################################################################
 
 
@@ -89,18 +78,20 @@ def is_request_allowed(ip: str) -> bool:
 ACCESS_LOG = os.path.join(LOG_DIR, "access.log")
 SUBMISSION_LOG = os.path.join(LOG_DIR, "submissions.log")
 
+
 def log_access(user_ip: str = None, user_agent: str = None):
     timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
     log_entry = {
         "timestamp": timestamp,
         "type": "access",
         "user_ip": user_ip or "unknown",
-        "user_agent": user_agent or "unknown"
+        "user_agent": user_agent or "unknown",
     }
-    
+
     with open(ACCESS_LOG, "a") as f:
         f.write(json.dumps(log_entry) + "\n")
 
+
 def log_submission(scene: str, prompt: str, model: str, user: str = "anonymous", res: str = "unknown"):
     timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
     log_entry = {
@@ -110,16 +101,17 @@ def log_submission(scene: str, prompt: str, model: str, user: str = "anonymous",
         "scene": scene,
         "prompt": prompt,
         "model": model,
-        #"max_step": str(max_step),
-        "res": res
+        # "max_step": str(max_step),
+        "res": res,
     }
-    
+
     with open(SUBMISSION_LOG, "a") as f:
         f.write(json.dumps(log_entry) + "\n")
 
+
 def read_logs(log_type: str = "all", max_entries: int = 50) -> list:
     logs = []
-    
+
     if log_type in ["all", "access"]:
         try:
             with open(ACCESS_LOG, "r") as f:
@@ -127,7 +119,7 @@ def read_logs(log_type: str = "all", max_entries: int = 50) -> list:
                     logs.append(json.loads(line.strip()))
         except FileNotFoundError:
             pass
-    
+
     if log_type in ["all", "submission"]:
         try:
             with open(SUBMISSION_LOG, "r") as f:
@@ -135,36 +127,37 @@ def read_logs(log_type: str = "all", max_entries: int = 50) -> list:
                     logs.append(json.loads(line.strip()))
         except FileNotFoundError:
             pass
-    
-    # Sorted by timestemp
+
+    # Sorted by timestamp
     logs.sort(key=lambda x: x["timestamp"], reverse=True)
     return logs[:max_entries]
 
+
 def format_logs_for_display(logs: list) -> str:
     if not logs:
         return "No log record"
-    
+
     markdown = "### System Access Log\n\n"
     markdown += "| Time | Type | User/IP | Details |\n"
     markdown += "|------|------|---------|----------|\n"
-    
+
     for log in logs:
         timestamp = log.get("timestamp", "unknown")
         log_type = "Access" if log.get("type") == "access" else "Submission"
-        
+
         if log_type == "Access":
             user = log.get("user_ip", "unknown")
             details = f"User-Agent: {log.get('user_agent', 'unknown')}"
         else:
             user = log.get("user", "anonymous")
             result = log.get('res', 'unknown')
-            if result != "success": 
+            if result != "success":
                 if len(result) > 40:  # Adjust this threshold as needed
                     result = f"{result[:20]}...{result[-20:]}"
             details = f"Scene: {log.get('scene', 'unknown')}, Prompt: {log.get('prompt', '')}, Model: {log.get('model', 'unknown')}, result: {result}"
-        
+
         markdown += f"| {timestamp} | {log_type} | {user} | {details} |\n"
-    
+
     return markdown
 
 
@@ -185,31 +178,22 @@ def submit_to_backend(
         "scene_index": scene_index,
         "episode_index": episode_index,
     }
-    
-    payload = {
-        "user": user,
-        "task": "robot_navigation",
-        "job_id": job_id,
-        "data": json.dumps(data)
-    }
-    
+
+    payload = {"user": user, "task": "robot_navigation", "job_id": job_id, "data": json.dumps(data)}
+
     try:
         headers = {"Content-Type": "application/json"}
-        response = requests.post(
-            API_ENDPOINTS["submit_task"],
-            json=payload,
-            headers=headers,
-            timeout=600
-        )
+        response = requests.post(API_ENDPOINTS["submit_task"], json=payload, headers=headers, timeout=600)
         return response.json()
     except Exception as e:
         return {"status": "error", "message": str(e)}
 
+
 def get_task_status(task_id: str) -> dict:
     try:
         response = requests.get(f"{API_ENDPOINTS['query_status']}/{task_id}", timeout=600)
         try:
-            return response.json()  
+            return response.json()
         except json.JSONDecodeError:
             return {"status": "error", "message": response.text}
     except Exception as e:
@@ -218,22 +202,14 @@ def get_task_status(task_id: str) -> dict:
 
 def get_task_result(task_id: str) -> Optional[dict]:
     try:
-        response = requests.get(
-            f"{API_ENDPOINTS['get_result']}/{task_id}",
-            timeout=5
-        )
+        response = requests.get(f"{API_ENDPOINTS['get_result']}/{task_id}", timeout=5)
         return response.json()
     except Exception as e:
         print(f"Error fetching result: {e}")
         return None
 
-def run_simulation(
-    scene: str,
-    prompt: str,
-    episode: str,
-    history: list,
-    request: gr.Request
-) -> dict:
+
+def run_simulation(scene: str, prompt: str, episode: str, history: list, request: gr.Request) -> dict:
     model = "InternNav-VLA"
 
     timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
@@ -245,14 +221,14 @@ def run_simulation(
     if not is_request_allowed(user_ip):
         log_submission(scene, prompt, model, user_ip, "IP blocked temporarily")
         raise gr.Error("Too many requests from this IP. Please wait and try again one minute later.")
-    
+
     submission_result = submit_to_backend(scene, prompt, episode)
     print("submission_result: ", submission_result)
 
     if submission_result.get("status") != "pending":
         log_submission(scene, prompt, model, user_ip, "Submission failed")
         raise gr.Error(f"Submission failed: {submission_result.get('message', 'unknown issue')}")
-    
+
     try:
         task_id = submission_result["task_id"]
         SESSION_TASKS[session_id] = task_id
@@ -276,24 +252,19 @@ def run_simulation(
         time.sleep(1)
     if status.get("status") == "completed":
         import base64
+
         video_bytes = base64.b64decode(status.get("video"))
         receive_time = time.time()
         with open(f"received_video_{receive_time}.mp4", "wb") as f:
             f.write(video_bytes)
         video_path = f"received_video_{receive_time}.mp4"
-        new_entry = {
-            "timestamp": timestamp,
-            "scene": scene,
-            "model": model,
-            "prompt": prompt,
-            "video_path": video_path
-        }
-        
+        new_entry = {"timestamp": timestamp, "scene": scene, "model": model, "prompt": prompt, "video_path": video_path}
+
         updated_history = history + [new_entry]
-        
+
         if len(updated_history) > 10:
             updated_history = updated_history[:10]
-        
+
         print("updated_history:", updated_history)
         log_submission(scene, prompt, model, user_ip, "success")
         gr.Info("Simulation completed successfully!")
@@ -316,32 +287,42 @@ def run_simulation(
         log_submission(scene, prompt, model, user_ip, "missing task's status from backend")
         yield None, history
 
+
 ###################################################################################################################
 def update_history_display(history: list) -> list:
     print("update_history_display")
     updates = []
-    
+
     for i in range(10):
         if i < len(history):
             entry = history[i]
-            updates.extend([
-                gr.update(visible=True),
-                gr.update(visible=True, label=f"Simulation {i+1}  scene: {entry['scene']}, prompt: {entry['prompt']}", open=False),
-                gr.update(value=entry['video_path'], visible=True),
-                gr.update(value=f"{entry['timestamp']}")
-            ])
+            updates.extend(
+                [
+                    gr.update(visible=True),
+                    gr.update(
+                        visible=True,
+                        label=f"Simulation {i+1}  scene: {entry['scene']}, prompt: {entry['prompt']}",
+                        open=False,
+                    ),
+                    gr.update(value=entry['video_path'], visible=True),
+                    gr.update(value=f"{entry['timestamp']}"),
+                ]
+            )
             print(f'update video')
             print(entry['video_path'])
         else:
-            updates.extend([
-                gr.update(visible=False),
-                gr.update(visible=False),
-                gr.update(value=None, visible=False),
-                gr.update(value="")
-            ])
+            updates.extend(
+                [
+                    gr.update(visible=False),
+                    gr.update(visible=False),
+                    gr.update(value=None, visible=False),
+                    gr.update(value=""),
+                ]
+            )
     print("update_history_display end!!")
     return updates
 
+
 def update_scene_display(scene: str):
     print(f"update_scene_display {scene}")
     config = SCENE_CONFIGS.get(scene, {})
@@ -353,6 +334,7 @@ def update_scene_display(scene: str):
 
     return None, glb_path
 
+
 def update_episode_display(scene: str, episode: str):
     print(f"update_episode_display {scene} {episode}")
     config = SCENE_CONFIGS.get(scene, {})
@@ -360,14 +342,18 @@ def update_episode_display(scene: str, episode: str):
     episode_id = int(episode[-1])
     image_path = os.path.join("scene_assets", f"{scene_name}_{episode_id-1}.jpg")
     print(f"image_path {image_path}")
-    # vaild if file path exists
+    # valid if file path exists
     if not os.path.exists(image_path):
         return None
 
     return image_path
+
+
 def update_log_display():
     logs = read_logs()
     return format_logs_for_display(logs)
+
+
 ##############################################################################
 
 
@@ -382,7 +368,6 @@ def cleanup_session(request: gr.Request):
             print(f"Task Termination Failed: {task_id}: {e}")
 
 
-
 ###############################################################################
 
 custom_css = """
@@ -419,11 +404,13 @@ def cleanup_session(request: gr.Request):
 """
 
 with gr.Blocks(title="Robot Navigation Inference", css=custom_css) as demo:
-    gr.Markdown("""
+    gr.Markdown(
+        """
     # 🧭 Habitat Robot Navigation Demo
     ### Simulation Test Based on Habitat Framework
-    """)
-    
+    """
+    )
+
     history_state = gr.State([])
 
     with gr.Row():
@@ -435,7 +422,7 @@ def cleanup_session(request: gr.Request):
                     choices=list(SCENE_CONFIGS.keys()),
                     value="scene_1",
                     interactive=True,
-            )
+                )
                 episode_dropdown = gr.Dropdown(
                     label="Select Start Position",
                     choices=list(EPISODE_CONFIGS.keys()),
@@ -444,41 +431,33 @@ def cleanup_session(request: gr.Request):
                 )
 
             with gr.Row():
-                scene_preview = gr.Model3D(elem_classes=["scene-preview"],
-                camera_position=(90.0, 120, 20000.0),
-                #display_mode="solid"
+                scene_preview = gr.Model3D(
+                    elem_classes=["scene-preview"],
+                    camera_position=(90.0, 120, 20000.0),
+                    # display_mode="solid"
                 )
                 fps_preview = gr.Image(label="FPS Preview")
-            
+
             scene_description = gr.Markdown("### Scene preview")
-            
+
             prompt_input = gr.Textbox(
                 label="Navigation Instruction",
                 value="Exit the bedroom and turn left. Walk straight passing the gray couch and stop near the rug.",
                 placeholder="e.g.: 'Exit the bedroom and turn left. Walk straight passing the gray couch and stop near the rug.'",
                 lines=2,
-                max_lines=4
+                max_lines=4,
             )
-                        
+
             scene_dropdown.change(
-                update_scene_display,
-                inputs=scene_dropdown,
-                outputs=[scene_description, scene_preview]
-            ).then(
-                update_episode_display,
-                inputs=[scene_dropdown, episode_dropdown],
-                outputs=[fps_preview]
-            )
-            
+                update_scene_display, inputs=scene_dropdown, outputs=[scene_description, scene_preview]
+            ).then(update_episode_display, inputs=[scene_dropdown, episode_dropdown], outputs=[fps_preview])
+
             episode_dropdown.change(
-                update_episode_display,
-                inputs=[scene_dropdown, episode_dropdown],
-                outputs=[fps_preview]
+                update_episode_display, inputs=[scene_dropdown, episode_dropdown], outputs=[fps_preview]
             )
-            
+
             submit_btn = gr.Button("Start Navigation Simulation", variant="primary")
 
-       
         with gr.Column(elem_id="result-panel"):
             gr.Markdown("### Latest Simulation Result")
 
@@ -490,86 +469,72 @@ def cleanup_session(request: gr.Request):
                 autoplay=True,
                 # streaming=True
             )
-            
+
             with gr.Column() as history_container:
                 gr.Markdown("### History")
                 gr.Markdown("#### History will be reset after refresh")
-                
+
                 history_slots = []
                 for i in range(10):
                     with gr.Column(visible=False) as slot:
                         with gr.Accordion(visible=False, open=False) as accordion:
-                            video = gr.Video(interactive=False)  
-                            detail_md = gr.Markdown() 
-                    history_slots.append((slot, accordion, video, detail_md))  
-    
+                            video = gr.Video(interactive=False)
+                            detail_md = gr.Markdown()
+                    history_slots.append((slot, accordion, video, detail_md))
+
     with gr.Accordion("View System Log (DEV ONLY)", open=False):
         logs_display = gr.Markdown()
         refresh_logs_btn = gr.Button("Refresh Log", variant="secondary")
-        
-        refresh_logs_btn.click(
-            update_log_display,
-            outputs=logs_display
-        )
+
+        refresh_logs_btn.click(update_log_display, outputs=logs_display)
 
     gr.Examples(
         examples=[
-            ["scene_1", "Exit the bedroom and turn left. Walk straight passing the gray couch and stop near the rug.", "episode_0"],
+            [
+                "scene_1",
+                "Exit the bedroom and turn left. Walk straight passing the gray couch and stop near the rug.",
+                "episode_0",
+            ],
             ["scene_2", "Go from reception to conference room passing the water cooler.", "episode_1"],
             ["scene_3", "From the classroom, go to the library via the main hall.", "episode_2"],
-            ["scene_4", "From emergency room to pharmacy passing nurse station.", "episode_3"]
+            ["scene_4", "From emergency room to pharmacy passing nurse station.", "episode_3"],
         ],
         inputs=[scene_dropdown, prompt_input, episode_dropdown],
-        label="Navigation Task Example"
+        label="Navigation Task Example",
     )
-    
+
     submit_btn.click(
         fn=run_simulation,
         inputs=[scene_dropdown, prompt_input, episode_dropdown, history_state],
         outputs=[video_output, history_state],
         queue=True,
-        api_name="run_simulation"
+        api_name="run_simulation",
     ).then(
         fn=update_history_display,
         inputs=history_state,
         outputs=[comp for slot in history_slots for comp in slot],
-        queue=True
+        queue=True,
     ).then(
         fn=update_log_display,
         outputs=logs_display,
     )
 
-
-    demo.load(
-        fn=lambda: update_scene_display("scene_1"),
-        outputs=[scene_description, scene_preview]
-    ).then(
-        fn=update_log_display,
-        outputs=logs_display
-    )
-    demo.load(
-        fn=lambda: update_episode_display("scene_1", "episode_1"),
-        outputs=[fps_preview]
+    demo.load(fn=lambda: update_scene_display("scene_1"), outputs=[scene_description, scene_preview]).then(
+        fn=update_log_display, outputs=logs_display
     )
+    demo.load(fn=lambda: update_episode_display("scene_1", "episode_1"), outputs=[fps_preview])
 
     def record_access(request: gr.Request):
         user_ip = request.client.host if request else "unknown"
         user_agent = request.headers.get("user-agent", "unknown")
         log_access(user_ip, user_agent)
-        return update_log_display()   
+        return update_log_display()
 
-    demo.load(
-        fn=record_access,
-        inputs=None,
-        outputs=logs_display,
-        queue=False
-    )
+    demo.load(fn=record_access, inputs=None, outputs=logs_display, queue=False)
 
     demo.queue(default_concurrency_limit=8)
 
     demo.unload(fn=cleanup_session)
 
 if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=5750, debug=True, share = True, allowed_paths=["/mnt"])
-
-    
+    demo.launch(server_name="0.0.0.0", server_port=5750, debug=True, share=True, allowed_paths=["/mnt"])
diff --git a/scripts/eval/vln_gradio_backend.py b/scripts/demo/vln_gradio_backend.py
similarity index 80%
rename from scripts/eval/vln_gradio_backend.py
rename to scripts/demo/vln_gradio_backend.py
index 28bdb0b3..b6f89095 100644
--- a/scripts/eval/vln_gradio_backend.py
+++ b/scripts/demo/vln_gradio_backend.py
@@ -1,33 +1,24 @@
-from concurrent.futures import ThreadPoolExecutor
 import argparse
-import asyncio
 import base64
-import threading
-import subprocess
-import multiprocessing
-import numpy as np
+import json
 import os
-import torch
-import uvicorn
 import sys
-import time
-import json
 import uuid
-from typing import Dict, Optional
-from fastapi import APIRouter, FastAPI, HTTPException, status, BackgroundTasks, Response
-from pydantic import BaseModel
+
 # from utils.log_util import logger
-import logging
 from enum import Enum
-from internnav.utils.dist import *
-from internnav.habitat_extensions.evaluator_single import VLNEvaluator
-from internnav.model.basemodel.internvla_n1.internvla_n1 import InternVLAN1ForCausalLM
-from transformers import (
-    AutoTokenizer,
-    AutoProcessor,
-    Qwen2_5_VLForConditionalGeneration,
-)
+from typing import Dict, Optional
 
+import numpy as np
+import torch
+import uvicorn
+from fastapi import APIRouter, BackgroundTasks, FastAPI, HTTPException, status
+from pydantic import BaseModel
+from transformers import AutoProcessor
+
+from internnav.model.basemodel.internvla_n1.internvla_n1 import InternVLAN1ForCausalLM
+from internnav.projects.habitat_extensions.evaluator_single import VLNEvaluator
+from internnav.utils.dist import *
 
 PROJECT_ROOT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 sys.path.append(PROJECT_ROOT_PATH)
@@ -36,14 +27,16 @@
 
 global instruction
 
+
 class VideoRequest(BaseModel):
     """
     Frontend post json object template
     """
+
     user: str
     task: str
     job_id: str
-    data: str  
+    data: str
 
     """
     data json template
@@ -63,21 +56,22 @@ class VideoRequest(BaseModel):
         }
     """
 
+
 class TaskInfo:
     def __init__(self, task_id, status, result_path):
         self.task_id: str = task_id
         self.status: str = status
         self.result_path: str = result_path
 
+
 class TaskStatus(str, Enum):
-    pending = 'pending',
+    pending = ('pending',)
     completed = 'completed'
     failed = 'failed'
     terminated = 'terminated'
-    
-    
-class BackendServer:
 
+
+class BackendServer:
     def __init__(self, host: str, port: int):
         self.host = host
         self.port = port
@@ -93,7 +87,7 @@ def __init__(self, host: str, port: int):
     def _register_routes(self):
         route_config = [
             ('/video', self.predict, ['POST'], None),
-            ('/task/{task_id}', self.get_task_status, ['GET'], Dict[str, Optional[str]])
+            ('/task/{task_id}', self.get_task_status, ['GET'], Dict[str, Optional[str]]),
         ]
 
         for path, handler, methods, response_model in route_config:
@@ -102,22 +96,15 @@ def _register_routes(self):
                 endpoint=handler,
                 methods=methods,
                 response_model=response_model if 'GET' in methods else None,
-                status_code=status.HTTP_200_OK if 'GET' in methods else None
+                status_code=status.HTTP_200_OK if 'GET' in methods else None,
             )
-    
-    async def predict(
-        self,
-        request: VideoRequest,
-        background_tasks: BackgroundTasks
-    ) -> Dict[str, str]:
+
+    async def predict(self, request: VideoRequest, background_tasks: BackgroundTasks) -> Dict[str, str]:
         # Safety: allow tasks pending to MAX_TASK_LIMIT
         # TODO: may need to improve
         if sum(task.status == "pending" for task in self.tasks.values()) >= self.MAX_TASK_LIMIT:
             print(f"Failed to START Task: reach to limit")
-            raise HTTPException(
-                status_code=429,
-                detail=f"Failed to start new task: reach to limit"
-            )
+            raise HTTPException(status_code=429, detail=f"Failed to start new task: reach to limit")
 
         task_id = str(uuid.uuid4())
         path = os.path.join(output_path, task_id)
@@ -127,11 +114,11 @@ async def predict(
         background_tasks.add_task(self._submit_task, task_id, request.data, path)
 
         print(f"Start Task: {task_id} for user: {request.user}, task: {request.task}")
-            
+
         return {"task_id": task_id, "status": "pending"}
 
     def _submit_task(self, task_id: str, data: str, path: str):
-        
+
         print(f"process task: ID={task_id}")
         print(f"receive data: {data}...")  # 只打印前100个字符
         try:
@@ -140,17 +127,18 @@ def _submit_task(self, task_id: str, data: str, path: str):
                 print("=======VLN Eval Task=======")
                 cache_dir = f"/tmp/InternNav/.triton"
                 os.makedirs(cache_dir, exist_ok=True)
-                os.chmod(cache_dir, 0o777)  
+                os.chmod(cache_dir, 0o777)
 
                 evaluator.infer_scene_id = int(data_dict["scene_index"]) - 1
                 evaluator.infer_episode_id = int(data_dict["episode_index"]) - 1
-                evaluator.infer_instruction = (data_dict["instruction"])
+                evaluator.infer_instruction = data_dict["instruction"]
                 evaluator.output_path = path
                 evaluator.infer_data_ready = True
                 evaluator.run_single_eval()
-                
+
         except Exception as e:
             import traceback
+
             print(traceback.format_exc())
             self.tasks[task_id].status = "failed"
             print(f"Task {task_id} failed: {e}")
@@ -168,14 +156,8 @@ async def get_task_status(self, task_id: str) -> Dict[str, Optional[str]]:
 
         return {"status": 'completed', "result": task.result_path, "video": video_data}
 
-        
     def run(self):
-        uvicorn.run(
-            '__main__:server.app',
-            host=self.host,
-            port=self.port
-        )
-
+        uvicorn.run('__main__:server.app', host=self.host, port=self.port)
 
 
 if __name__ == "__main__":
@@ -196,17 +178,13 @@ def run(self):
     parser.add_argument("--predict_step_nums", type=int, default=32)
     parser.add_argument("--continuous_traj", action="store_true", default=True)
     parser.add_argument("--max_new_tokens", type=int, default=1024)
-    
-    parser.add_argument('--world_size', default=1, type=int,
-                        help='number of distributed processes')
-    parser.add_argument('--rank', default=0, type=int,
-                        help='rank')
-    parser.add_argument('--gpu', default=0, type=int,
-                        help='gpu')
+
+    parser.add_argument('--world_size', default=1, type=int, help='number of distributed processes')
+    parser.add_argument('--rank', default=0, type=int, help='rank')
+    parser.add_argument('--gpu', default=0, type=int, help='gpu')
     parser.add_argument('--port', default='2443')
     parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training')
-    parser.add_argument('--device', default='cuda',
-                        help='device to use for training / testing')
+    parser.add_argument('--device', default='cuda', help='device to use for training / testing')
     args = parser.parse_args()
     init_distributed_mode(args)
     local_rank = args.local_rank
@@ -217,8 +195,7 @@ def run(self):
 
     device = torch.device(f"cuda:{local_rank}")
     model = InternVLAN1ForCausalLM.from_pretrained(
-        args.model_path, torch_dtype=torch.bfloat16,
-        attn_implementation="flash_attention_2", device_map={"": device}
+        args.model_path, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2", device_map={"": device}
     )
     model.eval()
     world_size = get_world_size()
@@ -230,11 +207,9 @@ def run(self):
         model=model,
         processor=processor,
         epoch=0,
-        args=args
+        args=args,
     )
     # evaluator.eval_action(0)
 
     server = BackendServer(host="0.0.0.0", port=8001)
     server.run()
-
-
diff --git a/scripts/eval/eval_dual_system.sh b/scripts/eval/bash/eval_dual_system.sh
similarity index 93%
rename from scripts/eval/eval_dual_system.sh
rename to scripts/eval/bash/eval_dual_system.sh
index 2bc91a13..ef4be1eb 100755
--- a/scripts/eval/eval_dual_system.sh
+++ b/scripts/eval/bash/eval_dual_system.sh
@@ -1,5 +1,5 @@
 export MAGNUM_LOG=quiet HABITAT_SIM_LOG=quiet
-export NCCL_SOCKET_IFNAME=bond0 
+export NCCL_SOCKET_IFNAME=bond0
 export NCCL_IB_HCA=mlx5_2,mlx5_3,mlx5_4,mlx5_5
 
 MID_RUN_NAME="InternVLA-N1"
@@ -16,4 +16,4 @@ srun -p efm_t \
     --model_path checkpoints/${MID_RUN_NAME} \
     --predict_step_nums 32 \
     --continuous_traj \
-    --output_path results/$MID_RUN_NAME/val_unseen_32traj_8steps \
\ No newline at end of file
+    --output_path results/$MID_RUN_NAME/val_unseen_32traj_8steps \
diff --git a/scripts/eval/eval_system2.sh b/scripts/eval/bash/eval_system2.sh
similarity index 85%
rename from scripts/eval/eval_system2.sh
rename to scripts/eval/bash/eval_system2.sh
index 2c4dbd80..bedfbb3b 100755
--- a/scripts/eval/eval_system2.sh
+++ b/scripts/eval/bash/eval_system2.sh
@@ -1,5 +1,5 @@
 export MAGNUM_LOG=quiet HABITAT_SIM_LOG=quiet
-export NCCL_SOCKET_IFNAME=bond0 
+export NCCL_SOCKET_IFNAME=bond0
 export NCCL_IB_HCA=mlx5_2,mlx5_3,mlx5_4,mlx5_5
 
 
@@ -16,4 +16,4 @@ srun -p efm_t \
     python scripts/eval/eval_habitat.py \
     --model_path /path/to/${MID_RUN_NAME} \
     --mode system2 \
-    --output_path results/$MID_RUN_NAME/val_unseen \
\ No newline at end of file
+    --output_path results/$MID_RUN_NAME/val_unseen \
diff --git a/scripts/eval/start_aliyun_dlc.sh b/scripts/eval/bash/start_aliyun_dlc.sh
similarity index 100%
rename from scripts/eval/start_aliyun_dlc.sh
rename to scripts/eval/bash/start_aliyun_dlc.sh
diff --git a/scripts/eval/start_eval.sh b/scripts/eval/bash/start_eval.sh
similarity index 99%
rename from scripts/eval/start_eval.sh
rename to scripts/eval/bash/start_eval.sh
index 478c3bf9..1f472a8a 100755
--- a/scripts/eval/start_eval.sh
+++ b/scripts/eval/bash/start_eval.sh
@@ -112,4 +112,4 @@ while true; do
             fi
         fi
     fi
-done
\ No newline at end of file
+done
diff --git a/scripts/eval/start_eval_iros.sh b/scripts/eval/bash/start_eval_iros.sh
similarity index 99%
rename from scripts/eval/start_eval_iros.sh
rename to scripts/eval/bash/start_eval_iros.sh
index 0615c105..12d819dd 100644
--- a/scripts/eval/start_eval_iros.sh
+++ b/scripts/eval/bash/start_eval_iros.sh
@@ -105,4 +105,4 @@ if [ -n "$processes" ]; then
         kill -9 $pid
         echo "kill: $pid"
     done
-fi
\ No newline at end of file
+fi
diff --git a/scripts/eval/eval_habitat.py b/scripts/eval/eval_habitat.py
index b125b873..46d54370 100644
--- a/scripts/eval/eval_habitat.py
+++ b/scripts/eval/eval_habitat.py
@@ -1,21 +1,15 @@
 import argparse
-import os
 import json
+import os
 
+import numpy as np
 import torch
-from transformers import (
-    AutoTokenizer,
-    AutoProcessor,
-    Qwen2_5_VLForConditionalGeneration,
-)
-
-import habitat
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
 
-import numpy as np
 from internnav.model.basemodel.internvla_n1.internvla_n1 import InternVLAN1ForCausalLM
+from internnav.projects.habitat_extensions.evaluator import VLNEvaluator
 from internnav.utils.dist import *
-from internnav.habitat_extensions.evaluator import VLNEvaluator
-from internnav.habitat_extensions import measures
+
 
 def parse_args():
 
@@ -25,7 +19,7 @@ def parse_args():
     parser.add_argument("--model_path", type=str, default="")
     parser.add_argument("--habitat_config_path", type=str, default='scripts/eval/configs/vln_r2r.yaml')
     parser.add_argument("--eval_split", type=str, default='val_unseen')
-    parser.add_argument("--output_path", type=str, default='./logs/habitat/test') #!
+    parser.add_argument("--output_path", type=str, default='./logs/habitat/test')  #!
     parser.add_argument("--num_future_steps", type=int, default=4)
     parser.add_argument("--num_frames", type=int, default=32)
     parser.add_argument("--save_video", action="store_true", default=False)
@@ -35,50 +29,50 @@ def parse_args():
     parser.add_argument("--predict_step_nums", type=int, default=16)
     parser.add_argument("--continuous_traj", action="store_true", default=False)
     parser.add_argument("--max_new_tokens", type=int, default=1024)
-    
-    parser.add_argument('--world_size', default=1, type=int,
-                        help='number of distributed processes')
-    parser.add_argument('--rank', default=0, type=int,
-                        help='rank')
-    parser.add_argument('--gpu', default=0, type=int,
-                        help='gpu')
+
+    parser.add_argument('--world_size', default=1, type=int, help='number of distributed processes')
+    parser.add_argument('--rank', default=0, type=int, help='rank')
+    parser.add_argument('--gpu', default=0, type=int, help='gpu')
     parser.add_argument('--port', default='2333')
     parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training')
-    parser.add_argument('--device', default='cuda',
-                        help='device to use for training / testing')
-    
+    parser.add_argument('--device', default='cuda', help='device to use for training / testing')
+
     return parser.parse_args()
 
 
 def main():
     args = parse_args()
-    
+
     init_distributed_mode(args)
     local_rank = args.local_rank
     np.random.seed(local_rank)
-    
-    #* 1. Load model and tokenizer. Currently, we support two modes: dual_system and system2 in Habitat.
+
+    # * 1. Load model and tokenizer. Currently, we support two modes: dual_system and system2 in Habitat.
     processor = AutoProcessor.from_pretrained(args.model_path)
     processor.tokenizer.padding_side = 'left'
-    
+
     device = torch.device(f"cuda:{local_rank}")
     if args.mode == 'dual_system':
         model = InternVLAN1ForCausalLM.from_pretrained(
-            args.model_path, torch_dtype=torch.bfloat16,
-            attn_implementation="flash_attention_2", device_map={"": device}
+            args.model_path,
+            torch_dtype=torch.bfloat16,
+            attn_implementation="flash_attention_2",
+            device_map={"": device},
         )
     elif args.mode == 'system2':
         model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-            args.model_path, torch_dtype=torch.bfloat16,
-            attn_implementation="flash_attention_2", device_map={"": device}
+            args.model_path,
+            torch_dtype=torch.bfloat16,
+            attn_implementation="flash_attention_2",
+            device_map={"": device},
         )
     else:
         raise ValueError(f"Invalid mode: {args.mode}")
 
     model.eval()
     world_size = get_world_size()
-    
-    #* 2. initialize evaluator
+
+    # * 2. initialize evaluator
     evaluator = VLNEvaluator(
         config_path=args.habitat_config_path,
         split=args.eval_split,
@@ -87,11 +81,11 @@ def main():
         model=model,
         processor=processor,
         epoch=0,
-        args=args
+        args=args,
     )
-    
-    #* 3. do eval
-    sucs, spls, oss, nes, ep_num = evaluator.eval_action(idx=get_rank()) 
+
+    # * 3. do eval
+    sucs, spls, oss, nes, ep_num = evaluator.eval_action(idx=get_rank())
     ep_num_all = [torch.zeros_like(ep_num) for _ in range(world_size)]
 
     # import ipdb; ipdb.set_trace()
@@ -105,24 +99,24 @@ def main():
     dist.all_gather(spls_all, spls)
     dist.all_gather(oss_all, oss)
     dist.all_gather(nes_all, nes)
-   
+
     sucs_all = torch.cat(sucs_all, dim=0)
     spls_all = torch.cat(spls_all, dim=0)
     oss_all = torch.cat(oss_all, dim=0)
     nes_all = torch.cat(nes_all, dim=0)
     result_all = {
-                    "sucs_all": (sum(sucs_all)/len(sucs_all)).item(),
-                    "spls_all": (sum(spls_all)/len(spls_all)).item(),
-                    "oss_all": (sum(oss_all)/len(oss_all)).item(),
-                    "nes_all": (sum(nes_all)/len(nes_all)).item(),
-                    'length': len(sucs_all)
-                }
+        "sucs_all": (sum(sucs_all) / len(sucs_all)).item(),
+        "spls_all": (sum(spls_all) / len(spls_all)).item(),
+        "oss_all": (sum(oss_all) / len(oss_all)).item(),
+        "nes_all": (sum(nes_all) / len(nes_all)).item(),
+        'length': len(sucs_all),
+    }
 
     print(result_all)
     if get_rank() == 0:
         with open(os.path.join(args.output_path, f'result.json'), 'a') as f:
             f.write(json.dumps(result_all))
-    
-    
+
+
 if __name__ == '__main__':
     main()
diff --git a/scripts/eval/start_server.py b/scripts/eval/start_server.py
new file mode 100644
index 00000000..eb001127
--- /dev/null
+++ b/scripts/eval/start_server.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+import sys
+
+sys.path.append('.')
+
+import argparse
+import glob
+import importlib
+import importlib.util
+import os
+import sys
+
+from internnav.utils.comm_utils.server import AgentServer
+
+
+# import all agents to register them
+def auto_register_agents(agent_dir: str):
+    # Get all Python files in the agents directory
+    agent_modules = glob.glob(os.path.join(agent_dir, '*.py'))
+
+    # Import each module to trigger the registration
+    for module in agent_modules:
+        if not module.endswith('__init__.py'):  # Avoid importing __init__.py itself
+            module_name = os.path.basename(module)[:-3]  # Remove the .py extension
+            importlib.import_module(
+                f'internnav_baselines.agents.{module_name}'
+            )  # Replace 'agents' with your module's package
+
+
+def load_eval_cfg(config_path, attr_name='eval_cfg'):
+    spec = importlib.util.spec_from_file_location("eval_config_module", config_path)
+    config_module = importlib.util.module_from_spec(spec)
+    sys.modules["eval_config_module"] = config_module
+    spec.loader.exec_module(config_module)
+    return getattr(config_module, attr_name)
+
+
+if __name__ == '__main__':
+    print("Starting Agent Server...")
+
+    print("Registering agents...")
+    auto_register_agents('internnav_baselines/agents')
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--host', type=str, default='localhost')
+    parser.add_argument(
+        '--config',
+        type=str,
+        default=None,
+        help='eval config file path, e.g. scripts/eval/configs/h1_cma_cfg.py',
+    )
+    parser.add_argument('--port', type=int, default=8087)
+    parser.add_argument('--reload', action='store_true')
+    args = parser.parse_args()
+    if args.config:
+        eval_cfg = load_eval_cfg(args.config)
+        args.port = eval_cfg.agent.server_port
+    else:
+        print(f"Warning: No config file provided, using port {args.port}")
+
+    server = AgentServer(args.host, args.port)
+    server.run()
diff --git a/challenge/README.md b/scripts/iros_challenge/README.md
similarity index 100%
rename from challenge/README.md
rename to scripts/iros_challenge/README.md
diff --git a/challenge/demo.gif b/scripts/iros_challenge/demo.gif
similarity index 100%
rename from challenge/demo.gif
rename to scripts/iros_challenge/demo.gif
diff --git a/scripts/eval/eval_iros.py b/scripts/iros_challenge/eval_iros.py
similarity index 100%
rename from scripts/eval/eval_iros.py
rename to scripts/iros_challenge/eval_iros.py
diff --git a/challenge/health_check.sh b/scripts/iros_challenge/health_check.sh
similarity index 99%
rename from challenge/health_check.sh
rename to scripts/iros_challenge/health_check.sh
index 63c7c7ce..86c23399 100644
--- a/challenge/health_check.sh
+++ b/scripts/iros_challenge/health_check.sh
@@ -86,4 +86,4 @@ while true; do
             fi
         fi
     fi
-done
\ No newline at end of file
+done
diff --git a/challenge/onsite_competition/README.md b/scripts/iros_challenge/onsite_competition/README.md
similarity index 100%
rename from challenge/onsite_competition/README.md
rename to scripts/iros_challenge/onsite_competition/README.md
diff --git a/challenge/onsite_competition/captures/rs_meta.json b/scripts/iros_challenge/onsite_competition/captures/rs_meta.json
similarity index 100%
rename from challenge/onsite_competition/captures/rs_meta.json
rename to scripts/iros_challenge/onsite_competition/captures/rs_meta.json
diff --git a/challenge/onsite_competition/captures/rs_rgb.jpg b/scripts/iros_challenge/onsite_competition/captures/rs_rgb.jpg
similarity index 100%
rename from challenge/onsite_competition/captures/rs_rgb.jpg
rename to scripts/iros_challenge/onsite_competition/captures/rs_rgb.jpg
diff --git a/challenge/onsite_competition/captures/sim_depth.npy b/scripts/iros_challenge/onsite_competition/captures/sim_depth.npy
similarity index 100%
rename from challenge/onsite_competition/captures/sim_depth.npy
rename to scripts/iros_challenge/onsite_competition/captures/sim_depth.npy
diff --git a/challenge/onsite_competition/captures/sim_rgb.npy b/scripts/iros_challenge/onsite_competition/captures/sim_rgb.npy
similarity index 100%
rename from challenge/onsite_competition/captures/sim_rgb.npy
rename to scripts/iros_challenge/onsite_competition/captures/sim_rgb.npy
diff --git a/challenge/onsite_competition/onsite_competition_rules_en-US.md b/scripts/iros_challenge/onsite_competition/onsite_competition_rules_en-US.md
similarity index 100%
rename from challenge/onsite_competition/onsite_competition_rules_en-US.md
rename to scripts/iros_challenge/onsite_competition/onsite_competition_rules_en-US.md
diff --git a/challenge/onsite_competition/onsite_competition_rules_zh-CN.md b/scripts/iros_challenge/onsite_competition/onsite_competition_rules_zh-CN.md
similarity index 100%
rename from challenge/onsite_competition/onsite_competition_rules_zh-CN.md
rename to scripts/iros_challenge/onsite_competition/onsite_competition_rules_zh-CN.md
diff --git a/challenge/onsite_competition/sdk/cam.py b/scripts/iros_challenge/onsite_competition/sdk/cam.py
similarity index 100%
rename from challenge/onsite_competition/sdk/cam.py
rename to scripts/iros_challenge/onsite_competition/sdk/cam.py
diff --git a/challenge/onsite_competition/sdk/control.py b/scripts/iros_challenge/onsite_competition/sdk/control.py
similarity index 100%
rename from challenge/onsite_competition/sdk/control.py
rename to scripts/iros_challenge/onsite_competition/sdk/control.py
diff --git a/challenge/onsite_competition/sdk/main.py b/scripts/iros_challenge/onsite_competition/sdk/main.py
similarity index 100%
rename from challenge/onsite_competition/sdk/main.py
rename to scripts/iros_challenge/onsite_competition/sdk/main.py
diff --git a/challenge/onsite_competition/sdk/real_world_env.py b/scripts/iros_challenge/onsite_competition/sdk/real_world_env.py
similarity index 100%
rename from challenge/onsite_competition/sdk/real_world_env.py
rename to scripts/iros_challenge/onsite_competition/sdk/real_world_env.py
diff --git a/challenge/onsite_competition/sdk/save_obs.py b/scripts/iros_challenge/onsite_competition/sdk/save_obs.py
similarity index 100%
rename from challenge/onsite_competition/sdk/save_obs.py
rename to scripts/iros_challenge/onsite_competition/sdk/save_obs.py
diff --git a/challenge/onsite_competition/sdk/stream.py b/scripts/iros_challenge/onsite_competition/sdk/stream.py
similarity index 100%
rename from challenge/onsite_competition/sdk/stream.py
rename to scripts/iros_challenge/onsite_competition/sdk/stream.py
diff --git a/challenge/onsite_competition/sdk/test_agent.py b/scripts/iros_challenge/onsite_competition/sdk/test_agent.py
similarity index 100%
rename from challenge/onsite_competition/sdk/test_agent.py
rename to scripts/iros_challenge/onsite_competition/sdk/test_agent.py
diff --git a/challenge/onsite_competition/sdk/test_robot.py b/scripts/iros_challenge/onsite_competition/sdk/test_robot.py
similarity index 100%
rename from challenge/onsite_competition/sdk/test_robot.py
rename to scripts/iros_challenge/onsite_competition/sdk/test_robot.py
diff --git a/challenge/output.gif b/scripts/iros_challenge/output.gif
similarity index 100%
rename from challenge/output.gif
rename to scripts/iros_challenge/output.gif
diff --git a/challenge/start_eval_iros.sh b/scripts/iros_challenge/start_eval_iros.sh
similarity index 100%
rename from challenge/start_eval_iros.sh
rename to scripts/iros_challenge/start_eval_iros.sh
diff --git a/scripts/eval/inference_only_demo.ipynb b/scripts/notebooks/inference_only_demo.ipynb
similarity index 100%
rename from scripts/eval/inference_only_demo.ipynb
rename to scripts/notebooks/inference_only_demo.ipynb
diff --git a/setup.cfg b/setup.cfg
index c055fee2..4118ae4d 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -43,4 +43,4 @@ per-file-ignores=*/__init__.py:F401
 ignore=E402,E501,W503,E203,D401,R504,R505,SIM102,SIM117
 max-line-length = 120
 max-complexity = 30
-exclude=_*,.vscode,.git,docs/**,**/test/**,**/lcmtypes/**,internnav/model/**,*.ipynb
+exclude=_*,.vscode,.git,docs/**,**/test/**,**/lcmtypes/**,*.ipynb,scripts/**,internnav/projects/**
diff --git a/tests/function_test/test_server.py b/tests/function_test/test_server.py
index 885bf366..7e4d7b4e 100644
--- a/tests/function_test/test_server.py
+++ b/tests/function_test/test_server.py
@@ -9,7 +9,7 @@
 def start_server():
     server_cmd = [
         sys.executable,
-        "internnav/utils/comm_utils/server.py",
+        "scripts/eval/start_server.py",
     ]
 
     proc = subprocess.Popen(