From b01b7b752dc141687fdec51c349bdf4a57853948 Mon Sep 17 00:00:00 2001
From: wangyukai <kewa990809@gmail.com>
Date: Tue, 9 Sep 2025 08:27:17 +0000
Subject: [PATCH] add draw action arrow by pil; no distributed visualize
 allowed

---
 internnav/evaluator/utils/common.py     | 246 +++++++++++++++++-------
 internnav/evaluator/vln_pe_evaluator.py |   3 +
 2 files changed, 176 insertions(+), 73 deletions(-)

diff --git a/internnav/evaluator/utils/common.py b/internnav/evaluator/utils/common.py
index cd595a34..5e6011b7 100644
--- a/internnav/evaluator/utils/common.py
+++ b/internnav/evaluator/utils/common.py
@@ -1,12 +1,13 @@
+import copy
+import gzip
 import json
 import math
-import gzip
-import copy
 import os
 from collections import defaultdict
 
 import numpy as np
 from internutopia.core.util import is_in_container
+from PIL import Image, ImageDraw
 from scipy.ndimage import binary_dilation
 
 from internnav.utils.common_log_util import common_logger as log
@@ -155,26 +156,21 @@ def different_height(item):
             break
     return different_height
 
+
 def transform_rotation_z_90degrees(rotation):
-    z_rot_90 = [np.cos(np.pi/4), 0, 0, np.sin(np.pi/4)]  # 90 degrees = pi/2 radians
+    z_rot_90 = [np.cos(np.pi / 4), 0, 0, np.sin(np.pi / 4)]  # 90 degrees = pi/2 radians
     w1, x1, y1, z1 = rotation
     w2, x2, y2, z2 = z_rot_90
     revised_rotation = [
-        w1*w2 - x1*x2 - y1*y2 - z1*z2,  # w
-        w1*x2 + x1*w2 + y1*z2 - z1*y2,  # x
-        w1*y2 - x1*z2 + y1*w2 + z1*x2,  # y
-        w1*z2 + x1*y2 - y1*x2 + z1*w2   # z
+        w1 * w2 - x1 * x2 - y1 * y2 - z1 * z2,  # w
+        w1 * x2 + x1 * w2 + y1 * z2 - z1 * y2,  # x
+        w1 * y2 - x1 * z2 + y1 * w2 + z1 * x2,  # y
+        w1 * z2 + x1 * y2 - y1 * x2 + z1 * w2,  # z
     ]
     return revised_rotation
 
 
-def load_data(
-    dataset_root_dir,
-    split,
-    filter_same_trajectory=True,
-    filter_stairs=True,
-    dataset_type='mp3d'
-):
+def load_data(dataset_root_dir, split, filter_same_trajectory=True, filter_stairs=True, dataset_type='mp3d'):
     with gzip.open(os.path.join(dataset_root_dir, split, f"{split}.json.gz"), 'rt', encoding='utf-8') as f:
         data = json.load(f)['episodes']
 
@@ -208,10 +204,10 @@ def load_data(
             new_scene_data.append(new_item)
 
         new_data[scan] = new_scene_data
-        
+
     data = copy.deepcopy(new_data)
     new_data = defaultdict(list)
-    
+
     # filter_same_trajectory
     if filter_same_trajectory:
         total_count = 0
@@ -262,6 +258,7 @@ def load_scene_usd(mp3d_data_dir, scan):
         return None
     return scene_usd_path
 
+
 def load_kujiale_scene_usd(kujiale_iros_data_dir, scan):
     """Load scene USD based on the scan"""
     scene_usd_path = os.path.join(kujiale_iros_data_dir, scan, f'{scan}.usda')
@@ -270,6 +267,7 @@ def load_kujiale_scene_usd(kujiale_iros_data_dir, scan):
         return None
     return scene_usd_path
 
+
 def get_new_position_and_rotation(robot_position, robot_rotation, action):
     from omni.isaac.core.utils.rotations import (
         euler_angles_to_quat,
@@ -336,15 +334,17 @@ def norm_depth(depth_info, min_depth=0, max_depth=10):
     depth_info = (depth_info - min_depth) / (max_depth - min_depth)
     return depth_info
 
+
 def draw_trajectory(array, obs_lst, reference_path):
     """
     Draw the globalgps path and orientation arrows onto the depth array.
     """
-    from internnav.evaluator.utils.path_plan import world_to_pixel
-    from omni.isaac.core.utils.rotations import quat_to_euler_angles
-    import numpy as np
     import matplotlib.pyplot as plt
+    import numpy as np
     from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
+    from omni.isaac.core.utils.rotations import quat_to_euler_angles
+
+    from internnav.evaluator.utils.path_plan import world_to_pixel
 
     points = []
     arrows = []
@@ -393,9 +393,9 @@ def draw_trajectory(array, obs_lst, reference_path):
 
     # Draw orientation arrows
     for start, end in arrows:
-        ax.arrow(start[0], start[1],
-                 end[0] - start[0], end[1] - start[1],
-                 head_width=5, head_length=5, fc='red', ec='red')
+        ax.arrow(
+            start[0], start[1], end[0] - start[0], end[1] - start[1], head_width=5, head_length=5, fc='red', ec='red'
+        )
 
     ax.axis("off")
     fig.tight_layout(pad=0)
@@ -405,118 +405,213 @@ def draw_trajectory(array, obs_lst, reference_path):
     plt.close(fig)
     return img
 
-from PIL import Image
+
 from internnav import PROJECT_ROOT_PATH
-def draw_action(array, action, arrow_color=(255, 0, 0)):  # Default to blue
+
+
+def draw_action_with_image(array, action, arrow_color=(255, 0, 0)):  # Default to blue
     """
     Draw colored arrow on the bottom of the numpy array while:
     1. Maintaining original image shape
     2. Removing white backgrounds from icons
     3. Coloring the arrow (default blue)
-    
+
     Args:
         array: Input numpy array (H,W,3) RGB image
         action: Integer action (0=stop, 1=forward, 2=left, 3=right)
         arrow_color: Tuple (R,G,B) for arrow color (default: blue)
-    
+
     Returns:
         Numpy array with same shape as input, with colored icon at bottom center
     """
     if 'move_by_discrete' in action:
         move = action['move_by_discrete'][0]  # Extract the movement value
     elif 'move_by_flash' in action:
-        move = action['move_by_flash'][0] 
+        move = action['move_by_flash'][0]
     else:
         move = 1
     action = move
 
     # Load action icon
-    action_icons = {
-        1: "forward.png",
-        2: "left.png",
-        3: "right.png",
-        0: "stop.png"
-    }
+    action_icons = {1: "forward.png", 2: "left.png", 3: "right.png", 0: "stop.png"}
     icon_path = os.path.join(PROJECT_ROOT_PATH, "internnav/utils/images/")
     icon_path = os.path.join(icon_path, (action_icons.get(action, "stop.png")))
-    
+
     # Convert array to PIL Image
     img = Image.fromarray(array.copy())  # Keep original unchanged
-    
+
     try:
         # Load icon and convert to RGBA if not already
         icon = Image.open(icon_path).convert('RGBA').resize((40, 40))
-        
+
         # Process icon:
         # 1. Convert white background to transparent
         # 2. Convert black arrow to specified color
         data = np.array(icon)
         r, g, b, a = data.T
-        
+
         # Identify white background (high RGB values)
         white_areas = (r > 200) & (g > 200) & (b > 200)
         # Identify arrow (non-white areas)
         arrow_mask = ~white_areas
-        
+
         # Set white areas to transparent
         data[..., -1][white_areas.T] = 0
-        
+
         # Color the arrow
         data[..., 0][arrow_mask.T] = arrow_color[0]  # R
         data[..., 1][arrow_mask.T] = arrow_color[1]  # G
         data[..., 2][arrow_mask.T] = arrow_color[2]  # B
-        
+
         icon = Image.fromarray(data)
-        
+
         # Calculate position (bottom center)
         icon_pos = (
             (img.width - icon.width) // 2,  # Center horizontally
-            img.height - icon.height - 10   # 10px from bottom
+            img.height - icon.height - 10,  # 10px from bottom
         )
-        
+
         # Paste icon onto image using alpha channel as mask
         img.paste(icon, icon_pos, icon)
-        
+
     except Exception as e:
         print(f"Couldn't process icon: {e}")
         return array  # Return original if icon fails
-    
+
     return np.array(img)  # Return with same shape as input
 
+
+def draw_action_pil(array, action, arrow_color=(255, 0, 0)):  # default: red
+    """
+    Draw a colored arrow (or stop icon) on the bottom-center of the image.
+
+    Args:
+        array: np.ndarray (H, W, 3) RGB image
+        action: int or dict with 'move_by_discrete'/'move_by_flash' (0=stop, 1=forward, 2=left, 3=right)
+        arrow_color: (R, G, B)
+    Returns:
+        np.ndarray with same shape as input
+    """
+    # Normalize action to int code
+    if isinstance(action, dict):
+        if 'move_by_discrete' in action:
+            move = action['move_by_discrete'][0]
+        elif 'move_by_flash' in action:
+            move = action['move_by_flash'][0]
+        else:
+            move = 1
+        action_code = int(move)
+    else:
+        action_code = int(action)
+
+    img = Image.fromarray(array.copy())
+
+    # Icon size relative to image; anti-aliased via supersampling
+    base = min(img.width, img.height)
+    size = max(32, min(128, int(base * 0.1)))  # 10% of min dim, clamp 32..128
+    scale = 3  # supersample for smoother edges
+    W, H = size * scale, size * scale
+
+    # Transparent overlay we’ll paste onto the image
+    overlay = Image.new('RGBA', (W, H), (0, 0, 0, 0))
+    draw = ImageDraw.Draw(overlay)
+    color = tuple(arrow_color) + (255,)
+    # Optional subtle shadow for contrast
+    shadow = (0, 0, 0, 120)
+
+    cx, cy = W // 2, H // 2
+
+    def draw_up_arrow():
+        # Shaft
+        shaft_w = int(W * 0.22)
+        shaft_h = int(H * 0.48)
+        head_h = int(H * 0.36)
+        y1 = cy + shaft_h // 2
+        y0 = y1 - shaft_h
+        # Shadow
+        draw.rounded_rectangle(
+            [cx - shaft_w // 2 + 2, y0 + 2, cx + shaft_w // 2 + 2, y1 + 2], radius=shaft_w // 2, fill=shadow
+        )
+        # Color shaft
+        draw.rounded_rectangle([cx - shaft_w // 2, y0, cx + shaft_w // 2, y1], radius=shaft_w // 2, fill=color)
+        # Head triangle
+        apex = (cx, y0 - head_h)
+        left = (cx - int(W * 0.32), y0)
+        right = (cx + int(W * 0.32), y0)
+        # Shadow
+        draw.polygon(
+            [(apex[0] + 2, apex[1] + 2), (left[0] + 2, left[1] + 2), (right[0] + 2, right[1] + 2)], fill=shadow
+        )
+        # Color
+        draw.polygon([apex, left, right], fill=color)
+
+    def draw_stop_icon():
+        r = int(min(W, H) * 0.38)
+        bbox = [cx - r, cy - r, cx + r, cy + r]
+        # Shadow
+        s_off = 2
+        draw.ellipse([bbox[0] + s_off, bbox[1] + s_off, bbox[2] + s_off, bbox[3] + s_off], fill=shadow)
+        # Red circle (or any arrow_color)
+        draw.ellipse(bbox, fill=color)
+        # White square inside
+        ir = int(r * 0.55)
+        draw.rectangle([cx - ir, cy - int(ir * 0.6), cx + ir, cy + int(ir * 0.6)], fill=(255, 255, 255, 255))
+
+    if action_code == 0:
+        draw_stop_icon()
+        rotated = overlay
+    else:
+        draw_up_arrow()
+        # Rotate according to action: forward=up, left=+90°, right=-90°
+        angle = {1: 0, 2: 90, 3: -90}.get(action_code, 0)
+        rotated = overlay.rotate(angle, resample=Image.BICUBIC, expand=True)
+
+    # Downsample (anti-alias) to final icon size
+    icon = rotated.resize((size, size), Image.LANCZOS)
+
+    # Paste at bottom center with a small margin
+    margin = max(6, size // 8)
+    x = (img.width - icon.width) // 2
+    y = img.height - icon.height - margin
+    img.paste(icon, (x, y), icon)
+
+    return np.array(img)
+
+
 def crop(array):
     # Crop 256x256 (as in your original code)
     height, width = array.shape[:2]
     start_x = (width - 256) // 2
     start_y = (height - 256) // 2
-    return array[start_y:start_y+256, start_x:start_x+256, :]
+    return array[start_y : start_y + 256, start_x : start_x + 256, :]
+
 
 def obs_to_image(obs_lst, action, output_path: str, reference_path, normalize: bool = True):
     """
     Load .npy file and save as image
-    
+
     Args:
         npy_path: Path to input .npy file
         output_path: Output image path (extension determines format)
         normalize: Scale values to 0-255 if True
     """
     os.makedirs(os.path.dirname(output_path), exist_ok=True)
-    
+
     first_obs = obs_lst[-1]
-    if 'rgb' not in first_obs: return
+    if 'rgb' not in first_obs:
+        return
     rgb_array = first_obs['rgb']
     topdown_array = first_obs['topdown_rgb']
-    depth = first_obs['topdown_depth']
 
     # draw array on rgb array
-    # rgb_array = draw_action(rgb_array, action)
+    rgb_array = draw_action_pil(rgb_array, action)
 
     # draw trajectory on depth
     topdown_array = crop(draw_trajectory(topdown_array, obs_lst, reference_path))
-    # topdown_array = crop(draw_trajectory(depth, obs_lst, reference_path))
 
     # Combine horizontally (256x256 + 256x256 = 512x256)
     array = np.concatenate((rgb_array, topdown_array), axis=1)
-    
+
     # Handle different array types
     if array.dtype == np.bool_:
         array = array.astype(np.uint8) * 255
@@ -525,9 +620,9 @@ def obs_to_image(obs_lst, action, output_path: str, reference_path, normalize: b
         array = array.astype(np.uint8)
     elif np.issubdtype(array.dtype, np.integer) and normalize:
         array = ((array - array.min()) * (255 / (array.max() - array.min()))).astype(np.uint8)
-    
+
     # Upscaling using interpolation, improve resolution
-    array = cv2.resize(array, (array.shape[1]*2, array.shape[0]*2), interpolation=cv2.INTER_CUBIC)
+    array = cv2.resize(array, (array.shape[1] * 2, array.shape[0] * 2), interpolation=cv2.INTER_CUBIC)
 
     # Create and save image
     if array.ndim == 2:  # Grayscale
@@ -539,10 +634,12 @@ def obs_to_image(obs_lst, action, output_path: str, reference_path, normalize: b
 
     print(f"Saved to {output_path}")
 
-import cv2
-import os
+
 from glob import glob
 
+import cv2
+
+
 def images_to_video(image_folder, output_path, fps=10):
     """
     Generate a video from a folder of images.
@@ -582,11 +679,14 @@ def images_to_video(image_folder, output_path, fps=10):
     out.release()
     print(f"Video saved to: {output_path}")
 
-from tqdm import tqdm 
+
+from tqdm import tqdm
+
+
 def obs_to_video(obs_lst, output_video_path, fps=30):
     """
     Convert a list of observations (with 'rgb' and 'topdown_rgb') directly into a video.
-    
+
     Args:
         obs_lst (list): List of observations, each containing 'rgb' and 'topdown_rgb' arrays.
         output_video_path (str): Path to save the output video (e.g., 'output.mp4').
@@ -594,10 +694,10 @@ def obs_to_video(obs_lst, output_video_path, fps=30):
     """
     if not obs_lst:
         raise ValueError("Empty observation list!")
-    
+
     # Get the first frame to determine video dimensions
     first_obs = obs_lst[0]
-    
+
     # Process the first frame to get dimensions
     rgb_array = first_obs['rgb']
     topdown_array = first_obs['topdown_rgb']
@@ -608,41 +708,41 @@ def obs_to_video(obs_lst, output_video_path, fps=30):
     # npy_to_image(rgb_array, output_path + '/rgb.png')
     # np.save(output_path + '/topdown.npy', topdown_array)
     # np.save(output_path + '/rgb.npy', rgb_array)
-    
+
     # Crop topdown to 256x256 (as in your original code)
     height, width = topdown_array.shape[:2]
     start_x = (width - 256) // 2
     start_y = (height - 256) // 2
-    topdown_array = topdown_array[start_y:start_y+256, start_x:start_x+256, :]
-    
+    topdown_array = topdown_array[start_y : start_y + 256, start_x : start_x + 256, :]
+
     # Combine horizontally (256x256 + 256x256 = 512x256)
     combined_array = np.concatenate((rgb_array, topdown_array), axis=1)
     height, width, _ = combined_array.shape
-    
+
     # Initialize VideoWriter
     fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # MP4 codec
     video_writer = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
-    
+
     # Process and write each frame
     for i, obs in enumerate(tqdm(obs_lst, desc="Generating video")):
         if 'rgb' not in obs or 'topdown_rgb' not in obs:
             print(f"Warning: Observation {i} missing 'rgb' or 'topdown_rgb'")
             continue
-        
+
         rgb_array = obs['rgb']
         topdown_array = obs['topdown_rgb']
-        
+
         # Crop topdown to 256x256
         height_td, width_td = topdown_array.shape[:2]
         start_x = (width_td - 256) // 2
         start_y = (height_td - 256) // 2
-        topdown_array = topdown_array[start_y:start_y+256, start_x:start_x+256, :]
-        
+        topdown_array = topdown_array[start_y : start_y + 256, start_x : start_x + 256, :]
+
         # Ensure correct shape and type
         if rgb_array.shape != (256, 256, 3) or topdown_array.shape != (256, 256, 3):
             print(f"Warning: Observation {i} has incorrect dimensions")
             continue
-        
+
         # Convert float arrays (0-1) to uint8 (0-255)
         if rgb_array.dtype == np.float32 or rgb_array.dtype == np.float64:
             if rgb_array.max() <= 1.0:
@@ -650,10 +750,10 @@ def obs_to_video(obs_lst, output_video_path, fps=30):
         if topdown_array.dtype == np.float32 or topdown_array.dtype == np.float64:
             if topdown_array.max() <= 1.0:
                 topdown_array = (topdown_array * 255).astype(np.uint8)
-        
+
         # Combine and write frame
         combined_frame = np.concatenate((rgb_array, topdown_array), axis=1)
         video_writer.write(combined_frame)
-    
+
     video_writer.release()
     print(f"Video saved to: {output_video_path}")
diff --git a/internnav/evaluator/vln_pe_evaluator.py b/internnav/evaluator/vln_pe_evaluator.py
index 468f49d9..4c4acf64 100644
--- a/internnav/evaluator/vln_pe_evaluator.py
+++ b/internnav/evaluator/vln_pe_evaluator.py
@@ -286,6 +286,9 @@ def eval(self):
 
             # save step obs
             if self.vis_output:
+                if self.config.task.task_settings['use_distributed']:
+                    raise Exception('visualization not support distributed mode!')
+
                 for ob, info, act in zip(obs, reset_info, action):
                     if info is None or 'rgb' not in ob or ob['fail_reason']:
                         continue