From b01b7b752dc141687fdec51c349bdf4a57853948 Mon Sep 17 00:00:00 2001 From: wangyukai Date: Tue, 9 Sep 2025 08:27:17 +0000 Subject: [PATCH] add draw action arrow by pil; no distributed visualize allowed --- internnav/evaluator/utils/common.py | 246 +++++++++++++++++------- internnav/evaluator/vln_pe_evaluator.py | 3 + 2 files changed, 176 insertions(+), 73 deletions(-) diff --git a/internnav/evaluator/utils/common.py b/internnav/evaluator/utils/common.py index cd595a34..5e6011b7 100644 --- a/internnav/evaluator/utils/common.py +++ b/internnav/evaluator/utils/common.py @@ -1,12 +1,13 @@ +import copy +import gzip import json import math -import gzip -import copy import os from collections import defaultdict import numpy as np from internutopia.core.util import is_in_container +from PIL import Image, ImageDraw from scipy.ndimage import binary_dilation from internnav.utils.common_log_util import common_logger as log @@ -155,26 +156,21 @@ def different_height(item): break return different_height + def transform_rotation_z_90degrees(rotation): - z_rot_90 = [np.cos(np.pi/4), 0, 0, np.sin(np.pi/4)] # 90 degrees = pi/2 radians + z_rot_90 = [np.cos(np.pi / 4), 0, 0, np.sin(np.pi / 4)] # 90 degrees = pi/2 radians w1, x1, y1, z1 = rotation w2, x2, y2, z2 = z_rot_90 revised_rotation = [ - w1*w2 - x1*x2 - y1*y2 - z1*z2, # w - w1*x2 + x1*w2 + y1*z2 - z1*y2, # x - w1*y2 - x1*z2 + y1*w2 + z1*x2, # y - w1*z2 + x1*y2 - y1*x2 + z1*w2 # z + w1 * w2 - x1 * x2 - y1 * y2 - z1 * z2, # w + w1 * x2 + x1 * w2 + y1 * z2 - z1 * y2, # x + w1 * y2 - x1 * z2 + y1 * w2 + z1 * x2, # y + w1 * z2 + x1 * y2 - y1 * x2 + z1 * w2, # z ] return revised_rotation -def load_data( - dataset_root_dir, - split, - filter_same_trajectory=True, - filter_stairs=True, - dataset_type='mp3d' -): +def load_data(dataset_root_dir, split, filter_same_trajectory=True, filter_stairs=True, dataset_type='mp3d'): with gzip.open(os.path.join(dataset_root_dir, split, f"{split}.json.gz"), 'rt', encoding='utf-8') as f: data = json.load(f)['episodes'] @@ -208,10 +204,10 @@ def load_data( new_scene_data.append(new_item) new_data[scan] = new_scene_data - + data = copy.deepcopy(new_data) new_data = defaultdict(list) - + # filter_same_trajectory if filter_same_trajectory: total_count = 0 @@ -262,6 +258,7 @@ def load_scene_usd(mp3d_data_dir, scan): return None return scene_usd_path + def load_kujiale_scene_usd(kujiale_iros_data_dir, scan): """Load scene USD based on the scan""" scene_usd_path = os.path.join(kujiale_iros_data_dir, scan, f'{scan}.usda') @@ -270,6 +267,7 @@ def load_kujiale_scene_usd(kujiale_iros_data_dir, scan): return None return scene_usd_path + def get_new_position_and_rotation(robot_position, robot_rotation, action): from omni.isaac.core.utils.rotations import ( euler_angles_to_quat, @@ -336,15 +334,17 @@ def norm_depth(depth_info, min_depth=0, max_depth=10): depth_info = (depth_info - min_depth) / (max_depth - min_depth) return depth_info + def draw_trajectory(array, obs_lst, reference_path): """ Draw the globalgps path and orientation arrows onto the depth array. """ - from internnav.evaluator.utils.path_plan import world_to_pixel - from omni.isaac.core.utils.rotations import quat_to_euler_angles - import numpy as np import matplotlib.pyplot as plt + import numpy as np from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas + from omni.isaac.core.utils.rotations import quat_to_euler_angles + + from internnav.evaluator.utils.path_plan import world_to_pixel points = [] arrows = [] @@ -393,9 +393,9 @@ def draw_trajectory(array, obs_lst, reference_path): # Draw orientation arrows for start, end in arrows: - ax.arrow(start[0], start[1], - end[0] - start[0], end[1] - start[1], - head_width=5, head_length=5, fc='red', ec='red') + ax.arrow( + start[0], start[1], end[0] - start[0], end[1] - start[1], head_width=5, head_length=5, fc='red', ec='red' + ) ax.axis("off") fig.tight_layout(pad=0) @@ -405,118 +405,213 @@ def draw_trajectory(array, obs_lst, reference_path): plt.close(fig) return img -from PIL import Image + from internnav import PROJECT_ROOT_PATH -def draw_action(array, action, arrow_color=(255, 0, 0)): # Default to blue + + +def draw_action_with_image(array, action, arrow_color=(255, 0, 0)): # Default to blue """ Draw colored arrow on the bottom of the numpy array while: 1. Maintaining original image shape 2. Removing white backgrounds from icons 3. Coloring the arrow (default blue) - + Args: array: Input numpy array (H,W,3) RGB image action: Integer action (0=stop, 1=forward, 2=left, 3=right) arrow_color: Tuple (R,G,B) for arrow color (default: blue) - + Returns: Numpy array with same shape as input, with colored icon at bottom center """ if 'move_by_discrete' in action: move = action['move_by_discrete'][0] # Extract the movement value elif 'move_by_flash' in action: - move = action['move_by_flash'][0] + move = action['move_by_flash'][0] else: move = 1 action = move # Load action icon - action_icons = { - 1: "forward.png", - 2: "left.png", - 3: "right.png", - 0: "stop.png" - } + action_icons = {1: "forward.png", 2: "left.png", 3: "right.png", 0: "stop.png"} icon_path = os.path.join(PROJECT_ROOT_PATH, "internnav/utils/images/") icon_path = os.path.join(icon_path, (action_icons.get(action, "stop.png"))) - + # Convert array to PIL Image img = Image.fromarray(array.copy()) # Keep original unchanged - + try: # Load icon and convert to RGBA if not already icon = Image.open(icon_path).convert('RGBA').resize((40, 40)) - + # Process icon: # 1. Convert white background to transparent # 2. Convert black arrow to specified color data = np.array(icon) r, g, b, a = data.T - + # Identify white background (high RGB values) white_areas = (r > 200) & (g > 200) & (b > 200) # Identify arrow (non-white areas) arrow_mask = ~white_areas - + # Set white areas to transparent data[..., -1][white_areas.T] = 0 - + # Color the arrow data[..., 0][arrow_mask.T] = arrow_color[0] # R data[..., 1][arrow_mask.T] = arrow_color[1] # G data[..., 2][arrow_mask.T] = arrow_color[2] # B - + icon = Image.fromarray(data) - + # Calculate position (bottom center) icon_pos = ( (img.width - icon.width) // 2, # Center horizontally - img.height - icon.height - 10 # 10px from bottom + img.height - icon.height - 10, # 10px from bottom ) - + # Paste icon onto image using alpha channel as mask img.paste(icon, icon_pos, icon) - + except Exception as e: print(f"Couldn't process icon: {e}") return array # Return original if icon fails - + return np.array(img) # Return with same shape as input + +def draw_action_pil(array, action, arrow_color=(255, 0, 0)): # default: red + """ + Draw a colored arrow (or stop icon) on the bottom-center of the image. + + Args: + array: np.ndarray (H, W, 3) RGB image + action: int or dict with 'move_by_discrete'/'move_by_flash' (0=stop, 1=forward, 2=left, 3=right) + arrow_color: (R, G, B) + Returns: + np.ndarray with same shape as input + """ + # Normalize action to int code + if isinstance(action, dict): + if 'move_by_discrete' in action: + move = action['move_by_discrete'][0] + elif 'move_by_flash' in action: + move = action['move_by_flash'][0] + else: + move = 1 + action_code = int(move) + else: + action_code = int(action) + + img = Image.fromarray(array.copy()) + + # Icon size relative to image; anti-aliased via supersampling + base = min(img.width, img.height) + size = max(32, min(128, int(base * 0.1))) # 10% of min dim, clamp 32..128 + scale = 3 # supersample for smoother edges + W, H = size * scale, size * scale + + # Transparent overlay we’ll paste onto the image + overlay = Image.new('RGBA', (W, H), (0, 0, 0, 0)) + draw = ImageDraw.Draw(overlay) + color = tuple(arrow_color) + (255,) + # Optional subtle shadow for contrast + shadow = (0, 0, 0, 120) + + cx, cy = W // 2, H // 2 + + def draw_up_arrow(): + # Shaft + shaft_w = int(W * 0.22) + shaft_h = int(H * 0.48) + head_h = int(H * 0.36) + y1 = cy + shaft_h // 2 + y0 = y1 - shaft_h + # Shadow + draw.rounded_rectangle( + [cx - shaft_w // 2 + 2, y0 + 2, cx + shaft_w // 2 + 2, y1 + 2], radius=shaft_w // 2, fill=shadow + ) + # Color shaft + draw.rounded_rectangle([cx - shaft_w // 2, y0, cx + shaft_w // 2, y1], radius=shaft_w // 2, fill=color) + # Head triangle + apex = (cx, y0 - head_h) + left = (cx - int(W * 0.32), y0) + right = (cx + int(W * 0.32), y0) + # Shadow + draw.polygon( + [(apex[0] + 2, apex[1] + 2), (left[0] + 2, left[1] + 2), (right[0] + 2, right[1] + 2)], fill=shadow + ) + # Color + draw.polygon([apex, left, right], fill=color) + + def draw_stop_icon(): + r = int(min(W, H) * 0.38) + bbox = [cx - r, cy - r, cx + r, cy + r] + # Shadow + s_off = 2 + draw.ellipse([bbox[0] + s_off, bbox[1] + s_off, bbox[2] + s_off, bbox[3] + s_off], fill=shadow) + # Red circle (or any arrow_color) + draw.ellipse(bbox, fill=color) + # White square inside + ir = int(r * 0.55) + draw.rectangle([cx - ir, cy - int(ir * 0.6), cx + ir, cy + int(ir * 0.6)], fill=(255, 255, 255, 255)) + + if action_code == 0: + draw_stop_icon() + rotated = overlay + else: + draw_up_arrow() + # Rotate according to action: forward=up, left=+90°, right=-90° + angle = {1: 0, 2: 90, 3: -90}.get(action_code, 0) + rotated = overlay.rotate(angle, resample=Image.BICUBIC, expand=True) + + # Downsample (anti-alias) to final icon size + icon = rotated.resize((size, size), Image.LANCZOS) + + # Paste at bottom center with a small margin + margin = max(6, size // 8) + x = (img.width - icon.width) // 2 + y = img.height - icon.height - margin + img.paste(icon, (x, y), icon) + + return np.array(img) + + def crop(array): # Crop 256x256 (as in your original code) height, width = array.shape[:2] start_x = (width - 256) // 2 start_y = (height - 256) // 2 - return array[start_y:start_y+256, start_x:start_x+256, :] + return array[start_y : start_y + 256, start_x : start_x + 256, :] + def obs_to_image(obs_lst, action, output_path: str, reference_path, normalize: bool = True): """ Load .npy file and save as image - + Args: npy_path: Path to input .npy file output_path: Output image path (extension determines format) normalize: Scale values to 0-255 if True """ os.makedirs(os.path.dirname(output_path), exist_ok=True) - + first_obs = obs_lst[-1] - if 'rgb' not in first_obs: return + if 'rgb' not in first_obs: + return rgb_array = first_obs['rgb'] topdown_array = first_obs['topdown_rgb'] - depth = first_obs['topdown_depth'] # draw array on rgb array - # rgb_array = draw_action(rgb_array, action) + rgb_array = draw_action_pil(rgb_array, action) # draw trajectory on depth topdown_array = crop(draw_trajectory(topdown_array, obs_lst, reference_path)) - # topdown_array = crop(draw_trajectory(depth, obs_lst, reference_path)) # Combine horizontally (256x256 + 256x256 = 512x256) array = np.concatenate((rgb_array, topdown_array), axis=1) - + # Handle different array types if array.dtype == np.bool_: array = array.astype(np.uint8) * 255 @@ -525,9 +620,9 @@ def obs_to_image(obs_lst, action, output_path: str, reference_path, normalize: b array = array.astype(np.uint8) elif np.issubdtype(array.dtype, np.integer) and normalize: array = ((array - array.min()) * (255 / (array.max() - array.min()))).astype(np.uint8) - + # Upscaling using interpolation, improve resolution - array = cv2.resize(array, (array.shape[1]*2, array.shape[0]*2), interpolation=cv2.INTER_CUBIC) + array = cv2.resize(array, (array.shape[1] * 2, array.shape[0] * 2), interpolation=cv2.INTER_CUBIC) # Create and save image if array.ndim == 2: # Grayscale @@ -539,10 +634,12 @@ def obs_to_image(obs_lst, action, output_path: str, reference_path, normalize: b print(f"Saved to {output_path}") -import cv2 -import os + from glob import glob +import cv2 + + def images_to_video(image_folder, output_path, fps=10): """ Generate a video from a folder of images. @@ -582,11 +679,14 @@ def images_to_video(image_folder, output_path, fps=10): out.release() print(f"Video saved to: {output_path}") -from tqdm import tqdm + +from tqdm import tqdm + + def obs_to_video(obs_lst, output_video_path, fps=30): """ Convert a list of observations (with 'rgb' and 'topdown_rgb') directly into a video. - + Args: obs_lst (list): List of observations, each containing 'rgb' and 'topdown_rgb' arrays. output_video_path (str): Path to save the output video (e.g., 'output.mp4'). @@ -594,10 +694,10 @@ def obs_to_video(obs_lst, output_video_path, fps=30): """ if not obs_lst: raise ValueError("Empty observation list!") - + # Get the first frame to determine video dimensions first_obs = obs_lst[0] - + # Process the first frame to get dimensions rgb_array = first_obs['rgb'] topdown_array = first_obs['topdown_rgb'] @@ -608,41 +708,41 @@ def obs_to_video(obs_lst, output_video_path, fps=30): # npy_to_image(rgb_array, output_path + '/rgb.png') # np.save(output_path + '/topdown.npy', topdown_array) # np.save(output_path + '/rgb.npy', rgb_array) - + # Crop topdown to 256x256 (as in your original code) height, width = topdown_array.shape[:2] start_x = (width - 256) // 2 start_y = (height - 256) // 2 - topdown_array = topdown_array[start_y:start_y+256, start_x:start_x+256, :] - + topdown_array = topdown_array[start_y : start_y + 256, start_x : start_x + 256, :] + # Combine horizontally (256x256 + 256x256 = 512x256) combined_array = np.concatenate((rgb_array, topdown_array), axis=1) height, width, _ = combined_array.shape - + # Initialize VideoWriter fourcc = cv2.VideoWriter_fourcc(*'mp4v') # MP4 codec video_writer = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height)) - + # Process and write each frame for i, obs in enumerate(tqdm(obs_lst, desc="Generating video")): if 'rgb' not in obs or 'topdown_rgb' not in obs: print(f"Warning: Observation {i} missing 'rgb' or 'topdown_rgb'") continue - + rgb_array = obs['rgb'] topdown_array = obs['topdown_rgb'] - + # Crop topdown to 256x256 height_td, width_td = topdown_array.shape[:2] start_x = (width_td - 256) // 2 start_y = (height_td - 256) // 2 - topdown_array = topdown_array[start_y:start_y+256, start_x:start_x+256, :] - + topdown_array = topdown_array[start_y : start_y + 256, start_x : start_x + 256, :] + # Ensure correct shape and type if rgb_array.shape != (256, 256, 3) or topdown_array.shape != (256, 256, 3): print(f"Warning: Observation {i} has incorrect dimensions") continue - + # Convert float arrays (0-1) to uint8 (0-255) if rgb_array.dtype == np.float32 or rgb_array.dtype == np.float64: if rgb_array.max() <= 1.0: @@ -650,10 +750,10 @@ def obs_to_video(obs_lst, output_video_path, fps=30): if topdown_array.dtype == np.float32 or topdown_array.dtype == np.float64: if topdown_array.max() <= 1.0: topdown_array = (topdown_array * 255).astype(np.uint8) - + # Combine and write frame combined_frame = np.concatenate((rgb_array, topdown_array), axis=1) video_writer.write(combined_frame) - + video_writer.release() print(f"Video saved to: {output_video_path}") diff --git a/internnav/evaluator/vln_pe_evaluator.py b/internnav/evaluator/vln_pe_evaluator.py index 468f49d9..4c4acf64 100644 --- a/internnav/evaluator/vln_pe_evaluator.py +++ b/internnav/evaluator/vln_pe_evaluator.py @@ -286,6 +286,9 @@ def eval(self): # save step obs if self.vis_output: + if self.config.task.task_settings['use_distributed']: + raise Exception('visualization not support distributed mode!') + for ob, info, act in zip(obs, reset_info, action): if info is None or 'rgb' not in ob or ob['fail_reason']: continue