From c0bebb99f81598a4d71b6541197c95c8064f43eb Mon Sep 17 00:00:00 2001 From: "xinjie.wang" Date: Wed, 14 Jan 2026 14:44:10 +0800 Subject: [PATCH] update --- README.md | 14 +++++--- docs/tutorials/layout_gen.md | 16 ++++----- embodied_gen/scripts/imageto3d.py | 8 +++++ embodied_gen/utils/gpt_clients.py | 3 +- embodied_gen/utils/simulation.py | 58 +++++++++++++++++++++++++------ 5 files changed, 76 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 81d4cb0..2d16b48 100644 --- a/README.md +++ b/README.md @@ -223,14 +223,20 @@ See our paper published in NeurIPS 2025. Text-to-image model based on SD3.5 Medium, usage requires agreement to the [model license](https://huggingface.co/stabilityai/stable-diffusion-3.5-medium). All models auto-downloaded at the first run. -You can generate any desired room as background using `scene3d-cli`. As each scene takes approximately 30 minutes to generate, we recommend pre-generating them for efficiency and adding them to `outputs/bg_scenes/scene_list.txt`. +You can generate any desired room as background using `scene3d-cli`. As each scene takes approximately 30 minutes to generate, we recommend pre-generating them for efficiency and adding them to `outputs/example_gen_scenes/scene_part_list.txt`. -We provided some sample background assets created with `scene3d-cli`. Download them(~4G) using `hf download xinjjj/scene3d-bg --repo-type dataset --local-dir outputs`. +We provided some sample background assets created with `scene3d-cli`. Download them(~2G) using: +```sh +hf download HorizonRobotics/EmbodiedGenData \ + --repo-type dataset --local-dir outputs \ + --include "example_gen_scenes/scene_00[01][0-9]/**" \ + "example_gen_scenes/scene_part_list.txt" +``` Generating one interactive 3D scene from task description with `layout-cli` takes approximately 30 minutes. ```sh layout-cli --task_descs "Place the pen in the mug on the desk" "Put the fruit on the table on the plate" \ ---bg_list "outputs/bg_scenes/scene_list.txt" --output_root "outputs/layouts_gen" --insert_robot +--bg_list "outputs/example_gen_scenes/scene_part_list.txt" --output_root "outputs/layouts_gen" --insert_robot ``` @@ -245,7 +251,7 @@ Remove `--insert_robot` if you don't consider the robot pose in layout generatio ```sh CUDA_VISIBLE_DEVICES=0 nohup layout-cli \ --task_descs "apps/assets/example_layout/task_list.txt" \ ---bg_list "outputs/bg_scenes/scene_list.txt" \ +--bg_list "outputs/example_gen_scenes/scene_part_list.txt" \ --n_image_retry 4 --n_asset_retry 3 --n_pipe_retry 3 \ --output_root "outputs/layouts_gens" --insert_robot > layouts_gens.log & ``` diff --git a/docs/tutorials/layout_gen.md b/docs/tutorials/layout_gen.md index 0f56543..3fb4e85 100644 --- a/docs/tutorials/layout_gen.md +++ b/docs/tutorials/layout_gen.md @@ -19,14 +19,14 @@ Layout Generation enables the generation of diverse, physically realistic, and s Before running `layout-cli`, you need to prepare background 3D scenes. You can either **generate your own** using the [`scene3d-cli`](scene_gen.md), or **download pre-generated backgrounds** for convenience. -Each scene takes approximately **30 minutes** to generate. For efficiency, we recommend pre-generating and listing them in `outputs/bg_scenes/scene_list.txt`. +Each scene takes approximately **30 minutes** to generate. For efficiency, we recommend pre-generating and listing them in `outputs/example_gen_scenes/scene_part_list.txt`. ```bash -# Option 1: Download pre-generated backgrounds (~4 GB) -hf download xinjjj/scene3d-bg --repo-type dataset --local-dir outputs - -# Option 2: Download a larger background set (~14 GB) -hf download xinjjj..RLv2-BG --repo-type dataset --local-dir outputs +# Download pre-generated backgrounds (~2 GB) +hf download HorizonRobotics/EmbodiedGenData \ + --repo-type dataset --local-dir outputs \ + --include "example_gen_scenes/scene_00[01][0-9]/**" \ + "example_gen_scenes/scene_part_list.txt" ``` ## Generate Interactive Layout Scenes @@ -37,7 +37,7 @@ Use the `layout-cli` to create interactive 3D scenes based on task descriptions. layout-cli \ --task_descs "Place the pen in the mug on the desk" \ "Put the fruit on the table on the plate" \ - --bg_list "outputs/bg_scenes/scene_list.txt" \ + --bg_list "outputs/example_gen_scenes/scene_part_list.txt" \ --output_root "outputs/layouts_gen" \ --insert_robot ``` @@ -59,7 +59,7 @@ You can also run multiple tasks via a task list file in the backend. ```sh CUDA_VISIBLE_DEVICES=0 nohup layout-cli \ --task_descs "apps/assets/example_layout/task_list.txt" \ - --bg_list "outputs/bg_scenes/scene_list.txt" \ + --bg_list "outputs/example_gen_scenes/scene_part_list.txt" \ --n_image_retry 4 --n_asset_retry 3 --n_pipe_retry 3 \ --output_root "outputs/layouts_gens" \ --insert_robot > layouts_gens.log & diff --git a/embodied_gen/scripts/imageto3d.py b/embodied_gen/scripts/imageto3d.py index 9d930ab..13a1193 100644 --- a/embodied_gen/scripts/imageto3d.py +++ b/embodied_gen/scripts/imageto3d.py @@ -153,6 +153,7 @@ def entrypoint(**kwargs): seed = args.seed asset_node = "unknown" + gs_model = None if isinstance(args.asset_type, list) and args.asset_type[idx]: asset_node = args.asset_type[idx] for try_idx in range(args.n_retry): @@ -165,6 +166,9 @@ def entrypoint(**kwargs): logger.error( f"[Image3D Failed] process {image_path}: {e}, retry: {try_idx+1}/{args.n_retry}" ) + seed = ( + random.randint(0, 100000) if seed is not None else None + ) continue gs_model = outputs["gaussian"][0] @@ -208,6 +212,10 @@ def entrypoint(**kwargs): seed = random.randint(0, 100000) if seed is not None else None + if gs_model is None: + logger.error(f"Exceed image3d retry num, skip {image_path}.") + continue + # Render the video for generated 3D asset. color_images = render_video(gs_model, r=1.85)["color"] normal_images = render_video(mesh_model, r=1.85)["normal"] diff --git a/embodied_gen/utils/gpt_clients.py b/embodied_gen/utils/gpt_clients.py index 32a9ea9..830dc77 100644 --- a/embodied_gen/utils/gpt_clients.py +++ b/embodied_gen/utils/gpt_clients.py @@ -42,7 +42,8 @@ "GPTclient", ] -CONFIG_FILE = "embodied_gen/utils/gpt_config.yaml" +_CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) +CONFIG_FILE = os.path.join(_CURRENT_DIR, "gpt_config.yaml") class GPTclient: diff --git a/embodied_gen/utils/simulation.py b/embodied_gen/utils/simulation.py index 5ff13b6..7f828fb 100644 --- a/embodied_gen/utils/simulation.py +++ b/embodied_gen/utils/simulation.py @@ -25,6 +25,7 @@ import sapien.core as sapien import sapien.physx as physx import torch +import trimesh from mani_skill.agents.base_agent import BaseAgent from mani_skill.envs.scene import ManiSkillScene from mani_skill.examples.motionplanning.panda.utils import ( @@ -57,9 +58,24 @@ "load_assets_from_layout_file", "load_mani_skill_robot", "render_images", + "is_urdf_articulated", ] +def is_urdf_articulated(urdf_path: str) -> bool: + try: + tree = ET.parse(urdf_path) + root = tree.getroot() + for joint in root.findall(".//joint"): + j_type = joint.get("type") + if j_type in ["prismatic", "revolute", "continuous", "planar"]: + return True + return False + except Exception as e: + print(f"Error parsing URDF {urdf_path}: {e}.") + return False + + def load_actor_from_urdf( scene: sapien.Scene | ManiSkillScene, file_path: str, @@ -203,14 +219,21 @@ def load_assets_from_layout_file( # Combine initial quaternion with object quaternion x, y, z, qx, qy, qz, qw = position qx, qy, qz, qw = quaternion_multiply([qx, qy, qz, qw], init_quat) - actor = load_actor_from_urdf( - scene, - urdf_file, - sapien.Pose(p=[x, y, z], q=[qw, qx, qy, qz]), - env_idx, - use_static=use_static, - update_mass=False, - ) + target_pose = sapien.Pose(p=[x, y, z], q=[qw, qx, qy, qz]) + if is_urdf_articulated(urdf_file): + loader = scene.create_urdf_loader() + loader.fix_root_link = use_static + actor = loader.load(urdf_file) + actor.set_root_pose(target_pose) + else: + actor = load_actor_from_urdf( + scene, + urdf_file, + target_pose, + env_idx, + use_static=use_static, + update_mass=False, + ) actors[node] = actor return actors @@ -725,8 +748,23 @@ def compute_grasp_action( Returns: np.ndarray: Array of grasp actions. """ - physx_rigid = actor.components[1] - mesh = get_component_mesh(physx_rigid, to_world_frame=True) + if isinstance(actor, physx.PhysxArticulation): + meshes = [] + for link in actor.links: + link_mesh = get_component_mesh(link, to_world_frame=True) + if link_mesh is not None and not link_mesh.is_empty: + meshes.append(link_mesh) + if meshes: + mesh = trimesh.util.concatenate(meshes) + else: + logger.warning( + f"Articulation {actor.name} has no valid meshes." + ) + return None + else: + physx_rigid = actor.components[1] + mesh = get_component_mesh(physx_rigid, to_world_frame=True) + obb = mesh.bounding_box_oriented approaching = np.array([0, 0, -1]) tcp_pose = self.agent.tcp.pose[env_idx]