From c0bebb99f81598a4d71b6541197c95c8064f43eb Mon Sep 17 00:00:00 2001
From: "xinjie.wang" <xinjie.wang@gpu-4090-dev015.hogpu.cc>
Date: Wed, 14 Jan 2026 14:44:10 +0800
Subject: [PATCH] update

---
 README.md                         | 14 +++++---
 docs/tutorials/layout_gen.md      | 16 ++++-----
 embodied_gen/scripts/imageto3d.py |  8 +++++
 embodied_gen/utils/gpt_clients.py |  3 +-
 embodied_gen/utils/simulation.py  | 58 +++++++++++++++++++++++++------
 5 files changed, 76 insertions(+), 23 deletions(-)
diff --git a/README.md b/README.md
index 81d4cb0..2d16b48 100644
--- a/README.md
+++ b/README.md
@@ -223,14 +223,20 @@ See our paper published in NeurIPS 2025.
 
 Text-to-image model based on SD3.5 Medium, usage requires agreement to the [model license](https://huggingface.co/stabilityai/stable-diffusion-3.5-medium). All models auto-downloaded at the first run.
 
-You can generate any desired room as background using `scene3d-cli`. As each scene takes approximately 30 minutes to generate, we recommend pre-generating them for efficiency and adding them to `outputs/bg_scenes/scene_list.txt`.
+You can generate any desired room as background using `scene3d-cli`. As each scene takes approximately 30 minutes to generate, we recommend pre-generating them for efficiency and adding them to `outputs/example_gen_scenes/scene_part_list.txt`.
 
-We provided some sample background assets created with `scene3d-cli`. Download them(~4G) using `hf download xinjjj/scene3d-bg --repo-type dataset --local-dir outputs`.
+We provided some sample background assets created with `scene3d-cli`. Download them(~2G) using:
+```sh
+hf download HorizonRobotics/EmbodiedGenData \
+  --repo-type dataset --local-dir outputs \
+  --include "example_gen_scenes/scene_00[01][0-9]/**" \
+            "example_gen_scenes/scene_part_list.txt"
+```
 
 Generating one interactive 3D scene from task description with `layout-cli` takes approximately 30 minutes.
 ```sh
 layout-cli --task_descs "Place the pen in the mug on the desk" "Put the fruit on the table on the plate" \
---bg_list "outputs/bg_scenes/scene_list.txt" --output_root "outputs/layouts_gen" --insert_robot
+--bg_list "outputs/example_gen_scenes/scene_part_list.txt" --output_root "outputs/layouts_gen" --insert_robot
 ```
 
 <table>
@@ -245,7 +251,7 @@ Remove `--insert_robot` if you don't consider the robot pose in layout generatio
 ```sh
 CUDA_VISIBLE_DEVICES=0 nohup layout-cli \
 --task_descs "apps/assets/example_layout/task_list.txt" \
---bg_list "outputs/bg_scenes/scene_list.txt" \
+--bg_list "outputs/example_gen_scenes/scene_part_list.txt" \
 --n_image_retry 4 --n_asset_retry 3 --n_pipe_retry 3 \
 --output_root "outputs/layouts_gens" --insert_robot > layouts_gens.log &
 ```
diff --git a/docs/tutorials/layout_gen.md b/docs/tutorials/layout_gen.md
index 0f56543..3fb4e85 100644
--- a/docs/tutorials/layout_gen.md
+++ b/docs/tutorials/layout_gen.md
@@ -19,14 +19,14 @@ Layout Generation enables the generation of diverse, physically realistic, and s
 Before running `layout-cli`, you need to prepare background 3D scenes.
 You can either **generate your own** using the [`scene3d-cli`](scene_gen.md), or **download pre-generated backgrounds** for convenience.
 
-Each scene takes approximately **30 minutes** to generate. For efficiency, we recommend pre-generating and listing them in `outputs/bg_scenes/scene_list.txt`.
+Each scene takes approximately **30 minutes** to generate. For efficiency, we recommend pre-generating and listing them in `outputs/example_gen_scenes/scene_part_list.txt`.
 
 ```bash
-# Option 1: Download pre-generated backgrounds (~4 GB)
-hf download xinjjj/scene3d-bg --repo-type dataset --local-dir outputs
-
-# Option 2: Download a larger background set (~14 GB)
-hf download xinjjj..RLv2-BG --repo-type dataset --local-dir outputs
+# Download pre-generated backgrounds (~2 GB)
+hf download HorizonRobotics/EmbodiedGenData \
+  --repo-type dataset --local-dir outputs \
+  --include "example_gen_scenes/scene_00[01][0-9]/**" \
+            "example_gen_scenes/scene_part_list.txt"
 ```
 
 ## Generate Interactive Layout Scenes
@@ -37,7 +37,7 @@ Use the `layout-cli` to create interactive 3D scenes based on task descriptions.
 layout-cli \
   --task_descs "Place the pen in the mug on the desk" \
                "Put the fruit on the table on the plate" \
-  --bg_list "outputs/bg_scenes/scene_list.txt" \
+  --bg_list "outputs/example_gen_scenes/scene_part_list.txt" \
   --output_root "outputs/layouts_gen" \
   --insert_robot
 ```
@@ -59,7 +59,7 @@ You can also run multiple tasks via a task list file in the backend.
 ```sh
 CUDA_VISIBLE_DEVICES=0 nohup layout-cli \
   --task_descs "apps/assets/example_layout/task_list.txt" \
-  --bg_list "outputs/bg_scenes/scene_list.txt" \
+  --bg_list "outputs/example_gen_scenes/scene_part_list.txt" \
   --n_image_retry 4 --n_asset_retry 3 --n_pipe_retry 3 \
   --output_root "outputs/layouts_gens" \
   --insert_robot > layouts_gens.log &
diff --git a/embodied_gen/scripts/imageto3d.py b/embodied_gen/scripts/imageto3d.py
index 9d930ab..13a1193 100644
--- a/embodied_gen/scripts/imageto3d.py
+++ b/embodied_gen/scripts/imageto3d.py
@@ -153,6 +153,7 @@ def entrypoint(**kwargs):
 
             seed = args.seed
             asset_node = "unknown"
+            gs_model = None
             if isinstance(args.asset_type, list) and args.asset_type[idx]:
                 asset_node = args.asset_type[idx]
             for try_idx in range(args.n_retry):
@@ -165,6 +166,9 @@ def entrypoint(**kwargs):
                     logger.error(
                         f"[Image3D Failed] process {image_path}: {e}, retry: {try_idx+1}/{args.n_retry}"
                     )
+                    seed = (
+                        random.randint(0, 100000) if seed is not None else None
+                    )
                     continue
 
                 gs_model = outputs["gaussian"][0]
@@ -208,6 +212,10 @@ def entrypoint(**kwargs):
 
                 seed = random.randint(0, 100000) if seed is not None else None
 
+            if gs_model is None:
+                logger.error(f"Exceed image3d retry num, skip {image_path}.")
+                continue
+
             # Render the video for generated 3D asset.
             color_images = render_video(gs_model, r=1.85)["color"]
             normal_images = render_video(mesh_model, r=1.85)["normal"]
diff --git a/embodied_gen/utils/gpt_clients.py b/embodied_gen/utils/gpt_clients.py
index 32a9ea9..830dc77 100644
--- a/embodied_gen/utils/gpt_clients.py
+++ b/embodied_gen/utils/gpt_clients.py
@@ -42,7 +42,8 @@
     "GPTclient",
 ]
 
-CONFIG_FILE = "embodied_gen/utils/gpt_config.yaml"
+_CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
+CONFIG_FILE = os.path.join(_CURRENT_DIR, "gpt_config.yaml")
 
 
 class GPTclient:
diff --git a/embodied_gen/utils/simulation.py b/embodied_gen/utils/simulation.py
index 5ff13b6..7f828fb 100644
--- a/embodied_gen/utils/simulation.py
+++ b/embodied_gen/utils/simulation.py
@@ -25,6 +25,7 @@
 import sapien.core as sapien
 import sapien.physx as physx
 import torch
+import trimesh
 from mani_skill.agents.base_agent import BaseAgent
 from mani_skill.envs.scene import ManiSkillScene
 from mani_skill.examples.motionplanning.panda.utils import (
@@ -57,9 +58,24 @@
     "load_assets_from_layout_file",
     "load_mani_skill_robot",
     "render_images",
+    "is_urdf_articulated",
 ]
 
 
+def is_urdf_articulated(urdf_path: str) -> bool:
+    try:
+        tree = ET.parse(urdf_path)
+        root = tree.getroot()
+        for joint in root.findall(".//joint"):
+            j_type = joint.get("type")
+            if j_type in ["prismatic", "revolute", "continuous", "planar"]:
+                return True
+        return False
+    except Exception as e:
+        print(f"Error parsing URDF {urdf_path}: {e}.")
+        return False
+
+
 def load_actor_from_urdf(
     scene: sapien.Scene | ManiSkillScene,
     file_path: str,
@@ -203,14 +219,21 @@ def load_assets_from_layout_file(
         # Combine initial quaternion with object quaternion
         x, y, z, qx, qy, qz, qw = position
         qx, qy, qz, qw = quaternion_multiply([qx, qy, qz, qw], init_quat)
-        actor = load_actor_from_urdf(
-            scene,
-            urdf_file,
-            sapien.Pose(p=[x, y, z], q=[qw, qx, qy, qz]),
-            env_idx,
-            use_static=use_static,
-            update_mass=False,
-        )
+        target_pose = sapien.Pose(p=[x, y, z], q=[qw, qx, qy, qz])
+        if is_urdf_articulated(urdf_file):
+            loader = scene.create_urdf_loader()
+            loader.fix_root_link = use_static
+            actor = loader.load(urdf_file)
+            actor.set_root_pose(target_pose)
+        else:
+            actor = load_actor_from_urdf(
+                scene,
+                urdf_file,
+                target_pose,
+                env_idx,
+                use_static=use_static,
+                update_mass=False,
+            )
         actors[node] = actor
 
     return actors
@@ -725,8 +748,23 @@ def compute_grasp_action(
         Returns:
             np.ndarray: Array of grasp actions.
         """
-        physx_rigid = actor.components[1]
-        mesh = get_component_mesh(physx_rigid, to_world_frame=True)
+        if isinstance(actor, physx.PhysxArticulation):
+            meshes = []
+            for link in actor.links:
+                link_mesh = get_component_mesh(link, to_world_frame=True)
+                if link_mesh is not None and not link_mesh.is_empty:
+                    meshes.append(link_mesh)
+            if meshes:
+                mesh = trimesh.util.concatenate(meshes)
+            else:
+                logger.warning(
+                    f"Articulation {actor.name} has no valid meshes."
+                )
+                return None
+        else:
+            physx_rigid = actor.components[1]
+            mesh = get_component_mesh(physx_rigid, to_world_frame=True)
+
         obb = mesh.bounding_box_oriented
         approaching = np.array([0, 0, -1])
         tcp_pose = self.agent.tcp.pose[env_idx]