diff --git a/README.md b/README.md
index 81d4cb0..2d16b48 100644
--- a/README.md
+++ b/README.md
@@ -223,14 +223,20 @@ See our paper published in NeurIPS 2025.
Text-to-image model based on SD3.5 Medium, usage requires agreement to the [model license](https://huggingface.co/stabilityai/stable-diffusion-3.5-medium). All models auto-downloaded at the first run.
-You can generate any desired room as background using `scene3d-cli`. As each scene takes approximately 30 minutes to generate, we recommend pre-generating them for efficiency and adding them to `outputs/bg_scenes/scene_list.txt`.
+You can generate any desired room as background using `scene3d-cli`. As each scene takes approximately 30 minutes to generate, we recommend pre-generating them for efficiency and adding them to `outputs/example_gen_scenes/scene_part_list.txt`.
-We provided some sample background assets created with `scene3d-cli`. Download them(~4G) using `hf download xinjjj/scene3d-bg --repo-type dataset --local-dir outputs`.
+We provided some sample background assets created with `scene3d-cli`. Download them(~2G) using:
+```sh
+hf download HorizonRobotics/EmbodiedGenData \
+ --repo-type dataset --local-dir outputs \
+ --include "example_gen_scenes/scene_00[01][0-9]/**" \
+ "example_gen_scenes/scene_part_list.txt"
+```
Generating one interactive 3D scene from task description with `layout-cli` takes approximately 30 minutes.
```sh
layout-cli --task_descs "Place the pen in the mug on the desk" "Put the fruit on the table on the plate" \
---bg_list "outputs/bg_scenes/scene_list.txt" --output_root "outputs/layouts_gen" --insert_robot
+--bg_list "outputs/example_gen_scenes/scene_part_list.txt" --output_root "outputs/layouts_gen" --insert_robot
```
@@ -245,7 +251,7 @@ Remove `--insert_robot` if you don't consider the robot pose in layout generatio
```sh
CUDA_VISIBLE_DEVICES=0 nohup layout-cli \
--task_descs "apps/assets/example_layout/task_list.txt" \
---bg_list "outputs/bg_scenes/scene_list.txt" \
+--bg_list "outputs/example_gen_scenes/scene_part_list.txt" \
--n_image_retry 4 --n_asset_retry 3 --n_pipe_retry 3 \
--output_root "outputs/layouts_gens" --insert_robot > layouts_gens.log &
```
diff --git a/docs/tutorials/layout_gen.md b/docs/tutorials/layout_gen.md
index 0f56543..3fb4e85 100644
--- a/docs/tutorials/layout_gen.md
+++ b/docs/tutorials/layout_gen.md
@@ -19,14 +19,14 @@ Layout Generation enables the generation of diverse, physically realistic, and s
Before running `layout-cli`, you need to prepare background 3D scenes.
You can either **generate your own** using the [`scene3d-cli`](scene_gen.md), or **download pre-generated backgrounds** for convenience.
-Each scene takes approximately **30 minutes** to generate. For efficiency, we recommend pre-generating and listing them in `outputs/bg_scenes/scene_list.txt`.
+Each scene takes approximately **30 minutes** to generate. For efficiency, we recommend pre-generating and listing them in `outputs/example_gen_scenes/scene_part_list.txt`.
```bash
-# Option 1: Download pre-generated backgrounds (~4 GB)
-hf download xinjjj/scene3d-bg --repo-type dataset --local-dir outputs
-
-# Option 2: Download a larger background set (~14 GB)
-hf download xinjjj..RLv2-BG --repo-type dataset --local-dir outputs
+# Download pre-generated backgrounds (~2 GB)
+hf download HorizonRobotics/EmbodiedGenData \
+ --repo-type dataset --local-dir outputs \
+ --include "example_gen_scenes/scene_00[01][0-9]/**" \
+ "example_gen_scenes/scene_part_list.txt"
```
## Generate Interactive Layout Scenes
@@ -37,7 +37,7 @@ Use the `layout-cli` to create interactive 3D scenes based on task descriptions.
layout-cli \
--task_descs "Place the pen in the mug on the desk" \
"Put the fruit on the table on the plate" \
- --bg_list "outputs/bg_scenes/scene_list.txt" \
+ --bg_list "outputs/example_gen_scenes/scene_part_list.txt" \
--output_root "outputs/layouts_gen" \
--insert_robot
```
@@ -59,7 +59,7 @@ You can also run multiple tasks via a task list file in the backend.
```sh
CUDA_VISIBLE_DEVICES=0 nohup layout-cli \
--task_descs "apps/assets/example_layout/task_list.txt" \
- --bg_list "outputs/bg_scenes/scene_list.txt" \
+ --bg_list "outputs/example_gen_scenes/scene_part_list.txt" \
--n_image_retry 4 --n_asset_retry 3 --n_pipe_retry 3 \
--output_root "outputs/layouts_gens" \
--insert_robot > layouts_gens.log &
diff --git a/embodied_gen/scripts/imageto3d.py b/embodied_gen/scripts/imageto3d.py
index 9d930ab..13a1193 100644
--- a/embodied_gen/scripts/imageto3d.py
+++ b/embodied_gen/scripts/imageto3d.py
@@ -153,6 +153,7 @@ def entrypoint(**kwargs):
seed = args.seed
asset_node = "unknown"
+ gs_model = None
if isinstance(args.asset_type, list) and args.asset_type[idx]:
asset_node = args.asset_type[idx]
for try_idx in range(args.n_retry):
@@ -165,6 +166,9 @@ def entrypoint(**kwargs):
logger.error(
f"[Image3D Failed] process {image_path}: {e}, retry: {try_idx+1}/{args.n_retry}"
)
+ seed = (
+ random.randint(0, 100000) if seed is not None else None
+ )
continue
gs_model = outputs["gaussian"][0]
@@ -208,6 +212,10 @@ def entrypoint(**kwargs):
seed = random.randint(0, 100000) if seed is not None else None
+ if gs_model is None:
+ logger.error(f"Exceed image3d retry num, skip {image_path}.")
+ continue
+
# Render the video for generated 3D asset.
color_images = render_video(gs_model, r=1.85)["color"]
normal_images = render_video(mesh_model, r=1.85)["normal"]
diff --git a/embodied_gen/utils/gpt_clients.py b/embodied_gen/utils/gpt_clients.py
index 32a9ea9..830dc77 100644
--- a/embodied_gen/utils/gpt_clients.py
+++ b/embodied_gen/utils/gpt_clients.py
@@ -42,7 +42,8 @@
"GPTclient",
]
-CONFIG_FILE = "embodied_gen/utils/gpt_config.yaml"
+_CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
+CONFIG_FILE = os.path.join(_CURRENT_DIR, "gpt_config.yaml")
class GPTclient:
diff --git a/embodied_gen/utils/simulation.py b/embodied_gen/utils/simulation.py
index 5ff13b6..7f828fb 100644
--- a/embodied_gen/utils/simulation.py
+++ b/embodied_gen/utils/simulation.py
@@ -25,6 +25,7 @@
import sapien.core as sapien
import sapien.physx as physx
import torch
+import trimesh
from mani_skill.agents.base_agent import BaseAgent
from mani_skill.envs.scene import ManiSkillScene
from mani_skill.examples.motionplanning.panda.utils import (
@@ -57,9 +58,24 @@
"load_assets_from_layout_file",
"load_mani_skill_robot",
"render_images",
+ "is_urdf_articulated",
]
+def is_urdf_articulated(urdf_path: str) -> bool:
+ try:
+ tree = ET.parse(urdf_path)
+ root = tree.getroot()
+ for joint in root.findall(".//joint"):
+ j_type = joint.get("type")
+ if j_type in ["prismatic", "revolute", "continuous", "planar"]:
+ return True
+ return False
+ except Exception as e:
+ print(f"Error parsing URDF {urdf_path}: {e}.")
+ return False
+
+
def load_actor_from_urdf(
scene: sapien.Scene | ManiSkillScene,
file_path: str,
@@ -203,14 +219,21 @@ def load_assets_from_layout_file(
# Combine initial quaternion with object quaternion
x, y, z, qx, qy, qz, qw = position
qx, qy, qz, qw = quaternion_multiply([qx, qy, qz, qw], init_quat)
- actor = load_actor_from_urdf(
- scene,
- urdf_file,
- sapien.Pose(p=[x, y, z], q=[qw, qx, qy, qz]),
- env_idx,
- use_static=use_static,
- update_mass=False,
- )
+ target_pose = sapien.Pose(p=[x, y, z], q=[qw, qx, qy, qz])
+ if is_urdf_articulated(urdf_file):
+ loader = scene.create_urdf_loader()
+ loader.fix_root_link = use_static
+ actor = loader.load(urdf_file)
+ actor.set_root_pose(target_pose)
+ else:
+ actor = load_actor_from_urdf(
+ scene,
+ urdf_file,
+ target_pose,
+ env_idx,
+ use_static=use_static,
+ update_mass=False,
+ )
actors[node] = actor
return actors
@@ -725,8 +748,23 @@ def compute_grasp_action(
Returns:
np.ndarray: Array of grasp actions.
"""
- physx_rigid = actor.components[1]
- mesh = get_component_mesh(physx_rigid, to_world_frame=True)
+ if isinstance(actor, physx.PhysxArticulation):
+ meshes = []
+ for link in actor.links:
+ link_mesh = get_component_mesh(link, to_world_frame=True)
+ if link_mesh is not None and not link_mesh.is_empty:
+ meshes.append(link_mesh)
+ if meshes:
+ mesh = trimesh.util.concatenate(meshes)
+ else:
+ logger.warning(
+ f"Articulation {actor.name} has no valid meshes."
+ )
+ return None
+ else:
+ physx_rigid = actor.components[1]
+ mesh = get_component_mesh(physx_rigid, to_world_frame=True)
+
obb = mesh.bounding_box_oriented
approaching = np.array([0, 0, -1])
tcp_pose = self.agent.tcp.pose[env_idx]