Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -223,14 +223,20 @@ See our paper published in NeurIPS 2025.

Text-to-image model based on SD3.5 Medium, usage requires agreement to the [model license](https://huggingface.co/stabilityai/stable-diffusion-3.5-medium). All models auto-downloaded at the first run.

You can generate any desired room as background using `scene3d-cli`. As each scene takes approximately 30 minutes to generate, we recommend pre-generating them for efficiency and adding them to `outputs/bg_scenes/scene_list.txt`.
You can generate any desired room as background using `scene3d-cli`. As each scene takes approximately 30 minutes to generate, we recommend pre-generating them for efficiency and adding them to `outputs/example_gen_scenes/scene_part_list.txt`.

We provided some sample background assets created with `scene3d-cli`. Download them(~4G) using `hf download xinjjj/scene3d-bg --repo-type dataset --local-dir outputs`.
We provided some sample background assets created with `scene3d-cli`. Download them(~2G) using:
```sh
hf download HorizonRobotics/EmbodiedGenData \
--repo-type dataset --local-dir outputs \
--include "example_gen_scenes/scene_00[01][0-9]/**" \
"example_gen_scenes/scene_part_list.txt"
```

Generating one interactive 3D scene from task description with `layout-cli` takes approximately 30 minutes.
```sh
layout-cli --task_descs "Place the pen in the mug on the desk" "Put the fruit on the table on the plate" \
--bg_list "outputs/bg_scenes/scene_list.txt" --output_root "outputs/layouts_gen" --insert_robot
--bg_list "outputs/example_gen_scenes/scene_part_list.txt" --output_root "outputs/layouts_gen" --insert_robot
```

<table>
Expand All @@ -245,7 +251,7 @@ Remove `--insert_robot` if you don't consider the robot pose in layout generatio
```sh
CUDA_VISIBLE_DEVICES=0 nohup layout-cli \
--task_descs "apps/assets/example_layout/task_list.txt" \
--bg_list "outputs/bg_scenes/scene_list.txt" \
--bg_list "outputs/example_gen_scenes/scene_part_list.txt" \
--n_image_retry 4 --n_asset_retry 3 --n_pipe_retry 3 \
--output_root "outputs/layouts_gens" --insert_robot > layouts_gens.log &
```
Expand Down
16 changes: 8 additions & 8 deletions docs/tutorials/layout_gen.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@ Layout Generation enables the generation of diverse, physically realistic, and s
Before running `layout-cli`, you need to prepare background 3D scenes.
You can either **generate your own** using the [`scene3d-cli`](scene_gen.md), or **download pre-generated backgrounds** for convenience.

Each scene takes approximately **30 minutes** to generate. For efficiency, we recommend pre-generating and listing them in `outputs/bg_scenes/scene_list.txt`.
Each scene takes approximately **30 minutes** to generate. For efficiency, we recommend pre-generating and listing them in `outputs/example_gen_scenes/scene_part_list.txt`.

```bash
# Option 1: Download pre-generated backgrounds (~4 GB)
hf download xinjjj/scene3d-bg --repo-type dataset --local-dir outputs

# Option 2: Download a larger background set (~14 GB)
hf download xinjjj..RLv2-BG --repo-type dataset --local-dir outputs
# Download pre-generated backgrounds (~2 GB)
hf download HorizonRobotics/EmbodiedGenData \
--repo-type dataset --local-dir outputs \
--include "example_gen_scenes/scene_00[01][0-9]/**" \
"example_gen_scenes/scene_part_list.txt"
```

## Generate Interactive Layout Scenes
Expand All @@ -37,7 +37,7 @@ Use the `layout-cli` to create interactive 3D scenes based on task descriptions.
layout-cli \
--task_descs "Place the pen in the mug on the desk" \
"Put the fruit on the table on the plate" \
--bg_list "outputs/bg_scenes/scene_list.txt" \
--bg_list "outputs/example_gen_scenes/scene_part_list.txt" \
--output_root "outputs/layouts_gen" \
--insert_robot
```
Expand All @@ -59,7 +59,7 @@ You can also run multiple tasks via a task list file in the backend.
```sh
CUDA_VISIBLE_DEVICES=0 nohup layout-cli \
--task_descs "apps/assets/example_layout/task_list.txt" \
--bg_list "outputs/bg_scenes/scene_list.txt" \
--bg_list "outputs/example_gen_scenes/scene_part_list.txt" \
--n_image_retry 4 --n_asset_retry 3 --n_pipe_retry 3 \
--output_root "outputs/layouts_gens" \
--insert_robot > layouts_gens.log &
Expand Down
8 changes: 8 additions & 0 deletions embodied_gen/scripts/imageto3d.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ def entrypoint(**kwargs):

seed = args.seed
asset_node = "unknown"
gs_model = None
if isinstance(args.asset_type, list) and args.asset_type[idx]:
asset_node = args.asset_type[idx]
for try_idx in range(args.n_retry):
Expand All @@ -165,6 +166,9 @@ def entrypoint(**kwargs):
logger.error(
f"[Image3D Failed] process {image_path}: {e}, retry: {try_idx+1}/{args.n_retry}"
)
seed = (
random.randint(0, 100000) if seed is not None else None
)
continue

gs_model = outputs["gaussian"][0]
Expand Down Expand Up @@ -208,6 +212,10 @@ def entrypoint(**kwargs):

seed = random.randint(0, 100000) if seed is not None else None

if gs_model is None:
logger.error(f"Exceed image3d retry num, skip {image_path}.")
continue

# Render the video for generated 3D asset.
color_images = render_video(gs_model, r=1.85)["color"]
normal_images = render_video(mesh_model, r=1.85)["normal"]
Expand Down
3 changes: 2 additions & 1 deletion embodied_gen/utils/gpt_clients.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@
"GPTclient",
]

CONFIG_FILE = "embodied_gen/utils/gpt_config.yaml"
_CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
CONFIG_FILE = os.path.join(_CURRENT_DIR, "gpt_config.yaml")


class GPTclient:
Expand Down
58 changes: 48 additions & 10 deletions embodied_gen/utils/simulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import sapien.core as sapien
import sapien.physx as physx
import torch
import trimesh
from mani_skill.agents.base_agent import BaseAgent
from mani_skill.envs.scene import ManiSkillScene
from mani_skill.examples.motionplanning.panda.utils import (
Expand Down Expand Up @@ -57,9 +58,24 @@
"load_assets_from_layout_file",
"load_mani_skill_robot",
"render_images",
"is_urdf_articulated",
]


def is_urdf_articulated(urdf_path: str) -> bool:
try:
tree = ET.parse(urdf_path)
root = tree.getroot()
for joint in root.findall(".//joint"):
j_type = joint.get("type")
if j_type in ["prismatic", "revolute", "continuous", "planar"]:
return True
return False
except Exception as e:
print(f"Error parsing URDF {urdf_path}: {e}.")
return False


def load_actor_from_urdf(
scene: sapien.Scene | ManiSkillScene,
file_path: str,
Expand Down Expand Up @@ -203,14 +219,21 @@ def load_assets_from_layout_file(
# Combine initial quaternion with object quaternion
x, y, z, qx, qy, qz, qw = position
qx, qy, qz, qw = quaternion_multiply([qx, qy, qz, qw], init_quat)
actor = load_actor_from_urdf(
scene,
urdf_file,
sapien.Pose(p=[x, y, z], q=[qw, qx, qy, qz]),
env_idx,
use_static=use_static,
update_mass=False,
)
target_pose = sapien.Pose(p=[x, y, z], q=[qw, qx, qy, qz])
if is_urdf_articulated(urdf_file):
loader = scene.create_urdf_loader()
loader.fix_root_link = use_static
actor = loader.load(urdf_file)
actor.set_root_pose(target_pose)
else:
actor = load_actor_from_urdf(
scene,
urdf_file,
target_pose,
env_idx,
use_static=use_static,
update_mass=False,
)
actors[node] = actor

return actors
Expand Down Expand Up @@ -725,8 +748,23 @@ def compute_grasp_action(
Returns:
np.ndarray: Array of grasp actions.
"""
physx_rigid = actor.components[1]
mesh = get_component_mesh(physx_rigid, to_world_frame=True)
if isinstance(actor, physx.PhysxArticulation):
meshes = []
for link in actor.links:
link_mesh = get_component_mesh(link, to_world_frame=True)
if link_mesh is not None and not link_mesh.is_empty:
meshes.append(link_mesh)
if meshes:
mesh = trimesh.util.concatenate(meshes)
else:
logger.warning(
f"Articulation {actor.name} has no valid meshes."
)
return None
else:
physx_rigid = actor.components[1]
mesh = get_component_mesh(physx_rigid, to_world_frame=True)

obb = mesh.bounding_box_oriented
approaching = np.array([0, 0, -1])
tcp_pose = self.agent.tcp.pose[env_idx]
Expand Down