From 9f32b43049e2a8290859263db42c23ca9e37ff7b Mon Sep 17 00:00:00 2001 From: Hoang Tran Date: Tue, 25 Nov 2025 21:09:54 +0000 Subject: [PATCH 1/2] add placeholder --- sections/05_foundation_models.tex | 5 +++++ snippets/ch5/03_training_smolvla.py | 1 + 2 files changed, 6 insertions(+) create mode 100644 snippets/ch5/03_training_smolvla.py diff --git a/sections/05_foundation_models.tex b/sections/05_foundation_models.tex index 884d4ac..b6524fc 100644 --- a/sections/05_foundation_models.tex +++ b/sections/05_foundation_models.tex @@ -239,3 +239,8 @@ \subsubsection{Code Example: Using SmolVLA} \begin{pbox}[label={ex:using-smolvla}]{Using SmolVLA \\ \url{https://github.com/fracapuano/robot-learning-tutorial/blob/main/snippets/ch5/02_using_smolvla.py}} \lstinputlisting[language=python]{snippets/ch5/02_using_smolvla.py} \end{pbox} + +\subsubsection{Code Example: Training SmolVLA} +\begin{pbox}[label={ex:training-smolvla}]{Training SmolVLA \\ \url{https://github.com/fracapuano/robot-learning-tutorial/blob/main/snippets/ch5/03_training_smolvla.py}} + \lstinputlisting[language=python]{snippets/ch5/03_training_smolvla.py} +\end{pbox} \ No newline at end of file diff --git a/snippets/ch5/03_training_smolvla.py b/snippets/ch5/03_training_smolvla.py new file mode 100644 index 0000000..aaa3285 --- /dev/null +++ b/snippets/ch5/03_training_smolvla.py @@ -0,0 +1 @@ +import torch From 30acd45691cfe1032606d713a4c5f9feef1cc9c0 Mon Sep 17 00:00:00 2001 From: Hoang Tran Date: Sun, 14 Dec 2025 15:47:33 +0000 Subject: [PATCH 2/2] add complete training script --- snippets/ch5/03_training_smolvla.py | 205 ++++++++++++++++++++++++++++ 1 file changed, 205 insertions(+) diff --git a/snippets/ch5/03_training_smolvla.py b/snippets/ch5/03_training_smolvla.py index aaa3285..a5115ed 100644 --- a/snippets/ch5/03_training_smolvla.py +++ b/snippets/ch5/03_training_smolvla.py @@ -1 +1,206 @@ +from pathlib import Path + import torch + +from lerobot.datasets.lerobot_dataset import LeRobotDataset, LeRobotDatasetMetadata +from lerobot.policies.factory import make_pre_post_processors +from lerobot.policies.smolvla.configuration_smolvla import SmolVLAConfig +from lerobot.policies.smolvla.modeling_smolvla import SmolVLAPolicy + + +# Output directory for saving the trained model +output_directory = Path("outputs/train/my_smolvla") +output_directory.mkdir(parents=True, exist_ok=True) + +device = torch.device("mps") # or "cuda" or "cpu" + +# Replace with your custom dataset repo_id (e.g., "${HF_USER}/mydataset") +dataset_id = "lerobot/svla_so100_pickplace" + +# Model configuration +# Option 1: Load from a pretrained SmolVLA checkpoint (recommended for fine-tuning) +pretrained_model_id = "lerobot/smolvla_base" +load_from_pretrained = True # Set to False to train from scratch + +# Option 2: Train from scratch (only if you have a large dataset and computational resources) +# This will initialize the model with a pretrained VLM backbone but random action expert +# load_from_pretrained = False + +# Load dataset metadata to get features and statistics +print(f"Loading dataset metadata from {dataset_id}...") +dataset_metadata = LeRobotDatasetMetadata(dataset_id) + +if load_from_pretrained: + print(f"Loading pretrained model from {pretrained_model_id}...") + policy = SmolVLAPolicy.from_pretrained(pretrained_model_id) + + # Create rename map to match dataset keys to model's expected keys + rename_map = { + "observation.images.top": "observation.images.camera1", + "observation.images.wrist": "observation.images.camera2", + } + + # Create preprocessor and postprocessor with dataset statistics + # This is important for normalizing inputs/outputs to match your dataset + preprocessor, postprocessor = make_pre_post_processors( + policy.config, + pretrained_path=pretrained_model_id, + preprocessor_overrides={ + "device_processor": {"device": str(device)}, + "rename_observations_processor": {"rename_map": rename_map}, + "normalizer_processor": { + "stats": dataset_metadata.stats, + "features": {**policy.config.input_features, **policy.config.output_features}, + "norm_map": policy.config.normalization_mapping, + }, + }, + postprocessor_overrides={ + "unnormalizer_processor": { + "stats": dataset_metadata.stats, + "features": policy.config.output_features, + "norm_map": policy.config.normalization_mapping, + }, + }, + ) +else: + print("Initializing new SmolVLA model from scratch...") + # Note: Training from scratch requires careful configuration + # The input/output features must match your dataset structure + from lerobot.configs.types import FeatureType + from lerobot.datasets.utils import dataset_to_policy_features + + features = dataset_to_policy_features(dataset_metadata.features) + output_features = {key: ft for key, ft in features.items() if ft.type is FeatureType.ACTION} + input_features = {key: ft for key, ft in features.items() if key not in output_features} + + cfg = SmolVLAConfig(input_features=input_features, output_features=output_features) + cfg.load_vlm_weights = True # Load pretrained VLM backbone + policy = SmolVLAPolicy(cfg) + + preprocessor, postprocessor = make_pre_post_processors( + cfg, dataset_stats=dataset_metadata.stats + ) + +# Move policy to the specified device +policy.train() +policy.to(device) + + +def make_delta_timestamps(delta_indices: list[int] | None, fps: int) -> list[float]: + """Convert delta indices to delta timestamps based on dataset FPS.""" + if delta_indices is None: + return [0] + return [i / fps for i in delta_indices] + + +# SmolVLA expects action sequences of length chunk_size (default 50) +# and single observation frames (n_obs_steps=1) +delta_timestamps = { + "action": make_delta_timestamps(policy.config.action_delta_indices, dataset_metadata.fps), +} + +# Add delta timestamps for image features that actually exist in the dataset +dataset_image_keys = [k for k in dataset_metadata.features.keys() if "image" in k.lower()] +delta_timestamps |= { + k: make_delta_timestamps(policy.config.observation_delta_indices, dataset_metadata.fps) + for k in dataset_image_keys +} + +# Add delta timestamp for state if present +if "observation.state" in dataset_metadata.features: + delta_timestamps["observation.state"] = make_delta_timestamps( + policy.config.observation_delta_indices, dataset_metadata.fps + ) + +# Load the dataset with appropriate delta timestamps +print(f"Loading dataset {dataset_id}...") +dataset = LeRobotDataset(dataset_id, delta_timestamps=delta_timestamps) +print(f"Dataset loaded: {dataset.num_episodes} episodes, {dataset.num_frames} frames") + +# Training configuration +batch_size = 64 # Adjust based on your GPU memory (64 works well for most GPUs) +training_steps = 20000 # Number of training steps (increase for better performance) +log_freq = 100 # Log every N steps + +# Create optimizer and scheduler using SmolVLA's preset configurations +optimizer = policy.config.get_optimizer_preset().build(policy.parameters()) +lr_scheduler = policy.config.get_scheduler_preset().build(optimizer, num_training_steps=training_steps) + +# Create dataloader for offline training +dataloader = torch.utils.data.DataLoader( + dataset, + batch_size=batch_size, + shuffle=True, + pin_memory=device.type == "cuda", + drop_last=True, + num_workers=4, # Adjust based on your system +) + +print(f"\nStarting training for {training_steps} steps...") +print(f"Batch size: {batch_size}") +print(f"Device: {device}") +print(f"Output directory: {output_directory}") +print("-" * 80) + +# Training loop +step = 0 +done = False +while not done: + for batch in dataloader: + # Preprocess the batch (normalization, tokenization, etc.) + batch = preprocessor(batch) + + # Forward pass: compute loss + loss, output_dict = policy.forward(batch) + + # Backward pass and optimization + loss.backward() + optimizer.step() + optimizer.zero_grad() + + # Update learning rate + if lr_scheduler is not None: + lr_scheduler.step() + + # Logging + if step % log_freq == 0: + current_lr = optimizer.param_groups[0]["lr"] + print( + f"Step: {step}/{training_steps} | " + f"Loss: {loss.item():.4f} | " + f"LR: {current_lr:.2e}" + ) + + step += 1 + if step >= training_steps: + done = True + break + +print("-" * 80) +print("Training completed!") + +# Save the trained model +print(f"\nSaving model to {output_directory}...") +policy.save_pretrained(output_directory) +preprocessor.save_pretrained(output_directory) +postprocessor.save_pretrained(output_directory) +print("Model saved successfully!") + +# Optional: Push to Hugging Face Hub +# Uncomment and update with your Hugging Face username +push_to_hub = False # Set to True to push to Hub +hub_repo_id = "YOUR_HF_USERNAME/my_smolvla_so101" # Replace with your repo ID + +if push_to_hub: + print(f"\nPushing model to Hugging Face Hub: {hub_repo_id}...") + policy.push_to_hub(hub_repo_id) + preprocessor.push_to_hub(hub_repo_id) + postprocessor.push_to_hub(hub_repo_id) + print(f"Model pushed to: https://huggingface.co/{hub_repo_id}") + +print("\n" + "=" * 80) +print("Training complete! Next steps:") +print("1. Test the model with: examples/tutorial/smolvla/using_smolvla_example.py") +print(f"2. Update model_id in the script to: {output_directory}") +print("3. Deploy on your SO101 robot!") +print("=" * 80) \ No newline at end of file