From e5e298b8e3b6a11ad1bd7d8f4bc14646283cc916 Mon Sep 17 00:00:00 2001 From: Abhik Singla Date: Tue, 28 Jun 2022 17:14:39 -0700 Subject: [PATCH 1/5] Adding support for adaptive behavior cloning --- rllib/agents/dqn/dqn.py | 2 ++ rllib/agents/sac/sac.py | 2 ++ rllib/agents/sac/sac_tf_policy.py | 8 +++++++- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/rllib/agents/dqn/dqn.py b/rllib/agents/dqn/dqn.py index 696633d96459..cdf34d7e52e0 100644 --- a/rllib/agents/dqn/dqn.py +++ b/rllib/agents/dqn/dqn.py @@ -234,6 +234,8 @@ def execution_plan(workers: WorkerSet, # which means that we don't need the sampling pipeline setup for batch in input_reader.get_all(): local_replay_buffer.add_batch(batch) + config["bc_iters"] = input_reader.total_iterations_count + workers.local_worker().policy_map['default_policy'].update_config(config) else: parallel_rollouts_mode = config.get("parallel_rollouts_mode", "bulk_sync") num_async = config.get("parallel_rollouts_num_async") diff --git a/rllib/agents/sac/sac.py b/rllib/agents/sac/sac.py index 87f6c21254ae..9da70f7d6da5 100644 --- a/rllib/agents/sac/sac.py +++ b/rllib/agents/sac/sac.py @@ -84,6 +84,8 @@ "normalize_actions": True, # Number of iterations to perform in the Behavior Cloning Pretraining "bc_iters": None, + # Whether to use adaptive Behavior Cloning Pretraining learning according to the data size. + "adaptive_bc": False, # === Learning === # Disable setting done=True at end of episode. This should be set to True diff --git a/rllib/agents/sac/sac_tf_policy.py b/rllib/agents/sac/sac_tf_policy.py index c7a14bd3dbdd..a03d2527f527 100644 --- a/rllib/agents/sac/sac_tf_policy.py +++ b/rllib/agents/sac/sac_tf_policy.py @@ -228,7 +228,13 @@ def sac_actor_critic_loss( # Should be True only for debugging purposes (e.g. test cases)! deterministic = policy.config["_deterministic_loss"] bc_iters = policy.config["bc_iters"] - bc_iters_const = (tf.constant(bc_iters, dtype=policy.global_step.dtype) + adaptive_bc = policy.config.get("adaptive_bc", False) + if adaptive_bc: + bc_iters_const = (tf1.placeholder(dtype=policy.global_step.dtype, shape=None, name="bc_iters_const") + if bc_iters else None) + policy.bc_iters_const = bc_iters_const #TODO + else: + bc_iters_const = (tf.constant(bc_iters, dtype=policy.global_step.dtype) if bc_iters else None) # Get the base model output from the train batch. From ef8147bc3aff8b3c889f0d9231266440b5534a2c Mon Sep 17 00:00:00 2001 From: Abhik Singla Date: Tue, 28 Jun 2022 17:24:25 -0700 Subject: [PATCH 2/5] cleanup --- rllib/agents/sac/sac_tf_policy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rllib/agents/sac/sac_tf_policy.py b/rllib/agents/sac/sac_tf_policy.py index a03d2527f527..0dab161d26d9 100644 --- a/rllib/agents/sac/sac_tf_policy.py +++ b/rllib/agents/sac/sac_tf_policy.py @@ -232,7 +232,7 @@ def sac_actor_critic_loss( if adaptive_bc: bc_iters_const = (tf1.placeholder(dtype=policy.global_step.dtype, shape=None, name="bc_iters_const") if bc_iters else None) - policy.bc_iters_const = bc_iters_const #TODO + policy.bc_iters_const = bc_iters_const else: bc_iters_const = (tf.constant(bc_iters, dtype=policy.global_step.dtype) if bc_iters else None) From b3a46ac56fb6a686bb2608123a26144777c981de Mon Sep 17 00:00:00 2001 From: Abhik Singla Date: Tue, 28 Jun 2022 17:52:39 -0700 Subject: [PATCH 3/5] Support for bc epochs --- rllib/agents/sac/sac.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rllib/agents/sac/sac.py b/rllib/agents/sac/sac.py index 9da70f7d6da5..cb7e9921f7d3 100644 --- a/rllib/agents/sac/sac.py +++ b/rllib/agents/sac/sac.py @@ -86,6 +86,8 @@ "bc_iters": None, # Whether to use adaptive Behavior Cloning Pretraining learning according to the data size. "adaptive_bc": False, + # Number of epochs to perform in the Behavior Cloning Pretraining + "bc_epochs": 1, # === Learning === # Disable setting done=True at end of episode. This should be set to True From f1d5c98c845e56e25bf057b769c8d84b2326f3cb Mon Sep 17 00:00:00 2001 From: Abhik Singla Date: Tue, 28 Jun 2022 18:53:35 -0700 Subject: [PATCH 4/5] fix --- rllib/policy/tf_policy.py | 1 + 1 file changed, 1 insertion(+) diff --git a/rllib/policy/tf_policy.py b/rllib/policy/tf_policy.py index 1479499b9274..c9be9473ec23 100644 --- a/rllib/policy/tf_policy.py +++ b/rllib/policy/tf_policy.py @@ -211,6 +211,7 @@ def __init__(self, self.dist_class is not None: self._log_likelihood = self.dist_class( self._dist_inputs, self.model).logp(self._action_input) + self.bc_iters_const: Optional[tf.Tensor] = None def variables(self): """Return the list of all savable variables for this policy.""" From 4c8a8cc2e07b033214c8c70d0d820a15642b3296 Mon Sep 17 00:00:00 2001 From: Abhik Singla Date: Mon, 11 Jul 2022 12:04:01 -0700 Subject: [PATCH 5/5] remove redundant experimental tag --- rllib/agents/sac/sac.py | 2 -- rllib/agents/sac/sac_tf_policy.py | 13 +++++-------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/rllib/agents/sac/sac.py b/rllib/agents/sac/sac.py index cb7e9921f7d3..b0b3389ce018 100644 --- a/rllib/agents/sac/sac.py +++ b/rllib/agents/sac/sac.py @@ -84,8 +84,6 @@ "normalize_actions": True, # Number of iterations to perform in the Behavior Cloning Pretraining "bc_iters": None, - # Whether to use adaptive Behavior Cloning Pretraining learning according to the data size. - "adaptive_bc": False, # Number of epochs to perform in the Behavior Cloning Pretraining "bc_epochs": 1, diff --git a/rllib/agents/sac/sac_tf_policy.py b/rllib/agents/sac/sac_tf_policy.py index 0dab161d26d9..17a3a36e7433 100644 --- a/rllib/agents/sac/sac_tf_policy.py +++ b/rllib/agents/sac/sac_tf_policy.py @@ -228,14 +228,11 @@ def sac_actor_critic_loss( # Should be True only for debugging purposes (e.g. test cases)! deterministic = policy.config["_deterministic_loss"] bc_iters = policy.config["bc_iters"] - adaptive_bc = policy.config.get("adaptive_bc", False) - if adaptive_bc: - bc_iters_const = (tf1.placeholder(dtype=policy.global_step.dtype, shape=None, name="bc_iters_const") - if bc_iters else None) - policy.bc_iters_const = bc_iters_const - else: - bc_iters_const = (tf.constant(bc_iters, dtype=policy.global_step.dtype) - if bc_iters else None) + bc_iters_const = tf1.placeholder_with_default( + tf.constant(bc_iters, dtype=policy.global_step.dtype), + shape=None, + name="bc_iters_const") + policy.bc_iters_const = bc_iters_const # Get the base model output from the train batch. model_out_t, _ = model({