From 8c0f7c6dbfa7906e7a28a596420aec9cf760e4ac Mon Sep 17 00:00:00 2001 From: DBay-ani Date: Sun, 30 Jun 2019 22:46:38 -0800 Subject: [PATCH 1/6] Committing code to use the feedback from safety monitors in a variety of ways (excluding use of CPO), In particular, the monitor is worked into the environment and used to provide: *activation of a safety fallback policy *modification of the reward to encorporate the monitor signal *provide additional features to the observations received by the controller. --- contracts.py | 5 + envs/monitorEncorporated_env.py | 278 ++++++++++++++++++ test/test_monitorEncorporated_straight.py | 160 ++++++++++ ...in_monitorEncorporated_straight_planner.py | 234 +++++++++++++++ 4 files changed, 677 insertions(+) create mode 100644 contracts.py create mode 100644 envs/monitorEncorporated_env.py create mode 100644 test/test_monitorEncorporated_straight.py create mode 100644 train/train_monitorEncorporated_straight_planner.py diff --git a/contracts.py b/contracts.py new file mode 100644 index 0000000..fd2fdbe --- /dev/null +++ b/contracts.py @@ -0,0 +1,5 @@ +def requires(booleanStatement): + assert(booleanStatement); + +def ensures(booleanStatement): + assert(booleanStatement); diff --git a/envs/monitorEncorporated_env.py b/envs/monitorEncorporated_env.py new file mode 100644 index 0000000..5d9513f --- /dev/null +++ b/envs/monitorEncorporated_env.py @@ -0,0 +1,278 @@ +from aa_simulation.contracts import *; +from rllab.envs.proxy_env import ProxyEnv; +from aa_simulation.envs.base_env import VehicleEnv; +from rllab.policies.base import Policy; +import numpy as np; + +from rllab.core.serializable import Serializable; +from rllab.spaces import Box; + +from rllab.envs.base import Step; + +def isProperMonitorSubFormula(thisProposedSubFormula): + return isinstance(thisProposedSubFormula, str); + +# In triple quotes below: the original implementation of +# isProperMonitorSubFormula prior to running into +# substantial problems pickling functions between components +""" +super(MonitorEncorporatedEnv, self).__init__(wrapped_env); + if(str(type(thisProposedSubFormula)) != ""): + return False; + if(thisProposedSubFormula.__code__.co_argcount != 2): + return False; + return True; +""" + + +class MonitorEncorporatedEnv(ProxyEnv): + """ + MonitorEncorporatedEnv: this class provides a way for transforming an instance of the + VehicleEnv class (the "wrapped environment") to an environment where the monitor is used + in a variety of ways. The monitor information is provided as a list of the quantitative + subformulas - for instance if the monitor is ((A < B) AND (C < D) AND (E < F)), then the + monitor is provided as [B -A, D -C, E -F]. The monitor information can be used in any subset + of the following: + (1) Activate fallback controller: in the case the monitor is violated, a fallback controller + is used to dictate the actions to be take as opposed to the agent interacting with + the environment. This occurs in all and only the situations where the monitor is + violated. To disable this functionality, simply pass in None for codeForFallbackController; + the actions provided through the step-function (see the code below) to the environment + then will always be acted out. + (2) Influence the reward returned; the reward returned by the environment is a weighted + combination of the reward given by the wrapped-environment and the value given by the + quantitative monitor. Specifically, the reward given is: + reward = rewardFromWrappedEnvironment + \ + weightForQuantMonitorValueInReward * min(B -A, D -C, E -F) + To effectively disable this functionality, set weightForQuantMonitorValueInReward to 0.0 . + (3) Additional features in observations: in addition to the features provided by the + wrapped-environment, the quantitative-monitor subformulas can be provided as additional + features for a state. For instance, if the initial feature vector is : + [f_1, f_2, ..., f_{n-1}, f_n] + the features can be expanded to include: + [f_1, f_2, ..., f_{n-1}, f_n, B -A, D -C, E -F] + Note that since we use the quantitative monitor subformulas, the features vary over the + set of the state-space where the monitor is not violated. This is in contrast to if + the subformulas from the original monitor, in which case the binary values would not + vary of the safe-set - the moment any one of them changes, the environment which trigger + the fallback controller to kind the vehicle, which makes having those features in such + an arrangement have little utility. To enable these additional features, set + useQuantMonitorSubformulasAsFeatures to true, and to disable, set + useQuantMonitorSubformulasAsFeatures to false. + Again, any subset of the above three options is valid - so there are at least 8 general modes of + operation for this class. + + + A Note on Some Unfortunate Hacks Made To Get The rllab Infrastructure to Work With This Code: + Unfortunately, various parts of the rllab code try to do clever things with pickling is + saving results and passing parameters around in the infrastructure. This limits how much plain + functions can be passed around as parameters - while cloudpickle can be substituted in + many places for pickle in the rllab code, at least three challenges remain there: (1) rllab uses + some functionality of pickle not supported by cloudpickle (specific attributes pickle has that + cloudpickle does not), (2) rllab is a project outside the general control of the aa-group, and + the code base for it has been frozen for some time in favor of developing a new platform; as + such, we would have to modify our own local copy of rllab and distribute to any in the aa-group + who want to use it, (3) in addition to the python package "pickle", rllab also takes advantage of + numpy pickle functions that apparently have some similar issues. + + As a work-around to the difficulties listed above, the code was change to use code for functions + in place of python implementations of the functions. That is, instead of passing in, say, + lambda x: x +2 + the code requires that the string + "x + 2" + be passed in. Specifically, the elements of quantitativeMonitorSubFormulas must be strings that can + be evaluated by the python built-in eval , and codeForFallbackController must be text + evaluatable by the python built-in eval and must define the function fallbackController . + Plans for near-future development include investigating better ways to handle the circumstances. + For the first swing at developing these functionalities, this arrangement should be sufficient + and not overly brittle nor overly complex. + """ + + def __init__(self, wrapped_env, quantitativeMonitorSubFormulas, \ + weightForQuantMonitorValueInReward, codeForFallbackController, useQuantMonitorSubformulasAsFeatures): + requires(isinstance(wrapped_env, VehicleEnv)); + requires(isinstance(quantitativeMonitorSubFormulas, list)); + # NOTE: we allow quantitativeMonitorSubFormulas to be an empty list, + # in which case no monitor violations should ever occur + requires(all([isProperMonitorSubFormula(x) for x in quantitativeMonitorSubFormulas])); + requires(isinstance(weightForQuantMonitorValueInReward, float)); + # NOTE: we allow weightForQuantMonitorValueInReward to be negative, in + # case the agent would be rewarded for violating the monitor condition. + # This might be useful for testing or to empirically judge the + # influence of the monitor signal encorporated via the reward function. + requires(codeForFallbackController == None or isinstance(codeForFallbackController, str)); + requires(isinstance(useQuantMonitorSubformulasAsFeatures, bool)); + + # NOTE: we cannot do + # ProxyEnv.__init__(self, wrapped_env); + # or + # super(MonitorEncorporatedEnv, self).__init__(wrapped_env); + # since the init function here (unlike the ProxyEnv class) takes in multiple + # arguments and results in local() not being able to find all of them if + # we try calling as listed above. + Serializable.quick_init(self, locals()) + self._wrapped_env = wrapped_env + + # TODO: consider including python-ic leading underscore as necessary... + self.quantitativeMonitorSubFormulas = quantitativeMonitorSubFormulas; + self.weightForQuantMonitorValueInReward = weightForQuantMonitorValueInReward; + if(codeForFallbackController != None): + exec(codeForFallbackController); + self.fallbackController = locals()["fallbackController"]; + else: + self.fallbackController = None; + + self.useQuantMonitorSubformulasAsFeatures = useQuantMonitorSubformulasAsFeatures; + + # TODO: Select better informed values of self._action for prior to + # the when the controller makes it first decision Grep over this + # file to see where self._action is used and why the value prior to + # the first choice might have some impact. + self._action = np.array([0,0]); # Note that this is the actual action performed on the + # environment, not necessarly the same as self._wrapped_env.action - + # in the case of a monitor violation, and if a fallback controller is + # specified, then the action is dictated by the fallback-controller, not + # the initial policy. + + # NOTE: Setting the two _state variable below is important for calculating + # the acceleration fed into the quantitative-monitor subformulas. See + # the function evaluate_quantitativeMonitorSubFormulas . + self._state = np.zeros(self.observation_space.flat_dim,) + + return; + + + def getAxiluraryInformation(self, state): + fakeTime = 0.0; #the time is not actually used in the dynamics in question... + # Note below we use self._action not self._wrapped_env.action, since we want the + # actual action performed, not the one the wrapped-controller would have done.... + state_dot = self._wrapped_env._model._dynamics(state, fakeTime, self._action); + """ + recall: + state_dot[0] = pos_x_dot + state_dot[1] = pos_y_dot + state_dot[2] = yaw_rate + state_dot[3] = v_x_dot + state_dot[4] = v_y_dot + state_dot[5] = yaw_rate_dot + """ + return state_dot[3:6]; # returning the accelerations. + + + def evaluate_quantitativeMonitorSubFormulas(self, state, action): + axiluraryInformation = self.getAxiluraryInformation(state) + listToReturn = [\ + eval(x, {"state" : state, "axiluraryInformation" : axiluraryInformation}) \ + for x in self.quantitativeMonitorSubFormulas]; + ensures(isinstance(listToReturn, list)); + ensures(len(listToReturn) == len(self.quantitativeMonitorSubFormulas)); + return listToReturn; + + def getMin_evaluate_quantitativeMonitorSubFormulas(self, state, action): + # This function handles the edge case where self.quantitativeMonitorSubFormulas is an + # empty list - helps avoid silly errors that might result from the more + # straight-forward use of min(self.evaluate_quantitativeMonitorSubFormulas(state, action)) + # at various locations. + if(len(self.quantitativeMonitorSubFormulas) == 0): + return 0.0; # NOTE: we consider the monitor to be violated when the value from + # the quantitative monitor is negative, so returning zero should not consistute + # a monitor violation. + else: + return min(self.evaluate_quantitativeMonitorSubFormulas(state, action)); + raise Exception("Control should never reach here"); + return; + + + def reset(self): + """ + Reset environment back to original state. + """ + self._action = np.array([0,0]); # None + self._wrapped_env._state = self._action; + self._state = self._wrapped_env.get_initial_state + self._wrapped_env._state = self._state; + observation = self.state_to_observation(self._state) + + # Reset renderer if available + if self._wrapped_env._renderer is not None: + self._wrapped_env._renderer.reset() + + return observation + + + def helper_step(self, action): + """ + Move one iteration forward in simulation. + """ + if action[0] < 0: # Only allow forward direction + action[0] = 0 + nextstate = self._wrapped_env._model.state_transition(self._state, action, + self._wrapped_env._dt) + self._state = nextstate + # Notice below that we use the + # state_to_observation and get_reward functions defined in this class as oppossed to the + # ones defined in the self._wrapped_class, hence the need to reimplement this + # function (helper_step) as oppossed to simply calling self._wrapped_class.step + reward, info = self.get_reward(nextstate, action) + observation = self.state_to_observation(nextstate) + return Step(observation=observation, reward=reward, done=False, + dist=info['dist'], vel=info['vel'], kappa=self._wrapped_env._model.kappa) + + + def step(self, action): + if(self.fallbackController != None): + monitorHasBeenViolated = (\ + self.getMin_evaluate_quantitativeMonitorSubFormulas(self._wrapped_env._state, action) < 0.0 ); + action = self.fallbackController(self._wrapped_env._state); + self._action = action; + # TODO: consider whether we should also set self._wrapped_env._action or make + # the opion of whether or not to do that a variable passed in to the + # init function of this class. + return self.helper_step(action); + + + def get_reward(self, state, action): + reward ,info = self._wrapped_env.get_reward(state, action); + if(self.weightForQuantMonitorValueInReward != 0.0): # this conditional prevents unnecessary + # computation, but is not strictly needed. + minimumQuantMonitorValue = self.getMin_evaluate_quantitativeMonitorSubFormulas(state, action); + reward = reward + self.weightForQuantMonitorValueInReward * minimumQuantMonitorValue; + return reward, info; + + + def state_to_observation(self, state): + originalObs = self._wrapped_env.state_to_observation(state); + if(self.useQuantMonitorSubformulasAsFeatures): + quantMonitorInput = np.array(self.evaluate_quantitativeMonitorSubFormulas(state, self._action)); + originalObs = np.concatenate([originalObs, quantMonitorInput]); + return originalObs; + + + @property + def observation_space(self): + """ + Define the shape of input vector to the neural network. + """ + if(not self.useQuantMonitorSubformulasAsFeatures): + return Box(low=-np.inf, high=np.inf, shape=(5,)); + else: + return Box(low=-np.inf, high=np.inf, shape=(5 +len(self.quantitativeMonitorSubFormulas))); + raise Exception("Control should never reach here"); + return; + + + @property + def get_initial_state(self): + state = self._wrapped_env.get_initial_state; + # NOTE: Setting the two state variables below are important for calculating + # the acceleration fed into the quantitative-monitor subformulas. See + # the function evaluate_quantitativeMonitorSubFormulas + self._state = state; + self._wrapped_env._state = state; + return state + + + def get_action(observation): + return self._wrapped_env.get_action(observation); + + diff --git a/test/test_monitorEncorporated_straight.py b/test/test_monitorEncorporated_straight.py new file mode 100644 index 0000000..6c33610 --- /dev/null +++ b/test/test_monitorEncorporated_straight.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +@author: DBayani +largely based on test_monitorEncorporated_straight.py by edwardahn + +Demonstrations of testing with MonitorEncorporatedEnv +environment wrapped around the straight vehicle environment. +""" + +import argparse +import sys + +import joblib +import matplotlib.pyplot as plt +import numpy as np + +from rllab.misc import tensor_utils + +from aa_simulation.envs.monitorEncorporated_env import MonitorEncorporatedEnv; +from aa_simulation.envs.straight.straight_env import StraightEnv; +from aa_simulation.misc.utils import normalize_angle + + +def rollout(env, agent, line_params, max_path_length=np.inf, + animated=False): + """ + Modified rollout function from rllab.sampler.utils to run + arbitrary straight trajectories. + """ + observations = [] + rewards = [] + actions = [] + agent_infos = [] + env_infos = [] + + projected_trajectory = [] + x0, y0, angle = line_params + env.reset() + agent.reset() + + # Force start state to be zeros + # Note: Because env is an instance of NormalizedEnv, there is no + # way of writing a custom function that I can use to set the + # initial state. Consequently we just force set it here. + start_yaw = angle + start_state = np.array([x0, y0, start_yaw, 0, 0, 0]) + env._wrapped_env._state = start_state + o = np.zeros(5) + + path_length = 0 + if animated: + env.render() + print('--------------------') + while path_length < max_path_length: + print('') + state = env._wrapped_env._state + print('State = ', state) + projected_o = StraightEnv.project_line(state, x0, y0, angle) + print('Projected state = ', projected_o) + _, agent_info = agent.get_action(env.state_to_observation(projected_o)); + a = agent_info['mean'] + print('Computed action = ', a) + next_o, r, d, env_info = env.step(a) + print('Next observation = ', next_o) + observations.append(env.observation_space.flatten(o)) + rewards.append(r) + actions.append(env.action_space.flatten(a)) + agent_infos.append(agent_info) + env_infos.append(env_info) + projected_trajectory.append(projected_o) + path_length += 1 + if d: + break + o = next_o + if animated: + env.render() + print('--------------------') + + return dict( + observations=tensor_utils.stack_tensor_list(observations), + actions=tensor_utils.stack_tensor_list(actions), + rewards=tensor_utils.stack_tensor_list(rewards), + agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), + env_infos=tensor_utils.stack_tensor_dict_list(env_infos), + ), projected_trajectory + + +def plot_trajectories(trajectory1, trajectory2): + """ + Plot trajectory of unprojected path and projected path. + """ + y1 = trajectory1[:,0] + y2 = trajectory2[:,0] + t = np.arange(len(y1)) + + diff = abs(y2 - y1) + max_diff = max(diff) + mean_diff = np.mean(diff) + print('\nMaximum absolute difference =\t', max_diff) + print('Mean absolute difference =\t', mean_diff) + + plt.figure() + plt.title('Trajectories: Relative y-values') + plt.xlabel('Time step') + plt.ylabel('y (m)') + plt.plot(t, y1, 'b', t, y2, 'r') + plt.legend(['Unprojected', 'Projected']) + plt.show() + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('file', type=str, + help='path to the snapshot file') + parser.add_argument('--max_path_length', type=int, default=100, + help='Max length of rollout') + parser.add_argument('--render', dest='render', + action='store_true', help='Rendering') + parser.add_argument('--no-render', dest='render', + action='store_false', help='Rendering') + parser.set_defaults(render=False) + args = parser.parse_args() + return args + + +def main(): + args = parse_arguments() + data = joblib.load(args.file) + policy = data['policy'] + env = data['env'] + plt.ion() + + + #np.set_printoptions(precision=4, suppress=True) + + + # Set fixed random seed + np.random.seed(9) + + # Sample rollouts with different projections (change line_params2) + line_params1 = np.array([0, 0, 0]) + line_params2 = np.array([3, 0, np.pi/2]) + path1, projected_states1 = rollout(env, policy, line_params1, + max_path_length=args.max_path_length, animated=args.render) + path2, projected_states2 = rollout(env, policy, line_params2, + max_path_length=args.max_path_length, animated=args.render) + + # Plot projected trajectories on graph + projected_states1 = np.array(projected_states1) + projected_states2 = np.array(projected_states2) + plot_trajectories(projected_states1, projected_states2) + + # Block until key is pressed + sys.stdout.write("Press to continue: ") + input() + + +# if __name__ == "__main__": +main() diff --git a/train/train_monitorEncorporated_straight_planner.py b/train/train_monitorEncorporated_straight_planner.py new file mode 100644 index 0000000..4a7b850 --- /dev/null +++ b/train/train_monitorEncorporated_straight_planner.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +@author: DBayani +largely based on train_straight_planner.py by edwardahn + +Demonstrations of training with MonitorEncorporatedEnv +environment wrapped around the straight vehicle environment. +""" + +import argparse + +import joblib +import lasagne.init as LI +import lasagne.layers as L +import lasagne.nonlinearities as LN +import numpy as np + +from rllab.algos.trpo import TRPO +from rllab.core.lasagne_layers import ParamLayer +from rllab.core.lasagne_powered import LasagnePowered +from rllab.core.network import MLP +# from rllab.envs.base import Env +from rllab.misc import ext, logger +from rllab.misc.instrument import run_experiment_lite, VariantGenerator +from rllab.misc.resolve import load_class +from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy +from sandbox.cpo.baselines.linear_feature_baseline import LinearFeatureBaseline + +from aa_simulation.envs.straight.straight_env import StraightEnv +from aa_simulation.envs.monitorEncorporated_env import MonitorEncorporatedEnv; + +# Pre-trained policy and baseline +policy = None +baseline = None + + + +from rllab.policies.base import Policy; + + +# We keep in the below dead-code as a reminder that the ideal way the +# fallback-controller would be specified is with an instance of the policy class. +""" +class DummbyFallbackPolicy(Policy): + def get_action(self, observation): + return np.array([0,0]); +""" + + + +def run_task(vv, log_dir=None, exp_name=None): + global policy + global baseline + + # Check if variant is available + if vv['model_type'] not in ['BrushTireModel', 'LinearTireModel']: + raise ValueError('Unrecognized model type for simulating robot') + if vv['robot_type'] not in ['MRZR', 'RCCar']: + raise ValueError('Unrecognized robot type') + + + # Load environment + baseEnv = StraightEnv( + target_velocity=vv['target_velocity'], + dt=vv['dt'], + model_type=vv['model_type'], + robot_type=vv['robot_type'] + ); + env = MonitorEncorporatedEnv(\ + wrapped_env = baseEnv, \ + quantitativeMonitorSubFormulas = vv["quantitativeMonitorSubFormulas"], \ + weightForQuantMonitorValueInReward = vv["weightForQuantMonitorValueInReward"], \ + codeForFallbackController = vv["codeForFallbackController"], \ + whetherUsedMonitorFeaturesOfNot = vv["whetherUsedMonitorFeaturesOfNot"]\ + ); + + # Save variant information for comparison plots + variant_file = logger.get_snapshot_dir() + '/variant.json' + logger.log_variant(variant_file, vv) + + # Set variance for each action component separately for exploration + # Note: We set the variance manually because we are not scaling our + # action space during training. + init_std_speed = vv['target_velocity'] / 4 + init_std_steer = np.pi / 6 + init_std = [init_std_speed, init_std_steer] + + # Build policy and baseline networks + # Note: Mean of policy network set to analytically computed values for + # faster training (rough estimates for RL to fine-tune). + if policy is None or baseline is None: + target_velocity = vv['target_velocity'] + target_steering = 0 + output_mean = np.array([target_velocity, target_steering]) + hidden_sizes = (32, 32) + + # In mean network, allow output b values to dominate final output + # value by constraining the magnitude of the output W matrix. This is + # to allow faster learning. These numbers are arbitrarily chosen. + W_gain = min(vv['target_velocity'] / 5, np.pi / 15) + + + # Thankfully, the use of env.spec.observation_space.flat_dim should take care of + # having to specify the input dimension... + mean_network = MLP( + input_shape=(env.spec.observation_space.flat_dim,), + output_dim=env.spec.action_space.flat_dim, + hidden_sizes=hidden_sizes, + hidden_nonlinearity=LN.tanh, + output_nonlinearity=None, + output_W_init=LI.GlorotUniform(gain=W_gain), + output_b_init=output_mean + ) + policy = GaussianMLPPolicy( + env_spec=env.spec, + hidden_sizes=(32, 32), + init_std=init_std, + mean_network=mean_network + ) + baseline = LinearFeatureBaseline(env_spec=env.spec) + + # Reset variance to re-enable exploration when using pre-trained networks + else: + policy._l_log_std = ParamLayer( + policy._mean_network.input_layer, + num_units=env.spec.action_space.flat_dim, + param=LI.Constant(np.log(init_std)), + name='output_log_std', + trainable=True + ) + obs_var = policy._mean_network.input_layer.input_var + mean_var, log_std_var = L.get_output([policy._l_mean, policy._l_log_std]) + policy._log_std_var = log_std_var + LasagnePowered.__init__(policy, [policy._l_mean, policy._l_log_std]) + policy._f_dist = ext.compile_function( + inputs=[obs_var], + outputs=[mean_var, log_std_var] + ) + + algo = TRPO( + env=env, + policy=policy, + baseline=baseline, + batch_size=600, + max_path_length=env.horizon, + n_itr=600, + discount=0.99, + step_size=0.01, + plot=False, + ) + algo.train() + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('--network', type=str, + help='Path to snapshot file of pre-trained network') + args = parser.parse_args() + return args + + +def main(): + global policy + global baseline + + # Load pre-trained network if available + args = parse_arguments() + if args.network is not None: + data = joblib.load(args.network) + policy = data['policy'] + baseline = data['baseline'] + use_pretrained = True + else: + use_pretrained = False + + # Run multiple experiment variants at once + vg = VariantGenerator() + + # Non-configurable parameters (do not change) + vg.add('trajectory', ['Straight']) + vg.add('objective', ['TargetVelocity']) + vg.add('algo', ['TRPO']) + + + # Configurable parameters + # Options for model_type: 'BrushTireModel', 'LinearTireModel' + # Options for robot_type: 'MRZR', 'RCCar' + seeds = [100, 200] + robot_type = 'RCCar' + use_ros = False + vg.add('seed', seeds) + vg.add('target_velocity', [1.0]) + vg.add('dt', [0.1]) + vg.add('model_type', ['BrushTireModel']) + vg.add('robot_type', [robot_type]) + vg.add('weightForQuantMonitorValueInReward', [-2.0, -1.0, -0.5, 0.0, 0.5, 1.0, 2,0]); + vg.add('whetherUsedMonitorFeaturesOfNot', [True, False]); + + # The values specified in the variables monitorSubformula_yValueMostlyPositive and + # monitorSubformula_acceleratingInYToFast will be used by the MonitorEncorporatedEnv + # environment in a way equivalent to the following commented-out code: + """ + def monitorSubformula_yValueMostlyPositive(state, additionalInfo): + return state[1] > -0.01; + def monitorSubformula_acceleratingInYToFast(state, additionalInfo): + return additionalInfo[1] > 0.1; + """ + monitorSubformula_yValueMostlyPositive = "state[1] > -0.01"; + monitorSubformula_acceleratingInYToFast = "axiluraryInformation[1] > 0.1"; + quantitativeMonitorSubFormulas = \ + [monitorSubformula_yValueMostlyPositive, monitorSubformula_acceleratingInYToFast]; + + codeForFallbackController = """fallbackController = (lambda *x: np.array([0,0]));""" + + vg.add("codeForFallbackController", [codeForFallbackController, None]); + vg.add("quantitativeMonitorSubFormulas", [quantitativeMonitorSubFormulas, []]); + + print('Number of Configurations: ', len(vg.variants())) + + # Run each experiment variant + for vv in vg.variants(): + run_experiment_lite( + stub_method_call=run_task, + variant=vv, + n_parallel=1, + snapshot_mode='last', + seed=vv['seed'] + ) + + +if __name__ == '__main__': + main() + From 99b1ffe8a4702443b07583e5d378f53a104e7944 Mon Sep 17 00:00:00 2001 From: DBay-ani Date: Tue, 2 Jul 2019 23:11:30 -0800 Subject: [PATCH 2/6] Minor bug fix: name updating. --- envs/monitorEncorporated_env.py | 2 +- train/train_monitorEncorporated_straight_planner.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/envs/monitorEncorporated_env.py b/envs/monitorEncorporated_env.py index 5d9513f..1a03ef5 100644 --- a/envs/monitorEncorporated_env.py +++ b/envs/monitorEncorporated_env.py @@ -82,7 +82,7 @@ class MonitorEncorporatedEnv(ProxyEnv): "x + 2" be passed in. Specifically, the elements of quantitativeMonitorSubFormulas must be strings that can be evaluated by the python built-in eval , and codeForFallbackController must be text - evaluatable by the python built-in eval and must define the function fallbackController . + evaluatable by the python built-in exec and must define the function fallbackController . Plans for near-future development include investigating better ways to handle the circumstances. For the first swing at developing these functionalities, this arrangement should be sufficient and not overly brittle nor overly complex. diff --git a/train/train_monitorEncorporated_straight_planner.py b/train/train_monitorEncorporated_straight_planner.py index 4a7b850..ad6a063 100644 --- a/train/train_monitorEncorporated_straight_planner.py +++ b/train/train_monitorEncorporated_straight_planner.py @@ -72,7 +72,7 @@ def run_task(vv, log_dir=None, exp_name=None): quantitativeMonitorSubFormulas = vv["quantitativeMonitorSubFormulas"], \ weightForQuantMonitorValueInReward = vv["weightForQuantMonitorValueInReward"], \ codeForFallbackController = vv["codeForFallbackController"], \ - whetherUsedMonitorFeaturesOfNot = vv["whetherUsedMonitorFeaturesOfNot"]\ + useQuantMonitorSubformulasAsFeatures = vv["useQuantMonitorSubformulasAsFeatures"]\ ); # Save variant information for comparison plots @@ -195,7 +195,7 @@ def main(): vg.add('model_type', ['BrushTireModel']) vg.add('robot_type', [robot_type]) vg.add('weightForQuantMonitorValueInReward', [-2.0, -1.0, -0.5, 0.0, 0.5, 1.0, 2,0]); - vg.add('whetherUsedMonitorFeaturesOfNot', [True, False]); + vg.add('useQuantMonitorSubformulasAsFeatures', [True, False]); # The values specified in the variables monitorSubformula_yValueMostlyPositive and # monitorSubformula_acceleratingInYToFast will be used by the MonitorEncorporatedEnv From 8f51ce080e628ae8dd888f6bce26452440ffafc6 Mon Sep 17 00:00:00 2001 From: DBay-ani Date: Tue, 9 Jul 2019 22:46:50 -0800 Subject: [PATCH 3/6] Put in a more useful fallback controller and monitor condition for the training in envs/monitorEncorporated_env.py train/train_monitorEncorporated_straight_planner.py . It is still very much a toy, but it matches the non-toy CPO safety constraints Edward is using . Also, made some trivial adjustments in the envs/monitorEncorporated_env.py to allow the quantitative monitor subformulas to the action - this leverages 95% of infulstructure already there, a very trivial change. I think I took it out before intentionally since I thought in matched the monitor use-cases better.... we might be abusing terminology to call this stuff a monitor - maybe... --- envs/monitorEncorporated_env.py | 4 +- ...in_monitorEncorporated_straight_planner.py | 42 ++++++++++++------- 2 files changed, 28 insertions(+), 18 deletions(-) diff --git a/envs/monitorEncorporated_env.py b/envs/monitorEncorporated_env.py index 1a03ef5..4476ac7 100644 --- a/envs/monitorEncorporated_env.py +++ b/envs/monitorEncorporated_env.py @@ -19,7 +19,7 @@ def isProperMonitorSubFormula(thisProposedSubFormula): super(MonitorEncorporatedEnv, self).__init__(wrapped_env); if(str(type(thisProposedSubFormula)) != ""): return False; - if(thisProposedSubFormula.__code__.co_argcount != 2): + if(thisProposedSubFormula.__code__.co_argcount != 3): return False; return True; """ @@ -162,7 +162,7 @@ def getAxiluraryInformation(self, state): def evaluate_quantitativeMonitorSubFormulas(self, state, action): axiluraryInformation = self.getAxiluraryInformation(state) listToReturn = [\ - eval(x, {"state" : state, "axiluraryInformation" : axiluraryInformation}) \ + eval(x, {"state" : state, "action": action, "axiluraryInformation" : axiluraryInformation, "np" :np}) \ for x in self.quantitativeMonitorSubFormulas]; ensures(isinstance(listToReturn, list)); ensures(len(listToReturn) == len(self.quantitativeMonitorSubFormulas)); diff --git a/train/train_monitorEncorporated_straight_planner.py b/train/train_monitorEncorporated_straight_planner.py index ad6a063..a89386d 100644 --- a/train/train_monitorEncorporated_straight_planner.py +++ b/train/train_monitorEncorporated_straight_planner.py @@ -183,6 +183,8 @@ def main(): vg.add('algo', ['TRPO']) + targetVelocity = 1.0 + # Configurable parameters # Options for model_type: 'BrushTireModel', 'LinearTireModel' # Options for robot_type: 'MRZR', 'RCCar' @@ -190,28 +192,36 @@ def main(): robot_type = 'RCCar' use_ros = False vg.add('seed', seeds) - vg.add('target_velocity', [1.0]) + vg.add('target_velocity', [targetVelocity]) vg.add('dt', [0.1]) vg.add('model_type', ['BrushTireModel']) vg.add('robot_type', [robot_type]) - vg.add('weightForQuantMonitorValueInReward', [-2.0, -1.0, -0.5, 0.0, 0.5, 1.0, 2,0]); + # We are mostly uninterested in the negative values below, since they would encourage + # violating the monitor.... + vg.add('weightForQuantMonitorValueInReward', [0.0, 2.0, 0.125, -0.125, 1.0, 0.25, 0.5, -2.0]); vg.add('useQuantMonitorSubformulasAsFeatures', [True, False]); - # The values specified in the variables monitorSubformula_yValueMostlyPositive and - # monitorSubformula_acceleratingInYToFast will be used by the MonitorEncorporatedEnv - # environment in a way equivalent to the following commented-out code: - """ - def monitorSubformula_yValueMostlyPositive(state, additionalInfo): - return state[1] > -0.01; - def monitorSubformula_acceleratingInYToFast(state, additionalInfo): - return additionalInfo[1] > 0.1; - """ - monitorSubformula_yValueMostlyPositive = "state[1] > -0.01"; - monitorSubformula_acceleratingInYToFast = "axiluraryInformation[1] > 0.1"; + # Original monitor code: "action[1] * (np.sign(state[0]) + np.sign(state[1])) > 0" + # This monitor is violated when the car drives away from y=0... + quantMonitorSubformula_carDrivingAwayFromLine = "-action[1] * (np.sign(state[0]) + np.sign(state[1]))"; quantitativeMonitorSubFormulas = \ - [monitorSubformula_yValueMostlyPositive, monitorSubformula_acceleratingInYToFast]; - - codeForFallbackController = """fallbackController = (lambda *x: np.array([0,0]));""" + [quantMonitorSubformula_carDrivingAwayFromLine]; + + codeForFallbackController = """ +import numpy as np; +def fallbackController(observation): + y = observation[0] + yaw = observation[1] + + amountToSteerOffCenter = 0.1; # very slight to avoid oversteering... but completely + # fails to consider drifting. As such, this is VERY MUCH a first-swing attempt + # with a fallback controller that we would not actually use. + velocityToUse = """ + str(targetVelocity) + """; + + # steers toward the line + steeringAngle = amountToSteerOffCenter * (np.sign(y) + np.sign(yaw)); + return np.array([steeringAngle, velocityToUse]); +"""; vg.add("codeForFallbackController", [codeForFallbackController, None]); vg.add("quantitativeMonitorSubFormulas", [quantitativeMonitorSubFormulas, []]); From 76c5ff01a565beef1f61e40297839768b7a32b6f Mon Sep 17 00:00:00 2001 From: DBay-ani Date: Wed, 10 Jul 2019 00:44:36 -0800 Subject: [PATCH 4/6] Committing train_monitorEncorporated_circle_planner.py . Similar to what was noted in the previous commit log for train/train_monitorEncorporated_straight_planner.py , the fallback controller and quantitative monitor used are toy-ish, but they match the non-toy work Edward is performing with CPO. --- envs/monitorEncorporated_env.py | 7 +- ...rain_monitorEncorporated_circle_planner.py | 251 ++++++++++++++++++ 2 files changed, 256 insertions(+), 2 deletions(-) create mode 100644 train/train_monitorEncorporated_circle_planner.py diff --git a/envs/monitorEncorporated_env.py b/envs/monitorEncorporated_env.py index 4476ac7..8d0b15e 100644 --- a/envs/monitorEncorporated_env.py +++ b/envs/monitorEncorporated_env.py @@ -253,10 +253,13 @@ def observation_space(self): """ Define the shape of input vector to the neural network. """ + dimensionOfOriginalObservationSpace = self._wrapped_env.observation_space.flat_dim; if(not self.useQuantMonitorSubformulasAsFeatures): - return Box(low=-np.inf, high=np.inf, shape=(5,)); + return Box(low=-np.inf, high=np.inf, shape=(\ + dimensionOfOriginalObservationSpace,)); else: - return Box(low=-np.inf, high=np.inf, shape=(5 +len(self.quantitativeMonitorSubFormulas))); + return Box(low=-np.inf, high=np.inf, shape=(\ + dimensionOfOriginalObservationSpace +len(self.quantitativeMonitorSubFormulas))); raise Exception("Control should never reach here"); return; diff --git a/train/train_monitorEncorporated_circle_planner.py b/train/train_monitorEncorporated_circle_planner.py new file mode 100644 index 0000000..5551239 --- /dev/null +++ b/train/train_monitorEncorporated_circle_planner.py @@ -0,0 +1,251 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +@author: DBayani +largely based on train_straight_planner.py by edwardahn + +Demonstrations of training with MonitorEncorporatedEnv +environment wrapped around the straight vehicle environment. +""" + +import argparse + +import joblib +import lasagne.init as LI +import lasagne.layers as L +import lasagne.nonlinearities as LN +import numpy as np + +from rllab.algos.trpo import TRPO +from rllab.core.lasagne_layers import ParamLayer +from rllab.core.lasagne_powered import LasagnePowered +from rllab.core.network import MLP +# from rllab.envs.base import Env +from rllab.misc import ext, logger +from rllab.misc.instrument import run_experiment_lite, VariantGenerator +from rllab.misc.resolve import load_class +from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy +from sandbox.cpo.baselines.linear_feature_baseline import LinearFeatureBaseline + +from aa_simulation.envs.circle.circle_env import CircleEnv +from aa_simulation.envs.monitorEncorporated_env import MonitorEncorporatedEnv; + +# Pre-trained policy and baseline +policy = None +baseline = None + + + +from rllab.policies.base import Policy; + + +# We keep in the below dead-code as a reminder that the ideal way the +# fallback-controller would be specified is with an instance of the policy class. +""" +class DummbyFallbackPolicy(Policy): + def get_action(self, observation): + return np.array([0,0]); +""" + + + +def run_task(vv, log_dir=None, exp_name=None): + global policy + global baseline + + trpo_stepsize = 0.01 + trpo_subsample_factor = 0.2 + + # Check if variant is available + if vv['model_type'] not in ['BrushTireModel', 'LinearTireModel']: + raise ValueError('Unrecognized model type for simulating robot') + if vv['robot_type'] not in ['MRZR', 'RCCar']: + raise ValueError('Unrecognized robot type') + + + # Load environment + baseEnv = CircleEnv( + target_velocity=vv['target_velocity'], + radius=vv['radius'], + dt=vv['dt'], + model_type=vv['model_type'], + robot_type=vv['robot_type'] + ) + env = MonitorEncorporatedEnv(\ + wrapped_env = baseEnv, \ + quantitativeMonitorSubFormulas = vv["quantitativeMonitorSubFormulas"], \ + weightForQuantMonitorValueInReward = vv["weightForQuantMonitorValueInReward"], \ + codeForFallbackController = vv["codeForFallbackController"], \ + useQuantMonitorSubformulasAsFeatures = vv["useQuantMonitorSubformulasAsFeatures"]\ + ); + + # Save variant information for comparison plots + variant_file = logger.get_snapshot_dir() + '/variant.json' + logger.log_variant(variant_file, vv) + + # Set variance for each action component separately for exploration + # Note: We set the variance manually because we are not scaling our + # action space during training. + init_std_speed = vv['target_velocity'] / 4 + init_std_steer = np.pi / 6 + init_std = [init_std_speed, init_std_steer] + + # Build policy and baseline networks + # Note: Mean of policy network set to analytically computed values for + # faster training (rough estimates for RL to fine-tune). + if policy is None or baseline is None: + wheelbase = 0.257 + target_velocity = vv['target_velocity'] + target_steering = np.arctan(wheelbase / vv['radius']) # CCW + output_mean = np.array([target_velocity, target_steering]) + hidden_sizes = (32, 32) + + # In mean network, allow output b values to dominate final output + # value by constraining the magnitude of the output W matrix. This is + # to allow faster learning. These numbers are arbitrarily chosen. + W_gain = min(vv['target_velocity'] / 5, np.pi / 15) + + mean_network = MLP( + input_shape=(env.spec.observation_space.flat_dim,), + output_dim=env.spec.action_space.flat_dim, + hidden_sizes=hidden_sizes, + hidden_nonlinearity=LN.tanh, + output_nonlinearity=None, + output_W_init=LI.GlorotUniform(gain=W_gain), + output_b_init=output_mean + ) + policy = GaussianMLPPolicy( + env_spec=env.spec, + hidden_sizes=hidden_sizes, + init_std=init_std, + mean_network=mean_network + ) + baseline = LinearFeatureBaseline( + env_spec=env.spec, + target_key='returns' + ) + + # Reset variance to re-enable exploration when using pre-trained networks + else: + policy._l_log_std = ParamLayer( + policy._mean_network.input_layer, + num_units=env.spec.action_space.flat_dim, + param=LI.Constant(np.log(init_std)), + name='output_log_std', + trainable=True + ) + obs_var = policy._mean_network.input_layer.input_var + mean_var, log_std_var = L.get_output([policy._l_mean, policy._l_log_std]) + policy._log_std_var = log_std_var + LasagnePowered.__init__(policy, [policy._l_mean, policy._l_log_std]) + policy._f_dist = ext.compile_function( + inputs=[obs_var], + outputs=[mean_var, log_std_var] + ) + + algo = TRPO( + env=env, + policy=policy, + baseline=baseline, + batch_size=600, + max_path_length=env.horizon, + n_itr=600, + discount=0.99, + step_size=trpo_stepsize, + plot=False, + ) + algo.train() + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('--network', type=str, + help='Path to snapshot file of pre-trained network') + args = parser.parse_args() + return args + + +def main(): + global policy + global baseline + + # Load pre-trained network if available + args = parse_arguments() + if args.network is not None: + data = joblib.load(args.network) + policy = data['policy'] + baseline = data['baseline'] + use_pretrained = True + else: + use_pretrained = False + + # Run multiple experiment variants at once + vg = VariantGenerator() + + # Non-configurable parameters (do not change) + vg.add('trajectory', ['Circle']) + vg.add('objective', ['TargetVelocity']) + vg.add('algo', ['TRPO']) + + + targetVelocity = 1.0 + epsilonValue = 0.05; + + # Configurable parameters + # Options for model_type: 'BrushTireModel', 'LinearTireModel' + # Options for robot_type: 'MRZR', 'RCCar' + seeds = [100, 200] + robot_type = 'RCCar' + use_ros = False + vg.add('seed', seeds) + vg.add('target_velocity', [targetVelocity]) + vg.add('radius', [1.0]) + vg.add('dt', [0.1]) + vg.add('eps', [epsilonValue]) + vg.add('model_type', ['BrushTireModel']) + vg.add('robot_type', [robot_type]) + # We are mostly uninterested in the negative values below, since they would encourage + # violating the monitor.... + vg.add('weightForQuantMonitorValueInReward', [0.0, 2.0, 0.125, -0.125, 1.0, 0.25, 0.5, -2.0]); + vg.add('useQuantMonitorSubformulasAsFeatures', [True, False]); + + quantMonitorSubformula_carDrivingAwayFromLine = str(epsilonValue) + "- state[0]"; # I.e., if more than + # epsilon away from the target raduis, then the quantitative monitor is violated. + quantitativeMonitorSubFormulas = \ + [quantMonitorSubformula_carDrivingAwayFromLine]; + + codeForFallbackController = """ +import numpy as np; +def fallbackController(observation): + deltaRaduis = observation[0] + + amountToSteerOffCenter = 0.1; # very slight to avoid oversteering... but completely + # fails to consider drifting. As such, this is VERY MUCH a first-swing attempt + # with a fallback controller that we would not actually use. + velocityToUse = """ + str(targetVelocity) + """; + + # steers toward the line + steeringAngle = amountToSteerOffCenter * np.sign(deltaRaduis); # If we are outside the circle, we + # stear toward the inside, if we are on the inside, we steer toward the outside.... + return np.array([steeringAngle, velocityToUse]); +"""; + + vg.add("codeForFallbackController", [codeForFallbackController, None]); + vg.add("quantitativeMonitorSubFormulas", [quantitativeMonitorSubFormulas, []]); + + print('Number of Configurations: ', len(vg.variants())) + + # Run each experiment variant + for vv in vg.variants(): + run_experiment_lite( + stub_method_call=run_task, + variant=vv, + n_parallel=1, + snapshot_mode='last', + seed=vv['seed'] + ) + + +if __name__ == '__main__': + main() + From 081c4ea3038254f463f55885390fdea0ff4ed359 Mon Sep 17 00:00:00 2001 From: DBay-ani Date: Mon, 15 Jul 2019 16:30:52 -0800 Subject: [PATCH 5/6] Increased thread use paramter in training. --- train/train_monitorEncorporated_circle_planner.py | 2 +- train/train_monitorEncorporated_straight_planner.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/train/train_monitorEncorporated_circle_planner.py b/train/train_monitorEncorporated_circle_planner.py index 5551239..27ca006 100644 --- a/train/train_monitorEncorporated_circle_planner.py +++ b/train/train_monitorEncorporated_circle_planner.py @@ -240,7 +240,7 @@ def fallbackController(observation): run_experiment_lite( stub_method_call=run_task, variant=vv, - n_parallel=1, + n_parallel=2, snapshot_mode='last', seed=vv['seed'] ) diff --git a/train/train_monitorEncorporated_straight_planner.py b/train/train_monitorEncorporated_straight_planner.py index a89386d..c7d1c47 100644 --- a/train/train_monitorEncorporated_straight_planner.py +++ b/train/train_monitorEncorporated_straight_planner.py @@ -233,7 +233,7 @@ def fallbackController(observation): run_experiment_lite( stub_method_call=run_task, variant=vv, - n_parallel=1, + n_parallel=2, snapshot_mode='last', seed=vv['seed'] ) From 5618dcb331315c290a813a9af1e4260a80a9daf5 Mon Sep 17 00:00:00 2001 From: DBay-ani Date: Mon, 5 Aug 2019 04:39:42 -0800 Subject: [PATCH 6/6] Some additions to infulstructure that made it easier to train policies a bit at a time. --- train/train_monitorEncorporated_circle_planner.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/train/train_monitorEncorporated_circle_planner.py b/train/train_monitorEncorporated_circle_planner.py index 27ca006..2a8f95d 100644 --- a/train/train_monitorEncorporated_circle_planner.py +++ b/train/train_monitorEncorporated_circle_planner.py @@ -236,11 +236,16 @@ def fallbackController(observation): print('Number of Configurations: ', len(vg.variants())) # Run each experiment variant + indexOfStartVariant = 126; for vv in vg.variants(): + indexOfStartVariant = indexOfStartVariant - 1; + print("indexOfStartVariant:" + str(indexOfStartVariant), flush=True); + if(indexOfStartVariant > 0): + continue; run_experiment_lite( stub_method_call=run_task, variant=vv, - n_parallel=2, + n_parallel=1, snapshot_mode='last', seed=vv['seed'] ) @@ -249,3 +254,8 @@ def fallbackController(observation): if __name__ == '__main__': main() + + + + +