diff --git a/contracts.py b/contracts.py new file mode 100644 index 0000000..fd2fdbe --- /dev/null +++ b/contracts.py @@ -0,0 +1,5 @@ +def requires(booleanStatement): + assert(booleanStatement); + +def ensures(booleanStatement): + assert(booleanStatement); diff --git a/envs/monitorEncorporated_env.py b/envs/monitorEncorporated_env.py new file mode 100644 index 0000000..8d0b15e --- /dev/null +++ b/envs/monitorEncorporated_env.py @@ -0,0 +1,281 @@ +from aa_simulation.contracts import *; +from rllab.envs.proxy_env import ProxyEnv; +from aa_simulation.envs.base_env import VehicleEnv; +from rllab.policies.base import Policy; +import numpy as np; + +from rllab.core.serializable import Serializable; +from rllab.spaces import Box; + +from rllab.envs.base import Step; + +def isProperMonitorSubFormula(thisProposedSubFormula): + return isinstance(thisProposedSubFormula, str); + +# In triple quotes below: the original implementation of +# isProperMonitorSubFormula prior to running into +# substantial problems pickling functions between components +""" +super(MonitorEncorporatedEnv, self).__init__(wrapped_env); + if(str(type(thisProposedSubFormula)) != ""): + return False; + if(thisProposedSubFormula.__code__.co_argcount != 3): + return False; + return True; +""" + + +class MonitorEncorporatedEnv(ProxyEnv): + """ + MonitorEncorporatedEnv: this class provides a way for transforming an instance of the + VehicleEnv class (the "wrapped environment") to an environment where the monitor is used + in a variety of ways. The monitor information is provided as a list of the quantitative + subformulas - for instance if the monitor is ((A < B) AND (C < D) AND (E < F)), then the + monitor is provided as [B -A, D -C, E -F]. The monitor information can be used in any subset + of the following: + (1) Activate fallback controller: in the case the monitor is violated, a fallback controller + is used to dictate the actions to be take as opposed to the agent interacting with + the environment. This occurs in all and only the situations where the monitor is + violated. To disable this functionality, simply pass in None for codeForFallbackController; + the actions provided through the step-function (see the code below) to the environment + then will always be acted out. + (2) Influence the reward returned; the reward returned by the environment is a weighted + combination of the reward given by the wrapped-environment and the value given by the + quantitative monitor. Specifically, the reward given is: + reward = rewardFromWrappedEnvironment + \ + weightForQuantMonitorValueInReward * min(B -A, D -C, E -F) + To effectively disable this functionality, set weightForQuantMonitorValueInReward to 0.0 . + (3) Additional features in observations: in addition to the features provided by the + wrapped-environment, the quantitative-monitor subformulas can be provided as additional + features for a state. For instance, if the initial feature vector is : + [f_1, f_2, ..., f_{n-1}, f_n] + the features can be expanded to include: + [f_1, f_2, ..., f_{n-1}, f_n, B -A, D -C, E -F] + Note that since we use the quantitative monitor subformulas, the features vary over the + set of the state-space where the monitor is not violated. This is in contrast to if + the subformulas from the original monitor, in which case the binary values would not + vary of the safe-set - the moment any one of them changes, the environment which trigger + the fallback controller to kind the vehicle, which makes having those features in such + an arrangement have little utility. To enable these additional features, set + useQuantMonitorSubformulasAsFeatures to true, and to disable, set + useQuantMonitorSubformulasAsFeatures to false. + Again, any subset of the above three options is valid - so there are at least 8 general modes of + operation for this class. + + + A Note on Some Unfortunate Hacks Made To Get The rllab Infrastructure to Work With This Code: + Unfortunately, various parts of the rllab code try to do clever things with pickling is + saving results and passing parameters around in the infrastructure. This limits how much plain + functions can be passed around as parameters - while cloudpickle can be substituted in + many places for pickle in the rllab code, at least three challenges remain there: (1) rllab uses + some functionality of pickle not supported by cloudpickle (specific attributes pickle has that + cloudpickle does not), (2) rllab is a project outside the general control of the aa-group, and + the code base for it has been frozen for some time in favor of developing a new platform; as + such, we would have to modify our own local copy of rllab and distribute to any in the aa-group + who want to use it, (3) in addition to the python package "pickle", rllab also takes advantage of + numpy pickle functions that apparently have some similar issues. + + As a work-around to the difficulties listed above, the code was change to use code for functions + in place of python implementations of the functions. That is, instead of passing in, say, + lambda x: x +2 + the code requires that the string + "x + 2" + be passed in. Specifically, the elements of quantitativeMonitorSubFormulas must be strings that can + be evaluated by the python built-in eval , and codeForFallbackController must be text + evaluatable by the python built-in exec and must define the function fallbackController . + Plans for near-future development include investigating better ways to handle the circumstances. + For the first swing at developing these functionalities, this arrangement should be sufficient + and not overly brittle nor overly complex. + """ + + def __init__(self, wrapped_env, quantitativeMonitorSubFormulas, \ + weightForQuantMonitorValueInReward, codeForFallbackController, useQuantMonitorSubformulasAsFeatures): + requires(isinstance(wrapped_env, VehicleEnv)); + requires(isinstance(quantitativeMonitorSubFormulas, list)); + # NOTE: we allow quantitativeMonitorSubFormulas to be an empty list, + # in which case no monitor violations should ever occur + requires(all([isProperMonitorSubFormula(x) for x in quantitativeMonitorSubFormulas])); + requires(isinstance(weightForQuantMonitorValueInReward, float)); + # NOTE: we allow weightForQuantMonitorValueInReward to be negative, in + # case the agent would be rewarded for violating the monitor condition. + # This might be useful for testing or to empirically judge the + # influence of the monitor signal encorporated via the reward function. + requires(codeForFallbackController == None or isinstance(codeForFallbackController, str)); + requires(isinstance(useQuantMonitorSubformulasAsFeatures, bool)); + + # NOTE: we cannot do + # ProxyEnv.__init__(self, wrapped_env); + # or + # super(MonitorEncorporatedEnv, self).__init__(wrapped_env); + # since the init function here (unlike the ProxyEnv class) takes in multiple + # arguments and results in local() not being able to find all of them if + # we try calling as listed above. + Serializable.quick_init(self, locals()) + self._wrapped_env = wrapped_env + + # TODO: consider including python-ic leading underscore as necessary... + self.quantitativeMonitorSubFormulas = quantitativeMonitorSubFormulas; + self.weightForQuantMonitorValueInReward = weightForQuantMonitorValueInReward; + if(codeForFallbackController != None): + exec(codeForFallbackController); + self.fallbackController = locals()["fallbackController"]; + else: + self.fallbackController = None; + + self.useQuantMonitorSubformulasAsFeatures = useQuantMonitorSubformulasAsFeatures; + + # TODO: Select better informed values of self._action for prior to + # the when the controller makes it first decision Grep over this + # file to see where self._action is used and why the value prior to + # the first choice might have some impact. + self._action = np.array([0,0]); # Note that this is the actual action performed on the + # environment, not necessarly the same as self._wrapped_env.action - + # in the case of a monitor violation, and if a fallback controller is + # specified, then the action is dictated by the fallback-controller, not + # the initial policy. + + # NOTE: Setting the two _state variable below is important for calculating + # the acceleration fed into the quantitative-monitor subformulas. See + # the function evaluate_quantitativeMonitorSubFormulas . + self._state = np.zeros(self.observation_space.flat_dim,) + + return; + + + def getAxiluraryInformation(self, state): + fakeTime = 0.0; #the time is not actually used in the dynamics in question... + # Note below we use self._action not self._wrapped_env.action, since we want the + # actual action performed, not the one the wrapped-controller would have done.... + state_dot = self._wrapped_env._model._dynamics(state, fakeTime, self._action); + """ + recall: + state_dot[0] = pos_x_dot + state_dot[1] = pos_y_dot + state_dot[2] = yaw_rate + state_dot[3] = v_x_dot + state_dot[4] = v_y_dot + state_dot[5] = yaw_rate_dot + """ + return state_dot[3:6]; # returning the accelerations. + + + def evaluate_quantitativeMonitorSubFormulas(self, state, action): + axiluraryInformation = self.getAxiluraryInformation(state) + listToReturn = [\ + eval(x, {"state" : state, "action": action, "axiluraryInformation" : axiluraryInformation, "np" :np}) \ + for x in self.quantitativeMonitorSubFormulas]; + ensures(isinstance(listToReturn, list)); + ensures(len(listToReturn) == len(self.quantitativeMonitorSubFormulas)); + return listToReturn; + + def getMin_evaluate_quantitativeMonitorSubFormulas(self, state, action): + # This function handles the edge case where self.quantitativeMonitorSubFormulas is an + # empty list - helps avoid silly errors that might result from the more + # straight-forward use of min(self.evaluate_quantitativeMonitorSubFormulas(state, action)) + # at various locations. + if(len(self.quantitativeMonitorSubFormulas) == 0): + return 0.0; # NOTE: we consider the monitor to be violated when the value from + # the quantitative monitor is negative, so returning zero should not consistute + # a monitor violation. + else: + return min(self.evaluate_quantitativeMonitorSubFormulas(state, action)); + raise Exception("Control should never reach here"); + return; + + + def reset(self): + """ + Reset environment back to original state. + """ + self._action = np.array([0,0]); # None + self._wrapped_env._state = self._action; + self._state = self._wrapped_env.get_initial_state + self._wrapped_env._state = self._state; + observation = self.state_to_observation(self._state) + + # Reset renderer if available + if self._wrapped_env._renderer is not None: + self._wrapped_env._renderer.reset() + + return observation + + + def helper_step(self, action): + """ + Move one iteration forward in simulation. + """ + if action[0] < 0: # Only allow forward direction + action[0] = 0 + nextstate = self._wrapped_env._model.state_transition(self._state, action, + self._wrapped_env._dt) + self._state = nextstate + # Notice below that we use the + # state_to_observation and get_reward functions defined in this class as oppossed to the + # ones defined in the self._wrapped_class, hence the need to reimplement this + # function (helper_step) as oppossed to simply calling self._wrapped_class.step + reward, info = self.get_reward(nextstate, action) + observation = self.state_to_observation(nextstate) + return Step(observation=observation, reward=reward, done=False, + dist=info['dist'], vel=info['vel'], kappa=self._wrapped_env._model.kappa) + + + def step(self, action): + if(self.fallbackController != None): + monitorHasBeenViolated = (\ + self.getMin_evaluate_quantitativeMonitorSubFormulas(self._wrapped_env._state, action) < 0.0 ); + action = self.fallbackController(self._wrapped_env._state); + self._action = action; + # TODO: consider whether we should also set self._wrapped_env._action or make + # the opion of whether or not to do that a variable passed in to the + # init function of this class. + return self.helper_step(action); + + + def get_reward(self, state, action): + reward ,info = self._wrapped_env.get_reward(state, action); + if(self.weightForQuantMonitorValueInReward != 0.0): # this conditional prevents unnecessary + # computation, but is not strictly needed. + minimumQuantMonitorValue = self.getMin_evaluate_quantitativeMonitorSubFormulas(state, action); + reward = reward + self.weightForQuantMonitorValueInReward * minimumQuantMonitorValue; + return reward, info; + + + def state_to_observation(self, state): + originalObs = self._wrapped_env.state_to_observation(state); + if(self.useQuantMonitorSubformulasAsFeatures): + quantMonitorInput = np.array(self.evaluate_quantitativeMonitorSubFormulas(state, self._action)); + originalObs = np.concatenate([originalObs, quantMonitorInput]); + return originalObs; + + + @property + def observation_space(self): + """ + Define the shape of input vector to the neural network. + """ + dimensionOfOriginalObservationSpace = self._wrapped_env.observation_space.flat_dim; + if(not self.useQuantMonitorSubformulasAsFeatures): + return Box(low=-np.inf, high=np.inf, shape=(\ + dimensionOfOriginalObservationSpace,)); + else: + return Box(low=-np.inf, high=np.inf, shape=(\ + dimensionOfOriginalObservationSpace +len(self.quantitativeMonitorSubFormulas))); + raise Exception("Control should never reach here"); + return; + + + @property + def get_initial_state(self): + state = self._wrapped_env.get_initial_state; + # NOTE: Setting the two state variables below are important for calculating + # the acceleration fed into the quantitative-monitor subformulas. See + # the function evaluate_quantitativeMonitorSubFormulas + self._state = state; + self._wrapped_env._state = state; + return state + + + def get_action(observation): + return self._wrapped_env.get_action(observation); + + diff --git a/test/test_monitorEncorporated_straight.py b/test/test_monitorEncorporated_straight.py new file mode 100644 index 0000000..6c33610 --- /dev/null +++ b/test/test_monitorEncorporated_straight.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +@author: DBayani +largely based on test_monitorEncorporated_straight.py by edwardahn + +Demonstrations of testing with MonitorEncorporatedEnv +environment wrapped around the straight vehicle environment. +""" + +import argparse +import sys + +import joblib +import matplotlib.pyplot as plt +import numpy as np + +from rllab.misc import tensor_utils + +from aa_simulation.envs.monitorEncorporated_env import MonitorEncorporatedEnv; +from aa_simulation.envs.straight.straight_env import StraightEnv; +from aa_simulation.misc.utils import normalize_angle + + +def rollout(env, agent, line_params, max_path_length=np.inf, + animated=False): + """ + Modified rollout function from rllab.sampler.utils to run + arbitrary straight trajectories. + """ + observations = [] + rewards = [] + actions = [] + agent_infos = [] + env_infos = [] + + projected_trajectory = [] + x0, y0, angle = line_params + env.reset() + agent.reset() + + # Force start state to be zeros + # Note: Because env is an instance of NormalizedEnv, there is no + # way of writing a custom function that I can use to set the + # initial state. Consequently we just force set it here. + start_yaw = angle + start_state = np.array([x0, y0, start_yaw, 0, 0, 0]) + env._wrapped_env._state = start_state + o = np.zeros(5) + + path_length = 0 + if animated: + env.render() + print('--------------------') + while path_length < max_path_length: + print('') + state = env._wrapped_env._state + print('State = ', state) + projected_o = StraightEnv.project_line(state, x0, y0, angle) + print('Projected state = ', projected_o) + _, agent_info = agent.get_action(env.state_to_observation(projected_o)); + a = agent_info['mean'] + print('Computed action = ', a) + next_o, r, d, env_info = env.step(a) + print('Next observation = ', next_o) + observations.append(env.observation_space.flatten(o)) + rewards.append(r) + actions.append(env.action_space.flatten(a)) + agent_infos.append(agent_info) + env_infos.append(env_info) + projected_trajectory.append(projected_o) + path_length += 1 + if d: + break + o = next_o + if animated: + env.render() + print('--------------------') + + return dict( + observations=tensor_utils.stack_tensor_list(observations), + actions=tensor_utils.stack_tensor_list(actions), + rewards=tensor_utils.stack_tensor_list(rewards), + agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), + env_infos=tensor_utils.stack_tensor_dict_list(env_infos), + ), projected_trajectory + + +def plot_trajectories(trajectory1, trajectory2): + """ + Plot trajectory of unprojected path and projected path. + """ + y1 = trajectory1[:,0] + y2 = trajectory2[:,0] + t = np.arange(len(y1)) + + diff = abs(y2 - y1) + max_diff = max(diff) + mean_diff = np.mean(diff) + print('\nMaximum absolute difference =\t', max_diff) + print('Mean absolute difference =\t', mean_diff) + + plt.figure() + plt.title('Trajectories: Relative y-values') + plt.xlabel('Time step') + plt.ylabel('y (m)') + plt.plot(t, y1, 'b', t, y2, 'r') + plt.legend(['Unprojected', 'Projected']) + plt.show() + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('file', type=str, + help='path to the snapshot file') + parser.add_argument('--max_path_length', type=int, default=100, + help='Max length of rollout') + parser.add_argument('--render', dest='render', + action='store_true', help='Rendering') + parser.add_argument('--no-render', dest='render', + action='store_false', help='Rendering') + parser.set_defaults(render=False) + args = parser.parse_args() + return args + + +def main(): + args = parse_arguments() + data = joblib.load(args.file) + policy = data['policy'] + env = data['env'] + plt.ion() + + + #np.set_printoptions(precision=4, suppress=True) + + + # Set fixed random seed + np.random.seed(9) + + # Sample rollouts with different projections (change line_params2) + line_params1 = np.array([0, 0, 0]) + line_params2 = np.array([3, 0, np.pi/2]) + path1, projected_states1 = rollout(env, policy, line_params1, + max_path_length=args.max_path_length, animated=args.render) + path2, projected_states2 = rollout(env, policy, line_params2, + max_path_length=args.max_path_length, animated=args.render) + + # Plot projected trajectories on graph + projected_states1 = np.array(projected_states1) + projected_states2 = np.array(projected_states2) + plot_trajectories(projected_states1, projected_states2) + + # Block until key is pressed + sys.stdout.write("Press to continue: ") + input() + + +# if __name__ == "__main__": +main() diff --git a/train/train_monitorEncorporated_circle_planner.py b/train/train_monitorEncorporated_circle_planner.py new file mode 100644 index 0000000..2a8f95d --- /dev/null +++ b/train/train_monitorEncorporated_circle_planner.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +@author: DBayani +largely based on train_straight_planner.py by edwardahn + +Demonstrations of training with MonitorEncorporatedEnv +environment wrapped around the straight vehicle environment. +""" + +import argparse + +import joblib +import lasagne.init as LI +import lasagne.layers as L +import lasagne.nonlinearities as LN +import numpy as np + +from rllab.algos.trpo import TRPO +from rllab.core.lasagne_layers import ParamLayer +from rllab.core.lasagne_powered import LasagnePowered +from rllab.core.network import MLP +# from rllab.envs.base import Env +from rllab.misc import ext, logger +from rllab.misc.instrument import run_experiment_lite, VariantGenerator +from rllab.misc.resolve import load_class +from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy +from sandbox.cpo.baselines.linear_feature_baseline import LinearFeatureBaseline + +from aa_simulation.envs.circle.circle_env import CircleEnv +from aa_simulation.envs.monitorEncorporated_env import MonitorEncorporatedEnv; + +# Pre-trained policy and baseline +policy = None +baseline = None + + + +from rllab.policies.base import Policy; + + +# We keep in the below dead-code as a reminder that the ideal way the +# fallback-controller would be specified is with an instance of the policy class. +""" +class DummbyFallbackPolicy(Policy): + def get_action(self, observation): + return np.array([0,0]); +""" + + + +def run_task(vv, log_dir=None, exp_name=None): + global policy + global baseline + + trpo_stepsize = 0.01 + trpo_subsample_factor = 0.2 + + # Check if variant is available + if vv['model_type'] not in ['BrushTireModel', 'LinearTireModel']: + raise ValueError('Unrecognized model type for simulating robot') + if vv['robot_type'] not in ['MRZR', 'RCCar']: + raise ValueError('Unrecognized robot type') + + + # Load environment + baseEnv = CircleEnv( + target_velocity=vv['target_velocity'], + radius=vv['radius'], + dt=vv['dt'], + model_type=vv['model_type'], + robot_type=vv['robot_type'] + ) + env = MonitorEncorporatedEnv(\ + wrapped_env = baseEnv, \ + quantitativeMonitorSubFormulas = vv["quantitativeMonitorSubFormulas"], \ + weightForQuantMonitorValueInReward = vv["weightForQuantMonitorValueInReward"], \ + codeForFallbackController = vv["codeForFallbackController"], \ + useQuantMonitorSubformulasAsFeatures = vv["useQuantMonitorSubformulasAsFeatures"]\ + ); + + # Save variant information for comparison plots + variant_file = logger.get_snapshot_dir() + '/variant.json' + logger.log_variant(variant_file, vv) + + # Set variance for each action component separately for exploration + # Note: We set the variance manually because we are not scaling our + # action space during training. + init_std_speed = vv['target_velocity'] / 4 + init_std_steer = np.pi / 6 + init_std = [init_std_speed, init_std_steer] + + # Build policy and baseline networks + # Note: Mean of policy network set to analytically computed values for + # faster training (rough estimates for RL to fine-tune). + if policy is None or baseline is None: + wheelbase = 0.257 + target_velocity = vv['target_velocity'] + target_steering = np.arctan(wheelbase / vv['radius']) # CCW + output_mean = np.array([target_velocity, target_steering]) + hidden_sizes = (32, 32) + + # In mean network, allow output b values to dominate final output + # value by constraining the magnitude of the output W matrix. This is + # to allow faster learning. These numbers are arbitrarily chosen. + W_gain = min(vv['target_velocity'] / 5, np.pi / 15) + + mean_network = MLP( + input_shape=(env.spec.observation_space.flat_dim,), + output_dim=env.spec.action_space.flat_dim, + hidden_sizes=hidden_sizes, + hidden_nonlinearity=LN.tanh, + output_nonlinearity=None, + output_W_init=LI.GlorotUniform(gain=W_gain), + output_b_init=output_mean + ) + policy = GaussianMLPPolicy( + env_spec=env.spec, + hidden_sizes=hidden_sizes, + init_std=init_std, + mean_network=mean_network + ) + baseline = LinearFeatureBaseline( + env_spec=env.spec, + target_key='returns' + ) + + # Reset variance to re-enable exploration when using pre-trained networks + else: + policy._l_log_std = ParamLayer( + policy._mean_network.input_layer, + num_units=env.spec.action_space.flat_dim, + param=LI.Constant(np.log(init_std)), + name='output_log_std', + trainable=True + ) + obs_var = policy._mean_network.input_layer.input_var + mean_var, log_std_var = L.get_output([policy._l_mean, policy._l_log_std]) + policy._log_std_var = log_std_var + LasagnePowered.__init__(policy, [policy._l_mean, policy._l_log_std]) + policy._f_dist = ext.compile_function( + inputs=[obs_var], + outputs=[mean_var, log_std_var] + ) + + algo = TRPO( + env=env, + policy=policy, + baseline=baseline, + batch_size=600, + max_path_length=env.horizon, + n_itr=600, + discount=0.99, + step_size=trpo_stepsize, + plot=False, + ) + algo.train() + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('--network', type=str, + help='Path to snapshot file of pre-trained network') + args = parser.parse_args() + return args + + +def main(): + global policy + global baseline + + # Load pre-trained network if available + args = parse_arguments() + if args.network is not None: + data = joblib.load(args.network) + policy = data['policy'] + baseline = data['baseline'] + use_pretrained = True + else: + use_pretrained = False + + # Run multiple experiment variants at once + vg = VariantGenerator() + + # Non-configurable parameters (do not change) + vg.add('trajectory', ['Circle']) + vg.add('objective', ['TargetVelocity']) + vg.add('algo', ['TRPO']) + + + targetVelocity = 1.0 + epsilonValue = 0.05; + + # Configurable parameters + # Options for model_type: 'BrushTireModel', 'LinearTireModel' + # Options for robot_type: 'MRZR', 'RCCar' + seeds = [100, 200] + robot_type = 'RCCar' + use_ros = False + vg.add('seed', seeds) + vg.add('target_velocity', [targetVelocity]) + vg.add('radius', [1.0]) + vg.add('dt', [0.1]) + vg.add('eps', [epsilonValue]) + vg.add('model_type', ['BrushTireModel']) + vg.add('robot_type', [robot_type]) + # We are mostly uninterested in the negative values below, since they would encourage + # violating the monitor.... + vg.add('weightForQuantMonitorValueInReward', [0.0, 2.0, 0.125, -0.125, 1.0, 0.25, 0.5, -2.0]); + vg.add('useQuantMonitorSubformulasAsFeatures', [True, False]); + + quantMonitorSubformula_carDrivingAwayFromLine = str(epsilonValue) + "- state[0]"; # I.e., if more than + # epsilon away from the target raduis, then the quantitative monitor is violated. + quantitativeMonitorSubFormulas = \ + [quantMonitorSubformula_carDrivingAwayFromLine]; + + codeForFallbackController = """ +import numpy as np; +def fallbackController(observation): + deltaRaduis = observation[0] + + amountToSteerOffCenter = 0.1; # very slight to avoid oversteering... but completely + # fails to consider drifting. As such, this is VERY MUCH a first-swing attempt + # with a fallback controller that we would not actually use. + velocityToUse = """ + str(targetVelocity) + """; + + # steers toward the line + steeringAngle = amountToSteerOffCenter * np.sign(deltaRaduis); # If we are outside the circle, we + # stear toward the inside, if we are on the inside, we steer toward the outside.... + return np.array([steeringAngle, velocityToUse]); +"""; + + vg.add("codeForFallbackController", [codeForFallbackController, None]); + vg.add("quantitativeMonitorSubFormulas", [quantitativeMonitorSubFormulas, []]); + + print('Number of Configurations: ', len(vg.variants())) + + # Run each experiment variant + indexOfStartVariant = 126; + for vv in vg.variants(): + indexOfStartVariant = indexOfStartVariant - 1; + print("indexOfStartVariant:" + str(indexOfStartVariant), flush=True); + if(indexOfStartVariant > 0): + continue; + run_experiment_lite( + stub_method_call=run_task, + variant=vv, + n_parallel=1, + snapshot_mode='last', + seed=vv['seed'] + ) + + +if __name__ == '__main__': + main() + + + + + + diff --git a/train/train_monitorEncorporated_straight_planner.py b/train/train_monitorEncorporated_straight_planner.py new file mode 100644 index 0000000..c7d1c47 --- /dev/null +++ b/train/train_monitorEncorporated_straight_planner.py @@ -0,0 +1,244 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +@author: DBayani +largely based on train_straight_planner.py by edwardahn + +Demonstrations of training with MonitorEncorporatedEnv +environment wrapped around the straight vehicle environment. +""" + +import argparse + +import joblib +import lasagne.init as LI +import lasagne.layers as L +import lasagne.nonlinearities as LN +import numpy as np + +from rllab.algos.trpo import TRPO +from rllab.core.lasagne_layers import ParamLayer +from rllab.core.lasagne_powered import LasagnePowered +from rllab.core.network import MLP +# from rllab.envs.base import Env +from rllab.misc import ext, logger +from rllab.misc.instrument import run_experiment_lite, VariantGenerator +from rllab.misc.resolve import load_class +from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy +from sandbox.cpo.baselines.linear_feature_baseline import LinearFeatureBaseline + +from aa_simulation.envs.straight.straight_env import StraightEnv +from aa_simulation.envs.monitorEncorporated_env import MonitorEncorporatedEnv; + +# Pre-trained policy and baseline +policy = None +baseline = None + + + +from rllab.policies.base import Policy; + + +# We keep in the below dead-code as a reminder that the ideal way the +# fallback-controller would be specified is with an instance of the policy class. +""" +class DummbyFallbackPolicy(Policy): + def get_action(self, observation): + return np.array([0,0]); +""" + + + +def run_task(vv, log_dir=None, exp_name=None): + global policy + global baseline + + # Check if variant is available + if vv['model_type'] not in ['BrushTireModel', 'LinearTireModel']: + raise ValueError('Unrecognized model type for simulating robot') + if vv['robot_type'] not in ['MRZR', 'RCCar']: + raise ValueError('Unrecognized robot type') + + + # Load environment + baseEnv = StraightEnv( + target_velocity=vv['target_velocity'], + dt=vv['dt'], + model_type=vv['model_type'], + robot_type=vv['robot_type'] + ); + env = MonitorEncorporatedEnv(\ + wrapped_env = baseEnv, \ + quantitativeMonitorSubFormulas = vv["quantitativeMonitorSubFormulas"], \ + weightForQuantMonitorValueInReward = vv["weightForQuantMonitorValueInReward"], \ + codeForFallbackController = vv["codeForFallbackController"], \ + useQuantMonitorSubformulasAsFeatures = vv["useQuantMonitorSubformulasAsFeatures"]\ + ); + + # Save variant information for comparison plots + variant_file = logger.get_snapshot_dir() + '/variant.json' + logger.log_variant(variant_file, vv) + + # Set variance for each action component separately for exploration + # Note: We set the variance manually because we are not scaling our + # action space during training. + init_std_speed = vv['target_velocity'] / 4 + init_std_steer = np.pi / 6 + init_std = [init_std_speed, init_std_steer] + + # Build policy and baseline networks + # Note: Mean of policy network set to analytically computed values for + # faster training (rough estimates for RL to fine-tune). + if policy is None or baseline is None: + target_velocity = vv['target_velocity'] + target_steering = 0 + output_mean = np.array([target_velocity, target_steering]) + hidden_sizes = (32, 32) + + # In mean network, allow output b values to dominate final output + # value by constraining the magnitude of the output W matrix. This is + # to allow faster learning. These numbers are arbitrarily chosen. + W_gain = min(vv['target_velocity'] / 5, np.pi / 15) + + + # Thankfully, the use of env.spec.observation_space.flat_dim should take care of + # having to specify the input dimension... + mean_network = MLP( + input_shape=(env.spec.observation_space.flat_dim,), + output_dim=env.spec.action_space.flat_dim, + hidden_sizes=hidden_sizes, + hidden_nonlinearity=LN.tanh, + output_nonlinearity=None, + output_W_init=LI.GlorotUniform(gain=W_gain), + output_b_init=output_mean + ) + policy = GaussianMLPPolicy( + env_spec=env.spec, + hidden_sizes=(32, 32), + init_std=init_std, + mean_network=mean_network + ) + baseline = LinearFeatureBaseline(env_spec=env.spec) + + # Reset variance to re-enable exploration when using pre-trained networks + else: + policy._l_log_std = ParamLayer( + policy._mean_network.input_layer, + num_units=env.spec.action_space.flat_dim, + param=LI.Constant(np.log(init_std)), + name='output_log_std', + trainable=True + ) + obs_var = policy._mean_network.input_layer.input_var + mean_var, log_std_var = L.get_output([policy._l_mean, policy._l_log_std]) + policy._log_std_var = log_std_var + LasagnePowered.__init__(policy, [policy._l_mean, policy._l_log_std]) + policy._f_dist = ext.compile_function( + inputs=[obs_var], + outputs=[mean_var, log_std_var] + ) + + algo = TRPO( + env=env, + policy=policy, + baseline=baseline, + batch_size=600, + max_path_length=env.horizon, + n_itr=600, + discount=0.99, + step_size=0.01, + plot=False, + ) + algo.train() + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('--network', type=str, + help='Path to snapshot file of pre-trained network') + args = parser.parse_args() + return args + + +def main(): + global policy + global baseline + + # Load pre-trained network if available + args = parse_arguments() + if args.network is not None: + data = joblib.load(args.network) + policy = data['policy'] + baseline = data['baseline'] + use_pretrained = True + else: + use_pretrained = False + + # Run multiple experiment variants at once + vg = VariantGenerator() + + # Non-configurable parameters (do not change) + vg.add('trajectory', ['Straight']) + vg.add('objective', ['TargetVelocity']) + vg.add('algo', ['TRPO']) + + + targetVelocity = 1.0 + + # Configurable parameters + # Options for model_type: 'BrushTireModel', 'LinearTireModel' + # Options for robot_type: 'MRZR', 'RCCar' + seeds = [100, 200] + robot_type = 'RCCar' + use_ros = False + vg.add('seed', seeds) + vg.add('target_velocity', [targetVelocity]) + vg.add('dt', [0.1]) + vg.add('model_type', ['BrushTireModel']) + vg.add('robot_type', [robot_type]) + # We are mostly uninterested in the negative values below, since they would encourage + # violating the monitor.... + vg.add('weightForQuantMonitorValueInReward', [0.0, 2.0, 0.125, -0.125, 1.0, 0.25, 0.5, -2.0]); + vg.add('useQuantMonitorSubformulasAsFeatures', [True, False]); + + # Original monitor code: "action[1] * (np.sign(state[0]) + np.sign(state[1])) > 0" + # This monitor is violated when the car drives away from y=0... + quantMonitorSubformula_carDrivingAwayFromLine = "-action[1] * (np.sign(state[0]) + np.sign(state[1]))"; + quantitativeMonitorSubFormulas = \ + [quantMonitorSubformula_carDrivingAwayFromLine]; + + codeForFallbackController = """ +import numpy as np; +def fallbackController(observation): + y = observation[0] + yaw = observation[1] + + amountToSteerOffCenter = 0.1; # very slight to avoid oversteering... but completely + # fails to consider drifting. As such, this is VERY MUCH a first-swing attempt + # with a fallback controller that we would not actually use. + velocityToUse = """ + str(targetVelocity) + """; + + # steers toward the line + steeringAngle = amountToSteerOffCenter * (np.sign(y) + np.sign(yaw)); + return np.array([steeringAngle, velocityToUse]); +"""; + + vg.add("codeForFallbackController", [codeForFallbackController, None]); + vg.add("quantitativeMonitorSubFormulas", [quantitativeMonitorSubFormulas, []]); + + print('Number of Configurations: ', len(vg.variants())) + + # Run each experiment variant + for vv in vg.variants(): + run_experiment_lite( + stub_method_call=run_task, + variant=vv, + n_parallel=2, + snapshot_mode='last', + seed=vv['seed'] + ) + + +if __name__ == '__main__': + main() +