From e99dacaceac6c5893e5f03f7cc6559511f51ff22 Mon Sep 17 00:00:00 2001 From: UtkarshMishra04 Date: Mon, 5 Apr 2021 08:48:09 +0000 Subject: [PATCH 1/9] updated and added continuous and test environment --- behavior_studio/brains/f1rl/sampleTest.py | 104 +++++ .../brains/f1rl/trainContinuous.py | 123 +++++ behavior_studio/brains/f1rl/utils/ddpg.py | 0 .../brains/f1rl/utils/settingsDDPG.py | 86 ++++ behavior_studio/default-testrl.yml | 55 +++ gym-gazebo/gym_gazebo/__init__.py | 7 + gym-gazebo/gym_gazebo/envs/f1/__init__.py | 3 + .../envs/f1/env_gazebo_f1_ddpg_camera.py | 442 ++++++++++++++++++ 8 files changed, 820 insertions(+) create mode 100755 behavior_studio/brains/f1rl/sampleTest.py create mode 100755 behavior_studio/brains/f1rl/trainContinuous.py create mode 100644 behavior_studio/brains/f1rl/utils/ddpg.py create mode 100644 behavior_studio/brains/f1rl/utils/settingsDDPG.py create mode 100644 behavior_studio/default-testrl.yml create mode 100755 gym-gazebo/gym_gazebo/envs/f1/env_gazebo_f1_ddpg_camera.py diff --git a/behavior_studio/brains/f1rl/sampleTest.py b/behavior_studio/brains/f1rl/sampleTest.py new file mode 100755 index 00000000..1aa46b4b --- /dev/null +++ b/behavior_studio/brains/f1rl/sampleTest.py @@ -0,0 +1,104 @@ +import time +from datetime import datetime +import pickle +import gym +from brains.f1rl.utils import liveplot +import gym_gazebo +import numpy as np +from gym import logger, wrappers +from brains.f1rl.utils.qlearn import QLearn +import brains.f1rl.utils.settings as settings +from brains.f1rl.utils.settings import actions_set + + +def render(): + render_skip = 0 # Skip first X episodes. + render_interval = 50 # Show render Every Y episodes. + render_episodes = 10 # Show Z episodes every rendering. + + if (episode % render_interval == 0) and (episode != 0) and (episode > render_skip): + env.render() + elif ((episode - render_episodes) % render_interval == 0) and (episode != 0) and (episode > render_skip) and \ + (render_episodes < episode): + env.render(close=True) + +def save_model(qlearn): + # Tabular RL: Tabular Q-learning basically stores the policy (Q-values) of the agent into a matrix of shape + # (S x A), where s are all states, a are all the possible actions. After the environment is solved, just save this + # matrix as a csv file. I have a quick implementation of this on my GitHub under Reinforcement Learning. + from datetime import datetime + import pickle + date = datetime.now() + format = date.strftime("%Y%m%d_%H%M%S") + file_name = "_qlearn_model_e_{}_a_{}_g_{}".format(qlearn.epsilon, qlearn.alpha, qlearn.gamma) + if os.path.isdir('brains/f1rl/logs') is False: + os.mkdir('brains/f1rl/logs') + if os.path.isdir('brains/f1rl/logs/qlearn_models/') is False: + os.mkdir('brains/f1rl/logs/qlearn_models/') + file = open("brains/f1rl/logs/qlearn_models/" + format + file_name + '.pkl', 'wb') + pickle.dump(qlearn.q, file) + +# if __name__ == '__main__': +print(settings.title) +print(settings.description) + +env = gym.make('GazeboF1CameraEnvDDPG-v0') + +outdir = './logs/f1_qlearn_gym_experiments/' +stats = {} # epoch: steps + + +env = gym.wrappers.Monitor(env, outdir, force=True) +plotter = liveplot.LivePlot(outdir) +last_time_steps = np.ndarray(0) +actions = 6 +stimate_step_per_lap = 4000 +lap_completed = False + +qlearn = QLearn(actions=actions, alpha=0.2, gamma=0.9, epsilon=0.99) + +highest_reward = 0 +initial_epsilon = qlearn.epsilon + +total_episodes = settings.total_episodes +epsilon_discount = settings.epsilon_discount # Default 0.9986 + +start_time = time.time() + +print(settings.lets_go) + +for episode in range(total_episodes): + done = False + lap_completed = False + cumulated_reward = 0 # Should going forward give more reward then L/R z? + + print("Env Reset Calling!") + + observation = env.reset() + + print("Env Resetted!") + + state = observation + + for step in range(20000): + + # Execute the action and get feedback + observation, reward, done, info = env.step(env.action_space.sample()) + cumulated_reward += reward + + if highest_reward < cumulated_reward: + highest_reward = cumulated_reward + + state = observation + + if not done: + break + + if stimate_step_per_lap > 4000 and not lap_completed: + print("LAP COMPLETED!!") + lap_completed = True + + if episode % 100 == 0 and settings.plotter_graphic: + plotter.plot_steps_vs_epoch(stats) + +env.close() diff --git a/behavior_studio/brains/f1rl/trainContinuous.py b/behavior_studio/brains/f1rl/trainContinuous.py new file mode 100755 index 00000000..858b43e4 --- /dev/null +++ b/behavior_studio/brains/f1rl/trainContinuous.py @@ -0,0 +1,123 @@ +import time +from datetime import datetime +import pickle +import gym +from brains.f1rl.utils import liveplot +import gym_gazebo +import numpy as np +from gym import logger, wrappers +from brains.f1rl.utils.ddpg import DDPGAgent +import brains.f1rl.utils.settingsDDPG as settings + + +def render(): + render_skip = 0 # Skip first X episodes. + render_interval = 50 # Show render Every Y episodes. + render_episodes = 10 # Show Z episodes every rendering. + + if (episode % render_interval == 0) and (episode != 0) and (episode > render_skip): + env.render() + elif ((episode - render_episodes) % render_interval == 0) and (episode != 0) and (episode > render_skip) and \ + (render_episodes < episode): + env.render(close=True) + +def save_model(agent): + # Tabular RL: Tabular Q-learning basically stores the policy (Q-values) of the agent into a matrix of shape + # (S x A), where s are all states, a are all the possible actions. After the environment is solved, just save this + # matrix as a csv file. I have a quick implementation of this on my GitHub under Reinforcement Learning. + from datetime import datetime + import pickle + date = datetime.now() + format = date.strftime("%Y%m%d_%H%M%S") + file_name = "_ddpg_model_g_{}_t_{}_h_{}".format(agent.gamma, agent.tau, agent.hidden_size) + if os.path.isdir('brains/f1rl/logs') is False: + os.mkdir('brains/f1rl/logs') + if os.path.isdir('brains/f1rl/logs/ddpg_models/') is False: + os.mkdir('brains/f1rl/logs/ddpg_models/') + file_actor = open("brains/f1rl/logs/ddpg_models/" + format + file_name + '_actor.pkl', 'wb') + file_critic = open("brains/f1rl/logs/ddpg_models/" + format + file_name + '_critic.pkl', 'wb') + pickle.dump(agent.get_actor_weights(), file_actor) + pickle.dump(agent.get_critic_weights(), file_critic) + +# if __name__ == '__main__': +print(settings.title) +print(settings.description) + +current_env = settings.current_env +if current_env == "laser": + env = gym.make('GazeboF1QlearnLaserEnv-v0') +elif current_env == "camera": + env = gym.make('GazeboF1QlearnCameraEnvContinuous-v0') +else: + print("NO correct env selected") + + +outdir = './logs/f1_ddpg_gym_experiments/' +stats = {} # epoch: steps + + +env = gym.wrappers.Monitor(env, outdir, force=True) +plotter = liveplot.LivePlot(outdir) +last_time_steps = np.ndarray(0) +stimate_step_per_lap = 4000 +lap_completed = False + +agent = DDPGAgent(env, settings.hidden_size, settings.gamma, settings.tau) + +if settings.load_model: + exit(1) + + qlearn_file = open('logs/qlearn_models/20200826_154342_qlearn_model_e_0.988614_a_0.2_g_0.9.pkl', 'rb') + model = pickle.load(qlearn_file) + print("Number of (action, state): {}".format(len(model))) + qlearn.q = model + qlearn.alpha = settings.alpha + qlearn.gamma = settings.gamma + qlearn.epsilon = settings.epsilon + highest_reward = 4000 +else: + highest_reward = 0 + +total_episodes = settings.total_episodes + +start_time = time.time() + +print(settings.lets_go) + +for episode in range(total_episodes): + done = False + lap_completed = False + cumulated_reward = 0 # Should going forward give more reward then L/R z? + observation = env.reset() + + state = observation + + for step in range(20000): + + + + if stimate_step_per_lap > 4000 and not lap_completed: + print("LAP COMPLETED!!") + lap_completed = True + + if episode % 100 == 0 and settings.plotter_graphic: + plotter.plot_steps_vs_epoch(stats) + + if episode % 1000 == 0 and settings.save_model: + print("\nSaving model . . .\n") + save_model(agent) + + m, s = divmod(int(time.time() - start_time), 60) + h, m = divmod(m, 60) + print ("EP: " + str(episode + 1) + " - Reward: " + str( + cumulated_reward) + " - Time: %d:%02d:%02d" % (h, m, s) + " - steps: " + str(step)) + +print ("\n|" + str(total_episodes) + "|" + str(agent.gamma) + "|" + str(agent.tau) + "|" + str(highest_reward) + "| PICTURE |") + +l = last_time_steps.tolist() +l.sort() + +print("Overall score: {:0.2f}".format(last_time_steps.mean())) +print("Best 100 score: {:0.2f}".format(reduce(lambda x, y: x + y, l[-100:]) / len(l[-100:]))) + +env.close() diff --git a/behavior_studio/brains/f1rl/utils/ddpg.py b/behavior_studio/brains/f1rl/utils/ddpg.py new file mode 100644 index 00000000..e69de29b diff --git a/behavior_studio/brains/f1rl/utils/settingsDDPG.py b/behavior_studio/brains/f1rl/utils/settingsDDPG.py new file mode 100644 index 00000000..faaf5ef1 --- /dev/null +++ b/behavior_studio/brains/f1rl/utils/settingsDDPG.py @@ -0,0 +1,86 @@ +# Global variables + +import numpy as np +from utils.configuration import Config + +app_configuration = Config('default-rlddpg.yml') + +debug_level = 0 +telemetry = False +telemetry_mask = False +plotter_graphic = True +my_board = True +save_model = True +load_model = False + +tau = app_configuration.tau +gamma = app_configuration.gamma +hidden_size = app_configuration.hidden_size +total_episodes = app_configuration.total_episodes + +gazebo_positions_set = app_configuration.gazebo_positions_set + +current_env = app_configuration.env + +# === POSES === +if gazebo_positions_set == "pista_simple": + gazebo_positions = [(0, 53.462, -41.988, 0.004, 0, 0, 1.57, -1.57), + (1, 53.462, -8.734, 0.004, 0, 0, 1.57, -1.57), + (2, 39.712, -30.741, 0.004, 0, 0, 1.56, 1.56), + (3, -6.861, -36.481, 0.004, 0, 0.01, -0.858, 0.613), + (4, 20.043, 37.130, 0.003, 0, 0.103, -1.4383, -1.4383)] + +elif gazebo_positions_set == "nurburgring": + gazebo_positions = [(0, -23.0937, -2.9703, 0, 0.0050, 0.0013, -0.9628, 0.2699), + (1, -32.3188, 12.2921, 0, 0.0014, 0.0049, -0.2727, 0.9620), + (2, -17.4155, -24.1243, 0, 0.0001, 0.0051, -0.0192, 1), + (3, 31.3967, -4.6166, 0, 0.0030, 0.0041, 0.6011, 0.7991), + (4, -56.1261, 4.1047, 0, 0.0043, -0.0027, -0.8517, -0.5240)] + +# === CAMERA === +# Images size +width = 640 +height = 480 +center_image = width/2 + +# Coord X ROW +# x_row = [250, 300, 350, 400, 450] +# x_row = [250, 450] +x_row = [350] + +# Maximum distance from the line +ranges = [300, 280, 250] # Line 1, 2 and 3 +reset_range = [-40, 40] +last_center_line = 0 + +lets_go = ''' + ______ __ + / ____/___ / / + / / __/ __ \/ / +/ /_/ / /_/ /_/ +\____/\____(_) +''' + +description = ''' + ___ ___ ____ ____ +| _ \| _ \| _ \ / ___| +| | \ | | \ | (_| | / _ +| |_/ | |_/ | __/| \_| | +|____/|____/|_| \____/ + ____ + / ___|__ _ _ __ ___ ___ _ __ __ _ + _____ | | / _` | '_ ` _ \ / _ \ '__/ _` | +|_____| | |__| (_| | | | | | | __/ | | (_| | + \____\__,_|_| |_| |_|\___|_| \__,_| + +''' + +title = ''' + ___ _ ______ _ _ + |_ | | | | ___ \ | | | | + | | __| | ___| |_/ /___ | |__ ___ | |_ + | |/ _` |/ _ \ // _ \| '_ \ / _ \| __| +/\__/ / (_| | __/ |\ \ (_) | |_) | (_) | |_ +\____/ \__,_|\___\_| \_\___/|_.__/ \___/ \__| + +''' diff --git a/behavior_studio/default-testrl.yml b/behavior_studio/default-testrl.yml new file mode 100644 index 00000000..6a49ebb9 --- /dev/null +++ b/behavior_studio/default-testrl.yml @@ -0,0 +1,55 @@ +Behaviors: + Robot: + Sensors: + Cameras: + Camera_0: + Name: 'camera_0' + Topic: '/F1ROS/cameraL/image_raw' + Pose3D: + Pose3D_0: + Name: 'pose3d_0' + Topic: '/F1ROS/odom' + Actuators: + Motors: + Motors_0: + Name: 'motors_0' + Topic: '/F1ROS/cmd_vel' + MaxV: 12 + MaxW: 1.5 + RL: True + BrainPath: 'brains/f1rl/sampleTest.py' + Type: 'f1rl' + Parameters: + action_set: 'hard' + gazebo_positions_set: 'pista_simple' + alpha: 0.2 + gamma: 0.9 + epsilon: 0.99 + total_episodes: 20000 + epsilon_discount: 0.9986 + env: 'camera' + Simulation: + World: /opt/jderobot/share/jderobot/gazebo/launch/simple_circuit.launch + Dataset: + In: '/tmp/my_bag.bag' + Out: '' + Stats: + Out: './' + PerfectLap: 'lap-simple-circuit.bag' + Layout: + Frame_0: + Name: frame_0 + Geometry: [1, 1, 2, 2] + Data: rgbimage + Frame_1: + Name: frame_1 + Geometry: [0, 1, 1, 1] + Data: rgbimage + Frame_2: + Name: frame_2 + Geometry: [0, 2, 1, 1] + Data: rgbimage + Frame_3: + Name: frame_3 + Geometry: [0, 3, 3, 1] + Data: rgbimage diff --git a/gym-gazebo/gym_gazebo/__init__.py b/gym-gazebo/gym_gazebo/__init__.py index 77560a27..9c2db093 100755 --- a/gym-gazebo/gym_gazebo/__init__.py +++ b/gym-gazebo/gym_gazebo/__init__.py @@ -26,6 +26,13 @@ # More arguments here ) +# F1 DDPG +register( + id='GazeboF1CameraEnvDDPG-v0', + entry_point='gym_gazebo.envs.f1:GazeboF1CameraEnvDDPG', + # More arguments here +) + # Turtlebot envs register( id='GazeboMazeTurtlebotLidar-v0', diff --git a/gym-gazebo/gym_gazebo/envs/f1/__init__.py b/gym-gazebo/gym_gazebo/envs/f1/__init__.py index 2bfa8a5a..fb630775 100755 --- a/gym-gazebo/gym_gazebo/envs/f1/__init__.py +++ b/gym-gazebo/gym_gazebo/envs/f1/__init__.py @@ -3,3 +3,6 @@ # DQN from gym_gazebo.envs.f1.env_gazebo_f1_dqn_camera import GazeboF1CameraEnvDQN + +# DDPG +from gym_gazebo.envs.f1.env_gazebo_f1_ddpg_camera import GazeboF1CameraEnvDDPG diff --git a/gym-gazebo/gym_gazebo/envs/f1/env_gazebo_f1_ddpg_camera.py b/gym-gazebo/gym_gazebo/envs/f1/env_gazebo_f1_ddpg_camera.py new file mode 100755 index 00000000..30bcd1e6 --- /dev/null +++ b/gym-gazebo/gym_gazebo/envs/f1/env_gazebo_f1_ddpg_camera.py @@ -0,0 +1,442 @@ +import os +import random +import sys +import time + +import cv2 +import gym +from gym import spaces +import numpy as np +import roslaunch +import rospkg +import rospy +import skimage as skimage + +from cv_bridge import CvBridge, CvBridgeError +from gazebo_msgs.msg import ModelState +from gazebo_msgs.srv import GetModelState, SetModelState +from geometry_msgs.msg import Twist +from gym import spaces, utils +from gym.utils import seeding +from sensor_msgs.msg import Image, LaserScan +from skimage import color, exposure, transform +from skimage.transform import rotate +from skimage.viewer import ImageViewer +from std_srvs.srv import Empty + +from gym_gazebo.envs import gazebo_env +from agents.f1.settings import telemetry + +# Images size +witdh = 640 +center_image = int(witdh/2) + +# Coord X ROW +x_row = [260, 360, 450] +# Maximum distance from the line +RANGES = [300, 280, 250] # Line 1, 2 and 3 + +RESET_RANGE = [-40, 40] + +# Deprecated? +space_reward = np.flip(np.linspace(0, 1, 300)) + +last_center_line = 0 + +font = cv2.FONT_HERSHEY_COMPLEX + +# OUTPUTS +v_lineal = [3, 8, 15] +w_angular = [-1, -0.6, 0, 1, 0.6] + +# POSES +positions = [(0, 53.462, -41.988, 0.004, 0, 0, 1.57, -1.57), + (1, 53.462, -8.734, 0.004, 0, 0, 1.57, -1.57), + (2, 39.712, -30.741, 0.004, 0, 0, 1.56, 1.56), + (3, -7.894, -39.051, 0.004, 0, 0.01, -2.021, 2.021), + (4, 20.043, 37.130, 0.003, 0, 0.103, -1.4383, -1.4383)] + + +class ImageF1: + def __init__(self): + self.height = 3 # Image height [pixels] + self.width = 3 # Image width [pixels] + self.width = 3 # Image width [pixels] + self.timeStamp = 0 # Time stamp [s] */ + self.format = "" # Image format string (RGB8, BGR,...) + self.data = np.zeros((self.height, self.width, 3), np.uint8) # The image data itself + self.data.shape = self.height, self.width, 3 + + def __str__(self): + s = "Image: {\n height: " + str(self.height) + "\n width: " + str(self.width) + s = s + "\n format: " + self.format + "\n timeStamp: " + str(self.timeStamp) + return s + "\n data: " + str(self.data) + "\n}" + + +class GazeboF1CameraEnvDDPG(gazebo_env.GazeboEnv): + """ + Description: + A Formula 1 car has to complete one lap of a circuit following a red line painted on the ground. Initially it + will not use the complete information of the image but some coordinates that refer to the error made with + respect to the center of the line. + Source: + Master's final project at Universidad Rey Juan Carlos. RoboticsLab Urjc. JdeRobot. Author: Ignacio Arranz + Observation: + Type: Array + Num Observation Min Max + ---------------------------------------- + 0 Vel. Lineal (m/s) 1 10 + 1 Vel. Angular (rad/seg) -2 2 + 2 Error 1 -300 300 + 3 Error 2 -280 280 + 4 Error 3 -250 250 + Actions: + Continuous with Max and Min Linear velocity of 12 and 2 m/s respectively, and, Max Absolute Angular Velocity + of 1.5 rad/s + Reward: + The reward depends on the set of the 3 errors. As long as the lowest point is within range, there will still + be steps. If errors 1 and 2 fall outside the range, a joint but weighted reward will be posted, always giving + more importance to the lower one. + Starting State: + The observations will start from different points in order to prevent you from performing the same actions + initially. This variability will enrich the set of state/actions. + Episode Termination: + The episode ends when the lower error is outside the established range (see table in the observation space). + """ + + def __init__(self): + # Launch the simulation with the given launchfile name + gazebo_env.GazeboEnv.__init__(self, "F1Cameracircuit_v0.launch") + self.vel_pub = rospy.Publisher('/F1ROS/cmd_vel', Twist, queue_size=5) + self.unpause = rospy.ServiceProxy('/gazebo/unpause_physics', Empty) + self.pause = rospy.ServiceProxy('/gazebo/pause_physics', Empty) + self.reset_proxy = rospy.ServiceProxy('/gazebo/reset_simulation', Empty) + # self.state_msg = ModelState() + # self.set_state = rospy.ServiceProxy("/gazebo/set_model_state", SetModelState) + self.position = None + self.reward_range = (-np.inf, np.inf) + self._seed() + self.img_rows = 32 + self.img_cols = 32 + self.img_channels = 1 + # self.bridge = CvBridge() + # self.image_sub = rospy.Subscriber("/F1ROS/cameraL/image_raw", Image, self.callback) + self.action_space = self._generate_continuous_action_space() + + def render(self, mode='human'): + pass + + def _gazebo_pause(self): + rospy.wait_for_service('/gazebo/pause_physics') + try: + # resp_pause = pause.call() + self.pause() + except rospy.ServiceException as e: + print("/gazebo/pause_physics service call failed: {}".format(e)) + + def _gazebo_unpause(self): + rospy.wait_for_service('/gazebo/unpause_physics') + try: + self.unpause() + except rospy.ServiceException as e: + print(e) + print("/gazebo/unpause_physics service call failed") + + def _gazebo_reset(self): + # Resets the state of the environment and returns an initial observation. + rospy.wait_for_service('/gazebo/reset_simulation') + try: + # reset_proxy.call() + self.reset_proxy() + self.unpause() + except rospy.ServiceException as e: + print("/gazebo/reset_simulation service call failed: {}".format(e)) + + @staticmethod + def _generate_continuous_action_space(): + + min_ang_speed = -1.5 + max_ang_speed = 1.5 + min_lin_speed = 2 + max_lin_speed = 12 + + action_space = spaces.Box([min_lin_speed, min_ang_speed], [max_lin_speed, max_ang_speed]) + + return action_space + + def _seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def show_telemetry(self, img, point_1, point_2, point_3, action, reward): + # Puntos centrales de la imagen (verde) + cv2.line(img, (320, x_row[0]), (320, x_row[0]), (255, 255, 0), thickness=5) + cv2.line(img, (320, x_row[1]), (320, x_row[1]), (255, 255, 0), thickness=5) + cv2.line(img, (320, x_row[2]), (320, x_row[2]), (255, 255, 0), thickness=5) + # Linea diferencia entre punto central - error (blanco) + cv2.line(img, (center_image, x_row[0]), (int(point_1), x_row[0]), (255, 255, 255), thickness=2) + cv2.line(img, (center_image, x_row[1]), (int(point_2), x_row[1]), (255, 255, 255), thickness=2) + cv2.line(img, (center_image, x_row[2]), (int(point_3), x_row[2]), (255, 255, 255), thickness=2) + # Telemetry + cv2.putText(img, str("action1: {} action2: {}".format(action[0], action[1])), (18, 280), font, 0.4, (255, 255, 255), 1) + cv2.putText(img, str("w ang: {}".format(w_angular)), (18, 300), font, 0.4, (255, 255, 255), 1) + cv2.putText(img, str("reward: {}".format(reward)), (18, 320), font, 0.4, (255, 255, 255), 1) + cv2.putText(img, str("err1: {}".format(center_image - point_1)), (18, 340), font, 0.4, (255, 255, 255), 1) + cv2.putText(img, str("err2: {}".format(center_image - point_2)), (18, 360), font, 0.4, (255, 255, 255), 1) + cv2.putText(img, str("err3: {}".format(center_image - point_3)), (18, 380), font, 0.4, (255, 255, 255), 1) + cv2.putText(img, str("pose: {}".format(self.position)), (18, 400), font, 0.4, (255, 255, 255), 1) + + cv2.imshow("Image window", img) + cv2.waitKey(3) + + @staticmethod + def set_new_pose(new_pos): + """ + (pos_number, pose_x, pose_y, pose_z, or_x, or_y, or_z, or_z) + """ + pos_number = positions[0] + + state = ModelState() + state.model_name = "f1_renault" + state.pose.position.x = positions[new_pos][1] + state.pose.position.y = positions[new_pos][2] + state.pose.position.z = positions[new_pos][3] + state.pose.orientation.x = positions[new_pos][4] + state.pose.orientation.y = positions[new_pos][5] + state.pose.orientation.z = positions[new_pos][6] + state.pose.orientation.w = positions[new_pos][7] + + rospy.wait_for_service('/gazebo/set_model_state') + + try: + set_state = rospy.ServiceProxy('/gazebo/set_model_state', SetModelState) + set_state(state) + except rospy.ServiceException as e: + print("Service call failed: {}".format(e)) + + return pos_number + + @staticmethod + def image_msg_to_image(img, cv_image): + + image = ImageF1() + image.width = img.width + image.height = img.height + image.format = "RGB8" + image.timeStamp = img.header.stamp.secs + (img.header.stamp.nsecs * 1e-9) + image.data = cv_image + + return image + + @staticmethod + def get_center(image_line): + try: + coords = np.divide(np.max(np.nonzero(image_line)) - np.min(np.nonzero(image_line)), 2) + coords = np.min(np.nonzero(image_line)) + coords + except: + coords = -1 + + return coords + + def processed_image(self, img): + """ + Conver img to HSV. Get the image processed. Get 3 lines from the image. + + :parameters: input image 640x480 + :return: x, y, z: 3 coordinates + """ + + img_proc = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) + line_pre_proc = cv2.inRange(img_proc, (0, 30, 30), (0, 255, 200)) + + # gray = cv2.cvtColor(mask, cv2.COLOR_BGR2GRAY) + _, mask = cv2.threshold(line_pre_proc, 240, 255, cv2.THRESH_BINARY) + + line_1 = mask[x_row[0], :] + line_2 = mask[x_row[1], :] + line_3 = mask[x_row[2], :] + + central_1 = self.get_center(line_1) + central_2 = self.get_center(line_2) + central_3 = self.get_center(line_3) + + # print(central_1, central_2, central_3) + + return central_1, central_2, central_3 + + def callback(self, data): + + # print("CALLBACK!!!!: ", ros_data.height, ros_data.width) + # np_arr = np.fromstring(ros_data.data, np.uint8) + # image_np = cv2.imdecode(np_arr, cv2.IMREAD_COLOR) + + # self.my_image = image_np + # rospy.loginfo(rospy.get_caller_id() + "I see %s", data.data) + + try: + cv_image = self.bridge.imgmsg_to_cv2(data, "bgr8") + except CvBridgeError as e: + print(e) + + (rows, cols,channels) = cv_image.shape + if cols > 60 and rows > 60: + cv2.circle(cv_image, (50, 50), 10, 255) + + cv2.imshow("Image window", cv_image) + cv2.waitKey(3) + + # try: + # self.image_pub.publish(self.bridge.cv2_to_imgmsg(cv_image, "bgr8")) + # except CvBridgeError as e: + # print(e) + + @staticmethod + def calculate_error(point_1, point_2, point_3): + + error_1 = abs(center_image - point_1) + error_2 = abs(center_image - point_2) + error_3 = abs(center_image - point_3) + + return error_1, error_2, error_3 + + @staticmethod + def calculate_reward(error_1, error_2, error_3): + + global center_image + alpha = 0 + beta = 0 + gamma = 1 + + # if error_1 > RANGES[0] and error_2 > RANGES[1]: + # ALPHA = 0.1 + # BETA = 0.2 + # GAMMA = 0.7 + # elif error_1 > RANGES[0]: + # ALPHA = 0.1 + # BETA = 0 + # GAMMA = 0.9 + # elif error_2 > RANGES[1]: + # ALPHA = 0 + # BETA = 0.1 + # GAMMA = 0.9 + + # d = ALPHA * np.true_divide(error_1, center_image) + \ + # beta * np.true_divide(error_2, center_image) + \ + # gamma * np.true_divide(error_3, center_image) + d = np.true_divide(error_3, center_image) + reward = np.round(np.exp(-d), 4) + + return reward + + @staticmethod + def is_game_over(point_1, point_2, point_3): + + done = False + + if center_image-RANGES[2] < point_3 < center_image+RANGES[2]: + if center_image-RANGES[0] < point_1 < center_image+RANGES[0] or \ + center_image-RANGES[1] < point_2 < center_image+RANGES[1]: + pass # In Line + else: + done = True + + return done + + def step(self, action): + + self._gazebo_unpause() + + # === ACTIONS === - 5 actions + vel_cmd = Twist() + vel_cmd.linear.x = action[0] + vel_cmd.angular.z = action[1] + self.vel_pub.publish(vel_cmd) + # print("Action: {} - V_Lineal: {} - W_Angular: {}".format(action, vel_cmd.linear.x, vel_cmd.angular.z)) + + # === IMAGE === + image_data = None + success = False + cv_image = None + f1_image_camera = None + while image_data is None or success is False: + image_data = rospy.wait_for_message('/F1ROS/cameraL/image_raw', Image, timeout=5) + # Transform the image data from ROS to CVMat + cv_image = CvBridge().imgmsg_to_cv2(image_data, "bgr8") + f1_image_camera = self.image_msg_to_image(image_data, cv_image) + + if f1_image_camera: + success = True + + point_1, point_2, point_3 = self.processed_image(f1_image_camera.data) + + # DONE + done = self.is_game_over(point_1, point_2, point_3) + + self._gazebo_pause() + + # action_sum = sum(self.last50actions) + + # == CALCULATE ERROR == + error_1, error_2, error_3 = self.calculate_error(point_1, point_2, point_3) + + # == REWARD == + if not done: + reward = self.calculate_reward(error_1, error_2, error_3) + else: + reward = -200 + + # == TELEMETRY == + if telemetry: + self.show_telemetry(f1_image_camera.data, point_1, point_2, point_3, action, reward) + + cv_image = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY) + cv_image = cv2.resize(cv_image, (self.img_rows, self.img_cols)) + observation = cv_image.reshape(1, 1, cv_image.shape[0], cv_image.shape[1]) + + # info = [vel_cmd.linear.x, vel_cmd.angular.z, error_1, error_2, error_3] + # OpenAI standard return: observation, reward, done, info + return observation, reward, done, {} + + # test STACK 4 + # cv_image = cv_image.reshape(1, 1, cv_image.shape[0], cv_image.shape[1]) + # self.s_t = np.append(cv_image, self.s_t[:, :3, :, :], axis=1) + # return self.s_t, reward, done, {} # observation, reward, done, info + + def reset(self): + + # === POSE === + pos = random.choice(list(enumerate(positions)))[0] + self.position = pos + self.set_new_pose(pos) + + # === RESET === + # Resets the state of the environment and returns an initial observation. + time.sleep(0.05) + # self._gazebo_reset() + self._gazebo_unpause() + + image_data = None + cv_image = None + success = False + while image_data is None or success is False: + image_data = rospy.wait_for_message('/F1ROS/cameraL/image_raw', Image, timeout=5) + # Transform the image data from ROS to CVMat + cv_image = CvBridge().imgmsg_to_cv2(image_data, "bgr8") + f1_image_camera = self.image_msg_to_image(image_data, cv_image) + if f1_image_camera: + success = True + + self._gazebo_pause() + + cv_image = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY) + cv_image = cv2.resize(cv_image, (self.img_rows, self.img_cols)) + print(cv_image.shape) + + state = cv_image.reshape(1, 1, cv_image.shape[0], cv_image.shape[1]) + + return state, pos + + # test STACK 4 + # self.s_t = np.stack((cv_image, cv_image, cv_image, cv_image), axis=0) + # self.s_t = self.s_t.reshape(1, self.s_t.shape[0], self.s_t.shape[1], self.s_t.shape[2]) + # return self.s_t From 8046fb712094a68d274b8c6d1993551fa2f88a5e Mon Sep 17 00:00:00 2001 From: UtkarshMishra04 Date: Thu, 8 Apr 2021 20:57:00 +0000 Subject: [PATCH 2/9] added basic support of continuous action spaces with DDPG --- behavior_studio/brains/f1rl/sampleTest.py | 104 -------------- .../brains/f1rl/trainContinuous.py | 88 ++++++++---- behavior_studio/brains/f1rl/utils/ddpg.py | 132 ++++++++++++++++++ .../brains/f1rl/utils/ddpg_utils/__init__.py | 0 .../brains/f1rl/utils/ddpg_utils/actor.py | 76 ++++++++++ .../brains/f1rl/utils/ddpg_utils/critic.py | 112 +++++++++++++++ .../utils/{ => ddpg_utils}/settingsDDPG.py | 4 +- .../brains/f1rl/utils/ddpg_utils/utils.py | 68 +++++++++ behavior_studio/default-rl.yml | 1 + ...{default-testrl.yml => default-rlddpg.yml} | 14 +- behavior_studio/utils/configuration.py | 12 +- .../envs/f1/env_gazebo_f1_ddpg_camera.py | 7 +- 12 files changed, 474 insertions(+), 144 deletions(-) delete mode 100755 behavior_studio/brains/f1rl/sampleTest.py create mode 100644 behavior_studio/brains/f1rl/utils/ddpg_utils/__init__.py create mode 100644 behavior_studio/brains/f1rl/utils/ddpg_utils/actor.py create mode 100644 behavior_studio/brains/f1rl/utils/ddpg_utils/critic.py rename behavior_studio/brains/f1rl/utils/{ => ddpg_utils}/settingsDDPG.py (97%) create mode 100644 behavior_studio/brains/f1rl/utils/ddpg_utils/utils.py rename behavior_studio/{default-testrl.yml => default-rlddpg.yml} (85%) diff --git a/behavior_studio/brains/f1rl/sampleTest.py b/behavior_studio/brains/f1rl/sampleTest.py deleted file mode 100755 index 1aa46b4b..00000000 --- a/behavior_studio/brains/f1rl/sampleTest.py +++ /dev/null @@ -1,104 +0,0 @@ -import time -from datetime import datetime -import pickle -import gym -from brains.f1rl.utils import liveplot -import gym_gazebo -import numpy as np -from gym import logger, wrappers -from brains.f1rl.utils.qlearn import QLearn -import brains.f1rl.utils.settings as settings -from brains.f1rl.utils.settings import actions_set - - -def render(): - render_skip = 0 # Skip first X episodes. - render_interval = 50 # Show render Every Y episodes. - render_episodes = 10 # Show Z episodes every rendering. - - if (episode % render_interval == 0) and (episode != 0) and (episode > render_skip): - env.render() - elif ((episode - render_episodes) % render_interval == 0) and (episode != 0) and (episode > render_skip) and \ - (render_episodes < episode): - env.render(close=True) - -def save_model(qlearn): - # Tabular RL: Tabular Q-learning basically stores the policy (Q-values) of the agent into a matrix of shape - # (S x A), where s are all states, a are all the possible actions. After the environment is solved, just save this - # matrix as a csv file. I have a quick implementation of this on my GitHub under Reinforcement Learning. - from datetime import datetime - import pickle - date = datetime.now() - format = date.strftime("%Y%m%d_%H%M%S") - file_name = "_qlearn_model_e_{}_a_{}_g_{}".format(qlearn.epsilon, qlearn.alpha, qlearn.gamma) - if os.path.isdir('brains/f1rl/logs') is False: - os.mkdir('brains/f1rl/logs') - if os.path.isdir('brains/f1rl/logs/qlearn_models/') is False: - os.mkdir('brains/f1rl/logs/qlearn_models/') - file = open("brains/f1rl/logs/qlearn_models/" + format + file_name + '.pkl', 'wb') - pickle.dump(qlearn.q, file) - -# if __name__ == '__main__': -print(settings.title) -print(settings.description) - -env = gym.make('GazeboF1CameraEnvDDPG-v0') - -outdir = './logs/f1_qlearn_gym_experiments/' -stats = {} # epoch: steps - - -env = gym.wrappers.Monitor(env, outdir, force=True) -plotter = liveplot.LivePlot(outdir) -last_time_steps = np.ndarray(0) -actions = 6 -stimate_step_per_lap = 4000 -lap_completed = False - -qlearn = QLearn(actions=actions, alpha=0.2, gamma=0.9, epsilon=0.99) - -highest_reward = 0 -initial_epsilon = qlearn.epsilon - -total_episodes = settings.total_episodes -epsilon_discount = settings.epsilon_discount # Default 0.9986 - -start_time = time.time() - -print(settings.lets_go) - -for episode in range(total_episodes): - done = False - lap_completed = False - cumulated_reward = 0 # Should going forward give more reward then L/R z? - - print("Env Reset Calling!") - - observation = env.reset() - - print("Env Resetted!") - - state = observation - - for step in range(20000): - - # Execute the action and get feedback - observation, reward, done, info = env.step(env.action_space.sample()) - cumulated_reward += reward - - if highest_reward < cumulated_reward: - highest_reward = cumulated_reward - - state = observation - - if not done: - break - - if stimate_step_per_lap > 4000 and not lap_completed: - print("LAP COMPLETED!!") - lap_completed = True - - if episode % 100 == 0 and settings.plotter_graphic: - plotter.plot_steps_vs_epoch(stats) - -env.close() diff --git a/behavior_studio/brains/f1rl/trainContinuous.py b/behavior_studio/brains/f1rl/trainContinuous.py index 858b43e4..ab11fe13 100755 --- a/behavior_studio/brains/f1rl/trainContinuous.py +++ b/behavior_studio/brains/f1rl/trainContinuous.py @@ -7,8 +7,8 @@ import numpy as np from gym import logger, wrappers from brains.f1rl.utils.ddpg import DDPGAgent -import brains.f1rl.utils.settingsDDPG as settings - +import brains.f1rl.utils.ddpg_utils.settingsDDPG as settings +import tensorflow as tf def render(): render_skip = 0 # Skip first X episodes. @@ -43,40 +43,45 @@ def save_model(agent): print(settings.title) print(settings.description) -current_env = settings.current_env -if current_env == "laser": - env = gym.make('GazeboF1QlearnLaserEnv-v0') -elif current_env == "camera": - env = gym.make('GazeboF1QlearnCameraEnvContinuous-v0') -else: - print("NO correct env selected") - +env = gym.make('GazeboF1CameraEnvDDPG-v0') outdir = './logs/f1_ddpg_gym_experiments/' stats = {} # epoch: steps - env = gym.wrappers.Monitor(env, outdir, force=True) plotter = liveplot.LivePlot(outdir) last_time_steps = np.ndarray(0) stimate_step_per_lap = 4000 lap_completed = False -agent = DDPGAgent(env, settings.hidden_size, settings.gamma, settings.tau) - -if settings.load_model: - exit(1) - - qlearn_file = open('logs/qlearn_models/20200826_154342_qlearn_model_e_0.988614_a_0.2_g_0.9.pkl', 'rb') - model = pickle.load(qlearn_file) - print("Number of (action, state): {}".format(len(model))) - qlearn.q = model - qlearn.alpha = settings.alpha - qlearn.gamma = settings.gamma - qlearn.epsilon = settings.epsilon - highest_reward = 4000 -else: - highest_reward = 0 +sess = tf.Session() + +agent = DDPGAgent(sess=sess, + state_dim=(32,32), + action_dim=2, + training_batch=settings.batch_size, + epsilon_decay=settings.epsdecay, + gamma=settings.gamma, + tau=settings.tau) + +sess.run(tf.global_variables_initializer()) + +agent._hard_update() + +#if settings.load_model: +# exit(1) +# +# qlearn_file = open('logs/qlearn_models/20200826_154342_qlearn_model_e_0.988614_a_0.2_g_0.9.pkl', 'rb') +# model = pickle.load(qlearn_file) +# print("Number of (action, state): {}".format(len(model))) +# qlearn.q = model +# qlearn.alpha = settings.alpha +# qlearn.gamma = settings.gamma +# qlearn.epsilon = settings.epsilon +# highest_reward = 4000 +#else: + +highest_reward = 0 total_episodes = settings.total_episodes @@ -84,17 +89,46 @@ def save_model(agent): print(settings.lets_go) +max_action = [12., 1.5] +min_action = [2., -1.5] + for episode in range(total_episodes): done = False lap_completed = False cumulated_reward = 0 # Should going forward give more reward then L/R z? observation = env.reset() - state = observation + state, _ = observation for step in range(20000): + action = agent.predict(state) + + mod_action = action + for itr in range(len(action)): + mod_action[itr] = min_action[itr] + 0.5*(max_action[itr] - min_action[itr])*(action[itr]+1) + + # Execute the action and get feedback + nextState, reward, done, info = env.step(mod_action) + cumulated_reward += reward + + agent.memorize(state, action, reward, done, nextState) + + agent.train() + + if highest_reward < cumulated_reward: + highest_reward = cumulated_reward + + env._flush(force=True) + + if not done: + state = nextState + else: + last_time_steps = np.append(last_time_steps, [int(step + 1)]) + stats[episode] = step + break + if stimate_step_per_lap > 4000 and not lap_completed: print("LAP COMPLETED!!") diff --git a/behavior_studio/brains/f1rl/utils/ddpg.py b/behavior_studio/brains/f1rl/utils/ddpg.py index e69de29b..cad5ab58 100644 --- a/behavior_studio/brains/f1rl/utils/ddpg.py +++ b/behavior_studio/brains/f1rl/utils/ddpg.py @@ -0,0 +1,132 @@ +import numpy as np +import tensorflow as tf +from brains.f1rl.utils.ddpg_utils.actor import Actor +from brains.f1rl.utils.ddpg_utils.critic import Critic +from brains.f1rl.utils.ddpg_utils.utils import OrnsteinUhlenbeckActionNoise, ReplayBuffer +from copy import deepcopy + +class DDPGAgent: + + def __init__(self, + sess = None, + state_dim = None, + action_dim = None, + training_batch = None, + epsilon_decay = None, + gamma = 0.99, + tau = 0.001): + + self._sess = sess + self._state_dim = state_dim + self._action_dim = action_dim + + self._actor = Actor(sess=self._sess, + state_dim=self._state_dim, + action_dim=self._action_dim, + hidden_sizes=[32, 16, 16], + output_activation=tf.nn.tanh, + scope = 'actor') + + self._critic = Critic(sess=sess, + state_dim=self._state_dim, + action_dim=self._action_dim, + latent_dim=2*self._action_dim, + hidden_sizes=[32, 16, 32, 32], + output_activation=tf.nn.relu, + scope = 'critic') + + self._target_actor = Actor(sess=self._sess, + state_dim=self._state_dim, + action_dim=self._action_dim, + hidden_sizes=[32, 16, 16], + output_activation=tf.nn.tanh, + scope = 'target_actor') + + self._target_critic = Critic(sess=sess, + state_dim=self._state_dim, + action_dim=self._action_dim, + latent_dim=2*self._action_dim, + hidden_sizes=[32, 16, 32, 32], + output_activation=tf.nn.relu, + scope = 'target_critic') + + self._actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(self._action_dim)) + + self._memory_limit = 20000 + self._memory_batch = training_batch + self._memory = ReplayBuffer(buffer_size=self._memory_limit) + + self._epsilon = 1.0 + self._epsilon_decay = epsilon_decay + + self._discount_factor = gamma + self._tau = tau + + self._hard_update_actor = self.update_target_network(self._actor.network_params, self._target_actor.network_params, tau=1.0) + self._hard_update_critic = self.update_target_network(self._actor.network_params, self._target_actor.network_params, tau=1.0) + + self._soft_update_actor = self.update_target_network(self._actor.network_params, self._target_actor.network_params, tau=self._tau) + self._soft_update_critic = self.update_target_network(self._actor.network_params, self._target_actor.network_params, tau=self._tau) + + def memorize(self, state, action, reward, terminal, next_state): + + self._memory.add(state, action, reward, terminal, next_state) + + def bellman(self, rewards, q_values, terminal, gamma): + + critic_target = np.random.rand(1) + + if terminal: + critic_target[0] = rewards + else: + critic_target[0] = rewards + gamma * q_values + return critic_target + + def predict(self, state): + + action = self._actor.predict(state=state)[0] + + action += self._actor_noise() + + action = np.clip(action, -1.0, 1.0) + + return action + + def train(self): + + if self._memory.size() > self._memory_batch: + + s_batch, a_batch, r_batch, t_batch, next_s_batch = self._memory.sample_batch(self._memory_batch) + + for i in range(self._memory_batch): + + q_value = self._target_critic.predict(state=next_s_batch[i], + action=self._target_actor.predict(state=next_s_batch[i])) + + critic_target = self.bellman(r_batch[i], q_value, t_batch[i], self._discount_factor) + + self._critic.update(state=s_batch[i], action=a_batch[i], critic_target=critic_target) + + actions = self._actor.predict(state=s_batch[i]) + grads = self._critic.gradients(state=s_batch[i], action=actions) + + self._actor.update(state=s_batch[i], action_gradients=grads) + + self._sess.run(self._soft_update_actor) + self._sess.run(self._soft_update_critic) + + else: + pass + + def update_target_network(self, network_params, target_network_params, tau): + + op_holder = [] + for from_var,to_var in zip(network_params, target_network_params): + op_holder.append(to_var.assign((tf.multiply(from_var, tau) + tf.multiply(to_var, 1. - tau)))) + + return op_holder + + def _hard_update(self): + + self._sess.run(self._hard_update_actor) + self._sess.run(self._hard_update_critic) diff --git a/behavior_studio/brains/f1rl/utils/ddpg_utils/__init__.py b/behavior_studio/brains/f1rl/utils/ddpg_utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/behavior_studio/brains/f1rl/utils/ddpg_utils/actor.py b/behavior_studio/brains/f1rl/utils/ddpg_utils/actor.py new file mode 100644 index 00000000..96032190 --- /dev/null +++ b/behavior_studio/brains/f1rl/utils/ddpg_utils/actor.py @@ -0,0 +1,76 @@ +import numpy as np +import tensorflow as tf + + +class Actor: + + def __init__(self, + sess=None, + state_dim = None, + action_dim = None, + hidden_sizes = None, + learning_rate = 0.0003, + hidden_activation = tf.nn.relu, + output_activation = tf.nn.tanh, + w_init=tf.contrib.layers.xavier_initializer(), + b_init=tf.zeros_initializer(), + scope = 'actor' + ): + + self._sess = sess + self._state = tf.placeholder(dtype=tf.float32, shape=state_dim, name='state') + self._action_grads = tf.placeholder(dtype=tf.float32, shape=action_dim, name='action_grads') + + ############################# Actor Layer ########################### + + with tf.variable_scope(scope): + + layer = tf.layers.conv1d(inputs=tf.expand_dims(self._state, 0), + filters=hidden_sizes[0], + kernel_size=4, + activation=hidden_activation, + kernel_initializer=w_init, + bias_initializer=b_init, + name='layer_in') + + for i in range(len(hidden_sizes)-1): + + layer = tf.layers.conv1d(inputs=layer, + filters=hidden_sizes[i+1], + kernel_size=3, + activation=hidden_activation, + kernel_initializer=w_init, + bias_initializer=b_init, + name='layer_'+str(i+1)) + + layer = tf.layers.flatten(inputs=layer, + name='layer_flatten') + + layer = tf.layers.dense(inputs=layer, + units=action_dim, + activation=output_activation, + kernel_initializer=w_init, + bias_initializer=b_init, + name='layer_out') + + self.layer_out = tf.layers.flatten(layer) + + self.predicted_action = tf.squeeze(self.layer_out) + + self.network_params = tf.trainable_variables() + + self._optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) + + params_grad = tf.gradients(self.predicted_action, tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES), -self._action_grads) + grads = zip(params_grad, tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)) + + self._train_op = self._optimizer.apply_gradients(grads) + + def predict(self, state): + return self._sess.run(self.predicted_action, {self._state: state}) + + def update(self, state, action_gradients): + + self._sess.run(self._train_op, + {self._state: state, + self._action_grads: action_gradients}) diff --git a/behavior_studio/brains/f1rl/utils/ddpg_utils/critic.py b/behavior_studio/brains/f1rl/utils/ddpg_utils/critic.py new file mode 100644 index 00000000..549f1c25 --- /dev/null +++ b/behavior_studio/brains/f1rl/utils/ddpg_utils/critic.py @@ -0,0 +1,112 @@ +import numpy as np +import tensorflow as tf + + +class Critic: + + def __init__(self, + sess=None, + state_dim = None, + action_dim = None, + latent_dim = None, + hidden_sizes = None, + learning_rate = 0.003, + hidden_activation = tf.nn.relu, + output_activation = tf.nn.relu, + w_init=tf.contrib.layers.xavier_initializer(), + b_init=tf.zeros_initializer(), + scope = 'critic' + ): + + self._sess = sess + + ############################# Critic Layer ########################### + + with tf.variable_scope(scope): + + self._state = tf.placeholder(dtype=tf.float32, shape=state_dim, name='state') + self._action = tf.placeholder(dtype=tf.float32, shape=action_dim, name='action') + self._return_target = tf.placeholder(dtype=tf.float32, shape=1, name='target') + + with tf.GradientTape() as tape: + + tape.watch([self._state, self._action]) + + layer = tf.layers.conv1d(inputs=tf.expand_dims(self._state, 0), + filters=hidden_sizes[0], + kernel_size=4, + activation=hidden_activation, + kernel_initializer=w_init, + bias_initializer=b_init, + name='layer_in') + + for i in range(len(hidden_sizes)-3): + + layer = tf.layers.conv1d(inputs=layer, + filters=hidden_sizes[i+1], + kernel_size=3, + activation=hidden_activation, + kernel_initializer=w_init, + bias_initializer=b_init, + name='layer_'+str(i+1)) + + layer = tf.layers.flatten(inputs=layer, + name='layer_flatten') + + layer = tf.layers.dense(inputs=layer, + units=latent_dim, + activation=hidden_activation, + kernel_initializer=w_init, + bias_initializer=b_init, + name='layer_out_latent') + + layer = tf.layers.flatten(layer) + + action_layer = tf.concat([layer, tf.reshape(self._action, (1,action_dim))], 1) + + action_layer = tf.layers.dense(inputs=action_layer, + units=hidden_sizes[i+2], + activation=hidden_activation, + kernel_initializer=w_init, + bias_initializer=b_init, + name='action_layer_'+str(i+2)) + + action_layer = tf.layers.dense(inputs=action_layer, + units=hidden_sizes[i+3], + activation=hidden_activation, + kernel_initializer=w_init, + bias_initializer=b_init, + name='action_layer_'+str(i+3)) + + action_layer = tf.layers.dense(inputs=action_layer, + units=1, + activation=None, + kernel_initializer=w_init, + bias_initializer=b_init, + name='action_layer_out') + + self.layer_out = tf.layers.flatten(action_layer) + + self.predicted_qvalue = tf.squeeze(self.layer_out) + + self._loss = tf.reduce_mean(tf.square(self.predicted_qvalue - self._return_target)) + + self._action_gradients = tape.gradient(self.predicted_qvalue, self._action) + + self.network_params = tf.trainable_variables() + + self._optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) + self._train_op = self._optimizer.minimize(self._loss) + + def gradients(self, state, action): + return self._sess.run(self._action_gradients, {self._state: state, self._action: action}) + + def predict(self, state, action): + return self._sess.run(self.predicted_qvalue, {self._state: state, self._action: action}) + + def update(self, state, action, critic_target): + + self._sess.run(self._train_op, + {self._state: state, + self._action: action, + self._return_target: critic_target}) diff --git a/behavior_studio/brains/f1rl/utils/settingsDDPG.py b/behavior_studio/brains/f1rl/utils/ddpg_utils/settingsDDPG.py similarity index 97% rename from behavior_studio/brains/f1rl/utils/settingsDDPG.py rename to behavior_studio/brains/f1rl/utils/ddpg_utils/settingsDDPG.py index faaf5ef1..2e426dc4 100644 --- a/behavior_studio/brains/f1rl/utils/settingsDDPG.py +++ b/behavior_studio/brains/f1rl/utils/ddpg_utils/settingsDDPG.py @@ -10,12 +10,14 @@ telemetry_mask = False plotter_graphic = True my_board = True -save_model = True +save_model = False load_model = False tau = app_configuration.tau gamma = app_configuration.gamma +epsdecay = app_configuration.epsilon_decay hidden_size = app_configuration.hidden_size +batch_size = 64 total_episodes = app_configuration.total_episodes gazebo_positions_set = app_configuration.gazebo_positions_set diff --git a/behavior_studio/brains/f1rl/utils/ddpg_utils/utils.py b/behavior_studio/brains/f1rl/utils/ddpg_utils/utils.py new file mode 100644 index 00000000..c40ee4c4 --- /dev/null +++ b/behavior_studio/brains/f1rl/utils/ddpg_utils/utils.py @@ -0,0 +1,68 @@ +import sys +import argparse +import numpy as np +import gym +import pandas as pd +import math +from collections import deque +import random + +class OrnsteinUhlenbeckActionNoise: + def __init__(self, mu, sigma=0.25, theta=.05, dt=1e-2, x0=None): + self.theta = theta + self.mu = mu + self.sigma = sigma + self.dt = dt + self.x0 = x0 + self.reset() + + def __call__(self): + x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape) + self.x_prev = x + return x + + def reset(self): + self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu) + + def __repr__(self): + return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma) + +class ReplayBuffer(object): + + def __init__(self, buffer_size, random_seed=123): + self.buffer_size = buffer_size + self.count = 0 + self.buffer = deque() + random.seed(random_seed) + + def add(self, state, action, reward, terminal, next_state): + experience = (state, action, reward, terminal, next_state) + if self.count < self.buffer_size: + self.buffer.append(experience) + self.count += 1 + else: + self.buffer.popleft() + self.buffer.append(experience) + + def size(self): + return self.count + + def sample_batch(self, batch_size): + batch = [] + + if self.count < batch_size: + batch = random.sample(self.buffer, self.count) + else: + batch = random.sample(self.buffer, batch_size) + + state_batch = np.array([_[0] for _ in batch]) + action_batch = np.array([_[1] for _ in batch]) + reward_batch = np.array([_[2] for _ in batch]) + terminal_batch = np.array([_[3] for _ in batch]) + next_state_batch = np.array([_[4] for _ in batch]) + + return state_batch, action_batch, reward_batch, terminal_batch, next_state_batch + + def clear(self): + self.buffer.clear() + self.count = 0 diff --git a/behavior_studio/default-rl.yml b/behavior_studio/default-rl.yml index 02f813ab..38080f5f 100644 --- a/behavior_studio/default-rl.yml +++ b/behavior_studio/default-rl.yml @@ -19,6 +19,7 @@ Behaviors: RL: True BrainPath: 'brains/f1rl/train.py' Type: 'f1rl' + Action: 'discrete' Parameters: action_set: 'hard' gazebo_positions_set: 'pista_simple' diff --git a/behavior_studio/default-testrl.yml b/behavior_studio/default-rlddpg.yml similarity index 85% rename from behavior_studio/default-testrl.yml rename to behavior_studio/default-rlddpg.yml index 6a49ebb9..14fe2cdc 100644 --- a/behavior_studio/default-testrl.yml +++ b/behavior_studio/default-rlddpg.yml @@ -14,19 +14,19 @@ Behaviors: Motors_0: Name: 'motors_0' Topic: '/F1ROS/cmd_vel' - MaxV: 12 - MaxW: 1.5 + MaxV: 3 + MaxW: 0.3 RL: True - BrainPath: 'brains/f1rl/sampleTest.py' + BrainPath: 'brains/f1rl/trainContinuous.py' Type: 'f1rl' + Action: 'continuous' Parameters: - action_set: 'hard' gazebo_positions_set: 'pista_simple' - alpha: 0.2 + tau: 0.2 gamma: 0.9 - epsilon: 0.99 + epsilon_decay: 0.98 + hidden_size: 0.99 total_episodes: 20000 - epsilon_discount: 0.9986 env: 'camera' Simulation: World: /opt/jderobot/share/jderobot/gazebo/launch/simple_circuit.launch diff --git a/behavior_studio/utils/configuration.py b/behavior_studio/utils/configuration.py index 1ccb4244..5d06c5fa 100644 --- a/behavior_studio/utils/configuration.py +++ b/behavior_studio/utils/configuration.py @@ -93,6 +93,7 @@ def initialize_configuration(self, config_data): robot = config_data['Behaviors']['Robot'] self.brain_path = robot['BrainPath'] self.robot_type = robot['Type'] + self.action_type = robot['Action'] self.current_world = config_data['Behaviors']['Simulation']['World'] self.actuators = robot['Actuators'] @@ -115,7 +116,7 @@ def initialize_configuration(self, config_data): self.experiment_timeouts = config_data['Behaviors']['Experiment']['Timeout'] self.experiment_repetitions = config_data['Behaviors']['Experiment']['Repetitions'] - if self.robot_type == 'f1rl': + if self.robot_type == 'f1rl' and self.action_type == 'discrete': self.action_set = robot['Parameters']['action_set'] self.gazebo_positions_set = robot['Parameters']['gazebo_positions_set'] self.alpha = robot['Parameters']['alpha'] @@ -125,6 +126,15 @@ def initialize_configuration(self, config_data): self.epsilon_discount = robot['Parameters']['epsilon_discount'] self.env = robot['Parameters']['env'] + if self.robot_type == 'f1rl' and self.action_type == 'continuous': + self.gazebo_positions_set = robot['Parameters']['gazebo_positions_set'] + self.tau = robot['Parameters']['tau'] + self.gamma = robot['Parameters']['gamma'] + self.epsilon_decay = robot['Parameters']['epsilon_decay'] + self.hidden_size = robot['Parameters']['hidden_size'] + self.total_episodes = robot['Parameters']['total_episodes'] + self.env = robot['Parameters']['env'] + def create_layout_from_cfg(self, cfg): """Creates the configuration for the layout of the sensors view panels specified from configuration file diff --git a/gym-gazebo/gym_gazebo/envs/f1/env_gazebo_f1_ddpg_camera.py b/gym-gazebo/gym_gazebo/envs/f1/env_gazebo_f1_ddpg_camera.py index 30bcd1e6..aed28b21 100755 --- a/gym-gazebo/gym_gazebo/envs/f1/env_gazebo_f1_ddpg_camera.py +++ b/gym-gazebo/gym_gazebo/envs/f1/env_gazebo_f1_ddpg_camera.py @@ -160,7 +160,7 @@ def _generate_continuous_action_space(): min_lin_speed = 2 max_lin_speed = 12 - action_space = spaces.Box([min_lin_speed, min_ang_speed], [max_lin_speed, max_ang_speed]) + action_space = spaces.Box(np.array([min_lin_speed, min_ang_speed]),np.array([max_lin_speed, max_ang_speed])) return action_space @@ -391,7 +391,7 @@ def step(self, action): cv_image = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY) cv_image = cv2.resize(cv_image, (self.img_rows, self.img_cols)) - observation = cv_image.reshape(1, 1, cv_image.shape[0], cv_image.shape[1]) + observation = cv_image.reshape(cv_image.shape[0], cv_image.shape[1]) # info = [vel_cmd.linear.x, vel_cmd.angular.z, error_1, error_2, error_3] # OpenAI standard return: observation, reward, done, info @@ -430,9 +430,8 @@ def reset(self): cv_image = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY) cv_image = cv2.resize(cv_image, (self.img_rows, self.img_cols)) - print(cv_image.shape) - state = cv_image.reshape(1, 1, cv_image.shape[0], cv_image.shape[1]) + state = cv_image.reshape(cv_image.shape[0], cv_image.shape[1]) return state, pos From 9335eb13aee3afc2edcd0f2b2c9030cf33f5a435 Mon Sep 17 00:00:00 2001 From: UtkarshMishra04 Date: Mon, 19 Apr 2021 11:53:06 +0000 Subject: [PATCH 3/9] added algorithm support for continuous control, making it modular w.r.t. RL algorithms --- .../brains/f1rl/trainContinuous.py | 27 +++---------------- .../envs/f1/env_gazebo_f1_ddpg_camera.py | 22 +++++++++------ requirements.txt | 2 +- 3 files changed, 18 insertions(+), 33 deletions(-) diff --git a/behavior_studio/brains/f1rl/trainContinuous.py b/behavior_studio/brains/f1rl/trainContinuous.py index ab11fe13..717ee5fb 100755 --- a/behavior_studio/brains/f1rl/trainContinuous.py +++ b/behavior_studio/brains/f1rl/trainContinuous.py @@ -6,9 +6,8 @@ import gym_gazebo import numpy as np from gym import logger, wrappers -from brains.f1rl.utils.ddpg import DDPGAgent +# from brains.f1rl.utils.ddpg import DDPGAgent import brains.f1rl.utils.ddpg_utils.settingsDDPG as settings -import tensorflow as tf def render(): render_skip = 0 # Skip first X episodes. @@ -54,20 +53,6 @@ def save_model(agent): stimate_step_per_lap = 4000 lap_completed = False -sess = tf.Session() - -agent = DDPGAgent(sess=sess, - state_dim=(32,32), - action_dim=2, - training_batch=settings.batch_size, - epsilon_decay=settings.epsdecay, - gamma=settings.gamma, - tau=settings.tau) - -sess.run(tf.global_variables_initializer()) - -agent._hard_update() - #if settings.load_model: # exit(1) # @@ -96,13 +81,11 @@ def save_model(agent): done = False lap_completed = False cumulated_reward = 0 # Should going forward give more reward then L/R z? - observation = env.reset() - - state, _ = observation + state = env.reset() for step in range(20000): - action = agent.predict(state) + action = 2*np.random.rand(2)-1 mod_action = action @@ -113,10 +96,6 @@ def save_model(agent): nextState, reward, done, info = env.step(mod_action) cumulated_reward += reward - agent.memorize(state, action, reward, done, nextState) - - agent.train() - if highest_reward < cumulated_reward: highest_reward = cumulated_reward diff --git a/gym-gazebo/gym_gazebo/envs/f1/env_gazebo_f1_ddpg_camera.py b/gym-gazebo/gym_gazebo/envs/f1/env_gazebo_f1_ddpg_camera.py index aed28b21..717ee026 100755 --- a/gym-gazebo/gym_gazebo/envs/f1/env_gazebo_f1_ddpg_camera.py +++ b/gym-gazebo/gym_gazebo/envs/f1/env_gazebo_f1_ddpg_camera.py @@ -168,7 +168,7 @@ def _seed(self, seed=None): self.np_random, seed = seeding.np_random(seed) return [seed] - def show_telemetry(self, img, point_1, point_2, point_3, action, reward): + def show_telemetry(self, img, vel_cmd, point_1, point_2, point_3, action, reward): # Puntos centrales de la imagen (verde) cv2.line(img, (320, x_row[0]), (320, x_row[0]), (255, 255, 0), thickness=5) cv2.line(img, (320, x_row[1]), (320, x_row[1]), (255, 255, 0), thickness=5) @@ -178,8 +178,8 @@ def show_telemetry(self, img, point_1, point_2, point_3, action, reward): cv2.line(img, (center_image, x_row[1]), (int(point_2), x_row[1]), (255, 255, 255), thickness=2) cv2.line(img, (center_image, x_row[2]), (int(point_3), x_row[2]), (255, 255, 255), thickness=2) # Telemetry - cv2.putText(img, str("action1: {} action2: {}".format(action[0], action[1])), (18, 280), font, 0.4, (255, 255, 255), 1) - cv2.putText(img, str("w ang: {}".format(w_angular)), (18, 300), font, 0.4, (255, 255, 255), 1) + cv2.putText(img, str("action1: {}, action2: {}".format(action[0], action[1])), (18, 280), font, 0.4, (255, 255, 255), 1) + cv2.putText(img, str("l_vel: {}, w_ang: {}".format(vel_cmd.linear.x, vel_cmd.angular.z)), (18, 300), font, 0.4, (255, 255, 255), 1) cv2.putText(img, str("reward: {}".format(reward)), (18, 320), font, 0.4, (255, 255, 255), 1) cv2.putText(img, str("err1: {}".format(center_image - point_1)), (18, 340), font, 0.4, (255, 255, 255), 1) cv2.putText(img, str("err2: {}".format(center_image - point_2)), (18, 360), font, 0.4, (255, 255, 255), 1) @@ -387,15 +387,19 @@ def step(self, action): # == TELEMETRY == if telemetry: - self.show_telemetry(f1_image_camera.data, point_1, point_2, point_3, action, reward) + self.show_telemetry(f1_image_camera.data, vel_cmd, point_1, point_2, point_3, action, reward) cv_image = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY) cv_image = cv2.resize(cv_image, (self.img_rows, self.img_cols)) observation = cv_image.reshape(cv_image.shape[0], cv_image.shape[1]) - # info = [vel_cmd.linear.x, vel_cmd.angular.z, error_1, error_2, error_3] + vehicle_state = [vel_cmd.linear.x, vel_cmd.angular.z] + info = {'error_1': error_1, 'error_2': error_2, 'error_3': error_3} + + stacked_obs = {'image':observation, 'state':vehicle_state} + # OpenAI standard return: observation, reward, done, info - return observation, reward, done, {} + return stacked_obs, reward, done, {} # test STACK 4 # cv_image = cv_image.reshape(1, 1, cv_image.shape[0], cv_image.shape[1]) @@ -432,8 +436,10 @@ def reset(self): cv_image = cv2.resize(cv_image, (self.img_rows, self.img_cols)) state = cv_image.reshape(cv_image.shape[0], cv_image.shape[1]) - - return state, pos + + stacked_obs = {'image': state, 'state':[0., 0.]} + + return stacked_obs # test STACK 4 # self.s_t = np.stack((cv_image, cv_image, cv_image, cv_image), axis=0) diff --git a/requirements.txt b/requirements.txt index ea4ad79f..8c2d5eb3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,7 @@ pathlib==1.0.1 jderobot-jderobottypes==1.0.0 catkin_pkg==0.4.23 numpy==1.20.1 -matplotlib==3.3.0 +matplotlib==3.2.1 kiwisolver==1.2 defusedxml==0.6.0 netifaces==0.10.9 From db54b3633a8a2faee2d168942784c3025401eda4 Mon Sep 17 00:00:00 2001 From: UtkarshMishra04 Date: Sat, 24 Apr 2021 00:08:17 +0000 Subject: [PATCH 4/9] adding default-rl changes --- behavior_studio/brains/f1rl/utils/memory.py | 2 +- behavior_studio/default-rl.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/behavior_studio/brains/f1rl/utils/memory.py b/behavior_studio/brains/f1rl/utils/memory.py index 3bf5b306..ab4f11f2 100644 --- a/behavior_studio/brains/f1rl/utils/memory.py +++ b/behavior_studio/brains/f1rl/utils/memory.py @@ -46,4 +46,4 @@ def addMemory(self, state, action, reward, newState, isFinal) : self.newStates.append(newState) self.finals.append(isFinal) - self.currentPosition += 1 \ No newline at end of file + self.currentPosition += 1 diff --git a/behavior_studio/default-rl.yml b/behavior_studio/default-rl.yml index 38080f5f..42d1d306 100644 --- a/behavior_studio/default-rl.yml +++ b/behavior_studio/default-rl.yml @@ -17,7 +17,7 @@ Behaviors: MaxV: 3 MaxW: 0.3 RL: True - BrainPath: 'brains/f1rl/train.py' + BrainPath: 'brains/f1rl/f1_follow_line_camera_dqn.py' Type: 'f1rl' Action: 'discrete' Parameters: From 9c4ca0bc1e6124019601a1926d65a77587a196b8 Mon Sep 17 00:00:00 2001 From: UtkarshMishra04 Date: Wed, 28 Apr 2021 22:05:25 +0000 Subject: [PATCH 5/9] added continuous environment with Laser, fixed minor Qlearn Laser --- .../brains/f1rl/trainContinuous.py | 6 +- behavior_metrics/default-rl.yml | 4 +- behavior_metrics/default.yml | 1 + gym-gazebo/gym_gazebo/__init__.py | 9 +- gym-gazebo/gym_gazebo/envs/f1/__init__.py | 1 + .../envs/f1/env_gazebo_f1_ddpg_laser.py | 222 ++++++++++++++++++ .../envs/f1/env_gazebo_f1_qlearn_laser.py | 2 +- 7 files changed, 238 insertions(+), 7 deletions(-) create mode 100644 gym-gazebo/gym_gazebo/envs/f1/env_gazebo_f1_ddpg_laser.py diff --git a/behavior_metrics/brains/f1rl/trainContinuous.py b/behavior_metrics/brains/f1rl/trainContinuous.py index 717ee5fb..3132baea 100755 --- a/behavior_metrics/brains/f1rl/trainContinuous.py +++ b/behavior_metrics/brains/f1rl/trainContinuous.py @@ -42,7 +42,7 @@ def save_model(agent): print(settings.title) print(settings.description) -env = gym.make('GazeboF1CameraEnvDDPG-v0') +env = gym.make('GazeboF1LaserEnvDDPG-v0') outdir = './logs/f1_ddpg_gym_experiments/' stats = {} # epoch: steps @@ -74,8 +74,8 @@ def save_model(agent): print(settings.lets_go) -max_action = [12., 1.5] -min_action = [2., -1.5] +max_action = [12., 2] +min_action = [2., -2] for episode in range(total_episodes): done = False diff --git a/behavior_metrics/default-rl.yml b/behavior_metrics/default-rl.yml index 42d1d306..e8549d4e 100644 --- a/behavior_metrics/default-rl.yml +++ b/behavior_metrics/default-rl.yml @@ -17,7 +17,7 @@ Behaviors: MaxV: 3 MaxW: 0.3 RL: True - BrainPath: 'brains/f1rl/f1_follow_line_camera_dqn.py' + BrainPath: 'brains/f1rl/train.py' Type: 'f1rl' Action: 'discrete' Parameters: @@ -28,7 +28,7 @@ Behaviors: epsilon: 0.99 total_episodes: 20000 epsilon_discount: 0.9986 - env: 'camera' + env: 'laser' Simulation: World: /opt/jderobot/share/jderobot/gazebo/launch/simple_circuit.launch Dataset: diff --git a/behavior_metrics/default.yml b/behavior_metrics/default.yml index f09e236d..794d732d 100644 --- a/behavior_metrics/default.yml +++ b/behavior_metrics/default.yml @@ -19,6 +19,7 @@ Behaviors: BrainPath: 'brains/f1/brain_f1_explicit.py' Type: 'f1' + Action: None Simulation: World: /opt/jderobot/share/jderobot/gazebo/launch/simple_circuit.launch Dataset: diff --git a/gym-gazebo/gym_gazebo/__init__.py b/gym-gazebo/gym_gazebo/__init__.py index 9c2db093..4ce5dd25 100755 --- a/gym-gazebo/gym_gazebo/__init__.py +++ b/gym-gazebo/gym_gazebo/__init__.py @@ -26,13 +26,20 @@ # More arguments here ) -# F1 DDPG +# F1 DDPG CAMERA register( id='GazeboF1CameraEnvDDPG-v0', entry_point='gym_gazebo.envs.f1:GazeboF1CameraEnvDDPG', # More arguments here ) +# F1 DDPG LASER +register( + id='GazeboF1LaserEnvDDPG-v0', + entry_point='gym_gazebo.envs.f1:GazeboF1LaserEnvDDPG', + # More arguments here +) + # Turtlebot envs register( id='GazeboMazeTurtlebotLidar-v0', diff --git a/gym-gazebo/gym_gazebo/envs/f1/__init__.py b/gym-gazebo/gym_gazebo/envs/f1/__init__.py index fb630775..9fa3d036 100755 --- a/gym-gazebo/gym_gazebo/envs/f1/__init__.py +++ b/gym-gazebo/gym_gazebo/envs/f1/__init__.py @@ -6,3 +6,4 @@ # DDPG from gym_gazebo.envs.f1.env_gazebo_f1_ddpg_camera import GazeboF1CameraEnvDDPG +from gym_gazebo.envs.f1.env_gazebo_f1_ddpg_laser import GazeboF1LaserEnvDDPG diff --git a/gym-gazebo/gym_gazebo/envs/f1/env_gazebo_f1_ddpg_laser.py b/gym-gazebo/gym_gazebo/envs/f1/env_gazebo_f1_ddpg_laser.py new file mode 100644 index 00000000..1bd12069 --- /dev/null +++ b/gym-gazebo/gym_gazebo/envs/f1/env_gazebo_f1_ddpg_laser.py @@ -0,0 +1,222 @@ +import rospy +import time +import random +import numpy as np + +from gym import spaces +from gym.utils import seeding + +from gym_gazebo.envs import gazebo_env +from gazebo_msgs.msg import ModelState +from gazebo_msgs.srv import SetModelState + +from geometry_msgs.msg import Twist +from std_srvs.srv import Empty +from sensor_msgs.msg import LaserScan + +from agents.f1.settings import gazebo_positions + + +class GazeboF1LaserEnvDDPG(gazebo_env.GazeboEnv): + + def __init__(self): + # Launch the simulation with the given launchfile name + gazebo_env.GazeboEnv.__init__(self, "F1Lasercircuit_v0.launch") + # gazebo_env.GazeboEnv.__init__(self, "f1_1_nurburgrinlineROS_laser.launch") + self.vel_pub = rospy.Publisher('/F1ROS/cmd_vel', Twist, queue_size=5) + self.unpause = rospy.ServiceProxy('/gazebo/unpause_physics', Empty) + self.pause = rospy.ServiceProxy('/gazebo/pause_physics', Empty) + self.reset_proxy = rospy.ServiceProxy('/gazebo/reset_simulation', Empty) + self.action_space = self._generate_continuous_action_space() # spaces.Discrete(5) # F, L, R + self.reward_range = (-np.inf, np.inf) + self.position = None + self._seed() + + def render(self, mode='human'): + pass + + def _gazebo_pause(self): + rospy.wait_for_service('/gazebo/pause_physics') + try: + # resp_pause = pause.call() + self.pause() + except rospy.ServiceException as e: + print("/gazebo/pause_physics service call failed: {}".format(e)) + + def _gazebo_unpause(self): + rospy.wait_for_service('/gazebo/unpause_physics') + try: + self.unpause() + except rospy.ServiceException as e: + print(e) + print("/gazebo/unpause_physics service call failed") + + def _gazebo_reset(self): + # Resets the state of the environment and returns an initial observation. + rospy.wait_for_service('/gazebo/reset_simulation') + try: + # reset_proxy.call() + self.reset_proxy() + self.unpause() + except rospy.ServiceException as e: + print("/gazebo/reset_simulation service call failed: {}".format(e)) + + def set_new_pose(self): + """ + (pos_number, pose_x, pose_y, pose_z, or_x, or_y, or_z, or_z) + """ + pos = random.choice(list(enumerate(gazebo_positions)))[0] + self.position = pos + + pos_number = gazebo_positions[0] + + state = ModelState() + state.model_name = "f1_renault" + state.pose.position.x = gazebo_positions[pos][1] + state.pose.position.y = gazebo_positions[pos][2] + state.pose.position.z = gazebo_positions[pos][3] + state.pose.orientation.x = gazebo_positions[pos][4] + state.pose.orientation.y = gazebo_positions[pos][5] + state.pose.orientation.z = gazebo_positions[pos][6] + state.pose.orientation.w = gazebo_positions[pos][7] + + rospy.wait_for_service('/gazebo/set_model_state') + try: + set_state = rospy.ServiceProxy('/gazebo/set_model_state', SetModelState) + set_state(state) + except rospy.ServiceException as e: + print("Service call failed: {}".format(e)) + return pos_number + + @staticmethod + def _generate_continuous_action_space(): + + min_ang_speed = -2.0 + max_ang_speed = 2.0 + min_lin_speed = 2 + max_lin_speed = 12 + + action_space = spaces.Box(np.array([min_lin_speed, min_ang_speed]),np.array([max_lin_speed, max_ang_speed])) + + return action_space + + @staticmethod + def discrete_observation(data, new_ranges): + + discrete_ranges = [] + min_range = 0.05 + done = False + mod = len(data.ranges) / new_ranges + filter_data = data.ranges[10:-10] + for i, item in enumerate(filter_data): + if i % mod == 0: + if filter_data[i] == float('Inf') or np.isinf(filter_data[i]): + discrete_ranges.append(6) + elif np.isnan(filter_data[i]): + discrete_ranges.append(0) + else: + discrete_ranges.append(int(filter_data[i])) + if min_range > filter_data[i] > 0: + # print("Data ranges: {}".format(data.ranges[i])) + done = True + break + + return discrete_ranges, done + + @staticmethod + def calculate_observation(data): + min_range = 0.5 # Default: 0.21 + done = False + for i, item in enumerate(data.ranges): + if min_range > data.ranges[i] > 0: + done = True + return done + + @staticmethod + def get_center_of_laser(data): + + laser_len = len(data.ranges) + left_sum = sum(data.ranges[laser_len - (laser_len / 5):laser_len - (laser_len / 10)]) # 80-90 + right_sum = sum(data.ranges[(laser_len / 10):(laser_len / 5)]) # 10-20 + + center_detour = (right_sum - left_sum) / 5 + + return center_detour + + def _seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def step(self, actions): + + self._gazebo_unpause() + + vel_cmd = Twist() + vel_cmd.linear.x = actions[0] + vel_cmd.angular.z = actions[1] + self.vel_pub.publish(vel_cmd) + + laser_data = None + success = False + while laser_data is None or not success: + try: + laser_data = rospy.wait_for_message('/F1ROS/laser/scan', LaserScan, timeout=5) + finally: + success = True + + if laser_data is None: + state = [0]*5 + return state, -200, True, {} + + self._gazebo_pause() + + state, _ = self.discrete_observation(laser_data, 5) + + laser_len = len(laser_data.ranges) + left_sum = sum(laser_data.ranges[laser_len - (laser_len // 5):laser_len - (laser_len // 10)]) # 80-90 + right_sum = sum(laser_data.ranges[(laser_len // 10):(laser_len // 5)]) # 10-20 + # center_detour = (right_sum - left_sum) / 5 + left_boundary = left_sum / 5 + right_boundary = right_sum / 5 + center_detour = right_boundary - left_boundary + # print("LEFT: {} - RIGHT: {}".format(left_boundary, right_boundary)) + + done = False + if abs(center_detour) > 2 or left_boundary < 2 or right_boundary < 2: + done = True + # print("center: {}".format(center_detour)) + + if not done: + if abs(center_detour) < 4: + reward = 5 + elif abs(center_detour < 2) and action == 1: + reward = 10 + else: # L or R no looping + reward = 2 + else: + reward = -200 + + return state, reward, done, {} + + def reset(self): + # === POSE === + self.set_new_pose() + time.sleep(0.1) + + # self._gazebo_reset() + self._gazebo_unpause() + + # Read laser data + laser_data = None + success = False + while laser_data is None or not success: + try: + laser_data = rospy.wait_for_message('/F1ROS/laser/scan', LaserScan, timeout=5) + finally: + success = True + + self._gazebo_pause() + + state = self.discrete_observation(laser_data, 5) + + return state diff --git a/gym-gazebo/gym_gazebo/envs/f1/env_gazebo_f1_qlearn_laser.py b/gym-gazebo/gym_gazebo/envs/f1/env_gazebo_f1_qlearn_laser.py index a668935b..b3cb4a97 100644 --- a/gym-gazebo/gym_gazebo/envs/f1/env_gazebo_f1_qlearn_laser.py +++ b/gym-gazebo/gym_gazebo/envs/f1/env_gazebo_f1_qlearn_laser.py @@ -28,7 +28,7 @@ def __init__(self): self.unpause = rospy.ServiceProxy('/gazebo/unpause_physics', Empty) self.pause = rospy.ServiceProxy('/gazebo/pause_physics', Empty) self.reset_proxy = rospy.ServiceProxy('/gazebo/reset_simulation', Empty) - self.action_space = actions # spaces.Discrete(5) # F, L, R + self.action_space = spaces.Discrete(len(actions.keys())) # F, L, R self.reward_range = (-np.inf, np.inf) self.position = None self._seed() From f39f369845fef41138331e966140908231f612ed Mon Sep 17 00:00:00 2001 From: UtkarshMishra04 Date: Wed, 28 Apr 2021 22:12:57 +0000 Subject: [PATCH 6/9] resolved wuth action type in yml files --- behavior_metrics/default.yml | 1 - behavior_metrics/utils/configuration.py | 40 +++++++++++++------------ 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/behavior_metrics/default.yml b/behavior_metrics/default.yml index 794d732d..f09e236d 100644 --- a/behavior_metrics/default.yml +++ b/behavior_metrics/default.yml @@ -19,7 +19,6 @@ Behaviors: BrainPath: 'brains/f1/brain_f1_explicit.py' Type: 'f1' - Action: None Simulation: World: /opt/jderobot/share/jderobot/gazebo/launch/simple_circuit.launch Dataset: diff --git a/behavior_metrics/utils/configuration.py b/behavior_metrics/utils/configuration.py index 5d06c5fa..ef433ca2 100644 --- a/behavior_metrics/utils/configuration.py +++ b/behavior_metrics/utils/configuration.py @@ -93,7 +93,6 @@ def initialize_configuration(self, config_data): robot = config_data['Behaviors']['Robot'] self.brain_path = robot['BrainPath'] self.robot_type = robot['Type'] - self.action_type = robot['Action'] self.current_world = config_data['Behaviors']['Simulation']['World'] self.actuators = robot['Actuators'] @@ -116,24 +115,27 @@ def initialize_configuration(self, config_data): self.experiment_timeouts = config_data['Behaviors']['Experiment']['Timeout'] self.experiment_repetitions = config_data['Behaviors']['Experiment']['Repetitions'] - if self.robot_type == 'f1rl' and self.action_type == 'discrete': - self.action_set = robot['Parameters']['action_set'] - self.gazebo_positions_set = robot['Parameters']['gazebo_positions_set'] - self.alpha = robot['Parameters']['alpha'] - self.gamma = robot['Parameters']['gamma'] - self.epsilon = robot['Parameters']['epsilon'] - self.total_episodes = robot['Parameters']['total_episodes'] - self.epsilon_discount = robot['Parameters']['epsilon_discount'] - self.env = robot['Parameters']['env'] - - if self.robot_type == 'f1rl' and self.action_type == 'continuous': - self.gazebo_positions_set = robot['Parameters']['gazebo_positions_set'] - self.tau = robot['Parameters']['tau'] - self.gamma = robot['Parameters']['gamma'] - self.epsilon_decay = robot['Parameters']['epsilon_decay'] - self.hidden_size = robot['Parameters']['hidden_size'] - self.total_episodes = robot['Parameters']['total_episodes'] - self.env = robot['Parameters']['env'] + if self.robot_type == 'f1rl': + self.action_type = robot['Action'] + + if self.action_type == 'discrete': + self.action_set = robot['Parameters']['action_set'] + self.gazebo_positions_set = robot['Parameters']['gazebo_positions_set'] + self.alpha = robot['Parameters']['alpha'] + self.gamma = robot['Parameters']['gamma'] + self.epsilon = robot['Parameters']['epsilon'] + self.total_episodes = robot['Parameters']['total_episodes'] + self.epsilon_discount = robot['Parameters']['epsilon_discount'] + self.env = robot['Parameters']['env'] + + elif self.action_type == 'continuous': + self.gazebo_positions_set = robot['Parameters']['gazebo_positions_set'] + self.tau = robot['Parameters']['tau'] + self.gamma = robot['Parameters']['gamma'] + self.epsilon_decay = robot['Parameters']['epsilon_decay'] + self.hidden_size = robot['Parameters']['hidden_size'] + self.total_episodes = robot['Parameters']['total_episodes'] + self.env = robot['Parameters']['env'] def create_layout_from_cfg(self, cfg): """Creates the configuration for the layout of the sensors view panels specified from configuration file From 7bc84d656aa4137408210c263d38ecbc68122cf7 Mon Sep 17 00:00:00 2001 From: UtkarshMishra04 Date: Mon, 3 May 2021 19:43:43 +0000 Subject: [PATCH 7/9] added torch ddpg with cuda integration --- .../brains/f1rl/trainContinuous.py | 216 +++++++++------ behavior_metrics/brains/f1rl/utils/ddpg.py | 261 +++++++++--------- .../f1rl/utils/ddpg_utils/Policy_ddpg.py | 58 ++++ .../f1rl/utils/ddpg_utils/Value_ddpg.py | 62 +++++ .../brains/f1rl/utils/ddpg_utils/actor.py | 76 ----- .../brains/f1rl/utils/ddpg_utils/critic.py | 112 -------- .../brains/f1rl/utils/ddpg_utils/ddpg_step.py | 47 ++++ .../f1rl/utils/ddpg_utils/ddpg_utils.py | 199 +++++++++++++ .../brains/f1rl/utils/ddpg_utils/utils.py | 68 ----- behavior_metrics/default-rl.yml | 2 +- behavior_metrics/default-rlddpg.yml | 2 +- .../envs/f1/env_gazebo_f1_ddpg_camera.py | 23 +- 12 files changed, 646 insertions(+), 480 deletions(-) create mode 100644 behavior_metrics/brains/f1rl/utils/ddpg_utils/Policy_ddpg.py create mode 100644 behavior_metrics/brains/f1rl/utils/ddpg_utils/Value_ddpg.py delete mode 100644 behavior_metrics/brains/f1rl/utils/ddpg_utils/actor.py delete mode 100644 behavior_metrics/brains/f1rl/utils/ddpg_utils/critic.py create mode 100644 behavior_metrics/brains/f1rl/utils/ddpg_utils/ddpg_step.py create mode 100644 behavior_metrics/brains/f1rl/utils/ddpg_utils/ddpg_utils.py delete mode 100644 behavior_metrics/brains/f1rl/utils/ddpg_utils/utils.py diff --git a/behavior_metrics/brains/f1rl/trainContinuous.py b/behavior_metrics/brains/f1rl/trainContinuous.py index 3132baea..c8edc868 100755 --- a/behavior_metrics/brains/f1rl/trainContinuous.py +++ b/behavior_metrics/brains/f1rl/trainContinuous.py @@ -1,13 +1,16 @@ import time from datetime import datetime import pickle +import torch +from torch.utils.tensorboard import SummaryWriter import gym from brains.f1rl.utils import liveplot import gym_gazebo import numpy as np from gym import logger, wrappers -# from brains.f1rl.utils.ddpg import DDPGAgent +from brains.f1rl.utils.ddpg import DDPG import brains.f1rl.utils.ddpg_utils.settingsDDPG as settings +from PIL import Image def render(): render_skip = 0 # Skip first X episodes. @@ -20,32 +23,24 @@ def render(): (render_episodes < episode): env.render(close=True) -def save_model(agent): - # Tabular RL: Tabular Q-learning basically stores the policy (Q-values) of the agent into a matrix of shape - # (S x A), where s are all states, a are all the possible actions. After the environment is solved, just save this - # matrix as a csv file. I have a quick implementation of this on my GitHub under Reinforcement Learning. - from datetime import datetime - import pickle - date = datetime.now() - format = date.strftime("%Y%m%d_%H%M%S") - file_name = "_ddpg_model_g_{}_t_{}_h_{}".format(agent.gamma, agent.tau, agent.hidden_size) - if os.path.isdir('brains/f1rl/logs') is False: - os.mkdir('brains/f1rl/logs') - if os.path.isdir('brains/f1rl/logs/ddpg_models/') is False: - os.mkdir('brains/f1rl/logs/ddpg_models/') - file_actor = open("brains/f1rl/logs/ddpg_models/" + format + file_name + '_actor.pkl', 'wb') - file_critic = open("brains/f1rl/logs/ddpg_models/" + format + file_name + '_critic.pkl', 'wb') - pickle.dump(agent.get_actor_weights(), file_actor) - pickle.dump(agent.get_critic_weights(), file_critic) - # if __name__ == '__main__': print(settings.title) print(settings.description) -env = gym.make('GazeboF1LaserEnvDDPG-v0') +current_env = settings.current_env +if current_env == "laser": + env_id = "GazeboF1LaserEnvDDPG-v0" + env = gym.make('GazeboF1LaserEnvDDPG-v0') +elif current_env == "camera": + env_id = "GazeboF1CameraEnvDDPG-v0" + env = gym.make('GazeboF1CameraEnvDDPG-v0') +else: + print("NO correct env selected") outdir = './logs/f1_ddpg_gym_experiments/' -stats = {} # epoch: steps + +if not os.path.exists(outdir): + os.makedirs(outdir+'images/') env = gym.wrappers.Monitor(env, outdir, force=True) plotter = liveplot.LivePlot(outdir) @@ -53,18 +48,12 @@ def save_model(agent): stimate_step_per_lap = 4000 lap_completed = False -#if settings.load_model: -# exit(1) -# -# qlearn_file = open('logs/qlearn_models/20200826_154342_qlearn_model_e_0.988614_a_0.2_g_0.9.pkl', 'rb') -# model = pickle.load(qlearn_file) -# print("Number of (action, state): {}".format(len(model))) -# qlearn.q = model -# qlearn.alpha = settings.alpha -# qlearn.gamma = settings.gamma -# qlearn.epsilon = settings.epsilon -# highest_reward = 4000 -#else: +if settings.load_model: + save_path = outdir+'model/' + model_path = save_path +else: + save_path = outdir+'model/' + model_path = None highest_reward = 0 @@ -72,65 +61,122 @@ def save_model(agent): start_time = time.time() +seed = 123 +save_iter = int(total_episodes/20) +render = False +writer = SummaryWriter(outdir) +env.seed(seed) + +ddpg = DDPG(env_id, + 32, + 2, + render=False, + num_process=1, + memory_size=1000000, + lr_p=1e-3, + lr_v=1e-3, + gamma=0.99, + polyak=0.995, + explore_size=2000, + step_per_iter=1000, + batch_size=256, + min_update_step=1000, + update_step=50, + action_noise=0.1, + seed=seed) + print(settings.lets_go) max_action = [12., 2] min_action = [2., -2] for episode in range(total_episodes): - done = False + + global_steps = (episode - 1) * ddpg.step_per_iter + log = dict() + num_steps = 0 + num_episodes = 0 + total_reward = 0 + min_episode_reward = float('inf') + max_episode_reward = float('-inf') lap_completed = False cumulated_reward = 0 # Should going forward give more reward then L/R z? - state = env.reset() - - for step in range(20000): - - action = 2*np.random.rand(2)-1 - - mod_action = action - - for itr in range(len(action)): - mod_action[itr] = min_action[itr] + 0.5*(max_action[itr] - min_action[itr])*(action[itr]+1) - - # Execute the action and get feedback - nextState, reward, done, info = env.step(mod_action) - cumulated_reward += reward - - if highest_reward < cumulated_reward: - highest_reward = cumulated_reward - - env._flush(force=True) - - if not done: - state = nextState - else: - last_time_steps = np.append(last_time_steps, [int(step + 1)]) - stats[episode] = step - break - - - if stimate_step_per_lap > 4000 and not lap_completed: - print("LAP COMPLETED!!") - lap_completed = True - - if episode % 100 == 0 and settings.plotter_graphic: - plotter.plot_steps_vs_epoch(stats) - - if episode % 1000 == 0 and settings.save_model: - print("\nSaving model . . .\n") - save_model(agent) - - m, s = divmod(int(time.time() - start_time), 60) - h, m = divmod(m, 60) - print ("EP: " + str(episode + 1) + " - Reward: " + str( - cumulated_reward) + " - Time: %d:%02d:%02d" % (h, m, s) + " - steps: " + str(step)) - -print ("\n|" + str(total_episodes) + "|" + str(agent.gamma) + "|" + str(agent.tau) + "|" + str(highest_reward) + "| PICTURE |") - -l = last_time_steps.tolist() -l.sort() - -print("Overall score: {:0.2f}".format(last_time_steps.mean())) -print("Best 100 score: {:0.2f}".format(reduce(lambda x, y: x + y, l[-100:]) / len(l[-100:]))) + while num_steps < ddpg.step_per_iter: + state = env.reset() + # state = self.running_state(state) + episode_reward = 0 + + for t in range(1000): + + if global_steps < ddpg.explore_size: # explore + action = env.action_space.sample() + else: # action with noise + action = ddpg.choose_action(state, ddpg.action_noise) + + mod_action = action + + for itr in range(len(action)): + mod_action[itr] = min_action[itr] + 0.5*(max_action[itr] - min_action[itr])*(action[itr]+1) + + next_state, reward, done, info = env.step(action) + # next_state = self.running_state(next_state) + mask = 0 if done else 1 + # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob') + ddpg.memory.push(state, action, reward, next_state, mask, None) + + # print("Points:", info['points']) + # print("Errors:", info['errors']) + # observation_image = Image.fromarray(info['image'].reshape(32,32)) + # observation_image.save(outdir+'/images/obs'+str(episode)+str(t)+'.jpg') + + episode_reward += reward + cumulated_reward += reward + global_steps += 1 + num_steps += 1 + + if global_steps >= ddpg.min_update_step and global_steps % ddpg.update_step == 0: + for _ in range(ddpg.update_step): + batch = ddpg.memory.sample( + ddpg.batch_size) # random sample batch + ddpg.update(batch) + + if done or num_steps >= ddpg.step_per_iter: + if highest_reward < cumulated_reward: + highest_reward = cumulated_reward + break + + state = next_state + + if num_steps > 4000 and not lap_completed: + print("LAP COMPLETED!!") + lap_completed = True + + num_episodes += 1 + total_reward += episode_reward + min_episode_reward = min(episode_reward, min_episode_reward) + max_episode_reward = max(episode_reward, max_episode_reward) + + log['num_steps'] = num_steps + log['num_episodes'] = num_episodes + log['total_reward'] = total_reward + log['avg_reward'] = total_reward / num_episodes + log['max_episode_reward'] = max_episode_reward + log['min_episode_reward'] = min_episode_reward + + print(f"Iter: {episode}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, " + f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, " + f"average reward: {log['avg_reward']: .4f}") + + # record reward information + writer.add_scalar("total reward", log['total_reward'], episode) + writer.add_scalar("average reward", log['avg_reward'], episode) + writer.add_scalar("min reward", log['min_episode_reward'], episode) + writer.add_scalar("max reward", log['max_episode_reward'], episode) + writer.add_scalar("num steps", log['num_steps'], episode) + + if episode % save_iter == 0: + ddpg.save(save_path) + + torch.cuda.empty_cache() env.close() diff --git a/behavior_metrics/brains/f1rl/utils/ddpg.py b/behavior_metrics/brains/f1rl/utils/ddpg.py index cad5ab58..4fad5c53 100644 --- a/behavior_metrics/brains/f1rl/utils/ddpg.py +++ b/behavior_metrics/brains/f1rl/utils/ddpg.py @@ -1,132 +1,137 @@ -import numpy as np -import tensorflow as tf -from brains.f1rl.utils.ddpg_utils.actor import Actor -from brains.f1rl.utils.ddpg_utils.critic import Critic -from brains.f1rl.utils.ddpg_utils.utils import OrnsteinUhlenbeckActionNoise, ReplayBuffer -from copy import deepcopy - -class DDPGAgent: - - def __init__(self, - sess = None, - state_dim = None, - action_dim = None, - training_batch = None, - epsilon_decay = None, - gamma = 0.99, - tau = 0.001): - - self._sess = sess - self._state_dim = state_dim - self._action_dim = action_dim - - self._actor = Actor(sess=self._sess, - state_dim=self._state_dim, - action_dim=self._action_dim, - hidden_sizes=[32, 16, 16], - output_activation=tf.nn.tanh, - scope = 'actor') - - self._critic = Critic(sess=sess, - state_dim=self._state_dim, - action_dim=self._action_dim, - latent_dim=2*self._action_dim, - hidden_sizes=[32, 16, 32, 32], - output_activation=tf.nn.relu, - scope = 'critic') - - self._target_actor = Actor(sess=self._sess, - state_dim=self._state_dim, - action_dim=self._action_dim, - hidden_sizes=[32, 16, 16], - output_activation=tf.nn.tanh, - scope = 'target_actor') - - self._target_critic = Critic(sess=sess, - state_dim=self._state_dim, - action_dim=self._action_dim, - latent_dim=2*self._action_dim, - hidden_sizes=[32, 16, 32, 32], - output_activation=tf.nn.relu, - scope = 'target_critic') - - self._actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(self._action_dim)) - - self._memory_limit = 20000 - self._memory_batch = training_batch - self._memory = ReplayBuffer(buffer_size=self._memory_limit) - - self._epsilon = 1.0 - self._epsilon_decay = epsilon_decay - - self._discount_factor = gamma - self._tau = tau - - self._hard_update_actor = self.update_target_network(self._actor.network_params, self._target_actor.network_params, tau=1.0) - self._hard_update_critic = self.update_target_network(self._actor.network_params, self._target_actor.network_params, tau=1.0) - - self._soft_update_actor = self.update_target_network(self._actor.network_params, self._target_actor.network_params, tau=self._tau) - self._soft_update_critic = self.update_target_network(self._actor.network_params, self._target_actor.network_params, tau=self._tau) - - def memorize(self, state, action, reward, terminal, next_state): - - self._memory.add(state, action, reward, terminal, next_state) - - def bellman(self, rewards, q_values, terminal, gamma): - - critic_target = np.random.rand(1) +#!/usr/bin/env python +import pickle - if terminal: - critic_target[0] = rewards - else: - critic_target[0] = rewards + gamma * q_values - return critic_target - - def predict(self, state): - - action = self._actor.predict(state=state)[0] - - action += self._actor_noise() - - action = np.clip(action, -1.0, 1.0) - - return action - - def train(self): - - if self._memory.size() > self._memory_batch: - - s_batch, a_batch, r_batch, t_batch, next_s_batch = self._memory.sample_batch(self._memory_batch) - - for i in range(self._memory_batch): - - q_value = self._target_critic.predict(state=next_s_batch[i], - action=self._target_actor.predict(state=next_s_batch[i])) - - critic_target = self.bellman(r_batch[i], q_value, t_batch[i], self._discount_factor) - - self._critic.update(state=s_batch[i], action=a_batch[i], critic_target=critic_target) - - actions = self._actor.predict(state=s_batch[i]) - grads = self._critic.gradients(state=s_batch[i], action=actions) - - self._actor.update(state=s_batch[i], action_gradients=grads) - - self._sess.run(self._soft_update_actor) - self._sess.run(self._soft_update_critic) - - else: - pass - - def update_target_network(self, network_params, target_network_params, tau): +import numpy as np +import torch +import torch.optim as optim - op_holder = [] - for from_var,to_var in zip(network_params, target_network_params): - op_holder.append(to_var.assign((tf.multiply(from_var, tau) + tf.multiply(to_var, 1. - tau)))) - - return op_holder +from brains.f1rl.utils.ddpg_utils.ddpg_step import ddpg_step +from brains.f1rl.utils.ddpg_utils.Policy_ddpg import Policy +from brains.f1rl.utils.ddpg_utils.Value_ddpg import Value +from brains.f1rl.utils.ddpg_utils.ddpg_utils import Memory, get_env_info, check_path, device, FLOAT, ZFilter - def _hard_update(self): - self._sess.run(self._hard_update_actor) - self._sess.run(self._hard_update_critic) +class DDPG: + def __init__(self, + env_id, + num_states, + num_actions, + render=False, + num_process=1, + memory_size=1000000, + lr_p=1e-3, + lr_v=1e-3, + gamma=0.99, + polyak=0.995, + explore_size=10000, + step_per_iter=3000, + batch_size=100, + min_update_step=1000, + update_step=50, + action_noise=0.1, + seed=1, + model_path=None + ): + self.env_id = env_id + self.num_states = num_states + self.num_actions = num_actions + self.gamma = gamma + self.polyak = polyak + self.memory = Memory(memory_size) + self.explore_size = explore_size + self.step_per_iter = step_per_iter + self.render = render + self.num_process = num_process + self.lr_p = lr_p + self.lr_v = lr_v + self.batch_size = batch_size + self.min_update_step = min_update_step + self.update_step = update_step + self.action_noise = action_noise + self.model_path = model_path + self.seed = seed + + self._init_model() + + def _init_model(self): + """init model from parameters""" + + self.action_low = -1 + self.action_high = 1 + + # seeding + np.random.seed(self.seed) + torch.manual_seed(self.seed) + + self.policy_net = Policy( + self.num_actions, self.action_high).to(device) + self.policy_net_target = Policy( + self.num_actions, self.action_high).to(device) + + self.value_net = Value(64, self.num_actions).to(device) + self.value_net_target = Value(64, self.num_actions).to(device) + + self.running_state = ZFilter((self.num_states,), clip=5) + + if self.model_path: + print("Loading Saved Model {}_ddpg.p".format(self.env_id)) + self.policy_net, self.value_net, self.running_state = pickle.load( + open('{}/{}_ddpg.p'.format(self.model_path, self.env_id), "rb")) + + self.policy_net_target.load_state_dict(self.policy_net.state_dict()) + self.value_net_target.load_state_dict(self.value_net.state_dict()) + + self.optimizer_p = optim.Adam( + self.policy_net.parameters(), lr=self.lr_p) + self.optimizer_v = optim.Adam( + self.value_net.parameters(), lr=self.lr_v) + + def choose_action(self, state, noise_scale): + """select action""" + state = np.random.rand(1,32,32) + state = FLOAT(state).unsqueeze(0).to(device) + with torch.no_grad(): + action, log_prob = self.policy_net.get_action_log_prob(state) + action = action.cpu().numpy()[0] + # add noise + noise = noise_scale * np.random.randn(self.num_actions) + action += noise + action = np.clip(action, -self.action_high, self.action_high) + return action + + def eval(self, i_iter, render=False): + """evaluate model""" + state = self.env.reset() + state = np.random.rand(1,32,32) + test_reward = 0 + while True: + if render: + self.env.render() + # state = self.running_state(state) + action = self.choose_action(state, 0) + state, reward, done, _ = self.env.step(action) + state = np.random.rand(1,32,32) + test_reward += reward + if done: + break + print(f"Iter: {i_iter}, test Reward: {test_reward}") + self.env.close() + + def update(self, batch): + """learn model""" + batch_state = FLOAT(batch.state).to(device) + batch_action = FLOAT(batch.action).to(device) + batch_reward = FLOAT(batch.reward).to(device) + batch_next_state = FLOAT(batch.next_state).to(device) + batch_mask = FLOAT(batch.mask).to(device) + + # update by DDPG + alg_step_stats = ddpg_step(self.policy_net, self.policy_net_target, self.value_net, self.value_net_target, self.optimizer_p, + self.optimizer_v, batch_state, batch_action, batch_reward, batch_next_state, batch_mask, + self.gamma, self.polyak) + + def save(self, save_path): + """save model""" + check_path(save_path) + pickle.dump((self.policy_net, self.value_net, self.running_state), + open('{}/{}_ddpg.p'.format(save_path, self.env_id), 'wb')) diff --git a/behavior_metrics/brains/f1rl/utils/ddpg_utils/Policy_ddpg.py b/behavior_metrics/brains/f1rl/utils/ddpg_utils/Policy_ddpg.py new file mode 100644 index 00000000..b6feb84a --- /dev/null +++ b/behavior_metrics/brains/f1rl/utils/ddpg_utils/Policy_ddpg.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python +import torch +import torch.nn as nn +import torch.optim as optim +import torch.nn.functional as F + +# if gpu is to be used +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +def init_weight(m): + if isinstance(m, nn.Linear): + nn.init.xavier_normal_(m.weight) + nn.init.constant_(m.bias, 0.0) + + +class Policy(nn.Module): + def __init__( + self, + dim_action, + max_action=None, + activation=nn.LeakyReLU + ): + super(Policy, self).__init__() + self.dim_action = dim_action + self.max_action = max_action + + self.conv1 = nn.Conv2d(1, 16, kernel_size=5, stride=2) + self.bn1 = nn.BatchNorm2d(16) + self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2) + self.bn2 = nn.BatchNorm2d(32) + self.conv3 = nn.Conv2d(32, 32, kernel_size=5, stride=2) + self.bn3 = nn.BatchNorm2d(32) + + # Number of Linear input connections depends on output of conv2d layers + # and therefore the input image size, so compute it. + def conv2d_size_out(size, kernel_size = 5, stride = 2): + return (size - (kernel_size - 1) - 1) // stride + 1 + + convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(32))) + convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(32))) + linear_input_size = convw * convh * 32 + self.head = nn.Linear(linear_input_size, self.dim_action) + + self.apply(init_weight) + + def forward(self, x): + x = x.to(device) + x = F.relu(self.bn1(self.conv1(x))) + x = F.relu(self.bn2(self.conv2(x))) + x = F.relu(self.bn3(self.conv3(x))) + action = self.head(x.view(x.size(0), -1)) + return action * self.max_action + + def get_action_log_prob(self, states): + action = self.forward(states) + return action, None + diff --git a/behavior_metrics/brains/f1rl/utils/ddpg_utils/Value_ddpg.py b/behavior_metrics/brains/f1rl/utils/ddpg_utils/Value_ddpg.py new file mode 100644 index 00000000..4106e331 --- /dev/null +++ b/behavior_metrics/brains/f1rl/utils/ddpg_utils/Value_ddpg.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python +import torch +import torch.nn as nn +import torch.optim as optim +import torch.nn.functional as F + +# if gpu is to be used +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +def init_weight(m): + if isinstance(m, nn.Linear): + nn.init.xavier_normal_(m.weight) + nn.init.constant_(m.bias, 0.0) + + +class Value(nn.Module): + def __init__(self, dim_state, dim_action, dim_hidden=32, activation=nn.LeakyReLU): + super(Value, self).__init__() + + self.dim_state = dim_state + self.dim_action = dim_action + self.dim_hidden = dim_hidden + + self.conv1 = nn.Conv2d(1, 16, kernel_size=5, stride=2) + self.bn1 = nn.BatchNorm2d(16) + self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2) + self.bn2 = nn.BatchNorm2d(32) + self.conv3 = nn.Conv2d(32, 32, kernel_size=5, stride=2) + self.bn3 = nn.BatchNorm2d(32) + + # Number of Linear input connections depends on output of conv2d layers + # and therefore the input image size, so compute it. + def conv2d_size_out(size, kernel_size = 5, stride = 2): + return (size - (kernel_size - 1) - 1) // stride + 1 + + convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(32))) + convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(32))) + linear_input_size = convw * convh * 32 + self.head = nn.Linear(linear_input_size, self.dim_state) + + self.apply(init_weight) + + self.value = nn.Sequential( + nn.Linear(self.dim_state + self.dim_action, self.dim_hidden), + activation(), + nn.Linear(self.dim_hidden, 1) + ) + + self.value.apply(init_weight) + + def forward(self, states, actions): + + x = states.to(device) + x = F.relu(self.bn1(self.conv1(x))) + x = F.relu(self.bn2(self.conv2(x))) + x = F.relu(self.bn3(self.conv3(x))) + encoded_state = self.head(x.view(x.size(0), -1)) + state_actions = torch.cat([encoded_state, actions], dim=1) + value = self.value(state_actions) + return value + diff --git a/behavior_metrics/brains/f1rl/utils/ddpg_utils/actor.py b/behavior_metrics/brains/f1rl/utils/ddpg_utils/actor.py deleted file mode 100644 index 96032190..00000000 --- a/behavior_metrics/brains/f1rl/utils/ddpg_utils/actor.py +++ /dev/null @@ -1,76 +0,0 @@ -import numpy as np -import tensorflow as tf - - -class Actor: - - def __init__(self, - sess=None, - state_dim = None, - action_dim = None, - hidden_sizes = None, - learning_rate = 0.0003, - hidden_activation = tf.nn.relu, - output_activation = tf.nn.tanh, - w_init=tf.contrib.layers.xavier_initializer(), - b_init=tf.zeros_initializer(), - scope = 'actor' - ): - - self._sess = sess - self._state = tf.placeholder(dtype=tf.float32, shape=state_dim, name='state') - self._action_grads = tf.placeholder(dtype=tf.float32, shape=action_dim, name='action_grads') - - ############################# Actor Layer ########################### - - with tf.variable_scope(scope): - - layer = tf.layers.conv1d(inputs=tf.expand_dims(self._state, 0), - filters=hidden_sizes[0], - kernel_size=4, - activation=hidden_activation, - kernel_initializer=w_init, - bias_initializer=b_init, - name='layer_in') - - for i in range(len(hidden_sizes)-1): - - layer = tf.layers.conv1d(inputs=layer, - filters=hidden_sizes[i+1], - kernel_size=3, - activation=hidden_activation, - kernel_initializer=w_init, - bias_initializer=b_init, - name='layer_'+str(i+1)) - - layer = tf.layers.flatten(inputs=layer, - name='layer_flatten') - - layer = tf.layers.dense(inputs=layer, - units=action_dim, - activation=output_activation, - kernel_initializer=w_init, - bias_initializer=b_init, - name='layer_out') - - self.layer_out = tf.layers.flatten(layer) - - self.predicted_action = tf.squeeze(self.layer_out) - - self.network_params = tf.trainable_variables() - - self._optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) - - params_grad = tf.gradients(self.predicted_action, tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES), -self._action_grads) - grads = zip(params_grad, tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)) - - self._train_op = self._optimizer.apply_gradients(grads) - - def predict(self, state): - return self._sess.run(self.predicted_action, {self._state: state}) - - def update(self, state, action_gradients): - - self._sess.run(self._train_op, - {self._state: state, - self._action_grads: action_gradients}) diff --git a/behavior_metrics/brains/f1rl/utils/ddpg_utils/critic.py b/behavior_metrics/brains/f1rl/utils/ddpg_utils/critic.py deleted file mode 100644 index 549f1c25..00000000 --- a/behavior_metrics/brains/f1rl/utils/ddpg_utils/critic.py +++ /dev/null @@ -1,112 +0,0 @@ -import numpy as np -import tensorflow as tf - - -class Critic: - - def __init__(self, - sess=None, - state_dim = None, - action_dim = None, - latent_dim = None, - hidden_sizes = None, - learning_rate = 0.003, - hidden_activation = tf.nn.relu, - output_activation = tf.nn.relu, - w_init=tf.contrib.layers.xavier_initializer(), - b_init=tf.zeros_initializer(), - scope = 'critic' - ): - - self._sess = sess - - ############################# Critic Layer ########################### - - with tf.variable_scope(scope): - - self._state = tf.placeholder(dtype=tf.float32, shape=state_dim, name='state') - self._action = tf.placeholder(dtype=tf.float32, shape=action_dim, name='action') - self._return_target = tf.placeholder(dtype=tf.float32, shape=1, name='target') - - with tf.GradientTape() as tape: - - tape.watch([self._state, self._action]) - - layer = tf.layers.conv1d(inputs=tf.expand_dims(self._state, 0), - filters=hidden_sizes[0], - kernel_size=4, - activation=hidden_activation, - kernel_initializer=w_init, - bias_initializer=b_init, - name='layer_in') - - for i in range(len(hidden_sizes)-3): - - layer = tf.layers.conv1d(inputs=layer, - filters=hidden_sizes[i+1], - kernel_size=3, - activation=hidden_activation, - kernel_initializer=w_init, - bias_initializer=b_init, - name='layer_'+str(i+1)) - - layer = tf.layers.flatten(inputs=layer, - name='layer_flatten') - - layer = tf.layers.dense(inputs=layer, - units=latent_dim, - activation=hidden_activation, - kernel_initializer=w_init, - bias_initializer=b_init, - name='layer_out_latent') - - layer = tf.layers.flatten(layer) - - action_layer = tf.concat([layer, tf.reshape(self._action, (1,action_dim))], 1) - - action_layer = tf.layers.dense(inputs=action_layer, - units=hidden_sizes[i+2], - activation=hidden_activation, - kernel_initializer=w_init, - bias_initializer=b_init, - name='action_layer_'+str(i+2)) - - action_layer = tf.layers.dense(inputs=action_layer, - units=hidden_sizes[i+3], - activation=hidden_activation, - kernel_initializer=w_init, - bias_initializer=b_init, - name='action_layer_'+str(i+3)) - - action_layer = tf.layers.dense(inputs=action_layer, - units=1, - activation=None, - kernel_initializer=w_init, - bias_initializer=b_init, - name='action_layer_out') - - self.layer_out = tf.layers.flatten(action_layer) - - self.predicted_qvalue = tf.squeeze(self.layer_out) - - self._loss = tf.reduce_mean(tf.square(self.predicted_qvalue - self._return_target)) - - self._action_gradients = tape.gradient(self.predicted_qvalue, self._action) - - self.network_params = tf.trainable_variables() - - self._optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) - self._train_op = self._optimizer.minimize(self._loss) - - def gradients(self, state, action): - return self._sess.run(self._action_gradients, {self._state: state, self._action: action}) - - def predict(self, state, action): - return self._sess.run(self.predicted_qvalue, {self._state: state, self._action: action}) - - def update(self, state, action, critic_target): - - self._sess.run(self._train_op, - {self._state: state, - self._action: action, - self._return_target: critic_target}) diff --git a/behavior_metrics/brains/f1rl/utils/ddpg_utils/ddpg_step.py b/behavior_metrics/brains/f1rl/utils/ddpg_utils/ddpg_step.py new file mode 100644 index 00000000..2404aad6 --- /dev/null +++ b/behavior_metrics/brains/f1rl/utils/ddpg_utils/ddpg_step.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python +import torch +import torch.nn as nn + +from brains.f1rl.utils.ddpg_utils.ddpg_utils import get_flat_params, set_flat_params + + +def ddpg_step(policy_net, policy_net_target, value_net, value_net_target, optimizer_policy, optimizer_value, + states, actions, rewards, next_states, masks, gamma, polyak): + masks = masks.unsqueeze(-1) + rewards = rewards.unsqueeze(-1) + """update critic""" + + values = value_net(states, actions) + + with torch.no_grad(): + target_next_values = value_net_target( + next_states, policy_net_target(next_states)) + target_values = rewards + gamma * masks * target_next_values + value_loss = nn.MSELoss()(values, target_values) + + optimizer_value.zero_grad() + value_loss.backward() + optimizer_value.step() + + """update actor""" + + policy_loss = - value_net(states, policy_net(states)).mean() + optimizer_policy.zero_grad() + policy_loss.backward() + optimizer_policy.step() + + """soft update target nets""" + policy_net_flat_params = get_flat_params(policy_net) + policy_net_target_flat_params = get_flat_params(policy_net_target) + set_flat_params(policy_net_target, polyak * + policy_net_target_flat_params + (1 - polyak) * policy_net_flat_params) + + value_net_flat_params = get_flat_params(value_net) + value_net_target_flat_params = get_flat_params(value_net_target) + set_flat_params(value_net_target, polyak * + value_net_target_flat_params + (1 - polyak) * value_net_flat_params) + + return {"critic_loss": value_loss, + "policy_loss": policy_loss + } + diff --git a/behavior_metrics/brains/f1rl/utils/ddpg_utils/ddpg_utils.py b/behavior_metrics/brains/f1rl/utils/ddpg_utils/ddpg_utils.py new file mode 100644 index 00000000..dfc87c65 --- /dev/null +++ b/behavior_metrics/brains/f1rl/utils/ddpg_utils/ddpg_utils.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python + +import random +from collections import namedtuple, deque +import gym +from gym.spaces import Discrete +import os +import numpy as np +import torch +import torch.nn as nn + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +FLOAT = torch.FloatTensor +DOUBLE = torch.DoubleTensor +LONG = torch.LongTensor + + +def to_device(*args): + return [arg.to(device) for arg in args] + + +def get_flat_params(model: nn.Module): + """ + get tensor flatted parameters from model + :param model: + :return: tensor + """ + return torch.cat([param.view(-1) for param in model.parameters()]) + + +def get_flat_grad_params(model: nn.Module): + """ + get flatted grad of parameters from the model + :param model: + :return: tensor + """ + return torch.cat( + [param.grad.view(-1) if param.grad is not None else torch.zeros(param.view(-1).shape) for param in + model.parameters()]) + + +def set_flat_params(model, flat_params): + """ + set tensor flatted parameters to model + :param model: + :param flat_params: tensor + :return: + """ + prev_ind = 0 + for param in model.parameters(): + flat_size = param.numel() + param.data.copy_( + flat_params[prev_ind:prev_ind + flat_size].view(param.size())) + prev_ind += flat_size + + +def resolve_activate_function(name): + if name.lower() == "relu": + return nn.ReLU + if name.lower() == "sigmoid": + return nn.Sigmoid + if name.lower() == "leakyrelu": + return nn.LeakyReLU + if name.lower() == "prelu": + return nn.PReLU + if name.lower() == "softmax": + return nn.Softmax + if name.lower() == "tanh": + return nn.Tanh + + +Transition = namedtuple( + 'Transition', ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob')) + + +def check_path(path): + if not os.path.exists(path): + print(f"{path} not exist") + os.makedirs(path) + print(f"Create {path} success") + +def get_env_space(env_id): + env = gym.make(env_id) + num_states = env.observation_space.shape[0] + if type(env.action_space) == Discrete: + num_actions = env.action_space.n + else: + num_actions = env.action_space.shape[0] + + return env, num_states, num_actions + + +def get_env_info(env_id, unwrap=False): + env = gym.make(env_id) + if unwrap: + env = env.unwrapped + num_states = env.observation_space.shape[0] + env_continuous = False + if type(env.action_space) == Discrete: + num_actions = env.action_space.n + else: + num_actions = env.action_space.shape[0] + env_continuous = True + + return env, env_continuous, num_states, num_actions + + +class Memory(object): + def __init__(self, size=None): + self.memory = deque(maxlen=size) + + # save item + def push(self, *args): + self.memory.append(Transition(*args)) + + def clear(self): + self.memory.clear() + + def append(self, other): + self.memory += other.memory + + # sample a mini_batch + def sample(self, batch_size=None): + # sample all transitions + if batch_size is None: + return Transition(*zip(*self.memory)) + else: # sample with size: batch_size + random_batch = random.sample(self.memory, batch_size) + return Transition(*zip(*random_batch)) + + def __len__(self): + return len(self.memory) + +# from https://github.com/joschu/modular_rl +# http://www.johndcook.com/blog/standard_deviation/ + + +class RunningStat(object): + def __init__(self, shape): + self._n = 0 + self._M = np.zeros(shape) + self._S = np.zeros(shape) + + def push(self, x): + x = np.asarray(x) + assert x.shape == self._M.shape + self._n += 1 + if self._n == 1: + self._M[...] = x + else: + oldM = self._M.copy() + self._M[...] = oldM + (x - oldM) / self._n + self._S[...] = self._S + (x - oldM) * (x - self._M) + + @property + def n(self): + return self._n + + @property + def mean(self): + return self._M + + @property + def var(self): + return self._S / (self._n - 1) if self._n > 1 else np.square(self._M) + + @property + def std(self): + return np.sqrt(self.var) + + @property + def shape(self): + return self._M.shape + +class ZFilter: + """ + y = (x-mean)/std + using running estimates of mean,std + """ + + def __init__(self, shape, demean=True, destd=True, clip=10.0): + self.demean = demean + self.destd = destd + self.clip = clip + + self.rs = RunningStat(shape) + self.fix = False + + def __call__(self, x, update=True): + if update and not self.fix: + self.rs.push(x) + if self.demean: + x = x - self.rs.mean + if self.destd: + x = x / (self.rs.std + 1e-8) + if self.clip: + x = np.clip(x, -self.clip, self.clip) + return x diff --git a/behavior_metrics/brains/f1rl/utils/ddpg_utils/utils.py b/behavior_metrics/brains/f1rl/utils/ddpg_utils/utils.py deleted file mode 100644 index c40ee4c4..00000000 --- a/behavior_metrics/brains/f1rl/utils/ddpg_utils/utils.py +++ /dev/null @@ -1,68 +0,0 @@ -import sys -import argparse -import numpy as np -import gym -import pandas as pd -import math -from collections import deque -import random - -class OrnsteinUhlenbeckActionNoise: - def __init__(self, mu, sigma=0.25, theta=.05, dt=1e-2, x0=None): - self.theta = theta - self.mu = mu - self.sigma = sigma - self.dt = dt - self.x0 = x0 - self.reset() - - def __call__(self): - x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape) - self.x_prev = x - return x - - def reset(self): - self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu) - - def __repr__(self): - return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma) - -class ReplayBuffer(object): - - def __init__(self, buffer_size, random_seed=123): - self.buffer_size = buffer_size - self.count = 0 - self.buffer = deque() - random.seed(random_seed) - - def add(self, state, action, reward, terminal, next_state): - experience = (state, action, reward, terminal, next_state) - if self.count < self.buffer_size: - self.buffer.append(experience) - self.count += 1 - else: - self.buffer.popleft() - self.buffer.append(experience) - - def size(self): - return self.count - - def sample_batch(self, batch_size): - batch = [] - - if self.count < batch_size: - batch = random.sample(self.buffer, self.count) - else: - batch = random.sample(self.buffer, batch_size) - - state_batch = np.array([_[0] for _ in batch]) - action_batch = np.array([_[1] for _ in batch]) - reward_batch = np.array([_[2] for _ in batch]) - terminal_batch = np.array([_[3] for _ in batch]) - next_state_batch = np.array([_[4] for _ in batch]) - - return state_batch, action_batch, reward_batch, terminal_batch, next_state_batch - - def clear(self): - self.buffer.clear() - self.count = 0 diff --git a/behavior_metrics/default-rl.yml b/behavior_metrics/default-rl.yml index e8549d4e..38080f5f 100644 --- a/behavior_metrics/default-rl.yml +++ b/behavior_metrics/default-rl.yml @@ -28,7 +28,7 @@ Behaviors: epsilon: 0.99 total_episodes: 20000 epsilon_discount: 0.9986 - env: 'laser' + env: 'camera' Simulation: World: /opt/jderobot/share/jderobot/gazebo/launch/simple_circuit.launch Dataset: diff --git a/behavior_metrics/default-rlddpg.yml b/behavior_metrics/default-rlddpg.yml index 14fe2cdc..868b940e 100644 --- a/behavior_metrics/default-rlddpg.yml +++ b/behavior_metrics/default-rlddpg.yml @@ -26,7 +26,7 @@ Behaviors: gamma: 0.9 epsilon_decay: 0.98 hidden_size: 0.99 - total_episodes: 20000 + total_episodes: 2000000 env: 'camera' Simulation: World: /opt/jderobot/share/jderobot/gazebo/launch/simple_circuit.launch diff --git a/gym-gazebo/gym_gazebo/envs/f1/env_gazebo_f1_ddpg_camera.py b/gym-gazebo/gym_gazebo/envs/f1/env_gazebo_f1_ddpg_camera.py index 717ee026..fb0aab4a 100755 --- a/gym-gazebo/gym_gazebo/envs/f1/env_gazebo_f1_ddpg_camera.py +++ b/gym-gazebo/gym_gazebo/envs/f1/env_gazebo_f1_ddpg_camera.py @@ -380,10 +380,12 @@ def step(self, action): error_1, error_2, error_3 = self.calculate_error(point_1, point_2, point_3) # == REWARD == - if not done: - reward = self.calculate_reward(error_1, error_2, error_3) - else: - reward = -200 + #if not done: + # reward = self.calculate_reward(error_1, error_2, error_3) + #else: + # reward = -200 + + reward = self.calculate_reward(error_1, error_2, error_3) # == TELEMETRY == if telemetry: @@ -391,15 +393,18 @@ def step(self, action): cv_image = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY) cv_image = cv2.resize(cv_image, (self.img_rows, self.img_cols)) - observation = cv_image.reshape(cv_image.shape[0], cv_image.shape[1]) + observation = cv_image.reshape(1, cv_image.shape[0], cv_image.shape[1]) vehicle_state = [vel_cmd.linear.x, vel_cmd.angular.z] info = {'error_1': error_1, 'error_2': error_2, 'error_3': error_3} - stacked_obs = {'image':observation, 'state':vehicle_state} + stacked_obs = {'image':observation, + 'state':vehicle_state, + 'points': [point_1, point_2, point_3], + 'errors': [error_1, error_2, error_3]} # OpenAI standard return: observation, reward, done, info - return stacked_obs, reward, done, {} + return observation, reward, done, stacked_obs # test STACK 4 # cv_image = cv_image.reshape(1, 1, cv_image.shape[0], cv_image.shape[1]) @@ -435,11 +440,11 @@ def reset(self): cv_image = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY) cv_image = cv2.resize(cv_image, (self.img_rows, self.img_cols)) - state = cv_image.reshape(cv_image.shape[0], cv_image.shape[1]) + state = cv_image.reshape(1, cv_image.shape[0], cv_image.shape[1]) stacked_obs = {'image': state, 'state':[0., 0.]} - return stacked_obs + return state # test STACK 4 # self.s_t = np.stack((cv_image, cv_image, cv_image, cv_image), axis=0) From 3b586c27cfd02297ad671030127911c9647a77ca Mon Sep 17 00:00:00 2001 From: UtkarshMishra04 Date: Mon, 3 May 2021 19:47:39 +0000 Subject: [PATCH 8/9] added torch installation in requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 8c2d5eb3..2227a79e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,3 +29,4 @@ vcstool==0.2.14 scikit-image==0.17.2 bagpy==0.4.2 pycryptodomex==3.9.9 +torch==1.8.1 From a0cda6fd537ba50d48f3fbd9b4a0c95ae92a8b57 Mon Sep 17 00:00:00 2001 From: UtkarshMishra04 Date: Fri, 14 May 2021 03:58:20 +0000 Subject: [PATCH 9/9] updated local repository --- behavior_metrics/default-rl.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/behavior_metrics/default-rl.yml b/behavior_metrics/default-rl.yml index 38080f5f..42d1d306 100644 --- a/behavior_metrics/default-rl.yml +++ b/behavior_metrics/default-rl.yml @@ -17,7 +17,7 @@ Behaviors: MaxV: 3 MaxW: 0.3 RL: True - BrainPath: 'brains/f1rl/train.py' + BrainPath: 'brains/f1rl/f1_follow_line_camera_dqn.py' Type: 'f1rl' Action: 'discrete' Parameters: