Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
6a674d6
Made code modular, tested. Works
Asar5 Jan 4, 2021
66a0e7c
Separated lift from rest. Tested. Running longer test
Asar5 Jan 4, 2021
c234977
Added lift remove in eval, updating total reward as well. Tested
Asar5 Jan 4, 2021
58d2aae
Reverted arguments to original values
Asar5 Jan 4, 2021
f414006
Removed Lift, tested code. Working.
Asar5 Jan 6, 2021
7927ae3
Added saving agent data during pre-training. Made code more modular t…
Asar5 Jan 6, 2021
f2cb273
Made code modular (pre training and training). Activation key expired…
Asar5 Jan 4, 2021
2ac94dd
Tested and updated expert pid controller
hugheste Jan 13, 2021
25472b1
Merge pull request #15 from OSUrobotics/stephanie
hugheste Jan 13, 2021
2199b29
Testing expert pid controller and profiler
hugheste Jan 15, 2021
1c1deef
Merge pull request #16 from OSUrobotics/stephanie
hugheste Jan 15, 2021
e42050f
Generated expert data, saved output locally
hugheste Jan 20, 2021
0134e73
Pre-trained policy output and plotting code updates
hugheste Jan 21, 2021
3cd16d1
Merged changes from stephanie and anjali's changes for pre-training a…
hugheste Jan 21, 2021
e021273
Testing and updating pre-training, plotting all reward types
hugheste Jan 25, 2021
97ae30e
Corrected lift reward tensorboard plotting
hugheste Jan 26, 2021
00b8e83
Corrected lift reward tensorboard plotting
hugheste Jan 26, 2021
352f5d0
Updated main_DDPGfD so it was ready for random policy training
hugheste Jan 27, 2021
30138c7
Cleaned up some areas of the code, integrated batch training, reward …
hugheste Feb 1, 2021
15fe420
Merge branch 'anjali_remove_lift' into devel
hugheste Feb 1, 2021
3574afb
add state entrypoint, custom state indexing
jimzers Feb 4, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .idea/KinovaGrasping.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
184 changes: 30 additions & 154 deletions gym-kinova-gripper/DDPGfD.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def forward(self, state, action):


class DDPGfD(object):
def __init__(self, state_dim, action_dim, max_action, n, discount=0.995, tau=0.0005):
def __init__(self, state_dim, action_dim, max_action, n, discount=0.995, tau=0.0005, batch_size=64):
self.actor = Actor(state_dim, action_dim, max_action).to(device)
self.actor_target = copy.deepcopy(self.actor)
self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=1e-4)
Expand All @@ -66,106 +66,38 @@ def __init__(self, state_dim, action_dim, max_action, n, discount=0.995, tau=0.0
self.network_repl_freq = 10
self.total_it = 0

# note: parameterize this later!!!
self.batch_size = 64
self.batch_size = batch_size

def select_action(self, state):
state = torch.FloatTensor(state.reshape(1, -1)).to(device)
return self.actor(state).cpu().data.numpy().flatten()


def train(self, episode_step, expert_replay_buffer, replay_buffer=None):
def train(self, episode_step, expert_replay_buffer, replay_buffer=None, prob=0.8):
""" Update policy based on full trajectory of one episode """
self.total_it += 1

# Sample replay buffer
# Determine which replay buffer to sample from
if replay_buffer is not None and expert_replay_buffer is None: # Only use agent replay
expert_or_random = "agent"
elif replay_buffer is None and expert_replay_buffer is not None: # Only use expert replay
expert_or_random = "expert"
else:
print("PROBABILITY EXPERT OR AGENT")
expert_or_random = np.random.choice(np.array(["expert", "agent"]), p=[0.8, 0.2])
#expert_or_random = np.random.choice(np.array(["expert", "agent"]), p=[0.7, 0.3])
expert_or_random = np.random.choice(np.array(["expert", "agent"]), p=[prob, round(1. - prob, 2)])

if expert_or_random == "expert":
state, action, next_state, reward, not_done = expert_replay_buffer.sample()
else:
state, action, next_state, reward, not_done = replay_buffer.sample()
#state, action, next_state, reward, not_done = replay_buffer.sample_wo_expert()

# new sampling procedure for n step rollback
#state, action, next_state, reward, not_done = replay_buffer.sample_batch_nstep()

print("=======================state===================")
print(state.shape)
print("=======================next_state===================")
print(next_state.shape)

print("=====================action====================")
print(action.shape)

# Old implementation of target Q
# Target Q network
target_Q = self.critic_target(next_state, self.actor_target(next_state))
target_Q = reward + (self.discount * target_Q).detach() # bellman equation

''' Newly added N step code
target_Q = self.critic_target(next_state[:, 0], self.actor_target(next_state[:, 0]))
print("before regular")
print(target_Q.shape)
print(reward[:, 0].shape)
target_Q = reward[:, 0] + (self.discount * target_Q).detach() #bellman equation
'''

# print("======================target_Q===================")
# print(target_Q.shape)
#
# print("next state smaller")
# print(next_state[(self.n - 1):].shape)
#
# print("number of episode steps")
# print(episode_step)

"""
This is the updated version, assuming that we're sampling from a batch.
"""
''' Newly added N step code
target_action = self.actor_target(next_state[:, -1])
target_critic_val = self.critic_target(next_state[:, -1], target_action) # shape: (self.batch_size, 1)

n_step_return = torch.zeros(self.batch_size).to(device) # shape: (self.batch_size,)
# note: might need to pass n properly from higher state!!!

print("================================reward shape=======================")
print(reward[:, 0].shape) # idk the shape here, please record

for i in range(self.n):
n_step_return += (self.discount ** i) * reward[:, i].squeeze(-1)

print("====================n step return shape====================")
print(n_step_return.shape)

# this is the n step return with the added value fn estimation
target_QN = n_step_return + (self.discount ** self.n) * target_critic_val.squeeze(-1)
target_QN = target_QN.unsqueeze(dim=-1)

print("=======================target QN")
print(target_QN.shape)
'''

# Old version: Compute the target Q_N value
# Compute the target Q_N value
rollreward = []
target_QN = self.critic_target(next_state[(self.n - 1):], self.actor_target(next_state[(self.n - 1):]))

'''
# Checks episode size versus value of n (In case n is larger than the number of timesteps)
if state.shape[0] < 3:
for i in range(state.shape[0]):
roll_reward = (self.discount) * reward[i].item()
rollreward.append(roll_reward)
else:
'''
print("state.shape[0]: ", state.shape[0])
print("episode_step: ",episode_step)
ep_timesteps = episode_step
if state.shape[0] < episode_step:
ep_timesteps = state.shape[0]
Expand All @@ -175,47 +107,19 @@ def train(self, episode_step, expert_replay_buffer, replay_buffer=None):
roll_reward = (self.discount**(self.n - 1)) * reward[i].item() + (self.discount**(self.n - 2)) * reward[i - (self.n - 2)].item() + (self.discount ** 0) * reward[i-(self.n - 1)].item()
rollreward.append(roll_reward)


if len(rollreward) != ep_timesteps - (self.n - 1):
raise ValueError

#print("roll reward before reshape: ")
#print(rollreward)
#print(len(rollreward))

rollreward = torch.FloatTensor(np.array(rollreward).reshape(-1,1)).to(device)
print("rollreward.get_shape(): ", rollreward.size())
print("target_QN.get_shape(): ", target_QN.size())
print("self.discount: ", self.discount)
print("self.n.: ", self.n)

# print("================SHAPE DUMP=============")
# print(rollreward.shape)
# print(((self.discount ** self.n) * target_QN).shape)

# Old code: calculate target network
# Calculate target network
target_QN = rollreward + (self.discount ** self.n) * target_QN #bellman equation <= this is the final N step return

# Old code: Get current Q estimate
# Get current Q estimate
current_Q = self.critic(state, action)

# New implementation
#current_Q = self.critic(state[:, 0], action[:, 0])

# Old code: Get current Q estimate for n-step return
#current_Q_n = self.critic(state[:(episode_step - (self.n - 1))], action[:(episode_step - (self.n - 1))])
current_Q_n = self.critic(state[:(ep_timesteps - (self.n - 1))], action[:(ep_timesteps - (self.n - 1))])

# New Updated for new rollback method
#current_Q_n = self.critic(state[:, -1], action[:, -1])

print("======================Q shapes finallly==============")
print(current_Q.shape)
print(target_Q.shape)
print(current_Q_n.shape)
print(target_QN.shape)
print("==============end printing pain==================")

# L_1 loss (Loss between current state, action and reward, next state, action)
critic_L1loss = F.mse_loss(current_Q, target_Q)

Expand Down Expand Up @@ -248,82 +152,53 @@ def train(self, episode_step, expert_replay_buffer, replay_buffer=None):
target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
return actor_loss.item(), critic_loss.item(), critic_L1loss.item(), critic_LNloss.item()

def train_batch(self, replay_buffer, episode_step):
def train_batch(self, episode_step, expert_replay_buffer, replay_buffer, prob):
""" Update policy networks based on batch_size of episodes using n-step returns """
self.total_it += 1

# new sampling procedure for n step rollback
state, action, next_state, reward, not_done = replay_buffer.sample_batch_nstep()

# print("=======================state===================")
# print(state.shape)
# print("=======================next_state===================")
# print(next_state.shape)
#
# print("=====================action====================")
# print(action.shape)

# print("==========modified states and actions==============")
# print(state[:, -1])
# print(action[:, -1])


# Sample replay buffer
if replay_buffer is not None and expert_replay_buffer is None: # Only use agent replay
expert_or_random = "agent"
elif replay_buffer is None and expert_replay_buffer is not None: # Only use expert replay
expert_or_random = "expert"
else:
expert_or_random = np.random.choice(np.array(["expert", "agent"]), p=[prob, round(1. - prob, 2)])

# episode_step = len(state) # for varying episode sets
#print("expert_or_random: ",expert_or_random)
if expert_or_random == "expert":
#print("EXPERT")
state, action, next_state, reward, not_done = expert_replay_buffer.sample_batch(self.batch_size)
else:
#print("AGENT")
state, action, next_state, reward, not_done = replay_buffer.sample_batch(self.batch_size)

reward = reward.unsqueeze(-1)
not_done = not_done.unsqueeze(-1)

target_Q = self.critic_target(next_state[:, 0], self.actor_target(next_state[:, 0]))
# print("before regular")
# print(target_Q.shape)
# print(reward[:, 0].shape)
target_Q = reward[:, 0] + (self.discount * target_Q).detach() #bellman equation

# print("======================target_Q===================")
# print(target_Q.shape)
#
# print("next state smaller")
# print(next_state[(self.n - 1):].shape)
#
# print("number of episode steps")
# print(episode_step)

"""
This is the updated version, assuming that we're sampling from a batch.
"""
target_action = self.actor_target(next_state[:, -1])
target_critic_val = self.critic_target(next_state[:, -1], target_action) # shape: (self.batch_size, 1)

n_step_return = torch.zeros(self.batch_size).to(device) # shape: (self.batch_size,)
# note: might need to pass n properly from higher state!!!

# print("================================reward shape=======================")
# print(reward[:, 0].shape) # idk the shape here, please record

for i in range(self.n):
n_step_return += (self.discount ** i) * reward[:, i].squeeze(-1)

# print("====================n step return shape====================")
# print(n_step_return.shape)

# this is the n step return with the added value fn estimation
target_QN = n_step_return + (self.discount ** self.n) * target_critic_val.squeeze(-1)
target_QN = target_QN.unsqueeze(dim=-1)

# print("=======================target QN")
# print(target_QN.shape)

# New implementation
current_Q = self.critic(state[:, 0], action[:, 0])

# New Updated for new rollback method
current_Q_n = self.critic(state[:, -1], action[:, -1])

# print("======================Q shapes finallly==============")
# print(current_Q.shape)
# print(target_Q.shape)
# print(current_Q_n.shape)
# print(target_QN.shape)
# print("==============end printing pain==================")

# L_1 loss (Loss between current state, action and reward, next state, action)
critic_L1loss = F.mse_loss(current_Q, target_Q)

Expand Down Expand Up @@ -356,6 +231,7 @@ def train_batch(self, replay_buffer, episode_step):
target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
return actor_loss.item(), critic_loss.item(), critic_L1loss.item(), critic_LNloss.item()


def save(self, filename):
torch.save(self.critic.state_dict(), filename + "_critic")
torch.save(self.critic_optimizer.state_dict(), filename + "_critic_optimizer")
Expand All @@ -367,4 +243,4 @@ def load(self, filename):
self.critic.load_state_dict(torch.load(filename + "_critic"))
self.critic_optimizer.load_state_dict(torch.load(filename + "_critic_optimizer"))
self.actor.load_state_dict(torch.load(filename + "_actor"))
self.actor_optimizer.load_state_dict(torch.load(filename + "_actor_optimizer"))
self.actor_optimizer.load_state_dict(torch.load(filename + "_actor_optimizer"))
6 changes: 6 additions & 0 deletions gym-kinova-gripper/MUJOCO_LOG.TXT
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,9 @@ ERROR: Expired activation key
Tue Oct 6 21:21:50 2020
ERROR: Expired activation key

Sun Jan 17 00:02:15 2021
ERROR: Expired activation key

Sun Jan 3 00:02:38 2021
ERROR: Expired activation key

Binary file modified gym-kinova-gripper/__pycache__/DDPGfD.cpython-36.pyc
Binary file not shown.
Binary file modified gym-kinova-gripper/__pycache__/expert_data.cpython-36.pyc
Binary file not shown.
Binary file modified gym-kinova-gripper/__pycache__/utils.cpython-36.pyc
Binary file not shown.
36 changes: 0 additions & 36 deletions gym-kinova-gripper/eval_objects.csv
Original file line number Diff line number Diff line change
@@ -1,36 +0,0 @@
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
C,u,b,e,S
Loading