From 2bf6199574a9823c11b7288f1753f9f1b17d27df Mon Sep 17 00:00:00 2001 From: "M.GUNASEKHAR" <95043391+gunasekhar159@users.noreply.github.com> Date: Sun, 5 May 2024 23:10:12 +0530 Subject: [PATCH] Update README.md --- README.md | 67 ++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 59 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index aa97ef9..b6c3e60 100644 --- a/README.md +++ b/README.md @@ -2,22 +2,73 @@ ## AIM -Write the experiment AIM. +To develop a Python program to find the optimal policy for the given RL environment using Q-Learning and compare the state values with the Monte Carlo method. ## PROBLEM STATEMENT -Explain the problem statement. +Develop a Python program to derive the optimal policy using Q-Learning and compare state values with Monte Carlo method. ## Q LEARNING ALGORITHM -Include the steps involved in the Q Learning algorithm +Step 1: +Initialize Q-table and hyperparameters. +Step 2: +Choose an action using the epsilon-greedy policy and execute the action, observe the next state, reward, and update Q-values and repeat until episode ends. + +Step 3: +After training, derive the optimal policy from the Q-table. + +Step 4: +Implement the Monte Carlo method to estimate state values. + +Step 5: +Compare Q-Learning policy and state values with Monte Carlo results for the given RL environment. ## Q LEARNING FUNCTION -Include the Q Learning function +Developed by:M.Gunasekhar +Ref no :212221240014 +def q_learning(env, + gamma=1.0, + init_alpha=0.5, + min_alpha=0.01, + alpha_decay_ratio=0.5, + init_epsilon=1.0, + min_epsilon=0.1, + epsilon_decay_ratio=0.9, + n_episodes=3000): + nS, nA = env.observation_space.n, env.action_space.n + pi_track = [] + Q = np.zeros((nS, nA), dtype=np.float64) + Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float64) + select_action=lambda state,Q,epsilon: np.argmax(Q[state]) if np.random.random()>epsilon else np.random.randint(len(Q[state])) + alphas=decay_schedule( + init_alpha,min_alpha, + alpha_decay_ratio, + n_episodes) + epsilons=decay_schedule( + init_epsilon,min_epsilon, + epsilon_decay_ratio, + n_episodes) + for e in tqdm(range(n_episodes),leave=False): + state,done=env.reset(),False + action=select_action(state,Q,epsilons[e]) + while not done: + action=select_action(state,Q,epsilons[e]) + next_state,reward,done,_=env.step(action) + td_target=reward+gamma*Q[next_state].max()*(not done) + td_error=td_target-Q[state][action] + Q[state][action]=Q[state][action]+alphas[e]*td_error + state=next_state + Q_track[e]=Q + pi_track.append(np.argmax(Q,axis=1)) + V=np.max(Q,axis=1) + pi=lambda s:{s:a for s,a in enumerate(np.argmax(Q,axis=1))}[s] + return Q, V, pi, Q_track, pi_track ## OUTPUT: -Mention the optimal policy, optimal value function , success rate for the optimal policy. - -Include plot comparing the state value functions of Monte Carlo method and Qlearning. +image +image +image +image ## RESULT: +Therefore a python program has been successfully developed to find the optimal policy for the given RL environment using Q-Learning and compared the state values with the Monte Carlo method. -Write your result here