From 2bf6199574a9823c11b7288f1753f9f1b17d27df Mon Sep 17 00:00:00 2001
From: "M.GUNASEKHAR" <95043391+gunasekhar159@users.noreply.github.com>
Date: Sun, 5 May 2024 23:10:12 +0530
Subject: [PATCH] Update README.md
---
README.md | 67 ++++++++++++++++++++++++++++++++++++++++++++++++-------
1 file changed, 59 insertions(+), 8 deletions(-)
diff --git a/README.md b/README.md
index aa97ef9..b6c3e60 100644
--- a/README.md
+++ b/README.md
@@ -2,22 +2,73 @@
## AIM
-Write the experiment AIM.
+To develop a Python program to find the optimal policy for the given RL environment using Q-Learning and compare the state values with the Monte Carlo method.
## PROBLEM STATEMENT
-Explain the problem statement.
+Develop a Python program to derive the optimal policy using Q-Learning and compare state values with Monte Carlo method.
## Q LEARNING ALGORITHM
-Include the steps involved in the Q Learning algorithm
+Step 1:
+Initialize Q-table and hyperparameters.
+Step 2:
+Choose an action using the epsilon-greedy policy and execute the action, observe the next state, reward, and update Q-values and repeat until episode ends.
+
+Step 3:
+After training, derive the optimal policy from the Q-table.
+
+Step 4:
+Implement the Monte Carlo method to estimate state values.
+
+Step 5:
+Compare Q-Learning policy and state values with Monte Carlo results for the given RL environment.
## Q LEARNING FUNCTION
-Include the Q Learning function
+Developed by:M.Gunasekhar
+Ref no :212221240014
+def q_learning(env,
+ gamma=1.0,
+ init_alpha=0.5,
+ min_alpha=0.01,
+ alpha_decay_ratio=0.5,
+ init_epsilon=1.0,
+ min_epsilon=0.1,
+ epsilon_decay_ratio=0.9,
+ n_episodes=3000):
+ nS, nA = env.observation_space.n, env.action_space.n
+ pi_track = []
+ Q = np.zeros((nS, nA), dtype=np.float64)
+ Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float64)
+ select_action=lambda state,Q,epsilon: np.argmax(Q[state]) if np.random.random()>epsilon else np.random.randint(len(Q[state]))
+ alphas=decay_schedule(
+ init_alpha,min_alpha,
+ alpha_decay_ratio,
+ n_episodes)
+ epsilons=decay_schedule(
+ init_epsilon,min_epsilon,
+ epsilon_decay_ratio,
+ n_episodes)
+ for e in tqdm(range(n_episodes),leave=False):
+ state,done=env.reset(),False
+ action=select_action(state,Q,epsilons[e])
+ while not done:
+ action=select_action(state,Q,epsilons[e])
+ next_state,reward,done,_=env.step(action)
+ td_target=reward+gamma*Q[next_state].max()*(not done)
+ td_error=td_target-Q[state][action]
+ Q[state][action]=Q[state][action]+alphas[e]*td_error
+ state=next_state
+ Q_track[e]=Q
+ pi_track.append(np.argmax(Q,axis=1))
+ V=np.max(Q,axis=1)
+ pi=lambda s:{s:a for s,a in enumerate(np.argmax(Q,axis=1))}[s]
+ return Q, V, pi, Q_track, pi_track
## OUTPUT:
-Mention the optimal policy, optimal value function , success rate for the optimal policy.
-
-Include plot comparing the state value functions of Monte Carlo method and Qlearning.
+
+
+
+
## RESULT:
+Therefore a python program has been successfully developed to find the optimal policy for the given RL environment using Q-Learning and compared the state values with the Monte Carlo method.
-Write your result here