From 2bf6199574a9823c11b7288f1753f9f1b17d27df Mon Sep 17 00:00:00 2001
From: "M.GUNASEKHAR" <95043391+gunasekhar159@users.noreply.github.com>
Date: Sun, 5 May 2024 23:10:12 +0530
Subject: [PATCH] Update README.md

---
 README.md | 67 ++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 59 insertions(+), 8 deletions(-)
diff --git a/README.md b/README.md
index aa97ef9..b6c3e60 100644
--- a/README.md
+++ b/README.md
@@ -2,22 +2,73 @@
 
 
 ## AIM
-Write the experiment AIM.
+To develop a Python program to find the optimal policy for the given RL environment using Q-Learning and compare the state values with the Monte Carlo method.
 
 ## PROBLEM STATEMENT
-Explain the problem statement.
+Develop a Python program to derive the optimal policy using Q-Learning and compare state values with Monte Carlo method.
 
 ## Q LEARNING ALGORITHM
-Include the steps involved in the Q Learning algorithm
+Step 1:
+Initialize Q-table and hyperparameters.
 
+Step 2:
+Choose an action using the epsilon-greedy policy and execute the action, observe the next state, reward, and update Q-values and repeat until episode ends.
+
+Step 3:
+After training, derive the optimal policy from the Q-table.
+
+Step 4:
+Implement the Monte Carlo method to estimate state values.
+
+Step 5:
+Compare Q-Learning policy and state values with Monte Carlo results for the given RL environment.
 ## Q LEARNING FUNCTION
-Include the Q Learning function
+Developed by:M.Gunasekhar
+Ref no :212221240014
 
+def q_learning(env,
+               gamma=1.0,
+               init_alpha=0.5,
+               min_alpha=0.01,
+               alpha_decay_ratio=0.5,
+               init_epsilon=1.0,
+               min_epsilon=0.1,
+               epsilon_decay_ratio=0.9,
+               n_episodes=3000):
+    nS, nA = env.observation_space.n, env.action_space.n
+    pi_track = []
+    Q = np.zeros((nS, nA), dtype=np.float64)
+    Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float64)
+    select_action=lambda state,Q,epsilon: np.argmax(Q[state]) if np.random.random()>epsilon else np.random.randint(len(Q[state]))
+    alphas=decay_schedule(
+        init_alpha,min_alpha,
+        alpha_decay_ratio,
+        n_episodes)
+    epsilons=decay_schedule(
+        init_epsilon,min_epsilon,
+        epsilon_decay_ratio,
+        n_episodes)
+    for e in tqdm(range(n_episodes),leave=False):
+      state,done=env.reset(),False
+      action=select_action(state,Q,epsilons[e])
+      while not done:
+        action=select_action(state,Q,epsilons[e])
+        next_state,reward,done,_=env.step(action)
+        td_target=reward+gamma*Q[next_state].max()*(not done)
+        td_error=td_target-Q[state][action]
+        Q[state][action]=Q[state][action]+alphas[e]*td_error
+        state=next_state
+      Q_track[e]=Q
+      pi_track.append(np.argmax(Q,axis=1))
+    V=np.max(Q,axis=1)
+    pi=lambda s:{s:a for s,a in enumerate(np.argmax(Q,axis=1))}[s]
+    return Q, V, pi, Q_track, pi_track
 ## OUTPUT:
-Mention the optimal policy, optimal value function , success rate for the optimal policy.
-
-Include plot comparing the state value functions of Monte Carlo method and Qlearning.
+<img width="636" alt="image" src="https://github.com/obedotto/q-learning/assets/95043391/7e4ce389-307d-40b8-b2d2-dcecd5d727aa">
+<img width="479" alt="image" src="https://github.com/obedotto/q-learning/assets/95043391/8c55f097-cdc5-4eea-a22f-34137259ecdd">
+<img width="664" alt="image" src="https://github.com/obedotto/q-learning/assets/95043391/3a1b1b17-fb47-4bc1-831b-410d698a0d03">
+<img width="671" alt="image" src="https://github.com/obedotto/q-learning/assets/95043391/330d30ea-53e4-4674-b3ef-72a04d7f3626">
 
 ## RESULT:
+Therefore a python program has been successfully developed to find the optimal policy for the given RL environment using Q-Learning and compared the state values with the Monte Carlo method.
 
-Write your result here