Ass 2
Ass 2
In [1]:
import gym
import random
import matplotlib.pyplot as plt
import numpy as np
In [3]:
In [4]:
# Q TABLE FOR THE GAME
#1
# qtable = np.zeros((16, 4))
#2
nb_states = environment.observation_space.n # = 16
nb_actions = environment.action_space.n # = 4
qtable = np.zeros((nb_states, nb_actions))
qtable
Out[4]:
array([[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.]])
In [5]:
# random.choice(["LEFT", "DOWN", "RIGHT", "UP"])
environment.action_space.sample()
Out[5]:
In [6]:
environment.step(2)
environment.render()
In [7]:
action = environment.action_space.sample()
# 2. Implement this action and move the agent in the desired direction
new_state, reward, done, info,a = environment.step(action)
# Display the results (reward and map)
environment.render()
print(f'Reward = {reward}')
Reward = 0.0
In [8]:
# Hyperparameters
episodes = 1000 # Total number of episodes
alpha = 0.5 # Learning rate
gamma = 0.9 # Discount factor
# Training
for _ in range(episodes):
state = environment.reset()[0]
done = False
# Until the agent gets stuck in a hole or reaches the goal, keep training it
while not done:
# Choose the action with the highest value in the current state
if np.max(qtable[state]) > 0:
action = np.argmax(qtable[state])
# Implement this action and move the agent in the desired direction
new_state, reward, done, info,a = environment.step(action)
# Update Q(s,a)
qtable[state, action] = qtable[state, action] + \
alpha * (reward + gamma * np.max(qtable[new_state]) - q
table[state, action])
print()
print('===========================================')
print('Q-table after training:')
print(qtable)
# Plot outcomes
plt.figure(figsize=(12, 5))
plt.xlabel("Run number")
plt.ylabel("Outcome")
ax = plt.gca()
ax.set_facecolor('#efeeea')
plt.bar(range(len(outcomes)), outcomes, color="#0A047A", width=1.0)
plt.show()
Q-table before training:
[[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]]
===========================================
Q-table after training:
[[0. 0. 0.59049 0. ]
[0. 0. 0.6561 0. ]
[0. 0.729 0. 0. ]
[0. 0. 0. 0. ]
[0. 0. 0. 0. ]
[0. 0. 0. 0. ]
[0. 0.81 0. 0. ]
[0. 0. 0. 0. ]
[0. 0. 0. 0. ]
[0. 0. 0.2784375 0. ]
[0. 0.9 0. 0. ]
[0. 0. 0. 0. ]
[0. 0. 0. 0. ]
[0. 0. 0. 0. ]
[0. 0. 1. 0. ]
[0. 0. 0. 0. ]]
In [9]:
from IPython.display import clear_output
import time
state = environment.reset()[0]
done = False
sequence = []
# Implement this action and move the agent in the desired direction
new_state, reward, done, info,a = environment.step(action)
print(f"Sequence = {sequence}")
Sequence = [2, 2, 1, 1, 1, 2]
In [10]:
episodes = 100
nb_success = 0
# Evaluation
for _ in range(100):
state = environment.reset()[0]
done = False
# Until the agent gets stuck or reaches the goal, keep training it
while not done:
# Choose the action with the highest value in the current state
if np.max(qtable[state]) > 0:
action = np.argmax(qtable[state])
# Implement this action and move the agent in the desired direction
new_state, reward, done, info,a = environment.step(action)