0% found this document useful (0 votes)

27 views

Py Code Chapter 02

This document contains Python code for multi-armed bandit algorithms. It implements greedy, epsilon-greedy, and upper confidence bound algorithms on test problems with different numbers of arms. It generates and analyzes results for average reward and percentage of optimal actions over time.

Uploaded by

Emily Cheng

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

27 views

Py Code Chapter 02

Uploaded by

Emily Cheng

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

You are on page 1/ 10

## Python codes for Chapter 2: Multi-armed Bandits

#################

## Section 2.3: 10-armed testbed

import numpy as np
import matplotlib.pyplot as plt

which = lambda status: np.arange(len(status))[status]

rep = lambda x, k: np.array(list(x)*k)

# greedy action

k_bandit = 10
run_num = 2000
step_num = 1000
reward_history_1 = np.zeros((run_num, step_num))
count_optimal_history_1 = np.zeros((run_num, step_num))

seed = 1
np.random.seed(seed)

for i in range(run_num):
q_star = np.random.normal(loc = 0, scale = 1.0, size = k_bandit)
index_optimal = np.argmax(q_star) # optimal action
K_accum = np.zeros(k_bandit)
R_accum = np.zeros(k_bandit)
Q = np.zeros(k_bandit)
for j in range(step_num):
if j == 0:
index = np.random.randint(low = 0, high = k_bandit, size = 1) #
index of the initial action
else:
index = np.argmax(Q)
reward = np.random.normal(loc = q_star[index], scale = 1.0, size = 1)
reward_history_1[i, j] = reward
if index == index_optimal:
count_optimal_history_1[i, j] = 1
K_accum[index] = K_accum[index] + 1
R_accum[index] = R_accum[index] + reward
Q[index] = R_accum[index] / K_accum[index]

ave_reward_1 = np.mean(reward_history_1, axis = 0)

percent_optimal_1 = np.mean(count_optimal_history_1, axis = 0)

#################

# eps-greedy action: eps = 0.01

eps = 0.01

k_bandit = 10
run_num = 2000
step_num = 1000
reward_history_2 = np.zeros((run_num, step_num))
count_optimal_history_2 = np.zeros((run_num, step_num))
seed = 1
np.random.seed(seed)

for i in range(run_num):
q_star = np.random.normal(loc = 0, scale = 1.0, size = k_bandit)
index_optimal = np.argmax(q_star) # optimal action
K_accum = np.zeros(k_bandit)
R_accum = np.zeros(k_bandit)
Q = np.zeros(k_bandit)
for j in range(step_num):
if j == 0:
index = np.random.randint(low = 0, high = k_bandit, size = 1) #
index of the initial action
else:
rand_num = np.random.uniform(low = 0, high = 1, size = 1)
if rand_num < eps:
index = np.random.randint(low = 0, high = k_bandit, size =
1)
else:
index = np.argmax(Q)
reward = np.random.normal(loc = q_star[index], scale = 1.0, size = 1)
reward_history_2[i, j] = reward
if index == index_optimal:
count_optimal_history_2[i, j] = 1
K_accum[index] = K_accum[index] + 1
R_accum[index] = R_accum[index] + reward
Q[index] = R_accum[index] / K_accum[index]

ave_reward_2 = np.mean(reward_history_2, axis = 0)

percent_optimal_2 = np.mean(count_optimal_history_2, axis = 0)

#################

# eps-greedy action: eps = 0.1

eps = 0.1

k_bandit = 10
run_num = 2000
step_num = 1000
reward_history_3 = np.zeros((run_num, step_num))
count_optimal_history_3 = np.zeros((run_num, step_num))

seed = 1
np.random.seed(seed)

for i in range(run_num):
q_star = np.random.normal(loc = 0, scale = 1.0, size = k_bandit)
index_optimal = np.argmax(q_star) # optimal action
K_accum = np.zeros(k_bandit)
R_accum = np.zeros(k_bandit)
Q = np.zeros(k_bandit)
for j in range(step_num):
if j == 0:
index = np.random.randint(low = 0, high = k_bandit, size = 1) #
index of the initial action
else:
rand_num = np.random.uniform(low = 0, high = 1, size = 1)
if rand_num < eps:
index = np.random.randint(low = 0, high = k_bandit, size =
1)
else:
index = np.argmax(Q)
reward = np.random.normal(loc = q_star[index], scale = 1.0, size = 1)
reward_history_3[i, j] = reward
if index == index_optimal:
count_optimal_history_3[i, j] = 1
K_accum[index] = K_accum[index] + 1
R_accum[index] = R_accum[index] + reward
Q[index] = R_accum[index] / K_accum[index]

ave_reward_3 = np.mean(reward_history_3, axis = 0)

percent_optimal_3 = np.mean(count_optimal_history_3, axis = 0)

#################

# Figure 2.2(a)

ymin = np.min([ave_reward_1, ave_reward_2, ave_reward_3])

ymax = np.max([ave_reward_1, ave_reward_2, ave_reward_3])

ymin
ymax

plt.figure("Figure 2.2(a)")
plt.xlabel('step')
plt.ylabel('average reward')
plt.ylim(ymin, ymax)
plt.plot(range(step_num), ave_reward_1, 'r-', linewidth = 1, label = "eps = 0
(greedy)")
plt.plot(range(step_num), ave_reward_2, 'g--', linewidth = 1, label = "eps = 0.01")
plt.plot(range(step_num), ave_reward_3, 'b-.', linewidth = 1, label = "eps = 0.1")
plt.legend()
plt.show()

#################

# Figure 2.2(b)

ymin = np.min([percent_optimal_1, percent_optimal_2, percent_optimal_3])

ymax = np.max([percent_optimal_1, percent_optimal_2, percent_optimal_3])

ymin
ymax

plt.figure("Figure 2.2(b)")
plt.xlabel('step')
plt.ylabel('% optimal action')
plt.ylim(ymin, ymax)
plt.plot(range(step_num), percent_optimal_1, 'r-', linewidth = 1, label = "eps = 0
(greedy)")
plt.plot(range(step_num), percent_optimal_2, 'g--', linewidth = 1, label = "eps =
0.01")
plt.plot(range(step_num), percent_optimal_3, 'b-.', linewidth = 1, label = "eps =
0.1")
plt.legend()
plt.show()

#################

## Section 2.6: Optimistic Initial Values

# different initial values

# greedy action (eps = 0) with Q1 = 5

Q1 = 5.0 # warning: data type must be "float"

alpha = 0.1

k_bandit = 10
run_num = 2000
step_num = 1000
reward_history_1 = np.zeros((run_num, step_num))
count_optimal_history_1 = np.zeros((run_num, step_num))

seed = 1
np.random.seed(seed)

for i in range(run_num):
q_star = np.random.normal(loc = 0, scale = 1.0, size = k_bandit)
index_optimal = np.argmax(q_star) # optimal action
#Q = np.repeat(Q1, k_bandit) # becomes tuple
Q = rep([Q1], k_bandit)
#print("data type of Q: ", Q.dtype)
for j in range(step_num):
if j == 0:
index = np.random.randint(low = 0, high = k_bandit, size = 1) #
index of the initial action
else:
index = np.argmax(Q)
reward = np.random.normal(loc = q_star[index], scale = 1.0, size = 1)
reward_history_1[i, j] = reward
if index == index_optimal:
count_optimal_history_1[i, j] = 1
Q[index] = Q[index] + alpha * (reward - Q[index])

ave_reward_1 = np.mean(reward_history_1, axis = 0)

percent_optimal_1 = np.mean(count_optimal_history_1, axis = 0)

#################

# eps-greedy action: eps = 0.1 with Q1 = 0

Q1 = 0.0 # warning: data type must be "float"

eps = 0.1
alpha = 0.1

k_bandit = 10
run_num = 2000
step_num = 1000
q_star = np.zeros(k_bandit) # expected reward
reward_history_2 = np.zeros((run_num, step_num))
count_optimal_history_2 = np.zeros((run_num, step_num))
seed = 1
np.random.seed(seed)

for i in range(run_num):
q_star = np.random.normal(loc = 0, scale = 1.0, size = k_bandit)
index_optimal = np.argmax(q_star) # optimal action
#Q = np.repeat(Q1, k_bandit) # becomes tuple
Q = rep([Q1], k_bandit)
#print("data type of Q: ", Q.dtype)
for j in range(step_num):
if j == 0:
index = np.random.randint(low = 0, high = k_bandit, size = 1) #
index of the initial action
else:
rand_num = np.random.uniform(low = 0, high = 1, size = 1)
if rand_num < eps:
index = np.random.randint(low = 0, high = k_bandit, size =
1)
else:
index = np.argmax(Q)
reward = np.random.normal(loc = q_star[index], scale = 1.0, size = 1)
reward_history_2[i, j] = reward
if index == index_optimal:
count_optimal_history_2[i, j] = 1
Q[index] = Q[index] + alpha * (reward - Q[index])

ave_reward_2 = np.mean(reward_history_2, axis = 0)

percent_optimal_2 = np.mean(count_optimal_history_2, axis = 0)

#################

# Figure 2.3(a)

ymin = np.min([ave_reward_1, ave_reward_2])

ymax = np.max([ave_reward_1, ave_reward_2])

ymin
ymax

plt.figure("Figure 2.3(a)")
plt.xlabel('step')
plt.ylabel('average reward')
plt.ylim(ymin, ymax)
plt.plot(range(step_num), ave_reward_1, 'r-', linewidth = 1, label = "eps = 0, Q1 =
5")
plt.plot(range(step_num), ave_reward_2, 'b--', linewidth = 1, label = "eps = 0.1,
Q1 = 0")
plt.legend()
plt.show()

#################

# Figure 2.3(b)

ymin = np.min([percent_optimal_1, percent_optimal_2])

ymax = np.max([percent_optimal_1, percent_optimal_2])

ymin
ymax

plt.figure("Figure 2.3(b)")
plt.xlabel('step')
plt.ylabel('% optimal action')
plt.ylim(ymin, ymax)
plt.plot(range(step_num), percent_optimal_1, 'r-', linewidth = 1, label = "eps = 0,
Q1 = 5")
plt.plot(range(step_num), percent_optimal_2, 'b--', linewidth = 1, label = "eps =
0.1, Q1 = 0")
plt.legend()
plt.show()

#################

## Section 2.7: Upper-Confidence-Bound Action Selection

# eps-greedy action selection versus upper-confidence-bound action selection

# eps-greedy action: eps = 0.1

eps = 0.1

k_bandit = 10
run_num = 2000
step_num = 1000
reward_history_eps = np.zeros((run_num, step_num))
count_optimal_history_eps = np.zeros((run_num, step_num))

seed = 1
np.random.seed(seed)

for i in range(run_num):
q_star = np.random.normal(loc = 0, scale = 1.0, size = k_bandit)
index_optimal = np.argmax(q_star) # optimal action
K_accum = np.zeros(k_bandit)
R_accum = np.zeros(k_bandit)
Q = np.zeros(k_bandit)
for j in range(step_num):
if j == 0:
index = np.random.randint(low = 0, high = k_bandit, size = 1) #
index of the initial action
else:
rand_num = np.random.uniform(low = 0, high = 1, size = 1)
if rand_num < eps:
index = np.random.randint(low = 0, high = k_bandit, size =
1)
else:
index = np.argmax(Q)
reward = np.random.normal(loc = q_star[index], scale = 1.0, size = 1)
reward_history_eps[i, j] = reward
if index == index_optimal:
count_optimal_history_eps[i, j] = 1
K_accum[index] = K_accum[index] + 1
R_accum[index] = R_accum[index] + reward
Q[index] = R_accum[index] / K_accum[index]

ave_reward_eps = np.mean(reward_history_eps, axis = 0)

percent_optimal_eps = np.mean(count_optimal_history_eps, axis = 0)

#################

# UCB action selection

c = 2

k_bandit = 10
run_num = 2000
step_num = 1000
reward_history_ucb = np.zeros((run_num, step_num))
count_optimal_history_ucb = np.zeros((run_num, step_num))

seed = 1
np.random.seed(seed)

for i in range(run_num):
q_star = np.random.normal(loc = 0, scale = 1.0, size = k_bandit)
index_optimal = np.argmax(q_star) # optimal action
K_accum = np.zeros(k_bandit)
R_accum = np.zeros(k_bandit)
Q = np.zeros(k_bandit)
for j in range(step_num):
if j == 0:
index = np.random.randint(low = 0, high = k_bandit, size = 1) #
index of the initial action
else:
index_tmp = which(K_accum == 0)
if len(index_tmp) > 0:
index = np.random.choice(index_tmp, size = 1)
else:
index = np.argmax(Q + c * np.sqrt(np.log(j) / K_accum))
reward = np.random.normal(loc = q_star[index], scale = 1.0, size = 1)
reward_history_ucb[i, j] = reward
if index == index_optimal:
count_optimal_history_ucb[i, j] = 1
K_accum[index] = K_accum[index] + 1
R_accum[index] = R_accum[index] + reward
Q[index] = R_accum[index] / K_accum[index]

ave_reward_ucb = np.mean(reward_history_ucb, axis = 0)

percent_optimal_ucb = np.mean(count_optimal_history_ucb, axis = 0)

#################

# Figure 2.4(a)

ymin = np.min([ave_reward_eps, ave_reward_ucb])

ymax = np.max([ave_reward_eps, ave_reward_ucb])

ymin
ymax

plt.figure("Figure 2.4(a)")
plt.xlabel('step')
plt.ylabel('average reward')
plt.ylim(ymin, ymax)
plt.plot(range(step_num), ave_reward_eps, 'r-', linewidth = 1, label = "eps = 0.1")
plt.plot(range(step_num), ave_reward_ucb, 'b--', linewidth = 1, label = "ucb: c =
2")
plt.legend()
plt.show()

#################

# Figure 2.4(b)

ymin = np.min([percent_optimal_eps, percent_optimal_ucb])

ymax = np.max([percent_optimal_eps, percent_optimal_ucb])

ymin
ymax

plt.figure("Figure 2.4(a)")
plt.xlabel('step')
plt.ylabel('% optimal action')
plt.ylim(ymin, ymax)
plt.plot(range(step_num), percent_optimal_eps, 'r-', linewidth = 1, label = "eps =
0.1")
plt.plot(range(step_num), percent_optimal_ucb, 'b--', linewidth = 1, label = "ucb:
c = 2")
plt.legend()
plt.show()

#################

## Section 2.8: Gradient Bandit Algorithms

# gradient bandit algorithms

k_bandit = 10
run_num = 2000
step_num = 1000
reward_history = np.zeros((4, run_num, step_num))
count_optimal_history = np.zeros((4, run_num, step_num))

seed = 1
np.random.seed(seed)

for setting in range(4):

print("setting = ", setting)
if setting == 0:
alpha = 0.1; R_bar = True
if setting == 1:
alpha = 0.4; R_bar = True
if setting == 2:
alpha = 0.1; R_bar = False
if setting == 3:
alpha = 0.4; R_bar = False
for i in range(run_num):
q_star = np.random.normal(loc = 4, scale = 1.0, size = k_bandit)
index_optimal = np.argmax(q_star) # optimal action
reward_accum = 0
H = np.zeros(k_bandit)
for j in range(step_num):
sum_all = np.sum(np.exp(H))
prob = np.exp(H) / sum_all
if j == 0:
index = np.random.randint(low = 0, high = k_bandit, size =
1) # index of the initial action
else:
#index = np.argmax(prob) # first version: greedy action
index = np.random.choice(k_bandit, size = 1, p = prob) #
second version: less greedy action
reward = np.random.normal(loc = q_star[index], scale = 1.0, size
= 1)
reward_history[setting, i, j] = reward
if index == index_optimal:
count_optimal_history[setting, i, j] = 1
reward_accum = reward_accum + reward
reward_ave = reward_accum / (j+1) # chech here!
for k in range(k_bandit):
if k == index:
if R_bar == True:
H[index] = H[index] + alpha * (reward -
reward_ave) * (1 - prob[index])
else:
H[index] = H[index] + alpha * (reward) * (1 -
prob[index])
else:
if R_bar == True:
H[k] = H[k] - alpha * (reward - reward_ave) *
prob[k]
else:
H[k] = H[k] - alpha * (reward) * prob[k]

ave_reward = np.mean(reward_history, axis = 1)

percent_optimal = np.mean(count_optimal_history, axis = 1)

#################

# Figure 2.5(a)

ymin = np.min(ave_reward)
ymax = np.max(ave_reward)

ymin
ymax

plt.figure("Figure 2.5(a)")
plt.xlabel('step')
plt.ylabel('average reward')
plt.ylim(ymin, ymax)
plt.plot(range(step_num), ave_reward[0,:], 'r-', linewidth = 1, label = "with
baseline, alpha = 0.1")
plt.plot(range(step_num), ave_reward[1,:], 'g--', linewidth = 1, label = "with
baseline, alpha = 0.4")
plt.plot(range(step_num), ave_reward[2,:], 'b:', linewidth = 1, label = "without
baseline, alpha = 0.1")
plt.plot(range(step_num), ave_reward[3,:], 'm-.', linewidth = 1, label = "without
baseline, alpha = 0.4")
plt.legend()
plt.show()
#################

# Figure 2.5(b)

ymin = np.min(percent_optimal)
ymax = np.max(percent_optimal)

ymin
ymax

plt.figure("Figure 2.5(b)")
plt.xlabel('step')
plt.ylabel('% optimal action')
plt.ylim(ymin, ymax)
plt.plot(range(step_num), percent_optimal[0,:], 'r-', linewidth = 1, label = "with
baseline, alpha = 0.1")
plt.plot(range(step_num), percent_optimal[1,:], 'g--', linewidth = 1, label = "with
baseline, alpha = 0.4")
plt.plot(range(step_num), percent_optimal[2,:], 'b:', linewidth = 1, label =
"without baseline, alpha = 0.1")
plt.plot(range(step_num), percent_optimal[3,:], 'm-.', linewidth = 1, label =
"without baseline, alpha = 0.4")
plt.legend()
plt.show()

#################

# Figure 2.6: too time-consuming

#################

RLbook Solutions Manual
No ratings yet
RLbook Solutions Manual
35 pages
Assignment 1: Reinforcement Learning Prof. B. Ravindran
100% (2)
Assignment 1: Reinforcement Learning Prof. B. Ravindran
4 pages
Unit II
No ratings yet
Unit II
10 pages
Ad Py
No ratings yet
Ad Py
2 pages
Multi-armed bandits
No ratings yet
Multi-armed bandits
11 pages
Lecture 2 EE675
No ratings yet
Lecture 2 EE675
4 pages
Py Code Example 3 8 Value Iteration
No ratings yet
Py Code Example 3 8 Value Iteration
3 pages
Py Code Example 3 5 Policy Evaluation
No ratings yet
Py Code Example 3 5 Policy Evaluation
3 pages
Homework - 0: - MD Aamir Sohail - EE16BTECH11021 - AI5001: Introduction To Modern AI CODE: Q.5 ( - Greedy Method)
No ratings yet
Homework - 0: - MD Aamir Sohail - EE16BTECH11021 - AI5001: Introduction To Modern AI CODE: Q.5 ( - Greedy Method)
10 pages
Experiment 6
No ratings yet
Experiment 6
7 pages
Dissecting Reinforcement Learning-Part6
No ratings yet
Dissecting Reinforcement Learning-Part6
25 pages
Assignment 1
No ratings yet
Assignment 1
24 pages
rl-unit5
No ratings yet
rl-unit5
101 pages
Introduction To Bandits: (Some Slides Stolen From Csaba's AAAI Tutorial)
No ratings yet
Introduction To Bandits: (Some Slides Stolen From Csaba's AAAI Tutorial)
16 pages
Unit:1 Reinforcement Learning
No ratings yet
Unit:1 Reinforcement Learning
9 pages
Data Challenge - NC Soft
No ratings yet
Data Challenge - NC Soft
4 pages
Py Code Example 4 1 Policy Evaluation
No ratings yet
Py Code Example 4 1 Policy Evaluation
3 pages
EAS 240 MAB Project Description Spring 2025
No ratings yet
EAS 240 MAB Project Description Spring 2025
10 pages
Practical No4,5
No ratings yet
Practical No4,5
7 pages
Py Code Example 11 0 Baird Emphatic TD
No ratings yet
Py Code Example 11 0 Baird Emphatic TD
3 pages
Py Code Example 11 0 Baird Semi Gradient DP Like
No ratings yet
Py Code Example 11 0 Baird Semi Gradient DP Like
3 pages
Bandit
No ratings yet
Bandit
8 pages
Py Code Example 11 0 Baird Semi Gradient TD0
No ratings yet
Py Code Example 11 0 Baird Semi Gradient TD0
3 pages
Multi-Armed Bandit Algorithms and Empirical Evaluation
No ratings yet
Multi-Armed Bandit Algorithms and Empirical Evaluation
12 pages
Auer - Using Ucb for Exploration-exploitation Tradeoffs
No ratings yet
Auer - Using Ucb for Exploration-exploitation Tradeoffs
26 pages
Evendar 06 A
No ratings yet
Evendar 06 A
27 pages
Lab 9 Report Group 2
No ratings yet
Lab 9 Report Group 2
5 pages
Multi-Armed Bandits epsilon-greedy algorithm
No ratings yet
Multi-Armed Bandits epsilon-greedy algorithm
14 pages
What We Learned Last Time: 1. Intelligence Is The Computational Part of The Ability To Achieve Goals
No ratings yet
What We Learned Last Time: 1. Intelligence Is The Computational Part of The Ability To Achieve Goals
32 pages
Unit:1 Reinforcement Learning: Upper-Confidence-Bound Action Selection, Gradient Bandits
No ratings yet
Unit:1 Reinforcement Learning: Upper-Confidence-Bound Action Selection, Gradient Bandits
6 pages
A12-Online Learning Short 2020
No ratings yet
A12-Online Learning Short 2020
61 pages
29117-Article Text-33171-1-2-20240324
No ratings yet
29117-Article Text-33171-1-2-20240324
8 pages
RL UNIT PPT
No ratings yet
RL UNIT PPT
595 pages
Online Learning For Causal Bandits
No ratings yet
Online Learning For Causal Bandits
7 pages
RBF Based Reinforcement Learning
No ratings yet
RBF Based Reinforcement Learning
14 pages
Statistical Inference For Online Decision-Making: in A Contextual Bandit Setting
No ratings yet
Statistical Inference For Online Decision-Making: in A Contextual Bandit Setting
44 pages
RL-Endterm Report - Mridul Agarwal
No ratings yet
RL-Endterm Report - Mridul Agarwal
27 pages
CS6700 - Tutorial - 1 - Bandits - Ipynb - Colaboratory
No ratings yet
CS6700 - Tutorial - 1 - Bandits - Ipynb - Colaboratory
6 pages
ConcaveBandits_ICML2025
No ratings yet
ConcaveBandits_ICML2025
19 pages
Class 3
No ratings yet
Class 3
32 pages
Multi-Armed Bandit Problem With Online Clustering As Side
No ratings yet
Multi-Armed Bandit Problem With Online Clustering As Side
13 pages
Rlassignment 2
No ratings yet
Rlassignment 2
3 pages
Qlearning Py
No ratings yet
Qlearning Py
3 pages
NeurIPS 2021 Breaking The Moments Condition Barrier No Regret Algorithm For Bandits With Super Heavy Tailed Payoffs Paper
No ratings yet
NeurIPS 2021 Breaking The Moments Condition Barrier No Regret Algorithm For Bandits With Super Heavy Tailed Payoffs Paper
11 pages
Lecture 1: Introduction: Lecturer: Prof. Subrahmanya Swamy Peruru Scribe: Harshvardhan Arya - Rishabh Katiyar
No ratings yet
Lecture 1: Introduction: Lecturer: Prof. Subrahmanya Swamy Peruru Scribe: Harshvardhan Arya - Rishabh Katiyar
4 pages
multi-arm-bandit problem
No ratings yet
multi-arm-bandit problem
11 pages
UCB Algorithm in RL
No ratings yet
UCB Algorithm in RL
3 pages
Finite-Time Analysis of The Multi-Armed Bandit Problem With Known Trend
No ratings yet
Finite-Time Analysis of The Multi-Armed Bandit Problem With Known Trend
7 pages
Assignment 3: Reinforcement Learning Prof. B. Ravindran
100% (1)
Assignment 3: Reinforcement Learning Prof. B. Ravindran
4 pages
Necessary and Sufficient Conditions For Achieving Sub-Linear Regret in Stochastic Multi-Armed Bandits
No ratings yet
Necessary and Sufficient Conditions For Achieving Sub-Linear Regret in Stochastic Multi-Armed Bandits
9 pages
MAB Assignment 2
No ratings yet
MAB Assignment 2
2 pages
Exploration Vs Exploitation in Stationary Multi-Armed Bandit Problems
No ratings yet
Exploration Vs Exploitation in Stationary Multi-Armed Bandit Problems
15 pages
Frozen Lake
No ratings yet
Frozen Lake
6 pages
Ass1 Merged Merged
No ratings yet
Ass1 Merged Merged
19 pages
Py Code Example 4 1 Gradient MC Evaluation
No ratings yet
Py Code Example 4 1 Gradient MC Evaluation
4 pages
TD 1
No ratings yet
TD 1
1 page
Exploration Exploitation
No ratings yet
Exploration Exploitation
40 pages

Py Code Chapter 02

Uploaded by

Py Code Chapter 02

Uploaded by

## Python codes for Chapter 2: Multi-armed Bandits

## Section 2.3: 10-armed testbed

which = lambda status: np.arange(len(status))[status]

ave_reward_1 = np.mean(reward_history_1, axis = 0)

percent_optimal_1 = np.mean(count_optimal_history_1, axis = 0)

# eps-greedy action: eps = 0.01

ave_reward_2 = np.mean(reward_history_2, axis = 0)

percent_optimal_2 = np.mean(count_optimal_history_2, axis = 0)

# eps-greedy action: eps = 0.1

ave_reward_3 = np.mean(reward_history_3, axis = 0)

percent_optimal_3 = np.mean(count_optimal_history_3, axis = 0)

ymin = np.min([ave_reward_1, ave_reward_2, ave_reward_3])

ymin = np.min([percent_optimal_1, percent_optimal_2, percent_optimal_3])

## Section 2.6: Optimistic Initial Values

# different initial values

# greedy action (eps = 0) with Q1 = 5

Q1 = 5.0 # warning: data type must be "float"

ave_reward_1 = np.mean(reward_history_1, axis = 0)

percent_optimal_1 = np.mean(count_optimal_history_1, axis = 0)

# eps-greedy action: eps = 0.1 with Q1 = 0

Q1 = 0.0 # warning: data type must be "float"

ave_reward_2 = np.mean(reward_history_2, axis = 0)

percent_optimal_2 = np.mean(count_optimal_history_2, axis = 0)

ymin = np.min([ave_reward_1, ave_reward_2])

ymin = np.min([percent_optimal_1, percent_optimal_2])

## Section 2.7: Upper-Confidence-Bound Action Selection

# eps-greedy action selection versus upper-confidence-bound action selection

# eps-greedy action: eps = 0.1

ave_reward_eps = np.mean(reward_history_eps, axis = 0)

# UCB action selection

ave_reward_ucb = np.mean(reward_history_ucb, axis = 0)

percent_optimal_ucb = np.mean(count_optimal_history_ucb, axis = 0)

ymin = np.min([ave_reward_eps, ave_reward_ucb])

ymin = np.min([percent_optimal_eps, percent_optimal_ucb])

## Section 2.8: Gradient Bandit Algorithms

# gradient bandit algorithms

for setting in range(4):

ave_reward = np.mean(reward_history, axis = 1)

percent_optimal = np.mean(count_optimal_history, axis = 1)

# Figure 2.6: too time-consuming

You might also like