0% found this document useful (0 votes)
27 views

Py Code Chapter 02

This document contains Python code for multi-armed bandit algorithms. It implements greedy, epsilon-greedy, and upper confidence bound algorithms on test problems with different numbers of arms. It generates and analyzes results for average reward and percentage of optimal actions over time.

Uploaded by

Emily Cheng
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
27 views

Py Code Chapter 02

This document contains Python code for multi-armed bandit algorithms. It implements greedy, epsilon-greedy, and upper confidence bound algorithms on test problems with different numbers of arms. It generates and analyzes results for average reward and percentage of optimal actions over time.

Uploaded by

Emily Cheng
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 10

## Python codes for Chapter 2: Multi-armed Bandits

#################

## Section 2.3: 10-armed testbed

import numpy as np
import matplotlib.pyplot as plt

which = lambda status: np.arange(len(status))[status]


rep = lambda x, k: np.array(list(x)*k)

# greedy action

k_bandit = 10
run_num = 2000
step_num = 1000
reward_history_1 = np.zeros((run_num, step_num))
count_optimal_history_1 = np.zeros((run_num, step_num))

seed = 1
np.random.seed(seed)

for i in range(run_num):
q_star = np.random.normal(loc = 0, scale = 1.0, size = k_bandit)
index_optimal = np.argmax(q_star) # optimal action
K_accum = np.zeros(k_bandit)
R_accum = np.zeros(k_bandit)
Q = np.zeros(k_bandit)
for j in range(step_num):
if j == 0:
index = np.random.randint(low = 0, high = k_bandit, size = 1) #
index of the initial action
else:
index = np.argmax(Q)
reward = np.random.normal(loc = q_star[index], scale = 1.0, size = 1)
reward_history_1[i, j] = reward
if index == index_optimal:
count_optimal_history_1[i, j] = 1
K_accum[index] = K_accum[index] + 1
R_accum[index] = R_accum[index] + reward
Q[index] = R_accum[index] / K_accum[index]

ave_reward_1 = np.mean(reward_history_1, axis = 0)

percent_optimal_1 = np.mean(count_optimal_history_1, axis = 0)

#################

# eps-greedy action: eps = 0.01

eps = 0.01

k_bandit = 10
run_num = 2000
step_num = 1000
reward_history_2 = np.zeros((run_num, step_num))
count_optimal_history_2 = np.zeros((run_num, step_num))
seed = 1
np.random.seed(seed)

for i in range(run_num):
q_star = np.random.normal(loc = 0, scale = 1.0, size = k_bandit)
index_optimal = np.argmax(q_star) # optimal action
K_accum = np.zeros(k_bandit)
R_accum = np.zeros(k_bandit)
Q = np.zeros(k_bandit)
for j in range(step_num):
if j == 0:
index = np.random.randint(low = 0, high = k_bandit, size = 1) #
index of the initial action
else:
rand_num = np.random.uniform(low = 0, high = 1, size = 1)
if rand_num < eps:
index = np.random.randint(low = 0, high = k_bandit, size =
1)
else:
index = np.argmax(Q)
reward = np.random.normal(loc = q_star[index], scale = 1.0, size = 1)
reward_history_2[i, j] = reward
if index == index_optimal:
count_optimal_history_2[i, j] = 1
K_accum[index] = K_accum[index] + 1
R_accum[index] = R_accum[index] + reward
Q[index] = R_accum[index] / K_accum[index]

ave_reward_2 = np.mean(reward_history_2, axis = 0)

percent_optimal_2 = np.mean(count_optimal_history_2, axis = 0)

#################

# eps-greedy action: eps = 0.1

eps = 0.1

k_bandit = 10
run_num = 2000
step_num = 1000
reward_history_3 = np.zeros((run_num, step_num))
count_optimal_history_3 = np.zeros((run_num, step_num))

seed = 1
np.random.seed(seed)

for i in range(run_num):
q_star = np.random.normal(loc = 0, scale = 1.0, size = k_bandit)
index_optimal = np.argmax(q_star) # optimal action
K_accum = np.zeros(k_bandit)
R_accum = np.zeros(k_bandit)
Q = np.zeros(k_bandit)
for j in range(step_num):
if j == 0:
index = np.random.randint(low = 0, high = k_bandit, size = 1) #
index of the initial action
else:
rand_num = np.random.uniform(low = 0, high = 1, size = 1)
if rand_num < eps:
index = np.random.randint(low = 0, high = k_bandit, size =
1)
else:
index = np.argmax(Q)
reward = np.random.normal(loc = q_star[index], scale = 1.0, size = 1)
reward_history_3[i, j] = reward
if index == index_optimal:
count_optimal_history_3[i, j] = 1
K_accum[index] = K_accum[index] + 1
R_accum[index] = R_accum[index] + reward
Q[index] = R_accum[index] / K_accum[index]

ave_reward_3 = np.mean(reward_history_3, axis = 0)

percent_optimal_3 = np.mean(count_optimal_history_3, axis = 0)

#################

# Figure 2.2(a)

ymin = np.min([ave_reward_1, ave_reward_2, ave_reward_3])


ymax = np.max([ave_reward_1, ave_reward_2, ave_reward_3])

ymin
ymax

plt.figure("Figure 2.2(a)")
plt.xlabel('step')
plt.ylabel('average reward')
plt.ylim(ymin, ymax)
plt.plot(range(step_num), ave_reward_1, 'r-', linewidth = 1, label = "eps = 0
(greedy)")
plt.plot(range(step_num), ave_reward_2, 'g--', linewidth = 1, label = "eps = 0.01")
plt.plot(range(step_num), ave_reward_3, 'b-.', linewidth = 1, label = "eps = 0.1")
plt.legend()
plt.show()

#################

# Figure 2.2(b)

ymin = np.min([percent_optimal_1, percent_optimal_2, percent_optimal_3])


ymax = np.max([percent_optimal_1, percent_optimal_2, percent_optimal_3])

ymin
ymax

plt.figure("Figure 2.2(b)")
plt.xlabel('step')
plt.ylabel('% optimal action')
plt.ylim(ymin, ymax)
plt.plot(range(step_num), percent_optimal_1, 'r-', linewidth = 1, label = "eps = 0
(greedy)")
plt.plot(range(step_num), percent_optimal_2, 'g--', linewidth = 1, label = "eps =
0.01")
plt.plot(range(step_num), percent_optimal_3, 'b-.', linewidth = 1, label = "eps =
0.1")
plt.legend()
plt.show()

#################

## Section 2.6: Optimistic Initial Values

# different initial values

# greedy action (eps = 0) with Q1 = 5

Q1 = 5.0 # warning: data type must be "float"


alpha = 0.1

k_bandit = 10
run_num = 2000
step_num = 1000
reward_history_1 = np.zeros((run_num, step_num))
count_optimal_history_1 = np.zeros((run_num, step_num))

seed = 1
np.random.seed(seed)

for i in range(run_num):
q_star = np.random.normal(loc = 0, scale = 1.0, size = k_bandit)
index_optimal = np.argmax(q_star) # optimal action
#Q = np.repeat(Q1, k_bandit) # becomes tuple
Q = rep([Q1], k_bandit)
#print("data type of Q: ", Q.dtype)
for j in range(step_num):
if j == 0:
index = np.random.randint(low = 0, high = k_bandit, size = 1) #
index of the initial action
else:
index = np.argmax(Q)
reward = np.random.normal(loc = q_star[index], scale = 1.0, size = 1)
reward_history_1[i, j] = reward
if index == index_optimal:
count_optimal_history_1[i, j] = 1
Q[index] = Q[index] + alpha * (reward - Q[index])

ave_reward_1 = np.mean(reward_history_1, axis = 0)

percent_optimal_1 = np.mean(count_optimal_history_1, axis = 0)

#################

# eps-greedy action: eps = 0.1 with Q1 = 0

Q1 = 0.0 # warning: data type must be "float"


eps = 0.1
alpha = 0.1

k_bandit = 10
run_num = 2000
step_num = 1000
q_star = np.zeros(k_bandit) # expected reward
reward_history_2 = np.zeros((run_num, step_num))
count_optimal_history_2 = np.zeros((run_num, step_num))
seed = 1
np.random.seed(seed)

for i in range(run_num):
q_star = np.random.normal(loc = 0, scale = 1.0, size = k_bandit)
index_optimal = np.argmax(q_star) # optimal action
#Q = np.repeat(Q1, k_bandit) # becomes tuple
Q = rep([Q1], k_bandit)
#print("data type of Q: ", Q.dtype)
for j in range(step_num):
if j == 0:
index = np.random.randint(low = 0, high = k_bandit, size = 1) #
index of the initial action
else:
rand_num = np.random.uniform(low = 0, high = 1, size = 1)
if rand_num < eps:
index = np.random.randint(low = 0, high = k_bandit, size =
1)
else:
index = np.argmax(Q)
reward = np.random.normal(loc = q_star[index], scale = 1.0, size = 1)
reward_history_2[i, j] = reward
if index == index_optimal:
count_optimal_history_2[i, j] = 1
Q[index] = Q[index] + alpha * (reward - Q[index])

ave_reward_2 = np.mean(reward_history_2, axis = 0)

percent_optimal_2 = np.mean(count_optimal_history_2, axis = 0)

#################

# Figure 2.3(a)

ymin = np.min([ave_reward_1, ave_reward_2])


ymax = np.max([ave_reward_1, ave_reward_2])

ymin
ymax

plt.figure("Figure 2.3(a)")
plt.xlabel('step')
plt.ylabel('average reward')
plt.ylim(ymin, ymax)
plt.plot(range(step_num), ave_reward_1, 'r-', linewidth = 1, label = "eps = 0, Q1 =
5")
plt.plot(range(step_num), ave_reward_2, 'b--', linewidth = 1, label = "eps = 0.1,
Q1 = 0")
plt.legend()
plt.show()

#################

# Figure 2.3(b)

ymin = np.min([percent_optimal_1, percent_optimal_2])


ymax = np.max([percent_optimal_1, percent_optimal_2])

ymin
ymax

plt.figure("Figure 2.3(b)")
plt.xlabel('step')
plt.ylabel('% optimal action')
plt.ylim(ymin, ymax)
plt.plot(range(step_num), percent_optimal_1, 'r-', linewidth = 1, label = "eps = 0,
Q1 = 5")
plt.plot(range(step_num), percent_optimal_2, 'b--', linewidth = 1, label = "eps =
0.1, Q1 = 0")
plt.legend()
plt.show()

#################

## Section 2.7: Upper-Confidence-Bound Action Selection

# eps-greedy action selection versus upper-confidence-bound action selection

# eps-greedy action: eps = 0.1

eps = 0.1

k_bandit = 10
run_num = 2000
step_num = 1000
reward_history_eps = np.zeros((run_num, step_num))
count_optimal_history_eps = np.zeros((run_num, step_num))

seed = 1
np.random.seed(seed)

for i in range(run_num):
q_star = np.random.normal(loc = 0, scale = 1.0, size = k_bandit)
index_optimal = np.argmax(q_star) # optimal action
K_accum = np.zeros(k_bandit)
R_accum = np.zeros(k_bandit)
Q = np.zeros(k_bandit)
for j in range(step_num):
if j == 0:
index = np.random.randint(low = 0, high = k_bandit, size = 1) #
index of the initial action
else:
rand_num = np.random.uniform(low = 0, high = 1, size = 1)
if rand_num < eps:
index = np.random.randint(low = 0, high = k_bandit, size =
1)
else:
index = np.argmax(Q)
reward = np.random.normal(loc = q_star[index], scale = 1.0, size = 1)
reward_history_eps[i, j] = reward
if index == index_optimal:
count_optimal_history_eps[i, j] = 1
K_accum[index] = K_accum[index] + 1
R_accum[index] = R_accum[index] + reward
Q[index] = R_accum[index] / K_accum[index]

ave_reward_eps = np.mean(reward_history_eps, axis = 0)


percent_optimal_eps = np.mean(count_optimal_history_eps, axis = 0)

#################

# UCB action selection

c = 2

k_bandit = 10
run_num = 2000
step_num = 1000
reward_history_ucb = np.zeros((run_num, step_num))
count_optimal_history_ucb = np.zeros((run_num, step_num))

seed = 1
np.random.seed(seed)

for i in range(run_num):
q_star = np.random.normal(loc = 0, scale = 1.0, size = k_bandit)
index_optimal = np.argmax(q_star) # optimal action
K_accum = np.zeros(k_bandit)
R_accum = np.zeros(k_bandit)
Q = np.zeros(k_bandit)
for j in range(step_num):
if j == 0:
index = np.random.randint(low = 0, high = k_bandit, size = 1) #
index of the initial action
else:
index_tmp = which(K_accum == 0)
if len(index_tmp) > 0:
index = np.random.choice(index_tmp, size = 1)
else:
index = np.argmax(Q + c * np.sqrt(np.log(j) / K_accum))
reward = np.random.normal(loc = q_star[index], scale = 1.0, size = 1)
reward_history_ucb[i, j] = reward
if index == index_optimal:
count_optimal_history_ucb[i, j] = 1
K_accum[index] = K_accum[index] + 1
R_accum[index] = R_accum[index] + reward
Q[index] = R_accum[index] / K_accum[index]

ave_reward_ucb = np.mean(reward_history_ucb, axis = 0)

percent_optimal_ucb = np.mean(count_optimal_history_ucb, axis = 0)

#################

# Figure 2.4(a)

ymin = np.min([ave_reward_eps, ave_reward_ucb])


ymax = np.max([ave_reward_eps, ave_reward_ucb])

ymin
ymax

plt.figure("Figure 2.4(a)")
plt.xlabel('step')
plt.ylabel('average reward')
plt.ylim(ymin, ymax)
plt.plot(range(step_num), ave_reward_eps, 'r-', linewidth = 1, label = "eps = 0.1")
plt.plot(range(step_num), ave_reward_ucb, 'b--', linewidth = 1, label = "ucb: c =
2")
plt.legend()
plt.show()

#################

# Figure 2.4(b)

ymin = np.min([percent_optimal_eps, percent_optimal_ucb])


ymax = np.max([percent_optimal_eps, percent_optimal_ucb])

ymin
ymax

plt.figure("Figure 2.4(a)")
plt.xlabel('step')
plt.ylabel('% optimal action')
plt.ylim(ymin, ymax)
plt.plot(range(step_num), percent_optimal_eps, 'r-', linewidth = 1, label = "eps =
0.1")
plt.plot(range(step_num), percent_optimal_ucb, 'b--', linewidth = 1, label = "ucb:
c = 2")
plt.legend()
plt.show()

#################

## Section 2.8: Gradient Bandit Algorithms

# gradient bandit algorithms

k_bandit = 10
run_num = 2000
step_num = 1000
reward_history = np.zeros((4, run_num, step_num))
count_optimal_history = np.zeros((4, run_num, step_num))

seed = 1
np.random.seed(seed)

for setting in range(4):


print("setting = ", setting)
if setting == 0:
alpha = 0.1; R_bar = True
if setting == 1:
alpha = 0.4; R_bar = True
if setting == 2:
alpha = 0.1; R_bar = False
if setting == 3:
alpha = 0.4; R_bar = False
for i in range(run_num):
q_star = np.random.normal(loc = 4, scale = 1.0, size = k_bandit)
index_optimal = np.argmax(q_star) # optimal action
reward_accum = 0
H = np.zeros(k_bandit)
for j in range(step_num):
sum_all = np.sum(np.exp(H))
prob = np.exp(H) / sum_all
if j == 0:
index = np.random.randint(low = 0, high = k_bandit, size =
1) # index of the initial action
else:
#index = np.argmax(prob) # first version: greedy action
index = np.random.choice(k_bandit, size = 1, p = prob) #
second version: less greedy action
reward = np.random.normal(loc = q_star[index], scale = 1.0, size
= 1)
reward_history[setting, i, j] = reward
if index == index_optimal:
count_optimal_history[setting, i, j] = 1
reward_accum = reward_accum + reward
reward_ave = reward_accum / (j+1) # chech here!
for k in range(k_bandit):
if k == index:
if R_bar == True:
H[index] = H[index] + alpha * (reward -
reward_ave) * (1 - prob[index])
else:
H[index] = H[index] + alpha * (reward) * (1 -
prob[index])
else:
if R_bar == True:
H[k] = H[k] - alpha * (reward - reward_ave) *
prob[k]
else:
H[k] = H[k] - alpha * (reward) * prob[k]

ave_reward = np.mean(reward_history, axis = 1)

percent_optimal = np.mean(count_optimal_history, axis = 1)

#################

# Figure 2.5(a)

ymin = np.min(ave_reward)
ymax = np.max(ave_reward)

ymin
ymax

plt.figure("Figure 2.5(a)")
plt.xlabel('step')
plt.ylabel('average reward')
plt.ylim(ymin, ymax)
plt.plot(range(step_num), ave_reward[0,:], 'r-', linewidth = 1, label = "with
baseline, alpha = 0.1")
plt.plot(range(step_num), ave_reward[1,:], 'g--', linewidth = 1, label = "with
baseline, alpha = 0.4")
plt.plot(range(step_num), ave_reward[2,:], 'b:', linewidth = 1, label = "without
baseline, alpha = 0.1")
plt.plot(range(step_num), ave_reward[3,:], 'm-.', linewidth = 1, label = "without
baseline, alpha = 0.4")
plt.legend()
plt.show()
#################

# Figure 2.5(b)

ymin = np.min(percent_optimal)
ymax = np.max(percent_optimal)

ymin
ymax

plt.figure("Figure 2.5(b)")
plt.xlabel('step')
plt.ylabel('% optimal action')
plt.ylim(ymin, ymax)
plt.plot(range(step_num), percent_optimal[0,:], 'r-', linewidth = 1, label = "with
baseline, alpha = 0.1")
plt.plot(range(step_num), percent_optimal[1,:], 'g--', linewidth = 1, label = "with
baseline, alpha = 0.4")
plt.plot(range(step_num), percent_optimal[2,:], 'b:', linewidth = 1, label =
"without baseline, alpha = 0.1")
plt.plot(range(step_num), percent_optimal[3,:], 'm-.', linewidth = 1, label =
"without baseline, alpha = 0.4")
plt.legend()
plt.show()

#################

# Figure 2.6: too time-consuming

#################

You might also like