Py Code Chapter 02
Py Code Chapter 02
#################
import numpy as np
import matplotlib.pyplot as plt
# greedy action
k_bandit = 10
run_num = 2000
step_num = 1000
reward_history_1 = np.zeros((run_num, step_num))
count_optimal_history_1 = np.zeros((run_num, step_num))
seed = 1
np.random.seed(seed)
for i in range(run_num):
q_star = np.random.normal(loc = 0, scale = 1.0, size = k_bandit)
index_optimal = np.argmax(q_star) # optimal action
K_accum = np.zeros(k_bandit)
R_accum = np.zeros(k_bandit)
Q = np.zeros(k_bandit)
for j in range(step_num):
if j == 0:
index = np.random.randint(low = 0, high = k_bandit, size = 1) #
index of the initial action
else:
index = np.argmax(Q)
reward = np.random.normal(loc = q_star[index], scale = 1.0, size = 1)
reward_history_1[i, j] = reward
if index == index_optimal:
count_optimal_history_1[i, j] = 1
K_accum[index] = K_accum[index] + 1
R_accum[index] = R_accum[index] + reward
Q[index] = R_accum[index] / K_accum[index]
#################
eps = 0.01
k_bandit = 10
run_num = 2000
step_num = 1000
reward_history_2 = np.zeros((run_num, step_num))
count_optimal_history_2 = np.zeros((run_num, step_num))
seed = 1
np.random.seed(seed)
for i in range(run_num):
q_star = np.random.normal(loc = 0, scale = 1.0, size = k_bandit)
index_optimal = np.argmax(q_star) # optimal action
K_accum = np.zeros(k_bandit)
R_accum = np.zeros(k_bandit)
Q = np.zeros(k_bandit)
for j in range(step_num):
if j == 0:
index = np.random.randint(low = 0, high = k_bandit, size = 1) #
index of the initial action
else:
rand_num = np.random.uniform(low = 0, high = 1, size = 1)
if rand_num < eps:
index = np.random.randint(low = 0, high = k_bandit, size =
1)
else:
index = np.argmax(Q)
reward = np.random.normal(loc = q_star[index], scale = 1.0, size = 1)
reward_history_2[i, j] = reward
if index == index_optimal:
count_optimal_history_2[i, j] = 1
K_accum[index] = K_accum[index] + 1
R_accum[index] = R_accum[index] + reward
Q[index] = R_accum[index] / K_accum[index]
#################
eps = 0.1
k_bandit = 10
run_num = 2000
step_num = 1000
reward_history_3 = np.zeros((run_num, step_num))
count_optimal_history_3 = np.zeros((run_num, step_num))
seed = 1
np.random.seed(seed)
for i in range(run_num):
q_star = np.random.normal(loc = 0, scale = 1.0, size = k_bandit)
index_optimal = np.argmax(q_star) # optimal action
K_accum = np.zeros(k_bandit)
R_accum = np.zeros(k_bandit)
Q = np.zeros(k_bandit)
for j in range(step_num):
if j == 0:
index = np.random.randint(low = 0, high = k_bandit, size = 1) #
index of the initial action
else:
rand_num = np.random.uniform(low = 0, high = 1, size = 1)
if rand_num < eps:
index = np.random.randint(low = 0, high = k_bandit, size =
1)
else:
index = np.argmax(Q)
reward = np.random.normal(loc = q_star[index], scale = 1.0, size = 1)
reward_history_3[i, j] = reward
if index == index_optimal:
count_optimal_history_3[i, j] = 1
K_accum[index] = K_accum[index] + 1
R_accum[index] = R_accum[index] + reward
Q[index] = R_accum[index] / K_accum[index]
#################
# Figure 2.2(a)
ymin
ymax
plt.figure("Figure 2.2(a)")
plt.xlabel('step')
plt.ylabel('average reward')
plt.ylim(ymin, ymax)
plt.plot(range(step_num), ave_reward_1, 'r-', linewidth = 1, label = "eps = 0
(greedy)")
plt.plot(range(step_num), ave_reward_2, 'g--', linewidth = 1, label = "eps = 0.01")
plt.plot(range(step_num), ave_reward_3, 'b-.', linewidth = 1, label = "eps = 0.1")
plt.legend()
plt.show()
#################
# Figure 2.2(b)
ymin
ymax
plt.figure("Figure 2.2(b)")
plt.xlabel('step')
plt.ylabel('% optimal action')
plt.ylim(ymin, ymax)
plt.plot(range(step_num), percent_optimal_1, 'r-', linewidth = 1, label = "eps = 0
(greedy)")
plt.plot(range(step_num), percent_optimal_2, 'g--', linewidth = 1, label = "eps =
0.01")
plt.plot(range(step_num), percent_optimal_3, 'b-.', linewidth = 1, label = "eps =
0.1")
plt.legend()
plt.show()
#################
k_bandit = 10
run_num = 2000
step_num = 1000
reward_history_1 = np.zeros((run_num, step_num))
count_optimal_history_1 = np.zeros((run_num, step_num))
seed = 1
np.random.seed(seed)
for i in range(run_num):
q_star = np.random.normal(loc = 0, scale = 1.0, size = k_bandit)
index_optimal = np.argmax(q_star) # optimal action
#Q = np.repeat(Q1, k_bandit) # becomes tuple
Q = rep([Q1], k_bandit)
#print("data type of Q: ", Q.dtype)
for j in range(step_num):
if j == 0:
index = np.random.randint(low = 0, high = k_bandit, size = 1) #
index of the initial action
else:
index = np.argmax(Q)
reward = np.random.normal(loc = q_star[index], scale = 1.0, size = 1)
reward_history_1[i, j] = reward
if index == index_optimal:
count_optimal_history_1[i, j] = 1
Q[index] = Q[index] + alpha * (reward - Q[index])
#################
k_bandit = 10
run_num = 2000
step_num = 1000
q_star = np.zeros(k_bandit) # expected reward
reward_history_2 = np.zeros((run_num, step_num))
count_optimal_history_2 = np.zeros((run_num, step_num))
seed = 1
np.random.seed(seed)
for i in range(run_num):
q_star = np.random.normal(loc = 0, scale = 1.0, size = k_bandit)
index_optimal = np.argmax(q_star) # optimal action
#Q = np.repeat(Q1, k_bandit) # becomes tuple
Q = rep([Q1], k_bandit)
#print("data type of Q: ", Q.dtype)
for j in range(step_num):
if j == 0:
index = np.random.randint(low = 0, high = k_bandit, size = 1) #
index of the initial action
else:
rand_num = np.random.uniform(low = 0, high = 1, size = 1)
if rand_num < eps:
index = np.random.randint(low = 0, high = k_bandit, size =
1)
else:
index = np.argmax(Q)
reward = np.random.normal(loc = q_star[index], scale = 1.0, size = 1)
reward_history_2[i, j] = reward
if index == index_optimal:
count_optimal_history_2[i, j] = 1
Q[index] = Q[index] + alpha * (reward - Q[index])
#################
# Figure 2.3(a)
ymin
ymax
plt.figure("Figure 2.3(a)")
plt.xlabel('step')
plt.ylabel('average reward')
plt.ylim(ymin, ymax)
plt.plot(range(step_num), ave_reward_1, 'r-', linewidth = 1, label = "eps = 0, Q1 =
5")
plt.plot(range(step_num), ave_reward_2, 'b--', linewidth = 1, label = "eps = 0.1,
Q1 = 0")
plt.legend()
plt.show()
#################
# Figure 2.3(b)
ymin
ymax
plt.figure("Figure 2.3(b)")
plt.xlabel('step')
plt.ylabel('% optimal action')
plt.ylim(ymin, ymax)
plt.plot(range(step_num), percent_optimal_1, 'r-', linewidth = 1, label = "eps = 0,
Q1 = 5")
plt.plot(range(step_num), percent_optimal_2, 'b--', linewidth = 1, label = "eps =
0.1, Q1 = 0")
plt.legend()
plt.show()
#################
eps = 0.1
k_bandit = 10
run_num = 2000
step_num = 1000
reward_history_eps = np.zeros((run_num, step_num))
count_optimal_history_eps = np.zeros((run_num, step_num))
seed = 1
np.random.seed(seed)
for i in range(run_num):
q_star = np.random.normal(loc = 0, scale = 1.0, size = k_bandit)
index_optimal = np.argmax(q_star) # optimal action
K_accum = np.zeros(k_bandit)
R_accum = np.zeros(k_bandit)
Q = np.zeros(k_bandit)
for j in range(step_num):
if j == 0:
index = np.random.randint(low = 0, high = k_bandit, size = 1) #
index of the initial action
else:
rand_num = np.random.uniform(low = 0, high = 1, size = 1)
if rand_num < eps:
index = np.random.randint(low = 0, high = k_bandit, size =
1)
else:
index = np.argmax(Q)
reward = np.random.normal(loc = q_star[index], scale = 1.0, size = 1)
reward_history_eps[i, j] = reward
if index == index_optimal:
count_optimal_history_eps[i, j] = 1
K_accum[index] = K_accum[index] + 1
R_accum[index] = R_accum[index] + reward
Q[index] = R_accum[index] / K_accum[index]
#################
c = 2
k_bandit = 10
run_num = 2000
step_num = 1000
reward_history_ucb = np.zeros((run_num, step_num))
count_optimal_history_ucb = np.zeros((run_num, step_num))
seed = 1
np.random.seed(seed)
for i in range(run_num):
q_star = np.random.normal(loc = 0, scale = 1.0, size = k_bandit)
index_optimal = np.argmax(q_star) # optimal action
K_accum = np.zeros(k_bandit)
R_accum = np.zeros(k_bandit)
Q = np.zeros(k_bandit)
for j in range(step_num):
if j == 0:
index = np.random.randint(low = 0, high = k_bandit, size = 1) #
index of the initial action
else:
index_tmp = which(K_accum == 0)
if len(index_tmp) > 0:
index = np.random.choice(index_tmp, size = 1)
else:
index = np.argmax(Q + c * np.sqrt(np.log(j) / K_accum))
reward = np.random.normal(loc = q_star[index], scale = 1.0, size = 1)
reward_history_ucb[i, j] = reward
if index == index_optimal:
count_optimal_history_ucb[i, j] = 1
K_accum[index] = K_accum[index] + 1
R_accum[index] = R_accum[index] + reward
Q[index] = R_accum[index] / K_accum[index]
#################
# Figure 2.4(a)
ymin
ymax
plt.figure("Figure 2.4(a)")
plt.xlabel('step')
plt.ylabel('average reward')
plt.ylim(ymin, ymax)
plt.plot(range(step_num), ave_reward_eps, 'r-', linewidth = 1, label = "eps = 0.1")
plt.plot(range(step_num), ave_reward_ucb, 'b--', linewidth = 1, label = "ucb: c =
2")
plt.legend()
plt.show()
#################
# Figure 2.4(b)
ymin
ymax
plt.figure("Figure 2.4(a)")
plt.xlabel('step')
plt.ylabel('% optimal action')
plt.ylim(ymin, ymax)
plt.plot(range(step_num), percent_optimal_eps, 'r-', linewidth = 1, label = "eps =
0.1")
plt.plot(range(step_num), percent_optimal_ucb, 'b--', linewidth = 1, label = "ucb:
c = 2")
plt.legend()
plt.show()
#################
k_bandit = 10
run_num = 2000
step_num = 1000
reward_history = np.zeros((4, run_num, step_num))
count_optimal_history = np.zeros((4, run_num, step_num))
seed = 1
np.random.seed(seed)
#################
# Figure 2.5(a)
ymin = np.min(ave_reward)
ymax = np.max(ave_reward)
ymin
ymax
plt.figure("Figure 2.5(a)")
plt.xlabel('step')
plt.ylabel('average reward')
plt.ylim(ymin, ymax)
plt.plot(range(step_num), ave_reward[0,:], 'r-', linewidth = 1, label = "with
baseline, alpha = 0.1")
plt.plot(range(step_num), ave_reward[1,:], 'g--', linewidth = 1, label = "with
baseline, alpha = 0.4")
plt.plot(range(step_num), ave_reward[2,:], 'b:', linewidth = 1, label = "without
baseline, alpha = 0.1")
plt.plot(range(step_num), ave_reward[3,:], 'm-.', linewidth = 1, label = "without
baseline, alpha = 0.4")
plt.legend()
plt.show()
#################
# Figure 2.5(b)
ymin = np.min(percent_optimal)
ymax = np.max(percent_optimal)
ymin
ymax
plt.figure("Figure 2.5(b)")
plt.xlabel('step')
plt.ylabel('% optimal action')
plt.ylim(ymin, ymax)
plt.plot(range(step_num), percent_optimal[0,:], 'r-', linewidth = 1, label = "with
baseline, alpha = 0.1")
plt.plot(range(step_num), percent_optimal[1,:], 'g--', linewidth = 1, label = "with
baseline, alpha = 0.4")
plt.plot(range(step_num), percent_optimal[2,:], 'b:', linewidth = 1, label =
"without baseline, alpha = 0.1")
plt.plot(range(step_num), percent_optimal[3,:], 'm-.', linewidth = 1, label =
"without baseline, alpha = 0.4")
plt.legend()
plt.show()
#################
#################