0% found this document useful (0 votes)
4 views

Reinforcement Learning - Project 3

Reinforcement learning

Uploaded by

azeemrm630
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
4 views

Reinforcement Learning - Project 3

Reinforcement learning

Uploaded by

azeemrm630
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 9

Project 3

Course code: CSCE 5210

Section: 002

Pair: 14

Team members

Name: Naga Sai Sivani, Tutika Ketha, Tirumuru


Email: NagaSaiSivaniTutika@m [email protected]
y.unt.edu t.edu
ID: 11703058 11597873
import numpy as np

Task T1
# Creating functions to build policy using value_iteration using
Bellman's equation

# Function to create the environment for agent


def initialize_environment(R1, R2, DESTINATION, HAZARD):
environment = np.zeros(GRID_SIZE)
environment[DESTINATION] = R1 # exit reward at destination
environment[HAZARD] = R2 # negative reward at hazard
return environment

# Define function to check if a state is valid


def is_valid_state(state, GRID_SIZE):
row, col = state
return (0 <= row < GRID_SIZE[0]) and (0 <= col < GRID_SIZE[1]) and
(state not in OBSTACLE)

# Define function to calculate reward for a state


def get_reward(state, environment):
return environment[state[0], state[1]]

def transProb(next_state, a, intended_action, OBSTACLE, prob=0.8):

# intended action (default = 0.8)


if a == intended_action:
return prob

# obstacle no movement
elif (next_state in OBSTACLE):
return 0

# 180 turn -> prohibited


elif (a[0]+intended_action[0] == 0) and (a[0]+intended_action[0]
== 1):
return 0

# un-intended direction ( default = (1-0.8)/2.0 = 0.1)


else:
return (1-prob)/2.0

def value_iteration(environment, r):

# set values for checking convergence


epsilon = 1e-6 # Convergence threshold
delta = float('inf')

# initialize value_function U
value_function = np.copy(environment)

# empty policy to start with


policy = environment.copy() # Initialize policy matrix with
objects

max_iterations = 500
k = 0

# run until converged or until max_iterations are reached


while (delta > epsilon) and (k < max_iterations):

k += 1

delta = 0

# to store new values without disturbing previous state


temp_value_function = value_function.copy()

# iterate through each state in the grid/environment


for row in range(GRID_SIZE[0]):
for col in range(GRID_SIZE[1]):

state = (row, col)

# OBSTACLE or out of bounds


if not is_valid_state(state, GRID_SIZE):
continue

# states with constant values


if state == DESTINATION or state == HAZARD:
temp_value_function[row, col] =
value_function[state]
continue
max_value = float('-inf')
best_action = 0
old_value = value_function[row, col] # to compare

# check for each possible action


for intended_action_index, intended_action in
enumerate(ACTIONS):

# sum of probabilities of all actions


next_value = 0
# calculate the sum of possiblities of intended
and un-intended action
for action_index, action in enumerate(ACTIONS):

next_state = (row + action[0], col +


action[1])

if is_valid_state(next_state, GRID_SIZE):

next_value += transProb(next_state,
action, intended_action, OBSTACLE) * value_function[next_state[0],
next_state[1]]

# update if better than other actions


if next_value > max_value:
max_value = next_value
best_action = intended_action_index

# Bellman's equation R(s) + Discount_faction *


max(sum(trans_prob*U[row][col]))
temp_value_function[row, col] =
DISCOUNT_FACTOR*max_value
policy[(row, col)] = best_action # Update policy with
best action tuple
delta = max(delta, abs(old_value -
temp_value_function[row, col]))

value_function = temp_value_function

return (value_function, policy)

# Visualize the policy


def visualize_policy(policy):
arrows = [ '←', '→', '↑', '↓',]
for row in range(GRID_SIZE[0]):
for col in range(GRID_SIZE[1]):
if (row, col) == DESTINATION:
print('D', end='\t')
elif (row, col) == HAZARD:
print('H', end='\t')
# elif (row, col) == START_POSITION:
# print('S', end='\t')
elif not is_valid_state((row, col), GRID_SIZE):
print('X', end='\t') # obstacle
else:
action = int(policy[row, col])
print(arrows[action], end='\t')
print()

# test

# Create Environment
GRID_SIZE = (5,5)

NUM_ACTIONS = 4

ACTIONS = [(0, -1), (0, 1), (-1, 0), (1, 0)] # Left, Right, Up, Down
if it was (row,col)
global ACTIONS

R1 = 10 # range [10, 50, 100] # exit reward


R2 = -5 # range [-5, -50, -500] # negative reward for hazard
r = -5 # -1 or -1 live-in reward
DISCOUNT_FACTOR = 0.9

HAZARD = (2, 2)
DESTINATION = (0, 3)
START_POSITION = (3, 1)
OBSTACLE = [(0, 0), (0, 4), (4, 0), (4, 4)]

# Create environment
environment = initialize_environment(R1, R2, DESTINATION, HAZARD)

# Apply value iteration


optimal_value_function, policy = value_iteration(environment,r)
print("\nPolicy: \n")
visualize_policy(policy)

print("\nValue function: \n")


print(optimal_value_function)

Policy:

X → → D X
→ → → ↑ ←
→ ↑ H ↑ ←
→ ↑ → ↑ ←
X ↑ ↑ ↑ X

Value function:

[[ 0. 6.74561812 8.4879587 10. 0. ]


[ 5.51121011 7.04764314 7.56503407 9.19004158 7.19540404]
[ 4.85452436 5.53004519 -5. 7.35113498 6.42860151]
[ 4.18758786 5.20927943 5.28346748 6.73034413 5.42442169]
[ 0. 4.16946553 4.65316541 5.26463239 0. ]]

Implementing Tasks
# environment settings
# same environment is used for all Tasks

GRID_SIZE = (5,5)
NUM_ACTIONS = 4

# Left, Right, Up, Down


ACTIONS = [(0, -1), (0, 1), (-1, 0), (1, 0)]
global ACTIONS

HAZARD = (2, 2)
DESTINATION = (0, 3)
START_POSITION = (3, 1)
OBSTACLE = [(0, 0), (0, 4), (4, 0), (4, 4)]

Task - T1
R1 = 10
R2 = -5
r = -5

# Create environment
environment = initialize_environment(R1, R2, DESTINATION, HAZARD)

# Apply value iteration


optimal_value_function, policy = value_iteration(environment,r)
print("\nPolicy (P1): \n")
visualize_policy(policy)

print("\nValue function: \n")


print(optimal_value_function)

Policy (P1):

X → → D X
→ → → ↑ ←
→ ↑ H ↑ ←
→ ↑ → ↑ ←
X ↑ ↑ ↑ X

Value function:

[[ 0. 6.74561812 8.4879587 10. 0. ]


[ 5.51121011 7.04764314 7.56503407 9.19004158 7.19540404]
[ 4.85452436 5.53004519 -5. 7.35113498 6.42860151]
[ 4.18758786 5.20927943 5.28346748 6.73034413 5.42442169]
[ 0. 4.16946553 4.65316541 5.26463239 0. ]]

Task - T3: part 1


R1 = 50
R2 = -50
r = -5

# Create environment
environment = initialize_environment(R1, R2, DESTINATION, HAZARD)

# Apply value iteration


optimal_value_function, policy = value_iteration(environment,r)
print("\nPolicy: (P2) \n")
visualize_policy(policy)

print("\nValue function: \n")


print(optimal_value_function)

Policy: (P2)

X → → D X
→ → → ↑ ←
↑ ↑ H ↑ ↑
↑ ↑ → ↑ ↑
X ↑ → ↑ X

Value function:

[[ 0. 33.25206812 42.13072607 50. 0. ]


[ 25.31103127 32.42161528 34.86711045 45.35749967 35.42768859]
[ 21.86076058 22.74018676 -50. 33.67741954 30.7809873 ]
[ 17.66890273 21.4350619 21.28535829 30.55256633 24.91204174]
[ 0. 17.29159918 20.6483922 23.85620279 0. ]]

Task - T3: part 2


R1 = 100
R2 = -500
r = -5

# Create environment
environment = initialize_environment(R1, R2, DESTINATION, HAZARD)

# Apply value iteration


optimal_value_function, policy = value_iteration(environment,r)
print("\nPolicy: (P3) \n")
visualize_policy(policy)

print("\nValue function: \n")


print(optimal_value_function)

Policy: (P3)

X → → D X
→ ↑ → ↑ ←
↑ ↑ H ↑ ↑
↑ ← → → ↑
X → → ↑ X

Value function:

[[ 0. 62.02522952 79.89598344 100. 0.


]
[ 38.6474947 50.00134953 25.70791979 81.96827756
63.63604508]
[ 29.40581223 -4.98670044 -500. 21.4146747
51.32094723]
[ 22.53798954 15.17561011 -19.92293294 30.87366273
39.72971159]
[ 0. 13.22360647 16.46916947 23.7112622 0.
]]

Task - T3: part 3


R1 = 10
R2 = -5
r = -1

# Create environment
environment = initialize_environment(R1, R2, DESTINATION, HAZARD)

# Apply value iteration


optimal_value_function, policy = value_iteration(environment,r)
print("\nPolicy: \n")
visualize_policy(policy)
print("\nValue function: \n")
print(optimal_value_function)

Policy:

X → → D X
→ → → ↑ ←
→ ↑ H ↑ ←
→ ↑ → ↑ ←
X ↑ ↑ ↑ X

Value function:

[[ 0. 6.74561812 8.4879587 10. 0. ]


[ 5.51121011 7.04764314 7.56503407 9.19004158 7.19540404]
[ 4.85452436 5.53004519 -5. 7.35113498 6.42860151]
[ 4.18758786 5.20927943 5.28346748 6.73034413 5.42442169]
[ 0. 4.16946553 4.65316541 5.26463239 0. ]]

R1 = 50
R2 = -50
r = -1

# Create environment
environment = initialize_environment(R1, R2, DESTINATION, HAZARD)

# Apply value iteration


optimal_value_function, policy = value_iteration(environment,r)
print("\nPolicy: \n")
visualize_policy(policy)

print("\nValue function: \n")


print(optimal_value_function)

Policy:

X → → D X
→ → → ↑ ←
↑ ↑ H ↑ ↑
↑ ↑ → ↑ ↑
X ↑ → ↑ X

Value function:

[[ 0. 33.25206812 42.13072607 50. 0. ]


[ 25.31103127 32.42161528 34.86711045 45.35749967 35.42768859]
[ 21.86076058 22.74018676 -50. 33.67741954 30.7809873 ]
[ 17.66890273 21.4350619 21.28535829 30.55256633 24.91204174]
[ 0. 17.29159918 20.6483922 23.85620279 0. ]]
R1 = 100
R2 = -500
r = -1

# Create environment
environment = initialize_environment(R1, R2, DESTINATION, HAZARD)

# Apply value iteration


optimal_value_function, policy = value_iteration(environment,r)
print("\nPolicy: \n")
visualize_policy(policy)

print("\nValue function: \n")


print(optimal_value_function)

Policy:

X → → D X
→ ↑ → ↑ ←
↑ ↑ H ↑ ↑
↑ ← → → ↑
X → → ↑ X

Value function:

[[ 0. 62.02522952 79.89598344 100. 0.


]
[ 38.6474947 50.00134953 25.70791979 81.96827756
63.63604508]
[ 29.40581223 -4.98670044 -500. 21.4146747
51.32094723]
[ 22.53798954 15.17561011 -19.92293294 30.87366273
39.72971159]
[ 0. 13.22360647 16.46916947 23.7112622 0.
]]

You might also like