0% found this document useful (0 votes)
27 views

Class ActorCritic

Uploaded by

Laura Cobeña
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
27 views

Class ActorCritic

Uploaded by

Laura Cobeña
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 1

for steps in range(num_steps):

#llamamos actor critic que nos da el valor y estrategia


# Plot results
value, policy_dist = actor_critic(state)
def moving_average(a, n=3):
# itera sobre el estado hasta que el poste se caiga o se c
ret = np.cumsum(a, dtype=float)
ret[n:] = ret[n:] - ret[:-n]
# desacoplamos del gradiente y convertimos en array return ret[n - 1:] / n
value = value.detach().numpy()[0,0]
dist = policy_dist.detach().numpy() all_rewards = np.asarray(all_rewards)
class ActorCritic(nn.Module): smoothend_rewards = moving_average(all_rewards, 30)
#toma acción basada en la disftribución de la red,
# red que tiene dos subredes dentro uno el actor y otro el crítico
dependiento el numero de outputs (2) por moverse de izda a
# se puede hacer por separado o que compartan parámetros return actor_critic
derecha
def __init__(self, num_inputs, num_actions, hidden_size,
learning_rate=3e-4):
action = np.random.choice(num_outputs,
super(ActorCritic, self).__init__()
p=np.squeeze(dist))
log_prob = torch.log(policy_dist.squeeze(0)[action])
self.num_actions = num_actions
entropy = -np.sum(np.mean(dist) * np.log(dist))
self.critic_linear1 = nn.Linear(num_inputs, hidden_size)
new_state, reward, terminated, truncated, _ =
self.critic_linear2 = nn.Linear(hidden_size, 1)
env.step(action)
done = terminated or truncated
self.actor_linear1 = nn.Linear(num_inputs, hidden_size)
self.actor_linear2 = nn.Linear(hidden_size, num_actions)
rewards.append(reward)
values.append(value)
def forward(self, state):
log_probs.append(log_prob)
state = Variable(torch.from_numpy(state).float().unsqueeze(0))
entropy_term += entropy
value = F.relu(self.critic_linear1(state))
state = new_state
value = self.critic_linear2(value)
if done or steps == num_steps-1:
policy_dist = F.relu(self.actor_linear1(state))
Qval, _ = actor_critic.forward(new_state)
policy_dist = F.softmax(self.actor_linear2(policy_dist), dim=1)
Qval = Qval.detach().numpy()[0,0]
all_rewards.append(np.sum(rewards))
return value, policy_dist
all_lengths.append(steps)
average_lengths.append(np.mean(all_lengths[-10:]))
def a2c(env): if episode % 10 == 0:
num_inputs = env.observation_space.shape[0] sys.stdout.write("episode: {}, reward: {}, total length: {}, average
num_outputs = env.action_space.n length: {} \n".format(episode, np.sum(rewards), steps, average_lengths[-1]))
break
# definimos alctor y critico que es modelo conjunto
# compute Q values
actor_critic = ActorCritic(num_inputs, num_outputs,
Qvals = np.zeros_like(values)
hidden_size) for t in reversed(range(len(rewards))):
# definimos el optimizador Qval = rewards[t] + GAMMA * Qval
ac_optimizer = optim.Adam(actor_critic.parameters(), Qvals[t] = Qval
lr=learning_rate)
#update actor critic
all_lengths = [] values = torch.FloatTensor(values)
Qvals = torch.FloatTensor(Qvals)
average_lengths = []
log_probs = torch.stack(log_probs)
all_rewards = []
entropy_term = 0 advantage = Qvals - values
actor_loss = (-log_probs * advantage).mean()
for episode in range(max_episodes): critic_loss = 0.5 * advantage.pow(2).mean()
log_probs = [] ac_loss = actor_loss + critic_loss + 0.001 * entropy_term
values = []
ac_optimizer.zero_grad()
rewards = []
ac_loss.backward()
ac_optimizer.step()
state, info = env.reset() # reinicializamos entorno y simulacion

You might also like