Source code for torchrl.models.base_pg_model

from abc import abstractproperty
import torch
from torchrl.distributions import Categorical, Normal
import torchrl.utils as U
from torchrl.models import BaseModel
from torchrl.nn import ActionLinear


[docs]class BasePGModel(BaseModel): """ Base class for all Policy Gradient Models. """ def __init__(self, model, batcher, *, entropy_coef=0, **kwargs): super().__init__(model=model, batcher=batcher, **kwargs) self.entropy_coef_fn = U.make_callable(entropy_coef) @abstractproperty def entropy(self): pass @property def entropy_coef(self): return self.entropy_coef_fn(self.num_steps)
[docs] def entropy_loss(self, batch): """ Adds a entropy cost to the loss function, with the intent of encouraging exploration. Parameters ---------- batch: Batch The batch should contain all the information necessary to compute the gradients. """ loss = -self.entropy * self.entropy_coef return loss
[docs] def create_dist(self, parameters): """ Specify how the policy distributions should be created. The type of the distribution depends on the environment. Parameters ---------- parameters: np.array The parameters are used to create a distribution (continuous or discrete depending on the type of the environment). """ if self.batcher.get_action_info().space == "discrete": logits = parameters return Categorical(logits=logits) elif self.batcher.get_action_info().space == "continuous": means = parameters[..., 0] std_devs = parameters[..., 1].exp() return Normal(loc=means, scale=std_devs) else: raise ValueError( "No distribution is defined for {} actions".format( self.batcher.get_action_info().space ) )
[docs] def write_logs(self, batch): super().write_logs(batch) self.add_log("Entropy", self.entropy) self.add_log("Policy/log_prob", batch.log_prob)
[docs] @staticmethod def output_layer(input_shape, action_info): return ActionLinear(in_features=input_shape, action_info=action_info)
[docs] @staticmethod def select_action(model, state, step): """ Define how the actions are selected, in this case the actions are sampled from a distribution which values are given be a NN. Parameters ---------- state: np.array The state of the environment (can be a batch of states). """ parameters = model.forward(state) dist = model.create_dist(parameters) action = dist.sample() return U.to_np(action)