Skip to content

A2C

A2C

Advantage Actor-Critic agents.

A2CConfig dataclass

A2CConfig(lr: float = 0.001, lr_final: float = 0.0, decay_steps: int = 600000, grad_clip: float | None = 0.5, device: str = 'cuda', entropy_coeff: float = 0.01, value_coeff: float = 0.5)

Bases: ModelConfig

Hyperparameters for advantage actor-critic agents (A2C / A3C / UNREAL).

Attributes:

Name Type Description
entropy_coeff float

Coefficient on the policy entropy bonus.

value_coeff float

Weight on the value function loss term.

A2CModel

A2CModel(action_size: int, config: A2CConfig)

Bases: Agent

A2C-family base class: defines the actor-critic + entropy loss.

Concrete subclasses (feed-forward, recurrent, ...) only need to implement forward, :meth:evaluate and :meth:backprop; they all share the same loss function via :meth:loss.

Source code in rlib/A2C/model.py
50
51
52
53
54
def __init__(self, action_size: int, config: A2CConfig) -> None:
    super().__init__(config=config)
    self.action_size = action_size
    self.entropy_coeff = config.entropy_coeff
    self.value_coeff = config.value_coeff

loss

loss(policy: Tensor, R: Tensor, V: Tensor, actions_onehot: Tensor) -> torch.Tensor

Standard A2C/A3C actor–critic loss with entropy bonus.

Combines:

  • a half-MSE value loss on R - V,
  • the negative log-likelihood policy gradient using a detached advantage, and
  • an entropy bonus on the action distribution.

policy is expected to be a normalised distribution (i.e. the output of a softmax); we numerically clip it before taking log so the loss stays finite for near-deterministic policies.

Source code in rlib/A2C/model.py
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
def loss(
    self,
    policy: torch.Tensor,
    R: torch.Tensor,
    V: torch.Tensor,
    actions_onehot: torch.Tensor,
) -> torch.Tensor:
    """Standard A2C/A3C actor–critic loss with entropy bonus.

    Combines:

    * a half-MSE value loss on ``R - V``,
    * the negative log-likelihood policy gradient using a *detached*
      advantage, and
    * an entropy bonus on the action distribution.

    ``policy`` is expected to be a normalised distribution
    (i.e. the output of a softmax); we numerically clip it before
    taking ``log`` so the loss stays finite for near-deterministic
    policies.
    """
    advantage = R - V
    value_loss = self.value_loss(R, V)

    log_policy = torch.log(torch.clip(policy, 1e-6, 0.999999))
    log_policy_actions = torch.sum(log_policy * actions_onehot, dim=1)
    policy_loss = torch.mean(-log_policy_actions * advantage.detach())

    entropy = torch.mean(torch.sum(policy * -log_policy, dim=1))
    return policy_loss + self.value_coeff * value_loss - self.entropy_coeff * entropy

ActorCritic

ActorCritic(model, input_size, action_size, config: A2CConfig, *, build_optimiser: bool = True, optim: type[Optimizer] = torch.optim.RMSprop, optim_args: dict | None = None, **model_args)

Bases: A2CModel

Feed-forward A2C actor-critic.

Source code in rlib/A2C/model.py
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
def __init__(
    self,
    model,
    input_size,
    action_size,
    config: A2CConfig,
    *,
    build_optimiser: bool = True,
    optim: type[torch.optim.Optimizer] = torch.optim.RMSprop,
    optim_args: dict | None = None,
    **model_args,
):
    super().__init__(action_size=action_size, config=config)

    self.model = model(input_size, **model_args).to(self.device)
    self.dense_size = self.model.dense_size
    self.policy_distrib = torch.nn.Linear(self.dense_size, action_size).to(self.device)  # Actor
    self.V = torch.nn.Linear(self.dense_size, 1).to(self.device)  # Critic

    if build_optimiser:
        self._build_optimiser(optim=optim, optim_args=optim_args)

ActorCritic_LSTM

ActorCritic_LSTM(model, input_size, action_size, cell_size, config: A2CConfig, *, build_optimiser: bool = True, optim: type[Optimizer] = torch.optim.RMSprop, optim_args: dict | None = None, **model_args)

Bases: A2CModel

Recurrent A2C actor-critic (masked LSTM body).

Source code in rlib/A2C/model.py
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
def __init__(
    self,
    model,
    input_size,
    action_size,
    cell_size,
    config: A2CConfig,
    *,
    build_optimiser: bool = True,
    optim: type[torch.optim.Optimizer] = torch.optim.RMSprop,
    optim_args: dict | None = None,
    **model_args,
):
    super().__init__(action_size=action_size, config=config)
    self.input_size = input_size
    self.cell_size = cell_size

    self.model = model(input_size, **model_args).to(self.device)
    self.dense_size = self.model.dense_size
    # self.lstm = MaskedRNN(MaskedLSTMCell(cell_size, self.dense_size), time_major=True)
    self.lstm = MaskedLSTMBlock(self.dense_size, cell_size, time_major=True).to(self.device)

    self.policy_distrib = torch.nn.Linear(cell_size, action_size, device=self.device)  # Actor
    self.V = torch.nn.Linear(cell_size, 1, device=self.device)  # Critic

    if build_optimiser:
        self._build_optimiser(optim=optim, optim_args=optim_args)

A2CLSTMTrainer

A2CLSTMTrainer(envs, agent: ActorCritic_LSTM, val_envs, config: TrainerConfig)

Bases: SyncMultiEnvTrainer

Recurrent A2C trainer (LSTM hidden state propagated across rollouts).

Source code in rlib/A2C/trainer.py
85
86
87
88
89
90
91
92
93
def __init__(
    self,
    envs,
    agent: ActorCritic_LSTM,
    val_envs,
    config: TrainerConfig,
) -> None:
    super().__init__(envs, agent, val_envs, config=config)
    self.prev_hidden = self.agent.get_initial_hidden(self.num_envs)

A2CTrainer

A2CTrainer(envs, agent: ActorCritic, val_envs, config: TrainerConfig)

Bases: SyncMultiEnvTrainer

Synchronous Advantage Actor-Critic trainer (feed-forward).

Source code in rlib/A2C/trainer.py
17
18
19
20
21
22
23
24
def __init__(
    self,
    envs,
    agent: ActorCritic,
    val_envs,
    config: TrainerConfig,
) -> None:
    super().__init__(envs, agent, val_envs, config=config)