Skip to content

RND

RND

Random Network Distillation.

PPOIntrinsic

PPOIntrinsic(model, input_size, action_size, config: PPOConfig, *, extr_coeff: float = 2.0, intr_coeff: float = 1.0, build_optimiser: bool = True, optim: type[Optimizer] = torch.optim.Adam, optim_args: dict | None = None, **model_args)

Bases: PPOModel

Twin-critic PPO with extrinsic + intrinsic value heads.

Source code in rlib/RND/model.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def __init__(
    self,
    model,
    input_size,
    action_size,
    config: PPOConfig,
    *,
    extr_coeff: float = 2.0,
    intr_coeff: float = 1.0,
    build_optimiser: bool = True,
    optim: type[torch.optim.Optimizer] = torch.optim.Adam,
    optim_args: dict | None = None,
    **model_args,
):
    super().__init__(action_size=action_size, config=config)
    self.input_size = input_size
    self.extr_coeff = extr_coeff
    self.intr_coeff = intr_coeff

    self.model = model(input_size, **model_args).to(self.device)
    self.dense_size = dense_size = self.model.dense_size
    self.policy = torch.nn.Sequential(
        torch.nn.Linear(dense_size, action_size), torch.nn.Softmax(dim=-1)
    ).to(self.device)  # Actor
    self.Ve = torch.nn.Linear(dense_size, 1).to(self.device)  # Critic (Extrinsic)
    self.Vi = torch.nn.Linear(dense_size, 1).to(
        self.device
    )  # Intrinsic Value i.e. expected instrinsic value of state

    if build_optimiser:
        self._build_optimiser(optim=optim, optim_args=optim_args)

RNDTrainer

RNDTrainer(envs, agent: RND, val_envs, config: RNDTrainerConfig)

Bases: SyncMultiEnvTrainer

Trainer for the Random Network Distillation agent.

Source code in rlib/RND/trainer.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
def __init__(
    self,
    envs,
    agent: RND,
    val_envs,
    config: RNDTrainerConfig,
):
    super().__init__(envs, agent, val_envs, config=config)

    self.gamma_intr = config.gamma_intr
    self.num_epochs = config.num_epochs
    self.num_minibatches = config.num_minibatches
    self.init_obs_steps = config.init_obs_steps
    self.pred_prob = 1 / (self.num_envs / 32.0)
    self.state_obs = RunningMeanStd()
    self.forward_filter = RewardForwardFilter(config.gamma_intr)
    self.intr_rolling = RunningMeanStd()

RNDTrainerConfig dataclass

RNDTrainerConfig(train_mode: TrainMode = TrainMode.NSTEP, returns: Returns = Returns.NSTEP, total_steps: int = 50000000, nsteps: int = 5, gamma: float = 0.99, lambda_: float = 0.95, validate_freq: int = 1000000, num_val_episodes: int = 50, max_val_steps: int = 10000, log_dir: str = 'logs/', model_dir: str = 'models/', save_freq: int = 0, log_scalars: bool = True, update_target_freq: int = 0, render_freq: int = 0, gamma_intr: float = 0.99, init_obs_steps: int = 600, num_epochs: int = 4, num_minibatches: int = 4)

Bases: TrainerConfig

Hyperparameters for :class:RNDTrainer.

gamma is reused as the extrinsic discount; the intrinsic discount is the new gamma_intr field.