module `emote.algorithms.sac`

Functions

def soft_update_from_to(source, target, tau) -> None

Classes

`class QLoss(LossCallback):`

A MSE loss between the action value net and the target q. The target q values are not calculated here and need to be added to the state before the loss of this module runs.

Methods

def __init__(
    self
,
    *name,
    q,
    opt,
    lr_schedule,
    max_grad_norm,
    data_group,
    log_per_param_weights,
    log_per_param_grads
) -> None

Arguments:

name(str): The name of the module. Used e.g. while logging.
q(nn.Module): A deep neural net that outputs the discounted loss given the current observations and a given action.
opt(optim.Optimizer): An optimizer for q.
lr_schedule(Optional[optim.lr_scheduler._LRScheduler]): Learning rate schedule for the optimizer of q.
max_grad_norm(float): Clip the norm of the gradient during backprop using this value.
data_group(str): The name of the data group from which this Loss takes its data.
log_per_param_weights((bool)): If true, log each individual policy parameter that is optimized (norm and value histogram).
log_per_param_grads((bool)): If true, log the gradients of each individual policy parameter that is optimized (norm and histogram).

def loss(self, observation, actions, q_target) -> None

`class QTarget(LoggingMixin, Callback):`

Creates rolling averages of the Q nets, and predicts q values using these.

The module is responsible both for keeping the averages correct in the target q networks and supplying q-value predictions using the target q networks.

Methods

def __init__(
    self
,
    *pi,
    ln_alpha,
    q1,
    q2,
    q1t,
    q2t,
    gamma,
    reward_scale,
    target_q_tau,
    data_group,
    roll_length,
    use_terminal_masking
) -> None

Arguments:

pi(nn.Module): A deep neural net that outputs actions and their log probability given a state.
ln_alpha(torch.tensor): The current weight for the entropy part of the soft Q.
q1(nn.Module): A deep neural net that outputs the discounted loss given the current observations and a given action.
q2(nn.Module): A deep neural net that outputs the discounted loss given the current observations and a given action. :param q1t (torch.nn.Module, optional): target Q network. (default: None) :param q2t (torch.nn.Module, optional): target Q network. (default: None) :param gamma (float, optional): Discount factor for the rewards in time. (default: 0.99) :param reward_scale (float, optional): Scale factor for the rewards. (default: 1.0) :param target_q_tau (float, optional): The weight given to the latest network in the exponential moving average. So NewTargetQ = OldTargetQ * (1-tau)

Q*tau. (default: 0.005) :param data_group (str, optional): The name of the data group from which this Loss takes its data. (default: "default") :param roll_length (int, optional): Rollout length. (default: 1) :param use_terminal_masking (bool, optional): Whether to use terminal masking for the next values. (default: False)

q1t(Optional[nn.Module])
q2t(Optional[nn.Module])
gamma(float)
reward_scale(float)
target_q_tau(float)
data_group(str)
roll_length(int)
use_terminal_masking(bool)

def begin_batch(self, next_observation, rewards, masks) -> None

def end_batch(self) -> None

`class PolicyLoss(LossCallback):`

Maximize the soft Q-value for the policy. This loss modifies the policy to select the action that gives the highest soft q-value.

Methods

def __init__(
    self
,
    *pi,
    ln_alpha,
    q,
    opt,
    lr_schedule,
    q2,
    max_grad_norm,
    name,
    data_group,
    log_per_param_weights,
    log_per_param_grads
) -> None

Arguments:

pi(nn.Module): A deep neural net that outputs actions and their log probability given a state.
ln_alpha(torch.tensor): The current weight for the entropy part of the soft Q.
q(nn.Module): A deep neural net that outputs the discounted loss given the current observations and a given action.
opt(optim.Optimizer): An optimizer for pi.
lr_schedule(Optional[optim.lr_scheduler._LRScheduler]): Learning rate schedule for the optimizer of policy.
q2(Optional[nn.Module]): A second deep neural net that outputs the discounted loss given the current observations and a given action. This is not necessary since it is fine if the policy isn't pessimistic, but can be nice for symmetry with the Q-loss.
max_grad_norm(float): Clip the norm of the gradient during backprop using this value.
name(str): The name of the module. Used e.g. while logging.
data_group(str): The name of the data group from which this Loss takes its data.
log_per_param_weights((bool)): If true, log each individual policy parameter that is optimized (norm and value histogram).
log_per_param_grads((bool)): If true, log the gradients of each individual policy parameter that is optimized (norm and histogram).

def loss(self, observation) -> None

`class AlphaLoss(LossCallback):`

Tweaks the alpha so that a specific target entropy is kept. The target entropy is scaled with the number of actions and a provided entropy scaling factor.

Methods

def __init__(
    self
,
    *pi,
    ln_alpha,
    opt,
    lr_schedule,
    n_actions,
    max_grad_norm,
    max_alpha,
    name,
    data_group,
    t_entropy
) -> None

Arguments:

pi(nn.Module): A deep neural net that outputs actions and their log probability given a state.
ln_alpha(torch.tensor): The current weight for the entropy part of the soft Q. :param lr_schedule (torch.optim.lr_scheduler._LRSchedule | None): Learning rate schedule for the optimizer of alpha.
opt(optim.Optimizer): An optimizer for ln_alpha.
lr_schedule(optim.lr_scheduler._LRScheduler | None)
n_actions(int): The dimension of the action space. Scales the target entropy.
max_grad_norm(float): Clip the norm of the gradient during backprop using this value.
max_alpha(float)
name(str): The name of the module. Used e.g. while logging.
data_group(str): The name of the data group from which this Loss takes its data. :param t_entropy (float | Schedule | None): Value or schedule for the target entropy.
t_entropy(float | Schedule | None)

def loss(self, observation) -> None

def end_batch(self) -> None

def state_dict(self) -> None

def load_state_dict(
    self,
    state_dict,
    load_weights,
    load_optimizer,
    load_hparams
) -> None

`class AgentProxyWrapper:`

Methods

def __init__(self, *inner) -> None

def __call__(self) -> None

def input_names(self) -> None

def output_names(self) -> None

def policy(self) -> None

`class FeatureAgentProxy(GenericAgentProxy):`

An agent proxy for basic MLPs. This AgentProxy assumes that the observations will contain a single flat array of features.

Methods

def __init__(self, policy, device, input_key) -> None

Create a new proxy.

Arguments:

policy(nn.Module): The policy to execute for actions.
device(torch.device): The device to run on.
input_key(str): The name of the features. (default: "obs") (default: obs)

`class VisionAgentProxy(FeatureAgentProxy):`

This AgentProxy assumes that the observations will contain image observations 'obs'.

Methods

def __init__(self, policy, device) -> None

Arguments:

policy(nn.Module)
device(torch.device)

`class MultiKeyAgentProxy(GenericAgentProxy):`

Handles multiple input keys. Observations are dicts that contain multiple input keys (e.g. both "features" and "images").

Methods

def __init__(self, policy, device, input_keys, spaces) -> None

Create a new proxy.

Arguments:

policy(nn.Module): The policy to execute for actions.
device(torch.device): The device to run on.
input_keys(tuple): The names of the input.
spaces(MDPSpace)

Emote