module emote.algorithms.sac

Functions

def soft_update_from_to(source, target, tau) -> None

Classes

class QLoss(LossCallback):

A MSE loss between the action value net and the target q. The target q values are not calculated here and need to be added to the state before the loss of this module runs.

Methods

def __init__(
    self
,
    *name,
    q,
    opt,
    lr_schedule,
    max_grad_norm,
    data_group,
    log_per_param_weights,
    log_per_param_grads
) -> None

Arguments:

  • name(str): The name of the module. Used e.g. while logging.
  • q(nn.Module): A deep neural net that outputs the discounted loss given the current observations and a given action.
  • opt(optim.Optimizer): An optimizer for q.
  • lr_schedule(Optional[optim.lr_scheduler._LRScheduler]): Learning rate schedule for the optimizer of q.
  • max_grad_norm(float): Clip the norm of the gradient during backprop using this value.
  • data_group(str): The name of the data group from which this Loss takes its data.
  • log_per_param_weights((bool)): If true, log each individual policy parameter that is optimized (norm and value histogram).
  • log_per_param_grads((bool)): If true, log the gradients of each individual policy parameter that is optimized (norm and histogram).
def loss(self, observation, actions, q_target) -> None

class QTarget(LoggingMixin, Callback):

Creates rolling averages of the Q nets, and predicts q values using these.

The module is responsible both for keeping the averages correct in the target q networks and supplying q-value predictions using the target q networks.

Methods

def __init__(
    self
,
    *pi,
    ln_alpha,
    q1,
    q2,
    q1t,
    q2t,
    gamma,
    reward_scale,
    target_q_tau,
    data_group,
    roll_length,
    use_terminal_masking
) -> None

Arguments:

  • pi(nn.Module): A deep neural net that outputs actions and their log probability given a state.
  • ln_alpha(torch.tensor): The current weight for the entropy part of the soft Q.
  • q1(nn.Module): A deep neural net that outputs the discounted loss given the current observations and a given action.
  • q2(nn.Module): A deep neural net that outputs the discounted loss given the current observations and a given action. :param q1t (torch.nn.Module, optional): target Q network. (default: None) :param q2t (torch.nn.Module, optional): target Q network. (default: None) :param gamma (float, optional): Discount factor for the rewards in time. (default: 0.99) :param reward_scale (float, optional): Scale factor for the rewards. (default: 1.0) :param target_q_tau (float, optional): The weight given to the latest network in the exponential moving average. So NewTargetQ = OldTargetQ * (1-tau)
  • Q*tau. (default: 0.005) :param data_group (str, optional): The name of the data group from which this Loss takes its data. (default: "default") :param roll_length (int, optional): Rollout length. (default: 1) :param use_terminal_masking (bool, optional): Whether to use terminal masking for the next values. (default: False)
  • q1t(Optional[nn.Module])
  • q2t(Optional[nn.Module])
  • gamma(float)
  • reward_scale(float)
  • target_q_tau(float)
  • data_group(str)
  • roll_length(int)
  • use_terminal_masking(bool)
def begin_batch(self, next_observation, rewards, masks) -> None
def end_batch(self) -> None

class PolicyLoss(LossCallback):

Maximize the soft Q-value for the policy. This loss modifies the policy to select the action that gives the highest soft q-value.

Methods

def __init__(
    self
,
    *pi,
    ln_alpha,
    q,
    opt,
    lr_schedule,
    q2,
    max_grad_norm,
    name,
    data_group,
    log_per_param_weights,
    log_per_param_grads
) -> None

Arguments:

  • pi(nn.Module): A deep neural net that outputs actions and their log probability given a state.
  • ln_alpha(torch.tensor): The current weight for the entropy part of the soft Q.
  • q(nn.Module): A deep neural net that outputs the discounted loss given the current observations and a given action.
  • opt(optim.Optimizer): An optimizer for pi.
  • lr_schedule(Optional[optim.lr_scheduler._LRScheduler]): Learning rate schedule for the optimizer of policy.
  • q2(Optional[nn.Module]): A second deep neural net that outputs the discounted loss given the current observations and a given action. This is not necessary since it is fine if the policy isn't pessimistic, but can be nice for symmetry with the Q-loss.
  • max_grad_norm(float): Clip the norm of the gradient during backprop using this value.
  • name(str): The name of the module. Used e.g. while logging.
  • data_group(str): The name of the data group from which this Loss takes its data.
  • log_per_param_weights((bool)): If true, log each individual policy parameter that is optimized (norm and value histogram).
  • log_per_param_grads((bool)): If true, log the gradients of each individual policy parameter that is optimized (norm and histogram).
def loss(self, observation) -> None

class AlphaLoss(LossCallback):

Tweaks the alpha so that a specific target entropy is kept. The target entropy is scaled with the number of actions and a provided entropy scaling factor.

Methods

def __init__(
    self
,
    *pi,
    ln_alpha,
    opt,
    lr_schedule,
    n_actions,
    max_grad_norm,
    max_alpha,
    name,
    data_group,
    t_entropy
) -> None

Arguments:

  • pi(nn.Module): A deep neural net that outputs actions and their log probability given a state.
  • ln_alpha(torch.tensor): The current weight for the entropy part of the soft Q. :param lr_schedule (torch.optim.lr_scheduler._LRSchedule | None): Learning rate schedule for the optimizer of alpha.
  • opt(optim.Optimizer): An optimizer for ln_alpha.
  • lr_schedule(optim.lr_scheduler._LRScheduler | None)
  • n_actions(int): The dimension of the action space. Scales the target entropy.
  • max_grad_norm(float): Clip the norm of the gradient during backprop using this value.
  • max_alpha(float)
  • name(str): The name of the module. Used e.g. while logging.
  • data_group(str): The name of the data group from which this Loss takes its data. :param t_entropy (float | Schedule | None): Value or schedule for the target entropy.
  • t_entropy(float | Schedule | None)
def loss(self, observation) -> None
def end_batch(self) -> None
def state_dict(self) -> None
def load_state_dict(
    self,
    state_dict,
    load_weights,
    load_optimizer,
    load_hparams
) -> None

class AgentProxyWrapper:

Methods

def __init__(self, *inner) -> None
def __call__(self) -> None
def input_names(self) -> None
def output_names(self) -> None
def policy(self) -> None

class FeatureAgentProxy(GenericAgentProxy):

An agent proxy for basic MLPs. This AgentProxy assumes that the observations will contain a single flat array of features.

Methods

def __init__(self, policy, device, input_key) -> None

Create a new proxy.

Arguments:

  • policy(nn.Module): The policy to execute for actions.
  • device(torch.device): The device to run on.
  • input_key(str): The name of the features. (default: "obs") (default: obs)

class VisionAgentProxy(FeatureAgentProxy):

This AgentProxy assumes that the observations will contain image observations 'obs'.

Methods

def __init__(self, policy, device) -> None

Arguments:

  • policy(nn.Module)
  • device(torch.device)

class MultiKeyAgentProxy(GenericAgentProxy):

Handles multiple input keys. Observations are dicts that contain multiple input keys (e.g. both "features" and "images").

Methods

def __init__(self, policy, device, input_keys, spaces) -> None

Create a new proxy.

Arguments:

  • policy(nn.Module): The policy to execute for actions.
  • device(torch.device): The device to run on.
  • input_keys(tuple): The names of the input.
  • spaces(MDPSpace)