module emote.algorithms.sac
Functions
def soft_update_from_to(source, target, tau) -> None
Classes
class QLoss(LossCallback):
A MSE loss between the action value net and the target q. The target q values are not calculated here and need to be added to the state before the loss of this module runs.
Methods
def __init__(
self
,
*name,
q,
opt,
lr_schedule,
max_grad_norm,
data_group,
log_per_param_weights,
log_per_param_grads
) -> None
Arguments:
name(str)
: The name of the module. Used e.g. while logging.q(nn.Module)
: A deep neural net that outputs the discounted loss given the current observations and a given action.opt(optim.Optimizer)
: An optimizer for q.lr_schedule(Optional[optim.lr_scheduler._LRScheduler])
: Learning rate schedule for the optimizer of q.max_grad_norm(float)
: Clip the norm of the gradient during backprop using this value.data_group(str)
: The name of the data group from which this Loss takes its data.log_per_param_weights((bool))
: If true, log each individual policy parameter that is optimized (norm and value histogram).log_per_param_grads((bool))
: If true, log the gradients of each individual policy parameter that is optimized (norm and histogram).
def loss(self, observation, actions, q_target) -> None
class QTarget(LoggingMixin, Callback):
Creates rolling averages of the Q nets, and predicts q values using these.
The module is responsible both for keeping the averages correct in the target q networks and supplying q-value predictions using the target q networks.
Methods
def __init__(
self
,
*pi,
ln_alpha,
q1,
q2,
q1t,
q2t,
gamma,
reward_scale,
target_q_tau,
data_group,
roll_length,
use_terminal_masking
) -> None
Arguments:
pi(nn.Module)
: A deep neural net that outputs actions and their log probability given a state.ln_alpha(torch.tensor)
: The current weight for the entropy part of the soft Q.q1(nn.Module)
: A deep neural net that outputs the discounted loss given the current observations and a given action.q2(nn.Module)
: A deep neural net that outputs the discounted loss given the current observations and a given action. :param q1t (torch.nn.Module, optional): target Q network. (default: None) :param q2t (torch.nn.Module, optional): target Q network. (default: None) :param gamma (float, optional): Discount factor for the rewards in time. (default: 0.99) :param reward_scale (float, optional): Scale factor for the rewards. (default: 1.0) :param target_q_tau (float, optional): The weight given to the latest network in the exponential moving average. So NewTargetQ = OldTargetQ * (1-tau)
- Q*tau. (default: 0.005) :param data_group (str, optional): The name of the data group from which this Loss takes its data. (default: "default") :param roll_length (int, optional): Rollout length. (default: 1) :param use_terminal_masking (bool, optional): Whether to use terminal masking for the next values. (default: False)
q1t(Optional[nn.Module])
q2t(Optional[nn.Module])
gamma(float)
reward_scale(float)
target_q_tau(float)
data_group(str)
roll_length(int)
use_terminal_masking(bool)
def begin_batch(self, next_observation, rewards, masks) -> None
def end_batch(self) -> None
class PolicyLoss(LossCallback):
Maximize the soft Q-value for the policy. This loss modifies the policy to select the action that gives the highest soft q-value.
Methods
def __init__(
self
,
*pi,
ln_alpha,
q,
opt,
lr_schedule,
q2,
max_grad_norm,
name,
data_group,
log_per_param_weights,
log_per_param_grads
) -> None
Arguments:
pi(nn.Module)
: A deep neural net that outputs actions and their log probability given a state.ln_alpha(torch.tensor)
: The current weight for the entropy part of the soft Q.q(nn.Module)
: A deep neural net that outputs the discounted loss given the current observations and a given action.opt(optim.Optimizer)
: An optimizer for pi.lr_schedule(Optional[optim.lr_scheduler._LRScheduler])
: Learning rate schedule for the optimizer of policy.q2(Optional[nn.Module])
: A second deep neural net that outputs the discounted loss given the current observations and a given action. This is not necessary since it is fine if the policy isn't pessimistic, but can be nice for symmetry with the Q-loss.max_grad_norm(float)
: Clip the norm of the gradient during backprop using this value.name(str)
: The name of the module. Used e.g. while logging.data_group(str)
: The name of the data group from which this Loss takes its data.log_per_param_weights((bool))
: If true, log each individual policy parameter that is optimized (norm and value histogram).log_per_param_grads((bool))
: If true, log the gradients of each individual policy parameter that is optimized (norm and histogram).
def loss(self, observation) -> None
class AlphaLoss(LossCallback):
Tweaks the alpha so that a specific target entropy is kept. The target entropy is scaled with the number of actions and a provided entropy scaling factor.
Methods
def __init__(
self
,
*pi,
ln_alpha,
opt,
lr_schedule,
n_actions,
max_grad_norm,
max_alpha,
name,
data_group,
t_entropy
) -> None
Arguments:
pi(nn.Module)
: A deep neural net that outputs actions and their log probability given a state.ln_alpha(torch.tensor)
: The current weight for the entropy part of the soft Q. :param lr_schedule (torch.optim.lr_scheduler._LRSchedule | None): Learning rate schedule for the optimizer of alpha.opt(optim.Optimizer)
: An optimizer for ln_alpha.lr_schedule(optim.lr_scheduler._LRScheduler | None)
n_actions(int)
: The dimension of the action space. Scales the target entropy.max_grad_norm(float)
: Clip the norm of the gradient during backprop using this value.max_alpha(float)
name(str)
: The name of the module. Used e.g. while logging.data_group(str)
: The name of the data group from which this Loss takes its data. :param t_entropy (float | Schedule | None): Value or schedule for the target entropy.t_entropy(float | Schedule | None)
def loss(self, observation) -> None
def end_batch(self) -> None
def state_dict(self) -> None
def load_state_dict(
self,
state_dict,
load_weights,
load_optimizer,
load_hparams
) -> None
class AgentProxyWrapper:
Methods
def __init__(self, *inner) -> None
def __call__(self) -> None
def input_names(self) -> None
def output_names(self) -> None
def policy(self) -> None
class FeatureAgentProxy(GenericAgentProxy):
An agent proxy for basic MLPs. This AgentProxy assumes that the observations will contain a single flat array of features.
Methods
def __init__(self, policy, device, input_key) -> None
Create a new proxy.
Arguments:
policy(nn.Module)
: The policy to execute for actions.device(torch.device)
: The device to run on.input_key(str)
: The name of the features. (default: "obs") (default: obs)
class VisionAgentProxy(FeatureAgentProxy):
This AgentProxy assumes that the observations will contain image observations 'obs'.
Methods
def __init__(self, policy, device) -> None
Arguments:
policy(nn.Module)
device(torch.device)
class MultiKeyAgentProxy(GenericAgentProxy):
Handles multiple input keys. Observations are dicts that contain multiple input keys (e.g. both "features" and "images").
Methods
def __init__(self, policy, device, input_keys, spaces) -> None
Create a new proxy.
Arguments:
policy(nn.Module)
: The policy to execute for actions.device(torch.device)
: The device to run on.input_keys(tuple)
: The names of the input.spaces(MDPSpace)