# Source code for gpytorch.mlls.predictive_log_likelihood

#!/usr/bin/env python3

from ._approximate_mll import _ApproximateMarginalLogLikelihood

[docs]class PredictiveLogLikelihood(_ApproximateMarginalLogLikelihood):
r"""
An alternative objective function for approximate GPs, proposed in Jankowiak et al., 2020_.
It typically produces better predictive variances than the :obj:gpytorch.mlls.VariationalELBO objective.

.. math::

\begin{align*}
\mathcal{L}_\text{ELBO} &=
\mathbb{E}_{p_\text{data}( y, \mathbf x )} \left[
\log p( y \! \mid \! \mathbf x)
\right] - \beta \: \text{KL} \left[ q( \mathbf u) \Vert p( \mathbf u) \right]
\\
&\approx \sum_{i=1}^N \log \mathbb{E}_{q(\mathbf u)} \left[
\int p( y_i \! \mid \! f_i) p(f_i \! \mid \! \mathbf u, \mathbf x_i) \: d f_i
\right] - \beta \: \text{KL} \left[ q( \mathbf u) \Vert p( \mathbf u) \right]
\end{align*}

where :math:N is the total number of datapoints, :math:q(\mathbf u) is the variational distribution for
the inducing function values, and :math:p(\mathbf u) is the prior distribution for the inducing function
values.

:math:\beta is a scaling constant that reduces the regularization effect of the KL
divergence. Setting :math:\beta=1 (default) results in an objective that can be motivated by a connection
to Stochastic Expectation Propagation (see Jankowiak et al., 2020_ for details).

.. note::
This objective is very similar to the variational ELBO.
The only difference is that the :math:log occurs *outside* the expectation :math:\mathbb{E}_{q(\mathbf u)}.
This difference results in very different predictive performance (see Jankowiak et al., 2020_).

:param ~gpytorch.likelihoods.Likelihood likelihood: The likelihood for the model
:param ~gpytorch.models.ApproximateGP model: The approximate GP model
:param int num_data: The total number of training data points (necessary for SGD)
:param float beta: (optional, default=1.) A multiplicative factor for the KL divergence term.
Setting it to anything less than 1 reduces the regularization effect of the model
(similarly to what was proposed in the beta-VAE paper_).
:param bool combine_terms: (default=True): Whether or not to sum the
expected NLL with the KL terms (default True)

Example:
>>> # model is a gpytorch.models.ApproximateGP
>>> # likelihood is a gpytorch.likelihoods.Likelihood
>>> mll = gpytorch.mlls.PredictiveLogLikelihood(likelihood, model, num_data=100, beta=0.5)
>>>
>>> output = model(train_x)
>>> loss = -mll(output, train_y)
>>> loss.backward()

.. _Jankowiak et al., 2020:
https://arxiv.org/abs/1910.07123
"""

def _log_likelihood_term(self, approximate_dist_f, target, **kwargs):
return self.likelihood.log_marginal(target, approximate_dist_f, **kwargs).sum(-1)

[docs]    def forward(self, approximate_dist_f, target, **kwargs):
r"""
Computes the predictive cross entropy given :math:q(\mathbf f) and :math:\mathbf y.
Calling this function will call the likelihood's
:meth:~gpytorch.likelihoods.Likelihood.forward function.

:param ~gpytorch.distributions.MultivariateNormal variational_dist_f: :math:q(\mathbf f)
the outputs of the latent function (the :obj:gpytorch.models.ApproximateGP)
:param torch.Tensor target: :math:\mathbf y The target values
:param kwargs: Additional arguments passed to the
likelihood's :meth:~gpytorch.likelihoods.Likelihood.forward function.
:rtype: torch.Tensor
:return: Predictive log likelihood. Output shape corresponds to batch shape of the model/input data.
"""
return super().forward(approximate_dist_f, target, **kwargs)