---------------------------------------------------
lines 5-420 of file: example/user/hes_fixed_math.py
---------------------------------------------------

# {xrst_begin user_hes_fixed_math.py}
# {xrst_spell
#     cc
# }
# {xrst_comment_ch #}
#
# Check the Hessian of the Fixed Effects Objective
# ################################################
#
# Purpose
# *******
# This is a detailed mathematical explanation of computing Hessian of the
# fixed effects objective used by the
# :ref:`sample_command@asymptotic` sampling method.
#
# Reference
# *********
# See the
# `theory <https://bradbell.github.io/cppad_mixed/doc/theory.htm>`_
# section of the ``cppad_mixed`` documentation.
#
# Notation
# ********
#
# .. csv-table::
#     :widths: auto
#
#     :math:`\theta`,model incidence for the parent region
#     :math:`u_0`,incidence random effect for first child
#     :math:`u_1`,incidence random effect for second child
#     :math:`y_0`,measured incidence for the first child
#     :math:`y_1`,measured incidence for the second child
#     :math:`s`,standard deviation for data and random effects
#
# The only fixed effect in this model is :math:`\theta`
# (sometimes written *theta* ) the incidence level for the world.
# The random effects are :math:`u_0` and :math:`u_1`.
#
# Problem Parameters
# ******************
# {xrst_spell_off}
# {xrst_code py}
import math
import time
theta_true     = 0.5
u_true         = [ 0.5, -0.5 ]
y_0            = theta_true * math.exp( u_true[0] )
y_1            = theta_true * math.exp( u_true[1] )
standard_dev   = theta_true # the standard deviation s
random_seed    = int( time.time() )
number_sample  = 1000
eta_in_prior   = 1e-6 # if None, the fixed effects are not scaled
# {xrst_code}
# {xrst_spell_on}
#
# Random Likelihood
# *****************
# The negative log-likelihood for this example, ignoring the constant
# of integration, is
#
# .. math::
#
#  f( \theta, u )
#  = \frac{1}{2 s^2} \left(
#     [ y_0 - \theta \exp( u_0 ) ]^2 +
#     [ y_1 - \theta \exp( u_1 ) ]^2 +
#     u_0^2 +
#     u_1^2
#  \right)
#
# Gradient w.r.t. Fixed Effects
# *****************************
#
# .. math::
#
#  f_\theta ( \theta, u )
#  =
#  \frac{1}{s^2} [
#     \theta \exp( 2 u_0 ) - y_0 \exp( u_0 ) +
#     \theta \exp( 2 u_1 ) - y_1 \exp( u_1 )
#  ]
#
# Gradient w.r.t. Random Effects
# ******************************
#
# .. math::
#
#  f_u ( \theta, u )
#  =
#  \frac{1}{s^2} \left(
#  \begin{array}{c}
#     \theta^2 \exp( 2 u_0 ) - y_0 \theta \exp( u_0 )  + u_0
#     \\
#     \theta^2 \exp( 2 u_1 ) - y_1 \theta \exp( u_1 )  + u_1
#  \end{array}
#  \right)
#
# Hessian w.r.t. Fixed Effects
# ****************************
#
# .. math::
#
#  f_{\theta,\theta} ( \theta, u )
#  =
#  \frac{1}{s^2} [ \exp( 2 u_0 ) + \exp( 2 u_1 ) ]
#
# Hessian w.r.t. Random Effects
# *****************************
#
# .. math::
#
#  f_{u,u} ( \theta, u )
#  =
#  \frac{1}{s^2} \left(
#  \begin{array}{cc}
#     2 \theta^2 \exp( 2 u_0 ) - y_0 \theta \exp( u_0 ) + 1   & 0
#     \\
#     0   & 2 \theta^2 \exp( 2 u_1 ) - y_1 \theta \exp( u_1 ) + 1
#  \end{array}
#  \right)
#
# Hessian Cross Term
# ******************
#
# .. math::
#
#  f_{u,\theta} ( \theta, u )
#  =
#  \frac{1}{s^2} \left(
#  \begin{array}{c}
#     2 \theta \exp( 2 u_0 ) - y_0 \exp( u_0 )
#     \\
#     2 \theta \exp( 2 u_1 ) - y_1 \exp( u_1 )
#  \end{array}
#  \right)
#
# Optimal Random Effects
# **********************
#
# Implicit Function Definition
# ============================
# The optimal random effects :math:`\hat{u} ( \theta )`
# solve the equation
#
# .. math::
#
#  f_u [ \theta , \hat{u} ( \theta ) ] = 0
#
# Derivatives of Optimal Random Effects
# =====================================
# Using the implicit function theorem we have
#
# .. math::
#
#  \hat{u}^{(1)} ( \theta )= -
#     f_{u,u} [ \theta , \hat{u} ( \theta) ]^{-1}
#        f_{u,\theta} [ \theta , \hat{u} ( \theta) ]
#
# Substituting in the formulas above for the Hessian terms on the
# right hand side we obtain:
#
# .. math::
#
#  \hat{u}_i^{(1)} ( \theta ) = - \frac{
#     2 \theta \exp[ 2 \hat{u}_i ( \theta) ] -
#     y_i \exp[ \hat{u}_i ( \theta) ]
#  }{
#     2 \theta^2 \exp[ 2 \hat{u}_i ( \theta) ] -
#     y_i \theta \exp[ \hat{u}_i ( \theta) ] + 1
#  }
#
# We can compute :math:`\hat{u}_i ( \theta )` by optimizing the
# random effects corresponding to the fixed effects being :math:`\theta`.
# We define :math:`g_i ( \theta )` by the equation
#
# .. math::
#
#  g_i ( \theta ) = 2 \theta \exp[ 2 \hat{u}_i ( \theta) ]
#              - y_i \exp[ \hat{u}_i ( \theta ) ]
#
# Give :math:`\hat{u}_i ( \theta )` we can compute :math:`g_i ( \theta )`.
# Given :math:`g_i ( \theta )`, we can compute
# the derivative :math:`\hat{u}_i^{(1)} ( \theta )` using
#
# .. math::
#
#  \hat{u}_i^{(1)} ( \theta ) = -
#     \frac{ g_i ( \theta ) }{ \theta g_i ( \theta ) + 1}
#
# Given :math:`\hat{u}^{(1)} ( \theta )`, we can compute
# the derivative :math:`g_i^{(1)} ( \theta )` using
#
# .. math::
#
#  g_i^{(1)} ( \theta ) =
#  2 \exp[ 2 \hat{u}_i ( \theta) ]  +
#  \left(
#     4 \theta \exp [ 2 \hat{u}_i ( \theta ) ] -
#     y_i \exp [ \hat{u}_i ( \theta ) ]
#  \right) \hat{u}_i^{(1)} ( \theta )
#
# Given :math:`g_i^{(1)} ( \theta )`, we can compute
# the second derivative :math:`\hat{u}_i^{(2)} ( \theta )` using
#
# .. math::
#
#  \hat{u}_i^{(2)} ( \theta ) =
#  \frac{ g_i ( \theta ) [ g_i ( \theta ) + \theta g_i ^{(1)} ( \theta ) ] }
#  { [ \theta g_i ( \theta ) + 1 ]^2 }
#  -
#  \frac{ g_i ^{(1)}( \theta ) }{ \theta g_i ( \theta ) + 1}
#
# .. math::
#
#  \hat{u}_i^{(2)} ( \theta ) =
#  \frac{ g_i ( \theta )^2 - g_i ^{(1)}( \theta )}
#  { [ \theta g_i ( \theta ) + 1 ]^2 }
#
# Given :math:`\hat{u}^{(2)} ( \theta )`, we can compute
# the second derivative :math:`g_i^{(2)} ( \theta )` using
#
# .. math::
#  :nowrap:
#
#  \begin{eqnarray}
#  g_i^{(2)} ( \theta ) & = &
#  8 \exp[ 2 \hat{u}_i ( \theta) ] \hat{u}_i^{(1)} ( \theta )
#  \\ & + &
#  \left(
#     8 \theta \exp [ 2 \hat{u}_i ( \theta ) ]  -
#     y_i \exp [ \hat{u}_i ( \theta ) ]
#  \right) \hat{u}_i^{(1)} ( \theta )^2
#  \\ & + &
#  \left(
#     4 \theta \exp [ 2 \hat{u}_i ( \theta ) ] -
#     y_i \exp [ \hat{u}_i ( \theta ) ]
#  \right) \hat{u}_i^{(2)} ( \theta )
#  \end{eqnarray}
#
# Laplace Approximation
# *********************
# Up to a constant, the Laplace approximation (fixed effects objective),
# as a function of the fixed effects, is:
#
# .. math::
#
#  L ( \theta ) = F( \theta ) + G( \theta )
#
# where :math:`F( \theta )` and :math:`G( \theta )` are defined by
#
# .. math::
#
#  F( \theta ) = f[ \theta , \hat{u} ( \theta ) ]
#
# .. math::
#
#  G( \theta ) =\frac{1}{2} \log \det f_{u,u} [ \theta , \hat{u} ( \theta ) ]
#
# The derivative of :math:`F( \theta )` is given by
#
# .. math::
#
#  F^{(1)} ( \theta ) =
#  f_\theta [ \theta , \hat{u} ( \theta ) ] +
#  f_u [ \theta , \hat{u} ( \theta ) ]  \hat{u}^{(1)} ( \theta )
#
# It follows from the definition of :math:`\hat{u} ( \theta )` that
# :math:`f_u [ \theta , \hat{u} ( \theta ) ] = 0` and
#
# .. math::
#
#  F^{(1)} ( \theta ) = f_\theta [ \theta , \hat{u} ( \theta ) ]
#
# Taking the derivative of the equation above we obtain
#
# .. math::
#
#  F^{(2)} ( \theta ) =
#  f_{\theta,\theta} [ \theta , \hat{u} ( \theta ) ] +
#  f_{\theta,u} [ \theta , \hat{u} ( \theta ) ] \hat{u}^{(1)} ( \theta )
#
# Combining the definition of :math:`G( \theta )`, :math:`g_i ( \theta )`
# and the formula for :math:`f_{u,u} ( \theta , u )` we have
#
# .. math::
#
#  G( \theta )
#  =
#  \frac{1}{2} \log \det \left(
#  \begin{array}{cc}
#     [ \theta g_0 ( \theta ) + 1 ] / s^2 & 0
#     \\
#     0   & [ \theta g_1 ( \theta ) + 1 ] / s^2
#  \end{array}
#  \right)
#
# Defining :math:`h_i ( \theta )` by
#
# .. math::
#
#  h_i ( \theta ) = \theta g_i ( \theta ) + 1
#
# we obtain the representation
#
# .. math::
#
#  G ( \theta ) =
#  + \frac{1}{2} \left(
#  \log [ h_0 ( \theta ) ] + \log [ h_1 ( \theta ) ] - \log ( s^4 )
#  \right)
#
# The first and second derivative of :math:`h_i ( \theta )`
# and :math:`G( \theta )` are given by
#
# .. math::
#
#  h_i^{(1)} ( \theta ) = g_i ( \theta ) + \theta g_i^{(1)} ( \theta )
#
# .. math::
#
#  G^{(1)} ( \theta ) =
#   \frac{1}{2} \left(
#      \frac{ h_0^{(1)} ( \theta ) }{ h_0 ( \theta ) }
#      +
#      \frac{ h_1^{(1)} ( \theta ) }{ h_1 ( \theta ) }
#   \right)
#
# .. math::
#
#  h_i^{(2)} ( \theta ) = 2 g_i^{(1)} ( \theta ) + \theta g_i^{(2)} ( \theta )
#
# .. math::
#
#  G^{(2)} ( \theta ) =
#   \frac{1}{2} \left(
#      \frac{ h_0 ( \theta ) h_0^{(2)} ( \theta ) - h_0^{(1)} ( \theta )^2 }
#      { h_0 ( \theta )^2 }
#      +
#      \frac{ h_1 ( \theta ) h_1^{(2)} ( \theta ) - h_1^{(1)} ( \theta )^2 }
#      { h_1 ( \theta )^2 }
#   \right)
#
# Asymptotic Statistics
# *********************
# The asymptotic posterior distribution for the optimal estimate of
# :math:`\theta` give the data :math:`y`
# is a normal with variance equal to the inverse of
#
# .. math::
#
#  L^{(2)} ( \theta ) = F^{(2)} ( \theta ) + G^{(2)} ( \theta )
#
# Scaling Fixed Effects
# *********************
# If :ref:`prior_table@eta` is not null,
# the Hessian is with respect to :math:`\alpha = \log( \theta + \eta )`.
# Inverting this relation we define
# :math:`\theta ( \alpha ) = \exp( \alpha ) - \eta`.
# The fixed effects objective as a function of :math:`\alpha` is
#
# .. math::
#
#  H( \alpha ) =
#  L[ \theta ( \alpha ) ] =
#  F[ \theta( \alpha ) ] + G[ \theta( \alpha ) ]
#
# Taking the first and second derivatives, we have
#
# .. math::
#
#  H^{(1)}( \alpha ) =  \left(
#     F^{(1)}[ \theta( \alpha ) ] + G^{(1)}[ \theta( \alpha ) ]
#  \right) \exp( \alpha )
#
# .. math::
#  :nowrap:
#
#  \begin{eqnarray}
#  H^{(2)}( \alpha ) & = & \left(
#     F^{(1)}[ \theta( \alpha ) ] + G^{(1)}[ \theta( \alpha ) ]
#  \right) \exp( \alpha )
#  \\ & + &
#  \left(
#     F^{(2)}[ \theta( \alpha ) ] + G^{(2)}[ \theta( \alpha ) ]
#  \right) \exp( 2 \alpha )
#  \end{eqnarray}
#
# Optimal Fixed Effects
# =====================
# The first order necessary conditions for
# :math:`\hat{\alpha}`
# to be a minimizer of the fixed effects object is
# :math:`H^{(1)} ( \hat{\alpha} ) = 0`.
# In this case we can simplify the Hessian scaling as follows:
#
# .. math::
#  :nowrap:
#
#  \begin{eqnarray}
#  H^{(2)}( \hat{\alpha} ) & = & \left(
#     F^{(2 }( \hat{\theta} ) + G^{(2)}( \hat{\theta} )
#  \right) \exp( 2 \hat{\alpha} )
#  \\ & = &
#  L^{(2)} ( \hat{\theta} ) \exp( 2 \hat{\alpha} )
#  \end{eqnarray}
#
# where :math:`\hat{\theta} = \theta( \hat{\alpha} )`.
#
# Source Code
# ***********
# {xrst_literal
#     BEGIN PYTHON
#     END PYTHON
# }
#
# {xrst_end user_hes_fixed_math.py}