Source code for gemseo.post.core.hessians

# -*- coding: utf-8 -*-
# Copyright 2021 IRT Saint Exupéry, https://www.irt-saintexupery.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License version 3 as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

# Contributors:
#    INITIAL AUTHORS - initial API and implementation and/or initial
#                        documentation
#        :author: Francois Gallard
#    OTHER AUTHORS   - MACROSCOPIC CHANGES
"""Hessian matrix approximations from gradient pairs."""
from __future__ import division, unicode_literals

import logging

from numpy import array, atleast_2d, concatenate, cumsum
from numpy import diag as np_diag
from numpy import dot, eye, inf, sqrt, trace, zeros
from numpy.linalg import LinAlgError, cholesky, inv, multi_dot, norm
from numpy.matlib import repmat
from scipy.optimize import leastsq

LOGGER = logging.getLogger(__name__)


[docs]class HessianApproximation(object):
    """Abstract class for hessian approximations from optimization history."""

    def __init__(self, history):
        """Constructor.

        :param history: the optimization history
        """
        self.history = history
        self.x_ref = None
        self.fgrad_ref = None
        self.f_ref = None
        self.b_mat_history = []
        self.h_mat_history = []

[docs]    def get_x_grad_history(
        self,
        funcname,
        first_iter=0,
        last_iter=0,
        at_most_niter=-1,
        func_index=None,
        normalize_design_space=False,
        design_space=None,
    ):
        """Get gradient history and design variables history of gradient evaluations.

        :param funcname: function name
        :param first_iter: first iteration after which the history is
            extracted (Default value = 0)
        :param last_iter: last iteration before which the history is
            extracted (Default value = 0)
        :param at_most_niter: maximum number of iterations to take
            (Default value = -1)
        :param func_index: Default value = None)
        :param normalize_design_space: if True, scale the values
            to work in a normalized design space (x between 0 and 1)
        :param design_space: the design space used to scale all values
            mandatory if normalize_design_space==True
        """
        x_grad_hist, x_hist = self.history.get_func_grad_history(funcname, x_hist=True)
        if normalize_design_space:
            (
                x_hist,
                x_grad_hist,
            ) = self._normalize_x_g(x_hist, x_grad_hist, design_space)
        assert len(x_grad_hist) == len(x_hist)
        n_pairs = len(x_grad_hist)
        if n_pairs < 2:
            raise ValueError(
                "Cannot build approximation for function : {}"
                " because its gradient history "
                "is too small : {}.".format(funcname, n_pairs)
            )
        #         LOGGER.info("Building Hessian approximation with " +
        #                     str(n_pairs) + " pairs of x and gradients")
        x_grad_hist = array(x_grad_hist)
        x_hist = array(x_hist)
        n_iter, nparam = x_hist.shape
        grad_shape = x_grad_hist.shape
        n_iter_g = grad_shape[0]
        nparam_g = grad_shape[-1]
        # Function is a vector, jacobian is a 2D matrix
        if len(grad_shape) == 3:
            if func_index is None:
                raise ValueError(
                    "Function {} has a vector output "
                    "then function index of output "
                    "must be specified.".format(funcname)
                )
            if func_index < 0 or func_index >= grad_shape[1]:
                raise ValueError(
                    "Function {} has a vector output of size {},"
                    "function index {} is out of range"
                    ".".format(funcname, grad_shape[1], func_index)
                )
            x_grad_hist = x_grad_hist[:, func_index, :]

        if (n_iter != n_iter_g) or (nparam_g != nparam):
            raise ValueError(
                "Inconsistent gradient and design" " variables optimization history"
            )
        if last_iter == 0:
            x_hist = x_hist[first_iter:, :]
            x_grad_hist = x_grad_hist[first_iter:, :]
        else:
            x_hist = x_hist[first_iter:last_iter, :]
            x_grad_hist = x_grad_hist[first_iter:last_iter, :]

        n_iter = x_hist.shape[0]
        if 0 < at_most_niter < n_iter:
            x_hist = x_hist[n_iter - at_most_niter :, :]
            x_grad_hist = x_grad_hist[n_iter - at_most_niter :, :]
        n_iter, nparam = x_hist.shape
        if n_iter < 2 or nparam == 0:
            raise ValueError(
                "Insufficient optimization history size, "
                "niter={} nparam = {}.".format(n_iter, nparam)
            )
        self.x_ref = x_hist[-1]
        self.fgrad_ref = x_grad_hist[-1]
        if last_iter == 0:
            self.f_ref = array(self.history.get_func_history(funcname))[-1]
        else:
            self.f_ref = array(self.history.get_func_history(funcname))[:last_iter][-1]
        return x_hist, x_grad_hist, n_iter, nparam

    @staticmethod
    def _normalize_x_g(x_hist, x_grad_hist, design_space):
        """scale the values to work in a normalized design space (x between 0 and 1)

        :param design_space: the design space used to scale all values
        """
        if design_space is None:
            raise ValueError(
                "Design space must be provided when using "
                "a normalize_design_space option!"
            )
        unnormalize_vect = design_space.unnormalize_vect

        def normalize_gradient(x_vect):
            """Unnormalize a gradient."""
            return unnormalize_vect(x_vect, minus_lb=False, no_check=True)

        x_scaled, grad_scaled = [], []
        for x_v, g_v in zip(x_hist, x_grad_hist):
            x_scaled.append(design_space.normalize_vect(x_v))
            grad_scaled.append(normalize_gradient(g_v))
        return x_scaled, grad_scaled

[docs]    @staticmethod
    def get_s_k_y_k(x_hist, x_grad_hist, iteration):
        """Generate the s_k and y_k terms, respectively design variable difference and
        gradients difference between iterates.

        :param x_hist: design variables history array
        :param x_grad_hist: gradients history array
        :param iteration: iteration number for which the pair must be generated
        """
        n_iter = x_grad_hist.shape[0]
        if iteration >= n_iter:
            raise ValueError(
                "Iteration {} is higher than number of gradients "
                "in the database : {}.".format(iteration, n_iter)
            )
        s_k = atleast_2d(x_hist[iteration + 1] - x_hist[iteration]).T
        y_k = atleast_2d(x_grad_hist[iteration + 1] - x_grad_hist[iteration]).T
        return s_k, y_k

[docs]    @staticmethod
    def iterate_s_k_y_k(x_hist, x_grad_hist):
        """Generate the s_k and y_k terms, respectively design variable difference and
        gradients difference between iterates.

        :param x_hist: design variables history array
        :param x_grad_hist: gradients history array
        """
        n_iter = x_hist.shape[0]
        for k in range(n_iter - 1):
            s_k, y_k = HessianApproximation.get_s_k_y_k(x_hist, x_grad_hist, k)
            yield s_k, y_k

[docs]    def build_approximation(
        self,
        funcname,
        save_diag=False,
        first_iter=0,
        last_iter=-1,
        b_mat0=None,
        at_most_niter=-1,
        return_x_grad=False,
        func_index=None,
        save_matrix=False,
        scaling=False,
        normalize_design_space=False,
        design_space=None,
    ):  # pylint: disable=W0221
        """Builds the hessian approximation B.

        :param funcname: function name
        :param save_diag: if True, returns the list of diagonal approximations
            (Default value = False)
        :param first_iter: first iteration after which the history is extracted
            (Default value = 0)
        :param last_iter: last iteration before which the history is extracted
            (Default value = -1)
        :param b_mat0: initial approximation matrix (Default value = None)
        :param at_most_niter: maximum number of iterations to take
            (Default value = -1)
        :param return_x_grad: if True, also returns the last gradient and x
            (Default value = False)
        :param func_index: Default value = None)
            (Default value = False)
        :param scaling: do scaling step
        :param normalize_design_space: if True, scale the values
            to work in a normalized design space (x between 0 and 1)
        :param design_space: the design space used to scale all values
            mandatory if normalize_design_space==True
        :returns: the B matrix, its diagonal,
            and eventually the x and grad history pairs
            used to build B, if return_x_grad=True,
            otherwise, None and None are returned for args consistency
        """
        normalize_ds = normalize_design_space
        x_hist, x_grad_hist, _, _ = self.get_x_grad_history(
            funcname,
            first_iter,
            last_iter,
            at_most_niter,
            func_index,
            normalize_ds,
            design_space,
        )
        if b_mat0 is None:
            last_n_grad = x_grad_hist.shape[0] - 2
            s_k, y_k = self.get_s_k_y_k(x_hist, x_grad_hist, last_n_grad)
            alpha = dot(y_k.T, s_k) / dot(y_k.T, y_k)
            b_mat = (1.0 / alpha) * eye(len(x_grad_hist[0]))
        elif b_mat0.size == 0:
            n_x = len(x_hist[0])
            b_mat = zeros((n_x, n_x))
        else:
            b_mat = b_mat0
        diag = []
        for s_k, y_k in self.iterate_s_k_y_k(x_hist, x_grad_hist):
            self.iterate_approximation(b_mat, s_k, y_k, scaling=scaling)
            if save_diag:
                diag.append(np_diag(b_mat).copy())
            if save_matrix:
                self.b_mat_history.append(b_mat.copy())
        if return_x_grad:
            return b_mat, diag, x_hist[-1, :], x_grad_hist[-1, :]
        return b_mat, diag, None, None

[docs]    @staticmethod
    def compute_scaling(hessk, hessk_dsk, dskt_hessk_dsk, dyk, dyt_dsk):
        """Compute scaling.

        :param hessk: previous approximation
        :param hessk_dsk: product between hessk and dsk
        :param dskt_hessk_dsk: product between dsk^t, hessk and dsk
        :param dyk: gradients difference between iterates
        :param dyt_dsk: product between dyk^t and dsk^t
        """
        coeff1 = (len(hessk_dsk) - 1) / (
            trace(hessk) - norm(hessk_dsk) ** 2 / dskt_hessk_dsk
        )
        coeff2 = dyt_dsk / norm(dyk) ** 2
        return coeff1, coeff2

[docs]    @staticmethod
    def iterate_approximation(hessk, dsk, dyk, scaling=False):
        """BFGS iteration from step k to step k+1.

        :param hessk: previous approximation
        :param dsk: design variable difference between iterates
        :param dyk: gradients difference between iterates
        :param scaling: do scaling step
        :returns: updated approximation
        """
        dyt_dsk = dot(dyk.T, dsk)
        hessk_dsk = dot(hessk, dsk)
        dskt_hessk_dsk = multi_dot((dsk.T, hessk, dsk))
        # Build the next approximation:
        b_first_term = hessk - multi_dot((hessk, dsk, dsk.T, hessk)) / dskt_hessk_dsk
        b_second_term = dot(dyk, dyk.T) / dyt_dsk
        if not scaling:
            hessk[:, :] = b_first_term + b_second_term
        else:
            c_1, c_2 = HessianApproximation.compute_scaling(
                hessk, hessk_dsk, dskt_hessk_dsk, dyk, dyt_dsk
            )
            hessk[:, :] = c_1 * b_first_term + c_2 * b_second_term

[docs]    def build_inverse_approximation(
        self,
        funcname,
        save_diag=False,
        first_iter=0,
        last_iter=-1,
        h_mat0=None,
        at_most_niter=-1,
        return_x_grad=False,
        func_index=None,
        save_matrix=False,
        factorize=False,
        scaling=False,
        angle_tol=1e-5,
        step_tol=1e10,
        normalize_design_space=False,
        design_space=None,
    ):
        """Builds the inversed hessian approximation H.

        :param funcname: function name
        :param save_diag: if True, returns the list of diagonal approximations
            (Default value = False)
        :param first_iter: first iteration after which the history is extracted
            (Default value = 0)
        :param last_iter: last iteration before which the history is extracted
            (Default value = -1)
        :param h_mat0: initial inverse approximation matrix
            (Default value = None)
        :param at_most_niter: maximum number of iterations to take
            (Default value = -1)
        :param return_x_grad: if True, also returns the last gradient and x
            (Default value = False)
        :param func_index: Default value = None)
        :param normalize_design_space: if True, scale the values
            to work in a normalized design space (x between 0 and 1)
        :param design_space: the design space used to scale all values
            mandatory if normalize_design_space==True
        :returns: the H matrix, its diagonal,
            and eventually the x and grad history pairs
            used to build H, if return_x_grad=True,
            otherwise, None and None are returned for args consistency
        """
        normalize_ds = normalize_design_space
        x_hist, x_grad_hist, _, _ = self.get_x_grad_history(
            funcname,
            first_iter,
            last_iter,
            at_most_niter,
            func_index,
            normalize_ds,
            design_space,
        )
        h_factor = None  # to become a matrix G such that H = G*G', optionally
        b_factor = None  # to become the inverse of the matrix G
        if h_mat0 is None:
            last_n_grad = x_grad_hist.shape[0] - 2
            s_k, y_k = self.get_s_k_y_k(x_hist, x_grad_hist, last_n_grad)
            alpha = dot(y_k.T, s_k) / dot(y_k.T, y_k)
            n_x = len(x_grad_hist[0])
            h_mat = alpha * eye(n_x)
            b_mat = 1.0 / alpha * eye(n_x)
            if factorize:
                h_factor = sqrt(alpha) * eye(n_x)
                b_factor = eye(n_x) / sqrt(alpha)
        elif len(h_mat0) == 0:
            n_x = len(x_hist[0])
            h_mat = zeros((n_x, n_x))
            b_mat = zeros((n_x, n_x))
            if factorize:
                h_factor = zeros((n_x, n_x))
                b_factor = zeros((n_x, n_x))
        else:
            h_mat = h_mat0
            try:
                b_mat = inv(h_mat)
            except LinAlgError:
                raise LinAlgError("The inversion of h_mat failed")
            if factorize or scaling:
                try:
                    h_factor = cholesky(h_mat)
                    b_factor = cholesky(b_mat).T
                except LinAlgError:
                    raise LinAlgError(
                        "The Cholesky decomposition of h_factor" " or b_factor failed"
                    )
        diag = []
        count = 0
        k = 0
        for s_k, y_k in self.iterate_s_k_y_k(x_hist, x_grad_hist):
            k = k + 1
            if dot(s_k.T, y_k) > angle_tol and norm(y_k, inf) < step_tol:
                count = count + 1
                self.iterate_inverse_approximation(
                    h_mat,
                    s_k,
                    y_k,
                    h_factor,
                    b_mat,
                    b_factor,
                    factorize=factorize,
                    scaling=scaling,
                )
            if save_diag:
                diag.append(np_diag(h_mat).copy())
            if save_matrix:
                self.h_mat_history.append(h_mat.copy())
        if return_x_grad:
            return h_mat, diag, x_hist[-1, :], x_grad_hist[-1, :], None, None, None
        return h_mat, diag, None, None, h_factor, b_mat, b_factor

[docs]    @staticmethod
    def compute_corrections(x_hist, x_grad_hist):
        """Computes the corrections from the history."""
        n_iter = x_hist.shape[0]
        x_corr = x_hist[1:n_iter].T - x_hist[: n_iter - 1].T
        grad_corr = x_grad_hist[1:n_iter].T - x_grad_hist[: n_iter - 1].T
        return x_corr, grad_corr

[docs]    @staticmethod
    def rebuild_history(x_corr, x_0, grad_corr, g_0):
        """Computes the history from the corrections."""
        # Rebuild the argument history:
        x_hist = repmat(x_0, x_corr.shape[1], 1) + cumsum(x_corr.T, axis=0)
        x_hist = concatenate((atleast_2d(x_0), x_hist), axis=0)
        # Rebuild the gradient history
        x_grad_hist = repmat(g_0, grad_corr.shape[1], 1) + cumsum(grad_corr.T, axis=0)
        x_grad_hist = concatenate((atleast_2d(g_0), x_grad_hist), axis=0)
        return x_hist, x_grad_hist

[docs]    @staticmethod
    def iterate_inverse_approximation(
        h_mat,
        s_k,
        y_k,
        h_factor=None,
        b_mat=None,
        b_factor=None,
        factorize=False,
        scaling=False,
    ):
        """Inverse BFGS iteration.

        :param h_mat: previous approximation
        :param s_k: design variable difference between iterates
        :param y_k: gradients difference between iterates
        :returns: updated inverse approximation
        """
        # Compute the two terms of the non-scaled updated matrix:
        yts = dot(y_k.T, s_k)
        proj = eye(len(s_k)) - dot(s_k, y_k.T) / yts
        h_first_term = multi_dot((proj, h_mat, proj.T))
        h_second_term = dot(s_k, s_k.T) / yts
        b_s = dot(b_mat, s_k)
        st_b_s = dot(s_k.T, b_s)
        # Compute the scaling coefficients:
        if scaling:
            coeff1, coeff2 = HessianApproximation.compute_scaling(
                b_mat, b_s, st_b_s, y_k, yts
            )
        else:
            coeff1, coeff2 = 1.0, 1.0
        # Update the inverse approximation H and, optionally, the factor G:
        h_mat[:, :] = h_first_term / coeff1 + h_second_term / coeff2
        if factorize:
            sst_b = dot(s_k, b_s.T)
            left = proj / sqrt(coeff1) + sst_b / sqrt(coeff2 * yts * st_b_s)
            h_factor[:, :] = dot(left, h_factor)
            # b_factor[:, :] = dot(eye(len(s_k)) - sstB.T / stBs / sqrt(coeff1)
            #                      + dot(y_k, s_k.T)
            #                      / sqrt(coeff2 * stBs * yts),
            #                      b_factor)
            right = sqrt(coeff1) * (eye(len(s_k)) - sst_b / st_b_s)
            right += sqrt(coeff2) * dot(s_k, y_k.T) / sqrt(st_b_s * yts)
            b_factor[:, :] = dot(b_factor, right)

        # Update the Hessian approximation:
        b_first_term = b_mat - multi_dot((b_s, b_s.T)) / st_b_s
        b_second_term = dot(y_k, y_k.T) / yts
        b_mat[:, :] = coeff1 * b_first_term + coeff2 * b_second_term


#             b_mat[:, :] = multi_dot((proj.T, b_mat, proj)) \
#                 + dot(y_k, y_k.T) / yts


[docs]class BFGSApprox(HessianApproximation):

    """Builds a BFGS approximation from optimization history."""

[docs]    @staticmethod
    def iterate_s_k_y_k(x_hist, x_grad_hist):
        """Generate the s_k and y_k terms, respectively design variable difference and
        gradients difference between iterates.

        :param x_hist: design variables history array
        :param x_grad_hist: gradients history array
        """
        n_iter = x_hist.shape[0]
        for k in range(n_iter - 1):
            s_k, y_k = BFGSApprox.get_s_k_y_k(x_hist, x_grad_hist, k)
            # All pairs curvatures shall be positive
            # if dot(s_k.T, y_k) > 0.:
            if dot(s_k.T, y_k) > 1e-16 * dot(y_k.T, y_k):
                yield s_k, y_k


[docs]class SR1Approx(HessianApproximation):

    """Builds a Symmetric Rank One approximation from optimization history."""

    EPSILON = 1e-8

[docs]    @staticmethod
    def iterate_approximation(b_mat, s_k, y_k, scaling=False):
        """SR1 iteration.

        :param b_mat: previous approximation
        :param s_k: design variable difference between iterates
        :param y_k: gradients difference between iterates
        :param scaling: do scaling step
        :returns: updated approximation
        """
        d_mat = y_k - multi_dot((b_mat, s_k))
        den = multi_dot((d_mat.T, s_k))
        if abs(den) > SR1Approx.EPSILON * norm(s_k) * norm(d_mat):
            b_mat[:, :] = b_mat + multi_dot((d_mat, d_mat.T)) / den
        else:
            LOGGER.debug(
                "Denominator of SR1 update is too small, " "update s_kipped %s.", den
            )


[docs]class LSTSQApprox(HessianApproximation):

    """Builds a Least squares approximation from optimization history."""

[docs]    def build_approximation(
        self,
        funcname,
        save_diag=False,
        first_iter=0,
        last_iter=-1,
        b_mat0=None,
        at_most_niter=-1,
        return_x_grad=False,
        scaling=False,
        func_index=-1,
        normalize_design_space=False,
        design_space=None,
    ):
        """Builds the hessian approximation.

        :param funcname: function name
        :param save_diag: if True, returns the list of diagonal approximations
            (Default value = False)
        :param first_iter: first iteration after which the history is extracted
            (Default value = 0)
        :param last_iter: last iteration before which the history is extracted
            (Default value = -1)
        :param b_mat0: initial approximation matrix (Default value = None)
        :param at_most_niter: Default value = -1)
        :param return_x_grad: Default value = False)
        :param func_index: Default value = -1)
        :param normalize_design_space: if True, scale the values
            to work in a normalized design space (x between 0 and 1)
        :param design_space: the design space used to scale all values
            mandatory if normalize_design_space==True
        :returns: the B matrix, its diagonal,
            and eventually the x and grad history pairs
            used to build B, if return_x_grad=True,
            otherwise, None and None are returned for args consistency
        """
        x_hist, x_grad_hist, n_iter, nparam = self.get_x_grad_history(
            funcname,
            first_iter,
            last_iter,
            at_most_niter,
            func_index=func_index,
            normalize_design_space=normalize_design_space,
            design_space=design_space,
        )
        sec_dim = max(nparam, n_iter)
        diag = []
        assert len(x_grad_hist) == len(x_hist)

        def y_to_b(y_vars):
            """Reshapes the approximation from vector to matrix.

            :param y_vars: the vector approximation
            :param y_vars: returns: the matrix shaped approximation
            :returns: the matrix shaped approximation
            """
            y_mat = y_vars.reshape((nparam, nparam))
            return y_mat + y_mat.T

        def func(y_vars):
            """Create the least square function.

            :param y_vars: the current approximation vector
            :param y_vars: returns: the estimated error vector
            :returns: the estimated error vector
            """
            b_mat_current = y_to_b(y_vars)
            err = zeros((nparam, sec_dim))
            for i in range(n_iter):
                x_l = x_hist[i] - self.x_ref
                err[:, i] = dot(b_mat_current, x_l) - x_grad_hist[i]
            err = err.reshape(nparam * sec_dim)
            if n_iter < nparam:  #
                err += y_vars
            return err

        y_0 = zeros(nparam * nparam)
        LOGGER.debug("Start least squares problem..")
        y_opt, ier = leastsq(func, x0=y_0)  # , cov_x, infodict, mesg, ier
        LOGGER.debug("End least squares, msg=%s", str(ier))
        b_mat = y_to_b(y_opt)
        if return_x_grad:
            return b_mat, diag, x_hist[-1, :], x_grad_hist[-1, :]
        return b_mat, diag, None, None