Source code for fitAlgs.fitAlg

# -*- coding: utf-8 -*-
"""
:Author: Dominic Hunt
"""
import logging
import math
import collections
import itertools

import numpy as np
import scipy as sp
#import numdifftools as nd

from fitAlgs.fitSims import FitSim
from fitAlgs import qualityFunc
from fitAlgs import boundFunc

import utils


[docs]class FitAlg(object):
    """
    The abstract class for fitting data

    Parameters
    ----------
    fit_sim : fitAlgs.fitSims.FitSim instance, optional
        An instance of one of the fitting simulation methods. Default ``fitAlgs.fitSims.FitSim``
    fit_measure : string, optional
        The name of the function used to calculate the quality of the fit.
        The value it returns provides the fitter with its fitting guide. Default ``-loge``
    fit_measure_args : dict, optional
        The parameters used to initialise fit_measure and extra_fit_measures. Default ``None``
    extra_fit_measures : list of strings, optional
        List of fit measures not used to fit the model, but to provide more information. Any arguments needed for these
        measures should be placed in fit_measure_args. Default ``None``
    bounds : dictionary of tuples of length two with floats, optional
        The boundaries for methods that use bounds. If unbounded methods are
        specified then the bounds will be ignored. Default is ``None``, which
        translates to boundaries of (0, np.inf) for each parameter.
    boundary_excess_cost : str or callable returning a function, optional
        The function is used to calculate the penalty for exceeding the boundaries.
        Default is ``boundFunc.scalarBound()``
    boundary_excess_cost_properties : dict, optional
        The parameters for the boundary_excess_cost function. Default {}
    calculate_covariance : bool, optional
        Is the covariance calculated. Default ``False``


    Attributes
    ----------
    Name : string
        The name of the fitting method

    See Also
    --------
    fitAlgs.fitSims.fitSim : The general fitting class

    """

    def __init__(self,
                 fit_sim=None, fit_measure='-loge', fit_measure_args=None, extra_fit_measures=None,
                 bounds=None, boundary_excess_cost=None, boundary_excess_cost_properties=None, bound_ratio=0.000001,
                 calculate_covariance=False, **kwargs):

        if fit_sim is None:
            self.fit_sim = FitSim()
        elif isinstance(fit_sim, FitSim):
            self.fit_sim = fit_sim
        else:
            raise NameError("fitSim type is incorrect: {}".format(type(fit_sim)))

        if bounds is None:
            raise NameError("Please specify bounds for your parameters")
        else:
            self.boundaries = bounds

        if fit_measure_args is None:
            fit_measure_args = {}

        if extra_fit_measures is None:
            extra_fit_measures = []

        self.Name = self.find_name()

        if callable(boundary_excess_cost):
            if boundary_excess_cost_properties is not None:
                boundary_excess_cost_kwargs = {k: v for k, v in kwargs.items()
                                                    if k in boundary_excess_cost_properties}
            else:
                boundary_excess_cost_kwargs = kwargs.copy()
            self.boundary_excess_cost = boundary_excess_cost(**boundary_excess_cost_kwargs)
        elif isinstance(boundary_excess_cost, str):
            boundary_excess_cost_function = utils.find_function(boundary_excess_cost, 'fitAlgs', excluded_files=['fit'])
            boundary_excess_cost_kwargs = {k: v for k, v in kwargs.items()
                                           if k in utils.get_function_args(boundary_excess_cost_function)}
            self.boundary_excess_cost = boundary_excess_cost_function(**boundary_excess_cost_kwargs)
        else:
            self.boundary_excess_cost = boundFunc.scalarBound()

        self.fit_quality_function = qualityFunc.qualFuncIdent(fit_measure, **fit_measure_args.copy())
        self.calculate_covariance = calculate_covariance
        if self.calculate_covariance:
            self.hessInc = {k: bound_ratio * (u - l) for k, (l, u) in self.boundaries.items()}

        self.measures = {m: qualityFunc.qualFuncIdent(m, **fit_measure_args.copy()) for m in extra_fit_measures}

        self.fit_info = {'Name': self.Name,
                         'fit_measure_function': fit_measure,
                         'fit_measure_arguments': fit_measure_args,
                         'boundary_cost_function': utils.callableDetailsString(boundary_excess_cost),
                         'bounds': self.boundaries,
                         'extra_fit_measures': extra_fit_measures,
                         'calculate_covariance': calculate_covariance,
                         'bound_ratio': bound_ratio,
                         'FitSim': self.fit_sim.info()}

        self.boundary_values = None
        self.boundary_names = None

        self.tested_parameters = []
        self.tested_parameter_qualities = []

        self.logger = logging.getLogger(self.Name)

    def __repr__(self):

        return repr(self.info())

[docs]    def find_name(self):
        """
        Returns the name of the class
        """

        return self.__class__.__name__

[docs]    def participant(self, model, model_parameters, model_properties, participant_data):
        """
        Fit participant data to a model for a given task

        Parameters
        ----------
        model : model.modelTemplate.Model inherited class
            The model you wish to try and fit values to
        model_parameters : dict
            The model initial parameters
        model_properties : dict
            The model static properties
        participant_data : dict
            The participant data

        Returns
        -------
        model : model.modelTemplate.Model inherited class instance
            The model with the best fit parameters
        fit_quality : float
            Specifies the fit quality for this participant to the model
        fitting_data : tuple of OrderedDict and list
            They are an ordered dictionary containing the parameter values tested, in the order they were tested, and the
            fit qualities of these parameters.
        """

        sim = self.fit_sim.prepare_sim(model, model_parameters, model_properties, participant_data)

        model_initial_parameters = list(model_parameters.values())
        model_parameter_names = list(model_parameters.keys())

        best_fit_parameters, fit_quality, fit_info = self.fit(sim, model_parameter_names, model_initial_parameters[:])

        model_run = self.fit_sim.fitted_model(*best_fit_parameters)

        fit_measures = self.extra_measures(*best_fit_parameters)

        testedParamDict = collections.OrderedDict([(key, val[0]) for key, val in zip(model_parameter_names, np.array(fit_info[0]).T)])

        fitting_data = {"tested_parameters": testedParamDict,
                        "fit_qualities": fit_info[1],
                        "fit_quality": fit_quality,
                        "final_parameters": collections.OrderedDict([(key, val) for key, val in zip(model_parameter_names, best_fit_parameters)])}

        fitting_data.update({"fit_quality_" + k: v for k, v in fit_measures.items()})

        if self.calculate_covariance:
            covariance = self.covariance(model_parameter_names, best_fit_parameters, fit_info[2])
            covdict = ({"fit_quality_cov_{}_{}".format(p1, p2): c for p1, cr in zip(model_parameter_names, covariance)
                                                                 for p2, c in zip(model_parameter_names, cr)})
            fitting_data.update(covdict)

        try:
            fitting_data.update(fit_info[2])
        finally:
            return model_run, fit_quality, fitting_data

[docs]    def fit(self, simulator, model_parameter_names, model_initial_parameters):
        """
        Runs the model through the fitting algorithms and starting parameters
        and returns the best one. This is the abstract version that always
        returns ``(0,0)``

        Parameters
        ----------
        simulator : function
            The function used by a fitting algorithm to generate a fit for
            given model parameters. One example is ``fitAlgs.fitAlg.fitness``
        model_parameter_names : list of strings
            The list of initial parameter names
        model_initial_parameters : list of floats
            The list of the initial parameters

        Returns
        -------
        best_fit_parameters : list of floats
            The best fitting parameters
        fit_quality : float
            The quality of the fit as defined by the quality function chosen.
        tested_parameters : tuple of two lists
            The two lists are a list containing the parameter values tested, in the order they were tested, and the
            fit qualities of these parameters.

        See Also
        --------
        fitAlgs.fitAlg.fitness

        """
        # TODO : consider passing model_parameter_names and model_initial_parameters as one parameter

        self.simulator = simulator
        self.tested_parameters = []
        self.tested_parameter_qualities = []
        best_fit_parameters = 0
        fit_quality = np.inf

        return best_fit_parameters, fit_quality, (self.tested_parameters, self.tested_parameter_qualities)

[docs]    def fitness(self, *params):
        """
        Generates a fit quality value used by the fitting function. This is the function passed to the fitting function.

        Parameters
        ----------
        *params : array of floats
            The parameters proposed by the fitting algorithm

        Returns
        -------
        fit_quality : float
            The fit quality value calculated using the fitQualFunc function

        See Also
        --------
        fitAlgs.qualityFunc : the module of fitQualFunc functions
        fitAlg.invalidParams : Checks if the parameters are valid and if not returns ``inf``
        fitAlgs.fitSims.fitSim.fitness : Runs the model simulation and returns the values used to calculate the fit quality

        """
        # This is because the fitting functions return an array and we want a list
        pms = list(*params)

        # Start by checking that the parameters are valid
        if self.invalid_parameters(*pms):
            pseudo_fit_quality = self.boundary_excess_cost(pms, self.boundary_values, self.fit_quality_function)
            return pseudo_fit_quality

        # Run the simulation with these parameters
        modVals = self.simulator(*pms)

        fit_quality = self.fit_quality_function(modVals)

        self.tested_parameters.append(pms)
        self.tested_parameter_qualities.append(fit_quality)

        return fit_quality

[docs]    def extra_measures(self, *model_parameter_values):
        """

        Parameters
        ----------
        *model_parameter_values : array of floats
            The parameters proposed by the fitting algorithm

        Returns
        -------
        fit_quality : dict of float
            The fit quality value calculated using the fit quality functions described in extraMeasures

        """

        modVals = self.simulator(*model_parameter_values)

        measureVals = {}
        for m, f in self.measures.items():

            fit_quality = f(modVals)
            measureVals[m] = fit_quality

        return measureVals

[docs]    def covariance(self, model_parameter_names, paramvals, fitinfo):
        """
        The covariance at a point

        Parameters
        ----------
        paramvals : array or list
            The parameters at which the
        fitinfo : dict
            The

        Returns
        -------
        covariance : float
            The covariance at the point paramvals

        """

        if 'hess_inv' in fitinfo:
            cov = fitinfo['hess_inv']
        elif 'hess' in fitinfo:
            hess = fitinfo['hess']
            cov = np.linalg.inv(hess)
        #elif 'jac' in fitinfo:
        #    jac = fitinfo['jac']
        #    cov = covariance(jac)
        else:
            inc = [self.hessInc[p] for p in model_parameter_names]
            # TODO : Check if this is correct or replace it with other methods
            jac = np.expand_dims(sp.optimise.approx_fprime(paramvals, self.fitness, inc), axis=0)
            #cov = covariance(jac)
            cov = np.linalg.inv(np.dot(jac.T, jac))

        return cov

[docs]    def info(self):
        """
        The information relating to the fitting method used

        Includes information on the fitting algorithm used

        Returns
        -------
        info : (dict,dict)
            The fitSims info and the fitAlgs.fitAlg info

        See Also
        --------
        fitAlg.fitSims.fitSim.info
        """

        return self.fit_info

[docs]    def set_bounds(self, model_parameter_names):
        """
        Checks if the bounds have changed

        Parameters
        ----------
        model_parameter_names : list of strings
            An ordered list of the names of the parameters to be fitted

        Examples
        --------
        >>> a = FitAlg(bounds={1: (0, 5), 2: (0, 2), 3: (-1, 1)})
        >>> a.boundaries
        {1: (0, 5), 2: (0, 2), 3: (-1, 1)}
        >>> a.set_bounds([])
        >>> a.boundaries
        {1: (0, 5), 2: (0, 2), 3: (-1, 1)}
        >>> a.boundary_names
        []
        >>> a.set_bounds([3,1])
        >>> a.boundary_values
        [(-1, 1), (0, 5)]
        >>> a.set_bounds([2,1])
        >>> a.boundary_values
        [(0, 2), (0, 5)]
        """

        bounds = self.boundaries
        boundNames = self.boundary_names
        boundVals = self.boundary_values

        # Check if the bounds have changed or should be added
        if boundNames:
            changed = False
            for m, b in zip(model_parameter_names, boundNames):
                if m != b:
                    changed = True
                    break
        else:
            changed = True

        # If they have not, then we can leave
        if not changed:
            if len(model_parameter_names) == len(boundNames):
                return

        # If no bounds were defined
        if not bounds:
            boundVals = [(0, float('Inf')) for i in model_parameter_names]
            self.boundaries = {k: v for k, v in zip(model_parameter_names, boundVals)}
            self.boundary_names = model_parameter_names
            self.boundary_values = boundVals
        else:
            self.boundary_values = [bounds[k] for k in model_parameter_names]
            self.boundary_names = model_parameter_names

        return

[docs]    @classmethod
    def startParams(cls, initial_parameters, bounds=None, number_starting_points=3):
        """
        Defines a list of different starting parameters to run the minimization
        over

        Parameters
        ----------
        initial_parameters : list of floats
            The initial starting values proposed
        bounds : list of tuples of length two with floats, optional
            The boundaries for methods that use bounds. If unbounded methods are
            specified then the bounds will be ignored. Default is ``None``, which
            translates to boundaries of (0,float('Inf')) for each parameter.
        number_starting_points : int
            The number of starting parameter values to be calculated around
            each initial point

        Returns
        -------
        startParamSet : list of list of floats
            The generated starting parameter combinations

        See Also
        --------
        FitAlg.start_parameter_values : Used in this function

        Examples
        --------
        >>> FitAlg.startParams([0.5,0.5], number_starting_points=2)
        array([[0.33333333, 0.33333333],
               [0.66666667, 0.33333333],
               [0.33333333, 0.66666667],
               [0.66666667, 0.66666667]])
        """

        if bounds is None:
            # We only have the values passed in as the starting parameters
            startLists = (cls.start_parameter_values(i, number_starting_points=number_starting_points) for i in initial_parameters)

        else:
            if len(bounds) != len(initial_parameters):
                raise ValueError('Bounds do not fit the number of initial parameters', str(len(bounds)), str(len(initial_parameters)))

            startLists = (cls.start_parameter_values(i, boundary_min=bMin, boundary_max=bMax, number_starting_points=number_starting_points)
                          for i, (bMin, bMax) in zip(initial_parameters, bounds))

        startSets = utils.listMergeNP(*startLists)

        return startSets

[docs]    @staticmethod
    def start_parameter_values(initial, boundary_min=float('-Inf'), boundary_max=float('Inf'), number_starting_points=3):
        """
        Provides a set of starting points

        Parameters
        ----------
        initial : float
            The initial starting value proposed
        boundary_min : float, optional
            The minimum value of the parameter. Default is ``float('-Inf')``
        boundary_max : float, optional
            The maximum value of the parameter. Default is ``float('Inf')``
        number_starting_points : int
            The number of starting parameter values to be calculated around the inital
            point

        Returns
        -------
        startParams : list of floats
            The generated starting parameters

        Notes
        -----
        For each starting parameter provided a set of numStartPoints
        starting points will be chosen, surrounding the starting point provided. If
        the starting point provided is less than one but greater than zero it
        will be assumed that the values cannot leave those bounds, otherwise,
        unless otherwise told, it will be assumed that they can take any
        positive value and will be chosen to be eavenly spaced around the
        provided value.

        Examples
        --------
        >>> FitAlg.start_parameter_values(0.5)
        array([0.25, 0.5 , 0.75])
        >>> FitAlg.start_parameter_values(5)
        array([2.5, 5. , 7.5])
        >>> FitAlg.start_parameter_values(-5)
        array([2.5, 5. , 7.5])
        >>> FitAlg.start_parameter_values(5, boundary_min = 0, boundary_max = 7)
        array([4., 5., 6.])
        >>> FitAlg.start_parameter_values(5, boundary_min = -3, boundary_max = 30)
        array([1., 5., 9.])
        >>> FitAlg.start_parameter_values(5, boundary_min = 0, boundary_max = 30)
        array([2.5, 5. , 7.5])
        >>> FitAlg.start_parameter_values(5, boundary_min = 3, boundary_max = 30, number_starting_points = 7)
        array([3.5, 4. , 4.5, 5. , 5.5, 6. , 6.5])
        """

#        initialAbs = abs(initial)

        # The number of initial points per parameter
        divVal = (number_starting_points + 1) / 2

        if boundary_max is None or math.isinf(boundary_max):
            # We can also assume any number smaller than one should stay
            # smaller than one.
            if initial < 1 and initial > 0:
                valMax = 1
            else:
                valMax = float('inf')
        else:
            valMax = boundary_max

        if boundary_min is None or math.isinf(boundary_min):
            # We can also assume any number larger than one should stay
            # bigger than zero.
            if initial > 0:
                valMin = 0

                initialAbs = initial
                valAbsMax = valMax

            else:
                # this should never happen, but regardless
                valMin = 0

                initialAbs = abs(initial)
                valAbsMax = abs(valMax) + initialAbs

        else:
            valMin = boundary_min

            initialAbs = initial - valMin
            valAbsMax = valMax - valMin

        # Now that the bounds have been set we have shifted the space to
        # calculate the points in the space and then shift them back.

        # We can assume that any initial parameter proposed has the
        # correct order of magnitude.
        vMin = initialAbs / divVal

        if number_starting_points*vMin > valAbsMax:
            inc = (valAbsMax - initialAbs) / divVal
            vMin = valAbsMax - number_starting_points * inc
            vMax = valAbsMax - inc
        else:
            vMax = vMin * number_starting_points

        points = np.linspace(vMin, vMax, number_starting_points) + valMin

        return points

[docs]    def invalid_parameters(self, *params):
        """
        Identifies if the parameters passed are within the bounds provided

        If they are not returns ``inf``

        Parameters
        ----------
        params : list of floats
            Parameters to be passed to the sim

        Returns
        -------
        validity : Bool
            If the parameters are valid or not

        Notes
        -----
        No note

        Examples
        --------
        >>> a = FitAlg(bounds={1:(0,5), 2:(0,2), 3:(-1,1)})
        >>> a.set_bounds([3, 1])
        >>> a.invalid_parameters(0, 0)
        False
        >>> a.invalid_parameters(2, 0)
        True
        >>> a.invalid_parameters(0, -1)
        True
        >>> a.invalid_parameters(6, 6)
        True
        """

        for p, (mi, ma) in zip(params, self.boundary_values):

            if p < mi or p > ma:
                return True

        return False


[docs]def covariance(jac):
    """ Calculates the covariance based on the estimated jacobian

    Inspired by how this is calculated in scipy.optimise.curve_fit, as found at
    https://github.com/scipy/scipy/blob/2526df72e5d4ca8bad6e2f4b3cbdfbc33e805865/scipy/optimize/minpack.py#L739

    """

    # Do Moore-Penrose inverse discarding zero singular values.
    U, s, VT = np.linalg.svd(jac, full_matrices=False)
    threshold = np.finfo(float).eps * max(jac.shape) * s[0]
    s = s[s > threshold]
    VT = VT[:s.size]
    cov = np.dot(VT.T / s ** 2, VT)

    # Alternative method found, but assumes the residuals are small
    #cov = np.linalg.inv(np.dot(jac.T, jac))

    return cov