Source code for dataFitting

# -*- coding: utf-8 -*-
"""
:Author: Dominic Hunt
"""
import logging
import collections
import copy
import fire

import pandas as pd

from typing import Any

import outputting
import utils
import data

from fitAlgs.fitSims import FitSim
from fitAlgs.fitAlg import FitAlg
from modelGenerator import ModelGen


[docs]class LengthError(Exception):
    pass


[docs]class OrderError(Exception):
    pass


[docs]def run(data_folder='./',
        data_format='csv',
        data_file_filter=None,
        data_file_terminal_ID=True,
        data_read_options=None,
        data_split_by=None,
        data_group_by=None,
        data_extra_processing=None,
        model_name='QLearn',
        model_changing_properties=None,
        model_constant_properties=None,
        participantID="Name",
        participant_choices='Actions',
        participant_rewards='Rewards',
        model_fit_value='ActionProb',
        fit_subset=None,
        task_stimuli=None,
        participant_action_options=None,
        fit_method='Evolutionary',
        fit_method_args=None,
        fit_measure='-loge',
        fit_measure_args=None,
        fit_extra_measures=None,
        participant_varying_model_parameters=None,
        label=None,
        save_fitting_progress=False,
        config_file=None,
        output_path=None,
        pickle=False,
        boundary_excess_cost_function=None,
        min_log_level='INFO',
        numpy_error_level="log",
        fit_float_error_response_value=1 / 1e100,
        calculate_covariance=False
        ):
    """
    A framework for fitting models to data for tasks, along with
    recording the data associated with the fits.

    Parameters
    ----------
    data_folder : string or list of strings, optional
        The folder where the data can be found. Default is the current folder.
    data_format : string, optional
        The file type of the data, from ``mat``, ``csv``, ``xlsx`` and ``pkl``. Default is ``csv``
    data_file_filter : callable, string, list of strings or None, optional
        A function to process the file names or a list of possible prefixes as strings or a single string.
        Default ``None``, no file names removed
    data_file_terminal_ID : bool, optional
        Is there an ID number at the end of the filename? If not then a more general search will be performed.
        Default ``True``
    data_read_options : dict, optional
        The keyword arguments for the data importing method chosen
    data_split_by : string or list of strings, optional
        If multiple participant datasets are in one file sheet, this specifies the column or columns that can
        distinguish and identify the rows for each participant. Default ``None``
    data_group_by : list of strings, optional
        A list of parts of filenames that are repeated across participants, identifying all the files that should
        be grouped together to form one participants data. The rest of the filename is assumed to identify the
        participant. Default is ``None``
    data_extra_processing : callable, optional
        A function that modifies the dictionary of data read for each participant in such that it is appropriate
        for fitting. Default is ``None``
    model_name : string, optional
        The name of the file where a model.modelTemplate.Model class can be found. Default ``QLearn``
    model_changing_properties : dictionary with values of tuple of two floats, optional
        Parameters are the options that you allow to vary across model fits. Each model parameter is specified as a
        dict key. The value is a tuple containing the upper and lower search bounds, e.g. ``alpha`` has the bounds
        (0, 1). Default ``None``
    model_constant_properties : dictionary of float, string or binary valued elements, optional
        These contain all the the model options that define the version
        of the model being studied. Default ``None``
    participantID : str, optional
        The key (label) used to identify each participant. Default ``Name``
    participant_choices : string, optional
        The participant data key of their action choices. Default ``'Actions'``
    participant_rewards : string, optional
        The participant data key of the participant reward data. Default ``'Rewards'``
    model_fit_value : string, optional
        The key to be compared in the model data. Default ``'ActionProb'``
    fit_subset : ``float('Nan')``, ``None``, ``"rewarded"``, ``"unrewarded"``, ``"all"`` or list of int, optional
        Describes which, if any, subset of trials will be used to evaluate the performance of the model.
        This can either be described as a list of trial numbers or, by passing
        - ``"all"`` for fitting all trials
        - ``float('Nan')`` or ``"unrewarded"`` for all those trials whose feedback was ``float('Nan')``
        - ``"rewarded"`` for those who had feedback that was not ``float('Nan')``
        Default ``None``, which means all trials will be used.
    task_stimuli : list of strings or None, optional
        The keys containing the observational parameters seen by the
        participant before taking a decision on an action. Default ``None``
    participant_action_options : string or list of strings or None or one element list with a list, optional
        If a string or list of strings these are treated as dict keys where the valid actions for each trial can
        be found. If None then all trials will use all available actions. If the list contains one list then it will
        be treated as a list of valid actions for each trialstep. Default ``'None'``
    fit_method : string, optional
        The fitting method to be used. The names accepted are those of the modules in the folder fitAlgs containing a
        FitAlg class. Default ``'evolutionary'``
    fit_method_args : dict, optional
        A dictionary of arguments specific to the fitting method. Default ``None``
    fit_measure : string, optional
        The name of the function used to calculate the quality of the fit.
        The value it returns provides the fitter with its fitting guide. Default ``-loge``
    fit_measure_args : dict, optional
        The parameters used to initialise fitMeasure and extraFitMeasures. Default ``None``
    fit_extra_measures : list of strings, optional
        List of fit measures not used to fit the model, but to provide more information. Any arguments needed for these
        measures should be placed in fitMeasureArgs. Default ``None``
    participant_varying_model_parameters : dict of string, optional
        A dictionary of model settings whose values should vary from participant to participant based on the
        values found in the imported participant data files. The key is the label given in the participant data file,
        as a string, and the value is the associated label in the model, also as a string. Default ``{}``
    label : string, optional
        The label for the data fitting. Default ``None`` will mean no data is saved to files.
    save_fitting_progress : bool, optional
        Specifies if the results from each iteration of the fitting process should be returned. Default ``False``
    config_file : string, optional
        The file name and path of a ``.yaml`` configuration file. Overrides all other parameters if found.
        Default ``None``
    output_path : string, optional
        The path that will be used for the run output. Default ``None``
    pickle : bool, optional
        If true the data for each model, and participant is recorded.
        Default is ``False``
    boundary_excess_cost_function : str or callable returning a function, optional
        The function is used to calculate the penalty for exceeding the boundaries.
        Default is ``boundFunc.scalarBound()``
    min_log_level : str, optional
        Defines the level of the log from (``DEBUG``, ``INFO``, ``WARNING``, ``ERROR``, ``CRITICAL``). Default ``INFO``
    numpy_error_level : {'log', 'raise'}
        Defines the response to numpy errors. Default ``log``. See numpy.seterr
    fit_float_error_response_value : float, optional
        If a floating point error occurs when running a fit the fitter function
        will return a value for each element of fpRespVal. Default is ``1/1e100`
    calculate_covariance : bool, optional
        Is the covariance calculated. Default ``False``

    See Also
    --------
    modelGenerator : The model factory
    outputting : The outputting functions
    fitAlgs.fitAlg.FitAlg : General class for a method of fitting data
    fitAlgs.fitSims.fitSim : General class for a method of simulating the fitting of data
    data.Data : Data import class
    """
    config = copy.deepcopy(locals())

    if participant_varying_model_parameters is None:
        model_changing_variables = {}
    else:
        model_changing_variables = participant_varying_model_parameters

    # TODO : Validate model_changing_properties with the data and the model
    participants = data.Data.load_data(file_type=data_format,
                                       folders=data_folder,
                                       file_name_filter=data_file_filter,
                                       terminal_ID=data_file_terminal_ID,
                                       split_by=data_split_by,
                                       participantID=participantID,
                                       choices=participant_choices,
                                       feedbacks=participant_rewards,
                                       stimuli=task_stimuli,
                                       action_options=participant_action_options,
                                       group_by=data_group_by,
                                       extra_processing=data_extra_processing,
                                       data_read_options=data_read_options)

    if model_changing_properties:
        model_parameters = {}
        for key, value in model_changing_properties.items():
            if len(value) == 2:
                v1, v2 = value
                if v2 < v1:
                    raise OrderError('The bounds specified for model parameter ``{}`` must have the lower bound first'.format(key))
                else:
                    model_parameters[key] = (v1 + v2) / 2
            else:
                raise LengthError("The parameter values for the ``model_changing_properties`` must be presented as a list of the maximum and minimum values. Review those of ``{}``".format(key))
    else:
        model_parameters = model_changing_properties

    models = ModelGen(model_name=model_name,
                      parameters=model_parameters,
                      other_options=model_constant_properties)

    model_simulator = FitSim(participant_choice_property=participants.choices,
                             participant_reward_property=participants.feedbacks,
                             model_fitting_variable=model_fit_value,
                             fit_subset=fit_subset,
                             task_stimuli_property=participants.stimuli,
                             action_options_property=participants.action_options,
                             float_error_response_value=fit_float_error_response_value
                             )

    fitting_method = utils.find_class(fit_method,
                                      class_folder='fitAlgs',
                                      inherited_class=FitAlg,
                                      excluded_files=['boundFunc', 'qualityFunc', 'fitSims'])

    if fit_method_args is None:
        fit_method_args = {}

    fitter = fitting_method(fit_sim=model_simulator,
                            fit_measure=fit_measure,
                            extra_fit_measures=fit_extra_measures,
                            fit_measure_args=fit_measure_args,
                            bounds=model_changing_properties,
                            boundary_excess_cost=boundary_excess_cost_function,
                            calculate_covariance=calculate_covariance,
                            **fit_method_args)

    with outputting.Saving(config=config) as file_name_generator:

        logger = logging.getLogger('Fitting')

        log_fitting_parameters(fitter.info())

        message = 'Beginning the data fitting'
        logger.info(message)

        model_ID = 0
        # Initialise the stores of information
        participant_fits = collections.defaultdict(list)  # type: collections.defaultdict[Any, list]

        for model, model_parameter_variables, model_static_args in models.iter_details():

            for v in model_changing_variables.values():
                model_static_args[v] = "<Varies for each participant>"

            log_model_fitting_parameters(model, model_parameter_variables, model_static_args)

            participantID = participants.participantID
            for participant in participants:

                participant_name = participant[participantID]
                if isinstance(participant_name, (list, tuple)):
                    participant_name = participant_name[0]

                for k, v in model_changing_variables.items():
                    model_static_args[v] = participant[k]

                # Find the best model values from those proposed
                message = "Beginning participant fit for participant {}".format(participant_name)
                logger.info(message)

                model_fitted, fit_quality, fitting_data = fitter.participant(model,
                                                                             model_parameter_variables,
                                                                             model_static_args,
                                                                             participant)

                message = "Participant fitted"
                logger.debug(message)

                log_model_fitted_parameters(model_parameter_variables, model_fitted.params(), fit_quality, participant_name)

                participant_fits = record_participant_fit(participant, participant_name, model_fitted.returnTaskState(),
                                                          str(model_ID), fitting_data, model_changing_variables,
                                                          participant_fits, fileNameGen=file_name_generator,
                                                          pickleData=pickle, saveFittingProgress=save_fitting_progress)

            model_ID += 1

        fit_record(participant_fits, file_name_generator)


# %% Data record functions  
[docs]def record_participant_fit(participant, part_name, model_data, model_name, fitting_data, partModelVars, participantFits,
                           fileNameGen=None, pickleData=False, saveFittingProgress=False, expData=None):
    """
    Record the data relevant to the participant fitting

    Parameters
    ----------
    participant : dict
        The participant data
    part_name : int or string
        The identifier for each participant
    model_data : dict
        The data from the model
    model_name : str
        The label given to the model
    fitting_data : dict
        Dictionary of details of the different fits, including an ordered dictionary containing the parameter values
        tested, in the order they were tested, and a list of the fit qualities of these parameters
    partModelVars : dict of string
        A dictionary of model settings whose values should vary from participant to participant based on the
        values found in the imported participant data files. The key is the label given in the participant data file,
        as a string, and the value is the associated label in the model, also as a string.
    participantFits : defaultdict of lists
        A dictionary to be filled with the summary of the participant fits
    fileNameGen : function or None
        Creates a new file with the name <handle> and the extension <extension>. It takes two string parameters: (``handle``, ``extension``) and
        returns one ``fileName`` string. Default ``None``
    pickleData : bool, optional
        If true the data for each model, task and participant is recorded.
        Default is ``False``
    saveFittingProgress : bool, optional
        Specifies if the results from each iteration of the fitting process should be returned. Default ``False``
    expData : dict, optional
        The data from the task. Default ``None``

    Returns
    -------
    participantFits : defaultdict of lists
        A dictionary to be filled with the summary of the previous and current participant fits

    See Also
    --------
    outputting.pickleLog : records the picked data
    """
    logger = logging.getLogger('Logging')
    partNameStr = str(part_name)

    message = "Recording participant " + partNameStr + " model fit"
    logger.info(message)

    label = "_Model-" + model_name + "_Part-" + partNameStr

    participantName = "Participant " + partNameStr

    participant.setdefault("Name", participantName)
    participant.setdefault("assigned_name", participantName)
    fitting_data.setdefault("Name", participantName)

    if fileNameGen:
        message = "Store data for " + participantName
        logger.info(message)

        participantFits = record_fitting(fitting_data, label, participant, partModelVars, participantFits, fileNameGen,
                                         save_fitting_progress=saveFittingProgress)

        if pickleData:
            if expData is not None:
                outputting.pickleLog(expData, fileNameGen, "_expData" + label)
            outputting.pickleLog(model_data, fileNameGen, "_modelData" + label)
            outputting.pickleLog(participant, fileNameGen, "_partData" + label)
            outputting.pickleLog(fitting_data, fileNameGen, "_fitData" + label)

    return participantFits


# %% Recording
[docs]def record_fitting(fitting_data, label, participant, participant_model_variables, participant_fits, file_name_generator,
                   save_fitting_progress=False):
    """
    Records formatted versions of the fitting data

    Parameters
    ----------
    fitting_data : dict, optional
        Dictionary of details of the different fits, including an ordered dictionary containing the parameter values
        tested, in the order they were tested, and a list of the fit qualities of these parameters.
    label : str
        The label used to identify the fit in the file names
    participant : dict
        The participant data
    participant_model_variables : dict of string
        A dictionary of model settings whose values should vary from participant to participant based on the
        values found in the imported participant data files. The key is the label given in the participant data file,
        as a string, and the value is the associated label in the model, also as a string.
    participant_fits : defaultdict of lists
        A dictionary to be filled with the summary of the participant fits
    file_name_generator : function
        Creates a new file with the name <handle> and the extension <extension>. It takes two string parameters: (``handle``, ``extension``) and
        returns one ``fileName`` string
    save_fitting_progress : bool, optional
        Specifies if the results from each iteration of the fitting process should be returned. Default ``False``

    Returns
    -------
    participant_fits : defaultdict of lists
        A dictionary to be filled with the summary of the previous and current participant fits

    """
    extendedLabel = "ParameterFits" + label

    participant_fits["Name"].append(participant["Name"])
    participant_fits["assigned_name"].append(participant["assigned_name"])
    for k in filter(lambda x: 'fit_quality' in x, fitting_data.keys()):
        participant_fits[k].append(fitting_data[k])
    for k, v in fitting_data["final_parameters"].items():
        participant_fits[k].append(v)
    for k, v in participant_model_variables.items():
        participant_fits[v] = participant[k]

    if save_fitting_progress:
        xlsx_fitting_data(fitting_data.copy(), extendedLabel, participant, file_name_generator)

    return participant_fits


#%% logging
[docs]def log_model_fitting_parameters(model, model_fit_variables, model_other_args):
    """
    Logs the model and task parameters that used as initial fitting conditions

    Parameters
    ----------
    model : string
        The name of the model
    model_fit_variables : dict
        The model parameters that will be fitted over and varied.
    model_other_args : dict
        The other parameters used in the model whose attributes have been
        modified by the user
    """
    model_args = copy.copy(model_fit_variables)
    model_args.update(copy.copy(model_other_args))
    model_instance = model(**model_args)
    model_properties = model_instance.params()

    message = "The fit will use the model ``{}``".format(model_properties['Name'])

    modelFitParams = [k + ' around ' + str(v) for k, v in model_fit_variables.items()]
    message += " fitted with the parameters " + ", ".join(modelFitParams)

    model_parameters = [k + ' = ' + str(v).replace('\n', '').strip('[](){}<>') for k, v in model_other_args.items()
                                                                               if k not in model_fit_variables.keys()]
    message += " and using the other user specified parameters " + ", ".join(model_parameters)

    logger_sim = logging.getLogger('Fitting')
    logger_sim.info(message)


[docs]def log_model_fitted_parameters(model_fit_variables, model_parameters, fit_quality, participant_name):
    """
    Logs the model and task parameters that used as initial fitting
    conditions

    Parameters
    ----------
    model_fit_variables : dict
        The model parameters that have been fitted over and varied.
    model_parameters : dict
        The model parameters for the fitted model
    fit_quality : float
        The value of goodness of fit
    participant_name : int or string
        The identifier for each participant
    """
    parameters = list(model_fit_variables.keys())

    model_fit_parameters = [k + ' = ' + str(v).strip('[]()') for k, v in model_parameters.items() if k in parameters]
    message = "The fitted values for participant " + str(participant_name) + " are " + ", ".join(model_fit_parameters)

    message += " with a fit quality of " + str(fit_quality) + "."

    logger_sim = logging.getLogger('Fitting')
    logger_sim.info(message)


[docs]def log_fitting_parameters(fit_info):
    """
    Records and outputs to the log the parameters associated with the fitting algorithms

    Parameters
    ----------
    fit_info : dict
        The details of the fitting
    """

    log = logging.getLogger('Fitting')

    message = "Fitting information:"
    log.info(message)

    name = fit_info.pop('Name')
    message = "For " + name + ":"
    log.info(message)
    for k, v in fit_info.items():
        message = k + ": " + repr(v)
        log.info(message)


#%% CSV
[docs]def fit_record(participant_fits, file_name_generator):
    """
    Returns the participant fits summary as a csv file

    Parameters
    ----------
    participant_fits : dict
        A summary of the recovered parameters
    file_name_generator : function
        Creates a new file with the name <handle> and the extension <extension>. It takes two string parameters: (``handle``, ``extension``) and
        returns one ``fileName`` string

    """
    participant_fit = pd.DataFrame.from_dict(participant_fits)
    output_file = file_name_generator("participantFits", 'csv')
    participant_fit.to_csv(output_file)


#%% Excel
[docs]def xlsx_fitting_data(fitting_data, label, participant, file_name_generator):
    """
    Saves the fitting data to an XLSX file

    Parameters
    ----------
    fitting_data : dict, optional
        Dictionary of details of the different fits, including an ordered dictionary containing the parameter values
        tested, in the order they were tested, and a list of the fit qualities of these parameters.
    label : str
        The label used to identify the fit in the file names
    participant : dict
        The participant data
    file_name_generator : function
        Creates a new file with the name <handle> and the extension <extension>. It takes two string parameters: (``handle``, ``extension``) and
        returns one ``fileName`` string

    """

    data = {}
    partData = outputting.newListDict(participant, 'part')
    data.update(partData)

    parameter_fitting_dict = copy.copy(fitting_data["tested_parameters"])
    parameter_fitting_dict['participant_fitting_name'] = fitting_data.pop("Name")
    #parameter_fitting_dict['fit_quality'] = fittingData.pop("fit_quality")
    #parameter_fitting_dict["fitQualities"] = fittingData.pop("fitQualities")
    for k, v in fitting_data.pop("final_parameters").items():
        parameter_fitting_dict[k + "final"] = v
    parameter_fitting_dict.update(fitting_data)
    data.update(parameter_fitting_dict)
    record_data = outputting.newListDict(data, "")

    record = pd.DataFrame(record_data)

    name = "data/" + label
    output_file = file_name_generator(name, 'xlsx')
    xlsxT = pd.ExcelWriter(output_file)
    record.to_excel(xlsxT, sheet_name='ParameterFits')
    xlsxT.save()


if __name__ == '__main__':
    fire.Fire(run)