Source code for dataFitting

# -*- coding: utf-8 -*-
"""
:Author: Dominic Hunt
"""
import logging
import collections
import copy
import fire

import pandas as pd

from typing import Any

import outputting
import utils
import data

from fitAlgs.fitSims import FitSim
from fitAlgs.fitAlg import FitAlg
from modelGenerator import ModelGen


[docs]class LengthError(Exception): pass
[docs]class OrderError(Exception): pass
[docs]def run(data_folder='./', data_format='csv', data_file_filter=None, data_file_terminal_ID=True, data_read_options=None, data_split_by=None, data_group_by=None, data_extra_processing=None, model_name='QLearn', model_changing_properties=None, model_constant_properties=None, participantID="Name", participant_choices='Actions', participant_rewards='Rewards', model_fit_value='ActionProb', fit_subset=None, task_stimuli=None, participant_action_options=None, fit_method='Evolutionary', fit_method_args=None, fit_measure='-loge', fit_measure_args=None, fit_extra_measures=None, participant_varying_model_parameters=None, label=None, save_fitting_progress=False, config_file=None, output_path=None, pickle=False, boundary_excess_cost_function=None, min_log_level='INFO', numpy_error_level="log", fit_float_error_response_value=1 / 1e100, calculate_covariance=False ): """ A framework for fitting models to data for tasks, along with recording the data associated with the fits. Parameters ---------- data_folder : string or list of strings, optional The folder where the data can be found. Default is the current folder. data_format : string, optional The file type of the data, from ``mat``, ``csv``, ``xlsx`` and ``pkl``. Default is ``csv`` data_file_filter : callable, string, list of strings or None, optional A function to process the file names or a list of possible prefixes as strings or a single string. Default ``None``, no file names removed data_file_terminal_ID : bool, optional Is there an ID number at the end of the filename? If not then a more general search will be performed. Default ``True`` data_read_options : dict, optional The keyword arguments for the data importing method chosen data_split_by : string or list of strings, optional If multiple participant datasets are in one file sheet, this specifies the column or columns that can distinguish and identify the rows for each participant. Default ``None`` data_group_by : list of strings, optional A list of parts of filenames that are repeated across participants, identifying all the files that should be grouped together to form one participants data. The rest of the filename is assumed to identify the participant. Default is ``None`` data_extra_processing : callable, optional A function that modifies the dictionary of data read for each participant in such that it is appropriate for fitting. Default is ``None`` model_name : string, optional The name of the file where a model.modelTemplate.Model class can be found. Default ``QLearn`` model_changing_properties : dictionary with values of tuple of two floats, optional Parameters are the options that you allow to vary across model fits. Each model parameter is specified as a dict key. The value is a tuple containing the upper and lower search bounds, e.g. ``alpha`` has the bounds (0, 1). Default ``None`` model_constant_properties : dictionary of float, string or binary valued elements, optional These contain all the the model options that define the version of the model being studied. Default ``None`` participantID : str, optional The key (label) used to identify each participant. Default ``Name`` participant_choices : string, optional The participant data key of their action choices. Default ``'Actions'`` participant_rewards : string, optional The participant data key of the participant reward data. Default ``'Rewards'`` model_fit_value : string, optional The key to be compared in the model data. Default ``'ActionProb'`` fit_subset : ``float('Nan')``, ``None``, ``"rewarded"``, ``"unrewarded"``, ``"all"`` or list of int, optional Describes which, if any, subset of trials will be used to evaluate the performance of the model. This can either be described as a list of trial numbers or, by passing - ``"all"`` for fitting all trials - ``float('Nan')`` or ``"unrewarded"`` for all those trials whose feedback was ``float('Nan')`` - ``"rewarded"`` for those who had feedback that was not ``float('Nan')`` Default ``None``, which means all trials will be used. task_stimuli : list of strings or None, optional The keys containing the observational parameters seen by the participant before taking a decision on an action. Default ``None`` participant_action_options : string or list of strings or None or one element list with a list, optional If a string or list of strings these are treated as dict keys where the valid actions for each trial can be found. If None then all trials will use all available actions. If the list contains one list then it will be treated as a list of valid actions for each trialstep. Default ``'None'`` fit_method : string, optional The fitting method to be used. The names accepted are those of the modules in the folder fitAlgs containing a FitAlg class. Default ``'evolutionary'`` fit_method_args : dict, optional A dictionary of arguments specific to the fitting method. Default ``None`` fit_measure : string, optional The name of the function used to calculate the quality of the fit. The value it returns provides the fitter with its fitting guide. Default ``-loge`` fit_measure_args : dict, optional The parameters used to initialise fitMeasure and extraFitMeasures. Default ``None`` fit_extra_measures : list of strings, optional List of fit measures not used to fit the model, but to provide more information. Any arguments needed for these measures should be placed in fitMeasureArgs. Default ``None`` participant_varying_model_parameters : dict of string, optional A dictionary of model settings whose values should vary from participant to participant based on the values found in the imported participant data files. The key is the label given in the participant data file, as a string, and the value is the associated label in the model, also as a string. Default ``{}`` label : string, optional The label for the data fitting. Default ``None`` will mean no data is saved to files. save_fitting_progress : bool, optional Specifies if the results from each iteration of the fitting process should be returned. Default ``False`` config_file : string, optional The file name and path of a ``.yaml`` configuration file. Overrides all other parameters if found. Default ``None`` output_path : string, optional The path that will be used for the run output. Default ``None`` pickle : bool, optional If true the data for each model, and participant is recorded. Default is ``False`` boundary_excess_cost_function : str or callable returning a function, optional The function is used to calculate the penalty for exceeding the boundaries. Default is ``boundFunc.scalarBound()`` min_log_level : str, optional Defines the level of the log from (``DEBUG``, ``INFO``, ``WARNING``, ``ERROR``, ``CRITICAL``). Default ``INFO`` numpy_error_level : {'log', 'raise'} Defines the response to numpy errors. Default ``log``. See numpy.seterr fit_float_error_response_value : float, optional If a floating point error occurs when running a fit the fitter function will return a value for each element of fpRespVal. Default is ``1/1e100` calculate_covariance : bool, optional Is the covariance calculated. Default ``False`` See Also -------- modelGenerator : The model factory outputting : The outputting functions fitAlgs.fitAlg.FitAlg : General class for a method of fitting data fitAlgs.fitSims.fitSim : General class for a method of simulating the fitting of data data.Data : Data import class """ config = copy.deepcopy(locals()) if participant_varying_model_parameters is None: model_changing_variables = {} else: model_changing_variables = participant_varying_model_parameters # TODO : Validate model_changing_properties with the data and the model participants = data.Data.load_data(file_type=data_format, folders=data_folder, file_name_filter=data_file_filter, terminal_ID=data_file_terminal_ID, split_by=data_split_by, participantID=participantID, choices=participant_choices, feedbacks=participant_rewards, stimuli=task_stimuli, action_options=participant_action_options, group_by=data_group_by, extra_processing=data_extra_processing, data_read_options=data_read_options) if model_changing_properties: model_parameters = {} for key, value in model_changing_properties.items(): if len(value) == 2: v1, v2 = value if v2 < v1: raise OrderError('The bounds specified for model parameter ``{}`` must have the lower bound first'.format(key)) else: model_parameters[key] = (v1 + v2) / 2 else: raise LengthError("The parameter values for the ``model_changing_properties`` must be presented as a list of the maximum and minimum values. Review those of ``{}``".format(key)) else: model_parameters = model_changing_properties models = ModelGen(model_name=model_name, parameters=model_parameters, other_options=model_constant_properties) model_simulator = FitSim(participant_choice_property=participants.choices, participant_reward_property=participants.feedbacks, model_fitting_variable=model_fit_value, fit_subset=fit_subset, task_stimuli_property=participants.stimuli, action_options_property=participants.action_options, float_error_response_value=fit_float_error_response_value ) fitting_method = utils.find_class(fit_method, class_folder='fitAlgs', inherited_class=FitAlg, excluded_files=['boundFunc', 'qualityFunc', 'fitSims']) if fit_method_args is None: fit_method_args = {} fitter = fitting_method(fit_sim=model_simulator, fit_measure=fit_measure, extra_fit_measures=fit_extra_measures, fit_measure_args=fit_measure_args, bounds=model_changing_properties, boundary_excess_cost=boundary_excess_cost_function, calculate_covariance=calculate_covariance, **fit_method_args) with outputting.Saving(config=config) as file_name_generator: logger = logging.getLogger('Fitting') log_fitting_parameters(fitter.info()) message = 'Beginning the data fitting' logger.info(message) model_ID = 0 # Initialise the stores of information participant_fits = collections.defaultdict(list) # type: collections.defaultdict[Any, list] for model, model_parameter_variables, model_static_args in models.iter_details(): for v in model_changing_variables.values(): model_static_args[v] = "<Varies for each participant>" log_model_fitting_parameters(model, model_parameter_variables, model_static_args) participantID = participants.participantID for participant in participants: participant_name = participant[participantID] if isinstance(participant_name, (list, tuple)): participant_name = participant_name[0] for k, v in model_changing_variables.items(): model_static_args[v] = participant[k] # Find the best model values from those proposed message = "Beginning participant fit for participant {}".format(participant_name) logger.info(message) model_fitted, fit_quality, fitting_data = fitter.participant(model, model_parameter_variables, model_static_args, participant) message = "Participant fitted" logger.debug(message) log_model_fitted_parameters(model_parameter_variables, model_fitted.params(), fit_quality, participant_name) participant_fits = record_participant_fit(participant, participant_name, model_fitted.returnTaskState(), str(model_ID), fitting_data, model_changing_variables, participant_fits, fileNameGen=file_name_generator, pickleData=pickle, saveFittingProgress=save_fitting_progress) model_ID += 1 fit_record(participant_fits, file_name_generator)
# %% Data record functions
[docs]def record_participant_fit(participant, part_name, model_data, model_name, fitting_data, partModelVars, participantFits, fileNameGen=None, pickleData=False, saveFittingProgress=False, expData=None): """ Record the data relevant to the participant fitting Parameters ---------- participant : dict The participant data part_name : int or string The identifier for each participant model_data : dict The data from the model model_name : str The label given to the model fitting_data : dict Dictionary of details of the different fits, including an ordered dictionary containing the parameter values tested, in the order they were tested, and a list of the fit qualities of these parameters partModelVars : dict of string A dictionary of model settings whose values should vary from participant to participant based on the values found in the imported participant data files. The key is the label given in the participant data file, as a string, and the value is the associated label in the model, also as a string. participantFits : defaultdict of lists A dictionary to be filled with the summary of the participant fits fileNameGen : function or None Creates a new file with the name <handle> and the extension <extension>. It takes two string parameters: (``handle``, ``extension``) and returns one ``fileName`` string. Default ``None`` pickleData : bool, optional If true the data for each model, task and participant is recorded. Default is ``False`` saveFittingProgress : bool, optional Specifies if the results from each iteration of the fitting process should be returned. Default ``False`` expData : dict, optional The data from the task. Default ``None`` Returns ------- participantFits : defaultdict of lists A dictionary to be filled with the summary of the previous and current participant fits See Also -------- outputting.pickleLog : records the picked data """ logger = logging.getLogger('Logging') partNameStr = str(part_name) message = "Recording participant " + partNameStr + " model fit" logger.info(message) label = "_Model-" + model_name + "_Part-" + partNameStr participantName = "Participant " + partNameStr participant.setdefault("Name", participantName) participant.setdefault("assigned_name", participantName) fitting_data.setdefault("Name", participantName) if fileNameGen: message = "Store data for " + participantName logger.info(message) participantFits = record_fitting(fitting_data, label, participant, partModelVars, participantFits, fileNameGen, save_fitting_progress=saveFittingProgress) if pickleData: if expData is not None: outputting.pickleLog(expData, fileNameGen, "_expData" + label) outputting.pickleLog(model_data, fileNameGen, "_modelData" + label) outputting.pickleLog(participant, fileNameGen, "_partData" + label) outputting.pickleLog(fitting_data, fileNameGen, "_fitData" + label) return participantFits
# %% Recording
[docs]def record_fitting(fitting_data, label, participant, participant_model_variables, participant_fits, file_name_generator, save_fitting_progress=False): """ Records formatted versions of the fitting data Parameters ---------- fitting_data : dict, optional Dictionary of details of the different fits, including an ordered dictionary containing the parameter values tested, in the order they were tested, and a list of the fit qualities of these parameters. label : str The label used to identify the fit in the file names participant : dict The participant data participant_model_variables : dict of string A dictionary of model settings whose values should vary from participant to participant based on the values found in the imported participant data files. The key is the label given in the participant data file, as a string, and the value is the associated label in the model, also as a string. participant_fits : defaultdict of lists A dictionary to be filled with the summary of the participant fits file_name_generator : function Creates a new file with the name <handle> and the extension <extension>. It takes two string parameters: (``handle``, ``extension``) and returns one ``fileName`` string save_fitting_progress : bool, optional Specifies if the results from each iteration of the fitting process should be returned. Default ``False`` Returns ------- participant_fits : defaultdict of lists A dictionary to be filled with the summary of the previous and current participant fits """ extendedLabel = "ParameterFits" + label participant_fits["Name"].append(participant["Name"]) participant_fits["assigned_name"].append(participant["assigned_name"]) for k in filter(lambda x: 'fit_quality' in x, fitting_data.keys()): participant_fits[k].append(fitting_data[k]) for k, v in fitting_data["final_parameters"].items(): participant_fits[k].append(v) for k, v in participant_model_variables.items(): participant_fits[v] = participant[k] if save_fitting_progress: xlsx_fitting_data(fitting_data.copy(), extendedLabel, participant, file_name_generator) return participant_fits
#%% logging
[docs]def log_model_fitting_parameters(model, model_fit_variables, model_other_args): """ Logs the model and task parameters that used as initial fitting conditions Parameters ---------- model : string The name of the model model_fit_variables : dict The model parameters that will be fitted over and varied. model_other_args : dict The other parameters used in the model whose attributes have been modified by the user """ model_args = copy.copy(model_fit_variables) model_args.update(copy.copy(model_other_args)) model_instance = model(**model_args) model_properties = model_instance.params() message = "The fit will use the model ``{}``".format(model_properties['Name']) modelFitParams = [k + ' around ' + str(v) for k, v in model_fit_variables.items()] message += " fitted with the parameters " + ", ".join(modelFitParams) model_parameters = [k + ' = ' + str(v).replace('\n', '').strip('[](){}<>') for k, v in model_other_args.items() if k not in model_fit_variables.keys()] message += " and using the other user specified parameters " + ", ".join(model_parameters) logger_sim = logging.getLogger('Fitting') logger_sim.info(message)
[docs]def log_model_fitted_parameters(model_fit_variables, model_parameters, fit_quality, participant_name): """ Logs the model and task parameters that used as initial fitting conditions Parameters ---------- model_fit_variables : dict The model parameters that have been fitted over and varied. model_parameters : dict The model parameters for the fitted model fit_quality : float The value of goodness of fit participant_name : int or string The identifier for each participant """ parameters = list(model_fit_variables.keys()) model_fit_parameters = [k + ' = ' + str(v).strip('[]()') for k, v in model_parameters.items() if k in parameters] message = "The fitted values for participant " + str(participant_name) + " are " + ", ".join(model_fit_parameters) message += " with a fit quality of " + str(fit_quality) + "." logger_sim = logging.getLogger('Fitting') logger_sim.info(message)
[docs]def log_fitting_parameters(fit_info): """ Records and outputs to the log the parameters associated with the fitting algorithms Parameters ---------- fit_info : dict The details of the fitting """ log = logging.getLogger('Fitting') message = "Fitting information:" log.info(message) name = fit_info.pop('Name') message = "For " + name + ":" log.info(message) for k, v in fit_info.items(): message = k + ": " + repr(v) log.info(message)
#%% CSV
[docs]def fit_record(participant_fits, file_name_generator): """ Returns the participant fits summary as a csv file Parameters ---------- participant_fits : dict A summary of the recovered parameters file_name_generator : function Creates a new file with the name <handle> and the extension <extension>. It takes two string parameters: (``handle``, ``extension``) and returns one ``fileName`` string """ participant_fit = pd.DataFrame.from_dict(participant_fits) output_file = file_name_generator("participantFits", 'csv') participant_fit.to_csv(output_file)
#%% Excel
[docs]def xlsx_fitting_data(fitting_data, label, participant, file_name_generator): """ Saves the fitting data to an XLSX file Parameters ---------- fitting_data : dict, optional Dictionary of details of the different fits, including an ordered dictionary containing the parameter values tested, in the order they were tested, and a list of the fit qualities of these parameters. label : str The label used to identify the fit in the file names participant : dict The participant data file_name_generator : function Creates a new file with the name <handle> and the extension <extension>. It takes two string parameters: (``handle``, ``extension``) and returns one ``fileName`` string """ data = {} partData = outputting.newListDict(participant, 'part') data.update(partData) parameter_fitting_dict = copy.copy(fitting_data["tested_parameters"]) parameter_fitting_dict['participant_fitting_name'] = fitting_data.pop("Name") #parameter_fitting_dict['fit_quality'] = fittingData.pop("fit_quality") #parameter_fitting_dict["fitQualities"] = fittingData.pop("fitQualities") for k, v in fitting_data.pop("final_parameters").items(): parameter_fitting_dict[k + "final"] = v parameter_fitting_dict.update(fitting_data) data.update(parameter_fitting_dict) record_data = outputting.newListDict(data, "") record = pd.DataFrame(record_data) name = "data/" + label output_file = file_name_generator(name, 'xlsx') xlsxT = pd.ExcelWriter(output_file) record.to_excel(xlsxT, sheet_name='ParameterFits') xlsxT.save()
if __name__ == '__main__': fire.Fire(run)