function [data,studyinfo,dataExist] = dsImport(src,varargin)
%DSIMPORT - load data into DynaSim formatted data structure.
%
% Usage:
% [data,studyinfo] = dsImport(data_file)
% [data,studyinfo] = dsImport(studyinfo)
% [data,studyinfo] = dsImport(study_dir) % containing studyinfo.mat
% data = dsImport(data_file)
%
% Inputs:
% - First input/argument:
% - data_file: data file path in accepted format (csv, mat, ...)
% - cell array of data files
% - study_dir
% - studyinfo structure
% - studyinfo file
% - options:
% 'verbose_flag': {0,1} (default: 1)
% 'process_id' : process identifier for loading studyinfo if necessary
% 'time_limits' : [beg,end] ms (see NOTE 2)
% 'variables' : cell array of matrix names (see NOTE 2)
% 'simIDs' : numeric array of simIDs to import (default: [])
% 'as_cell' : guarantee output as cell array and leave mising data as
% empty cells as long as 'simIDs' not given {0,1} (default: 0)
% Note: is not implemented for data_file input.
%
% Outputs:
% - DynaSim data structure:
% data.labels : list of state variables and monitors recorded
% data.(state_variables): state variable data matrix [time x cells]
% data.(monitors) : monitor data matrix [time x cells]
% data.time : time vector [time x 1]
% data.simulator_options: simulator options used to generate simulated data
% data.model : model used to generate simulated data
% [data.varied] : list of varied model components
% [data.results] : list of derived data sets created by post-processing
% - studyinfo: DynaSim studyinfo structure (see CheckStudyinfo)
% Note: if data is missing, studyinfo.simulations will only show found data
%
% Notes:
% - NOTE 1: CSV file structure assumes CSV file contains data organized
% according to output from dsWriteDynaSimSolver: time points along rows; state
% variables and monitors are columns; first column is time vector; next
% columns are state variables; final columns are monitors. first row has
% headers for each column. if a population has more than one cell, different
% cells are sequential columns with same header repeated for each cell.
%
% - NOTE 2: DynaSim data exported to MAT-files are HDF-compatible. To obtain
% partial data sets without having to load the entire file, use dsImport
% with options 'time_limits' and/or 'variables'. Alternatively, the entire
% data set can be loaded using dsImport with default options, then subsets
% extracted using dsSelect with appropriate options.
%
% - NOTE 3: if 'as_cell'==0, missing data will not be left empty in the results
% data output. if you need to know which are missing and have them left empty,
% use 'as_cell'==1 and check for empty cells.
%
% Examples:
% - Example 1: full data set
% data=dsImport('data.mat'); % load single data set
% data=dsImport(studyinfo); % load all data sets in studyinfo.study_dir
% - Example 2: partial data set with HDF-style loading
% data=dsImport('data.mat','variables','pop1_v','time_limits',[1000 4000])
%
% TODO:
% - specify subsets to return in terms of varied parameters, time_limits, ROIs,
% etc possible format for specifying range_varied: {'E','gNa',[.1 .3];
% 'I->E','tauI',[15 25]; 'I','mechanism_list','+iM'}
% - achieve by calling function dsSelect() at end of this function.
% See also: dsSimulate, dsExportData, dsCheckData, dsSelect
%
% Author: Jason Sherfey, PhD <jssherfey@gmail.com>
% Copyright (C) 2016 Jason Sherfey, Boston University, USA
% Check inputs
options=dsCheckOptions(varargin,{...
'verbose_flag',1,{0,1},...
'process_id',[],[],... % process identifier for loading studyinfo if necessary
'time_limits',[],[],...
'variables',[],[],...
'simIDs',[],[],...
'as_cell',0,{0,1},... % guarantee output as cell array and leave mising data as empty cells
'auto_gen_test_data_flag',0,{0,1},...
},false);
%% auto_gen_test_data_flag argin
if options.auto_gen_test_data_flag
varargs = varargin;
varargs{find(strcmp(varargs, 'auto_gen_test_data_flag'))+1} = 0;
varargs(end+1:end+2) = {'unit_test_flag',1};
argin = [{src}, varargs]; % specific to this function
end
if ischar(options.variables)
options.variables = {options.variables};
end
if ~nargin || isempty(src)
src = pwd;
end
% check if input is a DynaSim study_dir or path to studyinfo
if ischar(src)
if isdir(src) % assume char dir is study directory
study_dir = src;
srcS.study_dir = study_dir;
elseif strfind(src, 'studyinfo') %#ok<STRIFCND>
filePath = fileparts2(src);
if isempty(filePath)
filePath = pwd;
end
study_dir = filePath;
srcS.study_dir = study_dir;
else
srcS = []; % assume src is path to data file
end
elseif isstruct(src)
srcS = src; % src is studyinfo struct
else
srcS = []; % src is cellstr list of data files
end
% srcS will be populated with study_dir field if src is a string path to a dir or a studyinfo file
% else it will be empty
%% load study_dir if possible
if isstruct(srcS) && isfield(srcS,'study_dir')
% "file" is a studyinfo structure.
% retrieve most up-to-date studyinfo structure from studyinfo.mat file
try
studyinfo = dsCheckStudyinfo(srcS.study_dir,'process_id',options.process_id, varargin{:});
catch
error('Input source is not a recognized type.')
end
% compare simIDs to sim_id
if ~isempty(options.simIDs)
[~,~,simsInds] = intersect(options.simIDs, [studyinfo.simulations.sim_id]);
else
simsInds = true(length(studyinfo.simulations),1); % vector of all logical true
end
% filter studyinfo.simulations for simIDs
studyinfo.simulations = studyinfo.simulations(simsInds);
clear simsInds
% get list of data_files from studyinfo
data_files = {studyinfo.simulations.data_file}; % filter for only simIDs
dataExist = cellfun(@exist,data_files)==2;
if ~all(dataExist)
% convert original absolute paths to paths relative to study_dir for missing files
for iFile = find(~dataExist) % only look for missing files
[~,fname,fext] = fileparts2(data_files{iFile});
data_files{iFile} = fullfile(srcS.study_dir,'data',[fname fext]);
end
dataExist = cellfun(@exist,data_files)==2;
end
if ~all(dataExist)
% try regexp matching of sim# for files in 'data' folder
filesInDataDir = lscell(fullfile(srcS.study_dir, 'data'));
if ~isempty(filesInDataDir)
for iFile = find(~dataExist) % only look for missing files
[~,fname,~] = fileparts2(data_files{iFile});
simIDstr = regexpi(fname, 'sim(\d+)', 'tokens');
simIDstr = simIDstr{:};
simIDstr = simIDstr{:};
try
thisDataFile = filesInDataDir{contains(filesInDataDir, ['sim' simIDstr])};
data_files{iFile} = thisDataFile;
end
end
dataExist = cellfun(@exist,data_files)==2;
end
end
if ~all(dataExist)
warnStr = strjoin(data_files(~dataExist), '\t\t\t');
warning('These data files not found... \n \t\t\t%s \n', warnStr);
end
num_files = length(data_files);
if options.as_cell
% get outputCellInd
if isempty(options.simIDs)
outputCellInd = [studyinfo.simulations.sim_id]; % leave empty cells in mising data
else
outputCellInd = 1:num_files;
end
end
% filter studyinfo for files that exist
studyinfo.simulations = studyinfo.simulations(dataExist); % remove missing data
% load each data set recursively
keyvals = dsOptions2Keyval(options);
iLoadedFile = 0;
for iFile = 1:num_files
thisDataExists = dataExist(iFile);
if thisDataExists
dsVprintf(options, ' loading file (%g/%g): %s\n',iFile,num_files,data_files{iFile});
else
dsVprintf(options, ' skipping missing file (%g/%g): %s\n',iFile,num_files,data_files{iFile});
continue
end
iLoadedFile = iLoadedFile + 1;
tmp_data = dsImport(data_files{iFile},keyvals{:});
num_sets_per_file = length(tmp_data);
modifications = studyinfo.simulations(iLoadedFile).modifications;
if ~isfield(tmp_data,'varied') && ~isempty(modifications)
% add varied info
% this is necessary here when loading .csv data lacking metadata
tmp_data.varied={};
modifications(:,1:2) = cellfun( @(x) strrep(x,'->','_'),modifications(:,1:2),'UniformOutput',0);
for j = 1:size(modifications,1)
varied=[modifications{j,1} '_' modifications{j,2}];
for k = 1:num_sets_per_file
tmp_data(k).varied{end+1} = varied;
tmp_data(k).(varied) = modifications{j,3};
end
end
end
if options.as_cell
thisCellInd = outputCellInd(iFile);
end
% store this data
if iLoadedFile == 1
total_num_sets = num_sets_per_file * num_files;
set_indices=0:num_sets_per_file:total_num_sets-1;
% preallocate full data matrix based on first data file
if ~options.as_cell
data(1:total_num_sets) = tmp_data(1);
else
data = cell(num_files,1);
data(thisCellInd) = {tmp_data};
end
end
% replace i-th set of data sets by these data sets
if ~options.as_cell
data(set_indices(iFile)+(1:num_sets_per_file)) = tmp_data;
else
if iscell(tmp_data) && (length(tmp_data) == 1)
data(thisCellInd) = tmp_data;
else
data(thisCellInd) = {tmp_data};
end
end
end % iFile = 1:num_files
if ~any(dataExist)
error('No Data Found to analyze.')
end
% remove missing entries
% TODO: fix if missing entries and num_sets_per_file > 1
if ~all(dataExist)
if num_sets_per_file == 1
data(~dataExist) = [];
else
warning('Missing entries not removed, first found data copied into them. This behavior will be changed in a future DynaSim release.')
end
end
if ~exist('data', 'var')
if options.as_cell
data = {};
else
data = [];
end
end
return;
else % no studyinfo
studyinfo = [];
end
%% cell of file paths input
% check if input is a list of data files (TODO: eliminate duplicate code by
% combining with the above recursive loading for studyinfo data_files)
if iscellstr(src)
data_files=src;
dataExist=cellfun(@exist,data_files)==2;
data_files=data_files(dataExist);
keyvals=dsOptions2Keyval(options);
% load each data set recursively
for iFile = 1:length(data_files)
tmp_data = dsImport(data_files{iFile},keyvals{:});
% make data var
if (iFile == 1)
% preallocate full data matrix based on first data file
if ~options.as_cell
data(1:length(data_files)) = tmp_data;
else
data = cell(length(data_files), 1);
end
end
% store this data: replace i-th data element by this data set
if ~options.as_cell
data(iFile) = tmp_data;
else
if iscell(tmp_data) && (length(tmp_data) == 1)
data(iFile) = tmp_data;
else
data(iFile) = {tmp_data};
end
end
end
return;
end
%% char file path input
if ischar(src) % char path to single data file
[~,~,ext] = fileparts2(src);
switch lower(ext)
case '.mat'
% MAT-file contains data fields as separate variables (-v7.3 for HDF)
if isempty(options.time_limits) && isempty(options.variables)
% load full data set
data = load(src);
% if file only contains a structure called 'data' then return that
if isfield(data,'data') && length(fieldnames(data))==1
data=data.data;
end
else % load partial data set
% use matfile() to load HDF subsets given varargin options...
obj = matfile(src); % MAT-file object
varlist = who(obj); % variables stored in mat-file
labels = obj.labels; % list of state variables and monitors
if iscellstr(options.variables)
try
var2load = false(size(labels));
for iVar = 1:numel(options.variables)
% pass var string through RE to use wildcards
thisSearch = regexp(labels, regexptranslate('wildcard',options.variables{iVar})); % returns cells
thisSearch = ~cellfun(@isempty, thisSearch); % convert to logical
var2load = var2load | thisSearch;
end
labels = labels(var2load); % restrict variables to load
catch
% revert to default mode
labels = labels(ismember(labels, options.variables));
end
else
warning('Not using options.variables');
end
simulator_options = obj.simulator_options;
time = (simulator_options.tspan(1):simulator_options.dt:simulator_options.tspan(2))';
time = time(1:simulator_options.downsample_factor:length(time));
if ~isempty(options.time_limits)
% determine time indices to load
time_indices=nearest(time,options.time_limits(1)):nearest(time,options.time_limits(2));
else
% load all time points
time_indices=1:length(time);
end
% create DynaSim data structure:
data=[];
data.labels=labels;
% load state variables and monitors
for iLabel=1:length(labels)
data.(labels{iLabel})=obj.(labels{iLabel})(time_indices,:);
end
data.time=time(time_indices);
data.simulator_options=simulator_options;
if ismember('model',varlist)
data.model=obj.model;
end
if ismember('varied',varlist)
varied=obj.varied;
data.varied=varied;
for iVaried=1:length(varied)
data.(varied{iVaried})=obj.(varied{iVaried});
end
end
if ismember('results',varlist)
results=obj.results;
if iscellstr(options.variables)
results=results(ismember(results,options.variables));
end
data.results=results;
% load results
for iResult=1:length(results)
data.(results{iResult})=obj.(results{iResult})(time_indices,:);
end
end
end
case '.csv'
% assumes CSV file contains data organized according to output from dsWriteDynaSimSolver:
data = dsImportCSV(src);
if ~(isempty(options.time_limits) && isempty(options.variables))
% limit to select subsets
data = dsSelect(data,varargin{:}); % todo: create dsSelect()
end
otherwise
error('file type not recognized. dsImport currently supports DynaSim data structure in MAT file, data values in CSV file.');
end
end
%% auto_gen_test_data_flag argout
if options.auto_gen_test_data_flag
argout = {data, studyinfo}; % specific to this function
dsUnitSaveAutoGenTestData(argin, argout);
end
end % main fn