% MBParameters=struct(alpha_MB,Valpha_MB,…
% StoppingPathThresh,VStoppingPathThresh,…
% StoppingPathLength,VStoppingPathLength,…
% StoppingPathThreshReward=VStoppingPathThreshReward,…
% stoppingThresh,VstoppingThresh,… MaxItr,VMaxItr,…
% StopSim,VStopSim,… SelectActionSim,VSelectActionSim,…
% stopOnUncertaintyVal,VstopOnUncertaintyVal);
% QTablePerm=struct(QTablePermMean,VQTablePermMean,…
% QTableVar,VQTableVar)
function [Qtable_Integrated,N_itr,stateActionVisitCountsOut]=runInternalSimulationInResetAndStatistics...
(QTablePerm,currentState,Model,MBParameters,stateActionVisitCounts)
N_itr=1;
Qtable_Integrated= QTablePerm;
Qtable_Integrated.mean=zeros(size(Qtable_Integrated.mean));
if MBParameters.useMFToDriveMB
QTablePermLocal=QTablePerm;
else
QTablePermLocal=Qtable_Integrated;
end
stateActionVisitCountsOut=zeros(size(stateActionVisitCounts));
totalsteps=0;
%% begin_simulation_iteration_loop
while(N_itr<=MBParameters.MaxItrMB && totalsteps< MBParameters.MaxTotalSimSteps)
%display('runInternalSimulation')
% while(N_itr<=10)
currentStateSim= currentState;
path_end=0;
path_step=0;
%begin_path_simulation_iteration_loop
while(path_end==0)
%simulate one step
actionSim=selectActionSim(currentStateSim,Model,MBParameters, QTablePermLocal,stateActionVisitCountsOut);
stateActionVisitCountsOut(currentStateSim,actionSim)=stateActionVisitCountsOut(currentStateSim,actionSim)+1;
[nextStateSim,rewardSim,valid]=doActionInModel(actionSim,Model,currentStateSim,MBParameters);
if ~valid
%display('new state cannot simulate'); displayModel(Model);
%path_end=1; N_itr
break ;
end;
%path_end = testNewStateToEndPathSimulation(actionSim,
%currentStateSim, nextStateSim, rewardSim, path_step,
%Qtable_Integrated,Model, MBParameters); path_end=true;
%% update
%Qtable_Integrated=updateQTablePermNoReset(Qtable_Integrated,
%rewardSim, nextStateSim,actionSim ,
%currentStateSim,stateActionVisitCountsOut, MBParameters);
[nQ,maxvar,dreward]=...
updateQTablePermBase(Qtable_Integrated.mean,rewardSim, nextStateSim,actionSim , currentStateSim,stateActionVisitCountsOut, MBParameters);
QTablePermLocal.mean(currentStateSim,actionSim)=nQ;
Qtable_Integrated.mean(currentStateSim,actionSim)=nQ;
currentStateSim= nextStateSim;
path_step=path_step+1;
totalsteps=totalsteps+1;
path_end=(path_step==MBParameters.StoppingPathLengthMB)||(rand()<MBParameters.pStopPath);
%% logging
% Paths_Sampled(path_num).action(path_step)= actionSim;
% Paths_Sampled(path_num).state(path_step)= currentStateSim;
% Paths_Sampled(path_num).reward(path_step)= rewardSim;
% Paths_Sampled(path_num).path_end(path_step)=path_end;
% Paths_Sampled(path_num).steps=path_step;
% Paths_Sampled(path_num).last_state=nextStateSim;
%updateQtable_IntegratedWithSimulatedPathTDLambda(nextStateSim,
%rewardSim,
%currentStateSim,actionSim,Model,MBParameters,Model,stateActionVisitCounts);
%end_path_simulation_iteration_loop
end
N_itr=N_itr+1;
%display(['simulation done, steps:' num2str(path_step)])
end
%updateQtable_IntegratedWithSimulatedPathBacktracking(path_num,
%Paths_Sampled,Qtable_Integrated,Model,MBParameters); path_num= path_num+1;
%end_simulation_iteration_loop
end