% MBParameters=struct(alpha_MB,Valpha_MB,…
% StoppingPathThresh,VStoppingPathThresh,…
% StoppingPathLength,VStoppingPathLength,…
% StoppingPathThreshReward=VStoppingPathThreshReward,…
% stoppingThresh,VstoppingThresh,…
% MaxItr,VMaxItr,…
% StopSim,VStopSim,…
% SelectActionSim,VSelectActionSim,…
% stopOnUncertaintyVal,VstopOnUncertaintyVal);
% QTablePerm=struct(QTablePermMean,VQTablePermMean,…
% QTableVar,VQTableVar)
function [Qtable_Integrated,N_itr,stateActionVisitCountsOut]=runInternalEleTraceSimulationInResetAndStatistics...
(QTablePerm,currentState,Model,MBParameters,stateActionVisitCounts)
N_itr=1;
Qtable_Integrated= QTablePerm;
if MBParameters.useMFToDriveMB
QTablePermLocal=QTablePerm;
else
QTablePermLocal=Qtable_Integrated;
end
Qtable_Integrated.mean=zeros(size(Qtable_Integrated.mean));
stateActionVisitCountsOut=stateActionVisitCounts;
%% begin_simulation_iteration_loop
while(N_itr<=MBParameters.MaxItrMB)
%display('runInternalSimulation')
% while(N_itr<=10)
currentStateSim= currentState;
path_end=0;
path_step=0;
traceState=zeros(1,MBParameters.StoppingPathLengthMB);
traceAction=zeros(1,MBParameters.StoppingPathLengthMB);
traceW=zeros(1,MBParameters.StoppingPathLengthMB);
%begin_path_simulation_iteration_loop
while(path_end==0)
%simulate one step
actionSim=selectActionSim(currentStateSim,Model,MBParameters, QTablePermLocal,stateActionVisitCountsOut);
traceW=MBParameters.lambda*MBParameters.gamma*traceW;
traceW(path_step+1)=1;
traceState(path_step+1)=currentStateSim;
traceAction(path_step+1)=actionSim;
[nextStateSim,rewardSim,valid]=doActionInModel(actionSim,Model,currentStateSim,MBParameters);
if ~valid
%display('new state cannot simulate');
%displayModel(Model);
%path_end=1;
%N_itr
break ;
end;
stateActionVisitCountsOut(currentStateSim,actionSim)=stateActionVisitCountsOut(currentStateSim,actionSim)+1;
%path_end = testNewStateToEndPathSimulation(actionSim, currentStateSim, nextStateSim, rewardSim, path_step, Qtable_Integrated,Model, MBParameters);
%path_end=true;
%% update
%Qtable_Integrated=updateQTablePermNoReset(Qtable_Integrated, rewardSim, nextStateSim,actionSim , currentStateSim,stateActionVisitCountsOut, MBParameters);
[~,~,dreward]=...
updateQTablePermBase(Qtable_Integrated.mean,rewardSim, nextStateSim,actionSim , currentStateSim,stateActionVisitCountsOut, MBParameters);
currentStateSim= nextStateSim;
path_step=path_step+1;
path_end=(path_step==MBParameters.StoppingPathLengthMB);
if path_end
dreward=dreward+max(QTablePermLocal.mean(currentStateSim,:))*MBParameters.gamma;
end
for itrace=1:MBParameters.StoppingPathLengthMB
if(traceW(itrace)==0)
break;
else
if(MBParameters.UseMFToBoothMBFinalVAl)
dreward=traceW(itrace)*MBParameters.alpha*dreward;
end
QTablePermLocal.mean(traceState(itrace),traceAction(itrace))=QTablePermLocal.mean(traceState(itrace),traceAction(itrace))+dreward;
Qtable_Integrated.mean(traceState(itrace),traceAction(itrace))=Qtable_Integrated.mean(traceState(itrace),traceAction(itrace))+dreward;
end
end
%% logging
% Paths_Sampled(path_num).action(path_step)= actionSim;
% Paths_Sampled(path_num).state(path_step)= currentStateSim;
% Paths_Sampled(path_num).reward(path_step)= rewardSim;
% Paths_Sampled(path_num).path_end(path_step)=path_end;
% Paths_Sampled(path_num).steps=path_step;
% Paths_Sampled(path_num).last_state=nextStateSim;
%updateQtable_IntegratedWithSimulatedPathTDLambda(nextStateSim, rewardSim, currentStateSim,actionSim,Model,MBParameters,Model,stateActionVisitCounts);
%end_path_simulation_iteration_loop
end
N_itr=N_itr+1;
%display(['simulation done, steps:' num2str(path_step)])
end
%updateQtable_IntegratedWithSimulatedPathBacktracking(path_num, Paths_Sampled,Qtable_Integrated,Model,MBParameters);
%path_num= path_num+1;
%end_simulation_iteration_loop
end