function [QTablePerm, steps] = DPBounded( MBParameters,Environment)
% Episode do one episode of the mountain car with sarsa learning
% maxstepts: the maximum number of steps per episode
% Q: the current QTable
% alpha: the current learning rate
% gamma: the current discount factor
% epsilon: probablity of a random action
% statelist: the list of states
% actionlist: the list of actions
% Maze
% based on the code of:
% Jose Antonio Martin H. <jamartinh@fdi.ucm.es>
%
%
QTablePerm=zeros(Environment.Num_States,Environment.Num_Actions);
H=ones(1,Environment.Num_States);
V=zeros(1,Environment.Num_States);
T=MBParameters.softMax_t;
steps=0;
while steps<=MBParameters.MaxTotalSimSteps
steps=steps+1;
if (MBParameters.greedy==true)
[hmax,selectedstate]=max(H);
if(hmax==0)
display(strcat('converged in ',num2str(steps)));
break;
end;
else
if all(H<=0)
break
end
%[~,selectedstate]=max(H+0.00000001*rand(size(H)));
He=exp(H/T);
selectedstate=randsample(Environment.Num_States,1,true,He);
end
%pause
for action=1:Environment.Num_Actions
%statename=Environment.nodenames{selectedstate}
%actionname=Environment.actionName{action}
reward_s=Environment.reward{selectedstate, action};
Ps=Environment.ps{selectedstate, action};
nextState=Environment.nextState{selectedstate, action};
%sizereward_s=size(reward_s)
%MFParameters.gamma_MF
ip=V(nextState);
gammaip=MBParameters.gamma*ip;
%sizeip=size(ip)
%sizeps=size(Ps)
if ~isempty(Ps)
%QTablePerm(selectedstate, action)=dot(Ps,reward_s+ gammaip);
QTablePerm(selectedstate, action)=sum(Ps.*(reward_s+ gammaip));
end
end
m=max(QTablePerm(selectedstate,:));
d=abs(V(selectedstate)-m);
V(selectedstate)=m;
%displayModel(Environment)
for st_H=1:Environment.Num_States
idxst=(Environment.PreviousStates{selectedstate}==st_H);
h=d* max(idxst.*Environment.InversePs{selectedstate});
if (st_H==selectedstate)
H(st_H)=h;
else
H(st_H)=max(h,H(st_H));
end
end
end
end