function [QTablePerm, steps] = DPBounded(   MBParameters,Environment)
% Episode do one episode of the mountain car with sarsa learning
% maxstepts: the maximum number of steps per episode
% Q: the current QTable
% alpha: the current learning rate
% gamma: the current discount factor
% epsilon: probablity of a random action
% statelist: the list of states
% actionlist: the list of actions

% Maze
% based on the code of:
%  Jose Antonio Martin H. <jamartinh@fdi.ucm.es>
%
%

QTablePerm=zeros(Environment.Num_States,Environment.Num_Actions);
H=ones(1,Environment.Num_States);
V=zeros(1,Environment.Num_States);
T=MBParameters.softMax_t;
steps=0;
while steps<=MBParameters.MaxTotalSimSteps
    steps=steps+1;
    
    if (MBParameters.greedy==true)
        [hmax,selectedstate]=max(H);
        if(hmax==0)
            display(strcat('converged in ',num2str(steps)));
            break;
        end;
    else
        if all(H<=0)
            break
        end
        %[~,selectedstate]=max(H+0.00000001*rand(size(H)));
        He=exp(H/T);
        selectedstate=randsample(Environment.Num_States,1,true,He);
    end
    %pause
    
    
    for action=1:Environment.Num_Actions
        %statename=Environment.nodenames{selectedstate}
        %actionname=Environment.actionName{action}
        
        reward_s=Environment.reward{selectedstate, action};
        
        Ps=Environment.ps{selectedstate, action};
        
        nextState=Environment.nextState{selectedstate, action};
        %sizereward_s=size(reward_s)
        %MFParameters.gamma_MF
        
        ip=V(nextState);
        gammaip=MBParameters.gamma*ip;
        %sizeip=size(ip)
        %sizeps=size(Ps)
        if ~isempty(Ps)
            %QTablePerm(selectedstate, action)=dot(Ps,reward_s+ gammaip);
            QTablePerm(selectedstate, action)=sum(Ps.*(reward_s+ gammaip));
        end
    end
    m=max(QTablePerm(selectedstate,:));
    d=abs(V(selectedstate)-m);
    V(selectedstate)=m;
    %displayModel(Environment)
    for st_H=1:Environment.Num_States
        idxst=(Environment.PreviousStates{selectedstate}==st_H);
        h=d*  max(idxst.*Environment.InversePs{selectedstate});
        if (st_H==selectedstate)
            
            
            H(st_H)=h;
        else
            H(st_H)=max(h,H(st_H));
        end
    end
end

end