%function [ total_reward,i,Q,Model,last_actions,last_states,last_reward,last_Q,lastMaxK,lastMaxVar,lastDreward,lastMA_noise_n,last_maxD,last_meanD ] =...
function Results =...
    Episode_WithReset_And_Statistics( inputVals)

QTablePerm.mean=zeros(inputVals.Environment.Num_States,inputVals.Environment.Num_Actions);
QTablePerm.time=zeros(inputVals.Environment.Num_States,inputVals.Environment.Num_Actions);
QTablePerm.var=1.5*eye(inputVals.Environment.Num_States*inputVals.Environment.Num_Actions);

last_actions=zeros(1,inputVals.maxsteps);

last_states=zeros(1,inputVals.maxsteps);

last_reward=zeros(1,inputVals.maxsteps);
lastMaxK=zeros(1,inputVals.maxsteps);
lastMaxVar=lastMaxK;
lastDreward=lastMaxK;
lastMA_noise_n=lastMaxK;
last_maxD=lastMaxK;
last_meanD=lastMaxK;


last_Q=zeros(inputVals.maxsteps,inputVals.Environment.Num_States,inputVals.Environment.Num_Actions);
priorCounts=4;
inputVals.parametersMBFW.knownTransitions
copyTransitionsFromEnvironment=(inputVals.parametersMBFW.knownTransitions);

Model =CreateModel(inputVals.Environment,priorCounts,copyTransitionsFromEnvironment);
HealthyModel=Model;
AddictedModel=Model;
HealthyQ=QTablePerm;
AddictedQ=QTablePerm;
HealedModel=Model;
HealedQ=QTablePerm;
currentState           = inputVals.start;

total_reward = 0;

reset=1;


% selects an action using the epsilon greedy selection strategy
%a   = e_greedy_selection(Q,s,epsilon);

stateActionVisitCounts=zeros(Model.Num_States,Model.Num_Actions);
stateActionVisitCountsSimul=stateActionVisitCounts;
stateActionVisitCounts2=stateActionVisitCounts;
%inputVals.Environment=changeToTherapyReward(inputVals.Environment);
j=0;
stepsToComputeStatistics=0;
obsidx=0
for nStep=1:inputVals.maxsteps
    
    %% change environment phase (initial,drug,therapy,postDrug)
    if nStep==inputVals.initDrugStartSteps
        inputVals.Environment=changeToBaseReward(inputVals.Environment);
        HealthyQ=QTablePerm;
        HealthyModel=Model;
    elseif nStep==inputVals.therapyStartSteps
        inputVals.Environment=changeToTherapyReward(inputVals.Environment);
        AddictedModel=Model;
        AddictedQ=QTablePerm;
        if inputVals.simulatedTherapy
            QTablePerm=combinePolicies(QTablePerm,HealthyQ,inputVals.resetPolicyFactor,inputVals.parametersMF);
            if(inputVals.resetModelFactor>=0 && inputVals.resetModelFactor<=1)
                Model=combineModels(AddictedModel,HealthyModel,inputVals.resetModelFactor);
                
            elseif inputVals.resetModelFactor<0
                Model=punishDrugModel(HealthyModel,inputVals.Environment,inputVals.resetModelFactor);
            end
        else
            originalMBlF=inputVals.parametersMBFW.modelLearningFactor;
            originalMFlF=inputVals.parametersMF.alpha;
            originalModelDecay=inputVals.parametersMBFW.modelDecay;
            inputVals.parametersMBFW.modelLearningFactor=inputVals.parametersMBFW.modelLearningFactor*inputVals.therapyModelLF;
            inputVals.parametersMBFW.modelDecay=2*inputVals.parametersMBFW.modelDecay;
            
            inputVals.parametersMF.alpha=inputVals.parametersMF.alpha*inputVals.therapyMFLFF;
        end
    end
    if (nStep==inputVals.therapyEndSteps)
        inputVals.Environment=changeToBaseReward(inputVals.Environment);
        HealedModel=Model;
        HealedQ=QTablePerm;
        if(~inputVals.simulatedTherapy)
            inputVals.parametersMBFW.modelLearningFactor=originalMBlF;
            inputVals.parametersMF.alpha=originalMFlF;
            inputVals.parametersMBFW.modelDecay=originalModelDecay;
        end
        
    end
    
    
    %% execute agent
    
    
    [reward,action, new_state,~,~,~,~,~,~,QTablePerm,stateActionVisitCounts,reset,stateActionVisitCountsSimul,Model,stateActionVisitCounts2] =...
        step(inputVals.parametersMBBW.internalReplay,...
        inputVals.parametersMBFW.runInternalSimulation,...
        inputVals.parametersMBFW.updateModel,...
        inputVals.parametersMF.updateQTablePerm,...
        inputVals.parametersMBBW.internalReplay,...
        currentState,...
        Model,...
        inputVals.Environment,...
        QTablePerm,...
        stateActionVisitCountsSimul,...
        reset,...
        inputVals.parametersMF,...
        inputVals.parametersMBFW,...
        stateActionVisitCounts,...
        stateActionVisitCounts2);
    
    
    
    if inputVals.parametersMBFW.computePolicyWithDP
        
        if j==0
            %             display('parametersMBFW.computePolicyWithDP')
            QTablePerm.mean = DP(   inputVals.parametersMBFW,inputVals.Environment);
            Environment=inputVals.Environment;
            save (strcat('Environment',num2str(nStep,'%06i'),'.mat'),'Environment')
            j=100;
        else
            j=j-1;
        end
    end
%     if(reward~=0) || (currentState==1)
%         found=false;
%         for obsidx1=1:obsidx
%             if (observed(obsidx1).action==action &&...
%                     observed(obsidx1).currentState==currentState &&...
%                     observed(obsidx1).reward==reward &&...
%                     observed(obsidx1).new_state==new_state)
%                 found=true;
%                 break
%             end
%         end
%         if ~found
%             obsidx=obsidx+1;
%             observed(obsidx).action=action;
%             action=action
%             observed(obsidx).currentState=currentState;
%             currentState=currentState
%             observed(obsidx).reward=reward;
%             reward=reward
%             observed(obsidx).new_state=new_state;
%             new_state=new_state
%             nStep=nStep            
%             pause
%             displayQValueMean(QTablePerm.mean,inputVals.Environment);
%             pause
%         end
%     end
        
       
        
        
        
        
        
    
    
    last_actions(nStep)=action;
    
    last_states(nStep)=currentState;
    last_reward(nStep)=reward;
    
    %lastMaxK(i)=maxK;
    %lastMaxVar(i)=maxVar;
    %lastDreward(i)=max(abs(dreward));
    %lastMA_noise_n(i)=MA_noise_n;
    %   last_maxD(i)=maxdiffQp;
    %   last_meanD(i)=meanQp;
    
    last_Q(nStep,:,:)=QTablePerm.mean;
    currentState=new_state;
    
    
    
    Q=QTablePerm.mean;
    
    
    if inputVals.parametersMBFW.computeStatistics
        if stepsToComputeStatistics<=0
            inputVals.parametersMBFW.stepsToComputeStatistics=inputVals.parametersMBFW.periodToComputeStatistics;
            for stateForStatistics=1:inputVals.Environment.Num_States;
                for iStatisticsSteps=1:inputVals.parametersMBFW.statisticsStepsPerState
                    [reward1(iStatisticsSteps,stateForStatistics),...
                        action1(iStatisticsSteps,stateForStatistics),...
                        new_state1(iStatisticsSteps,stateForStatistics),~,~,~,~,~,~] = ...
                        step(inputVals.parametersMBBW.internalReplay,...
                        inputVals.parametersMBFW.runInternalSimulation,...
                        inputVals.parametersMBFW.updateModel,...
                        inputVals.parametersMF.updateQTablePerm,...
                        stateForStatistics,...
                        Model,...
                        inputVals.Environment,...
                        QTablePerm,...
                        stateActionVisitCountsSimul,...
                        reset,...
                        inputVals.parametersMF,...
                        inputVals.parametersMBFW,...
                        stateActionVisitCounts,...
                        stateActionVisitCounts2);
                    
                end
            end
        else
            stepsToComputeStatistics=stepsToComputeStatistics-1;
        end
    end
end


Results.total_reward=total_reward;
Results.i=nStep;
Results.Q=Q;
Results.Model=Model;
Results.last_actions=last_actions;
Results.last_states=last_states;
Results.last_reward=last_reward;
Results.last_Q=last_Q;
Results.lastMaxK=lastMaxK;
Results.lastMaxVar=lastMaxVar;
Results.lastDreward=lastDreward;
Results.lastMA_noise_n=lastMA_noise_n;
Results.last_maxD=last_maxD;
Results.last_meanD=last_meanD;
Results.HealthyModel=HealthyModel;
Results.HealthyQ=HealthyQ;
Results.AddictedModel=AddictedModel;
Results.AddictedQ=AddictedQ;
Results.HealedModel=HealedModel;
Results.HealedQ=HealedQ;
end