/*
  Copyright (c) 2011 Paul Richmond, University of Sheffield , UK; 
  all rights reserved unless otherwise stated.

  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  In addition to the regulations of the GNU General Public License,
  publications and communications based in parts on this program or on
  parts of this program are required to cite the article 
  "Democratic population decisions result in robust policy-gradient 
  learning: a parametric study with GPU simulations" by Paul Richmond, 
  Lars Buesing, Michele Giugliano and Eleni Vasilaki, PLoS ONE Neuroscience, 
  Under Review.. 

  This program is distributed in the hope that it will be useful, but
  WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with this program; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston,
  MA 02111-1307 USA
*/


// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <windows.h>
#include <time.h>
#include <float.h>
#include <cutil_inline.h>

// includes, project
#include "reduction.cuh"
#include "model.h"
#include "model.cuh"
#include "output.cuh"

////////////////////////////////////////////////////////////////////////////////
//Macro definitions
#define MAX(x,y) ((x > y) ? x : y)
#define MAX_NEURONS MAX(M, N)
#define MAX_RAND MAX_NEURONS //MAX(MAX_NEURONS, SM_CHUNK_SIZE*SM_CHUNK_SIZE)	//factor of T required for no dyn, N*M for weight update noise



bool printWeightPlot;

////////////////////////////////////////////////////////////////////////////////
// Thread/Grid blocks
dim3 m_threads;	
dim3 m_grid;

dim3 n_threads;	
dim3 n_grid;

dim3 nN_threads;
dim3 nN_grid;

dim3 mN_threads;
dim3 mN_grid;

dim3 NT_matrix_threads;	
dim3 NT_matrix_grid;	
int NT_grid_width;

dim3 NT_threads;
dim3 NT_grid;

dim3 MN_matrix_threads;
dim3 MN_matrix_grid;
int MN_grid_width;

dim3 MN_threads;
dim3 MN_grid;

//////////////////////////////////////////////////////
//Persitant global variables
float* d_Wr;
float* d_W;
float* d_W_out;
float* d_Wt;
//Step variables
float* h_In_Deg;	//host data
float* d_In_Deg;
float* d_rew;
int* d_decision_offset;
int* d_target_offset;
//Trial variables
float* d_u;
float* d_u_out;
float* d_Y;
float* d_Yt;
float* d_Y_sum;
float* d_YProbt;
float* d_input_pot;
float* d_lateral_pot;
float* d_YProb;
float* d_X;	
float* d_Input;	
float* d_epsp;
float* d_Grad;
float* d_out_x;
float* d_out_x_sum;
float* d_out_y;
float* d_out_y_sum;
rand48seeds* d_randSeeds;
magic_numbers mn;

//data used for producing gradiant graphs
float* h_W;
float* d_sumGrad;
float* d_sumDeltaW;
float* d_sumGrad_out;
float* d_sumDeltaW_out;
float* h_sumGrad;
float* h_sumDeltaW;



/**	Reward function  */
float reward(float x, int config)
{
	float reward = 0;

	//guassian reward
	if (reward_func == GAUSSIAN)
	{
		if (x<PI)
			reward = exp(-pow(x, 2.0f)/(2.0f*pow(sigma_R[config], 2.0f)));
		else
			reward = exp(-pow(2.0f*PI-x, 2.0f)/(2.0f*pow(sigma_R[config], 2.0f))); 

	}
	//box reward
	else if (reward_func == BOX)
	{
		if (x<PI)
			reward = (float)(x<sigma_R_box[config])*(x>-sigma_R_box[config]);
		else
			reward = (float)((2.0f*PI-x)<sigma_R_box[config]) * ((2.0f*PI-x)>-sigma_R_box[config]);
	}

	return reward;
}


float errorFunc(float x)
{

	float err;
	
	if (x<PI)
		err = x;
	else
		err = (2.0f*PI-x);

	return err;
}

/* Mod function using sign from divisor (as with python)*/
float py_modf(float n, float a) {
    float r = fmodf(n, a);
	float sign = (a > 0) ? 1.0f : -1.0f;
    if (r * sign < 0) r += a;
    return r;
}


void copyParametersToDevice()
{
	cutilSafeCall( cudaMemcpyToSymbol(beta, &h_beta, sizeof(float)*ind_configs));
	cutilSafeCall( cudaMemcpyToSymbol(xsc, &h_xsc, sizeof(float)*ind_configs));
	cutilSafeCall( cudaMemcpyToSymbol(u0, &h_u0, sizeof(float)*ind_configs));
	cutilSafeCall( cudaMemcpyToSymbol(w0, &h_w0, sizeof(float)*ind_configs));
	cutilSafeCall( cudaMemcpyToSymbol(eta, &h_eta, sizeof(float)*ind_configs));
	cutilSafeCall( cudaMemcpyToSymbol(baseline, &h_baseline, sizeof(float)*ind_configs));

	cutilSafeCall( cudaMemcpyToSymbol(tau, &h_tau, sizeof(float)*ind_configs));
	cutilSafeCall( cudaMemcpyToSymbol(lambda, &h_lambda, sizeof(float)*ind_configs));
	cutilSafeCall( cudaMemcpyToSymbol(urest, &h_urest, sizeof(float)*ind_configs));
	cutilSafeCall( cudaMemcpyToSymbol(threshold, &h_threshold, sizeof(float)*ind_configs));
	cutilSafeCall( cudaMemcpyToSymbol(du, &h_du, sizeof(float)*ind_configs));
	cutilSafeCall( cudaMemcpyToSymbol(rho0, &h_rho0, sizeof(float)*ind_configs));
	cutilSafeCall( cudaMemcpyToSymbol(ref, &h_ref, sizeof(float)*ind_configs));
}


/** initLearnStepData
 * Initialises all device data used during the learn_step function.
 * Initialises Weight matrices W (+W_out) and Wr on the device. Wr has static laternal connection weights. W is set to zero. 
 * A seperate set of data is created for each indipendant trial excluding Wr which is the same throughout all.
 * A set of host data is allocated (h_In_Deg) which is used in the learn step to create random goals.
 */
void initLearnStepData()
{
	//allocate W matrix on host to save weights later on
	if (printWeightPlot)
	{
		h_W = (float*) malloc(N*M*sizeof(float)*ind_trials);
	}

	//allocate weight data
	cudaMalloc( (void**) &d_Wr, N*N*sizeof(float)*ind_configs );
	cudaMalloc( (void**) &d_W, M*N*sizeof(float)*ind_trials);
	cudaMalloc( (void**) &d_W_out, M*N*sizeof(float) *ind_trials);
	cudaMalloc( (void**) &d_Wt, M*N*sizeof(float) *ind_trials);

	//init lateral connection weights and copy to device
	float* h_Wr = (float*) malloc(N*N*sizeof(float)*ind_configs);
	for(int i=0; i<N; i++)
	{
		for (int j=0; j<N; j++)
		{
			float dist = abs(j-i*1.0f); 
			dist = dist*(dist<(N/2)) + (N-dist)*(dist>=(N/2));

			//Wr varies for each set of configuration parameters
			for(int k=0; k<ind_configs; k++)
			{
				h_Wr[j+(i*N) + (k*N*N)] =	(expf(-powf(dist,2.0f)/(2.0f*powf(sig_p[k],2.0f)))*w_E[k]  - 0.9f)*rsc[k];		//j+(i*N) possibly should be i+(j*N)
			}
		}
	}
	cutilSafeCall( cudaMemcpy(d_Wr, h_Wr, N*N*sizeof(float)*ind_configs, cudaMemcpyHostToDevice) );
	free(h_Wr);

	//Set action cell weights to zero
	cutilSafeCall( cudaMemset(d_W, 0,M*N*sizeof(float)*ind_trials));
	cutilSafeCall( cudaMemset(d_W_out, 0,M*N*sizeof(float)*ind_trials));
	cutilSafeCall( cudaMemset(d_Wt, 0,M*N*sizeof(float)*ind_trials));

	//allocate global learnstep variables
	cudaMalloc( (void**) &d_In_Deg, Tr_MAX*sizeof(float)*ind_trials );
	cudaMalloc( (void**) &d_rew, sizeof(float)*ind_trials );
	cudaMalloc( (void**) &d_decision_offset, sizeof(int)*ind_trials );
	cudaMalloc( (void**) &d_target_offset, sizeof(int)*ind_trials );
	

	//allocate trial specific data
	cudaMalloc( (void**) &d_u, T*N*sizeof(float)*ind_trials );
	cudaMalloc( (void**) &d_u_out, T*N*sizeof(float)*ind_trials );
	cudaMalloc( (void**) &d_Y, T*N*sizeof(float)*ind_trials );
	cudaMalloc( (void**) &d_Yt, T*N*sizeof(float)*ind_trials );
	cudaMalloc( (void**) &d_Y_sum, T*N*sizeof(float)*ind_trials );
	cudaMalloc( (void**) &d_YProb, T*N*sizeof(float)*ind_trials );
	cudaMalloc( (void**) &d_YProbt, T*N*sizeof(float)*ind_trials );
	cudaMalloc( (void**) &d_input_pot, T*N*sizeof(float)*ind_trials );	//factor of T only required for no dyn system
	cudaMalloc( (void**) &d_lateral_pot, N*sizeof(float)*ind_trials );
	cudaMalloc( (void**) &d_X, T*M*sizeof(float)*ind_trials );
	cudaMalloc( (void**) &d_Input, T*M*sizeof(float)*ind_trials );
	cudaMalloc( (void**) &d_epsp, T*M*sizeof(float)*ind_trials );
	cudaMalloc( (void**) &d_Grad, M*N*sizeof(float)*ind_trials );
	cudaMalloc( (void**) &d_out_x, N*sizeof(float)*ind_trials );
	cudaMalloc( (void**) &d_out_x_sum, N*sizeof(float)*ind_trials );
	cudaMalloc( (void**) &d_out_y, N*sizeof(float)*ind_trials );
	cudaMalloc( (void**) &d_out_y_sum, N*sizeof(float)*ind_trials );

	//init host data used to create a set of random goals 
	h_In_Deg = (float*) malloc(Tr_MAX*sizeof(float)*ind_trials);

	//init randomn number stream data using rand48 algorithm
	rand48seeds* h_randSeeds;
	h_randSeeds = (rand48seeds*) malloc(MAX_RAND*sizeof(rand48seeds)*ind_trials);
	cudaMalloc( (void**) &d_randSeeds, MAX_RAND*sizeof(rand48seeds)*ind_trials);
	initCUDARand48(MAX_RAND*ind_trials, h_randSeeds, d_randSeeds, mn);
	free(h_randSeeds);
}

void initGraphAnalysis()
{
	//malloc data on host
	h_sumGrad = (float*)malloc(M*N*sizeof(float)*ind_trials);
	h_sumDeltaW = (float*)malloc(M*N*sizeof(float)*ind_trials);
	
	//malloc dat on device
	cudaMalloc( (void**) &d_sumGrad, M*N*sizeof(float)*ind_trials );
	cudaMalloc( (void**) &d_sumDeltaW, M*N*sizeof(float)*ind_trials );
	cudaMalloc( (void**) &d_sumGrad_out, M*N*sizeof(float)*ind_trials );
	cudaMalloc( (void**) &d_sumDeltaW_out, M*N*sizeof(float)*ind_trials );
}

void resetGraphAnalysis()
{
	//block set device data to 0
	cutilSafeCall( cudaMemset(d_sumGrad, 0,M*N*sizeof(float)*ind_trials));
	cutilSafeCall( cudaMemset(d_sumDeltaW, 0,M*N*sizeof(float)*ind_trials));
	cutilSafeCall( cudaMemset(d_sumGrad_out, 0,M*N*sizeof(float)*ind_trials));
	cutilSafeCall( cudaMemset(d_sumDeltaW_out, 0,M*N*sizeof(float)*ind_trials));
}

void cleanupGraphAnalysis()
{
	//free host memory
	free(h_sumGrad);
	free(h_sumDeltaW);

	//free device memory
	cudaFree(d_sumGrad);
	cudaFree(d_sumDeltaW);
	cudaFree(d_sumGrad_out);
	cudaFree(d_sumDeltaW_out);
}


/** cleanupLearnStep
 *	Frees all data on host and device which is used in the learn step.
 */
void cleanupLearnStep()
{
	//dealloc learn step data
	free(h_In_Deg); 
	cudaFree(d_In_Deg);
	cudaFree(d_rew);
	cudaFree(d_decision_offset);
	cudaFree(d_target_offset);
	

	//dealloc trial data
	cudaFree (d_randSeeds);
	cudaFree(d_u);
	cudaFree(d_u_out);
	cudaFree(d_Y);
	cudaFree(d_Yt);
	cudaFree(d_Y_sum);
	cudaFree(d_YProb);
	cudaFree(d_YProbt);
	cudaFree(d_input_pot);
	cudaFree(d_lateral_pot);
	cudaFree(d_X);
	cudaFree(d_Input);
	cudaFree(d_epsp);
	cudaFree(d_Grad);
	cudaFree(d_out_x);
	cudaFree(d_out_x_sum);
	cudaFree(d_out_y);
	cudaFree(d_out_y_sum);

	//dealloc weight data
	if (printWeightPlot)
		free(h_W);
	cudaFree( d_W);
	cudaFree( d_W_out);
	cudaFree( d_Wr);
	cudaFree( d_Wt);

}

/** calculateGridBlockSizes
 * All grid block sizes are pre-calculated before the learn step
 */
void calculateGridBlockSizes()
{
	//M total threads with a max block size of M
	m_threads = dim3(M_BLOCK_SIZE, 1, 1);	
	m_grid = dim3(M/M_BLOCK_SIZE, ind_trials, 1);

	//N total threads with a max block size of N
	n_threads = dim3(N_BLOCK_SIZE, 1, 1);	
	n_grid = dim3(N/N_BLOCK_SIZE, ind_trials, 1);

	//n block size by N total threads
	nN_threads = dim3(N_BLOCK_SIZE, 1 , 1);
	nN_grid = dim3(N, ind_trials, 1);

	//m block size by N total threads
	mN_threads = dim3(M_BLOCK_SIZE,1 , 1);
	mN_grid = dim3(N, ind_trials, 1);

	//NxT total threads with SM_CHUNK_SIZE^2 threads per block
	NT_matrix_threads = dim3(SM_CHUNK_SIZE, SM_CHUNK_SIZE, 1);	
	NT_matrix_grid = dim3(N*T/(SM_CHUNK_SIZE*SM_CHUNK_SIZE), ind_trials, 1);
	NT_grid_width = N/SM_CHUNK_SIZE;

	//N*T total threads
	NT_threads = dim3(SM_CHUNK_SIZE *SM_CHUNK_SIZE, 1, 1);	
	NT_grid = dim3(N*T/(SM_CHUNK_SIZE*SM_CHUNK_SIZE), ind_trials, 1);

	//NxM total threads with SM_CHUNK_SIZE^2 threads per 2D block
	//both x and y of grid are held within the x dimenion of the grid (requires mod and divide in kernel)
	MN_matrix_threads = dim3(SM_CHUNK_SIZE, SM_CHUNK_SIZE, 1);
	MN_matrix_grid = dim3(M*N/(SM_CHUNK_SIZE*SM_CHUNK_SIZE), ind_trials, 1);
	MN_grid_width = M/SM_CHUNK_SIZE;

	//NxM total threads with SM_CHUNK_SIZE^2 threads per 1D block
	MN_threads = dim3(SM_CHUNK_SIZE*SM_CHUNK_SIZE, 1, 1);
	MN_grid = dim3(M*N/(SM_CHUNK_SIZE*SM_CHUNK_SIZE), ind_trials, 1);
}

/**
 *	learn_step function
 */
template <LEARNING learning, DYNAMICS dynamics, PROFILING profiling>
void learn_step(int Tr, float* total_reward, float* total_error)
{
	

	//global learn step variables
	for (int i=0; i<ind_trials; i++)
	{
		total_reward[i] = 0;
		total_error[i] = 0;
	}

	srand ( (unsigned int)time(NULL) );
	for (int i=0;i<Tr*ind_trials;i++)
	{
		if (moving_target)
			h_In_Deg[i] = ((float)rand()/RAND_MAX)*2.0f*PI;
		else
			h_In_Deg[i] = static_target;
	}
	cutilSafeCall( cudaMemcpy(d_In_Deg, h_In_Deg, Tr*sizeof(float)*ind_trials, cudaMemcpyHostToDevice) );


	//allocate data for theta
	float* theta = (float*)malloc(ind_trials*sizeof(float));
	float* rew = (float*)malloc(ind_trials*sizeof(float));
	int* decision_offset = (int*)malloc(ind_trials*sizeof(int));
	int* target_offset = (int*)malloc(ind_trials*sizeof(int));

	for (int n=0; n<Tr; n++)
	{


		//set initial trial specific data
		cutilSafeCall( cudaMemset(d_Y, 0, T*N*sizeof(float)*ind_trials) );
		resetMembranePotential<<<NT_grid, NT_threads>>>(d_u, d_u_out);
		
		//calculate place cell distance and activations and output
		poissonNeuronSimulation<<<m_grid, m_threads>>>(n, Tr, d_X, d_Input, d_epsp, d_In_Deg, d_randSeeds, mn);
		cutilCheckMsg("Error in kernel\n");

		//for T trails (must be performed serially as lateral activation use data from t-1)
		for (int t=1; t<T; t++)
		{
			//calculate the place cell activations
			placeCellSpikePropagation<<<mN_grid, mN_threads>>>(d_W, d_Input+(t*M), d_input_pot);
			cutilCheckMsg("Error in kernel\n");

			//check if we are using dynamic simulation for lateral connections (if so calculate them)
			if (dynamics == DYN_SYS){
				//calculate the action cell lateral interactions
				actionCellLateralSpikePropagation<<<nN_grid, nN_threads>>>(d_Wr, d_Y+((t-1)*N), d_lateral_pot);
				cutilCheckMsg("Error in kernel\n");
			}

			//calculate the action cell spikes
			integrateAndFireNeuronSimulation<dynamics><<<n_grid, n_threads>>>(t, d_u, d_u_out, d_input_pot, d_lateral_pot, d_YProb, d_Y, d_randSeeds, mn);
			cutilCheckMsg("Error in kernel\n");

		
			//swap output for input
			float* d_u_temp;
			d_u_temp = d_u;
			d_u = d_u_out;
			d_u_out = d_u_temp;

		}
		

		//transpose Y
		transpose<N, T><<<NT_matrix_grid, NT_matrix_threads>>>(d_Y, d_Yt, NT_grid_width);
		cutilCheckMsg("Error in kernel\n");

		//calculate average angle of Y (for each N) across all ind trials N*ind_trials total parallel reductions
		reduceMultipleArrays<float>(T, d_Yt, d_Y_sum, N*ind_trials);
		cutilCheckMsg("Error in kernel\n");

		//calculate output angle components
		calculatePopulationVector<<<n_grid, n_threads>>>(d_Y_sum, d_out_x, d_out_y);
		cutilCheckMsg("Error in kernel\n");

		//sum output components
		reduceMultipleArrays<float>(N, d_out_x, d_out_x_sum, ind_trials);
		cutilCheckMsg("Error in kernel\n");
		reduceMultipleArrays<float>(N, d_out_y, d_out_y_sum, ind_trials);
		cutilCheckMsg("Error in kernel\n");


		//calculate angle by reading back sum totals to the CPU for each ind trial
		for (int i=0; i<ind_trials; i++)
		{
			int configuration_offset = pow2mod(i, ind_configs);
			float h_out_x_sum = 0;
			float h_out_y_sum = 0;
			int d_out_sum_offset = N*i;
			int d_In_Deg_offset = Tr*i;
			cutilSafeCall( cudaMemcpy( &h_out_x_sum, d_out_x_sum+d_out_sum_offset, sizeof(float), cudaMemcpyDeviceToHost) );
			cutilSafeCall( cudaMemcpy( &h_out_y_sum, d_out_y_sum+d_out_sum_offset, sizeof(float), cudaMemcpyDeviceToHost) );

			//calculate theta
			theta[i] = py_modf(atan2(h_out_x_sum, h_out_y_sum), 2.0f*PI);

			//calculate reward
			rew[i] = reward(abs(theta[i]-h_In_Deg[n+d_In_Deg_offset]), configuration_offset);

			//update total reward
			total_reward[i] += rew[i];
			total_error[i] += (fabs(h_In_Deg[n+d_In_Deg_offset] - theta[i]));

			//caluclate the descision and target offsets
			if (profiling == GRAPHING){
				decision_offset[i] = (int)floorf(((theta[i]+PI)*N)/(2.0f*PI));
				target_offset[i] = (int)floorf(((h_In_Deg[n+d_In_Deg_offset]+PI)*N)/(2.0f*PI));
			}
		}

		//copy rewards to device
		CUDA_SAFE_CALL( cudaMemcpy( d_rew, rew, ind_trials*sizeof(float), cudaMemcpyHostToDevice));
		
		//copy descision and target offsets to the device
		if (profiling == GRAPHING){
			CUDA_SAFE_CALL( cudaMemcpy( d_decision_offset, decision_offset, ind_trials*sizeof(int), cudaMemcpyHostToDevice));
			CUDA_SAFE_CALL( cudaMemcpy( d_target_offset, target_offset, ind_trials*sizeof(int), cudaMemcpyHostToDevice));
		}
		
		/* only perform the following if learning or calculating the gradiant graph*/
		if ((learning == UPDATE_LEARNING_WEIGHTS)||(profiling == GRAPHING))
		{
			//transpose rho
			transpose<N,T><<<NT_matrix_grid, NT_matrix_threads>>>(d_YProb, d_YProbt, NT_grid_width);
			cutilCheckMsg("Error in kernel\n");

			//calculate gradiant
			calculateGradiant<<<MN_matrix_grid, MN_matrix_threads>>>(MN_grid_width, d_Yt, d_YProbt, d_epsp, d_Grad);
			cutilCheckMsg("Error in kernel\n");

			if (profiling == GRAPHING)
			{
				//update the gradiant and deltaW sum totals
				updateGradiantAnalysis<<<MN_grid, MN_threads>>>(d_decision_offset, d_target_offset, d_Grad, d_rew, d_sumGrad, d_sumGrad_out, d_sumDeltaW, d_sumDeltaW_out); 
				cutilCheckMsg("Error in kernel\n");
				//swap sumGrad input and output
				float* d_sumGrad_temp;
				d_sumGrad_temp = d_sumGrad;
				d_sumGrad = d_sumGrad_out;
				d_sumGrad_out = d_sumGrad_temp;
				//swap deltaW input and output
				float* d_sumDeltaW_temp;
				d_sumDeltaW_temp = d_sumDeltaW;
				d_sumDeltaW = d_sumDeltaW_out;
				d_sumDeltaW_out = d_sumDeltaW_temp;

			}
			//no learning if we are calculating the gradiant graph
			else if (learning == UPDATE_LEARNING_WEIGHTS)
			{
				//update learning weights
				updateLearningWeights<<<MN_grid, MN_threads>>>(d_W, d_rew, d_Grad, d_W_out);
				cutilCheckMsg("Error in kernel\n");

				if (APPLY_NOISE)
				{
					//apply noise (this will also swap input and output)
					applyNoise<<<m_grid, m_threads>>>(d_W_out, d_W, d_randSeeds, mn);
					cutilCheckMsg("Error in kernel\n");
				}
				else{
					//swap the input and output pointer
					float* d_W_temp;	//used to swap input and output
					d_W_temp = d_W;
					d_W = d_W_out;
					d_W_out = d_W_temp;
				}
			}
		}
	}

	//cleanup theta
	free(theta);
	free(rew);
	free(decision_offset);
	free(target_offset);
}



void graphAnalysisDataToHost()
{
	printf("Copying graph analysis data from device to host\n");
	cutilSafeCall( cudaMemcpy(h_sumGrad, d_sumGrad, N*M*sizeof(float)*ind_trials, cudaMemcpyDeviceToHost) );
	cutilSafeCall( cudaMemcpy(h_sumDeltaW, d_sumDeltaW, N*M*sizeof(float)*ind_trials, cudaMemcpyDeviceToHost) );
}

void weightsToHost(){
	printf("Copying weight data from device to host\n");
	cutilSafeCall( cudaMemcpy(h_W, d_W, N*M*sizeof(float)*ind_trials, cudaMemcpyDeviceToHost) );
}


/** learn_curve
 * Runs the simulation
 */
template<PROFILING profiling, DYNAMICS dynamics> 
void learn_curve() 
{

	//init model
	copyParametersToDevice();
	initLearnStepData();
	calculateGridBlockSizes();
	if ((profiling == SIMULATION_EXTENDED_ANALYSIS)||(profiling == GRAPHING)){
		initGraphAnalysis();
		printf("Analysis mode....\n");
	}

	//start a timer
    unsigned int timer = 0;
    cutilCheckError( cutCreateTimer( &timer));
    cutilCheckError( cutStartTimer( timer));

	//allocate arrays for reward data
	float* total_analysis_rewards = (float*) malloc(ind_trials*no_intval*sizeof(float));
	float* total_analysis_errors = (float*) malloc(ind_trials*no_intval*sizeof(float));
	float* total_reward = (float*) malloc(ind_trials*sizeof(float));
	float* total_error = (float*) malloc(ind_trials*sizeof(float));

	//calculate the number of learn trials (scale no trials for no dyn)
	int learn_trials = 0;
	if (dynamics == DYN_SYS)
		learn_trials = learn_trials_dyn;
	else //NO_DYN_SYS
		learn_trials = learn_trials_no_dyn;

	int simulation_analysis_trials;
	if (profiling == SIMULATION)
		simulation_analysis_trials = analysis_trials;
	else //SIMULATION_EXTENDED_ANALYSIS
		simulation_analysis_trials = gradiant_analsyis_trials;

	
	//PROFILE
	if(profiling == PROFILE_ONLY)
	{
		if (dynamics==DYN_SYS)
			printf("Running dyn sys profiling:\n 1 step\n single trial\n %i ind configs\n %i ind trials per config\n", ind_configs, trials_per_config);
		if (dynamics==NO_DYN_SYS)
			printf("Running no dyn sys profiling:\n 1 step\n single trial\n %i ind configs\n %i ind trials per config\n", ind_configs, trials_per_config);

		learn_step<UPDATE_LEARNING_WEIGHTS, dynamics, PROFILE_ONLY>(1, total_reward, total_error);
	}
	//CREATE GRADIANT GRAPH: no learning
	else if(profiling == GRAPHING)
	{
		if (dynamics==DYN_SYS)
			printf("Running dyn sys graph analysis:\n 1 step\n %i trials\n %i ind configs\n %i ind trials per config\n", gradiant_analsyis_trials, ind_configs, trials_per_config);
		if (dynamics==NO_DYN_SYS)
			printf("Running no dyn sys graph analysis:\n 1 step\n %i trials\n %i ind configs\n %i ind trials per config\n", gradiant_analsyis_trials, ind_configs, trials_per_config);

		//reset values
		resetGraphAnalysis();

		//perform simulation
		learn_step<ANAYLSYS_ONLY, dynamics, GRAPHING>(gradiant_analsyis_trials, total_reward, total_error);

		//copy graph analysis dat to host
		graphAnalysisDataToHost();

		//output graphs
		printf("Saving graph analysis data...\n");
		saveGraphAnalysisData<dynamics>(h_sumGrad, h_sumDeltaW);
	}
	//SIMULATION
	else	
	{
		if (dynamics==DYN_SYS)
			printf("Running dyn sys simulation:\n %i steps\n %i analysis trials\n %i learning trials\n %i ind configs\n %i ind trials per config\n", no_intval, simulation_analysis_trials, learn_trials, ind_configs, trials_per_config);
		if (dynamics==NO_DYN_SYS)
			printf("Running no dyn sys simulation:\n %i steps\n %i analysis trials\n %i learning trials\n %i ind configs\n %i ind trials per config\n", no_intval, simulation_analysis_trials, learn_trials, ind_configs, trials_per_config);
		
		for (int m=0; m<no_intval; m++)
		{
			printf("Starting Step %i of %i\n", (m+1), no_intval);
			float* no_intval_reward = &total_analysis_rewards[m*ind_trials];
			float* no_intval_error = &total_analysis_errors[m*ind_trials];

			//Perform analsyis
			if (profiling == SIMULATION)
			{
				printf("Stage 1: Performing analysis...\n");
				//perform simulation analysis
				learn_step<ANAYLSYS_ONLY, dynamics, SIMULATION>(analysis_trials, no_intval_reward, no_intval_error);
			}
			else //SIMULATION_EXTENDED_ANALYSIS
			{
				printf("Performing extended analysis...\n");
				
				//reset values
				resetGraphAnalysis();
				
				//perform extended simulation analysis
				learn_step<ANAYLSYS_ONLY, dynamics, GRAPHING>(gradiant_analsyis_trials, no_intval_reward, no_intval_error);

				//copy graph analysis data to host and produce the graph data for the current step
				graphAnalysisDataToHost();
				printf("Saving graph analysis data for step %i...\n", (m+1));
				saveGraphAnalysisData<dynamics>(h_sumGrad, h_sumDeltaW, (m+1));
			}

			//Print analsyis
			for(int j=0; j<ind_configs; j++)
			{
				for(int i=0; i<trials_per_config; i++)
				{
					printf("Step %i, Config %i, Ind trial no: %i: Av Reward is %f, Av Error %f\n", (m+1), (j+1), (i+1), no_intval_reward[j+(i*ind_configs)]/(float)simulation_analysis_trials ,no_intval_error[j+(i*ind_configs)]/(float)simulation_analysis_trials);
				}
			}

			//perform learning
			printf("Performing learn step...\n", (m+1), no_intval);
			learn_step<UPDATE_LEARNING_WEIGHTS, dynamics, SIMULATION>(learn_trials, total_reward, total_error);

			//for extended analysis print weights after each step
			if((profiling==SIMULATION_EXTENDED_ANALYSIS)&&(printWeightPlot))
			{
				weightsToHost();
				printf("Saving weight data...\n");
				saveWeightData<dynamics>(h_W, (m+1));
			}
		}
	}
	
	//stop the timer
    cutilCheckError( cutStopTimer( timer));
	printf("Simulation complete\n");
    printf("Processing time: %f (seconds)\n\n", cutGetTimerValue( timer)/1000.0f);
    cutilCheckError( cutDeleteTimer( timer));

	//output data to files ifn ot profiling
	if ((profiling == SIMULATION)||(profiling == SIMULATION_EXTENDED_ANALYSIS)){
		printf("Saving learn curve data...\n");
		saveLearnCurveData<dynamics>(total_analysis_rewards);
		saveErrorCurveData<dynamics>(total_analysis_errors);
	}

	//for simulation mode print weights once at end of simulation step
	if((profiling==SIMULATION)&&(printWeightPlot))
	{
		weightsToHost();
		printf("Saving weight data...\n");
		saveWeightData<dynamics>(h_W);
	}



	//free array for reward data
	free(total_reward);
	free(total_analysis_rewards);
	free(total_analysis_errors);

	//cleanup
	cleanupLearnStep();
	if ((profiling == SIMULATION_EXTENDED_ANALYSIS)||(profiling == GRAPHING)){
		cleanupGraphAnalysis();
	}
}


////////////////////////////////////////////////////////////////////////////////
// Program main

int
main( int argc, char** argv) 
{
	PROFILING profile;
	bool dyn_sys;
	bool no_dyn_sys;

	//init the device using command-line specified CUDA device, otherwise use device with highest Gflops/s
	if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
		cutilDeviceInit(argc, argv);
	else
		cudaSetDevice( cutGetMaxGflopsDeviceId() );

	//profile
	profile = SIMULATION;
	if( cutCheckCmdLineFlag(argc, (const char**)argv, "profile") )
		profile = PROFILE_ONLY;
	if( cutCheckCmdLineFlag(argc, (const char**)argv, "graph_analysis") )
		profile = GRAPHING;
	if( cutCheckCmdLineFlag(argc, (const char**)argv, "extended_analysis") )
		profile = SIMULATION_EXTENDED_ANALYSIS;
	if( cutCheckCmdLineFlag(argc, (const char**)argv, "print_weight_plot") )
		printWeightPlot = true;

	//check for invalid use of pwint weight plot
	if ((profile == GRAPHING)&&(printWeightPlot)){
		printf("Cannot use print_weight_plot argument with graph_analysis.\n");
		cudaThreadExit();
		cutilExit(argc, argv);
		exit(0);
	}

	
	//dyn sys (default perform both dynamics and no dynamics sequentially
	dyn_sys = true;
	no_dyn_sys = true;
	//dynamic system only
	if( cutCheckCmdLineFlag(argc, (const char**)argv, "dyn_sys") )
		no_dyn_sys = false;
	else if( cutCheckCmdLineFlag(argc, (const char**)argv, "no_dyn_sys") )
		dyn_sys = false;

	//perform dynamic system simulation
	if (dyn_sys)
	{
		if (profile == PROFILE_ONLY)
			learn_curve<PROFILE_ONLY, DYN_SYS>();
		else if (profile == GRAPHING)
			learn_curve<GRAPHING, DYN_SYS>();
		else if (profile == SIMULATION_EXTENDED_ANALYSIS)
			learn_curve<SIMULATION_EXTENDED_ANALYSIS, DYN_SYS>();
		//SIMULTATION
		else	
			learn_curve<SIMULATION, DYN_SYS>();
	}

	//perform non dydnamic system simulation
	if(no_dyn_sys)
	{
		if (profile == PROFILE_ONLY)
			learn_curve<PROFILE_ONLY, NO_DYN_SYS>();
		else if (profile == GRAPHING)
			learn_curve<GRAPHING, NO_DYN_SYS>();
		else if (profile == SIMULATION_EXTENDED_ANALYSIS)
			learn_curve<SIMULATION_EXTENDED_ANALYSIS, NO_DYN_SYS>();
		//SIMULTATION
		else
			learn_curve<SIMULATION, NO_DYN_SYS>();
	}

	//if simulating
	if ((profile == SIMULATION)||(profile ==SIMULATION_EXTENDED_ANALYSIS))
	{
		printf("Creating learn curve graphs...\n");
		createLearnCurveGraph(dyn_sys, no_dyn_sys);
		createErrorCurveGraph(dyn_sys, no_dyn_sys);

		if (profile == SIMULATION_EXTENDED_ANALYSIS)
		{
			for (int m=0; m<no_intval; m++)
			{
				printf("Creating analysis graphs for step %i...\n", (m+1));
				createAnalysisGraphs(dyn_sys, no_dyn_sys, (m+1));

				//create a weight graph for each step
				if (printWeightPlot)
				{
					printf("Creating 3d weights plot for step %i...\n", (m+1));
					createWeightGraphs(dyn_sys, no_dyn_sys, (m+1));
				}

			}
		}
		else //SIMULATION
		{
			//create a single weight plot graph
			if (printWeightPlot)
			{
				printf("Creating 3d weights plot...\n");
				createWeightGraphs(dyn_sys, no_dyn_sys);
			}
		}
	}
	//if performing graph analysis
	else if (profile == GRAPHING)
	{
		printf("Creating analysis graphs...\n");
		createAnalysisGraphs(dyn_sys, no_dyn_sys);
	}


	

	cudaThreadExit();

	//this will pause the window if we are not profiling
	if(profile != PROFILE_ONLY)
		cutilExit(argc, argv);
}