/*
Copyright (c) 2011 Paul Richmond, University of Sheffield , UK;
all rights reserved unless otherwise stated.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
In addition to the regulations of the GNU General Public License,
publications and communications based in parts on this program or on
parts of this program are required to cite the article
"Democratic population decisions result in robust policy-gradient
learning: a parametric study with GPU simulations" by Paul Richmond,
Lars Buesing, Michele Giugliano and Eleni Vasilaki, PLoS ONE Neuroscience,
Under Review..
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston,
MA 02111-1307 USA
*/
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <windows.h>
#include <time.h>
#include <float.h>
#include <cutil_inline.h>
// includes, project
#include "reduction.cuh"
#include "model.h"
#include "model.cuh"
#include "output.cuh"
////////////////////////////////////////////////////////////////////////////////
//Macro definitions
#define MAX(x,y) ((x > y) ? x : y)
#define MAX_NEURONS MAX(M, N)
#define MAX_RAND MAX_NEURONS //MAX(MAX_NEURONS, SM_CHUNK_SIZE*SM_CHUNK_SIZE) //factor of T required for no dyn, N*M for weight update noise
bool printWeightPlot;
////////////////////////////////////////////////////////////////////////////////
// Thread/Grid blocks
dim3 m_threads;
dim3 m_grid;
dim3 n_threads;
dim3 n_grid;
dim3 nN_threads;
dim3 nN_grid;
dim3 mN_threads;
dim3 mN_grid;
dim3 NT_matrix_threads;
dim3 NT_matrix_grid;
int NT_grid_width;
dim3 NT_threads;
dim3 NT_grid;
dim3 MN_matrix_threads;
dim3 MN_matrix_grid;
int MN_grid_width;
dim3 MN_threads;
dim3 MN_grid;
//////////////////////////////////////////////////////
//Persitant global variables
float* d_Wr;
float* d_W;
float* d_W_out;
float* d_Wt;
//Step variables
float* h_In_Deg; //host data
float* d_In_Deg;
float* d_rew;
int* d_decision_offset;
int* d_target_offset;
//Trial variables
float* d_u;
float* d_u_out;
float* d_Y;
float* d_Yt;
float* d_Y_sum;
float* d_YProbt;
float* d_input_pot;
float* d_lateral_pot;
float* d_YProb;
float* d_X;
float* d_Input;
float* d_epsp;
float* d_Grad;
float* d_out_x;
float* d_out_x_sum;
float* d_out_y;
float* d_out_y_sum;
rand48seeds* d_randSeeds;
magic_numbers mn;
//data used for producing gradiant graphs
float* h_W;
float* d_sumGrad;
float* d_sumDeltaW;
float* d_sumGrad_out;
float* d_sumDeltaW_out;
float* h_sumGrad;
float* h_sumDeltaW;
/** Reward function */
float reward(float x, int config)
{
float reward = 0;
//guassian reward
if (reward_func == GAUSSIAN)
{
if (x<PI)
reward = exp(-pow(x, 2.0f)/(2.0f*pow(sigma_R[config], 2.0f)));
else
reward = exp(-pow(2.0f*PI-x, 2.0f)/(2.0f*pow(sigma_R[config], 2.0f)));
}
//box reward
else if (reward_func == BOX)
{
if (x<PI)
reward = (float)(x<sigma_R_box[config])*(x>-sigma_R_box[config]);
else
reward = (float)((2.0f*PI-x)<sigma_R_box[config]) * ((2.0f*PI-x)>-sigma_R_box[config]);
}
return reward;
}
float errorFunc(float x)
{
float err;
if (x<PI)
err = x;
else
err = (2.0f*PI-x);
return err;
}
/* Mod function using sign from divisor (as with python)*/
float py_modf(float n, float a) {
float r = fmodf(n, a);
float sign = (a > 0) ? 1.0f : -1.0f;
if (r * sign < 0) r += a;
return r;
}
void copyParametersToDevice()
{
cutilSafeCall( cudaMemcpyToSymbol(beta, &h_beta, sizeof(float)*ind_configs));
cutilSafeCall( cudaMemcpyToSymbol(xsc, &h_xsc, sizeof(float)*ind_configs));
cutilSafeCall( cudaMemcpyToSymbol(u0, &h_u0, sizeof(float)*ind_configs));
cutilSafeCall( cudaMemcpyToSymbol(w0, &h_w0, sizeof(float)*ind_configs));
cutilSafeCall( cudaMemcpyToSymbol(eta, &h_eta, sizeof(float)*ind_configs));
cutilSafeCall( cudaMemcpyToSymbol(baseline, &h_baseline, sizeof(float)*ind_configs));
cutilSafeCall( cudaMemcpyToSymbol(tau, &h_tau, sizeof(float)*ind_configs));
cutilSafeCall( cudaMemcpyToSymbol(lambda, &h_lambda, sizeof(float)*ind_configs));
cutilSafeCall( cudaMemcpyToSymbol(urest, &h_urest, sizeof(float)*ind_configs));
cutilSafeCall( cudaMemcpyToSymbol(threshold, &h_threshold, sizeof(float)*ind_configs));
cutilSafeCall( cudaMemcpyToSymbol(du, &h_du, sizeof(float)*ind_configs));
cutilSafeCall( cudaMemcpyToSymbol(rho0, &h_rho0, sizeof(float)*ind_configs));
cutilSafeCall( cudaMemcpyToSymbol(ref, &h_ref, sizeof(float)*ind_configs));
}
/** initLearnStepData
* Initialises all device data used during the learn_step function.
* Initialises Weight matrices W (+W_out) and Wr on the device. Wr has static laternal connection weights. W is set to zero.
* A seperate set of data is created for each indipendant trial excluding Wr which is the same throughout all.
* A set of host data is allocated (h_In_Deg) which is used in the learn step to create random goals.
*/
void initLearnStepData()
{
//allocate W matrix on host to save weights later on
if (printWeightPlot)
{
h_W = (float*) malloc(N*M*sizeof(float)*ind_trials);
}
//allocate weight data
cudaMalloc( (void**) &d_Wr, N*N*sizeof(float)*ind_configs );
cudaMalloc( (void**) &d_W, M*N*sizeof(float)*ind_trials);
cudaMalloc( (void**) &d_W_out, M*N*sizeof(float) *ind_trials);
cudaMalloc( (void**) &d_Wt, M*N*sizeof(float) *ind_trials);
//init lateral connection weights and copy to device
float* h_Wr = (float*) malloc(N*N*sizeof(float)*ind_configs);
for(int i=0; i<N; i++)
{
for (int j=0; j<N; j++)
{
float dist = abs(j-i*1.0f);
dist = dist*(dist<(N/2)) + (N-dist)*(dist>=(N/2));
//Wr varies for each set of configuration parameters
for(int k=0; k<ind_configs; k++)
{
h_Wr[j+(i*N) + (k*N*N)] = (expf(-powf(dist,2.0f)/(2.0f*powf(sig_p[k],2.0f)))*w_E[k] - 0.9f)*rsc[k]; //j+(i*N) possibly should be i+(j*N)
}
}
}
cutilSafeCall( cudaMemcpy(d_Wr, h_Wr, N*N*sizeof(float)*ind_configs, cudaMemcpyHostToDevice) );
free(h_Wr);
//Set action cell weights to zero
cutilSafeCall( cudaMemset(d_W, 0,M*N*sizeof(float)*ind_trials));
cutilSafeCall( cudaMemset(d_W_out, 0,M*N*sizeof(float)*ind_trials));
cutilSafeCall( cudaMemset(d_Wt, 0,M*N*sizeof(float)*ind_trials));
//allocate global learnstep variables
cudaMalloc( (void**) &d_In_Deg, Tr_MAX*sizeof(float)*ind_trials );
cudaMalloc( (void**) &d_rew, sizeof(float)*ind_trials );
cudaMalloc( (void**) &d_decision_offset, sizeof(int)*ind_trials );
cudaMalloc( (void**) &d_target_offset, sizeof(int)*ind_trials );
//allocate trial specific data
cudaMalloc( (void**) &d_u, T*N*sizeof(float)*ind_trials );
cudaMalloc( (void**) &d_u_out, T*N*sizeof(float)*ind_trials );
cudaMalloc( (void**) &d_Y, T*N*sizeof(float)*ind_trials );
cudaMalloc( (void**) &d_Yt, T*N*sizeof(float)*ind_trials );
cudaMalloc( (void**) &d_Y_sum, T*N*sizeof(float)*ind_trials );
cudaMalloc( (void**) &d_YProb, T*N*sizeof(float)*ind_trials );
cudaMalloc( (void**) &d_YProbt, T*N*sizeof(float)*ind_trials );
cudaMalloc( (void**) &d_input_pot, T*N*sizeof(float)*ind_trials ); //factor of T only required for no dyn system
cudaMalloc( (void**) &d_lateral_pot, N*sizeof(float)*ind_trials );
cudaMalloc( (void**) &d_X, T*M*sizeof(float)*ind_trials );
cudaMalloc( (void**) &d_Input, T*M*sizeof(float)*ind_trials );
cudaMalloc( (void**) &d_epsp, T*M*sizeof(float)*ind_trials );
cudaMalloc( (void**) &d_Grad, M*N*sizeof(float)*ind_trials );
cudaMalloc( (void**) &d_out_x, N*sizeof(float)*ind_trials );
cudaMalloc( (void**) &d_out_x_sum, N*sizeof(float)*ind_trials );
cudaMalloc( (void**) &d_out_y, N*sizeof(float)*ind_trials );
cudaMalloc( (void**) &d_out_y_sum, N*sizeof(float)*ind_trials );
//init host data used to create a set of random goals
h_In_Deg = (float*) malloc(Tr_MAX*sizeof(float)*ind_trials);
//init randomn number stream data using rand48 algorithm
rand48seeds* h_randSeeds;
h_randSeeds = (rand48seeds*) malloc(MAX_RAND*sizeof(rand48seeds)*ind_trials);
cudaMalloc( (void**) &d_randSeeds, MAX_RAND*sizeof(rand48seeds)*ind_trials);
initCUDARand48(MAX_RAND*ind_trials, h_randSeeds, d_randSeeds, mn);
free(h_randSeeds);
}
void initGraphAnalysis()
{
//malloc data on host
h_sumGrad = (float*)malloc(M*N*sizeof(float)*ind_trials);
h_sumDeltaW = (float*)malloc(M*N*sizeof(float)*ind_trials);
//malloc dat on device
cudaMalloc( (void**) &d_sumGrad, M*N*sizeof(float)*ind_trials );
cudaMalloc( (void**) &d_sumDeltaW, M*N*sizeof(float)*ind_trials );
cudaMalloc( (void**) &d_sumGrad_out, M*N*sizeof(float)*ind_trials );
cudaMalloc( (void**) &d_sumDeltaW_out, M*N*sizeof(float)*ind_trials );
}
void resetGraphAnalysis()
{
//block set device data to 0
cutilSafeCall( cudaMemset(d_sumGrad, 0,M*N*sizeof(float)*ind_trials));
cutilSafeCall( cudaMemset(d_sumDeltaW, 0,M*N*sizeof(float)*ind_trials));
cutilSafeCall( cudaMemset(d_sumGrad_out, 0,M*N*sizeof(float)*ind_trials));
cutilSafeCall( cudaMemset(d_sumDeltaW_out, 0,M*N*sizeof(float)*ind_trials));
}
void cleanupGraphAnalysis()
{
//free host memory
free(h_sumGrad);
free(h_sumDeltaW);
//free device memory
cudaFree(d_sumGrad);
cudaFree(d_sumDeltaW);
cudaFree(d_sumGrad_out);
cudaFree(d_sumDeltaW_out);
}
/** cleanupLearnStep
* Frees all data on host and device which is used in the learn step.
*/
void cleanupLearnStep()
{
//dealloc learn step data
free(h_In_Deg);
cudaFree(d_In_Deg);
cudaFree(d_rew);
cudaFree(d_decision_offset);
cudaFree(d_target_offset);
//dealloc trial data
cudaFree (d_randSeeds);
cudaFree(d_u);
cudaFree(d_u_out);
cudaFree(d_Y);
cudaFree(d_Yt);
cudaFree(d_Y_sum);
cudaFree(d_YProb);
cudaFree(d_YProbt);
cudaFree(d_input_pot);
cudaFree(d_lateral_pot);
cudaFree(d_X);
cudaFree(d_Input);
cudaFree(d_epsp);
cudaFree(d_Grad);
cudaFree(d_out_x);
cudaFree(d_out_x_sum);
cudaFree(d_out_y);
cudaFree(d_out_y_sum);
//dealloc weight data
if (printWeightPlot)
free(h_W);
cudaFree( d_W);
cudaFree( d_W_out);
cudaFree( d_Wr);
cudaFree( d_Wt);
}
/** calculateGridBlockSizes
* All grid block sizes are pre-calculated before the learn step
*/
void calculateGridBlockSizes()
{
//M total threads with a max block size of M
m_threads = dim3(M_BLOCK_SIZE, 1, 1);
m_grid = dim3(M/M_BLOCK_SIZE, ind_trials, 1);
//N total threads with a max block size of N
n_threads = dim3(N_BLOCK_SIZE, 1, 1);
n_grid = dim3(N/N_BLOCK_SIZE, ind_trials, 1);
//n block size by N total threads
nN_threads = dim3(N_BLOCK_SIZE, 1 , 1);
nN_grid = dim3(N, ind_trials, 1);
//m block size by N total threads
mN_threads = dim3(M_BLOCK_SIZE,1 , 1);
mN_grid = dim3(N, ind_trials, 1);
//NxT total threads with SM_CHUNK_SIZE^2 threads per block
NT_matrix_threads = dim3(SM_CHUNK_SIZE, SM_CHUNK_SIZE, 1);
NT_matrix_grid = dim3(N*T/(SM_CHUNK_SIZE*SM_CHUNK_SIZE), ind_trials, 1);
NT_grid_width = N/SM_CHUNK_SIZE;
//N*T total threads
NT_threads = dim3(SM_CHUNK_SIZE *SM_CHUNK_SIZE, 1, 1);
NT_grid = dim3(N*T/(SM_CHUNK_SIZE*SM_CHUNK_SIZE), ind_trials, 1);
//NxM total threads with SM_CHUNK_SIZE^2 threads per 2D block
//both x and y of grid are held within the x dimenion of the grid (requires mod and divide in kernel)
MN_matrix_threads = dim3(SM_CHUNK_SIZE, SM_CHUNK_SIZE, 1);
MN_matrix_grid = dim3(M*N/(SM_CHUNK_SIZE*SM_CHUNK_SIZE), ind_trials, 1);
MN_grid_width = M/SM_CHUNK_SIZE;
//NxM total threads with SM_CHUNK_SIZE^2 threads per 1D block
MN_threads = dim3(SM_CHUNK_SIZE*SM_CHUNK_SIZE, 1, 1);
MN_grid = dim3(M*N/(SM_CHUNK_SIZE*SM_CHUNK_SIZE), ind_trials, 1);
}
/**
* learn_step function
*/
template <LEARNING learning, DYNAMICS dynamics, PROFILING profiling>
void learn_step(int Tr, float* total_reward, float* total_error)
{
//global learn step variables
for (int i=0; i<ind_trials; i++)
{
total_reward[i] = 0;
total_error[i] = 0;
}
srand ( (unsigned int)time(NULL) );
for (int i=0;i<Tr*ind_trials;i++)
{
if (moving_target)
h_In_Deg[i] = ((float)rand()/RAND_MAX)*2.0f*PI;
else
h_In_Deg[i] = static_target;
}
cutilSafeCall( cudaMemcpy(d_In_Deg, h_In_Deg, Tr*sizeof(float)*ind_trials, cudaMemcpyHostToDevice) );
//allocate data for theta
float* theta = (float*)malloc(ind_trials*sizeof(float));
float* rew = (float*)malloc(ind_trials*sizeof(float));
int* decision_offset = (int*)malloc(ind_trials*sizeof(int));
int* target_offset = (int*)malloc(ind_trials*sizeof(int));
for (int n=0; n<Tr; n++)
{
//set initial trial specific data
cutilSafeCall( cudaMemset(d_Y, 0, T*N*sizeof(float)*ind_trials) );
resetMembranePotential<<<NT_grid, NT_threads>>>(d_u, d_u_out);
//calculate place cell distance and activations and output
poissonNeuronSimulation<<<m_grid, m_threads>>>(n, Tr, d_X, d_Input, d_epsp, d_In_Deg, d_randSeeds, mn);
cutilCheckMsg("Error in kernel\n");
//for T trails (must be performed serially as lateral activation use data from t-1)
for (int t=1; t<T; t++)
{
//calculate the place cell activations
placeCellSpikePropagation<<<mN_grid, mN_threads>>>(d_W, d_Input+(t*M), d_input_pot);
cutilCheckMsg("Error in kernel\n");
//check if we are using dynamic simulation for lateral connections (if so calculate them)
if (dynamics == DYN_SYS){
//calculate the action cell lateral interactions
actionCellLateralSpikePropagation<<<nN_grid, nN_threads>>>(d_Wr, d_Y+((t-1)*N), d_lateral_pot);
cutilCheckMsg("Error in kernel\n");
}
//calculate the action cell spikes
integrateAndFireNeuronSimulation<dynamics><<<n_grid, n_threads>>>(t, d_u, d_u_out, d_input_pot, d_lateral_pot, d_YProb, d_Y, d_randSeeds, mn);
cutilCheckMsg("Error in kernel\n");
//swap output for input
float* d_u_temp;
d_u_temp = d_u;
d_u = d_u_out;
d_u_out = d_u_temp;
}
//transpose Y
transpose<N, T><<<NT_matrix_grid, NT_matrix_threads>>>(d_Y, d_Yt, NT_grid_width);
cutilCheckMsg("Error in kernel\n");
//calculate average angle of Y (for each N) across all ind trials N*ind_trials total parallel reductions
reduceMultipleArrays<float>(T, d_Yt, d_Y_sum, N*ind_trials);
cutilCheckMsg("Error in kernel\n");
//calculate output angle components
calculatePopulationVector<<<n_grid, n_threads>>>(d_Y_sum, d_out_x, d_out_y);
cutilCheckMsg("Error in kernel\n");
//sum output components
reduceMultipleArrays<float>(N, d_out_x, d_out_x_sum, ind_trials);
cutilCheckMsg("Error in kernel\n");
reduceMultipleArrays<float>(N, d_out_y, d_out_y_sum, ind_trials);
cutilCheckMsg("Error in kernel\n");
//calculate angle by reading back sum totals to the CPU for each ind trial
for (int i=0; i<ind_trials; i++)
{
int configuration_offset = pow2mod(i, ind_configs);
float h_out_x_sum = 0;
float h_out_y_sum = 0;
int d_out_sum_offset = N*i;
int d_In_Deg_offset = Tr*i;
cutilSafeCall( cudaMemcpy( &h_out_x_sum, d_out_x_sum+d_out_sum_offset, sizeof(float), cudaMemcpyDeviceToHost) );
cutilSafeCall( cudaMemcpy( &h_out_y_sum, d_out_y_sum+d_out_sum_offset, sizeof(float), cudaMemcpyDeviceToHost) );
//calculate theta
theta[i] = py_modf(atan2(h_out_x_sum, h_out_y_sum), 2.0f*PI);
//calculate reward
rew[i] = reward(abs(theta[i]-h_In_Deg[n+d_In_Deg_offset]), configuration_offset);
//update total reward
total_reward[i] += rew[i];
total_error[i] += (fabs(h_In_Deg[n+d_In_Deg_offset] - theta[i]));
//caluclate the descision and target offsets
if (profiling == GRAPHING){
decision_offset[i] = (int)floorf(((theta[i]+PI)*N)/(2.0f*PI));
target_offset[i] = (int)floorf(((h_In_Deg[n+d_In_Deg_offset]+PI)*N)/(2.0f*PI));
}
}
//copy rewards to device
CUDA_SAFE_CALL( cudaMemcpy( d_rew, rew, ind_trials*sizeof(float), cudaMemcpyHostToDevice));
//copy descision and target offsets to the device
if (profiling == GRAPHING){
CUDA_SAFE_CALL( cudaMemcpy( d_decision_offset, decision_offset, ind_trials*sizeof(int), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL( cudaMemcpy( d_target_offset, target_offset, ind_trials*sizeof(int), cudaMemcpyHostToDevice));
}
/* only perform the following if learning or calculating the gradiant graph*/
if ((learning == UPDATE_LEARNING_WEIGHTS)||(profiling == GRAPHING))
{
//transpose rho
transpose<N,T><<<NT_matrix_grid, NT_matrix_threads>>>(d_YProb, d_YProbt, NT_grid_width);
cutilCheckMsg("Error in kernel\n");
//calculate gradiant
calculateGradiant<<<MN_matrix_grid, MN_matrix_threads>>>(MN_grid_width, d_Yt, d_YProbt, d_epsp, d_Grad);
cutilCheckMsg("Error in kernel\n");
if (profiling == GRAPHING)
{
//update the gradiant and deltaW sum totals
updateGradiantAnalysis<<<MN_grid, MN_threads>>>(d_decision_offset, d_target_offset, d_Grad, d_rew, d_sumGrad, d_sumGrad_out, d_sumDeltaW, d_sumDeltaW_out);
cutilCheckMsg("Error in kernel\n");
//swap sumGrad input and output
float* d_sumGrad_temp;
d_sumGrad_temp = d_sumGrad;
d_sumGrad = d_sumGrad_out;
d_sumGrad_out = d_sumGrad_temp;
//swap deltaW input and output
float* d_sumDeltaW_temp;
d_sumDeltaW_temp = d_sumDeltaW;
d_sumDeltaW = d_sumDeltaW_out;
d_sumDeltaW_out = d_sumDeltaW_temp;
}
//no learning if we are calculating the gradiant graph
else if (learning == UPDATE_LEARNING_WEIGHTS)
{
//update learning weights
updateLearningWeights<<<MN_grid, MN_threads>>>(d_W, d_rew, d_Grad, d_W_out);
cutilCheckMsg("Error in kernel\n");
if (APPLY_NOISE)
{
//apply noise (this will also swap input and output)
applyNoise<<<m_grid, m_threads>>>(d_W_out, d_W, d_randSeeds, mn);
cutilCheckMsg("Error in kernel\n");
}
else{
//swap the input and output pointer
float* d_W_temp; //used to swap input and output
d_W_temp = d_W;
d_W = d_W_out;
d_W_out = d_W_temp;
}
}
}
}
//cleanup theta
free(theta);
free(rew);
free(decision_offset);
free(target_offset);
}
void graphAnalysisDataToHost()
{
printf("Copying graph analysis data from device to host\n");
cutilSafeCall( cudaMemcpy(h_sumGrad, d_sumGrad, N*M*sizeof(float)*ind_trials, cudaMemcpyDeviceToHost) );
cutilSafeCall( cudaMemcpy(h_sumDeltaW, d_sumDeltaW, N*M*sizeof(float)*ind_trials, cudaMemcpyDeviceToHost) );
}
void weightsToHost(){
printf("Copying weight data from device to host\n");
cutilSafeCall( cudaMemcpy(h_W, d_W, N*M*sizeof(float)*ind_trials, cudaMemcpyDeviceToHost) );
}
/** learn_curve
* Runs the simulation
*/
template<PROFILING profiling, DYNAMICS dynamics>
void learn_curve()
{
//init model
copyParametersToDevice();
initLearnStepData();
calculateGridBlockSizes();
if ((profiling == SIMULATION_EXTENDED_ANALYSIS)||(profiling == GRAPHING)){
initGraphAnalysis();
printf("Analysis mode....\n");
}
//start a timer
unsigned int timer = 0;
cutilCheckError( cutCreateTimer( &timer));
cutilCheckError( cutStartTimer( timer));
//allocate arrays for reward data
float* total_analysis_rewards = (float*) malloc(ind_trials*no_intval*sizeof(float));
float* total_analysis_errors = (float*) malloc(ind_trials*no_intval*sizeof(float));
float* total_reward = (float*) malloc(ind_trials*sizeof(float));
float* total_error = (float*) malloc(ind_trials*sizeof(float));
//calculate the number of learn trials (scale no trials for no dyn)
int learn_trials = 0;
if (dynamics == DYN_SYS)
learn_trials = learn_trials_dyn;
else //NO_DYN_SYS
learn_trials = learn_trials_no_dyn;
int simulation_analysis_trials;
if (profiling == SIMULATION)
simulation_analysis_trials = analysis_trials;
else //SIMULATION_EXTENDED_ANALYSIS
simulation_analysis_trials = gradiant_analsyis_trials;
//PROFILE
if(profiling == PROFILE_ONLY)
{
if (dynamics==DYN_SYS)
printf("Running dyn sys profiling:\n 1 step\n single trial\n %i ind configs\n %i ind trials per config\n", ind_configs, trials_per_config);
if (dynamics==NO_DYN_SYS)
printf("Running no dyn sys profiling:\n 1 step\n single trial\n %i ind configs\n %i ind trials per config\n", ind_configs, trials_per_config);
learn_step<UPDATE_LEARNING_WEIGHTS, dynamics, PROFILE_ONLY>(1, total_reward, total_error);
}
//CREATE GRADIANT GRAPH: no learning
else if(profiling == GRAPHING)
{
if (dynamics==DYN_SYS)
printf("Running dyn sys graph analysis:\n 1 step\n %i trials\n %i ind configs\n %i ind trials per config\n", gradiant_analsyis_trials, ind_configs, trials_per_config);
if (dynamics==NO_DYN_SYS)
printf("Running no dyn sys graph analysis:\n 1 step\n %i trials\n %i ind configs\n %i ind trials per config\n", gradiant_analsyis_trials, ind_configs, trials_per_config);
//reset values
resetGraphAnalysis();
//perform simulation
learn_step<ANAYLSYS_ONLY, dynamics, GRAPHING>(gradiant_analsyis_trials, total_reward, total_error);
//copy graph analysis dat to host
graphAnalysisDataToHost();
//output graphs
printf("Saving graph analysis data...\n");
saveGraphAnalysisData<dynamics>(h_sumGrad, h_sumDeltaW);
}
//SIMULATION
else
{
if (dynamics==DYN_SYS)
printf("Running dyn sys simulation:\n %i steps\n %i analysis trials\n %i learning trials\n %i ind configs\n %i ind trials per config\n", no_intval, simulation_analysis_trials, learn_trials, ind_configs, trials_per_config);
if (dynamics==NO_DYN_SYS)
printf("Running no dyn sys simulation:\n %i steps\n %i analysis trials\n %i learning trials\n %i ind configs\n %i ind trials per config\n", no_intval, simulation_analysis_trials, learn_trials, ind_configs, trials_per_config);
for (int m=0; m<no_intval; m++)
{
printf("Starting Step %i of %i\n", (m+1), no_intval);
float* no_intval_reward = &total_analysis_rewards[m*ind_trials];
float* no_intval_error = &total_analysis_errors[m*ind_trials];
//Perform analsyis
if (profiling == SIMULATION)
{
printf("Stage 1: Performing analysis...\n");
//perform simulation analysis
learn_step<ANAYLSYS_ONLY, dynamics, SIMULATION>(analysis_trials, no_intval_reward, no_intval_error);
}
else //SIMULATION_EXTENDED_ANALYSIS
{
printf("Performing extended analysis...\n");
//reset values
resetGraphAnalysis();
//perform extended simulation analysis
learn_step<ANAYLSYS_ONLY, dynamics, GRAPHING>(gradiant_analsyis_trials, no_intval_reward, no_intval_error);
//copy graph analysis data to host and produce the graph data for the current step
graphAnalysisDataToHost();
printf("Saving graph analysis data for step %i...\n", (m+1));
saveGraphAnalysisData<dynamics>(h_sumGrad, h_sumDeltaW, (m+1));
}
//Print analsyis
for(int j=0; j<ind_configs; j++)
{
for(int i=0; i<trials_per_config; i++)
{
printf("Step %i, Config %i, Ind trial no: %i: Av Reward is %f, Av Error %f\n", (m+1), (j+1), (i+1), no_intval_reward[j+(i*ind_configs)]/(float)simulation_analysis_trials ,no_intval_error[j+(i*ind_configs)]/(float)simulation_analysis_trials);
}
}
//perform learning
printf("Performing learn step...\n", (m+1), no_intval);
learn_step<UPDATE_LEARNING_WEIGHTS, dynamics, SIMULATION>(learn_trials, total_reward, total_error);
//for extended analysis print weights after each step
if((profiling==SIMULATION_EXTENDED_ANALYSIS)&&(printWeightPlot))
{
weightsToHost();
printf("Saving weight data...\n");
saveWeightData<dynamics>(h_W, (m+1));
}
}
}
//stop the timer
cutilCheckError( cutStopTimer( timer));
printf("Simulation complete\n");
printf("Processing time: %f (seconds)\n\n", cutGetTimerValue( timer)/1000.0f);
cutilCheckError( cutDeleteTimer( timer));
//output data to files ifn ot profiling
if ((profiling == SIMULATION)||(profiling == SIMULATION_EXTENDED_ANALYSIS)){
printf("Saving learn curve data...\n");
saveLearnCurveData<dynamics>(total_analysis_rewards);
saveErrorCurveData<dynamics>(total_analysis_errors);
}
//for simulation mode print weights once at end of simulation step
if((profiling==SIMULATION)&&(printWeightPlot))
{
weightsToHost();
printf("Saving weight data...\n");
saveWeightData<dynamics>(h_W);
}
//free array for reward data
free(total_reward);
free(total_analysis_rewards);
free(total_analysis_errors);
//cleanup
cleanupLearnStep();
if ((profiling == SIMULATION_EXTENDED_ANALYSIS)||(profiling == GRAPHING)){
cleanupGraphAnalysis();
}
}
////////////////////////////////////////////////////////////////////////////////
// Program main
int
main( int argc, char** argv)
{
PROFILING profile;
bool dyn_sys;
bool no_dyn_sys;
//init the device using command-line specified CUDA device, otherwise use device with highest Gflops/s
if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
cutilDeviceInit(argc, argv);
else
cudaSetDevice( cutGetMaxGflopsDeviceId() );
//profile
profile = SIMULATION;
if( cutCheckCmdLineFlag(argc, (const char**)argv, "profile") )
profile = PROFILE_ONLY;
if( cutCheckCmdLineFlag(argc, (const char**)argv, "graph_analysis") )
profile = GRAPHING;
if( cutCheckCmdLineFlag(argc, (const char**)argv, "extended_analysis") )
profile = SIMULATION_EXTENDED_ANALYSIS;
if( cutCheckCmdLineFlag(argc, (const char**)argv, "print_weight_plot") )
printWeightPlot = true;
//check for invalid use of pwint weight plot
if ((profile == GRAPHING)&&(printWeightPlot)){
printf("Cannot use print_weight_plot argument with graph_analysis.\n");
cudaThreadExit();
cutilExit(argc, argv);
exit(0);
}
//dyn sys (default perform both dynamics and no dynamics sequentially
dyn_sys = true;
no_dyn_sys = true;
//dynamic system only
if( cutCheckCmdLineFlag(argc, (const char**)argv, "dyn_sys") )
no_dyn_sys = false;
else if( cutCheckCmdLineFlag(argc, (const char**)argv, "no_dyn_sys") )
dyn_sys = false;
//perform dynamic system simulation
if (dyn_sys)
{
if (profile == PROFILE_ONLY)
learn_curve<PROFILE_ONLY, DYN_SYS>();
else if (profile == GRAPHING)
learn_curve<GRAPHING, DYN_SYS>();
else if (profile == SIMULATION_EXTENDED_ANALYSIS)
learn_curve<SIMULATION_EXTENDED_ANALYSIS, DYN_SYS>();
//SIMULTATION
else
learn_curve<SIMULATION, DYN_SYS>();
}
//perform non dydnamic system simulation
if(no_dyn_sys)
{
if (profile == PROFILE_ONLY)
learn_curve<PROFILE_ONLY, NO_DYN_SYS>();
else if (profile == GRAPHING)
learn_curve<GRAPHING, NO_DYN_SYS>();
else if (profile == SIMULATION_EXTENDED_ANALYSIS)
learn_curve<SIMULATION_EXTENDED_ANALYSIS, NO_DYN_SYS>();
//SIMULTATION
else
learn_curve<SIMULATION, NO_DYN_SYS>();
}
//if simulating
if ((profile == SIMULATION)||(profile ==SIMULATION_EXTENDED_ANALYSIS))
{
printf("Creating learn curve graphs...\n");
createLearnCurveGraph(dyn_sys, no_dyn_sys);
createErrorCurveGraph(dyn_sys, no_dyn_sys);
if (profile == SIMULATION_EXTENDED_ANALYSIS)
{
for (int m=0; m<no_intval; m++)
{
printf("Creating analysis graphs for step %i...\n", (m+1));
createAnalysisGraphs(dyn_sys, no_dyn_sys, (m+1));
//create a weight graph for each step
if (printWeightPlot)
{
printf("Creating 3d weights plot for step %i...\n", (m+1));
createWeightGraphs(dyn_sys, no_dyn_sys, (m+1));
}
}
}
else //SIMULATION
{
//create a single weight plot graph
if (printWeightPlot)
{
printf("Creating 3d weights plot...\n");
createWeightGraphs(dyn_sys, no_dyn_sys);
}
}
}
//if performing graph analysis
else if (profile == GRAPHING)
{
printf("Creating analysis graphs...\n");
createAnalysisGraphs(dyn_sys, no_dyn_sys);
}
cudaThreadExit();
//this will pause the window if we are not profiling
if(profile != PROFILE_ONLY)
cutilExit(argc, argv);
}