/* Copyright (c) 2011 Paul Richmond, University of Sheffield , UK; all rights reserved unless otherwise stated. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. In addition to the regulations of the GNU General Public License, publications and communications based in parts on this program or on parts of this program are required to cite the article "Democratic population decisions result in robust policy-gradient learning: a parametric study with GPU simulations" by Paul Richmond, Lars Buesing, Michele Giugliano and Eleni Vasilaki, PLoS ONE Neuroscience, Under Review.. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* "This software contains source code provided by NVIDIA Corporation." This CUDA header file defines the prototype functions used for parallel reduction and is based on the CUDA SDK Parallel reduction example provided with the CUDA Computing SDK. It has been modified to allow a large number of simultaneous (and independent) parallel reductions. The reduction kernel is used for both the spikeTrainRedeuction and outputComponentReduction steps of the simulation. */ #ifndef __REDUCTION_H__ #define __REDUCTION_H__ #include <cutil_inline.h> #include <cutil_math.h> //maximum sizes for computation not in total #define MAX_REDUCTION_THREADS 256 #define MAX_REDUCTION_BLOCKS 64 #define MIN(x,y) ((x < y) ? x : y) #ifdef __DEVICE_EMULATION__ #define EMUSYNC __syncthreads() #else #define EMUSYNC #endif //host function prototypes template <class T> T reduceArray(int n, T* d_idata, T* d_odata); template <class T> void reduceMultipleArrays(int n, T* d_idata, T* d_odata, int multiple); bool isPow2(unsigned int x); unsigned int nextPow2( unsigned int x ); template<class T> struct SharedMemory { __device__ inline operator T*() { extern __shared__ int __smem[]; return (T*)__smem; } __device__ inline operator const T*() const { extern __shared__ int __smem[]; return (T*)__smem; } }; // specialize for double to avoid unaligned memory // access compile errors template<> struct SharedMemory<double> { __device__ inline operator double*() { extern __shared__ double __smem_d[]; return (double*)__smem_d; } __device__ inline operator const double*() const { extern __shared__ double __smem_d[]; return (double*)__smem_d; } }; template <class T, unsigned int blockSize, bool nIsPow2> __global__ void reduce6(T *g_idata, T *g_odata, unsigned int n) { // now that we are using warp-synchronous programming (below) // we need to declare our shared memory volatile so that the compiler // doesn't reorder stores to it and induce incorrect behavior. volatile T *sdata = SharedMemory<T>(); // perform first level of reduction, // reading from global memory, writing to shared memory unsigned int tid = threadIdx.x; unsigned int i = blockIdx.x*blockSize*2 + threadIdx.x; unsigned int gridSize = blockSize*2*gridDim.x; T mySum = 0; // we reduce multiple elements per thread. The number is determined by the // number of active thread blocks (via gridDim). More blocks will result // in a larger gridSize and therefore fewer elements per thread while (i < n) { mySum += g_idata[i]; // ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays if (nIsPow2 || i + blockSize < n) mySum += g_idata[i+blockSize]; i += gridSize; } // each thread puts its local sum into shared memory sdata[tid] = mySum; __syncthreads(); // do reduction in shared mem if (blockSize >= 512) { if (tid < 256) { sdata[tid] = mySum = mySum + sdata[tid + 256]; } __syncthreads(); } if (blockSize >= 256) { if (tid < 128) { sdata[tid] = mySum = mySum + sdata[tid + 128]; } __syncthreads(); } if (blockSize >= 128) { if (tid < 64) { sdata[tid] = mySum = mySum + sdata[tid + 64]; } __syncthreads(); } #ifndef __DEVICE_EMULATION__ if (tid < 32) #endif { if (blockSize >= 64) { sdata[tid] = mySum = mySum + sdata[tid + 32]; EMUSYNC; } if (blockSize >= 32) { sdata[tid] = mySum = mySum + sdata[tid + 16]; EMUSYNC; } if (blockSize >= 16) { sdata[tid] = mySum = mySum + sdata[tid + 8]; EMUSYNC; } if (blockSize >= 8) { sdata[tid] = mySum = mySum + sdata[tid + 4]; EMUSYNC; } if (blockSize >= 4) { sdata[tid] = mySum = mySum + sdata[tid + 2]; EMUSYNC; } if (blockSize >= 2) { sdata[tid] = mySum = mySum + sdata[tid + 1]; EMUSYNC; } } // write result for this block to global mem if (tid == 0) g_odata[blockIdx.x] = sdata[0]; } template <class T, unsigned int blockSize, bool nIsPow2> __global__ void reduce6_multiple(T *g_idata, T *g_odata, unsigned int n, unsigned int total_size) { // now that we are using warp-synchronous programming (below) // we need to declare our shared memory volatile so that the compiler // doesn't reorder stores to it and induce incorrect behavior. volatile T *sdata = SharedMemory<T>(); // perform first level of reduction, // reading from global memory, writing to shared memory unsigned int tid = threadIdx.x; unsigned int i = blockIdx.x*blockSize*2 + threadIdx.x; unsigned int gridSize = blockSize*2*gridDim.x; T mySum = 0; // we reduce multiple elements per thread. The number is determined by the // number of active thread blocks (via gridDim). More blocks will result // in a larger gridSize and therefore fewer elements per thread while (i < n) { mySum += g_idata[i+(total_size*blockIdx.y)]; // ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays if (nIsPow2 || i + blockSize < n) mySum += g_idata[i+blockSize+(total_size*blockIdx.y)]; i += gridSize; } // each thread puts its local sum into shared memory sdata[tid] = mySum; __syncthreads(); // do reduction in shared mem if (blockSize >= 512) { if (tid < 256) { sdata[tid] = mySum = mySum + sdata[tid + 256]; } __syncthreads(); } if (blockSize >= 256) { if (tid < 128) { sdata[tid] = mySum = mySum + sdata[tid + 128]; } __syncthreads(); } if (blockSize >= 128) { if (tid < 64) { sdata[tid] = mySum = mySum + sdata[tid + 64]; } __syncthreads(); } #ifndef __DEVICE_EMULATION__ if (tid < 32) #endif { if (blockSize >= 64) { sdata[tid] = mySum = mySum + sdata[tid + 32]; EMUSYNC; } if (blockSize >= 32) { sdata[tid] = mySum = mySum + sdata[tid + 16]; EMUSYNC; } if (blockSize >= 16) { sdata[tid] = mySum = mySum + sdata[tid + 8]; EMUSYNC; } if (blockSize >= 8) { sdata[tid] = mySum = mySum + sdata[tid + 4]; EMUSYNC; } if (blockSize >= 4) { sdata[tid] = mySum = mySum + sdata[tid + 2]; EMUSYNC; } if (blockSize >= 2) { sdata[tid] = mySum = mySum + sdata[tid + 1]; EMUSYNC; } } // write result for this block to global mem if (tid == 0) g_odata[blockIdx.x+(total_size*blockIdx.y)] = sdata[0]; } #endif