/*
Copyright (c) 2011 Paul Richmond, University of Sheffield , UK;
all rights reserved unless otherwise stated.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
In addition to the regulations of the GNU General Public License,
publications and communications based in parts on this program or on
parts of this program are required to cite the article
"Democratic population decisions result in robust policy-gradient
learning: a parametric study with GPU simulations" by Paul Richmond,
Lars Buesing, Michele Giugliano and Eleni Vasilaki, PLoS ONE Neuroscience,
Under Review..
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston,
MA 02111-1307 USA
*/
/*
"This software contains source code provided by NVIDIA Corporation."
This CUDA source file defines the host functions used for parallel
reduction and is based on the CUDA SDK Parallel reduction example
provided with the CUDA Computing SDK. It has been modified to allow a
large number of simultaneous (and independent) parallel reductions. i.e.
multiple reductions.
The reduction kernel is used for both the spikeTrainRedeuction and outputComponentReduction
steps of the simulation.
*/
#ifndef _REDUCTION_H_
#define _REDUCTION_H_
#include "reduction.cuh"
bool isPow2(unsigned int x)
{
return ((x&(x-1))==0);
}
unsigned int nextPow2( unsigned int x ) {
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
void getReductionBlocksAndThreads(int n, int &blocks, int &threads)
{
threads = (n < MAX_REDUCTION_THREADS*2) ? nextPow2((n + 1)/ 2) : MAX_REDUCTION_THREADS;
blocks = (n + (threads * 2 - 1)) / (threads * 2);
blocks = MIN(MAX_REDUCTION_BLOCKS, blocks);
}
template <class T>
void
reduce(int size, int threads, int blocks, T *d_idata, T *d_odata, int multiple = 1, int total_size = 0)
{
dim3 dimBlock(threads, 1, 1);
dim3 dimGrid(blocks, multiple, 1);
int smemSize = (threads <= 32) ? 2 * threads * sizeof(T) : threads * sizeof(T);
//if total size is not default then use mutiple reductions kernel
if (multiple > 1)
{
if (isPow2(size))
{
switch (threads)
{
case 512:
reduce6_multiple<T, 512, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size, total_size); break;
case 256:
reduce6_multiple<T, 256, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size, total_size); break;
case 128:
reduce6_multiple<T, 128, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size, total_size); break;
case 64:
reduce6_multiple<T, 64, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size, total_size); break;
case 32:
reduce6_multiple<T, 32, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size, total_size); break;
case 16:
reduce6_multiple<T, 16, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size, total_size); break;
case 8:
reduce6_multiple<T, 8, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size, total_size); break;
case 4:
reduce6_multiple<T, 4, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size, total_size); break;
case 2:
reduce6_multiple<T, 2, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size, total_size); break;
case 1:
reduce6_multiple<T, 1, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size, total_size); break;
}
}
else
{
switch (threads)
{
case 512:
reduce6_multiple<T, 512, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size, total_size); break;
case 256:
reduce6_multiple<T, 256, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size, total_size); break;
case 128:
reduce6_multiple<T, 128, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size, total_size); break;
case 64:
reduce6_multiple<T, 64, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size, total_size); break;
case 32:
reduce6_multiple<T, 32, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size, total_size); break;
case 16:
reduce6_multiple<T, 16, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size, total_size); break;
case 8:
reduce6_multiple<T, 8, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size, total_size); break;
case 4:
reduce6_multiple<T, 4, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size, total_size); break;
case 2:
reduce6_multiple<T, 2, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size, total_size); break;
case 1:
reduce6_multiple<T, 1, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size, total_size); break;
}
}
}
//only require a single reduction
else
{
if (isPow2(size))
{
switch (threads)
{
case 512:
reduce6<T, 512, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 256:
reduce6<T, 256, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 128:
reduce6<T, 128, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 64:
reduce6<T, 64, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 32:
reduce6<T, 32, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 16:
reduce6<T, 16, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 8:
reduce6<T, 8, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 4:
reduce6<T, 4, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 2:
reduce6<T, 2, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 1:
reduce6<T, 1, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
}
}
else
{
switch (threads)
{
case 512:
reduce6<T, 512, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 256:
reduce6<T, 256, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 128:
reduce6<T, 128, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 64:
reduce6<T, 64, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 32:
reduce6<T, 32, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 16:
reduce6<T, 16, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 8:
reduce6<T, 8, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 4:
reduce6<T, 4, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 2:
reduce6<T, 2, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 1:
reduce6<T, 1, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
}
}
}
}
template <class T>
T reduceArray(int n, T* d_idata, T* d_odata)
{
T gpu_result;
int nThreads = 0;
int nBlocks = 0;
getReductionBlocksAndThreads(n, nBlocks, nThreads);
//single reduction
reduce<T>(n, nThreads, nBlocks, d_idata, d_odata);
// check if kernel execution generated an error
cutilCheckMsg("Kernel execution failed");
// sum partial block sums on GPU
int s=nBlocks;
while(s > 1)
{
int threads = 0, blocks = 0;
getReductionBlocksAndThreads(s, blocks, threads);
//single reduction
reduce<T>(s, threads, blocks, d_odata, d_odata);
s = (s + (threads*2-1)) / (threads*2);
}
cutilSafeCallNoSync( cudaMemcpy( &gpu_result, d_odata, sizeof(T), cudaMemcpyDeviceToHost) );
return gpu_result;
}
template <class T>
void reduceMultipleArrays(int n, T* d_idata, T* d_odata, int multiple)
{
int nThreads = 0;
int nBlocks = 0;
getReductionBlocksAndThreads(n, nBlocks, nThreads);
//mutiple parallel reductions
reduce<T>(n, nThreads, nBlocks, d_idata, d_odata, multiple, n);
// check if kernel execution generated an error
cutilCheckMsg("Kernel execution failed");
// sum partial block sums on GPU
int s=nBlocks;
while(s > 1)
{
int threads = 0, blocks = 0;
getReductionBlocksAndThreads(s, blocks, threads);
//mutiple reductions
reduce<T>(s, threads, blocks, d_odata, d_odata, multiple, n);
s = (s + (threads*2-1)) / (threads*2);
}
}
template int
reduceArray<int>(int n, int* d_idata, int* d_odata);
template float
reduceArray<float>(int n, float* d_idata, float* d_odata);
template double
reduceArray<double>(int n, double* d_idata, double* d_odata);
template void
reduceMultipleArrays<int>(int n, int* d_idata, int* d_odata, int multiple);
template void
reduceMultipleArrays<float>(int n, float* d_idata, float* d_odata, int multiple);
template void
reduceMultipleArrays<double>(int n, double* d_idata, double* d_odata, int multiple);
#endif //_REDUCTION_H_