//
//
// File author(s):  <Gabriel Rucabado and Antonio Garcia Dopico>, (C) 2014
//
// Copyright: this software is licenced under the terms stipulated in the license.txt file located in its root directory
//
//

//!\file solverIm-gpu.cpp
//  \brief The implicit gpu solver of Neurite.
#include <math.h>
#include "solverIm-gpu.h"
#include "discretization.h"
#include "cudaException.h"
#include <string.h>
#include <stdio.h>
#include <stdlib.h>

#include <iostream>
#include <fstream>
using namespace std;


// Includes from cuda
#include "cuda_runtime.h"
#include "cublas_v2.h"

#define TYPE double


#define NORMAL 1
#define BRANCH 2
#define TERM 4
#define NEXT_TERM 8
#define FIRST_NORMAL 16
#define FIRST_BRANCH 32
#define OTHER 64

//#######################################
//#### HEADERDS of CUDA KERNELS #########
//#######################################
__global__ void Update_HH (int numElem, TYPE *m, TYPE *h, TYPE *n, TYPE dT,
			   TYPE *oldPot, TYPE *restPot, int *indVec, 
			   TYPE *G_L, TYPE *E_Na, TYPE *E_K, TYPE *E_L, 
			   TYPE *g_Na, TYPE *g_K, TYPE *G_Na, TYPE *G_K, 
			   TYPE *leftShiftNa, TYPE *leftShiftK,
			   TYPE *W, TYPE *K);
__global__ void Update_Mat (int numElem, short *type, TYPE *csrVal, TYPE dT,
			    TYPE *r_a, TYPE *c_m, TYPE *dx, TYPE *W,
			    int *indVec_DL, int *indVec, int *indMat);
__global__ void Update_B (int numElem, TYPE *K, TYPE *input, TYPE dT,
			  TYPE *c_m, TYPE *dx, TYPE *oldPot, TYPE *b);
__global__ void div (TYPE *op1, TYPE *op2, TYPE *res);
__global__ void divMul (TYPE *num1, TYPE *denom1, TYPE *num2, TYPE *denom2, TYPE *res);



//#######################################
//#### BODY of CUDA KERNELS #############
//#######################################
__global__ void Update_HH (int numElem, TYPE *m, TYPE *h, TYPE *n, TYPE dT,
			   TYPE *pot, TYPE *restPot, int *indVec, 
			   TYPE *G_L, TYPE *E_Na, TYPE *E_K, TYPE *E_L, 
			   TYPE *g_Na, TYPE *g_K, TYPE *G_Na, TYPE *G_K, 
			   TYPE *leftShiftNa, TYPE *leftShiftK,
			   TYPE *W, TYPE *K){
  TYPE dt	= dT * 1000;
  int id	= blockIdx.x * blockDim.x + threadIdx.x;
  if (id < numElem) {
    TYPE temp_G_Na, temp_G_K;
    TYPE mloc, nloc, hloc;
    TYPE Am, Ah, An, Bm, Bh, Bn; 
    int indexVec  = indVec[id];
    TYPE V_prevNa = ((pot[indexVec] + leftShiftNa[id]) - restPot[id]) / SCALE; 
    TYPE V_prevK  = ((pot[indexVec] + leftShiftK[id]) - restPot[id]) / SCALE; 

    Am 	= (2.5 - 0.1 * V_prevNa) / (exp(2.5 - 0.1 * V_prevNa) - 1);
    Bm 	= 4 * exp(-V_prevNa / 18);
    Ah 	= 0.07 * exp(-V_prevNa / 20);
    Bh	= 1 / (exp(3 - 0.1 * V_prevNa) + 1);
    An	= (0.1 - 0.01 * V_prevK) / (exp(1 - 0.1 * V_prevK)-1);
    Bn	= 0.125 * exp(-V_prevK / 80);	

    mloc	= m[id];
    hloc	= h[id];
    nloc	= n[id];

    m[id] = mloc = mloc + dt * (Am * (1 - mloc) - Bm * mloc);	    
    h[id] = hloc = hloc + dt * (Ah * (1 - hloc) - Bh * hloc);	  
    n[id] = nloc = nloc + dt * (An * (1 - nloc) - Bn * nloc);

    G_Na[id]  = temp_G_Na = g_Na[id] * mloc * mloc * mloc * hloc;
    G_K[id]   = temp_G_K  = g_K[id] * nloc * nloc * nloc * nloc;
    
    W[id] 	= -(temp_G_Na + temp_G_K + G_L[id]);
    K[indexVec] = temp_G_Na * E_Na[id] + temp_G_K * E_K[id] + G_L[id] * E_L[id];
  }
}



//! \brief Update values related to HH nodes of implicit Matrix 
__global__ void Update_Mat (int numElem, short *type, TYPE *csrVal, TYPE dT,
			    TYPE *r_a, TYPE *c_m, TYPE *dx, TYPE *W,
			    int *indVec_DL, int *indVec, int *indMat){
  double alpha, betaR, betaL, betaMe, beta, delta, gamma;
  // Figure out id and index of thread 
  int id	= blockIdx.x * blockDim.x + threadIdx.x;

  // Update Matrix (HH elements)
  if (id < numElem) {
    int indexVec = indVec[id];
    int indexMat = indMat[id]; 

    // Update Matrix
    alpha = -1/(r_a[indexVec] * dx[indexVec]);
    betaR = 1/(r_a[indexVec+1] * dx[indexVec+1]);
    betaMe = -(W[id] * dx[indexVec]) + 1/(r_a[indexVec] * dx[indexVec]);
    beta = (c_m[indexVec] * dx[indexVec])/dT + betaMe + betaR;
    delta = -betaR;

    switch (type[id]){
    case (NEXT_TERM):
      csrVal[indexMat]   = alpha;
      csrVal[indexMat+1] = beta + delta;
      break;
    case (BRANCH) :
      betaL = 1/(r_a[indVec_DL[id]] * dx[indVec_DL[id]]);
      gamma = -betaL;
    
      csrVal[indexMat]   = alpha;
      csrVal[indexMat+1] = beta + betaL;
      csrVal[indexMat+2] = delta;
      csrVal[indexMat+3] = gamma;
      break;
    case (NORMAL) :
      csrVal[indexMat]   = alpha;
      csrVal[indexMat+1] = beta;
      csrVal[indexMat+2] = delta;
      break;
    case (FIRST_NORMAL) :
      csrVal[indexMat]   = alpha + beta;
      csrVal[indexMat+1] = delta;
      break;
    case (FIRST_BRANCH) :
      betaL = 1/(r_a[indVec_DL[id]] * dx[indVec_DL[id]]);
      gamma = -betaL;
      csrVal[indexMat]   = alpha + beta + betaL;
      csrVal[indexMat+1] = delta;
      csrVal[indexMat+2] = gamma;
      break;
    default:
      break;
    }
  }
}




//! \brief Update values from B vector
__global__ void Update_B (int numElem, TYPE *K, TYPE *input, TYPE dT,
			  TYPE *c_m, TYPE *dx, TYPE *pot, TYPE *b){
  //figure out id of thread
  int id	= blockIdx.x * blockDim.x + threadIdx.x;

  //figure out B
  if (id < numElem) {
    b[id] = c_m[id] * dx[id] * pot[id]/dT + K[id] * dx[id] + input[id];
  }
}


//! \brief Divide kernel
__global__ void div (TYPE *num, TYPE *denom, TYPE *res){
  register double a = *num, b = *denom;
  if (threadIdx.x == 0){
    res[0] = +(a/b);
  } else if (threadIdx.x == 1){
    res[1] = -(a/b);
  }else{
    _INFO_CUDA ("It should not be here!!!\n");
  }
}


//! \brief Divide and multiple kernel
__global__ void divMul (TYPE *num1, TYPE *denom1, TYPE *num2, TYPE *denom2, TYPE *res){
  if (threadIdx.x == 0){
    res[0] = (num1[0] / denom1[0]) * (num2[0] / denom2[0]);
  }else{
    _INFO_CUDA ("It should not be here!!!\n");
  }  
}

//#################################################
//#### BODY of PRIVATE METHODS ####################
//#################################################
void SolverImGPU::prevBoundaries (discretization **elements,
				    int *CPU_indMat_HH, int *CPU_indVec_DL_HH, short *CPU_type_HH,
				    TYPE *CPU_csr_Val, int *CPU_csr_Col, int * CPU_csr_Row){
  discretization *elem;
  double alpha, betaR, betaL, betaMe, beta, delta, gamma;

  // 0 row
  if (elements[0]->get_kind_HH()){
    CPU_indMat_HH[this->num_HH] = 0;
    CPU_indVec_DL_HH[this->num_HH] = -1;
    CPU_type_HH[this->num_HH++] = OTHER;
  }
  CPU_csr_Row[0] = 0;
  CPU_csr_Val[this->nnz] = 1;
  CPU_csr_Col[this->nnz++] = 0;

  //1 row
  elem = elements[1];

  alpha = -1/(elem->get_r_a() * elem->get_dX());
  betaR = 1/(elements[elem->get_DR()]->get_r_a()*elements[elem->get_DR()]->get_dX());
  betaL = elem->get_branching()/
    (elements[elem->get_DL()]->get_r_a()*elements[elem->get_DL()]->get_dX());
  betaMe = -elem->get_W() * elem->get_dX();
  beta = (elem->get_c_m() * elem->get_dX())/this->dT + betaMe - alpha + betaL + betaR;
  delta = -betaR;

  CPU_csr_Row[1] = 1;
  CPU_csr_Val[this->nnz] = alpha + beta;
  CPU_csr_Col[this->nnz++] = 1;
  
  CPU_csr_Val[this->nnz] = delta;
  CPU_csr_Col[this->nnz++] = 2;

  if (elements[1]->get_DL() != 0) {
    gamma = -betaL;
    CPU_csr_Val[this->nnz] = gamma;
    CPU_csr_Col[this->nnz++] = elements[1]->get_DL();
      
    if (elements[1]->get_kind_HH()){
      CPU_indMat_HH[this->num_HH] = 1;
      CPU_indVec_DL_HH[this->num_HH] = 1;
      CPU_type_HH[this->num_HH++] = FIRST_BRANCH;
    }
  } else{
    if (elements[1]->get_kind_HH()){
      CPU_indMat_HH[this->num_HH] = 1;
      CPU_indVec_DL_HH[this->num_HH] = -1;
      CPU_type_HH[this->num_HH++] = FIRST_NORMAL;
    }
  }
}



void SolverImGPU::coreMatrix (discretization **elements,
				int *CPU_indMat_HH, int *CPU_indVec_DL_HH, short *CPU_type_HH,
				TYPE *CPU_csr_Val, int *CPU_csr_Col, int * CPU_csr_Row){
  // 2<-->(n-2) Rows
  int i;
  discretization *elem, *nextElem;
  double alpha, betaR, betaL, betaMe, beta, delta, gamma;

  for (i=2; i<this->num_elem-2; i++){
    elem = elements[i];
    nextElem = elements[i+1];
    
    if (elem->get_kind_HH()){
      CPU_indMat_HH[this->num_HH] = this->nnz;
    }

    CPU_csr_Row[i] = this->nnz;

    if(elem->get_DR() == i){ 
      CPU_csr_Val[this->nnz] 	= 1;
      CPU_csr_Col[this->nnz++] = i;
      if (elem->get_kind_HH()){
	CPU_indVec_DL_HH[this->num_HH] = -1;
	CPU_type_HH[this->num_HH++] = TERM;
      }

    } else{
      alpha = -1/(elem->get_r_a() * elem->get_dX());
      betaR = 1/(elements[elem->get_DR()]->get_r_a()*elements[elem->get_DR()]->get_dX());
      betaL = elem->get_branching()/
	(elements[elem->get_DL()]->get_r_a()*elements[elem->get_DL()]->get_dX());
      betaMe = -elem->get_W() * elem->get_dX();
      beta = (elem->get_c_m() * elem->get_dX())/this->dT + betaMe - alpha + betaL + betaR;
      delta = -betaR;

      CPU_csr_Val[this->nnz] = alpha;
      CPU_csr_Col[this->nnz++] = elem->get_mother();

      if(nextElem->get_DR() == i+1){
	CPU_csr_Val[this->nnz] = beta + delta;
	CPU_csr_Col[this->nnz++] = i;

	if (elem->get_kind_HH()){
	  CPU_indVec_DL_HH[this->num_HH] = -1;
	  CPU_type_HH[this->num_HH++] = NEXT_TERM;
	}
      }else{
	CPU_csr_Val[this->nnz] = beta;
	CPU_csr_Col[this->nnz++] = i;

	CPU_csr_Val[this->nnz] = delta;
	CPU_csr_Col[this->nnz++] = elem->get_DR();

	if (elem->get_DL() != 0) {
	  gamma = -betaL;
	  CPU_csr_Val[this->nnz] = gamma;
	  CPU_csr_Col[this->nnz++] = elem->get_DL();

	  if (elem->get_kind_HH()){
	    CPU_indVec_DL_HH[this->num_HH] = elem->get_DL();
	    CPU_type_HH[this->num_HH++] = BRANCH;
	  }
	} else {
	  if (elem->get_kind_HH()){
	    CPU_indVec_DL_HH[this->num_HH] = -1;
	    CPU_type_HH[this->num_HH++] = NORMAL;
	  }
	}
      }
    }
  }
}




void SolverImGPU::postBoundaries (discretization **elements,
				  int *CPU_indMat_HH, int *CPU_indVec_DL_HH, short *CPU_type_HH,
				  TYPE *CPU_csr_Val, int *CPU_csr_Col, int * CPU_csr_Row){
  discretization *elem;
  double alpha, betaR, betaMe, beta, delta;

  // n-2 row
  elem = elements[this->num_elem-2];

  alpha = -1/(elem->get_r_a() * elem->get_dX());
  betaR = 1/(elements[elem->get_DR()]->get_r_a()*elements[elem->get_DR()]->get_dX());
  betaMe = -elem->get_W() * elem->get_dX();
  beta = (elem->get_c_m() * elem->get_dX())/this->dT + betaMe -alpha + betaR;
  delta = -betaR;


  if (elem->get_kind_HH()){
    CPU_indMat_HH[this->num_HH] = this->nnz;
    CPU_indVec_DL_HH[this->num_HH] = -1;
    CPU_type_HH[this->num_HH++] = NEXT_TERM;
  }
  CPU_csr_Row[this->num_elem-2] = this->nnz;
  CPU_csr_Val[this->nnz] = alpha;
  CPU_csr_Col[this->nnz++] = elem->get_mother();

  CPU_csr_Val[this->nnz] = beta + delta;
  CPU_csr_Col[this->nnz++] = this->num_elem-2;

  //n-1 row
  if (elements[this->num_elem-1]->get_kind_HH()){
    CPU_indMat_HH[this->num_HH] = this->nnz;
    CPU_indVec_DL_HH[this->num_HH] = -1;
    CPU_type_HH[this->num_HH++] = OTHER;
  }

  CPU_csr_Row[this->num_elem-1] = this->nnz;
  CPU_csr_Val[this->nnz] = 1;
  CPU_csr_Col[this->nnz++] = this->num_elem - 1;
  CPU_csr_Row[this->num_elem] = this->nnz;
}








//#################################################
//#### BODY of PUBLIC METHODS #####################
//#################################################
SolverImGPU::SolverImGPU(double dT, double **potential,
			   int num_term, int *ind_terminals, double input_val,
			   int num_HH_elements, Hodgkin_Huxley **HH_elements,
			   int num_elem, discretization **elements) {
  int i=0;
  cudaError_t error;
  TYPE onePos=+1, oneNeg=-1, cero=0;
  char *env = getenv("idGPU");
  cudaDeviceProp deviceProp;

  try {
    this->idGPU =(env != NULL)? atoi(env):CUDA_ID_DEF;
    if ((error = cudaSetDevice (this->idGPU)) != OK_CUDA){
      std::stringstream params;
      params << "idGPU=" << this->idGPU;
      throw cudaException(__FILE__, __FUNCTION__, __LINE__,	       
			  "Identifying the GPU", error,  params.str());     
    }

    cudaGetDeviceProperties(&deviceProp, this->idGPU);
    _INFO("GPU used ==> Device %d: \"%s\"", this->idGPU, deviceProp.name);

    // COMMON
    //////////////////////////////////////////////////
    _DEBUG("STARTING TO CREATE STRUCTURES OF SOLVER");
    DIM3(dimBlock_ALL, TH_ALL,1,1);
    DIM3(dimBlock_HH, TH_HH,1,1);
    DIM3(dimBlock_DIV, 2,1,1);
    DIM3(dimBlock_DIVMUL, 1,1,1);

    DIM3(dimGrid_DIV, 1,1,1);
    DIM3(dimGrid_DIVMUL, 1,1,1);
    this->dT 	= dT;

    // Create Events
    for (i=0; i<NUM_EVENT; i++){
      if ((error = cudaEventCreateWithFlags(&this->event[i],
					    cudaEventDisableTiming)) != OK_CUDA){
	std::stringstream params, msg;
	params << "event=" << &event[i];
	msg    << "creating event: " + i;
	throw cudaException(__FILE__, __FUNCTION__, __LINE__,
			    msg.str(), error,  params.str());
      }
    }
    // Create Streams
    for (i=0; i<NUM_STREAM; i++){
      cudaError_t error;
      if ((error = cudaStreamCreate(&stream[i]))!= OK_CUDA){
	std::stringstream params,msg;
	params << "stream=" << &stream[i];
	msg << "Creating the stream: " << i;
	throw cudaException(__FILE__, __FUNCTION__, __LINE__,	       
			    msg.str(), error,  params.str());				
      }
    }

    // Create CUBLAS context 
    cublasStatus_t cublasStatus;
    if ((cublasStatus = cublasCreate(&cublasHandle)) != OK_CUBLAS){
      throw cudaException(__BASE_FILE__, __FUNCTION__, __LINE__,
			  "creating context cublas", cublasStatus, "");
    }
    
    if ((cublasStatus = cublasSetPointerMode(this->cublasHandle, 
					     CUBLAS_POINTER_MODE_DEVICE)) != OK_CUBLAS){
      throw cudaException(__BASE_FILE__, __FUNCTION__, __LINE__,
			  "configurating pointers for cublas", cublasStatus, "");
    }
    
    // Create CUSPARSE context
    cusparseStatus_t cusparseStatus;
    if ((cusparseStatus = cusparseCreate(&cusparseHandle)) != OK_CUSPARSE){
      throw cudaException(__BASE_FILE__, __FUNCTION__, __LINE__,
			  "Creating cusparse context", cusparseStatus, "");
    }
    if ((cusparseStatus = cusparseSetPointerMode(this->cusparseHandle, 
						 CUSPARSE_POINTER_MODE_DEVICE)) != OK_CUSPARSE){
      throw cudaException(__BASE_FILE__, __FUNCTION__, __LINE__,
			  "Creating pointers for cusparse", cusparseStatus, "");
    }
    if ((cusparseStatus = cusparseCreateMatDescr(&this->descA)) != OK_CUSPARSE){
      throw cudaException(__BASE_FILE__, __FUNCTION__, __LINE__,
			  "Creating A matrix for cusparse", cusparseStatus, "");
    }
    if ((cusparseStatus = cusparseSetMatIndexBase(this->descA, CUSPARSE_INDEX_BASE_ZERO)) != OK_CUSPARSE){
      throw cudaException(__BASE_FILE__, __FUNCTION__, __LINE__,
			  "Creating A Matrix for cusparse", cusparseStatus, "");
    }
    if ((cusparseStatus = cusparseSetMatType(this->descA, CUSPARSE_MATRIX_TYPE_GENERAL)) != OK_CUSPARSE){
      throw cudaException(__BASE_FILE__, __FUNCTION__, __LINE__,
			  "Creating A Matrix for cusparse", cusparseStatus, "");
    }

   
    CUDA_ALLOC(&(this->AP), num_elem * sizeof(TYPE));
    CUDA_ALLOC(&(this->AS), num_elem * sizeof(TYPE));
    CUDA_ALLOC(&(this->P), num_elem * sizeof(TYPE));
    CUDA_ALLOC(&(this->R), num_elem * sizeof(TYPE));
    CUDA_ALLOC(&(this->R0), num_elem * sizeof(TYPE));
    CUDA_SET (this->AP, 0, num_elem * sizeof(TYPE), stream[0]);
    CUDA_SET (this->AS, 0, num_elem * sizeof(TYPE), stream[0]);
    CUDA_SET (this->P, 0, num_elem * sizeof(TYPE), stream[0]);
    CUDA_SET (this->R, 0, num_elem * sizeof(TYPE), stream[0]);
    CUDA_SET (this->R0, 0, num_elem * sizeof(TYPE), stream[0]);


    CUDA_ALLOC(&(this->p0), 1 * sizeof(TYPE));
    CUDA_ALLOC(&(this->p1), 1 * sizeof(TYPE));
    CUDA_ALLOC(&(this->apr), 1 * sizeof(TYPE));
    CUDA_ALLOC(&(this->alpha), 2 * sizeof(TYPE));
    CUDA_ALLOC(&(this->w1), 1 * sizeof(TYPE));
    CUDA_ALLOC(&(this->w2), 1 * sizeof(TYPE));
    CUDA_ALLOC(&(this->w), 2 * sizeof(TYPE));
    CUDA_ALLOC(&(this->beta), 1 * sizeof(TYPE));
    CUDA_ALLOC(&(this->onePos), 1 * sizeof(TYPE));
    CUDA_ALLOC(&(this->oneNeg), 1 * sizeof(TYPE));
    CUDA_ALLOC(&(this->zero), 1 * sizeof(TYPE));
    CUDA_ALLOC(&(this->res), 1 * sizeof(TYPE));

    CUDA_CPY (this->onePos, &onePos, 1*sizeof(TYPE), stream[0]);
    CUDA_CPY (this->oneNeg, &oneNeg, 1*sizeof(TYPE), stream[0]);
    CUDA_CPY (this->zero,   &cero,   1*sizeof(TYPE), stream[0]);
    CUDA_SET (this->p0,   0, 1*sizeof(TYPE), stream[0]);
    CUDA_SET (this->p1,   0, 1*sizeof(TYPE), stream[0]);
    CUDA_SET (this->apr,  0, 1*sizeof(TYPE), stream[0]);
    CUDA_SET (this->w1,   0, 1*sizeof(TYPE), stream[0]);
    CUDA_SET (this->w2,   0, 1*sizeof(TYPE), stream[0]);
    CUDA_SET (this->beta, 0, 1*sizeof(TYPE), stream[0]);
    CUDA_SET (this->alpha,0, 2*sizeof(TYPE), stream[0]);
    CUDA_SET (this->w,    0, 2*sizeof(TYPE), stream[0]);


    // TERMINALS
    //////////////////////////////////////////////////
    this->num_term 	= num_term;


    // ALL
    //////////////////////////////////////////////////
    this->num_elem	= num_elem;
    //   POTENTIAL
    CUDA_ALLOC(&this->pot, num_elem * sizeof(TYPE));
    CUDA_CPY(this->pot, potential[0], sizeof(TYPE)*num_elem, stream[0]);

    //   ATTRIBUTES
    TYPE *CPU_dx	= new TYPE[num_elem];
    TYPE *CPU_c_m	= new TYPE[num_elem];
    TYPE *CPU_r_a	= new TYPE[num_elem];
    TYPE *CPU_K		= new TYPE[num_elem];
    TYPE *CPU_input_on	= new TYPE[num_elem];
    TYPE *CPU_input_off	= new TYPE[num_elem];
    for (i=0; i<num_elem; i++){
      CPU_dx[i]		= elements[i]->get_dX();
      CPU_c_m[i]	= elements[i]->get_c_m();
      CPU_r_a[i]	= elements[i]->get_r_a();
      CPU_K[i]		= elements[i]->get_K();
      CPU_input_off[i]	= 0;
      CPU_input_on[i]	= elements[i]->get_input_current();
    }

    DIM3(dimGrid_ALL, (this->num_elem + TH_ALL - 1)/TH_ALL,1,1);
    int num_all_cuda = dimGrid_ALL.x * dimBlock_ALL.x;

    CUDA_ALLOC(&(this->dx), num_all_cuda * sizeof(TYPE));
    CUDA_ALLOC(&(this->c_m), num_all_cuda * sizeof(TYPE));
    CUDA_ALLOC(&(this->r_a), num_all_cuda * sizeof(TYPE));
    CUDA_ALLOC(&(this->K), num_all_cuda * sizeof(TYPE));
    CUDA_ALLOC(&(this->input_on), num_all_cuda * sizeof(TYPE));
    CUDA_ALLOC(&(this->input_off), num_all_cuda * sizeof(TYPE));
    CUDA_CPY(this->dx, CPU_dx, this->num_elem * sizeof(TYPE), stream[0]);
    CUDA_CPY(this->c_m, CPU_c_m, this->num_elem * sizeof(TYPE), stream[0]);
    CUDA_CPY(this->r_a, CPU_r_a, this->num_elem * sizeof(TYPE), stream[0]);
    CUDA_CPY(this->K, CPU_K, this->num_elem * sizeof(TYPE), stream[0]);
    CUDA_CPY(this->input_on, CPU_input_on, this->num_elem * sizeof(TYPE), stream[0]);
    CUDA_CPY(this->input_off, CPU_input_off, this->num_elem * sizeof(TYPE), stream[0]);

    // HH
    //////////////////////////////////////////////////
    this->num_HH	= num_HH_elements;
    if (this->num_HH > 0){
      int *CPU_indVec_HH	= new int [num_HH_elements];
      TYPE *CPU_W		= new TYPE [num_HH_elements];
      TYPE *CPU_Shift_Na	= new TYPE [num_HH_elements];
      TYPE *CPU_Shift_K	= new TYPE [num_HH_elements];
      TYPE *CPU_n		= new TYPE [num_HH_elements];
      TYPE *CPU_m		= new TYPE [num_HH_elements];
      TYPE *CPU_h		= new TYPE [num_HH_elements];
      TYPE *CPU_E_Na	= new TYPE [num_HH_elements];
      TYPE *CPU_G_Na	= new TYPE [num_HH_elements];
      TYPE *CPU_g_Na	= new TYPE [num_HH_elements];
      TYPE *CPU_E_L	= new TYPE [num_HH_elements];
      TYPE *CPU_G_L	= new TYPE [num_HH_elements];
      TYPE *CPU_E_K	= new TYPE [num_HH_elements];
      TYPE *CPU_G_K	= new TYPE [num_HH_elements];
      TYPE *CPU_g_K	= new TYPE [num_HH_elements];
      TYPE *CPU_rest_pot	= new TYPE [num_HH_elements];
      for (i=0; i<num_HH_elements; i++){
	CPU_indVec_HH[i]	= HH_elements[i]->get_element();
	CPU_W[i]		= HH_elements[i]->get_W();
	CPU_Shift_Na[i]	= HH_elements[i]->get_Left_Shift_Na();
	CPU_Shift_K[i]	= HH_elements[i]->get_Left_Shift_K();
	CPU_n[i]		= HH_elements[i]->get_n();
	CPU_m[i]		= HH_elements[i]->get_m();
	CPU_h[i]		= HH_elements[i]->get_h();
	CPU_E_Na[i]	= HH_elements[i]->get_E_Na();
	CPU_G_Na[i]	= HH_elements[i]->get_G_Na();
	CPU_g_Na[i]	= HH_elements[i]->get_g_Na();
	CPU_E_L[i]	= HH_elements[i]->get_E_L();
	CPU_G_L[i]	= HH_elements[i]->get_G_L();
	CPU_E_K[i]	= HH_elements[i]->get_E_K();
	CPU_G_K[i]	= HH_elements[i]->get_G_K();
	CPU_g_K[i]	= HH_elements[i]->get_g_K();
	CPU_rest_pot[i]	= HH_elements[i]->get_rest_pot();
      }

      DIM3(dimGrid_HH, (this->num_HH + TH_HH - 1)/TH_HH,1,1);
      int num_HH_cuda = dimGrid_HH.x * dimBlock_HH.x;

      CUDA_ALLOC(&(this->indVec_HH), num_HH_cuda * sizeof(int));
      CUDA_ALLOC(&(this->W), num_HH_cuda * sizeof(TYPE));
      CUDA_ALLOC(&(this->leftShiftNa), num_HH_cuda * sizeof(TYPE));
      CUDA_ALLOC(&(this->leftShiftK), num_HH_cuda * sizeof(TYPE));
      CUDA_ALLOC(&(this->n), num_HH_cuda * sizeof(TYPE));
      CUDA_ALLOC(&(this->m), num_HH_cuda * sizeof(TYPE));
      CUDA_ALLOC(&(this->h), num_HH_cuda * sizeof(TYPE));
      CUDA_ALLOC(&(this->E_Na), num_HH_cuda * sizeof(TYPE));
      CUDA_ALLOC(&(this->G_Na), num_HH_cuda * sizeof(TYPE));
      CUDA_ALLOC(&(this->g_Na), num_HH_cuda * sizeof(TYPE));
      CUDA_ALLOC(&(this->E_L), num_HH_cuda * sizeof(TYPE));
      CUDA_ALLOC(&(this->G_L), num_HH_cuda * sizeof(TYPE));
      CUDA_ALLOC(&(this->E_K), num_HH_cuda * sizeof(TYPE));
      CUDA_ALLOC(&(this->G_K), num_HH_cuda * sizeof(TYPE));
      CUDA_ALLOC(&(this->g_K), num_HH_cuda * sizeof(TYPE));
      CUDA_ALLOC(&(this->rest_pot), num_HH_cuda * sizeof(TYPE));

      CUDA_CPY(this->indVec_HH, CPU_indVec_HH, this->num_HH * sizeof(int), stream[0]);
      CUDA_CPY(this->W, CPU_W, this->num_HH * sizeof(TYPE), stream[0]);
      CUDA_CPY(this->leftShiftNa, CPU_Shift_Na, this->num_HH * sizeof(TYPE), stream[0]);
      CUDA_CPY(this->leftShiftK, CPU_Shift_K, this->num_HH * sizeof(TYPE), stream[0]);
      CUDA_CPY(this->n, CPU_n, this->num_HH * sizeof(TYPE), stream[0]);
      CUDA_CPY(this->m, CPU_m, this->num_HH * sizeof(TYPE), stream[0]);
      CUDA_CPY(this->h, CPU_h, this->num_HH * sizeof(TYPE), stream[0]);
      CUDA_CPY(this->E_Na, CPU_E_Na, this->num_HH * sizeof(TYPE), stream[0]);
      CUDA_CPY(this->G_Na, CPU_G_Na, this->num_HH * sizeof(TYPE), stream[0]);
      CUDA_CPY(this->g_Na, CPU_g_Na, this->num_HH * sizeof(TYPE), stream[0]);
      CUDA_CPY(this->E_L, CPU_E_L, this->num_HH * sizeof(TYPE), stream[0]);
      CUDA_CPY(this->G_L, CPU_G_L, this->num_HH * sizeof(TYPE), stream[0]);
      CUDA_CPY(this->E_K, CPU_E_K, this->num_HH * sizeof(TYPE), stream[0]);
      CUDA_CPY(this->G_K, CPU_G_K, this->num_HH * sizeof(TYPE), stream[0]);
      CUDA_CPY(this->g_K, CPU_g_K, this->num_HH * sizeof(TYPE), stream[0]);
      CUDA_CPY(this->rest_pot, CPU_rest_pot, this->num_HH * sizeof(TYPE), stream[0]);
    }
  }catch (cudaException &e){
    e.print();
    throw e;
  }

  // Creation temporal structures that related to the matrix
  //////////////////////////////////////////////////
  TYPE *CPU_csr_Val 	= new TYPE [this->num_elem*4];
  int *CPU_csr_Col 	= new int [this->num_elem*4];
  int *CPU_csr_Row 	= new int [this->num_elem+1];
  int *CPU_indMat	= new int [this->num_HH];
  int *CPU_indVec_DL	= new int [this->num_HH];
  short *CPU_type	= new short [this->num_HH];

  this->nnz=0;
  this->num_HH=0;
  this->prevBoundaries(elements, CPU_indMat, CPU_indVec_DL,
		       CPU_type, CPU_csr_Val, CPU_csr_Col, CPU_csr_Row);
  _DEBUG("prev->coreMatrix"); 
  this->coreMatrix(elements, CPU_indMat, CPU_indVec_DL,
		   CPU_type, CPU_csr_Val, CPU_csr_Col, CPU_csr_Row);
  _DEBUG("coreMatrix->post"); 
  this->postBoundaries(elements, CPU_indMat, CPU_indVec_DL,
		       CPU_type, CPU_csr_Val, CPU_csr_Col, CPU_csr_Row);
  
  // Transferring the matrix from CPU to GPU
  CUDA_ALLOC (&(this->csr_Val), sizeof(TYPE)*this->nnz);
  CUDA_CPY(this->csr_Val, CPU_csr_Val, sizeof(TYPE)*this->nnz, stream[0]);
  CUDA_ALLOC (&(this->csr_Col_Ind), sizeof(int)*this->nnz);
  CUDA_CPY(this->csr_Col_Ind, CPU_csr_Col, sizeof(int)*this->nnz, stream[0]);
  CUDA_ALLOC (&(this->csr_Row_Ptr), sizeof(int)*(this->num_elem+1));
  CUDA_CPY(this->csr_Row_Ptr, CPU_csr_Row, sizeof(int)*(this->num_elem+1), stream[0]);

  CUDA_ALLOC (&(this->type_HH), sizeof(short)*this->num_HH);
  CUDA_CPY(this->type_HH, CPU_type, sizeof(short)*this->num_HH, stream[0]);
  CUDA_ALLOC (&(this->indMat_HH), sizeof(int)*this->num_HH);
  CUDA_CPY(this->indMat_HH, CPU_indMat, sizeof(int)*this->num_HH, stream[0]);
  CUDA_ALLOC (&(this->indVec_DL_HH), sizeof(int)*this->num_HH);
  CUDA_CPY(this->indVec_DL_HH, CPU_indVec_DL, sizeof(int)*this->num_HH, stream[0]);

  // Free allocated temporal matrix (CPU)
  free(CPU_csr_Val);
  free(CPU_csr_Col);
  free(CPU_csr_Row);
  free(CPU_indMat);
  free(CPU_indVec_DL);

  cudaDeviceSynchronize();
}


  //!\brief This function updates the electrical variables based on the previous time step information
  //  @param input_active Flag used to indicate if it has to use  the input current to update the values
void SolverImGPU::update (bool input_active) throw(){
  
  try{
    // Set GPU card to use
    cudaSetDevice (this->idGPU);
    
    if (input_active){
      input	= input_on;
    }else{
      input	= input_off;
    }

    if (num_HH > 0){  
      KERNEL(Update_HH, dimGrid_HH, dimBlock_HH, stream[0], 
     	     "%d, %p, %p, %p, %f, %p, %p, %p, %p,"
	     "%p, %p, %p, %p, %p, %p, %p, %p, %p, %p, %p",
     	     num_HH, m, h, n, dT, pot, rest_pot, indVec_HH, 
     	     G_L, E_Na, E_K, E_L, g_Na, g_K, G_Na, G_K,
	     leftShiftNa, leftShiftK, W, K);

      KERNEL(Update_Mat, dimGrid_HH, dimBlock_HH, stream[0], 
     	     "%d, %p, %p, %f, %p, %p, %p, %p, %p, %p, %p",
     	     num_HH, type_HH, csr_Val, dT, r_a, c_m, dx, W,
	     indVec_DL_HH, indVec_HH, indMat_HH);
    }

    KERNEL(Update_B, dimGrid_ALL, dimBlock_ALL, stream[0], 
	   "%d, %p, %p, %f, %p, %p, %p, %p",
	   num_elem, K, input, dT, c_m, dx, this->pot, this->pot);
  } catch (cudaException &e){
    e.print();
    throw e;
  }
}



//! \brief Calculate one iteration of explicit finite difference discretization solver
void SolverImGPU::calculate() throw(){
  /*
R0==B
csrmv (A,x)->AP;
daxpy (R0 -AP)->R0;
dot (R0, R0)->p0;
copy (R0,P)
copy (R0,R)

while (true){
=================================
|  	STREAM 0 		|
=================================
| csrmv(A,P)->AP 		|
| dot(AP,R0)->apr 		|
| alpha=p0/apr			|
| daxpy(R-alpha*AP)->R		|
| csrmv(A, R)->As  		|
| dot(As,R)->w1			|
| dot (As, As)->w2	  	|
| w=w1/w2			|
| daxpy (Xi+ alpha*P)->Xi+1	|
| daxpy (Xi+1 + w*R)->Xi+1	|
| daxpy (R-w*As)->R		| 
| dot (R, R0)->p1		|
| beta=alpha/w * p1/p0		|
| copy(P->AS)			|
| daxpy(AS-w*AP)->AS		|
| copy(R->P)			|
| daxpy(P + beta*AS)->P		|
=================================
p0=p1 (flip)
}
  */
  int i,j;
  TYPE result, *flip, limit = RANGE_ERROR*RANGE_ERROR;

  try{
    // Set GPU card to use
    cudaSetDevice (this->idGPU);

    cublasSetStream(this->cublasHandle, stream[0]);
    cusparseSetStream(this->cusparseHandle, stream[0]);

    CUDA_CPY_SYNC(this->R0, this->pot, sizeof(TYPE)*this->num_elem);
    cusparseXcsrmv(this->cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
    		   this->num_elem, this->num_elem, this->nnz, this->onePos,
    		   this->descA, this->csr_Val, this->csr_Row_Ptr, this->csr_Col_Ind,
    		   this->pot, this->zero, this->AP);
    cublasXaxpy(this->cublasHandle, this->num_elem, this->oneNeg, this->AP, 1, this->R0, 1);
    CUDA_CPY_SYNC(this->R, this->R0, sizeof(TYPE)*this->num_elem);
    CUDA_CPY_SYNC(this->P, this->R0, sizeof(TYPE)*this->num_elem);
    cublasXot(this->cublasHandle, this->num_elem, this->R0, 1, this->R0, 1, this->p0);

    for (i=0; i<MAX_ITER; i++){
      for (j=i; j<i+ITER_CHECK; j++){ 
	cusparseXcsrmv(this->cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
		       this->num_elem, this->num_elem, this->nnz, this->onePos,
		       this->descA, this->csr_Val, this->csr_Row_Ptr, this->csr_Col_Ind,
		       this->P, this->zero, this->AP);
	cublasXot(this->cublasHandle, this->num_elem, this->AP, 1, this->R0, 1, this->apr);
	KERNEL(div, dimGrid_DIV, dimBlock_DIV, stream[0], 
	       "%p, %p, %p", this->p0, this->apr, this->alpha);
	cublasXaxpy(this->cublasHandle, this->num_elem, &this->alpha[1], this->AP, 1, this->R, 1);
	cusparseXcsrmv(this->cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
		       this->num_elem, this->num_elem, this->nnz, this->onePos,
		       this->descA, this->csr_Val, this->csr_Row_Ptr, this->csr_Col_Ind,
		       this->R, this->zero, this->AS);
	cublasXot(this->cublasHandle, this->num_elem, this->AS, 1, this->R, 1, this->w1);
	cublasXot(this->cublasHandle, this->num_elem, this->AS, 1, this->AS, 1, this->w2);

	KERNEL(div, dimGrid_DIV, dimBlock_DIV, stream[0], 
	       "%p, %p, %p", this->w1, this->w2, this->w);

	cublasXaxpy(this->cublasHandle, this->num_elem, &(this->alpha[0]), this->P, 1, this->pot, 1);
	cublasXaxpy(this->cublasHandle, this->num_elem, &(this->w[0]), this->R, 1, this->pot, 1);

	cublasXaxpy(this->cublasHandle, this->num_elem, &this->w[1], this->AS, 1, this->R, 1);

	cublasXot(this->cublasHandle, this->num_elem, this->R, 1, this->R0, 1, this->p1);

	KERNEL(divMul, dimGrid_DIVMUL, dimBlock_DIVMUL, stream[0], 
	       "%p, %p, %p, %p, %p",
	       &(this->alpha[0]), &(this->w[0]), this->p1, this->p0, this->beta);
	CUDA_CPY_SYNC (this->AS, this->P, sizeof(TYPE)*this->num_elem);
	cublasXaxpy(this->cublasHandle, this->num_elem, &this->w[1], this->AP, 1, this->AS, 1);

	CUDA_CPY_SYNC (this->P, this->R, sizeof(TYPE)*this->num_elem);
	cublasXaxpy(this->cublasHandle, this->num_elem, this->beta, this->AS, 1, this->P, 1);

	flip 	= this->p0;
	this->p0= this->p1;
	this->p1= flip;
      }

      i=j-1;
      // SYNC HOST-DEVICE to check error
      cublasXot(this->cublasHandle, this->num_elem, this->R, 1, this->R,1,this->res);
      //COPY SYNC
      CUDA_CPY_SYNC(&result, this->res, sizeof(TYPE));
      if (result <= limit){
	break;
      }
    }


    if (result <= limit){
      _DEBUG("Finalize by precision at iteration %d [%.16e <= %.16e]",
	     i-1, result, RANGE_ERROR);
    } else {
      _DEBUG("Finalize by iterations [%.16e > %.16e]", result, RANGE_ERROR);
    }
    checkError(__PRETTY_FUNCTION__, __BASE_FILE__, __LINE__);

  }catch (cudaException &e){
    e.print();
    throw e;
  }
}

   //! \brief Copy the current potential to the pointer of the parameter
  //  @param potential Pointer on which the potential will be copied 
void SolverImGPU::snapshot(double *potential) throw(){
  try{
    CUDA_CPY(potential, this->pot, sizeof(TYPE) * this->num_elem, stream[0]);
  }catch (cudaException &e){
    e.print();
    throw e;
  }
}


//! \brief Free all memory used by solver
SolverImGPU::~SolverImGPU(){
  CUDA_FREE(this->csr_Val);
  CUDA_FREE(this->csr_Col_Ind);
  CUDA_FREE(this->csr_Row_Ptr);

  CUDA_FREE(this->pot);
  CUDA_FREE(this->dx);
  CUDA_FREE(this->c_m);
  CUDA_FREE(this->r_a);
  CUDA_FREE(this->K);
  CUDA_FREE(this->input_on);
  CUDA_FREE(this->input_off);

  CUDA_FREE(this->indVec_DL_HH);
  CUDA_FREE(this->indVec_HH);
  CUDA_FREE(this->indMat_HH);
  CUDA_FREE(this->type_HH);
  CUDA_FREE(this->n);
  CUDA_FREE(this->m);
  CUDA_FREE(this->h);
  CUDA_FREE(this->E_Na);
  CUDA_FREE(this->G_Na);
  CUDA_FREE(this->g_Na);
  CUDA_FREE(this->E_L);
  CUDA_FREE(this->G_L);
  CUDA_FREE(this->E_K);
  CUDA_FREE(this->G_K);
  CUDA_FREE(this->g_K);
  CUDA_FREE(this->rest_pot);
  CUDA_FREE(this->W);
}