/*
 * Copyright (C) 2004 Evan Thomas
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or (at
 * your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 */

/***********************************************************************
  Calculate the Jacobian for the state equations associated with a cell
************************************************************************/

#define P3_MODULE
#include "ndl.h"

#define COPY(v1, v2) {memcpy(v1, v2, n*sizeof(double));}

#define SQRT_DBL_EPSILON   1.5e-08
#define BR 1.43e-7  /* SQRT_DBL_EPSILON^(7/8) */
#define BU 0.011    /* SQRT_DBL_EPSILON^(1/4) */
#define BL 1.36e-6  /* SQRT_DBL_EPSILON^(3/4) */
#define UI 0.11     /* SQRT_DBL_EPSILON^(1/8) */

static double *a[4], *d[4];
static double *deltaDYDT, *deltaDYDT2;
/* Working storage */              
static int *indx;
static double *vv, *col;

/* linbcg storage */
static double *p,*pp,*r,*rr,*z,*zz, *bout;

/* Utility/debug routines */
void prtb(char*, double*, int);
void prtspA(char*, double*, int*);
void prtA(char*, double**, int);
void sprsin(double**, int, double**, int**, unsigned int*, Cell*);
void sprsTin(double**, int, double**, int**, unsigned int*, Cell*);

void matrix_init(const GD *gd, int n) {
    int i;
	static int nMax;

	if( n<nMax ) return;

	if( a[0] ) {
		for(i=0; i<4; i++) {
			PyMem_Free(a[i]);
			PyMem_Free(d[i]);
		}
		PyMem_Free(deltaDYDT);
		PyMem_Free(deltaDYDT2);
		PyMem_Free(vv);
		PyMem_Free(col);
		PyMem_Free(indx);
		PyMem_Free(p);
		PyMem_Free(pp);
		PyMem_Free(r);
		PyMem_Free(rr);
		PyMem_Free(z);
		PyMem_Free(zz);
		PyMem_Free(bout);
	}

    for(i=0; i<4; i++) {
        a[i] = getmain(n*sizeof(*a[i]));
        d[i] = getmain(n*sizeof(*d[i]));
    }
    deltaDYDT  = getmain(n*sizeof(double));
    deltaDYDT2 = getmain(n*sizeof(double));
    vv   = getmain(n*sizeof(*vv));
    col  = getmain(n*sizeof(*col));
    indx = getmain(n*sizeof(*indx));

    /* linbcg storage */
    p = getmain(3*n*sizeof(double));
    pp = getmain(3*n*sizeof(double));
    r = getmain(3*n*sizeof(double));
    rr = getmain(3*n*sizeof(double));
    z = getmain(3*n*sizeof(double));
    zz = getmain(3*n*sizeof(double));
    bout = getmain(3*n*sizeof(double));

	nMax = n;
}

static void diffcol(double t, double *X, double **J, int col, int n, double *abserr) {
    /* Calculate ith column of the Jacobian function at (X,t)

    Taken from the GNU Scientific Library.
    
    */
    
    int i, k, l;
    double h = SQRT_DBL_EPSILON;
    double a3;

    /* Algorithm based on description on pg. 204 of Conte and de Boor
    (CdB) - coefficients of Newton form of polynomial of degree 3. */

    for (i = 0; i < 4; i++){
        COPY(a[i], X);
        a[i][col] = X[col] + (i - 2.0) * h;
        derivs(t, a[i], d[i]);
    }

    for(k=1; k<5; k++) {
        for(i=0; i<4-k; i++) {
            d[i][col] = (d[i+1][col]-d[i][col]) / (a[i+k][col]-a[i][col]);
        }
    }

    /* Adapt procedure described on pg. 282 of CdB to find best
    value of step size. */

    a3 = fabs (d[0][col] + d[1][col] + d[2][col] + d[3][col]);

    if (a3 < 100.0 * SQRT_DBL_EPSILON)
        a3 = 100.0 * SQRT_DBL_EPSILON;

    h = pow (SQRT_DBL_EPSILON / (2.0 * a3), 1.0 / 3.0);

    if (h > 100.0 * SQRT_DBL_EPSILON)
        h = 100.0 * SQRT_DBL_EPSILON;

    COPY(a[0], X); a[0][col] = X[col] + h;
    derivs(t, a[0], d[0]);
    COPY(a[1], X); a[1][col] = X[col] - h;
    derivs(t, a[1], d[1]);
    for(l=0; l<n; l++)
        J[l][col] = (d[0][l] - d[1][l]) / (2.0 * h);

    *abserr = fabs (100.0 * a3 * h * h);

    return;
}

void quality_jacobian(double t, double *X,
                      double *DYDT, double *fac,
                      int n, double **J,
                      int *functionCnt, Cell *o) {
    int i;
    double abserr;

    
    for(i=0; i<n; i++) {
        *functionCnt += 3;
        diffcol(t, X, J, i, n, &abserr);
    }
}

static void addJel(int row, int col, double x,
                   int n, double **J,
                   double **spJ, int **spI, int *spCnt,
                   bool UseSparse, Cell *o) {
  /* add an element to the Jacabian, which may be sparse */

  static int k;

  if( !UseSparse ) {
    J[row][col] = x;
    return;
  }

  /* Build the TRANSPOSE sparse matrix - (see sprsin) */
  if( row==0 && col==0 ) {
      k=n;
      (*spI)[0] = n + 1;
  }

  if( row==col ) {
      (*spJ)[row] = x;
  }

  if( x!=0 && row!=col ) {
  
      if( ++k >= *spCnt ) {
    *spCnt = (int)((float)*spCnt*1.1+1);
    *spJ = PyMem_Realloc(*spJ, *spCnt*sizeof(double));
    *spI = PyMem_Realloc(*spI, *spCnt*sizeof(int));
    if( *spI==0 || *spJ==0 ) 
      ABEND("No memory in addJel\n", o);
  }

      (*spJ)[k] = x; /* Store off-diagonal elements and their columns. */
      (*spI)[k] = row;
    }

  if( row==n-1 ) 
      (*spI)[col+1] = k + 1;
 
}

void jacobian(double t, double *X,
              double *DYDT, double *fac,
              int n, double **J,
              int *functionCnt, Cell *o) {
  /* Implementation of the Salane jacobian scheme.
     Adaptive Routines for Forming Jacobians Numerically.
     Sandia technical report 86-1319
     (available electronically from Sandia labs web site)
  */
  
  int imax, col, i;
  double diff, scale, yscale;
  double xhold, delta;
  
  for(col=0; col<n; col++) {
    /* If first time through, just make a guess at fac */
    if( fac[col]==0 ) fac[col] = SQRT_DBL_EPSILON;
    
    yscale = max(fabs(X[col]), SQRT_DBL_EPSILON);
    delta = fac[col] * fabs(yscale);
    delta = min(delta, 0.1);
    delta = max(SQRT_DBL_EPSILON * fabs(yscale), delta);
    
    /* f(x+delta, y) */
    xhold = X[col];
    X[col] += delta;
    derivs(t, X, deltaDYDT);
    X[col] = xhold;
    (*functionCnt)++;
    
    diff = 0;
    imax = 0;
    for(i=0; i<n; i++) {
      double f = fabs(DYDT[i]-deltaDYDT[i]);
      /* scale and diff for calculating next fac */
      if( diff<f ) diff = f, imax = i;
      /* Jacobian column */
      J[i][col] = (deltaDYDT[i] - DYDT[i]) / delta;
    }
    scale = fabs(DYDT[imax]) > fabs(deltaDYDT[imax]) ? fabs(DYDT[imax]) : fabs(deltaDYDT[imax]);
    
    /* Adjust fac[col] for next time around */
    if( DYDT[imax]==0 || deltaDYDT[imax]==0 ) continue;
    if( diff > BU*scale ) fac[col] *= UI;
    if( BR*scale <= diff && diff <= BL*scale ) fac[col] /= UI;
    
    if( diff < BR*scale ) {
      /* Delta might be too small and we might be
	 thrashing roundoff error. So try a larger value. */            
      double sqrtfac = pow(fac[col], 0.5);
      double n1=0, n2=0;
      
      yscale = max(fabs(X[col]), SQRT_DBL_EPSILON);
      delta = sqrtfac * fabs(yscale);
      delta = min(delta, 0.1);
      delta = max(SQRT_DBL_EPSILON * fabs(yscale), delta);
      
      /* f(x+delta, y)_2 */
      xhold = X[col];
      X[col] += delta;
      derivs(t, X, deltaDYDT2);
      X[col] = xhold;
      (*functionCnt)++;
      
      for(i=0; i<n; i++) {
	if( n1<fabs(deltaDYDT[i]) )  n1 = fabs(deltaDYDT[i]);
	if( n2<fabs(deltaDYDT2[i]) ) n2 = fabs(deltaDYDT2[i]);
      }
      /* Extrapolation test 2.5 of Salane */
      if( sqrtfac*n2 > n1 ) {
	diff = 0; imax = 0;
	for(i=0; i<n; i++) {
	  double f = fabs(DYDT[i]-deltaDYDT2[i]);
	  /* scale and diff for calculating next fac */
	  if( diff<f ) diff = f, imax = i;
	  /* Jacobian column */
	  J[i][col] = (deltaDYDT2[i] - DYDT[i]) / delta;
	}
	scale = fabs(DYDT[imax]) > fabs(deltaDYDT[imax]) ? fabs(DYDT[imax]) : fabs(deltaDYDT[imax]);
	/* Adjust fac[col] for next time around */
	fac[col] = sqrtfac;
	if( DYDT[imax]==0 || deltaDYDT[imax]==0 ) continue;
	if( diff > BU*scale ) fac[col] *= UI;
	if( BR*scale <= diff && diff <= BL*scale ) fac[col] /= UI;
      }
    }
  }
}


void sparse_jacobian(double t, double *X,
		     double *DYDT, double *fac,
		     int n, double **spJ, int **spI, int *spCnt,
		     int *functionCnt, Cell *o) {
  /* Implementation of the Salane jacobian scheme.
     Adaptive Routines for Forming Jacobians Numerically.
     Sandia technical report 86-1319
     (available electronically from Sandia labs web site)
  */
  
  int imax, col, i;
  double diff, scale, yscale;
  double xhold, delta, f;
  
  for(col=0; col<n; col++) {
    /* If first time through, just make a guess at fac */
    if( fac[col]==0 ) fac[col] = SQRT_DBL_EPSILON;
    
    yscale = max(fabs(X[col]), SQRT_DBL_EPSILON);
    delta = fac[col] * fabs(yscale);
    delta = min(delta, 0.1);
    delta = max(SQRT_DBL_EPSILON * fabs(yscale), delta);
    
    /* f(x+delta, y) */
    xhold = X[col];
    X[col] += delta;
    derivs(t, X, deltaDYDT);
    X[col] = xhold;
    (*functionCnt)++;
    
    diff = 0;
    imax = 0;
    //#define GOOD
    //#ifdef GOOD
    for(i=0; i<n; i++) {
      f = fabs(DYDT[i]-deltaDYDT[i]);
      /* scale and diff for calculating next fac */
      if( diff<f ) diff = f, imax = i;
      /* Jacobian column */
      addJel(i, col, (deltaDYDT[i] - DYDT[i]) / delta,
	     n, 0, spJ, spI, spCnt, 1, o);
    }   
    //#else
    //f = fabs(DYDT[col]-deltaDYDT[col]);
    //if( diff<f ) diff = f, imax = col;
    //spJ[col] = (deltaDYDT[col] - DYDT[col]) / delta;
    //for(i=spI[col]; i<spI[col+1]; i++) {
    //  f = fabs(DYDT[spI[i]]-deltaDYDT[spI[i]]);
    //  /* scale and diff for calculating next fac */
    //  if( diff<f ) diff = f, imax = i;
    //  /* Jacobian column */
    //  spJ[i] = (deltaDYDT[spI[i]] - DYDT[spI[i]]) / delta;
    // }
    //#endif
    scale = fabs(DYDT[imax]) > fabs(deltaDYDT[imax]) ? fabs(DYDT[imax]) : fabs(deltaDYDT[imax]);
    
    /* Adjust fac[col] for next time around */
    if( DYDT[imax]==0 || deltaDYDT[imax]==0 ) continue;
    if( diff > BU*scale ) fac[col] *= UI;
    if( BR*scale <= diff && diff <= BL*scale ) fac[col] /= UI;
    
    if( diff < BR*scale ) {
      /* Delta might be too small and we might be
	 thrashing roundoff error. So try a larger value. */            
      double sqrtfac = pow(fac[col], 0.5);
      double n1=0, n2=0;
      
      yscale = max(fabs(X[col]), SQRT_DBL_EPSILON);
      delta = sqrtfac * fabs(yscale);
      delta = min(delta, 0.1);
      delta = max(SQRT_DBL_EPSILON * fabs(yscale), delta);
      
      /* f(x+delta, y)_2 */
      xhold = X[col];
      X[col] += delta;
      derivs(t, X, deltaDYDT2);
      X[col] = xhold;
      (*functionCnt)++;
      
      for(i=0; i<n; i++) {
	if( n1<fabs(deltaDYDT[i]) )  n1 = fabs(deltaDYDT[i]);
	if( n2<fabs(deltaDYDT2[i]) ) n2 = fabs(deltaDYDT2[i]);
      }
      /* Extrapolation test 2.5 of Salane */
      if( sqrtfac*n2 > n1 ) {
	diff = 0; imax = 0;
	for(i=0; i<n; i++) {
	  f = fabs(DYDT[i]-deltaDYDT2[i]);
	  /* scale and diff for calculating next fac */
	  if( diff<f ) diff = f, imax = i;
	  /* Jacobian column */
      addJel(i, col, (deltaDYDT2[i] - DYDT[i]) / delta,
	     n, 0, spJ, spI, spCnt, 1, o);
	}
	
	scale = fabs(DYDT[imax]) > fabs(deltaDYDT[imax]) ? fabs(DYDT[imax]) : fabs(deltaDYDT[imax]);
	/* Adjust fac[col] for next time around */
	fac[col] = sqrtfac;
	if( DYDT[imax]==0 || deltaDYDT[imax]==0 ) continue;
	if( diff > BU*scale ) fac[col] *= UI;
	if( BR*scale <= diff && diff <= BL*scale ) fac[col] /= UI;
      }
    }
  }

//  {
//#ifdef GOOD
//      prtspA("spJgood.dat", spJ, spI);
//#else
//      prtspA("spJbad.dat", spJ, spI);
//#endif
//    exit(0);
//  }
}

/* Solve linear equations using LU decomposition

   From Numerical Recipies in C  */

#define TINY 1.0e-20

static void ludcmp(double **a, int n, double *d, Cell *o) {
    int i, imax=0, j, k;
    double big, dum, sum, temp;

    *d = 1.0;
    for(i=0; i<n; i++) {
        big = 0.0;
        for(j=0; j<n; j++)
            if( (temp=fabs(a[i][j])) > big ) big = temp;

        if( big == 0.0 ) {
            ABEND("Singular matrix in ludcmp().\n", o);
        }

        vv[i] = 1.0/big;
    }

    for(j=0; j<n; j++) {
        for(i=0; i<j; i++) {
            sum = a[i][j];
            for(k=0; k<i; k++) sum -= a[i][k] * a[k][j];
            a[i][j] = sum;
        }

        big = 0.0;
        for(i=j; i<n; i++) {
            sum = a[i][j];
            for(k=0; k<j; k++)
                sum -= a[i][k] * a[k][j];
            a[i][j] = sum;

            /* Search for lowest pivot element */
            if( (dum = vv[i] * fabs(sum)) >= big ) {
                big = dum;
                imax = i;
            }
        }

        if( j != imax ) { /* Intechange rows? */
            for(k=0; k<n; k++) {
                dum = a[imax][k];
                a[imax][k] = a[j][k];
                a[j][k] = dum;
            }

            *d = -*d;
            vv[imax] = vv[j];
        }

        indx[j] = imax;
        if( a[j][j] == 0.0 ) a[j][j] = TINY;
        if( j != n-1 ) {
            dum = 1.0 / a[j][j];
            for(i=j+1; i<n; i++) a[i][j] *= dum;
        }
    }
}

static void lubksb(double **a, int n, double *b) {
  int i, ii=0, ip, j;
  double sum;

  for(i=0; i<n; i++) {
    ip = indx[i];
    sum = b[ip];
    b[ip] = b[i];
    if( ii )
      for(j=ii; j<=i-1; j++) sum -= a[i][j] * b[j];
    else if( sum ) ii = i;
    b[i] = sum;
  }

  for(i=n-1; i>=0; i--) {
    sum = b[i];
    for(j=i+1; j<n; j++) sum -= a[i][j] * b[j];
    b[i] = sum / a[i][i];
  }
}

void freevec(double *y) {
    PyMem_Free(y);
}

void freemat(double **y, int n) {
    int i;
    for(i=0; i<n; i++) PyMem_Free(y[i]);
    PyMem_Free(y);
}

double *vector(int n) { return getmain(n*sizeof(double)); }

double **matrix(int n) {
    int i;
    double **x;
    x = getmain(n*sizeof(double*));
    for(i=0; i<n; i++)
        x[i] = vector(n);
    return x;
}


void inv(double **in, double **out, int n, Cell *o) {
    /* Invert a matrix by LU decomposition */
    double d;
    int i, j;

    ludcmp(in, n, &d, o); /* Decompose the matrix just once. */
    for(j=0; j<n; j++) { /* Find inverse by columns. */
        for(i=0; i<n; i++) col[i]=0.0;
        col[j]=1.0;
        lubksb(in, n, col);
        for(i=0; i<n; i++) out[i][j]=col[i];
    }
}

#define EPS 1.0e-14

static void asolve(unsigned long n, double *spA, int *spI, double b[], double x[], int itrnsp);
static void atimes(unsigned long n, double *spA, int *spI, double x[], double r[], int itrnsp);

static void linbcg(unsigned long n, double *spA, int *spI, double b[],
		   double x[], double tol, int itmax, int *iter, double *err)
  /*
Solves A  x = b for x[1..n], given b[1..n], by the iterative biconjugate gradient method.
On input x[1..n] should be set to an initial guess of the solution (or all zeros); itol is 1,2,3,
or 4, specifying which convergence test is applied (see text); itmax is the maximum number
of allowed iterations; and tol is the desired convergence tolerance. On output, x[1..n] is
reset to the improved solution, iter is the number of iterations actually taken, and err is the
estimated error. The matrix A is referenced only through the user-supplied routines atimes,
which computes the product of either A or its transpose on a vector; and asolve, which solves
Ax = b or AT
x = b for some preconditioner matrix A (possibly the trivial diagonal part of A).
  */
{
  double snrm(unsigned long n, double sx[], int itol);
  unsigned long j;
  double ak,akden,bk,bkden=0,bknum,bnrm=0,dxnrm,xnrm,zm1nrm,znrm=0;
  static int itol = 1;

  /* Calculate initial residual. */
  *iter=0;
  atimes(n,spA,spI,x,r,0); /* Input to atimes is x[1..n], output is r[1..n];
                        the final 0 indicates that the matrix (not its
                        transpose) is to be used. */
    for (j=0;j<n;j++) {
      r[j]=b[j]-r[j];
      rr[j]=r[j];
    }
  
  /* atimes(n,A,r,rr,0); */ /*Uncomment this line to get the minimum residual
                            variant of the algorithm. */
  if (itol == 1) {
    bnrm=snrm(n,b,itol);
    asolve(n,spA,spI,r,z,0); /* Input to asolve is r[1..n], output is z[1..n]; 
                        the final 0 indicates that the matrix A (not
                        its transpose) is to be used. */
  }
  else if (itol == 2) {
    asolve(n,spA,spI,b,z,0);
    bnrm=snrm(n,z,itol);
    asolve(n,spA,spI,r,z,0);
  }
  else if (itol == 3 || itol == 4) {
    asolve(n,spA,spI,b,z,0);
    bnrm=snrm(n,z,itol);
    asolve(n,spA,spI,r,z,0);
    znrm=snrm(n,z,itol);
  }
  
  if( bnrm==0 ) { /* Does happen a lot! */
      for(j=0; j<n; j++) x[j] = b[j];
      *iter = 0;
      *err = 0;
      return;
  }

  while (*iter <= itmax) { /* Main loop. */
    ++(*iter);
    asolve(n,spA,spI,rr,zz,1); /* Final 1 indicates use of transpose matrix AT */

    for (bknum=0.0,j=0;j<n;j++) bknum += z[j]*rr[j];
    /* Calculate coefficient bk and direction vectors p and pp. */
    if (*iter == 1) {
      for (j=0;j<n;j++) {
        p[j]=z[j];
        pp[j]=zz[j];
      }
    }
    else {
      bk=bknum/bkden;
      for (j=0;j<n;j++) {
        p[j]=bk*p[j]+z[j];
        pp[j]=bk*pp[j]+zz[j];
      }
    }
    bkden=bknum; /* Calculate coefficient ak, new iterate x, and new 
                    residuals r and rr. */
    atimes(n,spA,spI,p,z,0);
    for (akden=0.0,j=0;j<n;j++) akden += z[j]*pp[j];
    ak=bknum/akden;
    atimes(n,spA,spI,pp,zz,1);
    for (j=0;j<n;j++) {
      x[j] += ak*p[j];
      r[j] -= ak*z[j];
      rr[j] -= ak*zz[j];
    }
    asolve(n,spA,spI,r,z,0); /* Solve A  z = r and check stopping criterion. */
    if (itol == 1)
      *err=snrm(n,r,itol)/bnrm;
    else if (itol == 2)
      *err=snrm(n,z,itol)/bnrm;
    else if (itol == 3 || itol == 4) {
      zm1nrm=znrm;
      znrm=snrm(n,z,itol);
      if (fabs(zm1nrm-znrm) > EPS*znrm) {
        dxnrm=fabs(ak)*snrm(n,p,itol);
        *err=znrm/fabs(zm1nrm-znrm)*dxnrm;
      } else {
        *err=znrm/bnrm; /* Error may not be accurate, so loop again. */
        continue;
      }
      xnrm=snrm(n,x,itol);
      if (*err <= 0.5*xnrm) *err /= xnrm;
      else {
        *err=znrm/bnrm; /* Error may not be accurate, so loop again. */
        continue;
      }
    }
    if (*err <= tol) break;
  }

}

double snrm(unsigned long n, double sx[], int itol)
  /* Compute one of two norms for a vector sx[1..n], as signaled by itol. Used by linbcg. */
{
  unsigned long i,isamax;
  double ans;
  if (itol <= 3) {
    ans = 0.0;
    for (i=0;i<n;i++) ans += sx[i]*sx[i]; /* Vector magnitude norm. */
    return sqrt(ans);
  } else {
    isamax=1;
    for (i=0;i<n;i++) { /* Largest component norm. */
      if (fabs(sx[i]) > fabs(sx[isamax])) isamax=i;
    }
    return fabs(sx[isamax]);
  }
}

static void sprsax(double *spA, int *spI,
		   double *x, double *b, int n)
/* Multiply a matrix in row-index sparse storage arrays sa and ija by a vector x[1..n], giving
a vector b[1..n]. */
{
    int i,k;
    
    for (i=0;i<n;i++) {
        b[i]=spA[i]*x[i]; /* Start with diagonal term. */
        for (k=spI[i];k<spI[i+1];k++) /* Loop over off-diagonal terms. */
            b[i] += spA[k]*x[spI[k]];
    }
}

static void sprstx(double *spA, int *spI,
		   double *x, double *b, int n)
/* Multiply the transpose of a matrix in row-index sparse storage arrays sa and ija by a vector
x[1..n], giving a vector b[1..n]. */
{
    int i,j,k;

    for (i=0;i<n;i++) b[i]=spA[i]*x[i]; /* Start with diagonal terms. */
    for (i=0;i<n;i++) { /* Loop over off-diagonal terms. */
        for (k=spI[i];k<spI[i+1];k++) {
            j=spI[k];
            b[j] += spA[k]*x[i];
        }
    }
}

static void atimes(unsigned long n, double *spA, int *spI, double x[],
		   double b[], int itrnsp)
{

  if (itrnsp) sprsax(spA,spI,x,b,n);
  else sprstx(spA,spI,x,b,n);

}

static void asolve(unsigned long n, double *spA, int *spI, double b[],
		   double x[], int itrnsp)
{
    unsigned long i;
    for(i=0;i<n;i++) x[i]=(spA[i] != 0.0 ? b[i]/spA[i] : b[i]);
}

void sprsin(double **A, int n, double **spA, int **spI,
		   unsigned int *spCnt, Cell *o)
/*
Converts a square matrix a[1..n][1..n] into row-indexed sparse storage mode. Only elements
of a with magnitude =thresh are retained. Output is in two linear arrays with dimension
nmax (an input parameter): sa[1..] contains array values, indexed by ija[1..]. The
number of elements filled of sa and ija on output are both ija[ija[1]-1]-1 (see text).
*/
{
    int i,j;
    unsigned int k;
    for (j=0;j<n;j++) (*spA)[j]=A[j][j]; /* Store diagonal elements. */
    *spI[0]=n+1; /* Index to 1st rowoff- diagonal element, if any. */
    k=n;
    for (i=0;i<n;i++) { /* Loop over rows. */
        for (j=0;j<n;j++) { /* Loop over columns. */
            if ( A[i][j] != 0 && i != j) {
                if (++k >= *spCnt) {
                    *spCnt = (int)((float)*spCnt*1.1+1);
                    *spA = PyMem_Realloc(*spA, *spCnt*sizeof(double));
                    *spI = PyMem_Realloc(*spA, *spCnt*sizeof(int));
                    if( spI==0 || spA==0 ) 
                        ABEND("No memory in sprsin\n", o);
                }
                (*spA)[k]=A[i][j]; /* Store off-diagonal elements and their columns. */
                (*spI)[k]=j;
            }
        }
        (*spI)[i+1]=k+1; /* As each row is completed, store index to next. */
    }
}

void sprsTin(double **A, int n, double **spA, int **spI,
		   unsigned int *spCnt, Cell *o)
/*
Converts a square matrix a[1..n][1..n] into row-indexed sparse storage mode. Only elements
of a with magnitude =thresh are retained. Output is in two linear arrays with dimension
nmax (an input parameter): sa[1..] contains array values, indexed by ija[1..]. The
number of elements filled of sa and ija on output are both ija[ija[1]-1]-1 (see text).
*/
{
    int i,j;
    unsigned int k;
    for (j=0;j<n;j++) (*spA)[j]=A[j][j]; /* Store diagonal elements. */
    *spI[0]=n+1; /* Index to 1st rowoff- diagonal element, if any. */
    k=n;
    for (i=0;i<n;i++) { /* Loop over rows. */
        for (j=0;j<n;j++) { /* Loop over columns. */
            if ( A[j][i] != 0 && i != j) {
                if (++k >= *spCnt) {
                    *spCnt = (int)((float)*spCnt*1.1+1);
                    *spA = PyMem_Realloc(*spA, *spCnt*sizeof(double));
                    *spI = PyMem_Realloc(*spA, *spCnt*sizeof(int));
                    if( spI==0 || spA==0 ) 
                        ABEND("No memory in sprsin\n", o);
                }
                (*spA)[k]=A[j][i]; /* Store off-diagonal elements and their columns. */
                (*spI)[k]=j;
            }
        }
        (*spI)[i+1]=k+1; /* As each row is completed, store index to next. */
    }
}

void prtspA(char *fn, double *spA, int *spI) {
    FILE *f = fopen(fn, "w");
    int i, j, k;
    int n = spI[0]-1;
    message(warn, "Jacobian is transposed.\n");
    k = n + 1;
    for(i=0; i<n; i++) {
        for(j=0; j<n; j++) {
            if( i==j ) {
                fprintf(f, "%g ", spA[i]);
                continue;
            }
            if( j==spI[k] && k<spI[i+1] )
                fprintf(f, "%g ", spA[k++]);
            else
                fprintf(f, "0 ");
        }
        fprintf(f, "\n");
    }
    fclose(f);
}

void prtA(char *fn, double **A, int n) {
    FILE *f = fopen(fn, "w");
    int i, j;
    for(i=0; i<n; i++) {
        for(j=0; j<n; j++) fprintf(f, "%g ", A[i][j]);
        fprintf(f, "\n");
    }
    fclose(f);
}

void prtb(char *fn, double *x, int n) {
    FILE *f = fopen(fn, "w");
    int i;
    for(i=0; i<n; i++) fprintf(f, "%g\n", x[i]);
    fclose(f);
}


void linsolve(double **A, double *spA, int *spI, double *b, int n,
	      double tol, Cell *o, int aIsDecomposed, int UseSparse) {
    /* Solve Ax=b, return x in b */
    double d, err=0;
    int iter, i;

    if( UseSparse ) {
        for(i=0; i<n; i++) bout[i] = 0;
        linbcg(n, spA, spI, b, bout, tol, n, &iter, &err);
        if( iter>n )
            message(warn, "linbcg() failed to achieve %g tolerance"
            " in %d interations, %g achieved\n", tol, n, err);

        for(i=0; i<n; i++) b[i] = bout[i];
    } else {
        if( !aIsDecomposed ) {
                ludcmp(A, n, &d, o);
        }
        lubksb(A, n, b);
    }
}

/*
  LU Decomposition (Iinject=0.01, 1 tree model, 40ms):
[000] info: made it in 6377s                                                    
[000] info: Integration stats:                                                  
[000] info: cellid=0 method=rw23 stepTotal=5077 stepAccepts=3441 (67.7762%) 
            functionCnt=1796033 jacobianCnt=3441 newtonCnt=15231

Biconjugate gradient (Iinject=0.01, 1 tree model, 40ms):
[000] info: made it in 10s                                                      
[000] info: Integration stats:                                                  
[000] info: cellid=0 method=rw23 stepTotal=18 stepAccepts=18 (100%) 
            functionCnt=9378 jacobianCnt=18 newtonCnt=54                                                
*/
