/* [+MEQ MatlabEQuilibrium Toolbox+] Swiss Plasma Center EPFL Lausanne 2022. All rights reserved. */
# include "meq.h"
# define IND(i, j, n) ((i)+(n)*(j))
# define GIND(v, i, j) (v + IND(i, j, nz))
# define CIND(v, i, l) (v + IND(i, l, nr2))

# define GSP \
{ \
  FLT pp = *--pi, *crl = CIND(cr, nr2-1, l), *cql = CIND(cq, 0, l), *csl = CIND(cs, 1, l); \
  for (j = nr2; --j; ) pp = *--pi += *--crl * pp; \
  *pi = pp = *cql++ * *pi; pi++; \
  for (j = nr2; --j; pi++) pp = *pi = *cql++ * *pi + *csl++ * pp; \
}
# define GSU \
{ \
  for (j = nr2, f = GIND(Fx, i, nr2); j--; f -= nz ) *f += *--pi; \
}

void FLT_NAME(gszr)(FLT *Fx, FLT *Fb, FLT *Jy, FLT *cx,
                    FLT *cq, FLT *cr, FLT *cs, FLT ci, FLT co,
                    FLT *p, int nz, int nr, int ntot, FLT dz)
{
  int    nz1 = nz-1, nz2 = nz1-1;
  int    nr1 = nr-1, nr2 = nr1-1;
  int    i, j, k, l;
  FLT *f, *pi;
  for (k = 0; k < ntot; k++) {
    /* Initialize Fx with Jy ... */
    {
      FLT c, *Jp = Jy, *cxp = cx;
      f = GIND(Fx, 1, 1);
      for (j = nr1; --j;) {
        c = *cxp++;
        for (i = nz1; --i; ) *f++ = *Jp++ * c;
        f += 2;
      }
    }
    /* ... and Fb */
    {
      FLT *f1 = Fx, *fb1 = Fb, *fnr = GIND(Fx, 0, nr1), *fbnr = Fb + nz + 2*nr2,	*f2 = GIND(Fx, 0, 1), *fnr1 = GIND(Fx, 0, nr2);
      for (i = nz; i--;) {
        *f1++    = *fb1        ;
        *fnr++   = *fbnr       ;
        *f2++   += *fb1++  * ci;
        *fnr1++ += *fbnr++ * co;
      }
    }
    {
      FLT *f1 = GIND(Fx, 0, 1), *fnz = GIND(Fx, nz1, 1), *fb = Fb + nz;
      for (j = nr2; j--;) {
        *f1  = *fb++; f1  += nz;
        *fnz = *fb++; fnz += nz;
      }
    }

    /* The four following loops on i can be parallelised, but that requires independent p storage,
     * size nz-2 x nr-2, and pi = p + (i-1)*nr2 */
    {
      l = (nz1>>1) - 1;
      for (i = 2; i < nz1; i += 2) {
        pi = p;
        f = GIND(Fx, i, 1);
        for (j = nr2; j--; f += nz) {
          *pi++ = FLTC(2.0) * *f;
          *f = *(f+1) + *(f-1);
        }
        GSP;
        GSU;
      }
    }

    int li = nz1>>1, lo = li>>1, id = 2, ih = 1, io = 4;
    {
      int ii;
      while (lo > 1) {
        ii = id+ih;
        for (i = io; i < nz1; i += io) {
          pi = p;
          f = GIND(Fx, i, 1);
          for (j = nr2; j--; f += nz) {
            *pi = *f - *(f+ii) - *(f-ii);
            *f += *(f+id) + *(f-id) - *(f+ih) - *(f-ih);
            *pi++ += *f;
          }
          for (l = lo-1; l < nz1; l += li) GSP;
          GSU;
        }
        id <<= 1; ih <<= 1; io <<= 1; lo >>= 1; li >>= 1;
      }
    }

    {
      int ii = io;
      io = id;
      while (ih >= 1) {
        for (i = io; i < nz1; i += ii) {
          pi = p;
          f = GIND(Fx, i, 1);
          for (j = nr2; j--; f += nz) {
            *pi++ = FLTC(2.0) * *f + *(f+id) + *(f-id);
            *f -= *(f+ih) + *(f-ih);
          }
          for (l = lo-1; l < nz1; l += li) GSP;
          GSU;
        }
        id >>= 1; ih >>= 1; io >>= 1; ii >>= 1; lo <<= 1; li <<= 1;
      }
    }

    {
      l = lo-1;
      for (i = 1; i < nz1; i += 2) {
        pi = p;
        f = GIND(Fx, i, 1);
        for (j = nr2; j--; f += nz) {
          *pi++ = FLTC(2.0) * *f + *(f+1) + *(f-1);
          *f = FLTC(0.0);
        }
        GSP;
        GSU;
      }
    }

    /* Add dFx/dz * dz */
    if (dz) {
      FLT hdz = FLTC(0.5) * dz, d0, d1, *f1 = Fx;
      for (j = nr; j--; ) {
        f = f1;
        d0 = (*(++f1) - *f) * dz;
        for (i = nz1; --i; ) {
          d1 = (*(++f1) - *f) * hdz;
          *f++ += d0;
          d0 = d1;
        }
        d1 = (*f1 - *f) * dz;
        *f += d0;
        *f1++ += d1;
      }
    } /* if (dz) */

    /* Prepare inputs and outputs for next solution */
    Jy += nr2*nz2;
    Fb += 2*(nz+nr2);
    Fx += nr*nz;
  }
}