affbe5f5创建于 2025年3月25日历史提交
/*******************************************************************
lower level fft stuff including routines called in fftext.c and fft2d.c
*******************************************************************/
#include "fftlib.h"
#include <math.h>
#include <stdlib.h>
#define	MCACHE	(11-(int)(sizeof(double)/8))	// fft's with M bigger than this bust primary cache

// some math constants to 40 decimal places
#define MYPI		3.141592653589793238462643383279502884197	// pi
#define MYROOT2 	1.414213562373095048801688724209698078569	// sqrt(2)
#define MYCOSPID8	0.9238795325112867561281831893967882868224	// cos(pi/8)
#define MYSINPID8	0.3826834323650897717284599840303988667613	// sin(pi/8)

#ifdef _MSC_VER
#define inline __inline
#endif


/*************************************************
routines to initialize tables used by fft routines
**************************************************/

void fftCosInit(int M, double *Utbl)
{
    /* Compute Utbl, the cosine table for ffts	*/
    /* of size (pow(2,M)/4 +1)	*/
    /* INPUTS */
    /* M = log2 of fft size	*/
    /* OUTPUTS */
    /* *Utbl = cosine table	*/
    int fftN = POW2(M);
    int i1;
    Utbl[0] = 1.0;
    for (i1 = 1; i1 < fftN/4; i1++)
        Utbl[i1] = cos( (2.0 * MYPI * i1) / fftN );
    Utbl[fftN/4] = 0.0;
}

void fftBRInit(int M, short *BRLow)
{
    /* Compute BRLow, the bit reversed table for ffts	*/
    /* of size pow(2,M/2 -1)	*/
    /* INPUTS */
    /* M = log2 of fft size	*/
    /* OUTPUTS */
    /* *BRLow = bit reversed counter table	*/
    int	Mroot_1 = M / 2 - 1;
    int	Nroot_1 = POW2(Mroot_1);
    int i1;
    int bitsum;
    int bitmask;
    int bit;
    for (i1 = 0; i1 < Nroot_1; i1++) {
        bitsum = 0;
        bitmask = 1;
        for (bit=1; bit <= Mroot_1; bitmask<<=1, bit++)
            if (i1 & bitmask)
                bitsum = bitsum + (Nroot_1 >> bit);
        BRLow[i1] = (short) bitsum;
    }
}

/************************************************
parts of ffts1
*************************************************/

static inline void bitrevR2(double *ioptr, int M, short *BRLow);
static inline void bitrevR2(double *ioptr, int M, short *BRLow)
{
    /*** bit reverse and first radix 2 stage of forward or inverse fft ***/
    double	f0r;
    double	f0i;
    double	f1r;
    double	f1i;
    double	f2r;
    double	f2i;
    double	f3r;
    double	f3i;
    double	f4r;
    double	f4i;
    double	f5r;
    double	f5i;
    double	f6r;
    double	f6i;
    double	f7r;
    double	f7i;
    double	t0r;
    double	t0i;
    double	t1r;
    double	t1i;
    double	*p0r;
    double	*p1r;
    double	*IOP;
    double 	*iolimit;
    int 	Colstart;
    int 	iCol;
    int 	posA;
    int 	posAi;
    int 	posB;
    int 	posBi;

    const int Nrems2 = POW2((M+3)/2);
    const int Nroot_1_ColInc = POW2(M)-Nrems2;
    const int Nroot_1 = POW2(M/2-1)-1;
    const int ColstartShift = (M+1)/2 +1;
    posA = POW2(M);		// 1/2 of POW2(M) complexes
    posAi = posA + 1;
    posB = posA + 2;
    posBi = posB + 1;

    iolimit = ioptr + Nrems2;
    const size_t ioptr_inc = (size_t) POW2((M / 2 + 1));
    for (; ioptr < iolimit; ioptr += ioptr_inc) {
        for (Colstart = Nroot_1; Colstart >= 0; Colstart--) {
            iCol = Nroot_1;
            p0r = ioptr+ Nroot_1_ColInc + BRLow[Colstart]*4;
            IOP = ioptr + (Colstart << ColstartShift);
            p1r = IOP + BRLow[iCol]*4;
            f0r = *(p0r);
            f0i = *(p0r+1);
            f1r = *(p0r+posA);
            f1i = *(p0r+posAi);
            for (; iCol > Colstart;) {
                f2r = *(p0r+2);
                f2i = *(p0r+(2+1));
                f3r = *(p0r+posB);
                f3i = *(p0r+posBi);
                f4r = *(p1r);
                f4i = *(p1r+1);
                f5r = *(p1r+posA);
                f5i = *(p1r+posAi);
                f6r = *(p1r+2);
                f6i = *(p1r+(2+1));
                f7r = *(p1r+posB);
                f7i = *(p1r+posBi);

                t0r	= f0r + f1r;
                t0i	= f0i + f1i;
                f1r = f0r - f1r;
                f1i = f0i - f1i;
                t1r = f2r + f3r;
                t1i = f2i + f3i;
                f3r = f2r - f3r;
                f3i = f2i - f3i;
                f0r = f4r + f5r;
                f0i = f4i + f5i;
                f5r = f4r - f5r;
                f5i = f4i - f5i;
                f2r = f6r + f7r;
                f2i = f6i + f7i;
                f7r = f6r - f7r;
                f7i = f6i - f7i;

                *(p1r) = t0r;
                *(p1r+1) = t0i;
                *(p1r+2) = f1r;
                *(p1r+(2+1)) = f1i;
                *(p1r+posA) = t1r;
                *(p1r+posAi) = t1i;
                *(p1r+posB) = f3r;
                *(p1r+posBi) = f3i;
                *(p0r) = f0r;
                *(p0r+1) = f0i;
                *(p0r+2) = f5r;
                *(p0r+(2+1)) = f5i;
                *(p0r+posA) = f2r;
                *(p0r+posAi) = f2i;
                *(p0r+posB) = f7r;
                *(p0r+posBi) = f7i;

                p0r -= Nrems2;
                f0r = *(p0r);
                f0i = *(p0r+1);
                f1r = *(p0r+posA);
                f1i = *(p0r+posAi);
                iCol -= 1;
                p1r = IOP + BRLow[iCol]*4;
            }
            f2r = *(p0r+2);
            f2i = *(p0r+(2+1));
            f3r = *(p0r+posB);
            f3i = *(p0r+posBi);

            t0r   = f0r + f1r;
            t0i   = f0i + f1i;
            f1r = f0r - f1r;
            f1i = f0i - f1i;
            t1r   = f2r + f3r;
            t1i   = f2i + f3i;
            f3r = f2r - f3r;
            f3i = f2i - f3i;

            *(p0r) = t0r;
            *(p0r+1) = t0i;
            *(p0r+2) = f1r;
            *(p0r+(2+1)) = f1i;
            *(p0r+posA) = t1r;
            *(p0r+posAi) = t1i;
            *(p0r+posB) = f3r;
            *(p0r+posBi) = f3i;

        }
    }
}

static inline void fft2pt(double *ioptr);
static inline void fft2pt(double *ioptr)
{
    /***   RADIX 2 fft	***/
    double f0r, f0i, f1r, f1i;
    double t0r, t0i;

    /* bit reversed load */
    f0r = ioptr[0];
    f0i = ioptr[1];
    f1r = ioptr[2];
    f1i = ioptr[3];

    /* Butterflys		*/
    /*
    f0	-	-	t0
    f1	-  1 -	f1
    */

    t0r = f0r + f1r;
    t0i = f0i + f1i;
    f1r = f0r - f1r;
    f1i = f0i - f1i;

    /* store result */
    ioptr[0] = t0r;
    ioptr[1] = t0i;
    ioptr[2] = f1r;
    ioptr[3] = f1i;
}


static inline void fft4pt(double *ioptr);
static inline void fft4pt(double *ioptr)
{
    /***   RADIX 4 fft	***/
    double f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
    double t0r, t0i, t1r, t1i;

    /* bit reversed load */
    f0r = ioptr[0];
    f0i = ioptr[1];
    f1r = ioptr[4];
    f1i = ioptr[5];
    f2r = ioptr[2];
    f2i = ioptr[3];
    f3r = ioptr[6];
    f3i = ioptr[7];

    /* Butterflys		*/
    /*
    f0	-	-	t0	-	-	f0
    f1	-  1 -	f1	-	-	f1
    f2	-	-	f2	-  1 -	f2
    f3	-  1 -	t1	- -i -	f3
    */

    t0r = f0r + f1r;
    t0i = f0i + f1i;
    f1r = f0r - f1r;
    f1i = f0i - f1i;

    t1r = f2r - f3r;
    t1i = f2i - f3i;
    f2r = f2r + f3r;
    f2i = f2i + f3i;

    f0r = t0r + f2r;
    f0i = t0i + f2i;
    f2r = t0r - f2r;
    f2i = t0i - f2i;

    f3r = f1r - t1i;
    f3i = f1i + t1r;
    f1r = f1r + t1i;
    f1i = f1i - t1r;

    /* store result */
    ioptr[0] = f0r;
    ioptr[1] = f0i;
    ioptr[2] = f1r;
    ioptr[3] = f1i;
    ioptr[4] = f2r;
    ioptr[5] = f2i;
    ioptr[6] = f3r;
    ioptr[7] = f3i;
}

static inline void fft8pt(double *ioptr);
static inline void fft8pt(double *ioptr)
{
    /***   RADIX 8 fft	***/
    double w0r = 1.0/MYROOT2; /* cos(pi/4)	*/
    double f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
    double f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
    double t0r, t0i, t1r, t1i;
    const double Two = 2.0;

    /* bit reversed load */
    f0r = ioptr[0];
    f0i = ioptr[1];
    f1r = ioptr[8];
    f1i = ioptr[9];
    f2r = ioptr[4];
    f2i = ioptr[5];
    f3r = ioptr[12];
    f3i = ioptr[13];
    f4r = ioptr[2];
    f4i = ioptr[3];
    f5r = ioptr[10];
    f5i = ioptr[11];
    f6r = ioptr[6];
    f6i = ioptr[7];
    f7r = ioptr[14];
    f7i = ioptr[15];
    /* Butterflys		*/
    /*
    f0	-	-	t0	-	-	f0	-	-	f0
    f1	-  1 -	f1	-	-	f1	-	-	f1
    f2	-	-	f2	-  1 -	f2	-	-	f2
    f3	-  1 -	t1	- -i -	f3	-	-	f3
    f4	-	-	t0	-	-	f4	-  1 -	t0
    f5	-  1 -	f5	-	-	f5	- w3 -	f4
    f6	-	-	f6	-  1 -	f6	- -i -	t1
    f7	-  1 -	t1	- -i -	f7	- iw3-	f6
    */

    t0r = f0r + f1r;
    t0i = f0i + f1i;
    f1r = f0r - f1r;
    f1i = f0i - f1i;

    t1r = f2r - f3r;
    t1i = f2i - f3i;
    f2r = f2r + f3r;
    f2i = f2i + f3i;

    f0r = t0r + f2r;
    f0i = t0i + f2i;
    f2r = t0r - f2r;
    f2i = t0i - f2i;

    f3r = f1r - t1i;
    f3i = f1i + t1r;
    f1r = f1r + t1i;
    f1i = f1i - t1r;


    t0r = f4r + f5r;
    t0i = f4i + f5i;
    f5r = f4r - f5r;
    f5i = f4i - f5i;

    t1r = f6r - f7r;
    t1i = f6i - f7i;
    f6r = f6r + f7r;
    f6i = f6i + f7i;

    f4r = t0r + f6r;
    f4i = t0i + f6i;
    f6r = t0r - f6r;
    f6i = t0i - f6i;

    f7r = f5r - t1i;
    f7i = f5i + t1r;
    f5r = f5r + t1i;
    f5i = f5i - t1r;


    t0r = f0r - f4r;
    t0i = f0i - f4i;
    f0r = f0r + f4r;
    f0i = f0i + f4i;

    t1r = f2r - f6i;
    t1i = f2i + f6r;
    f2r = f2r + f6i;
    f2i = f2i - f6r;

    f4r = f1r - f5r * w0r - f5i * w0r;
    f4i = f1i + f5r * w0r - f5i * w0r;
    f1r = f1r * Two - f4r;
    f1i = f1i * Two - f4i;

    f6r = f3r + f7r * w0r - f7i * w0r;
    f6i = f3i + f7r * w0r + f7i * w0r;
    f3r = f3r * Two - f6r;
    f3i = f3i * Two - f6i;

    /* store result */
    ioptr[0] = f0r;
    ioptr[1] = f0i;
    ioptr[2] = f1r;
    ioptr[3] = f1i;
    ioptr[4] = f2r;
    ioptr[5] = f2i;
    ioptr[6] = f3r;
    ioptr[7] = f3i;
    ioptr[8] = t0r;
    ioptr[9] = t0i;
    ioptr[10] = f4r;
    ioptr[11] = f4i;
    ioptr[12] = t1r;
    ioptr[13] = t1i;
    ioptr[14] = f6r;
    ioptr[15] = f6i;
}

static inline void bfR2(double *ioptr, int M, int NDiffU);
static inline void bfR2(double *ioptr, int M, int NDiffU)
{
    /*** 2nd radix 2 stage ***/
    int	pos;
    int	posi;
    int	pinc;
    int	pnext;
    int 	NSameU;
    int 	SameUCnt;

    double	*pstrt;
    double 	*p0r, *p1r, *p2r, *p3r;

    double f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
    double f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
    /* const double Two = 2.0; */

    pinc = NDiffU * 2;		// 2 doubles per complex
    pnext =  pinc * 4;
    pos = 2;
    posi = pos+1;
    NSameU = POW2(M) / 4 /NDiffU;	// 4 Us at a time
    pstrt = ioptr;
    p0r = pstrt;
    p1r = pstrt+pinc;
    p2r = p1r+pinc;
    p3r = p2r+pinc;

    /* Butterflys		*/
    /*
    f0	-	-	f4
    f1	-  1 -	f5
    f2	-	-	f6
    f3	-  1 -	f7
    */
    /* Butterflys		*/
    /*
    f0	-	-	f4
    f1	-  1 -	f5
    f2	-	-	f6
    f3	-  1 -	f7
    */

    for (SameUCnt = NSameU; SameUCnt > 0 ; SameUCnt--) {

        f0r = *p0r;
        f1r = *p1r;
        f0i = *(p0r + 1);
        f1i = *(p1r + 1);
        f2r = *p2r;
        f3r = *p3r;
        f2i = *(p2r + 1);
        f3i = *(p3r + 1);

        f4r = f0r + f1r;
        f4i = f0i + f1i;
        f5r = f0r - f1r;
        f5i = f0i - f1i;

        f6r = f2r + f3r;
        f6i = f2i + f3i;
        f7r = f2r - f3r;
        f7i = f2i - f3i;

        *p0r = f4r;
        *(p0r + 1) = f4i;
        *p1r = f5r;
        *(p1r + 1) = f5i;
        *p2r = f6r;
        *(p2r + 1) = f6i;
        *p3r = f7r;
        *(p3r + 1) = f7i;

        f0r = *(p0r + pos);
        f1i = *(p1r + posi);
        f0i = *(p0r + posi);
        f1r = *(p1r + pos);
        f2r = *(p2r + pos);
        f3i = *(p3r + posi);
        f2i = *(p2r + posi);
        f3r = *(p3r + pos);

        f4r = f0r + f1i;
        f4i = f0i - f1r;
        f5r = f0r - f1i;
        f5i = f0i + f1r;

        f6r = f2r + f3i;
        f6i = f2i - f3r;
        f7r = f2r - f3i;
        f7i = f2i + f3r;

        *(p0r + pos) = f4r;
        *(p0r + posi) = f4i;
        *(p1r + pos) = f5r;
        *(p1r + posi) = f5i;
        *(p2r + pos) = f6r;
        *(p2r + posi) = f6i;
        *(p3r + pos) = f7r;
        *(p3r + posi) = f7i;

        p0r += pnext;
        p1r += pnext;
        p2r += pnext;
        p3r += pnext;

    }
}

static inline void bfR4(double *ioptr, int M, int NDiffU);
static inline void bfR4(double *ioptr, int M, int NDiffU)
{
    /*** 1 radix 4 stage ***/
    int	pos;
    int	posi;
    int	pinc;
    int	pnext;
    int	pnexti;
    int 	NSameU;
    int 	SameUCnt;

    double	*pstrt;
    double 	*p0r, *p1r, *p2r, *p3r;

    double w1r = 1.0/MYROOT2; /* cos(pi/4)	*/
    double f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
    double f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
    double t1r, t1i;
    const double Two = 2.0;

    pinc = NDiffU * 2;		// 2 doubles per complex
    pnext =  pinc * 4;
    pnexti =  pnext + 1;
    pos = 2;
    posi = pos+1;
    NSameU = POW2(M) / 4 /NDiffU;	// 4 pts per butterfly
    pstrt = ioptr;
    p0r = pstrt;
    p1r = pstrt+pinc;
    p2r = p1r+pinc;
    p3r = p2r+pinc;

    /* Butterflys		*/
    /*
    f0	-	-	f0	-	-	f4
    f1	-  1 -	f5	-	-	f5
    f2	-	-	f6	-  1 -	f6
    f3	-  1 -	f3	- -i -	f7
    */
    /* Butterflys		*/
    /*
    f0	-	-	f4	-	-	f4
    f1	- -i -	t1	-	-	f5
    f2	-	-	f2	- w1 -	f6
    f3	- -i -	f7	- iw1-	f7
    */

    f0r = *p0r;
    f1r = *p1r;
    f2r = *p2r;
    f3r = *p3r;
    f0i = *(p0r + 1);
    f1i = *(p1r + 1);
    f2i = *(p2r + 1);
    f3i = *(p3r + 1);

    f5r = f0r - f1r;
    f5i = f0i - f1i;
    f0r = f0r + f1r;
    f0i = f0i + f1i;

    f6r = f2r + f3r;
    f6i = f2i + f3i;
    f3r = f2r - f3r;
    f3i = f2i - f3i;

    for (SameUCnt = NSameU-1; SameUCnt > 0 ; SameUCnt--) {

        f7r = f5r - f3i;
        f7i = f5i + f3r;
        f5r = f5r + f3i;
        f5i = f5i - f3r;

        f4r = f0r + f6r;
        f4i = f0i + f6i;
        f6r = f0r - f6r;
        f6i = f0i - f6i;

        f2r = *(p2r + pos);
        f2i = *(p2r + posi);
        f1r = *(p1r + pos);
        f1i = *(p1r + posi);
        f3i = *(p3r + posi);
        f0r = *(p0r + pos);
        f3r = *(p3r + pos);
        f0i = *(p0r + posi);

        *p3r = f7r;
        *p0r = f4r;
        *(p3r + 1) = f7i;
        *(p0r + 1) = f4i;
        *p1r = f5r;
        *p2r = f6r;
        *(p1r + 1) = f5i;
        *(p2r + 1) = f6i;

        f7r = f2r - f3i;
        f7i = f2i + f3r;
        f2r = f2r + f3i;
        f2i = f2i - f3r;

        f4r = f0r + f1i;
        f4i = f0i - f1r;
        t1r = f0r - f1i;
        t1i = f0i + f1r;

        f5r = t1r - f7r * w1r + f7i * w1r;
        f5i = t1i - f7r * w1r - f7i * w1r;
        f7r = t1r * Two - f5r;
        f7i = t1i * Two - f5i;

        f6r = f4r - f2r * w1r - f2i * w1r;
        f6i = f4i + f2r * w1r - f2i * w1r;
        f4r = f4r * Two - f6r;
        f4i = f4i * Two - f6i;

        f3r = *(p3r + pnext);
        f0r = *(p0r + pnext);
        f3i = *(p3r + pnexti);
        f0i = *(p0r + pnexti);
        f2r = *(p2r + pnext);
        f2i = *(p2r + pnexti);
        f1r = *(p1r + pnext);
        f1i = *(p1r + pnexti);

        *(p2r + pos) = f6r;
        *(p1r + pos) = f5r;
        *(p2r + posi) = f6i;
        *(p1r + posi) = f5i;
        *(p3r + pos) = f7r;
        *(p0r + pos) = f4r;
        *(p3r + posi) = f7i;
        *(p0r + posi) = f4i;

        f6r = f2r + f3r;
        f6i = f2i + f3i;
        f3r = f2r - f3r;
        f3i = f2i - f3i;

        f5r = f0r - f1r;
        f5i = f0i - f1i;
        f0r = f0r + f1r;
        f0i = f0i + f1i;

        p3r += pnext;
        p0r += pnext;
        p1r += pnext;
        p2r += pnext;

    }
    f7r = f5r - f3i;
    f7i = f5i + f3r;
    f5r = f5r + f3i;
    f5i = f5i - f3r;

    f4r = f0r + f6r;
    f4i = f0i + f6i;
    f6r = f0r - f6r;
    f6i = f0i - f6i;

    f2r = *(p2r + pos);
    f2i = *(p2r + posi);
    f1r = *(p1r + pos);
    f1i = *(p1r + posi);
    f3i = *(p3r + posi);
    f0r = *(p0r + pos);
    f3r = *(p3r + pos);
    f0i = *(p0r + posi);

    *p3r = f7r;
    *p0r = f4r;
    *(p3r + 1) = f7i;
    *(p0r + 1) = f4i;
    *p1r = f5r;
    *p2r = f6r;
    *(p1r + 1) = f5i;
    *(p2r + 1) = f6i;

    f7r = f2r - f3i;
    f7i = f2i + f3r;
    f2r = f2r + f3i;
    f2i = f2i - f3r;

    f4r = f0r + f1i;
    f4i = f0i - f1r;
    t1r = f0r - f1i;
    t1i = f0i + f1r;

    f5r = t1r - f7r * w1r + f7i * w1r;
    f5i = t1i - f7r * w1r - f7i * w1r;
    f7r = t1r * Two - f5r;
    f7i = t1i * Two - f5i;

    f6r = f4r - f2r * w1r - f2i * w1r;
    f6i = f4i + f2r * w1r - f2i * w1r;
    f4r = f4r * Two - f6r;
    f4i = f4i * Two - f6i;

    *(p2r + pos) = f6r;
    *(p1r + pos) = f5r;
    *(p2r + posi) = f6i;
    *(p1r + posi) = f5i;
    *(p3r + pos) = f7r;
    *(p0r + pos) = f4r;
    *(p3r + posi) = f7i;
    *(p0r + posi) = f4i;

}

static inline void bfstages(double *ioptr, int M, double *Utbl, int Ustride, int NDiffU, int StageCnt);
static inline void bfstages(double *ioptr, int M, double *Utbl, int Ustride, int NDiffU, int StageCnt)
{
    /***   RADIX 8 Stages	***/
    int	pos;
    int	posi;
    int	pinc;
    int	pnext;
    int 	NSameU;
    int 	Uinc;
    int 	Uinc2;
    int 	Uinc4;
    int 	DiffUCnt;
    int 	SameUCnt;
    int 	U2toU3;

    double	*pstrt;
    double 	*p0r, *p1r, *p2r, *p3r;
    double 	*u0r, *u0i, *u1r, *u1i, *u2r, *u2i;

    double w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i;
    double f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
    double f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
    double t0r, t0i, t1r, t1i;
    const double Two = 2.0;

    pinc = NDiffU * 2;		// 2 doubles per complex
    pnext =  pinc * 8;
    pos = pinc * 4;
    posi =  pos + 1;
    NSameU = POW2(M) / 8 /NDiffU;	// 8 pts per butterfly
    Uinc = NSameU * Ustride;
    Uinc2 = Uinc * 2;
    Uinc4 = Uinc * 4;
    U2toU3 = (POW2(M) / 8)*Ustride;
    for (; StageCnt > 0 ; StageCnt--) {

        u0r = &Utbl[0];
        u0i = &Utbl[POW2(M-2)*Ustride];
        u1r = u0r;
        u1i = u0i;
        u2r = u0r;
        u2i = u0i;

        w0r =  *u0r;
        w0i =  *u0i;
        w1r =  *u1r;
        w1i =  *u1i;
        w2r =  *u2r;
        w2i =  *u2i;
        w3r =  *(u2r+U2toU3);
        w3i =  *(u2i-U2toU3);

        pstrt = ioptr;

        p0r = pstrt;
        p1r = pstrt+pinc;
        p2r = p1r+pinc;
        p3r = p2r+pinc;

        /* Butterflys		*/
        /*
        f0	-	-	t0	-	-	f0	-	-	f0
        f1	- w0-	f1	-	-	f1	-	-	f1
        f2	-	-	f2	- w1-	f2	-	-	f4
        f3	- w0-	t1	- iw1-	f3	-	-	f5

        f4	-	-	t0	-	-	f4	- w2-	t0
        f5	- w0-	f5	-	-	f5	- w3-	t1
        f6	-	-	f6	- w1-	f6	- iw2-	f6
        f7	- w0-	t1	- iw1-	f7	- iw3-	f7
        */

        for (DiffUCnt = NDiffU; DiffUCnt > 0 ; DiffUCnt--) {
            f0r = *p0r;
            f0i = *(p0r + 1);
            f1r = *p1r;
            f1i = *(p1r + 1);
            for (SameUCnt = NSameU-1; SameUCnt > 0 ; SameUCnt--) {
                f2r = *p2r;
                f2i = *(p2r + 1);
                f3r = *p3r;
                f3i = *(p3r + 1);

                t0r = f0r + f1r * w0r + f1i * w0i;
                t0i = f0i - f1r * w0i + f1i * w0r;
                f1r = f0r * Two - t0r;
                f1i = f0i * Two - t0i;

                f4r = *(p0r + pos);
                f4i = *(p0r + posi);
                f5r = *(p1r + pos);
                f5i = *(p1r + posi);

                f6r = *(p2r + pos);
                f6i = *(p2r + posi);
                f7r = *(p3r + pos);
                f7i = *(p3r + posi);

                t1r = f2r - f3r * w0r - f3i * w0i;
                t1i = f2i + f3r * w0i - f3i * w0r;
                f2r = f2r * Two - t1r;
                f2i = f2i * Two - t1i;

                f0r = t0r + f2r * w1r + f2i * w1i;
                f0i = t0i - f2r * w1i + f2i * w1r;
                f2r = t0r * Two - f0r;
                f2i = t0i * Two - f0i;

                f3r = f1r + t1r * w1i - t1i * w1r;
                f3i = f1i + t1r * w1r + t1i * w1i;
                f1r = f1r * Two - f3r;
                f1i = f1i * Two - f3i;


                t0r = f4r + f5r * w0r + f5i * w0i;
                t0i = f4i - f5r * w0i + f5i * w0r;
                f5r = f4r * Two - t0r;
                f5i = f4i * Two - t0i;

                t1r = f6r - f7r * w0r - f7i * w0i;
                t1i = f6i + f7r * w0i - f7i * w0r;
                f6r = f6r * Two - t1r;
                f6i = f6i * Two - t1i;

                f4r = t0r + f6r * w1r + f6i * w1i;
                f4i = t0i - f6r * w1i + f6i * w1r;
                f6r = t0r * Two - f4r;
                f6i = t0i * Two - f4i;

                f7r = f5r + t1r * w1i - t1i * w1r;
                f7i = f5i + t1r * w1r + t1i * w1i;
                f5r = f5r * Two - f7r;
                f5i = f5i * Two - f7i;

                t0r = f0r - f4r * w2r - f4i * w2i;
                t0i = f0i + f4r * w2i - f4i * w2r;
                f0r = f0r * Two - t0r;
                f0i = f0i * Two - t0i;

                t1r = f1r - f5r * w3r - f5i * w3i;
                t1i = f1i + f5r * w3i - f5i * w3r;
                f1r = f1r * Two - t1r;
                f1i = f1i * Two - t1i;

                *(p0r + pos) = t0r;
                *(p1r + pos) = t1r;
                *(p0r + posi) = t0i;
                *(p1r + posi) = t1i;
                *p0r = f0r;
                *p1r = f1r;
                *(p0r + 1) = f0i;
                *(p1r + 1) = f1i;

                p0r += pnext;
                f0r = *p0r;
                f0i = *(p0r + 1);

                p1r += pnext;

                f1r = *p1r;
                f1i = *(p1r + 1);

                f4r = f2r - f6r * w2i + f6i * w2r;
                f4i = f2i - f6r * w2r - f6i * w2i;
                f6r = f2r * Two - f4r;
                f6i = f2i * Two - f4i;

                f5r = f3r - f7r * w3i + f7i * w3r;
                f5i = f3i - f7r * w3r - f7i * w3i;
                f7r = f3r * Two - f5r;
                f7i = f3i * Two - f5i;

                *p2r = f4r;
                *p3r = f5r;
                *(p2r + 1) = f4i;
                *(p3r + 1) = f5i;
                *(p2r + pos) = f6r;
                *(p3r + pos) = f7r;
                *(p2r + posi) = f6i;
                *(p3r + posi) = f7i;

                p2r += pnext;
                p3r += pnext;

            }

            f2r = *p2r;
            f2i = *(p2r + 1);
            f3r = *p3r;
            f3i = *(p3r + 1);

            t0r = f0r + f1r * w0r + f1i * w0i;
            t0i = f0i - f1r * w0i + f1i * w0r;
            f1r = f0r * Two - t0r;
            f1i = f0i * Two - t0i;

            f4r = *(p0r + pos);
            f4i = *(p0r + posi);
            f5r = *(p1r + pos);
            f5i = *(p1r + posi);

            f6r = *(p2r + pos);
            f6i = *(p2r + posi);
            f7r = *(p3r + pos);
            f7i = *(p3r + posi);

            t1r = f2r - f3r * w0r - f3i * w0i;
            t1i = f2i + f3r * w0i - f3i * w0r;
            f2r = f2r * Two - t1r;
            f2i = f2i * Two - t1i;

            f0r = t0r + f2r * w1r + f2i * w1i;
            f0i = t0i - f2r * w1i + f2i * w1r;
            f2r = t0r * Two - f0r;
            f2i = t0i * Two - f0i;

            f3r = f1r + t1r * w1i - t1i * w1r;
            f3i = f1i + t1r * w1r + t1i * w1i;
            f1r = f1r * Two - f3r;
            f1i = f1i * Two - f3i;

            if (DiffUCnt == NDiffU/2)
                Uinc4 = -Uinc4;

            u0r += Uinc4;
            u0i -= Uinc4;
            u1r += Uinc2;
            u1i -= Uinc2;
            u2r += Uinc;
            u2i -= Uinc;

            pstrt += 2;

            t0r = f4r + f5r * w0r + f5i * w0i;
            t0i = f4i - f5r * w0i + f5i * w0r;
            f5r = f4r * Two - t0r;
            f5i = f4i * Two - t0i;

            t1r = f6r - f7r * w0r - f7i * w0i;
            t1i = f6i + f7r * w0i - f7i * w0r;
            f6r = f6r * Two - t1r;
            f6i = f6i * Two - t1i;

            f4r = t0r + f6r * w1r + f6i * w1i;
            f4i = t0i - f6r * w1i + f6i * w1r;
            f6r = t0r * Two - f4r;
            f6i = t0i * Two - f4i;

            f7r = f5r + t1r * w1i - t1i * w1r;
            f7i = f5i + t1r * w1r + t1i * w1i;
            f5r = f5r * Two - f7r;
            f5i = f5i * Two - f7i;

            w0r = *u0r;
            w0i =  *u0i;
            w1r =  *u1r;
            w1i =  *u1i;

            if (DiffUCnt <= NDiffU/2)
                w0r = -w0r;

            t0r = f0r - f4r * w2r - f4i * w2i;
            t0i = f0i + f4r * w2i - f4i * w2r;
            f0r = f0r * Two - t0r;
            f0i = f0i * Two - t0i;

            f4r = f2r - f6r * w2i + f6i * w2r;
            f4i = f2i - f6r * w2r - f6i * w2i;
            f6r = f2r * Two - f4r;
            f6i = f2i * Two - f4i;

            *(p0r + pos) = t0r;
            *p2r = f4r;
            *(p0r + posi) = t0i;
            *(p2r + 1) = f4i;
            w2r =  *u2r;
            w2i =  *u2i;
            *p0r = f0r;
            *(p2r + pos) = f6r;
            *(p0r + 1) = f0i;
            *(p2r + posi) = f6i;

            p0r = pstrt;
            p2r = pstrt + pinc + pinc;

            t1r = f1r - f5r * w3r - f5i * w3i;
            t1i = f1i + f5r * w3i - f5i * w3r;
            f1r = f1r * Two - t1r;
            f1i = f1i * Two - t1i;

            f5r = f3r - f7r * w3i + f7i * w3r;
            f5i = f3i - f7r * w3r - f7i * w3i;
            f7r = f3r * Two - f5r;
            f7i = f3i * Two - f5i;

            *(p1r + pos) = t1r;
            *p3r = f5r;
            *(p1r + posi) = t1i;
            *(p3r + 1) = f5i;
            w3r =  *(u2r+U2toU3);
            w3i =  *(u2i-U2toU3);
            *p1r = f1r;
            *(p3r + pos) = f7r;
            *(p1r + 1) = f1i;
            *(p3r + posi) = f7i;

            p1r = pstrt + pinc;
            p3r = p2r + pinc;
        }
        NSameU /= 8;
        Uinc /= 8;
        Uinc2 /= 8;
        Uinc4 = Uinc * 4;
        NDiffU *= 8;
        pinc *= 8;
        pnext *= 8;
        pos *= 8;
        posi =  pos + 1;
    }
}

void fftrecurs(double *ioptr, int M, double *Utbl, int Ustride, int NDiffU, int StageCnt);
void fftrecurs(double *ioptr, int M, double *Utbl, int Ustride, int NDiffU, int StageCnt)
{
    /* recursive bfstages calls to maximize on chip cache efficiency */
    int i1;
    if (M <= MCACHE)  // fits on chip ?
        bfstages(ioptr, M, Utbl, Ustride, NDiffU, StageCnt);		/*  RADIX 8 Stages	*/
    else {
        for (i1=0; i1<8; i1++) {
            fftrecurs(&ioptr[i1*POW2(M-3)*2], M-3, Utbl, 8*Ustride, NDiffU, StageCnt-1);	/*  RADIX 8 Stages	*/
        }
        bfstages(ioptr, M, Utbl, Ustride, POW2(M-3), 1);	/*  RADIX 8 Stage	*/
    }
}

void ffts1(double *ioptr, int M, int Rows, double *Utbl, short *BRLow)
{
    /* Compute in-place complex fft on the rows of the input array	*/
    /* INPUTS */
    /* *ioptr = input data array	*/
    /* M = log2 of fft size	(ex M=10 for 1024 point fft) */
    /* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array)	*/
    /* *Utbl = cosine table	*/
    /* *BRLow = bit reversed counter table	*/
    /* OUTPUTS */
    /* *ioptr = output data array	*/

    int 	StageCnt;
    int 	NDiffU;

    switch (M) {
    case 0:
        break;
    case 1:
        for (; Rows>0; Rows--) {
            fft2pt(ioptr);				/* a 2 pt fft */
            ioptr += 2*POW2(M);
        }
        break;
    case 2:
        for (; Rows>0; Rows--) {
            fft4pt(ioptr);				/* a 4 pt fft */
            ioptr += 2*POW2(M);
        }
        break;
    case 3:
        for (; Rows>0; Rows--) {
            fft8pt(ioptr);				/* an 8 pt fft */
            ioptr += 2*POW2(M);
        }
        break;
    default:
        for (; Rows>0; Rows--) {

            bitrevR2(ioptr, M, BRLow);	/* bit reverse and first radix 2 stage */

            StageCnt = (M-1) / 3;		// number of radix 8 stages
            NDiffU = 2;				// one radix 2 stage already complete

            if ( (M-1-(StageCnt * 3)) == 1 ) {
                bfR2(ioptr, M, NDiffU);				/* 1 radix 2 stage */
                NDiffU *= 2;
            }

            if ( (M-1-(StageCnt * 3)) == 2 ) {
                bfR4(ioptr, M, NDiffU);			/* 1 radix 4 stage */
                NDiffU *= 4;
            }

            if (M <= MCACHE)
                bfstages(ioptr, M, Utbl, 1, NDiffU, StageCnt);		/*  RADIX 8 Stages	*/

            else {
                fftrecurs(ioptr, M, Utbl, 1, NDiffU, StageCnt);		/*  RADIX 8 Stages	*/
            }

            ioptr += 2*POW2(M);
        }
    }
}

/************************************************
parts of iffts1
*************************************************/

static inline void scbitrevR2(double *ioptr, int M, short *BRLow, double scale);
static inline void scbitrevR2(double *ioptr, int M, short *BRLow, double scale)
{
    /*** scaled bit reverse and first radix 2 stage forward or inverse fft ***/
    double	f0r;
    double	f0i;
    double	f1r;
    double	f1i;
    double	f2r;
    double	f2i;
    double	f3r;
    double	f3i;
    double	f4r;
    double	f4i;
    double	f5r;
    double	f5i;
    double	f6r;
    double	f6i;
    double	f7r;
    double	f7i;
    double	t0r;
    double	t0i;
    double	t1r;
    double	t1i;
    double	*p0r;
    double	*p1r;
    double	*IOP;
    double 	*iolimit;
    int 	Colstart;
    int 	iCol;
    int 	posA;
    int 	posAi;
    int 	posB;
    int 	posBi;

    const int Nrems2 = POW2((M+3)/2);
    const int Nroot_1_ColInc = POW2(M)-Nrems2;
    const int Nroot_1 = POW2(M/2-1)-1;
    const int ColstartShift = (M+1)/2 +1;
    posA = POW2(M);		// 1/2 of POW2(M) complexes
    posAi = posA + 1;
    posB = posA + 2;
    posBi = posB + 1;

    iolimit = ioptr + Nrems2;
    const size_t ioptr_inc = (size_t) POW2((M / 2 + 1));
    for (; ioptr < iolimit; ioptr += ioptr_inc) {
        for (Colstart = Nroot_1; Colstart >= 0; Colstart--) {
            iCol = Nroot_1;
            p0r = ioptr+ Nroot_1_ColInc + BRLow[Colstart]*4;
            IOP = ioptr + (Colstart << ColstartShift);
            p1r = IOP + BRLow[iCol]*4;
            f0r = *(p0r);
            f0i = *(p0r+1);
            f1r = *(p0r+posA);
            f1i = *(p0r+posAi);
            for (; iCol > Colstart;) {
                f2r = *(p0r+2);
                f2i = *(p0r+(2+1));
                f3r = *(p0r+posB);
                f3i = *(p0r+posBi);
                f4r = *(p1r);
                f4i = *(p1r+1);
                f5r = *(p1r+posA);
                f5i = *(p1r+posAi);
                f6r = *(p1r+2);
                f6i = *(p1r+(2+1));
                f7r = *(p1r+posB);
                f7i = *(p1r+posBi);

                t0r	= f0r + f1r;
                t0i	= f0i + f1i;
                f1r = f0r - f1r;
                f1i = f0i - f1i;
                t1r = f2r + f3r;
                t1i = f2i + f3i;
                f3r = f2r - f3r;
                f3i = f2i - f3i;
                f0r = f4r + f5r;
                f0i = f4i + f5i;
                f5r = f4r - f5r;
                f5i = f4i - f5i;
                f2r = f6r + f7r;
                f2i = f6i + f7i;
                f7r = f6r - f7r;
                f7i = f6i - f7i;

                *(p1r) = scale*t0r;
                *(p1r+1) = scale*t0i;
                *(p1r+2) = scale*f1r;
                *(p1r+(2+1)) = scale*f1i;
                *(p1r+posA) = scale*t1r;
                *(p1r+posAi) = scale*t1i;
                *(p1r+posB) = scale*f3r;
                *(p1r+posBi) = scale*f3i;
                *(p0r) = scale*f0r;
                *(p0r+1) = scale*f0i;
                *(p0r+2) = scale*f5r;
                *(p0r+(2+1)) = scale*f5i;
                *(p0r+posA) = scale*f2r;
                *(p0r+posAi) = scale*f2i;
                *(p0r+posB) = scale*f7r;
                *(p0r+posBi) = scale*f7i;

                p0r -= Nrems2;
                f0r = *(p0r);
                f0i = *(p0r+1);
                f1r = *(p0r+posA);
                f1i = *(p0r+posAi);
                iCol -= 1;
                p1r = IOP + BRLow[iCol]*4;
            }
            f2r = *(p0r+2);
            f2i = *(p0r+(2+1));
            f3r = *(p0r+posB);
            f3i = *(p0r+posBi);

            t0r   = f0r + f1r;
            t0i   = f0i + f1i;
            f1r = f0r - f1r;
            f1i = f0i - f1i;
            t1r   = f2r + f3r;
            t1i   = f2i + f3i;
            f3r = f2r - f3r;
            f3i = f2i - f3i;

            *(p0r) = scale*t0r;
            *(p0r+1) = scale*t0i;
            *(p0r+2) = scale*f1r;
            *(p0r+(2+1)) = scale*f1i;
            *(p0r+posA) = scale*t1r;
            *(p0r+posAi) = scale*t1i;
            *(p0r+posB) = scale*f3r;
            *(p0r+posBi) = scale*f3i;

        }
    }
}

static inline void ifft2pt(double *ioptr, double scale);
static inline void ifft2pt(double *ioptr, double scale)
{
    /***   RADIX 2 ifft	***/
    double f0r, f0i, f1r, f1i;
    double t0r, t0i;

    /* bit reversed load */
    f0r = ioptr[0];
    f0i = ioptr[1];
    f1r = ioptr[2];
    f1i = ioptr[3];

    /* Butterflys		*/
    /*
    f0	-	-	t0
    f1	-  1 -	f1
    */

    t0r = f0r + f1r;
    t0i = f0i + f1i;
    f1r = f0r - f1r;
    f1i = f0i - f1i;

    /* store result */
    ioptr[0] = scale*t0r;
    ioptr[1] = scale*t0i;
    ioptr[2] = scale*f1r;
    ioptr[3] = scale*f1i;
}

static inline void ifft4pt(double *ioptr, double scale);
static inline void ifft4pt(double *ioptr, double scale)
{
    /***   RADIX 4 ifft	***/
    double f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
    double t0r, t0i, t1r, t1i;

    /* bit reversed load */
    f0r = ioptr[0];
    f0i = ioptr[1];
    f1r = ioptr[4];
    f1i = ioptr[5];
    f2r = ioptr[2];
    f2i = ioptr[3];
    f3r = ioptr[6];
    f3i = ioptr[7];

    /* Butterflys		*/
    /*
    f0	-	-	t0	-	-	f0
    f1	-  1 -	f1	-	-	f1
    f2	-	-	f2	-  1 -	f2
    f3	-  1 -	t1	-  i -	f3
    */

    t0r = f0r + f1r;
    t0i = f0i + f1i;
    f1r = f0r - f1r;
    f1i = f0i - f1i;

    t1r = f2r - f3r;
    t1i = f2i - f3i;
    f2r = f2r + f3r;
    f2i = f2i + f3i;

    f0r = t0r + f2r;
    f0i = t0i + f2i;
    f2r = t0r - f2r;
    f2i = t0i - f2i;

    f3r = f1r + t1i;
    f3i = f1i - t1r;
    f1r = f1r - t1i;
    f1i = f1i + t1r;

    /* store result */
    ioptr[0] = scale*f0r;
    ioptr[1] = scale*f0i;
    ioptr[2] = scale*f1r;
    ioptr[3] = scale*f1i;
    ioptr[4] = scale*f2r;
    ioptr[5] = scale*f2i;
    ioptr[6] = scale*f3r;
    ioptr[7] = scale*f3i;
}

static inline void ifft8pt(double *ioptr, double scale);
static inline void ifft8pt(double *ioptr, double scale)
{
    /***   RADIX 8 ifft	***/
    double w0r = 1.0/MYROOT2; /* cos(pi/4)	*/
    double f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
    double f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
    double t0r, t0i, t1r, t1i;
    const double Two = 2.0;

    /* bit reversed load */
    f0r = ioptr[0];
    f0i = ioptr[1];
    f1r = ioptr[8];
    f1i = ioptr[9];
    f2r = ioptr[4];
    f2i = ioptr[5];
    f3r = ioptr[12];
    f3i = ioptr[13];
    f4r = ioptr[2];
    f4i = ioptr[3];
    f5r = ioptr[10];
    f5i = ioptr[11];
    f6r = ioptr[6];
    f6i = ioptr[7];
    f7r = ioptr[14];
    f7i = ioptr[15];

    /* Butterflys		*/
    /*
    f0	-	-	t0	-	-	f0	-	-	f0
    f1	-  1 -	f1	-	-	f1	-	-	f1
    f2	-	-	f2	-  1 -	f2	-	-	f2
    f3	-  1 -	t1	-  i -	f3	-	-	f3
    f4	-	-	t0	-	-	f4	-  1 -	t0
    f5	-  1 -	f5	-	-	f5	- w3 -	f4
    f6	-	-	f6	-  1 -	f6	-  i -	t1
    f7	-  1 -	t1	-  i -	f7	- iw3-	f6
    */

    t0r = f0r + f1r;
    t0i = f0i + f1i;
    f1r = f0r - f1r;
    f1i = f0i - f1i;

    t1r = f2r - f3r;
    t1i = f2i - f3i;
    f2r = f2r + f3r;
    f2i = f2i + f3i;

    f0r = t0r + f2r;
    f0i = t0i + f2i;
    f2r = t0r - f2r;
    f2i = t0i - f2i;

    f3r = f1r + t1i;
    f3i = f1i - t1r;
    f1r = f1r - t1i;
    f1i = f1i + t1r;


    t0r = f4r + f5r;
    t0i = f4i + f5i;
    f5r = f4r - f5r;
    f5i = f4i - f5i;

    t1r = f6r - f7r;
    t1i = f6i - f7i;
    f6r = f6r + f7r;
    f6i = f6i + f7i;

    f4r = t0r + f6r;
    f4i = t0i + f6i;
    f6r = t0r - f6r;
    f6i = t0i - f6i;

    f7r = f5r + t1i;
    f7i = f5i - t1r;
    f5r = f5r - t1i;
    f5i = f5i + t1r;


    t0r = f0r - f4r;
    t0i = f0i - f4i;
    f0r = f0r + f4r;
    f0i = f0i + f4i;

    t1r = f2r + f6i;
    t1i = f2i - f6r;
    f2r = f2r - f6i;
    f2i = f2i + f6r;

    f4r = f1r - f5r * w0r + f5i * w0r;
    f4i = f1i - f5r * w0r - f5i * w0r;
    f1r = f1r * Two - f4r;
    f1i = f1i * Two - f4i;

    f6r = f3r + f7r * w0r + f7i * w0r;
    f6i = f3i - f7r * w0r + f7i * w0r;
    f3r = f3r * Two - f6r;
    f3i = f3i * Two - f6i;

    /* store result */
    ioptr[0] = scale*f0r;
    ioptr[1] = scale*f0i;
    ioptr[2] = scale*f1r;
    ioptr[3] = scale*f1i;
    ioptr[4] = scale*f2r;
    ioptr[5] = scale*f2i;
    ioptr[6] = scale*f3r;
    ioptr[7] = scale*f3i;
    ioptr[8] = scale*t0r;
    ioptr[9] = scale*t0i;
    ioptr[10] = scale*f4r;
    ioptr[11] = scale*f4i;
    ioptr[12] = scale*t1r;
    ioptr[13] = scale*t1i;
    ioptr[14] = scale*f6r;
    ioptr[15] = scale*f6i;
}

static inline void ibfR2(double *ioptr, int M, int NDiffU);
static inline void ibfR2(double *ioptr, int M, int NDiffU)
{
    /*** 2nd radix 2 stage ***/
    int	pos;
    int	posi;
    int	pinc;
    int	pnext;
    int 	NSameU;
    int 	SameUCnt;

    double	*pstrt;
    double 	*p0r, *p1r, *p2r, *p3r;

    double f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
    double f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
    /* const double Two = 2.0; */

    pinc = NDiffU * 2;		// 2 doubles per complex
    pnext =  pinc * 4;
    pos = 2;
    posi = pos+1;
    NSameU = POW2(M) / 4 /NDiffU;	// 4 Us at a time
    pstrt = ioptr;
    p0r = pstrt;
    p1r = pstrt+pinc;
    p2r = p1r+pinc;
    p3r = p2r+pinc;

    /* Butterflys		*/
    /*
    f0	-	-	f4
    f1	-  1 -	f5
    f2	-	-	f6
    f3	-  1 -	f7
    */
    /* Butterflys		*/
    /*
    f0	-	-	f4
    f1	-  1 -	f5
    f2	-	-	f6
    f3	-  1 -	f7
    */

    for (SameUCnt = NSameU; SameUCnt > 0 ; SameUCnt--) {

        f0r = *p0r;
        f1r = *p1r;
        f0i = *(p0r + 1);
        f1i = *(p1r + 1);
        f2r = *p2r;
        f3r = *p3r;
        f2i = *(p2r + 1);
        f3i = *(p3r + 1);

        f4r = f0r + f1r;
        f4i = f0i + f1i;
        f5r = f0r - f1r;
        f5i = f0i - f1i;

        f6r = f2r + f3r;
        f6i = f2i + f3i;
        f7r = f2r - f3r;
        f7i = f2i - f3i;

        *p0r = f4r;
        *(p0r + 1) = f4i;
        *p1r = f5r;
        *(p1r + 1) = f5i;
        *p2r = f6r;
        *(p2r + 1) = f6i;
        *p3r = f7r;
        *(p3r + 1) = f7i;

        f0r = *(p0r + pos);
        f1i = *(p1r + posi);
        f0i = *(p0r + posi);
        f1r = *(p1r + pos);
        f2r = *(p2r + pos);
        f3i = *(p3r + posi);
        f2i = *(p2r + posi);
        f3r = *(p3r + pos);

        f4r = f0r - f1i;
        f4i = f0i + f1r;
        f5r = f0r + f1i;
        f5i = f0i - f1r;

        f6r = f2r - f3i;
        f6i = f2i + f3r;
        f7r = f2r + f3i;
        f7i = f2i - f3r;

        *(p0r + pos) = f4r;
        *(p0r + posi) = f4i;
        *(p1r + pos) = f5r;
        *(p1r + posi) = f5i;
        *(p2r + pos) = f6r;
        *(p2r + posi) = f6i;
        *(p3r + pos) = f7r;
        *(p3r + posi) = f7i;

        p0r += pnext;
        p1r += pnext;
        p2r += pnext;
        p3r += pnext;

    }
}

static inline void ibfR4(double *ioptr, int M, int NDiffU);
static inline void ibfR4(double *ioptr, int M, int NDiffU)
{
    /*** 1 radix 4 stage ***/
    int	pos;
    int	posi;
    int	pinc;
    int	pnext;
    int	pnexti;
    int 	NSameU;
    int 	SameUCnt;

    double	*pstrt;
    double 	*p0r, *p1r, *p2r, *p3r;

    double w1r = 1.0/MYROOT2; /* cos(pi/4)	*/
    double f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
    double f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
    double t1r, t1i;
    const double Two = 2.0;

    pinc = NDiffU * 2;		// 2 doubles per complex
    pnext =  pinc * 4;
    pnexti =  pnext + 1;
    pos = 2;
    posi = pos+1;
    NSameU = POW2(M) / 4 /NDiffU;	// 4 pts per butterfly
    pstrt = ioptr;
    p0r = pstrt;
    p1r = pstrt+pinc;
    p2r = p1r+pinc;
    p3r = p2r+pinc;

    /* Butterflys		*/
    /*
    f0	-	-	f0	-	-	f4
    f1	-  1 -	f5	-	-	f5
    f2	-	-	f6	-  1 -	f6
    f3	-  1 -	f3	- -i -	f7
    */
    /* Butterflys		*/
    /*
    f0	-	-	f4	-	-	f4
    f1	- -i -	t1	-	-	f5
    f2	-	-	f2	- w1 -	f6
    f3	- -i -	f7	- iw1-	f7
    */

    f0r = *p0r;
    f1r = *p1r;
    f2r = *p2r;
    f3r = *p3r;
    f0i = *(p0r + 1);
    f1i = *(p1r + 1);
    f2i = *(p2r + 1);
    f3i = *(p3r + 1);

    f5r = f0r - f1r;
    f5i = f0i - f1i;
    f0r = f0r + f1r;
    f0i = f0i + f1i;

    f6r = f2r + f3r;
    f6i = f2i + f3i;
    f3r = f2r - f3r;
    f3i = f2i - f3i;

    for (SameUCnt = NSameU-1; SameUCnt > 0 ; SameUCnt--) {

        f7r = f5r + f3i;
        f7i = f5i - f3r;
        f5r = f5r - f3i;
        f5i = f5i + f3r;

        f4r = f0r + f6r;
        f4i = f0i + f6i;
        f6r = f0r - f6r;
        f6i = f0i - f6i;

        f2r = *(p2r + pos);
        f2i = *(p2r + posi);
        f1r = *(p1r + pos);
        f1i = *(p1r + posi);
        f3i = *(p3r + posi);
        f0r = *(p0r + pos);
        f3r = *(p3r + pos);
        f0i = *(p0r + posi);

        *p3r = f7r;
        *p0r = f4r;
        *(p3r + 1) = f7i;
        *(p0r + 1) = f4i;
        *p1r = f5r;
        *p2r = f6r;
        *(p1r + 1) = f5i;
        *(p2r + 1) = f6i;

        f7r = f2r + f3i;
        f7i = f2i - f3r;
        f2r = f2r - f3i;
        f2i = f2i + f3r;

        f4r = f0r - f1i;
        f4i = f0i + f1r;
        t1r = f0r + f1i;
        t1i = f0i - f1r;

        f5r = t1r - f7r * w1r - f7i * w1r;
        f5i = t1i + f7r * w1r - f7i * w1r;
        f7r = t1r * Two - f5r;
        f7i = t1i * Two - f5i;

        f6r = f4r - f2r * w1r + f2i * w1r;
        f6i = f4i - f2r * w1r - f2i * w1r;
        f4r = f4r * Two - f6r;
        f4i = f4i * Two - f6i;

        f3r = *(p3r + pnext);
        f0r = *(p0r + pnext);
        f3i = *(p3r + pnexti);
        f0i = *(p0r + pnexti);
        f2r = *(p2r + pnext);
        f2i = *(p2r + pnexti);
        f1r = *(p1r + pnext);
        f1i = *(p1r + pnexti);

        *(p2r + pos) = f6r;
        *(p1r + pos) = f5r;
        *(p2r + posi) = f6i;
        *(p1r + posi) = f5i;
        *(p3r + pos) = f7r;
        *(p0r + pos) = f4r;
        *(p3r + posi) = f7i;
        *(p0r + posi) = f4i;

        f6r = f2r + f3r;
        f6i = f2i + f3i;
        f3r = f2r - f3r;
        f3i = f2i - f3i;

        f5r = f0r - f1r;
        f5i = f0i - f1i;
        f0r = f0r + f1r;
        f0i = f0i + f1i;

        p3r += pnext;
        p0r += pnext;
        p1r += pnext;
        p2r += pnext;

    }
    f7r = f5r + f3i;
    f7i = f5i - f3r;
    f5r = f5r - f3i;
    f5i = f5i + f3r;

    f4r = f0r + f6r;
    f4i = f0i + f6i;
    f6r = f0r - f6r;
    f6i = f0i - f6i;

    f2r = *(p2r + pos);
    f2i = *(p2r + posi);
    f1r = *(p1r + pos);
    f1i = *(p1r + posi);
    f3i = *(p3r + posi);
    f0r = *(p0r + pos);
    f3r = *(p3r + pos);
    f0i = *(p0r + posi);

    *p3r = f7r;
    *p0r = f4r;
    *(p3r + 1) = f7i;
    *(p0r + 1) = f4i;
    *p1r = f5r;
    *p2r = f6r;
    *(p1r + 1) = f5i;
    *(p2r + 1) = f6i;

    f7r = f2r + f3i;
    f7i = f2i - f3r;
    f2r = f2r - f3i;
    f2i = f2i + f3r;

    f4r = f0r - f1i;
    f4i = f0i + f1r;
    t1r = f0r + f1i;
    t1i = f0i - f1r;

    f5r = t1r - f7r * w1r - f7i * w1r;
    f5i = t1i + f7r * w1r - f7i * w1r;
    f7r = t1r * Two - f5r;
    f7i = t1i * Two - f5i;

    f6r = f4r - f2r * w1r + f2i * w1r;
    f6i = f4i - f2r * w1r - f2i * w1r;
    f4r = f4r * Two - f6r;
    f4i = f4i * Two - f6i;

    *(p2r + pos) = f6r;
    *(p1r + pos) = f5r;
    *(p2r + posi) = f6i;
    *(p1r + posi) = f5i;
    *(p3r + pos) = f7r;
    *(p0r + pos) = f4r;
    *(p3r + posi) = f7i;
    *(p0r + posi) = f4i;

}

static inline void ibfstages(double *ioptr, int M, double *Utbl, int Ustride, int NDiffU, int StageCnt);
static inline void ibfstages(double *ioptr, int M, double *Utbl, int Ustride, int NDiffU, int StageCnt)
{
    /***   RADIX 8 Stages	***/
    int	pos;
    int	posi;
    int	pinc;
    int	pnext;
    int 	NSameU;
    int 	Uinc;
    int 	Uinc2;
    int 	Uinc4;
    int 	DiffUCnt;
    int 	SameUCnt;
    int 	U2toU3;

    double	*pstrt;
    double 	*p0r, *p1r, *p2r, *p3r;
    double 	*u0r, *u0i, *u1r, *u1i, *u2r, *u2i;

    double w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i;
    double f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
    double f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
    double t0r, t0i, t1r, t1i;
    const double Two = 2.0;

    pinc = NDiffU * 2;		// 2 doubles per complex
    pnext =  pinc * 8;
    pos = pinc * 4;
    posi =  pos + 1;
    NSameU = POW2(M) / 8 /NDiffU;	// 8 pts per butterfly
    Uinc = NSameU * Ustride;
    Uinc2 = Uinc * 2;
    Uinc4 = Uinc * 4;
    U2toU3 = (POW2(M) / 8)*Ustride;
    for (; StageCnt > 0 ; StageCnt--) {

        u0r = &Utbl[0];
        u0i = &Utbl[POW2(M-2)*Ustride];
        u1r = u0r;
        u1i = u0i;
        u2r = u0r;
        u2i = u0i;

        w0r =  *u0r;
        w0i =  *u0i;
        w1r =  *u1r;
        w1i =  *u1i;
        w2r =  *u2r;
        w2i =  *u2i;
        w3r =  *(u2r+U2toU3);
        w3i =  *(u2i-U2toU3);

        pstrt = ioptr;

        p0r = pstrt;
        p1r = pstrt+pinc;
        p2r = p1r+pinc;
        p3r = p2r+pinc;

        /* Butterflys		*/
        /*
        f0	-	-	t0	-	-	f0	-	-	f0
        f1	- w0-	f1	-	-	f1	-	-	f1
        f2	-	-	f2	- w1-	f2	-	-	f4
        f3	- w0-	t1	- iw1-	f3	-	-	f5

        f4	-	-	t0	-	-	f4	- w2-	t0
        f5	- w0-	f5	-	-	f5	- w3-	t1
        f6	-	-	f6	- w1-	f6	- iw2-	f6
        f7	- w0-	t1	- iw1-	f7	- iw3-	f7
        */

        for (DiffUCnt = NDiffU; DiffUCnt > 0 ; DiffUCnt--) {
            f0r = *p0r;
            f0i = *(p0r + 1);
            f1r = *p1r;
            f1i = *(p1r + 1);
            for (SameUCnt = NSameU-1; SameUCnt > 0 ; SameUCnt--) {
                f2r = *p2r;
                f2i = *(p2r + 1);
                f3r = *p3r;
                f3i = *(p3r + 1);

                t0r = f0r + f1r * w0r - f1i * w0i;
                t0i = f0i + f1r * w0i + f1i * w0r;
                f1r = f0r * Two - t0r;
                f1i = f0i * Two - t0i;

                f4r = *(p0r + pos);
                f4i = *(p0r + posi);
                f5r = *(p1r + pos);
                f5i = *(p1r + posi);

                f6r = *(p2r + pos);
                f6i = *(p2r + posi);
                f7r = *(p3r + pos);
                f7i = *(p3r + posi);

                t1r = f2r - f3r * w0r + f3i * w0i;
                t1i = f2i - f3r * w0i - f3i * w0r;
                f2r = f2r * Two - t1r;
                f2i = f2i * Two - t1i;

                f0r = t0r + f2r * w1r - f2i * w1i;
                f0i = t0i + f2r * w1i + f2i * w1r;
                f2r = t0r * Two - f0r;
                f2i = t0i * Two - f0i;

                f3r = f1r + t1r * w1i + t1i * w1r;
                f3i = f1i - t1r * w1r + t1i * w1i;
                f1r = f1r * Two - f3r;
                f1i = f1i * Two - f3i;


                t0r = f4r + f5r * w0r - f5i * w0i;
                t0i = f4i + f5r * w0i + f5i * w0r;
                f5r = f4r * Two - t0r;
                f5i = f4i * Two - t0i;

                t1r = f6r - f7r * w0r + f7i * w0i;
                t1i = f6i - f7r * w0i - f7i * w0r;
                f6r = f6r * Two - t1r;
                f6i = f6i * Two - t1i;

                f4r = t0r + f6r * w1r - f6i * w1i;
                f4i = t0i + f6r * w1i + f6i * w1r;
                f6r = t0r * Two - f4r;
                f6i = t0i * Two - f4i;

                f7r = f5r + t1r * w1i + t1i * w1r;
                f7i = f5i - t1r * w1r + t1i * w1i;
                f5r = f5r * Two - f7r;
                f5i = f5i * Two - f7i;

                t0r = f0r - f4r * w2r + f4i * w2i;
                t0i = f0i - f4r * w2i - f4i * w2r;
                f0r = f0r * Two - t0r;
                f0i = f0i * Two - t0i;

                t1r = f1r - f5r * w3r + f5i * w3i;
                t1i = f1i - f5r * w3i - f5i * w3r;
                f1r = f1r * Two - t1r;
                f1i = f1i * Two - t1i;

                *(p0r + pos) = t0r;
                *(p0r + posi) = t0i;
                *p0r = f0r;
                *(p0r + 1) = f0i;

                p0r += pnext;
                f0r = *p0r;
                f0i = *(p0r + 1);

                *(p1r + pos) = t1r;
                *(p1r + posi) = t1i;
                *p1r = f1r;
                *(p1r + 1) = f1i;

                p1r += pnext;

                f1r = *p1r;
                f1i = *(p1r + 1);

                f4r = f2r - f6r * w2i - f6i * w2r;
                f4i = f2i + f6r * w2r - f6i * w2i;
                f6r = f2r * Two - f4r;
                f6i = f2i * Two - f4i;

                f5r = f3r - f7r * w3i - f7i * w3r;
                f5i = f3i + f7r * w3r - f7i * w3i;
                f7r = f3r * Two - f5r;
                f7i = f3i * Two - f5i;

                *p2r = f4r;
                *(p2r + 1) = f4i;
                *(p2r + pos) = f6r;
                *(p2r + posi) = f6i;

                p2r += pnext;

                *p3r = f5r;
                *(p3r + 1) = f5i;
                *(p3r + pos) = f7r;
                *(p3r + posi) = f7i;

                p3r += pnext;

            }

            f2r = *p2r;
            f2i = *(p2r + 1);
            f3r = *p3r;
            f3i = *(p3r + 1);

            t0r = f0r + f1r * w0r - f1i * w0i;
            t0i = f0i + f1r * w0i + f1i * w0r;
            f1r = f0r * Two - t0r;
            f1i = f0i * Two - t0i;

            f4r = *(p0r + pos);
            f4i = *(p0r + posi);
            f5r = *(p1r + pos);
            f5i = *(p1r + posi);

            f6r = *(p2r + pos);
            f6i = *(p2r + posi);
            f7r = *(p3r + pos);
            f7i = *(p3r + posi);

            t1r = f2r - f3r * w0r + f3i * w0i;
            t1i = f2i - f3r * w0i - f3i * w0r;
            f2r = f2r * Two - t1r;
            f2i = f2i * Two - t1i;

            f0r = t0r + f2r * w1r - f2i * w1i;
            f0i = t0i + f2r * w1i + f2i * w1r;
            f2r = t0r * Two - f0r;
            f2i = t0i * Two - f0i;

            f3r = f1r + t1r * w1i + t1i * w1r;
            f3i = f1i - t1r * w1r + t1i * w1i;
            f1r = f1r * Two - f3r;
            f1i = f1i * Two - f3i;

            if (DiffUCnt == NDiffU/2)
                Uinc4 = -Uinc4;

            u0r += Uinc4;
            u0i -= Uinc4;
            u1r += Uinc2;
            u1i -= Uinc2;
            u2r += Uinc;
            u2i -= Uinc;

            pstrt += 2;

            t0r = f4r + f5r * w0r - f5i * w0i;
            t0i = f4i + f5r * w0i + f5i * w0r;
            f5r = f4r * Two - t0r;
            f5i = f4i * Two - t0i;

            t1r = f6r - f7r * w0r + f7i * w0i;
            t1i = f6i - f7r * w0i - f7i * w0r;
            f6r = f6r * Two - t1r;
            f6i = f6i * Two - t1i;

            f4r = t0r + f6r * w1r - f6i * w1i;
            f4i = t0i + f6r * w1i + f6i * w1r;
            f6r = t0r * Two - f4r;
            f6i = t0i * Two - f4i;

            f7r = f5r + t1r * w1i + t1i * w1r;
            f7i = f5i - t1r * w1r + t1i * w1i;
            f5r = f5r * Two - f7r;
            f5i = f5i * Two - f7i;

            w0r = *u0r;
            w0i =  *u0i;
            w1r =  *u1r;
            w1i =  *u1i;

            if (DiffUCnt <= NDiffU/2)
                w0r = -w0r;

            t0r = f0r - f4r * w2r + f4i * w2i;
            t0i = f0i - f4r * w2i - f4i * w2r;
            f0r = f0r * Two - t0r;
            f0i = f0i * Two - t0i;

            f4r = f2r - f6r * w2i - f6i * w2r;
            f4i = f2i + f6r * w2r - f6i * w2i;
            f6r = f2r * Two - f4r;
            f6i = f2i * Two - f4i;

            *(p0r + pos) = t0r;
            *p2r = f4r;
            *(p0r + posi) = t0i;
            *(p2r + 1) = f4i;
            w2r =  *u2r;
            w2i =  *u2i;
            *p0r = f0r;
            *(p2r + pos) = f6r;
            *(p0r + 1) = f0i;
            *(p2r + posi) = f6i;

            p0r = pstrt;
            p2r = pstrt + pinc + pinc;

            t1r = f1r - f5r * w3r + f5i * w3i;
            t1i = f1i - f5r * w3i - f5i * w3r;
            f1r = f1r * Two - t1r;
            f1i = f1i * Two - t1i;

            f5r = f3r - f7r * w3i - f7i * w3r;
            f5i = f3i + f7r * w3r - f7i * w3i;
            f7r = f3r * Two - f5r;
            f7i = f3i * Two - f5i;

            *(p1r + pos) = t1r;
            *p3r = f5r;
            *(p1r + posi) = t1i;
            *(p3r + 1) = f5i;
            w3r =  *(u2r+U2toU3);
            w3i =  *(u2i-U2toU3);
            *p1r = f1r;
            *(p3r + pos) = f7r;
            *(p1r + 1) = f1i;
            *(p3r + posi) = f7i;

            p1r = pstrt + pinc;
            p3r = p2r + pinc;
        }
        NSameU /= 8;
        Uinc /= 8;
        Uinc2 /= 8;
        Uinc4 = Uinc * 4;
        NDiffU *= 8;
        pinc *= 8;
        pnext *= 8;
        pos *= 8;
        posi =  pos + 1;
    }
}

void ifftrecurs(double *ioptr, int M, double *Utbl, int Ustride, int NDiffU, int StageCnt);
void ifftrecurs(double *ioptr, int M, double *Utbl, int Ustride, int NDiffU, int StageCnt)
{
    /* recursive bfstages calls to maximize on chip cache efficiency */
    int i1;
    if (M <= MCACHE)
        ibfstages(ioptr, M, Utbl, Ustride, NDiffU, StageCnt);		/*  RADIX 8 Stages	*/
    else {
        for (i1=0; i1<8; i1++) {
            ifftrecurs(&ioptr[i1*POW2(M-3)*2], M-3, Utbl, 8*Ustride, NDiffU, StageCnt-1);	/*  RADIX 8 Stages	*/
        }
        ibfstages(ioptr, M, Utbl, Ustride, POW2(M-3), 1);	/*  RADIX 8 Stage	*/
    }
}

void iffts1(double *ioptr, int M, int Rows, double *Utbl, short *BRLow)
{
    /* Compute in-place inverse complex fft on the rows of the input array	*/
    /* INPUTS */
    /* *ioptr = input data array	*/
    /* M = log2 of fft size	*/
    /* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array)	*/
    /* *Utbl = cosine table	*/
    /* *BRLow = bit reversed counter table	*/
    /* OUTPUTS */
    /* *ioptr = output data array	*/

    int 	StageCnt;
    int 	NDiffU;
    const double scale = 1.0/POW2(M);

    switch (M) {
    case 0:
        break;
    case 1:
        for (; Rows>0; Rows--) {
            ifft2pt(ioptr, scale);				/* a 2 pt fft */
            ioptr += 2*POW2(M);
        }
        break;
    case 2:
        for (; Rows>0; Rows--) {
            ifft4pt(ioptr, scale);				/* a 4 pt fft */
            ioptr += 2*POW2(M);
        }
        break;
    case 3:
        for (; Rows>0; Rows--) {
            ifft8pt(ioptr, scale);				/* an 8 pt fft */
            ioptr += 2*POW2(M);
        }
        break;
    default:
        for (; Rows>0; Rows--) {

            scbitrevR2(ioptr, M, BRLow, scale);	/* bit reverse and first radix 2 stage */

            StageCnt = (M-1) / 3;		// number of radix 8 stages
            NDiffU = 2;				// one radix 2 stage already complete

            if ( (M-1-(StageCnt * 3)) == 1 ) {
                ibfR2(ioptr, M, NDiffU);				/* 1 radix 2 stage */
                NDiffU *= 2;
            }

            if ( (M-1-(StageCnt * 3)) == 2 ) {
                ibfR4(ioptr, M, NDiffU);			/* 1 radix 4 stage */
                NDiffU *= 4;
            }

            if (M <= MCACHE)
                ibfstages(ioptr, M, Utbl, 1, NDiffU, StageCnt);		/*  RADIX 8 Stages	*/

            else {
                ifftrecurs(ioptr, M, Utbl, 1, NDiffU, StageCnt);		/*  RADIX 8 Stages	*/
            }

            ioptr += 2*POW2(M);
        }
    }
}

/************************************************
parts of rffts1
*************************************************/

static inline void rfft1pt(double *ioptr);
static inline void rfft1pt(double *ioptr)
{
    /***   RADIX 2 rfft	***/
    double f0r, f0i;
    double t0r, t0i;

    /* bit reversed load */
    f0r = ioptr[0];
    f0i = ioptr[1];

    /* finish rfft */
    t0r = 	f0r + f0i;
    t0i = 	f0r - f0i;

    /* store result */
    ioptr[0] = t0r;
    ioptr[1] = t0i;
}

static inline void rfft2pt(double *ioptr);
static inline void rfft2pt(double *ioptr)
{
    /***   RADIX 4 rfft	***/
    double f0r, f0i, f1r, f1i;
    double t0r, t0i;

    /* bit reversed load */
    f0r = ioptr[0];
    f0i = ioptr[1];
    f1r = ioptr[2];
    f1i = ioptr[3];

    /* Butterflys		*/
    /*
    f0	-	-	t0
    f1	-  1 -	f1
    */

    t0r = f0r + f1r;
    t0i = f0i + f1i;
    f1r = f0r - f1r;
    f1i = f1i - f0i;
    /* finish rfft */
    f0r = 	t0r + t0i;
    f0i = 	t0r - t0i;

    /* store result */
    ioptr[0] = f0r;
    ioptr[1] = f0i;
    ioptr[2] = f1r;
    ioptr[3] = f1i;
}

static inline void rfft4pt(double *ioptr);
static inline void rfft4pt(double *ioptr)
{
    /***   RADIX 8 rfft	***/
    double f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
    double t0r, t0i, t1r, t1i;
    double w0r = 1.0/MYROOT2; /* cos(pi/4)	*/
    const double Two = 2.0;
    const double scale = 0.5;

    /* bit reversed load */
    f0r = ioptr[0];
    f0i = ioptr[1];
    f1r = ioptr[4];
    f1i = ioptr[5];
    f2r = ioptr[2];
    f2i = ioptr[3];
    f3r = ioptr[6];
    f3i = ioptr[7];

    /* Butterflys		*/
    /*
    f0	-	-	t0	-	-	f0
    f1	-  1 -	f1	-	-	f1
    f2	-	-	f2	-  1 -	f2
    f3	-  1 -	t1	- -i -	f3
    */

    t0r = f0r + f1r;
    t0i = f0i + f1i;
    f1r = f0r - f1r;
    f1i = f0i - f1i;

    t1r = f2r - f3r;
    t1i = f2i - f3i;
    f2r = f2r + f3r;
    f2i = f2i + f3i;

    f0r = t0r + f2r;
    f0i = t0i + f2i;
    f2r = t0r - f2r;
    f2i = f2i - t0i;	// neg for rfft

    f3r = f1r - t1i;
    f3i = f1i + t1r;
    f1r = f1r + t1i;
    f1i = f1i - t1r;

    /* finish rfft */
    t0r = 	f0r + f0i;    /* compute Re(x[0]) */
    t0i = 	f0r - f0i;    /* compute Re(x[N/2]) */

    t1r = f1r + f3r;
    t1i = f1i - f3i;
    f0r = f1i + f3i;
    f0i = f3r - f1r;

    f1r = t1r + w0r * f0r + w0r * f0i;
    f1i = t1i - w0r * f0r + w0r * f0i;
    f3r = Two * t1r - f1r;
    f3i = f1i - Two * t1i;

    /* store result */
    ioptr[4] = f2r;
    ioptr[5] = f2i;
    ioptr[0] = t0r;
    ioptr[1] = t0i;

    ioptr[2] = scale*f1r;
    ioptr[3] = scale*f1i;
    ioptr[6] = scale*f3r;
    ioptr[7] = scale*f3i;
}

static inline void rfft8pt(double *ioptr);
static inline void rfft8pt(double *ioptr)
{
    /***   RADIX 16 rfft	***/
    double w0r = 1.0/MYROOT2; /* cos(pi/4)	*/
    double w1r = MYCOSPID8; /* cos(pi/8)	*/
    double w1i = MYSINPID8; /* sin(pi/8)	*/
    double f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
    double f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
    double t0r, t0i, t1r, t1i;
    const double Two = 2.0;
    const double scale = 0.5;

    /* bit reversed load */
    f0r = ioptr[0];
    f0i = ioptr[1];
    f1r = ioptr[8];
    f1i = ioptr[9];
    f2r = ioptr[4];
    f2i = ioptr[5];
    f3r = ioptr[12];
    f3i = ioptr[13];
    f4r = ioptr[2];
    f4i = ioptr[3];
    f5r = ioptr[10];
    f5i = ioptr[11];
    f6r = ioptr[6];
    f6i = ioptr[7];
    f7r = ioptr[14];
    f7i = ioptr[15];
    /* Butterflys		*/
    /*
    f0	-	-	t0	-	-	f0	-	-	f0
    f1	-  1 -	f1	-	-	f1	-	-	f1
    f2	-	-	f2	-  1 -	f2	-	-	f2
    f3	-  1 -	t1	- -i -	f3	-	-	f3
    f4	-	-	t0	-	-	f4	-  1 -	t0
    f5	-  1 -	f5	-	-	f5	- w3 -	f4
    f6	-	-	f6	-  1 -	f6	- -i -	t1
    f7	-  1 -	t1	- -i -	f7	- iw3-	f6
    */

    t0r = f0r + f1r;
    t0i = f0i + f1i;
    f1r = f0r - f1r;
    f1i = f0i - f1i;

    t1r = f2r - f3r;
    t1i = f2i - f3i;
    f2r = f2r + f3r;
    f2i = f2i + f3i;

    f0r = t0r + f2r;
    f0i = t0i + f2i;
    f2r = t0r - f2r;
    f2i = t0i - f2i;

    f3r = f1r - t1i;
    f3i = f1i + t1r;
    f1r = f1r + t1i;
    f1i = f1i - t1r;


    t0r = f4r + f5r;
    t0i = f4i + f5i;
    f5r = f4r - f5r;
    f5i = f4i - f5i;

    t1r = f6r - f7r;
    t1i = f6i - f7i;
    f6r = f6r + f7r;
    f6i = f6i + f7i;

    f4r = t0r + f6r;
    f4i = t0i + f6i;
    f6r = t0r - f6r;
    f6i = t0i - f6i;

    f7r = f5r - t1i;
    f7i = f5i + t1r;
    f5r = f5r + t1i;
    f5i = f5i - t1r;


    t0r = f0r - f4r;
    t0i = f4i - f0i;	// neg for rfft
    f0r = f0r + f4r;
    f0i = f0i + f4i;

    t1r = f2r - f6i;
    t1i = f2i + f6r;
    f2r = f2r + f6i;
    f2i = f2i - f6r;

    f4r = f1r - f5r * w0r - f5i * w0r;
    f4i = f1i + f5r * w0r - f5i * w0r;
    f1r = f1r * Two - f4r;
    f1i = f1i * Two - f4i;

    f6r = f3r + f7r * w0r - f7i * w0r;
    f6i = f3i + f7r * w0r + f7i * w0r;
    f3r = f3r * Two - f6r;
    f3i = f3i * Two - f6i;

    /* finish rfft */
    f5r = 	f0r + f0i;    /* compute Re(x[0]) */
    f5i = 	f0r - f0i;    /* compute Re(x[N/2]) */

    f0r = f2r + t1r;
    f0i = f2i - t1i;
    f7r = f2i + t1i;
    f7i = t1r - f2r;

    f2r = f0r + w0r * f7r + w0r * f7i;
    f2i = f0i - w0r * f7r + w0r * f7i;
    t1r = Two * f0r - f2r;
    t1i = f2i - Two * f0i;


    f0r = f1r + f6r;
    f0i = f1i - f6i;
    f7r = f1i + f6i;
    f7i = f6r - f1r;

    f1r = f0r + w1r * f7r + w1i * f7i;
    f1i = f0i - w1i * f7r + w1r * f7i;
    f6r = Two * f0r - f1r;
    f6i = f1i - Two * f0i;

    f0r = f3r + f4r;
    f0i = f3i - f4i;
    f7r = f3i + f4i;
    f7i = f4r - f3r;

    f3r = f0r + w1i * f7r + w1r * f7i;
    f3i = f0i - w1r * f7r + w1i * f7i;
    f4r = Two * f0r - f3r;
    f4i = f3i - Two * f0i;

    /* store result */
    ioptr[8] = t0r;
    ioptr[9] = t0i;
    ioptr[0] = f5r;
    ioptr[1] = f5i;

    ioptr[4] = scale*f2r;
    ioptr[5] = scale*f2i;
    ioptr[12] = scale*t1r;
    ioptr[13] = scale*t1i;

    ioptr[2] = scale*f1r;
    ioptr[3] = scale*f1i;
    ioptr[6] = scale*f3r;
    ioptr[7] = scale*f3i;
    ioptr[10] = scale*f4r;
    ioptr[11] = scale*f4i;
    ioptr[14] = scale*f6r;
    ioptr[15] = scale*f6i;
}

static inline void frstage(double *ioptr, int M, double *Utbl);
static inline void frstage(double *ioptr, int M, double *Utbl)
{
    /*	Finish RFFT		*/

    int 	pos;
    int 	posi;
    int 	diffUcnt;

    double 	*p0r, *p1r;
    double 	*u0r, *u0i;

    double w0r, w0i;
    double f0r, f0i, f1r, f1i, f4r, f4i, f5r, f5i;
    double t0r, t0i, t1r, t1i;
    const double Two = 2.0;

    pos = POW2(M-1);
    posi = pos + 1;

    p0r = ioptr;
    p1r = ioptr + pos/2;

    u0r = Utbl + (size_t) POW2(M-3);

    w0r =  *u0r,

    f0r = *(p0r);
    f0i = *(p0r + 1);
    f4r = *(p0r + pos);
    f4i = *(p0r + posi);
    f1r = *(p1r);
    f1i = *(p1r + 1);
    f5r = *(p1r + pos);
    f5i = *(p1r + posi);

    t0r = Two * f0r + Two * f0i;    /* compute Re(x[0]) */
    t0i = Two * f0r - Two * f0i;    /* compute Re(x[N/2]) */
    t1r = f4r + f4r;
    t1i = -f4i - f4i;

    f0r = f1r + f5r;
    f0i = f1i - f5i;
    f4r = f1i + f5i;
    f4i = f5r - f1r;

    f1r = f0r + w0r * f4r + w0r * f4i;
    f1i = f0i - w0r * f4r + w0r * f4i;
    f5r = Two * f0r - f1r;
    f5i = f1i - Two * f0i;

    *(p0r) = t0r;
    *(p0r + 1) = t0i;
    *(p0r + pos) = t1r;
    *(p0r + posi) = t1i;
    *(p1r) = f1r;
    *(p1r + 1) = f1i;
    *(p1r + pos) = f5r;
    *(p1r + posi) = f5i;

    u0r = Utbl + 1;
    u0i = Utbl + (POW2(M-2)-1);

    w0r =  *u0r,
    w0i =  *u0i;

    p0r = (ioptr + 2);
    p1r = (ioptr + (POW2(M-2)-1)*2);

    /* Butterflys */
    /*
    f0	-	t0	-	-	f0
    f5	-	t1	- w0	-	f5

    f1	-	t0	-	-	f1
    f4	-	t1	-iw0 -	f4
    */

    for (diffUcnt = POW2(M-3)-1; diffUcnt > 0 ; diffUcnt--) {

        f0r = *(p0r);
        f0i = *(p0r + 1);
        f5r = *(p1r + pos);
        f5i = *(p1r + posi);
        f1r = *(p1r);
        f1i = *(p1r + 1);
        f4r = *(p0r + pos);
        f4i = *(p0r + posi);

        t0r = f0r + f5r;
        t0i = f0i - f5i;
        t1r = f0i + f5i;
        t1i = f5r - f0r;

        f0r = t0r + w0r * t1r + w0i * t1i;
        f0i = t0i - w0i * t1r + w0r * t1i;
        f5r = Two * t0r - f0r;
        f5i = f0i - Two * t0i;

        t0r = f1r + f4r;
        t0i = f1i - f4i;
        t1r = f1i + f4i;
        t1i = f4r - f1r;

        f1r = t0r + w0i * t1r + w0r * t1i;
        f1i = t0i - w0r * t1r + w0i * t1i;
        f4r = Two * t0r - f1r;
        f4i = f1i - Two * t0i;

        *(p0r) = f0r;
        *(p0r + 1) = f0i;
        *(p1r + pos) = f5r;
        *(p1r + posi) = f5i;

        w0r = *++u0r;
        w0i = *--u0i;

        *(p1r) = f1r;
        *(p1r + 1) = f1i;
        *(p0r + pos) = f4r;
        *(p0r + posi) = f4i;

        p0r += 2;
        p1r -= 2;
    }
}

void rffts1(double *ioptr, int M, int Rows, double *Utbl, short *BRLow)
{
    /* Compute in-place real fft on the rows of the input array	*/
    /* The result is the complex spectra of the positive frequencies */
    /* except the location for the first complex number contains the real */
    /* values for DC and Nyquest */
    /* INPUTS */
    /* *ioptr = real input data array	*/
    /* M = log2 of fft size	*/
    /* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array)	*/
    /* *Utbl = cosine table	*/
    /* *BRLow = bit reversed counter table	*/
    /* OUTPUTS */
    /* *ioptr = output data array	in the following order */
    /* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */

    double	scale;
    int 	StageCnt;
    int 	NDiffU;

    M=M-1;
    switch (M) {
    case -1:
        break;
    case 0:
        for (; Rows>0; Rows--) {
            rfft1pt(ioptr);				/* a 2 pt fft */
            ioptr += 2*POW2(M);
        }
    case 1:
        for (; Rows>0; Rows--) {
            rfft2pt(ioptr);				/* a 4 pt fft */
            ioptr += 2*POW2(M);
        }
        break;
    case 2:
        for (; Rows>0; Rows--) {
            rfft4pt(ioptr);				/* an 8 pt fft */
            ioptr += 2*POW2(M);
        }
        break;
    case 3:
        for (; Rows>0; Rows--) {
            rfft8pt(ioptr);				/* a 16 pt fft */
            ioptr += 2*POW2(M);
        }
        break;
    default:
        scale = 0.5;
        for (; Rows>0; Rows--) {

            scbitrevR2(ioptr, M, BRLow, scale);						/* bit reverse and first radix 2 stage */

            StageCnt = (M-1) / 3;		// number of radix 8 stages
            NDiffU = 2;				// one radix 2 stage already complete

            if ( (M-1-(StageCnt * 3)) == 1 ) {
                bfR2(ioptr, M, NDiffU);			/* 1 radix 2 stage */
                NDiffU *= 2;
            }

            if ( (M-1-(StageCnt * 3)) == 2 ) {
                bfR4(ioptr, M, NDiffU);			/* 1 radix 4 stage */
                NDiffU *= 4;
            }

            if (M <= MCACHE) {
                bfstages(ioptr, M, Utbl, 2, NDiffU, StageCnt);		/*  RADIX 8 Stages	*/
                frstage(ioptr, M+1, Utbl);
            }

            else {
                fftrecurs(ioptr, M, Utbl, 2, NDiffU, StageCnt);		/*  RADIX 8 Stages	*/
                frstage(ioptr, M+1, Utbl);
            }

            ioptr += 2*POW2(M);
        }
    }
}

/************************************************
parts of riffts1
*************************************************/

static inline void rifft1pt(double *ioptr, double scale);
static inline void rifft1pt(double *ioptr, double scale)
{
    /***   RADIX 2 rifft	***/
    double f0r, f0i;
    double t0r, t0i;

    /* bit reversed load */
    f0r = ioptr[0];
    f0i = ioptr[1];

    /* finish rfft */
    t0r = 	f0r + f0i;
    t0i = 	f0r - f0i;

    /* store result */
    ioptr[0] = scale*t0r;
    ioptr[1] = scale*t0i;
}

static inline void rifft2pt(double *ioptr, double scale);
static inline void rifft2pt(double *ioptr, double scale)
{
    /***   RADIX 4 rifft	***/
    double f0r, f0i, f1r, f1i;
    double t0r, t0i;
    const double Two = 2.0;

    /* bit reversed load */
    t0r = ioptr[0];
    t0i = ioptr[1];
    f1r = Two * ioptr[2];
    f1i = Two * ioptr[3];

    /* start rifft */
    f0r = 	t0r + t0i;
    f0i = 	t0r - t0i;
    /* Butterflys		*/
    /*
    f0	-	-	t0
    f1	-  1 -	f1
    */

    t0r = f0r + f1r;
    t0i = f0i - f1i;
    f1r = f0r - f1r;
    f1i = f0i + f1i;

    /* store result */
    ioptr[0] = scale*t0r;
    ioptr[1] = scale*t0i;
    ioptr[2] = scale*f1r;
    ioptr[3] = scale*f1i;
}

static inline void rifft4pt(double *ioptr, double scale);
static inline void rifft4pt(double *ioptr, double scale)
{
    /***   RADIX 8 rifft	***/
    double f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
    double t0r, t0i, t1r, t1i;
    double w0r = 1.0/MYROOT2; /* cos(pi/4)	*/
    const double Two = 2.0;

    /* bit reversed load */
    t0r = ioptr[0];
    t0i = ioptr[1];
    f2r = ioptr[2];
    f2i = ioptr[3];
    f1r = Two * ioptr[4];
    f1i = Two * ioptr[5];
    f3r = ioptr[6];
    f3i = ioptr[7];

    /* start rfft */
    f0r = 	t0r + t0i;    /* compute Re(x[0]) */
    f0i = 	t0r - t0i;    /* compute Re(x[N/2]) */

    t1r = f2r + f3r;
    t1i = f2i - f3i;
    t0r = f2r - f3r;
    t0i = f2i + f3i;

    f2r = t1r - w0r * t0r - w0r * t0i;
    f2i = t1i + w0r * t0r - w0r * t0i;
    f3r = Two * t1r - f2r;
    f3i = f2i - Two * t1i;

    /* Butterflys		*/
    /*
    f0	-	-	t0	-	-	f0
    f1	-  1 -	f1	-	-	f1
    f2	-	-	f2	-  1 -	f2
    f3	-  1 -	t1	-  i -	f3
    */

    t0r = f0r + f1r;
    t0i = f0i - f1i;
    f1r = f0r - f1r;
    f1i = f0i + f1i;

    t1r = f2r - f3r;
    t1i = f2i - f3i;
    f2r = f2r + f3r;
    f2i = f2i + f3i;

    f0r = t0r + f2r;
    f0i = t0i + f2i;
    f2r = t0r - f2r;
    f2i = t0i - f2i;

    f3r = f1r + t1i;
    f3i = f1i - t1r;
    f1r = f1r - t1i;
    f1i = f1i + t1r;

    /* store result */
    ioptr[0] = scale*f0r;
    ioptr[1] = scale*f0i;
    ioptr[2] = scale*f1r;
    ioptr[3] = scale*f1i;
    ioptr[4] = scale*f2r;
    ioptr[5] = scale*f2i;
    ioptr[6] = scale*f3r;
    ioptr[7] = scale*f3i;
}

static inline void rifft8pt(double *ioptr, double scale);
static inline void rifft8pt(double *ioptr, double scale)
{
    /***   RADIX 16 rifft	***/
    double w0r = 1.0/MYROOT2; /* cos(pi/4)	*/
    double w1r = MYCOSPID8; /* cos(pi/8)	*/
    double w1i = MYSINPID8; /* sin(pi/8)	*/
    double f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
    double f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
    double t0r, t0i, t1r, t1i;
    const double Two = 2.0;

    /* bit reversed load */
    t0r = ioptr[0];
    t0i = ioptr[1];
    f4r = ioptr[2];
    f4i = ioptr[3];
    f2r = ioptr[4];
    f2i = ioptr[5];
    f6r = ioptr[6];
    f6i = ioptr[7];
    f1r = Two * ioptr[8];
    f1i = Two * ioptr[9];
    f5r = ioptr[10];
    f5i = ioptr[11];
    f3r = ioptr[12];
    f3i = ioptr[13];
    f7r = ioptr[14];
    f7i = ioptr[15];


    /* start rfft */
    f0r = 	t0r + t0i;    /* compute Re(x[0]) */
    f0i = 	t0r - t0i;    /* compute Re(x[N/2]) */

    t0r = f2r + f3r;
    t0i = f2i - f3i;
    t1r = f2r - f3r;
    t1i = f2i + f3i;

    f2r = t0r - w0r * t1r - w0r * t1i;
    f2i = t0i + w0r * t1r - w0r * t1i;
    f3r = Two * t0r - f2r;
    f3i = f2i - Two * t0i;

    t0r = f4r + f7r;
    t0i = f4i - f7i;
    t1r = f4r - f7r;
    t1i = f4i + f7i;

    f4r = t0r - w1i * t1r - w1r * t1i;
    f4i = t0i + w1r * t1r - w1i * t1i;
    f7r = Two * t0r - f4r;
    f7i = f4i - Two * t0i;

    t0r = f6r + f5r;
    t0i = f6i - f5i;
    t1r = f6r - f5r;
    t1i = f6i + f5i;

    f6r = t0r - w1r * t1r - w1i * t1i;
    f6i = t0i + w1i * t1r - w1r * t1i;
    f5r = Two * t0r - f6r;
    f5i = f6i - Two * t0i;

    /* Butterflys		*/
    /*
    f0	-	-	t0	-	-	f0	-	-	f0
    f1*	-  1 -	f1	-	-	f1	-	-	f1
    f2	-	-	f2	-  1 -	f2	-	-	f2
    f3	-  1 -	t1	-  i -	f3	-	-	f3
    f4	-	-	t0	-	-	f4	-  1 -	t0
    f5	-  1 -	f5	-	-	f5	- w3 -	f4
    f6	-	-	f6	-  1 -	f6	-  i -	t1
    f7	-  1 -	t1	-  i -	f7	- iw3-	f6
    */

    t0r = f0r + f1r;
    t0i = f0i - f1i;
    f1r = f0r - f1r;
    f1i = f0i + f1i;

    t1r = f2r - f3r;
    t1i = f2i - f3i;
    f2r = f2r + f3r;
    f2i = f2i + f3i;

    f0r = t0r + f2r;
    f0i = t0i + f2i;
    f2r = t0r - f2r;
    f2i = t0i - f2i;

    f3r = f1r + t1i;
    f3i = f1i - t1r;
    f1r = f1r - t1i;
    f1i = f1i + t1r;


    t0r = f4r + f5r;
    t0i = f4i + f5i;
    f5r = f4r - f5r;
    f5i = f4i - f5i;

    t1r = f6r - f7r;
    t1i = f6i - f7i;
    f6r = f6r + f7r;
    f6i = f6i + f7i;

    f4r = t0r + f6r;
    f4i = t0i + f6i;
    f6r = t0r - f6r;
    f6i = t0i - f6i;

    f7r = f5r + t1i;
    f7i = f5i - t1r;
    f5r = f5r - t1i;
    f5i = f5i + t1r;


    t0r = f0r - f4r;
    t0i = f0i - f4i;
    f0r = f0r + f4r;
    f0i = f0i + f4i;

    t1r = f2r + f6i;
    t1i = f2i - f6r;
    f2r = f2r - f6i;
    f2i = f2i + f6r;

    f4r = f1r - f5r * w0r + f5i * w0r;
    f4i = f1i - f5r * w0r - f5i * w0r;
    f1r = f1r * Two - f4r;
    f1i = f1i * Two - f4i;

    f6r = f3r + f7r * w0r + f7i * w0r;
    f6i = f3i - f7r * w0r + f7i * w0r;
    f3r = f3r * Two - f6r;
    f3i = f3i * Two - f6i;

    /* store result */
    ioptr[0] = scale*f0r;
    ioptr[1] = scale*f0i;
    ioptr[2] = scale*f1r;
    ioptr[3] = scale*f1i;
    ioptr[4] = scale*f2r;
    ioptr[5] = scale*f2i;
    ioptr[6] = scale*f3r;
    ioptr[7] = scale*f3i;
    ioptr[8] = scale*t0r;
    ioptr[9] = scale*t0i;
    ioptr[10] = scale*f4r;
    ioptr[11] = scale*f4i;
    ioptr[12] = scale*t1r;
    ioptr[13] = scale*t1i;
    ioptr[14] = scale*f6r;
    ioptr[15] = scale*f6i;
}

static inline void ifrstage(double *ioptr, int M, double *Utbl);
static inline void ifrstage(double *ioptr, int M, double *Utbl)
{
    /*	Start RIFFT		*/

    int 	pos;
    int 	posi;
    int 	diffUcnt;

    double 	*p0r, *p1r;
    double 	*u0r, *u0i;

    double w0r, w0i;
    double f0r, f0i, f1r, f1i, f4r, f4i, f5r, f5i;
    double t0r, t0i, t1r, t1i;
    const double Two = 2.0;

    pos = POW2(M-1);
    posi = pos + 1;

    p0r = ioptr;
    p1r = ioptr + pos/2;

    u0r = Utbl + (size_t) POW2(M-3);

    w0r =  *u0r,

    f0r = *(p0r);
    f0i = *(p0r + 1);
    f4r = *(p0r + pos);
    f4i = *(p0r + posi);
    f1r = *(p1r);
    f1i = *(p1r + 1);
    f5r = *(p1r + pos);
    f5i = *(p1r + posi);

    t0r = f0r + f0i;
    t0i = f0r - f0i;
    t1r = f4r + f4r;
    t1i = -f4i - f4i;

    f0r = f1r + f5r;
    f0i = f1i - f5i;
    f4r = f1r - f5r;
    f4i = f1i + f5i;

    f1r = f0r - w0r * f4r - w0r * f4i;
    f1i = f0i + w0r * f4r - w0r * f4i;
    f5r = Two * f0r - f1r;
    f5i = f1i - Two * f0i;

    *(p0r) = t0r;
    *(p0r + 1) = t0i;
    *(p0r + pos) = t1r;
    *(p0r + posi) = t1i;
    *(p1r) = f1r;
    *(p1r + 1) = f1i;
    *(p1r + pos) = f5r;
    *(p1r + posi) = f5i;

    u0r = Utbl + 1;
    u0i = Utbl + (POW2(M-2)-1);

    w0r =  *u0r,
    w0i =  *u0i;

    p0r = (ioptr + 2);
    p1r = (ioptr + (POW2(M-2)-1)*2);

    /* Butterflys */
    /*
    f0	-	 t0		-	f0
    f1	-     t1     -w0-   f1

    f2	-	 t0		-	f2
    f3	-     t1	   -iw0-  f3
    */

    for (diffUcnt = POW2(M-3)-1; diffUcnt > 0 ; diffUcnt--) {

        f0r = *(p0r);
        f0i = *(p0r + 1);
        f5r = *(p1r + pos);
        f5i = *(p1r + posi);
        f1r = *(p1r);
        f1i = *(p1r + 1);
        f4r = *(p0r + pos);
        f4i = *(p0r + posi);

        t0r = f0r + f5r;
        t0i = f0i - f5i;
        t1r = f0r - f5r;
        t1i = f0i + f5i;

        f0r = t0r - w0i * t1r - w0r * t1i;
        f0i = t0i + w0r * t1r - w0i * t1i;
        f5r = Two * t0r - f0r;
        f5i = f0i - Two * t0i;

        t0r = f1r + f4r;
        t0i = f1i - f4i;
        t1r = f1r - f4r;
        t1i = f1i + f4i;

        f1r = t0r - w0r * t1r - w0i * t1i;
        f1i = t0i + w0i * t1r - w0r * t1i;
        f4r = Two * t0r - f1r;
        f4i = f1i - Two * t0i;

        *(p0r) = f0r;
        *(p0r + 1) = f0i;
        *(p1r + pos) = f5r;
        *(p1r + posi) = f5i;

        w0r = *++u0r;
        w0i = *--u0i;

        *(p1r) = f1r;
        *(p1r + 1) = f1i;
        *(p0r + pos) = f4r;
        *(p0r + posi) = f4i;

        p0r += 2;
        p1r -= 2;
    }
}

void riffts1(double *ioptr, int M, int Rows, double *Utbl, short *BRLow)
{
    /* Compute in-place real ifft on the rows of the input array	*/
    /* data order as from rffts1 */
    /* INPUTS */
    /* *ioptr = input data array in the following order	*/
    /* M = log2 of fft size	*/
    /* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */
    /* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array)	*/
    /* *Utbl = cosine table	*/
    /* *BRLow = bit reversed counter table	*/
    /* OUTPUTS */
    /* *ioptr = real output data array	*/

    double	scale;
    int 	StageCnt;
    int 	NDiffU;

    scale = 1.0/POW2(M);
    M=M-1;
    switch (M) {
    case -1:
        break;
    case 0:
        for (; Rows>0; Rows--) {
            rifft1pt(ioptr, scale);				/* a 2 pt fft */
            ioptr += 2*POW2(M);
        }
    case 1:
        for (; Rows>0; Rows--) {
            rifft2pt(ioptr, scale);				/* a 4 pt fft */
            ioptr += 2*POW2(M);
        }
        break;
    case 2:
        for (; Rows>0; Rows--) {
            rifft4pt(ioptr, scale);				/* an 8 pt fft */
            ioptr += 2*POW2(M);
        }
        break;
    case 3:
        for (; Rows>0; Rows--) {
            rifft8pt(ioptr, scale);				/* a 16 pt fft */
            ioptr += 2*POW2(M);
        }
        break;
    default:
        for (; Rows>0; Rows--) {

            ifrstage(ioptr, M+1, Utbl);

            scbitrevR2(ioptr, M, BRLow, scale);						/* bit reverse and first radix 2 stage */

            StageCnt = (M-1) / 3;		// number of radix 8 stages
            NDiffU = 2;				// one radix 2 stage already complete

            if ( (M-1-(StageCnt * 3)) == 1 ) {
                ibfR2(ioptr, M, NDiffU);			/* 1 radix 2 stage */
                NDiffU *= 2;
            }

            if ( (M-1-(StageCnt * 3)) == 2 ) {
                ibfR4(ioptr, M, NDiffU);			/* 1 radix 4 stage */
                NDiffU *= 4;
            }

            if (M <= MCACHE) {
                ibfstages(ioptr, M, Utbl, 2, NDiffU, StageCnt);		/*  RADIX 8 Stages	*/
            }

            else {
                ifftrecurs(ioptr, M, Utbl, 2, NDiffU, StageCnt);		/*  RADIX 8 Stages	*/
            }

            ioptr += 2*POW2(M);
        }
    }
}