
#include <iostream.h>
#include <stdlib.h>
//#include <builtin.h>
#include <assert.h>

#include "fxt.h"
#include "fxtaux.h"
#include "matrix/ldn2rc.h"
#include "permute.h"
#include "graypermute.h"


#define  invclock  (1.0/(366*1024*1024))

#define rdtsc(lo, hi) \
asm("rdtsc" : "=a" (lo), "=d" (hi))


#define  get_time(z) \
{ rdtsc(tlo, thi);  z = invclock*(thi*(4.0*1024*1024*1024)+tlo); }

#define  STR(s)  #s

#define TT(CODE) \
{ \
  null( f, n ); \
  cout.width(25);  cout << STR(CODE); cout.flush(); \
  get_time(t); \
  for (ulong ct=0; ct<m; ++ct)  {CODE}; \
  get_time(dt); \
  dt -= t; \
  dt /= m; \
  cout << "    dt="; cout.width(8); cout << dt; \
  if ( dt1==0 )  dt1 = dt; \
  cout << "   rel="; cout.width(8); cout << dt/dt1; \
  cout << endl; \
}


#define  SYMM0  2  // 2, 4 (default is 4)
#define  idx_swap(f, k, r)  { ulong kx=(k), rx=(r);  swap(f[kx], f[rx]); }
template <typename Type>
void
test(Type *f, ulong n)
{
    if ( n<=8 )
    {
        if ( n==8 )
        {
            swap0(f[1], f[4]);
            swap0(f[3], f[6]);
            return;
        }

        if ( n==4 )
        {
            swap0(f[1], f[2]);
            return;
        }

        return;
    }

    const ulong nh = (n>>1);
    static ulong x[BITS_PER_LONG];
    x[0] = nh;
    {  // initialize xor-table:
        ulong i, m = nh;
        for (i=1; m!=0; ++i)
        {
            m >>= 1;
            x[i] = x[i-1] ^ m;
        }
    }

#if  ( SYMM0 >= 2 )
    const ulong n1  = n - 1;    // = 11111111
#if  ( SYMM0 >= 4 )
    const ulong nx1 = nh - 2;   // = 01111110
//    const ulong nx2 = n1 - nx1; // = 10111101
#endif //  ( SYMM0 >= 4 )
#endif //  ( SYMM0 >= 2 )
    ulong k=0, r=0;
    while ( k<n/SYMM0  )  // n>=16, n/2>=8, n/4>=4
    {
        // ----- k%4 == 0:
        if ( r>k )
        {
            swap(f[k], f[r]);  // <nh, <nh 11
#if  ( SYMM0 >= 2 )
//            idx_swap(f, n1^k, n1^r);  // >nh, >nh 00
#if  ( SYMM0 >= 4 )
            idx_swap(f, nx1^k, nx1^r);  // <nh, <nh 11
//            idx_swap(f, nx2^k, nx2^r);  // >nh, >nh 00
#endif //  ( SYMM0 >= 4 )
#endif //  ( SYMM0 >= 2 )
        }

        r ^= nh;
        ++k;

        // ----- k%4 == 1:
        if ( r>k )
        {
            swap0(f[k], f[r]);  // <nh, >nh 10
#if  ( SYMM0 >= 4 )
            swap0(f[r^n1], f[k^n1]);
//            idx_swap(f, n1^k, n1^r);  // >nh, <nh 01
#endif //  ( SYMM0 >= 4 )
        }

        { // scan for lowest unset bit of k:
            ulong m = 2,  i = 1;
            while ( m & k )  { m <<= 1;  ++i; }
            r ^= x[i];
        }
        ++k;

        // ----- k%4 == 2:
        if ( r>k )
        {
            swap(f[k], f[r]);  // <nh, <nh 11
#if  ( SYMM0 >= 2 )
//            idx_swap(f, n1^k, n1^r); // >nh, >nh 00
#endif //  ( SYMM0 >= 2 )
        }

        r ^= nh;
        ++k;

        // ----- k%4 == 3:
        if ( r>k )
        {
            swap0(f[k], f[r]);    // <nh, >nh 10
#if  ( SYMM0 >= 4 )
            swap0(f[k^nx1], f[r^nx1]);    // <nh, >nh 10
//            idx_swap(f, nx1^k, nx1^r);   // <nh, >nh 10
#endif //  ( SYMM0 >= 4 )
        }

        { // scan for lowest unset bit of k:
            ulong m = 4,  i = 2;
            while ( m & k )  { m <<= 1;  ++i; }
            r ^= x[i];
        }
        ++k;
    }
}
// =========================
#undef  idx_swap


int
main(int argc, char **argv)
{
    double t, dt, dt1=0;
    ulong tlo, thi;

    ulong ldn = 20;
    if ( argc>1 )  ldn = atol(argv[1]);
    ulong  n = (1<<ldn);
    ulong ldn2 = ldn + 1,  n2 = 2*n;  // for real ops

    ulong m = 3;
    if ( argc>2 )  m = atol(argv[2]);


    Complex *f;
    double *fr, *fi;
    f = new Complex[n];
    fr = (double *)f;
    fi = fr + n;

    cout << "ldn=" << ldn << "  n=" << n << endl;
    cout << "repetitions: m=" << m << endl;

    int ms = n*sizeof(Complex)/1024;
    cout << "memsize=" << ms << " kiloByte" << endl;
    cout << endl;

    reverse(fr,n*2);  // touch memory


    TT( reverse(fr,n2); );

    TT( revbin_permute(fr,n2); );
    TT( test(fr,n2); );

    TT( revbin_permute(fr,n2); );
    TT( test(fr,n2); );

    TT( revbin_permute0(fr,n2); );
    TT( reverse(fr,n2); );
    TT( gray_permute(fr,n2); );
    TT( reverse(fr,n2); );

//    TT( fht_auto_convolution0(fr,ldn); );
//    TT( matrix_auto_convolution0(fr,ldn); );
//    TT( fht(fr,ldn); );
//
//    TT( dif2_walsh_wak(fr,ldn); );
//    TT( dit2_walsh_wak(fr,ldn); );
//    TT( dif2_walsh_wak_slow(fr,ldn); );
//    TT( dit2_walsh_wak_slow(fr,ldn); );
//
//    TT( null(f,n); );
//    TT( fht(fr,ldn2); );
//    TT( fht(f,ldn); );
//
//    TT( fht_fft(fr,fi,ldn,+1); );
//    TT( matrix_fft(fr,fi,ldn,+1); );
//
//    ulong r, c;  ldn2rc(ldn, r, c);  TT( matrix_transpose2(f,r,c); );
//    TT( revbin_permute0(f,n); );
//    TT( revbin_permute(f,n); );
//
//    TT( matrix_auto_convolution0(f,ldn); );
//    TT( matrix_complex_auto_convolution0(fr,fi,ldn); );
//
//    TT( matrix_auto_convolution0(fr,ldn2); );
//    TT( fht_auto_convolution0(fr,ldn2); );
//    TT( split_radix_fft_auto_convolution0(fr,ldn2); );
//
//    TT( dit2_fft(f,ldn,+1); );
//    TT( dit2_fft_localized(f,ldn,+1); );

    return 0;
}
//===========================================

