#include <stdlib.h>
#include <ctype.h>
#include <fstream.h>
#include <strstream.h>
#include <string.h>
#include <float.h>
#include <windows.h>

#undef INFINITE                                 // Defined in windows.h

#include "ap.h"


// Automatic init stuff

bool apfloatinitialized = false;

class apfloatinit
{
public:
    apfloatinit ()
    {
        int apfloatinitok = apinit ();

        assert (!apfloatinitok);
    };

    ~apfloatinit ()
    {
        apdeinit ();
    };
};

// Constructor is called at program init, destructor at program exit
apfloatinit autoinit;


// Code size in megabytes (approx.)
const size_t CODESIZE = 2;

const size_t BUFSIZE = 256;

float chopper64 = 9223372036854775808.0;        // 2^63
double dmodulus;
long double imodulus;

// The actual workspace buffer, not necessarily aligned to a cache line
rawtype *unalignedworkspace;

size_t rnd2down (size_t x)
{
    size_t r = 1;

    if (!x) return 0;

    while (x >>= 1) r <<= 1;

    return r;
}

size_t rnd23down (size_t x)
{
    size_t r = 1, p;

    if (!x) return 0;

    while (r <= x)
    {
        p = r;
        if (r == 1)
            r = 2;
        else if (r == (r & -r))
            r = r / 2 * 3;
        else
            r = r / 3 * 4;
    }

    return p;
}

size_t sqrt4down (size_t x)
{
    size_t r = 1;

    if (!x) return 0;

    while (x >>= 2) r <<= 1;

    return r;
}

// Inits global variables
bool apinit (void)
{
    if (apfloatinitialized) return false;
    apfloatinitialized = true;

    size_t v;
    char buf[BUFSIZE], str[BUFSIZE], *p;
    ifstream fs ("apfloat.ini", ios::in | ios::nocreate);

    // RAM size
    // Set this to the size of actual RAM memory you have on your computer.
    // Ramsize = 8 * (1 << 20);
    MEMORYSTATUS ms;
    SYSTEM_INFO systeminfo;

    ms.dwLength = sizeof (MEMORYSTATUS);
    GlobalMemoryStatus (&ms);
    Ramsize = ms.dwTotalPhys;

    GetSystemInfo(&systeminfo);
    NProcessors = systeminfo.dwNumberOfProcessors;

    // L1 cache size
    // 486's and Pentiums have 8KB (data) L1 cache
    CacheL1size = 8 * (1 << 10);

    // L2 cache size
    // set to amount of L2 cache
    CacheL2size = 256 * (1 << 10);

    // cache burst width
    // 16 bytes for 486, 32 for Pentium
    // Cacheburst = 16;
    Cacheburst = 32;

    // Longer numbers than this will be stored by default to disk
    Memorytreshold = 16384;

    // Efficient read/write block size
    Blocksize = 16384;

    if (!fs.fail ())
    {
        while (!fs.eof ())
        {
            fs.getline (buf, BUFSIZE);
            if ((p = strchr (buf, '=')) != 0)
            {
                *p = '\0';
                p++;
                istrstream (buf) >> str;
                for (v = strlen (str); v--;)
                    str[v] = tolower (str[v]);
                istrstream (p) >> v;
                if (!strcmp (str, "ramsize")) Ramsize = v;
                else if (!strcmp (str, "cachel1size")) CacheL1size = v;
                else if (!strcmp (str, "cachel2size")) CacheL2size = v;
                else if (!strcmp (str, "cacheburst")) Cacheburst = v;
                else if (!strcmp (str, "memorytreshold")) Memorytreshold = v;
                else if (!strcmp (str, "blocksize")) Blocksize = v;
                else if (!strcmp (str, "nprocessors")) NProcessors = v;
            }
        }
    }

    if ((p = getenv ("RAMSIZE")) != 0) istrstream (p) >> Ramsize;
    if ((p = getenv ("CACHEL1SIZE")) != 0) istrstream (p) >> CacheL1size;
    if ((p = getenv ("CACHEL2SIZE")) != 0) istrstream (p) >> CacheL2size;
    if ((p = getenv ("CACHEBURST")) != 0) istrstream (p) >> Cacheburst;
    if ((p = getenv ("MEMORYTRESHOLD")) != 0) istrstream (p) >> Memorytreshold;
    if ((p = getenv ("BLOCKSIZE")) != 0) istrstream (p) >> Blocksize;
    if ((p = getenv ("NPROCESSORS")) != 0) istrstream (p) >> NProcessors;

    if ((p = getenv ("MAXBLOCKSIZE")) != 0)
        istrstream (p) >> Maxblocksize;
    else
        Maxblocksize = Ramsize - CODESIZE * (1 << 20);

    // Set to the maximum 2^n or 3*2^n size block of modints that fits in the memory
    Maxblocksize = rnd23down (Maxblocksize / sizeof (modint));

    // Size of matrix that fits in L2 cache
    Cachetreshold = rnd2down (CacheL2size / sizeof (modint));

    // Cache burst in modints
    Cacheburstblocksize = rnd2down (Cacheburst / sizeof (modint));

    // Block size that fits in L1 cache
    Cachemaxblocksize = rnd2down (CacheL1size / sizeof (modint));

    // Transpose block size, fits in processor L1 cache
    Cacheblocksize = sqrt4down (CacheL1size / sizeof (modint));

    if (Blocksize > Memorytreshold)
        Blocksize = Memorytreshold;

    if (Cachemaxblocksize > Memorytreshold)
        Cachemaxblocksize = rnd2down (Memorytreshold);

    if ((unalignedworkspace = new rawtype[Maxblocksize + Cacheburstblocksize - 1]) == 0)
        return true;

    // Memory block aligned at the beginning of a cache line
    workspace = (rawtype *) (((size_t) unalignedworkspace + sizeof (modint) * Cacheburstblocksize - 1) & -(sizeof (modint) * Cacheburstblocksize));

    return false;
}

void apdeinit (void)
{
    if (!apfloatinitialized) return;
    apfloatinitialized = false;

    delete[] unalignedworkspace;
    workspace = 0;

    _fpreset ();
}

// Set the fpu for doing fpu multiplication.
void setmodulus (rawtype m)
{
    dmodulus = modulus = modint::modulus = m;

    _control87 (RC_CHOP | PC_64, MCW_RC | MCW_PC);

    asm ffree st
    asm fld1
    asm fidiv dword ptr modulus
    asm fld st
    asm fstp tbyte ptr imodulus
}

// Clear what setmodulus () did.
// Call in the end of the program.
void clearmodulus (void)
{
    asm ffree st
}
