#include "ap.h"

modint t0, t1, t2;
rawtype m01[2], m02[2], m12[2], mm[3], cc[3];
float chopper53 = 4503599627370496.0;                   // 2^52
double dtmp0, dtmp1, dtmp2;
double dmodulus0, dmodulus1, dmodulus2;
double imodulus0, imodulus1, imodulus2;

extern "C" void crtblock (size_t l, modint *buf1, modint *buf2, modint *buf3);

asm ("
    .globl crtblock

.align 32
crtblock:
    pushl %ebx
    pushl %ecx
    pushl %edx
    pushl %esi
    pushl %edi
    pushl %ebp

    movl 28(%esp), %ecx
    testl %ecx, %ecx
    jz crtblockend

    crtloop:

    decl %ecx
    movl 32(%esp), %eax
    movl 36(%esp), %ebx
    movl 40(%esp), %edx

    fildl t0; fildl (%eax, %ecx, 4)
    fildl t1; fxch %st(2)
    fmulp %st, %st(1); fxch %st(1)
    fildl (%ebx, %ecx, 4); fildl t2; fxch %st(2)
    fmulp %st, %st(1); fxch %st(1)
    fildl (%edx, %ecx, 4); fmulp %st, %st(1)
    fld %st(2); fmull imodulus0
    fld %st(2); fmull imodulus1
    fld %st(2); fmull imodulus2; fxch %st(2)
    fadds chopper64; fxch %st(1)
    fadds chopper64; fxch %st(2)
    fadds chopper64; fxch %st(1)
    fsubs chopper64; fxch %st(2)
    fsubs chopper64; fxch %st(1)
    fsubs chopper64; fxch %st(2)
    fmull dmodulus0; fxch %st(1)
    fmull dmodulus1; fxch %st(1)
    fsubrp %st, %st(5); fxch %st(1)
    fmull dmodulus2; fxch %st(1)
    fsubrp %st, %st(3)
    fsubrp %st, %st(1); fxch %st(2)
    fadds chopper53; fxch %st(1)
    fadds chopper53; fxch %st(2)
    fadds chopper53; fxch %st(1)
    fstpl dtmp0; fxch %st(1)
    fstpl dtmp1
    fstpl dtmp2

    movl m12, %eax
    mull dtmp0
    movl %eax, %ebx
    movl %edx, %ebp
    movl m12+4, %eax
    mull dtmp0
    xorl %edi, %edi
    addl %eax, %ebp
    adcl %edx, %edi

    movl m02, %eax
    mull dtmp1
    addl %eax, %ebx
    adcl %edx, %ebp
    adcl $0, %edi
    movl m02+4, %eax
    mull dtmp1
    addl %eax, %ebp
    adcl %edx, %edi

    movl m01, %eax
    mull dtmp2
    addl %eax, %ebx
    adcl %edx, %ebp
    adcl $0, %edi
    movl m01+4, %eax
    mull dtmp2
    addl %eax, %ebp
    adcl %edx, %edi

    movl mm, %esi
    movl mm+4, %eax
    movl mm+8, %edx

    crtsubloop:
    cmpl %edx, %edi
    jae crtsub
    jb crtnosub
    cmpl %eax, %ebp
    jae crtsub
    jb crtnosub
    cmpl %esi, %ebx
    jb crtnosub

    crtsub:
    subl %esi, %ebx
    sbbl %eax, %ebp
    sbbl %edx, %edi
    jmp crtsubloop

    crtnosub:

    movl cc, %esi
    movl cc+4, %eax
    addl %ebx, %esi
    adcl %ebp, %eax
    movl cc+8, %edx
    adcl %edi, %edx

    divl Base
    movl $0, cc+8
    movl %eax, cc+4
    movl %esi, %eax
    movl 32(%esp), %ebx
    divl Base
    movl %eax, cc
    movl %edx, (%ebx, %ecx, 4)

    testl %ecx, %ecx
    jnz crtloop

    crtblockend:

    popl %ebp
    popl %edi
    popl %esi
    popl %edx
    popl %ecx
    popl %ebx

    ret");


// Carry & Chinese Remainder Theorem for fnt-multiplication (and square)
// Returns 1 if right shift ocurred
// Assume that ds1 will be in memory if possible

int carrycrt (apstruct *ds1, apstruct *s2, apstruct *s3, size_t rsize)  // Low to high
{
    size_t l, t, p = rsize, r;
    modint *buf1, *buf2, *buf3;
    rawtype carry, tmp1, tmp2;

    cc[0] = cc[1] = cc[2] = 0;

    dmodulus0 = (double) moduli[0];
    dmodulus1 = (double) moduli[1];
    dmodulus2 = (double) moduli[2];

    imodulus0 = 1.0 / (double) moduli[0];
    imodulus1 = 1.0 / (double) moduli[1];
    imodulus2 = 1.0 / (double) moduli[2];

    setmodulus (moduli[0]);
    t0 = modint (1) / (modint (moduli[1]) * moduli[2]);

    //Now moduli[0] is larger than moduli[1], so special care must be taken
    setmodulus (moduli[1]);
    tmp1 = moduli[0];
    while (tmp1 >= modint::modulus) tmp1 -= modint::modulus;
    t1 = modint (1) / (modint (tmp1) * moduli[2]);

    //Now moduli[0] and moduli[1] are larger than moduli[2] again
    setmodulus (moduli[2]);
    tmp1 = moduli[0];
    while (tmp1 >= modint::modulus) tmp1 -= modint::modulus;
    tmp2 = moduli[1];
    while (tmp2 >= modint::modulus) tmp2 -= modint::modulus;
    t2 = modint (1) / (modint (tmp1) * tmp2);

    m01[0] = moduli[0];
    m01[1] = bigmul (m01, m01, moduli[1], 1);

    m02[0] = moduli[0];
    m02[1] = bigmul (m02, m02, moduli[2], 1);

    m12[0] = moduli[1];
    m12[1] = bigmul (m12, m12, moduli[2], 1);

    mm[2] = bigmul (mm, m01, moduli[2], 2);

    while (p)
    {
        l = (p < Blocksize ? p : Blocksize);
        p -= l;
        buf1 = ds1->getdata (p, l);
        buf2 = s2->getdata (p, l);
        buf3 = s3->getdata (p, l);

        crtblock (l, buf1, buf2, buf3);

        s3->cleardata ();
        s2->cleardata ();
        ds1->putdata ();
    }

    carry = cc[0];

    if (carry != 0)
    {
        p = ds1->size;
        r = 0;

        tmp1 = carry;

        while (p)
        {
            l = (p < Maxblocksize ? p : Maxblocksize);
            p -= l;
            buf1 = ds1->getdata (r, l);
            r += l;
            for (t = 0; t < l; t++)
            {
                tmp2 = buf1[t];
                buf1[t] = tmp1;
                tmp1 = tmp2;
            }
            ds1->putdata ();
        }

        return 1;
    }

    return 0;
}
