#include "ap.h"

modint t0, t1, t2;
rawtype m01[2], m02[2], m12[2], mm[3], cs[3], cc[3];

extern "C" void crtblock (size_t l, modint *buf1, modint *buf2, modint *buf3);

asm ("
    .globl crtblock

.align 16
crtblock:
    pushl %ebx
    pushl %ecx
    pushl %edx
    pushl %esi
    pushl %edi
    pushl %ebp

    movl 28(%esp), %ecx
    andl %ecx, %ecx
    jz crtblockend

    crtloop:

    decl %ecx
    movl 32(%esp), %ebx
    movl t0, %eax
    mull (%ebx, %ecx, 4)
    divl moduli
    movl %edx, %ebx

    movl m12, %eax
    mull %ebx
    movl %eax, cs
    movl %edx, %esi
    movl m12+4, %eax
    mull %ebx
    add %esi, %eax
    adc $0, %edx
    movl %eax, cs+4
    movl %edx, cs+8

    movl 36(%esp), %ebx
    movl t1, %eax
    mull (%ebx, %ecx, 4)
    divl moduli+4
    movl %edx, %ebx

    movl m02, %eax
    mull %ebx
    movl %eax, %edi
    movl %edx, %esi
    movl m02+4, %eax
    mull %ebx
    add %esi, %eax
    adc $0, %edx

    movl cs, %ebx
    movl cs+4, %ebp
    addl %edi, %ebx
    adcl %eax, %ebp
    movl cs+8, %edi
    adcl %edx, %edi
    movl mm, %esi
    movl mm+4, %eax
    movl mm+8, %edx

    cmpl %edx, %edi
    jae crtsub1
    movl %ebx, cs
    jb crtnosub1
    cmpl %eax, %ebp
    jae crtsub1
    jb crtnosub1
    cmpl %esi, %ebx
    jb crtnosub1

    crtsub1:
    subl %esi, %ebx
    sbbl %eax, %ebp
    movl %ebx, cs
    sbbl %edx, %edi

    crtnosub1:
    movl %ebp, cs+4
    movl %edi, cs+8

    movl 40(%esp), %ebx
    movl t2, %eax
    mull (%ebx, %ecx, 4)
    divl moduli+8
    movl %edx, %ebx

    movl m01, %eax
    mull %ebx
    movl %eax, %edi
    movl %edx, %esi
    movl m01+4, %eax
    mull %ebx
    add %esi, %eax
    adc $0, %edx

    movl cs, %ebx
    movl cs+4, %ebp
    addl %edi, %ebx
    adcl %eax, %ebp
    movl cs+8, %edi
    adcl %edx, %edi
    movl mm, %esi
    movl mm+4, %eax
    movl mm+8, %edx

    cmpl %edx, %edi
    jae crtsub2
    jb crtnosub2
    cmpl %eax, %ebp
    jae crtsub2
    jb crtnosub2
    cmpl %esi, %ebx
    jb crtnosub2

    crtsub2:
    subl %esi, %ebx
    sbbl %eax, %ebp
    sbbl %edx, %edi

    crtnosub2:

    movl cc, %esi
    movl cc+4, %eax
    addl %ebx, %esi
    adcl %ebp, %eax
    movl cc+8, %edx
    adcl %edi, %edx

    divl Base
    movl $0, cc+8
    movl %eax, cc+4
    movl %esi, %eax
    movl 32(%esp), %ebx
    divl Base
    movl %eax, cc
    movl %edx, (%ebx, %ecx, 4)

    andl %ecx, %ecx
    jnz crtloop

    crtblockend:

    popl %ebp
    popl %edi
    popl %esi
    popl %edx
    popl %ecx
    popl %ebx

    ret");


// Carry & Chinese Remainder Theorem for fnt-multiplication (and square)
// Returns 1 if right shift ocurred
// Assume that ds1 will be in memory if possible

int carrycrt (apstruct *ds1, apstruct *s2, apstruct *s3, size_t rsize)  // Low to high
{
    size_t l, t, p = rsize, r;
    modint *buf1, *buf2, *buf3;
    rawtype carry, tmp1, tmp2;

    cc[0] = cc[1] = cc[2] = 0;

    setmodulus (moduli[0]);
    t0 = modint (1) / (modint (moduli[1]) * moduli[2]);

    //Now moduli[0] is larger than moduli[1], so special care must be taken
    setmodulus (moduli[1]);
    tmp1 = moduli[0];
    while (tmp1 >= modint::modulus) tmp1 -= modint::modulus;
    t1 = modint (1) / (modint (tmp1) * moduli[2]);

    //Now moduli[0] and moduli[1] are larger than moduli[2] again
    setmodulus (moduli[2]);
    tmp1 = moduli[0];
    while (tmp1 >= modint::modulus) tmp1 -= modint::modulus;
    tmp2 = moduli[1];
    while (tmp2 >= modint::modulus) tmp2 -= modint::modulus;
    t2 = modint (1) / (modint (tmp1) * tmp2);

    m01[0] = moduli[0];
    m01[1] = bigmul (m01, m01, moduli[1], 1);

    m02[0] = moduli[0];
    m02[1] = bigmul (m02, m02, moduli[2], 1);

    m12[0] = moduli[1];
    m12[1] = bigmul (m12, m12, moduli[2], 1);

    mm[2] = bigmul (mm, m01, moduli[2], 2);

    while (p)
    {
        l = (p < Blocksize ? p : Blocksize);
        p -= l;
        buf1 = ds1->getdata (p, l);
        buf2 = s2->getdata (p, l);
        buf3 = s3->getdata (p, l);

        crtblock (l, buf1, buf2, buf3);

        s3->cleardata ();
        s2->cleardata ();
        ds1->putdata ();
    }

    carry = cc[0];

    if (carry != 0)
    {
        p = ds1->size;
        r = 0;

        tmp1 = carry;

        while (p)
        {
            l = (p < Maxblocksize ? p : Maxblocksize);
            p -= l;
            buf1 = ds1->getdata (r, l);
            r += l;
            for (t = 0; t < l; t++)
            {
                tmp2 = buf1[t];
                buf1[t] = tmp1;
                tmp1 = tmp2;
            }
            ds1->putdata ();
        }

        return 1;
    }

    return 0;
}
