F64 F64_QUADRANT_CONSTANT_ARRAY[2] = {0, pi};

#define F64_1                   0x3FF0000000000000
#define F64_1_OVER_2_FACTORIAL  0x3FE0000000000000  // 1/2!
#define F64_1_OVER_4_FACTORIAL  0x3FA5555555555555  // 1/4!
#define F64_1_OVER_6_FACTORIAL  0x3F56C16C16C16C17  // 1/6!
#define F64_1_OVER_8_FACTORIAL  0x3EFA01A01A01A01A  // 1/8!
#define F64_1_OVER_10_FACTORIAL 0x3E927E4FB7789F5C  // 1/10!
#define F64_1_OVER_12_FACTORIAL 0x3E21EED8EFF8D898  // 1/12!
#define F64_1_OVER_14_FACTORIAL 0x3DA93974A8C07C9D  // 1/14!

#define F64_2_OVER_PI   0x3FE45F306DC9C883
#define F64_PI_OVER_2   0x3FF921FB54442D18

asm
{
_F32_COS::
    PUSH        RBP
    MOV         RBP, RSP
    PUSH        RBX

    CVTSS2SD    XMM0, SF_ARG1[RBP]//XMM0: theta

/*
    I64 quadrant = theta * (2.0 / pi);
*/

    MOV         RAX, F64_2_OVER_PI
    MOVQ        XMM1, RAX       //  XMM1: 2.0 / Pi
    MULSD       XMM1, XMM0      //  XMM1: (2.0 / pi) * theta
    CVTSD2SI    RBX, XMM1       //  RBX : quadrant = (I64) ((2.0 / pi) * theta)

/*
    theta = theta - (F32) quadrant * (pi / 2.0);
*/

    CVTSI2SD    XMM1, RBX       //  XMM1: (F32) quadrant
    MOV         RAX, F64_PI_OVER_2
    MOVQ        XMM2, RAX       //  XMM2: pi / 2.0
    MULSD       XMM2, XMM1      //  XMM2: (pi / 2.0) * (F32) quadrant
    SUBSD       XMM0, XMM2      //  XMM0: theta = theta - ((pi / 2.0) * (F32) quadrant)

/*
    quadrant += 1;
    theta = gTrigQuadrantConstants[(quadrant >> 1) & 1] - theta;
*/

    INC         RBX             //  RBX : quadrant += 1
    SHR         RBX, 1          //  RBX : quadrant >> 1
    AND         RBX, 1          //  RBX : (quadrant >> 1) & 1
    SHL         RBX, 3          //  RBX : ((quadrant >> 1) & 1) * sizeof(F64)
    MOV         RAX, &F64_QUADRANT_CONSTANT_ARRAY
    ADD         RBX, RAX        //  RBX : constants + ((quadrant >> 1) & 1)
    MOVQ        XMM1, [RBX]     //  XMM1: constants[(quadrant >> 1) & 1]
    SUBSD       XMM1, XMM0      //  XMM1: theta = constants[(quadrant >> 1) & 1] - theta

/*
    theta2 = -(theta * theta);
*/

    MULSD       XMM1, XMM1      //  XMM1: theta * theta
    XORPS       XMM0, XMM0      //  XMM0: {0.0, 0.0}
    SUBSD       XMM0, XMM1      //  XMM0: theta2 = -(theta * theta)

/*
    F64 r = (1/14!) * theta2;
*/

    MOV         RAX, F64_1_OVER_14_FACTORIAL
    MOVQ        XMM1, RAX
    MULSD       XMM1, XMM0

/*
    r += (1/n!);
    r *= theta2;
*/

    MOV         RAX, F64_1_OVER_12_FACTORIAL
    MOVQ        XMM2, RAX       //  XMM2: (1/6!)
    ADDSD       XMM1, XMM2      //  XMM1: r += (1/6!)
    MULSD       XMM1, XMM0      //  XMM1: r *= theta

    MOV         RAX, F64_1_OVER_10_FACTORIAL
    MOVQ        XMM2, RAX       //  XMM2: (1/4!)
    ADDSD       XMM1, XMM2      //  XMM1: r += (1/4!)
    MULSD       XMM1, XMM0      //  XMM1: r *= theta

    MOV         RAX, F64_1_OVER_8_FACTORIAL
    MOVQ        XMM2, RAX       //  XMM2: (1/2!)
    ADDSD       XMM1, XMM2      //  XMM1: r += (1/2!)
    MULSD       XMM1, XMM0      //  XMM1: r *= theta

    MOV         RAX, F64_1_OVER_6_FACTORIAL
    MOVQ        XMM2, RAX       //  XMM2: (1/2!)
    ADDSD       XMM1, XMM2      //  XMM1: r += (1/2!)
    MULSD       XMM1, XMM0      //  XMM1: r *= theta

    MOV         RAX, F64_1_OVER_4_FACTORIAL
    MOVQ        XMM2, RAX       //  XMM2: (1/2!)
    ADDSD       XMM1, XMM2      //  XMM1: r += (1/2!)
    MULSD       XMM1, XMM0      //  XMM1: r *= theta

    MOV         RAX, F64_1_OVER_2_FACTORIAL
    MOVQ        XMM2, RAX       //  XMM2: (1/2!)
    ADDSD       XMM1, XMM2      //  XMM1: r += (1/2!)
    MULSD       XMM1, XMM0      //  XMM1: r *= theta

/*
    return (F32) (r + 1.0)
*/

    MOV         RAX, F64_1
    MOVQ        XMM2, RAX       //  XMM2: 1.0
    ADDSD       XMM1, XMM2      //  XMM1: r += 1.0
    CVTSD2SS    XMM0, XMM1      //  XMM0: (F32) r
    MOVQ        RAX, XMM0

    POP         RBX
    POP         RBP
    RET1        8
}
/**
    @ingroup Math
    @brief Calculate single precision cosine.

    Note that it isn't correct for all values of theta yet.

    Based on the paper:
    Fast Trigonometric Functions using Intel's SSE2 Instructions. 2003.
    L. Nyland, M. Snyder 

    @param[in] theta    Angle in radians.
    @return             sin(theta).
*/
_extern _F32_COS F32 CosF32(F32 theta);