/**
    @ingroup Math
    @brief Print 4x4 matrix.
    
    @param[in] m    4x4 matrix.
*/
U0 Matrix4DPrint(CMatrix4D *m)
{   
    U8 reg R14 str = "%n\t%n\t%n\t%n\n\n";
asm
{
        PUSH        R15

        XOR         R15, R15            // = i = 0
//      for (RBX = 0; RBX < 4; RBX++)
@@05:
        MOV         RAX, R15            // = i
        SHL         RAX, 2              // = i * 4
        ADD         RAX, SF_ARG1[RBP]   // = &m + i * 4

        SUB         RSP, 32

        CVTSS2SD    XMM0, 48[RAX]
        MOVSD_SSE   24[RSP], XMM0

        CVTSS2SD    XMM0, 32[RAX]
        MOVSD_SSE   16[RSP], XMM0

        CVTSS2SD    XMM0, 16[RAX]
        MOVSD_SSE   8[RSP], XMM0

        CVTSS2SD    XMM0, [RAX]
        MOVSD_SSE   [RSP], XMM0

        PUSH        4
        PUSH        R14
        CALL        &Print
        ADD         RSP, 48

        INC         R15
        CMP         R15, 4
        JNE         @@05

        POP         R15
}
}

/**
    @ingroup Math
    @brief Check if two 4x4 matrices are equal.

    @param[in] a    Matrix 1
    @param[in] b    Matrix 2
*/
Bool Matrix4DIsEqual(CMatrix4D *a, CMatrix4D *b)
{
    I64 i, j,
        total = 0;

    for (i = 0; i < 4; i++)
    {
        total += Vector4DIsEqual(&a->vec[i], &b->vec[i]);
    }
    
    if (total == 4)
        return TRUE;
    else
        return FALSE;
}

asm
{
_MATRIX_4D_MUL_VECTOR_4D::
    PUSH        RBP
    MOV         RBP, RSP

    MOV         RAX, SF_ARG2[RBP]
    MOVAPS      XMM0, [RAX]
    MOVAPS      XMM1, XMM0
    MOVAPS      XMM2, XMM0
    MOVAPS      XMM3, XMM0
    SHUFPS      XMM0, XMM0, 0x00    // (0, 0, 0, 0)
    SHUFPS      XMM1, XMM1, 0x55    // (1, 1, 1, 1)
    SHUFPS      XMM2, XMM2, 0xAA    // (2, 2, 2, 2)
    SHUFPS      XMM3, XMM3, 0xFF    // (3, 3, 3, 3)

    MOV         RAX, SF_ARG1[RBP]
    MOVAPS      XMM4, [RAX]
    MOVAPS      XMM5, 16[RAX]
    MOVAPS      XMM6, 32[RAX]
    MOVAPS      XMM7, 48[RAX]

    MULPS       XMM4, XMM0
    MULPS       XMM5, XMM1
    MULPS       XMM6, XMM2
    MULPS       XMM7, XMM3

    ADDPS       XMM4, XMM5
    ADDPS       XMM6, XMM7
    ADDPS       XMM4, XMM6

    MOV         RAX, SF_ARG3[RBP]
    MOVAPS      [RAX], XMM4
    
    POP         RBP
    RET1        24
}
/**
    @ingroup Math
    @brief Multiply 4x4 matrix by 4D vector.

    @param[in] m        4x4 matrix.
    @param[in] v        4D vector.
    @param[in,out] dest Destination 4D vector.
*/
_extern _MATRIX_4D_MUL_VECTOR_4D U0 Matrix4DMulVector4D(CMatrix4D *m, CVector4D *v, CVector4D *dest);

asm
{
_MATRIX_4D_MUL::
    PUSH        RBP
    MOV         RBP, RSP
    PUSH        R14
    PUSH        R15

    MOV         RAX, SF_ARG1[RBP]
    MOV         R14, SF_ARG2[RBP]
    MOV         R15, SF_ARG3[RBP]

    MOVAPS      XMM4, [RAX]
    MOVAPS      XMM5, 16[RAX]
    MOVAPS      XMM6, 32[RAX]
    MOVAPS      XMM7, 48[RAX]

    MOV         RAX, 4  // = i = 4
@@05:
    MOVAPS      XMM0, [R14]
    MOVAPS      XMM1, XMM0
    MOVAPS      XMM2, XMM0
    MOVAPS      XMM3, XMM0
    SHUFPS      XMM0, XMM0, 0x00    // (0, 0, 0, 0)
    SHUFPS      XMM1, XMM1, 0x55    // (1, 1, 1, 1)
    SHUFPS      XMM2, XMM2, 0xAA    // (2, 2, 2, 2)
    SHUFPS      XMM3, XMM3, 0xFF    // (3, 3, 3, 3)

    MULPS       XMM0, XMM4
    MULPS       XMM1, XMM5
    MULPS       XMM2, XMM6
    MULPS       XMM3, XMM7

    ADDPS       XMM0, XMM1
    ADDPS       XMM2, XMM3
    ADDPS       XMM0, XMM2

    MOVAPS      [R15], XMM0

    ADD         R14, 16
    ADD         R15, 16

    DEC         RAX
    JNZ         @@05

    POP         R15
    POP         R14
    POP         RBP
    RET1        24
}
/**
    @ingroup Math
    @brief Multiply 4x4 matrix by 4x4 matrix.

    @param[in]     a    4x4 matrix.
    @param[in]     b    4x4 matrix.
    @param[in,out] dest Destination 4x4 matrix.
*/
_extern _MATRIX_4D_MUL U0 Matrix4DMul(CMatrix4D *a, CMatrix4D *b, CMatrix4D *dest);

asm
{
_MATRIX_4D_TRANSPOSE::
    PUSH        RBP
    MOV         RBP, RSP

    MOV         RAX, SF_ARG1[RBP]
    MOVAPS      XMM0, [RAX]
    MOVAPS      XMM1, XMM0
    MOVAPS      XMM2, 16[RAX]

    MOVAPS      XMM3, 32[RAX]
    MOVAPS      XMM4, XMM3
    MOVAPS      XMM5, 48[RAX]

    SHUFPS      XMM0, XMM2, 0x44    // (0, 1, 0, 1)  TMP 0
    SHUFPS      XMM1, XMM2, 0xEE    // (2, 3, 2, 3)  TMP 2
    SHUFPS      XMM3, XMM5, 0x44    // (0, 1, 0, 1)  TMP 1
    SHUFPS      XMM4, XMM5, 0xEE    // (2, 3, 2, 3)  TMP 3

    MOVAPS      XMM2, XMM0          // XMM2 = TMP 0
    MOVAPS      XMM5, XMM1          // XMM5 = TMP 2

    SHUFPS      XMM0, XMM3, 0x88    // (0, 2, 0, 2)
    SHUFPS      XMM2, XMM3, 0xDD    // (1, 3, 1, 3)
    SHUFPS      XMM1, XMM4, 0x88    // (0, 2, 0, 2)
    SHUFPS      XMM5, XMM4, 0xDD    // (1, 3, 1, 3)

    MOV         RAX, SF_ARG2[RBP]
    MOVAPS      [RAX], XMM0
    MOVAPS      16[RAX], XMM2
    MOVAPS      32[RAX], XMM1
    MOVAPS      48[RAX], XMM5

    POP         RBP
    RET1        16
}
/**
    @ingroup Math
    @brief Transpose 4x4 matrix.

    @param[in]     m    4x4 matrix.
    @param[in,out] dest Destination 4x4 matrix.
*/
_extern _MATRIX_4D_TRANSPOSE U0 Matrix4DTranspose(CMatrix4D *m, CMatrix4D *dest);

/**
    @ingroup Math
    @brief Clear 4x4 matrix and set it to translation transformation.

    @param[in]     x    X translation.
    @param[in]     y    Y translation.
    @param[in]     z    Z translation.
    @param[in,out] dest Destination 4x4 matrix. 
*/
U0 Matrix4DTranslationSet(F32 x, F32 y, F32 z, CMatrix4D *dest)
{
    dest->e[MAT4_00] = F32_ONE;
    dest->e[MAT4_10] = F32_ZERO;
    dest->e[MAT4_20] = F32_ZERO;
    dest->e[MAT4_30] = F32_ZERO;

    dest->e[MAT4_01] = F32_ZERO;
    dest->e[MAT4_11] = F32_ONE;
    dest->e[MAT4_21] = F32_ZERO;
    dest->e[MAT4_31] = F32_ZERO;

    dest->e[MAT4_02] = F32_ZERO;
    dest->e[MAT4_12] = F32_ZERO;
    dest->e[MAT4_22] = F32_ONE;
    dest->e[MAT4_32] = F32_ZERO;

    dest->e[MAT4_03] = x;
    dest->e[MAT4_13] = y;
    dest->e[MAT4_23] = z;
    dest->e[MAT4_33] = F32_ONE;
}

/**
    @ingroup Math
    @brief Clear 4x4 matrix and set it to scale transformation.

    @param[in]     x    X scale.
    @param[in]     y    Y scale.
    @param[in]     z    Z scale.
    @param[in,out] dest Destination 4x4 matrix.
*/
U0 Matrix4DScaleSet(F32 x, F32 y, F32 z, CMatrix4D *dest)
{
    dest->e[MAT4_00] = x;
    dest->e[MAT4_10] = F32_ZERO;
    dest->e[MAT4_20] = F32_ZERO;
    dest->e[MAT4_30] = F32_ZERO;

    dest->e[MAT4_01] = F32_ZERO;
    dest->e[MAT4_11] = y;
    dest->e[MAT4_21] = F32_ZERO;
    dest->e[MAT4_31] = F32_ZERO;

    dest->e[MAT4_02] = F32_ZERO;
    dest->e[MAT4_12] = F32_ZERO;
    dest->e[MAT4_22] = z;
    dest->e[MAT4_32] = F32_ZERO;

    dest->e[MAT4_03] = F32_ZERO;
    dest->e[MAT4_13] = F32_ZERO;
    dest->e[MAT4_23] = F32_ZERO;
    dest->e[MAT4_33] = F32_ONE;
}

/**
    @ingroup Math
    @brief Clear 4x4 matrix and set it to a look-at transform.

             .                .   .            .   .                    .
             .Rx   Ry   Rz   0.   .1  0  0  -Ex.   .Rx  Ry  Rz  -(R . E). 
    LookAt = .Ux   Uy   Uz   0. * .0  1  0  -Ey. = .Ux  Uy  Ut  -(U . E).
             .-Fx  -Fy  -Fz  0.   .0  0  1  -Ez.   .Fx  Fy  Fz  F . E   .
             .0    0    0    1.   .0  0  0  1  .   .0   0   0   1       .
             .                .   .            .   .                    .
    Where:
    .R is the eye right direction.
    .U is the eye up direction.
    .F is the eye forward direction (away from target towards viewer).
    .E is the position of the eye.

    @param[in]     eye      Position of eye.
    @param[in]     center   Position to look at.
    @param[in]     up       Up direction vector. Can be general like an axis,
                            does not need to be orthographic in relation
                            to camera angle.
    @param[in,out] dest     Destination 4x4 matrix.
*/
U0 Matrix4DLookAtSet(CVector3D *eye, CVector3D *center, CVector3D *up, CMatrix4D *dest)
{
    Vector3DSub(center, eye, gVec4Temp0);               // Forward vector
    Vector3DNormalize(gVec4Temp0, gVec4Temp0);

    Vector3DCross(gVec4Temp0, up, gVec4Temp1);          // Right vector
    Vector3DNormalize(gVec4Temp1, gVec4Temp1);

    Vector3DCross(gVec4Temp1, gVec4Temp0, gVec4Temp2);  // Correct Up vector

    // Use dot products to apply eye translation matrix without a 4x4 multiply.
    dest->e[MAT4_03] = Vector3DDot(gVec4Temp1, eye) ^ F32_NEGATE_MASK;
    dest->e[MAT4_13] = Vector3DDot(gVec4Temp2, eye) ^ F32_NEGATE_MASK;
    dest->e[MAT4_23] = Vector3DDot(gVec4Temp0, eye);
    dest->e[MAT4_33] = F32_ONE;

    Vector3DNegate(gVec4Temp0, gVec4Temp0);             // Forward = -Forward

    dest->e[MAT4_00] = gVec4Temp1->x;               // Right
    dest->e[MAT4_10] = gVec4Temp2->x;               // Up
    dest->e[MAT4_20] = gVec4Temp0->x;               // Forward
    dest->e[MAT4_30] = F32_ZERO;

    dest->e[MAT4_01] = gVec4Temp1->y;
    dest->e[MAT4_11] = gVec4Temp2->y;
    dest->e[MAT4_21] = gVec4Temp0->y;
    dest->e[MAT4_31] = F32_ZERO;

    dest->e[MAT4_02] = gVec4Temp1->z;
    dest->e[MAT4_12] = gVec4Temp2->z;
    dest->e[MAT4_22] = gVec4Temp0->z;
    dest->e[MAT4_32] = F32_ZERO;
}