TexConv/CMP_CompressonatorLib/DXTC/dxtc_v11_compress_asm.c

//===============================================================================
// Copyright (c) 2007-2016  Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2004-2006 ATI Technologies Inc.
//===============================================================================
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files(the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions :
// 
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
// 
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//
//

#include "dxtc_v11_compress.h"

#if defined(_WIN32)

// Raises priority of G at expense of B - seems slightly better than no munging
#define AXIS_MUNGE


#define XMMCONST(name, val)\
    __declspec(align(16)) const float name[4] = { val, val, val, val };
#define XMMICONST(name, val)\
    __declspec(align(16)) const unsigned long name[4] = { val, val, val, val };
#define XMM4CONST(name, v0, v1, v2, v3)\
    __declspec(align(16)) const float name[4] = { v0, v1, v2, v3 };
#define XMM4ICONST(name, v0, v1, v2, v3)\
    __declspec(align(16)) const unsigned long name[4] = { v0, v1, v2, v3 };


XMMCONST(zero, 0);
XMMCONST(one, 1.0f);
XMMCONST(one_over_16, (1.0f/16.0f));
XMM4CONST(one_over_16_x_255_zeros, (1.0f/(16.0f*255.0f)), 0, 0, 0);
XMMICONST(clearsign, 0x7fffffff);
XMMICONST(signbit, 0x80000000);
XMMCONST(half, 0.5f);
XMMCONST(c255, 255.0f);
XMMCONST(c3, 3.0f);
XMMCONST(lots, 10000.0f);
XMMCONST(minuslots, -10000.0f);

XMMICONST(mask_green, 0xfc);
XMMICONST(mask_blue_red, 0xf8);

XMMICONST(clamp_0, 0);
XMMICONST(clamp_255, 255);

XMMICONST(green_rounding, (0x80>>6));
XMMICONST(blue_red_rounding, (0x80>>5));

XMM4ICONST(rb_sign_bits, 0x80000000, 0, 0x80000000, 0);

XMMCONST(stepsize, 0.95f);
XMMCONST(onethird, 1.0f/3.0f);
XMMCONST(maxerror_init, 10000000.0f);
XMMCONST(maxerror_epsilon, 5.0f);

XMM4CONST(b_half, 0.5f, 1.0f, 1.0f, 1.0f);
XMM4CONST(b_2x, 2.0f, 1.0f, 1.0f, 1.0f);

XMMICONST(invert, 0xffffffff);

XMMCONST(split_point, 2.0f/3.0f);
XMMCONST(deviation_point, 1.0f/3.0f);
XMMCONST(split_point_r300, 5.0f/8.0f);
XMMCONST(deviation_point_r300, 1.0f/4.0f);

XMMICONST(_0f0f0f0f0f0f0f0f, 0x0f0f0f0f);
XMMICONST(_0707070707070707, 0x07070707);
XMMICONST(_000f000f000f000f, 0x000f000f);
XMMICONST(_00f000f000f000f0, 0x00f000f0);

#define SHUFFLE_SELECT(a, b, c, d) (a | (b<<2) | (c<<4) | (d<<6))


#define PARALLEL_ADD_XMM7        \
{                                \
    __asm movaps xmm6, xmm7            \
    __asm movaps xmm5, xmm7            \
    __asm shufps xmm7, xmm7, SHUFFLE_SELECT(2, 3, 2, 3)    \
    __asm addps xmm7, xmm6                                \
    __asm shufps xmm5, xmm5, SHUFFLE_SELECT(1, 1, 1, 1)    \
    __asm addps xmm7, xmm5            \
}

#define AVERAGE        xmm0
#define AXIS        xmm1

BYTE expandtable[32] = 
{
    0x00, 0x01, 0x04, 0x05,
    0x10, 0x11, 0x14, 0x15,
    0x40, 0x41, 0x44, 0x45,
    0x50, 0x51, 0x54, 0x55,
    0x00, 0x02, 0x08, 0x0a,
    0x20, 0x22, 0x28, 0x2a,
    0x80, 0x82, 0x88, 0x8a,
    0xa0, 0xa2, 0xa8, 0xaa,
};

#endif //_WIN32

#if !defined(_WIN64) && defined(_WIN32)

#pragma warning( push )
#pragma warning(disable:4100)

void __declspec(naked) __cdecl DXTCV11CompressBlockSSE(DWORD *block_32, DWORD *block_dxtc)
{
    // *block_32 *block_dxtc (VS2010) compiler generates warning C4100: unreferenced formal parameter
    __asm
    {
        {
            push esi
            push edi
            push ebx
            push ebp
#define SP_BLOCK_32        20
#define SP_BLOCK_DXTC    24

            // Set up a 16-byte aligned storage space pointer
            mov ebp, esp
            and ebp, ~0x0f
#define EBP_UNIQUES            (16*16)
#define EBP_POS_ON_AXIS        (EBP_UNIQUES+(16*4))
#define EBP_DIST_FROM_AXIS    (EBP_POS_ON_AXIS+(16*4))

#define EBP_VALUES            (EBP_DIST_FROM_AXIS+16)
#define EBP_OLDLEFT            (EBP_VALUES+16)
#define EBP_OLDRIGHT        (EBP_OLDLEFT+16)
#define EBP_RIGHTSAVE        (EBP_OLDRIGHT+16)

#define EBP_MAXERROR        (EBP_RIGHTSAVE+4)
#define EBP_ERROR            (EBP_MAXERROR+4)

#define EBP_SWAP            (EBP_ERROR+4)


            // Find the array of unique pixel values and sum them to find their average position

            xorps AVERAGE, AVERAGE        // average (r, g, b)

            lea edi, [ebp-EBP_UNIQUES]
            mov esi, [esp+SP_BLOCK_32]
            mov ecx, 16
            pxor mm2, mm2
            {
            average_unique_loop:
                movd mm0, [esi]
                add esi, 4
                punpcklbw mm0, mm2
                movq mm1, mm0
                punpckhwd mm0, mm2
                punpcklwd mm1, mm2
                cvtpi2ps xmm7, mm0
                movlhps xmm7, xmm7
                cvtpi2ps xmm7, mm1


                // colourspace conversion
                // Most of the options in the C compressor are not supported here
#ifdef AXIS_MUNGE
                movaps xmm6, xmm7
                shufps xmm6, xmm6, SHUFFLE_SELECT(1, 1, 1, 1)    // G
                addss xmm7, xmm6
                mulps xmm7, [b_half]
#endif

                movaps [edi], xmm7
                add edi, 16
                addps AVERAGE, xmm7

                sub ecx, 1
                jne average_unique_loop
            }

            // Compute average of the uniques
            mulps AVERAGE, [one_over_16]
        }


        {
            // For each component, reflect points about the average so all lie on the same side
            // of the average, and compute the new average - this gives a second point that defines the axis
            // To compute the sign of the axis sum the positive differences of G for each of R and B (the
            // G axis is always positive in this implementation

            xorps AXIS, AXIS        // v (r, g, b)
            xorps xmm2, xmm2        // rg_pos, rb_pos, bg_pos
            lea edi, [ebp-EBP_UNIQUES]
            mov ecx, 16
            {
            find_axis_loop:
                movaps xmm7, [edi]        // R G B value
                add edi, 16
                subps xmm7, AVERAGE        // centred
                movaps xmm6, xmm7
                movaps xmm5, xmm7

                andps xmm7, [clearsign]    // fabs (r, g, b)
                addps AXIS, xmm7        // direction of axis

                shufps xmm6, xmm6, SHUFFLE_SELECT(0, 2, 2, 3)    // B R R 0
                shufps xmm5, xmm5, SHUFFLE_SELECT(1, 0, 1, 3)    // G B G 0

                cmpnltps xmm6, [zero]    // R/B > 0?
                andps xmm6, xmm5        // insert the G or B value for those channels which are positive
                addps xmm2, xmm6        // bg_pos rb_pos rg_pos

                sub ecx, 1
                jne find_axis_loop
            }

            mulps AXIS, [one_over_16]


            // Handle RB_pos - RB pos is used if RG_pos and BG_pos are both zero.
            movaps xmm5, xmm2    // duplicate the pos across these three
            movaps xmm6, xmm2
            movaps xmm7, xmm2
            shufps xmm5, xmm5, SHUFFLE_SELECT(1, 3, 3, 3)    // RB_pos 0 ->
            shufps xmm6, xmm6, SHUFFLE_SELECT(2, 2, 2, 2)    // RG_pos ->
            shufps xmm7, xmm7, SHUFFLE_SELECT(0, 0, 0, 0)    // BG_pos ->
            orps xmm6, xmm7
            cmpneqps xmm6, [zero]    // so check for any non-zero in RG_pos or BG_pos
            andps xmm2, xmm6        // Mask out RG_pos in current if we need to the current
            xorps xmm6, [invert]
            andps xmm5, xmm6
            orps xmm2, xmm5            // insert RB pos instead


            // Change the sign of the R and B portions of the axis appropriately
            cmpltps xmm2, [zero]
            andps xmm2, [rb_sign_bits]
            xorps AXIS, xmm2            // Flip the sign of the axis if the r/g or b/g tests indicate a negative slope


        }

        // Axis projection and remapping

        {
            // Normalise the axis for simplicity of future calculation
            movaps xmm7, AXIS
            mulps xmm7, xmm7

            PARALLEL_ADD_XMM7

            // low of xmm7 is the DP result
            // If this is 0 we haven't actually got an axis, and we can't rsq it,
            // so mask the output to 0 in this case. This generates an acceptable result
            movaps xmm2, xmm7
            cmpneqps xmm2, [zero]

#if 1        // RSQRT with Newton-Raphson. This can be omitted for even faster encoding performance, but quality
            // and consistency improves with it in on certain images. It's not a large cost so leave it in.
            rsqrtps xmm3, xmm7
            andps xmm3, xmm2

            movaps    xmm2, xmm3
            mulps    xmm3, xmm7
            mulps    xmm3, xmm2
            mulps    xmm2, [half]
            movaps    xmm7, [c3]
            subps    xmm7, xmm3
            mulps    xmm7, xmm2
#else
            // No Newton-Raphson method
            rsqrtps xmm7, xmm7
            andps xmm7, xmm2
#endif

            shufps xmm7, xmm7, SHUFFLE_SELECT(0, 0, 0, 0)

            // Normalise
            mulps AXIS, xmm7
        }

#define LEFT xmm2
#define RIGHT xmm3
        // Map the axis
        {
            lea edi, [ebp-EBP_UNIQUES]
            lea edx, [ebp-EBP_POS_ON_AXIS]
            mov ecx, 16
            movaps LEFT, [lots]
            movaps RIGHT, [minuslots]
            xorps xmm4, xmm4            // axis mapping error
            {
            map_axis_loop:
                movaps xmm7, [edi]
                subps xmm7, AVERAGE
                mulps xmm7, AXIS

                PARALLEL_ADD_XMM7
                movss [edx], xmm7
                add edx, 4

                // xmm7 == pos_on_axis

                minss LEFT, xmm7        // calculate left
                maxss RIGHT, xmm7        // calculate right

                add edi, 16

                sub ecx, 1
                jne map_axis_loop
            }
        }


        shufps LEFT, LEFT, SHUFFLE_SELECT(0, 0, 0, 0)
        shufps RIGHT, RIGHT, SHUFFLE_SELECT(0, 0, 0, 0)


        {
            // Now we have a good axis and the basic information about how the points are mapped
            // to it

            // Our initial guess is to represent the endpoints accurately, by moving the average
            // to the centre and recalculating the point positions along the line

            // Calculate centre
            movaps xmm7, LEFT
            addps xmm7, RIGHT
            mulps xmm7, [half]

            // Offset all the axis positions to the centre
            lea edi, [ebp-EBP_POS_ON_AXIS]
            movaps xmm5, [edi]
            movaps xmm6, [edi+16]
            subps xmm5, xmm7
            subps xmm6, xmm7
            movaps [edi], xmm5
            movaps [edi+16], xmm6
            movaps xmm5, [edi+32]
            movaps xmm6, [edi+48]
            subps xmm5, xmm7
            subps xmm6, xmm7
            movaps [edi+32], xmm5
            movaps [edi+48], xmm6

            // Offset left, right and average to centre
            subps LEFT, xmm7
            subps RIGHT, xmm7

            mulps xmm7, AXIS
            addps AVERAGE, xmm7
        }


#define PROGRESSIVE_REFINEMENT
#ifdef PROGRESSIVE_REFINEMENT
        {
            // Attempt a (simple) progressive refinement step to reduce noise in the
            // output image by trying to find a better overall match for the endpoints
            // than the first-guess solution found so far (which is just to take the ends.

            // The method is to move the endpoints inwards until a local minima is found.
            // This provides quite a significant improvement in image quality.

            mov eax, [maxerror_init]
            mov [ebp-EBP_MAXERROR], eax

            movaps [ebp-EBP_OLDLEFT], LEFT
            movaps [ebp-EBP_OLDRIGHT], RIGHT

            lea edx, [expandtable]
            lea edi, [ebp-EBP_VALUES]

            {
            next_refinement_loop:

                movaps [ebp-EBP_RIGHTSAVE], RIGHT

                mov eax, 0
                mov [ebp-EBP_ERROR], eax

                lea ecx, split_point

                movss xmm7, LEFT
                addss xmm7, RIGHT
                mulss xmm7, [half]
                movss [edi], LEFT
                movss [edi+4], RIGHT
                movss xmm5, RIGHT
                movss xmm6, xmm7
                subss xmm5, xmm7        // right-centre
                movss xmm4, xmm7
                movss RIGHT, xmm5
                mulss xmm5, [ecx+16]    //[deviation_point]
                subss xmm6, xmm5
                addss xmm4, xmm5
                movss [edi+8], xmm6
                movss [edi+12], xmm4


                mulps RIGHT, [ecx]        //[split_point]

                mov ecx, 16
                lea esi, [ebp-EBP_POS_ON_AXIS]
                {
                next_builderror_loop:
                    movss xmm4, [esi]
                    add esi, 4
                    movaps xmm5, xmm4
                    movaps xmm6, xmm4
                    andps xmm4, [clearsign]
                    cmpltss xmm4, RIGHT        // < division means 2
                    cmpnltss xmm5, xmm7        // >= centre means add 1

                    movmskps eax, xmm4
                    movmskps ebx, xmm5

                    movzx eax, byte ptr[edx+eax+16]
                    movzx ebx, byte ptr[edx+ebx]
                    or eax, ebx
                    and eax, 3

                    subss xmm6, [edi+4*eax]
                    mulss xmm6, xmm6
                    addss xmm6, [ebp-EBP_ERROR]
                    movss [ebp-EBP_ERROR], xmm6

                    sub ecx, 1
                    jne next_builderror_loop
                }

                movaps RIGHT, [ebp-EBP_RIGHTSAVE]

#if 1
                mov eax, [ebp-EBP_ERROR]
                mov ebx, [ebp-EBP_MAXERROR]
                cmp eax, ebx
                jge refinement_done
#else
                movss xmm5, [ebp-EBP_MAXERROR]
                cmpltss xmm5, xmm6
                movss [ebp-EBP_ERROR], xmm5
                mov eax, [ebp-EBP_ERROR]
                test eax, eax
                jnz refinement_done
#endif

                subss xmm6, [maxerror_epsilon]
                movss [ebp-EBP_MAXERROR], xmm6

                movaps [ebp-EBP_OLDLEFT], LEFT
                movaps [ebp-EBP_OLDRIGHT], RIGHT

                mulps LEFT, [stepsize]
                mulps RIGHT, [stepsize]

                jmp next_refinement_loop

            refinement_done:
                movaps LEFT, [ebp-EBP_OLDLEFT]
                movaps RIGHT, [ebp-EBP_OLDRIGHT]
            }
        }
#endif


        {
            // Calculate the high and low output colour values

            // Involved in this is a rounding procedure which is undoubtedly slightly twitchy. A
            // straight rounded average is not correct, as the decompressor 'unrounds' by replicating
            // the top bits to the bottom.

            // In order to take account of this process, we don't just apply a straight rounding correction,
            // but base our rounding on the input value (a straight rounding is actually pretty good in terms of
            // error measure, but creates a visual colour and/or brightness shift relative to the original image)
            // The method used here is to apply a centre-biased rounding dependent on the input value, which was
            // (mostly by experiment) found to give minimum MSE while preserving the visual characteristics of
            // the image.
            // rgb = (average_rgb + (left|right)*v_rgb);
            movaps xmm6, LEFT
            movaps xmm7, RIGHT
            mulps xmm6, AXIS
            mulps xmm7, AXIS
            addps xmm6, AVERAGE
            addps xmm7, AVERAGE

#ifdef AXIS_MUNGE
            // Scale the B component, then subtract the green component resultant in each
            movaps xmm4, xmm6
            movaps xmm5, xmm7
            mulps xmm6, [b_2x]
            mulps xmm7, [b_2x]
            shufps xmm4, xmm4, SHUFFLE_SELECT(1, 1, 1, 1)
            shufps xmm5, xmm5, SHUFFLE_SELECT(1, 1, 1, 1)
            subss xmm6, xmm4
            subss xmm7, xmm5
#endif

            // Rearrange so B and R are in the same register half (they both use 5-bit rounding)
            shufps xmm6, xmm6, SHUFFLE_SELECT(0, 2, 1, 3)    // B R G
            shufps xmm7, xmm7, SHUFFLE_SELECT(0, 2, 1, 3)

            // Convert to integer (by truncation, as C code does)
            cvttps2pi mm0, xmm6
            cvttps2pi mm1, xmm7
            movhlps xmm6, xmm6
            movhlps xmm7, xmm7
            cvttps2pi mm2, xmm6
            cvttps2pi mm3, xmm7

            // mm0/1 is blue/red, mm2/3 is green

            // This isn't quite the same as the C algorithm, but should generate the same result
            // if the input range is ensured to be 0-255

            // This code could be heavily interleaved, but for P4 it's not worth the hassle - the
            // P4 reordering range of 15 instructions will let it do the job for us

            pmaxsw mm0, [clamp_0]        // Note: faster to do these max/min in MMX than float XMM - better reordering opportunities
            pmaxsw mm1, [clamp_0]
            pminsw mm0, [clamp_255]
            pminsw mm1, [clamp_255]
            movq mm6, mm0
            movq mm7, mm1
            paddd mm0, [blue_red_rounding]
            paddd mm1, [blue_red_rounding]
            psrld mm6, 5
            psrld mm7, 5
            psubd mm0, mm6
            psubd mm1, mm7
            // No need to clamp here, with the input in 0-255 range it can never be outside at the end
            pand mm0, [mask_blue_red]
            pand mm1, [mask_blue_red]

            // Separate out R and B as they will need separate shifts later
            pshufw mm4, mm0, SHUFFLE_SELECT(2, 3, 2, 3)    // extract R    (this is an SSE, not MMX, instruction)
            pshufw mm5, mm1, SHUFFLE_SELECT(2, 3, 2, 3)    // also R

            pmaxsw mm2, [clamp_0]
            pmaxsw mm3, [clamp_0]
            pminsw mm2, [clamp_255]
            pminsw mm3, [clamp_255]
            movq mm6, mm2
            movq mm7, mm3
            paddd mm2, [green_rounding]
            paddd mm3, [green_rounding]
            psrld mm6, 6
            psrld mm7, 6
            psubd mm2, mm6
            psubd mm3, mm7
            pand mm2, [mask_green]
            pand mm3, [mask_green]


            // Convert the 8-bit values to final RGB565 colours in mm0 and mm1
            psrld mm0, 3
            psrld mm1, 3
            pslld mm4, 8
            pslld mm5, 8
            pslld mm2, 3
            pslld mm3, 3
            por mm0, mm4
            por mm1, mm5
            por mm0, mm2
            por mm1, mm3

            // mm0 and mm1 are c0 and c1


            // Need to compare c0/c1 for sign and flip and set swap if required - and handle colour equality as well....
            mov edi, [esp+SP_BLOCK_DXTC]
#if 1
            pxor mm5, mm5
            punpcklwd mm0, mm5    // unpack c0/c1 to DWORD's as pcmp is a signed comparison
            punpcklwd mm1, mm5
            movq mm2, mm0
            movq mm3, mm0
            movq mm4, mm0
            pcmpgtd mm2, mm1
            pxor mm2, [invert]    // Need less than, so flip the result
            movd [ebp-EBP_SWAP], mm2    // Set the swap flag (used below) appropriately)
            // mm2 is the mask to indicate flipping is needed

            pcmpeqd mm4, mm1
            movd ebx, mm4        // ebx is the equality flag, plenty of time for this slow move to resolve

            punpcklwd mm0, mm1    // 'normal' order
            punpcklwd mm1, mm3    // reversed order
            pand mm1, mm2
            pandn mm2, mm0
            por mm1, mm2        // one of the two, selected by mm2
            movd [edi], mm1        // write the result
#else
            // No compare simple version
            punpcklwd mm0, mm1
            mov [ebp-EBP_SWAP], 0
            xor ebx, ebx
            movd [edi], mm0

//            punpcklwd mm1, mm0
//            movq mm0, mm1
//            mov dword ptr[ebp-EBP_SWAP], 0xffffffff
#endif
        }

        // Clear the output bitmasks
        add edi, 4
        mov dword ptr [edi], 0

        // If the values are equal, the bit selector is 0 because the two colours are
        // the same (which implies transparent)
        // This seems the easiest way to do it, and will only rarely break branch prediction on
        // typical images.
        test ebx, ebx
        jnz all_done


        {
            // Final clustering, creating the 2-bit values that define the output

            lea ecx, split_point

            movaps xmm7, RIGHT
            mulps RIGHT, [ecx]            // split point
            addps xmm7, LEFT
            mulps xmm7, [half]            // centre (probably 0, but what the hell)

            lea esi, [ebp-EBP_POS_ON_AXIS]
            lea edx, [expandtable]
            movss xmm6, [ebp-EBP_SWAP]
            shufps xmm6, xmm6, 0
            mov ecx, 4
            {
            next_bit_loop:    // Do 4 at once
                movaps xmm4, [esi]        // Read the four pos_on_axis entries
                add esi, 16
                movaps xmm5, xmm4
                andps xmm4, [clearsign]
                cmpltps xmm4, RIGHT        // < division means 2
                cmpnltps xmm5, xmm7        // >= centre means add 1

                xorps xmm5, xmm6        // Swap the order if we had to flip our colours

                movmskps eax, xmm4
                movmskps ebx, xmm5

                movzx eax, byte ptr[edx+eax+16]
                movzx ebx, byte ptr[edx+ebx]
                or eax, ebx
                mov byte ptr [edi], al
                add edi, 1

                sub ecx, 1
                jne next_bit_loop
            }

        }


        all_done:
        emms

        pop ebp
        pop ebx
        pop edi
        pop esi
        ret
    }


}

void __declspec(naked) __cdecl DXTCV11CompressBlockSSEMinimal(DWORD *block_32, DWORD *block_dxtc)
{
#define SP_BLOCK_32         20
#define SP_BLOCK_DXTC       24
#define EBP_UNIQUES         (16*16)
#define EBP_POS_ON_AXIS     (EBP_UNIQUES+(16*4))
#define EBP_DIST_FROM_AXIS  (EBP_POS_ON_AXIS+(16*4))
#define EBP_VALUES          (EBP_DIST_FROM_AXIS+16)
#define EBP_OLDLEFT         (EBP_VALUES+16)
#define EBP_OLDRIGHT        (EBP_OLDLEFT+16)
#define EBP_RIGHTSAVE       (EBP_OLDRIGHT+16)
#define EBP_MAXERROR        (EBP_RIGHTSAVE+4)
#define EBP_ERROR           (EBP_MAXERROR+4)
#define EBP_SWAP            (EBP_ERROR+4)

// *block_32 *block_dxtc (VS2010) compiler generates warning C4100: unreferenced formal parameter
__asm
{
    push esi
    push edi
    push ebx
    push ebp

    // Set up a 16-byte aligned storage space pointer
    mov ebp, esp
    and ebp, ~0x0f

    // init to 0
    xorps AVERAGE, AVERAGE        // average (r, g, b)

    // -------------------------------------------------------------------------------------
    // (3) Find the array of unique pixel values and sum them to find their average position
    // -------------------------------------------------------------------------------------
    {
    lea        edi,[ebp - EBP_UNIQUES]
    mov        esi,[esp + SP_BLOCK_32]
    mov        ecx, 16
    pxor    mm2, mm2

average_unique_loop :
    {
    movd        mm0 ,[esi]
    add         esi , 4
    punpcklbw   mm0 , mm2
    movq        mm1 , mm0
    punpckhwd   mm0 , mm2
    punpcklwd   mm1 , mm2
    cvtpi2ps    xmm7, mm0
    movlhps     xmm7, xmm7
    cvtpi2ps    xmm7, mm1

        // colourspace conversion
    movaps[edi], xmm7
    add            edi, 16
    addps        AVERAGE, xmm7

    sub ecx, 1
    jne average_unique_loop
    }

    // Compute average of the uniques
    mulps AVERAGE,[one_over_16]
    }

    // -------------------------------------------------------------------------------------
    // (4) For each component, reflect points about the average so all lie on the same side
    // of the average, and compute the new average - this gives a second point that defines the axis
    // To compute the sign of the axis sum the positive differences of G for each of R and B (the
    // G axis is always positive in this implementation
    // -------------------------------------------------------------------------------------
    { 

            xorps    AXIS, AXIS                // v (r, g, b)
                xorps    xmm2, xmm2                // rg_pos, rb_pos, bg_pos
                lea        edi, [ebp - EBP_UNIQUES]
                mov        ecx, 16

                find_axis_loop:
            {
                movaps    xmm7, [edi]            // R G B value
                    add        edi, 16
                    subps    xmm7, AVERAGE            // centred
                    movaps    xmm6, xmm7
                    movaps    xmm5, xmm7

                    andps    xmm7, [clearsign]        // fabs (r, g, b)
                    addps    AXIS, xmm7            // direction of axis

                    shufps    xmm6, xmm6, SHUFFLE_SELECT(0, 2, 2, 3)    // B R R 0
                    shufps    xmm5, xmm5, SHUFFLE_SELECT(1, 0, 1, 3)    // G B G 0

                    cmpnltps xmm6, [zero]        // R/B > 0?
                    andps    xmm6, xmm5            // insert the G or B value for those channels which are positive
                    addps    xmm2, xmm6            // bg_pos rb_pos rg_pos

                    sub        ecx, 1
                    jne        find_axis_loop
            }


            mulps AXIS, [one_over_16]

                // Handle RB_pos - RB pos is used if RG_pos and BG_pos are both zero.
                movaps    xmm5, xmm2                                 // duplicate the pos across these three
                movaps    xmm6, xmm2
                movaps    xmm7, xmm2
                shufps    xmm5, xmm5, SHUFFLE_SELECT(1, 3, 3, 3)    // RB_pos 0 ->
                shufps    xmm6, xmm6, SHUFFLE_SELECT(2, 2, 2, 2)    // RG_pos ->
                shufps    xmm7, xmm7, SHUFFLE_SELECT(0, 0, 0, 0)    // BG_pos ->
                orps    xmm6, xmm7
                cmpneqps xmm6, [zero]                            // so check for any non-zero in RG_pos or BG_pos
                andps    xmm2, xmm6                                // Mask out RG_pos in current if we need to the current
                xorps    xmm6, [invert]
                andps    xmm5, xmm6
                orps    xmm2, xmm5                                    // insert RB pos instead

                // Change the sign of the R and B portions of the axis appropriately
                cmpltps xmm2, [zero]
                andps    xmm2, [rb_sign_bits]
                xorps    AXIS, xmm2                                // Flip the sign of the axis if the r/g or b/g tests indicate a negative slope
    }

    // -------------------------------------------------------------------------------------
    // (5) Axis projection and remapping
    // -------------------------------------------------------------------------------------
    {
        // Normalise the axis for simplicity of future calculation
        movaps        xmm7, AXIS
        mulps        xmm7, xmm7

        PARALLEL_ADD_XMM7

        // low of xmm7 is the DP result
        // If this is 0 we haven't actually got an axis, and we can't rsq it,
        // so mask the output to 0 in this case. This generates an acceptable result
        movaps        xmm2, xmm7
        cmpneqps    xmm2, [zero]
        rsqrtps        xmm7, xmm7
        andps        xmm7, xmm2
        shufps        xmm7, xmm7, SHUFFLE_SELECT(0, 0, 0, 0)

        // Normalise
        mulps AXIS, xmm7
    }

    #define LEFT  xmm2
    #define RIGHT xmm3

    // -------------------------------------------------------------------------------------
    // (6) Map the axis
    // -------------------------------------------------------------------------------------
    {
        lea        edi, [ebp - EBP_UNIQUES]
        lea        edx, [ebp - EBP_POS_ON_AXIS]
        mov        ecx, 16
        movaps    LEFT, [lots]
        movaps    RIGHT, [minuslots]
        xorps    xmm4, xmm4                // axis mapping error
        {
map_axis_loop:
                movaps    xmm7,[edi]
                subps    xmm7, AVERAGE
                mulps    xmm7, AXIS
                PARALLEL_ADD_XMM7
                movss[edx], xmm7
                add        edx, 4
                // xmm7 == pos_on_axis
                minss    LEFT,  xmm7        // calculate left
                maxss    RIGHT, xmm7        // calculate right
                add        edi, 16
                sub        ecx, 1
                jne         map_axis_loop
        }
    }

    shufps    LEFT, LEFT, SHUFFLE_SELECT(0, 0, 0, 0)
    shufps    RIGHT, RIGHT, SHUFFLE_SELECT(0, 0, 0, 0)

    // -------------------------------------------------------------------------------------
    // (7) Now we have a good axis and the basic information about how the points are mapped
    // to it
    // Our initial guess is to represent the endpoints accurately, by moving the average
    // to the centre and recalculating the point positions along the line
    // -------------------------------------------------------------------------------------
    { 
            // Calculate centre
            movaps    xmm7, LEFT
                addps    xmm7, RIGHT
                mulps    xmm7, [half]

                // Offset all the axis positions to the centre
                lea        edi, [ebp - EBP_POS_ON_AXIS]
                movaps    xmm5, [edi]
                movaps    xmm6, [edi + 16]
                subps    xmm5, xmm7
                subps    xmm6, xmm7
                movaps[edi], xmm5
                movaps[edi + 16], xmm6
                movaps    xmm5, [edi + 32]
                movaps    xmm6, [edi + 48]
                subps    xmm5, xmm7
                subps    xmm6, xmm7
                movaps[edi + 32], xmm5
                movaps[edi + 48], xmm6

                // Offset left, right and average to centre
                subps LEFT, xmm7
                subps RIGHT, xmm7

                mulps xmm7, AXIS
                addps AVERAGE, xmm7
     }

    // -------------------------------------------------------------------------------------
    // (8) Calculate the high and low output colour values
    // Involved in this is a rounding procedure which is undoubtedly slightly twitchy. A
    // straight rounded average is not correct, as the decompressor 'unrounds' by replicating
    // the top bits to the bottom.
    // In order to take account of this process, we don't just apply a straight rounding correction,
    // but base our rounding on the input value (a straight rounding is actually pretty good in terms of
    // error measure, but creates a visual colour and/or brightness shift relative to the original image)
    // The method used here is to apply a centre-biased rounding dependent on the input value, which was
    // (mostly by experiment) found to give minimum MSE while preserving the visual characteristics of
    // the image.
    // rgb = (average_rgb + (left|right)*v_rgb);
    // -------------------------------------------------------------------------------------
    {
        movaps    xmm6, LEFT
        movaps    xmm7, RIGHT
        mulps    xmm6, AXIS
        mulps    xmm7, AXIS
        addps    xmm6, AVERAGE
        addps    xmm7, AVERAGE

        // Rearrange so B and R are in the same register half (they both use 5-bit rounding)
        shufps    xmm6, xmm6, SHUFFLE_SELECT(0, 2, 1, 3)    // B R G
        shufps    xmm7, xmm7, SHUFFLE_SELECT(0, 2, 1, 3)

        // Convert to integer (by truncation, as C code does)
        cvttps2pi    mm0, xmm6
        cvttps2pi    mm1, xmm7
        movhlps        xmm6, xmm6
        movhlps        xmm7, xmm7
        cvttps2pi    mm2, xmm6
        cvttps2pi    mm3, xmm7

        // mm0/1 is blue/red, mm2/3 is green
        // This isn't quite the same as the C algorithm, but should generate the same result
        // if the input range is ensured to be 0-255
        // This code could be heavily interleaved, but for P4 it's not worth the hassle - the
        // P4 reordering range of 15 instructions will let it do the job for us

        pmaxsw    mm0, [clamp_0]        // Note: faster to do these max/min in MMX than float XMM - better reordering opportunities
        pmaxsw    mm1, [clamp_0]
        pminsw    mm0, [clamp_255]
        pminsw    mm1, [clamp_255]
        movq    mm6, mm0
        movq    mm7, mm1
        paddd    mm0, [blue_red_rounding]
        paddd    mm1, [blue_red_rounding]
        psrld    mm6, 5
        psrld    mm7, 5
        psubd    mm0, mm6
        psubd    mm1, mm7
        // No need to clamp here, with the input in 0-255 range it can never be outside at the end
        pand    mm0, [mask_blue_red]
        pand    mm1, [mask_blue_red]

        // Separate out R and B as they will need separate shifts later
        pshufw    mm4, mm0, SHUFFLE_SELECT(2, 3, 2, 3)    // extract R    (this is an SSE, not MMX, instruction)
        pshufw    mm5, mm1, SHUFFLE_SELECT(2, 3, 2, 3)    // also R

        pmaxsw    mm2, [clamp_0]
        pmaxsw    mm3, [clamp_0]
        pminsw    mm2, [clamp_255]
        pminsw    mm3, [clamp_255]
        movq    mm6, mm2
        movq    mm7, mm3
        paddd    mm2, [green_rounding]
        paddd    mm3, [green_rounding]
        psrld    mm6, 6
        psrld    mm7, 6
        psubd    mm2, mm6
        psubd    mm3, mm7
        pand    mm2, [mask_green]
        pand    mm3, [mask_green]

        // Convert the 8-bit values to final RGB565 colours in mm0 and mm1
        psrld    mm0, 3
        psrld    mm1, 3
        pslld    mm4, 8
        pslld    mm5, 8
        pslld    mm2, 3
        pslld    mm3, 3
        por        mm0, mm4
        por        mm1, mm5
        por        mm0, mm2
        por        mm1, mm3

        // mm0 and mm1 are c0 and c1

        // Need to compare c0/c1 for sign and flip and set swap if required - and handle colour equality as well....
        mov        edi, [esp + SP_BLOCK_DXTC]
        pxor    mm5, mm5
        punpcklwd mm0, mm5                // unpack c0/c1 to DWORD's as pcmp is a signed comparison
        punpcklwd mm1, mm5
        movq    mm2, mm0
        movq    mm3, mm0
        movq    mm4, mm0
        pcmpgtd mm2, mm1
        pxor    mm2, [invert]                // Need less than, so flip the result
        movd[ebp - EBP_SWAP], mm2        // Set the swap flag (used below) appropriately)
                                        // mm2 is the mask to indicate flipping is needed
        pcmpeqd mm4, mm1
        movd    ebx, mm4                    // ebx is the equality flag, plenty of time for this slow move to resolve
        punpcklwd mm0, mm1                // 'normal' order
        punpcklwd mm1, mm3                // reversed order
        pand    mm1, mm2
        pandn    mm2, mm0
        por        mm1, mm2                    // one of the two, selected by mm2
        movd[edi], mm1                    // write the result
    }


    // Clear the output bitmasks
    add edi, 4
    mov dword ptr[edi], 0

    // If the values are equal, the bit selector is 0 because the two colours are
    // the same (which implies transparent)
    // This seems the easiest way to do it, and will only rarely break branch prediction on
    // typical images.
    test    ebx, ebx
    jnz        all_done

    // -------------------------------------------------------------------------------------
    // (9) Final clustering, creating the 2-bit values that define the output
    // -------------------------------------------------------------------------------------
    {
        lea        ecx, split_point

        movaps    xmm7, RIGHT
        mulps    RIGHT,[ecx]                    // split point
        addps    xmm7, LEFT
        mulps    xmm7,[half]                    // centre (probably 0, but what the hell)

        lea        esi,[ebp - EBP_POS_ON_AXIS]
        lea        edx,[expandtable]
        movss    xmm6,[ebp - EBP_SWAP]
        shufps    xmm6, xmm6, 0
        mov        ecx, 4
        {
next_bit_loop:                            // Do 4 at once
            movaps        xmm4,[esi]            // Read the four pos_on_axis entries
            add            esi, 16
            movaps        xmm5, xmm4
            andps        xmm4,[clearsign]
            cmpltps        xmm4, RIGHT            // < division means 2
            cmpnltps    xmm5, xmm7            // >= centre means add 1
            xorps        xmm5, xmm6            // Swap the order if we had to flip our colours
            movmskps    eax, xmm4
            movmskps    ebx, xmm5
            movzx        eax, byte ptr[edx + eax + 16]
            movzx        ebx, byte ptr[edx + ebx]
            or            eax, ebx
            mov            byte ptr[edi], al
            add            edi, 1
            sub            ecx, 1
            jne            next_bit_loop
        }
    }

all_done:
    emms
    pop ebp
    pop ebx
    pop edi
    pop esi
    ret

    } // _asm
}

#pragma warning( pop )

#endif // !_WIN64