TexConv/CMP_CompressonatorLib/DXTC/dxtc_v11_compress_64.asm

;===============================================================================
; Copyright (c) 2004-2006 ATI Technologies Inc.
;===============================================================================


AXIS_MUNGE                equ 1
PROGRESSIVE_REFINEMENT    equ 1

.DATA

ALIGN 16


EXTERN zero                : XMMWORD
EXTERN one                : XMMWORD
EXTERN one_over_16        : XMMWORD
EXTERN one_over_16_x_255_zeros    : XMMWORD
EXTERN clearsign        : XMMWORD
EXTERN signbit            : XMMWORD
EXTERN half                : XMMWORD
EXTERN c255                : XMMWORD
EXTERN c3                : XMMWORD
EXTERN lots                : XMMWORD
EXTERN minuslots        : XMMWORD

EXTERN mask_green        : MMWORD
EXTERN mask_blue_red    : MMWORD
EXTERN clamp_0            : MMWORD
EXTERN clamp_255        : MMWORD
EXTERN green_rounding    : MMWORD
EXTERN blue_red_rounding: MMWORD

EXTERN rb_sign_bits        : XMMWORD

EXTERN stepsize            : XMMWORD
EXTERN onethird            : XMMWORD

EXTERN maxerror_init    : DWORD
EXTERN maxerror_epsilon    : DWORD

EXTERN b_half            : XMMWORD
EXTERN b_2x                : XMMWORD

EXTERN invert            : XMMWORD


EXTERN split_point        : XMMWORD
EXTERN deviation_point    : XMMWORD


EXTERN expandtable        : DWORD


EXTERN expandtable        : DWORD


EXTERN _0f0f0f0f0f0f0f0f : MMWORD
EXTERN _0707070707070707 : MMWORD
EXTERN _000f000f000f000f : MMWORD
EXTERN _00f000f000f000f0 : MMWORD


.CODE


PARALLEL_ADD_3D        MACRO reg, tmp1, tmp2
    movaps tmp1,reg
    movhlps tmp2,reg
    shufps reg,reg, 055h
    addss tmp1,tmp2
    addss reg,tmp1

    ENDM

SHUFFLE_SELECT        MACRO a,b,c,d
    LOCAL Value
    IF (a gt 3) or (b gt 3) or (c gt 3) or (d gt 3)
        .ERR
        EXITM <0>
    ENDIF
    Value = ((a) OR (b SHL 2) OR (c SHL 4) OR (d SHL 6))
    EXITM %Value
    ENDM


; x64 uses register calling conventions
; The first four parameters are put in rcx, rdx, r8, r9 (floats would be in xmm0-3)
; rax, r10, r11, xmm4 and xmm5 are volatile in addition to the above - all others must be saved

; void __cdecl DXTCCompressBlockSSE(DWORD *block_32, DWORD *block_dxtc);
; block_dxtc == rdx, how convenient!


DXTCV11CompressBlockSSE PROC 

            push rsi    
            push rdi
            push rbx
            push rbp

            ; Set up a 16-byte aligned storage space pointer
            mov rbp, rsp
            and rbp, NOT 0fh

SAVED_REGS            equ 3
TMP_REGSAVE            equ (SAVED_REGS*16)

            ; Any xmm regs over 5 need to be saved here as well
            movaps [rbp-TMP_REGSAVE-( 0*16)], xmm6
            movaps [rbp-TMP_REGSAVE-( 1*16)], xmm7
            movaps [rbp-TMP_REGSAVE-( 2*16)], xmm8


; Other locals
TMP_RGB             equ (TMP_REGSAVE+(16*16))
TMP_POS_ON_AXIS        equ (TMP_RGB+(16*4))

TMP_CLUSTERPOS        equ (TMP_POS_ON_AXIS+16)
TMP_OLDLEFT            equ (TMP_CLUSTERPOS+16)
TMP_OLDRIGHT        equ (TMP_OLDLEFT+16)
TMP_RIGHTSAVE        equ (TMP_OLDRIGHT+16)
TMP_AXISSAVE        equ (TMP_RIGHTSAVE+16)
TMP_AVGSAVE         equ (TMP_AXISSAVE+16)

TMP_CLUSTERS        equ (TMP_AVGSAVE+(16*4))

TMP_MAXERROR        equ (TMP_CLUSTERS+16)
TMP_SWAP            equ (TMP_MAXERROR+4)


        ; Convert the pixel values to float and find their average position

            xorps xmm0, xmm0        ; average (r, g, b)

            ; The input data is in rcx
            lea rdi, [rbp-TMP_RGB]
            mov eax, 16
            xorps xmm2,xmm2

            average_loop:
                movd xmm7, dword ptr [rcx]
                punpcklbw xmm7,xmm2
                punpcklwd xmm7,xmm2
                cvtdq2ps xmm7,xmm7
                add rcx, 4

IF AXIS_MUNGE
                ; colourspace conversion to increase weight of G at expense of B
                movaps xmm6, xmm7
                shufps xmm6, xmm6, SHUFFLE_SELECT(1, 1, 1, 1)    ; G
                addss xmm7, xmm6
                mulss xmm7, dword ptr [b_half]
ENDIF

                movaps [rdi], xmm7        ; save the value off in the RGB float array
                add edi, 16
                addps xmm0, xmm7        ; accumulate average

                sub eax, 1
                jne average_loop

            ; Compute average of the values
            mulps xmm0, [one_over_16]


        ; For each component, reflect points about the average so all lie on the same side
        ; of the average, and compute the new average - this gives a second point that defines the axis
        ; To compute the sign of the axis sum the positive differences of G for each of R and B (the
        ; G axis is always positive in this implementation

            xorps xmm1, xmm1        ; axis (r, g, b)
            xorps xmm2, xmm2        ; rg_pos, rb_pos, bg_pos
            lea rdi, [rbp-TMP_RGB]

            mov ecx, 16
            find_axis_loop:
                movaps xmm7, [rdi]        ; R G B value
                add rdi, 16
                subps xmm7, xmm0        ; subtract average - centred
                movaps xmm6, xmm7
                movaps xmm5, xmm7

                andps xmm7, [clearsign]    ; fabs (r, g, b)
                addps xmm1, xmm7        ; accumulate direction of axis

                shufps xmm6, xmm6, SHUFFLE_SELECT(0, 2, 2, 3)    ; B R R 0
                shufps xmm5, xmm5, SHUFFLE_SELECT(1, 0, 1, 3)    ; G B G 0

                cmpnltps xmm6, [zero]    ; R/B > 0?
                andps xmm6, xmm5        ; insert the G or B value for those channels which are positive
                addps xmm2, xmm6        ; bg_pos rb_pos rg_pos

                sub ecx, 1
                jne find_axis_loop


            ; Handle RB_pos - RB pos is used if RG_pos and BG_pos are both zero.
            movaps xmm5, xmm2    ; duplicate the pos across these three
            movaps xmm6, xmm2
            movaps xmm7, xmm2
            shufps xmm5, xmm5, SHUFFLE_SELECT(1, 3, 3, 3)    ; RB_pos 0 ->
            shufps xmm6, xmm6, SHUFFLE_SELECT(2, 2, 2, 2)    ; RG_pos ->
            shufps xmm7, xmm7, SHUFFLE_SELECT(0, 0, 0, 0)    ; BG_pos ->
            orps xmm6, xmm7
            cmpneqps xmm6, [zero]    ; so check for any non-zero in RG_pos or BG_pos
            andps xmm2, xmm6        ; Mask out RG_pos in current if we need to the current
            xorps xmm6, [invert]
            andps xmm5, xmm6
            orps xmm2, xmm5            ; insert RB pos instead


            ; Change the sign of the R and B portions of the axis appropriately
            cmpltps xmm2, [zero]
            andps xmm2, [rb_sign_bits]
            xorps xmm1, xmm2            ; Flip the sign of the axis if the r/g or b/g tests indicate a negative slope


        ; Axis projection and remapping

            ; Normalise the axis for simplicity of future calculation
            movaps xmm7, xmm1

            mulps xmm7, xmm7
            PARALLEL_ADD_3D xmm7    ,xmm6,xmm5

            ; low of xmm7 is the DP result
            ; If this is 0 we haven't actually got an axis, and we can't rsq it,
            ; so mask the output to 0 in this case. This generates an acceptable result
            movss xmm2, xmm7
            cmpneqss xmm2, dword ptr [zero]

            ; RSQRT with Newton-Raphson. This can be omitted for even faster encoding performance, but quality
            ; and consistency improves with it in on certain images. It's not a large cost so leave it in.
            rsqrtss xmm3, xmm7
            andps xmm3, xmm2

            movss    xmm2, xmm3
            mulss    xmm3, xmm7
            mulss    xmm3, xmm2
            mulss    xmm2, dword ptr [half]
            movss    xmm7, dword ptr [c3]
            subss    xmm7, xmm3
            mulss    xmm7, xmm2

            shufps xmm7, xmm7, SHUFFLE_SELECT(0, 0, 0, 0)

            ; Normalise
            mulps xmm1, xmm7


        ; Map the axis

            lea rsi, [rbp-TMP_RGB]
            lea rdi, [rbp-TMP_POS_ON_AXIS]
            mov ecx, 16
            movaps xmm2, [lots]         ; left
            movaps xmm3, [minuslots]    ; right

            map_axis_loop:
                movaps xmm7, [rsi]
                subps xmm7, xmm0    ; subtract average

                ; dot product with axis
                mulps xmm7, xmm1
                PARALLEL_ADD_3D xmm7    ,xmm6,xmm5

                ; xmm7 == pos_on_axis

                movss dword ptr [rdi], xmm7
                add rdi, 4

                minss xmm2, xmm7        ; calculate left
                maxss xmm3, xmm7        ; calculate right

                add rsi, 16

                sub ecx, 1
                jne map_axis_loop


        ; We have a good axis and the basic information about how the points are mapped to it
        ; We need to calculate the endpoints - the initial guess is to use the extremities.

            ; Left and right are used across all the simds
            shufps xmm2, xmm2, SHUFFLE_SELECT(0, 0, 0, 0)   ; left
            shufps xmm3, xmm3, SHUFFLE_SELECT(0, 0, 0, 0)   ; right


        ; To simplify further calculations, we offset everything such that the axis centre is at 0

            ; Calculate centre
            movaps xmm7, xmm2   ; left
            addps xmm7, xmm3    ; right
            mulps xmm7, [half]

            ; Offset all the axis positions to the centre
            lea rdi, [rbp-TMP_POS_ON_AXIS]
            movaps xmm5, [rdi]
            movaps xmm6, [rdi+16]
            subps xmm5, xmm7
            subps xmm6, xmm7
            movaps [rdi], xmm5
            movaps [rdi+16], xmm6
            movaps xmm5, [rdi+32]
            movaps xmm6, [rdi+48]
            subps xmm5, xmm7
            subps xmm6, xmm7
            movaps [rdi+32], xmm5
            movaps [rdi+48], xmm6

            ; Offset left, right and average to centre
            subps xmm2, xmm7    ; left
            subps xmm3, xmm7    ; right

            mulps xmm7, xmm1    ; convert to rgb by multiplying by axis
            addps xmm0, xmm7    ; average


IF PROGRESSIVE_REFINEMENT

        ; Attempt a (simple) progressive refinement step to reduce noise in the
        ; output image by trying to find a better overall match for the endpoints
        ; than the first-guess solution found so far (which is just to take the ends.

            ; The method is to move the endpoints inwards until a local minima is found.
            ; This provides quite a significant improvement in image quality.

            mov eax, [maxerror_init]
            mov [rbp-TMP_MAXERROR], eax

            movaps [rbp-TMP_OLDLEFT], xmm2            ; XXX register space expansion
            movaps [rbp-TMP_OLDRIGHT], xmm3            ; XXX register space expansion

            lea rdi, [rbp-TMP_CLUSTERPOS]

            next_refinement_loop:

                movaps [rbp-TMP_RIGHTSAVE], xmm3    ; XXX register space expansion

                xorps xmm8,xmm8                        ; Error - clear the error

                ; Update the array of cluster positions based on the new values of left and right
                movss xmm7, xmm2        ; left
                addss xmm7, xmm3        ; right
                mulss xmm7, dword ptr [half]
                movss dword ptr [rdi], xmm2       ; left
                movss dword ptr [rdi+4], xmm3     ; right
                movss xmm5, xmm3        ; right
                movss xmm6, xmm7
                subss xmm5, xmm7        ; right-centre
                movss xmm4, xmm7
                movss xmm3, xmm5
                mulss xmm5, dword ptr [deviation_point]
                subss xmm6, xmm5
                addss xmm4, xmm5
                movss dword ptr [rdi+8], xmm6
                movss dword ptr [rdi+12], xmm4

                mulss xmm3, dword ptr [split_point]


                ; Calculate the current error
                mov ecx, 16
                lea rsi, [rbp-TMP_POS_ON_AXIS]
                next_builderror_loop:
                    movss xmm4, dword ptr [rsi]
                    add rsi, 4
                    movaps xmm5, xmm4
                    movaps xmm6, xmm4
                    andps xmm4, [clearsign]
                    cmpltss xmm4, xmm3        ; < division means 2
                    cmpnltss xmm5, xmm7        ; >= centre means add 1

                    movmskps eax, xmm4
                    movmskps ebx, xmm5
                    lea eax, [ebx+2*eax]        ; rax == which cluster (top 32 bits are cleared by this)

                    subss xmm6, dword ptr [rdi+4*rax]     ; rdi = array of positions along axis of cluster
                    mulss xmm6, xmm6            ; square to use MSE and (conveniently eliminate sign)
                    addss xmm8, xmm6            ; Accumulate to the MSE

                    sub ecx, 1
                    jne next_builderror_loop


                movaps xmm3, [rbp-TMP_RIGHTSAVE]    ; restore the right XXX register space expansion

                ; Test and update the maximum error
                movss xmm5, dword ptr [rbp-TMP_MAXERROR]
                cmpltss xmm5, xmm8
                movmskps eax, xmm5
                test eax,1
                jnz refinement_done


                subss xmm6, [maxerror_epsilon]
                movss dword ptr [rbp-TMP_MAXERROR], xmm8

                movaps [rbp-TMP_OLDLEFT], xmm2      ; left    XXX register space expansion
                movaps [rbp-TMP_OLDRIGHT], xmm3     ; right XXX register space expansion

                ; step left and right in a bit
                mulps xmm2, [stepsize]    ; XXX promote to register to improve code density?
                mulps xmm3, [stepsize]

                jmp next_refinement_loop

            refinement_done:
                movaps xmm2, [rbp-TMP_OLDLEFT]      ; left
                movaps xmm3, [rbp-TMP_OLDRIGHT]     ; right
ENDIF

        ; Endpoints and axis are now valid so we have all the information we need to compress the block


        ; Calculate the high and low output colour values

            ; Involved in this is a complex rounding procedure.
            ; A straight rounded average is not correct, as the decompressor unrounds by replicating
            ; the top bits to the bottom.

            ; In order to take account of this process, we don't just apply a straight rounding correction,
            ; but base our rounding on the input value (a straight rounding is actually pretty good in terms of
            ; error measure, but creates a visual colour and/or brightness shift relative to the original image)
            ; The method used here is to apply a centre-biased rounding dependent on the input value, which was
            ; (mostly by experiment) found to give minimum MSE while preserving the visual characteristics of
            ; the image.
            ; rgb = (average_rgb + (left|right)*v_rgb);
            movaps xmm6, xmm2       ; left
            movaps xmm7, xmm3       ; right
            mulps xmm6, xmm1        ; axis
            mulps xmm7, xmm1
            addps xmm6, xmm0        ; average
            addps xmm7, xmm0

IF AXIS_MUNGE
            ; Scale the B component, then subtract the green component resultant in each
            movaps xmm4, xmm6
            movaps xmm5, xmm7
            mulps xmm6, [b_2x]        ; XXX promote to register to improve code density?
            mulps xmm7, [b_2x]
            shufps xmm4, xmm4, SHUFFLE_SELECT(1, 1, 1, 1)
            shufps xmm5, xmm5, SHUFFLE_SELECT(1, 1, 1, 1)
            subss xmm6, xmm4
            subss xmm7, xmm5
ENDIF

            ; Rearrange so B and R are in the same register half (they both use 5-bit rounding)
            shufps xmm6, xmm6, SHUFFLE_SELECT(0, 2, 1, 3)    ; B R G
            shufps xmm7, xmm7, SHUFFLE_SELECT(0, 2, 1, 3)

            ; Convert to integer (by truncation, as C code does)
            cvttps2pi mm0, xmm6
            cvttps2pi mm1, xmm7
            movhlps xmm6, xmm6
            movhlps xmm7, xmm7
            cvttps2pi mm2, xmm6
            cvttps2pi mm3, xmm7

            ; mm0/1 is blue/red, mm2/3 is green

            ; This isn't quite the same as the C algorithm, but should generate the same result
            ; if the input range is ensured to be 0-255

            ; This code could be heavily interleaved, but for P4 it's not worth the hassle - the
            ; P4 reordering range of 15 instructions will let it do the job for us

            ; XXX promote all these consts to registers to improve code density?
            pmaxsw mm0, [clamp_0]        ; Note: faster to do these max/min in MMX than float XMM - better reordering opportunities
            pmaxsw mm1, [clamp_0]
            pminsw mm0, [clamp_255]
            pminsw mm1, [clamp_255]
            movq mm6, mm0
            movq mm7, mm1
            paddd mm0, [blue_red_rounding]
            paddd mm1, [blue_red_rounding]
            psrld mm6, 5
            psrld mm7, 5
            psubd mm0, mm6
            psubd mm1, mm7
            ; No need to clamp here, with the input in 0-255 range it can never be outside at the end
            pand mm0, [mask_blue_red]
            pand mm1, [mask_blue_red]

            ; Separate out R and B as they will need separate shifts later
            pshufw mm4, mm0, SHUFFLE_SELECT(2, 3, 2, 3)    ; extract R    (this is an SSE, not MMX, instruction)
            pshufw mm5, mm1, SHUFFLE_SELECT(2, 3, 2, 3)    ; also R

            pmaxsw mm2, [clamp_0]
            pmaxsw mm3, [clamp_0]
            pminsw mm2, [clamp_255]
            pminsw mm3, [clamp_255]
            movq mm6, mm2
            movq mm7, mm3
            paddd mm2, [green_rounding]
            paddd mm3, [green_rounding]
            psrld mm6, 6
            psrld mm7, 6
            psubd mm2, mm6
            psubd mm3, mm7
            pand mm2, [mask_green]
            pand mm3, [mask_green]


            ; Convert the 8-bit values to final RGB565 colours in mm0 and mm1
            psrld mm0, 3
            psrld mm1, 3
            pslld mm4, 8
            pslld mm5, 8
            pslld mm2, 3
            pslld mm3, 3
            por mm0, mm4
            por mm1, mm5
            por mm0, mm2
            por mm1, mm3

            ; mm0 and mm1 are c0 and c1


            ; Need to compare c0/c1 for sign and flip and set swap if required - and handle colour equality as well....

            ; rdx contains the destination DXTC block (dx == DXTC)

            pxor mm5, mm5
            punpcklwd mm0, mm5    ; unpack c0/c1 to DWORD's as pcmp is a signed comparison
            punpcklwd mm1, mm5
            movq mm2, mm0
            movq mm3, mm0
            movq mm4, mm0
            pcmpgtd mm2, mm1
            pxor mm2, qword ptr [invert]    ; Need less than, so flip the result
            movd dword ptr [rbp-TMP_SWAP], mm2    ; Set the swap flag (used below) appropriately)
            ; mm2 is the mask to indicate flipping is needed

            pcmpeqd mm4, mm1
            movd ebx, mm4        ; ebx is the equality flag, plenty of time for this slow move to resolve

            punpcklwd mm0, mm1    ; 'normal' order
            punpcklwd mm1, mm3    ; reversed order
            pand mm1, mm2
            pandn mm2, mm0
            por mm1, mm2        ; one of the two, selected by mm2
            movd dword ptr [rdx], mm1        ; write the result


        ; Colour writes complete - do some housekeeping

            ; Clear the output bitmasks
            add rdx,4
            mov dword ptr [rdx], 0

            ; If the values are equal, the bit selector is 0 because the two colours are
            ; the same (which implies transparent)
            ; This seems the easiest way to do it, and will only rarely break branch prediction on
            ; typical images.
            test ebx, ebx
            jnz all_done

IF 0
            ; Sanity check
            movzx eax, word ptr [rdi-4]
            movzx ebx, word ptr [rdi-2]
            cmp eax, ebx
            jge fine
            int 3
            fine:
ENDIF


        ; Final clustering, creating the 2-bit values that define the output

            movaps xmm7, xmm3           ; right
            mulps xmm3, [split_point]
            addps xmm7, xmm2            ; left
            mulps xmm7, [half]            ; centre (probably 0, but what the hell)

            lea rsi, [rbp-TMP_POS_ON_AXIS]
            lea rdi, [expandtable]

            mov r8d,[rbp-TMP_SWAP]
            and r8d, 0fh

            ; Main cluster loop
            movaps xmm0,[clearsign]        ; Promote this outside the loop to improve code density
            mov ecx, 4
            next_bit_loop:    ; Do 4 at once
                movaps xmm4, [rsi]        ; Read the four pos_on_axis entries
                add esi, 16
                movaps xmm5, xmm4
                andps xmm4, xmm0        ; Clear the sign bits in this copy
                cmpltps xmm4, xmm3        ; < division means 2
                cmpnltps xmm5, xmm7        ; >= centre means add 1

                movmskps eax, xmm4
                movmskps ebx, xmm5
                xor ebx,r8d             ; Swap the order if we exchanged the colours
                mov al, byte ptr [rdi+rax+16]
                or al, byte ptr [rdi+rbx]
                mov byte ptr [rdx], al    ; rdx == block_dxtc+1

                add rdx, 1

                sub ecx, 1
                jne next_bit_loop

        ; Complete

        all_done:

            ; Restore changed regs and exit

            movaps xmm6, [rbp-TMP_REGSAVE-( 0*16)]
            movaps xmm7, [rbp-TMP_REGSAVE-( 1*16)]
            movaps xmm8, [rbp-TMP_REGSAVE-( 2*16)]

            pop rbp
            pop rbx
            pop rdi
            pop rsi
            ret
            

DXTCV11CompressBlockSSE     ENDP

END
add AMD CompressionLib 2020-07-31 11:31:32 +08:00			`;===============================================================================`
			`; Copyright (c) 2004-2006 ATI Technologies Inc.`
			`;===============================================================================`


			`AXIS_MUNGE equ 1`
			`PROGRESSIVE_REFINEMENT equ 1`

			`.DATA`

			`ALIGN 16`


			`EXTERN zero : XMMWORD`
			`EXTERN one : XMMWORD`
			`EXTERN one_over_16 : XMMWORD`
			`EXTERN one_over_16_x_255_zeros : XMMWORD`
			`EXTERN clearsign : XMMWORD`
			`EXTERN signbit : XMMWORD`
			`EXTERN half : XMMWORD`
			`EXTERN c255 : XMMWORD`
			`EXTERN c3 : XMMWORD`
			`EXTERN lots : XMMWORD`
			`EXTERN minuslots : XMMWORD`

			`EXTERN mask_green : MMWORD`
			`EXTERN mask_blue_red : MMWORD`
			`EXTERN clamp_0 : MMWORD`
			`EXTERN clamp_255 : MMWORD`
			`EXTERN green_rounding : MMWORD`
			`EXTERN blue_red_rounding: MMWORD`

			`EXTERN rb_sign_bits : XMMWORD`

			`EXTERN stepsize : XMMWORD`
			`EXTERN onethird : XMMWORD`

			`EXTERN maxerror_init : DWORD`
			`EXTERN maxerror_epsilon : DWORD`

			`EXTERN b_half : XMMWORD`
			`EXTERN b_2x : XMMWORD`

			`EXTERN invert : XMMWORD`


			`EXTERN split_point : XMMWORD`
			`EXTERN deviation_point : XMMWORD`


			`EXTERN expandtable : DWORD`



			`EXTERN expandtable : DWORD`


			`EXTERN _0f0f0f0f0f0f0f0f : MMWORD`
			`EXTERN _0707070707070707 : MMWORD`
			`EXTERN _000f000f000f000f : MMWORD`
			`EXTERN _00f000f000f000f0 : MMWORD`


			`.CODE`


			`PARALLEL_ADD_3D MACRO reg, tmp1, tmp2`
			`movaps tmp1,reg`
			`movhlps tmp2,reg`
			`shufps reg,reg, 055h`
			`addss tmp1,tmp2`
			`addss reg,tmp1`

			`ENDM`

			`SHUFFLE_SELECT MACRO a,b,c,d`
			`LOCAL Value`
			`IF (a gt 3) or (b gt 3) or (c gt 3) or (d gt 3)`
			`.ERR`
			`EXITM <0>`
			`ENDIF`
			`Value = ((a) OR (b SHL 2) OR (c SHL 4) OR (d SHL 6))`
			`EXITM %Value`
			`ENDM`



			`; x64 uses register calling conventions`
			`; The first four parameters are put in rcx, rdx, r8, r9 (floats would be in xmm0-3)`
			`; rax, r10, r11, xmm4 and xmm5 are volatile in addition to the above - all others must be saved`

			`; void __cdecl DXTCCompressBlockSSE(DWORD block_32, DWORD block_dxtc);`
			`; block_dxtc == rdx, how convenient!`



			`DXTCV11CompressBlockSSE PROC`

			`push rsi`
			`push rdi`
			`push rbx`
			`push rbp`

			`; Set up a 16-byte aligned storage space pointer`
			`mov rbp, rsp`
			`and rbp, NOT 0fh`

			`SAVED_REGS equ 3`
			`TMP_REGSAVE equ (SAVED_REGS*16)`

			`; Any xmm regs over 5 need to be saved here as well`
			`movaps [rbp-TMP_REGSAVE-( 0*16)], xmm6`
			`movaps [rbp-TMP_REGSAVE-( 1*16)], xmm7`
			`movaps [rbp-TMP_REGSAVE-( 2*16)], xmm8`


			`; Other locals`
			`TMP_RGB equ (TMP_REGSAVE+(16*16))`
			`TMP_POS_ON_AXIS equ (TMP_RGB+(16*4))`

			`TMP_CLUSTERPOS equ (TMP_POS_ON_AXIS+16)`
			`TMP_OLDLEFT equ (TMP_CLUSTERPOS+16)`
			`TMP_OLDRIGHT equ (TMP_OLDLEFT+16)`
			`TMP_RIGHTSAVE equ (TMP_OLDRIGHT+16)`
			`TMP_AXISSAVE equ (TMP_RIGHTSAVE+16)`
			`TMP_AVGSAVE equ (TMP_AXISSAVE+16)`

			`TMP_CLUSTERS equ (TMP_AVGSAVE+(16*4))`

			`TMP_MAXERROR equ (TMP_CLUSTERS+16)`
			`TMP_SWAP equ (TMP_MAXERROR+4)`


			`; Convert the pixel values to float and find their average position`

			`xorps xmm0, xmm0 ; average (r, g, b)`

			`; The input data is in rcx`
			`lea rdi, [rbp-TMP_RGB]`
			`mov eax, 16`
			`xorps xmm2,xmm2`

			`average_loop:`
			`movd xmm7, dword ptr [rcx]`
			`punpcklbw xmm7,xmm2`
			`punpcklwd xmm7,xmm2`
			`cvtdq2ps xmm7,xmm7`
			`add rcx, 4`

			`IF AXIS_MUNGE`
			`; colourspace conversion to increase weight of G at expense of B`
			`movaps xmm6, xmm7`
			`shufps xmm6, xmm6, SHUFFLE_SELECT(1, 1, 1, 1) ; G`
			`addss xmm7, xmm6`
			`mulss xmm7, dword ptr [b_half]`
			`ENDIF`

			`movaps [rdi], xmm7 ; save the value off in the RGB float array`
			`add edi, 16`
			`addps xmm0, xmm7 ; accumulate average`

			`sub eax, 1`
			`jne average_loop`

			`; Compute average of the values`
			`mulps xmm0, [one_over_16]`



			`; For each component, reflect points about the average so all lie on the same side`
			`; of the average, and compute the new average - this gives a second point that defines the axis`
			`; To compute the sign of the axis sum the positive differences of G for each of R and B (the`
			`; G axis is always positive in this implementation`

			`xorps xmm1, xmm1 ; axis (r, g, b)`
			`xorps xmm2, xmm2 ; rg_pos, rb_pos, bg_pos`
			`lea rdi, [rbp-TMP_RGB]`

			`mov ecx, 16`
			`find_axis_loop:`
			`movaps xmm7, [rdi] ; R G B value`
			`add rdi, 16`
			`subps xmm7, xmm0 ; subtract average - centred`
			`movaps xmm6, xmm7`
			`movaps xmm5, xmm7`

			`andps xmm7, [clearsign] ; fabs (r, g, b)`
			`addps xmm1, xmm7 ; accumulate direction of axis`

			`shufps xmm6, xmm6, SHUFFLE_SELECT(0, 2, 2, 3) ; B R R 0`
			`shufps xmm5, xmm5, SHUFFLE_SELECT(1, 0, 1, 3) ; G B G 0`

			`cmpnltps xmm6, [zero] ; R/B > 0?`
			`andps xmm6, xmm5 ; insert the G or B value for those channels which are positive`
			`addps xmm2, xmm6 ; bg_pos rb_pos rg_pos`

			`sub ecx, 1`
			`jne find_axis_loop`


			`; Handle RB_pos - RB pos is used if RG_pos and BG_pos are both zero.`
			`movaps xmm5, xmm2 ; duplicate the pos across these three`
			`movaps xmm6, xmm2`
			`movaps xmm7, xmm2`
			`shufps xmm5, xmm5, SHUFFLE_SELECT(1, 3, 3, 3) ; RB_pos 0 ->`
			`shufps xmm6, xmm6, SHUFFLE_SELECT(2, 2, 2, 2) ; RG_pos ->`
			`shufps xmm7, xmm7, SHUFFLE_SELECT(0, 0, 0, 0) ; BG_pos ->`
			`orps xmm6, xmm7`
			`cmpneqps xmm6, [zero] ; so check for any non-zero in RG_pos or BG_pos`
			`andps xmm2, xmm6 ; Mask out RG_pos in current if we need to the current`
			`xorps xmm6, [invert]`
			`andps xmm5, xmm6`
			`orps xmm2, xmm5 ; insert RB pos instead`


			`; Change the sign of the R and B portions of the axis appropriately`
			`cmpltps xmm2, [zero]`
			`andps xmm2, [rb_sign_bits]`
			`xorps xmm1, xmm2 ; Flip the sign of the axis if the r/g or b/g tests indicate a negative slope`



			`; Axis projection and remapping`

			`; Normalise the axis for simplicity of future calculation`
			`movaps xmm7, xmm1`

			`mulps xmm7, xmm7`
			`PARALLEL_ADD_3D xmm7 ,xmm6,xmm5`

			`; low of xmm7 is the DP result`
			`; If this is 0 we haven't actually got an axis, and we can't rsq it,`
			`; so mask the output to 0 in this case. This generates an acceptable result`
			`movss xmm2, xmm7`
			`cmpneqss xmm2, dword ptr [zero]`

			`; RSQRT with Newton-Raphson. This can be omitted for even faster encoding performance, but quality`
			`; and consistency improves with it in on certain images. It's not a large cost so leave it in.`
			`rsqrtss xmm3, xmm7`
			`andps xmm3, xmm2`

			`movss xmm2, xmm3`
			`mulss xmm3, xmm7`
			`mulss xmm3, xmm2`
			`mulss xmm2, dword ptr [half]`
			`movss xmm7, dword ptr [c3]`
			`subss xmm7, xmm3`
			`mulss xmm7, xmm2`

			`shufps xmm7, xmm7, SHUFFLE_SELECT(0, 0, 0, 0)`

			`; Normalise`
			`mulps xmm1, xmm7`


			`; Map the axis`

			`lea rsi, [rbp-TMP_RGB]`
			`lea rdi, [rbp-TMP_POS_ON_AXIS]`
			`mov ecx, 16`
			`movaps xmm2, [lots] ; left`
			`movaps xmm3, [minuslots] ; right`

			`map_axis_loop:`
			`movaps xmm7, [rsi]`
			`subps xmm7, xmm0 ; subtract average`

			`; dot product with axis`
			`mulps xmm7, xmm1`
			`PARALLEL_ADD_3D xmm7 ,xmm6,xmm5`

			`; xmm7 == pos_on_axis`

			`movss dword ptr [rdi], xmm7`
			`add rdi, 4`

			`minss xmm2, xmm7 ; calculate left`
			`maxss xmm3, xmm7 ; calculate right`

			`add rsi, 16`

			`sub ecx, 1`
			`jne map_axis_loop`


			`; We have a good axis and the basic information about how the points are mapped to it`
			`; We need to calculate the endpoints - the initial guess is to use the extremities.`

			`; Left and right are used across all the simds`
			`shufps xmm2, xmm2, SHUFFLE_SELECT(0, 0, 0, 0) ; left`
			`shufps xmm3, xmm3, SHUFFLE_SELECT(0, 0, 0, 0) ; right`


			`; To simplify further calculations, we offset everything such that the axis centre is at 0`

			`; Calculate centre`
			`movaps xmm7, xmm2 ; left`
			`addps xmm7, xmm3 ; right`
			`mulps xmm7, [half]`

			`; Offset all the axis positions to the centre`
			`lea rdi, [rbp-TMP_POS_ON_AXIS]`
			`movaps xmm5, [rdi]`
			`movaps xmm6, [rdi+16]`
			`subps xmm5, xmm7`
			`subps xmm6, xmm7`
			`movaps [rdi], xmm5`
			`movaps [rdi+16], xmm6`
			`movaps xmm5, [rdi+32]`
			`movaps xmm6, [rdi+48]`
			`subps xmm5, xmm7`
			`subps xmm6, xmm7`
			`movaps [rdi+32], xmm5`
			`movaps [rdi+48], xmm6`

			`; Offset left, right and average to centre`
			`subps xmm2, xmm7 ; left`
			`subps xmm3, xmm7 ; right`

			`mulps xmm7, xmm1 ; convert to rgb by multiplying by axis`
			`addps xmm0, xmm7 ; average`



			`IF PROGRESSIVE_REFINEMENT`

			`; Attempt a (simple) progressive refinement step to reduce noise in the`
			`; output image by trying to find a better overall match for the endpoints`
			`; than the first-guess solution found so far (which is just to take the ends.`

			`; The method is to move the endpoints inwards until a local minima is found.`
			`; This provides quite a significant improvement in image quality.`

			`mov eax, [maxerror_init]`
			`mov [rbp-TMP_MAXERROR], eax`

			`movaps [rbp-TMP_OLDLEFT], xmm2 ; XXX register space expansion`
			`movaps [rbp-TMP_OLDRIGHT], xmm3 ; XXX register space expansion`

			`lea rdi, [rbp-TMP_CLUSTERPOS]`

			`next_refinement_loop:`

			`movaps [rbp-TMP_RIGHTSAVE], xmm3 ; XXX register space expansion`

			`xorps xmm8,xmm8 ; Error - clear the error`

			`; Update the array of cluster positions based on the new values of left and right`
			`movss xmm7, xmm2 ; left`
			`addss xmm7, xmm3 ; right`
			`mulss xmm7, dword ptr [half]`
			`movss dword ptr [rdi], xmm2 ; left`
			`movss dword ptr [rdi+4], xmm3 ; right`
			`movss xmm5, xmm3 ; right`
			`movss xmm6, xmm7`
			`subss xmm5, xmm7 ; right-centre`
			`movss xmm4, xmm7`
			`movss xmm3, xmm5`
			`mulss xmm5, dword ptr [deviation_point]`
			`subss xmm6, xmm5`
			`addss xmm4, xmm5`
			`movss dword ptr [rdi+8], xmm6`
			`movss dword ptr [rdi+12], xmm4`

			`mulss xmm3, dword ptr [split_point]`


			`; Calculate the current error`
			`mov ecx, 16`
			`lea rsi, [rbp-TMP_POS_ON_AXIS]`
			`next_builderror_loop:`
			`movss xmm4, dword ptr [rsi]`
			`add rsi, 4`
			`movaps xmm5, xmm4`
			`movaps xmm6, xmm4`
			`andps xmm4, [clearsign]`
			`cmpltss xmm4, xmm3 ; < division means 2`
			`cmpnltss xmm5, xmm7 ; >= centre means add 1`

			`movmskps eax, xmm4`
			`movmskps ebx, xmm5`
			`lea eax, [ebx+2*eax] ; rax == which cluster (top 32 bits are cleared by this)`

			`subss xmm6, dword ptr [rdi+4*rax] ; rdi = array of positions along axis of cluster`
			`mulss xmm6, xmm6 ; square to use MSE and (conveniently eliminate sign)`
			`addss xmm8, xmm6 ; Accumulate to the MSE`

			`sub ecx, 1`
			`jne next_builderror_loop`


			`movaps xmm3, [rbp-TMP_RIGHTSAVE] ; restore the right XXX register space expansion`

			`; Test and update the maximum error`
			`movss xmm5, dword ptr [rbp-TMP_MAXERROR]`
			`cmpltss xmm5, xmm8`
			`movmskps eax, xmm5`
			`test eax,1`
			`jnz refinement_done`


			`subss xmm6, [maxerror_epsilon]`
			`movss dword ptr [rbp-TMP_MAXERROR], xmm8`

			`movaps [rbp-TMP_OLDLEFT], xmm2 ; left XXX register space expansion`
			`movaps [rbp-TMP_OLDRIGHT], xmm3 ; right XXX register space expansion`

			`; step left and right in a bit`
			`mulps xmm2, [stepsize] ; XXX promote to register to improve code density?`
			`mulps xmm3, [stepsize]`

			`jmp next_refinement_loop`

			`refinement_done:`
			`movaps xmm2, [rbp-TMP_OLDLEFT] ; left`
			`movaps xmm3, [rbp-TMP_OLDRIGHT] ; right`
			`ENDIF`

			`; Endpoints and axis are now valid so we have all the information we need to compress the block`


			`; Calculate the high and low output colour values`

			`; Involved in this is a complex rounding procedure.`
			`; A straight rounded average is not correct, as the decompressor unrounds by replicating`
			`; the top bits to the bottom.`

			`; In order to take account of this process, we don't just apply a straight rounding correction,`
			`; but base our rounding on the input value (a straight rounding is actually pretty good in terms of`
			`; error measure, but creates a visual colour and/or brightness shift relative to the original image)`
			`; The method used here is to apply a centre-biased rounding dependent on the input value, which was`
			`; (mostly by experiment) found to give minimum MSE while preserving the visual characteristics of`
			`; the image.`
			`; rgb = (average_rgb + (left\|right)*v_rgb);`
			`movaps xmm6, xmm2 ; left`
			`movaps xmm7, xmm3 ; right`
			`mulps xmm6, xmm1 ; axis`
			`mulps xmm7, xmm1`
			`addps xmm6, xmm0 ; average`
			`addps xmm7, xmm0`

			`IF AXIS_MUNGE`
			`; Scale the B component, then subtract the green component resultant in each`
			`movaps xmm4, xmm6`
			`movaps xmm5, xmm7`
			`mulps xmm6, [b_2x] ; XXX promote to register to improve code density?`
			`mulps xmm7, [b_2x]`
			`shufps xmm4, xmm4, SHUFFLE_SELECT(1, 1, 1, 1)`
			`shufps xmm5, xmm5, SHUFFLE_SELECT(1, 1, 1, 1)`
			`subss xmm6, xmm4`
			`subss xmm7, xmm5`
			`ENDIF`

			`; Rearrange so B and R are in the same register half (they both use 5-bit rounding)`
			`shufps xmm6, xmm6, SHUFFLE_SELECT(0, 2, 1, 3) ; B R G`
			`shufps xmm7, xmm7, SHUFFLE_SELECT(0, 2, 1, 3)`

			`; Convert to integer (by truncation, as C code does)`
			`cvttps2pi mm0, xmm6`
			`cvttps2pi mm1, xmm7`
			`movhlps xmm6, xmm6`
			`movhlps xmm7, xmm7`
			`cvttps2pi mm2, xmm6`
			`cvttps2pi mm3, xmm7`

			`; mm0/1 is blue/red, mm2/3 is green`

			`; This isn't quite the same as the C algorithm, but should generate the same result`
			`; if the input range is ensured to be 0-255`

			`; This code could be heavily interleaved, but for P4 it's not worth the hassle - the`
			`; P4 reordering range of 15 instructions will let it do the job for us`

			`; XXX promote all these consts to registers to improve code density?`
			`pmaxsw mm0, [clamp_0] ; Note: faster to do these max/min in MMX than float XMM - better reordering opportunities`
			`pmaxsw mm1, [clamp_0]`
			`pminsw mm0, [clamp_255]`
			`pminsw mm1, [clamp_255]`
			`movq mm6, mm0`
			`movq mm7, mm1`
			`paddd mm0, [blue_red_rounding]`
			`paddd mm1, [blue_red_rounding]`
			`psrld mm6, 5`
			`psrld mm7, 5`
			`psubd mm0, mm6`
			`psubd mm1, mm7`
			`; No need to clamp here, with the input in 0-255 range it can never be outside at the end`
			`pand mm0, [mask_blue_red]`
			`pand mm1, [mask_blue_red]`

			`; Separate out R and B as they will need separate shifts later`
			`pshufw mm4, mm0, SHUFFLE_SELECT(2, 3, 2, 3) ; extract R (this is an SSE, not MMX, instruction)`
			`pshufw mm5, mm1, SHUFFLE_SELECT(2, 3, 2, 3) ; also R`

			`pmaxsw mm2, [clamp_0]`
			`pmaxsw mm3, [clamp_0]`
			`pminsw mm2, [clamp_255]`
			`pminsw mm3, [clamp_255]`
			`movq mm6, mm2`
			`movq mm7, mm3`
			`paddd mm2, [green_rounding]`
			`paddd mm3, [green_rounding]`
			`psrld mm6, 6`
			`psrld mm7, 6`
			`psubd mm2, mm6`
			`psubd mm3, mm7`
			`pand mm2, [mask_green]`
			`pand mm3, [mask_green]`


			`; Convert the 8-bit values to final RGB565 colours in mm0 and mm1`
			`psrld mm0, 3`
			`psrld mm1, 3`
			`pslld mm4, 8`
			`pslld mm5, 8`
			`pslld mm2, 3`
			`pslld mm3, 3`
			`por mm0, mm4`
			`por mm1, mm5`
			`por mm0, mm2`
			`por mm1, mm3`

			`; mm0 and mm1 are c0 and c1`


			`; Need to compare c0/c1 for sign and flip and set swap if required - and handle colour equality as well....`

			`; rdx contains the destination DXTC block (dx == DXTC)`

			`pxor mm5, mm5`
			`punpcklwd mm0, mm5 ; unpack c0/c1 to DWORD's as pcmp is a signed comparison`
			`punpcklwd mm1, mm5`
			`movq mm2, mm0`
			`movq mm3, mm0`
			`movq mm4, mm0`
			`pcmpgtd mm2, mm1`
			`pxor mm2, qword ptr [invert] ; Need less than, so flip the result`
			`movd dword ptr [rbp-TMP_SWAP], mm2 ; Set the swap flag (used below) appropriately)`
			`; mm2 is the mask to indicate flipping is needed`

			`pcmpeqd mm4, mm1`
			`movd ebx, mm4 ; ebx is the equality flag, plenty of time for this slow move to resolve`

			`punpcklwd mm0, mm1 ; 'normal' order`
			`punpcklwd mm1, mm3 ; reversed order`
			`pand mm1, mm2`
			`pandn mm2, mm0`
			`por mm1, mm2 ; one of the two, selected by mm2`
			`movd dword ptr [rdx], mm1 ; write the result`


			`; Colour writes complete - do some housekeeping`

			`; Clear the output bitmasks`
			`add rdx,4`
			`mov dword ptr [rdx], 0`

			`; If the values are equal, the bit selector is 0 because the two colours are`
			`; the same (which implies transparent)`
			`; This seems the easiest way to do it, and will only rarely break branch prediction on`
			`; typical images.`
			`test ebx, ebx`
			`jnz all_done`

			`IF 0`
			`; Sanity check`
			`movzx eax, word ptr [rdi-4]`
			`movzx ebx, word ptr [rdi-2]`
			`cmp eax, ebx`
			`jge fine`
			`int 3`
			`fine:`
			`ENDIF`



			`; Final clustering, creating the 2-bit values that define the output`

			`movaps xmm7, xmm3 ; right`
			`mulps xmm3, [split_point]`
			`addps xmm7, xmm2 ; left`
			`mulps xmm7, [half] ; centre (probably 0, but what the hell)`

			`lea rsi, [rbp-TMP_POS_ON_AXIS]`
			`lea rdi, [expandtable]`

			`mov r8d,[rbp-TMP_SWAP]`
			`and r8d, 0fh`

			`; Main cluster loop`
			`movaps xmm0,[clearsign] ; Promote this outside the loop to improve code density`
			`mov ecx, 4`
			`next_bit_loop: ; Do 4 at once`
			`movaps xmm4, [rsi] ; Read the four pos_on_axis entries`
			`add esi, 16`
			`movaps xmm5, xmm4`
			`andps xmm4, xmm0 ; Clear the sign bits in this copy`
			`cmpltps xmm4, xmm3 ; < division means 2`
			`cmpnltps xmm5, xmm7 ; >= centre means add 1`

			`movmskps eax, xmm4`
			`movmskps ebx, xmm5`
			`xor ebx,r8d ; Swap the order if we exchanged the colours`
			`mov al, byte ptr [rdi+rax+16]`
			`or al, byte ptr [rdi+rbx]`
			`mov byte ptr [rdx], al ; rdx == block_dxtc+1`

			`add rdx, 1`

			`sub ecx, 1`
			`jne next_bit_loop`

			`; Complete`

			`all_done:`

			`; Restore changed regs and exit`

			`movaps xmm6, [rbp-TMP_REGSAVE-( 0*16)]`
			`movaps xmm7, [rbp-TMP_REGSAVE-( 1*16)]`
			`movaps xmm8, [rbp-TMP_REGSAVE-( 2*16)]`

			`pop rbp`
			`pop rbx`
			`pop rdi`
			`pop rsi`
			`ret`


			`DXTCV11CompressBlockSSE ENDP`

			`END`