;===============================================================================
; Copyright (c) 2004-2006 ATI Technologies Inc.
;===============================================================================


AXIS_MUNGE                equ 1
PROGRESSIVE_REFINEMENT    equ 1

.DATA

ALIGN 16


EXTERN zero                : XMMWORD
EXTERN one                : XMMWORD
EXTERN one_over_16        : XMMWORD
EXTERN one_over_16_x_255_zeros    : XMMWORD
EXTERN clearsign        : XMMWORD
EXTERN signbit            : XMMWORD
EXTERN half                : XMMWORD
EXTERN c255                : XMMWORD
EXTERN c3                : XMMWORD
EXTERN lots                : XMMWORD
EXTERN minuslots        : XMMWORD

EXTERN mask_green        : MMWORD
EXTERN mask_blue_red    : MMWORD
EXTERN clamp_0            : MMWORD
EXTERN clamp_255        : MMWORD
EXTERN green_rounding    : MMWORD
EXTERN blue_red_rounding: MMWORD

EXTERN rb_sign_bits        : XMMWORD

EXTERN stepsize            : XMMWORD
EXTERN onethird            : XMMWORD

EXTERN maxerror_init    : DWORD
EXTERN maxerror_epsilon    : DWORD

EXTERN b_half            : XMMWORD
EXTERN b_2x                : XMMWORD

EXTERN invert            : XMMWORD


EXTERN split_point        : XMMWORD
EXTERN deviation_point    : XMMWORD


EXTERN expandtable        : DWORD


EXTERN expandtable        : DWORD


EXTERN _0f0f0f0f0f0f0f0f : MMWORD
EXTERN _0707070707070707 : MMWORD
EXTERN _000f000f000f000f : MMWORD
EXTERN _00f000f000f000f0 : MMWORD


.CODE


PARALLEL_ADD_3D        MACRO reg, tmp1, tmp2
    movaps tmp1,reg
    movhlps tmp2,reg
    shufps reg,reg, 055h
    addss tmp1,tmp2
    addss reg,tmp1

    ENDM

SHUFFLE_SELECT        MACRO a,b,c,d
    LOCAL Value
    IF (a gt 3) or (b gt 3) or (c gt 3) or (d gt 3)
        .ERR
        EXITM <0>
    ENDIF
    Value = ((a) OR (b SHL 2) OR (c SHL 4) OR (d SHL 6))
    EXITM %Value
    ENDM


; x64 uses register calling conventions
; The first four parameters are put in rcx, rdx, r8, r9 (floats would be in xmm0-3)
; rax, r10, r11, xmm4 and xmm5 are volatile in addition to the above - all others must be saved

; void __cdecl DXTCCompressBlockSSE(DWORD *block_32, DWORD *block_dxtc);
; block_dxtc == rdx, how convenient!


DXTCV11CompressBlockSSE PROC 

            push rsi    
            push rdi
            push rbx
            push rbp

            ; Set up a 16-byte aligned storage space pointer
            mov rbp, rsp
            and rbp, NOT 0fh

SAVED_REGS            equ 3
TMP_REGSAVE            equ (SAVED_REGS*16)

            ; Any xmm regs over 5 need to be saved here as well
            movaps [rbp-TMP_REGSAVE-( 0*16)], xmm6
            movaps [rbp-TMP_REGSAVE-( 1*16)], xmm7
            movaps [rbp-TMP_REGSAVE-( 2*16)], xmm8


; Other locals
TMP_RGB             equ (TMP_REGSAVE+(16*16))
TMP_POS_ON_AXIS        equ (TMP_RGB+(16*4))

TMP_CLUSTERPOS        equ (TMP_POS_ON_AXIS+16)
TMP_OLDLEFT            equ (TMP_CLUSTERPOS+16)
TMP_OLDRIGHT        equ (TMP_OLDLEFT+16)
TMP_RIGHTSAVE        equ (TMP_OLDRIGHT+16)
TMP_AXISSAVE        equ (TMP_RIGHTSAVE+16)
TMP_AVGSAVE         equ (TMP_AXISSAVE+16)

TMP_CLUSTERS        equ (TMP_AVGSAVE+(16*4))

TMP_MAXERROR        equ (TMP_CLUSTERS+16)
TMP_SWAP            equ (TMP_MAXERROR+4)


        ; Convert the pixel values to float and find their average position

            xorps xmm0, xmm0        ; average (r, g, b)

            ; The input data is in rcx
            lea rdi, [rbp-TMP_RGB]
            mov eax, 16
            xorps xmm2,xmm2

            average_loop:
                movd xmm7, dword ptr [rcx]
                punpcklbw xmm7,xmm2
                punpcklwd xmm7,xmm2
                cvtdq2ps xmm7,xmm7
                add rcx, 4

IF AXIS_MUNGE
                ; colourspace conversion to increase weight of G at expense of B
                movaps xmm6, xmm7
                shufps xmm6, xmm6, SHUFFLE_SELECT(1, 1, 1, 1)    ; G
                addss xmm7, xmm6
                mulss xmm7, dword ptr [b_half]
ENDIF

                movaps [rdi], xmm7        ; save the value off in the RGB float array
                add edi, 16
                addps xmm0, xmm7        ; accumulate average

                sub eax, 1
                jne average_loop

            ; Compute average of the values
            mulps xmm0, [one_over_16]


        ; For each component, reflect points about the average so all lie on the same side
        ; of the average, and compute the new average - this gives a second point that defines the axis
        ; To compute the sign of the axis sum the positive differences of G for each of R and B (the
        ; G axis is always positive in this implementation

            xorps xmm1, xmm1        ; axis (r, g, b)
            xorps xmm2, xmm2        ; rg_pos, rb_pos, bg_pos
            lea rdi, [rbp-TMP_RGB]

            mov ecx, 16
            find_axis_loop:
                movaps xmm7, [rdi]        ; R G B value
                add rdi, 16
                subps xmm7, xmm0        ; subtract average - centred
                movaps xmm6, xmm7
                movaps xmm5, xmm7

                andps xmm7, [clearsign]    ; fabs (r, g, b)
                addps xmm1, xmm7        ; accumulate direction of axis

                shufps xmm6, xmm6, SHUFFLE_SELECT(0, 2, 2, 3)    ; B R R 0
                shufps xmm5, xmm5, SHUFFLE_SELECT(1, 0, 1, 3)    ; G B G 0

                cmpnltps xmm6, [zero]    ; R/B > 0?
                andps xmm6, xmm5        ; insert the G or B value for those channels which are positive
                addps xmm2, xmm6        ; bg_pos rb_pos rg_pos

                sub ecx, 1
                jne find_axis_loop


            ; Handle RB_pos - RB pos is used if RG_pos and BG_pos are both zero.
            movaps xmm5, xmm2    ; duplicate the pos across these three
            movaps xmm6, xmm2
            movaps xmm7, xmm2
            shufps xmm5, xmm5, SHUFFLE_SELECT(1, 3, 3, 3)    ; RB_pos 0 ->
            shufps xmm6, xmm6, SHUFFLE_SELECT(2, 2, 2, 2)    ; RG_pos ->
            shufps xmm7, xmm7, SHUFFLE_SELECT(0, 0, 0, 0)    ; BG_pos ->
            orps xmm6, xmm7
            cmpneqps xmm6, [zero]    ; so check for any non-zero in RG_pos or BG_pos
            andps xmm2, xmm6        ; Mask out RG_pos in current if we need to the current
            xorps xmm6, [invert]
            andps xmm5, xmm6
            orps xmm2, xmm5            ; insert RB pos instead


            ; Change the sign of the R and B portions of the axis appropriately
            cmpltps xmm2, [zero]
            andps xmm2, [rb_sign_bits]
            xorps xmm1, xmm2            ; Flip the sign of the axis if the r/g or b/g tests indicate a negative slope


        ; Axis projection and remapping

            ; Normalise the axis for simplicity of future calculation
            movaps xmm7, xmm1

            mulps xmm7, xmm7
            PARALLEL_ADD_3D xmm7    ,xmm6,xmm5

            ; low of xmm7 is the DP result
            ; If this is 0 we haven't actually got an axis, and we can't rsq it,
            ; so mask the output to 0 in this case. This generates an acceptable result
            movss xmm2, xmm7
            cmpneqss xmm2, dword ptr [zero]

            ; RSQRT with Newton-Raphson. This can be omitted for even faster encoding performance, but quality
            ; and consistency improves with it in on certain images. It's not a large cost so leave it in.
            rsqrtss xmm3, xmm7
            andps xmm3, xmm2

            movss    xmm2, xmm3
            mulss    xmm3, xmm7
            mulss    xmm3, xmm2
            mulss    xmm2, dword ptr [half]
            movss    xmm7, dword ptr [c3]
            subss    xmm7, xmm3
            mulss    xmm7, xmm2

            shufps xmm7, xmm7, SHUFFLE_SELECT(0, 0, 0, 0)

            ; Normalise
            mulps xmm1, xmm7


        ; Map the axis

            lea rsi, [rbp-TMP_RGB]
            lea rdi, [rbp-TMP_POS_ON_AXIS]
            mov ecx, 16
            movaps xmm2, [lots]         ; left
            movaps xmm3, [minuslots]    ; right

            map_axis_loop:
                movaps xmm7, [rsi]
                subps xmm7, xmm0    ; subtract average

                ; dot product with axis
                mulps xmm7, xmm1
                PARALLEL_ADD_3D xmm7    ,xmm6,xmm5

                ; xmm7 == pos_on_axis

                movss dword ptr [rdi], xmm7
                add rdi, 4

                minss xmm2, xmm7        ; calculate left
                maxss xmm3, xmm7        ; calculate right

                add rsi, 16

                sub ecx, 1
                jne map_axis_loop


        ; We have a good axis and the basic information about how the points are mapped to it
        ; We need to calculate the endpoints - the initial guess is to use the extremities.

            ; Left and right are used across all the simds
            shufps xmm2, xmm2, SHUFFLE_SELECT(0, 0, 0, 0)   ; left
            shufps xmm3, xmm3, SHUFFLE_SELECT(0, 0, 0, 0)   ; right


        ; To simplify further calculations, we offset everything such that the axis centre is at 0

            ; Calculate centre
            movaps xmm7, xmm2   ; left
            addps xmm7, xmm3    ; right
            mulps xmm7, [half]

            ; Offset all the axis positions to the centre
            lea rdi, [rbp-TMP_POS_ON_AXIS]
            movaps xmm5, [rdi]
            movaps xmm6, [rdi+16]
            subps xmm5, xmm7
            subps xmm6, xmm7
            movaps [rdi], xmm5
            movaps [rdi+16], xmm6
            movaps xmm5, [rdi+32]
            movaps xmm6, [rdi+48]
            subps xmm5, xmm7
            subps xmm6, xmm7
            movaps [rdi+32], xmm5
            movaps [rdi+48], xmm6

            ; Offset left, right and average to centre
            subps xmm2, xmm7    ; left
            subps xmm3, xmm7    ; right

            mulps xmm7, xmm1    ; convert to rgb by multiplying by axis
            addps xmm0, xmm7    ; average


IF PROGRESSIVE_REFINEMENT

        ; Attempt a (simple) progressive refinement step to reduce noise in the
        ; output image by trying to find a better overall match for the endpoints
        ; than the first-guess solution found so far (which is just to take the ends.

            ; The method is to move the endpoints inwards until a local minima is found.
            ; This provides quite a significant improvement in image quality.

            mov eax, [maxerror_init]
            mov [rbp-TMP_MAXERROR], eax

            movaps [rbp-TMP_OLDLEFT], xmm2            ; XXX register space expansion
            movaps [rbp-TMP_OLDRIGHT], xmm3            ; XXX register space expansion

            lea rdi, [rbp-TMP_CLUSTERPOS]

            next_refinement_loop:

                movaps [rbp-TMP_RIGHTSAVE], xmm3    ; XXX register space expansion

                xorps xmm8,xmm8                        ; Error - clear the error

                ; Update the array of cluster positions based on the new values of left and right
                movss xmm7, xmm2        ; left
                addss xmm7, xmm3        ; right
                mulss xmm7, dword ptr [half]
                movss dword ptr [rdi], xmm2       ; left
                movss dword ptr [rdi+4], xmm3     ; right
                movss xmm5, xmm3        ; right
                movss xmm6, xmm7
                subss xmm5, xmm7        ; right-centre
                movss xmm4, xmm7
                movss xmm3, xmm5
                mulss xmm5, dword ptr [deviation_point]
                subss xmm6, xmm5
                addss xmm4, xmm5
                movss dword ptr [rdi+8], xmm6
                movss dword ptr [rdi+12], xmm4

                mulss xmm3, dword ptr [split_point]


                ; Calculate the current error
                mov ecx, 16
                lea rsi, [rbp-TMP_POS_ON_AXIS]
                next_builderror_loop:
                    movss xmm4, dword ptr [rsi]
                    add rsi, 4
                    movaps xmm5, xmm4
                    movaps xmm6, xmm4
                    andps xmm4, [clearsign]
                    cmpltss xmm4, xmm3        ; < division means 2
                    cmpnltss xmm5, xmm7        ; >= centre means add 1

                    movmskps eax, xmm4
                    movmskps ebx, xmm5
                    lea eax, [ebx+2*eax]        ; rax == which cluster (top 32 bits are cleared by this)

                    subss xmm6, dword ptr [rdi+4*rax]     ; rdi = array of positions along axis of cluster
                    mulss xmm6, xmm6            ; square to use MSE and (conveniently eliminate sign)
                    addss xmm8, xmm6            ; Accumulate to the MSE

                    sub ecx, 1
                    jne next_builderror_loop


                movaps xmm3, [rbp-TMP_RIGHTSAVE]    ; restore the right XXX register space expansion

                ; Test and update the maximum error
                movss xmm5, dword ptr [rbp-TMP_MAXERROR]
                cmpltss xmm5, xmm8
                movmskps eax, xmm5
                test eax,1
                jnz refinement_done


                subss xmm6, [maxerror_epsilon]
                movss dword ptr [rbp-TMP_MAXERROR], xmm8

                movaps [rbp-TMP_OLDLEFT], xmm2      ; left    XXX register space expansion
                movaps [rbp-TMP_OLDRIGHT], xmm3     ; right XXX register space expansion

                ; step left and right in a bit
                mulps xmm2, [stepsize]    ; XXX promote to register to improve code density?
                mulps xmm3, [stepsize]

                jmp next_refinement_loop

            refinement_done:
                movaps xmm2, [rbp-TMP_OLDLEFT]      ; left
                movaps xmm3, [rbp-TMP_OLDRIGHT]     ; right
ENDIF

        ; Endpoints and axis are now valid so we have all the information we need to compress the block


        ; Calculate the high and low output colour values

            ; Involved in this is a complex rounding procedure.
            ; A straight rounded average is not correct, as the decompressor unrounds by replicating
            ; the top bits to the bottom.

            ; In order to take account of this process, we don't just apply a straight rounding correction,
            ; but base our rounding on the input value (a straight rounding is actually pretty good in terms of
            ; error measure, but creates a visual colour and/or brightness shift relative to the original image)
            ; The method used here is to apply a centre-biased rounding dependent on the input value, which was
            ; (mostly by experiment) found to give minimum MSE while preserving the visual characteristics of
            ; the image.
            ; rgb = (average_rgb + (left|right)*v_rgb);
            movaps xmm6, xmm2       ; left
            movaps xmm7, xmm3       ; right
            mulps xmm6, xmm1        ; axis
            mulps xmm7, xmm1
            addps xmm6, xmm0        ; average
            addps xmm7, xmm0

IF AXIS_MUNGE
            ; Scale the B component, then subtract the green component resultant in each
            movaps xmm4, xmm6
            movaps xmm5, xmm7
            mulps xmm6, [b_2x]        ; XXX promote to register to improve code density?
            mulps xmm7, [b_2x]
            shufps xmm4, xmm4, SHUFFLE_SELECT(1, 1, 1, 1)
            shufps xmm5, xmm5, SHUFFLE_SELECT(1, 1, 1, 1)
            subss xmm6, xmm4
            subss xmm7, xmm5
ENDIF

            ; Rearrange so B and R are in the same register half (they both use 5-bit rounding)
            shufps xmm6, xmm6, SHUFFLE_SELECT(0, 2, 1, 3)    ; B R G
            shufps xmm7, xmm7, SHUFFLE_SELECT(0, 2, 1, 3)

            ; Convert to integer (by truncation, as C code does)
            cvttps2pi mm0, xmm6
            cvttps2pi mm1, xmm7
            movhlps xmm6, xmm6
            movhlps xmm7, xmm7
            cvttps2pi mm2, xmm6
            cvttps2pi mm3, xmm7

            ; mm0/1 is blue/red, mm2/3 is green

            ; This isn't quite the same as the C algorithm, but should generate the same result
            ; if the input range is ensured to be 0-255

            ; This code could be heavily interleaved, but for P4 it's not worth the hassle - the
            ; P4 reordering range of 15 instructions will let it do the job for us

            ; XXX promote all these consts to registers to improve code density?
            pmaxsw mm0, [clamp_0]        ; Note: faster to do these max/min in MMX than float XMM - better reordering opportunities
            pmaxsw mm1, [clamp_0]
            pminsw mm0, [clamp_255]
            pminsw mm1, [clamp_255]
            movq mm6, mm0
            movq mm7, mm1
            paddd mm0, [blue_red_rounding]
            paddd mm1, [blue_red_rounding]
            psrld mm6, 5
            psrld mm7, 5
            psubd mm0, mm6
            psubd mm1, mm7
            ; No need to clamp here, with the input in 0-255 range it can never be outside at the end
            pand mm0, [mask_blue_red]
            pand mm1, [mask_blue_red]

            ; Separate out R and B as they will need separate shifts later
            pshufw mm4, mm0, SHUFFLE_SELECT(2, 3, 2, 3)    ; extract R    (this is an SSE, not MMX, instruction)
            pshufw mm5, mm1, SHUFFLE_SELECT(2, 3, 2, 3)    ; also R

            pmaxsw mm2, [clamp_0]
            pmaxsw mm3, [clamp_0]
            pminsw mm2, [clamp_255]
            pminsw mm3, [clamp_255]
            movq mm6, mm2
            movq mm7, mm3
            paddd mm2, [green_rounding]
            paddd mm3, [green_rounding]
            psrld mm6, 6
            psrld mm7, 6
            psubd mm2, mm6
            psubd mm3, mm7
            pand mm2, [mask_green]
            pand mm3, [mask_green]


            ; Convert the 8-bit values to final RGB565 colours in mm0 and mm1
            psrld mm0, 3
            psrld mm1, 3
            pslld mm4, 8
            pslld mm5, 8
            pslld mm2, 3
            pslld mm3, 3
            por mm0, mm4
            por mm1, mm5
            por mm0, mm2
            por mm1, mm3

            ; mm0 and mm1 are c0 and c1


            ; Need to compare c0/c1 for sign and flip and set swap if required - and handle colour equality as well....

            ; rdx contains the destination DXTC block (dx == DXTC)

            pxor mm5, mm5
            punpcklwd mm0, mm5    ; unpack c0/c1 to DWORD's as pcmp is a signed comparison
            punpcklwd mm1, mm5
            movq mm2, mm0
            movq mm3, mm0
            movq mm4, mm0
            pcmpgtd mm2, mm1
            pxor mm2, qword ptr [invert]    ; Need less than, so flip the result
            movd dword ptr [rbp-TMP_SWAP], mm2    ; Set the swap flag (used below) appropriately)
            ; mm2 is the mask to indicate flipping is needed

            pcmpeqd mm4, mm1
            movd ebx, mm4        ; ebx is the equality flag, plenty of time for this slow move to resolve

            punpcklwd mm0, mm1    ; 'normal' order
            punpcklwd mm1, mm3    ; reversed order
            pand mm1, mm2
            pandn mm2, mm0
            por mm1, mm2        ; one of the two, selected by mm2
            movd dword ptr [rdx], mm1        ; write the result


        ; Colour writes complete - do some housekeeping

            ; Clear the output bitmasks
            add rdx,4
            mov dword ptr [rdx], 0

            ; If the values are equal, the bit selector is 0 because the two colours are
            ; the same (which implies transparent)
            ; This seems the easiest way to do it, and will only rarely break branch prediction on
            ; typical images.
            test ebx, ebx
            jnz all_done

IF 0
            ; Sanity check
            movzx eax, word ptr [rdi-4]
            movzx ebx, word ptr [rdi-2]
            cmp eax, ebx
            jge fine
            int 3
            fine:
ENDIF


        ; Final clustering, creating the 2-bit values that define the output

            movaps xmm7, xmm3           ; right
            mulps xmm3, [split_point]
            addps xmm7, xmm2            ; left
            mulps xmm7, [half]            ; centre (probably 0, but what the hell)

            lea rsi, [rbp-TMP_POS_ON_AXIS]
            lea rdi, [expandtable]

            mov r8d,[rbp-TMP_SWAP]
            and r8d, 0fh

            ; Main cluster loop
            movaps xmm0,[clearsign]        ; Promote this outside the loop to improve code density
            mov ecx, 4
            next_bit_loop:    ; Do 4 at once
                movaps xmm4, [rsi]        ; Read the four pos_on_axis entries
                add esi, 16
                movaps xmm5, xmm4
                andps xmm4, xmm0        ; Clear the sign bits in this copy
                cmpltps xmm4, xmm3        ; < division means 2
                cmpnltps xmm5, xmm7        ; >= centre means add 1

                movmskps eax, xmm4
                movmskps ebx, xmm5
                xor ebx,r8d             ; Swap the order if we exchanged the colours
                mov al, byte ptr [rdi+rax+16]
                or al, byte ptr [rdi+rbx]
                mov byte ptr [rdx], al    ; rdx == block_dxtc+1

                add rdx, 1

                sub ecx, 1
                jne next_bit_loop

        ; Complete

        all_done:

            ; Restore changed regs and exit

            movaps xmm6, [rbp-TMP_REGSAVE-( 0*16)]
            movaps xmm7, [rbp-TMP_REGSAVE-( 1*16)]
            movaps xmm8, [rbp-TMP_REGSAVE-( 2*16)]

            pop rbp
            pop rbx
            pop rdi
            pop rsi
            ret
            

DXTCV11CompressBlockSSE     ENDP

END