TexConv/CMP_CompressonatorLib/DXTC/dxtc_v11_compress_sse2.asm

;===============================================================================
; Copyright (c) 2004-2006 ATI Technologies Inc.
;===============================================================================
; Assemble with:
; ml   /DX64=0 /W3 /Fo $(OutDir)\$(InputName).obj /c /coff /Cp /Zi $(InputPath)
; ml64 /DX64=1 /W3 /Fo $(OutDir)\$(InputName).obj /c /Cp /Zi $(InputPath)


; Performance / test / tuning options

WEIGHTING                equ 1        ; Allows (limited) weighting of RGB components (currently 2 : 4 : 1)
UNROUNDING                equ 1        ; Costs ~1%; improves MSE slightly
PROG_THRESHOLD            equ 1        ; Enable to skip prog on small-signal blocks; meaningless drag with no_prog

NO_PROG                    equ 0        ; Note that weighting is not relevant if NO_PROG is enabled
USE_34                    equ 0        ; Enable 3-colour blocks. Costs ~10% on Phenom with X64, but more like 20% on older chips with win32, helps most images little but fixes some challenging blocks

COUNT_PROG_STEPS        equ 0        ; Generates a false-colour image; black is 2 prog steps, rising accordingly as with Spectrum palette
SHOW_BLOCK_TYPES        equ 0        ; USE_34 must be enabled

; This is the dial controlling the point at which prog turns on. The tipping point is somewhere between 8 and
; 12; increases above 12 start to add noise and hit MSE noticeably; below 8 there are few gains.
; For a typical image 8 provides about a 2.5% gain, with 12 providing about 7.5% gain in performance; 16 only
; adds another 2%. In the end 12 seems acceptable; the image is very similar and the shape of noise is not
; significantly changed, while the diminishing returns above this point seem to make it unwise.
prog_threshold            equ 12*32    ; X is in 9.5 format

; Increasing stepsize suffers from decreasing performance returns (due to a minimum of 2 prog steps being
; required), but decreasing it suffers from increasing inaccuracy once a certain point is crossed. Selecting
; the optimum step size is therefore an important performance/quality tradeoff; excellent results are
; possible with high performance if this is correctly tuned.
; If weighting is off, larger step sizes appear to make less difference in quality.
step                    equ 3*32    ; X is in 9.5 format; we want to step somewhere between 1.0 and 4.0
;step                    equ    2*32
;step                    equ    4*32
;step                    equ    8*32

; We could make step programmable to trade off performance and image quality


IF X64 EQ 0
    .686
    .MMX
    .XMM
    .MODEL FLAT
ENDIF


.DATA

white                DWORD 0ffffffh, 0ffffffh, 0ffffffh, 0ffffffh
                    DWORD 0ffffffh, 0ffffffh, 0ffffffh, 0ffffffh
                    DWORD 0ffffffh, 0ffffffh, 0ffffffh, 0ffffffh
                    DWORD 0ffffffh, 0ffffffh, 0ffffffh, 0ffffffh


whiteblack            DWORD 0ffffffh, 0ffffffh, 0ffffffh, 0ffffffh
                    DWORD 0ffffffh, 0000000h, 0000000h, 0ffffffh
                    DWORD 0ffffffh, 0ffffffh, 0000000h, 0000000h
                    DWORD 0000000h, 0000000h, 0000000h, 0000000h

redblack            DWORD 0ff0000h, 0ff0000h, 0ff0000h, 0ff0000h
                    DWORD 0ff0000h, 0000000h, 0000000h, 0ff0000h
                    DWORD 0ff0000h, 0ff0000h, 0000000h, 0000000h
                    DWORD 0000000h, 0000000h, 0000000h, 0000000h

redsblack            DWORD 0ff0000h, 0aa0000h, 0aa0000h, 0ff0000h
                    DWORD 0ff0000h, 0550000h, 0550000h, 0ff0000h
                    DWORD 0ff0000h, 0ff0000h, 0000000h, 0000000h
                    DWORD 0000000h, 0000000h, 0000000h, 0000000h

redsblack_prog        DWORD 0ff0000h, 0990000h, 0990000h, 0990000h
                    DWORD 0990000h, 0660000h, 0660000h, 0990000h
                    DWORD 0990000h, 0990000h, 0660000h, 0000000h
                    DWORD 0660000h, 0660000h, 0660000h, 0660000h

redsblack2            DWORD 0e00000h, 0e00000h, 0e00000h, 0e00000h
                    DWORD 0c00000h, 0c00000h, 0c00000h, 0c00000h
                    DWORD 0a00000h, 0a00000h, 0a00000h, 0a00000h
                    DWORD 0800000h, 0800000h, 0800000h, 0800000h

greenred            DWORD 0ff0000h, 0ff0000h, 0ff0000h, 0ff0000h
                    DWORD 0ff0000h, 0ff0000h, 0ff0000h, 0ff0000h
                    DWORD 000ff00h, 000ff00h, 000ff00h, 000ff00h
                    DWORD 000ff00h, 000ff00h, 000ff00h, 000ff00h

bluered                DWORD 0ff0000h, 0ff0000h, 0ff0000h, 0ff0000h
                    DWORD 0ff0000h, 0ff0000h, 0ff0000h, 0ff0000h
                    DWORD 00000ffh, 00000ffh, 00000ffh, 00000ffh
                    DWORD 00000ffh, 00000ffh, 00000ffh, 00000ffh

fade                DWORD 0ffffffh, 0ffffffh, 0b0b0b0h, 0707070h
                    DWORD 0ffffffh, 0ffffffh, 0b0b0b0h, 0707070h
                    DWORD 0ffffffh, 0ffffffh, 0d0d0d0h, 0909090h
                    DWORD 0ffffffh, 0ffffffh, 0ffffffh, 0e0e0e0h

col_table            DWORD 000000000h, 0001f001fh, 0f800f800h, 0f81ff81fh
                    DWORD 007e007e0h, 007ff07ffh, 0ffe0ffe0h, 0ffffffffh
                    DWORD 084108410h

ALIGN 16

testval                    equ 0400h
test_val                WORD testval, testval, testval, testval, testval, testval, testval, testval

one_third                equ 65536/3


AVG_FRAC_BITS            equ 4                            ; 0-4

mask_rgb                QWORD 000ffffff00ffffffh, 000ffffff00ffffffh
mask_rg                    QWORD 000ffff0000ffff00h, 000ffff0000ffff00h
mask_rb                    QWORD 000ff00ff00ff00ffh, 000ff00ff00ff00ffh
mask_gb                    QWORD 00000ffff0000ffffh, 00000ffff0000ffffh
mask_r                    QWORD 000ff000000ff0000h, 000ff000000ff0000h
mask_b                    QWORD 0000000ff000000ffh, 0000000ff000000ffh
mask_low_qword            QWORD 0ffffffffffffffffh, 0
mask_low_dword            QWORD 000000000ffffffffh, 0
mask_low_word            QWORD 0000000000000ffffh, 0
mask_third_word            QWORD 00000ffff00000000h, 0
dword_word_mask            QWORD 00000ffff0000ffffh, 00000ffff0000ffffh
quad_word_mask            QWORD 0000000000000ffffh, 0000000000000ffffh
quad_dword_mask            QWORD 000000000ffffffffh, 000000000ffffffffh
quad_upper_dword_mask    QWORD 0ffffffff00000000h, 0ffffffff00000000h
max_sint32                DWORD 07fffffffh, 07fffffffh, 07fffffffh, 07fffffffh

scale_one_third            WORD one_third,one_third,one_third,one_third,one_third,one_third,one_third,one_third
stepsize                WORD step,step,step,step,step,step,step,step
prog_threshold_size        WORD prog_threshold,prog_threshold,prog_threshold,prog_threshold,prog_threshold,prog_threshold,prog_threshold,prog_threshold

IF WEIGHTING
WEIGHTING_BITS            equ 4                ; If this equals AVG_FRAC_BITS a few extra optimisations kick in
weighting                WORD 4,16,8,0, 4,16,8,0
unweighting                WORD 4,1,2,0
round_565_weighted        WORD 0010h, 0020h, 0020h, 0
ELSE
WEIGHTING_BITS            equ 0
ENDIF

IF UNROUNDING
round_565                WORD 0040h,0020h,0040h,0    ; This is applied to the average, which is in 8.4 format, so it's 0.5 in 5.7, 6.6, 5.7 format
scale_to_round            WORD 2048,1024,2048,0        ; This is a 5 6 5 right shift in 8.4 format
round_mask                WORD 0fff0h, 0fff0h, 0fff0h, 0
ENDIF


scale_8_4_to_565        WORD 32*16,    64*16, 32*16,    0
clamp_565                WORD 31,63,31,0

_0000000055555555        QWORD 00000000055555555h

_0707070707070707        QWORD 00707070707070707h
_0f0f0f0f0f0f0f0f        QWORD 00f0f0f0f0f0f0f0fh
_000f000f000f000f        QWORD 0000f000f000f000fh
_00f000f000f000f0        QWORD 000f000f000f000f0h

ALIGN 16

; The axis ordering table. This is a table of XMMWORD 4-float values that set the axis signs
; using the index made up of the RB,GB,GR negative correlation and zero signal bits
; In addition, the scaling factor to convert from -1,1 range to 1.15 fixed point is baked in
scale_1_15                equ 32768.0

AXIS_ORDER_ENTRY        MACRO rb_neg, gb_zero, gb_neg, gr_zero, gr_neg
    IF (gr_zero NE 0) AND (gb_zero NE 0)
        ; use rb on B
        IF (rb_neg NE 0)
            REAL4 -scale_1_15, scale_1_15, scale_1_15, 0.0
        ELSE
            REAL4 scale_1_15, scale_1_15, scale_1_15, 0.0
        ENDIF
    ELSE
        ; use gb_pos on B and gr_pos on R
        IF (gb_neg NE 0)
            REAL4 -scale_1_15, scale_1_15
        ELSE
            REAL4 scale_1_15, scale_1_15
        ENDIF
        IF (gr_neg NE 0)
            REAL4 -scale_1_15, 0.0
        ELSE
            REAL4 scale_1_15, 0.0
        ENDIF
    ENDIF
ENDM


axis_order_table    LABEL XMMWORD
    axis_order_count = 0
    WHILE (axis_order_count LT 32)
        AXIS_ORDER_ENTRY (axis_order_count AND 1), (axis_order_count AND 2), (axis_order_count AND 4), (axis_order_count AND 8), (axis_order_count AND 010h)
        axis_order_count = axis_order_count + 1
    ENDM


; A few helper macros to make shift code cleaner
AVG_SHIFT_BITS            equ WEIGHTING_BITS + (4-AVG_FRAC_BITS)
IF AVG_SHIFT_BITS
avg_round_val            equ 1 SHL (AVG_SHIFT_BITS-1)        ; 0.5 correction
avg_round                WORD avg_round_val,avg_round_val,avg_round_val,avg_round_val,avg_round_val,avg_round_val,avg_round_val,avg_round_val
ENDIF

INPUT_SHIFT_BITS        equ WEIGHTING_BITS - AVG_FRAC_BITS        ; input has been left shifted by weight, needs to be left shifted by frac

INPUT_SHIFT            MACRO reg
    IF INPUT_SHIFT_BITS GT 0
        psrlw reg,INPUT_SHIFT_BITS
    ELSEIF INPUT_SHIFT_BITS LT 0
        psllw reg,-INPUT_SHIFT_BITS
    ENDIF

    ENDM


; Macro to generate the correct value for use in the various shuffle ops
SHUFFLE_SELECT        MACRO a,b,c,d
    LOCAL Value
    IF (a gt 3) or (b gt 3) or (c gt 3) or (d gt 3)
        .ERR
        EXITM <0>
    ENDIF
    Value = ((a) OR (b SHL 2) OR (c SHL 4) OR (d SHL 6))
    EXITM %Value
ENDM


.CODE

; The DXTC compressor is a six-step process:
; 1. AVERAGE:    find the average value of the block
; 2. AXIS:        compute the compression axis
; 3. POS:        calculate the position on the axis of each data value in the block
; 4. PROG:        progressively refine the selected endpoints to find an error minima
; 5. COLOUR:    generate output colours from the endpoints, axis and average
; 6. CLUSTER:    cluster the data values into the appropriate clusters

; Each of these is implemented by macros to allow the composition of different components
; to form a particular block compressor; this also eases portability to x64. For example,
; changing AVERAGE would allow different structured input, and changing AVERAGE, AXIS and POS
; allows e.g. 2D vs. 3D block types.

; Macros make the code harder to debug, because MASM generates the error messages with line
; numbers corresponding to the invocation of the macro rather than inside the macro. Enabling
; /W3 helps, it then shows the line offset inside the macro. Short macros are therefore a
; lot easier to debug than long ones.

; CLUSTER has been subsumed into pos, prog and colour in this implementation


; Temporaries
TMP_TMP                equ    (8*16)            ; Basic tmp area - 8 XMMWORD each contains two WORD RGB0 source values
TMP_AVG                equ TMP_TMP + 16
TMP_AXIS            equ TMP_AVG + 16
TMP_POS                equ TMP_AXIS + (2*16)
TMP_CLUSTER            equ TMP_POS + 16            ; 2 dwords; centre and split
TMP_BEST            equ TMP_CLUSTER + 16
TMP_CENTRE            equ TMP_BEST + 16
TMP_CURRENT            equ TMP_CENTRE + 16


; Prologue / epilog / register management
IF X64
    ; We can use volatile registers for some of the scratch regs we need so we don't need to save them
    rfx                equ r8
    efx                equ r8d
    tmp_base_reg    equ rbp        ; Tried r9 but faster with rbp - REX prefix overhead perhaps?
    source_reg        equ rcx
    dest_reg        equ rdx
    stride_reg        equ rax
    egx                equ r10d
    ehx                equ r11d

    main_proc_name    equ DXTCV11CompressBlockSSE2

    SAVED_REGS            equ 8
    TMP_REGSAVE            equ TMP_CENTRE + (SAVED_REGS*16)


    SAVE_REG        MACRO reg, n
        IF n LE SAVED_REGS
            movdqa [tmp_base_reg-TMP_REGSAVE+(n*16)], reg
        ENDIF
    ENDM

    RESTORE_REG        MACRO reg, n
        IF n LE SAVED_REGS
            movdqa reg, [tmp_base_reg-TMP_REGSAVE+(n*16)]
        ENDIF
    ENDM

    SAVE_REGS        MACRO
;        push rsi
;        push rdi
;        push rbx
        push rbp

        ; Set up a 16-byte aligned storage space pointer
        mov tmp_base_reg, rsp
        and tmp_base_reg, NOT 0fh

        ; Any xmm regs over 5 need to be saved here as well
        SAVE_REG xmm6,1
        SAVE_REG xmm7,2
        SAVE_REG xmm8,3
        SAVE_REG xmm9,4
        SAVE_REG xmm10,5
        SAVE_REG xmm11,6
        SAVE_REG xmm12,7
        SAVE_REG xmm13,8
        SAVE_REG xmm14,9
        SAVE_REG xmm15,10
    ENDM

    RESTORE_REGS    MACRO
        RESTORE_REG xmm6,1
        RESTORE_REG xmm7,2
        RESTORE_REG xmm8,3
        RESTORE_REG xmm9,4
        RESTORE_REG xmm10,5
        RESTORE_REG xmm11,6
        RESTORE_REG xmm12,7
        RESTORE_REG xmm13,8
        RESTORE_REG xmm14,9
        RESTORE_REG xmm15,10

        pop rbp
;        pop rbx
;        pop rdi
;        pop rsi
    ENDM

ELSE    ; X64

    ; We don't have spare volatile regs
    efx                equ ebx
    tmp_base_reg    equ ebp
    source_reg        equ ecx
    dest_reg        equ edx
    stride_reg        equ eax
    IF USE_34
        egx                equ esi
        ehx                equ edi
    ENDIF

    main_proc_name    equ _DXTCV11CompressBlockSSE2

    SAVE_REGS        MACRO
        IF USE_34
            push esi
            push edi
        ENDIF

        push ebx
        push ebp

        ; Set up the tmp pointer aligned on a 16-byte boundary
        mov ebp, esp
        and ebp, NOT 0fh
    ENDM

    RESTORE_REGS    MACRO
        pop ebp
        pop ebx
        IF USE_34
            pop edi
            pop esi
        ENDIF

        emms
    ENDM

ENDIF    ; X64


COLOURSPACE_TRANSFORM_SETUP    MACRO
    IF WEIGHTING
        movdqa xmm5, xmmword ptr [weighting]
    ELSE
        movdqa xmm5, xmmword ptr [mask_rgb]
    ENDIF
ENDM

COLOURSPACE_TRANSFORM    MACRO reg1, reg2
    IF WEIGHTING
        ; Weighting doesn't need masking - the weighting mul clears out the A component
    ELSE
        pand reg1,xmm5
        pand reg2,xmm5
    ENDIF
ENDM


AVERAGE_RGB        MACRO src_reg, stride_reg

        COLOURSPACE_TRANSFORM_SETUP

        ; Read source data - use DQU since we can't be certain it's aligned, the
        ; overhead is not important given the size of the rest of the calculation
        movdqu xmm0,[src_reg]
        movdqu xmm2,[src_reg+stride_reg]
        pxor xmm7,xmm7

        COLOURSPACE_TRANSFORM xmm0,xmm2

        movdqa xmm1,xmm0
        movdqa xmm3,xmm2
        punpcklbw xmm0,xmm7        ; expand to two 0RGB values in each XMM, 8 XMMs total
        punpckhbw xmm1,xmm7
        punpcklbw xmm2,xmm7
        punpckhbw xmm3,xmm7

    IF WEIGHTING
        pmullw xmm0,xmm5
        pmullw xmm1,xmm5
        pmullw xmm2,xmm5
        pmullw xmm3,xmm5
    ENDIF

        ; Write unpacked values (8 bits) to the scratch buffer: they're needed
        ; again in the axis calculation and we don't have the registers to keep
        ; them around (we probably can on x64)
        movdqa [tmp_base_reg-TMP_TMP],xmm0
        movdqa [tmp_base_reg-TMP_TMP+16],xmm1
        paddw xmm0,xmm1                        ; Start accumulating the result
        movdqa [tmp_base_reg-TMP_TMP+32],xmm2
        movdqa [tmp_base_reg-TMP_TMP+48],xmm3
        paddw xmm2,xmm3

        lea src_reg,[src_reg+2*stride_reg]
        paddw xmm0,xmm2

        movdqu xmm2,[src_reg]
        movdqu xmm4,[src_reg+stride_reg]

        COLOURSPACE_TRANSFORM xmm2,xmm4

    IF X64
            movdqa xmm6,xmm2
            movdqa xmm9,xmm4
            punpcklbw xmm2,xmm7
            punpckhbw xmm6,xmm7
            punpcklbw xmm4,xmm7
            punpckhbw xmm9,xmm7
        IF WEIGHTING
            pmullw xmm2,xmm5
            pmullw xmm6,xmm5
            pmullw xmm4,xmm5
            pmullw xmm9,xmm5
        ENDIF

            ; 7, 6, 8 and 9 are the inputs to the axis comp (so no need to write to the scratchpad)
            movdqa xmm7,xmm2
            movdqa xmm8,xmm4
            paddw xmm2,xmm6
            paddw xmm4,xmm9
    ELSE    ; X64
            movdqa xmm1,xmm2
            movdqa xmm6,xmm4
            punpcklbw xmm2,xmm7
            punpckhbw xmm1,xmm7
            punpcklbw xmm4,xmm7
            punpckhbw xmm6,xmm7
        IF WEIGHTING
            pmullw xmm2,xmm5
            pmullw xmm1,xmm5
            pmullw xmm4,xmm5
            pmullw xmm6,xmm5
        ENDIF

            ; 7 and 6 are the inputs to the axis comp (so no need to write to the scratchpad)
            movdqa [tmp_base_reg-TMP_TMP+64],xmm2
            movdqa [tmp_base_reg-TMP_TMP+80],xmm1
            paddw xmm2,xmm1
            movdqa xmm7,xmm4
            paddw xmm4,xmm6
    ENDIF    ; X64

        paddw xmm0,xmm2

        paddw xmm0,xmm4            ; xmm0 has 8 value sums in each qword

        pshufd xmm1,xmm0,SHUFFLE_SELECT(2,3,0,1)    ; swap so result is in both qwords
        paddw xmm0,xmm1

        ; Convert the average to have the correct fractional bit count
        ; This is some combination of a real or virtual divide by 16 in total to generate the average, so
        ; xmm0 now has the average in low 4 words 0,R,G,B in 8.0 to 8.4 fixed point format

    IF AVG_SHIFT_BITS
        paddw xmm0,[avg_round]
        psrlw xmm0,AVG_SHIFT_BITS
    ENDIF
        movdqa [tmp_base_reg-TMP_AVG],xmm0

ENDM    ; AVERAGE_RGB


IF X64

    ; For X64, the ability to 4-way interleave is slightly faster even taking into account the (significant)
    ; overhead of saving the registers (each register saved is more than 1 MB/s off the rate)
AXIS_3C_I_64        MACRO offset, first, last
    IF first
        pxor xmm2,xmm2
    ELSE
        movdqa xmm7,[tmp_base_reg-offset]        ; fetch two RGBs
        movdqa xmm6,[tmp_base_reg-offset+16]
        movdqa xmm8,[tmp_base_reg-offset+32]
        movdqa xmm9,[tmp_base_reg-offset+48]
    ENDIF
        INPUT_SHIFT xmm7            ; Convert to 8.x fixed point. Note we don't need bit replication (avg of 16 255s is 255.0)
        INPUT_SHIFT xmm6            ; This is a nop if the weight and avg fractional bits match
        INPUT_SHIFT xmm8
        INPUT_SHIFT xmm9
            pxor xmm3,xmm3
            movdqa xmm1,xmm3    ; move is cheaper than repeated xors
            movdqa xmm10,xmm3
            movdqa xmm11,xmm3
        psubw xmm7,xmm0            ; subtract avg
        psubw xmm6,xmm0
        psubw xmm8,xmm0
        psubw xmm9,xmm0
;    IF last EQ 0        ; Oddly this appears to be slower on X64?
            movdqa [tmp_base_reg-offset],xmm7        ; write RGB-avg back as it will be reused later
            movdqa [tmp_base_reg-offset+16],xmm6
;    ENDIF
            movdqa [tmp_base_reg-offset+32],xmm8
            movdqa [tmp_base_reg-offset+48],xmm9
        pshuflw xmm5,xmm7,SHUFFLE_SELECT(2,0,1,3)    ; R B G 0; lines up with B G R 0 to produce RB / BG / GR axis ordering info
        pshuflw xmm4,xmm6,SHUFFLE_SELECT(2,0,1,3)
        pshuflw xmm12,xmm8,SHUFFLE_SELECT(2,0,1,3)
        pshuflw xmm13,xmm9,SHUFFLE_SELECT(2,0,1,3)
            psubw xmm3,xmm7            ; -axis
            psubw xmm1,xmm6
            psubw xmm10,xmm8
            psubw xmm11,xmm9
        pshufhw xmm5,xmm5,SHUFFLE_SELECT(2,0,1,3)
        pshufhw xmm4,xmm4,SHUFFLE_SELECT(2,0,1,3)
        pshufhw xmm12,xmm12,SHUFFLE_SELECT(2,0,1,3)
        pshufhw xmm13,xmm13,SHUFFLE_SELECT(2,0,1,3)
            pmaxsw xmm3,xmm7        ; abs(axis)
            pmaxsw xmm1,xmm6
            pmaxsw xmm10,xmm8
            pmaxsw xmm11,xmm9
        psraw xmm5,16                ; state of sign bit
        psraw xmm4,16
        psraw xmm12,16
        psraw xmm13,16
            paddw xmm1,xmm3        ; accumulate axis
            paddw xmm10,xmm11
            paddw xmm1,xmm10
    IF first EQ 0
            paddw xmm1,[tmp_base_reg-TMP_AXIS]
    ENDIF
        pandn xmm5,xmm7
        pandn xmm4,xmm6
        pandn xmm12,xmm8
        pandn xmm13,xmm9
    IF last EQ 0
            movdqa [tmp_base_reg-TMP_AXIS], xmm1
    ENDIF
        paddw xmm5,xmm4
        paddw xmm12,xmm13

        paddw xmm2,xmm5            ; accumulate order
        paddw xmm2,xmm12
ENDM    ; AXIS_3C_I_64

ELSE    ; X64

AXIS_3C_I        MACRO offset, first, last
    IF first
        pxor xmm2,xmm2
    ELSE
        movdqa xmm7,[tmp_base_reg-offset]        ; fetch two RGBs
        movdqa xmm6,[tmp_base_reg-offset+16]
    ENDIF
        INPUT_SHIFT xmm7            ; Convert to 8.x fixed point. Note we don't need bit replication (avg of 16 255s is 255.0)
        INPUT_SHIFT xmm6            ; This is a nop if the weight and avg fractional bits match
            pxor xmm3,xmm3
            pxor xmm1,xmm1
        psubw xmm7,xmm0            ; subtract avg
        psubw xmm6,xmm0
    IF last EQ 0
            movdqa [tmp_base_reg-offset],xmm7        ; write RGB-avg back as it will be reused later
            movdqa [tmp_base_reg-offset+16],xmm6
    ENDIF
        pshuflw xmm5,xmm7,SHUFFLE_SELECT(2,0,1,3)    ; R B G 0; lines up with B G R 0 to produce RB / BG / GR axis ordering info
        pshuflw xmm4,xmm6,SHUFFLE_SELECT(2,0,1,3)
            psubw xmm3,xmm7            ; -axis
            psubw xmm1,xmm6
        pshufhw xmm5,xmm5,SHUFFLE_SELECT(2,0,1,3)
        pshufhw xmm4,xmm4,SHUFFLE_SELECT(2,0,1,3)
            pmaxsw xmm3,xmm7        ; abs(axis)
            pmaxsw xmm1,xmm6
        psraw xmm5,16                ; state of sign bit
        psraw xmm4,16
            paddw xmm1,xmm3        ; accumulate axis
    IF first EQ 0
            paddw xmm1,[tmp_base_reg-TMP_AXIS]
    ENDIF
        pandn xmm5,xmm7
        pandn xmm4,xmm6
    IF last EQ 0
            movdqa [tmp_base_reg-TMP_AXIS], xmm1
    ENDIF
        paddw xmm2,xmm5            ; accumulate order
        paddw xmm2,xmm4
ENDM    ; AXIS_3C_I

ENDIF


; Calculate the axis vector
AXIS_3COMPONENT        MACRO no_axis
        ; Expects:    TMP_TMP to have expanded RGB values
        ;            xmm0 is the average in the form R G B 0 R G B 0
        ; Is expected to:    set TMP_TMP to RGB-avg
        ;                    set xmm7 to the axis

        ; G is the priority axis and the axis ordering info is set accordingly. The method used
        ; is far from perfect, but suffices for most cases.
    IF X64
        AXIS_3C_I_64 TMP_TMP+64, 1, 0
        AXIS_3C_I_64 TMP_TMP   , 0, 1
    ELSE
        AXIS_3C_I TMP_TMP+96, 1, 0
        AXIS_3C_I TMP_TMP+32, 0, 0
        AXIS_3C_I TMP_TMP+64, 0, 0
        AXIS_3C_I TMP_TMP   , 0, 1
    ENDIF

        ; parallel add xmm1 and xmm2 to get final absolute axis info
        pshufd xmm3,xmm1,SHUFFLE_SELECT(2,3,0,1)
        pshufd xmm4,xmm2,SHUFFLE_SELECT(2,3,0,1)
        pxor xmm5,xmm5
        paddw xmm1,xmm3            ; Final summed absolute axis
        paddw xmm2,xmm4            ; Final summed pos
        movdqa xmm0,xmm5

        ; axis is in 8.8 fixed point - it needs normalisation and ordering (the signs set correctly)
        ; By default, G is the major axis, and BG and RG pos define the orders of the B and R axes
        ; If G isn't the major axis (i.e. it is 0) R becomes the major axis and RB pos defines the
        ; order of the B axis. If RB_pos is also 0, B is the major axis and is positive.

        ; If the axis is 0 we have a constant-colour block - we must catch this here (division by
        ; zero in normalisation otherwise)

        pcmpeqw xmm5,xmm1
        pmovmskb eax,xmm2        ; eax is the sign-flip bitvector (axis_neg)
        pcmpeqw xmm2,xmm0

        pmovmskb ecx,xmm5        ; ecx is the axis equals 0 bitvector
        pmovmskb efx,xmm2        ; efx is the order equals 0 bitvector (axis_order_zero)

        ; Finish up the no-axis check.
        and ecx,02ah
        cmp ecx,02ah
        je no_axis

        ; We need to normalise the axis below, and negation is easier in float, so do it there
        punpcklwd xmm1,xmm0                            ; The axis is always positive so we can unpack with 0

            ; Axis ordering: we have put two bitvectors into eax and efx.
            ; The bits are 1 RB, 3 GB, 5 GR - 0,2,4,6+ are junk, so we mask them off
            ; 5 values in here affect the axis ordering:
            ; If GR and GB == 0, then we need to apply RB's sign to B
            ; Otherwise, we apply GR to R and GB to B
            ; See the AXIS_ORDER_ENTRY macro for how we create the 512-byte table of negation bits
            and eax,02ah
            and efx,028h        ; We don't need rb of axis_zero


        ; Promote both to float
        cvtdq2ps xmm2,xmm1
            xorps xmm5,xmm5

        ; We can do the normalisation and set the axis signs in parallel
        movaps xmm1,xmm2
        mulps xmm2,xmm2

            shl eax,3            ; x8, with empty lower bit, is a 16-byte aligned pointer to the 32 entry table
            shl efx,2            ; One shift less to slot into the gaps

        ; 3D parallel add
        movaps xmm4,xmm2
        movhlps xmm0,xmm2        ; note that xmm0 is still 0 from the pxor above which cleans things up a little
        shufps xmm2,xmm2,SHUFFLE_SELECT(1,1,1,1)
        addss xmm4,xmm0
            or eax,efx            ; bits low to high: 0,0,0,0, rb_neg, gb_zero, gb_neg, gr_zero, gr_neg
        addss xmm2,xmm4
    IF X64
            lea rfx,axis_order_table
    ENDIF

        ; low of xmm7 is the DP result
        ; We know that this cannot be 0 in the int implementation
        ; - the axis was known to be nonzero on at least one component
        ; - this is known to be representable exactly in float as it's less than 24 bits in magnitude
        ; - the square cannot be small, exponent is positive and it also gives positive results in each component
        ; - they cannot therefore sum to 0
        rsqrtss xmm2, xmm2        ; No need for Newton-Raphson, ~15 bits of precision is fine

            ; Apply the axis ordering; we also need the result in the correct 1.15 format when we
            ; send it back, so this scaling factor is baked into the axis order table
    IF X64
            mulps xmm1, [rfx + rax]
    ELSE
            mulps xmm1, [axis_order_table + eax]
    ENDIF

        shufps xmm2, xmm2, SHUFFLE_SELECT(0, 0, 0, 0)

        ; Normalise, apply axis order, and scale to 1.15
        mulps xmm2, xmm1

        ; Get it back into int
        cvtps2dq xmm1,xmm2
        packssdw xmm1,xmm1                ; This duplicates as well, which is what we want
        movdqa [tmp_base_reg-TMP_AXIS], xmm1
ENDM    ; AXIS_3COMPONENT


POS_3C_8_VALUES    MACRO offset, first
        movdqa xmm0,xmm1        ; Copy across the axis. It's a faster to copy then multiply-load, one reason
        movdqa xmm2,xmm1        ; is likely because the maddwd issues at 1/clock at best while movdqa can
        movdqa xmm3,xmm1        ; parallelise up to 3 per clock, so there's more breathing room for the loads

    IF first AND (X64 EQ 0)        ; This optimisation appears to be oddly slower on X64
        pmaddwd xmm0,xmm7
        pmaddwd xmm1,xmm6
    ELSE
        pmaddwd xmm0,[tmp_base_reg-offset]
        pmaddwd xmm1,[tmp_base_reg-offset+16]
    ENDIF
        pmaddwd xmm2,[tmp_base_reg-offset+32]
        pmaddwd xmm3,[tmp_base_reg-offset+48]

        movdqa xmm4,xmm0        ; We could do this with shift or pshufd. Shift has lower code density
        movdqa xmm5,xmm1        ; but is fast on more CPUs, and pshufd shows little if any gain
        movdqa xmm6,xmm2
        movdqa xmm7,xmm3
        psllq xmm0,32
        psllq xmm1,32
        psllq xmm2,32
        psllq xmm3,32

        paddd xmm0,xmm4            ; We now have 2 results in dwords 1 and 3 of the XMM. This is important below...
        paddd xmm1,xmm5            ; The result is in 9.(15+AVG_FRAC_BITS) format
        paddd xmm2,xmm6
        paddd xmm3,xmm7
        psrad xmm0,10+AVG_FRAC_BITS    ; Scale to the desired 9.5 format
        psrad xmm1,10+AVG_FRAC_BITS    ; (9.5 is convenient because it becomes 8.4 when multiplied by axis)
        psrad xmm2,10+AVG_FRAC_BITS
        psrad xmm3,10+AVG_FRAC_BITS
        packssdw xmm0,xmm1        ; Pack so we now have 4 word results in words 1, 3, 5 and 7
        packssdw xmm2,xmm3

    IF first
        movdqa xmm1, [tmp_base_reg-TMP_AXIS]    ; Reload the axis we corrupted above
    ENDIF

        psrad xmm0,16            ; Because we cunningly arranged the results to be in the high word of
        psrad xmm2,16            ; each dword, a sign-preserving shift puts it right for the final pack

        packssdw xmm0,xmm2        ; 8 results in xmm0
ENDM    ; POS_3C_8_VALUES


; These should be reusable whatever the axis and average format are
POS_MINMAX        MACRO
        ; using the xmm0 and xmm2 results we calculate a final minmax
        movdqa xmm3,xmm2
        pmaxsw xmm2,xmm0
        pminsw xmm3,xmm0
        pshufd xmm4,xmm2,SHUFFLE_SELECT(2,3,0,1)    ; dword halves
        pshufd xmm5,xmm3,SHUFFLE_SELECT(2,3,0,1)
        pmaxsw xmm2,xmm4
        pminsw xmm3,xmm5
        pshuflw xmm4,xmm2,SHUFFLE_SELECT(2,3,0,1)    ; word halves
        pshuflw xmm5,xmm3,SHUFFLE_SELECT(2,3,0,1)
        pmaxsw xmm2,xmm4                            ; Final max
        pminsw xmm3,xmm5                            ; Final min
        pshuflw xmm4,xmm2,SHUFFLE_SELECT(1,0,1,0)    ; word quarters
        pshuflw xmm5,xmm3,SHUFFLE_SELECT(1,0,1,0)
        pmaxsw xmm2,xmm4                            ; Final max
        pminsw xmm3,xmm5                            ; Final min

        ; if min == max == 0 then we have a single-colour block. This shouldn't be able to
        ; happen, as these should be caught when there is no axis instead.
ENDM


POS_CENTRE        MACRO
        ; Calculate centre
        ; Centre = (A+B)/2    (and then min = -max)

        ; Replicate low and high halves; note shuffles above leave halves already the same
        pshufd xmm7,xmm2,SHUFFLE_SELECT(0,0,0,0)    ; max
        pshufd xmm2,xmm3,SHUFFLE_SELECT(0,0,0,0)

        paddw xmm2,xmm7
    IF PROG_THRESHOLD
            movdqa xmm4, xmmword ptr [prog_threshold_size]
    ENDIF
        psraw xmm2,1                    ; xmm2 = offset; arithmetic shift to preserve sign

        psubw xmm7,xmm2                    ; offset max to new centre

        ; save offset centre (9.5 format) for later correction of refined X
        movdqa [tmp_base_reg-TMP_CENTRE], xmm2
ENDM


POS_OFFSET        MACRO reread
        ; Offset points to centre, forming array of abs(P-centre) and centre-side cluster bits
    IF reread
        movdqa xmm0,[tmp_base_reg-TMP_POS]
        movdqa xmm1,[tmp_base_reg-TMP_POS+16]
    ENDIF
    IF PROG_THRESHOLD
            pcmpgtw xmm4,xmm7
    ENDIF
        psubw xmm0,xmm2
        psubw xmm1,xmm2
        pxor xmm2,xmm2
        pxor xmm3,xmm3
        pmovmskb eax,xmm0                ; Save clustering
        pmovmskb efx,xmm1
        psubw xmm2,xmm0
        psubw xmm3,xmm1
        shl efx,15
        shr eax,1
        or eax,efx
        and eax,055555555h
        mov [tmp_base_reg-TMP_CLUSTER],eax

        pmaxsw xmm0,xmm2                ; abs(P-centre)
        pmaxsw xmm1,xmm3
    IF NO_PROG EQ 0
        movdqa [tmp_base_reg-TMP_POS],xmm0
        movdqa [tmp_base_reg-TMP_POS+16],xmm1
    ENDIF
    IF PROG_THRESHOLD
            pmovmskb ecx,xmm4
    ENDIF

ENDM


POS_3COMPONENT            MACRO no_prog
        ; Expects:    TMP_TMP to have expanded RGB-average values
        ;            xmm1 == axis in form R G B 0 R G B 0
        ; Is expected to:    put abs(P-centre) in TMP_POS
        ;                    put 'centre side' info in TMP_CLUSTER_CENTRE

        ; We need to generate 16 values, but we don't have enough XMMs, so we do two halves
        POS_3C_8_VALUES TMP_TMP, 1
        movdqa [tmp_base_reg-TMP_POS],xmm0
        POS_3C_8_VALUES (TMP_TMP-64), 0
        movdqa [tmp_base_reg-TMP_POS+16],xmm0

        movdqa xmm2,[tmp_base_reg-TMP_POS]
        POS_MINMAX
        POS_CENTRE
        POS_OFFSET 1

ENDM    ; POS_3COMPONENT


NOPROG_CLUSTER            MACRO
        movdqa xmm6,xmm7
        paddw xmm6,xmm6
        pmulhw xmm6, xmmword ptr [scale_one_third]

        psubw xmm1,xmm6
        psubw xmm0,xmm6                    ; abs(P-centre) - 2/3x

        pmovmskb efx,xmm1
        pmovmskb eax,xmm0                ; 4-block clustering

        ; Save the split point clustering data
        shl efx,16
        or eax,efx
        and eax,0aaaaaaaah
        mov [tmp_base_reg-TMP_CLUSTER+4],eax
    ENDM


; Attempt a (simple) progressive refinement step to reduce noise in the
; output image by trying to find a better overall match for the endpoints
; than the first-guess solution (the extremities of the input signal)

; The method is to move the endpoints inwards until a local MSE minima is found.

PROG            MACRO
        LOCAL next_refinement_loop
        LOCAL refinement_done
        LOCAL no_prog

        ; Expects:    xmm7 (all words) is the initial max value
        ;            TMP_POS has been set up with the array of 16 words

    IF COUNT_PROG_STEPS
        xor eax,eax
    ENDIF

    IF PROG_THRESHOLD
        ; If we're below the prog threshold, we can use the no-refinement clustering
        ; (which doesn't have to calculate MSE)
        test ecx,2
        je doprog
        NOPROG_CLUSTER
        jmp no_prog
    doprog:
    ENDIF

        movdqa [tmp_base_reg-TMP_BEST],xmm7
        movq mm1,mmword ptr [max_sint32]    ; Initialise max error; scalar, leverage the MMX unit

        next_refinement_loop:
            movdqa xmm6,xmm7        ; Save the current X (since we corrupt it)

            pmulhw xmm7, xmmword ptr [scale_one_third]

            ; Calculate E4 (4-colour block MSE)
            ; xmm0 and 1 are already abs(P-centre)
            pxor xmm2,xmm2
            pxor xmm3,xmm3
            psubw xmm0,xmm7            ; Since it all parallelises nicely, it's faster to subtract twice (and uses a register less)
            psubw xmm1,xmm7
            psubw xmm0,xmm7            ; abs(P-centre) - 2/3x
            psubw xmm1,xmm7
    IF COUNT_PROG_STEPS EQ 0
            pmovmskb eax,xmm0        ; 4-block clustering
            pmovmskb efx,xmm1
    ENDIF

;            jmp refinement_done

            psubw xmm2,xmm0
            psubw xmm3,xmm1
            pmaxsw xmm0,xmm2
            pmaxsw xmm1,xmm3        ; abs(abs(P-centre)-2/3x)

            psubw xmm0,xmm7            ; abs(abs(P-centre)-2/3x) - 1/3x
            psubw xmm1,xmm7

            pmaddwd xmm0,xmm0        ; 4 mean-square-error values, doing part of the parallel add
            pmaddwd xmm1,xmm1

            paddd xmm0,xmm1

            pshufd xmm1,xmm0,SHUFFLE_SELECT(2,3,0,1)    ; This is a big cost
            paddd xmm0,xmm1

            ; Move to MMX for this last bit; faster on all but the most recent CPUs
            movdq2q mm0,xmm0
            movdq2q mm2,xmm0
            psrlq mm0,32
            paddd mm0,mm2

            ; Compare E4 with current minimum error and choose result
            pcmpgtd mm1,mm0

            pmovmskb ecx,mm1

            ; Pause here while that result becomes available, so read these up for go-around
            movdqa xmm0,[tmp_base_reg-TMP_POS]        ; Read up the source values of abs(P-centre)
            movdqa xmm1,[tmp_base_reg-TMP_POS+16]

            test ecx,8
            jz refinement_done

            movq mm1,mm0            ; Save the better max error

            ; Go around
        go_around:
            movdqa xmm7,xmm6
            movdqa [tmp_base_reg-TMP_BEST],xmm6

    IF COUNT_PROG_STEPS
            add eax,1
    ELSE
            ; Save the split point clustering data
            shl efx,16
            or eax,efx
            and eax,0aaaaaaaah
            mov [tmp_base_reg-TMP_CLUSTER+4],eax
    ENDIF

            ; Fixed step size, tunable
            psubw xmm7,[stepsize]

            ; Used to check for a negative stepsize here, but stopped that because I think it's
            ; impossible that the error could be less than it was the previous time and the
            ; movmsk/test combination is expensive.
            jmp next_refinement_loop


    refinement_done:
        movdqa xmm7,[tmp_base_reg-TMP_BEST]
    no_prog:

ENDM    ; PROG


PROG_34            MACRO
        LOCAL next_refinement_loop
        LOCAL refinement_done
        LOCAL no_prog

        ; Expects:    xmm7 (all words) is the initial max value
        ;            TMP_POS has been set up with the array of 16 words

    IF COUNT_PROG_STEPS
        xor eax,eax
    ENDIF

    IF PROG_THRESHOLD
        ; If we're below the prog threshold, we can use the no-refinement clustering
        ; XXX - this means that all threshold blocks get 4-colour clustering
        test ecx,2
        je doprog
        NOPROG_CLUSTER
        xor ecx,ecx        ; 4-colour block
        jmp no_prog
    doprog:
    ENDIF

        movdqa [tmp_base_reg-TMP_BEST],xmm7
        movq mm1,mmword ptr [max_sint32]    ; Initialise max error; scalar, leverage the MMX unit

        next_refinement_loop:
            movdqa [tmp_base_reg-TMP_CURRENT],xmm7
                movdqa xmm6,xmm7
            pmulhw xmm7, xmmword ptr [scale_one_third]
                psrlw xmm6,1        ; 1/2 x


            ; Calculate E3 and E4 (3 and 4-colour block MSE)
            ; xmm0 and 1 are already abs(P-centre)

                movdqa xmm4,xmm0
                movdqa xmm5,xmm1

IF X64
            ; This X64 path is a surprisingly marginal gain, probably because we can get decent
            ; loop-to-loop parallelism here
            pxor xmm2,xmm2
            movdqa xmm3,xmm2
            movdqa xmm8,xmm2
            movdqa xmm9,xmm2
            psubw xmm0,xmm7            ; Since it all parallelises nicely, it's faster to subtract twice (and uses a register less)
            psubw xmm1,xmm7
            psubw xmm4,xmm6            ; abs(P-centre) - 1/2x
            psubw xmm5,xmm6
            psubw xmm0,xmm7            ; abs(P-centre) - 2/3x
            psubw xmm1,xmm7
    IF COUNT_PROG_STEPS EQ 0
            pmovmskb egx,xmm4        ; 3-block clustering
            pmovmskb ehx,xmm5
            pmovmskb eax,xmm0        ; 4-block clustering
            pmovmskb efx,xmm1
    ENDIF

            psubw xmm2,xmm0
            psubw xmm3,xmm1
            psubw xmm8,xmm4
            psubw xmm9,xmm5
            pmaxsw xmm0,xmm2
            pmaxsw xmm1,xmm3        ; abs(abs(P-centre)-2/3x)
            pmaxsw xmm4,xmm8
            pmaxsw xmm5,xmm9        ; abs(abs(P-centre)-1/2x)
ELSE
            ; Calculate E4 (4-colour block MSE)
            ; xmm0 and 1 are already abs(P-centre)
            pxor xmm2,xmm2
            pxor xmm3,xmm3
            psubw xmm0,xmm7            ; Since it all parallelises nicely, it's faster to subtract twice (and uses a register less)
            psubw xmm1,xmm7
            psubw xmm0,xmm7            ; abs(P-centre) - 2/3x
            psubw xmm1,xmm7
    IF COUNT_PROG_STEPS EQ 0
            pmovmskb eax,xmm0        ; 4-block clustering
            pmovmskb efx,xmm1
    ENDIF

                psubw xmm4,xmm6        ; abs(P-centre)-2/3x)
                psubw xmm5,xmm6
                pmovmskb egx,xmm4    ; 3-block clustering
                pmovmskb ehx,xmm5

            psubw xmm2,xmm0
            psubw xmm3,xmm1
            pmaxsw xmm0,xmm2
            pmaxsw xmm1,xmm3        ; abs(abs(P-centre)-2/3x)

                pxor xmm2,xmm2
                pxor xmm3,xmm3
                psubw xmm2,xmm4
                psubw xmm3,xmm5
                pmaxsw xmm4,xmm2
                pmaxsw xmm5,xmm3    ; abs(abs(P-centre)-1/2x)
ENDIF

;            jmp refinement_done

            psubw xmm0,xmm7            ; abs(abs(P-centre)-2/3x) - 1/3x
            psubw xmm1,xmm7
                psubw xmm4,xmm6        ; abs(abs(P-centre)-1/2x) - 1/2x
                psubw xmm5,xmm6

            pmaddwd xmm0,xmm0        ; 4 mean-square-error values, doing part of the parallel add
            pmaddwd xmm1,xmm1
                pmaddwd xmm4,xmm4
                pmaddwd xmm5,xmm5

            paddd xmm0,xmm1
                paddd xmm4,xmm5

            pshufd xmm1,xmm0,SHUFFLE_SELECT(2,3,0,1)    ; This is a big cost
                pshufd xmm5,xmm4,SHUFFLE_SELECT(2,3,0,1)
            paddd xmm0,xmm1
                paddd xmm4,xmm5

            ; Move to MMX for this last bit; faster on all but the most recent CPUs
            movdq2q mm0,xmm0
                movdq2q mm4,xmm4
            movdq2q mm2,xmm0
                movdq2q mm6,xmm4
            psrlq mm0,32
                psrlq mm4,32
            paddd mm0,mm2
                paddd mm4,mm6

            ; Compare E4 (and E3 if present) with current minimum error and choose result
                movq mm5,mm1
            pcmpgtd mm1,mm0

            pmovmskb ecx,mm1

            ; Pause here while that result becomes available, so read these up for go-around
            movdqa xmm0,[tmp_base_reg-TMP_POS]        ; Read up the source values of abs(P-centre)
            movdqa xmm1,[tmp_base_reg-TMP_POS+16]

            test ecx,8
            jnz e4_good
                ; e4 is not better, is e3?
                pcmpgtd mm5,mm4
                pmovmskb ecx,mm5
                test ecx,8
                jz refinement_done
                ; e3 is the new best
        e3_best:
                movq mm1,mm4
                mov ecx,0ffffffffh    ; 3-colour block
                mov eax,egx
                mov efx,ehx
                jmp go_around

        e4_good:
                ; is e3 better than e4?
                movq mm5,mm0
                pcmpgtd mm5,mm4
                pmovmskb ecx,mm5
                test ecx,8
                jnz e3_best
                xor ecx,ecx            ; 4-colour block

            movq mm1,mm0            ; Save the better max error

            ; Go around
        go_around:
            mov [tmp_base_reg-TMP_CLUSTER+8],ecx
            movdqa xmm7,[tmp_base_reg-TMP_CURRENT]
            movdqa [tmp_base_reg-TMP_BEST],xmm7

    IF COUNT_PROG_STEPS
            add eax,1
    ELSE
            ; Save the split point clustering data
            shl efx,16
            or eax,efx
            and eax,0aaaaaaaah
            mov [tmp_base_reg-TMP_CLUSTER+4],eax
    ENDIF

            ; Fixed step size
            psubw xmm7,[stepsize]

            jmp next_refinement_loop


    refinement_done:
        movdqa xmm7,[tmp_base_reg-TMP_BEST]
    no_prog:

ENDM    ; PROG_34


COLOUR_AVERAGE            MACRO output_reg, use_3component

        ; Output the average if there is no axis

        ; input is:
        ; average (8.4)

        movq mm0,[tmp_base_reg-TMP_AVG]
    IF AVG_FRAC_BITS LT 4
        psllw mm0,4-AVG_FRAC_BITS            ; Convert average to 8.4 format
    ENDIF

    IF WEIGHTING AND use_3component
        pmullw mm0,[unweighting]
    ENDIF
    IF UNROUNDING
        paddw mm0, mmword ptr [round_565]
        movq mm3, mmword ptr [scale_to_round]
        pmulhw mm3,mm0
        pand mm3,mmword ptr [round_mask]
        psubw mm0,mm3
    ENDIF

        pmulhw mm0,mmword ptr [scale_8_4_to_565]                ; 0 R G B in 565 format
        pminsw mm0,mmword ptr [clamp_565]
        movq mm2,mm0
    IF use_3component
        movq mm4,mm0
        pand mm0, [mask_third_word]
        psrlq mm0,(32-11)
        psrlq mm2,(16-5)
        por mm2,mm4
    ELSE
        psrlq mm2,(16-5)
    ENDIF
        por mm0,mm2

        ; Duplicate
        punpcklwd mm0,mm0
        pand mm0,[mask_low_dword]
        movq [output_reg],mm0
ENDM    ; COLOUR


COLOUR        MACRO output_reg, use_3component
        ; input is:
        ; x (9.5, always > 0) in xmm7 (all words)
        ; centre (9.5)
        ; axis (1.15)
        ; average (8.AVG_FRAC_BITS)

    IF COUNT_PROG_STEPS
        sub eax,1
        test eax,0ffffff8h
        jz noclamp
        mov eax,8
    noclamp:
        mov eax,[col_table+eax*4]
        mov [edx],eax
        mov dword ptr [edx+4],0
    ELSEIF SHOW_BLOCK_TYPES
        mov ecx,[col_table+4]
        mov efx,[col_table+8]
        mov eax,[tmp_base_reg-TMP_CLUSTER+8]
        cmp eax,3
        cmove ecx,efx
        mov [edx],ecx
        mov dword ptr [edx+4],0
    ELSE


;        pxor xmm7,xmm7    ; to force the refined X to 0

        ; This is mostly done in MMX code, which is faster on some machines and (slightly)
        ; slower only on very recent chips as there are no independent chains to
        ; execute in the gaps

        ; The two colours are avg +- ((x+-centre)*axis)
        movq mm0,[tmp_base_reg-TMP_AVG]
    IF AVG_FRAC_BITS LT 4
        psllw mm0,4-AVG_FRAC_BITS            ; Convert average to 8.4 format
    ENDIF
        movdq2q mm1,xmm7                    ; 9.5 format x
        movdq2q mm2,xmm7
        movq mm7,[tmp_base_reg-TMP_CENTRE]    ; 9.5 format centre offset
        movq mm5,[tmp_base_reg-TMP_AXIS]
        paddw mm1,mm7                        ; 9.5 format x+-centre
        psubw mm2,mm7
    IF UNROUNDING
        IF WEIGHTING AND use_3component
            paddw mm0, mmword ptr [round_565_weighted]
        ELSE
            paddw mm0, mmword ptr [round_565]    ; 8.4 format average
        ENDIF
    ENDIF
        pmulhw mm1,mm5                        ; 8.4 format (x+-centre)*axis
        pmulhw mm2,mm5
        pxor mm4,mm4
        paddw mm1,mm0                        ; avg + axis offset
        psubw mm0,mm2                        ; avg - axis offset
        pmaxsw mm0,mm4                        ; Clamp to positive range (can't use addusw, axis is signed)
        pmaxsw mm1,mm4

    IF WEIGHTING AND use_3component
        pmullw mm0,mmword ptr [unweighting]
        pmullw mm1,mmword ptr [unweighting]
    ENDIF

    IF UNROUNDING
        ; The (canonical) DXTC decompressor uses (should use) bit replication to generate
        ; 888 colour values from the 565 input. We therefore need to tweak our colours here
        ; to take account of this. The procedure is to add 0.5 and then subtract between
        ; 0 and 1 depending on the value of the input. We aren't in the 565 colourspace yet,
        ; still 8.4 fixed, but we can scale accordingly as long as we mask out the bits we don't want
        ; to contribute (consider 1F->FF, 1E->F7, 1D->EF, 1C->E7 but 1B->DE, we must only
        ; apply the top three bits in the unrounding process for R/B and the top two for G).
        movq mm3, mmword ptr [scale_to_round]    ; This is 5 6 5 right shifts
        movq mm4,mm0
        movq mm5,mm1
        movq mm6, mmword ptr [round_mask]
        pmulhw mm4,mm3
        pmulhw mm5,mm3
        pand mm4,mm6            ; Truncate off bits below the threshold point at which they influence the result
        pand mm5,mm6
        psubw mm0,mm4
        psubw mm1,mm5
    ENDIF

        movq mm3, mmword ptr [scale_8_4_to_565]        ; Encodes the appropriate shifts
        pmulhw mm0,mm3            ; 0 R G B in 565 format
        pmulhw mm1,mm3

        movq mm5,mmword ptr [clamp_565]
        pminsw mm0,mm5
        pminsw mm1,mm5

    IF use_3component
        pshufw mm4,mm0,SHUFFLE_SELECT(1,3,3,3)
        pshufw mm5,mm1,SHUFFLE_SELECT(1,3,3,3)
        pshufw mm2,mm0,SHUFFLE_SELECT(2,3,3,3)
        pshufw mm3,mm1,SHUFFLE_SELECT(2,3,3,3)
        psllw mm4,5
        psllw mm5,5
        psllw mm2,11
        psllw mm3,11
        por mm0,mm4
        por mm1,mm5
    ELSE
        movq mm4,mmword ptr [mask_low_dword]
        pand mm0,mm4
        pand mm1,mm4
        pshufw mm2,mm0,SHUFFLE_SELECT(1,3,3,3)
        pshufw mm3,mm1,SHUFFLE_SELECT(1,3,3,3)
        psllw mm2,5
        psllw mm3,5
    ENDIF
        por mm0,mm2
        por mm1,mm3

        ; mm0 and mm1 are c0 and c1

        ; Read up the cluster information we'll need
        movd mm7,dword ptr [tmp_base_reg-TMP_CLUSTER]        ; low/high (low bit)
        movd mm6,dword ptr [tmp_base_reg-TMP_CLUSTER+4]        ; endpoint/splitpoint (high bit)
    IF USE_34
        movd mm5,dword ptr [tmp_base_reg - TMP_CLUSTER+8]    ; 3-colour flag
    ELSE
        movq mm5,[_0000000055555555]
    ENDIF

        ; Compare and write in correct order

        ; rdx contains the destination DXTC block (dx == DXTC)

        ; We need to use dword compares - compares are signed and we want 32 result bits anyway
        pshufw mm2,mm0,SHUFFLE_SELECT(0,3,3,3)        ; Word 3 is zero, saves a read up cf. an AND
        pshufw mm1,mm1,SHUFFLE_SELECT(0,3,3,3)
        movq mm4,mm2

        ; Create the mask to say which way round the colours are
        pcmpgtd mm2, mm1            ; mm2 is the swap mask

    IF USE_34
        pxor mm2,mm5    ; swap the swap flag
        pand mm5,mm6    ; bits are set if 3-colour and endpoint/splitpoint bit is set
        psrld mm5,1
        pxor mm5, [_0000000055555555]    ; Form mask
        pand mm7,mm5    ; mask low/high
    ENDIF

        movq mm3,mm0
            pand mm5,mm2            ; mm5 is the cluster swap bit pattern

        pcmpeqd mm4, mm1            ; Set the equality flag for clustering info
            pxor mm7,mm5            ; Apply the cluster swap or not

        punpcklwd mm0, mm1            ; the two 565 colours in normal order
        punpcklwd mm1, mm3            ; the two 565 colours in reversed order
            por mm7,mm6                ; merge endpoint / splitpoint cluster with low/high cluster

        pand mm0, mm2
        pandn mm2, mm1

            pandn mm4,mm7            ; apply zero mask flag to cluster bits

        por mm0, mm2                ; one of the two colour sets as selected by mm5

        punpckldq mm0,mm4            ; merge the colour and the cluster bits
        movq [output_reg],mm0        ; and write

        ; 0 is the low word endpoint
        ; 1 is the high word endpoint
        ; 2 is the split point near 0
        ; 3 is the split point near 1

        ; For transparent blocks
        ; 0 is the low word endpoint
        ; 1 is the high word endpoint
        ; 2 is the interpolated point

    ENDIF    ; COUNT_PROG_STEPS


ENDM    ; COLOUR


IF 0

AVERAGE_RGB_RB        MACRO src_reg, stride_reg
        movdqa xmm0, xmmword ptr [mask_rb]

        ; Read up all 16 values
        ; Use DQU since we can't be certain they're aligned
        movdqu xmm4,[src_reg]
        movdqu xmm5,[src_reg+stride_reg]
        lea src_reg,[src_reg+2*stride_reg]
        movdqu xmm6,[src_reg]
        movdqu xmm7,[src_reg+stride_reg]

        ; RB is hugely convenient - we just need the AND and we have 4 (B R) pairs in each register
        pand xmm4,xmm0
        pand xmm5,xmm0
        pand xmm6,xmm0
        pand xmm7,xmm0

        ; We can keep the unpacked values in the scratch regs for now
        movdqa xmm0,xmm4
        movdqa xmm1,xmm5
        paddw xmm0,xmm6
        paddw xmm1,xmm7
        paddw xmm0,xmm1

        pshufd xmm1,xmm0,SHUFFLE_SELECT(2,3,0,1)    ; dword halves
        paddw xmm0,xmm1
        pshuflw xmm1,xmm0,SHUFFLE_SELECT(2,3,0,1)    ; word halves; do both sides, so we get the whole reg populated cheaper than a post shufd
        pshufhw xmm1,xmm1,SHUFFLE_SELECT(2,3,0,1)
        paddw xmm0,xmm1

        ; We now do a 'virtual divide' by 16 to generate the average, so
        ; xmm0 now has the average in all word pairs R,B in 8.4 fixed point format
        movdqa [tmp_base_reg-TMP_AVG],xmm0
    ENDM    ; AVERAGE_RGB_RB


ACCUMULATE_AXIS_2C    MACRO reg, lastreg, destination
        psllw reg,4
            pxor xmm4,xmm4
        psubw reg,xmm0
            movdqa destination,reg
        pshuflw xmm2,reg,SHUFFLE_SELECT(1,0,3,2)    ; reverse R and B for axis ordering calc
            psubw xmm4,reg        ; -axis
        pshufhw xmm2,xmm2,SHUFFLE_SELECT(1,0,3,2)
            pmaxsw xmm4,reg        ; abs(axis)
        psraw xmm2,16                                ; state of sign bit
            paddw xmm1,xmm4
        pandn reg,xmm2
        paddw reg,lastreg
    ENDM

    ; We need to specify the reg to be used for indexing on the macro call (for easy portability to x64)
AXIS_2COMPONENT        MACRO no_axis

        ; Expects:    xmm4-7 have expanded RB values
        ;            xmm0 is the average in the form R B R B R B R B
        ; Is expected to:    xmm1 to the absolute axis
        ;                    xmm2 to the axis ordering info (for each axis pair sum of one value when the other value is positive or zero)

        psllw xmm4,4
            pxor xmm1,xmm1
        psubw xmm4,xmm0
            movdqa [tmp_base_reg-TMP_TMP],xmm4
        pshuflw xmm3,xmm4,SHUFFLE_SELECT(1,0,3,2)    ; reverse R and B for axis ordering calc
            psubw xmm1,xmm4        ; -axis
        pshufhw xmm3,xmm3,SHUFFLE_SELECT(1,0,3,2)
            pmaxsw xmm1,xmm4    ; abs(axis)
        psraw xmm3,16                                ; state of sign bit
        pandn xmm3,xmm4                                ; pos

        ACCUMULATE_AXIS_2C xmm5, xmm3, [tmp_base_reg-TMP_TMP+16]
        ACCUMULATE_AXIS_2C xmm6, xmm5, [tmp_base_reg-TMP_TMP+32]
        ACCUMULATE_AXIS_2C xmm7, xmm6, [tmp_base_reg-TMP_TMP+48]

        ; xmm7 is now the axis ordering info

        ; parallel add the 4 results in xmm1 and xmm7->xmm2 to get final absolute axis info
        pshufd xmm3,xmm1,SHUFFLE_SELECT(2,3,0,1)    ; dword halves
        pshufd xmm2,xmm7,SHUFFLE_SELECT(2,3,0,1)
        paddw xmm1,xmm3
        paddw xmm2,xmm7
        pshuflw xmm3,xmm1,SHUFFLE_SELECT(2,3,0,1)    ; word halves
        pshuflw xmm7,xmm2,SHUFFLE_SELECT(2,3,0,1)
        paddw xmm1,xmm3            ; Final summed absolute axis
        paddw xmm2,xmm7            ; Final summed pos

        ; If the axis is 0 we have a constant-colour block - we must catch this here (division by zero otherwise)
        pxor xmm3,xmm3
        pcmpeqw xmm3,xmm1
        pmovmskb ecx,xmm3
        and ecx,0ah
        cmp ecx,0ah
        je no_axis
    ENDM    ; AXIS_2COMPONENT


AXIS_NORM_2COMPONENT        MACRO
        ; Expects: xmm1 to be the axis and xmm2 to be the ordering info
        ; Is expected to:    set TMP_TMP to RB-avg
        ;                    set xmm7 to the axis

        ; axis is in 8.8 fixed point - it needs normalisation and ordering (the signs set correctly)

        ; We need to normalise the axis below, and negation is easier in float, so promote to float
        pand xmm2, xmmword ptr [dword_word_mask]        ; We only need the B part of the sign to be meaningful (the R-B axis)
        pxor xmm4,xmm4
        punpcklwd xmm1,xmm4                            ; The axis is always positive so we can unpack with 0
        punpcklwd xmm4,xmm2                            ; Only the sign matters

        ; Promote both to float
        cvtdq2ps xmm7,xmm1
        cvtdq2ps xmm5,xmm4

        ; We can start the normalisation while the axis signs are being set
        movaps xmm1, xmm7
        xorps xmm2,xmm2

        ; magnitude is a 2D dot product
        mulps xmm7, xmm7

            cmpltps xmm5,xmm2            ; Signs of the axis ordering

        movaps xmm4,xmm7
        shufps xmm7,xmm7,SHUFFLE_SELECT(1,0,1,0)
        addss xmm7,xmm4

            andps xmm5,[b_sign_bit]

        ; low of xmm7 is the DP result
        ; If this is 0 we haven't actually got an axis, and we can't rsq it,
        ; so mask the output to 0 in this case. This generates an acceptable result
        ; It may also be important the top bits of the register used for the and stay as 0.
        ; Otherwise, it ands together two floating-point-messes...
        cmpneqss xmm2,xmm7                ; xmm2 was still zero to this point

            xorps xmm1,xmm5                ; Flip the sign of the axis if the r/g or b/g tests indicate a negative slope

        ; This skips the Newton-Raphson. It's half to 1% faster, and it seems to make very little difference to the
        ; int compressor (which is what I'd expect - we're only working at around 15 bits of precision anyway)
        rsqrtss xmm7, xmm7
        andps xmm7, xmm2                ; zero mask

            ; We also need the result in the correct 1.15 format when we send it back, so
            ; multiply that through here while we're waiting for the rcp to finish
            mulps xmm1, xmmword ptr [scale_1_15]

        shufps xmm7, xmm7, SHUFFLE_SELECT(0, 0, 0, 0)
            shufps xmm1,xmm1, SHUFFLE_SELECT(0,1,0,1)

        ; Normalise
        mulps xmm7, xmm1

        ; Get it back into int
        cvtps2dq xmm7,xmm7
        movdqa xmm1,xmm7
        packssdw xmm7,xmm7                ; This duplicates as well, which is what we want
        movdqa [tmp_base_reg-TMP_AXIS], xmm7

    ENDM    ; AXIS_NORM_2COMPONENT


POS_2COMPONENT            MACRO

        ; Expects:    TMP_TMP to have expanded RGB-average values
        ;            xmm7 == axis in form R G R G R G R G
        ; Is expected to:    put abs(P-centre) in TMP_POS
        ;                    put 'centre side' info in TMP_CLUSTER_CENTRE

        movdqa xmm0,[tmp_base_reg-TMP_TMP]        ; Four (RB-avg) values in 9.4 (-255.0 to +255.0)
        movdqa xmm1,[tmp_base_reg-TMP_TMP+16]
        movdqa xmm2,[tmp_base_reg-TMP_TMP+32]
        movdqa xmm3,[tmp_base_reg-TMP_TMP+48]
        pmaddwd xmm0,xmm7                        ; Multiply by axis and do 2D dot product: four 32-bit results in 9.19 format
        pmaddwd xmm1,xmm7
        pmaddwd xmm2,xmm7
        pmaddwd xmm3,xmm7

        psrad xmm0,10+AXIS_SHIFT_BITS            ; Convert to desired 9.5 format (preserving sign)
        psrad xmm1,10+AXIS_SHIFT_BITS
        psrad xmm2,10+AXIS_SHIFT_BITS
        psrad xmm3,10+AXIS_SHIFT_BITS

        packssdw xmm0,xmm1        ; 8 results in xmm0
        packssdw xmm2,xmm3

        movdqa [tmp_base_reg-TMP_POS],xmm0
        movdqa [tmp_base_reg-TMP_POS+16],xmm2

        POS_MINMAX
        POS_CENTRE
        POS_OFFSET 1

    ENDM    ; POS_2AXIS

ENDIF


IF 0

DXTCV11CompressAlphaBlockSSE2 PROC

        ; The alpha compressor is somewhat simpler. There is no need to find an
        ; axis, and the min and max values serve to determine pos_minmax from
        ; which centre is generated.

        ; Progressive refinement is also less important (as there are more interpolated
        ; values available and it is more important that the endpoints be represented
        ; correctly)

        movd mm4, dword ptr [ecx]
        movd mm5, dword ptr [ecx+eax]
        lea ecx,[ecx+2*eax]
        movq mm0,mm4
        movq mm1,mm4
        movd mm6, dword ptr [ecx]
        movd mm7, dword ptr [ecx+eax]
        pmaxub mm0,mm5
        pminub mm1,mm5
        pmaxub mm0,mm6
        pminub mm1,mm6
        pmaxub mm0,mm7
        pminub mm1,mm7

        ; Parallel minmax to finish up
        pshufw mm2,mm0,SHUFFLE_SELECT(1,0,3,3)
        pshufw mm3,mm1,SHUFFLE_SELECT(1,0,3,3)
        pmaxub mm0,mm2
        pminub mm1,mm3
        movq mm2,mm0
        movq mm3,mm1
        psrlw mm0,8
        psrlw mm1,8
        pmaxub mm0,mm2
        pminub mm1,mm3

        ; Promote to SSE2 and convert to words
        punpckldq mm4,mm5
        punpckldq mm6,mm7
        pxor xmm5,xmm5
        movq2dq xmm0,mm5
        movq2dq xmm1,mm6
        punpcklbw xmm0,xmm5
        punpcklbw xmm1,xmm5

        ; Calculate centre
        ; Could use pavgb for this, but let's keep the precision
        movq2dq xmm2,mm0
        movq2dq xmm3,mm1
        punpcklbw xmm2,xmm5
        punpcklbw xmm3,xmm5
        paddw xmm2,xmm3            ; 8.1 format centre
        pshuflw xmm2,xmm2,SHUFFLE_SELECT(0,0,0,0)
        pshufd xmm2,xmm2,SHUFFLE_SELECT(0,0,0,0)
        movdqa [tmp_base_reg-TMP_CENTRE], xmm2        ; offset centre (9.5 format) for later correction of refined X

        psllw xmm0,1            ; 8.1 format values
        psllw xmm1,1

        POS_OFFSET ebp, 0


        ; Select an endpoint which is exactly representable
        ; Any step can then be of 1.0


        ; Round all interpolated positions to integer values before calculating E8


        ; Given n, find the output value
        ; 0 2 3 4 5 6 7 1 is the output for given values on n

        movdqa xmm1,[bytes_7]
        pxor xmm2,xmm2
        pcmpeqb xmm3,xmm3        ; all 1s

        pcmpeqb xmm1,xmm0        ; mask set if 7
        pcmpeqb xmm2,xmm0        ; mask set if 0
        pxor xmm3,xmm1
        pxor xmm3,xmm2            ; mask set if 1-6

        pand xmm2,[bytes_1]
        psubb xmm0,[bytes_1]
        pand xmm3,xmm0
        por xmm3,xmm2            ; final result in unpacked form

        movdqa xmm0,xmm3
        pand xmm3,[_07000700s]    ; alternate mids
        pand xmm0,[_00070007s]    ; alternate lows
        psrlw xmm3,5
        por xmm0,xmm3            ; 8 sets of 6-bit pairs of results
        movdqa xmm3,xmm0
        pand xmm0,[_003f0000003f0000s]    ; alternate mids
        pand xmm3,[_0000003f0000003fs]    ; alternate lows
        psrld xmm0,12
        por xmm0,xmm3                    ; 4 sets of 12-bit 4 results

        movdqa xmm3,xmm0
        pand xmm0,[_00000fff00000000s]    ; alternate mids
        pand xmm3,[_0000000000000fffs]    ; alternate lows
        psrlq xmm0,20
        por xmm0,xmm3                    ; 2 sets of 24-bit 8 results

        movdqa xmm3,xmm0
        psrldq xmm0,10                    ; 10-byte shift
        por xmm3,xmm0                    ; Result

        ; or-in endpoint values


DXTCV11CompressAlphaBlockSSE2 ENDP

ENDIF


;void __fastcall DXTCV11CompressExplicitAlphaBlockMMX(BYTE block_8[16], DWORD block_dxtc[2]);
    IF X64
DXTCV11CompressExplicitAlphaBlockMMX PROC
        movq mm0,[rcx]
        movq mm1,[rcx+8]
    ELSE
@DXTCV11CompressExplicitAlphaBlockMMX@8 PROC
        movq mm0,[ecx]
        movq mm1,[ecx+8]
    ENDIF

        ; We have to adjust the values because of the derounding operation in the decode
        ; We need to add (7 - top nybble) to the lower nybble
        movq mm4,[_0707070707070707]
        movq mm5,mm4
        movq mm2,mm0
        movq mm3,mm1
        psrlq mm2,4
        psrlq mm3,4
        movq mm6,[_0f0f0f0f0f0f0f0f]
        pand mm2,mm6
        pand mm3,mm6
        psubb mm4,mm2       ; This is a signed value
        psubb mm5,mm3
        paddusb mm0,mm4     ; ... which is added or subtracted using unsigned saturation on the result to clamp to 0/255
        paddusb mm1,mm5

        ; We need to pack into a single 64-bit word, discarding the lower bits

        movq mm2,mm0
        movq mm3,mm1
        psrlq mm0,4      ; mm0 has x7x6x5x4x3x2x1x0
        psrlq mm1,4
        psrlq mm2,8      ; mm2 has 0x7x6x5x4x3x2x1x
        psrlq mm3,8

        movq mm4, [_000f000f000f000f]
        movq mm5, [_00f000f000f000f0]
        pand mm0,mm4        ; ...6...4...2...0
        pand mm1,mm4
        pand mm2,mm5        ; ..7...5...3...1.
        pand mm3,mm5
        por mm0,mm2         ; ..76..54..32..10
        por mm1,mm3

        packuswb mm0,mm1    ; fedcba98 76543210

    IF X64
        movq [rdx],mm0
    ELSE
        movq [edx],mm0
    ENDIF

        emms
        ret

IF X64
DXTCV11CompressExplicitAlphaBlockMMX ENDP
ELSE
@DXTCV11CompressExplicitAlphaBlockMMX@8 ENDP
ENDIF


main_proc_name PROC
        ; Fetch stride and source according to calling conventions
        mov stride_reg,16                ; Packed data input at the moment
IF X64
        ; rcx and rdx are already the correct source and destination
ELSE
        ; Fetch stride and source according to calling conventions
        mov source_reg,[esp+4]
        mov dest_reg,[esp+8]
ENDIF
;lea source_reg, white
;lea source_reg, whiteblack
;lea source_reg, redblack
;lea source_reg, redsblack_prog
;lea source_reg, redsblack2
;lea source_reg, greenred

        SAVE_REGS

        AVERAGE_RGB source_reg, stride_reg
;        jmp no_axis                        ; Insert this to test averaging

        AXIS_3COMPONENT no_axis
        POS_3COMPONENT
    IF NO_PROG
        NOPROG_CLUSTER
    ELSEIF USE_34
        PROG_34
    ELSE
        PROG
    ENDIF
colour:
        COLOUR edx, 1

exit:
        RESTORE_REGS
        ret

no_axis:
        COLOUR_AVERAGE dest_reg, 1
        jmp exit

main_proc_name     ENDP


IF 0
        mov eax,16                ; Packed data input at the moment
        mov ecx,[esp+4]
        mov edx,[esp+8]
;lea ecx, bluered
;lea ecx,fade

        SAVE_REGS

        AVERAGE_RGB_RB ecx, eax
;        jmp no_axis                        ; Insert this to test averaging

        AXIS_2COMPONENT no_axis
        AXIS_NORM_2COMPONENT
        POS_2COMPONENT
        PROG
        COLOUR edx, 0

exit:
        RESTORE_REGS
        ret

no_axis:
        COLOUR_AVERAGE edx, 0
        jmp exit


ELSE
ENDIF


IF X64 EQ 0

; Prototype for fastcall with stride...
;void __fastcall DXTCV11CompressBlockSSE2Strided(DWORD *block_32, DWORD *block_dxtc, DWORD input_stride);
@DXTCV11CompressBlockSSE2Strided@12 PROC
        ; Fetch stride and source according to calling conventions
        mov eax,[esp+4]
        SAVE_REGS

        AVERAGE_RGB ecx, eax
;        jmp no_axis                        ; Insert this to test averaging

        AXIS_3COMPONENT no_axis
        POS_3COMPONENT
        PROG
        COLOUR edx, 1

exit:
        RESTORE_REGS
        ret

no_axis:
        COLOUR_AVERAGE edx, 1
        jmp exit

@DXTCV11CompressBlockSSE2Strided@12     ENDP

ENDIF


END


; x64 uses register calling conventions
; The first four parameters are put in rcx, rdx, r8, r9 (floats would be in xmm0-3)
; rax, r10, r11, xmm4 and xmm5 are volatile in addition to the above - all others must be saved

; void __cdecl DXTCCompressBlockSSE(DWORD *block_32, DWORD *block_dxtc);
; block_dxtc == rdx, how convenient!