1980 lines
64 KiB
NASM
1980 lines
64 KiB
NASM
;===============================================================================
|
|
; Copyright (c) 2004-2006 ATI Technologies Inc.
|
|
;===============================================================================
|
|
; Assemble with:
|
|
; ml /DX64=0 /W3 /Fo $(OutDir)\$(InputName).obj /c /coff /Cp /Zi $(InputPath)
|
|
; ml64 /DX64=1 /W3 /Fo $(OutDir)\$(InputName).obj /c /Cp /Zi $(InputPath)
|
|
|
|
|
|
; Performance / test / tuning options
|
|
|
|
WEIGHTING equ 1 ; Allows (limited) weighting of RGB components (currently 2 : 4 : 1)
|
|
UNROUNDING equ 1 ; Costs ~1%; improves MSE slightly
|
|
PROG_THRESHOLD equ 1 ; Enable to skip prog on small-signal blocks; meaningless drag with no_prog
|
|
|
|
NO_PROG equ 0 ; Note that weighting is not relevant if NO_PROG is enabled
|
|
USE_34 equ 0 ; Enable 3-colour blocks. Costs ~10% on Phenom with X64, but more like 20% on older chips with win32, helps most images little but fixes some challenging blocks
|
|
|
|
COUNT_PROG_STEPS equ 0 ; Generates a false-colour image; black is 2 prog steps, rising accordingly as with Spectrum palette
|
|
SHOW_BLOCK_TYPES equ 0 ; USE_34 must be enabled
|
|
|
|
; This is the dial controlling the point at which prog turns on. The tipping point is somewhere between 8 and
|
|
; 12; increases above 12 start to add noise and hit MSE noticeably; below 8 there are few gains.
|
|
; For a typical image 8 provides about a 2.5% gain, with 12 providing about 7.5% gain in performance; 16 only
|
|
; adds another 2%. In the end 12 seems acceptable; the image is very similar and the shape of noise is not
|
|
; significantly changed, while the diminishing returns above this point seem to make it unwise.
|
|
prog_threshold equ 12*32 ; X is in 9.5 format
|
|
|
|
; Increasing stepsize suffers from decreasing performance returns (due to a minimum of 2 prog steps being
|
|
; required), but decreasing it suffers from increasing inaccuracy once a certain point is crossed. Selecting
|
|
; the optimum step size is therefore an important performance/quality tradeoff; excellent results are
|
|
; possible with high performance if this is correctly tuned.
|
|
; If weighting is off, larger step sizes appear to make less difference in quality.
|
|
step equ 3*32 ; X is in 9.5 format; we want to step somewhere between 1.0 and 4.0
|
|
;step equ 2*32
|
|
;step equ 4*32
|
|
;step equ 8*32
|
|
|
|
; We could make step programmable to trade off performance and image quality
|
|
|
|
|
|
IF X64 EQ 0
|
|
.686
|
|
.MMX
|
|
.XMM
|
|
.MODEL FLAT
|
|
ENDIF
|
|
|
|
|
|
|
|
|
|
.DATA
|
|
|
|
white DWORD 0ffffffh, 0ffffffh, 0ffffffh, 0ffffffh
|
|
DWORD 0ffffffh, 0ffffffh, 0ffffffh, 0ffffffh
|
|
DWORD 0ffffffh, 0ffffffh, 0ffffffh, 0ffffffh
|
|
DWORD 0ffffffh, 0ffffffh, 0ffffffh, 0ffffffh
|
|
|
|
|
|
whiteblack DWORD 0ffffffh, 0ffffffh, 0ffffffh, 0ffffffh
|
|
DWORD 0ffffffh, 0000000h, 0000000h, 0ffffffh
|
|
DWORD 0ffffffh, 0ffffffh, 0000000h, 0000000h
|
|
DWORD 0000000h, 0000000h, 0000000h, 0000000h
|
|
|
|
redblack DWORD 0ff0000h, 0ff0000h, 0ff0000h, 0ff0000h
|
|
DWORD 0ff0000h, 0000000h, 0000000h, 0ff0000h
|
|
DWORD 0ff0000h, 0ff0000h, 0000000h, 0000000h
|
|
DWORD 0000000h, 0000000h, 0000000h, 0000000h
|
|
|
|
redsblack DWORD 0ff0000h, 0aa0000h, 0aa0000h, 0ff0000h
|
|
DWORD 0ff0000h, 0550000h, 0550000h, 0ff0000h
|
|
DWORD 0ff0000h, 0ff0000h, 0000000h, 0000000h
|
|
DWORD 0000000h, 0000000h, 0000000h, 0000000h
|
|
|
|
redsblack_prog DWORD 0ff0000h, 0990000h, 0990000h, 0990000h
|
|
DWORD 0990000h, 0660000h, 0660000h, 0990000h
|
|
DWORD 0990000h, 0990000h, 0660000h, 0000000h
|
|
DWORD 0660000h, 0660000h, 0660000h, 0660000h
|
|
|
|
redsblack2 DWORD 0e00000h, 0e00000h, 0e00000h, 0e00000h
|
|
DWORD 0c00000h, 0c00000h, 0c00000h, 0c00000h
|
|
DWORD 0a00000h, 0a00000h, 0a00000h, 0a00000h
|
|
DWORD 0800000h, 0800000h, 0800000h, 0800000h
|
|
|
|
greenred DWORD 0ff0000h, 0ff0000h, 0ff0000h, 0ff0000h
|
|
DWORD 0ff0000h, 0ff0000h, 0ff0000h, 0ff0000h
|
|
DWORD 000ff00h, 000ff00h, 000ff00h, 000ff00h
|
|
DWORD 000ff00h, 000ff00h, 000ff00h, 000ff00h
|
|
|
|
bluered DWORD 0ff0000h, 0ff0000h, 0ff0000h, 0ff0000h
|
|
DWORD 0ff0000h, 0ff0000h, 0ff0000h, 0ff0000h
|
|
DWORD 00000ffh, 00000ffh, 00000ffh, 00000ffh
|
|
DWORD 00000ffh, 00000ffh, 00000ffh, 00000ffh
|
|
|
|
fade DWORD 0ffffffh, 0ffffffh, 0b0b0b0h, 0707070h
|
|
DWORD 0ffffffh, 0ffffffh, 0b0b0b0h, 0707070h
|
|
DWORD 0ffffffh, 0ffffffh, 0d0d0d0h, 0909090h
|
|
DWORD 0ffffffh, 0ffffffh, 0ffffffh, 0e0e0e0h
|
|
|
|
col_table DWORD 000000000h, 0001f001fh, 0f800f800h, 0f81ff81fh
|
|
DWORD 007e007e0h, 007ff07ffh, 0ffe0ffe0h, 0ffffffffh
|
|
DWORD 084108410h
|
|
|
|
ALIGN 16
|
|
|
|
testval equ 0400h
|
|
test_val WORD testval, testval, testval, testval, testval, testval, testval, testval
|
|
|
|
one_third equ 65536/3
|
|
|
|
|
|
AVG_FRAC_BITS equ 4 ; 0-4
|
|
|
|
mask_rgb QWORD 000ffffff00ffffffh, 000ffffff00ffffffh
|
|
mask_rg QWORD 000ffff0000ffff00h, 000ffff0000ffff00h
|
|
mask_rb QWORD 000ff00ff00ff00ffh, 000ff00ff00ff00ffh
|
|
mask_gb QWORD 00000ffff0000ffffh, 00000ffff0000ffffh
|
|
mask_r QWORD 000ff000000ff0000h, 000ff000000ff0000h
|
|
mask_b QWORD 0000000ff000000ffh, 0000000ff000000ffh
|
|
mask_low_qword QWORD 0ffffffffffffffffh, 0
|
|
mask_low_dword QWORD 000000000ffffffffh, 0
|
|
mask_low_word QWORD 0000000000000ffffh, 0
|
|
mask_third_word QWORD 00000ffff00000000h, 0
|
|
dword_word_mask QWORD 00000ffff0000ffffh, 00000ffff0000ffffh
|
|
quad_word_mask QWORD 0000000000000ffffh, 0000000000000ffffh
|
|
quad_dword_mask QWORD 000000000ffffffffh, 000000000ffffffffh
|
|
quad_upper_dword_mask QWORD 0ffffffff00000000h, 0ffffffff00000000h
|
|
max_sint32 DWORD 07fffffffh, 07fffffffh, 07fffffffh, 07fffffffh
|
|
|
|
scale_one_third WORD one_third,one_third,one_third,one_third,one_third,one_third,one_third,one_third
|
|
stepsize WORD step,step,step,step,step,step,step,step
|
|
prog_threshold_size WORD prog_threshold,prog_threshold,prog_threshold,prog_threshold,prog_threshold,prog_threshold,prog_threshold,prog_threshold
|
|
|
|
IF WEIGHTING
|
|
WEIGHTING_BITS equ 4 ; If this equals AVG_FRAC_BITS a few extra optimisations kick in
|
|
weighting WORD 4,16,8,0, 4,16,8,0
|
|
unweighting WORD 4,1,2,0
|
|
round_565_weighted WORD 0010h, 0020h, 0020h, 0
|
|
ELSE
|
|
WEIGHTING_BITS equ 0
|
|
ENDIF
|
|
|
|
IF UNROUNDING
|
|
round_565 WORD 0040h,0020h,0040h,0 ; This is applied to the average, which is in 8.4 format, so it's 0.5 in 5.7, 6.6, 5.7 format
|
|
scale_to_round WORD 2048,1024,2048,0 ; This is a 5 6 5 right shift in 8.4 format
|
|
round_mask WORD 0fff0h, 0fff0h, 0fff0h, 0
|
|
ENDIF
|
|
|
|
|
|
scale_8_4_to_565 WORD 32*16, 64*16, 32*16, 0
|
|
clamp_565 WORD 31,63,31,0
|
|
|
|
_0000000055555555 QWORD 00000000055555555h
|
|
|
|
_0707070707070707 QWORD 00707070707070707h
|
|
_0f0f0f0f0f0f0f0f QWORD 00f0f0f0f0f0f0f0fh
|
|
_000f000f000f000f QWORD 0000f000f000f000fh
|
|
_00f000f000f000f0 QWORD 000f000f000f000f0h
|
|
|
|
ALIGN 16
|
|
|
|
; The axis ordering table. This is a table of XMMWORD 4-float values that set the axis signs
|
|
; using the index made up of the RB,GB,GR negative correlation and zero signal bits
|
|
; In addition, the scaling factor to convert from -1,1 range to 1.15 fixed point is baked in
|
|
scale_1_15 equ 32768.0
|
|
|
|
AXIS_ORDER_ENTRY MACRO rb_neg, gb_zero, gb_neg, gr_zero, gr_neg
|
|
IF (gr_zero NE 0) AND (gb_zero NE 0)
|
|
; use rb on B
|
|
IF (rb_neg NE 0)
|
|
REAL4 -scale_1_15, scale_1_15, scale_1_15, 0.0
|
|
ELSE
|
|
REAL4 scale_1_15, scale_1_15, scale_1_15, 0.0
|
|
ENDIF
|
|
ELSE
|
|
; use gb_pos on B and gr_pos on R
|
|
IF (gb_neg NE 0)
|
|
REAL4 -scale_1_15, scale_1_15
|
|
ELSE
|
|
REAL4 scale_1_15, scale_1_15
|
|
ENDIF
|
|
IF (gr_neg NE 0)
|
|
REAL4 -scale_1_15, 0.0
|
|
ELSE
|
|
REAL4 scale_1_15, 0.0
|
|
ENDIF
|
|
ENDIF
|
|
ENDM
|
|
|
|
|
|
axis_order_table LABEL XMMWORD
|
|
axis_order_count = 0
|
|
WHILE (axis_order_count LT 32)
|
|
AXIS_ORDER_ENTRY (axis_order_count AND 1), (axis_order_count AND 2), (axis_order_count AND 4), (axis_order_count AND 8), (axis_order_count AND 010h)
|
|
axis_order_count = axis_order_count + 1
|
|
ENDM
|
|
|
|
|
|
|
|
|
|
; A few helper macros to make shift code cleaner
|
|
AVG_SHIFT_BITS equ WEIGHTING_BITS + (4-AVG_FRAC_BITS)
|
|
IF AVG_SHIFT_BITS
|
|
avg_round_val equ 1 SHL (AVG_SHIFT_BITS-1) ; 0.5 correction
|
|
avg_round WORD avg_round_val,avg_round_val,avg_round_val,avg_round_val,avg_round_val,avg_round_val,avg_round_val,avg_round_val
|
|
ENDIF
|
|
|
|
INPUT_SHIFT_BITS equ WEIGHTING_BITS - AVG_FRAC_BITS ; input has been left shifted by weight, needs to be left shifted by frac
|
|
|
|
INPUT_SHIFT MACRO reg
|
|
IF INPUT_SHIFT_BITS GT 0
|
|
psrlw reg,INPUT_SHIFT_BITS
|
|
ELSEIF INPUT_SHIFT_BITS LT 0
|
|
psllw reg,-INPUT_SHIFT_BITS
|
|
ENDIF
|
|
|
|
ENDM
|
|
|
|
|
|
; Macro to generate the correct value for use in the various shuffle ops
|
|
SHUFFLE_SELECT MACRO a,b,c,d
|
|
LOCAL Value
|
|
IF (a gt 3) or (b gt 3) or (c gt 3) or (d gt 3)
|
|
.ERR
|
|
EXITM <0>
|
|
ENDIF
|
|
Value = ((a) OR (b SHL 2) OR (c SHL 4) OR (d SHL 6))
|
|
EXITM %Value
|
|
ENDM
|
|
|
|
|
|
|
|
.CODE
|
|
|
|
; The DXTC compressor is a six-step process:
|
|
; 1. AVERAGE: find the average value of the block
|
|
; 2. AXIS: compute the compression axis
|
|
; 3. POS: calculate the position on the axis of each data value in the block
|
|
; 4. PROG: progressively refine the selected endpoints to find an error minima
|
|
; 5. COLOUR: generate output colours from the endpoints, axis and average
|
|
; 6. CLUSTER: cluster the data values into the appropriate clusters
|
|
|
|
; Each of these is implemented by macros to allow the composition of different components
|
|
; to form a particular block compressor; this also eases portability to x64. For example,
|
|
; changing AVERAGE would allow different structured input, and changing AVERAGE, AXIS and POS
|
|
; allows e.g. 2D vs. 3D block types.
|
|
|
|
; Macros make the code harder to debug, because MASM generates the error messages with line
|
|
; numbers corresponding to the invocation of the macro rather than inside the macro. Enabling
|
|
; /W3 helps, it then shows the line offset inside the macro. Short macros are therefore a
|
|
; lot easier to debug than long ones.
|
|
|
|
; CLUSTER has been subsumed into pos, prog and colour in this implementation
|
|
|
|
|
|
; Temporaries
|
|
TMP_TMP equ (8*16) ; Basic tmp area - 8 XMMWORD each contains two WORD RGB0 source values
|
|
TMP_AVG equ TMP_TMP + 16
|
|
TMP_AXIS equ TMP_AVG + 16
|
|
TMP_POS equ TMP_AXIS + (2*16)
|
|
TMP_CLUSTER equ TMP_POS + 16 ; 2 dwords; centre and split
|
|
TMP_BEST equ TMP_CLUSTER + 16
|
|
TMP_CENTRE equ TMP_BEST + 16
|
|
TMP_CURRENT equ TMP_CENTRE + 16
|
|
|
|
|
|
; Prologue / epilog / register management
|
|
IF X64
|
|
; We can use volatile registers for some of the scratch regs we need so we don't need to save them
|
|
rfx equ r8
|
|
efx equ r8d
|
|
tmp_base_reg equ rbp ; Tried r9 but faster with rbp - REX prefix overhead perhaps?
|
|
source_reg equ rcx
|
|
dest_reg equ rdx
|
|
stride_reg equ rax
|
|
egx equ r10d
|
|
ehx equ r11d
|
|
|
|
main_proc_name equ DXTCV11CompressBlockSSE2
|
|
|
|
SAVED_REGS equ 8
|
|
TMP_REGSAVE equ TMP_CENTRE + (SAVED_REGS*16)
|
|
|
|
|
|
SAVE_REG MACRO reg, n
|
|
IF n LE SAVED_REGS
|
|
movdqa [tmp_base_reg-TMP_REGSAVE+(n*16)], reg
|
|
ENDIF
|
|
ENDM
|
|
|
|
RESTORE_REG MACRO reg, n
|
|
IF n LE SAVED_REGS
|
|
movdqa reg, [tmp_base_reg-TMP_REGSAVE+(n*16)]
|
|
ENDIF
|
|
ENDM
|
|
|
|
SAVE_REGS MACRO
|
|
; push rsi
|
|
; push rdi
|
|
; push rbx
|
|
push rbp
|
|
|
|
; Set up a 16-byte aligned storage space pointer
|
|
mov tmp_base_reg, rsp
|
|
and tmp_base_reg, NOT 0fh
|
|
|
|
; Any xmm regs over 5 need to be saved here as well
|
|
SAVE_REG xmm6,1
|
|
SAVE_REG xmm7,2
|
|
SAVE_REG xmm8,3
|
|
SAVE_REG xmm9,4
|
|
SAVE_REG xmm10,5
|
|
SAVE_REG xmm11,6
|
|
SAVE_REG xmm12,7
|
|
SAVE_REG xmm13,8
|
|
SAVE_REG xmm14,9
|
|
SAVE_REG xmm15,10
|
|
ENDM
|
|
|
|
RESTORE_REGS MACRO
|
|
RESTORE_REG xmm6,1
|
|
RESTORE_REG xmm7,2
|
|
RESTORE_REG xmm8,3
|
|
RESTORE_REG xmm9,4
|
|
RESTORE_REG xmm10,5
|
|
RESTORE_REG xmm11,6
|
|
RESTORE_REG xmm12,7
|
|
RESTORE_REG xmm13,8
|
|
RESTORE_REG xmm14,9
|
|
RESTORE_REG xmm15,10
|
|
|
|
pop rbp
|
|
; pop rbx
|
|
; pop rdi
|
|
; pop rsi
|
|
ENDM
|
|
|
|
ELSE ; X64
|
|
|
|
; We don't have spare volatile regs
|
|
efx equ ebx
|
|
tmp_base_reg equ ebp
|
|
source_reg equ ecx
|
|
dest_reg equ edx
|
|
stride_reg equ eax
|
|
IF USE_34
|
|
egx equ esi
|
|
ehx equ edi
|
|
ENDIF
|
|
|
|
main_proc_name equ _DXTCV11CompressBlockSSE2
|
|
|
|
SAVE_REGS MACRO
|
|
IF USE_34
|
|
push esi
|
|
push edi
|
|
ENDIF
|
|
|
|
push ebx
|
|
push ebp
|
|
|
|
; Set up the tmp pointer aligned on a 16-byte boundary
|
|
mov ebp, esp
|
|
and ebp, NOT 0fh
|
|
ENDM
|
|
|
|
RESTORE_REGS MACRO
|
|
pop ebp
|
|
pop ebx
|
|
IF USE_34
|
|
pop edi
|
|
pop esi
|
|
ENDIF
|
|
|
|
emms
|
|
ENDM
|
|
|
|
ENDIF ; X64
|
|
|
|
|
|
|
|
COLOURSPACE_TRANSFORM_SETUP MACRO
|
|
IF WEIGHTING
|
|
movdqa xmm5, xmmword ptr [weighting]
|
|
ELSE
|
|
movdqa xmm5, xmmword ptr [mask_rgb]
|
|
ENDIF
|
|
ENDM
|
|
|
|
COLOURSPACE_TRANSFORM MACRO reg1, reg2
|
|
IF WEIGHTING
|
|
; Weighting doesn't need masking - the weighting mul clears out the A component
|
|
ELSE
|
|
pand reg1,xmm5
|
|
pand reg2,xmm5
|
|
ENDIF
|
|
ENDM
|
|
|
|
|
|
|
|
AVERAGE_RGB MACRO src_reg, stride_reg
|
|
|
|
COLOURSPACE_TRANSFORM_SETUP
|
|
|
|
; Read source data - use DQU since we can't be certain it's aligned, the
|
|
; overhead is not important given the size of the rest of the calculation
|
|
movdqu xmm0,[src_reg]
|
|
movdqu xmm2,[src_reg+stride_reg]
|
|
pxor xmm7,xmm7
|
|
|
|
COLOURSPACE_TRANSFORM xmm0,xmm2
|
|
|
|
movdqa xmm1,xmm0
|
|
movdqa xmm3,xmm2
|
|
punpcklbw xmm0,xmm7 ; expand to two 0RGB values in each XMM, 8 XMMs total
|
|
punpckhbw xmm1,xmm7
|
|
punpcklbw xmm2,xmm7
|
|
punpckhbw xmm3,xmm7
|
|
|
|
IF WEIGHTING
|
|
pmullw xmm0,xmm5
|
|
pmullw xmm1,xmm5
|
|
pmullw xmm2,xmm5
|
|
pmullw xmm3,xmm5
|
|
ENDIF
|
|
|
|
; Write unpacked values (8 bits) to the scratch buffer: they're needed
|
|
; again in the axis calculation and we don't have the registers to keep
|
|
; them around (we probably can on x64)
|
|
movdqa [tmp_base_reg-TMP_TMP],xmm0
|
|
movdqa [tmp_base_reg-TMP_TMP+16],xmm1
|
|
paddw xmm0,xmm1 ; Start accumulating the result
|
|
movdqa [tmp_base_reg-TMP_TMP+32],xmm2
|
|
movdqa [tmp_base_reg-TMP_TMP+48],xmm3
|
|
paddw xmm2,xmm3
|
|
|
|
lea src_reg,[src_reg+2*stride_reg]
|
|
paddw xmm0,xmm2
|
|
|
|
movdqu xmm2,[src_reg]
|
|
movdqu xmm4,[src_reg+stride_reg]
|
|
|
|
COLOURSPACE_TRANSFORM xmm2,xmm4
|
|
|
|
IF X64
|
|
movdqa xmm6,xmm2
|
|
movdqa xmm9,xmm4
|
|
punpcklbw xmm2,xmm7
|
|
punpckhbw xmm6,xmm7
|
|
punpcklbw xmm4,xmm7
|
|
punpckhbw xmm9,xmm7
|
|
IF WEIGHTING
|
|
pmullw xmm2,xmm5
|
|
pmullw xmm6,xmm5
|
|
pmullw xmm4,xmm5
|
|
pmullw xmm9,xmm5
|
|
ENDIF
|
|
|
|
; 7, 6, 8 and 9 are the inputs to the axis comp (so no need to write to the scratchpad)
|
|
movdqa xmm7,xmm2
|
|
movdqa xmm8,xmm4
|
|
paddw xmm2,xmm6
|
|
paddw xmm4,xmm9
|
|
ELSE ; X64
|
|
movdqa xmm1,xmm2
|
|
movdqa xmm6,xmm4
|
|
punpcklbw xmm2,xmm7
|
|
punpckhbw xmm1,xmm7
|
|
punpcklbw xmm4,xmm7
|
|
punpckhbw xmm6,xmm7
|
|
IF WEIGHTING
|
|
pmullw xmm2,xmm5
|
|
pmullw xmm1,xmm5
|
|
pmullw xmm4,xmm5
|
|
pmullw xmm6,xmm5
|
|
ENDIF
|
|
|
|
; 7 and 6 are the inputs to the axis comp (so no need to write to the scratchpad)
|
|
movdqa [tmp_base_reg-TMP_TMP+64],xmm2
|
|
movdqa [tmp_base_reg-TMP_TMP+80],xmm1
|
|
paddw xmm2,xmm1
|
|
movdqa xmm7,xmm4
|
|
paddw xmm4,xmm6
|
|
ENDIF ; X64
|
|
|
|
paddw xmm0,xmm2
|
|
|
|
paddw xmm0,xmm4 ; xmm0 has 8 value sums in each qword
|
|
|
|
pshufd xmm1,xmm0,SHUFFLE_SELECT(2,3,0,1) ; swap so result is in both qwords
|
|
paddw xmm0,xmm1
|
|
|
|
; Convert the average to have the correct fractional bit count
|
|
; This is some combination of a real or virtual divide by 16 in total to generate the average, so
|
|
; xmm0 now has the average in low 4 words 0,R,G,B in 8.0 to 8.4 fixed point format
|
|
|
|
IF AVG_SHIFT_BITS
|
|
paddw xmm0,[avg_round]
|
|
psrlw xmm0,AVG_SHIFT_BITS
|
|
ENDIF
|
|
movdqa [tmp_base_reg-TMP_AVG],xmm0
|
|
|
|
ENDM ; AVERAGE_RGB
|
|
|
|
|
|
|
|
IF X64
|
|
|
|
; For X64, the ability to 4-way interleave is slightly faster even taking into account the (significant)
|
|
; overhead of saving the registers (each register saved is more than 1 MB/s off the rate)
|
|
AXIS_3C_I_64 MACRO offset, first, last
|
|
IF first
|
|
pxor xmm2,xmm2
|
|
ELSE
|
|
movdqa xmm7,[tmp_base_reg-offset] ; fetch two RGBs
|
|
movdqa xmm6,[tmp_base_reg-offset+16]
|
|
movdqa xmm8,[tmp_base_reg-offset+32]
|
|
movdqa xmm9,[tmp_base_reg-offset+48]
|
|
ENDIF
|
|
INPUT_SHIFT xmm7 ; Convert to 8.x fixed point. Note we don't need bit replication (avg of 16 255s is 255.0)
|
|
INPUT_SHIFT xmm6 ; This is a nop if the weight and avg fractional bits match
|
|
INPUT_SHIFT xmm8
|
|
INPUT_SHIFT xmm9
|
|
pxor xmm3,xmm3
|
|
movdqa xmm1,xmm3 ; move is cheaper than repeated xors
|
|
movdqa xmm10,xmm3
|
|
movdqa xmm11,xmm3
|
|
psubw xmm7,xmm0 ; subtract avg
|
|
psubw xmm6,xmm0
|
|
psubw xmm8,xmm0
|
|
psubw xmm9,xmm0
|
|
; IF last EQ 0 ; Oddly this appears to be slower on X64?
|
|
movdqa [tmp_base_reg-offset],xmm7 ; write RGB-avg back as it will be reused later
|
|
movdqa [tmp_base_reg-offset+16],xmm6
|
|
; ENDIF
|
|
movdqa [tmp_base_reg-offset+32],xmm8
|
|
movdqa [tmp_base_reg-offset+48],xmm9
|
|
pshuflw xmm5,xmm7,SHUFFLE_SELECT(2,0,1,3) ; R B G 0; lines up with B G R 0 to produce RB / BG / GR axis ordering info
|
|
pshuflw xmm4,xmm6,SHUFFLE_SELECT(2,0,1,3)
|
|
pshuflw xmm12,xmm8,SHUFFLE_SELECT(2,0,1,3)
|
|
pshuflw xmm13,xmm9,SHUFFLE_SELECT(2,0,1,3)
|
|
psubw xmm3,xmm7 ; -axis
|
|
psubw xmm1,xmm6
|
|
psubw xmm10,xmm8
|
|
psubw xmm11,xmm9
|
|
pshufhw xmm5,xmm5,SHUFFLE_SELECT(2,0,1,3)
|
|
pshufhw xmm4,xmm4,SHUFFLE_SELECT(2,0,1,3)
|
|
pshufhw xmm12,xmm12,SHUFFLE_SELECT(2,0,1,3)
|
|
pshufhw xmm13,xmm13,SHUFFLE_SELECT(2,0,1,3)
|
|
pmaxsw xmm3,xmm7 ; abs(axis)
|
|
pmaxsw xmm1,xmm6
|
|
pmaxsw xmm10,xmm8
|
|
pmaxsw xmm11,xmm9
|
|
psraw xmm5,16 ; state of sign bit
|
|
psraw xmm4,16
|
|
psraw xmm12,16
|
|
psraw xmm13,16
|
|
paddw xmm1,xmm3 ; accumulate axis
|
|
paddw xmm10,xmm11
|
|
paddw xmm1,xmm10
|
|
IF first EQ 0
|
|
paddw xmm1,[tmp_base_reg-TMP_AXIS]
|
|
ENDIF
|
|
pandn xmm5,xmm7
|
|
pandn xmm4,xmm6
|
|
pandn xmm12,xmm8
|
|
pandn xmm13,xmm9
|
|
IF last EQ 0
|
|
movdqa [tmp_base_reg-TMP_AXIS], xmm1
|
|
ENDIF
|
|
paddw xmm5,xmm4
|
|
paddw xmm12,xmm13
|
|
|
|
paddw xmm2,xmm5 ; accumulate order
|
|
paddw xmm2,xmm12
|
|
ENDM ; AXIS_3C_I_64
|
|
|
|
ELSE ; X64
|
|
|
|
AXIS_3C_I MACRO offset, first, last
|
|
IF first
|
|
pxor xmm2,xmm2
|
|
ELSE
|
|
movdqa xmm7,[tmp_base_reg-offset] ; fetch two RGBs
|
|
movdqa xmm6,[tmp_base_reg-offset+16]
|
|
ENDIF
|
|
INPUT_SHIFT xmm7 ; Convert to 8.x fixed point. Note we don't need bit replication (avg of 16 255s is 255.0)
|
|
INPUT_SHIFT xmm6 ; This is a nop if the weight and avg fractional bits match
|
|
pxor xmm3,xmm3
|
|
pxor xmm1,xmm1
|
|
psubw xmm7,xmm0 ; subtract avg
|
|
psubw xmm6,xmm0
|
|
IF last EQ 0
|
|
movdqa [tmp_base_reg-offset],xmm7 ; write RGB-avg back as it will be reused later
|
|
movdqa [tmp_base_reg-offset+16],xmm6
|
|
ENDIF
|
|
pshuflw xmm5,xmm7,SHUFFLE_SELECT(2,0,1,3) ; R B G 0; lines up with B G R 0 to produce RB / BG / GR axis ordering info
|
|
pshuflw xmm4,xmm6,SHUFFLE_SELECT(2,0,1,3)
|
|
psubw xmm3,xmm7 ; -axis
|
|
psubw xmm1,xmm6
|
|
pshufhw xmm5,xmm5,SHUFFLE_SELECT(2,0,1,3)
|
|
pshufhw xmm4,xmm4,SHUFFLE_SELECT(2,0,1,3)
|
|
pmaxsw xmm3,xmm7 ; abs(axis)
|
|
pmaxsw xmm1,xmm6
|
|
psraw xmm5,16 ; state of sign bit
|
|
psraw xmm4,16
|
|
paddw xmm1,xmm3 ; accumulate axis
|
|
IF first EQ 0
|
|
paddw xmm1,[tmp_base_reg-TMP_AXIS]
|
|
ENDIF
|
|
pandn xmm5,xmm7
|
|
pandn xmm4,xmm6
|
|
IF last EQ 0
|
|
movdqa [tmp_base_reg-TMP_AXIS], xmm1
|
|
ENDIF
|
|
paddw xmm2,xmm5 ; accumulate order
|
|
paddw xmm2,xmm4
|
|
ENDM ; AXIS_3C_I
|
|
|
|
ENDIF
|
|
|
|
|
|
|
|
; Calculate the axis vector
|
|
AXIS_3COMPONENT MACRO no_axis
|
|
; Expects: TMP_TMP to have expanded RGB values
|
|
; xmm0 is the average in the form R G B 0 R G B 0
|
|
; Is expected to: set TMP_TMP to RGB-avg
|
|
; set xmm7 to the axis
|
|
|
|
; G is the priority axis and the axis ordering info is set accordingly. The method used
|
|
; is far from perfect, but suffices for most cases.
|
|
IF X64
|
|
AXIS_3C_I_64 TMP_TMP+64, 1, 0
|
|
AXIS_3C_I_64 TMP_TMP , 0, 1
|
|
ELSE
|
|
AXIS_3C_I TMP_TMP+96, 1, 0
|
|
AXIS_3C_I TMP_TMP+32, 0, 0
|
|
AXIS_3C_I TMP_TMP+64, 0, 0
|
|
AXIS_3C_I TMP_TMP , 0, 1
|
|
ENDIF
|
|
|
|
; parallel add xmm1 and xmm2 to get final absolute axis info
|
|
pshufd xmm3,xmm1,SHUFFLE_SELECT(2,3,0,1)
|
|
pshufd xmm4,xmm2,SHUFFLE_SELECT(2,3,0,1)
|
|
pxor xmm5,xmm5
|
|
paddw xmm1,xmm3 ; Final summed absolute axis
|
|
paddw xmm2,xmm4 ; Final summed pos
|
|
movdqa xmm0,xmm5
|
|
|
|
; axis is in 8.8 fixed point - it needs normalisation and ordering (the signs set correctly)
|
|
; By default, G is the major axis, and BG and RG pos define the orders of the B and R axes
|
|
; If G isn't the major axis (i.e. it is 0) R becomes the major axis and RB pos defines the
|
|
; order of the B axis. If RB_pos is also 0, B is the major axis and is positive.
|
|
|
|
; If the axis is 0 we have a constant-colour block - we must catch this here (division by
|
|
; zero in normalisation otherwise)
|
|
|
|
pcmpeqw xmm5,xmm1
|
|
pmovmskb eax,xmm2 ; eax is the sign-flip bitvector (axis_neg)
|
|
pcmpeqw xmm2,xmm0
|
|
|
|
pmovmskb ecx,xmm5 ; ecx is the axis equals 0 bitvector
|
|
pmovmskb efx,xmm2 ; efx is the order equals 0 bitvector (axis_order_zero)
|
|
|
|
; Finish up the no-axis check.
|
|
and ecx,02ah
|
|
cmp ecx,02ah
|
|
je no_axis
|
|
|
|
; We need to normalise the axis below, and negation is easier in float, so do it there
|
|
punpcklwd xmm1,xmm0 ; The axis is always positive so we can unpack with 0
|
|
|
|
; Axis ordering: we have put two bitvectors into eax and efx.
|
|
; The bits are 1 RB, 3 GB, 5 GR - 0,2,4,6+ are junk, so we mask them off
|
|
; 5 values in here affect the axis ordering:
|
|
; If GR and GB == 0, then we need to apply RB's sign to B
|
|
; Otherwise, we apply GR to R and GB to B
|
|
; See the AXIS_ORDER_ENTRY macro for how we create the 512-byte table of negation bits
|
|
and eax,02ah
|
|
and efx,028h ; We don't need rb of axis_zero
|
|
|
|
|
|
; Promote both to float
|
|
cvtdq2ps xmm2,xmm1
|
|
xorps xmm5,xmm5
|
|
|
|
; We can do the normalisation and set the axis signs in parallel
|
|
movaps xmm1,xmm2
|
|
mulps xmm2,xmm2
|
|
|
|
shl eax,3 ; x8, with empty lower bit, is a 16-byte aligned pointer to the 32 entry table
|
|
shl efx,2 ; One shift less to slot into the gaps
|
|
|
|
; 3D parallel add
|
|
movaps xmm4,xmm2
|
|
movhlps xmm0,xmm2 ; note that xmm0 is still 0 from the pxor above which cleans things up a little
|
|
shufps xmm2,xmm2,SHUFFLE_SELECT(1,1,1,1)
|
|
addss xmm4,xmm0
|
|
or eax,efx ; bits low to high: 0,0,0,0, rb_neg, gb_zero, gb_neg, gr_zero, gr_neg
|
|
addss xmm2,xmm4
|
|
IF X64
|
|
lea rfx,axis_order_table
|
|
ENDIF
|
|
|
|
; low of xmm7 is the DP result
|
|
; We know that this cannot be 0 in the int implementation
|
|
; - the axis was known to be nonzero on at least one component
|
|
; - this is known to be representable exactly in float as it's less than 24 bits in magnitude
|
|
; - the square cannot be small, exponent is positive and it also gives positive results in each component
|
|
; - they cannot therefore sum to 0
|
|
rsqrtss xmm2, xmm2 ; No need for Newton-Raphson, ~15 bits of precision is fine
|
|
|
|
; Apply the axis ordering; we also need the result in the correct 1.15 format when we
|
|
; send it back, so this scaling factor is baked into the axis order table
|
|
IF X64
|
|
mulps xmm1, [rfx + rax]
|
|
ELSE
|
|
mulps xmm1, [axis_order_table + eax]
|
|
ENDIF
|
|
|
|
shufps xmm2, xmm2, SHUFFLE_SELECT(0, 0, 0, 0)
|
|
|
|
; Normalise, apply axis order, and scale to 1.15
|
|
mulps xmm2, xmm1
|
|
|
|
; Get it back into int
|
|
cvtps2dq xmm1,xmm2
|
|
packssdw xmm1,xmm1 ; This duplicates as well, which is what we want
|
|
movdqa [tmp_base_reg-TMP_AXIS], xmm1
|
|
ENDM ; AXIS_3COMPONENT
|
|
|
|
|
|
|
|
|
|
|
|
POS_3C_8_VALUES MACRO offset, first
|
|
movdqa xmm0,xmm1 ; Copy across the axis. It's a faster to copy then multiply-load, one reason
|
|
movdqa xmm2,xmm1 ; is likely because the maddwd issues at 1/clock at best while movdqa can
|
|
movdqa xmm3,xmm1 ; parallelise up to 3 per clock, so there's more breathing room for the loads
|
|
|
|
IF first AND (X64 EQ 0) ; This optimisation appears to be oddly slower on X64
|
|
pmaddwd xmm0,xmm7
|
|
pmaddwd xmm1,xmm6
|
|
ELSE
|
|
pmaddwd xmm0,[tmp_base_reg-offset]
|
|
pmaddwd xmm1,[tmp_base_reg-offset+16]
|
|
ENDIF
|
|
pmaddwd xmm2,[tmp_base_reg-offset+32]
|
|
pmaddwd xmm3,[tmp_base_reg-offset+48]
|
|
|
|
movdqa xmm4,xmm0 ; We could do this with shift or pshufd. Shift has lower code density
|
|
movdqa xmm5,xmm1 ; but is fast on more CPUs, and pshufd shows little if any gain
|
|
movdqa xmm6,xmm2
|
|
movdqa xmm7,xmm3
|
|
psllq xmm0,32
|
|
psllq xmm1,32
|
|
psllq xmm2,32
|
|
psllq xmm3,32
|
|
|
|
paddd xmm0,xmm4 ; We now have 2 results in dwords 1 and 3 of the XMM. This is important below...
|
|
paddd xmm1,xmm5 ; The result is in 9.(15+AVG_FRAC_BITS) format
|
|
paddd xmm2,xmm6
|
|
paddd xmm3,xmm7
|
|
psrad xmm0,10+AVG_FRAC_BITS ; Scale to the desired 9.5 format
|
|
psrad xmm1,10+AVG_FRAC_BITS ; (9.5 is convenient because it becomes 8.4 when multiplied by axis)
|
|
psrad xmm2,10+AVG_FRAC_BITS
|
|
psrad xmm3,10+AVG_FRAC_BITS
|
|
packssdw xmm0,xmm1 ; Pack so we now have 4 word results in words 1, 3, 5 and 7
|
|
packssdw xmm2,xmm3
|
|
|
|
IF first
|
|
movdqa xmm1, [tmp_base_reg-TMP_AXIS] ; Reload the axis we corrupted above
|
|
ENDIF
|
|
|
|
psrad xmm0,16 ; Because we cunningly arranged the results to be in the high word of
|
|
psrad xmm2,16 ; each dword, a sign-preserving shift puts it right for the final pack
|
|
|
|
packssdw xmm0,xmm2 ; 8 results in xmm0
|
|
ENDM ; POS_3C_8_VALUES
|
|
|
|
|
|
|
|
|
|
|
|
; These should be reusable whatever the axis and average format are
|
|
POS_MINMAX MACRO
|
|
; using the xmm0 and xmm2 results we calculate a final minmax
|
|
movdqa xmm3,xmm2
|
|
pmaxsw xmm2,xmm0
|
|
pminsw xmm3,xmm0
|
|
pshufd xmm4,xmm2,SHUFFLE_SELECT(2,3,0,1) ; dword halves
|
|
pshufd xmm5,xmm3,SHUFFLE_SELECT(2,3,0,1)
|
|
pmaxsw xmm2,xmm4
|
|
pminsw xmm3,xmm5
|
|
pshuflw xmm4,xmm2,SHUFFLE_SELECT(2,3,0,1) ; word halves
|
|
pshuflw xmm5,xmm3,SHUFFLE_SELECT(2,3,0,1)
|
|
pmaxsw xmm2,xmm4 ; Final max
|
|
pminsw xmm3,xmm5 ; Final min
|
|
pshuflw xmm4,xmm2,SHUFFLE_SELECT(1,0,1,0) ; word quarters
|
|
pshuflw xmm5,xmm3,SHUFFLE_SELECT(1,0,1,0)
|
|
pmaxsw xmm2,xmm4 ; Final max
|
|
pminsw xmm3,xmm5 ; Final min
|
|
|
|
; if min == max == 0 then we have a single-colour block. This shouldn't be able to
|
|
; happen, as these should be caught when there is no axis instead.
|
|
ENDM
|
|
|
|
|
|
POS_CENTRE MACRO
|
|
; Calculate centre
|
|
; Centre = (A+B)/2 (and then min = -max)
|
|
|
|
; Replicate low and high halves; note shuffles above leave halves already the same
|
|
pshufd xmm7,xmm2,SHUFFLE_SELECT(0,0,0,0) ; max
|
|
pshufd xmm2,xmm3,SHUFFLE_SELECT(0,0,0,0)
|
|
|
|
paddw xmm2,xmm7
|
|
IF PROG_THRESHOLD
|
|
movdqa xmm4, xmmword ptr [prog_threshold_size]
|
|
ENDIF
|
|
psraw xmm2,1 ; xmm2 = offset; arithmetic shift to preserve sign
|
|
|
|
psubw xmm7,xmm2 ; offset max to new centre
|
|
|
|
; save offset centre (9.5 format) for later correction of refined X
|
|
movdqa [tmp_base_reg-TMP_CENTRE], xmm2
|
|
ENDM
|
|
|
|
|
|
POS_OFFSET MACRO reread
|
|
; Offset points to centre, forming array of abs(P-centre) and centre-side cluster bits
|
|
IF reread
|
|
movdqa xmm0,[tmp_base_reg-TMP_POS]
|
|
movdqa xmm1,[tmp_base_reg-TMP_POS+16]
|
|
ENDIF
|
|
IF PROG_THRESHOLD
|
|
pcmpgtw xmm4,xmm7
|
|
ENDIF
|
|
psubw xmm0,xmm2
|
|
psubw xmm1,xmm2
|
|
pxor xmm2,xmm2
|
|
pxor xmm3,xmm3
|
|
pmovmskb eax,xmm0 ; Save clustering
|
|
pmovmskb efx,xmm1
|
|
psubw xmm2,xmm0
|
|
psubw xmm3,xmm1
|
|
shl efx,15
|
|
shr eax,1
|
|
or eax,efx
|
|
and eax,055555555h
|
|
mov [tmp_base_reg-TMP_CLUSTER],eax
|
|
|
|
pmaxsw xmm0,xmm2 ; abs(P-centre)
|
|
pmaxsw xmm1,xmm3
|
|
IF NO_PROG EQ 0
|
|
movdqa [tmp_base_reg-TMP_POS],xmm0
|
|
movdqa [tmp_base_reg-TMP_POS+16],xmm1
|
|
ENDIF
|
|
IF PROG_THRESHOLD
|
|
pmovmskb ecx,xmm4
|
|
ENDIF
|
|
|
|
ENDM
|
|
|
|
|
|
|
|
|
|
|
|
POS_3COMPONENT MACRO no_prog
|
|
; Expects: TMP_TMP to have expanded RGB-average values
|
|
; xmm1 == axis in form R G B 0 R G B 0
|
|
; Is expected to: put abs(P-centre) in TMP_POS
|
|
; put 'centre side' info in TMP_CLUSTER_CENTRE
|
|
|
|
; We need to generate 16 values, but we don't have enough XMMs, so we do two halves
|
|
POS_3C_8_VALUES TMP_TMP, 1
|
|
movdqa [tmp_base_reg-TMP_POS],xmm0
|
|
POS_3C_8_VALUES (TMP_TMP-64), 0
|
|
movdqa [tmp_base_reg-TMP_POS+16],xmm0
|
|
|
|
movdqa xmm2,[tmp_base_reg-TMP_POS]
|
|
POS_MINMAX
|
|
POS_CENTRE
|
|
POS_OFFSET 1
|
|
|
|
ENDM ; POS_3COMPONENT
|
|
|
|
|
|
|
|
|
|
NOPROG_CLUSTER MACRO
|
|
movdqa xmm6,xmm7
|
|
paddw xmm6,xmm6
|
|
pmulhw xmm6, xmmword ptr [scale_one_third]
|
|
|
|
psubw xmm1,xmm6
|
|
psubw xmm0,xmm6 ; abs(P-centre) - 2/3x
|
|
|
|
pmovmskb efx,xmm1
|
|
pmovmskb eax,xmm0 ; 4-block clustering
|
|
|
|
; Save the split point clustering data
|
|
shl efx,16
|
|
or eax,efx
|
|
and eax,0aaaaaaaah
|
|
mov [tmp_base_reg-TMP_CLUSTER+4],eax
|
|
ENDM
|
|
|
|
|
|
|
|
|
|
; Attempt a (simple) progressive refinement step to reduce noise in the
|
|
; output image by trying to find a better overall match for the endpoints
|
|
; than the first-guess solution (the extremities of the input signal)
|
|
|
|
; The method is to move the endpoints inwards until a local MSE minima is found.
|
|
|
|
PROG MACRO
|
|
LOCAL next_refinement_loop
|
|
LOCAL refinement_done
|
|
LOCAL no_prog
|
|
|
|
; Expects: xmm7 (all words) is the initial max value
|
|
; TMP_POS has been set up with the array of 16 words
|
|
|
|
IF COUNT_PROG_STEPS
|
|
xor eax,eax
|
|
ENDIF
|
|
|
|
IF PROG_THRESHOLD
|
|
; If we're below the prog threshold, we can use the no-refinement clustering
|
|
; (which doesn't have to calculate MSE)
|
|
test ecx,2
|
|
je doprog
|
|
NOPROG_CLUSTER
|
|
jmp no_prog
|
|
doprog:
|
|
ENDIF
|
|
|
|
movdqa [tmp_base_reg-TMP_BEST],xmm7
|
|
movq mm1,mmword ptr [max_sint32] ; Initialise max error; scalar, leverage the MMX unit
|
|
|
|
next_refinement_loop:
|
|
movdqa xmm6,xmm7 ; Save the current X (since we corrupt it)
|
|
|
|
pmulhw xmm7, xmmword ptr [scale_one_third]
|
|
|
|
; Calculate E4 (4-colour block MSE)
|
|
; xmm0 and 1 are already abs(P-centre)
|
|
pxor xmm2,xmm2
|
|
pxor xmm3,xmm3
|
|
psubw xmm0,xmm7 ; Since it all parallelises nicely, it's faster to subtract twice (and uses a register less)
|
|
psubw xmm1,xmm7
|
|
psubw xmm0,xmm7 ; abs(P-centre) - 2/3x
|
|
psubw xmm1,xmm7
|
|
IF COUNT_PROG_STEPS EQ 0
|
|
pmovmskb eax,xmm0 ; 4-block clustering
|
|
pmovmskb efx,xmm1
|
|
ENDIF
|
|
|
|
; jmp refinement_done
|
|
|
|
psubw xmm2,xmm0
|
|
psubw xmm3,xmm1
|
|
pmaxsw xmm0,xmm2
|
|
pmaxsw xmm1,xmm3 ; abs(abs(P-centre)-2/3x)
|
|
|
|
psubw xmm0,xmm7 ; abs(abs(P-centre)-2/3x) - 1/3x
|
|
psubw xmm1,xmm7
|
|
|
|
pmaddwd xmm0,xmm0 ; 4 mean-square-error values, doing part of the parallel add
|
|
pmaddwd xmm1,xmm1
|
|
|
|
paddd xmm0,xmm1
|
|
|
|
pshufd xmm1,xmm0,SHUFFLE_SELECT(2,3,0,1) ; This is a big cost
|
|
paddd xmm0,xmm1
|
|
|
|
; Move to MMX for this last bit; faster on all but the most recent CPUs
|
|
movdq2q mm0,xmm0
|
|
movdq2q mm2,xmm0
|
|
psrlq mm0,32
|
|
paddd mm0,mm2
|
|
|
|
; Compare E4 with current minimum error and choose result
|
|
pcmpgtd mm1,mm0
|
|
|
|
pmovmskb ecx,mm1
|
|
|
|
; Pause here while that result becomes available, so read these up for go-around
|
|
movdqa xmm0,[tmp_base_reg-TMP_POS] ; Read up the source values of abs(P-centre)
|
|
movdqa xmm1,[tmp_base_reg-TMP_POS+16]
|
|
|
|
test ecx,8
|
|
jz refinement_done
|
|
|
|
movq mm1,mm0 ; Save the better max error
|
|
|
|
; Go around
|
|
go_around:
|
|
movdqa xmm7,xmm6
|
|
movdqa [tmp_base_reg-TMP_BEST],xmm6
|
|
|
|
IF COUNT_PROG_STEPS
|
|
add eax,1
|
|
ELSE
|
|
; Save the split point clustering data
|
|
shl efx,16
|
|
or eax,efx
|
|
and eax,0aaaaaaaah
|
|
mov [tmp_base_reg-TMP_CLUSTER+4],eax
|
|
ENDIF
|
|
|
|
; Fixed step size, tunable
|
|
psubw xmm7,[stepsize]
|
|
|
|
; Used to check for a negative stepsize here, but stopped that because I think it's
|
|
; impossible that the error could be less than it was the previous time and the
|
|
; movmsk/test combination is expensive.
|
|
jmp next_refinement_loop
|
|
|
|
|
|
refinement_done:
|
|
movdqa xmm7,[tmp_base_reg-TMP_BEST]
|
|
no_prog:
|
|
|
|
ENDM ; PROG
|
|
|
|
|
|
|
|
|
|
PROG_34 MACRO
|
|
LOCAL next_refinement_loop
|
|
LOCAL refinement_done
|
|
LOCAL no_prog
|
|
|
|
; Expects: xmm7 (all words) is the initial max value
|
|
; TMP_POS has been set up with the array of 16 words
|
|
|
|
IF COUNT_PROG_STEPS
|
|
xor eax,eax
|
|
ENDIF
|
|
|
|
IF PROG_THRESHOLD
|
|
; If we're below the prog threshold, we can use the no-refinement clustering
|
|
; XXX - this means that all threshold blocks get 4-colour clustering
|
|
test ecx,2
|
|
je doprog
|
|
NOPROG_CLUSTER
|
|
xor ecx,ecx ; 4-colour block
|
|
jmp no_prog
|
|
doprog:
|
|
ENDIF
|
|
|
|
movdqa [tmp_base_reg-TMP_BEST],xmm7
|
|
movq mm1,mmword ptr [max_sint32] ; Initialise max error; scalar, leverage the MMX unit
|
|
|
|
next_refinement_loop:
|
|
movdqa [tmp_base_reg-TMP_CURRENT],xmm7
|
|
movdqa xmm6,xmm7
|
|
pmulhw xmm7, xmmword ptr [scale_one_third]
|
|
psrlw xmm6,1 ; 1/2 x
|
|
|
|
|
|
; Calculate E3 and E4 (3 and 4-colour block MSE)
|
|
; xmm0 and 1 are already abs(P-centre)
|
|
|
|
movdqa xmm4,xmm0
|
|
movdqa xmm5,xmm1
|
|
|
|
IF X64
|
|
; This X64 path is a surprisingly marginal gain, probably because we can get decent
|
|
; loop-to-loop parallelism here
|
|
pxor xmm2,xmm2
|
|
movdqa xmm3,xmm2
|
|
movdqa xmm8,xmm2
|
|
movdqa xmm9,xmm2
|
|
psubw xmm0,xmm7 ; Since it all parallelises nicely, it's faster to subtract twice (and uses a register less)
|
|
psubw xmm1,xmm7
|
|
psubw xmm4,xmm6 ; abs(P-centre) - 1/2x
|
|
psubw xmm5,xmm6
|
|
psubw xmm0,xmm7 ; abs(P-centre) - 2/3x
|
|
psubw xmm1,xmm7
|
|
IF COUNT_PROG_STEPS EQ 0
|
|
pmovmskb egx,xmm4 ; 3-block clustering
|
|
pmovmskb ehx,xmm5
|
|
pmovmskb eax,xmm0 ; 4-block clustering
|
|
pmovmskb efx,xmm1
|
|
ENDIF
|
|
|
|
psubw xmm2,xmm0
|
|
psubw xmm3,xmm1
|
|
psubw xmm8,xmm4
|
|
psubw xmm9,xmm5
|
|
pmaxsw xmm0,xmm2
|
|
pmaxsw xmm1,xmm3 ; abs(abs(P-centre)-2/3x)
|
|
pmaxsw xmm4,xmm8
|
|
pmaxsw xmm5,xmm9 ; abs(abs(P-centre)-1/2x)
|
|
ELSE
|
|
; Calculate E4 (4-colour block MSE)
|
|
; xmm0 and 1 are already abs(P-centre)
|
|
pxor xmm2,xmm2
|
|
pxor xmm3,xmm3
|
|
psubw xmm0,xmm7 ; Since it all parallelises nicely, it's faster to subtract twice (and uses a register less)
|
|
psubw xmm1,xmm7
|
|
psubw xmm0,xmm7 ; abs(P-centre) - 2/3x
|
|
psubw xmm1,xmm7
|
|
IF COUNT_PROG_STEPS EQ 0
|
|
pmovmskb eax,xmm0 ; 4-block clustering
|
|
pmovmskb efx,xmm1
|
|
ENDIF
|
|
|
|
psubw xmm4,xmm6 ; abs(P-centre)-2/3x)
|
|
psubw xmm5,xmm6
|
|
pmovmskb egx,xmm4 ; 3-block clustering
|
|
pmovmskb ehx,xmm5
|
|
|
|
psubw xmm2,xmm0
|
|
psubw xmm3,xmm1
|
|
pmaxsw xmm0,xmm2
|
|
pmaxsw xmm1,xmm3 ; abs(abs(P-centre)-2/3x)
|
|
|
|
pxor xmm2,xmm2
|
|
pxor xmm3,xmm3
|
|
psubw xmm2,xmm4
|
|
psubw xmm3,xmm5
|
|
pmaxsw xmm4,xmm2
|
|
pmaxsw xmm5,xmm3 ; abs(abs(P-centre)-1/2x)
|
|
ENDIF
|
|
|
|
; jmp refinement_done
|
|
|
|
psubw xmm0,xmm7 ; abs(abs(P-centre)-2/3x) - 1/3x
|
|
psubw xmm1,xmm7
|
|
psubw xmm4,xmm6 ; abs(abs(P-centre)-1/2x) - 1/2x
|
|
psubw xmm5,xmm6
|
|
|
|
pmaddwd xmm0,xmm0 ; 4 mean-square-error values, doing part of the parallel add
|
|
pmaddwd xmm1,xmm1
|
|
pmaddwd xmm4,xmm4
|
|
pmaddwd xmm5,xmm5
|
|
|
|
paddd xmm0,xmm1
|
|
paddd xmm4,xmm5
|
|
|
|
pshufd xmm1,xmm0,SHUFFLE_SELECT(2,3,0,1) ; This is a big cost
|
|
pshufd xmm5,xmm4,SHUFFLE_SELECT(2,3,0,1)
|
|
paddd xmm0,xmm1
|
|
paddd xmm4,xmm5
|
|
|
|
; Move to MMX for this last bit; faster on all but the most recent CPUs
|
|
movdq2q mm0,xmm0
|
|
movdq2q mm4,xmm4
|
|
movdq2q mm2,xmm0
|
|
movdq2q mm6,xmm4
|
|
psrlq mm0,32
|
|
psrlq mm4,32
|
|
paddd mm0,mm2
|
|
paddd mm4,mm6
|
|
|
|
; Compare E4 (and E3 if present) with current minimum error and choose result
|
|
movq mm5,mm1
|
|
pcmpgtd mm1,mm0
|
|
|
|
pmovmskb ecx,mm1
|
|
|
|
; Pause here while that result becomes available, so read these up for go-around
|
|
movdqa xmm0,[tmp_base_reg-TMP_POS] ; Read up the source values of abs(P-centre)
|
|
movdqa xmm1,[tmp_base_reg-TMP_POS+16]
|
|
|
|
test ecx,8
|
|
jnz e4_good
|
|
; e4 is not better, is e3?
|
|
pcmpgtd mm5,mm4
|
|
pmovmskb ecx,mm5
|
|
test ecx,8
|
|
jz refinement_done
|
|
; e3 is the new best
|
|
e3_best:
|
|
movq mm1,mm4
|
|
mov ecx,0ffffffffh ; 3-colour block
|
|
mov eax,egx
|
|
mov efx,ehx
|
|
jmp go_around
|
|
|
|
e4_good:
|
|
; is e3 better than e4?
|
|
movq mm5,mm0
|
|
pcmpgtd mm5,mm4
|
|
pmovmskb ecx,mm5
|
|
test ecx,8
|
|
jnz e3_best
|
|
xor ecx,ecx ; 4-colour block
|
|
|
|
movq mm1,mm0 ; Save the better max error
|
|
|
|
; Go around
|
|
go_around:
|
|
mov [tmp_base_reg-TMP_CLUSTER+8],ecx
|
|
movdqa xmm7,[tmp_base_reg-TMP_CURRENT]
|
|
movdqa [tmp_base_reg-TMP_BEST],xmm7
|
|
|
|
IF COUNT_PROG_STEPS
|
|
add eax,1
|
|
ELSE
|
|
; Save the split point clustering data
|
|
shl efx,16
|
|
or eax,efx
|
|
and eax,0aaaaaaaah
|
|
mov [tmp_base_reg-TMP_CLUSTER+4],eax
|
|
ENDIF
|
|
|
|
; Fixed step size
|
|
psubw xmm7,[stepsize]
|
|
|
|
jmp next_refinement_loop
|
|
|
|
|
|
refinement_done:
|
|
movdqa xmm7,[tmp_base_reg-TMP_BEST]
|
|
no_prog:
|
|
|
|
ENDM ; PROG_34
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
COLOUR_AVERAGE MACRO output_reg, use_3component
|
|
|
|
; Output the average if there is no axis
|
|
|
|
; input is:
|
|
; average (8.4)
|
|
|
|
movq mm0,[tmp_base_reg-TMP_AVG]
|
|
IF AVG_FRAC_BITS LT 4
|
|
psllw mm0,4-AVG_FRAC_BITS ; Convert average to 8.4 format
|
|
ENDIF
|
|
|
|
IF WEIGHTING AND use_3component
|
|
pmullw mm0,[unweighting]
|
|
ENDIF
|
|
IF UNROUNDING
|
|
paddw mm0, mmword ptr [round_565]
|
|
movq mm3, mmword ptr [scale_to_round]
|
|
pmulhw mm3,mm0
|
|
pand mm3,mmword ptr [round_mask]
|
|
psubw mm0,mm3
|
|
ENDIF
|
|
|
|
pmulhw mm0,mmword ptr [scale_8_4_to_565] ; 0 R G B in 565 format
|
|
pminsw mm0,mmword ptr [clamp_565]
|
|
movq mm2,mm0
|
|
IF use_3component
|
|
movq mm4,mm0
|
|
pand mm0, [mask_third_word]
|
|
psrlq mm0,(32-11)
|
|
psrlq mm2,(16-5)
|
|
por mm2,mm4
|
|
ELSE
|
|
psrlq mm2,(16-5)
|
|
ENDIF
|
|
por mm0,mm2
|
|
|
|
; Duplicate
|
|
punpcklwd mm0,mm0
|
|
pand mm0,[mask_low_dword]
|
|
movq [output_reg],mm0
|
|
ENDM ; COLOUR
|
|
|
|
|
|
|
|
|
|
COLOUR MACRO output_reg, use_3component
|
|
; input is:
|
|
; x (9.5, always > 0) in xmm7 (all words)
|
|
; centre (9.5)
|
|
; axis (1.15)
|
|
; average (8.AVG_FRAC_BITS)
|
|
|
|
IF COUNT_PROG_STEPS
|
|
sub eax,1
|
|
test eax,0ffffff8h
|
|
jz noclamp
|
|
mov eax,8
|
|
noclamp:
|
|
mov eax,[col_table+eax*4]
|
|
mov [edx],eax
|
|
mov dword ptr [edx+4],0
|
|
ELSEIF SHOW_BLOCK_TYPES
|
|
mov ecx,[col_table+4]
|
|
mov efx,[col_table+8]
|
|
mov eax,[tmp_base_reg-TMP_CLUSTER+8]
|
|
cmp eax,3
|
|
cmove ecx,efx
|
|
mov [edx],ecx
|
|
mov dword ptr [edx+4],0
|
|
ELSE
|
|
|
|
|
|
; pxor xmm7,xmm7 ; to force the refined X to 0
|
|
|
|
; This is mostly done in MMX code, which is faster on some machines and (slightly)
|
|
; slower only on very recent chips as there are no independent chains to
|
|
; execute in the gaps
|
|
|
|
; The two colours are avg +- ((x+-centre)*axis)
|
|
movq mm0,[tmp_base_reg-TMP_AVG]
|
|
IF AVG_FRAC_BITS LT 4
|
|
psllw mm0,4-AVG_FRAC_BITS ; Convert average to 8.4 format
|
|
ENDIF
|
|
movdq2q mm1,xmm7 ; 9.5 format x
|
|
movdq2q mm2,xmm7
|
|
movq mm7,[tmp_base_reg-TMP_CENTRE] ; 9.5 format centre offset
|
|
movq mm5,[tmp_base_reg-TMP_AXIS]
|
|
paddw mm1,mm7 ; 9.5 format x+-centre
|
|
psubw mm2,mm7
|
|
IF UNROUNDING
|
|
IF WEIGHTING AND use_3component
|
|
paddw mm0, mmword ptr [round_565_weighted]
|
|
ELSE
|
|
paddw mm0, mmword ptr [round_565] ; 8.4 format average
|
|
ENDIF
|
|
ENDIF
|
|
pmulhw mm1,mm5 ; 8.4 format (x+-centre)*axis
|
|
pmulhw mm2,mm5
|
|
pxor mm4,mm4
|
|
paddw mm1,mm0 ; avg + axis offset
|
|
psubw mm0,mm2 ; avg - axis offset
|
|
pmaxsw mm0,mm4 ; Clamp to positive range (can't use addusw, axis is signed)
|
|
pmaxsw mm1,mm4
|
|
|
|
IF WEIGHTING AND use_3component
|
|
pmullw mm0,mmword ptr [unweighting]
|
|
pmullw mm1,mmword ptr [unweighting]
|
|
ENDIF
|
|
|
|
IF UNROUNDING
|
|
; The (canonical) DXTC decompressor uses (should use) bit replication to generate
|
|
; 888 colour values from the 565 input. We therefore need to tweak our colours here
|
|
; to take account of this. The procedure is to add 0.5 and then subtract between
|
|
; 0 and 1 depending on the value of the input. We aren't in the 565 colourspace yet,
|
|
; still 8.4 fixed, but we can scale accordingly as long as we mask out the bits we don't want
|
|
; to contribute (consider 1F->FF, 1E->F7, 1D->EF, 1C->E7 but 1B->DE, we must only
|
|
; apply the top three bits in the unrounding process for R/B and the top two for G).
|
|
movq mm3, mmword ptr [scale_to_round] ; This is 5 6 5 right shifts
|
|
movq mm4,mm0
|
|
movq mm5,mm1
|
|
movq mm6, mmword ptr [round_mask]
|
|
pmulhw mm4,mm3
|
|
pmulhw mm5,mm3
|
|
pand mm4,mm6 ; Truncate off bits below the threshold point at which they influence the result
|
|
pand mm5,mm6
|
|
psubw mm0,mm4
|
|
psubw mm1,mm5
|
|
ENDIF
|
|
|
|
movq mm3, mmword ptr [scale_8_4_to_565] ; Encodes the appropriate shifts
|
|
pmulhw mm0,mm3 ; 0 R G B in 565 format
|
|
pmulhw mm1,mm3
|
|
|
|
movq mm5,mmword ptr [clamp_565]
|
|
pminsw mm0,mm5
|
|
pminsw mm1,mm5
|
|
|
|
IF use_3component
|
|
pshufw mm4,mm0,SHUFFLE_SELECT(1,3,3,3)
|
|
pshufw mm5,mm1,SHUFFLE_SELECT(1,3,3,3)
|
|
pshufw mm2,mm0,SHUFFLE_SELECT(2,3,3,3)
|
|
pshufw mm3,mm1,SHUFFLE_SELECT(2,3,3,3)
|
|
psllw mm4,5
|
|
psllw mm5,5
|
|
psllw mm2,11
|
|
psllw mm3,11
|
|
por mm0,mm4
|
|
por mm1,mm5
|
|
ELSE
|
|
movq mm4,mmword ptr [mask_low_dword]
|
|
pand mm0,mm4
|
|
pand mm1,mm4
|
|
pshufw mm2,mm0,SHUFFLE_SELECT(1,3,3,3)
|
|
pshufw mm3,mm1,SHUFFLE_SELECT(1,3,3,3)
|
|
psllw mm2,5
|
|
psllw mm3,5
|
|
ENDIF
|
|
por mm0,mm2
|
|
por mm1,mm3
|
|
|
|
; mm0 and mm1 are c0 and c1
|
|
|
|
; Read up the cluster information we'll need
|
|
movd mm7,dword ptr [tmp_base_reg-TMP_CLUSTER] ; low/high (low bit)
|
|
movd mm6,dword ptr [tmp_base_reg-TMP_CLUSTER+4] ; endpoint/splitpoint (high bit)
|
|
IF USE_34
|
|
movd mm5,dword ptr [tmp_base_reg - TMP_CLUSTER+8] ; 3-colour flag
|
|
ELSE
|
|
movq mm5,[_0000000055555555]
|
|
ENDIF
|
|
|
|
; Compare and write in correct order
|
|
|
|
; rdx contains the destination DXTC block (dx == DXTC)
|
|
|
|
; We need to use dword compares - compares are signed and we want 32 result bits anyway
|
|
pshufw mm2,mm0,SHUFFLE_SELECT(0,3,3,3) ; Word 3 is zero, saves a read up cf. an AND
|
|
pshufw mm1,mm1,SHUFFLE_SELECT(0,3,3,3)
|
|
movq mm4,mm2
|
|
|
|
; Create the mask to say which way round the colours are
|
|
pcmpgtd mm2, mm1 ; mm2 is the swap mask
|
|
|
|
IF USE_34
|
|
pxor mm2,mm5 ; swap the swap flag
|
|
pand mm5,mm6 ; bits are set if 3-colour and endpoint/splitpoint bit is set
|
|
psrld mm5,1
|
|
pxor mm5, [_0000000055555555] ; Form mask
|
|
pand mm7,mm5 ; mask low/high
|
|
ENDIF
|
|
|
|
movq mm3,mm0
|
|
pand mm5,mm2 ; mm5 is the cluster swap bit pattern
|
|
|
|
pcmpeqd mm4, mm1 ; Set the equality flag for clustering info
|
|
pxor mm7,mm5 ; Apply the cluster swap or not
|
|
|
|
punpcklwd mm0, mm1 ; the two 565 colours in normal order
|
|
punpcklwd mm1, mm3 ; the two 565 colours in reversed order
|
|
por mm7,mm6 ; merge endpoint / splitpoint cluster with low/high cluster
|
|
|
|
pand mm0, mm2
|
|
pandn mm2, mm1
|
|
|
|
pandn mm4,mm7 ; apply zero mask flag to cluster bits
|
|
|
|
por mm0, mm2 ; one of the two colour sets as selected by mm5
|
|
|
|
punpckldq mm0,mm4 ; merge the colour and the cluster bits
|
|
movq [output_reg],mm0 ; and write
|
|
|
|
; 0 is the low word endpoint
|
|
; 1 is the high word endpoint
|
|
; 2 is the split point near 0
|
|
; 3 is the split point near 1
|
|
|
|
; For transparent blocks
|
|
; 0 is the low word endpoint
|
|
; 1 is the high word endpoint
|
|
; 2 is the interpolated point
|
|
|
|
ENDIF ; COUNT_PROG_STEPS
|
|
|
|
|
|
ENDM ; COLOUR
|
|
|
|
|
|
|
|
|
|
|
|
IF 0
|
|
|
|
AVERAGE_RGB_RB MACRO src_reg, stride_reg
|
|
movdqa xmm0, xmmword ptr [mask_rb]
|
|
|
|
; Read up all 16 values
|
|
; Use DQU since we can't be certain they're aligned
|
|
movdqu xmm4,[src_reg]
|
|
movdqu xmm5,[src_reg+stride_reg]
|
|
lea src_reg,[src_reg+2*stride_reg]
|
|
movdqu xmm6,[src_reg]
|
|
movdqu xmm7,[src_reg+stride_reg]
|
|
|
|
; RB is hugely convenient - we just need the AND and we have 4 (B R) pairs in each register
|
|
pand xmm4,xmm0
|
|
pand xmm5,xmm0
|
|
pand xmm6,xmm0
|
|
pand xmm7,xmm0
|
|
|
|
; We can keep the unpacked values in the scratch regs for now
|
|
movdqa xmm0,xmm4
|
|
movdqa xmm1,xmm5
|
|
paddw xmm0,xmm6
|
|
paddw xmm1,xmm7
|
|
paddw xmm0,xmm1
|
|
|
|
pshufd xmm1,xmm0,SHUFFLE_SELECT(2,3,0,1) ; dword halves
|
|
paddw xmm0,xmm1
|
|
pshuflw xmm1,xmm0,SHUFFLE_SELECT(2,3,0,1) ; word halves; do both sides, so we get the whole reg populated cheaper than a post shufd
|
|
pshufhw xmm1,xmm1,SHUFFLE_SELECT(2,3,0,1)
|
|
paddw xmm0,xmm1
|
|
|
|
; We now do a 'virtual divide' by 16 to generate the average, so
|
|
; xmm0 now has the average in all word pairs R,B in 8.4 fixed point format
|
|
movdqa [tmp_base_reg-TMP_AVG],xmm0
|
|
ENDM ; AVERAGE_RGB_RB
|
|
|
|
|
|
ACCUMULATE_AXIS_2C MACRO reg, lastreg, destination
|
|
psllw reg,4
|
|
pxor xmm4,xmm4
|
|
psubw reg,xmm0
|
|
movdqa destination,reg
|
|
pshuflw xmm2,reg,SHUFFLE_SELECT(1,0,3,2) ; reverse R and B for axis ordering calc
|
|
psubw xmm4,reg ; -axis
|
|
pshufhw xmm2,xmm2,SHUFFLE_SELECT(1,0,3,2)
|
|
pmaxsw xmm4,reg ; abs(axis)
|
|
psraw xmm2,16 ; state of sign bit
|
|
paddw xmm1,xmm4
|
|
pandn reg,xmm2
|
|
paddw reg,lastreg
|
|
ENDM
|
|
|
|
; We need to specify the reg to be used for indexing on the macro call (for easy portability to x64)
|
|
AXIS_2COMPONENT MACRO no_axis
|
|
|
|
; Expects: xmm4-7 have expanded RB values
|
|
; xmm0 is the average in the form R B R B R B R B
|
|
; Is expected to: xmm1 to the absolute axis
|
|
; xmm2 to the axis ordering info (for each axis pair sum of one value when the other value is positive or zero)
|
|
|
|
psllw xmm4,4
|
|
pxor xmm1,xmm1
|
|
psubw xmm4,xmm0
|
|
movdqa [tmp_base_reg-TMP_TMP],xmm4
|
|
pshuflw xmm3,xmm4,SHUFFLE_SELECT(1,0,3,2) ; reverse R and B for axis ordering calc
|
|
psubw xmm1,xmm4 ; -axis
|
|
pshufhw xmm3,xmm3,SHUFFLE_SELECT(1,0,3,2)
|
|
pmaxsw xmm1,xmm4 ; abs(axis)
|
|
psraw xmm3,16 ; state of sign bit
|
|
pandn xmm3,xmm4 ; pos
|
|
|
|
ACCUMULATE_AXIS_2C xmm5, xmm3, [tmp_base_reg-TMP_TMP+16]
|
|
ACCUMULATE_AXIS_2C xmm6, xmm5, [tmp_base_reg-TMP_TMP+32]
|
|
ACCUMULATE_AXIS_2C xmm7, xmm6, [tmp_base_reg-TMP_TMP+48]
|
|
|
|
; xmm7 is now the axis ordering info
|
|
|
|
; parallel add the 4 results in xmm1 and xmm7->xmm2 to get final absolute axis info
|
|
pshufd xmm3,xmm1,SHUFFLE_SELECT(2,3,0,1) ; dword halves
|
|
pshufd xmm2,xmm7,SHUFFLE_SELECT(2,3,0,1)
|
|
paddw xmm1,xmm3
|
|
paddw xmm2,xmm7
|
|
pshuflw xmm3,xmm1,SHUFFLE_SELECT(2,3,0,1) ; word halves
|
|
pshuflw xmm7,xmm2,SHUFFLE_SELECT(2,3,0,1)
|
|
paddw xmm1,xmm3 ; Final summed absolute axis
|
|
paddw xmm2,xmm7 ; Final summed pos
|
|
|
|
; If the axis is 0 we have a constant-colour block - we must catch this here (division by zero otherwise)
|
|
pxor xmm3,xmm3
|
|
pcmpeqw xmm3,xmm1
|
|
pmovmskb ecx,xmm3
|
|
and ecx,0ah
|
|
cmp ecx,0ah
|
|
je no_axis
|
|
ENDM ; AXIS_2COMPONENT
|
|
|
|
|
|
AXIS_NORM_2COMPONENT MACRO
|
|
; Expects: xmm1 to be the axis and xmm2 to be the ordering info
|
|
; Is expected to: set TMP_TMP to RB-avg
|
|
; set xmm7 to the axis
|
|
|
|
; axis is in 8.8 fixed point - it needs normalisation and ordering (the signs set correctly)
|
|
|
|
; We need to normalise the axis below, and negation is easier in float, so promote to float
|
|
pand xmm2, xmmword ptr [dword_word_mask] ; We only need the B part of the sign to be meaningful (the R-B axis)
|
|
pxor xmm4,xmm4
|
|
punpcklwd xmm1,xmm4 ; The axis is always positive so we can unpack with 0
|
|
punpcklwd xmm4,xmm2 ; Only the sign matters
|
|
|
|
; Promote both to float
|
|
cvtdq2ps xmm7,xmm1
|
|
cvtdq2ps xmm5,xmm4
|
|
|
|
; We can start the normalisation while the axis signs are being set
|
|
movaps xmm1, xmm7
|
|
xorps xmm2,xmm2
|
|
|
|
; magnitude is a 2D dot product
|
|
mulps xmm7, xmm7
|
|
|
|
cmpltps xmm5,xmm2 ; Signs of the axis ordering
|
|
|
|
movaps xmm4,xmm7
|
|
shufps xmm7,xmm7,SHUFFLE_SELECT(1,0,1,0)
|
|
addss xmm7,xmm4
|
|
|
|
andps xmm5,[b_sign_bit]
|
|
|
|
; low of xmm7 is the DP result
|
|
; If this is 0 we haven't actually got an axis, and we can't rsq it,
|
|
; so mask the output to 0 in this case. This generates an acceptable result
|
|
; It may also be important the top bits of the register used for the and stay as 0.
|
|
; Otherwise, it ands together two floating-point-messes...
|
|
cmpneqss xmm2,xmm7 ; xmm2 was still zero to this point
|
|
|
|
xorps xmm1,xmm5 ; Flip the sign of the axis if the r/g or b/g tests indicate a negative slope
|
|
|
|
; This skips the Newton-Raphson. It's half to 1% faster, and it seems to make very little difference to the
|
|
; int compressor (which is what I'd expect - we're only working at around 15 bits of precision anyway)
|
|
rsqrtss xmm7, xmm7
|
|
andps xmm7, xmm2 ; zero mask
|
|
|
|
; We also need the result in the correct 1.15 format when we send it back, so
|
|
; multiply that through here while we're waiting for the rcp to finish
|
|
mulps xmm1, xmmword ptr [scale_1_15]
|
|
|
|
shufps xmm7, xmm7, SHUFFLE_SELECT(0, 0, 0, 0)
|
|
shufps xmm1,xmm1, SHUFFLE_SELECT(0,1,0,1)
|
|
|
|
; Normalise
|
|
mulps xmm7, xmm1
|
|
|
|
; Get it back into int
|
|
cvtps2dq xmm7,xmm7
|
|
movdqa xmm1,xmm7
|
|
packssdw xmm7,xmm7 ; This duplicates as well, which is what we want
|
|
movdqa [tmp_base_reg-TMP_AXIS], xmm7
|
|
|
|
ENDM ; AXIS_NORM_2COMPONENT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
POS_2COMPONENT MACRO
|
|
|
|
; Expects: TMP_TMP to have expanded RGB-average values
|
|
; xmm7 == axis in form R G R G R G R G
|
|
; Is expected to: put abs(P-centre) in TMP_POS
|
|
; put 'centre side' info in TMP_CLUSTER_CENTRE
|
|
|
|
movdqa xmm0,[tmp_base_reg-TMP_TMP] ; Four (RB-avg) values in 9.4 (-255.0 to +255.0)
|
|
movdqa xmm1,[tmp_base_reg-TMP_TMP+16]
|
|
movdqa xmm2,[tmp_base_reg-TMP_TMP+32]
|
|
movdqa xmm3,[tmp_base_reg-TMP_TMP+48]
|
|
pmaddwd xmm0,xmm7 ; Multiply by axis and do 2D dot product: four 32-bit results in 9.19 format
|
|
pmaddwd xmm1,xmm7
|
|
pmaddwd xmm2,xmm7
|
|
pmaddwd xmm3,xmm7
|
|
|
|
psrad xmm0,10+AXIS_SHIFT_BITS ; Convert to desired 9.5 format (preserving sign)
|
|
psrad xmm1,10+AXIS_SHIFT_BITS
|
|
psrad xmm2,10+AXIS_SHIFT_BITS
|
|
psrad xmm3,10+AXIS_SHIFT_BITS
|
|
|
|
packssdw xmm0,xmm1 ; 8 results in xmm0
|
|
packssdw xmm2,xmm3
|
|
|
|
movdqa [tmp_base_reg-TMP_POS],xmm0
|
|
movdqa [tmp_base_reg-TMP_POS+16],xmm2
|
|
|
|
POS_MINMAX
|
|
POS_CENTRE
|
|
POS_OFFSET 1
|
|
|
|
ENDM ; POS_2AXIS
|
|
|
|
ENDIF
|
|
|
|
|
|
|
|
IF 0
|
|
|
|
DXTCV11CompressAlphaBlockSSE2 PROC
|
|
|
|
; The alpha compressor is somewhat simpler. There is no need to find an
|
|
; axis, and the min and max values serve to determine pos_minmax from
|
|
; which centre is generated.
|
|
|
|
; Progressive refinement is also less important (as there are more interpolated
|
|
; values available and it is more important that the endpoints be represented
|
|
; correctly)
|
|
|
|
movd mm4, dword ptr [ecx]
|
|
movd mm5, dword ptr [ecx+eax]
|
|
lea ecx,[ecx+2*eax]
|
|
movq mm0,mm4
|
|
movq mm1,mm4
|
|
movd mm6, dword ptr [ecx]
|
|
movd mm7, dword ptr [ecx+eax]
|
|
pmaxub mm0,mm5
|
|
pminub mm1,mm5
|
|
pmaxub mm0,mm6
|
|
pminub mm1,mm6
|
|
pmaxub mm0,mm7
|
|
pminub mm1,mm7
|
|
|
|
; Parallel minmax to finish up
|
|
pshufw mm2,mm0,SHUFFLE_SELECT(1,0,3,3)
|
|
pshufw mm3,mm1,SHUFFLE_SELECT(1,0,3,3)
|
|
pmaxub mm0,mm2
|
|
pminub mm1,mm3
|
|
movq mm2,mm0
|
|
movq mm3,mm1
|
|
psrlw mm0,8
|
|
psrlw mm1,8
|
|
pmaxub mm0,mm2
|
|
pminub mm1,mm3
|
|
|
|
; Promote to SSE2 and convert to words
|
|
punpckldq mm4,mm5
|
|
punpckldq mm6,mm7
|
|
pxor xmm5,xmm5
|
|
movq2dq xmm0,mm5
|
|
movq2dq xmm1,mm6
|
|
punpcklbw xmm0,xmm5
|
|
punpcklbw xmm1,xmm5
|
|
|
|
; Calculate centre
|
|
; Could use pavgb for this, but let's keep the precision
|
|
movq2dq xmm2,mm0
|
|
movq2dq xmm3,mm1
|
|
punpcklbw xmm2,xmm5
|
|
punpcklbw xmm3,xmm5
|
|
paddw xmm2,xmm3 ; 8.1 format centre
|
|
pshuflw xmm2,xmm2,SHUFFLE_SELECT(0,0,0,0)
|
|
pshufd xmm2,xmm2,SHUFFLE_SELECT(0,0,0,0)
|
|
movdqa [tmp_base_reg-TMP_CENTRE], xmm2 ; offset centre (9.5 format) for later correction of refined X
|
|
|
|
psllw xmm0,1 ; 8.1 format values
|
|
psllw xmm1,1
|
|
|
|
POS_OFFSET ebp, 0
|
|
|
|
|
|
; Select an endpoint which is exactly representable
|
|
; Any step can then be of 1.0
|
|
|
|
|
|
; Round all interpolated positions to integer values before calculating E8
|
|
|
|
|
|
|
|
; Given n, find the output value
|
|
; 0 2 3 4 5 6 7 1 is the output for given values on n
|
|
|
|
movdqa xmm1,[bytes_7]
|
|
pxor xmm2,xmm2
|
|
pcmpeqb xmm3,xmm3 ; all 1s
|
|
|
|
pcmpeqb xmm1,xmm0 ; mask set if 7
|
|
pcmpeqb xmm2,xmm0 ; mask set if 0
|
|
pxor xmm3,xmm1
|
|
pxor xmm3,xmm2 ; mask set if 1-6
|
|
|
|
pand xmm2,[bytes_1]
|
|
psubb xmm0,[bytes_1]
|
|
pand xmm3,xmm0
|
|
por xmm3,xmm2 ; final result in unpacked form
|
|
|
|
movdqa xmm0,xmm3
|
|
pand xmm3,[_07000700s] ; alternate mids
|
|
pand xmm0,[_00070007s] ; alternate lows
|
|
psrlw xmm3,5
|
|
por xmm0,xmm3 ; 8 sets of 6-bit pairs of results
|
|
movdqa xmm3,xmm0
|
|
pand xmm0,[_003f0000003f0000s] ; alternate mids
|
|
pand xmm3,[_0000003f0000003fs] ; alternate lows
|
|
psrld xmm0,12
|
|
por xmm0,xmm3 ; 4 sets of 12-bit 4 results
|
|
|
|
movdqa xmm3,xmm0
|
|
pand xmm0,[_00000fff00000000s] ; alternate mids
|
|
pand xmm3,[_0000000000000fffs] ; alternate lows
|
|
psrlq xmm0,20
|
|
por xmm0,xmm3 ; 2 sets of 24-bit 8 results
|
|
|
|
movdqa xmm3,xmm0
|
|
psrldq xmm0,10 ; 10-byte shift
|
|
por xmm3,xmm0 ; Result
|
|
|
|
; or-in endpoint values
|
|
|
|
|
|
|
|
DXTCV11CompressAlphaBlockSSE2 ENDP
|
|
|
|
ENDIF
|
|
|
|
|
|
|
|
|
|
;void __fastcall DXTCV11CompressExplicitAlphaBlockMMX(BYTE block_8[16], DWORD block_dxtc[2]);
|
|
IF X64
|
|
DXTCV11CompressExplicitAlphaBlockMMX PROC
|
|
movq mm0,[rcx]
|
|
movq mm1,[rcx+8]
|
|
ELSE
|
|
@DXTCV11CompressExplicitAlphaBlockMMX@8 PROC
|
|
movq mm0,[ecx]
|
|
movq mm1,[ecx+8]
|
|
ENDIF
|
|
|
|
; We have to adjust the values because of the derounding operation in the decode
|
|
; We need to add (7 - top nybble) to the lower nybble
|
|
movq mm4,[_0707070707070707]
|
|
movq mm5,mm4
|
|
movq mm2,mm0
|
|
movq mm3,mm1
|
|
psrlq mm2,4
|
|
psrlq mm3,4
|
|
movq mm6,[_0f0f0f0f0f0f0f0f]
|
|
pand mm2,mm6
|
|
pand mm3,mm6
|
|
psubb mm4,mm2 ; This is a signed value
|
|
psubb mm5,mm3
|
|
paddusb mm0,mm4 ; ... which is added or subtracted using unsigned saturation on the result to clamp to 0/255
|
|
paddusb mm1,mm5
|
|
|
|
; We need to pack into a single 64-bit word, discarding the lower bits
|
|
|
|
movq mm2,mm0
|
|
movq mm3,mm1
|
|
psrlq mm0,4 ; mm0 has x7x6x5x4x3x2x1x0
|
|
psrlq mm1,4
|
|
psrlq mm2,8 ; mm2 has 0x7x6x5x4x3x2x1x
|
|
psrlq mm3,8
|
|
|
|
movq mm4, [_000f000f000f000f]
|
|
movq mm5, [_00f000f000f000f0]
|
|
pand mm0,mm4 ; ...6...4...2...0
|
|
pand mm1,mm4
|
|
pand mm2,mm5 ; ..7...5...3...1.
|
|
pand mm3,mm5
|
|
por mm0,mm2 ; ..76..54..32..10
|
|
por mm1,mm3
|
|
|
|
packuswb mm0,mm1 ; fedcba98 76543210
|
|
|
|
IF X64
|
|
movq [rdx],mm0
|
|
ELSE
|
|
movq [edx],mm0
|
|
ENDIF
|
|
|
|
emms
|
|
ret
|
|
|
|
IF X64
|
|
DXTCV11CompressExplicitAlphaBlockMMX ENDP
|
|
ELSE
|
|
@DXTCV11CompressExplicitAlphaBlockMMX@8 ENDP
|
|
ENDIF
|
|
|
|
|
|
|
|
|
|
|
|
main_proc_name PROC
|
|
; Fetch stride and source according to calling conventions
|
|
mov stride_reg,16 ; Packed data input at the moment
|
|
IF X64
|
|
; rcx and rdx are already the correct source and destination
|
|
ELSE
|
|
; Fetch stride and source according to calling conventions
|
|
mov source_reg,[esp+4]
|
|
mov dest_reg,[esp+8]
|
|
ENDIF
|
|
;lea source_reg, white
|
|
;lea source_reg, whiteblack
|
|
;lea source_reg, redblack
|
|
;lea source_reg, redsblack_prog
|
|
;lea source_reg, redsblack2
|
|
;lea source_reg, greenred
|
|
|
|
SAVE_REGS
|
|
|
|
AVERAGE_RGB source_reg, stride_reg
|
|
; jmp no_axis ; Insert this to test averaging
|
|
|
|
AXIS_3COMPONENT no_axis
|
|
POS_3COMPONENT
|
|
IF NO_PROG
|
|
NOPROG_CLUSTER
|
|
ELSEIF USE_34
|
|
PROG_34
|
|
ELSE
|
|
PROG
|
|
ENDIF
|
|
colour:
|
|
COLOUR edx, 1
|
|
|
|
exit:
|
|
RESTORE_REGS
|
|
ret
|
|
|
|
no_axis:
|
|
COLOUR_AVERAGE dest_reg, 1
|
|
jmp exit
|
|
|
|
main_proc_name ENDP
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
IF 0
|
|
mov eax,16 ; Packed data input at the moment
|
|
mov ecx,[esp+4]
|
|
mov edx,[esp+8]
|
|
;lea ecx, bluered
|
|
;lea ecx,fade
|
|
|
|
SAVE_REGS
|
|
|
|
AVERAGE_RGB_RB ecx, eax
|
|
; jmp no_axis ; Insert this to test averaging
|
|
|
|
AXIS_2COMPONENT no_axis
|
|
AXIS_NORM_2COMPONENT
|
|
POS_2COMPONENT
|
|
PROG
|
|
COLOUR edx, 0
|
|
|
|
exit:
|
|
RESTORE_REGS
|
|
ret
|
|
|
|
no_axis:
|
|
COLOUR_AVERAGE edx, 0
|
|
jmp exit
|
|
|
|
|
|
ELSE
|
|
ENDIF
|
|
|
|
|
|
|
|
|
|
|
|
|
|
IF X64 EQ 0
|
|
|
|
; Prototype for fastcall with stride...
|
|
;void __fastcall DXTCV11CompressBlockSSE2Strided(DWORD *block_32, DWORD *block_dxtc, DWORD input_stride);
|
|
@DXTCV11CompressBlockSSE2Strided@12 PROC
|
|
; Fetch stride and source according to calling conventions
|
|
mov eax,[esp+4]
|
|
SAVE_REGS
|
|
|
|
AVERAGE_RGB ecx, eax
|
|
; jmp no_axis ; Insert this to test averaging
|
|
|
|
AXIS_3COMPONENT no_axis
|
|
POS_3COMPONENT
|
|
PROG
|
|
COLOUR edx, 1
|
|
|
|
exit:
|
|
RESTORE_REGS
|
|
ret
|
|
|
|
no_axis:
|
|
COLOUR_AVERAGE edx, 1
|
|
jmp exit
|
|
|
|
@DXTCV11CompressBlockSSE2Strided@12 ENDP
|
|
|
|
ENDIF
|
|
|
|
|
|
|
|
END
|
|
|
|
|
|
|
|
; x64 uses register calling conventions
|
|
; The first four parameters are put in rcx, rdx, r8, r9 (floats would be in xmm0-3)
|
|
; rax, r10, r11, xmm4 and xmm5 are volatile in addition to the above - all others must be saved
|
|
|
|
; void __cdecl DXTCCompressBlockSSE(DWORD *block_32, DWORD *block_dxtc);
|
|
; block_dxtc == rdx, how convenient!
|