TexConv/CMP_CompressonatorLib/DXTC/dxtc_v11_compress_64.asm

632 lines
21 KiB
NASM
Raw Normal View History

2020-07-31 11:31:32 +08:00
;===============================================================================
; Copyright (c) 2004-2006 ATI Technologies Inc.
;===============================================================================
AXIS_MUNGE equ 1
PROGRESSIVE_REFINEMENT equ 1
.DATA
ALIGN 16
EXTERN zero : XMMWORD
EXTERN one : XMMWORD
EXTERN one_over_16 : XMMWORD
EXTERN one_over_16_x_255_zeros : XMMWORD
EXTERN clearsign : XMMWORD
EXTERN signbit : XMMWORD
EXTERN half : XMMWORD
EXTERN c255 : XMMWORD
EXTERN c3 : XMMWORD
EXTERN lots : XMMWORD
EXTERN minuslots : XMMWORD
EXTERN mask_green : MMWORD
EXTERN mask_blue_red : MMWORD
EXTERN clamp_0 : MMWORD
EXTERN clamp_255 : MMWORD
EXTERN green_rounding : MMWORD
EXTERN blue_red_rounding: MMWORD
EXTERN rb_sign_bits : XMMWORD
EXTERN stepsize : XMMWORD
EXTERN onethird : XMMWORD
EXTERN maxerror_init : DWORD
EXTERN maxerror_epsilon : DWORD
EXTERN b_half : XMMWORD
EXTERN b_2x : XMMWORD
EXTERN invert : XMMWORD
EXTERN split_point : XMMWORD
EXTERN deviation_point : XMMWORD
EXTERN expandtable : DWORD
EXTERN expandtable : DWORD
EXTERN _0f0f0f0f0f0f0f0f : MMWORD
EXTERN _0707070707070707 : MMWORD
EXTERN _000f000f000f000f : MMWORD
EXTERN _00f000f000f000f0 : MMWORD
.CODE
PARALLEL_ADD_3D MACRO reg, tmp1, tmp2
movaps tmp1,reg
movhlps tmp2,reg
shufps reg,reg, 055h
addss tmp1,tmp2
addss reg,tmp1
ENDM
SHUFFLE_SELECT MACRO a,b,c,d
LOCAL Value
IF (a gt 3) or (b gt 3) or (c gt 3) or (d gt 3)
.ERR
EXITM <0>
ENDIF
Value = ((a) OR (b SHL 2) OR (c SHL 4) OR (d SHL 6))
EXITM %Value
ENDM
; x64 uses register calling conventions
; The first four parameters are put in rcx, rdx, r8, r9 (floats would be in xmm0-3)
; rax, r10, r11, xmm4 and xmm5 are volatile in addition to the above - all others must be saved
; void __cdecl DXTCCompressBlockSSE(DWORD *block_32, DWORD *block_dxtc);
; block_dxtc == rdx, how convenient!
DXTCV11CompressBlockSSE PROC
push rsi
push rdi
push rbx
push rbp
; Set up a 16-byte aligned storage space pointer
mov rbp, rsp
and rbp, NOT 0fh
SAVED_REGS equ 3
TMP_REGSAVE equ (SAVED_REGS*16)
; Any xmm regs over 5 need to be saved here as well
movaps [rbp-TMP_REGSAVE-( 0*16)], xmm6
movaps [rbp-TMP_REGSAVE-( 1*16)], xmm7
movaps [rbp-TMP_REGSAVE-( 2*16)], xmm8
; Other locals
TMP_RGB equ (TMP_REGSAVE+(16*16))
TMP_POS_ON_AXIS equ (TMP_RGB+(16*4))
TMP_CLUSTERPOS equ (TMP_POS_ON_AXIS+16)
TMP_OLDLEFT equ (TMP_CLUSTERPOS+16)
TMP_OLDRIGHT equ (TMP_OLDLEFT+16)
TMP_RIGHTSAVE equ (TMP_OLDRIGHT+16)
TMP_AXISSAVE equ (TMP_RIGHTSAVE+16)
TMP_AVGSAVE equ (TMP_AXISSAVE+16)
TMP_CLUSTERS equ (TMP_AVGSAVE+(16*4))
TMP_MAXERROR equ (TMP_CLUSTERS+16)
TMP_SWAP equ (TMP_MAXERROR+4)
; Convert the pixel values to float and find their average position
xorps xmm0, xmm0 ; average (r, g, b)
; The input data is in rcx
lea rdi, [rbp-TMP_RGB]
mov eax, 16
xorps xmm2,xmm2
average_loop:
movd xmm7, dword ptr [rcx]
punpcklbw xmm7,xmm2
punpcklwd xmm7,xmm2
cvtdq2ps xmm7,xmm7
add rcx, 4
IF AXIS_MUNGE
; colourspace conversion to increase weight of G at expense of B
movaps xmm6, xmm7
shufps xmm6, xmm6, SHUFFLE_SELECT(1, 1, 1, 1) ; G
addss xmm7, xmm6
mulss xmm7, dword ptr [b_half]
ENDIF
movaps [rdi], xmm7 ; save the value off in the RGB float array
add edi, 16
addps xmm0, xmm7 ; accumulate average
sub eax, 1
jne average_loop
; Compute average of the values
mulps xmm0, [one_over_16]
; For each component, reflect points about the average so all lie on the same side
; of the average, and compute the new average - this gives a second point that defines the axis
; To compute the sign of the axis sum the positive differences of G for each of R and B (the
; G axis is always positive in this implementation
xorps xmm1, xmm1 ; axis (r, g, b)
xorps xmm2, xmm2 ; rg_pos, rb_pos, bg_pos
lea rdi, [rbp-TMP_RGB]
mov ecx, 16
find_axis_loop:
movaps xmm7, [rdi] ; R G B value
add rdi, 16
subps xmm7, xmm0 ; subtract average - centred
movaps xmm6, xmm7
movaps xmm5, xmm7
andps xmm7, [clearsign] ; fabs (r, g, b)
addps xmm1, xmm7 ; accumulate direction of axis
shufps xmm6, xmm6, SHUFFLE_SELECT(0, 2, 2, 3) ; B R R 0
shufps xmm5, xmm5, SHUFFLE_SELECT(1, 0, 1, 3) ; G B G 0
cmpnltps xmm6, [zero] ; R/B > 0?
andps xmm6, xmm5 ; insert the G or B value for those channels which are positive
addps xmm2, xmm6 ; bg_pos rb_pos rg_pos
sub ecx, 1
jne find_axis_loop
; Handle RB_pos - RB pos is used if RG_pos and BG_pos are both zero.
movaps xmm5, xmm2 ; duplicate the pos across these three
movaps xmm6, xmm2
movaps xmm7, xmm2
shufps xmm5, xmm5, SHUFFLE_SELECT(1, 3, 3, 3) ; RB_pos 0 ->
shufps xmm6, xmm6, SHUFFLE_SELECT(2, 2, 2, 2) ; RG_pos ->
shufps xmm7, xmm7, SHUFFLE_SELECT(0, 0, 0, 0) ; BG_pos ->
orps xmm6, xmm7
cmpneqps xmm6, [zero] ; so check for any non-zero in RG_pos or BG_pos
andps xmm2, xmm6 ; Mask out RG_pos in current if we need to the current
xorps xmm6, [invert]
andps xmm5, xmm6
orps xmm2, xmm5 ; insert RB pos instead
; Change the sign of the R and B portions of the axis appropriately
cmpltps xmm2, [zero]
andps xmm2, [rb_sign_bits]
xorps xmm1, xmm2 ; Flip the sign of the axis if the r/g or b/g tests indicate a negative slope
; Axis projection and remapping
; Normalise the axis for simplicity of future calculation
movaps xmm7, xmm1
mulps xmm7, xmm7
PARALLEL_ADD_3D xmm7 ,xmm6,xmm5
; low of xmm7 is the DP result
; If this is 0 we haven't actually got an axis, and we can't rsq it,
; so mask the output to 0 in this case. This generates an acceptable result
movss xmm2, xmm7
cmpneqss xmm2, dword ptr [zero]
; RSQRT with Newton-Raphson. This can be omitted for even faster encoding performance, but quality
; and consistency improves with it in on certain images. It's not a large cost so leave it in.
rsqrtss xmm3, xmm7
andps xmm3, xmm2
movss xmm2, xmm3
mulss xmm3, xmm7
mulss xmm3, xmm2
mulss xmm2, dword ptr [half]
movss xmm7, dword ptr [c3]
subss xmm7, xmm3
mulss xmm7, xmm2
shufps xmm7, xmm7, SHUFFLE_SELECT(0, 0, 0, 0)
; Normalise
mulps xmm1, xmm7
; Map the axis
lea rsi, [rbp-TMP_RGB]
lea rdi, [rbp-TMP_POS_ON_AXIS]
mov ecx, 16
movaps xmm2, [lots] ; left
movaps xmm3, [minuslots] ; right
map_axis_loop:
movaps xmm7, [rsi]
subps xmm7, xmm0 ; subtract average
; dot product with axis
mulps xmm7, xmm1
PARALLEL_ADD_3D xmm7 ,xmm6,xmm5
; xmm7 == pos_on_axis
movss dword ptr [rdi], xmm7
add rdi, 4
minss xmm2, xmm7 ; calculate left
maxss xmm3, xmm7 ; calculate right
add rsi, 16
sub ecx, 1
jne map_axis_loop
; We have a good axis and the basic information about how the points are mapped to it
; We need to calculate the endpoints - the initial guess is to use the extremities.
; Left and right are used across all the simds
shufps xmm2, xmm2, SHUFFLE_SELECT(0, 0, 0, 0) ; left
shufps xmm3, xmm3, SHUFFLE_SELECT(0, 0, 0, 0) ; right
; To simplify further calculations, we offset everything such that the axis centre is at 0
; Calculate centre
movaps xmm7, xmm2 ; left
addps xmm7, xmm3 ; right
mulps xmm7, [half]
; Offset all the axis positions to the centre
lea rdi, [rbp-TMP_POS_ON_AXIS]
movaps xmm5, [rdi]
movaps xmm6, [rdi+16]
subps xmm5, xmm7
subps xmm6, xmm7
movaps [rdi], xmm5
movaps [rdi+16], xmm6
movaps xmm5, [rdi+32]
movaps xmm6, [rdi+48]
subps xmm5, xmm7
subps xmm6, xmm7
movaps [rdi+32], xmm5
movaps [rdi+48], xmm6
; Offset left, right and average to centre
subps xmm2, xmm7 ; left
subps xmm3, xmm7 ; right
mulps xmm7, xmm1 ; convert to rgb by multiplying by axis
addps xmm0, xmm7 ; average
IF PROGRESSIVE_REFINEMENT
; Attempt a (simple) progressive refinement step to reduce noise in the
; output image by trying to find a better overall match for the endpoints
; than the first-guess solution found so far (which is just to take the ends.
; The method is to move the endpoints inwards until a local minima is found.
; This provides quite a significant improvement in image quality.
mov eax, [maxerror_init]
mov [rbp-TMP_MAXERROR], eax
movaps [rbp-TMP_OLDLEFT], xmm2 ; XXX register space expansion
movaps [rbp-TMP_OLDRIGHT], xmm3 ; XXX register space expansion
lea rdi, [rbp-TMP_CLUSTERPOS]
next_refinement_loop:
movaps [rbp-TMP_RIGHTSAVE], xmm3 ; XXX register space expansion
xorps xmm8,xmm8 ; Error - clear the error
; Update the array of cluster positions based on the new values of left and right
movss xmm7, xmm2 ; left
addss xmm7, xmm3 ; right
mulss xmm7, dword ptr [half]
movss dword ptr [rdi], xmm2 ; left
movss dword ptr [rdi+4], xmm3 ; right
movss xmm5, xmm3 ; right
movss xmm6, xmm7
subss xmm5, xmm7 ; right-centre
movss xmm4, xmm7
movss xmm3, xmm5
mulss xmm5, dword ptr [deviation_point]
subss xmm6, xmm5
addss xmm4, xmm5
movss dword ptr [rdi+8], xmm6
movss dword ptr [rdi+12], xmm4
mulss xmm3, dword ptr [split_point]
; Calculate the current error
mov ecx, 16
lea rsi, [rbp-TMP_POS_ON_AXIS]
next_builderror_loop:
movss xmm4, dword ptr [rsi]
add rsi, 4
movaps xmm5, xmm4
movaps xmm6, xmm4
andps xmm4, [clearsign]
cmpltss xmm4, xmm3 ; < division means 2
cmpnltss xmm5, xmm7 ; >= centre means add 1
movmskps eax, xmm4
movmskps ebx, xmm5
lea eax, [ebx+2*eax] ; rax == which cluster (top 32 bits are cleared by this)
subss xmm6, dword ptr [rdi+4*rax] ; rdi = array of positions along axis of cluster
mulss xmm6, xmm6 ; square to use MSE and (conveniently eliminate sign)
addss xmm8, xmm6 ; Accumulate to the MSE
sub ecx, 1
jne next_builderror_loop
movaps xmm3, [rbp-TMP_RIGHTSAVE] ; restore the right XXX register space expansion
; Test and update the maximum error
movss xmm5, dword ptr [rbp-TMP_MAXERROR]
cmpltss xmm5, xmm8
movmskps eax, xmm5
test eax,1
jnz refinement_done
subss xmm6, [maxerror_epsilon]
movss dword ptr [rbp-TMP_MAXERROR], xmm8
movaps [rbp-TMP_OLDLEFT], xmm2 ; left XXX register space expansion
movaps [rbp-TMP_OLDRIGHT], xmm3 ; right XXX register space expansion
; step left and right in a bit
mulps xmm2, [stepsize] ; XXX promote to register to improve code density?
mulps xmm3, [stepsize]
jmp next_refinement_loop
refinement_done:
movaps xmm2, [rbp-TMP_OLDLEFT] ; left
movaps xmm3, [rbp-TMP_OLDRIGHT] ; right
ENDIF
; Endpoints and axis are now valid so we have all the information we need to compress the block
; Calculate the high and low output colour values
; Involved in this is a complex rounding procedure.
; A straight rounded average is not correct, as the decompressor unrounds by replicating
; the top bits to the bottom.
; In order to take account of this process, we don't just apply a straight rounding correction,
; but base our rounding on the input value (a straight rounding is actually pretty good in terms of
; error measure, but creates a visual colour and/or brightness shift relative to the original image)
; The method used here is to apply a centre-biased rounding dependent on the input value, which was
; (mostly by experiment) found to give minimum MSE while preserving the visual characteristics of
; the image.
; rgb = (average_rgb + (left|right)*v_rgb);
movaps xmm6, xmm2 ; left
movaps xmm7, xmm3 ; right
mulps xmm6, xmm1 ; axis
mulps xmm7, xmm1
addps xmm6, xmm0 ; average
addps xmm7, xmm0
IF AXIS_MUNGE
; Scale the B component, then subtract the green component resultant in each
movaps xmm4, xmm6
movaps xmm5, xmm7
mulps xmm6, [b_2x] ; XXX promote to register to improve code density?
mulps xmm7, [b_2x]
shufps xmm4, xmm4, SHUFFLE_SELECT(1, 1, 1, 1)
shufps xmm5, xmm5, SHUFFLE_SELECT(1, 1, 1, 1)
subss xmm6, xmm4
subss xmm7, xmm5
ENDIF
; Rearrange so B and R are in the same register half (they both use 5-bit rounding)
shufps xmm6, xmm6, SHUFFLE_SELECT(0, 2, 1, 3) ; B R G
shufps xmm7, xmm7, SHUFFLE_SELECT(0, 2, 1, 3)
; Convert to integer (by truncation, as C code does)
cvttps2pi mm0, xmm6
cvttps2pi mm1, xmm7
movhlps xmm6, xmm6
movhlps xmm7, xmm7
cvttps2pi mm2, xmm6
cvttps2pi mm3, xmm7
; mm0/1 is blue/red, mm2/3 is green
; This isn't quite the same as the C algorithm, but should generate the same result
; if the input range is ensured to be 0-255
; This code could be heavily interleaved, but for P4 it's not worth the hassle - the
; P4 reordering range of 15 instructions will let it do the job for us
; XXX promote all these consts to registers to improve code density?
pmaxsw mm0, [clamp_0] ; Note: faster to do these max/min in MMX than float XMM - better reordering opportunities
pmaxsw mm1, [clamp_0]
pminsw mm0, [clamp_255]
pminsw mm1, [clamp_255]
movq mm6, mm0
movq mm7, mm1
paddd mm0, [blue_red_rounding]
paddd mm1, [blue_red_rounding]
psrld mm6, 5
psrld mm7, 5
psubd mm0, mm6
psubd mm1, mm7
; No need to clamp here, with the input in 0-255 range it can never be outside at the end
pand mm0, [mask_blue_red]
pand mm1, [mask_blue_red]
; Separate out R and B as they will need separate shifts later
pshufw mm4, mm0, SHUFFLE_SELECT(2, 3, 2, 3) ; extract R (this is an SSE, not MMX, instruction)
pshufw mm5, mm1, SHUFFLE_SELECT(2, 3, 2, 3) ; also R
pmaxsw mm2, [clamp_0]
pmaxsw mm3, [clamp_0]
pminsw mm2, [clamp_255]
pminsw mm3, [clamp_255]
movq mm6, mm2
movq mm7, mm3
paddd mm2, [green_rounding]
paddd mm3, [green_rounding]
psrld mm6, 6
psrld mm7, 6
psubd mm2, mm6
psubd mm3, mm7
pand mm2, [mask_green]
pand mm3, [mask_green]
; Convert the 8-bit values to final RGB565 colours in mm0 and mm1
psrld mm0, 3
psrld mm1, 3
pslld mm4, 8
pslld mm5, 8
pslld mm2, 3
pslld mm3, 3
por mm0, mm4
por mm1, mm5
por mm0, mm2
por mm1, mm3
; mm0 and mm1 are c0 and c1
; Need to compare c0/c1 for sign and flip and set swap if required - and handle colour equality as well....
; rdx contains the destination DXTC block (dx == DXTC)
pxor mm5, mm5
punpcklwd mm0, mm5 ; unpack c0/c1 to DWORD's as pcmp is a signed comparison
punpcklwd mm1, mm5
movq mm2, mm0
movq mm3, mm0
movq mm4, mm0
pcmpgtd mm2, mm1
pxor mm2, qword ptr [invert] ; Need less than, so flip the result
movd dword ptr [rbp-TMP_SWAP], mm2 ; Set the swap flag (used below) appropriately)
; mm2 is the mask to indicate flipping is needed
pcmpeqd mm4, mm1
movd ebx, mm4 ; ebx is the equality flag, plenty of time for this slow move to resolve
punpcklwd mm0, mm1 ; 'normal' order
punpcklwd mm1, mm3 ; reversed order
pand mm1, mm2
pandn mm2, mm0
por mm1, mm2 ; one of the two, selected by mm2
movd dword ptr [rdx], mm1 ; write the result
; Colour writes complete - do some housekeeping
; Clear the output bitmasks
add rdx,4
mov dword ptr [rdx], 0
; If the values are equal, the bit selector is 0 because the two colours are
; the same (which implies transparent)
; This seems the easiest way to do it, and will only rarely break branch prediction on
; typical images.
test ebx, ebx
jnz all_done
IF 0
; Sanity check
movzx eax, word ptr [rdi-4]
movzx ebx, word ptr [rdi-2]
cmp eax, ebx
jge fine
int 3
fine:
ENDIF
; Final clustering, creating the 2-bit values that define the output
movaps xmm7, xmm3 ; right
mulps xmm3, [split_point]
addps xmm7, xmm2 ; left
mulps xmm7, [half] ; centre (probably 0, but what the hell)
lea rsi, [rbp-TMP_POS_ON_AXIS]
lea rdi, [expandtable]
mov r8d,[rbp-TMP_SWAP]
and r8d, 0fh
; Main cluster loop
movaps xmm0,[clearsign] ; Promote this outside the loop to improve code density
mov ecx, 4
next_bit_loop: ; Do 4 at once
movaps xmm4, [rsi] ; Read the four pos_on_axis entries
add esi, 16
movaps xmm5, xmm4
andps xmm4, xmm0 ; Clear the sign bits in this copy
cmpltps xmm4, xmm3 ; < division means 2
cmpnltps xmm5, xmm7 ; >= centre means add 1
movmskps eax, xmm4
movmskps ebx, xmm5
xor ebx,r8d ; Swap the order if we exchanged the colours
mov al, byte ptr [rdi+rax+16]
or al, byte ptr [rdi+rbx]
mov byte ptr [rdx], al ; rdx == block_dxtc+1
add rdx, 1
sub ecx, 1
jne next_bit_loop
; Complete
all_done:
; Restore changed regs and exit
movaps xmm6, [rbp-TMP_REGSAVE-( 0*16)]
movaps xmm7, [rbp-TMP_REGSAVE-( 1*16)]
movaps xmm8, [rbp-TMP_REGSAVE-( 2*16)]
pop rbp
pop rbx
pop rdi
pop rsi
ret
DXTCV11CompressBlockSSE ENDP
END