632 lines
21 KiB
NASM
632 lines
21 KiB
NASM
;===============================================================================
|
|
; Copyright (c) 2004-2006 ATI Technologies Inc.
|
|
;===============================================================================
|
|
|
|
|
|
AXIS_MUNGE equ 1
|
|
PROGRESSIVE_REFINEMENT equ 1
|
|
|
|
.DATA
|
|
|
|
ALIGN 16
|
|
|
|
|
|
EXTERN zero : XMMWORD
|
|
EXTERN one : XMMWORD
|
|
EXTERN one_over_16 : XMMWORD
|
|
EXTERN one_over_16_x_255_zeros : XMMWORD
|
|
EXTERN clearsign : XMMWORD
|
|
EXTERN signbit : XMMWORD
|
|
EXTERN half : XMMWORD
|
|
EXTERN c255 : XMMWORD
|
|
EXTERN c3 : XMMWORD
|
|
EXTERN lots : XMMWORD
|
|
EXTERN minuslots : XMMWORD
|
|
|
|
EXTERN mask_green : MMWORD
|
|
EXTERN mask_blue_red : MMWORD
|
|
EXTERN clamp_0 : MMWORD
|
|
EXTERN clamp_255 : MMWORD
|
|
EXTERN green_rounding : MMWORD
|
|
EXTERN blue_red_rounding: MMWORD
|
|
|
|
EXTERN rb_sign_bits : XMMWORD
|
|
|
|
EXTERN stepsize : XMMWORD
|
|
EXTERN onethird : XMMWORD
|
|
|
|
EXTERN maxerror_init : DWORD
|
|
EXTERN maxerror_epsilon : DWORD
|
|
|
|
EXTERN b_half : XMMWORD
|
|
EXTERN b_2x : XMMWORD
|
|
|
|
EXTERN invert : XMMWORD
|
|
|
|
|
|
EXTERN split_point : XMMWORD
|
|
EXTERN deviation_point : XMMWORD
|
|
|
|
|
|
EXTERN expandtable : DWORD
|
|
|
|
|
|
|
|
EXTERN expandtable : DWORD
|
|
|
|
|
|
EXTERN _0f0f0f0f0f0f0f0f : MMWORD
|
|
EXTERN _0707070707070707 : MMWORD
|
|
EXTERN _000f000f000f000f : MMWORD
|
|
EXTERN _00f000f000f000f0 : MMWORD
|
|
|
|
|
|
.CODE
|
|
|
|
|
|
PARALLEL_ADD_3D MACRO reg, tmp1, tmp2
|
|
movaps tmp1,reg
|
|
movhlps tmp2,reg
|
|
shufps reg,reg, 055h
|
|
addss tmp1,tmp2
|
|
addss reg,tmp1
|
|
|
|
ENDM
|
|
|
|
SHUFFLE_SELECT MACRO a,b,c,d
|
|
LOCAL Value
|
|
IF (a gt 3) or (b gt 3) or (c gt 3) or (d gt 3)
|
|
.ERR
|
|
EXITM <0>
|
|
ENDIF
|
|
Value = ((a) OR (b SHL 2) OR (c SHL 4) OR (d SHL 6))
|
|
EXITM %Value
|
|
ENDM
|
|
|
|
|
|
|
|
; x64 uses register calling conventions
|
|
; The first four parameters are put in rcx, rdx, r8, r9 (floats would be in xmm0-3)
|
|
; rax, r10, r11, xmm4 and xmm5 are volatile in addition to the above - all others must be saved
|
|
|
|
; void __cdecl DXTCCompressBlockSSE(DWORD *block_32, DWORD *block_dxtc);
|
|
; block_dxtc == rdx, how convenient!
|
|
|
|
|
|
|
|
DXTCV11CompressBlockSSE PROC
|
|
|
|
push rsi
|
|
push rdi
|
|
push rbx
|
|
push rbp
|
|
|
|
; Set up a 16-byte aligned storage space pointer
|
|
mov rbp, rsp
|
|
and rbp, NOT 0fh
|
|
|
|
SAVED_REGS equ 3
|
|
TMP_REGSAVE equ (SAVED_REGS*16)
|
|
|
|
; Any xmm regs over 5 need to be saved here as well
|
|
movaps [rbp-TMP_REGSAVE-( 0*16)], xmm6
|
|
movaps [rbp-TMP_REGSAVE-( 1*16)], xmm7
|
|
movaps [rbp-TMP_REGSAVE-( 2*16)], xmm8
|
|
|
|
|
|
; Other locals
|
|
TMP_RGB equ (TMP_REGSAVE+(16*16))
|
|
TMP_POS_ON_AXIS equ (TMP_RGB+(16*4))
|
|
|
|
TMP_CLUSTERPOS equ (TMP_POS_ON_AXIS+16)
|
|
TMP_OLDLEFT equ (TMP_CLUSTERPOS+16)
|
|
TMP_OLDRIGHT equ (TMP_OLDLEFT+16)
|
|
TMP_RIGHTSAVE equ (TMP_OLDRIGHT+16)
|
|
TMP_AXISSAVE equ (TMP_RIGHTSAVE+16)
|
|
TMP_AVGSAVE equ (TMP_AXISSAVE+16)
|
|
|
|
TMP_CLUSTERS equ (TMP_AVGSAVE+(16*4))
|
|
|
|
TMP_MAXERROR equ (TMP_CLUSTERS+16)
|
|
TMP_SWAP equ (TMP_MAXERROR+4)
|
|
|
|
|
|
; Convert the pixel values to float and find their average position
|
|
|
|
xorps xmm0, xmm0 ; average (r, g, b)
|
|
|
|
; The input data is in rcx
|
|
lea rdi, [rbp-TMP_RGB]
|
|
mov eax, 16
|
|
xorps xmm2,xmm2
|
|
|
|
average_loop:
|
|
movd xmm7, dword ptr [rcx]
|
|
punpcklbw xmm7,xmm2
|
|
punpcklwd xmm7,xmm2
|
|
cvtdq2ps xmm7,xmm7
|
|
add rcx, 4
|
|
|
|
IF AXIS_MUNGE
|
|
; colourspace conversion to increase weight of G at expense of B
|
|
movaps xmm6, xmm7
|
|
shufps xmm6, xmm6, SHUFFLE_SELECT(1, 1, 1, 1) ; G
|
|
addss xmm7, xmm6
|
|
mulss xmm7, dword ptr [b_half]
|
|
ENDIF
|
|
|
|
movaps [rdi], xmm7 ; save the value off in the RGB float array
|
|
add edi, 16
|
|
addps xmm0, xmm7 ; accumulate average
|
|
|
|
sub eax, 1
|
|
jne average_loop
|
|
|
|
; Compute average of the values
|
|
mulps xmm0, [one_over_16]
|
|
|
|
|
|
|
|
; For each component, reflect points about the average so all lie on the same side
|
|
; of the average, and compute the new average - this gives a second point that defines the axis
|
|
; To compute the sign of the axis sum the positive differences of G for each of R and B (the
|
|
; G axis is always positive in this implementation
|
|
|
|
xorps xmm1, xmm1 ; axis (r, g, b)
|
|
xorps xmm2, xmm2 ; rg_pos, rb_pos, bg_pos
|
|
lea rdi, [rbp-TMP_RGB]
|
|
|
|
mov ecx, 16
|
|
find_axis_loop:
|
|
movaps xmm7, [rdi] ; R G B value
|
|
add rdi, 16
|
|
subps xmm7, xmm0 ; subtract average - centred
|
|
movaps xmm6, xmm7
|
|
movaps xmm5, xmm7
|
|
|
|
andps xmm7, [clearsign] ; fabs (r, g, b)
|
|
addps xmm1, xmm7 ; accumulate direction of axis
|
|
|
|
shufps xmm6, xmm6, SHUFFLE_SELECT(0, 2, 2, 3) ; B R R 0
|
|
shufps xmm5, xmm5, SHUFFLE_SELECT(1, 0, 1, 3) ; G B G 0
|
|
|
|
cmpnltps xmm6, [zero] ; R/B > 0?
|
|
andps xmm6, xmm5 ; insert the G or B value for those channels which are positive
|
|
addps xmm2, xmm6 ; bg_pos rb_pos rg_pos
|
|
|
|
sub ecx, 1
|
|
jne find_axis_loop
|
|
|
|
|
|
; Handle RB_pos - RB pos is used if RG_pos and BG_pos are both zero.
|
|
movaps xmm5, xmm2 ; duplicate the pos across these three
|
|
movaps xmm6, xmm2
|
|
movaps xmm7, xmm2
|
|
shufps xmm5, xmm5, SHUFFLE_SELECT(1, 3, 3, 3) ; RB_pos 0 ->
|
|
shufps xmm6, xmm6, SHUFFLE_SELECT(2, 2, 2, 2) ; RG_pos ->
|
|
shufps xmm7, xmm7, SHUFFLE_SELECT(0, 0, 0, 0) ; BG_pos ->
|
|
orps xmm6, xmm7
|
|
cmpneqps xmm6, [zero] ; so check for any non-zero in RG_pos or BG_pos
|
|
andps xmm2, xmm6 ; Mask out RG_pos in current if we need to the current
|
|
xorps xmm6, [invert]
|
|
andps xmm5, xmm6
|
|
orps xmm2, xmm5 ; insert RB pos instead
|
|
|
|
|
|
; Change the sign of the R and B portions of the axis appropriately
|
|
cmpltps xmm2, [zero]
|
|
andps xmm2, [rb_sign_bits]
|
|
xorps xmm1, xmm2 ; Flip the sign of the axis if the r/g or b/g tests indicate a negative slope
|
|
|
|
|
|
|
|
; Axis projection and remapping
|
|
|
|
; Normalise the axis for simplicity of future calculation
|
|
movaps xmm7, xmm1
|
|
|
|
mulps xmm7, xmm7
|
|
PARALLEL_ADD_3D xmm7 ,xmm6,xmm5
|
|
|
|
; low of xmm7 is the DP result
|
|
; If this is 0 we haven't actually got an axis, and we can't rsq it,
|
|
; so mask the output to 0 in this case. This generates an acceptable result
|
|
movss xmm2, xmm7
|
|
cmpneqss xmm2, dword ptr [zero]
|
|
|
|
; RSQRT with Newton-Raphson. This can be omitted for even faster encoding performance, but quality
|
|
; and consistency improves with it in on certain images. It's not a large cost so leave it in.
|
|
rsqrtss xmm3, xmm7
|
|
andps xmm3, xmm2
|
|
|
|
movss xmm2, xmm3
|
|
mulss xmm3, xmm7
|
|
mulss xmm3, xmm2
|
|
mulss xmm2, dword ptr [half]
|
|
movss xmm7, dword ptr [c3]
|
|
subss xmm7, xmm3
|
|
mulss xmm7, xmm2
|
|
|
|
shufps xmm7, xmm7, SHUFFLE_SELECT(0, 0, 0, 0)
|
|
|
|
; Normalise
|
|
mulps xmm1, xmm7
|
|
|
|
|
|
; Map the axis
|
|
|
|
lea rsi, [rbp-TMP_RGB]
|
|
lea rdi, [rbp-TMP_POS_ON_AXIS]
|
|
mov ecx, 16
|
|
movaps xmm2, [lots] ; left
|
|
movaps xmm3, [minuslots] ; right
|
|
|
|
map_axis_loop:
|
|
movaps xmm7, [rsi]
|
|
subps xmm7, xmm0 ; subtract average
|
|
|
|
; dot product with axis
|
|
mulps xmm7, xmm1
|
|
PARALLEL_ADD_3D xmm7 ,xmm6,xmm5
|
|
|
|
; xmm7 == pos_on_axis
|
|
|
|
movss dword ptr [rdi], xmm7
|
|
add rdi, 4
|
|
|
|
minss xmm2, xmm7 ; calculate left
|
|
maxss xmm3, xmm7 ; calculate right
|
|
|
|
add rsi, 16
|
|
|
|
sub ecx, 1
|
|
jne map_axis_loop
|
|
|
|
|
|
; We have a good axis and the basic information about how the points are mapped to it
|
|
; We need to calculate the endpoints - the initial guess is to use the extremities.
|
|
|
|
; Left and right are used across all the simds
|
|
shufps xmm2, xmm2, SHUFFLE_SELECT(0, 0, 0, 0) ; left
|
|
shufps xmm3, xmm3, SHUFFLE_SELECT(0, 0, 0, 0) ; right
|
|
|
|
|
|
; To simplify further calculations, we offset everything such that the axis centre is at 0
|
|
|
|
; Calculate centre
|
|
movaps xmm7, xmm2 ; left
|
|
addps xmm7, xmm3 ; right
|
|
mulps xmm7, [half]
|
|
|
|
; Offset all the axis positions to the centre
|
|
lea rdi, [rbp-TMP_POS_ON_AXIS]
|
|
movaps xmm5, [rdi]
|
|
movaps xmm6, [rdi+16]
|
|
subps xmm5, xmm7
|
|
subps xmm6, xmm7
|
|
movaps [rdi], xmm5
|
|
movaps [rdi+16], xmm6
|
|
movaps xmm5, [rdi+32]
|
|
movaps xmm6, [rdi+48]
|
|
subps xmm5, xmm7
|
|
subps xmm6, xmm7
|
|
movaps [rdi+32], xmm5
|
|
movaps [rdi+48], xmm6
|
|
|
|
; Offset left, right and average to centre
|
|
subps xmm2, xmm7 ; left
|
|
subps xmm3, xmm7 ; right
|
|
|
|
mulps xmm7, xmm1 ; convert to rgb by multiplying by axis
|
|
addps xmm0, xmm7 ; average
|
|
|
|
|
|
|
|
IF PROGRESSIVE_REFINEMENT
|
|
|
|
; Attempt a (simple) progressive refinement step to reduce noise in the
|
|
; output image by trying to find a better overall match for the endpoints
|
|
; than the first-guess solution found so far (which is just to take the ends.
|
|
|
|
; The method is to move the endpoints inwards until a local minima is found.
|
|
; This provides quite a significant improvement in image quality.
|
|
|
|
mov eax, [maxerror_init]
|
|
mov [rbp-TMP_MAXERROR], eax
|
|
|
|
movaps [rbp-TMP_OLDLEFT], xmm2 ; XXX register space expansion
|
|
movaps [rbp-TMP_OLDRIGHT], xmm3 ; XXX register space expansion
|
|
|
|
lea rdi, [rbp-TMP_CLUSTERPOS]
|
|
|
|
next_refinement_loop:
|
|
|
|
movaps [rbp-TMP_RIGHTSAVE], xmm3 ; XXX register space expansion
|
|
|
|
xorps xmm8,xmm8 ; Error - clear the error
|
|
|
|
; Update the array of cluster positions based on the new values of left and right
|
|
movss xmm7, xmm2 ; left
|
|
addss xmm7, xmm3 ; right
|
|
mulss xmm7, dword ptr [half]
|
|
movss dword ptr [rdi], xmm2 ; left
|
|
movss dword ptr [rdi+4], xmm3 ; right
|
|
movss xmm5, xmm3 ; right
|
|
movss xmm6, xmm7
|
|
subss xmm5, xmm7 ; right-centre
|
|
movss xmm4, xmm7
|
|
movss xmm3, xmm5
|
|
mulss xmm5, dword ptr [deviation_point]
|
|
subss xmm6, xmm5
|
|
addss xmm4, xmm5
|
|
movss dword ptr [rdi+8], xmm6
|
|
movss dword ptr [rdi+12], xmm4
|
|
|
|
mulss xmm3, dword ptr [split_point]
|
|
|
|
|
|
; Calculate the current error
|
|
mov ecx, 16
|
|
lea rsi, [rbp-TMP_POS_ON_AXIS]
|
|
next_builderror_loop:
|
|
movss xmm4, dword ptr [rsi]
|
|
add rsi, 4
|
|
movaps xmm5, xmm4
|
|
movaps xmm6, xmm4
|
|
andps xmm4, [clearsign]
|
|
cmpltss xmm4, xmm3 ; < division means 2
|
|
cmpnltss xmm5, xmm7 ; >= centre means add 1
|
|
|
|
movmskps eax, xmm4
|
|
movmskps ebx, xmm5
|
|
lea eax, [ebx+2*eax] ; rax == which cluster (top 32 bits are cleared by this)
|
|
|
|
subss xmm6, dword ptr [rdi+4*rax] ; rdi = array of positions along axis of cluster
|
|
mulss xmm6, xmm6 ; square to use MSE and (conveniently eliminate sign)
|
|
addss xmm8, xmm6 ; Accumulate to the MSE
|
|
|
|
sub ecx, 1
|
|
jne next_builderror_loop
|
|
|
|
|
|
movaps xmm3, [rbp-TMP_RIGHTSAVE] ; restore the right XXX register space expansion
|
|
|
|
; Test and update the maximum error
|
|
movss xmm5, dword ptr [rbp-TMP_MAXERROR]
|
|
cmpltss xmm5, xmm8
|
|
movmskps eax, xmm5
|
|
test eax,1
|
|
jnz refinement_done
|
|
|
|
|
|
subss xmm6, [maxerror_epsilon]
|
|
movss dword ptr [rbp-TMP_MAXERROR], xmm8
|
|
|
|
movaps [rbp-TMP_OLDLEFT], xmm2 ; left XXX register space expansion
|
|
movaps [rbp-TMP_OLDRIGHT], xmm3 ; right XXX register space expansion
|
|
|
|
; step left and right in a bit
|
|
mulps xmm2, [stepsize] ; XXX promote to register to improve code density?
|
|
mulps xmm3, [stepsize]
|
|
|
|
jmp next_refinement_loop
|
|
|
|
refinement_done:
|
|
movaps xmm2, [rbp-TMP_OLDLEFT] ; left
|
|
movaps xmm3, [rbp-TMP_OLDRIGHT] ; right
|
|
ENDIF
|
|
|
|
; Endpoints and axis are now valid so we have all the information we need to compress the block
|
|
|
|
|
|
; Calculate the high and low output colour values
|
|
|
|
; Involved in this is a complex rounding procedure.
|
|
; A straight rounded average is not correct, as the decompressor unrounds by replicating
|
|
; the top bits to the bottom.
|
|
|
|
; In order to take account of this process, we don't just apply a straight rounding correction,
|
|
; but base our rounding on the input value (a straight rounding is actually pretty good in terms of
|
|
; error measure, but creates a visual colour and/or brightness shift relative to the original image)
|
|
; The method used here is to apply a centre-biased rounding dependent on the input value, which was
|
|
; (mostly by experiment) found to give minimum MSE while preserving the visual characteristics of
|
|
; the image.
|
|
; rgb = (average_rgb + (left|right)*v_rgb);
|
|
movaps xmm6, xmm2 ; left
|
|
movaps xmm7, xmm3 ; right
|
|
mulps xmm6, xmm1 ; axis
|
|
mulps xmm7, xmm1
|
|
addps xmm6, xmm0 ; average
|
|
addps xmm7, xmm0
|
|
|
|
IF AXIS_MUNGE
|
|
; Scale the B component, then subtract the green component resultant in each
|
|
movaps xmm4, xmm6
|
|
movaps xmm5, xmm7
|
|
mulps xmm6, [b_2x] ; XXX promote to register to improve code density?
|
|
mulps xmm7, [b_2x]
|
|
shufps xmm4, xmm4, SHUFFLE_SELECT(1, 1, 1, 1)
|
|
shufps xmm5, xmm5, SHUFFLE_SELECT(1, 1, 1, 1)
|
|
subss xmm6, xmm4
|
|
subss xmm7, xmm5
|
|
ENDIF
|
|
|
|
; Rearrange so B and R are in the same register half (they both use 5-bit rounding)
|
|
shufps xmm6, xmm6, SHUFFLE_SELECT(0, 2, 1, 3) ; B R G
|
|
shufps xmm7, xmm7, SHUFFLE_SELECT(0, 2, 1, 3)
|
|
|
|
; Convert to integer (by truncation, as C code does)
|
|
cvttps2pi mm0, xmm6
|
|
cvttps2pi mm1, xmm7
|
|
movhlps xmm6, xmm6
|
|
movhlps xmm7, xmm7
|
|
cvttps2pi mm2, xmm6
|
|
cvttps2pi mm3, xmm7
|
|
|
|
; mm0/1 is blue/red, mm2/3 is green
|
|
|
|
; This isn't quite the same as the C algorithm, but should generate the same result
|
|
; if the input range is ensured to be 0-255
|
|
|
|
; This code could be heavily interleaved, but for P4 it's not worth the hassle - the
|
|
; P4 reordering range of 15 instructions will let it do the job for us
|
|
|
|
; XXX promote all these consts to registers to improve code density?
|
|
pmaxsw mm0, [clamp_0] ; Note: faster to do these max/min in MMX than float XMM - better reordering opportunities
|
|
pmaxsw mm1, [clamp_0]
|
|
pminsw mm0, [clamp_255]
|
|
pminsw mm1, [clamp_255]
|
|
movq mm6, mm0
|
|
movq mm7, mm1
|
|
paddd mm0, [blue_red_rounding]
|
|
paddd mm1, [blue_red_rounding]
|
|
psrld mm6, 5
|
|
psrld mm7, 5
|
|
psubd mm0, mm6
|
|
psubd mm1, mm7
|
|
; No need to clamp here, with the input in 0-255 range it can never be outside at the end
|
|
pand mm0, [mask_blue_red]
|
|
pand mm1, [mask_blue_red]
|
|
|
|
; Separate out R and B as they will need separate shifts later
|
|
pshufw mm4, mm0, SHUFFLE_SELECT(2, 3, 2, 3) ; extract R (this is an SSE, not MMX, instruction)
|
|
pshufw mm5, mm1, SHUFFLE_SELECT(2, 3, 2, 3) ; also R
|
|
|
|
pmaxsw mm2, [clamp_0]
|
|
pmaxsw mm3, [clamp_0]
|
|
pminsw mm2, [clamp_255]
|
|
pminsw mm3, [clamp_255]
|
|
movq mm6, mm2
|
|
movq mm7, mm3
|
|
paddd mm2, [green_rounding]
|
|
paddd mm3, [green_rounding]
|
|
psrld mm6, 6
|
|
psrld mm7, 6
|
|
psubd mm2, mm6
|
|
psubd mm3, mm7
|
|
pand mm2, [mask_green]
|
|
pand mm3, [mask_green]
|
|
|
|
|
|
; Convert the 8-bit values to final RGB565 colours in mm0 and mm1
|
|
psrld mm0, 3
|
|
psrld mm1, 3
|
|
pslld mm4, 8
|
|
pslld mm5, 8
|
|
pslld mm2, 3
|
|
pslld mm3, 3
|
|
por mm0, mm4
|
|
por mm1, mm5
|
|
por mm0, mm2
|
|
por mm1, mm3
|
|
|
|
; mm0 and mm1 are c0 and c1
|
|
|
|
|
|
; Need to compare c0/c1 for sign and flip and set swap if required - and handle colour equality as well....
|
|
|
|
; rdx contains the destination DXTC block (dx == DXTC)
|
|
|
|
pxor mm5, mm5
|
|
punpcklwd mm0, mm5 ; unpack c0/c1 to DWORD's as pcmp is a signed comparison
|
|
punpcklwd mm1, mm5
|
|
movq mm2, mm0
|
|
movq mm3, mm0
|
|
movq mm4, mm0
|
|
pcmpgtd mm2, mm1
|
|
pxor mm2, qword ptr [invert] ; Need less than, so flip the result
|
|
movd dword ptr [rbp-TMP_SWAP], mm2 ; Set the swap flag (used below) appropriately)
|
|
; mm2 is the mask to indicate flipping is needed
|
|
|
|
pcmpeqd mm4, mm1
|
|
movd ebx, mm4 ; ebx is the equality flag, plenty of time for this slow move to resolve
|
|
|
|
punpcklwd mm0, mm1 ; 'normal' order
|
|
punpcklwd mm1, mm3 ; reversed order
|
|
pand mm1, mm2
|
|
pandn mm2, mm0
|
|
por mm1, mm2 ; one of the two, selected by mm2
|
|
movd dword ptr [rdx], mm1 ; write the result
|
|
|
|
|
|
; Colour writes complete - do some housekeeping
|
|
|
|
; Clear the output bitmasks
|
|
add rdx,4
|
|
mov dword ptr [rdx], 0
|
|
|
|
; If the values are equal, the bit selector is 0 because the two colours are
|
|
; the same (which implies transparent)
|
|
; This seems the easiest way to do it, and will only rarely break branch prediction on
|
|
; typical images.
|
|
test ebx, ebx
|
|
jnz all_done
|
|
|
|
IF 0
|
|
; Sanity check
|
|
movzx eax, word ptr [rdi-4]
|
|
movzx ebx, word ptr [rdi-2]
|
|
cmp eax, ebx
|
|
jge fine
|
|
int 3
|
|
fine:
|
|
ENDIF
|
|
|
|
|
|
|
|
; Final clustering, creating the 2-bit values that define the output
|
|
|
|
movaps xmm7, xmm3 ; right
|
|
mulps xmm3, [split_point]
|
|
addps xmm7, xmm2 ; left
|
|
mulps xmm7, [half] ; centre (probably 0, but what the hell)
|
|
|
|
lea rsi, [rbp-TMP_POS_ON_AXIS]
|
|
lea rdi, [expandtable]
|
|
|
|
mov r8d,[rbp-TMP_SWAP]
|
|
and r8d, 0fh
|
|
|
|
; Main cluster loop
|
|
movaps xmm0,[clearsign] ; Promote this outside the loop to improve code density
|
|
mov ecx, 4
|
|
next_bit_loop: ; Do 4 at once
|
|
movaps xmm4, [rsi] ; Read the four pos_on_axis entries
|
|
add esi, 16
|
|
movaps xmm5, xmm4
|
|
andps xmm4, xmm0 ; Clear the sign bits in this copy
|
|
cmpltps xmm4, xmm3 ; < division means 2
|
|
cmpnltps xmm5, xmm7 ; >= centre means add 1
|
|
|
|
movmskps eax, xmm4
|
|
movmskps ebx, xmm5
|
|
xor ebx,r8d ; Swap the order if we exchanged the colours
|
|
mov al, byte ptr [rdi+rax+16]
|
|
or al, byte ptr [rdi+rbx]
|
|
mov byte ptr [rdx], al ; rdx == block_dxtc+1
|
|
|
|
add rdx, 1
|
|
|
|
sub ecx, 1
|
|
jne next_bit_loop
|
|
|
|
; Complete
|
|
|
|
all_done:
|
|
|
|
; Restore changed regs and exit
|
|
|
|
movaps xmm6, [rbp-TMP_REGSAVE-( 0*16)]
|
|
movaps xmm7, [rbp-TMP_REGSAVE-( 1*16)]
|
|
movaps xmm8, [rbp-TMP_REGSAVE-( 2*16)]
|
|
|
|
pop rbp
|
|
pop rbx
|
|
pop rdi
|
|
pop rsi
|
|
ret
|
|
|
|
|
|
DXTCV11CompressBlockSSE ENDP
|
|
|
|
END |