TexConv/CMP_CompressonatorLib/DXTC/dxtc_v11_compress_asm.c

1073 lines
37 KiB
C
Raw Normal View History

2020-07-31 11:31:32 +08:00
//===============================================================================
// Copyright (c) 2007-2016 Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2004-2006 ATI Technologies Inc.
//===============================================================================
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files(the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions :
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//
//
#include "dxtc_v11_compress.h"
#if defined(_WIN32)
// Raises priority of G at expense of B - seems slightly better than no munging
#define AXIS_MUNGE
#define XMMCONST(name, val)\
__declspec(align(16)) const float name[4] = { val, val, val, val };
#define XMMICONST(name, val)\
__declspec(align(16)) const unsigned long name[4] = { val, val, val, val };
#define XMM4CONST(name, v0, v1, v2, v3)\
__declspec(align(16)) const float name[4] = { v0, v1, v2, v3 };
#define XMM4ICONST(name, v0, v1, v2, v3)\
__declspec(align(16)) const unsigned long name[4] = { v0, v1, v2, v3 };
XMMCONST(zero, 0);
XMMCONST(one, 1.0f);
XMMCONST(one_over_16, (1.0f/16.0f));
XMM4CONST(one_over_16_x_255_zeros, (1.0f/(16.0f*255.0f)), 0, 0, 0);
XMMICONST(clearsign, 0x7fffffff);
XMMICONST(signbit, 0x80000000);
XMMCONST(half, 0.5f);
XMMCONST(c255, 255.0f);
XMMCONST(c3, 3.0f);
XMMCONST(lots, 10000.0f);
XMMCONST(minuslots, -10000.0f);
XMMICONST(mask_green, 0xfc);
XMMICONST(mask_blue_red, 0xf8);
XMMICONST(clamp_0, 0);
XMMICONST(clamp_255, 255);
XMMICONST(green_rounding, (0x80>>6));
XMMICONST(blue_red_rounding, (0x80>>5));
XMM4ICONST(rb_sign_bits, 0x80000000, 0, 0x80000000, 0);
XMMCONST(stepsize, 0.95f);
XMMCONST(onethird, 1.0f/3.0f);
XMMCONST(maxerror_init, 10000000.0f);
XMMCONST(maxerror_epsilon, 5.0f);
XMM4CONST(b_half, 0.5f, 1.0f, 1.0f, 1.0f);
XMM4CONST(b_2x, 2.0f, 1.0f, 1.0f, 1.0f);
XMMICONST(invert, 0xffffffff);
XMMCONST(split_point, 2.0f/3.0f);
XMMCONST(deviation_point, 1.0f/3.0f);
XMMCONST(split_point_r300, 5.0f/8.0f);
XMMCONST(deviation_point_r300, 1.0f/4.0f);
XMMICONST(_0f0f0f0f0f0f0f0f, 0x0f0f0f0f);
XMMICONST(_0707070707070707, 0x07070707);
XMMICONST(_000f000f000f000f, 0x000f000f);
XMMICONST(_00f000f000f000f0, 0x00f000f0);
#define SHUFFLE_SELECT(a, b, c, d) (a | (b<<2) | (c<<4) | (d<<6))
#define PARALLEL_ADD_XMM7 \
{ \
__asm movaps xmm6, xmm7 \
__asm movaps xmm5, xmm7 \
__asm shufps xmm7, xmm7, SHUFFLE_SELECT(2, 3, 2, 3) \
__asm addps xmm7, xmm6 \
__asm shufps xmm5, xmm5, SHUFFLE_SELECT(1, 1, 1, 1) \
__asm addps xmm7, xmm5 \
}
#define AVERAGE xmm0
#define AXIS xmm1
BYTE expandtable[32] =
{
0x00, 0x01, 0x04, 0x05,
0x10, 0x11, 0x14, 0x15,
0x40, 0x41, 0x44, 0x45,
0x50, 0x51, 0x54, 0x55,
0x00, 0x02, 0x08, 0x0a,
0x20, 0x22, 0x28, 0x2a,
0x80, 0x82, 0x88, 0x8a,
0xa0, 0xa2, 0xa8, 0xaa,
};
#endif //_WIN32
#if !defined(_WIN64) && defined(_WIN32)
#pragma warning( push )
#pragma warning(disable:4100)
void __declspec(naked) __cdecl DXTCV11CompressBlockSSE(DWORD *block_32, DWORD *block_dxtc)
{
// *block_32 *block_dxtc (VS2010) compiler generates warning C4100: unreferenced formal parameter
__asm
{
{
push esi
push edi
push ebx
push ebp
#define SP_BLOCK_32 20
#define SP_BLOCK_DXTC 24
// Set up a 16-byte aligned storage space pointer
mov ebp, esp
and ebp, ~0x0f
#define EBP_UNIQUES (16*16)
#define EBP_POS_ON_AXIS (EBP_UNIQUES+(16*4))
#define EBP_DIST_FROM_AXIS (EBP_POS_ON_AXIS+(16*4))
#define EBP_VALUES (EBP_DIST_FROM_AXIS+16)
#define EBP_OLDLEFT (EBP_VALUES+16)
#define EBP_OLDRIGHT (EBP_OLDLEFT+16)
#define EBP_RIGHTSAVE (EBP_OLDRIGHT+16)
#define EBP_MAXERROR (EBP_RIGHTSAVE+4)
#define EBP_ERROR (EBP_MAXERROR+4)
#define EBP_SWAP (EBP_ERROR+4)
// Find the array of unique pixel values and sum them to find their average position
xorps AVERAGE, AVERAGE // average (r, g, b)
lea edi, [ebp-EBP_UNIQUES]
mov esi, [esp+SP_BLOCK_32]
mov ecx, 16
pxor mm2, mm2
{
average_unique_loop:
movd mm0, [esi]
add esi, 4
punpcklbw mm0, mm2
movq mm1, mm0
punpckhwd mm0, mm2
punpcklwd mm1, mm2
cvtpi2ps xmm7, mm0
movlhps xmm7, xmm7
cvtpi2ps xmm7, mm1
// colourspace conversion
// Most of the options in the C compressor are not supported here
#ifdef AXIS_MUNGE
movaps xmm6, xmm7
shufps xmm6, xmm6, SHUFFLE_SELECT(1, 1, 1, 1) // G
addss xmm7, xmm6
mulps xmm7, [b_half]
#endif
movaps [edi], xmm7
add edi, 16
addps AVERAGE, xmm7
sub ecx, 1
jne average_unique_loop
}
// Compute average of the uniques
mulps AVERAGE, [one_over_16]
}
{
// For each component, reflect points about the average so all lie on the same side
// of the average, and compute the new average - this gives a second point that defines the axis
// To compute the sign of the axis sum the positive differences of G for each of R and B (the
// G axis is always positive in this implementation
xorps AXIS, AXIS // v (r, g, b)
xorps xmm2, xmm2 // rg_pos, rb_pos, bg_pos
lea edi, [ebp-EBP_UNIQUES]
mov ecx, 16
{
find_axis_loop:
movaps xmm7, [edi] // R G B value
add edi, 16
subps xmm7, AVERAGE // centred
movaps xmm6, xmm7
movaps xmm5, xmm7
andps xmm7, [clearsign] // fabs (r, g, b)
addps AXIS, xmm7 // direction of axis
shufps xmm6, xmm6, SHUFFLE_SELECT(0, 2, 2, 3) // B R R 0
shufps xmm5, xmm5, SHUFFLE_SELECT(1, 0, 1, 3) // G B G 0
cmpnltps xmm6, [zero] // R/B > 0?
andps xmm6, xmm5 // insert the G or B value for those channels which are positive
addps xmm2, xmm6 // bg_pos rb_pos rg_pos
sub ecx, 1
jne find_axis_loop
}
mulps AXIS, [one_over_16]
// Handle RB_pos - RB pos is used if RG_pos and BG_pos are both zero.
movaps xmm5, xmm2 // duplicate the pos across these three
movaps xmm6, xmm2
movaps xmm7, xmm2
shufps xmm5, xmm5, SHUFFLE_SELECT(1, 3, 3, 3) // RB_pos 0 ->
shufps xmm6, xmm6, SHUFFLE_SELECT(2, 2, 2, 2) // RG_pos ->
shufps xmm7, xmm7, SHUFFLE_SELECT(0, 0, 0, 0) // BG_pos ->
orps xmm6, xmm7
cmpneqps xmm6, [zero] // so check for any non-zero in RG_pos or BG_pos
andps xmm2, xmm6 // Mask out RG_pos in current if we need to the current
xorps xmm6, [invert]
andps xmm5, xmm6
orps xmm2, xmm5 // insert RB pos instead
// Change the sign of the R and B portions of the axis appropriately
cmpltps xmm2, [zero]
andps xmm2, [rb_sign_bits]
xorps AXIS, xmm2 // Flip the sign of the axis if the r/g or b/g tests indicate a negative slope
}
// Axis projection and remapping
{
// Normalise the axis for simplicity of future calculation
movaps xmm7, AXIS
mulps xmm7, xmm7
PARALLEL_ADD_XMM7
// low of xmm7 is the DP result
// If this is 0 we haven't actually got an axis, and we can't rsq it,
// so mask the output to 0 in this case. This generates an acceptable result
movaps xmm2, xmm7
cmpneqps xmm2, [zero]
#if 1 // RSQRT with Newton-Raphson. This can be omitted for even faster encoding performance, but quality
// and consistency improves with it in on certain images. It's not a large cost so leave it in.
rsqrtps xmm3, xmm7
andps xmm3, xmm2
movaps xmm2, xmm3
mulps xmm3, xmm7
mulps xmm3, xmm2
mulps xmm2, [half]
movaps xmm7, [c3]
subps xmm7, xmm3
mulps xmm7, xmm2
#else
// No Newton-Raphson method
rsqrtps xmm7, xmm7
andps xmm7, xmm2
#endif
shufps xmm7, xmm7, SHUFFLE_SELECT(0, 0, 0, 0)
// Normalise
mulps AXIS, xmm7
}
#define LEFT xmm2
#define RIGHT xmm3
// Map the axis
{
lea edi, [ebp-EBP_UNIQUES]
lea edx, [ebp-EBP_POS_ON_AXIS]
mov ecx, 16
movaps LEFT, [lots]
movaps RIGHT, [minuslots]
xorps xmm4, xmm4 // axis mapping error
{
map_axis_loop:
movaps xmm7, [edi]
subps xmm7, AVERAGE
mulps xmm7, AXIS
PARALLEL_ADD_XMM7
movss [edx], xmm7
add edx, 4
// xmm7 == pos_on_axis
minss LEFT, xmm7 // calculate left
maxss RIGHT, xmm7 // calculate right
add edi, 16
sub ecx, 1
jne map_axis_loop
}
}
shufps LEFT, LEFT, SHUFFLE_SELECT(0, 0, 0, 0)
shufps RIGHT, RIGHT, SHUFFLE_SELECT(0, 0, 0, 0)
{
// Now we have a good axis and the basic information about how the points are mapped
// to it
// Our initial guess is to represent the endpoints accurately, by moving the average
// to the centre and recalculating the point positions along the line
// Calculate centre
movaps xmm7, LEFT
addps xmm7, RIGHT
mulps xmm7, [half]
// Offset all the axis positions to the centre
lea edi, [ebp-EBP_POS_ON_AXIS]
movaps xmm5, [edi]
movaps xmm6, [edi+16]
subps xmm5, xmm7
subps xmm6, xmm7
movaps [edi], xmm5
movaps [edi+16], xmm6
movaps xmm5, [edi+32]
movaps xmm6, [edi+48]
subps xmm5, xmm7
subps xmm6, xmm7
movaps [edi+32], xmm5
movaps [edi+48], xmm6
// Offset left, right and average to centre
subps LEFT, xmm7
subps RIGHT, xmm7
mulps xmm7, AXIS
addps AVERAGE, xmm7
}
#define PROGRESSIVE_REFINEMENT
#ifdef PROGRESSIVE_REFINEMENT
{
// Attempt a (simple) progressive refinement step to reduce noise in the
// output image by trying to find a better overall match for the endpoints
// than the first-guess solution found so far (which is just to take the ends.
// The method is to move the endpoints inwards until a local minima is found.
// This provides quite a significant improvement in image quality.
mov eax, [maxerror_init]
mov [ebp-EBP_MAXERROR], eax
movaps [ebp-EBP_OLDLEFT], LEFT
movaps [ebp-EBP_OLDRIGHT], RIGHT
lea edx, [expandtable]
lea edi, [ebp-EBP_VALUES]
{
next_refinement_loop:
movaps [ebp-EBP_RIGHTSAVE], RIGHT
mov eax, 0
mov [ebp-EBP_ERROR], eax
lea ecx, split_point
movss xmm7, LEFT
addss xmm7, RIGHT
mulss xmm7, [half]
movss [edi], LEFT
movss [edi+4], RIGHT
movss xmm5, RIGHT
movss xmm6, xmm7
subss xmm5, xmm7 // right-centre
movss xmm4, xmm7
movss RIGHT, xmm5
mulss xmm5, [ecx+16] //[deviation_point]
subss xmm6, xmm5
addss xmm4, xmm5
movss [edi+8], xmm6
movss [edi+12], xmm4
mulps RIGHT, [ecx] //[split_point]
mov ecx, 16
lea esi, [ebp-EBP_POS_ON_AXIS]
{
next_builderror_loop:
movss xmm4, [esi]
add esi, 4
movaps xmm5, xmm4
movaps xmm6, xmm4
andps xmm4, [clearsign]
cmpltss xmm4, RIGHT // < division means 2
cmpnltss xmm5, xmm7 // >= centre means add 1
movmskps eax, xmm4
movmskps ebx, xmm5
movzx eax, byte ptr[edx+eax+16]
movzx ebx, byte ptr[edx+ebx]
or eax, ebx
and eax, 3
subss xmm6, [edi+4*eax]
mulss xmm6, xmm6
addss xmm6, [ebp-EBP_ERROR]
movss [ebp-EBP_ERROR], xmm6
sub ecx, 1
jne next_builderror_loop
}
movaps RIGHT, [ebp-EBP_RIGHTSAVE]
#if 1
mov eax, [ebp-EBP_ERROR]
mov ebx, [ebp-EBP_MAXERROR]
cmp eax, ebx
jge refinement_done
#else
movss xmm5, [ebp-EBP_MAXERROR]
cmpltss xmm5, xmm6
movss [ebp-EBP_ERROR], xmm5
mov eax, [ebp-EBP_ERROR]
test eax, eax
jnz refinement_done
#endif
subss xmm6, [maxerror_epsilon]
movss [ebp-EBP_MAXERROR], xmm6
movaps [ebp-EBP_OLDLEFT], LEFT
movaps [ebp-EBP_OLDRIGHT], RIGHT
mulps LEFT, [stepsize]
mulps RIGHT, [stepsize]
jmp next_refinement_loop
refinement_done:
movaps LEFT, [ebp-EBP_OLDLEFT]
movaps RIGHT, [ebp-EBP_OLDRIGHT]
}
}
#endif
{
// Calculate the high and low output colour values
// Involved in this is a rounding procedure which is undoubtedly slightly twitchy. A
// straight rounded average is not correct, as the decompressor 'unrounds' by replicating
// the top bits to the bottom.
// In order to take account of this process, we don't just apply a straight rounding correction,
// but base our rounding on the input value (a straight rounding is actually pretty good in terms of
// error measure, but creates a visual colour and/or brightness shift relative to the original image)
// The method used here is to apply a centre-biased rounding dependent on the input value, which was
// (mostly by experiment) found to give minimum MSE while preserving the visual characteristics of
// the image.
// rgb = (average_rgb + (left|right)*v_rgb);
movaps xmm6, LEFT
movaps xmm7, RIGHT
mulps xmm6, AXIS
mulps xmm7, AXIS
addps xmm6, AVERAGE
addps xmm7, AVERAGE
#ifdef AXIS_MUNGE
// Scale the B component, then subtract the green component resultant in each
movaps xmm4, xmm6
movaps xmm5, xmm7
mulps xmm6, [b_2x]
mulps xmm7, [b_2x]
shufps xmm4, xmm4, SHUFFLE_SELECT(1, 1, 1, 1)
shufps xmm5, xmm5, SHUFFLE_SELECT(1, 1, 1, 1)
subss xmm6, xmm4
subss xmm7, xmm5
#endif
// Rearrange so B and R are in the same register half (they both use 5-bit rounding)
shufps xmm6, xmm6, SHUFFLE_SELECT(0, 2, 1, 3) // B R G
shufps xmm7, xmm7, SHUFFLE_SELECT(0, 2, 1, 3)
// Convert to integer (by truncation, as C code does)
cvttps2pi mm0, xmm6
cvttps2pi mm1, xmm7
movhlps xmm6, xmm6
movhlps xmm7, xmm7
cvttps2pi mm2, xmm6
cvttps2pi mm3, xmm7
// mm0/1 is blue/red, mm2/3 is green
// This isn't quite the same as the C algorithm, but should generate the same result
// if the input range is ensured to be 0-255
// This code could be heavily interleaved, but for P4 it's not worth the hassle - the
// P4 reordering range of 15 instructions will let it do the job for us
pmaxsw mm0, [clamp_0] // Note: faster to do these max/min in MMX than float XMM - better reordering opportunities
pmaxsw mm1, [clamp_0]
pminsw mm0, [clamp_255]
pminsw mm1, [clamp_255]
movq mm6, mm0
movq mm7, mm1
paddd mm0, [blue_red_rounding]
paddd mm1, [blue_red_rounding]
psrld mm6, 5
psrld mm7, 5
psubd mm0, mm6
psubd mm1, mm7
// No need to clamp here, with the input in 0-255 range it can never be outside at the end
pand mm0, [mask_blue_red]
pand mm1, [mask_blue_red]
// Separate out R and B as they will need separate shifts later
pshufw mm4, mm0, SHUFFLE_SELECT(2, 3, 2, 3) // extract R (this is an SSE, not MMX, instruction)
pshufw mm5, mm1, SHUFFLE_SELECT(2, 3, 2, 3) // also R
pmaxsw mm2, [clamp_0]
pmaxsw mm3, [clamp_0]
pminsw mm2, [clamp_255]
pminsw mm3, [clamp_255]
movq mm6, mm2
movq mm7, mm3
paddd mm2, [green_rounding]
paddd mm3, [green_rounding]
psrld mm6, 6
psrld mm7, 6
psubd mm2, mm6
psubd mm3, mm7
pand mm2, [mask_green]
pand mm3, [mask_green]
// Convert the 8-bit values to final RGB565 colours in mm0 and mm1
psrld mm0, 3
psrld mm1, 3
pslld mm4, 8
pslld mm5, 8
pslld mm2, 3
pslld mm3, 3
por mm0, mm4
por mm1, mm5
por mm0, mm2
por mm1, mm3
// mm0 and mm1 are c0 and c1
// Need to compare c0/c1 for sign and flip and set swap if required - and handle colour equality as well....
mov edi, [esp+SP_BLOCK_DXTC]
#if 1
pxor mm5, mm5
punpcklwd mm0, mm5 // unpack c0/c1 to DWORD's as pcmp is a signed comparison
punpcklwd mm1, mm5
movq mm2, mm0
movq mm3, mm0
movq mm4, mm0
pcmpgtd mm2, mm1
pxor mm2, [invert] // Need less than, so flip the result
movd [ebp-EBP_SWAP], mm2 // Set the swap flag (used below) appropriately)
// mm2 is the mask to indicate flipping is needed
pcmpeqd mm4, mm1
movd ebx, mm4 // ebx is the equality flag, plenty of time for this slow move to resolve
punpcklwd mm0, mm1 // 'normal' order
punpcklwd mm1, mm3 // reversed order
pand mm1, mm2
pandn mm2, mm0
por mm1, mm2 // one of the two, selected by mm2
movd [edi], mm1 // write the result
#else
// No compare simple version
punpcklwd mm0, mm1
mov [ebp-EBP_SWAP], 0
xor ebx, ebx
movd [edi], mm0
// punpcklwd mm1, mm0
// movq mm0, mm1
// mov dword ptr[ebp-EBP_SWAP], 0xffffffff
#endif
}
// Clear the output bitmasks
add edi, 4
mov dword ptr [edi], 0
// If the values are equal, the bit selector is 0 because the two colours are
// the same (which implies transparent)
// This seems the easiest way to do it, and will only rarely break branch prediction on
// typical images.
test ebx, ebx
jnz all_done
{
// Final clustering, creating the 2-bit values that define the output
lea ecx, split_point
movaps xmm7, RIGHT
mulps RIGHT, [ecx] // split point
addps xmm7, LEFT
mulps xmm7, [half] // centre (probably 0, but what the hell)
lea esi, [ebp-EBP_POS_ON_AXIS]
lea edx, [expandtable]
movss xmm6, [ebp-EBP_SWAP]
shufps xmm6, xmm6, 0
mov ecx, 4
{
next_bit_loop: // Do 4 at once
movaps xmm4, [esi] // Read the four pos_on_axis entries
add esi, 16
movaps xmm5, xmm4
andps xmm4, [clearsign]
cmpltps xmm4, RIGHT // < division means 2
cmpnltps xmm5, xmm7 // >= centre means add 1
xorps xmm5, xmm6 // Swap the order if we had to flip our colours
movmskps eax, xmm4
movmskps ebx, xmm5
movzx eax, byte ptr[edx+eax+16]
movzx ebx, byte ptr[edx+ebx]
or eax, ebx
mov byte ptr [edi], al
add edi, 1
sub ecx, 1
jne next_bit_loop
}
}
all_done:
emms
pop ebp
pop ebx
pop edi
pop esi
ret
}
}
void __declspec(naked) __cdecl DXTCV11CompressBlockSSEMinimal(DWORD *block_32, DWORD *block_dxtc)
{
#define SP_BLOCK_32 20
#define SP_BLOCK_DXTC 24
#define EBP_UNIQUES (16*16)
#define EBP_POS_ON_AXIS (EBP_UNIQUES+(16*4))
#define EBP_DIST_FROM_AXIS (EBP_POS_ON_AXIS+(16*4))
#define EBP_VALUES (EBP_DIST_FROM_AXIS+16)
#define EBP_OLDLEFT (EBP_VALUES+16)
#define EBP_OLDRIGHT (EBP_OLDLEFT+16)
#define EBP_RIGHTSAVE (EBP_OLDRIGHT+16)
#define EBP_MAXERROR (EBP_RIGHTSAVE+4)
#define EBP_ERROR (EBP_MAXERROR+4)
#define EBP_SWAP (EBP_ERROR+4)
// *block_32 *block_dxtc (VS2010) compiler generates warning C4100: unreferenced formal parameter
__asm
{
push esi
push edi
push ebx
push ebp
// Set up a 16-byte aligned storage space pointer
mov ebp, esp
and ebp, ~0x0f
// init to 0
xorps AVERAGE, AVERAGE // average (r, g, b)
// -------------------------------------------------------------------------------------
// (3) Find the array of unique pixel values and sum them to find their average position
// -------------------------------------------------------------------------------------
2020-07-31 11:31:32 +08:00
{
lea edi,[ebp - EBP_UNIQUES]
mov esi,[esp + SP_BLOCK_32]
mov ecx, 16
pxor mm2, mm2
2020-07-31 11:31:32 +08:00
average_unique_loop :
{
movd mm0 ,[esi]
add esi , 4
punpcklbw mm0 , mm2
movq mm1 , mm0
punpckhwd mm0 , mm2
punpcklwd mm1 , mm2
cvtpi2ps xmm7, mm0
movlhps xmm7, xmm7
cvtpi2ps xmm7, mm1
// colourspace conversion
movaps[edi], xmm7
add edi, 16
addps AVERAGE, xmm7
2020-07-31 11:31:32 +08:00
sub ecx, 1
jne average_unique_loop
}
// Compute average of the uniques
mulps AVERAGE,[one_over_16]
}
// -------------------------------------------------------------------------------------
// (4) For each component, reflect points about the average so all lie on the same side
2020-07-31 11:31:32 +08:00
// of the average, and compute the new average - this gives a second point that defines the axis
// To compute the sign of the axis sum the positive differences of G for each of R and B (the
// G axis is always positive in this implementation
// -------------------------------------------------------------------------------------
2020-07-31 11:31:32 +08:00
{
xorps AXIS, AXIS // v (r, g, b)
xorps xmm2, xmm2 // rg_pos, rb_pos, bg_pos
lea edi, [ebp - EBP_UNIQUES]
mov ecx, 16
2020-07-31 11:31:32 +08:00
find_axis_loop:
{
movaps xmm7, [edi] // R G B value
add edi, 16
subps xmm7, AVERAGE // centred
movaps xmm6, xmm7
movaps xmm5, xmm7
2020-07-31 11:31:32 +08:00
andps xmm7, [clearsign] // fabs (r, g, b)
addps AXIS, xmm7 // direction of axis
2020-07-31 11:31:32 +08:00
shufps xmm6, xmm6, SHUFFLE_SELECT(0, 2, 2, 3) // B R R 0
shufps xmm5, xmm5, SHUFFLE_SELECT(1, 0, 1, 3) // G B G 0
2020-07-31 11:31:32 +08:00
cmpnltps xmm6, [zero] // R/B > 0?
andps xmm6, xmm5 // insert the G or B value for those channels which are positive
addps xmm2, xmm6 // bg_pos rb_pos rg_pos
2020-07-31 11:31:32 +08:00
sub ecx, 1
jne find_axis_loop
2020-07-31 11:31:32 +08:00
}
mulps AXIS, [one_over_16]
// Handle RB_pos - RB pos is used if RG_pos and BG_pos are both zero.
movaps xmm5, xmm2 // duplicate the pos across these three
movaps xmm6, xmm2
movaps xmm7, xmm2
shufps xmm5, xmm5, SHUFFLE_SELECT(1, 3, 3, 3) // RB_pos 0 ->
shufps xmm6, xmm6, SHUFFLE_SELECT(2, 2, 2, 2) // RG_pos ->
shufps xmm7, xmm7, SHUFFLE_SELECT(0, 0, 0, 0) // BG_pos ->
orps xmm6, xmm7
cmpneqps xmm6, [zero] // so check for any non-zero in RG_pos or BG_pos
andps xmm2, xmm6 // Mask out RG_pos in current if we need to the current
xorps xmm6, [invert]
andps xmm5, xmm6
orps xmm2, xmm5 // insert RB pos instead
2020-07-31 11:31:32 +08:00
// Change the sign of the R and B portions of the axis appropriately
cmpltps xmm2, [zero]
andps xmm2, [rb_sign_bits]
xorps AXIS, xmm2 // Flip the sign of the axis if the r/g or b/g tests indicate a negative slope
2020-07-31 11:31:32 +08:00
}
// -------------------------------------------------------------------------------------
// (5) Axis projection and remapping
// -------------------------------------------------------------------------------------
{
2020-07-31 11:31:32 +08:00
// Normalise the axis for simplicity of future calculation
movaps xmm7, AXIS
mulps xmm7, xmm7
2020-07-31 11:31:32 +08:00
PARALLEL_ADD_XMM7
// low of xmm7 is the DP result
// If this is 0 we haven't actually got an axis, and we can't rsq it,
// so mask the output to 0 in this case. This generates an acceptable result
movaps xmm2, xmm7
cmpneqps xmm2, [zero]
rsqrtps xmm7, xmm7
andps xmm7, xmm2
shufps xmm7, xmm7, SHUFFLE_SELECT(0, 0, 0, 0)
2020-07-31 11:31:32 +08:00
// Normalise
mulps AXIS, xmm7
}
#define LEFT xmm2
#define RIGHT xmm3
// -------------------------------------------------------------------------------------
// (6) Map the axis
// -------------------------------------------------------------------------------------
{
lea edi, [ebp - EBP_UNIQUES]
lea edx, [ebp - EBP_POS_ON_AXIS]
mov ecx, 16
movaps LEFT, [lots]
movaps RIGHT, [minuslots]
xorps xmm4, xmm4 // axis mapping error
2020-07-31 11:31:32 +08:00
{
map_axis_loop:
movaps xmm7,[edi]
subps xmm7, AVERAGE
mulps xmm7, AXIS
2020-07-31 11:31:32 +08:00
PARALLEL_ADD_XMM7
movss[edx], xmm7
add edx, 4
2020-07-31 11:31:32 +08:00
// xmm7 == pos_on_axis
minss LEFT, xmm7 // calculate left
maxss RIGHT, xmm7 // calculate right
add edi, 16
sub ecx, 1
jne map_axis_loop
2020-07-31 11:31:32 +08:00
}
}
shufps LEFT, LEFT, SHUFFLE_SELECT(0, 0, 0, 0)
shufps RIGHT, RIGHT, SHUFFLE_SELECT(0, 0, 0, 0)
2020-07-31 11:31:32 +08:00
// -------------------------------------------------------------------------------------
// (7) Now we have a good axis and the basic information about how the points are mapped
2020-07-31 11:31:32 +08:00
// to it
// Our initial guess is to represent the endpoints accurately, by moving the average
// to the centre and recalculating the point positions along the line
// -------------------------------------------------------------------------------------
2020-07-31 11:31:32 +08:00
{
// Calculate centre
movaps xmm7, LEFT
addps xmm7, RIGHT
mulps xmm7, [half]
2020-07-31 11:31:32 +08:00
// Offset all the axis positions to the centre
lea edi, [ebp - EBP_POS_ON_AXIS]
movaps xmm5, [edi]
movaps xmm6, [edi + 16]
subps xmm5, xmm7
subps xmm6, xmm7
2020-07-31 11:31:32 +08:00
movaps[edi], xmm5
movaps[edi + 16], xmm6
movaps xmm5, [edi + 32]
movaps xmm6, [edi + 48]
subps xmm5, xmm7
subps xmm6, xmm7
2020-07-31 11:31:32 +08:00
movaps[edi + 32], xmm5
movaps[edi + 48], xmm6
// Offset left, right and average to centre
subps LEFT, xmm7
subps RIGHT, xmm7
mulps xmm7, AXIS
addps AVERAGE, xmm7
}
// -------------------------------------------------------------------------------------
// (8) Calculate the high and low output colour values
2020-07-31 11:31:32 +08:00
// Involved in this is a rounding procedure which is undoubtedly slightly twitchy. A
// straight rounded average is not correct, as the decompressor 'unrounds' by replicating
// the top bits to the bottom.
// In order to take account of this process, we don't just apply a straight rounding correction,
// but base our rounding on the input value (a straight rounding is actually pretty good in terms of
// error measure, but creates a visual colour and/or brightness shift relative to the original image)
// The method used here is to apply a centre-biased rounding dependent on the input value, which was
// (mostly by experiment) found to give minimum MSE while preserving the visual characteristics of
// the image.
// rgb = (average_rgb + (left|right)*v_rgb);
// -------------------------------------------------------------------------------------
{
movaps xmm6, LEFT
movaps xmm7, RIGHT
mulps xmm6, AXIS
mulps xmm7, AXIS
addps xmm6, AVERAGE
addps xmm7, AVERAGE
2020-07-31 11:31:32 +08:00
// Rearrange so B and R are in the same register half (they both use 5-bit rounding)
shufps xmm6, xmm6, SHUFFLE_SELECT(0, 2, 1, 3) // B R G
shufps xmm7, xmm7, SHUFFLE_SELECT(0, 2, 1, 3)
2020-07-31 11:31:32 +08:00
// Convert to integer (by truncation, as C code does)
cvttps2pi mm0, xmm6
cvttps2pi mm1, xmm7
movhlps xmm6, xmm6
movhlps xmm7, xmm7
cvttps2pi mm2, xmm6
cvttps2pi mm3, xmm7
2020-07-31 11:31:32 +08:00
// mm0/1 is blue/red, mm2/3 is green
// This isn't quite the same as the C algorithm, but should generate the same result
// if the input range is ensured to be 0-255
// This code could be heavily interleaved, but for P4 it's not worth the hassle - the
// P4 reordering range of 15 instructions will let it do the job for us
pmaxsw mm0, [clamp_0] // Note: faster to do these max/min in MMX than float XMM - better reordering opportunities
pmaxsw mm1, [clamp_0]
pminsw mm0, [clamp_255]
pminsw mm1, [clamp_255]
movq mm6, mm0
movq mm7, mm1
paddd mm0, [blue_red_rounding]
paddd mm1, [blue_red_rounding]
psrld mm6, 5
psrld mm7, 5
psubd mm0, mm6
psubd mm1, mm7
2020-07-31 11:31:32 +08:00
// No need to clamp here, with the input in 0-255 range it can never be outside at the end
pand mm0, [mask_blue_red]
pand mm1, [mask_blue_red]
2020-07-31 11:31:32 +08:00
// Separate out R and B as they will need separate shifts later
pshufw mm4, mm0, SHUFFLE_SELECT(2, 3, 2, 3) // extract R (this is an SSE, not MMX, instruction)
pshufw mm5, mm1, SHUFFLE_SELECT(2, 3, 2, 3) // also R
pmaxsw mm2, [clamp_0]
pmaxsw mm3, [clamp_0]
pminsw mm2, [clamp_255]
pminsw mm3, [clamp_255]
movq mm6, mm2
movq mm7, mm3
paddd mm2, [green_rounding]
paddd mm3, [green_rounding]
psrld mm6, 6
psrld mm7, 6
psubd mm2, mm6
psubd mm3, mm7
pand mm2, [mask_green]
pand mm3, [mask_green]
2020-07-31 11:31:32 +08:00
// Convert the 8-bit values to final RGB565 colours in mm0 and mm1
psrld mm0, 3
psrld mm1, 3
pslld mm4, 8
pslld mm5, 8
pslld mm2, 3
pslld mm3, 3
por mm0, mm4
por mm1, mm5
por mm0, mm2
por mm1, mm3
2020-07-31 11:31:32 +08:00
// mm0 and mm1 are c0 and c1
// Need to compare c0/c1 for sign and flip and set swap if required - and handle colour equality as well....
mov edi, [esp + SP_BLOCK_DXTC]
pxor mm5, mm5
punpcklwd mm0, mm5 // unpack c0/c1 to DWORD's as pcmp is a signed comparison
2020-07-31 11:31:32 +08:00
punpcklwd mm1, mm5
movq mm2, mm0
movq mm3, mm0
movq mm4, mm0
2020-07-31 11:31:32 +08:00
pcmpgtd mm2, mm1
pxor mm2, [invert] // Need less than, so flip the result
movd[ebp - EBP_SWAP], mm2 // Set the swap flag (used below) appropriately)
2020-07-31 11:31:32 +08:00
// mm2 is the mask to indicate flipping is needed
pcmpeqd mm4, mm1
movd ebx, mm4 // ebx is the equality flag, plenty of time for this slow move to resolve
punpcklwd mm0, mm1 // 'normal' order
punpcklwd mm1, mm3 // reversed order
pand mm1, mm2
pandn mm2, mm0
por mm1, mm2 // one of the two, selected by mm2
movd[edi], mm1 // write the result
}
2020-07-31 11:31:32 +08:00
// Clear the output bitmasks
add edi, 4
mov dword ptr[edi], 0
// If the values are equal, the bit selector is 0 because the two colours are
// the same (which implies transparent)
// This seems the easiest way to do it, and will only rarely break branch prediction on
// typical images.
test ebx, ebx
jnz all_done
// -------------------------------------------------------------------------------------
// (9) Final clustering, creating the 2-bit values that define the output
// -------------------------------------------------------------------------------------
{
lea ecx, split_point
movaps xmm7, RIGHT
mulps RIGHT,[ecx] // split point
addps xmm7, LEFT
mulps xmm7,[half] // centre (probably 0, but what the hell)
lea esi,[ebp - EBP_POS_ON_AXIS]
lea edx,[expandtable]
movss xmm6,[ebp - EBP_SWAP]
shufps xmm6, xmm6, 0
mov ecx, 4
{
next_bit_loop: // Do 4 at once
movaps xmm4,[esi] // Read the four pos_on_axis entries
add esi, 16
movaps xmm5, xmm4
andps xmm4,[clearsign]
cmpltps xmm4, RIGHT // < division means 2
cmpnltps xmm5, xmm7 // >= centre means add 1
xorps xmm5, xmm6 // Swap the order if we had to flip our colours
movmskps eax, xmm4
movmskps ebx, xmm5
movzx eax, byte ptr[edx + eax + 16]
movzx ebx, byte ptr[edx + ebx]
or eax, ebx
mov byte ptr[edi], al
add edi, 1
sub ecx, 1
jne next_bit_loop
}
}
2020-07-31 11:31:32 +08:00
all_done:
emms
pop ebp
pop ebx
pop edi
pop esi
ret
} // _asm
}
#pragma warning( pop )
#endif // !_WIN64