//=============================================================================== // Copyright (c) 2007-2016 Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2004-2006 ATI Technologies Inc. //=============================================================================== // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files(the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and / or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions : // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. // // #include "dxtc_v11_compress.h" #if defined(_WIN32) // Raises priority of G at expense of B - seems slightly better than no munging #define AXIS_MUNGE #define XMMCONST(name, val)\ __declspec(align(16)) const float name[4] = { val, val, val, val }; #define XMMICONST(name, val)\ __declspec(align(16)) const unsigned long name[4] = { val, val, val, val }; #define XMM4CONST(name, v0, v1, v2, v3)\ __declspec(align(16)) const float name[4] = { v0, v1, v2, v3 }; #define XMM4ICONST(name, v0, v1, v2, v3)\ __declspec(align(16)) const unsigned long name[4] = { v0, v1, v2, v3 }; XMMCONST(zero, 0); XMMCONST(one, 1.0f); XMMCONST(one_over_16, (1.0f/16.0f)); XMM4CONST(one_over_16_x_255_zeros, (1.0f/(16.0f*255.0f)), 0, 0, 0); XMMICONST(clearsign, 0x7fffffff); XMMICONST(signbit, 0x80000000); XMMCONST(half, 0.5f); XMMCONST(c255, 255.0f); XMMCONST(c3, 3.0f); XMMCONST(lots, 10000.0f); XMMCONST(minuslots, -10000.0f); XMMICONST(mask_green, 0xfc); XMMICONST(mask_blue_red, 0xf8); XMMICONST(clamp_0, 0); XMMICONST(clamp_255, 255); XMMICONST(green_rounding, (0x80>>6)); XMMICONST(blue_red_rounding, (0x80>>5)); XMM4ICONST(rb_sign_bits, 0x80000000, 0, 0x80000000, 0); XMMCONST(stepsize, 0.95f); XMMCONST(onethird, 1.0f/3.0f); XMMCONST(maxerror_init, 10000000.0f); XMMCONST(maxerror_epsilon, 5.0f); XMM4CONST(b_half, 0.5f, 1.0f, 1.0f, 1.0f); XMM4CONST(b_2x, 2.0f, 1.0f, 1.0f, 1.0f); XMMICONST(invert, 0xffffffff); XMMCONST(split_point, 2.0f/3.0f); XMMCONST(deviation_point, 1.0f/3.0f); XMMCONST(split_point_r300, 5.0f/8.0f); XMMCONST(deviation_point_r300, 1.0f/4.0f); XMMICONST(_0f0f0f0f0f0f0f0f, 0x0f0f0f0f); XMMICONST(_0707070707070707, 0x07070707); XMMICONST(_000f000f000f000f, 0x000f000f); XMMICONST(_00f000f000f000f0, 0x00f000f0); #define SHUFFLE_SELECT(a, b, c, d) (a | (b<<2) | (c<<4) | (d<<6)) #define PARALLEL_ADD_XMM7 \ { \ __asm movaps xmm6, xmm7 \ __asm movaps xmm5, xmm7 \ __asm shufps xmm7, xmm7, SHUFFLE_SELECT(2, 3, 2, 3) \ __asm addps xmm7, xmm6 \ __asm shufps xmm5, xmm5, SHUFFLE_SELECT(1, 1, 1, 1) \ __asm addps xmm7, xmm5 \ } #define AVERAGE xmm0 #define AXIS xmm1 BYTE expandtable[32] = { 0x00, 0x01, 0x04, 0x05, 0x10, 0x11, 0x14, 0x15, 0x40, 0x41, 0x44, 0x45, 0x50, 0x51, 0x54, 0x55, 0x00, 0x02, 0x08, 0x0a, 0x20, 0x22, 0x28, 0x2a, 0x80, 0x82, 0x88, 0x8a, 0xa0, 0xa2, 0xa8, 0xaa, }; #endif //_WIN32 #if !defined(_WIN64) && defined(_WIN32) #pragma warning( push ) #pragma warning(disable:4100) void __declspec(naked) __cdecl DXTCV11CompressBlockSSE(DWORD *block_32, DWORD *block_dxtc) { // *block_32 *block_dxtc (VS2010) compiler generates warning C4100: unreferenced formal parameter __asm { { push esi push edi push ebx push ebp #define SP_BLOCK_32 20 #define SP_BLOCK_DXTC 24 // Set up a 16-byte aligned storage space pointer mov ebp, esp and ebp, ~0x0f #define EBP_UNIQUES (16*16) #define EBP_POS_ON_AXIS (EBP_UNIQUES+(16*4)) #define EBP_DIST_FROM_AXIS (EBP_POS_ON_AXIS+(16*4)) #define EBP_VALUES (EBP_DIST_FROM_AXIS+16) #define EBP_OLDLEFT (EBP_VALUES+16) #define EBP_OLDRIGHT (EBP_OLDLEFT+16) #define EBP_RIGHTSAVE (EBP_OLDRIGHT+16) #define EBP_MAXERROR (EBP_RIGHTSAVE+4) #define EBP_ERROR (EBP_MAXERROR+4) #define EBP_SWAP (EBP_ERROR+4) // Find the array of unique pixel values and sum them to find their average position xorps AVERAGE, AVERAGE // average (r, g, b) lea edi, [ebp-EBP_UNIQUES] mov esi, [esp+SP_BLOCK_32] mov ecx, 16 pxor mm2, mm2 { average_unique_loop: movd mm0, [esi] add esi, 4 punpcklbw mm0, mm2 movq mm1, mm0 punpckhwd mm0, mm2 punpcklwd mm1, mm2 cvtpi2ps xmm7, mm0 movlhps xmm7, xmm7 cvtpi2ps xmm7, mm1 // colourspace conversion // Most of the options in the C compressor are not supported here #ifdef AXIS_MUNGE movaps xmm6, xmm7 shufps xmm6, xmm6, SHUFFLE_SELECT(1, 1, 1, 1) // G addss xmm7, xmm6 mulps xmm7, [b_half] #endif movaps [edi], xmm7 add edi, 16 addps AVERAGE, xmm7 sub ecx, 1 jne average_unique_loop } // Compute average of the uniques mulps AVERAGE, [one_over_16] } { // For each component, reflect points about the average so all lie on the same side // of the average, and compute the new average - this gives a second point that defines the axis // To compute the sign of the axis sum the positive differences of G for each of R and B (the // G axis is always positive in this implementation xorps AXIS, AXIS // v (r, g, b) xorps xmm2, xmm2 // rg_pos, rb_pos, bg_pos lea edi, [ebp-EBP_UNIQUES] mov ecx, 16 { find_axis_loop: movaps xmm7, [edi] // R G B value add edi, 16 subps xmm7, AVERAGE // centred movaps xmm6, xmm7 movaps xmm5, xmm7 andps xmm7, [clearsign] // fabs (r, g, b) addps AXIS, xmm7 // direction of axis shufps xmm6, xmm6, SHUFFLE_SELECT(0, 2, 2, 3) // B R R 0 shufps xmm5, xmm5, SHUFFLE_SELECT(1, 0, 1, 3) // G B G 0 cmpnltps xmm6, [zero] // R/B > 0? andps xmm6, xmm5 // insert the G or B value for those channels which are positive addps xmm2, xmm6 // bg_pos rb_pos rg_pos sub ecx, 1 jne find_axis_loop } mulps AXIS, [one_over_16] // Handle RB_pos - RB pos is used if RG_pos and BG_pos are both zero. movaps xmm5, xmm2 // duplicate the pos across these three movaps xmm6, xmm2 movaps xmm7, xmm2 shufps xmm5, xmm5, SHUFFLE_SELECT(1, 3, 3, 3) // RB_pos 0 -> shufps xmm6, xmm6, SHUFFLE_SELECT(2, 2, 2, 2) // RG_pos -> shufps xmm7, xmm7, SHUFFLE_SELECT(0, 0, 0, 0) // BG_pos -> orps xmm6, xmm7 cmpneqps xmm6, [zero] // so check for any non-zero in RG_pos or BG_pos andps xmm2, xmm6 // Mask out RG_pos in current if we need to the current xorps xmm6, [invert] andps xmm5, xmm6 orps xmm2, xmm5 // insert RB pos instead // Change the sign of the R and B portions of the axis appropriately cmpltps xmm2, [zero] andps xmm2, [rb_sign_bits] xorps AXIS, xmm2 // Flip the sign of the axis if the r/g or b/g tests indicate a negative slope } // Axis projection and remapping { // Normalise the axis for simplicity of future calculation movaps xmm7, AXIS mulps xmm7, xmm7 PARALLEL_ADD_XMM7 // low of xmm7 is the DP result // If this is 0 we haven't actually got an axis, and we can't rsq it, // so mask the output to 0 in this case. This generates an acceptable result movaps xmm2, xmm7 cmpneqps xmm2, [zero] #if 1 // RSQRT with Newton-Raphson. This can be omitted for even faster encoding performance, but quality // and consistency improves with it in on certain images. It's not a large cost so leave it in. rsqrtps xmm3, xmm7 andps xmm3, xmm2 movaps xmm2, xmm3 mulps xmm3, xmm7 mulps xmm3, xmm2 mulps xmm2, [half] movaps xmm7, [c3] subps xmm7, xmm3 mulps xmm7, xmm2 #else // No Newton-Raphson method rsqrtps xmm7, xmm7 andps xmm7, xmm2 #endif shufps xmm7, xmm7, SHUFFLE_SELECT(0, 0, 0, 0) // Normalise mulps AXIS, xmm7 } #define LEFT xmm2 #define RIGHT xmm3 // Map the axis { lea edi, [ebp-EBP_UNIQUES] lea edx, [ebp-EBP_POS_ON_AXIS] mov ecx, 16 movaps LEFT, [lots] movaps RIGHT, [minuslots] xorps xmm4, xmm4 // axis mapping error { map_axis_loop: movaps xmm7, [edi] subps xmm7, AVERAGE mulps xmm7, AXIS PARALLEL_ADD_XMM7 movss [edx], xmm7 add edx, 4 // xmm7 == pos_on_axis minss LEFT, xmm7 // calculate left maxss RIGHT, xmm7 // calculate right add edi, 16 sub ecx, 1 jne map_axis_loop } } shufps LEFT, LEFT, SHUFFLE_SELECT(0, 0, 0, 0) shufps RIGHT, RIGHT, SHUFFLE_SELECT(0, 0, 0, 0) { // Now we have a good axis and the basic information about how the points are mapped // to it // Our initial guess is to represent the endpoints accurately, by moving the average // to the centre and recalculating the point positions along the line // Calculate centre movaps xmm7, LEFT addps xmm7, RIGHT mulps xmm7, [half] // Offset all the axis positions to the centre lea edi, [ebp-EBP_POS_ON_AXIS] movaps xmm5, [edi] movaps xmm6, [edi+16] subps xmm5, xmm7 subps xmm6, xmm7 movaps [edi], xmm5 movaps [edi+16], xmm6 movaps xmm5, [edi+32] movaps xmm6, [edi+48] subps xmm5, xmm7 subps xmm6, xmm7 movaps [edi+32], xmm5 movaps [edi+48], xmm6 // Offset left, right and average to centre subps LEFT, xmm7 subps RIGHT, xmm7 mulps xmm7, AXIS addps AVERAGE, xmm7 } #define PROGRESSIVE_REFINEMENT #ifdef PROGRESSIVE_REFINEMENT { // Attempt a (simple) progressive refinement step to reduce noise in the // output image by trying to find a better overall match for the endpoints // than the first-guess solution found so far (which is just to take the ends. // The method is to move the endpoints inwards until a local minima is found. // This provides quite a significant improvement in image quality. mov eax, [maxerror_init] mov [ebp-EBP_MAXERROR], eax movaps [ebp-EBP_OLDLEFT], LEFT movaps [ebp-EBP_OLDRIGHT], RIGHT lea edx, [expandtable] lea edi, [ebp-EBP_VALUES] { next_refinement_loop: movaps [ebp-EBP_RIGHTSAVE], RIGHT mov eax, 0 mov [ebp-EBP_ERROR], eax lea ecx, split_point movss xmm7, LEFT addss xmm7, RIGHT mulss xmm7, [half] movss [edi], LEFT movss [edi+4], RIGHT movss xmm5, RIGHT movss xmm6, xmm7 subss xmm5, xmm7 // right-centre movss xmm4, xmm7 movss RIGHT, xmm5 mulss xmm5, [ecx+16] //[deviation_point] subss xmm6, xmm5 addss xmm4, xmm5 movss [edi+8], xmm6 movss [edi+12], xmm4 mulps RIGHT, [ecx] //[split_point] mov ecx, 16 lea esi, [ebp-EBP_POS_ON_AXIS] { next_builderror_loop: movss xmm4, [esi] add esi, 4 movaps xmm5, xmm4 movaps xmm6, xmm4 andps xmm4, [clearsign] cmpltss xmm4, RIGHT // < division means 2 cmpnltss xmm5, xmm7 // >= centre means add 1 movmskps eax, xmm4 movmskps ebx, xmm5 movzx eax, byte ptr[edx+eax+16] movzx ebx, byte ptr[edx+ebx] or eax, ebx and eax, 3 subss xmm6, [edi+4*eax] mulss xmm6, xmm6 addss xmm6, [ebp-EBP_ERROR] movss [ebp-EBP_ERROR], xmm6 sub ecx, 1 jne next_builderror_loop } movaps RIGHT, [ebp-EBP_RIGHTSAVE] #if 1 mov eax, [ebp-EBP_ERROR] mov ebx, [ebp-EBP_MAXERROR] cmp eax, ebx jge refinement_done #else movss xmm5, [ebp-EBP_MAXERROR] cmpltss xmm5, xmm6 movss [ebp-EBP_ERROR], xmm5 mov eax, [ebp-EBP_ERROR] test eax, eax jnz refinement_done #endif subss xmm6, [maxerror_epsilon] movss [ebp-EBP_MAXERROR], xmm6 movaps [ebp-EBP_OLDLEFT], LEFT movaps [ebp-EBP_OLDRIGHT], RIGHT mulps LEFT, [stepsize] mulps RIGHT, [stepsize] jmp next_refinement_loop refinement_done: movaps LEFT, [ebp-EBP_OLDLEFT] movaps RIGHT, [ebp-EBP_OLDRIGHT] } } #endif { // Calculate the high and low output colour values // Involved in this is a rounding procedure which is undoubtedly slightly twitchy. A // straight rounded average is not correct, as the decompressor 'unrounds' by replicating // the top bits to the bottom. // In order to take account of this process, we don't just apply a straight rounding correction, // but base our rounding on the input value (a straight rounding is actually pretty good in terms of // error measure, but creates a visual colour and/or brightness shift relative to the original image) // The method used here is to apply a centre-biased rounding dependent on the input value, which was // (mostly by experiment) found to give minimum MSE while preserving the visual characteristics of // the image. // rgb = (average_rgb + (left|right)*v_rgb); movaps xmm6, LEFT movaps xmm7, RIGHT mulps xmm6, AXIS mulps xmm7, AXIS addps xmm6, AVERAGE addps xmm7, AVERAGE #ifdef AXIS_MUNGE // Scale the B component, then subtract the green component resultant in each movaps xmm4, xmm6 movaps xmm5, xmm7 mulps xmm6, [b_2x] mulps xmm7, [b_2x] shufps xmm4, xmm4, SHUFFLE_SELECT(1, 1, 1, 1) shufps xmm5, xmm5, SHUFFLE_SELECT(1, 1, 1, 1) subss xmm6, xmm4 subss xmm7, xmm5 #endif // Rearrange so B and R are in the same register half (they both use 5-bit rounding) shufps xmm6, xmm6, SHUFFLE_SELECT(0, 2, 1, 3) // B R G shufps xmm7, xmm7, SHUFFLE_SELECT(0, 2, 1, 3) // Convert to integer (by truncation, as C code does) cvttps2pi mm0, xmm6 cvttps2pi mm1, xmm7 movhlps xmm6, xmm6 movhlps xmm7, xmm7 cvttps2pi mm2, xmm6 cvttps2pi mm3, xmm7 // mm0/1 is blue/red, mm2/3 is green // This isn't quite the same as the C algorithm, but should generate the same result // if the input range is ensured to be 0-255 // This code could be heavily interleaved, but for P4 it's not worth the hassle - the // P4 reordering range of 15 instructions will let it do the job for us pmaxsw mm0, [clamp_0] // Note: faster to do these max/min in MMX than float XMM - better reordering opportunities pmaxsw mm1, [clamp_0] pminsw mm0, [clamp_255] pminsw mm1, [clamp_255] movq mm6, mm0 movq mm7, mm1 paddd mm0, [blue_red_rounding] paddd mm1, [blue_red_rounding] psrld mm6, 5 psrld mm7, 5 psubd mm0, mm6 psubd mm1, mm7 // No need to clamp here, with the input in 0-255 range it can never be outside at the end pand mm0, [mask_blue_red] pand mm1, [mask_blue_red] // Separate out R and B as they will need separate shifts later pshufw mm4, mm0, SHUFFLE_SELECT(2, 3, 2, 3) // extract R (this is an SSE, not MMX, instruction) pshufw mm5, mm1, SHUFFLE_SELECT(2, 3, 2, 3) // also R pmaxsw mm2, [clamp_0] pmaxsw mm3, [clamp_0] pminsw mm2, [clamp_255] pminsw mm3, [clamp_255] movq mm6, mm2 movq mm7, mm3 paddd mm2, [green_rounding] paddd mm3, [green_rounding] psrld mm6, 6 psrld mm7, 6 psubd mm2, mm6 psubd mm3, mm7 pand mm2, [mask_green] pand mm3, [mask_green] // Convert the 8-bit values to final RGB565 colours in mm0 and mm1 psrld mm0, 3 psrld mm1, 3 pslld mm4, 8 pslld mm5, 8 pslld mm2, 3 pslld mm3, 3 por mm0, mm4 por mm1, mm5 por mm0, mm2 por mm1, mm3 // mm0 and mm1 are c0 and c1 // Need to compare c0/c1 for sign and flip and set swap if required - and handle colour equality as well.... mov edi, [esp+SP_BLOCK_DXTC] #if 1 pxor mm5, mm5 punpcklwd mm0, mm5 // unpack c0/c1 to DWORD's as pcmp is a signed comparison punpcklwd mm1, mm5 movq mm2, mm0 movq mm3, mm0 movq mm4, mm0 pcmpgtd mm2, mm1 pxor mm2, [invert] // Need less than, so flip the result movd [ebp-EBP_SWAP], mm2 // Set the swap flag (used below) appropriately) // mm2 is the mask to indicate flipping is needed pcmpeqd mm4, mm1 movd ebx, mm4 // ebx is the equality flag, plenty of time for this slow move to resolve punpcklwd mm0, mm1 // 'normal' order punpcklwd mm1, mm3 // reversed order pand mm1, mm2 pandn mm2, mm0 por mm1, mm2 // one of the two, selected by mm2 movd [edi], mm1 // write the result #else // No compare simple version punpcklwd mm0, mm1 mov [ebp-EBP_SWAP], 0 xor ebx, ebx movd [edi], mm0 // punpcklwd mm1, mm0 // movq mm0, mm1 // mov dword ptr[ebp-EBP_SWAP], 0xffffffff #endif } // Clear the output bitmasks add edi, 4 mov dword ptr [edi], 0 // If the values are equal, the bit selector is 0 because the two colours are // the same (which implies transparent) // This seems the easiest way to do it, and will only rarely break branch prediction on // typical images. test ebx, ebx jnz all_done { // Final clustering, creating the 2-bit values that define the output lea ecx, split_point movaps xmm7, RIGHT mulps RIGHT, [ecx] // split point addps xmm7, LEFT mulps xmm7, [half] // centre (probably 0, but what the hell) lea esi, [ebp-EBP_POS_ON_AXIS] lea edx, [expandtable] movss xmm6, [ebp-EBP_SWAP] shufps xmm6, xmm6, 0 mov ecx, 4 { next_bit_loop: // Do 4 at once movaps xmm4, [esi] // Read the four pos_on_axis entries add esi, 16 movaps xmm5, xmm4 andps xmm4, [clearsign] cmpltps xmm4, RIGHT // < division means 2 cmpnltps xmm5, xmm7 // >= centre means add 1 xorps xmm5, xmm6 // Swap the order if we had to flip our colours movmskps eax, xmm4 movmskps ebx, xmm5 movzx eax, byte ptr[edx+eax+16] movzx ebx, byte ptr[edx+ebx] or eax, ebx mov byte ptr [edi], al add edi, 1 sub ecx, 1 jne next_bit_loop } } all_done: emms pop ebp pop ebx pop edi pop esi ret } } void __declspec(naked) __cdecl DXTCV11CompressBlockSSEMinimal(DWORD *block_32, DWORD *block_dxtc) { #define SP_BLOCK_32 20 #define SP_BLOCK_DXTC 24 #define EBP_UNIQUES (16*16) #define EBP_POS_ON_AXIS (EBP_UNIQUES+(16*4)) #define EBP_DIST_FROM_AXIS (EBP_POS_ON_AXIS+(16*4)) #define EBP_VALUES (EBP_DIST_FROM_AXIS+16) #define EBP_OLDLEFT (EBP_VALUES+16) #define EBP_OLDRIGHT (EBP_OLDLEFT+16) #define EBP_RIGHTSAVE (EBP_OLDRIGHT+16) #define EBP_MAXERROR (EBP_RIGHTSAVE+4) #define EBP_ERROR (EBP_MAXERROR+4) #define EBP_SWAP (EBP_ERROR+4) // *block_32 *block_dxtc (VS2010) compiler generates warning C4100: unreferenced formal parameter __asm { push esi push edi push ebx push ebp // Set up a 16-byte aligned storage space pointer mov ebp, esp and ebp, ~0x0f // init to 0 xorps AVERAGE, AVERAGE // average (r, g, b) // ------------------------------------------------------------------------------------- // (3) Find the array of unique pixel values and sum them to find their average position // ------------------------------------------------------------------------------------- { lea edi,[ebp - EBP_UNIQUES] mov esi,[esp + SP_BLOCK_32] mov ecx, 16 pxor mm2, mm2 average_unique_loop : { movd mm0 ,[esi] add esi , 4 punpcklbw mm0 , mm2 movq mm1 , mm0 punpckhwd mm0 , mm2 punpcklwd mm1 , mm2 cvtpi2ps xmm7, mm0 movlhps xmm7, xmm7 cvtpi2ps xmm7, mm1 // colourspace conversion movaps[edi], xmm7 add edi, 16 addps AVERAGE, xmm7 sub ecx, 1 jne average_unique_loop } // Compute average of the uniques mulps AVERAGE,[one_over_16] } // ------------------------------------------------------------------------------------- // (4) For each component, reflect points about the average so all lie on the same side // of the average, and compute the new average - this gives a second point that defines the axis // To compute the sign of the axis sum the positive differences of G for each of R and B (the // G axis is always positive in this implementation // ------------------------------------------------------------------------------------- { xorps AXIS, AXIS // v (r, g, b) xorps xmm2, xmm2 // rg_pos, rb_pos, bg_pos lea edi, [ebp - EBP_UNIQUES] mov ecx, 16 find_axis_loop: { movaps xmm7, [edi] // R G B value add edi, 16 subps xmm7, AVERAGE // centred movaps xmm6, xmm7 movaps xmm5, xmm7 andps xmm7, [clearsign] // fabs (r, g, b) addps AXIS, xmm7 // direction of axis shufps xmm6, xmm6, SHUFFLE_SELECT(0, 2, 2, 3) // B R R 0 shufps xmm5, xmm5, SHUFFLE_SELECT(1, 0, 1, 3) // G B G 0 cmpnltps xmm6, [zero] // R/B > 0? andps xmm6, xmm5 // insert the G or B value for those channels which are positive addps xmm2, xmm6 // bg_pos rb_pos rg_pos sub ecx, 1 jne find_axis_loop } mulps AXIS, [one_over_16] // Handle RB_pos - RB pos is used if RG_pos and BG_pos are both zero. movaps xmm5, xmm2 // duplicate the pos across these three movaps xmm6, xmm2 movaps xmm7, xmm2 shufps xmm5, xmm5, SHUFFLE_SELECT(1, 3, 3, 3) // RB_pos 0 -> shufps xmm6, xmm6, SHUFFLE_SELECT(2, 2, 2, 2) // RG_pos -> shufps xmm7, xmm7, SHUFFLE_SELECT(0, 0, 0, 0) // BG_pos -> orps xmm6, xmm7 cmpneqps xmm6, [zero] // so check for any non-zero in RG_pos or BG_pos andps xmm2, xmm6 // Mask out RG_pos in current if we need to the current xorps xmm6, [invert] andps xmm5, xmm6 orps xmm2, xmm5 // insert RB pos instead // Change the sign of the R and B portions of the axis appropriately cmpltps xmm2, [zero] andps xmm2, [rb_sign_bits] xorps AXIS, xmm2 // Flip the sign of the axis if the r/g or b/g tests indicate a negative slope } // ------------------------------------------------------------------------------------- // (5) Axis projection and remapping // ------------------------------------------------------------------------------------- { // Normalise the axis for simplicity of future calculation movaps xmm7, AXIS mulps xmm7, xmm7 PARALLEL_ADD_XMM7 // low of xmm7 is the DP result // If this is 0 we haven't actually got an axis, and we can't rsq it, // so mask the output to 0 in this case. This generates an acceptable result movaps xmm2, xmm7 cmpneqps xmm2, [zero] rsqrtps xmm7, xmm7 andps xmm7, xmm2 shufps xmm7, xmm7, SHUFFLE_SELECT(0, 0, 0, 0) // Normalise mulps AXIS, xmm7 } #define LEFT xmm2 #define RIGHT xmm3 // ------------------------------------------------------------------------------------- // (6) Map the axis // ------------------------------------------------------------------------------------- { lea edi, [ebp - EBP_UNIQUES] lea edx, [ebp - EBP_POS_ON_AXIS] mov ecx, 16 movaps LEFT, [lots] movaps RIGHT, [minuslots] xorps xmm4, xmm4 // axis mapping error { map_axis_loop: movaps xmm7,[edi] subps xmm7, AVERAGE mulps xmm7, AXIS PARALLEL_ADD_XMM7 movss[edx], xmm7 add edx, 4 // xmm7 == pos_on_axis minss LEFT, xmm7 // calculate left maxss RIGHT, xmm7 // calculate right add edi, 16 sub ecx, 1 jne map_axis_loop } } shufps LEFT, LEFT, SHUFFLE_SELECT(0, 0, 0, 0) shufps RIGHT, RIGHT, SHUFFLE_SELECT(0, 0, 0, 0) // ------------------------------------------------------------------------------------- // (7) Now we have a good axis and the basic information about how the points are mapped // to it // Our initial guess is to represent the endpoints accurately, by moving the average // to the centre and recalculating the point positions along the line // ------------------------------------------------------------------------------------- { // Calculate centre movaps xmm7, LEFT addps xmm7, RIGHT mulps xmm7, [half] // Offset all the axis positions to the centre lea edi, [ebp - EBP_POS_ON_AXIS] movaps xmm5, [edi] movaps xmm6, [edi + 16] subps xmm5, xmm7 subps xmm6, xmm7 movaps[edi], xmm5 movaps[edi + 16], xmm6 movaps xmm5, [edi + 32] movaps xmm6, [edi + 48] subps xmm5, xmm7 subps xmm6, xmm7 movaps[edi + 32], xmm5 movaps[edi + 48], xmm6 // Offset left, right and average to centre subps LEFT, xmm7 subps RIGHT, xmm7 mulps xmm7, AXIS addps AVERAGE, xmm7 } // ------------------------------------------------------------------------------------- // (8) Calculate the high and low output colour values // Involved in this is a rounding procedure which is undoubtedly slightly twitchy. A // straight rounded average is not correct, as the decompressor 'unrounds' by replicating // the top bits to the bottom. // In order to take account of this process, we don't just apply a straight rounding correction, // but base our rounding on the input value (a straight rounding is actually pretty good in terms of // error measure, but creates a visual colour and/or brightness shift relative to the original image) // The method used here is to apply a centre-biased rounding dependent on the input value, which was // (mostly by experiment) found to give minimum MSE while preserving the visual characteristics of // the image. // rgb = (average_rgb + (left|right)*v_rgb); // ------------------------------------------------------------------------------------- { movaps xmm6, LEFT movaps xmm7, RIGHT mulps xmm6, AXIS mulps xmm7, AXIS addps xmm6, AVERAGE addps xmm7, AVERAGE // Rearrange so B and R are in the same register half (they both use 5-bit rounding) shufps xmm6, xmm6, SHUFFLE_SELECT(0, 2, 1, 3) // B R G shufps xmm7, xmm7, SHUFFLE_SELECT(0, 2, 1, 3) // Convert to integer (by truncation, as C code does) cvttps2pi mm0, xmm6 cvttps2pi mm1, xmm7 movhlps xmm6, xmm6 movhlps xmm7, xmm7 cvttps2pi mm2, xmm6 cvttps2pi mm3, xmm7 // mm0/1 is blue/red, mm2/3 is green // This isn't quite the same as the C algorithm, but should generate the same result // if the input range is ensured to be 0-255 // This code could be heavily interleaved, but for P4 it's not worth the hassle - the // P4 reordering range of 15 instructions will let it do the job for us pmaxsw mm0, [clamp_0] // Note: faster to do these max/min in MMX than float XMM - better reordering opportunities pmaxsw mm1, [clamp_0] pminsw mm0, [clamp_255] pminsw mm1, [clamp_255] movq mm6, mm0 movq mm7, mm1 paddd mm0, [blue_red_rounding] paddd mm1, [blue_red_rounding] psrld mm6, 5 psrld mm7, 5 psubd mm0, mm6 psubd mm1, mm7 // No need to clamp here, with the input in 0-255 range it can never be outside at the end pand mm0, [mask_blue_red] pand mm1, [mask_blue_red] // Separate out R and B as they will need separate shifts later pshufw mm4, mm0, SHUFFLE_SELECT(2, 3, 2, 3) // extract R (this is an SSE, not MMX, instruction) pshufw mm5, mm1, SHUFFLE_SELECT(2, 3, 2, 3) // also R pmaxsw mm2, [clamp_0] pmaxsw mm3, [clamp_0] pminsw mm2, [clamp_255] pminsw mm3, [clamp_255] movq mm6, mm2 movq mm7, mm3 paddd mm2, [green_rounding] paddd mm3, [green_rounding] psrld mm6, 6 psrld mm7, 6 psubd mm2, mm6 psubd mm3, mm7 pand mm2, [mask_green] pand mm3, [mask_green] // Convert the 8-bit values to final RGB565 colours in mm0 and mm1 psrld mm0, 3 psrld mm1, 3 pslld mm4, 8 pslld mm5, 8 pslld mm2, 3 pslld mm3, 3 por mm0, mm4 por mm1, mm5 por mm0, mm2 por mm1, mm3 // mm0 and mm1 are c0 and c1 // Need to compare c0/c1 for sign and flip and set swap if required - and handle colour equality as well.... mov edi, [esp + SP_BLOCK_DXTC] pxor mm5, mm5 punpcklwd mm0, mm5 // unpack c0/c1 to DWORD's as pcmp is a signed comparison punpcklwd mm1, mm5 movq mm2, mm0 movq mm3, mm0 movq mm4, mm0 pcmpgtd mm2, mm1 pxor mm2, [invert] // Need less than, so flip the result movd[ebp - EBP_SWAP], mm2 // Set the swap flag (used below) appropriately) // mm2 is the mask to indicate flipping is needed pcmpeqd mm4, mm1 movd ebx, mm4 // ebx is the equality flag, plenty of time for this slow move to resolve punpcklwd mm0, mm1 // 'normal' order punpcklwd mm1, mm3 // reversed order pand mm1, mm2 pandn mm2, mm0 por mm1, mm2 // one of the two, selected by mm2 movd[edi], mm1 // write the result } // Clear the output bitmasks add edi, 4 mov dword ptr[edi], 0 // If the values are equal, the bit selector is 0 because the two colours are // the same (which implies transparent) // This seems the easiest way to do it, and will only rarely break branch prediction on // typical images. test ebx, ebx jnz all_done // ------------------------------------------------------------------------------------- // (9) Final clustering, creating the 2-bit values that define the output // ------------------------------------------------------------------------------------- { lea ecx, split_point movaps xmm7, RIGHT mulps RIGHT,[ecx] // split point addps xmm7, LEFT mulps xmm7,[half] // centre (probably 0, but what the hell) lea esi,[ebp - EBP_POS_ON_AXIS] lea edx,[expandtable] movss xmm6,[ebp - EBP_SWAP] shufps xmm6, xmm6, 0 mov ecx, 4 { next_bit_loop: // Do 4 at once movaps xmm4,[esi] // Read the four pos_on_axis entries add esi, 16 movaps xmm5, xmm4 andps xmm4,[clearsign] cmpltps xmm4, RIGHT // < division means 2 cmpnltps xmm5, xmm7 // >= centre means add 1 xorps xmm5, xmm6 // Swap the order if we had to flip our colours movmskps eax, xmm4 movmskps ebx, xmm5 movzx eax, byte ptr[edx + eax + 16] movzx ebx, byte ptr[edx + ebx] or eax, ebx mov byte ptr[edi], al add edi, 1 sub ecx, 1 jne next_bit_loop } } all_done: emms pop ebp pop ebx pop edi pop esi ret } // _asm } #pragma warning( pop ) #endif // !_WIN64