//===================================================================== // Copyright 2020 (c), Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files(the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and / or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions : // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. // //===================================================================== #include "cmp_math_common.h" #include "cmp_math_vec4.h" #ifndef ASPM_GPU //--------------------------------------------- // CPU: Computes square root of a float value //--------------------------------------------- float cpu_sqrtf(float * pIn) { //printf("native : "); return sqrtf(*pIn); } #ifdef CMP_USE_XMMINTRIN #ifndef __linux__ //--------------------------------------------- // SSE: Computes square root of a float value //--------------------------------------------- float sse_sqrtf( float *pIn ) { //printf("sse : "); __m128 val = _mm_load1_ps(pIn); val = _mm_sqrt_ss(val); return val.m128_f32[0]; } #endif #endif //------------------------------------------------- // CPU: Computes 1 / (square root of a float value) //------------------------------------------------- float cpu_rsqf(float *f) { float sf = cmp_sqrtf2(f); if (sf != 0) return 1 / cmp_sqrtf2(f); else return 0.0f; } #ifdef CMP_USE_XMMINTRIN #ifndef __linux__ //------------------------------------------------- // SSE: Computes 1 / (square root of a float value) //------------------------------------------------- #ifdef CMP_USE_RSQ_RSQR float sse_rsqf(float* v) { __m128 val = _mm_load1_ps(v); val = _mm_rsqrt_ss(val); float frsq = val.m128_f32[0]; return (0.5f * frsq) * (3.0f - (*v * frsq) * frsq); }; #else float sse_rsqf(float *v) { __m128 val = _mm_set_ss(*v); // Copy float and zero the upper 3 elements __m128 val1 = _mm_set_ss(1.0f); val = _mm_sqrt_ss(val); val = _mm_div_ss(val1, val); return ( val.m128_f32[0] ); }; #endif #endif #endif //--------------------------------------------- // CPU: Computes min of two float values //--------------------------------------------- float cpu_minf(float l1, float r1) { return (l1 < r1 ? l1 : r1); } #ifdef CMP_USE_XMMINTRIN #ifndef __linux__ float sse_minf( float a, float b ) { // Branchless SSE min. _mm_store_ss( &a, _mm_min_ss(_mm_set_ss(a),_mm_set_ss(b)) ); return a; } #endif #endif //--------------------------------------------- // CPU: Computes max of two float values //--------------------------------------------- float cpu_maxf(float l1, float r1) { return (l1 > r1 ? l1 : r1); } #ifdef CMP_USE_XMMINTRIN #ifndef __linux__ float sse_maxf( float a, float b ) { // Branchless SSE max. _mm_store_ss( &a, _mm_max_ss(_mm_set_ss(a),_mm_set_ss(b)) ); return a; } #endif #endif //================================================ // Clamp the value in the range [minval .. maxval] //================================================ float cpu_clampf(float value, float minval, float maxval) { if (value < minval) { return(minval); } else if (value > maxval) { return(maxval); } return value; } #ifdef CMP_USE_XMMINTRIN #ifndef __linux__ float sse_clampf( float val, float minval, float maxval ) { _mm_store_ss( &val, _mm_min_ss( _mm_max_ss(_mm_set_ss(val),_mm_set_ss(minval)), _mm_set_ss(maxval) ) ); return val; } #endif #endif void cpu_averageRGB(unsigned char *src_rgba_block) { float medianR = 0.0f, medianG = 0.0f, medianB = 0.0f; for (CGU_UINT32 k = 0; k<16; k++) { CGU_UINT32 R = (src_rgba_block[k] & 0xff0000) >> 16; CGU_UINT32 G = (src_rgba_block[k] & 0xff00) >> 8; CGU_UINT32 B = src_rgba_block[k] & 0xff; medianR += R; medianG += G; medianB += B; } medianR /= 16; medianG /= 16; medianB /= 16; // Now skew the colour weightings based on the gravity center of the block float largest = cmp_maxf2(cmp_maxf2(medianR,medianG), medianB); if (largest > 0) { medianR /= largest; medianG /= largest; medianB /= largest; } else medianR = medianG = medianB = 1.0f; } float cpu_lerp2(CMP_Vec4uc C1, CMP_Vec4uc CA, CMP_Vec4uc CB, CMP_Vec4uc C2, CMP_MATH_BYTE *encode1, CMP_MATH_BYTE *encode2) { // Initial Setup CMP_Vec4uc P[4]; int diff1; int diff2; int min1 = 0x1FF; int min2 = 0x1FF; float gradA[4] = { 0.0f, 0.3f, 0.7f, 1.0f }; float D[3]; D[0] = (float)C2.x - C1.x; D[1] = (float)C2.y - C1.y; D[2] = (float)C2.z - C1.z; for (int i = 0; i < 4; i++) { P[i].x = (CMP_MATH_BYTE)(C1.x + (gradA[i] * D[0])); P[i].y = (CMP_MATH_BYTE)(C1.y + (gradA[i] * D[1])); P[i].z = (CMP_MATH_BYTE)(C1.z + (gradA[i] * D[2])); // Now check if its closer to C1 diff1 = abs(CA.x - P[i].x) + abs(CA.y - P[i].y) + abs(CA.z - P[i].z); if (diff1 < min1) { *encode1 = (CMP_MATH_BYTE)i; min1 = diff1; } // Now check if its closer to C2 diff2 = abs(CB.x - P[i].x) + abs(CB.y - P[i].y) + abs(CB.z - P[i].z); if (diff2 < min2) { *encode2 = (CMP_MATH_BYTE)i; min2 = diff2; } } return float(min1+min2); } #ifdef CMP_USE_XMMINTRIN #ifndef __linux__ float sse_lerp2(CMP_Vec4uc C1, CMP_Vec4uc CA, CMP_Vec4uc CB, CMP_Vec4uc C2, CMP_MATH_BYTE *encode1, CMP_MATH_BYTE *encode2) { // Initial Setup __m128 iC1, iC2, iCA, iCB; //Load auchars into _m128 __m128 iP[4]; __m128 idiff1, idiff2; float min = 511.0f; __m128 imin1 = _mm_set_ps1(min); __m128 imin2 = _mm_set_ps1(min); __m128 absMask = _mm_set_ps1(-0.0); __m128 zero = _mm_set_ps1(0.0); //Aplha channels have been disabled iC1 = _mm_set_ps(0, C1.z, C1.y, C1.x); iC2 = _mm_set_ps(0, C2.z, C2.y, C2.x); iCA = _mm_set_ps(0, CA.z, CA.y, CA.x); iCB = _mm_set_ps(0, CB.z, CB.y, CB.x); //Direction Vector __m128 iD; iD = _mm_sub_ps(iC2, iC1); float gradA[4] = { 0.0f, 0.3f, 0.7f, 1.0f }; for (int i = 0; i < 4; i++) { __m128 currentGradA = _mm_set_ps1(gradA[i]); iP[i] = _mm_add_ps(_mm_mul_ps(currentGradA, iD), iC1); //Calculate SAD (Sum of Absolute Difference) __m128 iSub1 = _mm_sub_ps(iCA, iP[i]); __m128 iSub2 = _mm_sub_ps(iCB, iP[i]); __m128 iAbs1 = _mm_andnot_ps(absMask, iSub1); // Computes the absolute value __m128 iAbs2 = _mm_andnot_ps(absMask, iSub2); idiff1 = _mm_hadd_ps(iAbs1, zero); idiff1 = _mm_hadd_ps(idiff1, zero); idiff2 = _mm_hadd_ps(iAbs2, zero); idiff2 = _mm_hadd_ps(idiff2, zero); if (_mm_comilt_ss(idiff1, imin1)) { *encode1 = (CMP_MATH_BYTE)i; imin1 = idiff1; } if (_mm_comilt_ss(idiff2, imin2)) { *encode2 = (CMP_MATH_BYTE)i; imin2 = idiff2; } } // Convert __m128 back to floats float result1[4]; float result2[4]; _mm_storeu_ps (result1, imin1); _mm_storeu_ps (result2, imin2); return (result1[0] + result2[0]); } float fma_lerp2(CMP_Vec4uc C1, CMP_Vec4uc CA, CMP_Vec4uc CB, CMP_Vec4uc C2, CMP_MATH_BYTE *encode1, CMP_MATH_BYTE *encode2) { // Initial Setup __m128 iC1, iC2, iCA, iCB; //Load auchars into _m128 __m128 iP[4]; __m128 idiff1, idiff2; float min = 511.0f; __m128 imin1 = _mm_set_ps1(min); __m128 imin2 = _mm_set_ps1(min); __m128 absMask = _mm_set_ps1(-0.0); __m128 zero = _mm_set_ps1(0.0); //Aplha channels have been disabled iC1 = _mm_set_ps(0, C1.z, C1.y, C1.x); iC2 = _mm_set_ps(0, C2.z, C2.y, C2.x); iCA = _mm_set_ps(0, CA.z, CA.y, CA.x); iCB = _mm_set_ps(0, CB.z, CB.y, CB.x); //Direction Vector __m128 iD; iD = _mm_sub_ps(iC2, iC1); float gradA[4] = { 0.0f, 0.3f, 0.7f, 1.0f }; for (int i = 0; i < 4; i++) { __m128 currentGradA = _mm_set_ps1(gradA[i]); iP[i] = _mm_fmadd_ps(currentGradA, iD, iC1); //Calculate SAD (Sum of Absolute Difference) __m128 iSub1 = _mm_sub_ps(iCA, iP[i]); __m128 iSub2 = _mm_sub_ps(iCB, iP[i]); __m128 iAbs1 = _mm_andnot_ps(absMask, iSub1); // Computes the absolute value __m128 iAbs2 = _mm_andnot_ps(absMask, iSub2); idiff1 = _mm_hadd_ps(iAbs1, zero); idiff1 = _mm_hadd_ps(idiff1, zero); idiff2 = _mm_hadd_ps(iAbs2, zero); idiff2 = _mm_hadd_ps(idiff2, zero); if (_mm_comilt_ss(idiff1, imin1)) { *encode1 = (CMP_MATH_BYTE)i; imin1 = idiff1; } if (_mm_comilt_ss(idiff2, imin2)) { *encode2 = (CMP_MATH_BYTE)i; imin2 = idiff2; } } // Convert __m128 back to floats float result1[4]; float result2[4]; _mm_storeu_ps (result1, imin1); _mm_storeu_ps (result2, imin2); return (result1[0] + result2[0]); } void cmp_set_fma3_features() { //printf("FMA3 has been enabled."); cmp_lerp2 = fma_lerp2; } #endif #endif void cmp_set_cpu_features() { // features list in Alphabetical order cmp_clampf2 = cpu_clampf; cmp_lerp2 = cpu_lerp2; cmp_maxf2 = cpu_maxf; cmp_minf2 = cpu_minf; cmp_rsqf2 = cpu_rsqf; cmp_sqrtf2 = cpu_sqrtf; } #ifdef CMP_USE_XMMINTRIN #ifndef __linux__ void cmp_set_sse2_features() { cmp_clampf2 = sse_clampf; cmp_lerp2 = sse_lerp2; cmp_maxf2 = sse_maxf; cmp_minf2 = sse_minf; cmp_rsqf2 = sse_rsqf; cmp_sqrtf2 = sse_sqrtf; } #endif #endif //--------------------------------- // User Interface to the CMP_MATH //--------------------------------- float(*cmp_sqrtf2 ) (float *) = cpu_sqrtf; float(*cmp_rsqf2 ) (float *) = cpu_rsqf; float(*cmp_minf2 ) (float l1, float r1) = cpu_minf; float(*cmp_maxf2 ) (float l1, float r1) = cpu_maxf; float(*cmp_clampf2 ) (float value, float minval, float maxval) = cpu_clampf; float (*cmp_lerp2) (CMP_Vec4uc C1, CMP_Vec4uc CA, CMP_Vec4uc CB, CMP_Vec4uc C2, CMP_MATH_BYTE *encode1, CMP_MATH_BYTE *encode2) = cpu_lerp2; void (*cmp_averageRGB)(unsigned char *src_rgba_block) = cpu_averageRGB; //} #else // OpenCL interfaces float cmp_sqrtf2 (float *) {}; float cmp_rsqf2 (float *) {}; float cmp_minf2 (float l1, float r1) {}; float cmp_maxf2 (float l1, float r1) {}; float cmp_clampf2(float value, float minval, float maxval) {}; void cmp_lerp2(CMP_Vec4uc C1, CMP_Vec4uc CA, CMP_Vec4uc CB, CMP_Vec4uc C2, CMP_MATH_BYTE *encode1, CMP_MATH_BYTE *encode2) {}; void cmp_averageRGB(unsigned char *src_rgba_block) {}; #endif // not def OPENCL