//===================================================================== // Copyright (c) 2020-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files(the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and / or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions : // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. // // File: bc1_cmp.h //-------------------------------------------------------------------------------------- // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. //-------------------------------------------------------------------------------------- #define USE_CMP #include "common_def.h" #include "bcn_common_kernel.h" #include "bcn_common_api.h" #ifndef ASPM_GPU #include "cpu_extensions.h" #include "core_simd.h" #endif //----------------------------------------------------------------------- // When build is for CPU, we have some missing API calls common to GPU // Use CPU CMP_Core replacements //----------------------------------------------------------------------- #if defined(ASPM_GPU) || defined(ASPM_HLSL) || defined(ASPM_OPENCL) #define ALIGN_16 #define ALIGN_32 #define ALIGN_64 #else #include INC_cmp_math_func #if defined(_WIN32) || defined(_WIN64) #define ALIGN_16 __declspec(align(16)) #define ALIGN_32 __declspec(align(32)) #define ALIGN_64 __declspec(align(64)) #else // !WIN32 && !_WIN64 #define ALIGN_16 __attribute__((aligned(16))) #define ALIGN_32 __attribute__((aligned(32))) #define ALIGN_64 __attribute__((aligned(64))) #endif // !WIN32 && !_WIN64 #endif #define USE_REFINE3D #define USE_REFINE #ifndef MAX_ERROR #define MAX_ERROR 128000.f #endif #define NUM_CHANNELS 4 #define NUM_ENDPOINTS 2 #ifndef CMP_QUALITY0 #define CMP_QUALITY0 0.25f #endif #ifndef CMP_QUALITY1 #define CMP_QUALITY1 0.50f #endif #ifndef CMP_QUALITY2 #define CMP_QUALITY2 0.75f #endif #define EPS (2.f / 255.f) * (2.f / 255.f) #define EPS2 3.f * (2.f / 255.f) * (2.f / 255.f) // Disable SIMD code during GPU builds #if !defined(ASPM_GPU) CMP_STATIC CGU_BOOL g_bc1FunctionPointersSet = false; // declarations for SIMD function variations CMP_STATIC CGU_FLOAT _cpu_bc1ComputeBestEndpoints(CGU_FLOAT*, CGU_FLOAT*, CGU_FLOAT*, CGU_FLOAT*, CGU_FLOAT*, int, int); // function pointers CMP_STATIC CGU_FLOAT (*cpu_bc1ComputeBestEndpoints)(CGU_FLOAT*, CGU_FLOAT*, CGU_FLOAT*, CGU_FLOAT*, CGU_FLOAT*, int, int) = 0; // Toggle which SIMD instruction set extensions to use. Setting this to EXTENSION_COUNT will enable auto-detection of supported extensions. // NOTE: The requested extension will only be enabled if it is supported by the current CPU. CMP_STATIC bool bc1ToggleSIMD(CGU_INT newExtension) { CGU_BOOL useAVX512 = true; CGU_BOOL useAVX2 = true; CGU_BOOL useSSE42 = true; CPUExtensions extensions = GetCPUExtensions(); if (newExtension < EXTENSION_COUNT) // user requested a specific instruction set extension { useAVX512 = newExtension == EXTENSION_AVX512_F; useAVX2 = newExtension == EXTENSION_AVX2; useSSE42 = newExtension == EXTENSION_SSE42; } if (useAVX512 && IsAvailableAVX512(extensions)) { cpu_bc1ComputeBestEndpoints = avx512_bc1ComputeBestEndpoints; } else if (useAVX2 && IsAvailableAVX2(extensions)) { cpu_bc1ComputeBestEndpoints = avx_bc1ComputeBestEndpoints; } else if (useSSE42 && IsAvailableSSE4(extensions)) { cpu_bc1ComputeBestEndpoints = sse_bc1ComputeBestEndpoints; } else { cpu_bc1ComputeBestEndpoints = _cpu_bc1ComputeBestEndpoints; } g_bc1FunctionPointersSet = true; bool result = true; if (newExtension != EXTENSION_COUNT && (useAVX512 && !IsAvailableAVX512(extensions)) || (useAVX2 && !IsAvailableAVX2(extensions)) || (useSSE42 && !IsAvailableSSE4(extensions))) result = false; return result; } #endif static CGU_FLOAT cgu_getRampErr(CGU_FLOAT Prj[BLOCK_SIZE_4X4], CGU_FLOAT PrjErr[BLOCK_SIZE_4X4], CGU_FLOAT PreMRep[BLOCK_SIZE_4X4], CGU_FLOAT StepErr, CGU_FLOAT lowPosStep, CGU_FLOAT highPosStep, CGU_UINT32 dwUniqueColors) { CGU_FLOAT error = 0; CGU_FLOAT step = (highPosStep - lowPosStep) / 3; // using (dwNumChannels=4 - 1); CGU_FLOAT step_h = step * (CGU_FLOAT)0.5; CGU_FLOAT rstep = (CGU_FLOAT)1.0f / step; for (CGU_UINT32 i = 0; i < dwUniqueColors; i++) { CGU_FLOAT v; // Work out which value in the block this select CGU_FLOAT del; if ((del = Prj[i] - lowPosStep) <= 0) v = lowPosStep; else if (Prj[i] - highPosStep >= 0) v = highPosStep; else v = cmp_floor((del + step_h) * rstep) * step + lowPosStep; // And accumulate the error CGU_FLOAT d = (Prj[i] - v); d *= d; CGU_FLOAT err = PreMRep[i] * d + PrjErr[i]; error += err; if (StepErr < error) { error = StepErr; break; } } return error; } CMP_STATIC CMP_EndPoints cgu_CompressRGBBlockX( CMP_IN CGU_Vec3f BlkInBGRf_UV[BLOCK_SIZE_4X4], CMP_IN CGU_FLOAT Rpt[BLOCK_SIZE_4X4], CMP_IN CGU_UINT32 dwUniqueColors, CMP_IN CGU_Vec3f channelWeightsBGR, CMP_IN CGU_BOOL b3DRefinement ) { CMP_UNUSED(channelWeightsBGR); CMP_UNUSED(b3DRefinement); CGU_FLOAT ALIGN_16 Prj0[BLOCK_SIZE_4X4]; CGU_FLOAT ALIGN_16 Prj[BLOCK_SIZE_4X4]; CGU_FLOAT ALIGN_16 PrjErr[BLOCK_SIZE_4X4]; CGU_FLOAT ALIGN_16 RmpIndxs[BLOCK_SIZE_4X4]; CGU_Vec3f LineDirG; CGU_Vec3f LineDir; CGU_FLOAT LineDir0[NUM_CHANNELS]; CGU_Vec3f BlkUV[BLOCK_SIZE_4X4]; CGU_Vec3f BlkSh[BLOCK_SIZE_4X4]; CGU_Vec3f Mdl; CGU_Vec3f rsltC0; CGU_Vec3f rsltC1; CGU_Vec3f PosG0 = {0.0f, 0.0f, 0.0f}; CGU_Vec3f PosG1 = {0.0f, 0.0f, 0.0f}; CGU_UINT32 i; for (i = 0; i < dwUniqueColors; i++) { BlkUV[i] = BlkInBGRf_UV[i]; } // if not more then 2 different colors, we've done if (dwUniqueColors <= 2) { rsltC0 = BlkInBGRf_UV[0] * 255.0f; rsltC1 = BlkInBGRf_UV[dwUniqueColors - 1] * 255.0f; } else { // This is our first attempt to find an axis we will go along. // The cumulation is done to find a line minimizing the MSE from the // input 3D points. // While trying to find the axis we found that the diameter of the input // set is quite small. Do not bother. // FindAxisIsSmall(BlkSh, LineDir0, Mdl, Blk, Rpt,dwUniqueColors); { CGU_UINT32 ii; CGU_UINT32 jj; CGU_UINT32 kk; // These vars cannot be Vec3 as index to them are varying CGU_FLOAT Crrl[NUM_CHANNELS]; CGU_FLOAT RGB2[NUM_CHANNELS]; LineDir0[0] = LineDir0[1] = LineDir0[2] = RGB2[0] = RGB2[1] = RGB2[2] = Crrl[0] = Crrl[1] = Crrl[2] = Mdl.x = Mdl.y = Mdl.z = 0.f; // sum position of all points CGU_FLOAT fNumPoints = 0.0f; for (ii = 0; ii < dwUniqueColors; ii++) { Mdl.x += BlkUV[ii].x * Rpt[ii]; Mdl.y += BlkUV[ii].y * Rpt[ii]; Mdl.z += BlkUV[ii].z * Rpt[ii]; fNumPoints += Rpt[ii]; } // and then average to calculate center coordinate of block Mdl /= fNumPoints; for (ii = 0; ii < dwUniqueColors; ii++) { // calculate output block as offsets around block center BlkSh[ii] = BlkUV[ii] - Mdl; // compute correlation matrix // RGB2 = sum of ((distance from point from center) squared) RGB2[0] += BlkSh[ii].x * BlkSh[ii].x * Rpt[ii]; RGB2[1] += BlkSh[ii].y * BlkSh[ii].y * Rpt[ii]; RGB2[2] += BlkSh[ii].z * BlkSh[ii].z * Rpt[ii]; Crrl[0] += BlkSh[ii].x * BlkSh[ii].y * Rpt[ii]; Crrl[1] += BlkSh[ii].y * BlkSh[ii].z * Rpt[ii]; Crrl[2] += BlkSh[ii].z * BlkSh[ii].x * Rpt[ii]; } // if set's diameter is small CGU_UINT32 i0 = 0, i1 = 1; CGU_FLOAT mxRGB2 = 0.0f; CGU_FLOAT fEPS = fNumPoints * EPS; for (kk = 0, jj = 0; jj < 3; jj++) { if (RGB2[jj] >= fEPS) kk++; else RGB2[jj] = 0.0f; if (mxRGB2 < RGB2[jj]) { mxRGB2 = RGB2[jj]; i0 = jj; } } CGU_FLOAT fEPS2 = fNumPoints * EPS2; CGU_BOOL AxisIsSmall; AxisIsSmall = (RGB2[0] < fEPS2); AxisIsSmall = AxisIsSmall && (RGB2[1] < fEPS2); AxisIsSmall = AxisIsSmall && (RGB2[2] < fEPS2); // all are very small to avoid division on the small determinant if (AxisIsSmall) { rsltC0 = BlkInBGRf_UV[0] * 255.0f; rsltC1 = BlkInBGRf_UV[dwUniqueColors - 1] * 255.0f; } else { // !AxisIsSmall if (kk == 1) // really only 1 dimension LineDir0[i0] = 1.; else if (kk == 2) { // really only 2 dimensions i1 = (RGB2[(i0 + 1) % 3] > 0.f) ? (i0 + 1) % 3 : (i0 + 2) % 3; CGU_FLOAT Crl = (i1 == (i0 + 1) % 3) ? Crrl[i0] : Crrl[(i0 + 2) % 3]; LineDir0[i1] = Crl / RGB2[i0]; LineDir0[i0] = 1.; } else { CGU_FLOAT maxDet = 100000.f; CGU_FLOAT Cs[3]; // select max det for precision for (jj = 0; jj < 3; jj++) { // 3 = nDimensions CGU_FLOAT Det = RGB2[jj] * RGB2[(jj + 1) % 3] - Crrl[jj] * Crrl[jj]; Cs[jj] = cmp_fabs(Crrl[jj] / sqrt(RGB2[jj] * RGB2[(jj + 1) % 3])); if (maxDet < Det) { maxDet = Det; i0 = jj; } } // inverse correl matrix // -- -- -- -- // | A B | | C -B | // | B C | => | -B A | // -- -- -- -- CGU_FLOAT mtrx1[2][2]; CGU_FLOAT vc1[2]; CGU_FLOAT vc[2]; vc1[0] = Crrl[(i0 + 2) % 3]; vc1[1] = Crrl[(i0 + 1) % 3]; // C mtrx1[0][0] = RGB2[(i0 + 1) % 3]; // A mtrx1[1][1] = RGB2[i0]; // -B mtrx1[1][0] = mtrx1[0][1] = -Crrl[i0]; // find a solution vc[0] = mtrx1[0][0] * vc1[0] + mtrx1[0][1] * vc1[1]; vc[1] = mtrx1[1][0] * vc1[0] + mtrx1[1][1] * vc1[1]; // normalize vc[0] /= maxDet; vc[1] /= maxDet; // find a line direction vector LineDir0[i0] = 1.; LineDir0[(i0 + 1) % 3] = 1.; LineDir0[(i0 + 2) % 3] = vc[0] + vc[1]; } // normalize direction vector CGU_FLOAT Len = LineDir0[0] * LineDir0[0] + LineDir0[1] * LineDir0[1] + LineDir0[2] * LineDir0[2]; Len = sqrt(Len); LineDir0[0] = (Len > 0.f) ? LineDir0[0] / Len : 0.0f; LineDir0[1] = (Len > 0.f) ? LineDir0[1] / Len : 0.0f; LineDir0[2] = (Len > 0.f) ? LineDir0[2] / Len : 0.0f; } } // FindAxisIsSmall // GCC is being an awful being when it comes to goto-jumps. // So please bear with this. CGU_FLOAT ErrG = 10000000.f; CGU_FLOAT PrjBnd0; CGU_FLOAT PrjBnd1; CGU_FLOAT ALIGN_16 PreMRep[BLOCK_SIZE_4X4]; LineDir.x = LineDir0[0]; LineDir.y = LineDir0[1]; LineDir.z = LineDir0[2]; // Here is the main loop. // 1. Project input set on the axis in consideration. // 2. Run 1 dimensional search (see scalar case) to find an (sub) optimal pair of end points. // 3. Compute the vector of indexes (or clusters) for the current approximate ramp. // 4. Present our color channels as 3 16DIM vectors. // 5. Find closest approximation of each of 16DIM color vector with the projection of the 16DIM index vector. // 6. Plug the projections as a new directional vector for the axis. // 7. Goto 1. // D - is 16 dim "index" vector (or 16 DIM vector of indexes - {0, 1/3,2/3, 0, ...,}, but shifted and normalized). // Ci - is a 16 dim vector of color i. for each Ci find a scalar Ai such that (Ai * D - Ci) (Ai * D - Ci) -> min , // i.e distance between vector AiD and C is min. You can think of D as a unit interval(vector) "clusterizer", and Ai is a scale // you need to apply to the clusterizer to approximate the Ci vector instead of the unit vector. // Solution is // Ai = (D . Ci) / (D . D); . - is a dot product. // in 3 dim space Ai(s) represent a line direction, along which // we again try to find (sub)optimal quantizer. // That's what our for(;;) loop is about. for (;;) { // 1. Project input set on the axis in consideration. // From Foley & Van Dam: Closest point of approach of a line (P + v) to a // point (R) is // P + ((R-P).v) / (v.v))v // The distance along v is therefore (R-P).v / (v.v) // (v.v) is 1 if v is a unit vector. // PrjBnd0 = 1000.0f; PrjBnd1 = -1000.0f; for (i = 0; i < BLOCK_SIZE_4X4; i++) Prj0[i] = Prj[i] = PrjErr[i] = PreMRep[i] = 0.f; for (i = 0; i < dwUniqueColors; i++) { Prj0[i] = Prj[i] = dot(BlkSh[i], LineDir); PrjErr[i] = dot(BlkSh[i] - LineDir * Prj[i], BlkSh[i] - LineDir * Prj[i]); PrjBnd0 = min(PrjBnd0, Prj[i]); PrjBnd1 = max(PrjBnd1, Prj[i]); } // 2. Run 1 dimensional search (see scalar case) to find an (sub) optimal // pair of end points. // min and max of the search interval CGU_FLOAT Scl0; CGU_FLOAT Scl1; Scl0 = PrjBnd0 - (PrjBnd1 - PrjBnd0) * 0.125f; Scl1 = PrjBnd1 + (PrjBnd1 - PrjBnd0) * 0.125f; // compute scaling factor to scale down the search interval to [0.,1] const CGU_FLOAT Scl2 = (Scl1 - Scl0) * (Scl1 - Scl0); const CGU_FLOAT overScl = 1.f / (Scl1 - Scl0); for (i = 0; i < dwUniqueColors; i++) { // scale them Prj[i] = (Prj[i] - Scl0) * overScl; // premultiply the scale square to plug into error computation later PreMRep[i] = Rpt[i] * Scl2; } // scale first approximation of end points PrjBnd0 = (PrjBnd0 - Scl0) * overScl; PrjBnd1 = (PrjBnd1 - Scl0) * overScl; CGU_FLOAT StepErr = MAX_ERROR; // search step CGU_FLOAT searchStep = 0.025f; // low Start/End; high Start/End const CGU_FLOAT lowStartEnd = (PrjBnd0 - 2.f * searchStep > 0.f) ? PrjBnd0 - 2.f * searchStep : 0.f; const CGU_FLOAT highStartEnd = (PrjBnd1 + 2.f * searchStep < 1.f) ? PrjBnd1 + 2.f * searchStep : 1.f; // find the best endpoints CGU_FLOAT Pos0 = 0; CGU_FLOAT Pos1 = 0; CGU_FLOAT lowPosStep, highPosStep; CGU_FLOAT err; int l, h; for (l = 0, lowPosStep = lowStartEnd; l < 8; l++, lowPosStep += searchStep) { for (h = 0, highPosStep = highStartEnd; h < 8; h++, highPosStep -= searchStep) { // compute an error for the current pair of end points. err = cgu_getRampErr(Prj, PrjErr, PreMRep, StepErr, lowPosStep, highPosStep, dwUniqueColors); if (err < StepErr) { // save better result StepErr = err; Pos0 = lowPosStep; Pos1 = highPosStep; } } } // inverse the scaling Pos0 = Pos0 * (Scl1 - Scl0) + Scl0; Pos1 = Pos1 * (Scl1 - Scl0) + Scl0; // did we find somthing better from the previous run? if (StepErr + 0.001 < ErrG) { // yes, remember it ErrG = StepErr; LineDirG = LineDir; PosG0.x = Pos0; PosG0.y = Pos0; PosG0.z = Pos0; PosG1.x = Pos1; PosG1.y = Pos1; PosG1.z = Pos1; // 3. Compute the vector of indexes (or clusters) for the current // approximate ramp. // indexes const CGU_FLOAT step = (Pos1 - Pos0) / 3.0f; // (dwNumChannels=4 - 1); const CGU_FLOAT step_h = step * (CGU_FLOAT)0.5; const CGU_FLOAT rstep = (CGU_FLOAT)1.0f / step; const CGU_FLOAT overBlkTp = 1.f / 3.0f; // (dwNumChannels=4 - 1); // here the index vector is computed, // shifted and normalized CGU_FLOAT indxAvrg = 3.0f / 2.0f; // (dwNumChannels=4 - 1); for (i = 0; i < dwUniqueColors; i++) { CGU_FLOAT del; // CGU_UINT32 n = (CGU_UINT32)((b - _min_ex + (step*0.5f)) * rstep); if ((del = Prj0[i] - Pos0) <= 0) RmpIndxs[i] = 0.f; else if (Prj0[i] - Pos1 >= 0) RmpIndxs[i] = 3.0f; // (dwNumChannels=4 - 1); else RmpIndxs[i] = cmp_floor((del + step_h) * rstep); // shift and normalization RmpIndxs[i] = (RmpIndxs[i] - indxAvrg) * overBlkTp; } // 4. Present our color channels as 3 16 DIM vectors. // 5. Find closest aproximation of each of 16DIM color vector with the // pojection of the 16DIM index vector. CGU_Vec3f Crs = {0.0f, 0.0f, 0.0f}; CGU_FLOAT Len = 0.0f; for (i = 0; i < dwUniqueColors; i++) { const CGU_FLOAT PreMlt = RmpIndxs[i] * Rpt[i]; Len += RmpIndxs[i] * PreMlt; Crs.x += BlkSh[i].x * PreMlt; Crs.y += BlkSh[i].y * PreMlt; Crs.z += BlkSh[i].z * PreMlt; } LineDir.x = LineDir.y = LineDir.z = 0.0f; if (Len > 0.0f) { CGU_FLOAT Len2; LineDir = Crs / Len; // 6. Plug the projections as a new directional vector for the axis. // 7. Goto 1. Len2 = dot(LineDir, LineDir); // LineDir.x * LineDir.x + LineDir.y * LineDir.y + LineDir.z * LineDir.z; Len2 = sqrt(Len2); LineDir /= Len2; } } else // We was not able to find anything better. Drop out. break; } // inverse transform to find end-points of 3-color ramp rsltC0 = (PosG0 * LineDirG + Mdl) * 255.f; rsltC1 = (PosG1 * LineDirG + Mdl) * 255.f; } // !isDone // We've dealt with (almost) unrestricted full precision realm. // Now back digital world. // round the end points to make them look like compressed ones CGU_Vec3f inpRmpEndPts0 = {0.0f, 255.0f, 0.0f}; CGU_Vec3f inpRmpEndPts1 = {0.0f, 255.0f, 0.0f}; CGU_Vec3f Fctrs0 = {8.0f, 4.0f, 8.0f}; //(1 << (PIX_GRID - BG)); x (1 << (PIX_GRID - GG)); y (1 << (PIX_GRID - RG)); z CGU_Vec3f Fctrs1 = {32.0f, 64.0f, 32.0f}; //(CGU_FLOAT)(1 << RG); z (CGU_FLOAT)(1 << GG); y (CGU_FLOAT)(1 << BG); x CGU_FLOAT _Min = 0.0f; CGU_FLOAT _Max = 255.0f; { // MkRmpOnGrid(inpRmpEndPts, rsltC, _Min, _Max); inpRmpEndPts0 = cmp_floorVec3f(rsltC0); if (inpRmpEndPts0.x <= _Min) inpRmpEndPts0.x = _Min; else { inpRmpEndPts0.x += cmp_floor(128.f / Fctrs1.x) - cmp_floor(inpRmpEndPts0.x / Fctrs1.x); inpRmpEndPts0.x = min(inpRmpEndPts0.x, _Max); } if (inpRmpEndPts0.y <= _Min) inpRmpEndPts0.y = _Min; else { inpRmpEndPts0.y += cmp_floor(128.f / Fctrs1.y) - cmp_floor(inpRmpEndPts0.y / Fctrs1.y); inpRmpEndPts0.y = min(inpRmpEndPts0.y, _Max); } if (inpRmpEndPts0.z <= _Min) inpRmpEndPts0.z = _Min; else { inpRmpEndPts0.z += cmp_floor(128.f / Fctrs1.z) - cmp_floor(inpRmpEndPts0.z / Fctrs1.z); inpRmpEndPts0.z = min(inpRmpEndPts0.z, _Max); } inpRmpEndPts0 = cmp_floorVec3f(inpRmpEndPts0 / Fctrs0) * Fctrs0; inpRmpEndPts1 = cmp_floorVec3f(rsltC1); if (inpRmpEndPts1.x <= _Min) inpRmpEndPts1.x = _Min; else { inpRmpEndPts1.x += cmp_floor(128.f / Fctrs1.x) - cmp_floor(inpRmpEndPts1.x / Fctrs1.x); inpRmpEndPts1.x = min(inpRmpEndPts1.x, _Max); } if (inpRmpEndPts1.y <= _Min) inpRmpEndPts1.y = _Min; else { inpRmpEndPts1.y += cmp_floor(128.f / Fctrs1.y) - cmp_floor(inpRmpEndPts1.y / Fctrs1.y); inpRmpEndPts1.y = min(inpRmpEndPts1.y, _Max); } if (inpRmpEndPts1.z <= _Min) inpRmpEndPts1.z = _Min; else { inpRmpEndPts1.z += cmp_floor(128.f / Fctrs1.z) - cmp_floor(inpRmpEndPts1.z / Fctrs1.z); inpRmpEndPts1.z = min(inpRmpEndPts1.z, _Max); } inpRmpEndPts1 = cmp_floorVec3f(inpRmpEndPts1 / Fctrs0) * Fctrs0; } // MkRmpOnGrid CMP_EndPoints EndPoints; EndPoints.Color0 = inpRmpEndPts0; EndPoints.Color1 = inpRmpEndPts1; return EndPoints; } CMP_STATIC CMP_EndPoints cgu_MkRmpOnGridBGR(CMP_IN CGU_Vec3f rsltC0, CMP_IN CGU_Vec3f rsltC1, CMP_IN CGU_UINT32 nRedBits, CMP_IN CGU_UINT32 nGreenBits, CMP_IN CGU_UINT32 nBlueBits) { CGU_Vec3f inpRmpEndPts0 = {0.0f, 255.0f, 0.0f}; CGU_Vec3f inpRmpEndPts1 = {0.0f, 255.0f, 0.0f}; CGU_Vec3f Fctrs0 = {8.0f, 4.0f, 8.0f}; CGU_Vec3f Fctrs1 = {32.0f, 64.0f, 32.0f}; CGU_FLOAT _Min = 0.0f; CGU_FLOAT _Max = 255.0f; // user override 565 default setting if ((nRedBits!=5)||(nGreenBits!=6)||(nBlueBits!=5)) { Fctrs1[RC] = (CGU_FLOAT)(1 << nRedBits); Fctrs1[GC] = (CGU_FLOAT)(1 << nGreenBits); Fctrs1[BC] = (CGU_FLOAT)(1 << nBlueBits); Fctrs0[RC] = (CGU_FLOAT)(1 << (PIX_GRID-nRedBits)); Fctrs0[GC] = (CGU_FLOAT)(1 << (PIX_GRID-nGreenBits)); Fctrs0[BC] = (CGU_FLOAT)(1 << (PIX_GRID-nBlueBits)); } inpRmpEndPts0 = cmp_floorVec3f(rsltC0); if (inpRmpEndPts0.x <= _Min) inpRmpEndPts0.x = _Min; else { inpRmpEndPts0.x += cmp_floor(128.f / Fctrs1.x) - cmp_floor(inpRmpEndPts0.x / Fctrs1.x); inpRmpEndPts0.x = cmp_minf(inpRmpEndPts0.x, _Max); } if (inpRmpEndPts0.y <= _Min) inpRmpEndPts0.y = _Min; else { inpRmpEndPts0.y += cmp_floor(128.f / Fctrs1.y) - cmp_floor(inpRmpEndPts0.y / Fctrs1.y); inpRmpEndPts0.y = cmp_minf(inpRmpEndPts0.y, _Max); } if (inpRmpEndPts0.z <= _Min) inpRmpEndPts0.z = _Min; else { inpRmpEndPts0.z += cmp_floor(128.f / Fctrs1.z) - cmp_floor(inpRmpEndPts0.z / Fctrs1.z); inpRmpEndPts0.z = cmp_minf(inpRmpEndPts0.z, _Max); } inpRmpEndPts0 = cmp_floorVec3f(inpRmpEndPts0 / Fctrs0) * Fctrs0; inpRmpEndPts1 = cmp_floorVec3f(rsltC1); if (inpRmpEndPts1.x <= _Min) inpRmpEndPts1.x = _Min; else { inpRmpEndPts1.x += cmp_floor(128.f / Fctrs1.x) - cmp_floor(inpRmpEndPts1.x / Fctrs1.x); inpRmpEndPts1.x = cmp_minf(inpRmpEndPts1.x, _Max); } if (inpRmpEndPts1.y <= _Min) inpRmpEndPts1.y = _Min; else { inpRmpEndPts1.y += cmp_floor(128.f / Fctrs1.y) - cmp_floor(inpRmpEndPts1.y / Fctrs1.y); inpRmpEndPts1.y = cmp_minf(inpRmpEndPts1.y, _Max); } if (inpRmpEndPts1.z <= _Min) inpRmpEndPts1.z = _Min; else { inpRmpEndPts1.z += cmp_floor(128.f / Fctrs1.z) - cmp_floor(inpRmpEndPts1.z / Fctrs1.z); inpRmpEndPts1.z = cmp_minf(inpRmpEndPts1.z, _Max); } inpRmpEndPts1 = cmp_floorVec3f(inpRmpEndPts1 / Fctrs0) * Fctrs0; CMP_EndPoints EndPoints; EndPoints.Color0 = inpRmpEndPts0; EndPoints.Color1 = inpRmpEndPts1; return EndPoints; } // MkRmpOnGrid //=================================================================== // Replaces CompressBlockBC1_RGBA_Internal() // if ((errLQ > 0.0f) && (fquality > CMP_QUALITY2)) code block //=================================================================== CMP_STATIC CGU_Vec2ui cgu_CompRGBBlock(CMP_IN CGU_Vec4f src_imageNorm[BLOCK_SIZE_4X4], CMP_IN CMP_BC15Options BC15Options) { //CGU_FLOAT errLQ = 1e6f; CGU_UINT32 m_nRefinementSteps = BC15Options.m_nRefinementSteps; CGU_UINT32 dwAlphaThreshold = BC15Options.m_nAlphaThreshold; CGU_Vec3f channelWeights = {BC15Options.m_fChannelWeights[0],BC15Options.m_fChannelWeights[1],BC15Options.m_fChannelWeights[2]}; CGU_BOOL isSRGB = BC15Options.m_bIsSRGB; CGU_Vec3f rgbBlock_normal[BLOCK_SIZE_4X4]; CGU_UINT32 nCmpIndices = 0; CGU_UINT32 c0, c1; // High Quality CMP_EndPoints EndPoints = {{0, 0, 0xFF}, {0, 0, 0xFF}}; CGU_UINT32 i; CGU_FLOAT ALIGN_16 Rpt[BLOCK_SIZE_4X4]; CGU_UINT32 pcIndices = 0; m_nRefinementSteps = 0; CGU_Vec3f BlkInBGRf_UV[BLOCK_SIZE_4X4]; // Normalized Block Input (0..1) in BGR channel format // Default inidices & endpoints for Transparent Block CGU_Vec3ui nEndpoints0 = {0, 0, 0}; // Endpoints are stored BGR as x,y,z CGU_Vec3ui nEndpoints1 = {0xFF, 0xFF, 0xFF}; // Endpoints are stored BGR as x,y,z for (i = 0; i < BLOCK_SIZE_4X4; i++) { Rpt[i] = 0.0f; } //=============================================================== // Check if we have more then 2 colors and process Alpha block CGU_UINT32 dwColors = 0; CGU_UINT32 dwBlk[BLOCK_SIZE_4X4]; CGU_UINT32 R, G, B, A; for (i = 0; i < BLOCK_SIZE_4X4; i++) { // Do any color conversion prior to processing the block rgbBlock_normal[i] = isSRGB ? cmp_linearToSrgb(src_imageNorm[i].rgb) : src_imageNorm[i].rgb; R = (CGU_UINT32)(rgbBlock_normal[i].x * 255.0f); G = (CGU_UINT32)(rgbBlock_normal[i].y * 255.0f); B = (CGU_UINT32)(rgbBlock_normal[i].z * 255.0f); //if (dwAlphaThreshold > 0) // A = (CGU_UINT32)src_imageNorm[i].w * 255.0f; //else A = 255; // Punch Through Alpha in BC1 Codec (1 bit alpha) //if ((dwAlphaThreshold == 0) || (A >= dwAlphaThreshold)) //{ // copy to local RGB data and have alpha set to 0xFF dwBlk[dwColors++] = A << 24 | R << 16 | G << 8 | B; //} } if (!dwColors) { // All are colors transparent EndPoints.Color0.x = EndPoints.Color0.y = EndPoints.Color0.z = 0.0f; EndPoints.Color1.x = EndPoints.Color1.y = EndPoints.Color0.z = 255.0f; nCmpIndices = 0xFFFFFFFF; } else { // We have colors to process nCmpIndices = 0; // Punch Through Alpha Support ToDo // CGU_BOOL bHasAlpha = (dwColors != BLOCK_SIZE_4X4); // bHasAlpha = bHasAlpha && (dwAlphaThreshold > 0); // valid for (dwNumChannels=4); // if (bHasAlpha) { // CGU_Vec2ui compBlock = {0xf800f800,0}; // return compBlock; // } // Here we are computing an unique number of sorted colors. // For each unique value we compute the number of it appearences. // qsort((void *)dwBlk, (size_t)dwColors, sizeof(CGU_UINT32), QSortIntCmp); { CGU_UINT32 j; CMP_di what[BLOCK_SIZE_4X4]; for (i = 0; i < dwColors; i++) { what[i].index = i; what[i].data = dwBlk[i]; } CGU_UINT32 tmp_index; CGU_UINT32 tmp_data; for (i = 1; i < dwColors; i++) { for (j = i; j > 0; j--) { if (what[j - 1].data > what[j].data) { tmp_index = what[j].index; tmp_data = what[j].data; what[j].index = what[j - 1].index; what[j].data = what[j - 1].data; what[j - 1].index = tmp_index; what[j - 1].data = tmp_data; } } } for (i = 0; i < dwColors; i++) dwBlk[i] = what[i].data; } CGU_UINT32 new_p; CGU_UINT32 dwBlkU[BLOCK_SIZE_4X4]; CGU_UINT32 dwUniqueColors = 0; new_p = dwBlkU[0] = dwBlk[0]; Rpt[dwUniqueColors] = 1.f; for (i = 1; i < dwColors; i++) { if (new_p != dwBlk[i]) { dwUniqueColors++; new_p = dwBlkU[dwUniqueColors] = dwBlk[i]; Rpt[dwUniqueColors] = 1.f; } else Rpt[dwUniqueColors] += 1.f; } dwUniqueColors++; // Simple case of only 2 colors to process // no need for futher processing as lowest quality methods work best for this case if (dwUniqueColors <= 2) { CGU_Vec3f rsltC0; CGU_Vec3f rsltC1; rsltC0.r = rgbBlock_normal[0].b * 255.0f; rsltC0.g = rgbBlock_normal[0].g * 255.0f; rsltC0.b = rgbBlock_normal[0].r * 255.0f; rsltC1.r = rgbBlock_normal[dwUniqueColors - 1].b * 255.0f; rsltC1.g = rgbBlock_normal[dwUniqueColors - 1].g * 255.0f; rsltC1.b = rgbBlock_normal[dwUniqueColors - 1].r * 255.0f; EndPoints = cgu_MkRmpOnGridBGR(rsltC0, rsltC1,5, 6, 5); } else { // switch from int range back to UV floats for (i = 0; i < dwUniqueColors; i++) { R = (dwBlkU[i] >> 16) & 0xff; G = (dwBlkU[i] >> 8) & 0xff; B = (dwBlkU[i] >> 0) & 0xff; BlkInBGRf_UV[i].z = (CGU_FLOAT)R / 255.0f; BlkInBGRf_UV[i].y = (CGU_FLOAT)G / 255.0f; BlkInBGRf_UV[i].x = (CGU_FLOAT)B / 255.0f; } CGU_Vec3f channelWeightsBGR; channelWeightsBGR.x = channelWeights.z; channelWeightsBGR.y = channelWeights.y; channelWeightsBGR.z = channelWeights.x; EndPoints = cgu_CompressRGBBlockX(BlkInBGRf_UV, Rpt, dwUniqueColors, channelWeightsBGR, m_nRefinementSteps); } } // colors //=================================================================== // Process Cluster INPUT is constant EndPointsf OUTPUT is pcIndices //=================================================================== if (nCmpIndices == 0) { R = (CGU_UINT32)(EndPoints.Color0.z); G = (CGU_UINT32)(EndPoints.Color0.y); B = (CGU_UINT32)(EndPoints.Color0.x); CGU_INT32 cluster0 = cmp_constructColor(R, G, B); R = (CGU_UINT32)(EndPoints.Color1.z); G = (CGU_UINT32)(EndPoints.Color1.y); B = (CGU_UINT32)(EndPoints.Color1.x); CGU_INT32 cluster1 = cmp_constructColor(R, G, B); CGU_Vec3f InpRmp[NUM_ENDPOINTS]; if ((cluster0 <= cluster1) // valid for 4 channels // || (cluster0 > cluster1) // valid for 3 channels ) { // inverse endpoints InpRmp[0] = EndPoints.Color1; InpRmp[1] = EndPoints.Color0; } else { InpRmp[0] = EndPoints.Color0; InpRmp[1] = EndPoints.Color1; } CGU_Vec3f srcblockBGR[BLOCK_SIZE_4X4]; CGU_FLOAT srcblockA[BLOCK_SIZE_4X4]; // Swizzle the source RGB to BGR for processing for (i = 0; i < BLOCK_SIZE_4X4; i++) { srcblockBGR[i].z = rgbBlock_normal[i].x * 255.0f; srcblockBGR[i].y = rgbBlock_normal[i].y * 255.0f; srcblockBGR[i].x = rgbBlock_normal[i].z * 255.0f; srcblockA[i] = 255.0f; if (dwAlphaThreshold > 0) { CGU_UINT32 alpha = (CGU_UINT32)src_imageNorm[i].w*255.0f; if (alpha >= dwAlphaThreshold) srcblockA[i] = alpha; } } // input ramp is on the coarse grid // make ramp endpoints the way they'll going to be decompressed CGU_Vec3f InpRmpL[NUM_ENDPOINTS]; CGU_Vec3f Fctrs = {32.0F, 64.0F, 32.0F}; // 1 << RG,1 << GG,1 << BG { // ConstantRamp = MkWkRmpPts(InpRmpL, InpRmp); InpRmpL[0] = InpRmp[0] + cmp_floorVec3f(InpRmp[0] / Fctrs); InpRmpL[0] = cmp_clampVec3f(InpRmpL[0], 0.0f, 255.0f); InpRmpL[1] = InpRmp[1] + cmp_floorVec3f(InpRmp[1] / Fctrs); InpRmpL[1] = cmp_clampVec3f(InpRmpL[1], 0.0f, 255.0f); } // MkWkRmpPts // build ramp CGU_Vec3f LerpRmp[4]; CGU_Vec3f offset = {1.0f, 1.0f, 1.0f}; { //BldRmp(Rmp, InpRmpL, dwNumChannels); // linear interpolate end points to get the ramp LerpRmp[0] = InpRmpL[0]; LerpRmp[3] = InpRmpL[1]; LerpRmp[1] = cmp_floorVec3f((InpRmpL[0] * 2.0f + LerpRmp[3] + offset) / 3.0f); LerpRmp[2] = cmp_floorVec3f((InpRmpL[0] + LerpRmp[3] * 2.0f + offset) / 3.0f); } // BldRmp //========================================================================= // Clusterize, Compute error and find DXTC indexes for the current cluster //========================================================================= { // Clusterize CGU_UINT32 alpha; // For each colour in the original block assign it // to the closest cluster and compute the cumulative error for (i = 0; i < BLOCK_SIZE_4X4; i++) { alpha = (CGU_UINT32)srcblockA[i]; if ((dwAlphaThreshold > 0) && alpha == 0) { //*((CGU_DWORD *)&_Blk[i][AC]) == 0) pcIndices |= cmp_set2Bit32(4, i); // dwNumChannels 3 or 4 (default is 4) } else { CGU_FLOAT shortest = 99999999999.f; CGU_UINT8 shortestIndex = 0; CGU_Vec3f channelWeightsBGR; channelWeightsBGR.x = channelWeights.z; channelWeightsBGR.y = channelWeights.y; channelWeightsBGR.z = channelWeights.x; for (CGU_UINT8 rampindex = 0; rampindex < 4; rampindex++) { // r is either 1 or 4 // calculate the distance for each component CGU_FLOAT distance = dot(((srcblockBGR[i] - LerpRmp[rampindex]) * channelWeightsBGR), ((srcblockBGR[i] - LerpRmp[rampindex]) * channelWeightsBGR)); if (distance < shortest) { shortest = distance; shortestIndex = rampindex; } } // The total is a sum of (error += shortest) // We have the index of the best cluster, so assign this in the block // Reorder indices to match correct DXTC ordering if (shortestIndex == 3) // dwNumChannels - 1 shortestIndex = 1; else if (shortestIndex) shortestIndex++; pcIndices |= cmp_set2Bit32(shortestIndex, i); } } // BLOCK_SIZE_4X4 } // Clusterize } // Process Cluster //============================================================== // Generate Compressed Result from nEndpoints & pcIndices //============================================================== c0 = cmp_constructColorBGR(EndPoints.Color0); c1 = cmp_constructColorBGR(EndPoints.Color1); // Get Processed indices if not set if (nCmpIndices == 0) nCmpIndices = pcIndices; CGU_Vec2ui cmpBlock; if (c0 <= c1) { cmpBlock.x = c1 | (c0 << 16); } else cmpBlock.x = c0 | (c1 << 16); cmpBlock.y = nCmpIndices; return cmpBlock; } CMP_STATIC void cgu_ProcessColors(CMP_INOUT CGU_Vec3f CMP_PTRINOUT colorMin, CMP_INOUT CGU_Vec3f CMP_PTRINOUT colorMax, CMP_INOUT CGU_UINT32 CMP_PTRINOUT c0, CMP_INOUT CGU_UINT32 CMP_PTRINOUT c1, CMP_IN CGU_INT setopt, CMP_IN CGU_BOOL isSRGB) { // CGU_UINT32 srbMap[32] = {0,5,8,11,12,13,14,15,16,17,18,19,20,21,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31}; // CGU_UINT32 sgMap[64] = {0,10,14,16,19,20,22,24,25,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,42,43,43,44,45,45, // 46,47,47,48,48,49,50,50,51,52,52,53,53,54,54,55,55,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63}; CGU_INT32 x, y, z; CGU_Vec3f scale = {31.0f, 63.0f, 31.0f}; CGU_Vec3f MinColorScaled; CGU_Vec3f MaxColorScaled; // Clamp or Transform is needed, the transforms have built in clamps if (isSRGB) { MinColorScaled = cmp_linearToSrgb(CMP_PTRINOUT colorMin); MaxColorScaled = cmp_linearToSrgb(CMP_PTRINOUT colorMax); } else { MinColorScaled = cmp_clampVec3f(CMP_PTRINOUT colorMin, 0.0f, 1.0f); MaxColorScaled = cmp_clampVec3f(CMP_PTRINOUT colorMax, 0.0f, 1.0f); } switch (setopt) { case 0: // Use Min Max processing MinColorScaled = cmp_floorVec3f(MinColorScaled * scale); MaxColorScaled = cmp_ceilVec3f(MaxColorScaled * scale); CMP_PTRINOUT colorMin = MinColorScaled / scale; CMP_PTRINOUT colorMax = MaxColorScaled / scale; break; default: // Use round processing MinColorScaled = round(MinColorScaled * scale); MaxColorScaled = round(MaxColorScaled * scale); break; } x = (CGU_UINT32)(MinColorScaled.x); y = (CGU_UINT32)(MinColorScaled.y); z = (CGU_UINT32)(MinColorScaled.z); //if (isSRGB) { // // scale RB // x = srbMap[x]; // &0x1F]; // y = sgMap [y]; // &0x3F]; // z = srbMap[z]; // &0x1F]; // // scale G //} CMP_PTRINOUT c0 = (x << 11) | (y << 5) | z; x = (CGU_UINT32)(MaxColorScaled.x); y = (CGU_UINT32)(MaxColorScaled.y); z = (CGU_UINT32)(MaxColorScaled.z); CMP_PTRINOUT c1 = (x << 11) | (y << 5) | z; } CMP_STATIC CGU_FLOAT cgu_getIndicesRGB(CMP_INOUT CGU_UINT32 CMP_PTRINOUT cmpindex, CMP_IN const CGU_Vec3f block[16], CMP_IN CGU_Vec3f minColor, CMP_IN CGU_Vec3f maxColor, CMP_IN CGU_BOOL getErr) { CGU_UINT32 PackedIndices = 0; CGU_FLOAT err = 0.0f; CGU_Vec3f cn[4]; CGU_FLOAT minDistance; if (getErr) { // remap to BC1 spec for decoding offsets, // where cn[0] > cn[1] Max Color = index 0, 2/3 offset =index 2, 1/3 offset = index 3, Min Color = index 1 cn[0] = maxColor; cn[1] = minColor; cn[2] = cn[0] * 2.0f / 3.0f + cn[1] * 1.0f / 3.0f; cn[3] = cn[0] * 1.0f / 3.0f + cn[1] * 2.0f / 3.0f; } CGU_FLOAT Scale = 3.f / cmp_dotVec3f(minColor - maxColor, minColor - maxColor); CGU_Vec3f ScaledRange = (minColor - maxColor) * Scale; CGU_FLOAT Bias = (cmp_dotVec3f(maxColor, maxColor) - cmp_dotVec3f(maxColor, minColor)) * Scale; CGU_INT indexMap[4] = {0, 2, 3, 1}; // mapping based on BC1 Spec for color0 > color1 CGU_UINT32 index; CGU_FLOAT diff; for (CGU_UINT32 i = 0; i < 16; i++) { // Get offset from base scale diff = cmp_dotVec3f(block[i], ScaledRange) + Bias; index = ((CGU_UINT32)round(diff)) & 0x3; // remap linear offset to spec offset index = indexMap[index]; // use err calc for use in higher quality code if (getErr) { minDistance = cmp_dotVec3f(block[i] - cn[index], block[i] - cn[index]); err += minDistance; } // Map the 2 bit index into compress 32 bit block if (index) PackedIndices |= (index << (2 * i)); } if (getErr) err = err * 0.0208333f; CMP_PTRINOUT cmpindex = PackedIndices; return err; } //-------------------------------------------------------------------------------------------------------- // Decompress is RGB (0.0f..255.0f) //-------------------------------------------------------------------------------------------------------- CMP_STATIC void cgu_decompressRGBBlock(CMP_INOUT CGU_Vec3f rgbBlock[BLOCK_SIZE_4X4], const CGU_Vec2ui compressedBlock) { CGU_UINT32 n0 = compressedBlock.x & 0xffff; CGU_UINT32 n1 = compressedBlock.x >> 16; CGU_UINT32 index; //------------------------------------------------------- // Decode the compressed block 0..255 color range //------------------------------------------------------- CGU_Vec3f c0 = cmp_565ToLinear(n0); // max color CGU_Vec3f c1 = cmp_565ToLinear(n1); // min color CGU_Vec3f c2; CGU_Vec3f c3; if (n0 > n1) { c2 = (c0 * 2.0f + c1) / 3.0f; c3 = (c1 * 2.0f + c0) / 3.0f; for (CGU_UINT32 i = 0; i < 16; i++) { index = (compressedBlock.y >> (2 * i)) & 3; switch (index) { case 0: rgbBlock[i] = c0; break; case 1: rgbBlock[i] = c1; break; case 2: rgbBlock[i] = c2; break; case 3: rgbBlock[i] = c3; break; } } } else { // Transparent decode c2 = (c0 + c1) / 2.0f; for (CGU_UINT32 i = 0; i < 16; i++) { index = (compressedBlock.y >> (2 * i)) & 3; switch (index) { case 0: rgbBlock[i] = c0; break; case 1: rgbBlock[i] = c1; break; case 2: rgbBlock[i] = c2; break; case 3: rgbBlock[i] = 0.0f; break; } } } } // The source is 0..255 CMP_STATIC float cgu_RGBABlockErrorLinear(const CGU_Vec4uc src_rgbBlock[BLOCK_SIZE_4X4], const CGU_Vec2ui compressedBlock) { CGU_Vec3f rgbBlock[BLOCK_SIZE_4X4]; // Decompressed block channels are 0..255 cgu_decompressRGBBlock(rgbBlock, compressedBlock); //------------------------------------------------------------------ // Calculate MSE of the block // Note : pow is used as Float type for the code to be usable on CPU //------------------------------------------------------------------ CGU_Vec3f serr; serr = 0.0f; float sR, sG, sB, R, G, B; for (int j = 0; j < 16; j++) { sR = src_rgbBlock[j].x; sG = src_rgbBlock[j].y; sB = src_rgbBlock[j].z; R = rgbBlock[j].x; G = rgbBlock[j].y; B = rgbBlock[j].z; // Norm colors serr.x += pow(sR - R, 2.0f); serr.y += pow(sG - G, 2.0f); serr.z += pow(sB - B, 2.0f); } // MSE for 16 texels return (serr.x + serr.y + serr.z) / 48.0f; } // The source is 0..1, decompressed data using cmp_decompressRGBBlock2 is 0..255 which is converted down to 0..1 CMP_STATIC float cgu_RGBBlockError(const CGU_Vec3f src_rgbBlock[BLOCK_SIZE_4X4], const CGU_Vec2ui compressedBlock, CGU_BOOL isSRGB) { CGU_Vec3f rgbBlock[BLOCK_SIZE_4X4]; // Decompressed block channels are 0..255 cgu_decompressRGBBlock(rgbBlock, compressedBlock); //------------------------------------------------------------------ // Calculate MSE of the block // Note : pow is used as Float type for the code to be usable on CPU //------------------------------------------------------------------ CGU_Vec3f serr; serr = 0.0f; float sR, sG, sB, R, G, B; for (int j = 0; j < 16; j++) { if (isSRGB) { sR = round(cmp_linearToSrgbf(src_rgbBlock[j].x) * 255.0f); sG = round(cmp_linearToSrgbf(src_rgbBlock[j].y) * 255.0f); sB = round(cmp_linearToSrgbf(src_rgbBlock[j].z) * 255.0f); } else { sR = round(src_rgbBlock[j].x * 255.0f); sG = round(src_rgbBlock[j].y * 255.0f); sB = round(src_rgbBlock[j].z * 255.0f); } R = rgbBlock[j].x; G = rgbBlock[j].y; B = rgbBlock[j].z; // Norm colors serr.x += pow(sR - R, 2.0f); serr.y += pow(sG - G, 2.0f); serr.z += pow(sB - B, 2.0f); } // MSE for 16 texels return (serr.x + serr.y + serr.z) / 48.0f; } CMP_STATIC CGU_Vec2ui cgu_CompressRGBBlock_MinMax(CMP_IN const CGU_Vec3f src_imageRGB[16], CMP_IN CGU_FLOAT fquality, CMP_IN CGU_BOOL isSRGB, CMP_INOUT CGU_Vec3f srcRGB[16], // The list of source colors with blue channel altered CMP_INOUT CGU_Vec3f CMP_REFINOUT average_rgb, // The centrepoint of the axis CMP_INOUT CGU_FLOAT CMP_REFINOUT errout ) { CGU_Vec2ui Q1CompData = {0,0}; CGU_Vec3f rgb = {0,0,0}; // ------------------------------------------------------------------------------------- // (1) Find the array of unique pixel values and sum them to find their average position // ------------------------------------------------------------------------------------- CGU_FLOAT errLQ = 0.0f; CGU_BOOL fastProcess = (fquality <= CMP_QUALITY0); // Min Max only CGU_Vec3f srcMin = 1.0f; // Min source color CGU_Vec3f srcMax = 0.0f; // Max source color CGU_Vec2ui Q1compressedBlock = {0, 0}; CGU_UINT32 c0 = 0; CGU_UINT32 c1 = 0; average_rgb = 0.0f; // Get average and modifed src // find average position and save list of pixels as 0F..255F range for processing // Note: z (blue) is average of blue+green channels for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++) { srcMin = cmp_minVec3f(srcMin, src_imageRGB[i]); srcMax = cmp_maxVec3f(srcMax, src_imageRGB[i]); if (!fastProcess) { rgb = isSRGB ? cmp_linearToSrgb(src_imageRGB[i]) : cmp_saturate(src_imageRGB[i]); rgb.z = (rgb.y + rgb.z) * 0.5F; // Z-axiz => (R+G)/2 srcRGB[i] = rgb; average_rgb = average_rgb + rgb; } } // Process two colors for saving in 565 format as C0 and C1 cgu_ProcessColors(CMP_REFINOUT srcMin, CMP_REFINOUT srcMax, CMP_REFINOUT c0, CMP_REFINOUT c1, isSRGB ? 1 : 0, isSRGB); // Save simple min-max encoding if (c0 < c1) { Q1CompData.x = (c0 << 16) | c1; CGU_UINT32 index = 0; errLQ = cgu_getIndicesRGB(CMP_REFINOUT index, src_imageRGB, srcMin, srcMax, false); Q1CompData.y = index; errout = cgu_RGBBlockError(src_imageRGB, Q1CompData, isSRGB); } else { // Most simple case all colors are equal or 0.0f Q1compressedBlock.x = (c1 << 16) | c0; Q1compressedBlock.y = 0; errout = 0.0f; return Q1compressedBlock; } // 0.0625F is (1/BLOCK_SIZE_4X4) average_rgb = average_rgb * 0.0625F; return Q1CompData; } CMP_STATIC CGU_Vec2ui cgu_CompressRGBBlock_Fast(CMP_IN const CGU_Vec3f src_imageRGB[16], CMP_IN CGU_FLOAT fquality, CMP_IN CGU_BOOL isSRGB, CMP_IN CGU_Vec3f srcRGB[16], CMP_IN CGU_Vec3f CMP_REFINOUT average_rgb, CMP_INOUT CGU_FLOAT CMP_REFINOUT errout) { CMP_UNUSED(fquality); CGU_Vec3f axisVectorRGB = {0.0f, 0.0f, 0.0f}; // The axis vector for index projection CGU_FLOAT pos_on_axis[16]; // The distance each unique falls along the compression axis CGU_FLOAT axisleft = 0; // The extremities and centre (average of left/right) of srcRGB along the compression axis CGU_FLOAT axisright = 0; // The extremities and centre (average of left/right) of srcRGB along the compression axis CGU_FLOAT axiscentre = 0; // The extremities and centre (average of left/right) of srcRGB along the compression axis CGU_INT32 swap = 0; // Indicator if the RGB values need swapping to generate an opaque result CGU_Vec3f srcBlock[16]; // The list of source colors with any color space transforms and clipping CGU_UINT32 c0 = 0; CGU_UINT32 c1 = 0; CGU_Vec2ui compressedBlock = {0, 0}; CGU_FLOAT Q1CompErr; CGU_Vec2ui Q1CompData = {0,0}; CGU_Vec3f rgb = {0,0,0}; // ------------------------------------------------------------------------------------- // (4) For each component, reflect points about the average so all lie on the same side // of the average, and compute the new average - this gives a second point that defines the axis // To compute the sign of the axis sum the positive differences of G for each of R and B (the // G axis is always positive in this implementation // ------------------------------------------------------------------------------------- // An interesting situation occurs if the G axis contains no information, in which case the RB // axis is also compared. I am not entirely sure if this is the correct implementation - should // the priority axis be determined by magnitude? { CGU_FLOAT rg_pos = 0.0f; CGU_FLOAT bg_pos = 0.0f; CGU_FLOAT rb_pos = 0.0f; for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++) { rgb = srcRGB[i] - average_rgb; axisVectorRGB = axisVectorRGB + cmp_fabsVec3f(rgb); if (rgb.x > 0) { rg_pos += rgb.y; rb_pos += rgb.z; } if (rgb.z > 0) bg_pos += rgb.y; } // Average over BLOCK_SIZE_4X4 axisVectorRGB = axisVectorRGB * 0.0625F; // New average position if (rg_pos < 0) axisVectorRGB.x = -axisVectorRGB.x; if (bg_pos < 0) axisVectorRGB.z = -axisVectorRGB.z; if ((rg_pos == bg_pos) && (rg_pos == 0)) { if (rb_pos < 0) axisVectorRGB.z = -axisVectorRGB.z; } } // ------------------------------------------------------------------------------------- // (5) Axis projection and remapping // ------------------------------------------------------------------------------------- { CGU_FLOAT v2_recip; // Normalize the axis for simplicity of future calculation v2_recip = cmp_dotVec3f(axisVectorRGB, axisVectorRGB); if (v2_recip > 0) v2_recip = 1.0f / (CGU_FLOAT)cmp_sqrt(v2_recip); else v2_recip = 1.0f; axisVectorRGB = axisVectorRGB * v2_recip; } // ------------------------------------------------------------------------------------- // (6) Map the axis // ------------------------------------------------------------------------------------- // the line joining (and extended on either side of) average and axis // defines the axis onto which the points will be projected // Project all the points onto the axis, calculate the distance along // the axis from the centre of the axis (average) // From Foley & Van Dam: Closest point of approach of a line (P + v) to a point (R) is // P + ((R-P).v) / (v.v))v // The distance along v is therefore (R-P).v / (v.v) where (v.v) is 1 if v is a unit vector. // // Calculate the extremities at the same time - these need to be reasonably accurately // represented in all cases { axisleft = CMP_FLOAT_MAX; axisright = -CMP_FLOAT_MAX; for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++) { // Compute the distance along the axis of the point of closest approach CGU_Vec3f temp = (srcRGB[i] - average_rgb); pos_on_axis[i] = cmp_dotVec3f(temp, axisVectorRGB); // Work out the extremities if (pos_on_axis[i] < axisleft) axisleft = pos_on_axis[i]; if (pos_on_axis[i] > axisright) axisright = pos_on_axis[i]; } } // --------------------------------------------------------------------------------------------- // (7) Now we have a good axis and the basic information about how the points are mapped to it // Our initial guess is to represent the endpoints accurately, by moving the average // to the centre and recalculating the point positions along the line // --------------------------------------------------------------------------------------------- { axiscentre = (axisleft + axisright) * 0.5F; average_rgb = average_rgb + (axisVectorRGB * axiscentre); for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++) pos_on_axis[i] -= axiscentre; axisright -= axiscentre; axisleft -= axiscentre; } // ------------------------------------------------------------------------------------- // (8) Calculate the high and low output colour values // Involved in this is a rounding procedure which is undoubtedly slightly twitchy. A // straight rounded average is not correct, as the decompressor 'unrounds' by replicating // the top bits to the bottom. // In order to take account of this process, we don't just apply a straight rounding correction, // but base our rounding on the input value (a straight rounding is actually pretty good in terms of // error measure, but creates a visual colour and/or brightness shift relative to the original image) // The method used here is to apply a centre-biased rounding dependent on the input value, which was // (mostly by experiment) found to give minimum MSE while preserving the visual characteristics of // the image. // rgb = (average_rgb + (left|right)*axisVectorRGB); // ------------------------------------------------------------------------------------- { CGU_Vec3f MinColor, MaxColor; MinColor = average_rgb + (axisVectorRGB * axisleft); MaxColor = average_rgb + (axisVectorRGB * axisright); MinColor.z = (MinColor.z * 2) - MinColor.y; MaxColor.z = (MaxColor.z * 2) - MaxColor.y; cgu_ProcessColors(CMP_REFINOUT MinColor, CMP_REFINOUT MaxColor, CMP_REFINOUT c0, CMP_REFINOUT c1, 1, false); // Force to be a 4-colour opaque block - in which case, c0 is greater than c1 swap = 0; if (c0 < c1) { CGU_UINT32 t; t = c0; c0 = c1; c1 = t; swap = 1; } else if (c0 == c1) { // This block will always be encoded in 3-colour mode // Need to ensure that only one of the two points gets used, // avoiding accidentally setting some transparent pixels into the block for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++) pos_on_axis[i] = axisleft; } compressedBlock.x = c0 | (c1 << 16); // ------------------------------------------------------------------------------------- // (9) Final clustering, creating the 2-bit values that define the output // ------------------------------------------------------------------------------------- CGU_UINT32 index; CGU_FLOAT division; { compressedBlock.y = 0; division = axisright * 2.0f / 3.0f; axiscentre = (axisleft + axisright) / 2; // Actually, this code only works if centre is 0 or approximately so CGU_FLOAT CompMinErr; // This feature is work in progress // remap to BC1 spec for decoding offsets, // where cn[0] > cn[1] Max Color = index 0, 2/3 offset =index 2, 1/3 offset = index 3, Min Color = index 1 // CGU_Vec3f cn[4]; // cn[0] = MaxColor; // cn[1] = MinColor; // cn[2] = cn[0]*2.0f/3.0f + cn[1]*1.0f/3.0f; // cn[3] = cn[0]*1.0f/3.0f + cn[1]*2.0f/3.0f; for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++) { // Endpoints (indicated by block > average) are 0 and 1, while // interpolants are 2 and 3 if (cmp_fabs(pos_on_axis[i]) >= division) index = 0; else index = 2; // Positive is in the latter half of the block if (pos_on_axis[i] >= axiscentre) index += 1; index = index ^ swap; // Set the output, taking swapping into account compressedBlock.y |= (index << (2 * i)); // use err calc for use in higher quality code //CompMinErr += cmp_dotVec3f(srcRGBRef[i] - cn[index],srcRGBRef[i] - cn[index]); } //CompMinErr = CompMinErr * 0.0208333f; CompMinErr = cgu_RGBBlockError(src_imageRGB, compressedBlock, isSRGB); Q1CompErr = cgu_RGBBlockError(src_imageRGB, Q1CompData, isSRGB); if (CompMinErr > Q1CompErr) { compressedBlock = Q1CompData; errout = Q1CompErr; } else errout = CompMinErr; } } // done return compressedBlock; } CMP_STATIC CGU_UINT8 g_Match5Bit[256][2] = { { 0, 0},{ 0, 0},{ 1, 0},{ 1, 0},{ 0, 1},{ 0, 1},{ 0, 1},{ 1, 1},{ 1, 1},{ 1, 1},{ 0, 2},{ 4, 0},{ 1, 2},{ 1, 2},{ 1, 2},{ 2, 2}, { 2, 2},{ 2, 2},{ 1, 3},{ 5, 1},{ 2, 3},{ 2, 3},{ 0, 4},{ 3, 3},{ 3, 3},{ 3, 3},{ 2, 4},{ 2, 4},{ 2, 4},{ 5, 3},{ 1, 5},{ 1, 5}, { 2, 5},{ 4, 4},{ 4, 4},{ 3, 5},{ 3, 5},{ 2, 6},{ 2, 6},{ 2, 6},{ 3, 6},{ 5, 5},{ 5, 5},{ 4, 6},{ 8, 4},{ 3, 7},{ 3, 7},{ 3, 7}, { 6, 6},{ 6, 6},{ 6, 6},{ 5, 7},{ 9, 5},{ 6, 7},{ 6, 7},{ 4, 8},{ 7, 7},{ 7, 7},{ 7, 7},{ 6, 8},{ 6, 8},{ 6, 8},{ 9, 7},{ 5, 9}, { 5, 9},{ 6, 9},{ 8, 8},{ 8, 8},{ 7, 9},{ 7, 9},{ 6,10},{ 6,10},{ 6,10},{ 7,10},{ 9, 9},{ 9, 9},{ 8,10},{12, 8},{ 7,11},{ 7,11}, { 7,11},{10,10},{10,10},{10,10},{ 9,11},{13, 9},{10,11},{10,11},{ 8,12},{11,11},{11,11},{11,11},{10,12},{10,12},{10,12},{13,11}, { 9,13},{ 9,13},{10,13},{12,12},{12,12},{11,13},{11,13},{10,14},{10,14},{10,14},{11,14},{13,13},{13,13},{12,14},{16,12},{11,15}, {11,15},{11,15},{14,14},{14,14},{14,14},{13,15},{17,13},{14,15},{14,15},{12,16},{15,15},{15,15},{15,15},{14,16},{14,16},{14,16}, {17,15},{13,17},{13,17},{14,17},{16,16},{16,16},{15,17},{15,17},{14,18},{14,18},{14,18},{15,18},{17,17},{17,17},{16,18},{20,16}, {15,19},{15,19},{15,19},{18,18},{18,18},{18,18},{17,19},{21,17},{18,19},{18,19},{16,20},{19,19},{19,19},{19,19},{18,20},{18,20}, {18,20},{21,19},{17,21},{17,21},{18,21},{20,20},{20,20},{19,21},{19,21},{18,22},{18,22},{18,22},{19,22},{21,21},{21,21},{20,22}, {24,20},{19,23},{19,23},{19,23},{22,22},{22,22},{22,22},{21,23},{25,21},{22,23},{22,23},{20,24},{23,23},{23,23},{23,23},{22,24}, {22,24},{22,24},{25,23},{21,25},{21,25},{22,25},{24,24},{24,24},{23,25},{23,25},{22,26},{22,26},{22,26},{23,26},{25,25},{25,25}, {24,26},{28,24},{23,27},{23,27},{23,27},{26,26},{26,26},{26,26},{25,27},{29,25},{26,27},{26,27},{24,28},{27,27},{27,27},{27,27}, {26,28},{26,28},{26,28},{29,27},{25,29},{25,29},{26,29},{28,28},{28,28},{27,29},{27,29},{26,30},{26,30},{26,30},{27,30},{29,29}, {29,29},{28,30},{28,30},{27,31},{27,31},{27,31},{30,30},{30,30},{30,30},{29,31},{29,31},{30,31},{30,31},{30,31},{31,31},{31,31}}; CMP_STATIC CGU_UINT8 g_Match6Bit[256][2] = { { 0, 0},{ 1, 0},{ 0, 1},{ 1, 1},{ 1, 1},{ 0, 2},{ 1, 2},{ 2, 2},{ 2, 2},{ 1, 3},{ 0, 4},{ 3, 3},{ 3, 3},{ 0, 5},{ 1, 5},{ 4, 4}, { 4, 4},{ 1, 6},{ 0, 7},{ 5, 5},{ 5, 5},{ 0, 8},{ 1, 8},{ 6, 6},{ 6, 6},{ 1, 9},{ 2, 9},{ 7, 7},{ 7, 7},{ 2,10},{ 3,10},{ 8, 8}, { 8, 8},{ 3,11},{ 4,11},{ 9, 9},{ 9, 9},{ 4,12},{ 5,12},{10,10},{10,10},{ 5,13},{ 6,13},{16, 8},{11,11},{ 6,14},{ 7,14},{17, 9}, {12,12},{ 7,15},{ 8,15},{16,11},{13,13},{10,15},{ 8,16},{ 9,16},{14,14},{13,15},{ 9,17},{10,17},{15,15},{16,15},{10,18},{11,18}, {12,18},{16,16},{11,19},{12,19},{13,19},{17,17},{12,20},{13,20},{14,20},{18,18},{13,21},{14,21},{15,21},{19,19},{14,22},{15,22}, {20,20},{20,20},{15,23},{16,23},{21,21},{21,21},{16,24},{17,24},{22,22},{22,22},{17,25},{18,25},{23,23},{23,23},{18,26},{19,26}, {24,24},{24,24},{19,27},{20,27},{25,25},{25,25},{20,28},{21,28},{26,26},{26,26},{21,29},{22,29},{32,24},{27,27},{22,30},{23,30}, {33,25},{28,28},{23,31},{24,31},{32,27},{29,29},{26,31},{24,32},{25,32},{30,30},{29,31},{25,33},{26,33},{31,31},{32,31},{26,34}, {27,34},{28,34},{32,32},{27,35},{28,35},{29,35},{33,33},{28,36},{29,36},{30,36},{34,34},{29,37},{30,37},{31,37},{35,35},{30,38}, {31,38},{36,36},{36,36},{31,39},{32,39},{37,37},{37,37},{32,40},{33,40},{38,38},{38,38},{33,41},{34,41},{39,39},{39,39},{34,42}, {35,42},{40,40},{40,40},{35,43},{36,43},{41,41},{41,41},{36,44},{37,44},{42,42},{42,42},{37,45},{38,45},{48,40},{43,43},{38,46}, {39,46},{49,41},{44,44},{39,47},{40,47},{48,43},{45,45},{42,47},{40,48},{41,48},{46,46},{45,47},{41,49},{42,49},{47,47},{48,47}, {42,50},{43,50},{44,50},{48,48},{43,51},{44,51},{45,51},{49,49},{44,52},{45,52},{46,52},{50,50},{45,53},{46,53},{47,53},{51,51}, {46,54},{47,54},{52,52},{52,52},{47,55},{48,55},{53,53},{53,53},{48,56},{49,56},{54,54},{54,54},{49,57},{50,57},{55,55},{55,55}, {50,58},{51,58},{56,56},{56,56},{51,59},{52,59},{57,57},{57,57},{52,60},{53,60},{58,58},{58,58},{53,61},{54,61},{59,59},{59,59}, {54,62},{55,62},{60,60},{60,60},{55,63},{56,63},{61,61},{61,61},{58,63},{59,63},{62,62},{62,62},{61,63},{62,63},{63,63},{63,63}}; CMP_STATIC CGU_Vec2ui cgu_solidColorBlock(CMP_IN CGU_UINT8 Red, CMP_IN CGU_UINT8 Green, CMP_IN CGU_UINT8 Blue) { CGU_UINT32 maxEndp16; CGU_UINT32 minEndp16; CGU_UINT32 mask = 0xAAAAAAAAu; minEndp16 = g_Match5Bit[Red][0] * 2048U + g_Match6Bit[Green][0] * 32U + g_Match5Bit[Blue][0]; maxEndp16 = g_Match5Bit[Red][1] * 2048U + g_Match6Bit[Green][1] * 32U + g_Match5Bit[Blue][1]; // write the color block if( maxEndp16 < minEndp16 ) { CGU_UINT32 tmpValue = minEndp16; minEndp16 = maxEndp16; maxEndp16 = tmpValue; mask ^= 0x55555555u; } CGU_Vec2ui outputBytes; outputBytes.x = CGU_UINT32(maxEndp16) | (CGU_UINT32(minEndp16) << 16u); outputBytes.y = mask; return outputBytes; } CMP_STATIC void cmp_get_encode_data(CMP_IN CMP_EncodeData CMP_REFINOUT edata, CMP_IN CMP_CONSTANT CGU_Vec4uc src_image[16]) { CMP_CONSTANT CGU_UINT32 fr = src_image[0].r, fg = src_image[0].g, fb = src_image[0].b; edata.all_colors_equal = false; edata.total.r = fr; edata.total.g = fg; edata.total.b = fb; edata.max.r = fr; edata.max.g = fg; edata.max.b = fb; edata.min.r = fr; edata.min.g = fg; edata.min.b = fb; edata.grayscale_flag = (fr == fg) && (fr == fb); edata.any_black_pixels = (fr | fg | fb) < 4; for (CGU_UINT32 i = 1; i < 16; i++) { CMP_CONSTANT CGU_INT r = src_image[i].r, g = src_image[i].g, b = src_image[i].b; edata.grayscale_flag &= ((r == g) && (r == b)); edata.any_black_pixels |= ((r | g | b) < 4); edata.max.r = CMP_MAX(edata.max.r, r); edata.max.g = CMP_MAX(edata.max.g, g); edata.max.b = CMP_MAX(edata.max.b, b); edata.min.r = CMP_MIN(edata.min.r, r); edata.min.g = CMP_MIN(edata.min.g, g); edata.min.b = CMP_MIN(edata.min.b, b); edata.total.r += r; edata.total.g += g; edata.total.b += b; } edata.avg.r = (edata.total.r + 8) >> 4; edata.avg.g = (edata.total.g + 8) >> 4; edata.avg.b = (edata.total.b + 8) >> 4; } #ifndef ASPM_GPU /*------------------------------------------------------------------------------------------------ 1 DIM ramp ------------------------------------------------------------------------------------------------*/ CMP_STATIC inline void cpu_BldClrRmp(CGU_FLOAT _Rmp[MAX_POINTS], CGU_FLOAT _InpRmp[NUM_ENDPOINTS], CGU_UINT32 dwNumPoints) { CGU_UINT32 dwRndAmount[9] = {0, 0, 0, 0, 1, 1, 2, 2, 3}; // linear interpolate end points to get the ramp _Rmp[0] = _InpRmp[0]; _Rmp[dwNumPoints - 1] = _InpRmp[1]; if(dwNumPoints % 2) _Rmp[dwNumPoints] = 1000000.f; // for 3 point ramp; not to select the 4th point as min for(CGU_UINT32 e = 1; e < dwNumPoints - 1; e++) _Rmp[e] = cmp_floor((_Rmp[0] * (dwNumPoints - 1 - e) + _Rmp[dwNumPoints - 1] * e + dwRndAmount[dwNumPoints])/ (CGU_FLOAT)(dwNumPoints - 1)); } /*------------------------------------------------------------------------------------------------ // build 3D ramp ------------------------------------------------------------------------------------------------*/ CMP_STATIC inline void cpu_BldRmp(CGU_FLOAT _Rmp[NUM_CHANNELS][MAX_POINTS], CGU_FLOAT _InpRmp[NUM_CHANNELS][NUM_ENDPOINTS],CGU_UINT32 dwNumPoints) { for(CGU_UINT32 j = 0; j < 3; j++) cpu_BldClrRmp(_Rmp[j], _InpRmp[j], dwNumPoints); } /*------------------------------------------------------------------------------------------------ // this is how the end points is going to be look like when decompressed ------------------------------------------------------------------------------------------------*/ CMP_STATIC inline void cpu_MkWkRmpPts(CMP_INOUT CGU_UINT8 CMP_REFINOUT _bEq, CGU_FLOAT _OutRmpPts[NUM_CHANNELS][NUM_ENDPOINTS], CGU_FLOAT _InpRmpPts[NUM_CHANNELS][NUM_ENDPOINTS], CGU_UINT8 nRedBits, CGU_UINT8 nGreenBits, CGU_UINT8 nBlueBits) { CGU_FLOAT Fctrs[3]; Fctrs[RC] = (CGU_FLOAT)(1 << nRedBits); Fctrs[GC] = (CGU_FLOAT)(1 << nGreenBits); Fctrs[BC] = (CGU_FLOAT)(1 << nBlueBits); CGU_BOOL bEq = true; // find whether input ramp is flat for(CGU_UINT32 j = 0; j < 3; j++) bEq &= (_InpRmpPts[j][0] == _InpRmpPts[j][1]); _bEq = bEq?1:0; // end points on the integer grid for(CGU_UINT32 j = 0; j <3; j++) { for(CGU_UINT32 k = 0; k <2; k++) { // Apply the lower bit replication to give full dynamic range _OutRmpPts[j][k] = _InpRmpPts[j][k] + cmp_floor(_InpRmpPts[j][k] / Fctrs[j]); _OutRmpPts[j][k] = cmp_max(_OutRmpPts[j][k], 0.f); _OutRmpPts[j][k] = cmp_min(_OutRmpPts[j][k], 255.f); } } } // Compute error and find DXTC indexes for the current cluster CMP_STATIC CGU_FLOAT cpu_ClstrIntnl(CGU_FLOAT _Blk[BLOCK_SIZE_4X4][NUM_CHANNELS], CGU_UINT8 pcIndices[BLOCK_SIZE_4X4], CGU_FLOAT _Rmp[NUM_CHANNELS][MAX_POINTS], int dwBlockSize, CGU_UINT8 dwNumPoints, bool _ConstRamp, CGU_FLOAT _pfWeights[3], bool _bUseAlpha) { CGU_FLOAT Err = 0.f; CGU_UINT8 rmp_l = (_ConstRamp) ? 1 : dwNumPoints; // For each colour in the original block assign it // to the closest cluster and compute the cumulative error for(int i=0; i< dwBlockSize; i++) { if(_bUseAlpha && *((CGU_UINT32*) &_Blk[i][AC]) == 0) pcIndices[i] = dwNumPoints; else { CGU_FLOAT shortest = 99999999999.f; CGU_UINT8 shortestIndex = 0; CGU_UINT8 r; if ((_pfWeights[0] != 1.0f)||(_pfWeights[1] != 1.0f)||(_pfWeights[2] != 1.0f)) for(r=0; r < rmp_l; r++) { // calculate the distance for each component CGU_FLOAT distance = (_Blk[i][RC] - _Rmp[RC][r]) * (_Blk[i][RC] - _Rmp[RC][r]) * _pfWeights[0] + (_Blk[i][GC] - _Rmp[GC][r]) * (_Blk[i][GC] - _Rmp[GC][r]) * _pfWeights[1] + (_Blk[i][BC] - _Rmp[BC][r]) * (_Blk[i][BC] - _Rmp[BC][r]) * _pfWeights[2]; if(distance < shortest) { shortest = distance; shortestIndex = r; } } else for(r=0; r < rmp_l; r++) { // calculate the distance for each component CGU_FLOAT distance = (_Blk[i][RC] - _Rmp[RC][r]) * (_Blk[i][RC] - _Rmp[RC][r]) + (_Blk[i][GC] - _Rmp[GC][r]) * (_Blk[i][GC] - _Rmp[GC][r]) + (_Blk[i][BC] - _Rmp[BC][r]) * (_Blk[i][BC] - _Rmp[BC][r]); if(distance < shortest) { shortest = distance; shortestIndex = r; } } Err += shortest; // We have the index of the best cluster, so assign this in the block // Reorder indices to match correct DXTC ordering if(shortestIndex == dwNumPoints - 1) shortestIndex = 1; else if(shortestIndex) shortestIndex++; pcIndices[i] = shortestIndex; } } return Err; } /*------------------------------------------------------------------------------------------------ // input ramp is on the coarse grid ------------------------------------------------------------------------------------------------*/ CMP_STATIC CGU_FLOAT cpu_ClstrBas( CGU_UINT8 pcIndices[BLOCK_SIZE_4X4], CGU_FLOAT _Blk[BLOCK_SIZE_4X4][NUM_CHANNELS], CGU_FLOAT _InpRmp[NUM_CHANNELS][NUM_ENDPOINTS], int dwBlockSize, CGU_UINT8 dwNumPoints, CGU_FLOAT _pfWeights[3], bool _bUseAlpha, CGU_UINT8 nRedBits, CGU_UINT8 nGreenBits, CGU_UINT8 nBlueBits) { // make ramp endpoints the way they'll going to be decompressed CGU_UINT8 Eq = 1; CGU_FLOAT InpRmp[NUM_CHANNELS][NUM_ENDPOINTS]; cpu_MkWkRmpPts(Eq, InpRmp, _InpRmp, nRedBits, nGreenBits, nBlueBits); // build ramp as it would be built by decompressor CGU_FLOAT Rmp[NUM_CHANNELS][MAX_POINTS]; cpu_BldRmp(Rmp, InpRmp, dwNumPoints); // clusterize and find a cumulative error return cpu_ClstrIntnl(_Blk, pcIndices, Rmp, dwBlockSize, dwNumPoints, Eq, _pfWeights, _bUseAlpha); } CMP_STATIC CGU_UINT8 nByteBitsMask2[9] = {0x00,0x80,0xc0,0xe0,0xf0,0xf8,0xfc,0xfe,0xff}; CMP_STATIC CGU_UINT32 cpu_ConstructColor2(CGU_UINT8 R, CGU_UINT8 nRedBits, CGU_UINT8 G, CGU_UINT8 nGreenBits, CGU_UINT8 B, CGU_UINT8 nBlueBits) { return ( ((R & nByteBitsMask2[nRedBits]) << (nGreenBits + nBlueBits - (PIX_GRID - nRedBits))) | ((G & nByteBitsMask2[nGreenBits])<< (nBlueBits - (PIX_GRID - nGreenBits))) | ((B & nByteBitsMask2[nBlueBits]) >> ((PIX_GRID - nBlueBits)))); } CMP_STATIC CGU_FLOAT cpu_Clstr( CGU_UINT32 block_32[BLOCK_SIZE_4X4], CGU_UINT32 dwBlockSize, CGU_UINT8 nEndpoints[3][NUM_ENDPOINTS], CGU_UINT8 pcIndices[BLOCK_SIZE_4X4], CGU_UINT8 dwNumPoints, CGU_FLOAT _pfWeights[3], bool _bUseAlpha, CGU_UINT8 _nAlphaThreshold, CGU_UINT8 nRedBits, CGU_UINT8 nGreenBits, CGU_UINT8 nBlueBits) { CGU_UINT32 c0 = cpu_ConstructColor2(nEndpoints[RC][0], nRedBits, nEndpoints[GC][0], nGreenBits, nEndpoints[BC][0], nBlueBits); CGU_UINT32 c1 = cpu_ConstructColor2(nEndpoints[RC][1], nRedBits, nEndpoints[GC][1], nGreenBits, nEndpoints[BC][1], nBlueBits); CGU_UINT32 nEndpointIndex0 = 0; CGU_UINT32 nEndpointIndex1 = 1; if((!(dwNumPoints & 0x1) && c0 <= c1) || ((dwNumPoints & 0x1) && c0 > c1)) { nEndpointIndex0 = 1; nEndpointIndex1 = 0; } CGU_FLOAT InpRmp[NUM_CHANNELS][NUM_ENDPOINTS]; InpRmp[RC][0] = (CGU_FLOAT)nEndpoints[RC][nEndpointIndex0]; InpRmp[RC][1] = (CGU_FLOAT)nEndpoints[RC][nEndpointIndex1]; InpRmp[GC][0] = (CGU_FLOAT)nEndpoints[GC][nEndpointIndex0]; InpRmp[GC][1] = (CGU_FLOAT)nEndpoints[GC][nEndpointIndex1]; InpRmp[BC][0] = (CGU_FLOAT)nEndpoints[BC][nEndpointIndex0]; InpRmp[BC][1] = (CGU_FLOAT)nEndpoints[BC][nEndpointIndex1]; CGU_UINT32 dwAlphaThreshold = _nAlphaThreshold << 24; CGU_FLOAT Blk[BLOCK_SIZE_4X4][NUM_CHANNELS]; for(CGU_UINT32 i = 0; i < dwBlockSize; i++) { Blk[i][RC] = (CGU_FLOAT)((block_32[i] & 0xff0000) >> 16); Blk[i][GC] = (CGU_FLOAT)((block_32[i] & 0xff00) >> 8); Blk[i][BC] = (CGU_FLOAT)(block_32[i] & 0xff); if(_bUseAlpha) Blk[i][AC] = ((block_32[i] & 0xff000000) >= dwAlphaThreshold) ? 1.f : 0.f; } return cpu_ClstrBas(pcIndices, Blk, InpRmp, dwBlockSize, dwNumPoints, _pfWeights, _bUseAlpha, nRedBits, nGreenBits, nBlueBits); } /*------------------------------------------------------------------------------------------------ Compute cumulative error for the current cluster ------------------------------------------------------------------------------------------------*/ CMP_STATIC CGU_FLOAT cpu_ClstrErr(CGU_FLOAT _Blk[BLOCK_SIZE_4X4][NUM_CHANNELS], CGU_FLOAT _Rpt[BLOCK_SIZE_4X4], CGU_FLOAT _Rmp[NUM_CHANNELS][MAX_POINTS], CGU_UINT32 _NmbClrs, CGU_UINT32 _blcktp, bool _ConstRamp, CGU_Vec3f channelWeights) { CGU_FLOAT fError = 0.f; CGU_UINT32 rmp_l = (_ConstRamp) ? 1 : _blcktp; CGU_BOOL useWeights = ((channelWeights[0] != 1.0f) || (channelWeights[1] != 1.0f) || (channelWeights[2] != 1.0f)); // For each colour in the original block, find the closest cluster // and compute the comulative error for(CGU_UINT32 i=0; i<_NmbClrs; i++) { CGU_FLOAT fShortest = 99999999999.f; if(useWeights) for(CGU_UINT32 r=0; r < rmp_l; r++) { // calculate the distance for each component CGU_FLOAT fDistance = (_Blk[i][RC] - _Rmp[RC][r]) * (_Blk[i][RC] - _Rmp[RC][r]) * channelWeights[0] + (_Blk[i][GC] - _Rmp[GC][r]) * (_Blk[i][GC] - _Rmp[GC][r]) * channelWeights[1] + (_Blk[i][BC] - _Rmp[BC][r]) * (_Blk[i][BC] - _Rmp[BC][r]) * channelWeights[2]; if(fDistance < fShortest) fShortest = fDistance; } else for(CGU_UINT32 r=0; r < rmp_l; r++) { // calculate the distance for each component CGU_FLOAT fDistance = (_Blk[i][RC] - _Rmp[RC][r]) * (_Blk[i][RC] - _Rmp[RC][r]) + (_Blk[i][GC] - _Rmp[GC][r]) * (_Blk[i][GC] - _Rmp[GC][r]) + (_Blk[i][BC] - _Rmp[BC][r]) * (_Blk[i][BC] - _Rmp[BC][r]); if(fDistance < fShortest) fShortest = fDistance; } // accumulate the error fError += fShortest * _Rpt[i]; } return fError; } #if defined(USE_REFINE3D) CMP_STATIC CGU_FLOAT cmp_Refine3D( CGU_FLOAT _OutRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS], CGU_FLOAT _InpRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS], CGU_FLOAT _Blk[BLOCK_SIZE_4X4][NUM_CHANNELS], CGU_FLOAT _Rpt[BLOCK_SIZE_4X4], CGU_UINT32 _NmrClrs, CGU_UINT32 dwNumPoints, CGU_Vec3f channelWeights, CGU_UINT8 nRedBits, CGU_UINT8 nGreenBits, CGU_UINT8 nBlueBits, CGU_UINT32 nRefineSteps) { CGU_FLOAT ALIGN_16 Rmp[NUM_CHANNELS][MAX_POINTS]; CGU_FLOAT Blk[BLOCK_SIZE_4X4][NUM_CHANNELS]; for(CGU_UINT32 i = 0; i < _NmrClrs; i++) for(CGU_UINT32 j = 0; j < 3; j++) Blk[i][j] = _Blk[i][j]; CGU_FLOAT fWeightRed = channelWeights.r; CGU_FLOAT fWeightGreen = channelWeights.g; CGU_FLOAT fWeightBlue = channelWeights.b; // here is our grid CGU_FLOAT Fctrs[3]; Fctrs[RC] = (CGU_FLOAT)(1 << (PIX_GRID-nRedBits)); Fctrs[GC] = (CGU_FLOAT)(1 << (PIX_GRID-nGreenBits)); Fctrs[BC] = (CGU_FLOAT)(1 << (PIX_GRID-nBlueBits)); CGU_FLOAT InpRmp0[NUM_CHANNELS][NUM_ENDPOINTS]; CGU_FLOAT InpRmp[NUM_CHANNELS][NUM_ENDPOINTS]; for(CGU_UINT32 k = 0; k < 2; k++) for(CGU_UINT32 j = 0; j < 3; j++) InpRmp0[j][k] = InpRmp[j][k] = _OutRmpPnts[j][k] = _InpRmpPnts[j][k]; // make ramp endpoints the way they'll going to be decompressed // plus check whether the ramp is flat CGU_UINT8 Eq; CGU_FLOAT WkRmpPts[NUM_CHANNELS][NUM_ENDPOINTS]; cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits); // build ramp for all 3 colors cpu_BldRmp(Rmp, WkRmpPts, dwNumPoints); // clusterize for the current ramp CGU_FLOAT bestE = cpu_ClstrErr(Blk, _Rpt, Rmp, _NmrClrs, dwNumPoints, Eq, channelWeights); if(bestE == 0.f) // if exact, we've done return bestE; // Jitter endpoints in each direction CGU_INT nRefineStart = 0 - (cmp_min(nRefineSteps, (CGU_UINT8)8)); CGU_INT nRefineEnd = cmp_min(nRefineSteps, (CGU_UINT8)8); for(CGU_INT nJitterG0 = nRefineStart; nJitterG0 <= nRefineEnd; nJitterG0++) { InpRmp[GC][0] = cmp_min(cmp_max(InpRmp0[GC][0] + nJitterG0 * Fctrs[GC], 0.f), 255.f); for(CGU_INT nJitterG1 = nRefineStart; nJitterG1 <= nRefineEnd; nJitterG1++) { InpRmp[GC][1] = cmp_min(cmp_max(InpRmp0[GC][1] + nJitterG1 * Fctrs[GC], 0.f), 255.f); cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits); cpu_BldClrRmp(Rmp[GC], WkRmpPts[GC], dwNumPoints); CGU_FLOAT RmpErrG[MAX_POINTS][BLOCK_SIZE_4X4]; for(CGU_UINT32 i=0; i < _NmrClrs; i++) { for(CGU_UINT32 r = 0; r < dwNumPoints; r++) { CGU_FLOAT DistG = (Rmp[GC][r] - Blk[i][GC]); RmpErrG[r][i] = DistG * DistG * fWeightGreen; } } for(CGU_INT nJitterB0 = nRefineStart; nJitterB0 <= nRefineEnd; nJitterB0++) { InpRmp[BC][0] = cmp_min(cmp_max(InpRmp0[BC][0] + nJitterB0 * Fctrs[BC], 0.f), 255.f); for(CGU_INT nJitterB1 = nRefineStart; nJitterB1 <= nRefineEnd; nJitterB1++) { InpRmp[BC][1] = cmp_min(cmp_max(InpRmp0[BC][1] + nJitterB1 * Fctrs[BC], 0.f), 255.f); cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits); cpu_BldClrRmp(Rmp[BC], WkRmpPts[BC], dwNumPoints); CGU_FLOAT RmpErr[MAX_POINTS][BLOCK_SIZE_4X4]; for(CGU_UINT32 i=0; i < _NmrClrs; i++) { for(CGU_UINT32 r = 0; r < dwNumPoints; r++) { CGU_FLOAT DistB = (Rmp[BC][r] - Blk[i][BC]); RmpErr[r][i] = RmpErrG[r][i] + DistB * DistB * fWeightBlue; } } for(CGU_INT nJitterR0 = nRefineStart; nJitterR0 <= nRefineEnd; nJitterR0++) { InpRmp[RC][0] = cmp_min(cmp_max(InpRmp0[RC][0] + nJitterR0 * Fctrs[RC], 0.f), 255.f); for(CGU_INT nJitterR1 = nRefineStart; nJitterR1 <= nRefineEnd; nJitterR1++) { InpRmp[RC][1] = cmp_min(cmp_max(InpRmp0[RC][1] + nJitterR1 * Fctrs[RC], 0.f), 255.f); cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits); cpu_BldClrRmp(Rmp[RC], WkRmpPts[RC], dwNumPoints); // compute cumulative error CGU_FLOAT mse = 0.f; CGU_INT rmp_l = (Eq > 0) ? 1 : dwNumPoints; for(CGU_UINT32 k = 0; k < _NmrClrs; k++) { CGU_FLOAT MinErr = 10000000.f; for(CGU_INT r = 0; r < rmp_l; r++) { CGU_FLOAT Dist = (Rmp[RC][r] - Blk[k][RC]); CGU_FLOAT Err = RmpErr[r][k] + Dist * Dist * fWeightRed; MinErr = cmp_min(MinErr, Err); } mse += MinErr * _Rpt[k]; } // save if we achieve better result if(mse < bestE) { bestE = mse; for(CGU_UINT32 k = 0; k < 2; k++) for(CGU_UINT32 j = 0; j < 3; j++) _OutRmpPnts[j][k] = InpRmp[j][k]; } } } } } } } return bestE; } #endif #if defined(USE_REFINE) CMP_STATIC CGU_FLOAT cmp_Refine(CGU_FLOAT _OutRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS], CGU_FLOAT _InpRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS], CGU_FLOAT _Blk[BLOCK_SIZE_4X4][NUM_CHANNELS], CGU_FLOAT _Rpt[BLOCK_SIZE_4X4], CGU_INT _NmrClrs, CGU_UINT8 dwNumPoints, CGU_Vec3f channelWeights, CGU_UINT32 nRedBits, CGU_UINT32 nGreenBits, CGU_UINT32 nBlueBits, CGU_UINT32 nRefineSteps ) { CGU_FLOAT ALIGN_16 Rmp[NUM_CHANNELS][MAX_POINTS]; if (nRefineSteps == 0) nRefineSteps = 1; CGU_FLOAT Blk[BLOCK_SIZE_4X4][NUM_CHANNELS]; for(CGU_INT i = 0; i < _NmrClrs; i++) for(CGU_INT j = 0; j < 3; j++) Blk[i][j] = _Blk[i][j]; CGU_FLOAT fWeightRed = channelWeights.r; CGU_FLOAT fWeightGreen = channelWeights.g; CGU_FLOAT fWeightBlue = channelWeights.b; // here is our grid CGU_FLOAT Fctrs[3]; Fctrs[RC] = (CGU_FLOAT)(1 << (PIX_GRID-nRedBits)); Fctrs[GC] = (CGU_FLOAT)(1 << (PIX_GRID-nGreenBits)); Fctrs[BC] = (CGU_FLOAT)(1 << (PIX_GRID-nBlueBits)); CGU_FLOAT InpRmp0[NUM_CHANNELS][NUM_ENDPOINTS]; CGU_FLOAT InpRmp[NUM_CHANNELS][NUM_ENDPOINTS]; for(CGU_INT k = 0; k < 2; k++) for(CGU_INT j = 0; j < 3; j++) InpRmp0[j][k] = InpRmp[j][k] = _OutRmpPnts[j][k] = _InpRmpPnts[j][k]; // make ramp endpoints the way they'll going to be decompressed // plus check whether the ramp is flat CGU_UINT8 Eq; CGU_FLOAT WkRmpPts[NUM_CHANNELS][NUM_ENDPOINTS]; cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits); // build ramp for all 3 colors cpu_BldRmp(Rmp, WkRmpPts, dwNumPoints); // clusterize for the current ramp CGU_FLOAT bestE = cpu_ClstrErr(Blk, _Rpt, Rmp, _NmrClrs, dwNumPoints, Eq, channelWeights); if(bestE == 0.f) // || !nRefineSteps) // if exact, we've done return bestE; // Tweak each component in isolation and get the best values // precompute ramp errors for Green and Blue CGU_FLOAT RmpErr[MAX_POINTS][BLOCK_SIZE_4X4]; for(CGU_INT i=0; i < _NmrClrs; i++) { for(CGU_INT r = 0; r < dwNumPoints; r++) { CGU_FLOAT DistG = (Rmp[GC][r] - Blk[i][GC]); CGU_FLOAT DistB = (Rmp[BC][r] - Blk[i][BC]); RmpErr[r][i] = DistG * DistG * fWeightGreen + DistB * DistB * fWeightBlue; } } // First Red CGU_FLOAT bstC0 = InpRmp0[RC][0]; CGU_FLOAT bstC1 = InpRmp0[RC][1]; CGU_INT nRefineStart = 0 - (cmp_min(nRefineSteps, (CGU_UINT8)8)); CGU_INT nRefineEnd = cmp_min(nRefineSteps, (CGU_UINT8)8); for(CGU_INT i = nRefineStart; i <= nRefineEnd; i++) { for(CGU_INT j = nRefineStart; j <= nRefineEnd; j++) { // make a move; both sides of interval. InpRmp[RC][0] = cmp_min(cmp_max(InpRmp0[RC][0] + i * Fctrs[RC], 0.f), 255.f); InpRmp[RC][1] = cmp_min(cmp_max(InpRmp0[RC][1] + j * Fctrs[RC], 0.f), 255.f); // make ramp endpoints the way they'll going to be decompressed // plus check whether the ramp is flat cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits); // build ramp only for red cpu_BldClrRmp(Rmp[RC], WkRmpPts[RC], dwNumPoints); // compute cumulative error CGU_FLOAT mse = 0.f; CGU_INT rmp_l = (Eq > 0) ? 1 : dwNumPoints; for(CGU_INT k = 0; k < _NmrClrs; k++) { CGU_FLOAT MinErr = 10000000.f; for(CGU_INT r = 0; r < rmp_l; r++) { CGU_FLOAT Dist = (Rmp[RC][r] - Blk[k][RC]); CGU_FLOAT Err = RmpErr[r][k] + Dist * Dist * fWeightRed; MinErr = cmp_minf(MinErr, Err); } mse += MinErr * _Rpt[k]; } // save if we achieve better result if(mse < bestE) { bstC0 = InpRmp[RC][0]; bstC1 = InpRmp[RC][1]; bestE = mse; } } } // our best REDs InpRmp[RC][0] = bstC0; InpRmp[RC][1] = bstC1; // make ramp endpoints the way they'll going to be decompressed // plus check whether the ramp is flat cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits); // build ramp only for green cpu_BldRmp(Rmp, WkRmpPts, dwNumPoints); // precompute ramp errors for Red and Blue for(CGU_INT i=0; i < _NmrClrs; i++) { for(CGU_INT r = 0; r < dwNumPoints; r++) { CGU_FLOAT DistR = (Rmp[RC][r] - Blk[i][RC]); CGU_FLOAT DistB = (Rmp[BC][r] - Blk[i][BC]); RmpErr[r][i] = DistR * DistR * fWeightRed + DistB * DistB * fWeightBlue; } } // Now green bstC0 = InpRmp0[GC][0]; bstC1 = InpRmp0[GC][1]; for(CGU_INT i = nRefineStart; i <= nRefineEnd; i++) { for(CGU_INT j = nRefineStart; j <= nRefineEnd; j++) { InpRmp[GC][0] = cmp_minf(cmp_maxf(InpRmp0[GC][0] + i * Fctrs[GC], 0.f), 255.f); InpRmp[GC][1] = cmp_minf(cmp_maxf(InpRmp0[GC][1] + j * Fctrs[GC], 0.f), 255.f); cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits); cpu_BldClrRmp(Rmp[GC], WkRmpPts[GC], dwNumPoints); CGU_FLOAT mse = 0.f; CGU_INT rmp_l = (Eq > 0) ? 1 : dwNumPoints; for(CGU_INT k = 0; k < _NmrClrs; k++) { CGU_FLOAT MinErr = 10000000.f; for(CGU_INT r = 0; r < rmp_l; r++) { CGU_FLOAT Dist = (Rmp[GC][r] - Blk[k][GC]); CGU_FLOAT Err = RmpErr[r][k] + Dist * Dist * fWeightGreen; MinErr = cmp_minf(MinErr, Err); } mse += MinErr * _Rpt[k]; } if(mse < bestE) { bstC0 = InpRmp[GC][0]; bstC1 = InpRmp[GC][1]; bestE = mse; } } } // our best GREENs InpRmp[GC][0] = bstC0; InpRmp[GC][1] = bstC1; cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits); cpu_BldRmp(Rmp, WkRmpPts, dwNumPoints); // ramp err for Red and Green for(CGU_INT i=0; i < _NmrClrs; i++) { for(CGU_INT r = 0; r < dwNumPoints; r++) { CGU_FLOAT DistR = (Rmp[RC][r] - Blk[i][RC]); CGU_FLOAT DistG = (Rmp[GC][r] - Blk[i][GC]); RmpErr[r][i] = DistR * DistR * fWeightRed + DistG * DistG * fWeightGreen; } } bstC0 = InpRmp0[BC][0]; bstC1 = InpRmp0[BC][1]; // Now blue for(CGU_INT i = nRefineStart; i <= nRefineEnd; i++) { for(CGU_INT j = nRefineStart; j <= nRefineEnd; j++) { InpRmp[BC][0] = min(max(InpRmp0[BC][0] + i * Fctrs[BC], 0.f), 255.f); InpRmp[BC][1] = min(max(InpRmp0[BC][1] + j * Fctrs[BC], 0.f), 255.f); cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits); cpu_BldClrRmp(Rmp[BC], WkRmpPts[BC], dwNumPoints); CGU_FLOAT mse = 0.f; CGU_INT rmp_l = (Eq > 0) ? 1 : dwNumPoints; for(CGU_INT k = 0; k < _NmrClrs; k++) { CGU_FLOAT MinErr = 10000000.f; for(CGU_INT r = 0; r < rmp_l; r++) { CGU_FLOAT Dist = (Rmp[BC][r] - Blk[k][BC]); CGU_FLOAT Err = RmpErr[r][k] + Dist * Dist * fWeightBlue; MinErr = min(MinErr, Err); } mse += MinErr * _Rpt[k]; } if(mse < bestE) { bstC0 = InpRmp[BC][0]; bstC1 = InpRmp[BC][1]; bestE = mse; } } } // our best BLUEs InpRmp[BC][0] = bstC0; InpRmp[BC][1] = bstC1; // return our best choice for(CGU_INT j = 0; j < 3; j++) for(CGU_INT k = 0; k < 2; k++) _OutRmpPnts[j][k] = InpRmp[j][k]; return bestE; } #endif //====================================================================================== // Codec from CompressonatorLib //====================================================================================== #define BLOCK_SIZE_4X4 16 #define RG 5 #define GG 6 #define BG 5 /*------------------------------------------------------------------------------------------------ // this is how the end points is going to be rounded in compressed format ------------------------------------------------------------------------------------------------*/ CMP_STATIC void cpu_MkRmpOnGrid(CGU_FLOAT _RmpF[NUM_CHANNELS][NUM_ENDPOINTS], CGU_FLOAT _MnMx[NUM_CHANNELS][NUM_ENDPOINTS], CGU_FLOAT _Min, CGU_FLOAT _Max, CGU_UINT8 nRedBits, CGU_UINT8 nGreenBits, CGU_UINT8 nBlueBits) { CGU_FLOAT Fctrs0[3]; CGU_FLOAT Fctrs1[3]; Fctrs1[RC] = (CGU_FLOAT)(1 << nRedBits); Fctrs1[GC] = (CGU_FLOAT)(1 << nGreenBits); Fctrs1[BC] = (CGU_FLOAT)(1 << nBlueBits); Fctrs0[RC] = (CGU_FLOAT)(1 << (PIX_GRID-nRedBits)); Fctrs0[GC] = (CGU_FLOAT)(1 << (PIX_GRID-nGreenBits)); Fctrs0[BC] = (CGU_FLOAT)(1 << (PIX_GRID-nBlueBits)); for(int j = 0; j < 3; j++) { for(int k = 0; k < 2; k++) { _RmpF[j][k] = cmp_floor(_MnMx[j][k]); if(_RmpF[j][k] <= _Min) _RmpF[j][k] = _Min; else { _RmpF[j][k] += cmp_floor(128.f / Fctrs1[j]) - cmp_floor(_RmpF[j][k] / Fctrs1[j]); _RmpF[j][k] = cmp_minf(_RmpF[j][k], _Max); } _RmpF[j][k] = cmp_floor(_RmpF[j][k] / Fctrs0[j]) * Fctrs0[j]; } } } // Find the first approximation of the line // Assume there is a linear relation // Z = a * X_In // Z = b * Y_In // Find a,b to minimize MSE between Z and Z_In CMP_STATIC void cpu_FindAxis(CMP_OUT CGU_FLOAT BlkSh[BLOCK_SIZE_4X4][NUM_CHANNELS], CMP_IN CGU_FLOAT LineDir0[NUM_CHANNELS], CMP_IN CGU_FLOAT fBlockCenter[NUM_CHANNELS], CMP_OUT CGU_UINT8 CMP_REFINOUT AxisIsSmall, CMP_IN CGU_FLOAT BlkUV[BLOCK_SIZE_4X4][NUM_CHANNELS], CMP_IN CGU_FLOAT _inpRpt[BLOCK_SIZE_4X4], CMP_IN int nDimensions, CMP_IN int dwUniqueColors) { CGU_FLOAT Crrl[NUM_CHANNELS]; CGU_FLOAT RGB2[NUM_CHANNELS]; CGU_INT i; LineDir0[0] = LineDir0[1] = LineDir0[2] = RGB2[0] = RGB2[1] = RGB2[2] = Crrl[0] = Crrl[1] = Crrl[2] = fBlockCenter[0] = fBlockCenter[1] = fBlockCenter[2] = 0.f; // sum position of all points CGU_FLOAT fNumPoints = 0.f; for(i=0; i < dwUniqueColors; i++) { fBlockCenter[0] += BlkUV[i][0] * _inpRpt[i]; fBlockCenter[1] += BlkUV[i][1] * _inpRpt[i]; fBlockCenter[2] += BlkUV[i][2] * _inpRpt[i]; fNumPoints += _inpRpt[i]; } // and then average to calculate center coordinate of block fBlockCenter[0] /= fNumPoints; fBlockCenter[1] /= fNumPoints; fBlockCenter[2] /= fNumPoints; for(i = 0; i < dwUniqueColors; i++) { // calculate output block as offsets around block center BlkSh[i][0] = BlkUV[i][0] - fBlockCenter[0]; BlkSh[i][1] = BlkUV[i][1] - fBlockCenter[1]; BlkSh[i][2] = BlkUV[i][2] - fBlockCenter[2]; // compute correlation matrix // RGB2 = sum of ((distance from point from center) squared) // Crrl = ???????. Seems to be be some calculation based on distance from point center in two dimensions for(int j = 0; j < nDimensions; j++) { RGB2[j] += BlkSh[i][j] * BlkSh[i][j] * _inpRpt[i]; Crrl[j] += BlkSh[i][j] * BlkSh[i][(j+1)%3] * _inpRpt[i]; } } // if set's diameter is small int i0 = 0, i1 = 1; CGU_FLOAT mxRGB2 = 0.f; int k = 0, j = 0; CGU_FLOAT fEPS = fNumPoints * EPS; for(k = 0, j = 0; j < 3; j++) { if(RGB2[j] >= fEPS) k++; else RGB2[j] = 0.f; if(mxRGB2 < RGB2[j]) { mxRGB2 = RGB2[j]; i0 = j; } } CGU_FLOAT fEPS2 = fNumPoints * EPS2; AxisIsSmall = 1; for(j = 0; j < 3; j++) { AxisIsSmall &= (RGB2[j] < fEPS2); } if(AxisIsSmall) // all are very small to avoid division on the small determinant return; if(k == 1) // really only 1 dimension LineDir0[i0]= 1.; else if(k == 2) { // really only 2 dimensions i1 = (RGB2[(i0+1)%3] > 0.f) ? (i0+1)%3 : (i0+2)%3; CGU_FLOAT Crl = (i1 == (i0+1)%3) ? Crrl[i0] : Crrl[(i0+2)%3]; LineDir0[i1] = Crl/ RGB2[i0]; LineDir0[i0]= 1.; } else { CGU_FLOAT maxDet = 100000.f; CGU_FLOAT Cs[3]; // select max det for precision for(j = 0; j < nDimensions; j++) { CGU_FLOAT Det = RGB2[j] * RGB2[(j+1)%3] - Crrl[j] * Crrl[j]; Cs[j] = abs(Crrl[j]/sqrt(RGB2[j] * RGB2[(j+1)%3])); if(maxDet < Det) { maxDet = Det; i0 = j; } } // inverse correl matrix // -- -- -- -- // | A B | | C -B | // | B C | => | -B A | // -- -- -- -- CGU_FLOAT mtrx1[2][2]; CGU_FLOAT vc1[2]; CGU_FLOAT vc[2]; vc1[0] = Crrl[(i0 + 2) %3]; vc1[1] = Crrl[(i0 + 1) %3]; // C mtrx1[0][0] = RGB2[(i0+1)%3]; // A mtrx1[1][1] = RGB2[i0]; // -B mtrx1[1][0] = mtrx1[0][1] = -Crrl[i0]; // find a solution vc[0] = mtrx1[0][0] * vc1[0] + mtrx1[0][1] * vc1[1]; vc[1] = mtrx1[1][0] * vc1[0] + mtrx1[1][1] * vc1[1]; // normalize vc[0] /= maxDet; vc[1] /= maxDet; // find a line direction vector LineDir0[i0] = 1.; LineDir0[(i0 + 1) %3] = 1.; LineDir0[(i0 + 2) %3] = vc[0] + vc[1]; } // normalize direction vector CGU_FLOAT Len = LineDir0[0] * LineDir0[0] + LineDir0[1] * LineDir0[1] + LineDir0[2] * LineDir0[2]; Len = sqrt(Len); for(j = 0; j < 3; j++) LineDir0[j] = (Len > 0.f) ? LineDir0[j] / Len : 0.f; } CMP_STATIC CGU_FLOAT cpu_RampSrchW(CGU_FLOAT Prj[BLOCK_SIZE_4X4], CGU_FLOAT PrjErr[BLOCK_SIZE_4X4], CGU_FLOAT PreMRep[BLOCK_SIZE_4X4], CGU_FLOAT StepErr, CGU_FLOAT lowPosStep, CGU_FLOAT highPosStep, int dwUniqueColors, int dwNumPoints) { CGU_FLOAT error = 0.0f; CGU_FLOAT step = (highPosStep - lowPosStep)/(dwNumPoints - 1); CGU_FLOAT step_h = step * 0.5f; CGU_FLOAT rstep = (CGU_FLOAT)1.0f / step; CGU_INT i; for(i=0; i < dwUniqueColors; i++) { // Work out which value in the block this select CGU_FLOAT del = Prj[i] - lowPosStep; CGU_FLOAT v; if(del <= 0) v = lowPosStep; else if(Prj[i] - highPosStep >= 0) v = highPosStep; else v = cmp_floor((del + step_h) * rstep) * step + lowPosStep; // And accumulate the error CGU_FLOAT d = (Prj[i] - v); d *= d; CGU_FLOAT err = PreMRep[i] * d + PrjErr[i]; error += err; if(StepErr < error) { error = StepErr; break; } } return error; } CMP_STATIC CGU_FLOAT _cpu_bc1ComputeBestEndpoints(CGU_FLOAT endpointsOut[NUM_ENDPOINTS], CGU_FLOAT endpointsIn[NUM_ENDPOINTS], CGU_FLOAT prj[BLOCK_SIZE_4X4], CGU_FLOAT prjError[BLOCK_SIZE_4X4], CGU_FLOAT preMRep[BLOCK_SIZE_4X4], int numColours, int numPoints) { CGU_FLOAT minError = MAX_ERROR; static const CGU_FLOAT searchStep = 0.025f; const CGU_FLOAT lowStart = (endpointsIn[0] - 2.0f*searchStep > 0.0f) ? endpointsIn[0] - 2.0f*searchStep : 0.0f; const CGU_FLOAT highStart = (endpointsIn[1] + 2.0f*searchStep < 1.0f) ? endpointsIn[1] + 2.0f*searchStep : 1.0f; CGU_FLOAT lowStep = lowStart; CGU_FLOAT highStep = highStart; for(int low = 0; low < 8; ++low) { for(int high = 0; high < 8; ++high) { // compute an error for the current pair of end points. CGU_FLOAT error = cpu_RampSrchW(prj, prjError, preMRep, minError, lowStep, highStep, numColours, numPoints); if(error < minError) { // save better result minError = error; endpointsOut[0] = lowStep; endpointsOut[1] = highStep; } highStep -= searchStep; } lowStep += searchStep; } return minError; } // This is a float point-based compression // it assumes that the number of unique colors is already known; input is in [0., 255.] range. // This is C version. CMP_STATIC bool cpu_CompressRGBBlockX( CMP_OUT CGU_FLOAT _RsltRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS], CMP_IN CGU_FLOAT src_image[BLOCK_SIZE_4X4][NUM_CHANNELS], CMP_IN CGU_FLOAT Rpt[BLOCK_SIZE_4X4], CMP_IN int dwUniqueColors, CMP_IN CGU_UINT8 dwNumPoints, CMP_IN bool b3DRefinement, CMP_IN CGU_UINT8 nRefinementSteps, CMP_IN CGU_FLOAT pfWeights[3], CMP_IN CGU_UINT8 nRedBits, CMP_IN CGU_UINT8 nGreenBits, CMP_IN CGU_UINT8 nBlueBits, CMP_IN CGU_FLOAT fquality ) { #if !defined(ASPM_GPU) if (!g_bc1FunctionPointersSet) { bc1ToggleSIMD(EXTENSION_COUNT); } #endif CGU_FLOAT ALIGN_16 Prj0[BLOCK_SIZE_4X4]; CGU_FLOAT ALIGN_16 Prj[BLOCK_SIZE_4X4]; CGU_FLOAT ALIGN_16 PrjErr[BLOCK_SIZE_4X4]; CGU_FLOAT ALIGN_16 LineDir[NUM_CHANNELS]; CGU_FLOAT ALIGN_16 RmpIndxs[BLOCK_SIZE_4X4]; CMP_UNUSED(fquality); CMP_UNUSED(b3DRefinement) CGU_FLOAT LineDirG[NUM_CHANNELS]; CGU_FLOAT PosG[NUM_ENDPOINTS]; CGU_FLOAT BlkUV[BLOCK_SIZE_4X4][NUM_CHANNELS]; CGU_FLOAT BlkSh[BLOCK_SIZE_4X4][NUM_CHANNELS]; CGU_FLOAT LineDir0[NUM_CHANNELS]; CGU_FLOAT Mdl[NUM_CHANNELS]; CGU_FLOAT rsltC[NUM_CHANNELS][NUM_ENDPOINTS]; int i, j, k; // down to [0., 1.] for(i = 0; i < dwUniqueColors; i++) for(j = 0; j < 3; j++) BlkUV[i][j] = src_image[i][j] / 255.f; bool isDONE = false; // as usual if not more then 2 different colors, we've done if(dwUniqueColors <= 2) { for(j = 0; j < 3; j++) { rsltC[j][0] = src_image[0][j]; rsltC[j][1] = src_image[dwUniqueColors - 1][j]; } isDONE = true; } if ( !isDONE ) { // This is our first attempt to find an axis we will go along. // The cumulation is done to find a line minimizing the MSE from the input 3D points. CGU_UINT8 bSmall; cpu_FindAxis(BlkSh, LineDir0, Mdl, bSmall, BlkUV, Rpt, 3, dwUniqueColors); // While trying to find the axis we found that the diameter of the input set is quite small. // Do not bother. if(bSmall) { for(j = 0; j < 3; j++) { rsltC[j][0] = src_image[0][j]; rsltC[j][1] = src_image[dwUniqueColors - 1][j]; } isDONE = true; } } // GCC is being an awful being when it comes to goto-jumps. // So please bear with this. if ( !isDONE ) { CGU_FLOAT ErrG = 10000000.f; CGU_FLOAT PrjBnd[NUM_ENDPOINTS]; CGU_FLOAT ALIGN_16 PreMRep[BLOCK_SIZE_4X4]; for(j =0; j < 3; j++) LineDir[j] = LineDir0[j]; // Here is the main loop. // 1. Project input set on the axis in consideration. // 2. Run 1 dimensional search (see scalar case) to find an (sub) optimal pair of end points. // 3. Compute the vector of indexes (or clusters) for the current approximate ramp. // 4. Present our color channels as 3 16DIM vectors. // 5. Find closest approximation of each of 16DIM color vector with the projection of the 16DIM index vector. // 6. Plug the projections as a new directional vector for the axis. // 7. Goto 1. // D - is 16 dim "index" vector (or 16 DIM vector of indexes - {0, 1/3, 2/3, 0, ...,}, but shifted and normalized). // Ci - is a 16 dim vector of color i. // for each Ci find a scalar Ai such that // (Ai * D - Ci) (Ai * D - Ci) -> min , i.e distance between vector AiD and C is min. // You can think of D as a unit interval(vector) "clusterizer", // and Ai is a scale you need to apply to the clusterizer to // approximate the Ci vector instead of the unit vector. // Solution is // Ai = (D . Ci) / (D . D); . - is a dot product. // in 3 dim space Ai(s) represent a line direction, along which // we again try to find (sub)optimal quantizer. // That's what our for(;;) loop is about. for(;;) { // 1. Project input set on the axis in consideration. // From Foley & Van Dam: Closest point of approach of a line (P + v) to a point (R) is // P + ((R-P).v) / (v.v))v // The distance along v is therefore (R-P).v / (v.v) // (v.v) is 1 if v is a unit vector. // PrjBnd[0] = 1000.; PrjBnd[1] = -1000.; for(i = 0; i < BLOCK_SIZE_4X4; i++) Prj0[i] = Prj[i] = PrjErr[i] = PreMRep[i] = 0.f; for(i = 0; i < dwUniqueColors; i++) { Prj0[i] = Prj[i] = BlkSh[i][0] * LineDir[0] + BlkSh[i][1] * LineDir[1] + BlkSh[i][2] * LineDir[2]; PrjErr[i] = (BlkSh[i][0] - LineDir[0] * Prj[i]) * (BlkSh[i][0] - LineDir[0] * Prj[i]) + (BlkSh[i][1] - LineDir[1] * Prj[i]) * (BlkSh[i][1] - LineDir[1] * Prj[i]) + (BlkSh[i][2] - LineDir[2] * Prj[i]) * (BlkSh[i][2] - LineDir[2] * Prj[i]); PrjBnd[0] = min(PrjBnd[0], Prj[i]); PrjBnd[1] = max(PrjBnd[1], Prj[i]); } // 2. Run 1 dimensional search (see scalar case) to find an (sub) optimal pair of end points. // min and max of the search interval CGU_FLOAT stepf = 0.125f; CGU_FLOAT Scl[NUM_ENDPOINTS]; Scl[0] = PrjBnd[0] - (PrjBnd[1] - PrjBnd[0]) * stepf; Scl[1] = PrjBnd[1] + (PrjBnd[1] - PrjBnd[0]) * stepf; // No range found exit if (Scl[0] == Scl[1]) { return false; } // compute scaling factor to scale down the search interval to [0.,1] const CGU_FLOAT Scl2 = (Scl[1] - Scl[0]) * (Scl[1] - Scl[0]); const CGU_FLOAT overScl = 1.f/(Scl[1] - Scl[0]); for(i = 0; i < dwUniqueColors; i++) { // scale them Prj[i] = (Prj[i] - Scl[0]) * overScl; // premultiply the scale squire to plug into error computation later PreMRep[i] = Rpt[i] * Scl2; } // scale first approximation of end points PrjBnd[0] = (PrjBnd[0] - Scl[0]) * overScl; PrjBnd[1] = (PrjBnd[1] - Scl[0]) * overScl; // find the best endpoints CGU_FLOAT Pos[NUM_ENDPOINTS]; #if defined(ASPM_GPU) CGU_FLOAT StepErr = _cpu_bc1ComputeBestEndpoints(Pos, PrjBnd, Prj, PrjErr, PreMRep, dwUniqueColors, dwNumPoints); #else CGU_FLOAT StepErr = cpu_bc1ComputeBestEndpoints(Pos, PrjBnd, Prj, PrjErr, PreMRep, dwUniqueColors, dwNumPoints); #endif // inverse the scaling Pos[0] = Pos[0] * (Scl[1] - Scl[0])+ Scl[0]; Pos[1] = Pos[1] * (Scl[1] - Scl[0])+ Scl[0]; // did we find somthing better from the previous run? if(StepErr + 0.001 < ErrG) { // yes, remember it ErrG = StepErr; LineDirG[0] = LineDir[0]; LineDirG[1] = LineDir[1]; LineDirG[2] = LineDir[2]; PosG[0] = Pos[0]; PosG[1] = Pos[1]; // 3. Compute the vector of indexes (or clusters) for the current approximate ramp. // indexes const CGU_FLOAT step = (Pos[1] - Pos[0]) / (CGU_FLOAT)(dwNumPoints - 1); const CGU_FLOAT step_h = step * (CGU_FLOAT)0.5; const CGU_FLOAT rstep = (CGU_FLOAT)1.0f / step; const CGU_FLOAT overBlkTp = 1.f/ (CGU_FLOAT)(dwNumPoints - 1) ; // here the index vector is computed, // shifted and normalized CGU_FLOAT indxAvrg = (CGU_FLOAT)(dwNumPoints - 1) / 2.f; for(i=0; i < dwUniqueColors; i++) { CGU_FLOAT del; //int n = (int)((b - _min_ex + (step*0.5f)) * rstep); if((del = Prj0[i] - Pos[0]) <= 0) RmpIndxs[i] = 0.f; else if(Prj0[i] - Pos[1] >= 0) RmpIndxs[i] = (CGU_FLOAT)(dwNumPoints - 1); else RmpIndxs[i] = cmp_floor((del + step_h) * rstep); // shift and normalization RmpIndxs[i] = (RmpIndxs[i] - indxAvrg) * overBlkTp; } // 4. Present our color channels as 3 16DIM vectors. // 5. Find closest aproximation of each of 16DIM color vector with the pojection of the 16DIM index vector. CGU_FLOAT Crs[3], Len, Len2; for(i = 0, Crs[0] = Crs[1] = Crs[2] = Len = 0.f; i < dwUniqueColors; i++) { const CGU_FLOAT PreMlt = RmpIndxs[i] * Rpt[i]; Len += RmpIndxs[i] * PreMlt; for(j = 0; j < 3; j++) Crs[j] += BlkSh[i][j] * PreMlt; } LineDir[0] = LineDir[1] = LineDir[2] = 0.f; if(Len > 0.f) { LineDir[0] = Crs[0]/ Len; LineDir[1] = Crs[1]/ Len; LineDir[2] = Crs[2]/ Len; // 6. Plug the projections as a new directional vector for the axis. // 7. Goto 1. Len2 = LineDir[0] * LineDir[0] + LineDir[1] * LineDir[1] + LineDir[2] * LineDir[2]; Len2 = sqrt(Len2); LineDir[0] /= Len2; LineDir[1] /= Len2; LineDir[2] /= Len2; } } else // We was not able to find anything better. Drop dead. break; } // inverse transform to find end-points of 3-color ramp for(k = 0; k < 2; k++) for(j = 0; j < 3; j++) rsltC[j][k] = (PosG[k] * LineDirG[j] + Mdl[j]) * 255.f; } // We've dealt with (almost) unrestricted full precision realm. // Now back to the dirty digital world. // round the end points to make them look like compressed ones CGU_FLOAT inpRmpEndPts[NUM_CHANNELS][NUM_ENDPOINTS]; cpu_MkRmpOnGrid(inpRmpEndPts, rsltC, 0.f, 255.f, nRedBits, nGreenBits, nBlueBits); // Try using this on 3 channels // static CGU_Vec2i cmp_getLinearEndPoints(CGU_FLOAT _Blk[BLOCK_SIZE_4X4], CMP_IN CGU_FLOAT fquality, CMP_IN CGU_BOOL isSigned); // This not a small procedure squeezes and stretches the ramp along each axis (R,G,B) separately while other 2 are fixed. // It does it only over coarse grid - 565 that is. It tries to squeeze more precision for the real world ramp. #if defined(USE_REFINE) || defined(USE_REFINE3D) switch(nRefinementSteps) { case 1: cmp_Refine(_RsltRmpPnts, inpRmpEndPts, src_image, Rpt, dwUniqueColors, dwNumPoints, pfWeights, nRedBits, nGreenBits, nBlueBits,3); break; case 2: if (dwUniqueColors > 2) cmp_Refine3D(_RsltRmpPnts, inpRmpEndPts, src_image, Rpt, dwUniqueColors, dwNumPoints, pfWeights, nRedBits, nGreenBits, nBlueBits, 1); else cmp_Refine(_RsltRmpPnts, inpRmpEndPts, src_image, Rpt, dwUniqueColors, dwNumPoints, pfWeights, nRedBits, nGreenBits, nBlueBits,3); break; default: cmp_Refine(_RsltRmpPnts, inpRmpEndPts, src_image, Rpt, dwUniqueColors, dwNumPoints, pfWeights, nRedBits, nGreenBits, nBlueBits, 1); break; } #endif return true; } // CPU: CompRGBBlock() CMP_STATIC CGU_FLOAT cpu_CompRGBBlock32(CGU_UINT32 block_32[16], CGU_UINT32 compressedBlock[2], CGU_UINT32 dwBlockSize, CGU_UINT8 nRedBits, CGU_UINT8 nGreenBits, CGU_UINT8 nBlueBits, CGU_UINT8 nEndpoints[3][NUM_ENDPOINTS], CGU_UINT8 pcIndices[BLOCK_SIZE_4X4], CGU_UINT8 dwNumPoints, bool b3DRefinement, CGU_UINT8 m_nRefinementSteps, CGU_FLOAT _pfChannelWeights[3], bool _bUseAlpha, CGU_UINT8 _nAlphaThreshold) { CGU_FLOAT ALIGN_16 Rpt[BLOCK_SIZE_4X4]; CGU_FLOAT ALIGN_16 BlkIn[BLOCK_SIZE_4X4][NUM_CHANNELS]; CGU_UINT32 mx; for (mx=0; mx < BLOCK_SIZE_4X4; mx++) { Rpt[mx] = 0; BlkIn[mx][0] = 0; BlkIn[mx][1] = 0; BlkIn[mx][2] = 0; BlkIn[mx][3] = 0; } compressedBlock[0] = 0; CGU_UINT32 dwAlphaThreshold = _nAlphaThreshold << 24; CGU_UINT32 dwColors = 0; CGU_UINT32 dwBlk[BLOCK_SIZE]; for(CGU_UINT32 i = 0; i < dwBlockSize; i++) if(!_bUseAlpha || (block_32[i] & 0xff000000) >= dwAlphaThreshold) dwBlk[dwColors++] = block_32[i] | 0xff000000; // Do we have any colors ? static int id=0; if(dwColors) { bool bHasAlpha = (dwColors != dwBlockSize); if(bHasAlpha && _bUseAlpha && !(dwNumPoints & 0x1)) return CMP_FLT_MAX; // Here we are computing an unique number of colors. // For each unique value we compute the number of it appearences. //qsort((void *)dwBlk, (size_t)dwColors, sizeof(CGU_UINT32), QSortIntCmp); #ifndef ASPM_GPU // this is here for reminder when code moves to GPU std::sort(dwBlk, dwBlk + 15); #else { CGU_UINT32 j; CMP_di what[BLOCK_SIZE_4X4]; for (i = 0; i < dwColors; i++) { what[i].index = i; what[i].data = dwBlk[i]; } CGU_UINT32 tmp_index; CGU_UINT32 tmp_data; for (i = 1; i < dwColors; i++) { for (j = i; j > 0; j--) { if (what[j - 1].data > what[j].data) { tmp_index = what[j].index; tmp_data = what[j].data; what[j].index = what[j - 1].index; what[j].data = what[j - 1].data; what[j - 1].index = tmp_index; what[j - 1].data = tmp_data; } } } for (i = 0; i < dwColors; i++) dwBlk[i] = what[i].data; } #endif CGU_UINT32 new_p; CGU_UINT32 dwBlkU[BLOCK_SIZE_4X4]; CGU_UINT32 dwUniqueColors = 0; new_p = dwBlkU[0] = dwBlk[0]; Rpt[dwUniqueColors] = 1.f; CGU_UINT32 i; for( i = 1; i < dwColors; i++) { if(new_p != dwBlk[i]) { dwUniqueColors++; new_p = dwBlkU[dwUniqueColors] = dwBlk[i]; Rpt[dwUniqueColors] = 1.f; } else Rpt[dwUniqueColors] += 1.f; } dwUniqueColors++; // switch to float for( i=0; i> 16) & 0xff); // R BlkIn[i][GC] = (CGU_FLOAT)((dwBlkU[i] >> 8) & 0xff); // G BlkIn[i][BC] = (CGU_FLOAT)((dwBlkU[i] >> 0) & 0xff); // B BlkIn[i][AC] = 255.0f; } CGU_FLOAT rsltC[NUM_CHANNELS][NUM_ENDPOINTS]; if (cpu_CompressRGBBlockX(rsltC, // CMP_EndPoints = CompressRGBBlock_Slow2 ( BlkIn, // CGU_Vec3f src_imageNorm[BLOCK_SIZE_4X4] Rpt, // CGU_FLOAT Rpt[BLOCK_SIZE_4X4], dwUniqueColors, // CGU_UINT32 dwUniqueColors, dwNumPoints, // CGU_UINT32 dwNumPoints, b3DRefinement, // m_nRefinementSteps, // CGU_UINT32 m_nRefinementSteps, _pfChannelWeights, // CGU_Vec3f channelWeightsBGR, nRedBits, // ); nGreenBits, nBlueBits, 1.0f) ) { // return to integer realm for(int ch = 0; ch < 3; ch++) for(int j = 0; j < 2; j++) nEndpoints[ch][j] = (CGU_UINT8 )rsltC[ch][j]; //printf("Endpoints {%3d,%3d,%3d} {%3d,%3d,%3d} ", nEndpoints[0][0],nEndpoints[1][0],nEndpoints[2][0], // nEndpoints[0][1],nEndpoints[1][1],nEndpoints[2][1]); // Now get the indices using the new end points return cpu_Clstr(block_32, dwBlockSize, nEndpoints, pcIndices, dwNumPoints, _pfChannelWeights, _bUseAlpha,_nAlphaThreshold, nRedBits, nGreenBits, nBlueBits); } else { CGU_FLOAT CompErr = CMP_FLT_MAX; if (dwNumPoints < 4) { CGU_Vec3f src_imageNorm[BLOCK_SIZE_4X4]; for (CGU_UINT32 px = 0; px < 16; px++) { src_imageNorm[px].r = (CGU_FLOAT)((block_32[px] >> 16) & 0xff)/ 255.0f; src_imageNorm[px].g = (CGU_FLOAT)((block_32[px] >> 8) & 0xff)/ 255.0f; src_imageNorm[px].b = (CGU_FLOAT)((block_32[px] >> 0) & 0xff)/ 255.0f; } // Do a quick compression test CGU_Vec3f srcRGB[16]; // The list of source colors with blue channel altered CGU_Vec3f average_rgb; // The centrepoint of the axis CGU_FLOAT errLQ = CMP_FLT_MAX; cgu_CompressRGBBlock_MinMax(src_imageNorm, 1.0f, false,srcRGB, average_rgb, errLQ); CGU_Vec2ui cmp = cgu_CompressRGBBlock_Fast(src_imageNorm, 1.0f, false,srcRGB, average_rgb, CompErr); compressedBlock[0] = cmp.x; compressedBlock[1] = cmp.y; } return CompErr; } } else { // All colors transparent nEndpoints[0][0] = nEndpoints[1][0] = nEndpoints[2][0] = 0; nEndpoints[0][1] = nEndpoints[1][1] = nEndpoints[2][1] = 0xff; for (CGU_UINT32 ms=0; ms 0) { //return cmpBlockBlue; cmpBlock.x = compressedBlock[0]; cmpBlock.y = compressedBlock[1]; err = fError3; } else { CGU_FLOAT fError4 = CMP_FLT_MAX; fError4 = (fError3 == 0.0) ? CMP_FLT_MAX :cpu_CompRGBBlock32((CGU_UINT32*)bgraBlock, compressedBlock, BLOCK_SIZE_4X4, RG, GG, BG, nEndpoints[1], nIndices[1], 4, BC15Options.m_b3DRefinement, BC15Options.m_nRefinementSteps, pfChannelWeights, BC15Options.m_bUseAlpha, BC15Options.m_nAlphaThreshold); CGU_UINT32 nMethod; if (fError3 <= fError4) { err = fError3; nMethod = 0; } else { err = fError4; nMethod = 1; } CGU_UINT32 c0 = BC1ConstructColour((nEndpoints[nMethod][RC][0] >> (8-RG)), (nEndpoints[nMethod][GC][0] >> (8-GG)), (nEndpoints[nMethod][BC][0] >> (8-BG))); CGU_UINT32 c1 = BC1ConstructColour((nEndpoints[nMethod][RC][1] >> (8-RG)), (nEndpoints[nMethod][GC][1] >> (8-GG)), (nEndpoints[nMethod][BC][1] >> (8-BG))); if(nMethod == 1 && c0 <= c1 || nMethod == 0 && c0 > c1) compressedBlock[0] = c1 | (c0<<16); else compressedBlock[0] = c0 | (c1<<16); compressedBlock[1] = 0; for(CGU_UINT32 i=0; i<16; i++) compressedBlock[1] |= (nIndices[nMethod][i] << (2*i)); cmpBlock.x = compressedBlock[0]; cmpBlock.y = compressedBlock[1]; } return cmpBlock; } #endif #ifdef ENABLE_NEW_CODE //---------------------------------------- Common Utility Code ------------------------------------------------------- // 1 - Dim error CMP_STATIC CGU_FLOAT cgu_RampSrchW( CGU_FLOAT Prj[BLOCK_SIZE_4X4], CGU_FLOAT PrjErr[BLOCK_SIZE_4X4], CGU_FLOAT PreMRep[BLOCK_SIZE_4X4], CGU_FLOAT StepErr, CGU_FLOAT lowPosStep, CGU_FLOAT highPosStep, CGU_UINT32 dwUniqueColors, CGU_UINT32 dwNumPoints) { CGU_FLOAT error = 0; CGU_FLOAT step = (highPosStep - lowPosStep) / (dwNumPoints - 1); CGU_FLOAT step_h = step * (CGU_FLOAT)0.5; CGU_FLOAT rstep = (CGU_FLOAT)1.0f / step; for (CGU_UINT32 i = 0; i < dwUniqueColors; i++) { CGU_FLOAT v; // Work out which value in the block this select CGU_FLOAT del; if ((del = Prj[i] - lowPosStep) <= 0) v = lowPosStep; else if (Prj[i] - highPosStep >= 0) v = highPosStep; else v = cmp_floor((del + step_h) * rstep) * step + lowPosStep; // And accumulate the error CGU_FLOAT d = (Prj[i] - v); d *= d; CGU_FLOAT err = PreMRep[i] * d + PrjErr[i]; error += err; if (StepErr < error) { error = StepErr; break; } } return error; } CMP_STATIC CGU_UINT32 cgu_processCluster( CMP_IN CMP_EndPoints EndPoints, CMP_IN CGU_Vec4f rgbBlock_normal[BLOCK_SIZE_4X4], CMP_IN CGU_UINT32 dwAlphaThreshold, CMP_IN CGU_Vec3f channelWeights, CMP_IN CGU_UINT8 indices[BLOCK_SIZE_4X4], CMP_OUT CGU_FLOAT CMP_REFINOUT Err ) { Err = 0.f; CGU_UINT32 pcIndices = 0; CGU_UINT32 R, G, B; R = (CGU_UINT32)(EndPoints.Color0.z); G = (CGU_UINT32)(EndPoints.Color0.y); B = (CGU_UINT32)(EndPoints.Color0.x); CGU_INT32 cluster0 = cmp_constructColor(R, G, B); R = (CGU_UINT32)(EndPoints.Color1.z); G = (CGU_UINT32)(EndPoints.Color1.y); B = (CGU_UINT32)(EndPoints.Color1.x); CGU_INT32 cluster1 = cmp_constructColor(R, G, B); CGU_Vec3f InpRmp[NUM_ENDPOINTS]; if ((cluster0 <= cluster1) // valid for 4 channels // || (cluster0 > cluster1) // valid for 3 channels ) { // inverse endpoints InpRmp[0] = EndPoints.Color1; InpRmp[1] = EndPoints.Color0; } else { InpRmp[0] = EndPoints.Color0; InpRmp[1] = EndPoints.Color1; } CGU_Vec3f srcblockLinear[BLOCK_SIZE_4X4]; CGU_FLOAT srcblockA[BLOCK_SIZE_4X4]; // Swizzle the source RGB to BGR for processing for (CGU_UINT32 i = 0; i < BLOCK_SIZE_4X4; i++) { srcblockLinear[i].z = rgbBlock_normal[i].x * 255.0f; srcblockLinear[i].y = rgbBlock_normal[i].y * 255.0f; srcblockLinear[i].x = rgbBlock_normal[i].z * 255.0f; srcblockA[i] = 0.0f; //if (dwAlphaThreshold > 0) //{ // CGU_UINT32 alpha = (CGU_UINT32)BlockA[i]; // if (alpha >= dwAlphaThreshold) // srcblockA[i] = BlockA[i]; //} } // cmp_ClstrBas2() // input ramp is on the coarse grid // make ramp endpoints the way they'll going to be decompressed CGU_Vec3f InpRmpL[NUM_ENDPOINTS]; CGU_Vec3f Fctrs = {32.0F, 64.0F, 32.0F}; // 1 << RG,1 << GG,1 << BG { // ConstantRamp = MkWkRmpPts(InpRmpL, InpRmp); InpRmpL[0] = InpRmp[0] + cmp_floorVec3f(InpRmp[0] / Fctrs); InpRmpL[0] = cmp_clampVec3f(InpRmpL[0], 0.0f, 255.0f); InpRmpL[1] = InpRmp[1] + cmp_floorVec3f(InpRmp[1] / Fctrs); InpRmpL[1] = cmp_clampVec3f(InpRmpL[1], 0.0f, 255.0f); } // MkWkRmpPts // build ramp CGU_Vec3f LerpRmp[4]; CGU_Vec3f offset = {1.0f, 1.0f, 1.0f}; { //BldRmp(Rmp, InpRmpL, dwNumChannels); // linear interpolate end points to get the ramp LerpRmp[0] = InpRmpL[0]; LerpRmp[3] = InpRmpL[1]; LerpRmp[1] = cmp_floorVec3f((InpRmpL[0] * 2.0f + LerpRmp[3] + offset) / 3.0f); LerpRmp[2] = cmp_floorVec3f((InpRmpL[0] + LerpRmp[3] * 2.0f + offset) / 3.0f); } // BldRmp //========================================================================= // Clusterize, Compute error and find DXTC indexes for the current cluster //========================================================================= { // Clusterize CGU_UINT32 alpha; // For each colour in the original block assign it // to the closest cluster and compute the cumulative error for (CGU_UINT32 i = 0; i < BLOCK_SIZE_4X4; i++) { alpha = (CGU_UINT32)srcblockA[i]; if ((dwAlphaThreshold > 0) && alpha == 0) { //*((CGU_UINT32 *)&_Blk[i][AC]) == 0) pcIndices |= cmp_set2Bit32(4, i); // dwNumChannels 3 or 4 (default is 4) indices[i] = 4; } else { CGU_FLOAT shortest = 99999999999.f; CGU_UINT8 shortestIndex = 0; CGU_Vec3f channelWeightsBGR; channelWeightsBGR.x = channelWeights.z; channelWeightsBGR.y = channelWeights.y; channelWeightsBGR.z = channelWeights.x; for (CGU_UINT8 rampindex = 0; rampindex < 4; rampindex++) { // r is either 1 or 4 // calculate the distance for each component CGU_FLOAT distance = cmp_dotVec3f(((srcblockLinear[i] - LerpRmp[rampindex]) * channelWeightsBGR), ((srcblockLinear[i] - LerpRmp[rampindex]) * channelWeightsBGR)); if (distance < shortest) { shortest = distance; shortestIndex = rampindex; } } Err += shortest; // The total is a sum of (error += shortest) // We have the index of the best cluster, so assign this in the block // Reorder indices to match correct DXTC ordering if (shortestIndex == 3) // dwNumChannels - 1 shortestIndex = 1; else if (shortestIndex) shortestIndex++; pcIndices |= cmp_set2Bit32(shortestIndex, i); indices[i] = shortestIndex; } } // BLOCK_SIZE_4X4 } // Clusterize return pcIndices; } #endif // Process a rgbBlock which is normalized (0.0f ... 1.0f), signed normal is not implemented CMP_STATIC CGU_Vec2ui CompressBlockBC1_NORMALIZED(CMP_IN CGU_Vec4f src_imageNorm[BLOCK_SIZE_4X4], CMP_IN CMP_BC15Options BC15Options) { bool usingMaxQualityOnly = false; #ifndef ASPM_GPU if (BC15Options.m_fquality > 0.75) usingMaxQualityOnly = true; #endif CGU_FLOAT CompErr = CMP_FLT_MAX; CGU_Vec2ui cmpBlock = {0U,0U}; CGU_Vec2ui cmpBlockTemp = {0U,0U}; CGU_FLOAT CompErrTemp; // Transfer to RGB Norm from RGBA Norm CGU_Vec3f src_imageRGBNorm[16]; CGU_Vec4uc pixels[16]; CGU_Vec4uc pixelsBGRA[16]; for (CGU_UINT32 sr = 0; sr < 16; sr++) { src_imageRGBNorm[sr] = src_imageNorm[sr].rgb; pixelsBGRA[sr].b = pixels[sr].r = src_imageNorm[sr].r * 255.0f; pixelsBGRA[sr].g = pixels[sr].g = src_imageNorm[sr].g * 255.0f; pixelsBGRA[sr].r = pixels[sr].b = src_imageNorm[sr].b * 255.0f; pixelsBGRA[sr].a = pixels[sr].a = src_imageNorm[sr].a * 255.0f; } // check for a punch through transparent alpha setting if ((BC15Options.m_fquality < 0.75) && (BC15Options.m_bUseAlpha)) { CGU_Vec2ui cmpBlockAlpha = {0xffff0000,0xffffffffU}; for (CGU_UINT32 sr = 0; sr < 16; sr++) if (pixels[sr].a < BC15Options.m_nAlphaThreshold) { return cmpBlockAlpha; } } //================ // extern codec //================ // For debugging // CGU_Vec2ui cmpBlockRed = {0xF800F800,0x00000000}; // CGU_Vec2ui cmpBlockGreen = {0x7E007E00,0x00000000}; // CGU_Vec2ui cmpBlockBlue = {0x1F001F00,0x00000000}; if (!BC15Options.m_bUseAlpha ) { //========================================== // Gain +0.3 dB for images with soild blocks //========================================== bool bAllColoursEqual = true; // Load the whole 4x4 block for (CGU_UINT32 i = 0u; (i < 16u) && bAllColoursEqual; ++i) { for (CGU_INT c = 0; c < 3; c++) bAllColoursEqual = bAllColoursEqual && (pixels[0][c] == pixels[i][c]); } if (bAllColoursEqual) { cmpBlock = cgu_solidColorBlock(pixels[0].x,pixels[0].y,pixels[0].z); CompErr = cgu_RGBABlockErrorLinear(pixels, cmpBlock); if (BC15Options.m_nRefinementSteps < 1) return cmpBlock; } } if (!usingMaxQualityOnly) { //==================================== // Get src image data, min,max... //===================================== //CMP_EncodeData edata; //cmp_get_encode_data(edata,pixels); if (!BC15Options.m_bUseAlpha) { //==================================== // Fast Compression, low quality //===================================== CGU_Vec3f srcRGB[16]; // The list of source colors with blue channel altered CGU_Vec3f average_rgb; // The centrepoint of the axis CGU_FLOAT errLQ = CMP_FLT_MAX; cmpBlockTemp = cgu_CompressRGBBlock_MinMax(src_imageRGBNorm, BC15Options.m_fquality, BC15Options.m_bIsSRGB,srcRGB, average_rgb, errLQ); if ((BC15Options.m_fquality < CMP_QUALITY0) || (errLQ == 0.0f)) return cmpBlockTemp; if (CompErr > errLQ) { CompErr = errLQ; cmpBlock = cmpBlockTemp; } cmpBlockTemp = cgu_CompressRGBBlock_Fast(src_imageRGBNorm, BC15Options.m_fquality, BC15Options.m_bIsSRGB,srcRGB, average_rgb, errLQ); if (CompErr > errLQ) { CompErr = errLQ; cmpBlock = cmpBlockTemp; } if (BC15Options.m_fquality < CMP_QUALITY1) return cmpBlock; } //======================================== // use GPU codec lower quality then CPU //======================================== cmpBlockTemp = cgu_CompRGBBlock(src_imageNorm,BC15Options); CompErrTemp = cgu_RGBABlockErrorLinear(pixels, cmpBlockTemp); if (CompErr > CompErrTemp) { CompErr = CompErrTemp; cmpBlock = cmpBlockTemp; } if (BC15Options.m_fquality < CMP_QUALITY2) return cmpBlock; }// if useCGUCodecs //==================================== // High Quality Codec CPU only //===================================== #ifndef ASPM_GPU cmpBlockTemp = cpu_CompRGBBlock(pixelsBGRA,BC15Options,CompErrTemp); CompErrTemp = cgu_RGBABlockErrorLinear(pixels, cmpBlockTemp); if (CompErr > CompErrTemp) { CompErr = CompErrTemp; cmpBlock = cmpBlockTemp; } #endif return cmpBlock; }