//=================================================================================== // Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files(the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and / or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions : // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. // //================================================================================== //---------------------------------------------------------------------------------- // File: BC7Encode.hlsl // // The Compute Shader for BC7 Encoder // // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. //---------------------------------------------------------------------------------- #ifdef ASPM_GPU #pragma warning(disable : 3078) // "loop control variable conflicts with a previous declaration in the outer scope" #else // using CPU #include "common_def.h" #include "bcn_common_api.h" #include #endif // TryMode456CS #define ENABLE_MODE4 #define ENABLE_MODE5 #define ENABLE_MODE6 // TryMode02CS #define ENABLE_MODE0 #define ENABLE_MODE2 // TryMode137CS #define ENABLE_MODE1 #define ENABLE_MODE3 #define ENABLE_MODE7 //#define ENABLE_CMP_MODE0 //#define ENABLE_CMP_MODE1 //#define ENABLE_CMP_MODE2 //#define ENABLE_CMP_MODE3 //#define ENABLE_CMP_MODE4 //#define ENABLE_CMP_MODE5 #define ENABLE_CMP_MODE6 //#define ENABLE_CMP_MODE7 #define ENABLE_CMP_API #define USE_NEW_SP_ERR_IDX #define ENABLE_CMP_REFINE_MODE6_API // API to improve mode 6 quality #define MAX_TRY_SHAKER 1 // used in cmp_ep_shaker //==================================================================================== // HLSL Host Simulation //==================================================================================== // Simulate HLSL compute code on a CPU host must run single treaded // On cpu the code simulates a single compute unit as used by CMP DXC host // Enable SIMULATE_GPU to run simulation in CPU using HPC in CMP GUI or CMP CLI // Note: some bcn_encode_kernel.cpp files have specific code you simulate with, enable // the define USE_NEW_SINGLE_HEADER_INTERFACES and pick the external or local codec // to run with. //=========================================================================== // Prototype to degug a simple simulation of shader using shared global data // run as single thread on CPU // #define SIMULATE_GPU //=========================================================================== #if !defined(ASPM_GPU) #define THREAD_GROUP_SIZE 64 #define BLOCK_SIZE_X 4 #define BLOCK_SIZE_Y 4 #define MAX_UINT 0xFFFFFFFF #define MIN_UINT 0x00000000 // Source Texture to process // Texture2D g_Input; // Normalized 0..1 struct Texture2D { CGU_Vec4f Texture[16]; CGU_Vec4f Load(CGU_Vec3ui index) { CGU_INT offset; offset = (index.x + (index.y * 4)) & 0x0F; return Texture[offset]; }; CGU_Vec4f Load(CGU_Vec3ui index, CGU_UINT32 z) { CMP_UNUSED(z); CGU_INT offset; offset = (index.x + (index.y * 4)) & 0x0F; return Texture[offset]; }; // Ignoring z in Texture2D load CGU_Vec4ui Load(CGU_Vec4ui index) { CGU_INT offset; offset = (index.x + (index.y * 4)) & 0x0F; // implicit conversion of float to uint CGU_Vec4ui res; res.x = Texture[offset].x; res.y = Texture[offset].y; res.z = Texture[offset].z; res.w = Texture[offset].w; return res; }; }; // matches GPU struct in HLSL struct BufferShared { CGU_Vec4ui pixel; CGU_UINT32 error; CGU_UINT32 mode; CGU_UINT32 partition; CGU_UINT32 index_selector; CGU_UINT32 rotation; CGU_UINT32 pbit; CGU_Vec4ui endPoint_low; CGU_Vec4ui endPoint_high; CGU_Vec4ui endPoint_low_quantized; CGU_Vec4ui endPoint_high_quantized; CGU_UINT32 colorindex; CGU_UINT32 alphaindex; }; struct SharedIOData { CGU_UINT32 error; CGU_UINT32 mode; CGU_UINT32 index_selector; CGU_UINT32 rotation; CGU_UINT32 partition; CGU_Vec4ui data2; }; CMP_STATIC BufferShared shared_temp[THREAD_GROUP_SIZE]; CMP_STATIC Texture2D g_Input; // cbuffer input: On cpu will use 1 block CMP_STATIC CGU_UINT32 g_tex_width; // Not used in HLSLHost simulation code CMP_STATIC CGU_UINT32 g_num_block_x = 1; CMP_STATIC CGU_UINT32 g_format; // Not used in HLSLHost simulation code CMP_STATIC CGU_UINT32 g_mode_id = 1; CMP_STATIC CGU_UINT32 g_start_block_id = 0; CMP_STATIC CGU_UINT32 g_num_total_blocks; CMP_STATIC CGU_FLOAT g_alpha_weight = 1.0f; CMP_STATIC CGU_FLOAT g_quality = 1.0f; CMP_STATIC SharedIOData g_InBuff[THREAD_GROUP_SIZE]; CMP_STATIC CGU_Vec4ui g_OutBuff[THREAD_GROUP_SIZE]; // Used by EncodeBlocks & TryMode... CMP_STATIC SharedIOData g_OutBuff1[THREAD_GROUP_SIZE]; // Used by TryMode... // Forward definitions void TryMode456CS(CGU_UINT32 GI CMP_SVGROUPINDEX, CGU_Vec3ui groupID CMP_SVGROUPID); void TryMode137CS(CGU_UINT32 GI CMP_SVGROUPINDEX, CGU_Vec3ui groupID CMP_SVGROUPID); void TryMode02CS( CGU_UINT32 GI CMP_SVGROUPINDEX, CGU_Vec3ui groupID CMP_SVGROUPID); void EncodeBlocks(CGU_UINT32 GI CMP_SVGROUPINDEX, CGU_Vec3ui groupID CMP_SVGROUPID); CMP_STATIC void HLSLHost(CGU_Vec4f image_src[16]) { //==================================== // Simulate a single block CS //==================================== // Load image_src CGU_Vec4ui imageBlock[16]; for (CGU_INT i = 0; i < 16; i++) { g_Input.Texture[i].x = image_src[i].x / 255.0f; g_Input.Texture[i].y = image_src[i].y / 255.0f; g_Input.Texture[i].z = image_src[i].z / 255.0f; g_Input.Texture[i].w = image_src[i].w / 255.0f; } // Init global Buffers for first time use for (CGU_INT i = 0; i < THREAD_GROUP_SIZE; i++) { memset(&shared_temp[i], 0, sizeof(BufferShared)); memset(&g_InBuff[i], 0, sizeof(SharedIOData)); memset(&g_OutBuff1[i], 0, sizeof(SharedIOData)); } // First Shader call CGU_Vec3ui SV_GroupID = {0, 0, 0}; // = Dispatch (1..(n-1),1,1) where n = number of (4x4) blocks in the image; CGU_Vec3ui SV_GrounThreadID = {0, 0, 0}; g_start_block_id = 0; // // Global Group Memory Sync for Pixel // for (CGU_INT i = 0; i < 16; i++) // { // CGU_Vec4f px = g_Input.Load(CGU_Vec3ui(i % 4, i / 4, 0)); // px = cmp_clampVec4f(px * 255.0f, 0.0f, 255.0f); // //printf("in px[%2d] %3.0f %3.0f %3.0f\n",i, px.x, px.y, px.z); // shared_temp[i].pixel.r = (CGU_UINT32)px.r; // shared_temp[i].pixel.g = (CGU_UINT32)px.g; // shared_temp[i].pixel.b = (CGU_UINT32)px.b; // shared_temp[i].pixel.a = (CGU_UINT32)px.a; // } g_mode_id = 6; for (CGU_INT SV_GroupIndex = 15; SV_GroupIndex >= 0; SV_GroupIndex--) { TryMode456CS(SV_GroupIndex, SV_GroupID); } // Return Outbuff back to inbuff for next CS use for (CGU_INT i = 0; i < THREAD_GROUP_SIZE; i++) { memcpy(&g_InBuff[i], &g_OutBuff1[i], sizeof(SharedIOData)); } // Global Group Memory Sync for Pixel //for (CGU_INT i = 0; i < 16; i++) //{ // CGU_Vec4f px = g_Input.Load(CGU_Vec3ui(i % 4, i / 4, 0)); // px = cmp_clampVec4f(px * 255.0f, 0.0f, 255.0f); // shared_temp[i].pixel.r = (CGU_UINT32)px.r; // shared_temp[i].pixel.g = (CGU_UINT32)px.g; // shared_temp[i].pixel.b = (CGU_UINT32)px.b; // shared_temp[i].pixel.a = (CGU_UINT32)px.a; //} // Next Shader call g_mode_id = 1; for (CGU_INT SV_GroupIndex = 63; SV_GroupIndex >= 0; SV_GroupIndex--) { TryMode137CS(SV_GroupIndex, SV_GroupID); } // Return Outbuff back to inbuff for next shader call for (CGU_INT i = 0; i < THREAD_GROUP_SIZE; i++) { memcpy(&g_InBuff[i], &g_OutBuff1[i], sizeof(SharedIOData)); } // Final Shader call for (CGU_INT SV_GroupIndex = 15; SV_GroupIndex >= 0; SV_GroupIndex--) { EncodeBlocks(SV_GroupIndex, SV_GroupID); } } #endif #ifdef ENABLE_CMP_API // Change this to CGU_Vec4ui par_vectors42_nd[4][2]; CMP_STATIC CMP_CONSTANT CGU_UINT32 par_vectors42_nd[4][2][4] = { // type = 2 {{0, 0, 0, 0}, {0, 0, 0, 0}}, // 0 {0,0} {{0, 0, 0, 0}, {1, 1, 1, 1}}, // 1 {0,1} {{1, 1, 1, 1}, {0, 0, 0, 0}}, // 2 {1,0} {{1, 1, 1, 1}, {1, 1, 1, 1}} // 3 {1,1} }; #define COMP_RED 0 #define COMP_GREEN 1 #define COMP_BLUE 2 #define COMP_ALPHA 3 typedef struct { CGU_UINT32 numPartitionModes; CGU_UINT32 maxSubSets; CGU_UINT32 channels3or4; CGU_UINT32 bits; CGU_UINT32 clusters; CGU_UINT32 componentBits; CGU_UINT32 partitionBits; CGU_UINT32 indexBits; } MODESETTINGS; CMP_STATIC CMP_CONSTANT MODESETTINGS g_modesettings[8] = { // numPartitionModes,maxSubSets channels3or4, bits, clusters, componentBits, partitionBits, indexBits {16, 3, 3, 26, 8, 4, 4, 3}, // Mode 0 {64, 2, 3, 37, 8, 6, 6, 3}, // Mode 1 {64, 3, 3, 30, 4, 5, 6, 2}, // Mode 2 {64, 2, 3, 44, 4, 7, 6, 2}, // Mode 3 { 0, 0, 0, 0, 0, 0, 0, 2}, // Mode 4 { 0, 0, 0, 0, 0, 0, 0, 2}, // Mode 5 { 0, 0, 4, 58, 16, 7, 0, 4}, // Mode 6 {64, 2, 4, 42, 4, 5, 6, 2} // Mode 7 }; #ifndef ASPM_HLSL //======================================================= CMP_STATIC CMP_CONSTANT CGU_UINT32 subset_mask_table2[128] = { // 2 subset region patterns 0x0000CCCCu, // 0 1100 1100 1100 1100 (MSB..LSB) 0x00008888u, // 1 1000 1000 1000 1000 0x0000EEEEu, // 2 1110 1110 1110 1110 0x0000ECC8u, // 3 1110 1100 1100 1000 0x0000C880u, // 4 1100 1000 1000 0000 0x0000FEECu, // 5 1111 1110 1110 1100 0x0000FEC8u, // 6 1111 1110 1100 1000 0x0000EC80u, // 7 1110 1100 1000 0000 0x0000C800u, // 8 1100 1000 0000 0000 0x0000FFECu, // 9 1111 1111 1110 1100 0x0000FE80u, // 10 1111 1110 1000 0000 0x0000E800u, // 11 1110 1000 0000 0000 0x0000FFE8u, // 12 1111 1111 1110 1000 0x0000FF00u, // 13 1111 1111 0000 0000 0x0000FFF0u, // 14 1111 1111 1111 0000 0x0000F000u, // 15 1111 0000 0000 0000 0x0000F710u, // 16 1111 0111 0001 0000 0x0000008Eu, // 17 0000 0000 1000 1110 0x00007100u, // 18 0111 0001 0000 0000 0x000008CEu, // 19 0000 1000 1100 1110 0x0000008Cu, // 20 0000 0000 1000 1100 0x00007310u, // 21 0111 0011 0001 0000 0x00003100u, // 22 0011 0001 0000 0000 0x00008CCEu, // 23 1000 1100 1100 1110 0x0000088Cu, // 24 0000 1000 1000 1100 0x00003110u, // 25 0011 0001 0001 0000 0x00006666u, // 26 0110 0110 0110 0110 0x0000366Cu, // 27 0011 0110 0110 1100 0x000017E8u, // 28 0001 0111 1110 1000 0x00000FF0u, // 29 0000 1111 1111 0000 0x0000718Eu, // 30 0111 0001 1000 1110 0x0000399Cu, // 31 0011 1001 1001 1100 0x0000AAAAu, // 32 1010 1010 1010 1010 0x0000F0F0u, // 33 1111 0000 1111 0000 0x00005A5Au, // 34 0101 1010 0101 1010 0x000033CCu, // 35 0011 0011 1100 1100 0x00003C3Cu, // 36 0011 1100 0011 1100 0x000055AAu, // 37 0101 0101 1010 1010 0x00009696u, // 38 1001 0110 1001 0110 0x0000A55Au, // 39 1010 0101 0101 1010 0x000073CEu, // 40 0111 0011 1100 1110 0x000013C8u, // 41 0001 0011 1100 1000 0x0000324Cu, // 42 0011 0010 0100 1100 0x00003BDCu, // 43 0011 1011 1101 1100 0x00006996u, // 44 0110 1001 1001 0110 0x0000C33Cu, // 45 1100 0011 0011 1100 0x00009966u, // 46 1001 1001 0110 0110 0x00000660u, // 47 0000 0110 0110 0000 0x00000272u, // 48 0000 0010 0111 0010 0x000004E4u, // 49 0000 0100 1110 0100 0x00004E40u, // 50 0100 1110 0100 0000 0x00002720u, // 51 0010 0111 0010 0000 0x0000C936u, // 52 1100 1001 0011 0110 0x0000936Cu, // 53 1001 0011 0110 1100 0x000039C6u, // 54 0011 1001 1100 0110 0x0000639Cu, // 55 0110 0011 1001 1100 0x00009336u, // 56 1001 0011 0011 0110 0x00009CC6u, // 57 1001 1100 1100 0110 0x0000817Eu, // 58 1000 0001 0111 1110 0x0000E718u, // 59 1110 0111 0001 1000 0x0000CCF0u, // 60 1100 1100 1111 0000 0x00000FCCu, // 61 0000 1111 1100 1100 0x00007744u, // 62 0111 0111 0100 0100 0x0000EE22u, // 63 1110 1110 0010 0010 // 3 Subset region patterns 0xF60008CCu, // 0 1111 0110 0000 0000 : 0000 1000 1100 1100 = 2222122011001100 (MSB...LSB) 0x73008CC8u, // 1 0111 0011 0000 0000 : 1000 1100 1100 1000 = 1222112211001000 0x3310CC80u, // 2 0011 0011 0001 0000 : 1100 1100 1000 0000 = 1122112210020000 0x00CEEC00u, // 3 0000 0000 1100 1110 : 1110 1100 0000 0000 = 1110110022002220 0xCC003300u, // 4 1100 1100 0000 0000 : 0011 0011 0000 0000 = 2211221100000000 0xCC0000CCu, // 5 1100 1100 0000 0000 : 0000 0000 1100 1100 = 2200220011001100 0x00CCFF00u, // 6 0000 0000 1100 1100 : 1111 1111 0000 0000 = 1111111122002200 0x3300CCCCu, // 7 0011 0011 0000 0000 : 1100 1100 1100 1100 = 1122112211001100 0xF0000F00u, // 8 1111 0000 0000 0000 : 0000 1111 0000 0000 = 2222111100000000 0xF0000FF0u, // 9 1111 0000 0000 0000 : 0000 1111 1111 0000 = 2222111111110000 0xFF0000F0u, // 10 1111 1111 0000 0000 : 0000 0000 1111 0000 = 2222222211110000 0x88884444u, // 11 1000 1000 1000 1000 : 0100 0100 0100 0100 = 2100210021002100 0x88886666u, // 12 1000 1000 1000 1000 : 0110 0110 0110 0110 = 2110211021102110 0xCCCC2222u, // 13 1100 1100 1100 1100 : 0010 0010 0010 0010 = 2210221022102210 0xEC80136Cu, // 14 1110 1100 1000 0000 : 0001 0011 0110 1100 = 2221221121101100 0x7310008Cu, // 15 0111 0011 0001 0000 : 0000 0000 1000 1100 = 0222002210021100 0xC80036C8u, // 16 1100 1000 0000 0000 : 0011 0110 1100 1000 = 2211211011001000 0x310008CEu, // 17 0011 0001 0000 0000 : 0000 1000 1100 1110 = 0022100211001110 0xCCC03330u, // 18 1100 1100 1100 0000 : 0011 0011 0011 0000 = 2211221122110000 0x0CCCF000u, // 19 0000 1100 1100 1100 : 1111 0000 0000 0000 = 1111220022002200 0xEE0000EEu, // 20 1110 1110 0000 0000 : 0000 0000 1110 1110 = 2220222011101110 0x77008888u, // 21 0111 0111 0000 0000 : 1000 1000 1000 1000 = 1222122210001000 0xCC0022C0u, // 22 1100 1100 0000 0000 : 0010 0010 1100 0000 = 2210221011000000 0x33004430u, // 23 0011 0011 0000 0000 : 0100 0100 0011 0000 = 0122012200110000 0x00CC0C22u, // 24 0000 0000 1100 1100 : 0000 1100 0010 0010 = 0000110022102210 0xFC880344u, // 25 1111 1100 1000 1000 : 0000 0011 0100 0100 = 2222221121002100 0x06606996u, // 26 0000 0110 0110 0000 : 0110 1001 1001 0110 = 0110122112210110 0x66009960u, // 27 0110 0110 0000 0000 : 1001 1001 0110 0000 = 1221122101100000 0xC88C0330u, // 28 1100 1000 1000 1100 : 0000 0011 0011 0000 = 2200201120112200 0xF9000066u, // 29 1111 1001 0000 0000 : 0000 0000 0110 0110 = 2222200201100110 0x0CC0C22Cu, // 30 0000 1100 1100 0000 : 1100 0010 0010 1100 = 1100221022101100 0x73108C00u, // 31 0111 0011 0001 0000 : 1000 1100 0000 0000 = 1222112200020000 0xEC801300u, // 32 1110 1100 1000 0000 : 0001 0011 0000 0000 = 2221221120000000 0x08CEC400u, // 33 0000 1000 1100 1110 : 1100 0100 0000 0000 = 1100210022002220 0xEC80004Cu, // 34 1110 1100 1000 0000 : 0000 0000 0100 1100 = 2220220021001100 0x44442222u, // 35 0100 0100 0100 0100 : 0010 0010 0010 0010 = 0210021002100210 0x0F0000F0u, // 36 0000 1111 0000 0000 : 0000 0000 1111 0000 = 0000222211110000 0x49242492u, // 37 0100 1001 0010 0100 : 0010 0100 1001 0010 = 0210210210210210 0x42942942u, // 38 0100 0010 1001 0100 : 0010 1001 0100 0010 = 0210102121020210 0x0C30C30Cu, // 39 0000 1100 0011 0000 : 1100 0011 0000 1100 = 1100221100221100 0x03C0C03Cu, // 40 0000 0011 1100 0000 : 1100 0000 0011 1100 = 1100002222111100 0xFF0000AAu, // 41 1111 1111 0000 0000 : 0000 0000 1010 1010 = 2222222210101010 0x5500AA00u, // 42 0101 0101 0000 0000 : 1010 1010 0000 0000 = 1212121200000000 0xCCCC3030u, // 43 1100 1100 1100 1100 : 0011 0000 0011 0000 = 2211220022112200 0x0C0CC0C0u, // 44 0000 1100 0000 1100 : 1100 0000 1100 0000 = 1100220011002200 0x66669090u, // 45 0110 0110 0110 0110 : 1001 0000 1001 0000 = 1221022012210220 0x0FF0A00Au, // 46 0000 1111 1111 0000 : 1010 0000 0000 1010 = 1010222222221010 0x5550AAA0u, // 47 0101 0101 0101 0000 : 1010 1010 1010 0000 = 1212121212120000 0xF0000AAAu, // 48 1111 0000 0000 0000 : 0000 1010 1010 1010 = 2222101010101010 0x0E0EE0E0u, // 49 0000 1110 0000 1110 : 1110 0000 1110 0000 = 1110222011102220 0x88887070u, // 50 1000 1000 1000 1000 : 0111 0000 0111 0000 = 2111200021112000 0x99906660u, // 51 1001 1001 1001 0000 : 0110 0110 0110 0000 = 2112211221120000 0xE00E0EE0u, // 52 1110 0000 0000 1110 : 0000 1110 1110 0000 = 2220111011102220 0x88880770u, // 53 1000 1000 1000 1000 : 0000 0111 0111 0000 = 2000211121112000 0xF0000666u, // 54 1111 0000 0000 0000 : 0000 0110 0110 0110 = 2222011001100110 0x99006600u, // 55 1001 1001 0000 0000 : 0110 0110 0000 0000 = 2112211200000000 0xFF000066u, // 56 1111 1111 0000 0000 : 0000 0000 0110 0110 = 2222222201100110 0xC00C0CC0u, // 57 1100 0000 0000 1100 : 0000 1100 1100 0000 = 2200110011002200 0xCCCC0330u, // 58 1100 1100 1100 1100 : 0000 0011 0011 0000 = 2200221122112200 0x90006000u, // 59 1001 0000 0000 0000 : 0110 0000 0000 0000 = 2112000000000000 0x08088080u, // 60 0000 1000 0000 1000 : 1000 0000 1000 0000 = 1000200010002000 0xEEEE1010u, // 61 1110 1110 1110 1110 : 0001 0000 0001 0000 = 2221222022212220 0xFFF0000Au, // 62 1111 1111 1111 0000 : 0000 0000 0000 1010 = 2222222222221010 0x731008CEu, // 63 0111 0011 0001 0000 : 0000 1000 1100 1110 = 0222102211021110 }; CMP_STATIC CMP_CONSTANT CGU_UINT8 cmp_npv_nd[2][8] = { {1, 2, 4, 8, 16, 32, 0, 0}, // 3 {1, 2, 4, 0, 0, 0, 0, 0} // 4 }; CMP_STATIC CMP_CONSTANT CGU_UINT8 cmp_par_vectors_nd[2][8][64][2][4] = { { // 3D {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}}, {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{1, 1, 1, 0}, {1, 1, 1, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}}, {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {1, 1, 1, 0}}, {{1, 1, 1, 0}, {0, 0, 0, 0}}, {{1, 1, 1, 0}, {1, 1, 1, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}}, {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{1, 1, 0, 0}, {1, 1, 0, 0}}, {{1, 0, 1, 0}, {1, 0, 1, 0}}, {{0, 1, 1, 0}, {0, 1, 1, 0}}, {{0, 0, 0, 0}, {1, 1, 1, 0}}, {{1, 1, 1, 0}, {0, 0, 0, 0}}, {{0, 1, 0, 0}, {0, 1, 0, 0}}, {{1, 1, 1, 0}, {1, 1, 1, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}}, {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{1, 1, 0, 0}, {0, 0, 0, 0}}, {{1, 0, 1, 0}, {0, 0, 0, 0}}, {{0, 1, 1, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {1, 1, 0, 0}}, {{1, 1, 0, 0}, {1, 1, 0, 0}}, {{1, 0, 1, 0}, {1, 1, 0, 0}}, {{0, 1, 1, 0}, {1, 1, 0, 0}}, {{0, 0, 0, 0}, {1, 0, 1, 0}}, {{1, 1, 0, 0}, {1, 0, 1, 0}}, {{1, 0, 1, 0}, {1, 0, 1, 0}}, {{0, 1, 1, 0}, {1, 0, 1, 0}}, {{0, 0, 0, 0}, {0, 1, 1, 0}}, {{1, 1, 0, 0}, {0, 1, 1, 0}}, {{1, 0, 1, 0}, {0, 1, 1, 0}}, {{0, 1, 1, 0}, {0, 1, 1, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}}, {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{1, 1, 0, 0}, {0, 0, 0, 0}}, {{1, 0, 1, 0}, {0, 0, 0, 0}}, {{0, 1, 1, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {1, 1, 0, 0}}, {{1, 1, 0, 0}, {1, 1, 0, 0}}, {{1, 0, 1, 0}, {1, 1, 0, 0}}, {{0, 1, 1, 0}, {1, 1, 0, 0}}, {{0, 0, 0, 0}, {1, 0, 1, 0}}, {{1, 1, 0, 0}, {1, 0, 1, 0}}, {{1, 0, 1, 0}, {1, 0, 1, 0}}, {{0, 1, 1, 0}, {1, 0, 1, 0}}, {{0, 0, 0, 0}, {0, 1, 1, 0}}, {{1, 1, 0, 0}, {0, 1, 1, 0}}, {{1, 0, 1, 0}, {0, 1, 1, 0}}, {{0, 1, 1, 0}, {0, 1, 1, 0}}, {{1, 0, 0, 0}, {1, 1, 1, 0}}, {{0, 1, 0, 0}, {1, 1, 1, 0}}, {{0, 0, 1, 0}, {1, 1, 1, 0}}, {{1, 1, 1, 0}, {1, 1, 1, 0}}, {{1, 0, 0, 0}, {0, 0, 1, 0}}, {{0, 1, 0, 0}, {0, 0, 1, 0}}, {{0, 0, 1, 0}, {0, 0, 1, 0}}, {{1, 1, 1, 0}, {0, 0, 1, 0}}, {{1, 0, 0, 0}, {1, 0, 0, 0}}, {{0, 1, 0, 0}, {1, 0, 0, 0}}, {{0, 0, 1, 0}, {1, 0, 0, 0}}, {{1, 1, 1, 0}, {1, 0, 0, 0}}, {{1, 0, 0, 0}, {0, 1, 0, 0}}, {{0, 1, 0, 0}, {0, 1, 0, 0}}, {{0, 0, 1, 0}, {0, 1, 0, 0}}, {{1, 1, 1, 0}, {0, 1, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}}, {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}}, {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}}, }, { // 4D {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}}, {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{1, 1, 1, 1}, {1, 1, 1, 1}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}}, {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {1, 1, 1, 1}}, {{1, 1, 1, 1}, {0, 0, 0, 0}}, {{1, 1, 1, 1}, {1, 1, 1, 1}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}}, {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 1, 1, 1}}, {{0, 1, 1, 1}, {0, 0, 0, 0}}, {{0, 1, 1, 1}, {0, 1, 1, 1}}, {{1, 0, 0, 0}, {1, 0, 0, 0}}, {{1, 0, 0, 0}, {1, 1, 1, 1}}, {{1, 1, 1, 1}, {1, 0, 0, 0}}, {{1, 1, 1, 1}, {1, 1, 1, 1}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}}, {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 1, 1, 1}}, {{0, 1, 1, 1}, {0, 0, 0, 0}}, {{0, 1, 1, 1}, {0, 1, 1, 1}}, {{1, 0, 0, 0}, {1, 0, 0, 0}}, {{1, 0, 0, 0}, {1, 1, 1, 1}}, {{1, 1, 1, 1}, {1, 0, 0, 0}}, {{1, 1, 1, 1}, {1, 1, 1, 1}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 1, 1}}, {{0, 0, 1, 1}, {0, 0, 0, 0}}, {{0, 1, 0, 1}, {0, 1, 0, 1}}, {{1, 0, 0, 0}, {1, 0, 0, 0}}, {{1, 0, 0, 0}, {1, 0, 1, 1}}, {{1, 0, 1, 1}, {1, 0, 0, 0}}, {{1, 1, 0, 1}, {1, 1, 0, 1}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}}, {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}}, {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}}, {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}}, }, }; CMP_STATIC CMP_CONSTANT CGU_UINT8 cmp_rampI[3][16] = { {0, 21, 43, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // 2 bit index {0, 9, 18, 27, 37, 46, 55, 64, 0, 0, 0, 0, 0, 0, 0, 0}, // 3 bit index {0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64} // 4 bit index }; // The data is saved as a packed INT = (BC7_FIXUPINDEX1 << 4 + BC7_FIXUPINDEX2) CMP_STATIC CMP_CONSTANT CGU_UINT32 CMPFIXUPINDEX[128] = { // 2 subset partitions 0..63 0xf0u,0xf0u,0xf0u,0xf0u,0xf0u,0xf0u,0xf0u,0xf0u, 0xf0u,0xf0u,0xf0u,0xf0u,0xf0u,0xf0u,0xf0u,0xf0u, 0xf0u,0x20u,0x80u,0x20u,0x20u,0x80u,0x80u,0xf0u, 0x20u,0x80u,0x20u,0x20u,0x80u,0x80u,0x20u,0x20u, 0xf0u,0xf0u,0x60u,0x80u,0x20u,0x80u,0xf0u,0xf0u, 0x20u,0x80u,0x20u,0x20u,0x20u,0xf0u,0xf0u,0x60u, 0x60u,0x20u,0x60u,0x80u,0xf0u,0xf0u,0x20u,0x20u, 0xf0u,0xf0u,0xf0u,0xf0u,0xf0u,0x20u,0x20u,0xf0u, // 3 subset partitions 64..128 0x3fu,0x38u,0xf8u,0xf3u,0x8fu,0x3fu,0xf3u,0xf8u, 0x8fu,0x8fu,0x6fu,0x6fu,0x6fu,0x5fu,0x3fu,0x38u, 0x3fu,0x38u,0x8fu,0xf3u,0x3fu,0x38u,0x6fu,0xa8u, 0x53u,0x8fu,0x86u,0x6au,0x8fu,0x5fu,0xfau,0xf8u, 0x8fu,0xf3u,0x3fu,0x5au,0x6au,0xa8u,0x89u,0xfau, 0xf6u,0x3fu,0xf8u,0x5fu,0xf3u,0xf6u,0xf6u,0xf8u, 0x3fu,0xf3u,0x5fu,0x5fu,0x5fu,0x8fu,0x5fu,0xafu, 0x5fu,0xafu,0x8fu,0xdfu,0xf3u,0xcfu,0x3fu,0x38u }; INLINE void cmp_get_fixuptable(CMP_INOUT CGU_UINT32 fixup[3], CGU_INT part_id) { CGU_UINT32 skip_packed = CMPFIXUPINDEX[part_id]; // gather_int2(FIXUPINDEX, part_id); fixup[0] = 0; fixup[1] = skip_packed >> 4; fixup[2] = skip_packed & 15; } INLINE CGU_UINT8 shift_right_epocode2(CMP_IN CGU_UINT8 v, CMP_IN CGU_INT bits) { return v >> bits; // (perf warning expected) } INLINE CGU_UINT8 expand_epocode2(CMP_IN CGU_UINT8 v, CMP_IN CGU_INT bits) { CGU_UINT8 vv = v << (8 - bits); return vv + shift_right_epocode2(vv, bits); } INLINE CGV_FLOAT cmp_GetRamp(CMP_IN CGU_INT index_bits, // ramp bits Valid range 2..4 CMP_IN CGU_INT bits, // Component Valid range 5..8 CMP_IN CGU_INT p1, // 0..255 CMP_IN CGU_INT p2, // 0..255 CMP_IN CGU_UINT8 index) { CGU_INT e1 = expand_epocode2(p1, bits); CGU_INT e2 = expand_epocode2(p2, bits); CGV_FLOAT ramp = cmp_rampI[index_bits - 2][index] / 64.0F; CGV_FLOAT rampf = floor(e1 + ramp * (e2 - e1) + 0.5F); return rampf; } #if defined(USE_NEW_SP_ERR_IDX) #ifndef ASPM_GPU struct BC7_EncodeRamps2 { CGU_INT ep_d[4][256]; CGU_UINT8 sp_err[3*4*256*2*2*16]; CGU_INT sp_idx[3*4*256*2*2*16*2]; CGU_BOOL ramp_init; }; BC7_EncodeRamps2 BC7EncodeRamps2; #define LOG_CL_RANGE2 5 #define LOG_CL_BASE2 2 #define BIT_BASE2 5 #define BIT_RANGE2 9 #define BTT2(bits) (bits-BIT_BASE2) #define CLT2(cl) (cl-LOG_CL_BASE2) #define SOURCE_BLOCK_SIZE 16 CMP_CONSTANT CGU_FLOAT rampWeights2[5][SOURCE_BLOCK_SIZE] = { { 0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f}, // 0 bit index { 0.000000f,1.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f}, // 1 bit index { 0.000000f,0.328125f,0.671875f,1.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f}, // 2 bit index { 0.000000f,0.140625f,0.281250f,0.421875f,0.578125f,0.718750f,0.859375f,1.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f}, // 3 bit index { 0.000000f,0.062500f,0.140625f,0.203125f,0.265625f,0.328125f,0.406250f,0.468750f,0.531250f,0.593750f,0.671875f,0.734375f,0.796875f,0.859375f,0.937500f,1.000000f} // 4 bit index }; CGU_INT old_expandbits(CGU_INT bits, CGU_INT v) { return (v << (8 - bits) | v >> (2 * bits - 8)); } void old_init_BC7ramps() { CMP_STATIC CGU_BOOL g_rampsInitialized = FALSE; if (g_rampsInitialized == TRUE) return; g_rampsInitialized = TRUE; BC7EncodeRamps2.ramp_init = TRUE; //bc7_isa(); ASPM_PRINT((" INIT Ramps\n")); CGU_INT bits; CGU_INT p1; CGU_INT p2; CGU_INT clogBC7; CGU_INT index; CGU_INT j; CGU_INT o1; CGU_INT o2; for (bits = BIT_BASE2; bits < BIT_RANGE2; bits++) { for (p1 = 0; p1 < (1 << bits); p1++) { BC7EncodeRamps2.ep_d[BTT2(bits)][p1] = old_expandbits(bits, p1); } //p1 } //bits= 0 && BC7EncodeRamps2.sp_err[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16) + (BTT2(bits) * 256 * 2 * 2 * 16) + (tf * 2 * 2 * 16) + (o1 * 2 * 16) + (o2 * 16) + index] == 0)) { BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) + (j * 2 * 2 * 16 * 2) + (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 0] = BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) + (tf * 2 * 2 * 16 * 2) + (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 0]; BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) + (j * 2 * 2 * 16 * 2) + (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 1] = BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) + (tf * 2 * 2 * 16 * 2) + (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 1]; break; } else if ((tc < 256 && BC7EncodeRamps2.sp_err[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16) + (BTT2(bits) * 256 * 2 * 2 * 16) + (tc * 2 * 2 * 16) + (o1 * 2 * 16) + (o2 * 16) + index] == 0)) { BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) + (j * 2 * 2 * 16 * 2) + (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 0] = BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) + (tc * 2 * 2 * 16 * 2) + (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 0]; break; } } BC7EncodeRamps2.sp_err[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16) + (BTT2(bits) * 256 * 2 * 2 * 16) + (j * 2 * 2 * 16) + (o1 * 2 * 16) + (o2 * 16) + index] = (CGU_UINT8)k; } //sp_idx < 0 } //i<(1 << clogBC7) } //o2 } //o1 } //j } //bits 0.0F ? a : -a; } INLINE CGV_FLOAT old_get_sperr(CGU_INT clogBC7, // ramp bits Valid range 2..4 CGU_INT bits, // Component Valid range 5..8 CGV_INT p1, // 0..255 CGU_INT t1, CGU_INT t2, CGV_UINT8 index) { if (BC7EncodeRamps2.ramp_init) return BC7EncodeRamps2.sp_err[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16) + (BTT2(bits) * 256 * 2 * 2 * 16) + (p1 * 2 * 2 * 16) + (t1 * 2 * 16) + (t2 * 16) + index]; else return 0.0f; } #endif #endif #endif // Not ASPM_HLSL #endif // ENABLE_CMP_API #define get_end_point_l(subset) shared_temp[threadBase + subset].endPoint_low_quantized #define get_end_point_h(subset) shared_temp[threadBase + subset].endPoint_high_quantized #define get_color_index(index) shared_temp[threadBase + index].error #define get_alpha_index(index) shared_temp[threadBase + index].mode //4 bit index: 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 CMP_STATIC CMP_CONSTANT CGU_UINT32 aStep[3][64] = { {0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15}, //3 bit index: 0, 9, 18, 27, 37, 46, 55, 64 {0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7}, //2 bit index: 0, 21, 43, 64 {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}}; CMP_STATIC CMP_CONSTANT CGU_UINT32 aWeight[3][16] = {{0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64}, {0, 9, 18, 27, 37, 46, 55, 64, 0, 0, 0, 0, 0, 0, 0, 0}, {0, 21, 43, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}}; //Associated to partition 0-63 CMP_STATIC CMP_CONSTANT CGU_UINT32 blockPartitions[64] = { 0xCCCC, 0x8888, 0xEEEE, 0xECC8, 0xC880, 0xFEEC, 0xFEC8, 0xEC80, 0xC800, 0xFFEC, 0xFE80, 0xE800, 0xFFE8, 0xFF00, 0xFFF0, 0xF000, 0xF710, 0x008E, 0x7100, 0x08CE, 0x008C, 0x7310, 0x3100, 0x8CCE, 0x088C, 0x3110, 0x6666, 0x366C, 0x17E8, 0x0FF0, 0x718E, 0x399C, 0xaaaa, 0xf0f0, 0x5a5a, 0x33cc, 0x3c3c, 0x55aa, 0x9696, 0xa55a, 0x73ce, 0x13c8, 0x324c, 0x3bdc, 0x6996, 0xc33c, 0x9966, 0x660, 0x272, 0x4e4, 0x4e40, 0x2720, 0xc936, 0x936c, 0x39c6, 0x639c, 0x9336, 0x9cc6, 0x817e, 0xe718, 0xccf0, 0xfcc, 0x7744, 0xee22, }; //Associated to partition 64-127 CMP_STATIC CMP_CONSTANT CGU_UINT32 blockPartitions2[64] = { 0xaa685050, 0x6a5a5040, 0x5a5a4200, 0x5450a0a8, 0xa5a50000, 0xa0a05050, 0x5555a0a0, 0x5a5a5050, 0xaa550000, 0xaa555500, 0xaaaa5500, 0x90909090, 0x94949494, 0xa4a4a4a4, 0xa9a59450, 0x2a0a4250, 0xa5945040, 0x0a425054, 0xa5a5a500, 0x55a0a0a0, 0xa8a85454, 0x6a6a4040, 0xa4a45000, 0x1a1a0500, 0x0050a4a4, 0xaaa59090, 0x14696914, 0x69691400, 0xa08585a0, 0xaa821414, 0x50a4a450, 0x6a5a0200, 0xa9a58000, 0x5090a0a8, 0xa8a09050, 0x24242424, 0x00aa5500, 0x24924924, 0x24499224, 0x50a50a50, 0x500aa550, 0xaaaa4444, 0x66660000, 0xa5a0a5a0, 0x50a050a0, 0x69286928, 0x44aaaa44, 0x66666600, 0xaa444444, 0x54a854a8, 0x95809580, 0x96969600, 0xa85454a8, 0x80959580, 0xaa141414, 0x96960000, 0xaaaa1414, 0xa05050a0, 0xa0a5a5a0, 0x96000000, 0x40804080, 0xa9a8a9a8, 0xaaaaaa44, 0x2a4a5254, }; CMP_STATIC CMP_CONSTANT CGU_Vec2ui candidateFixUpIndex1D[128] = { {15, 0},{15, 0},{15, 0},{15, 0}, {15, 0},{15, 0},{15, 0},{15, 0}, {15, 0},{15, 0},{15, 0},{15, 0}, {15, 0},{15, 0},{15, 0},{15, 0}, {15, 0},{ 2, 0},{ 8, 0},{ 2, 0}, { 2, 0},{ 8, 0},{ 8, 0},{15, 0}, { 2, 0},{ 8, 0},{ 2, 0},{ 2, 0}, { 8, 0},{ 8, 0},{ 2, 0},{ 2, 0}, {15, 0},{15, 0},{ 6, 0},{ 8, 0}, { 2, 0},{ 8, 0},{15, 0},{15, 0}, { 2, 0},{ 8, 0},{ 2, 0},{ 2, 0}, { 2, 0},{15, 0},{15, 0},{ 6, 0}, { 6, 0},{ 2, 0},{ 6, 0},{ 8, 0}, {15, 0},{15, 0},{ 2, 0},{ 2, 0}, {15, 0},{15, 0},{15, 0},{15, 0}, {15, 0},{ 2, 0},{ 2, 0},{15, 0}, //candidateFixUpIndex1D[i][1], i < 64 should not be used { 3,15},{ 3, 8},{15, 8},{15, 3}, { 8,15},{ 3,15},{15, 3},{15, 8}, { 8,15},{ 8,15},{ 6,15},{ 6,15}, { 6,15},{ 5,15},{ 3,15},{ 3, 8}, { 3,15},{ 3, 8},{ 8,15},{15, 3}, { 3,15},{ 3, 8},{ 6,15},{10, 8}, { 5, 3},{ 8,15},{ 8, 6},{ 6,10}, { 8,15},{ 5,15},{15,10},{15, 8}, { 8,15},{15, 3},{ 3,15},{ 5,10}, { 6,10},{10, 8},{ 8, 9},{15,10}, {15, 6},{ 3,15},{15, 8},{ 5,15}, {15, 3},{15, 6},{15, 6},{15, 8}, //The Spec doesn't mark the first fixed up index in this row, so I apply 15 for them, and seems correct { 3,15},{15, 3},{ 5,15},{ 5,15}, { 5,15},{ 8,15},{ 5,15},{10,15}, { 5,15},{10,15},{ 8,15},{13,15}, {15, 3},{12,15},{ 3,15},{ 3, 8}, }; CMP_STATIC CMP_CONSTANT CGU_Vec2ui candidateFixUpIndex1DOrdered[128] = { {15, 0},{15, 0},{15, 0},{15, 0}, {15, 0},{15, 0},{15, 0},{15, 0}, {15, 0},{15, 0},{15, 0},{15, 0}, {15, 0},{15, 0},{15, 0},{15, 0}, {15, 0},{ 2, 0},{ 8, 0},{ 2, 0}, { 2, 0},{ 8, 0},{ 8, 0},{15, 0}, { 2, 0},{ 8, 0},{ 2, 0},{ 2, 0}, { 8, 0},{ 8, 0},{ 2, 0},{ 2, 0}, {15, 0},{15, 0},{ 6, 0},{ 8, 0}, { 2, 0},{ 8, 0},{15, 0},{15, 0}, { 2, 0},{ 8, 0},{ 2, 0},{ 2, 0}, { 2, 0},{15, 0},{15, 0},{ 6, 0}, { 6, 0},{ 2, 0},{ 6, 0},{ 8, 0}, {15, 0},{15, 0},{ 2, 0},{ 2, 0}, {15, 0},{15, 0},{15, 0},{15, 0}, {15, 0},{ 2, 0},{ 2, 0},{15, 0}, //candidateFixUpIndex1DOrdered[i][1], i < 64 should not be used { 3,15},{ 3, 8},{ 8,15},{ 3,15}, { 8,15},{ 3,15},{ 3,15},{ 8,15}, { 8,15},{ 8,15},{ 6,15},{ 6,15}, { 6,15},{ 5,15},{ 3,15},{ 3, 8}, { 3,15},{ 3, 8},{ 8,15},{ 3,15}, { 3,15},{ 3, 8},{ 6,15},{ 8,10}, { 3, 5},{ 8,15},{ 6, 8},{ 6,10}, { 8,15},{ 5,15},{10,15},{ 8,15}, { 8,15},{ 3,15},{ 3,15},{ 5,10}, { 6,10},{ 8,10},{ 8, 9},{10,15}, { 6,15},{ 3,15},{ 8,15},{ 5,15}, { 3,15},{ 6,15},{ 6,15},{ 8,15}, //The Spec doesn't mark the first fixed up index in this row, so I apply 15 for them, and seems correct { 3,15},{ 3,15},{ 5,15},{ 5,15}, { 5,15},{ 8,15},{ 5,15},{10,15}, { 5,15},{10,15},{ 8,15},{13,15}, { 3,15},{12,15},{ 3,15},{ 3, 8} }; CGU_Vec4ui quantize(CGU_Vec4ui color, CGU_UINT32 uPrec) { return (((color << 8) + color) * ((1 << uPrec) - 1) + 32768U) >> 16; } CGU_Vec4ui unquantize(CGU_Vec4ui color, CGU_UINT32 uPrec) { #ifdef ASPM_GPU color = color << (8 - uPrec); return color | (color >> uPrec); #else CGU_Vec4ui res; color.x = color.x << (8 - uPrec); color.y = color.y << (8 - uPrec); color.z = color.z << (8 - uPrec); color.w = color.w << (8 - uPrec); res.x = color.x | (color.x >> uPrec); res.y = color.y | (color.y >> uPrec); res.z = color.z | (color.z >> uPrec); res.w = color.w | (color.w >> uPrec); return res; #endif } void swap(CMP_INOUT CGU_Vec4ui CMP_REFINOUT lhs, CMP_INOUT CGU_Vec4ui CMP_REFINOUT rhs) { CGU_Vec4ui tmp = lhs; lhs = rhs; rhs = tmp; } void swap(CMP_INOUT CGU_Vec3ui CMP_REFINOUT lhs, CMP_INOUT CGU_Vec3ui CMP_REFINOUT rhs) { CGU_Vec3ui tmp = lhs; lhs = rhs; rhs = tmp; } void swap(CMP_INOUT CGU_UINT32 CMP_REFINOUT lhs, CMP_INOUT CGU_UINT32 CMP_REFINOUT rhs) { CGU_UINT32 tmp = lhs; lhs = rhs; rhs = tmp; } CGU_UINT32 ComputeError(CMP_IN CGU_Vec4ui a, CMP_IN CGU_Vec4ui b) { return dot(a.rgb, b.rgb) + (g_alpha_weight * a.a * b.a); } void Ensure_A_Is_Larger(CMP_INOUT CGU_Vec4ui CMP_REFINOUT a, CMP_INOUT CGU_Vec4ui CMP_REFINOUT b) { if (a.x < b.x) swap(a.x, b.x); if (a.y < b.y) swap(a.y, b.y); if (a.z < b.z) swap(a.z, b.z); if (a.w < b.w) swap(a.w, b.w); } void compress_endpoints0(CMP_INOUT CGU_Vec4ui endPoint[2], CMP_INOUT CGU_Vec4ui quantized[2], CGU_Vec2ui P) { #ifdef ASPM_GPU CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++) { quantized[j].rgb = quantize(endPoint[j].rgbb, 5).rgb & 0xFFFFFFFE; quantized[j].rgb |= P[j]; quantized[j].a = 0xFF; endPoint[j].rgb = unquantize(quantized[j].rgbb, 5).rgb; endPoint[j].a = 0xFF; quantized[j] <<= 3; } #else CGU_Vec4ui rgbb; CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++) { rgbb.r = endPoint[j].r; rgbb.g = endPoint[j].g; rgbb.b = endPoint[j].b; rgbb.a = endPoint[j].b; quantized[j].rgb = quantize(rgbb, 5).rgb; quantized[j].r &= 0xFFFFFFFE; quantized[j].g &= 0xFFFFFFFE; quantized[j].b &= 0xFFFFFFFE; quantized[j].r |= P[j]; quantized[j].g |= P[j]; quantized[j].b |= P[j]; quantized[j].a = 0xFF; rgbb.r = quantized[j].r; rgbb.g = quantized[j].g; rgbb.b = quantized[j].b; rgbb.a = quantized[j].b; endPoint[j].rgb = unquantize(rgbb, 5).rgb; endPoint[j].a = 0xFF; quantized[j].r <<= 3; quantized[j].g <<= 3; quantized[j].b <<= 3; quantized[j].a <<= 3; } #endif } void compress_endpoints1(CMP_INOUT CGU_Vec4ui endPoint[2], CMP_OUT CGU_Vec4ui quantized[2], CGU_Vec2ui P) { #ifdef ASPM_GPU CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++) { quantized[j].rgb = quantize(endPoint[j].rgbb, 7).rgb & 0xFFFFFFFE; quantized[j].rgb |= P[j]; quantized[j].a = 0xFF; endPoint[j].rgb = unquantize(quantized[j].rgbb, 7).rgb; endPoint[j].a = 0xFF; quantized[j] <<= 1; } #else CGU_Vec4ui rgbb; CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++) { rgbb.r = endPoint[j].r; rgbb.g = endPoint[j].g; rgbb.b = endPoint[j].b; rgbb.a = endPoint[j].b; quantized[j].rgb = quantize(rgbb, 7).rgb; quantized[j].r &= 0xFFFFFFFE; quantized[j].g &= 0xFFFFFFFE; quantized[j].b &= 0xFFFFFFFE; quantized[j].r |= P[j]; quantized[j].g |= P[j]; quantized[j].b |= P[j]; quantized[j].a = 0xFF; rgbb.r = quantized[j].r; rgbb.g = quantized[j].g; rgbb.b = quantized[j].b; rgbb.a = quantized[j].b; endPoint[j].rgb = unquantize(rgbb, 7).rgb; endPoint[j].a = 0xFF; quantized[j].r = quantized[j].r << 1; quantized[j].g = quantized[j].g << 1; quantized[j].b = quantized[j].b << 1; quantized[j].a = quantized[j].a << 1; } #endif } void compress_endpoints2(CMP_INOUT CGU_Vec4ui endPoint[2], CMP_INOUT CGU_Vec4ui quantized[2]) { #ifdef ASPM_GPU CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++) { quantized[j].rgb = quantize(endPoint[j].rgbb, 5).rgb; quantized[j].a = 0xFF; endPoint[j].rgb = unquantize(quantized[j].rgbb, 5).rgb; endPoint[j].a = 0xFF; quantized[j] <<= 3; } #else CGU_Vec4ui rgbb; CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++) { rgbb.r = endPoint[j].r; rgbb.g = endPoint[j].g; rgbb.b = endPoint[j].b; rgbb.a = endPoint[j].b; quantized[j].rgb = quantize(rgbb, 5).rgb; quantized[j].a = 0xFF; rgbb.r = quantized[j].r; rgbb.g = quantized[j].g; rgbb.b = quantized[j].b; rgbb.a = quantized[j].b; endPoint[j].rgb = unquantize(rgbb, 5).rgb; endPoint[j].a = 0xFF; quantized[j].r <<= 3; quantized[j].g <<= 3; quantized[j].b <<= 3; quantized[j].a <<= 3; } #endif } void compress_endpoints3(CMP_INOUT CGU_Vec4ui endPoint[2], CMP_INOUT CGU_Vec4ui quantized[2], CGU_Vec2ui P) { for (CGU_UINT32 j = 0; j < 2; j++) { quantized[j].r = endPoint[j].x & 0xFFFFFFFE; quantized[j].g = endPoint[j].y & 0xFFFFFFFE; quantized[j].b = endPoint[j].z & 0xFFFFFFFE; quantized[j].a = 0xFF; quantized[j].r |= P[j]; quantized[j].g |= P[j]; quantized[j].b |= P[j]; endPoint[j].r = quantized[j].r; endPoint[j].g = quantized[j].g; endPoint[j].b = quantized[j].b; endPoint[j].a = 0xFF; } } void compress_endpoints4(CMP_INOUT CGU_Vec4ui endPoint[2], CMP_INOUT CGU_Vec4ui quantized[2]) { #ifdef ASPM_HLSL [unroll] for ( uint j = 0; j < 2; j ++ ) { quantized[j].rgb = quantize(endPoint[j].rgbb, 5).rgb; quantized[j].a = quantize(endPoint[j].a, 6).r; endPoint[j].rgb = unquantize(quantized[j].rgbb, 5).rgb; endPoint[j].a = unquantize(quantized[j].a, 6).r; quantized[j].rgb <<= 3; quantized[j].a <<= 2; } #else CGU_Vec4ui rgbb; CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++) { rgbb.r = endPoint[j].r; rgbb.g = endPoint[j].g; rgbb.b = endPoint[j].b; rgbb.a = endPoint[j].b; quantized[j].rgb = quantize(rgbb, 5).rgb; quantized[j].a = quantize(endPoint[j].a, 6).r; rgbb.r = quantized[j].r; rgbb.g = quantized[j].g; rgbb.b = quantized[j].b; rgbb.a = quantized[j].b; endPoint[j].rgb = unquantize(rgbb, 5).rgb; endPoint[j].a = unquantize(quantized[j].a, 6).r; quantized[j].r <<= 3; quantized[j].g <<= 3; quantized[j].b <<= 3; quantized[j].a <<= 2; } #endif } void compress_endpoints5(CMP_INOUT CGU_Vec4ui endPoint[2], CMP_INOUT CGU_Vec4ui quantized[2]) { #ifdef ASPM_HLSL CMP_UNROLL for ( uint j = 0; j < 2; j ++ ) { quantized[j].rgb = quantize(endPoint[j].rgbb, 7).rgb; quantized[j].a = endPoint[j].a; endPoint[j].rgb = unquantize(quantized[j].rgbb, 7).rgb; // endPoint[j].a Alpha is full precision quantized[j].rgb <<= 1; } #else CGU_Vec4ui rgbb; CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++) { rgbb.r = endPoint[j].r; rgbb.g = endPoint[j].g; rgbb.b = endPoint[j].b; rgbb.a = endPoint[j].b; quantized[j].rgb = quantize(rgbb, 7).rgb; quantized[j].a = endPoint[j].a; rgbb.r = quantized[j].r; rgbb.g = quantized[j].g; rgbb.b = quantized[j].b; rgbb.a = quantized[j].b; endPoint[j].rgb = unquantize(rgbb, 7).rgb; quantized[j].r <<= 1; quantized[j].g <<= 1; quantized[j].b <<= 1; } #endif } void compress_endpoints6(CMP_INOUT CGU_Vec4ui endPoint[2], CMP_OUT CGU_Vec4ui quantized[2], CGU_Vec2ui P) { for (CGU_UINT32 j = 0; j < 2; j++) { quantized[j].x = endPoint[j].x & 0xFFFFFFFE; quantized[j].y = endPoint[j].y & 0xFFFFFFFE; quantized[j].z = endPoint[j].z & 0xFFFFFFFE; quantized[j].w = endPoint[j].w & 0xFFFFFFFE; quantized[j].x = quantized[j].x | P[j]; quantized[j].y = quantized[j].y | P[j]; quantized[j].z = quantized[j].z | P[j]; quantized[j].w = quantized[j].w | P[j]; endPoint[j] = quantized[j]; } } void compress_endpoints7(CMP_INOUT CGU_Vec4ui endPoint[2], CMP_INOUT CGU_Vec4ui quantized[2], CGU_Vec2ui P) { CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++) { quantized[j] = quantize(endPoint[j], 6); quantized[j].x = (quantized[j].x & 0xFFFFFFFE) | P[j]; quantized[j].y = (quantized[j].y & 0xFFFFFFFE) | P[j]; quantized[j].z = (quantized[j].z & 0xFFFFFFFE) | P[j]; quantized[j].w = (quantized[j].w & 0xFFFFFFFE) | P[j]; endPoint[j] = unquantize(quantized[j], 6); } CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++) { quantized[j].x = quantized[j].x << 2; quantized[j].y = quantized[j].y << 2; quantized[j].z = quantized[j].z << 2; quantized[j].w = quantized[j].w << 2; } } void block_package0(CMP_OUT CGU_Vec4ui CMP_REFINOUT block, CGU_UINT32 partition, CGU_UINT32 threadBase) { block.x = 0x01 | ((partition - 64) << 1) | ((get_end_point_l(0).r & 0xF0) << 1) | ((get_end_point_h(0).r & 0xF0) << 5) | ((get_end_point_l(1).r & 0xF0) << 9) | ((get_end_point_h(1).r & 0xF0) << 13) | ((get_end_point_l(2).r & 0xF0) << 17) | ((get_end_point_h(2).r & 0xF0) << 21) | ((get_end_point_l(0).g & 0xF0) << 25); block.y = ((get_end_point_l(0).g & 0xF0) >> 7) | ((get_end_point_h(0).g & 0xF0) >> 3) | ((get_end_point_l(1).g & 0xF0) << 1) | ((get_end_point_h(1).g & 0xF0) << 5) | ((get_end_point_l(2).g & 0xF0) << 9) | ((get_end_point_h(2).g & 0xF0) << 13) | ((get_end_point_l(0).b & 0xF0) << 17) | ((get_end_point_h(0).b & 0xF0) << 21) | ((get_end_point_l(1).b & 0xF0) << 25); block.z = ((get_end_point_l(1).b & 0xF0) >> 7) | ((get_end_point_h(1).b & 0xF0) >> 3) | ((get_end_point_l(2).b & 0xF0) << 1) | ((get_end_point_h(2).b & 0xF0) << 5) | ((get_end_point_l(0).r & 0x08) << 10) | ((get_end_point_h(0).r & 0x08) << 11) | ((get_end_point_l(1).r & 0x08) << 12) | ((get_end_point_h(1).r & 0x08) << 13) | ((get_end_point_l(2).r & 0x08) << 14) | ((get_end_point_h(2).r & 0x08) << 15) | (get_color_index(0) << 19); block.w = 0; CGU_UINT32 i = 1; for (; i <= cmp_min(candidateFixUpIndex1DOrdered[partition][0], 4); i++) { block.z |= get_color_index(i) << (i * 3 + 18); } if (candidateFixUpIndex1DOrdered[partition][0] < 4) //i = 4 { block.z |= get_color_index(4) << 29; i += 1; } else //i = 5 { block.w |= (get_color_index(4) & 0x04) >> 2; for (; i <= candidateFixUpIndex1DOrdered[partition][0]; i++) block.w |= get_color_index(i) << (i * 3 - 14); } for (; i <= candidateFixUpIndex1DOrdered[partition][1]; i++) { block.w |= get_color_index(i) << (i * 3 - 15); } for (; i < 16; i++) { block.w |= get_color_index(i) << (i * 3 - 16); } } void block_package1(CMP_OUT CGU_Vec4ui CMP_REFINOUT block, CGU_UINT32 partition, CGU_UINT32 threadBase) { block.x = 0x02 | (partition << 2) | ((get_end_point_l(0).r & 0xFC) << 6) | ((get_end_point_h(0).r & 0xFC) << 12) | ((get_end_point_l(1).r & 0xFC) << 18) | ((get_end_point_h(1).r & 0xFC) << 24); block.y = ((get_end_point_l(0).g & 0xFC) >> 2) | ((get_end_point_h(0).g & 0xFC) << 4) | ((get_end_point_l(1).g & 0xFC) << 10) | ((get_end_point_h(1).g & 0xFC) << 16) | ((get_end_point_l(0).b & 0xFC) << 22) | ((get_end_point_h(0).b & 0xFC) << 28); block.z = ((get_end_point_h(0).b & 0xFC) >> 4) | ((get_end_point_l(1).b & 0xFC) << 2) | ((get_end_point_h(1).b & 0xFC) << 8) | ((get_end_point_l(0).r & 0x02) << 15) | ((get_end_point_l(1).r & 0x02) << 16) | (get_color_index(0) << 18); if (candidateFixUpIndex1DOrdered[partition][0] == 15) { block.w = (get_color_index(15) << 30) | (get_color_index(14) << 27) | (get_color_index(13) << 24) | (get_color_index(12) << 21) | (get_color_index(11) << 18) | (get_color_index(10) << 15) | (get_color_index(9) << 12) | (get_color_index(8) << 9) | (get_color_index(7) << 6) | (get_color_index(6) << 3) | get_color_index(5); block.z |= (get_color_index(4) << 29) | (get_color_index(3) << 26) | (get_color_index(2) << 23) | (get_color_index(1) << 20) | (get_color_index(0) << 18); } else if (candidateFixUpIndex1DOrdered[partition][0] == 2) { block.w = (get_color_index(15) << 29) | (get_color_index(14) << 26) | (get_color_index(13) << 23) | (get_color_index(12) << 20) | (get_color_index(11) << 17) | (get_color_index(10) << 14) | (get_color_index(9) << 11) | (get_color_index(8) << 8) | (get_color_index(7) << 5) | (get_color_index(6) << 2) | (get_color_index(5) >> 1); block.z |= (get_color_index(5) << 31) | (get_color_index(4) << 28) | (get_color_index(3) << 25) | (get_color_index(2) << 23) | (get_color_index(1) << 20) | (get_color_index(0) << 18); } else if (candidateFixUpIndex1DOrdered[partition][0] == 8) { block.w = (get_color_index(15) << 29) | (get_color_index(14) << 26) | (get_color_index(13) << 23) | (get_color_index(12) << 20) | (get_color_index(11) << 17) | (get_color_index(10) << 14) | (get_color_index(9) << 11) | (get_color_index(8) << 9) | (get_color_index(7) << 6) | (get_color_index(6) << 3) | get_color_index(5); block.z |= (get_color_index(4) << 29) | (get_color_index(3) << 26) | (get_color_index(2) << 23) | (get_color_index(1) << 20) | (get_color_index(0) << 18); } else //candidateFixUpIndex1DOrdered[partition] == 6 { block.w = (get_color_index(15) << 29) | (get_color_index(14) << 26) | (get_color_index(13) << 23) | (get_color_index(12) << 20) | (get_color_index(11) << 17) | (get_color_index(10) << 14) | (get_color_index(9) << 11) | (get_color_index(8) << 8) | (get_color_index(7) << 5) | (get_color_index(6) << 3) | get_color_index(5); block.z |= (get_color_index(4) << 29) | (get_color_index(3) << 26) | (get_color_index(2) << 23) | (get_color_index(1) << 20) | (get_color_index(0) << 18); } } void block_package2(CMP_OUT CGU_Vec4ui CMP_REFINOUT block, CGU_UINT32 partition, CGU_UINT32 threadBase) { block.x = 0x04 | ((partition - 64) << 3) | ((get_end_point_l(0).r & 0xF8) << 6) | ((get_end_point_h(0).r & 0xF8) << 11) | ((get_end_point_l(1).r & 0xF8) << 16) | ((get_end_point_h(1).r & 0xF8) << 21) | ((get_end_point_l(2).r & 0xF8) << 26); block.y = ((get_end_point_l(2).r & 0xF8) >> 6) | ((get_end_point_h(2).r & 0xF8) >> 1) | ((get_end_point_l(0).g & 0xF8) << 4) | ((get_end_point_h(0).g & 0xF8) << 9) | ((get_end_point_l(1).g & 0xF8) << 14) | ((get_end_point_h(1).g & 0xF8) << 19) | ((get_end_point_l(2).g & 0xF8) << 24); block.z = ((get_end_point_h(2).g & 0xF8) >> 3) | ((get_end_point_l(0).b & 0xF8) << 2) | ((get_end_point_h(0).b & 0xF8) << 7) | ((get_end_point_l(1).b & 0xF8) << 12) | ((get_end_point_h(1).b & 0xF8) << 17) | ((get_end_point_l(2).b & 0xF8) << 22) | ((get_end_point_h(2).b & 0xF8) << 27); block.w = ((get_end_point_h(2).b & 0xF8) >> 5) | (get_color_index(0) << 3); CGU_UINT32 i = 1; for (; i <= candidateFixUpIndex1DOrdered[partition][0]; i++) { block.w |= get_color_index(i) << (i * 2 + 2); } for (; i <= candidateFixUpIndex1DOrdered[partition][1]; i++) { block.w |= get_color_index(i) << (i * 2 + 1); } for (; i < 16; i++) { block.w |= get_color_index(i) << (i * 2); } } void block_package3(CMP_OUT CGU_Vec4ui CMP_REFINOUT block, CGU_UINT32 partition, CGU_UINT32 threadBase) { block.x = 0x08 | (partition << 4) | ((get_end_point_l(0).r & 0xFE) << 9) | ((get_end_point_h(0).r & 0xFE) << 16) | ((get_end_point_l(1).r & 0xFE) << 23) | ((get_end_point_h(1).r & 0xFE) << 30); block.y = ((get_end_point_h(1).r & 0xFE) >> 2) | ((get_end_point_l(0).g & 0xFE) << 5) | ((get_end_point_h(0).g & 0xFE) << 12) | ((get_end_point_l(1).g & 0xFE) << 19) | ((get_end_point_h(1).g & 0xFE) << 26); block.z = ((get_end_point_h(1).g & 0xFE) >> 6) | ((get_end_point_l(0).b & 0xFE) << 1) | ((get_end_point_h(0).b & 0xFE) << 8) | ((get_end_point_l(1).b & 0xFE) << 15) | ((get_end_point_h(1).b & 0xFE) << 22) | ((get_end_point_l(0).r & 0x01) << 30) | ((get_end_point_h(0).r & 0x01) << 31); block.w = ((get_end_point_l(1).r & 0x01) << 0) | ((get_end_point_h(1).r & 0x01) << 1) | (get_color_index(0) << 2); CGU_UINT32 i = 1; for (; i <= candidateFixUpIndex1DOrdered[partition][0]; i++) { block.w |= get_color_index(i) << (i * 2 + 1); } for (; i < 16; i++) { block.w |= get_color_index(i) << (i * 2); } } void block_package4(CMP_OUT CGU_Vec4ui CMP_REFINOUT block, CGU_UINT32 rotation, CGU_UINT32 index_selector, CGU_UINT32 threadBase) { block.x = 0x10 | ((rotation & 3) << 5) | ((index_selector & 1) << 7) | ((get_end_point_l(0).r & 0xF8) << 5) | ((get_end_point_h(0).r & 0xF8) << 10) | ((get_end_point_l(0).g & 0xF8) << 15) | ((get_end_point_h(0).g & 0xF8) << 20) | ((get_end_point_l(0).b & 0xF8) << 25); block.y = ((get_end_point_l(0).b & 0xF8) >> 7) | ((get_end_point_h(0).b & 0xF8) >> 2) | ((get_end_point_l(0).a & 0xFC) << 4) | ((get_end_point_h(0).a & 0xFC) << 10) | ((get_color_index(0) & 1) << 18) | (get_color_index(1) << 19) | (get_color_index(2) << 21) | (get_color_index(3) << 23) | (get_color_index(4) << 25) | (get_color_index(5) << 27) | (get_color_index(6) << 29) | (get_color_index(7) << 31); block.z = (get_color_index(7) >> 1) | (get_color_index(8) << 1) | (get_color_index(9) << 3) | (get_color_index(10) << 5) | (get_color_index(11) << 7) | (get_color_index(12) << 9) | (get_color_index(13) << 11) | (get_color_index(14) << 13) | (get_color_index(15) << 15) | ((get_alpha_index(0) & 3) << 17) | (get_alpha_index(1) << 19) | (get_alpha_index(2) << 22) | (get_alpha_index(3) << 25) | (get_alpha_index(4) << 28) | (get_alpha_index(5) << 31); block.w = (get_alpha_index(5) >> 1) | (get_alpha_index(6) << 2) | (get_alpha_index(7) << 5) | (get_alpha_index(8) << 8) | (get_alpha_index(9) << 11) | (get_alpha_index(10) << 14) | (get_alpha_index(11) << 17) | (get_alpha_index(12) << 20) | (get_alpha_index(13) << 23) | (get_alpha_index(14) << 26) | (get_alpha_index(15) << 29); } void block_package5(CMP_OUT CGU_Vec4ui CMP_REFINOUT block, CGU_UINT32 rotation, CGU_UINT32 threadBase) { block.x = 0x20 | (rotation << 6) | ((get_end_point_l(0).r & 0xFE) << 7) | ((get_end_point_h(0).r & 0xFE) << 14) | ((get_end_point_l(0).g & 0xFE) << 21) | ((get_end_point_h(0).g & 0xFE) << 28); block.y = ((get_end_point_h(0).g & 0xFE) >> 4) | ((get_end_point_l(0).b & 0xFE) << 3) | ((get_end_point_h(0).b & 0xFE) << 10) | (get_end_point_l(0).a << 18) | (get_end_point_h(0).a << 26); block.z = (get_end_point_h(0).a >> 6) | (get_color_index(0) << 2) | (get_color_index(1) << 3) | (get_color_index(2) << 5) | (get_color_index(3) << 7) | (get_color_index(4) << 9) | (get_color_index(5) << 11) | (get_color_index(6) << 13) | (get_color_index(7) << 15) | (get_color_index(8) << 17) | (get_color_index(9) << 19) | (get_color_index(10) << 21) | (get_color_index(11) << 23) | (get_color_index(12) << 25) | (get_color_index(13) << 27) | (get_color_index(14) << 29) | (get_color_index(15) << 31); block.w = (get_color_index(15) >> 1) | (get_alpha_index(0) << 1) | (get_alpha_index(1) << 2) | (get_alpha_index(2) << 4) | (get_alpha_index(3) << 6) | (get_alpha_index(4) << 8) | (get_alpha_index(5) << 10) | (get_alpha_index(6) << 12) | (get_alpha_index(7) << 14) | (get_alpha_index(8) << 16) | (get_alpha_index(9) << 18) | (get_alpha_index(10) << 20) | (get_alpha_index(11) << 22) | (get_alpha_index(12) << 24) | (get_alpha_index(13) << 26) | (get_alpha_index(14) << 28) | (get_alpha_index(15) << 30); } void block_package6(CMP_OUT CGU_Vec4ui CMP_REFINOUT block, CGU_UINT32 threadBase) { block.x = 0x40 | ((get_end_point_l(0).r & 0xFE) << 6) | ((get_end_point_h(0).r & 0xFE) << 13) | ((get_end_point_l(0).g & 0xFE) << 20) | ((get_end_point_h(0).g & 0xFE) << 27); block.y = ((get_end_point_h(0).g & 0xFE) >> 5) | ((get_end_point_l(0).b & 0xFE) << 2) | ((get_end_point_h(0).b & 0xFE) << 9) | ((get_end_point_l(0).a & 0xFE) << 16) | ((get_end_point_h(0).a & 0xFE) << 23) | (get_end_point_l(0).r & 0x01) << 31; block.z = (get_end_point_h(0).r & 0x01) | (get_color_index(0) << 1) | (get_color_index(1) << 4) | (get_color_index(2) << 8) | (get_color_index(3) << 12) | (get_color_index(4) << 16) | (get_color_index(5) << 20) | (get_color_index(6) << 24) | (get_color_index(7) << 28); block.w = (get_color_index(8) << 0) | (get_color_index(9) << 4) | (get_color_index(10) << 8) | (get_color_index(11) << 12) | (get_color_index(12) << 16) | (get_color_index(13) << 20) | (get_color_index(14) << 24) | (get_color_index(15) << 28); } void block_package7(CMP_OUT CGU_Vec4ui CMP_REFINOUT block, CGU_UINT32 partition, CGU_UINT32 threadBase) { block.x = 0x80 | (partition << 8) | ((get_end_point_l(0).r & 0xF8) << 11) | ((get_end_point_h(0).r & 0xF8) << 16) | ((get_end_point_l(1).r & 0xF8) << 21) | ((get_end_point_h(1).r & 0xF8) << 26); block.y = ((get_end_point_h(1).r & 0xF8) >> 6) | ((get_end_point_l(0).g & 0xF8) >> 1) | ((get_end_point_h(0).g & 0xF8) << 4) | ((get_end_point_l(1).g & 0xF8) << 9) | ((get_end_point_h(1).g & 0xF8) << 14) | ((get_end_point_l(0).b & 0xF8) << 19) | ((get_end_point_h(0).b & 0xF8) << 24); block.z = ((get_end_point_l(1).b & 0xF8) >> 3) | ((get_end_point_h(1).b & 0xF8) << 2) | ((get_end_point_l(0).a & 0xF8) << 7) | ((get_end_point_h(0).a & 0xF8) << 12) | ((get_end_point_l(1).a & 0xF8) << 17) | ((get_end_point_h(1).a & 0xF8) << 22) | ((get_end_point_l(0).r & 0x04) << 28) | ((get_end_point_h(0).r & 0x04) << 29); block.w = ((get_end_point_l(1).r & 0x04) >> 2) | ((get_end_point_h(1).r & 0x04) >> 1) | (get_color_index(0) << 2); CGU_UINT32 i = 1; for (; i <= candidateFixUpIndex1DOrdered[partition][0]; i++) { block.w |= get_color_index(i) << (i * 2 + 1); } for (; i < 16; i++) { block.w |= get_color_index(i) << (i * 2); } } void GroupSync() { #ifdef ASPM_GPU GroupMemoryBarrierWithGroupSync(); #endif } void set_pixel_rotation(CMP_INOUT CGU_Vec4ui CMP_REFINOUT pixel, CGU_UINT32 rotation) { #ifdef ASPM_GPU if (1 == rotation) { pixel.ra = pixel.ar; } else if (2 == rotation) { pixel.ga = pixel.ag; } else if (3 == rotation) { pixel.ba = pixel.ab; } #else CGU_UINT32 r, g, b, a; r = pixel.r; g = pixel.g; b = pixel.b; a = pixel.a; if (1 == rotation) { pixel.r = a; pixel.a = r; } else if (2 == rotation) { pixel.g = a; pixel.a = g; } else if (3 == rotation) { pixel.b = a; pixel.a = b; } #endif } CGU_BOOL cmp_ImageHasAlpha(CGU_UINT32 threadBase) { #if defined(ENABLED_MODE6) || defined(ENABLE_CMP_MODE6) CGU_UINT32 alpha; for (CGU_INT ii = 0; ii < 16; ii++) { alpha = shared_temp[threadBase + ii].pixel.a; if ((alpha < 255)) return true; } #endif return false; } #ifdef ENABLE_CMP_API CGU_UINT32 GetRamp2(CGU_UINT32 e0, CGU_UINT32 e1, CGU_UINT32 index, CGU_UINT32 indexprecision) { if (indexprecision == 2) return (CGU_UINT32)(((64 - aWeight[2][index]) * e0 + aWeight[2][index] * e1 + 32) >> 6); else if (indexprecision == 3) return (CGU_UINT32)(((64 - aWeight[1][index]) * e0 + aWeight[1][index] * e1 + 32) >> 6); else // indexprecision == 4 return (CGU_UINT32)(((64 - aWeight[0][index]) * e0 + aWeight[0][index] * e1 + 32) >> 6); } //====================================== MODE 6 ========================================== void cmp_encode_apply_swap(CMP_INOUT CGU_Vec4ui epo_code_out[2], CMP_INOUT CGU_UINT32 block_index[2], CMP_IN CGU_INT bits) { CGU_UINT32 levels = 1 << bits; if ((block_index[0] & 15) >= levels / 2) { // swap end points CGU_Vec4ui t = epo_code_out[0]; epo_code_out[0] = epo_code_out[1]; epo_code_out[1] = t; block_index[0] = (CGU_UINT32)(0x11111111 * (levels - 1)) - block_index[0]; block_index[1] = (CGU_UINT32)(0x11111111 * (levels - 1)) - block_index[1]; } } CGU_INT cmp_Write32Bit(CMP_INOUT CGU_UINT32 base[4], CMP_IN CGU_INT offset, CMP_IN CGU_INT bits, CMP_IN CGU_UINT32 bitVal) { base[offset / 32] |= ((CGU_UINT32)bitVal) << (offset % 32); if (offset % 32 + bits > 32) { if ((offset / 32 + 1) < 4) base[(offset / 32) + 1] |= cmp_shift_right_uint32(bitVal, 32 - offset % 32); } offset += bits; return offset; } void cmp_encode_index2(CMP_INOUT CGU_UINT32 data[4], CMP_IN CGU_INT pPos, CMP_INOUT CGU_UINT32 color_index[2], CMP_IN CGU_INT bits, CMP_IN CGU_INT flips) { CGU_INT levels = 1 << bits; CGU_INT flips_shifted = flips; for (CGU_INT k1 = 0; k1 < 2; k1++) { CGU_UINT32 qbits_shifted = color_index[k1]; for (CGU_INT k2 = 0; k2 < 8; k2++) { CGU_UINT32 q = qbits_shifted & 15; if ((flips_shifted & 1) > 0) q = (levels - 1) - q; if (k1 == 0 && k2 == 0) pPos = cmp_Write32Bit(data, pPos, bits - 1, q); else pPos = cmp_Write32Bit(data, pPos, bits, q); qbits_shifted >>= 4; flips_shifted >>= 1; } } } void cmp_eigen_vector(CMP_INOUT CGV_Vec4f CMP_REFINOUT eigen_vector, CMP_INOUT CGU_Vec4f CMP_REFINOUT image_mean, CMP_IN CGV_Vec4ui image_src[16], CMP_IN CGU_INT numEntries) { CGU_INT k; image_mean = 0.0f; eigen_vector = 0.0f; CGV_FLOAT vector_covOut[10]; CGV_FLOAT covar[10] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; CGV_Vec4f rgbasum = {0.0f, 0.0f, 0.0f, 0.0f}; for (k = 0; k < numEntries; k++) { CGV_Vec4f rgba; rgba.x = image_src[k].x; rgba.y = image_src[k].y; rgba.z = image_src[k].z; rgba.w = image_src[k].w; rgbasum.x += rgba.x; rgbasum.y += rgba.y; rgbasum.z += rgba.z; rgbasum.w += rgba.w; covar[0] += rgba.x * rgba.x; //covar[0].x => covar[0] covar[1] += rgba.x * rgba.y; //covar[0].y => covar[1] covar[2] += rgba.x * rgba.z; //covar[0].z => covar[2] covar[3] += rgba.x * rgba.w; //covar[0].w => covar[3] covar[4] += rgba.y * rgba.y; //covar[1].y => covar[4] covar[5] += rgba.y * rgba.z; //covar[1].z => covar[5] covar[6] += rgba.y * rgba.w; //covar[1].w => covar[6] covar[7] += rgba.z * rgba.z; //covar[2].z => covar[7] covar[8] += rgba.z * rgba.w; //covar[2].w => covar[8] covar[9] += rgba.w * rgba.w; //covar[3].w => covar[9] } image_mean = rgbasum / (CGV_FLOAT)numEntries; vector_covOut[0] = covar[0] - (rgbasum.x * rgbasum.x / numEntries); vector_covOut[1] = covar[1] - (rgbasum.x * rgbasum.y / numEntries); vector_covOut[2] = covar[2] - (rgbasum.x * rgbasum.z / numEntries); vector_covOut[3] = covar[3] - (rgbasum.x * rgbasum.w / numEntries); vector_covOut[4] = covar[4] - (rgbasum.y * rgbasum.y / numEntries); vector_covOut[5] = covar[5] - (rgbasum.y * rgbasum.z / numEntries); vector_covOut[6] = covar[6] - (rgbasum.y * rgbasum.w / numEntries); vector_covOut[7] = covar[7] - (rgbasum.z * rgbasum.z / numEntries); vector_covOut[8] = covar[8] - (rgbasum.z * rgbasum.w / numEntries); vector_covOut[9] = covar[9] - (rgbasum.w * rgbasum.w / numEntries); CGV_FLOAT inv_var = 1.0 / (256 * 256); // GPU multiply is faster 1.5258789062500000e-05 for (k = 0; k < 10; k++) { vector_covOut[k] = vector_covOut[k] * inv_var; } // Compute eigen_vector CGV_Vec4f vec = {1.0f, 1.0f, 1.0f, 1.0f}; CGU_INT powerIterations = 6; // 4 not enough for HQ : can use quality to set ranges from 2..n for (k = 0; k < powerIterations; k++) { eigen_vector.x = vector_covOut[0] * vec.x + vector_covOut[1] * vec.y + vector_covOut[2] * vec.z + vector_covOut[3] * vec.w; eigen_vector.y = vector_covOut[1] * vec.x + vector_covOut[4] * vec.y + vector_covOut[5] * vec.z + vector_covOut[6] * vec.w; eigen_vector.z = vector_covOut[2] * vec.x + vector_covOut[5] * vec.y + vector_covOut[7] * vec.z + vector_covOut[8] * vec.w; eigen_vector.w = vector_covOut[3] * vec.x + vector_covOut[6] * vec.y + vector_covOut[8] * vec.z + vector_covOut[9] * vec.w; // renormalize every other iteration if (k % 2 == 1) { CGV_FLOAT norm_sq = cmp_dot4f(eigen_vector, eigen_vector); CGV_FLOAT rnorm = cmp_Image_rsqrt(norm_sq); vec = eigen_vector * rnorm; } else vec = eigen_vector; } eigen_vector = vec; //printf("eigen_vector [%1.8f,%1.3f,%1.8f,%1.8f]\n", eigen_vector.x, eigen_vector.y, eigen_vector.z, eigen_vector.w); } void cmp_endpoints2(CMP_INOUT CGU_Vec4ui end_points_out[2], CMP_IN CGV_Vec4f ext[2], CMP_IN CGV_Vec4f eigen_vector, CMP_IN CGV_Vec4f image_mean) { CGV_FLOAT levelHigh = 255; // Mode 6 levels = 1 << bits = 128 then use (level * 2) - 1 CGV_FLOAT levelLow = 254; // Mode 6 levels = 1 << bits = 128 then use (level * 2) - 2 CGV_Vec4f qep_b[2]; CGV_FLOAT err0 = 0.0f; CGV_FLOAT err1 = 0.0f; CGV_Vec4f block_endpoints[2]; block_endpoints[0] = ext[0] * eigen_vector + image_mean; block_endpoints[1] = ext[1] * eigen_vector + image_mean; for (CGU_INT subset = 0; subset < 2; subset++) { // this code effects quality qep_b[0].x = cmp_clampf((CGV_INT)((block_endpoints[subset].x / 255.0f * levelHigh) / 2.0f + 0.5f) * 2.0f, 0, levelLow); qep_b[0].y = cmp_clampf((CGV_INT)((block_endpoints[subset].y / 255.0f * levelHigh) / 2.0f + 0.5f) * 2.0f, 0, levelLow); qep_b[0].z = cmp_clampf((CGV_INT)((block_endpoints[subset].z / 255.0f * levelHigh) / 2.0f + 0.5f) * 2.0f, 0, levelLow); qep_b[0].w = cmp_clampf((CGV_INT)((block_endpoints[subset].w / 255.0f * levelHigh) / 2.0f + 0.5f) * 2.0f, 0, levelLow); qep_b[1].x = cmp_clampf((CGV_INT)((block_endpoints[subset].x / 255.0f * levelHigh - 1) / 2.0f + 0.5f) * 2 + 1, 1, levelHigh); qep_b[1].y = cmp_clampf((CGV_INT)((block_endpoints[subset].y / 255.0f * levelHigh - 1) / 2.0f + 0.5f) * 2 + 1, 1, levelHigh); qep_b[1].z = cmp_clampf((CGV_INT)((block_endpoints[subset].z / 255.0f * levelHigh - 1) / 2.0f + 0.5f) * 2 + 1, 1, levelHigh); qep_b[1].w = cmp_clampf((CGV_INT)((block_endpoints[subset].w / 255.0f * levelHigh - 1) / 2.0f + 0.5f) * 2 + 1, 1, levelHigh); err0 = cmp_dot4f(block_endpoints[subset] - qep_b[0], block_endpoints[subset] - qep_b[0]); err1 = cmp_dot4f(block_endpoints[subset] - qep_b[1], block_endpoints[subset] - qep_b[1]); if (subset == 0) { end_points_out[1].x = (err0 < err1) ? qep_b[0].x : qep_b[1].x; end_points_out[1].y = (err0 < err1) ? qep_b[0].y : qep_b[1].y; end_points_out[1].z = (err0 < err1) ? qep_b[0].z : qep_b[1].z; end_points_out[1].w = (err0 < err1) ? qep_b[0].w : qep_b[1].w; } else { end_points_out[0].x = ((err0 < err1) ? qep_b[0].x : qep_b[1].x); end_points_out[0].y = ((err0 < err1) ? qep_b[0].y : qep_b[1].y); end_points_out[0].z = ((err0 < err1) ? qep_b[0].z : qep_b[1].z); end_points_out[0].w = ((err0 < err1) ? qep_b[0].w : qep_b[1].w); } } } void cmp_block_endpoints(CMP_INOUT CGU_Vec4ui end_points_out[2], CMP_IN CGV_Vec4f eigen_vector, CMP_IN CGV_Vec4f image_mean, CMP_IN CGU_Vec4ui image_src[16], CMP_IN CGU_INT numEntries, //IN: range 0..15 (MAX_SUBSET_SIZE) CMP_IN CGU_INT partition_mask // 0xFFFF:FFFF ) { CGV_Vec4f ext[2] = {{255.0f, 255.0f, 255.0f, 255.0f}, {0.0f, 0.0f, 0.0f, 0.0f}}; // find min/max CGV_INT mask_shifted = partition_mask << 1; for (CGU_INT k3 = 0; k3 <= numEntries; k3++) { mask_shifted >>= 1; if ((mask_shifted & 1) == 0) continue; CGV_FLOAT dot = 0; CGV_Vec4f diff; diff.x = image_src[k3].x - image_mean.x; diff.y = image_src[k3].y - image_mean.y; diff.z = image_src[k3].z - image_mean.z; diff.w = image_src[k3].w - image_mean.w; dot += cmp_dot4f(eigen_vector, diff); ext[0].x = cmp_minf(ext[0].x, dot); ext[0].y = cmp_minf(ext[0].y, dot); ext[0].z = cmp_minf(ext[0].z, dot); ext[0].w = cmp_minf(ext[0].w, dot); ext[1].x = cmp_maxf(ext[1].x, dot); ext[1].y = cmp_maxf(ext[1].y, dot); ext[1].z = cmp_maxf(ext[1].z, dot); ext[1].w = cmp_maxf(ext[1].w, dot); } // create some distance if the endpoints collapse if (ext[1].x - ext[0].x < 1.0f) { ext[0] -= 0.5f; ext[1] += 0.5f; } cmp_endpoints2(end_points_out, ext, eigen_vector, image_mean); } CGV_UINT8 clampIndex2(CGV_UINT8 v, CGV_UINT8 a, CGV_UINT8 b) { if (v < a) return a; else if (v > b) return b; return v; } void cmp_block_index(CMP_INOUT CGU_UINT32 index_out[16], CMP_IN CGV_Vec4f eigen_vector, CMP_IN CGV_Vec4f image_mean, CMP_IN CGU_Vec4ui image_src[16], CMP_IN CGU_UINT32 numEntries // Range 0..15 (MAX_SUBSET_SIZE) ) { //===================== // Get Projected Index //===================== CGV_FLOAT image_projected[16]; CGV_FLOAT image_v[16]; CGV_FLOAT image_z[16]; CGV_FLOAT projected_high; // Values are +ve about centered image projection CGV_FLOAT projected_low; // Values are -ve about centered image projection CGV_FLOAT image_s; //==================================================================== // Center the image to new coordinate axis centered at the mean value //==================================================================== CGV_Vec4f image_centered[16]; CGV_Vec4f diff; for (CGU_UINT32 k1 = 0; k1 <= numEntries; k1++) { diff.x = image_src[k1].x - image_mean.x; diff.y = image_src[k1].y - image_mean.y; diff.z = image_src[k1].z - image_mean.z; diff.w = image_src[k1].w - image_mean.w; image_centered[k1] = diff * eigen_vector; image_projected[k1] = image_centered[k1].x + image_centered[k1].y + image_centered[k1].z + image_centered[k1].w; } projected_high = image_projected[0]; projected_low = image_projected[0]; for (CGU_UINT32 i1 = 1; i1 <= numEntries; i1++) { if (projected_high < image_projected[i1]) projected_high = image_projected[i1]; if (projected_low > image_projected[i1]) projected_low = image_projected[i1]; } CGV_FLOAT img_diff = projected_low - projected_high; if (img_diff == 0.0f) return; image_s = numEntries / img_diff; // Get initial index projection for (CGU_UINT32 idx = 0; idx <= numEntries; idx++) { image_v[idx] = image_projected[idx] * image_s; image_z[idx] = floor(image_v[idx] + 0.5F - projected_high * image_s); index_out[idx] = (CGV_UINT32)image_z[idx]; } // get minimum index CGU_UINT32 index_min = index_out[0]; for (CGU_UINT32 i3 = 1; i3 <= numEntries; i3++) { if (index_out[i3] < index_min) index_min = index_out[i3]; } // Reposition all index by min index (using min index as 0) //printf("index : "); for (CGU_UINT32 i4 = 0; i4 <= numEntries; i4++) { index_out[i4] = clampIndex2(index_out[i4] - index_min, 0, 15); //printf("%02x,", index_out[i4]); } //printf("\n"); } CGU_UINT32 cmp_calcblockerr(CGU_Vec4ui endPoint_in[2], CGU_Vec4ui image_src[16]) { CGU_UINT32 error = 0; CGU_Vec4ui pixel = image_src[0]; CGU_Vec4ui endPoint[2]; CGU_Vec4i pixelDiff; endPoint[0] = endPoint_in[0]; endPoint[1] = endPoint_in[1]; pixelDiff.x = pixel.x - endPoint[0].x; pixelDiff.y = pixel.y - endPoint[0].y; pixelDiff.z = pixel.z - endPoint[0].z; pixelDiff.w = pixel.w - endPoint[0].w; CGU_Vec4i span; CGU_Vec2i span_norm_sqr; CGU_Vec2i dotProduct; span.x = endPoint[1].x - endPoint[0].x; span.y = endPoint[1].y - endPoint[0].y; span.z = endPoint[1].z - endPoint[0].z; span.w = endPoint[1].w - endPoint[0].w; span_norm_sqr = cmp_dotVec4i(span, span); dotProduct = cmp_dotVec4i(span, pixelDiff); if (span_norm_sqr.x > 0 && dotProduct.x >= 0 && CGU_UINT32(dotProduct.x * 63.49999) > CGU_UINT32(32 * span_norm_sqr.x)) { span.x = -span.x; span.y = -span.y; span.z = -span.z; span.w = -span.w; swap(endPoint[0], endPoint[1]); } CGU_UINT32 color_index; CGU_Vec4ui pixel_r; for (CGU_UINT32 i = 0; i < 16; i++) { pixel = image_src[i]; pixelDiff.x = pixel.x - endPoint[0].x; pixelDiff.y = pixel.y - endPoint[0].y; pixelDiff.z = pixel.z - endPoint[0].z; pixelDiff.w = pixel.w - endPoint[0].w; dotProduct.x = cmp_dotVec4i(span, pixelDiff); color_index = (span_norm_sqr.x <= 0 || dotProduct.x <= 0) ? 0 : ((dotProduct.x < span_norm_sqr.x) ? aStep[0][CGU_UINT32(dotProduct.x * 63.49999 / span_norm_sqr.x)] : aStep[0][63]); pixel_r = (endPoint[0] * (64 - aWeight[0][color_index]) + endPoint[1] * aWeight[0][color_index] + 32u) >> 6; Ensure_A_Is_Larger(pixel_r, pixel); pixel_r -= pixel; error += ComputeError(pixel_r, pixel_r); } return error; } CGU_FLOAT cmp_GetIndexedEndPoints(CMP_INOUT CGU_Vec4ui epo_code_out[2], CMP_INOUT CGU_UINT32 index_out[16], CMP_IN CGU_Vec4ui image_src[16], CMP_IN CGU_INT numEntries, CMP_IN CGU_INT partition_mask) { CGV_Vec4f image_mean = {0.0f, 0.0f, 0.0f, 0.0f}; CGV_Vec4f eigen_vector; for (CGU_INT i0 = 0; i0 < 16; i0++) index_out[i0] = 0; cmp_eigen_vector(eigen_vector, image_mean, image_src, numEntries); cmp_block_endpoints(epo_code_out, eigen_vector, image_mean, image_src, numEntries, partition_mask); cmp_block_index(index_out, eigen_vector, image_mean, image_src, numEntries); CGU_UINT32 besterr = cmp_calcblockerr(epo_code_out, image_src); return besterr; } void cmp_encode_mode6(CMP_INOUT CGU_UINT32 cmp_out[4], CMP_IN CGU_Vec4ui epo_code_out[2], CMP_IN CGU_UINT32 packed_color_index[2]) { cmp_encode_apply_swap(epo_code_out, packed_color_index, 4); CGU_INT k; for (k = 0; k < 4; k++) cmp_out[k] = 0; CGU_INT pos = 0; // mode 6 pos = cmp_Write32Bit(cmp_out, pos, 7, 64); // endpoints pos = cmp_Write32Bit(cmp_out, pos, 7, epo_code_out[0].x >> 1); pos = cmp_Write32Bit(cmp_out, pos, 7, epo_code_out[1].x >> 1); pos = cmp_Write32Bit(cmp_out, pos, 7, epo_code_out[0].y >> 1); pos = cmp_Write32Bit(cmp_out, pos, 7, epo_code_out[1].y >> 1); pos = cmp_Write32Bit(cmp_out, pos, 7, epo_code_out[0].z >> 1); pos = cmp_Write32Bit(cmp_out, pos, 7, epo_code_out[1].z >> 1); pos = cmp_Write32Bit(cmp_out, pos, 7, epo_code_out[0].w >> 1); pos = cmp_Write32Bit(cmp_out, pos, 7, epo_code_out[1].w >> 1); // p bits pos = cmp_Write32Bit(cmp_out, pos, 1, epo_code_out[0].x & 1); pos = cmp_Write32Bit(cmp_out, pos, 1, epo_code_out[1].x & 1); // quantized values cmp_encode_index2(cmp_out, pos, packed_color_index, 4, 0); } //====================================== MODES 01237 ========================================== CGU_UINT32 index_collapse2(CMP_INOUT CGU_UINT32 index[16], CGU_UINT32 numEntries) { CGU_UINT32 minIndex = index[0]; CGU_UINT32 MaxIndex = index[0]; for (CGU_UINT32 km = 1; km < numEntries; km++) { if (index[km] < minIndex) minIndex = index[km]; if (index[km] > MaxIndex) MaxIndex = index[km]; } if (MaxIndex == 0) return 0; CGU_UINT32 D = 1; for (CGU_UINT32 d = 2; d <= MaxIndex - minIndex; d++) { for (CGU_UINT32 ent = 0U; ent < numEntries; ent++) { CGU_UINT8 imod = (index[ent] - minIndex); if (fmod(imod, d) > 0.0f) { if (ent >= numEntries) D = d; break; } } } CGU_FLOAT invD = 1.0f / D; for (CGU_UINT32 ki = 0; ki < numEntries; ki++) { index[ki] = (CGU_UINT32)((index[ki] - minIndex) * invD); } for (CGU_UINT32 k = 1; k < numEntries; k++) { if (index[k] > MaxIndex) MaxIndex = index[k]; } return (MaxIndex); } INLINE void GetClusterMean2(CMP_INOUT CGV_Vec4f image_cluster_mean[16], CMP_IN CGU_Vec4ui image_src[16], CMP_IN CGU_UINT32 index_cluster[16], CMP_IN CGU_UINT32 numEntries, // < 16 CMP_IN CGU_UINT32 channels3or4) { // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS) // unused index values are underfined CGU_UINT32 i_cnt[16]; CGU_UINT32 i_comp[16]; CGU_UINT32 idx; for (CGU_UINT32 i0 = 0; i0 < numEntries; i0++) { idx = index_cluster[i0] & 0x0F; i_cnt[idx] = 0; image_cluster_mean[idx] = 0.0f; } CGU_UINT32 ic = 0; for (CGU_UINT32 i1 = 0; i1 < numEntries; i1++) { idx = index_cluster[i1] & 0x0F; if (i_cnt[idx] == 0) i_comp[ic++] = idx; i_cnt[idx]++; image_cluster_mean[idx].x += image_src[i1].x; image_cluster_mean[idx].y += image_src[i1].y; image_cluster_mean[idx].z += image_src[i1].z; image_cluster_mean[idx].w += image_src[i1].w; } for (CGU_UINT32 i = 0; i < ic; i++) { CGU_UINT32 icmp = i_comp[i]; if (i_cnt[icmp] != 0) { image_cluster_mean[icmp].x = (CGV_FLOAT)floor((image_cluster_mean[icmp].x / (CGV_FLOAT)i_cnt[icmp]) + 0.5F); image_cluster_mean[icmp].y = (CGV_FLOAT)floor((image_cluster_mean[icmp].y / (CGV_FLOAT)i_cnt[icmp]) + 0.5F); image_cluster_mean[icmp].z = (CGV_FLOAT)floor((image_cluster_mean[icmp].z / (CGV_FLOAT)i_cnt[icmp]) + 0.5F); if (channels3or4 == 4) image_cluster_mean[icmp].w = (CGV_FLOAT)floor((image_cluster_mean[icmp].w / (CGV_FLOAT)i_cnt[icmp]) + 0.5F); else image_cluster_mean[icmp].w = 0.0f; } } } #ifndef ASPM_HLSL // CPU Version #define USE_OLDCODE INLINE CGU_UINT8 cmp_get_partition_subset2(CMP_IN CGU_INT part_id, CMP_IN CGU_INT maxSubsets, CMP_IN CGU_INT index) { if (maxSubsets == 2) { CGU_UINT32 mask_packed = subset_mask_table2[part_id]; return ((mask_packed & (0x01 << index)) ? 1 : 0); // This can be moved to caller, just return mask!! } // 3 region subsets part_id += 64; CGU_UINT32 mask0 = subset_mask_table2[part_id] & 0xFFFF; CGU_UINT32 mask1 = subset_mask_table2[part_id] >> 16; CGU_UINT32 mask = 0x01 << index; return ((mask1 & mask) ? 2 : 0 + (mask0 & mask) ? 1 : 0); // This can be moved to caller, just return mask!! } void cmp_GetPartitionSubSet2_mode01237(CMP_INOUT CGV_Vec4ui image_subsets[3][16], // OUT: Subset pattern mapped with image src colors CMP_INOUT CGU_INT entryCount_out[3], // OUT: Number of entries per subset CMP_IN CGU_UINT8 partition, // Partition Shape 0..63 CMP_IN CGV_Vec4ui image_src[16], // Image colors CMP_IN CGU_INT blockMode, // [0,1,2,3 or 7] CMP_IN CGU_UINT8 channels3or4) { // 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS) CGU_UINT8 maxSubsets = 2; if (blockMode == 0 || blockMode == 2) maxSubsets = 3; entryCount_out[0] = 0; entryCount_out[1] = 0; entryCount_out[2] = 0; for (CGU_INT i = 0; i < 16; i++) { CGU_UINT8 subset = cmp_get_partition_subset2(partition, maxSubsets, i); image_subsets[subset][entryCount_out[subset]].x = image_src[i].x; image_subsets[subset][entryCount_out[subset]].y = image_src[i].y; image_subsets[subset][entryCount_out[subset]].z = image_src[i].z; // if we have only 3 channels then set the alpha subset to 0 if (channels3or4 == 3) image_subsets[subset][entryCount_out[subset]].w = 0.0F; else image_subsets[subset][entryCount_out[subset]].w = image_src[i].w; entryCount_out[subset]++; } } void cmp_GetImageCentered(CMP_INOUT CGV_Vec4f image_centered[16], CMP_INOUT CGV_Vec4f CMP_REFINOUT mean_out, CMP_IN CGV_Vec4ui image_src[16], CMP_IN CGU_INT numEntries, CMP_IN CGU_UINT8 channels3or4) { (channels3or4); mean_out = 0.0f; CGU_INT k; for (k = 0; k < numEntries; k++) { mean_out.x = mean_out.x + image_src[k].x; mean_out.y = mean_out.y + image_src[k].y; mean_out.z = mean_out.z + image_src[k].z; if (channels3or4 == 4) mean_out.w = mean_out.w + image_src[k].w; } mean_out /= (CGV_FLOAT)numEntries; for (k = 0; k < numEntries; k++) { image_centered[k].x = image_src[k].x - mean_out.x; image_centered[k].y = image_src[k].y - mean_out.y; image_centered[k].z = image_src[k].z - mean_out.z; if (channels3or4 == 4) image_centered[k].w = image_src[k].w - mean_out.w; } } void cmp_GetCovarianceVector(CMP_INOUT CGV_FLOAT covariance_out[16], CMP_IN CGV_Vec4f image_centered[16], CMP_IN CGU_INT numEntries, CMP_IN CGU_UINT8 channels3or4) { CGU_UINT8 ch1; CGU_UINT8 ch2; CGU_INT k; for (ch1 = 0; ch1 < channels3or4; ch1++) for (ch2 = 0; ch2 <= ch1; ch2++) { covariance_out[ch1 + ch2 * 4] = 0; for (k = 0; k < numEntries; k++) covariance_out[ch1 + ch2 * 4] += image_centered[k][ch1] * image_centered[k][ch2]; } for (ch1 = 0; ch1 < channels3or4; ch1++) for (ch2 = ch1 + 1; ch2 < channels3or4; ch2++) covariance_out[ch1 + ch2 * 4] = covariance_out[ch2 + ch1 * 4]; } void cmp_GetEigenVector(CMP_INOUT CGV_Vec4f CMP_REFINOUT EigenVector_out, // Normalized Eigen Vector output CMP_IN CGV_FLOAT CovarianceVector[16], // Covariance Vector CMP_IN CGU_UINT8 channels3or4) { CGV_FLOAT vector_covIn[16]; CGV_FLOAT vector_covOut[16]; CGV_FLOAT vector_maxCovariance; CGU_UINT8 ch1; CGU_UINT8 ch2; CGU_UINT8 ch3; for (ch1 = 0; ch1 < channels3or4; ch1++) for (ch2 = 0; ch2 < channels3or4; ch2++) { vector_covIn[ch1 + ch2 * 4] = CovarianceVector[ch1 + ch2 * 4]; } vector_maxCovariance = 0; for (ch1 = 0; ch1 < channels3or4; ch1++) { if (vector_covIn[ch1 + ch1 * 4] > vector_maxCovariance) vector_maxCovariance = vector_covIn[ch1 + ch1 * 4]; } // Normalize Input Covariance Vector for (ch1 = 0; ch1 < channels3or4; ch1++) for (ch2 = 0; ch2 < channels3or4; ch2++) { if (vector_maxCovariance > 0) vector_covIn[ch1 + ch2 * 4] = vector_covIn[ch1 + ch2 * 4] / vector_maxCovariance; } for (ch1 = 0; ch1 < channels3or4; ch1++) { for (ch2 = 0; ch2 < channels3or4; ch2++) { CGV_FLOAT vector_temp_cov = 0; for (ch3 = 0; ch3 < channels3or4; ch3++) { vector_temp_cov = vector_temp_cov + vector_covIn[ch1 + ch3 * 4] * vector_covIn[ch3 + ch2 * 4]; } vector_covOut[ch1 + ch2 * 4] = vector_temp_cov; } } vector_maxCovariance = 0; CGU_INT maxCovariance_channel = 0; for (ch1 = 0; ch1 < channels3or4; ch1++) { if (vector_covOut[ch1 + ch1 * 4] > vector_maxCovariance) { maxCovariance_channel = ch1; vector_maxCovariance = vector_covOut[ch1 + ch1 * 4]; } } CGV_FLOAT vector_t = 0; for (ch1 = 0; ch1 < channels3or4; ch1++) { vector_t = vector_t + vector_covOut[maxCovariance_channel + ch1 * 4] * vector_covOut[maxCovariance_channel + ch1 * 4]; EigenVector_out[ch1] = vector_covOut[maxCovariance_channel + ch1 * 4]; } // Normalize the Eigen Vector vector_t = sqrt(vector_t); for (ch1 = 0; ch1 < channels3or4; ch1++) { if (vector_t > 0) EigenVector_out[ch1] = EigenVector_out[ch1] / vector_t; } } void cmp_GetProjecedImage(CMP_INOUT CGV_FLOAT projection_out[16], CMP_IN CGV_Vec4f image_centered[16], CMP_IN CGU_INT numEntries, CMP_IN CGV_Vec4f EigenVector, CMP_IN CGU_UINT8 channels3or4) { // EigenVector must be normalized for (CGU_INT k = 0; k < numEntries; k++) { projection_out[k] = 0.0F; projection_out[k] = projection_out[k] + (image_centered[k].x * EigenVector.x); projection_out[k] = projection_out[k] + (image_centered[k].y * EigenVector.y); projection_out[k] = projection_out[k] + (image_centered[k].z * EigenVector.z); if (channels3or4 == 4) projection_out[k] = projection_out[k] + (image_centered[k].w * EigenVector.w); } } typedef struct { CGV_FLOAT image; CGU_UINT8 index; } CMP_di2; void cmp_GetProjectedIndex(CMP_INOUT CGU_UINT8 projected_index_out[16], //output: index, uncentered, in the range 0..clusters-1 CMP_IN CGV_FLOAT image_projected[16], // image_block points, might be uncentered CMP_IN CGU_INT clusters, // clusters: number of points in the ramp (max 16) CMP_IN CGU_INT numEntries) { CMP_di2 what[16]; CGV_FLOAT image_v[16]; CGV_FLOAT image_z[16]; CGV_FLOAT image_l; CGV_FLOAT image_mm; CGV_FLOAT image_r = 0.0F; CGV_FLOAT image_dm = 0.0F; CGV_FLOAT image_min; CGV_FLOAT image_max; CGV_FLOAT image_s; CGU_INT i; CGU_INT j; for (i = 0; i < 16; i++) projected_index_out[i] = 0; image_min = image_projected[0]; image_max = image_projected[0]; for (i = 1; i < numEntries; i++) { if (image_min < image_projected[i]) image_min = image_projected[i]; if (image_max > image_projected[i]) image_max = image_projected[i]; } CGV_FLOAT img_diff = image_max - image_min; if (img_diff == 0.0f) return; if (cmp_isnan(img_diff)) return; image_s = (clusters - 1) / img_diff; for (i = 0; i < numEntries; i++) { image_v[i] = image_projected[i] * image_s; image_z[i] = floor(image_v[i] + 0.5F - image_min * image_s); projected_index_out[i] = (CGU_UINT8)image_z[i]; what[i].image = image_v[i] - image_z[i] - image_min * image_s; what[i].index = i; image_dm += what[i].image; image_r += what[i].image * what[i].image; } if (numEntries * image_r - image_dm * image_dm >= (CGV_FLOAT)(numEntries - 1) / 8) { image_dm /= numEntries; for (i = 0; i < numEntries; i++) what[i].image -= image_dm; CGU_UINT8 tmp_index; CGV_FLOAT tmp_image; for (i = 1; i < numEntries; i++) { for (j = i; j > 0; j--) { if (what[j - 1].image > what[j].image) { tmp_index = what[j].index; tmp_image = what[j].image; what[j].index = what[j - 1].index; what[j].image = what[j - 1].image; what[j - 1].index = tmp_index; what[j - 1].image = tmp_image; } } } // got into fundamental simplex // move coordinate system origin to its center // i=0 < numEntries avoids varying int division by 0 for (i = 0; i < numEntries; i++) { what[i].image = what[i].image - (CGV_FLOAT)(((2.0f * i + 1) - numEntries) / (2.0f * numEntries)); } image_mm = 0.0F; image_l = 0.0F; j = -1; for (i = 0; i < numEntries; i++) { image_l += what[i].image; if (image_l < image_mm) { image_mm = image_l; j = i; } } j = j + 1; // avoid j = j%numEntries use this while (j > numEntries) j = j - numEntries; for (i = j; i < numEntries; i++) { CGU_UINT8 idx = what[i].index; CGU_UINT8 pidx = projected_index_out[idx] + 1; //gather_index(projected_index_out,idx)+1; projected_index_out[idx] = pidx; // scatter_index(projected_index_out,idx,pidx); } } // get minimum index CGU_UINT8 index_min = projected_index_out[0]; for (i = 1; i < numEntries; i++) { if (projected_index_out[i] < index_min) index_min = projected_index_out[i]; } // reposition all index by min index (using min index as 0) for (i = 0; i < numEntries; i++) { projected_index_out[i] = cmp_clampi(projected_index_out[i] - index_min, 0, 15); } } CGV_FLOAT cmp_err_Total(CMP_IN CGV_Vec4ui image_src1[16], CMP_IN CGV_Vec4f image_src2[16], CMP_IN CGU_INT numEntries, CMP_IN CGU_UINT8 channels3or4) { CGV_FLOAT err_t = 0.0F; for (CGU_INT k = 0; k < numEntries; k++) { err_t = err_t + cmp_squaref(image_src1[k].x - image_src2[k].x); err_t = err_t + cmp_squaref(image_src1[k].y - image_src2[k].y); err_t = err_t + cmp_squaref(image_src1[k].z - image_src2[k].z); if (channels3or4 == 4) err_t = err_t + cmp_squaref(image_src1[k].w - image_src2[k].w); } return err_t; }; CGV_FLOAT cmp_GetQuantizeIndex_old(CMP_INOUT CGU_UINT8 index_out[16], CMP_IN CGV_Vec4ui image_src[16], CMP_IN CGU_INT numEntries, CMP_IN CGU_INT numClusters, CMP_IN CGU_UINT8 channels3or4) { CGV_FLOAT covariance_vector[16]; CGV_Vec4f image_centered[16]; CGV_FLOAT image_projected[16]; CGV_Vec4f image_mean = 0.0f; CGV_Vec4f eigen_vector = 0.0f; // Init vars for (CGU_INT ik = 0; ik < 16; ik++) { covariance_vector[ik] = 0.0f; image_centered[ik] = 0.0f; image_projected[ik] = 0.0f; } cmp_GetImageCentered(image_centered, image_mean, image_src, numEntries, channels3or4); cmp_GetCovarianceVector(covariance_vector, image_centered, numEntries, channels3or4); //----------------------------------------------------- // check if all covariances are the same // if so then set all index to same value 0 and return // use EPSILON to set the limit for all same limit //----------------------------------------------------- CGV_FLOAT image_covt = 0.0F; image_covt = covariance_vector[0]; image_covt = image_covt + covariance_vector[5]; image_covt = image_covt + covariance_vector[10]; if (channels3or4 == 4) image_covt = image_covt + covariance_vector[15]; if (image_covt < 0.00390625f) { for (CGU_INT i = 0; i < 16; i++) index_out[i] = 0; return 0.0f; } cmp_GetEigenVector(eigen_vector, covariance_vector, channels3or4); cmp_GetProjecedImage(image_projected, image_centered, numEntries, eigen_vector, channels3or4); cmp_GetProjectedIndex(index_out, image_projected, numClusters, numEntries); //========================================== // Refine //========================================== CGV_FLOAT image_q = 0.0F; eigen_vector = 0.0f; for (CGU_INT k = 0; k < numEntries; k++) { eigen_vector.x = eigen_vector.x + image_centered[k].x * index_out[k]; eigen_vector.y = eigen_vector.y + image_centered[k].y * index_out[k]; eigen_vector.z = eigen_vector.z + image_centered[k].z * index_out[k]; if (channels3or4 == 4) eigen_vector.w = eigen_vector.w + image_centered[k].w * index_out[k]; } image_q = image_q + eigen_vector.x * eigen_vector.x; image_q = image_q + eigen_vector.y * eigen_vector.y; image_q = image_q + eigen_vector.z * eigen_vector.z; if (channels3or4 == 4) image_q = image_q + eigen_vector.w * eigen_vector.w; image_q = sqrt(image_q); // direction needs to be normalized if (image_q != 0.0F) eigen_vector = eigen_vector / image_q; // Get new projected data cmp_GetProjecedImage(image_projected, image_centered, numEntries, eigen_vector, channels3or4); cmp_GetProjectedIndex(index_out, image_projected, numClusters, numEntries); // Calc Error CGV_FLOAT image_t = 0.0F; CGV_FLOAT index_average = 0.0F; for (CGU_INT ik = 0; ik < numEntries; ik++) { index_average = index_average + index_out[ik]; image_t = image_t + index_out[ik] * index_out[ik]; } index_average = index_average / (CGV_FLOAT)numEntries; image_t = image_t - index_average * index_average * (CGV_FLOAT)numEntries; if (image_t != 0.0F) image_t = 1.0F / image_t; eigen_vector = 0.0f; for (CGU_INT nk = 0; nk < numEntries; nk++) { eigen_vector.x = eigen_vector.x + image_centered[nk].x * index_out[nk]; eigen_vector.y = eigen_vector.y + image_centered[nk].y * index_out[nk]; eigen_vector.z = eigen_vector.z + image_centered[nk].z * index_out[nk]; if (channels3or4 == 4) eigen_vector.w = eigen_vector.w + image_centered[nk].w * index_out[nk]; } CGV_Vec4f image_decomp[SOURCE_BLOCK_SIZE]; for (CGU_UINT32 ii = 0; ii < SOURCE_BLOCK_SIZE; ii++) image_decomp[ii] = 0.0f; for (CGU_INT i = 0; i < numEntries; i++) { image_decomp[i].x = image_mean.x + eigen_vector.x * image_t * (index_out[i] - index_average); image_decomp[i].y = image_mean.y + eigen_vector.y * image_t * (index_out[i] - index_average); image_decomp[i].z = image_mean.z + eigen_vector.z * image_t * (index_out[i] - index_average); if (channels3or4 == 4) image_decomp[i].w = image_mean.w + eigen_vector.w * image_t * (index_out[i] - index_average); } CGV_FLOAT err_1 = cmp_err_Total(image_src, image_decomp, numEntries, channels3or4); return err_1; } typedef struct { CGV_FLOAT image; CGU_UINT8 index; } CMP_du2; void cmp_sortPartitionProjection(CMP_IN CGV_FLOAT projection[64], CMP_INOUT CGU_UINT8 order[64], CMP_IN CGU_UINT8 numPartitions) // max 64 { CMP_du2 what[64]; CGU_UINT8 Parti; CGU_UINT8 Partj; for (Parti = 0; Parti < numPartitions; Parti++) { what[Parti].index = Parti; what[Parti].image = projection[Parti]; } CGU_UINT8 index; CGV_FLOAT data; for (Parti = 1; Parti < numPartitions; Parti++) { for (Partj = Parti; Partj > 0; Partj--) { if (what[Partj - 1].image > what[Partj].image) { index = what[Partj].index; data = what[Partj].image; what[Partj].index = what[Partj - 1].index; what[Partj].image = what[Partj - 1].image; what[Partj - 1].index = index; what[Partj - 1].image = data; } } } for (Parti = 0; Parti < numPartitions; Parti++) order[Parti] = what[Parti].index; }; CGU_BOOL cmp_get_ideal_cluster(CMP_INOUT CGV_Vec4f image_cluster[2], CMP_IN CGU_UINT32 index_cluster[16], CMP_IN CGU_INT Mi_, CMP_IN CGV_Vec4ui image_src[16], CMP_IN CGU_INT numEntries, CMP_IN CGU_UINT8 channels3or4) { // get ideal cluster centers CGV_Vec4f image_cluster_mean[16]; for (CGU_INT ii = 0; ii < numEntries; ii++) { image_cluster_mean[ii] = 0.0f; } GetClusterMean2(image_cluster_mean, image_src, index_cluster, numEntries, channels3or4); // unrounded CGV_FLOAT image_matrix0[2] = {0, 0}; // matrix /inverse matrix CGV_FLOAT image_matrix1[2] = {0, 0}; // matrix /inverse matrix CGV_Vec4f image_rp[2]; // right part for RMS fit problem image_rp[0] = 0.0f; image_rp[1] = 0.0f; // weight with cnt if runnning on compacted index for (CGU_INT k = 0; k < numEntries; k++) { image_matrix0[0] += (Mi_ - index_cluster[k]) * (Mi_ - index_cluster[k]); image_matrix0[1] += index_cluster[k] * (Mi_ - index_cluster[k]); // im is symmetric image_matrix1[1] += index_cluster[k] * index_cluster[k]; image_rp[0] += image_cluster_mean[index_cluster[k]] * (Mi_ - index_cluster[k]); image_rp[1] += image_cluster_mean[index_cluster[k]] * index_cluster[k]; } CGV_FLOAT matrix_dd = image_matrix0[0] * image_matrix1[1] - image_matrix0[1] * image_matrix0[1]; // assert(matrix_dd !=0); // matrix_dd=0 means that index_cidx[k] and (Mi_-index_cidx[k]) collinear which implies only one active index; // taken care of separately if (matrix_dd == 0) { image_cluster[0] = 0.0f; image_cluster[1] = 0.0f; return FALSE; } image_matrix1[0] = image_matrix0[0]; image_matrix0[0] = image_matrix1[1] / matrix_dd; image_matrix1[1] = image_matrix1[0] / matrix_dd; image_matrix1[0] = image_matrix0[1] = -image_matrix0[1] / matrix_dd; CGV_FLOAT Mif = (CGV_FLOAT)Mi_; // values can exceed 255 here, clamp made no diff in quality! image_cluster[0] = (((image_rp[0] * image_matrix0[0]) + (image_rp[1] * image_matrix0[1])) * Mif); image_cluster[1] = (((image_rp[0] * image_matrix1[0]) + (image_rp[1] * image_matrix1[1])) * Mif); return TRUE; } CGV_FLOAT cmp_quant_solid_color(CMP_INOUT CGU_UINT32 index_out[16], CMP_INOUT CGV_Vec4ui epo_code_out[2], CMP_IN CGV_Vec4ui image_src[16], CMP_IN CGU_INT numEntries, CMP_IN CGU_UINT8 Mi_, CMP_IN CGU_UINT8 bits[4], CMP_IN CGU_INT type, CMP_IN CGU_UINT8 channels3or4, CMP_IN CGU_INT blockMode) { #ifndef ASPM_GPU #if defined(USE_NEW_SP_ERR_IDX) CGU_INT clogBC7 = 0; CGU_INT iv = Mi_ + 1; while (iv >>= 1) clogBC7++; old_init_BC7ramps(); // first time call inits global #endif #endif CGU_INT index_bits = g_modesettings[blockMode].indexBits; CGV_Vec4ui epo_0[2]; epo_0[0] = 0u; epo_0[1] = 0u; CGU_UINT8 image_log = 0; CGU_UINT8 image_idx = 0; CGU_BOOL use_par = FALSE; if (type != 0) use_par = TRUE; CGV_FLOAT error_1 = CMP_FLOAT_MAX; //CGU_UINT8 ch; CGU_UINT8 ch1; //CGU_INT k; CGU_INT i; for (CGU_INT pn = 0; pn < cmp_npv_nd[channels3or4 - 3][type] && (error_1 != 0.0F); pn++) { CGU_Vec4ui o1[2] = {{0u, 0u, 0u, 0u}, {2u, 2u, 2u, 2u}}; CGU_Vec4ui o2[2] = {{0u, 0u, 0u, 0u}, {2u, 2u, 2u, 2u}}; if (use_par == TRUE) { if (cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][0]) o1[0][0] = 1; else o1[1][0] = 1; if (cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][0]) o2[0][0] = 1; else o2[1][0] = 1; if (cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][1]) o1[0][1] = 1; else o1[1][1] = 1; if (cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][1]) o2[0][1] = 1; else o2[1][1] = 1; if (cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][2]) o1[0][2] = 1; else o1[1][2] = 1; if (cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][2]) o2[0][2] = 1; else o2[1][2] = 1; if (cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][3]) o1[0][3] = 1; else o1[1][3] = 1; if (cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][3]) o2[0][3] = 1; else o2[1][3] = 1; } CGU_INT image_tcr[MAX_CHANNELS]; CGU_INT epo_dr_0[MAX_CHANNELS]; CGV_FLOAT error_0 = CMP_FLOAT_MAX; for (CGU_UINT8 iclogBC7 = 0; iclogBC7 < (1 << index_bits) && (error_0 != 0); iclogBC7++) { CGV_FLOAT error_t = 0; CGU_INT t1o[MAX_CHANNELS], t2o[MAX_CHANNELS]; for (ch1 = 0; ch1 < channels3or4; ch1++) { // D CGV_FLOAT error_ta = CMP_FLOAT_MAX; for (CGU_UINT8 t1 = o1[0][ch1]; t1 < o1[1][ch1]; t1++) { // C // This is needed for non-integer mean points of "collapsed" sets for (CGU_UINT8 t2 = o2[0][ch1]; t2 < o2[1][ch1]; t2++) { // B CGU_INT image_tf; CGU_INT image_tc; image_tf = (CGU_INT)floor(image_src[0][ch1]); image_tc = (CGU_INT)ceil(image_src[0][ch1]); #ifndef ASPM_GPU #ifdef USE_NEW_SP_ERR_IDX CGV_FLOAT err_tf = old_get_sperr(clogBC7, bits[ch1], image_tf, t1, t2, iclogBC7); CGV_FLOAT err_tc = old_get_sperr(clogBC7, bits[ch1], image_tc, t1, t2, iclogBC7); if (err_tf > err_tc) image_tcr[ch1] = image_tc; else if (err_tf < err_tc) image_tcr[ch1] = image_tf; else image_tcr[ch1] = (CGV_INT)floor(image_src[ch1][COMP_RED] + 0.5F); //=============================== // Refine this for better quality! //=============================== CGV_FLOAT error_tr; error_tr = old_get_sperr(clogBC7, bits[ch1], image_tcr[ch1], t1, t2, iclogBC7); error_tr = (error_tr * error_tr) + 2 * error_tr * old_img_absf(image_tcr[ch1] - image_src[ch1][COMP_RED]) + (image_tcr[ch1] - image_src[ch1][COMP_RED]) * (image_tcr[ch1] - image_src[ch1][COMP_RED]); if (error_tr < error_ta) { error_ta = error_tr; t1o[ch1] = t1; t2o[ch1] = t2; epo_dr_0[ch1] = cmp_clampi(image_tcr[ch1], 0, 255); } #endif #else image_tcr[ch1] = (CGU_INT)floor(image_src[0][ch1] + 0.5F); error_ta = 0; t1o[ch1] = t1; t2o[ch1] = t2; epo_dr_0[ch1] = cmp_clampi(image_tcr[ch1], 0, 255); #endif } // B } //C error_t += error_ta; } // D if (error_t <= error_0) { // We have a solid color: Use image src if on GPU image_log = iclogBC7; image_idx = image_log; #ifndef ASPM_GPU #ifdef USE_BC7_SP_ERR_IDX if (BC7EncodeRamps2.ramp_init) { for (CGU_UINT8 ch = 0; ch < channels3or4; ch++) { CGV_INT index = (CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits[ch]) * 256 * 2 * 2 * 16 * 2) + (epo_dr_0[ch] * 2 * 2 * 16 * 2) + (t1o[ch] * 2 * 16 * 2) + (t2o[ch] * 16 * 2) + (iclogBC7 * 2); epo_0[0][ch] = BC7EncodeRamps2.sp_idx[index + 0] & 0xFF; epo_0[1][ch] = BC7EncodeRamps2.sp_idx[index + 1] & 0xFF; } } #endif #else CGU_UINT8 ch; CGU_UINT8 k; // This needs improving CGV_FLOAT MinC[4] = {255, 255, 255, 255}; CGV_FLOAT MaxC[4] = {0, 0, 0, 0}; // get min max colors for (ch = 0; ch < channels3or4; ch++) for (k = 0; k < numEntries; k++) { if (image_src[k][ch] < MinC[ch]) MinC[ch] = image_src[k][ch]; if (image_src[k][ch] > MaxC[ch]) MaxC[ch] = image_src[k][ch]; } epo_0[0][0] = (CGU_UINT8)MinC[0]; epo_0[1][0] = (CGU_UINT8)MaxC[0]; epo_0[0][1] = (CGU_UINT8)MinC[1]; epo_0[1][1] = (CGU_UINT8)MaxC[1]; epo_0[0][2] = (CGU_UINT8)MinC[2]; epo_0[1][2] = (CGU_UINT8)MaxC[2]; epo_0[0][3] = (CGU_UINT8)MinC[3]; epo_0[1][3] = (CGU_UINT8)MaxC[3]; #endif error_0 = error_t; } } // E if (error_0 < error_1) { image_idx = image_log; epo_code_out[0] = epo_0[0]; epo_code_out[1] = epo_0[1]; error_1 = error_0; } } //1 // Get Image error CGV_Vec4f image_decomp[16]; for (i = 0; i < numEntries; i++) { index_out[i] = image_idx; { image_decomp[i][0] = cmp_GetRamp(index_bits, bits[0], epo_code_out[0].x, epo_code_out[1].x, i); image_decomp[i][1] = cmp_GetRamp(index_bits, bits[1], epo_code_out[0].y, epo_code_out[1].y, i); image_decomp[i][2] = cmp_GetRamp(index_bits, bits[2], epo_code_out[0].z, epo_code_out[1].z, i); if (channels3or4 == 4) image_decomp[i][3] = cmp_GetRamp(index_bits, bits[3], epo_code_out[0].w, epo_code_out[1].w, i); } } // Do we need to do this rather then err_1 * numEntries CGV_FLOAT error_quant; error_quant = cmp_err_Total(image_src, image_decomp, numEntries, channels3or4); return error_quant; } INLINE CGV_FLOAT old_sq_image(CGV_FLOAT v) { return v * v; } CGV_FLOAT cmp_shake3(CMP_INOUT CGU_Vec4ui epo_code_shake[2], CMP_IN CGV_Vec4f image_cluster[2], CMP_IN CGU_UINT32 index_cidx[16], CMP_IN CGV_Vec4ui image_src[16], CMP_IN CGU_INT index_bits, CMP_IN CGU_INT type, CMP_IN CGU_UINT8 max_bits[4], CMP_IN CGU_UINT8 use_par, CMP_IN CGU_INT numEntries, // max 16 CMP_IN CGU_UINT8 channels3or4) { CGV_FLOAT best_err = CMP_FLOAT_MAX; CGV_FLOAT err_ed[16] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; CGU_INT epo_code_par[2][2][2][MAX_CHANNELS]; for (CGU_UINT8 ch = 0; ch < channels3or4; ch++) { CGU_UINT8 ppA = 0; CGU_UINT8 ppB = 0; CGU_UINT8 rr = (use_par ? 2 : 1); CGU_INT epo_code_epi[2][2]; // first/second, coord, begin rage end range for (ppA = 0; ppA < rr; ppA++) { // loop max =2 for (ppB = 0; ppB < rr; ppB++) { //loop max =2 // set default ranges epo_code_epi[0][0] = epo_code_epi[0][1] = cmp_ep_find_floor2(image_cluster[0][ch], max_bits[ch], use_par, ppA); epo_code_epi[1][0] = epo_code_epi[1][1] = cmp_ep_find_floor2(image_cluster[1][ch], max_bits[ch], use_par, ppB); // set begin range epo_code_epi[0][0] -= ((epo_code_epi[0][0] < 1 ? epo_code_epi[0][0] : 1)) & (~use_par); epo_code_epi[1][0] -= ((epo_code_epi[1][0] < 1 ? epo_code_epi[1][0] : 1)) & (~use_par); // set end range epo_code_epi[0][1] += ((1 << max_bits[ch]) - 1 - epo_code_epi[0][1] < 2 ? (1 << max_bits[ch]) - 1 - epo_code_epi[0][1] : 2) & (~use_par); epo_code_epi[1][1] += ((1 << max_bits[ch]) - 1 - epo_code_epi[1][1] < 2 ? (1 << max_bits[ch]) - 1 - epo_code_epi[1][1] : 2) & (~use_par); CGU_INT step = (1 << use_par); err_ed[(ppA * 8) + (ppB * 4) + ch] = CMP_FLOAT_MAX; for (CGU_INT epo_p1 = epo_code_epi[0][0]; epo_p1 <= epo_code_epi[0][1]; epo_p1 += step) { for (CGU_INT epo_p2 = epo_code_epi[1][0]; epo_p2 <= epo_code_epi[1][1]; epo_p2 += step) { CGV_FLOAT image_square_diff = 0.0F; CGU_INT _mc = numEntries; CGV_FLOAT image_ramp; while (_mc > 0) { image_ramp = cmp_GetRamp(index_bits, max_bits[ch], epo_p1, epo_p2, index_cidx[_mc - 1]); image_square_diff += cmp_squaref(image_ramp - image_src[(_mc - 1)][ch]); _mc--; } if (image_square_diff < err_ed[(ppA * 8) + (ppB * 4) + ch]) { err_ed[(ppA * 8) + (ppB * 4) + ch] = image_square_diff; epo_code_par[ppA][ppB][0][ch] = epo_p1; epo_code_par[ppA][ppB][1][ch] = epo_p2; } } } } // pp1 } // pp0 } // j //--------------------------------------------------------- for (CGU_INT pn = 0; pn < cmp_npv_nd[channels3or4 - 3][type]; pn++) { CGV_FLOAT err_2 = 0.0F; CGU_INT d1; CGU_INT d2; for (CGU_UINT8 ch = 0; ch < channels3or4; ch++) { d1 = cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][ch]; d2 = cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][ch]; err_2 += err_ed[(d1 * 8) + (d2 * 4) + ch]; } if (err_2 < best_err) { best_err = err_2; d1 = cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][0]; d2 = cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][0]; epo_code_shake[0][0] = epo_code_par[d1][d2][0][0]; epo_code_shake[1][0] = epo_code_par[d1][d2][1][0]; d1 = cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][1]; d2 = cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][1]; epo_code_shake[0][1] = epo_code_par[d1][d2][0][1]; epo_code_shake[1][1] = epo_code_par[d1][d2][1][1]; d1 = cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][2]; d2 = cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][2]; epo_code_shake[0][2] = epo_code_par[d1][d2][0][2]; epo_code_shake[1][2] = epo_code_par[d1][d2][1][2]; d1 = cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][3]; d2 = cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][3]; epo_code_shake[0][3] = epo_code_par[d1][d2][0][3]; epo_code_shake[1][3] = epo_code_par[d1][d2][1][3]; } } return best_err; } CGV_FLOAT cmp_requantized_index(CMP_INOUT CGU_UINT8 index_out[16], CMP_INOUT CGU_Vec4ui epo_code_best[2], CMP_IN CGU_INT index_bits, CMP_IN CGU_UINT8 max_bits[4], CMP_IN CGV_Vec4ui image_src[16], CMP_IN CGU_INT numEntries, CMP_IN CGU_UINT8 channels3or4) { //CGV_Vec4f image_requantize[16]; //CGV_FLOAT err_r = 0.0F; CGU_UINT8 k; CGU_UINT8 ch; // for (k = 0; k < 16; k++) // { // image_requantize[k][0] = cmp_GetRamp(index_bits, max_bits[0], epo_code_best[0][0], epo_code_best[1][0], k); // image_requantize[k][1] = cmp_GetRamp(index_bits, max_bits[1], epo_code_best[0][1], epo_code_best[1][1], k); // image_requantize[k][2] = cmp_GetRamp(index_bits, max_bits[2], epo_code_best[0][2], epo_code_best[1][2], k); // if (channels3or4 == 4) // image_requantize[k][3] = cmp_GetRamp(index_bits, max_bits[3], epo_code_best[0][3], epo_code_best[1][3], k); // else // image_requantize[k][3] = 0.0f; // } //========================================= // requantized image based on new epo_code //========================================= CGV_FLOAT image_requantize[SOURCE_BLOCK_SIZE][MAX_CHANNELS]; CGV_FLOAT err_r = 0.0F; for (ch = 0; ch < channels3or4; ch++) { for (k = 0; k < SOURCE_BLOCK_SIZE; k++) { image_requantize[k][ch] = cmp_GetRamp(index_bits, max_bits[ch], epo_code_best[0][ch], epo_code_best[1][ch], k); } } //========================================= // Calc the error for the requantized image //========================================= CGV_Vec4f imageDiff; //CGU_UINT8 block_entries = (1 << index_bits); // // for (k = 0; k < numEntries; k++) // { // CGV_FLOAT err_cmin = 262145.0f; // (256 * 256 * 4) + 1; CMP_FLOAT_MAX; // CGU_UINT8 hold_index = 0; // CGV_FLOAT image_err; // // for (CGU_UINT8 k1 = 0; k1 < block_entries; k1++) // { // imageDiff.x = image_requantize[k1].x - image_src[k].x; // imageDiff.y = image_requantize[k1].y - image_src[k].y; // imageDiff.z = image_requantize[k1].z - image_src[k].z; // imageDiff.w = image_requantize[k1].w - image_src[k].w; // image_err = cmp_dot4f(imageDiff, imageDiff); // // if (image_err < err_cmin) // { // err_cmin = image_err; // hold_index = k1; // } // } // // index_out[k] = hold_index; // err_r += err_cmin; // } //========================================= // Calc the error for the requantized image //========================================= for (k = 0; k < numEntries; k++) { CGV_FLOAT err_cmin = CMP_FLOAT_MAX; CGV_INT hold_index_j = 0; for (CGV_INT iclogBC7 = 0; iclogBC7 < (1 << index_bits); iclogBC7++) { CGV_FLOAT image_err = 0.0F; for (ch = 0; ch < channels3or4; ch++) { image_err += old_sq_image(image_requantize[iclogBC7][ch] - image_src[k][ch]); } if (image_err < err_cmin) { err_cmin = image_err; hold_index_j = iclogBC7; } } index_out[k] = (CGV_UINT8)hold_index_j; err_r += err_cmin; } return err_r; } CGV_FLOAT cmp_optimize_IndexAndEndPoints(CMP_INOUT CGU_Vec4ui epo_code_out[2], CMP_INOUT CGU_UINT32 index_io[16], CMP_INOUT CGU_UINT32 index_packed_out[2], CMP_IN CGV_Vec4ui image_src[16], CMP_IN CGU_INT numEntries, CMP_IN CGU_UINT8 Mi_, CMP_IN CGU_UINT8 bits, CMP_IN CGU_UINT8 channels3or4, CMP_IN CGU_FLOAT errorThreshold, CMP_IN CGU_INT blockMode) { CGV_FLOAT err_best = CMP_FLOAT_MAX; CGU_INT type; CGU_UINT8 channels2 = 2 * channels3or4; type = bits % channels2; CGU_UINT8 use_par = (type != 0); CGU_UINT8 max_bits[4] = {0, 0, 0, 0}; CGU_UINT8 ch; CGU_INT k; for (ch = 0; ch < channels3or4; ch++) max_bits[ch] = (bits + channels2 - 1) / channels2; CGU_INT index_bits = g_modesettings[blockMode].indexBits; CGU_INT clt_clogBC7 = index_bits - 2; if (clt_clogBC7 > 3) return CMP_FLOAT_MAX; Mi_ = Mi_ - 1; CGU_UINT32 index_tmp[16]; CGU_INT maxTry = MAX_TRY_SHAKER; for (k = 0; k < numEntries; k++) index_tmp[k] = cmp_clampui8(index_io[k], 0, 15); epo_code_out[0] = 0u; epo_code_out[1] = 0u; CGV_FLOAT err_requant = 0.0F; CGU_UINT8 MaxIndex; MaxIndex = index_collapse2(index_tmp, numEntries); //=============================== // we have a solid color 4x4 block //=============================== if (MaxIndex == 0) { return cmp_quant_solid_color(index_io, epo_code_out, image_src, numEntries, Mi_, max_bits, type, channels3or4, blockMode); } for (CGU_INT ii = 0; ii < maxTry; ii++) { //=============================== // We have ramp colors to process //=============================== CGV_FLOAT err_cluster = CMP_FLOAT_MAX; CGV_FLOAT err_shake; CGU_UINT32 index_cluster[16]; CGU_Vec4ui epo_code_best[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}}; for (CGU_UINT8 ii2 = 0; ii2 < numEntries; ii2++) index_cluster[ii2] = 0; CGU_UINT8 mi = Mi_; for (CGU_UINT8 index_slope = 1; (index_slope * MaxIndex) <= mi; index_slope++) { CGV_Vec4f image_cluster[2] = {{0.0f, 0.0f, 0.0f, 0.0f}, {0.0f, 0.0f, 0.0f, 0.0f}}; for (CGU_UINT8 index_offset = 0; index_offset <= (mi - index_slope * MaxIndex); index_offset++) { //------------------------------------- // set a new index data to try //------------------------------------- for (k = 0; k < numEntries; k++) index_cluster[k] = index_tmp[k] * index_slope + index_offset; if (cmp_get_ideal_cluster(image_cluster, index_cluster, Mi_, image_src, numEntries, channels3or4)) { CGU_Vec4ui epo_code_shake[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}}; err_shake = cmp_shake3( epo_code_shake, image_cluster, index_cluster, image_src, index_bits, type, max_bits, use_par, numEntries, channels3or4); if (err_shake < err_cluster) { err_cluster = err_shake; epo_code_best[0] = epo_code_shake[0]; epo_code_best[1] = epo_code_shake[1]; } } } } if ((err_cluster != CMP_FLOAT_MAX)) { //========================= // test results for quality //========================= CGU_UINT8 index_best[16] = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; err_requant = cmp_requantized_index(index_best, epo_code_best, index_bits, max_bits, image_src, numEntries, channels3or4); if (err_requant < err_best) { //better = 1; for (k = 0; k < numEntries; k++) index_io[k] = index_tmp[k] = index_best[k]; cmp_pack4bitindex32(index_packed_out, index_io); epo_code_out[0] = epo_code_best[0]; epo_code_out[1] = epo_code_best[1]; err_best = err_requant; } } // Early out if we have our target err if (err_best <= errorThreshold) break; MaxIndex = index_collapse2(index_tmp, numEntries); if (MaxIndex == 0) break; } return err_best; } CGU_UINT8 cmp_Write8Bit2(CMP_INOUT CGU_UINT8 base[16], CMP_IN CGU_INT offset, CMP_IN CGU_INT bits, CMP_IN CGU_UINT8 bitVal) { base[offset / 8] |= bitVal << (offset % 8); if (offset % 8 + bits > 8) { base[offset / 8 + 1] |= shift_right_uint82(bitVal, 8 - offset % 8); } return (offset += bits); } INLINE CGU_UINT8 shift_right_uint8V2(CMP_IN CGU_UINT8 v, CMP_IN CGU_UINT8 bits) { return v >> bits; // (perf warning expected) } void cmp_Write8BitV2(CMP_INOUT CGU_UINT8 base[16], CMP_IN CGU_INT offset, CMP_IN CGU_INT bits, CMP_IN CGU_UINT8 bitVal) { base[offset / 8] |= bitVal << (offset % 8); if (offset % 8 + bits > 8) { base[offset / 8 + 1] |= shift_right_uint8V2(bitVal, 8 - offset % 8); } } void cmp_Encode_mode01237(CMP_IN CGU_INT blockMode, CMP_IN CGU_UINT8 bestPartition, CMP_IN CGU_UINT32 packedEndpoints[6], CMP_IN CGU_UINT8 index16[16], CMP_INOUT CGU_UINT8 cmp_out[16]) { CGU_UINT8 blockindex[SOURCE_BLOCK_SIZE]; CGU_UINT32 indexBitsV = g_modesettings[blockMode].indexBits; CGU_UINT32 k; CGU_UINT32 ch; for (k = 0; k < COMPRESSED_BLOCK_SIZE; k++) cmp_out[k] = 0; // mode 0 = 1, mode 1 = 01, mode 2 = 001, mode 3 = 0001, ... CGU_INT bitPosition = blockMode; bitPosition = cmp_Write8Bit2(cmp_out, bitPosition, 1, 1); // Write partition bits bitPosition = cmp_Write8Bit2(cmp_out, bitPosition, g_modesettings[blockMode].partitionBits, bestPartition); // Sort out the index set and tag whether we need to flip the // endpoints to get the correct state in the implicit index bits // The implicitly encoded MSB of the fixup index must be 0 CGU_UINT32 fixup[3] = {0, 0, 0}; cmp_get_fixuptable(fixup, (g_modesettings[blockMode].maxSubSets == 2 ? bestPartition : bestPartition + 64)); // Extract indices and mark subsets that need to have their colours flipped to get the // right state for the implicit MSB of the fixup index CGU_INT flipColours[3] = {0, 0, 0}; for (k = 0; k < SOURCE_BLOCK_SIZE; k++) { blockindex[k] = index16[k]; for (CGU_UINT8 j = 0; j < g_modesettings[blockMode].maxSubSets; j++) { if (k == fixup[j]) { if (blockindex[k] & (1 << (indexBitsV - 1))) { flipColours[j] = 1; } } } } // Now we must flip the endpoints where necessary so that the implicitly encoded // index bits have the correct state for (k = 0; k < g_modesettings[blockMode].maxSubSets; k++) { if (flipColours[k] == 1) { CGU_UINT32 temp = packedEndpoints[k * 2 + 0]; packedEndpoints[k * 2 + 0] = packedEndpoints[k * 2 + 1]; packedEndpoints[k * 2 + 1] = temp; } } // ...next flip the indices where necessary for (k = 0; k < SOURCE_BLOCK_SIZE; k++) { CGU_UINT8 partsub = cmp_get_partition_subset2(bestPartition, g_modesettings[blockMode].maxSubSets, k); if (flipColours[partsub] == 1) { blockindex[k] = ((1 << indexBitsV) - 1) - blockindex[k]; } } // Endpoints are stored in the following order RRRR GGGG BBBB (AAAA) (PPPP) // i.e. components are packed together CGU_Vec4ui unpackedColours[MAX_SUBSETS * 2]; CGU_UINT8 parityBits[MAX_SUBSETS][2]; // Init for (k = 0; k < MAX_SUBSETS * 2; k++) unpackedColours[k] = 0; // Unpack the colour values for the subsets for (k = 0; k < g_modesettings[blockMode].maxSubSets; k++) { CGU_UINT32 packedColours[2] = {packedEndpoints[k * 2 + 0], packedEndpoints[k * 2 + 1]}; if (blockMode == 0 || blockMode == 3 || blockMode == 7) { // TWO_PBIT parityBits[k][0] = packedColours[0] & 1; parityBits[k][1] = packedColours[1] & 1; packedColours[0] >>= 1; packedColours[1] >>= 1; } else if (blockMode == 1) { // ONE_PBIT parityBits[k][0] = packedColours[1] & 1; parityBits[k][1] = packedColours[1] & 1; packedColours[0] >>= 1; packedColours[1] >>= 1; } else if (blockMode == 2) { parityBits[k][0] = 0; parityBits[k][1] = 0; } for (ch = 0; ch < g_modesettings[blockMode].channels3or4; ch++) { unpackedColours[k * 2][ch] = packedColours[0] & ((1 << g_modesettings[blockMode].componentBits) - 1); unpackedColours[k * 2 + 1][ch] = packedColours[1] & ((1 << g_modesettings[blockMode].componentBits) - 1); packedColours[0] >>= g_modesettings[blockMode].componentBits; packedColours[1] >>= g_modesettings[blockMode].componentBits; } } // Loop over component for (ch = 0; ch < g_modesettings[blockMode].channels3or4; ch++) { // loop over subsets for (k = 0; k < g_modesettings[blockMode].maxSubSets; k++) { bitPosition = cmp_Write8Bit2(cmp_out, bitPosition, g_modesettings[blockMode].componentBits, unpackedColours[k * 2][ch] & 0xFF); bitPosition = cmp_Write8Bit2(cmp_out, bitPosition, g_modesettings[blockMode].componentBits, unpackedColours[k * 2 + 1][ch] & 0xFF); } } // write parity bits if (blockMode != 2) { for (k = 0; k < g_modesettings[blockMode].maxSubSets; k++) { if (blockMode == 1) { // ONE_PBIT bitPosition = cmp_Write8Bit2(cmp_out, bitPosition, 1, parityBits[k][0] & 0x01); } else { // TWO_PBIT bitPosition = cmp_Write8Bit2(cmp_out, bitPosition, 1, parityBits[k][0] & 0x01); bitPosition = cmp_Write8Bit2(cmp_out, bitPosition, 1, parityBits[k][1] & 0x01); } } } // Encode the index bits CGU_INT bitPositionV = bitPosition; for (k = 0; k < 16; k++) { CGU_UINT8 partsub = cmp_get_partition_subset2(bestPartition, g_modesettings[blockMode].maxSubSets, k); // If this is a fixup index then drop the MSB which is implicitly 0 if (k == fixup[partsub]) { cmp_Write8BitV2(cmp_out, bitPositionV, g_modesettings[blockMode].indexBits - 1, blockindex[k] & 0x07F); bitPositionV += g_modesettings[blockMode].indexBits - 1; } else { cmp_Write8BitV2(cmp_out, bitPositionV, g_modesettings[blockMode].indexBits, blockindex[k]); bitPositionV += g_modesettings[blockMode].indexBits; } } } CGV_FLOAT cmp_process_mode(CMP_INOUT CGU_UINT32 best_cmp_out[5], CMP_IN CGU_Vec4ui image_src[16], CMP_IN CGU_INT block_mode) { #ifdef USE_OLDCODE CGV_FLOAT best_err = 1e30f; CGU_Vec4ui epo_code[6]; CGU_Vec4ui bestEndpoints[6]; CGU_UINT8 bestindex[3][16]; CGU_INT bestEntryCount[3]; CGU_UINT8 bestindex16[16]; CGU_UINT32 packedEndpoints[6] = {0, 0, 0, 0, 0, 0}; CGU_UINT32 k; CGU_UINT32 ch; CGU_UINT32 subset; // Check for a solid color for a fast encode CGV_Vec4ui mean_out = 0.0f; for (k = 0; k < 16; k++) { mean_out = mean_out + image_src[k]; bestindex16[k] = 0; } mean_out = mean_out / 16; // Image has alpha if (mean_out.w < 255) { } CGU_UINT8 storedBestindex[64][3][16]; CGV_FLOAT storedError[64]; CGU_UINT8 sortedPartition[64]; CGV_FLOAT quality = 1.0f; CGV_FLOAT opaque_err = 0.0f; CGV_Vec4ui image_subsets[3][16]; CGU_INT subset_entryCount[MAX_SUBSETS] = {0, 0, 0}; CGU_UINT8 bestPartition = 0; for (CGU_UINT8 mode_blockPartition = 0; mode_blockPartition < 64; mode_blockPartition++) { cmp_GetPartitionSubSet2_mode01237( image_subsets, subset_entryCount, mode_blockPartition, image_src, block_mode, g_modesettings[block_mode].channels3or4); CGV_Vec4ui subset_image_src[16]; CGU_UINT8 index_out1[16]; CGV_FLOAT err_quant = 0.0F; // Store the quntize error for this partition to be sorted and processed later for (subset = 0; subset < g_modesettings[block_mode].maxSubSets; subset++) { CGU_INT numEntries = subset_entryCount[subset]; for (CGU_UINT8 ii = 0; ii < 16; ii++) subset_image_src[ii] = image_subsets[subset][ii]; err_quant += cmp_GetQuantizeIndex_old( index_out1, subset_image_src, numEntries, g_modesettings[block_mode].clusters, g_modesettings[block_mode].channels3or4); for (CGU_UINT8 idx = 0; idx < numEntries; idx++) storedBestindex[mode_blockPartition][subset][idx] = index_out1[idx]; } storedError[mode_blockPartition] = err_quant; } // Sort the results cmp_sortPartitionProjection(storedError, sortedPartition, 64); // 64 partitions CGU_UINT8 numShakeAttempts = cmp_max8(1, cmp_min8((CGU_UINT8)floor(8 * quality + 0.5), 64)); // 64 partitions CGV_FLOAT err_best = CMP_FLOAT_MAX; // Now do the endpoint shaking for (CGU_UINT8 nSA = 0; nSA < numShakeAttempts; nSA++) { CGV_FLOAT err_optimized = 0.0F; CGU_UINT8 sortedBlockPartition; sortedBlockPartition = sortedPartition[nSA]; //******************************************** // Get the partition shape for the given mode //******************************************** cmp_GetPartitionSubSet2_mode01237( image_subsets, subset_entryCount, sortedBlockPartition, image_src, block_mode, g_modesettings[block_mode].channels3or4); //***************************** // Process the partition shape //***************************** for (subset = 0; subset < g_modesettings[block_mode].maxSubSets; subset++) { CGU_INT numEntries = subset_entryCount[subset]; CGU_UINT32 index_io[16]; CGV_Vec4ui src_image_block[16]; CGU_Vec4ui tmp_epo_code[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}}; for (k = 0; k < 16; k++) src_image_block[k] = image_subsets[subset][k]; for (k = 0; k < 16; k++) index_io[k] = storedBestindex[sortedBlockPartition][subset][k]; CGU_UINT32 index_packed_out[2] = {0, 0}; err_optimized += cmp_optimize_IndexAndEndPoints(tmp_epo_code, index_io, index_packed_out, src_image_block, numEntries, g_modesettings[block_mode].clusters, g_modesettings[block_mode].bits, g_modesettings[block_mode].channels3or4, 0.01f, 1); for (k = 0; k < 16; k++) storedBestindex[sortedBlockPartition][subset][k] = index_io[k]; epo_code[subset * 2] = tmp_epo_code[0]; epo_code[subset * 2 + 1] = tmp_epo_code[1]; shared_temp[subset * 2].endPoint_low = tmp_epo_code[0]; shared_temp[subset * 2 + 1].endPoint_high = tmp_epo_code[1]; } //**************************************** // Check if result is better than the last //**************************************** if (err_optimized < err_best) { bestPartition = sortedBlockPartition; CGU_INT bestIndexCount = 0; for (subset = 0; subset < g_modesettings[block_mode].maxSubSets; subset++) { CGU_UINT32 numEntries = subset_entryCount[subset]; bestEntryCount[subset] = numEntries; if (numEntries) { bestEndpoints[subset * 2] = epo_code[subset * 2]; bestEndpoints[subset * 2 + 1] = epo_code[subset * 2 + 1]; shared_temp[subset * 2].endPoint_low = bestEndpoints[subset * 2]; shared_temp[subset * 2 + 1].endPoint_high = bestEndpoints[subset * 2 + 1]; for (k = 0; k < numEntries; k++) { bestindex[subset][k] = storedBestindex[sortedBlockPartition][subset][k]; bestindex16[bestIndexCount++] = storedBestindex[sortedBlockPartition][subset][k]; shared_temp[k].colorindex = storedBestindex[sortedBlockPartition][subset][k]; } } } err_best = err_optimized; // Early out if we found we can compress with error below the quality threshold if (err_best <= 0.01f) // Thresh hold err { break; } } } if (block_mode != 7) err_best += opaque_err; if (err_best > best_err) return best_err; //************************** // Save the encoded block //************************** best_err = err_best; // Now we have all the data needed to encode the block // We need to pack the endpoints prior to encoding for (subset = 0; subset < g_modesettings[block_mode].maxSubSets; subset++) { packedEndpoints[subset * 2] = 0; packedEndpoints[subset * 2 + 1] = 0; if (bestEntryCount[subset]) { CGU_UINT32 rightAlignment = 0; // Sort out parity bits if (block_mode != 2) { // Sort out BCC parity bits packedEndpoints[subset * 2] = bestEndpoints[subset * 2][0] & 1; packedEndpoints[subset * 2 + 1] = bestEndpoints[subset * 2 + 1][0] & 1; for (ch = 0; ch < g_modesettings[block_mode].channels3or4; ch++) { bestEndpoints[subset * 2][ch] >>= 1; bestEndpoints[subset * 2 + 1][ch] >>= 1; } rightAlignment++; } // Fixup endpoints for (ch = 0; ch < g_modesettings[block_mode].channels3or4; ch++) { packedEndpoints[subset * 2] |= bestEndpoints[subset * 2][ch] << rightAlignment; packedEndpoints[subset * 2 + 1] |= bestEndpoints[subset * 2 + 1][ch] << rightAlignment; rightAlignment += g_modesettings[block_mode].componentBits; } } } CGU_UINT8 idxCount[3] = {0, 0, 0}; for (k = 0; k < SOURCE_BLOCK_SIZE; k++) { CGU_UINT8 partsub = cmp_get_partition_subset2(bestPartition, g_modesettings[block_mode].maxSubSets, k); CGU_UINT8 idxC = idxCount[partsub]; bestindex16[k] = bestindex[partsub][idxC]; idxCount[partsub] = idxC + 1; shared_temp[k].colorindex = bestindex16[k]; } CGU_UINT8 cmp_out[COMPRESSED_BLOCK_SIZE]; cmp_Encode_mode01237(block_mode, bestPartition, packedEndpoints, bestindex16, cmp_out); best_cmp_out[0] = (CGU_UINT32)cmp_out[0] + (CGU_UINT32)(cmp_out[1] << 8) + (CGU_UINT32)(cmp_out[2] << 16) + (CGU_UINT32)(cmp_out[3] << 24); best_cmp_out[1] = (CGU_UINT32)cmp_out[4] + (CGU_UINT32)(cmp_out[5] << 8) + (CGU_UINT32)(cmp_out[6] << 16) + (CGU_UINT32)(cmp_out[7] << 24); best_cmp_out[2] = (CGU_UINT32)cmp_out[8] + (CGU_UINT32)(cmp_out[9] << 8) + (CGU_UINT32)(cmp_out[10] << 16) + (CGU_UINT32)(cmp_out[11] << 24); best_cmp_out[3] = (CGU_UINT32)cmp_out[12] + (CGU_UINT32)(cmp_out[13] << 8) + (CGU_UINT32)(cmp_out[14] << 16) + (CGU_UINT32)(cmp_out[15] << 24); //CGU_Vec4ui block = {0, 0, 0, 0}; //block_package1(block, bestPartition, 0); //best_cmp_out[0] = block[0]; //best_cmp_out[1] = block[1]; //best_cmp_out[2] = block[2]; //best_cmp_out[3] = block[3]; // //printSharedTemp(); return best_err; #else CGU_UINT8 bestPartition = 0; // Find the best partion CGU_UINT32 pbit = 0; CGU_UINT32 error; CGU_UINT32 bestErr = MAX_UINT; CGU_UINT32 bestpbit = 0; for (CGU_UINT8 mode_blockPartition = 0; mode_blockPartition < 64; mode_blockPartition++) { error = cmp_GetPartitionError(pbit, mode_blockPartition, image_src); if (error < bestErr) { bestErr = error; bestpbit = pbit; bestPartition = mode_blockPartition; } } // Get the index for the partition for (CGU_INT threadInBlock = 15; threadInBlock >= 0; threadInBlock--) { ProcessBlock(1, bestPartition, 0, bestpbit, 0, threadInBlock, threadInBlock, 0); } // print results for debug printSharedTemp(); //======================= // Encode final block //======================== { // CGU_Vec4ui blockGreen = {0xffe00040, 0xfffe0007, 0x00000001, 0x00000000}; // CGU_Vec4ui blockBlue = {0x00000040, 0xfffffff8, 0x00000001, 0x00000000}; // CGU_Vec4ui block00 = {0xf0617fc0, 0xfffe0c3f, 0xff00fe11, 0xff01ef00}; CGU_Vec4ui blockRed = {0x001fffc0, 0xfffe0000, 0x00000001, 0x00000000}; CGU_Vec4ui block = {0, 0, 0, 0}; CGU_UINT32 input_mode = 1; switch (input_mode) { case 1: block_package1(block, bestPartition, 0); break; case 3: block_package3(block, bestPartition, 0); break; case 7: block_package7(block, bestPartition, 0); break; default: // error unsupported mode used! block = blockRed; break; } best_cmp_out[0] = block[0]; best_cmp_out[1] = block[1]; best_cmp_out[2] = block[2]; best_cmp_out[3] = block[3]; } return 0.0f; #endif } #endif // Not ASPM_HLSL //======================================= MODES 45 ============================================= #ifndef ASPM_HLSL #if defined(ENABLE_CMP_MODE4) || defined(ENABLE_CMP_MODE5) // Compression Results struct cmp_mode_parameters2 { CGV_INT color_qendpoint[8]; CGV_INT alpha_qendpoint[8]; CGV_UINT8 color_index[16]; CGV_UINT8 alpha_index[16]; CGV_UINT32 idxMode; CGV_UINT32 rotated_channel; }; CMP_STATIC CMP_CONSTANT CGU_UINT8 componentRotations2[4][4] = { { COMP_ALPHA, COMP_RED, COMP_GREEN, COMP_BLUE }, { COMP_RED, COMP_ALPHA, COMP_GREEN, COMP_BLUE }, { COMP_GREEN, COMP_RED, COMP_ALPHA, COMP_BLUE }, { COMP_BLUE, COMP_RED, COMP_GREEN, COMP_ALPHA } }; INLINE CGV_UINT8 old_shift_right_uint(CGV_UINT8 v, CGU_UINT8 bits) { return v >> bits; // (perf warning expected) } void old_Write8Bit(CGV_UINT8 base[], CGU_INT* uniform offset, CGU_INT bits, CGV_UINT8 bitVal) { base[*offset / 8] |= bitVal << (*offset % 8); if (*offset % 8 + bits > 8) { base[*offset / 8 + 1] |= old_shift_right_uint(bitVal, 8 - *offset % 8); } *offset += bits; } INLINE void old_swap_index(CGV_UINT8 u[], CGV_UINT8 v[], CGU_INT n) { for (CGU_INT i = 0; i < n; i++) { CGV_UINT8 t = u[i]; u[i] = v[i]; v[i] = t; } } INLINE void old_swap_epo(CGV_INT u[], CGV_INT v[], CGV_INT n) { for (CGU_INT i = 0; i < n; i++) { CGV_INT t = u[i]; u[i] = v[i]; v[i] = t; } } INLINE void old_encode_swap(CGV_INT endpoint[], CGU_INT channels, CGV_UINT8 block_index[MAX_SUBSET_SIZE], CGU_INT bits) { CGU_INT levels = 1 << bits; if (block_index[0] >= levels / 2) { old_swap_epo(&endpoint[0], &endpoint[channels], channels); for (CGU_INT k = 0; k < SOURCE_BLOCK_SIZE; k++) #ifdef ASPM_GPU block_index[k] = (levels - 1) - block_index[k]; #else block_index[k] = CGV_UINT8(levels - 1) - block_index[k]; #endif } } void old_encode_index(CGV_UINT8 data[16], CGU_INT* uniform pPos, CGV_UINT8 block_index[MAX_SUBSET_SIZE], CGU_INT bits) { old_Write8Bit(data, pPos, bits - 1, block_index[0]); for (CGU_INT j = 1; j < SOURCE_BLOCK_SIZE; j++) { CGV_UINT8 qbits = block_index[j] & 0xFF; old_Write8Bit(data, pPos, bits, qbits); } } void cmp_Encode_mode4(CMP_INOUT CGV_UINT8 cmp_out[COMPRESSED_BLOCK_SIZE], cmp_mode_parameters2 params) { CGU_INT bitPosition = 4; // Position the pointer at the LSB for (CGU_INT k = 0; k < COMPRESSED_BLOCK_SIZE; k++) cmp_out[k] = 0; // mode 4 (5 bits) 00001 old_Write8Bit(cmp_out, &bitPosition, 1, 1); // rotation 2 bits old_Write8Bit(cmp_out, &bitPosition, 2, CMP_STATIC_CAST(CGV_UINT8, params.rotated_channel)); // idxMode 1 bit old_Write8Bit(cmp_out, &bitPosition, 1, CMP_STATIC_CAST(CGV_UINT8, params.idxMode)); CGU_INT idxBits[2] = {2, 3}; if (params.idxMode) { idxBits[0] = 3; idxBits[1] = 2; // Indicate if we need to fixup the index old_swap_index(params.color_index, params.alpha_index, 16); old_encode_swap(params.alpha_qendpoint, 4, params.color_index, 2); old_encode_swap(params.color_qendpoint, 4, params.alpha_index, 3); } else { old_encode_swap(params.color_qendpoint, 4, params.color_index, 2); old_encode_swap(params.alpha_qendpoint, 4, params.alpha_index, 3); } // color endpoints 5 bits each // R0 : R1 // G0 : G1 // B0 : B1 for (CGU_INT component = 0; component < 3; component++) { old_Write8Bit(cmp_out, &bitPosition, 5, CMP_STATIC_CAST(CGV_UINT8, params.color_qendpoint[component])); old_Write8Bit(cmp_out, &bitPosition, 5, CMP_STATIC_CAST(CGV_UINT8, params.color_qendpoint[4 + component])); } // alpha endpoints (6 bits each) // A0 : A1 old_Write8Bit(cmp_out, &bitPosition, 6, CMP_STATIC_CAST(CGV_UINT8, params.alpha_qendpoint[0])); old_Write8Bit(cmp_out, &bitPosition, 6, CMP_STATIC_CAST(CGV_UINT8, params.alpha_qendpoint[4])); // index 2 bits each (31 bits total) old_encode_index(cmp_out, &bitPosition, params.color_index, 2); // index 3 bits each (47 bits total) old_encode_index(cmp_out, &bitPosition, params.alpha_index, 3); } void cmp_Encode_mode5(CMP_INOUT CGV_UINT8 cmp_out[COMPRESSED_BLOCK_SIZE], cmp_mode_parameters2 params) { for (CGU_INT k = 0; k < COMPRESSED_BLOCK_SIZE; k++) cmp_out[k] = 0; // mode 5 bits = 000001 CGU_INT bitPosition = 5; // Position the pointer at the LSB old_Write8Bit(cmp_out, &bitPosition, 1, 1); // Write 2 bit rotation old_Write8Bit(cmp_out, &bitPosition, 2, CMP_STATIC_CAST(CGV_UINT8, params.rotated_channel)); old_encode_swap(params.color_qendpoint, 4, params.color_index, 2); old_encode_swap(params.alpha_qendpoint, 4, params.alpha_index, 2); // color endpoints (7 bits each) // R0 : R1 // G0 : G1 // B0 : B1 for (CGU_INT component = 0; component < 3; component++) { old_Write8Bit(cmp_out, &bitPosition, 7, CMP_STATIC_CAST(CGV_UINT8, params.color_qendpoint[component])); old_Write8Bit(cmp_out, &bitPosition, 7, CMP_STATIC_CAST(CGV_UINT8, params.color_qendpoint[4 + component])); } // alpha endpoints (8 bits each) // A0 : A1 old_Write8Bit(cmp_out, &bitPosition, 8, CMP_STATIC_CAST(CGV_UINT8, params.alpha_qendpoint[0])); old_Write8Bit(cmp_out, &bitPosition, 8, CMP_STATIC_CAST(CGV_UINT8, params.alpha_qendpoint[4])); // color index 2 bits each (31 bits total) // alpha index 2 bits each (31 bits total) old_encode_index(cmp_out, &bitPosition, params.color_index, 2); old_encode_index(cmp_out, &bitPosition, params.alpha_index, 2); } void Compress_mode45(CMP_INOUT CGU_UINT32 cmp_out[4], CGU_INT blockMode, CGU_Vec4ui image_src[SOURCE_BLOCK_SIZE]) { cmp_mode_parameters2 best_candidate; CGU_UINT32 channels3or4 = 4; CGU_UINT8 numClusters0[2]; CGU_UINT8 numClusters1[2]; CGU_INT modeBits[2]; CGU_INT max_idxMode; if (blockMode == 4) { max_idxMode = 2; modeBits[0] = 30; // bits = 2 * (Red 5+ Grn 5+ blu 5) modeBits[1] = 36; // bits = 2 * (Alpha 6+6+6) numClusters0[0] = 4; numClusters0[1] = 8; numClusters1[0] = 8; numClusters1[1] = 4; } else { max_idxMode = 1; modeBits[0] = 42; // bits = 2 * (Red 7+ Grn 7+ blu 7) modeBits[1] = 48; // bits = 2 * (Alpha 8+8+8) = 48 numClusters0[0] = 4; numClusters0[1] = 4; numClusters1[0] = 4; numClusters1[1] = 4; } CGU_Vec4ui src_color_Block[SOURCE_BLOCK_SIZE]; CGU_Vec4ui src_alpha_Block[SOURCE_BLOCK_SIZE]; CGV_FLOAT best_err = CMP_FLOAT_MAX; // Go through each possible rotation and selection of index rotationBits) for (CGU_UINT8 rotated_channel = 0; rotated_channel < channels3or4; rotated_channel++) { // A for (CGU_INT k = 0; k < SOURCE_BLOCK_SIZE; k++) { for (CGU_INT p = 0; p < 3; p++) { src_color_Block[k][p] = image_src[k][componentRotations2[rotated_channel][p+1]]; src_alpha_Block[k][p] = image_src[k][componentRotations2[rotated_channel][0]]; } src_color_Block[k][3] = image_src[k][3]; src_alpha_Block[k][3] = image_src[k][componentRotations2[3][3]]; } CGV_FLOAT err_quantizer; CGV_FLOAT err_bestQuantizer = CMP_FLOAT_MAX; for (CGU_INT idxMode = 0; idxMode < max_idxMode; idxMode++) { err_quantizer = cmp_GetQuantizeIndex_old(best_candidate.color_index, src_color_Block, SOURCE_BLOCK_SIZE, numClusters0[idxMode], 3); err_quantizer += cmp_GetQuantizeIndex_old(best_candidate.alpha_index, src_alpha_Block, SOURCE_BLOCK_SIZE, numClusters1[idxMode], 3) / 3.0F; // If quality is high then run the full shaking for this config and // store the result if it beats the best overall error // Otherwise only run the shaking if the error is better than the best // quantizer error if (err_quantizer <= err_bestQuantizer) { err_bestQuantizer = err_quantizer; // Shake size gives the size of the shake cube CGV_FLOAT err_overallError; CGU_Vec4ui color_qendpoint2[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}}; CGV_Vec4ui src_image_block[16]; CGU_Vec4ui alpha_qendpoint2[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}}; CGU_UINT32 alpha_index[16]; CGU_UINT32 color_index[16]; for (int k = 0; k < 16; k++) { alpha_index[k] = best_candidate.alpha_index[k]; color_index[k] = best_candidate.color_index[k]; } CGU_UINT32 color_index_packed_out[2] = {0, 0}; CGU_UINT32 alpha_index_packed_out[2] = {0, 0}; err_overallError = cmp_optimize_IndexAndEndPoints(color_qendpoint2, color_index, color_index_packed_out, src_color_Block, 16, numClusters0[idxMode], modeBits[0], 3, 0.01f, blockMode); // Alpha scalar block err_overallError += cmp_optimize_IndexAndEndPoints(alpha_qendpoint2, alpha_index, alpha_index_packed_out, src_alpha_Block, 16, numClusters1[idxMode], modeBits[1], 3, 0.01f, blockMode) / 3; // If we beat the previous best then encode the block if (err_overallError < best_err) { best_err = err_overallError; best_candidate.idxMode = idxMode; best_candidate.rotated_channel = rotated_channel; best_candidate.alpha_qendpoint[0] = alpha_qendpoint2[0].x; best_candidate.alpha_qendpoint[1] = alpha_qendpoint2[0].y; best_candidate.alpha_qendpoint[2] = alpha_qendpoint2[0].z; best_candidate.alpha_qendpoint[3] = alpha_qendpoint2[0].w; best_candidate.alpha_qendpoint[4] = alpha_qendpoint2[1].x; best_candidate.alpha_qendpoint[5] = alpha_qendpoint2[1].y; best_candidate.alpha_qendpoint[6] = alpha_qendpoint2[1].z; best_candidate.alpha_qendpoint[7] = alpha_qendpoint2[1].w; best_candidate.color_qendpoint[0] = color_qendpoint2[0].x; best_candidate.color_qendpoint[1] = color_qendpoint2[0].y; best_candidate.color_qendpoint[2] = color_qendpoint2[0].z; best_candidate.color_qendpoint[3] = color_qendpoint2[0].w; best_candidate.color_qendpoint[4] = color_qendpoint2[1].x; best_candidate.color_qendpoint[5] = color_qendpoint2[1].y; best_candidate.color_qendpoint[6] = color_qendpoint2[1].z; best_candidate.color_qendpoint[7] = color_qendpoint2[1].w; for (int k = 0; k < 16; k++) { best_candidate.color_index[k] = color_index[k]; best_candidate.alpha_index[k] = alpha_index[k]; } CGV_UINT8 cmp_out16[COMPRESSED_BLOCK_SIZE]; if (blockMode == 4) cmp_Encode_mode4(cmp_out16, best_candidate); else cmp_Encode_mode5(cmp_out16, best_candidate); cmp_out[0] = (CGU_UINT32)cmp_out16[0] + (CGU_UINT32)(cmp_out16[1] << 8) + (CGU_UINT32)(cmp_out16[2] << 16) + (CGU_UINT32)(cmp_out16[3] << 24); cmp_out[1] = (CGU_UINT32)cmp_out16[4] + (CGU_UINT32)(cmp_out16[5] << 8) + (CGU_UINT32)(cmp_out16[6] << 16) + (CGU_UINT32)(cmp_out16[7] << 24); cmp_out[2] = (CGU_UINT32)cmp_out16[8] + (CGU_UINT32)(cmp_out16[9] << 8) + (CGU_UINT32)(cmp_out16[10] << 16) + (CGU_UINT32)(cmp_out16[11] << 24); cmp_out[3] = (CGU_UINT32)cmp_out16[12] + (CGU_UINT32)(cmp_out16[13] << 8) + (CGU_UINT32)(cmp_out16[14] << 16) + (CGU_UINT32)(cmp_out16[15] << 24); } } } // B } // A } #endif #endif #ifdef ENABLE_CMP_REFINE_MODE6_API CGU_BOOL get_ideal_cluster2(CMP_INOUT CGV_Vec4f image_cluster[2], CMP_IN CGU_UINT32 index_cluster[16], CMP_IN CGU_INT Mi_, CMP_IN CGU_Vec4ui image_src[16], CMP_IN CGU_UINT32 numEntries, CMP_IN CGU_UINT32 channels3or4) { // get ideal cluster centers CGV_Vec4f image_cluster_mean[16]; for (CGU_UINT32 ii = 0; ii < 16; ii++) { image_cluster_mean[ii] = 0.0f; } GetClusterMean2(image_cluster_mean, image_src, index_cluster, numEntries, channels3or4); // unrounded CGV_FLOAT image_matrix0[2] = {0, 0}; // matrix /inverse matrix CGV_FLOAT image_matrix1[2] = {0, 0}; // matrix /inverse matrix CGV_Vec4f image_rp[2]; // right part for RMS fit problem image_rp[0] = 0.0f; image_rp[1] = 0.0f; // weight with cnt if runnning on compacted index for (CGU_UINT32 k = 0; k < numEntries; k++) { image_matrix0[0] += (Mi_ - index_cluster[k]) * (Mi_ - index_cluster[k]); image_matrix0[1] += index_cluster[k] * (Mi_ - index_cluster[k]); // im is symmetric image_matrix1[1] += index_cluster[k] * index_cluster[k]; image_rp[0] += image_cluster_mean[index_cluster[k]] * (CGU_FLOAT)(Mi_ - index_cluster[k]); image_rp[1] += image_cluster_mean[index_cluster[k]] * (CGU_FLOAT)index_cluster[k]; } CGV_FLOAT matrix_dd = image_matrix0[0] * image_matrix1[1] - image_matrix0[1] * image_matrix0[1]; // assert(matrix_dd !=0); // matrix_dd=0 means that index_cidx[k] and (Mi_-index_cidx[k]) collinear which implies only one active index; // taken care of separately if (matrix_dd == 0) { image_cluster[0] = 0.0f; image_cluster[1] = 0.0f; return FALSE; } image_matrix1[0] = image_matrix0[0]; image_matrix0[0] = image_matrix1[1] / matrix_dd; image_matrix1[1] = image_matrix1[0] / matrix_dd; image_matrix1[0] = image_matrix0[1] = -image_matrix0[1] / matrix_dd; CGV_FLOAT Mif = (CGV_FLOAT)Mi_; // values can exceed 255 here, clamp made no diff in quality! image_cluster[0] = (((image_rp[0] * image_matrix0[0]) + (image_rp[1] * image_matrix0[1])) * Mif); image_cluster[1] = (((image_rp[0] * image_matrix1[0]) + (image_rp[1] * image_matrix1[1])) * Mif); return TRUE; } CGV_FLOAT shake2(CMP_INOUT CGU_Vec4ui epo_code_shake[2], CMP_IN CGV_Vec4f image_cluster[2], CMP_IN CGU_UINT32 index_cluster[16], CMP_IN CGU_Vec4ui image_src[16], CMP_IN CGU_UINT32 index_bits, CMP_IN CGU_UINT32 mtype, CMP_IN CGU_UINT32 max_bits[4], CMP_IN CGU_UINT32 use_par, CMP_IN CGU_UINT32 numEntries, // max 16 CMP_IN CGU_UINT32 channels3or4) { CMP_UNUSED(mtype); CGV_FLOAT best_err = CMP_FLOAT_MAX; #define SHAKESIZE1 1 #define SHAKESIZE2 2 // shake single or - cartesian // shake odd/odd and even/even or - same parity // shake odd/odd odd/even , even/odd and even/even - bcc CGV_FLOAT err_ed[2][2][4]; CGU_UINT32 epo_code_par[2][2][2][4]; for (CGU_UINT32 ch = 0; ch < channels3or4; ch++) { CGU_UINT32 ppA = 0; CGU_UINT32 ppB = 0; CGU_UINT32 rr = (use_par ? 2 : 1); CGU_UINT32 epo_code_epi0[2]; // first/second, coord, begin rage end range CGU_UINT32 epo_code_epi1[2]; // first/second, coord, begin rage end range for (ppA = 0; ppA < rr; ppA++) { // loop max =2 for (ppB = 0; ppB < rr; ppB++) { //loop max =2 // set default ranges switch (ch) { case 0: epo_code_epi0[0] = epo_code_epi0[1] = cmp_ep_find_floor2(image_cluster[0].x, max_bits[0], use_par, ppA); epo_code_epi1[0] = epo_code_epi1[1] = cmp_ep_find_floor2(image_cluster[1].x, max_bits[0], use_par, ppB); break; case 1: epo_code_epi0[0] = epo_code_epi0[1] = cmp_ep_find_floor2(image_cluster[0].y, max_bits[1], use_par, ppA); epo_code_epi1[0] = epo_code_epi1[1] = cmp_ep_find_floor2(image_cluster[1].y, max_bits[1], use_par, ppB); break; case 2: epo_code_epi0[0] = epo_code_epi0[1] = cmp_ep_find_floor2(image_cluster[0].z, max_bits[2], use_par, ppA); epo_code_epi1[0] = epo_code_epi1[1] = cmp_ep_find_floor2(image_cluster[1].z, max_bits[2], use_par, ppB); break; case 3: if (channels3or4 == 4) { epo_code_epi0[0] = epo_code_epi0[1] = cmp_ep_find_floor2(image_cluster[0].w, max_bits[3], use_par, ppA); epo_code_epi1[0] = epo_code_epi1[1] = cmp_ep_find_floor2(image_cluster[1].w, max_bits[3], use_par, ppB); } break; } // set begin range epo_code_epi0[0] -= ((epo_code_epi0[0] < SHAKESIZE1 ? epo_code_epi0[0] : SHAKESIZE1)) & (~use_par); epo_code_epi1[0] -= ((epo_code_epi1[0] < SHAKESIZE1 ? epo_code_epi1[0] : SHAKESIZE1)) & (~use_par); // set end range epo_code_epi0[1] += ((1 << max_bits[ch]) - 1 - epo_code_epi0[1] < SHAKESIZE2 ? (1 << max_bits[ch]) - 1 - epo_code_epi0[1] : SHAKESIZE2) & (~use_par); epo_code_epi1[1] += ((1 << max_bits[ch]) - 1 - epo_code_epi1[1] < SHAKESIZE2 ? (1 << max_bits[ch]) - 1 - epo_code_epi1[1] : SHAKESIZE2) & (~use_par); CGU_UINT32 step = (1 << use_par); err_ed[ppA][ppB][ch] = CMP_FLOAT_MAX; for (CGU_UINT32 epo_p0 = epo_code_epi0[0]; epo_p0 <= epo_code_epi0[1]; epo_p0 += step) { for (CGU_UINT32 epo_p1 = epo_code_epi1[0]; epo_p1 <= epo_code_epi1[1]; epo_p1 += step) { CGV_FLOAT image_square_diff = 0.0F; CGV_FLOAT image_ramp; for (CGU_UINT32 _mc = 1; _mc < numEntries; _mc++) { image_ramp = GetRamp2(epo_p0, epo_p1, index_cluster[_mc], index_bits); switch (ch) { case 0: image_square_diff += cmp_squaref(image_ramp - image_src[_mc].x); break; case 1: image_square_diff += cmp_squaref(image_ramp - image_src[_mc].y); break; case 2: image_square_diff += cmp_squaref(image_ramp - image_src[_mc].z); break; case 3: if (channels3or4 == 4) image_square_diff += cmp_squaref(image_ramp - image_src[_mc].w); break; } } if (image_square_diff < err_ed[ppA][ppB][ch]) { err_ed[ppA][ppB][ch] = image_square_diff; epo_code_par[ppA][ppB][0][ch] = epo_p0; epo_code_par[ppA][ppB][1][ch] = epo_p1; } } } } // pp1 } // pp0 } // j //--------------------------------------------------------- // CMP_CONSTANT CGU_UINT8 npv_nd[2][8] = { // {1, 2, 4, 8, 16, 32, 0, 0}, // 3 channel // {1, 2, 4, 0, 0, 0, 0, 0} // 4 channel tyep index 0..7 // }; // for (CGU_INT pn = 0; pn < npv_nd[channels3or4 - 3][type]; pn++) CGU_UINT32 bits = 4; // for mode 6 its 4 for (CGU_UINT32 pn = 0; pn < bits; pn++) { CGV_FLOAT err_2 = 0.0F; CGU_UINT32 d1 = 0; CGU_UINT32 d2 = 0; for (CGU_UINT32 ch = 0; ch < channels3or4; ch++) { d1 = par_vectors42_nd[pn][0][ch]; d2 = par_vectors42_nd[pn][1][ch]; err_2 += err_ed[d1][d2][ch]; } if (err_2 < best_err) { best_err = err_2; d1 = par_vectors42_nd[pn][0][0]; d2 = par_vectors42_nd[pn][1][0]; epo_code_shake[0].x = epo_code_par[d1][d2][0][0]; epo_code_shake[1].x = epo_code_par[d1][d2][1][0]; d1 = par_vectors42_nd[pn][0][1]; d2 = par_vectors42_nd[pn][1][1]; epo_code_shake[0].y = epo_code_par[d1][d2][0][1]; epo_code_shake[1].y = epo_code_par[d1][d2][1][1]; d1 = par_vectors42_nd[pn][0][2]; d2 = par_vectors42_nd[pn][1][2]; epo_code_shake[0].z = epo_code_par[d1][d2][0][2]; epo_code_shake[1].z = epo_code_par[d1][d2][1][2]; if (channels3or4 == 4) { d1 = par_vectors42_nd[pn][0][3]; d2 = par_vectors42_nd[pn][1][3]; epo_code_shake[0].w = epo_code_par[d1][d2][0][3]; epo_code_shake[1].w = epo_code_par[d1][d2][1][3]; } } } return best_err; } CGV_FLOAT requantized_image_err2(CMP_INOUT CGU_UINT32 index_best[16], CMP_IN CGU_Vec4ui epo_code_best[2], CMP_IN CGU_UINT32 index_bits, CMP_IN CGU_UINT32 max_bits[4], CMP_IN CGU_Vec4ui image_src[16], CMP_IN CGU_UINT32 numEntries, // max 16 CMP_IN CGU_UINT32 channels3or4) { // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS) CMP_UNUSED(channels3or4); CMP_UNUSED(max_bits); //========================================= // requantized image based on new epo_code //========================================= CGV_Vec4f image_requantize[16]; CGV_FLOAT err_requant = 0.0F; for (CGU_UINT32 k = 0; k < numEntries; k++) { image_requantize[k].x = GetRamp2(epo_code_best[0].x, epo_code_best[1].x, k, index_bits); image_requantize[k].y = GetRamp2(epo_code_best[0].y, epo_code_best[1].y, k, index_bits); image_requantize[k].z = GetRamp2(epo_code_best[0].z, epo_code_best[1].z, k, index_bits); image_requantize[k].w = GetRamp2(epo_code_best[0].w, epo_code_best[1].w, k, index_bits); } //========================================= // Calc the error for the requantized image //========================================= CGV_FLOAT err_cmin; CGU_UINT32 best_indx; CGV_FLOAT image_err; CGV_Vec4f imageDiff; for (CGU_UINT32 k1 = 0; k1 < numEntries; k1++) { // start with error as sum of 4 channels with Max pixel // value 256 squared plus 1 for err min check = (256 * 256 * 4) + 1; err_cmin = 262145.0f; best_indx = 0; for (CGU_UINT8 k2 = 0; k2 < numEntries; k2++) { image_err = 0.0F; imageDiff.x = image_requantize[k2].x - image_src[k1].x; imageDiff.y = image_requantize[k2].y - image_src[k1].y; imageDiff.z = image_requantize[k2].z - image_src[k1].z; imageDiff.w = image_requantize[k2].w - image_src[k1].w; image_err = cmp_dot4f(imageDiff, imageDiff); if (image_err < err_cmin) { err_cmin = image_err; best_indx = k2; } } index_best[k1] = best_indx; err_requant += err_cmin; } return err_requant; } CGV_FLOAT cmp_mode6_optimize_IndexAndEndPoints(CMP_INOUT CGU_Vec4ui epo_code_out[2], // CMP_INOUT CGU_UINT32 index_io[16], // Make sure input index is 0..15 range CMP_IN CGU_Vec4ui image_src[16], CMP_IN CGU_UINT32 numEntries, // max 16 CMP_IN CGU_UINT32 Mi_, // last cluster , This should be no larger than 16 CMP_IN CGU_UINT32 bits, // total for all components CMP_IN CGU_UINT32 channels3or4, // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS) CMP_IN CGU_FLOAT errorThreshold) { CMP_UNUSED(bits); CGV_FLOAT err_best = CMP_FLOAT_MAX; CGU_UINT32 type = 2; // = bits % (2 * channels3or4) for Mode 6 with 58 bits and 4 channels type is 2 CGU_UINT32 use_par = 1; // as type == 2 use par is 1 = (type != 0); CGU_UINT32 max_bits[4] = {8, 8, 8, 8}; // Mode 6 max bits is 8 = (bits + channels2 - 1) / channels2; CGU_UINT32 index_bits = 4; // channel bits !! = 4 // CGU_INT iv; // iv = Mi_; // while (iv >>= 1) // index_bits++; Mi_ = Mi_ - 1; CGU_UINT32 index_tmp[16]; CGU_UINT32 maxTry = MAX_TRY_SHAKER; // should be set by quality CGV_FLOAT err_requant = 0.0F; // Init best index to input index for (CGU_UINT32 k = 0; k < numEntries; k++) index_tmp[k] = index_io[k]; CGU_UINT32 MaxIndex; MaxIndex = index_collapse2(index_tmp, numEntries); // we have a solid color 4x4 block no need for optimization! if (MaxIndex == 0) return 0.0f; for (CGU_UINT32 ii = 0; ii < maxTry; ii++) { //=============================== // We have ramp colors to process //=============================== CGV_FLOAT err_cluster = CMP_FLOAT_MAX; CGV_FLOAT err_shake; CGU_UINT32 index_cluster[16]; CGU_Vec4ui epo_code_best[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}}; for (CGU_UINT32 ii2 = 0; ii2 < numEntries; ii2++) index_cluster[ii2] = 0; CGU_UINT32 mi = Mi_; for (CGU_UINT32 index_slope = 1; (index_slope * MaxIndex) <= mi; index_slope++) { CGV_Vec4f image_cluster[2] = {{0.0f, 0.0f, 0.0f, 0.0f}, {0.0f, 0.0f, 0.0f, 0.0f}}; for (CGU_UINT32 index_offset = 0; index_offset <= (mi - index_slope * MaxIndex); index_offset++) { //------------------------------------- // set a new index data to try //------------------------------------- for (CGU_UINT32 k = 0; k < numEntries; k++) index_cluster[k] = index_tmp[k] * index_slope + index_offset; if (get_ideal_cluster2(image_cluster, index_cluster, Mi_, image_src, numEntries, channels3or4)) { CGU_Vec4ui epo_code_shake[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}}; err_shake = shake2( epo_code_shake, // return new epo image_cluster, index_cluster, image_src, index_bits, type, max_bits, use_par, numEntries, // max 16 channels3or4); if (err_shake < err_cluster) { err_cluster = err_shake; epo_code_best[0] = epo_code_shake[0]; epo_code_best[1] = epo_code_shake[1]; } } } } if ((err_cluster != CMP_FLOAT_MAX)) { //========================= // test results for quality //========================= CGU_UINT32 index_best[16] = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; err_requant = requantized_image_err2( index_best, // new index results epo_code_best, // prior result input index_bits, max_bits, image_src, numEntries, channels3or4); if (err_requant < err_best) { //better = 1; for (CGU_UINT32 k = 0; k < numEntries; k++) index_io[k] = index_tmp[k] = index_best[k]; //cmp_pack4bitindex(index_packed_out, index_io); epo_code_out[0] = epo_code_best[0]; epo_code_out[1] = epo_code_best[1]; err_best = err_requant; } } // Early out if we have our target err if (err_best <= errorThreshold) break; MaxIndex = index_collapse2(index_tmp, numEntries); if (MaxIndex == 0) break; } // Did not find anything better over Max trys return err_best; } #endif #endif // ENABLE_CMP_API : CPU & GPU Code block //================================================================================= // GPU API Interfaces // mode 4 5 6 all have 1 subset per block, and fix-up index is always index 0 //================================================================================= CMP_NUMTHREADS(THREAD_GROUP_SIZE, 1, 1) void TryMode456CS(CGU_UINT32 GI CMP_SVGROUPINDEX, CGU_Vec3ui groupID CMP_SVGROUPID) { CMP_CONSTANT CGU_UINT32 MAX_USED_THREAD = 16; CGU_UINT32 BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD; CGU_UINT32 blockInGroup = GI / MAX_USED_THREAD; CGU_UINT32 blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup; CGU_UINT32 threadBase = blockInGroup * MAX_USED_THREAD; CGU_UINT32 threadInBlock = GI - threadBase; CGU_UINT32 block_y = blockID / g_num_block_x; CGU_UINT32 block_x = blockID - block_y * g_num_block_x; CGU_UINT32 base_x = block_x * BLOCK_SIZE_X; CGU_UINT32 base_y = block_y * BLOCK_SIZE_Y; #if (defined(ENABLE_MODE4) || defined(ENABLE_MODE5) || defined(ENABLE_MODE6)|| defined(ENABLE_CMP_MODE6)) if (threadInBlock < 16) { CGU_Vec4f px = g_Input.Load(CGU_Vec3ui(base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0)) * 255.0f; px = clamp(px, 0.0f, 255.0f); shared_temp[GI].pixel.r = (CGU_UINT32)px.r; shared_temp[GI].pixel.g = (CGU_UINT32)px.g; shared_temp[GI].pixel.b = (CGU_UINT32)px.b; shared_temp[GI].pixel.a = (CGU_UINT32)px.a; shared_temp[GI].endPoint_low = shared_temp[GI].pixel; shared_temp[GI].endPoint_high = shared_temp[GI].pixel; } GroupSync(); if (threadInBlock < 8) { shared_temp[GI].endPoint_low = cmp_min(shared_temp[GI].endPoint_low, shared_temp[GI + 8].endPoint_low); shared_temp[GI].endPoint_high = cmp_max(shared_temp[GI].endPoint_high, shared_temp[GI + 8].endPoint_high); } GroupSync(); if (threadInBlock < 4) { shared_temp[GI].endPoint_low = cmp_min(shared_temp[GI].endPoint_low, shared_temp[GI + 4].endPoint_low); shared_temp[GI].endPoint_high = cmp_max(shared_temp[GI].endPoint_high, shared_temp[GI + 4].endPoint_high); } GroupSync(); if (threadInBlock < 2) { shared_temp[GI].endPoint_low = cmp_min(shared_temp[GI].endPoint_low, shared_temp[GI + 2].endPoint_low); shared_temp[GI].endPoint_high = cmp_max(shared_temp[GI].endPoint_high, shared_temp[GI + 2].endPoint_high); } GroupSync(); if (threadInBlock < 1) { shared_temp[GI].endPoint_low = cmp_min(shared_temp[GI].endPoint_low, shared_temp[GI + 1].endPoint_low); shared_temp[GI].endPoint_high = cmp_max(shared_temp[GI].endPoint_high, shared_temp[GI + 1].endPoint_high); } GroupSync(); CGU_Vec4ui endPoint[2]; endPoint[0] = shared_temp[threadBase].endPoint_low; endPoint[1] = shared_temp[threadBase].endPoint_high; CGU_UINT32 error = 0xFFFFFFFF; CGU_UINT32 mode = 0; CGU_UINT32 index_selector = 0; CGU_UINT32 rotation = 0; CGU_Vec2ui indexPrec; if (threadInBlock < 8) // all threads of threadInBlock < 8 will be working on trying out mode 4, since only mode 4 has index selector bit { if (0 == (threadInBlock & 1)) // thread 0, 2, 4, 6 { //2 represents 2bit index precision; 1 represents 3bit index precision index_selector = 0; indexPrec = CGU_Vec2ui( 2, 1 ); } else // thread 1, 3, 5, 7 { //2 represents 2bit index precision; 1 represents 3bit index precision index_selector = 1; indexPrec = CGU_Vec2ui( 1, 2 ); } } else { //2 represents 2bit index precision indexPrec = CGU_Vec2ui( 2, 2 ); } CGU_Vec4ui pixel_r; CGU_UINT32 color_index; CGU_UINT32 alpha_index; CGU_Vec4i span; CGU_Vec2i span_norm_sqr; CGU_Vec2i dotProduct; #if defined(ENABLE_MODE4) || defined(ENABLE_MODE5) if (threadInBlock < 12) // Try mode 4 5 in threads 0..11 { CGU_Vec4ui ep_quantized[2]; // mode 4 5 have component rotation if ((threadInBlock < 2) || (8 == threadInBlock)) // rotation = 0 in thread 0, 1 { rotation = 0; } else if ((threadInBlock < 4) || (9 == threadInBlock)) // rotation = 1 in thread 2, 3 { rotation = 1; set_pixel_rotation(endPoint[0],rotation); set_pixel_rotation(endPoint[1],rotation); } else if ((threadInBlock < 6) || (10 == threadInBlock)) // rotation = 2 in thread 4, 5 { rotation = 2; set_pixel_rotation(endPoint[0],rotation); set_pixel_rotation(endPoint[1],rotation); } else if ((threadInBlock < 8) || (11 == threadInBlock)) // rotation = 3 in thread 6, 7 { rotation = 3; set_pixel_rotation(endPoint[0],rotation); set_pixel_rotation(endPoint[1],rotation); } if (threadInBlock < 8) // try mode 4 in threads 0..7 { // mode 4 thread distribution // Thread 0 1 2 3 4 5 6 7 // Rotation 0 0 1 1 2 2 3 3 // Index selector 0 1 0 1 0 1 0 1 mode = 4; compress_endpoints4( endPoint,ep_quantized ); } else // try mode 5 in threads 8..11 { // mode 5 thread distribution // Thread 8 9 10 11 // Rotation 0 1 2 3 mode = 5; compress_endpoints5( endPoint,ep_quantized ); } CGU_Vec4ui pixel = shared_temp[threadBase + 0].pixel; set_pixel_rotation(pixel,rotation); span = cmp_castimp(endPoint[1] - endPoint[0]); span_norm_sqr = CGU_Vec2i( dot( span.rgb, span.rgb ), span.a * span.a ); // should be the same as above CGU_Vec3ui diff0 = pixel.rgb - endPoint[0].rgb; CGU_Vec3ui diff1 = pixel.rgb - endPoint[1].rgb; dotProduct = CGU_Vec2i( dot( diff0, diff0), dot( diff1, diff1) ); if ( dotProduct.x > dotProduct.y ) { span.rgb.x = -span.rgb.x; span.rgb.y = -span.rgb.y; span.rgb.z = -span.rgb.z; swap(endPoint[0].rgb, endPoint[1].rgb); } CGU_UINT32 diffa0 = pixel.a - endPoint[0].a; CGU_UINT32 diffa1 = pixel.a - endPoint[1].a; dotProduct = CGU_Vec2i( dot( diffa0, diffa0 ), dot( diffa1,diffa1 ) ); if ( dotProduct.x > dotProduct.y ) { span.a = -span.a; swap(endPoint[0].a, endPoint[1].a); } error = 0; for ( CGU_UINT32 i = 0; i < 16; i ++ ) { pixel = shared_temp[threadBase + i].pixel; set_pixel_rotation(pixel,rotation); diff0 = pixel.rgb - endPoint[0].rgb; dotProduct.x = dot( span.rgb, diff0 ); color_index = ( span_norm_sqr.x <= 0 /*endPoint[0] == endPoint[1]*/ || dotProduct.x <= 0 /*pixel == endPoint[0]*/ ) ? 0 : ( ( dotProduct.x < span_norm_sqr.x ) ? aStep[indexPrec.x][ CGU_UINT32( dotProduct.x * 63.49999 / span_norm_sqr.x ) ] : aStep[indexPrec.x][63] ); diffa0 = pixel.a - endPoint[0].a; dotProduct.y = dot( span.a, diffa0 ); alpha_index = ( span_norm_sqr.y <= 0 || dotProduct.y <= 0 ) ? 0 : ( ( dotProduct.y < span_norm_sqr.y ) ? aStep[indexPrec.y][ CGU_UINT32( dotProduct.y * 63.49999 / span_norm_sqr.y ) ] : aStep[indexPrec.y][63] ); pixel_r.rgb = ( endPoint[0].rgb * ( 64 - aWeight[indexPrec.x][color_index] ) + endPoint[1].rgb * aWeight[indexPrec.x][color_index] + 32U ); pixel_r.rgb.x = pixel_r.rgb.x >> 6; pixel_r.rgb.y = pixel_r.rgb.y >> 6; pixel_r.rgb.z = pixel_r.rgb.z >> 6; pixel_r.a = ( endPoint[0].a * ( 64 - aWeight[indexPrec.y][alpha_index] ) + endPoint[1].a * aWeight[indexPrec.y][alpha_index] + 32 ) >> 6; Ensure_A_Is_Larger( pixel_r, pixel ); pixel_r -= pixel; set_pixel_rotation(pixel_r,rotation); error += ComputeError(pixel_r, pixel_r); } } else #endif #ifdef ENABLE_MODE6 if (threadInBlock < 16)// Try mode 6 in threads 12..15, since in mode 4 5 6, only mode 6 has p bit { CGU_UINT32 p = threadInBlock - 12; CGU_Vec4ui ep_quantized[2]; compress_endpoints6( endPoint,ep_quantized, CGU_Vec2ui(p & 1 , (p >> 1)& 1 ) ); CGU_Vec4ui pixel = shared_temp[threadBase + 0].pixel; span = cmp_castimp( endPoint[1] - endPoint[0] ); span_norm_sqr = dot( span, span ); CGU_Vec4ui diff4 = pixel - endPoint[0]; dotProduct = dot( span, diff4 ); if ( span_norm_sqr.x > 0 && dotProduct.x >= 0 && CGU_UINT32( dotProduct.x * 63.49999 ) > CGU_UINT32( 32 * span_norm_sqr.x ) ) { span = -span; swap(endPoint[0], endPoint[1]); } error = 0; for ( CGU_UINT32 i = 0; i < 16; i ++ ) { pixel = shared_temp[threadBase + i].pixel; diff4 = pixel - endPoint[0]; dotProduct.x = dot( span, diff4 ); color_index = ( span_norm_sqr.x <= 0 || dotProduct.x <= 0 ) ? 0 : ( ( dotProduct.x < span_norm_sqr.x ) ? aStep[0][ CGU_UINT32( dotProduct.x * 63.49999 / span_norm_sqr.x ) ] : aStep[0][63] ); pixel_r = ( endPoint[0] * ( 64 - aWeight[0][color_index] ) + endPoint[1] * aWeight[0][color_index] + 32U ) >> 6; Ensure_A_Is_Larger( pixel_r, pixel ); pixel_r -= pixel; error += ComputeError(pixel_r, pixel_r); } mode = 6; rotation = p; // Borrow rotation for p } #endif shared_temp[GI].error = error; shared_temp[GI].mode = mode; shared_temp[GI].index_selector = index_selector; shared_temp[GI].rotation = rotation; GroupSync(); if (threadInBlock < 8) { if ( shared_temp[GI].error > shared_temp[GI + 8].error ) { shared_temp[GI].error = shared_temp[GI + 8].error; shared_temp[GI].mode = shared_temp[GI + 8].mode; shared_temp[GI].index_selector = shared_temp[GI + 8].index_selector; shared_temp[GI].rotation = shared_temp[GI + 8].rotation; } } GroupSync(); if (threadInBlock < 4) { if ( shared_temp[GI].error > shared_temp[GI + 4].error ) { shared_temp[GI].error = shared_temp[GI + 4].error; shared_temp[GI].mode = shared_temp[GI + 4].mode; shared_temp[GI].index_selector = shared_temp[GI + 4].index_selector; shared_temp[GI].rotation = shared_temp[GI + 4].rotation; } } GroupSync(); if (threadInBlock < 2) { if ( shared_temp[GI].error > shared_temp[GI + 2].error ) { shared_temp[GI].error = shared_temp[GI + 2].error; shared_temp[GI].mode = shared_temp[GI + 2].mode; shared_temp[GI].index_selector = shared_temp[GI + 2].index_selector; shared_temp[GI].rotation = shared_temp[GI + 2].rotation; } } GroupSync(); if (threadInBlock < 1) { if ( shared_temp[GI].error > shared_temp[GI + 1].error ) { shared_temp[GI].error = shared_temp[GI + 1].error; shared_temp[GI].mode = shared_temp[GI + 1].mode; shared_temp[GI].index_selector = shared_temp[GI + 1].index_selector; shared_temp[GI].rotation = shared_temp[GI + 1].rotation; } // Save the fast mode settings for modes 4&5 check if q = 0 for mode 6) g_OutBuff1[blockID].error = shared_temp[GI].error; g_OutBuff1[blockID].mode = shared_temp[GI].mode & 0x07; g_OutBuff1[blockID].rotation = shared_temp[GI].rotation; g_OutBuff1[blockID].index_selector = shared_temp[GI].index_selector; g_OutBuff1[blockID].partition = 0; g_OutBuff1[blockID].data2 = 0; // Enable cmp test #ifdef ENABLE_CMP_MODE6 if ((g_quality > 0.05f) #ifdef ENABLE_MODE6 && (shared_temp[GI].mode == 6) #endif ) { CGU_Vec4ui image_src[16]; for (int i = 0; i < 16; i++) { image_src[i].x = shared_temp[threadBase + i].pixel.x; image_src[i].y = shared_temp[threadBase + i].pixel.y; image_src[i].z = shared_temp[threadBase + i].pixel.z; image_src[i].w = shared_temp[threadBase + i].pixel.w; } CGU_Vec4ui epo_code_out[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}}; CGU_UINT32 index_packed_out[2] = {0, 0}; CGU_UINT32 cmp_out6[4] = {0, 0, 0, 0}; CGU_UINT32 best_index_out[16]; CGU_UINT32 besterr = cmp_GetIndexedEndPoints(epo_code_out, best_index_out, image_src, 15, // numEntries 0..15 (Note this function is changed from using 16) 0xffffffff); // Error cal needs updating to be the same all over //if (besterr > shared_temp[GI].error) { cmp_pack4bitindex32(index_packed_out, best_index_out); #ifdef ENABLE_CMP_REFINE_MODE6_API if (g_quality > 0.5f) { // Refined for better quailty using prior best_index_out initial input besterr = cmp_mode6_optimize_IndexAndEndPoints(epo_code_out, best_index_out, image_src, 16, // numEntries g_modesettings[6].clusters, // 16, g_modesettings[6].bits, // 58, g_modesettings[6].channels3or4, // 4, 0.1f); cmp_pack4bitindex32(index_packed_out, best_index_out); } #endif cmp_encode_mode6(cmp_out6, epo_code_out, index_packed_out); // Addin CMP results g_OutBuff1[blockID].error = besterr; g_OutBuff1[blockID].mode = 6 | 0x10; g_OutBuff1[blockID].data2.x = cmp_out6[0]; g_OutBuff1[blockID].data2.y = cmp_out6[1]; g_OutBuff1[blockID].data2.z = cmp_out6[2]; g_OutBuff1[blockID].data2.w = cmp_out6[3]; } // if better then fast mode } #endif } #else // Init if (threadInBlock < 1) { g_OutBuff1[blockID].error = MAX_UINT; g_OutBuff1[blockID].mode = 0; g_OutBuff1[blockID].rotation = 0; g_OutBuff1[blockID].index_selector = 0; g_OutBuff1[blockID].partition = 0; g_OutBuff1[blockID].data2 = 0; } GroupSync(); #endif } CMP_NUMTHREADS(THREAD_GROUP_SIZE, 1, 1) void TryMode137CS(CGU_UINT32 GI CMP_SVGROUPINDEX, CGU_Vec3ui groupID CMP_SVGROUPID) // mode 1 3 7 all have 2 subsets per block { const CGU_UINT32 MAX_USED_THREAD = 64; CGU_UINT32 BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD; CGU_UINT32 blockInGroup = GI / MAX_USED_THREAD; CGU_UINT32 blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup; CGU_UINT32 threadBase = blockInGroup * MAX_USED_THREAD; CGU_UINT32 threadInBlock = GI - threadBase; CGU_UINT32 block_y = blockID / g_num_block_x; CGU_UINT32 block_x = blockID - block_y * g_num_block_x; CGU_UINT32 base_x = block_x * BLOCK_SIZE_X; CGU_UINT32 base_y = block_y * BLOCK_SIZE_Y; if (threadInBlock < 16) { CGU_Vec4f px = g_Input.Load(CGU_Vec3ui(base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0)) * 255.0f; px = clamp(px, 0.0f, 255.0f); shared_temp[GI].pixel.r = (CGU_UINT32)px.r; shared_temp[GI].pixel.g = (CGU_UINT32)px.g; shared_temp[GI].pixel.b = (CGU_UINT32)px.b; shared_temp[GI].pixel.a = (CGU_UINT32)px.a; } GroupSync(); shared_temp[GI].error = 0xFFFFFFFF; // Use this to test only one of modes 1,3, or 7 // if (g_mode_id != 7) { // if (threadInBlock == 0) // g_OutBuff1[blockID].error = g_InBuff[blockID].error; // g_OutBuff1[blockID].mode = g_InBuff[blockID].mode; // g_OutBuff1[blockID].partition = g_InBuff[blockID].partition; // g_OutBuff1[blockID].index_selector = g_InBuff[blockID].index_selector; // g_OutBuff1[blockID].rotation = g_InBuff[blockID].rotation; // g_OutBuff1[blockID].data2 = g_InBuff[blockID].data2; // return; // } #if defined(ENABLE_MODE1) || defined(ENABLE_MODE3) || defined(ENABLE_MODE7) CGU_Vec4ui pixel_r; CGU_Vec4ui endPoint[2][2]; // endPoint[0..1 for subset id][0..1 for low and high in the subset] CGU_Vec4ui endPointBackup[2][2]; CGU_UINT32 color_index; if (threadInBlock < 64) { CGU_UINT32 partition = threadInBlock; CGU_UINT32 i; endPoint[0][0] = MAX_UINT; endPoint[0][1] = MIN_UINT; endPoint[1][0] = MAX_UINT; endPoint[1][1] = MIN_UINT; CGU_UINT32 bits = blockPartitions[partition]; for (i = 0; i < 16; i++) { CGU_Vec4ui pixel = shared_temp[threadBase + i].pixel; if (((bits >> i) & 0x01) == 1) { endPoint[1][0] = cmp_min(endPoint[1][0], pixel); endPoint[1][1] = cmp_max(endPoint[1][1], pixel); } else { endPoint[0][0] = cmp_min(endPoint[0][0], pixel); endPoint[0][1] = cmp_max(endPoint[0][1], pixel); } } endPointBackup[0][0] = endPoint[0][0]; endPointBackup[0][1] = endPoint[0][1]; endPointBackup[1][0] = endPoint[1][0]; endPointBackup[1][1] = endPoint[1][1]; CGU_UINT32 max_p = 2; // mode 1 #if defined(ENABLE_MODE3) || defined(ENABLE_MODE7) if (g_mode_id != 1) { // in mode 3 7, there are two p bits per subset, one for each end point max_p = 4; } #endif CGU_UINT32 final_p[2] = {0, 0}; CGU_UINT32 error[2] = {MAX_UINT, MAX_UINT}; for (CGU_UINT32 p = 0; p < max_p; p++) { endPoint[0][0] = endPointBackup[0][0]; endPoint[0][1] = endPointBackup[0][1]; endPoint[1][0] = endPointBackup[1][0]; endPoint[1][1] = endPointBackup[1][1]; for (i = 0; i < 2; i++) // loop through 2 subsets { #if defined(ENABLE_MODE1) if (g_mode_id == 1) { CGU_Vec4ui quantized[2]; compress_endpoints1(endPoint[i], quantized, p); } #endif #if defined(ENABLE_MODE3) if (g_mode_id == 3) { CGU_Vec4ui quantized[2]; compress_endpoints3(endPoint[i], quantized, CGU_Vec2ui(p & 1, (p >> 1) & 1)); } #endif #if defined(ENABLE_MODE7) if (g_mode_id == 7) { CGU_Vec4ui quantized[2]; compress_endpoints7(endPoint[i], quantized, CGU_Vec2ui(p & 1, (p >> 1) & 1)); } #endif } CGU_Vec4i span[2]; span[0].x = endPoint[0][1].x - endPoint[0][0].x; span[0].y = endPoint[0][1].y - endPoint[0][0].y; span[0].z = endPoint[0][1].z - endPoint[0][0].z; span[0].w = endPoint[0][1].w - endPoint[0][0].w; span[1].x = endPoint[1][1].x - endPoint[1][0].x; span[1].y = endPoint[1][1].y - endPoint[1][0].y; span[1].z = endPoint[1][1].z - endPoint[1][0].z; span[1].w = endPoint[1][1].w - endPoint[1][0].w; #if defined(ENABLE_MODE3) if (g_mode_id != 7) { span[0].w = span[1].w = 0; } #endif CGU_INT span_norm_sqr[2]; span_norm_sqr[0] = dot(span[0], span[0]); span_norm_sqr[1] = dot(span[1], span[1]); CGU_Vec4i diff; diff.x = shared_temp[threadBase + 0].pixel.x - endPoint[0][0].x; diff.y = shared_temp[threadBase + 0].pixel.y - endPoint[0][0].y; diff.z = shared_temp[threadBase + 0].pixel.z - endPoint[0][0].z; diff.w = shared_temp[threadBase + 0].pixel.w - endPoint[0][0].w; // TODO: again, this shouldn't be necessary here in error calculation CGU_INT dotProduct = dot(span[0],diff); if (span_norm_sqr[0] > 0 && dotProduct > 0 && CGU_UINT32(dotProduct * 63.49999) > CGU_UINT32(32 * span_norm_sqr[0])) { span[0].x = -span[0].x; span[0].y = -span[0].y; span[0].z = -span[0].z; span[0].w = -span[0].w; swap(endPoint[0][0], endPoint[0][1]); } diff.x = shared_temp[threadBase + candidateFixUpIndex1D[partition].x].pixel.x - endPoint[1][0].x; diff.y = shared_temp[threadBase + candidateFixUpIndex1D[partition].x].pixel.y - endPoint[1][0].y; diff.z = shared_temp[threadBase + candidateFixUpIndex1D[partition].x].pixel.z - endPoint[1][0].z; diff.w = shared_temp[threadBase + candidateFixUpIndex1D[partition].x].pixel.w - endPoint[1][0].w; dotProduct = dot(span[1], diff); if (span_norm_sqr[1] > 0 && dotProduct > 0 && CGU_UINT32(dotProduct * 63.49999) > CGU_UINT32(32 * span_norm_sqr[1])) { span[1].x = -span[1].x; span[1].y = -span[1].y; span[1].z = -span[1].z; span[1].w = -span[1].w; swap(endPoint[1][0], endPoint[1][1]); } CGU_UINT32 step_selector = 1; // mode 1 has 3 bit index #if defined(ENABLE_MODE3) || defined(ENABLE_MODE7) if (g_mode_id != 1) { step_selector = 2; // mode 3 7 have 2 bit index } #endif CGU_UINT32 p_error[2] = {0, 0}; for (i = 0; i < 16; i++) { CGU_UINT32 subset_index = (bits >> i) & 0x01; if (subset_index == 1) { diff.x = shared_temp[threadBase + i].pixel.x - endPoint[1][0].x; diff.y = shared_temp[threadBase + i].pixel.y - endPoint[1][0].y; diff.z = shared_temp[threadBase + i].pixel.z - endPoint[1][0].z; diff.w = shared_temp[threadBase + i].pixel.w - endPoint[1][0].w; dotProduct = dot(span[1], diff); color_index = (span_norm_sqr[1] <= 0 || dotProduct <= 0) ? 0 : ((dotProduct < span_norm_sqr[1]) ? aStep[step_selector][CGU_UINT32(dotProduct * 63.49999 / span_norm_sqr[1])] : aStep[step_selector][63]); } else { diff.x = shared_temp[threadBase + i].pixel.x - endPoint[0][0].x; diff.y = shared_temp[threadBase + i].pixel.y - endPoint[0][0].y; diff.z = shared_temp[threadBase + i].pixel.z - endPoint[0][0].z; diff.w = shared_temp[threadBase + i].pixel.w - endPoint[0][0].w; dotProduct = dot(span[0], diff); color_index = (span_norm_sqr[0] <= 0 || dotProduct <= 0) ? 0 : ((dotProduct < span_norm_sqr[0]) ? aStep[step_selector][CGU_UINT32(dotProduct * 63.49999 / span_norm_sqr[0])] : aStep[step_selector][63]); } pixel_r = (endPoint[subset_index][0] * (64 - aWeight[step_selector][color_index]) + endPoint[subset_index][1] * aWeight[step_selector][color_index] + 32U) >> 6; if (g_mode_id != 7) { pixel_r.a = 255; } CGU_Vec4ui pixel = shared_temp[threadBase + i].pixel; Ensure_A_Is_Larger(pixel_r, pixel); pixel_r -= pixel; CGU_UINT32 pixel_error = ComputeError(pixel_r, pixel_r); if (subset_index == 1) p_error[1] += pixel_error; else p_error[0] += pixel_error; } for (i = 0; i < 2; i++) { if (p_error[i] < error[i]) { error[i] = p_error[i]; final_p[i] = p; } } } shared_temp[GI].error = error[0] + error[1]; shared_temp[GI].mode = g_mode_id; shared_temp[GI].partition = partition; // mode 1 3 7 don't have rotation, we use rotation for p bits if (g_mode_id == 1) shared_temp[GI].rotation = (final_p[1] << 1) | final_p[0]; else shared_temp[GI].rotation = (final_p[1] << 2) | final_p[0]; } GroupSync(); if (threadInBlock < 32) { if (shared_temp[GI].error > shared_temp[GI + 32].error) { shared_temp[GI].error = shared_temp[GI + 32].error; shared_temp[GI].mode = shared_temp[GI + 32].mode; shared_temp[GI].partition = shared_temp[GI + 32].partition; shared_temp[GI].rotation = shared_temp[GI + 32].rotation; } } GroupSync(); if (threadInBlock < 16) { if (shared_temp[GI].error > shared_temp[GI + 16].error) { shared_temp[GI].error = shared_temp[GI + 16].error; shared_temp[GI].mode = shared_temp[GI + 16].mode; shared_temp[GI].partition = shared_temp[GI + 16].partition; shared_temp[GI].rotation = shared_temp[GI + 16].rotation; } } GroupSync(); if (threadInBlock < 8) { if (shared_temp[GI].error > shared_temp[GI + 8].error) { shared_temp[GI].error = shared_temp[GI + 8].error; shared_temp[GI].mode = shared_temp[GI + 8].mode; shared_temp[GI].partition = shared_temp[GI + 8].partition; shared_temp[GI].rotation = shared_temp[GI + 8].rotation; } } GroupSync(); if (threadInBlock < 4) { if (shared_temp[GI].error > shared_temp[GI + 4].error) { shared_temp[GI].error = shared_temp[GI + 4].error; shared_temp[GI].mode = shared_temp[GI + 4].mode; shared_temp[GI].partition = shared_temp[GI + 4].partition; shared_temp[GI].rotation = shared_temp[GI + 4].rotation; } } GroupSync(); if (threadInBlock < 2) { if (shared_temp[GI].error > shared_temp[GI + 2].error) { shared_temp[GI].error = shared_temp[GI + 2].error; shared_temp[GI].mode = shared_temp[GI + 2].mode; shared_temp[GI].partition = shared_temp[GI + 2].partition; shared_temp[GI].rotation = shared_temp[GI + 2].rotation; } } GroupSync(); if (threadInBlock < 1) { if (shared_temp[GI].error > shared_temp[GI + 1].error) { shared_temp[GI].error = shared_temp[GI + 1].error; shared_temp[GI].mode = shared_temp[GI + 1].mode; shared_temp[GI].partition = shared_temp[GI + 1].partition; shared_temp[GI].rotation = shared_temp[GI + 1].rotation; } if ((g_InBuff[blockID].error > shared_temp[GI].error)){ g_OutBuff1[blockID].error = shared_temp[GI].error; g_OutBuff1[blockID].mode = shared_temp[GI].mode; g_OutBuff1[blockID].partition = shared_temp[GI].partition; g_OutBuff1[blockID].rotation = shared_temp[GI].rotation; g_OutBuff1[blockID].index_selector = 0; g_OutBuff1[blockID].data2 = 0; } else { g_OutBuff1[blockID].error = g_InBuff[blockID].error; g_OutBuff1[blockID].mode = g_InBuff[blockID].mode; g_OutBuff1[blockID].partition = g_InBuff[blockID].partition; g_OutBuff1[blockID].index_selector = g_InBuff[blockID].index_selector; g_OutBuff1[blockID].rotation = g_InBuff[blockID].rotation; g_OutBuff1[blockID].data2 = g_InBuff[blockID].data2; } } #else GroupSync(); if (threadInBlock < 1) { // cary over prior results g_OutBuff1[blockID].error = g_InBuff[blockID].error; g_OutBuff1[blockID].mode = g_InBuff[blockID].mode; g_OutBuff1[blockID].partition = g_InBuff[blockID].partition; g_OutBuff1[blockID].index_selector = g_InBuff[blockID].index_selector; g_OutBuff1[blockID].rotation = g_InBuff[blockID].rotation; g_OutBuff1[blockID].data2 = g_InBuff[blockID].data2; } #endif } CMP_NUMTHREADS(THREAD_GROUP_SIZE, 1, 1) void TryMode02CS(CGU_UINT32 GI CMP_SVGROUPINDEX, CGU_Vec3ui groupID CMP_SVGROUPID) // mode 0 2 have 3 subsets per block { const CGU_UINT32 MAX_USED_THREAD = 64; CGU_UINT32 BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD; CGU_UINT32 blockInGroup = GI / MAX_USED_THREAD; CGU_UINT32 blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup; CGU_UINT32 threadBase = blockInGroup * MAX_USED_THREAD; CGU_UINT32 threadInBlock = GI - threadBase; CGU_UINT32 block_y = blockID / g_num_block_x; CGU_UINT32 block_x = blockID - block_y * g_num_block_x; CGU_UINT32 base_x = block_x * BLOCK_SIZE_X; CGU_UINT32 base_y = block_y * BLOCK_SIZE_Y; #if defined(ENABLE_MODE0) || defined(ENABLE_MODE2) if (threadInBlock < 16) { CGU_Vec4f px = g_Input.Load(CGU_Vec3ui(base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0)) * 255.0f; px = clamp(px, 0.0f, 255.0f); shared_temp[GI].pixel.r = (CGU_UINT32)px.r; shared_temp[GI].pixel.g = (CGU_UINT32)px.g; shared_temp[GI].pixel.b = (CGU_UINT32)px.b; shared_temp[GI].pixel.a = (CGU_UINT32)px.a; } GroupSync(); shared_temp[GI].error = 0xFFFFFFFF; CGU_UINT32 num_partitions; if (0 == g_mode_id) { num_partitions = 16; } else { num_partitions = 64; } CGU_Vec4ui pixel_r; CGU_Vec4ui endPoint[3][2]; // endPoint[0..1 for subset id][0..1 for low and high in the subset] CGU_Vec4ui endPointBackup[3][2]; CGU_UINT32 color_index[16]; if (threadInBlock < num_partitions) { CGU_UINT32 partition = threadInBlock + 64; endPoint[0][0] = MAX_UINT; endPoint[0][1] = MIN_UINT; endPoint[1][0] = MAX_UINT; endPoint[1][1] = MIN_UINT; endPoint[2][0] = MAX_UINT; endPoint[2][1] = MIN_UINT; CGU_UINT32 bits2 = blockPartitions2[partition - 64]; CGU_UINT32 i; for ( i = 0; i < 16; i ++ ) { CGU_Vec4ui pixel = shared_temp[threadBase + i].pixel; CGU_UINT32 subset_index = ( bits2 >> ( i * 2 ) ) & 0x03; if ( subset_index == 2 ) { endPoint[2][0] = cmp_min( endPoint[2][0], pixel ); endPoint[2][1] = cmp_max( endPoint[2][1], pixel ); } else if ( subset_index == 1 ) { endPoint[1][0] = cmp_min( endPoint[1][0], pixel ); endPoint[1][1] = cmp_max( endPoint[1][1], pixel ); } else { endPoint[0][0] = cmp_min( endPoint[0][0], pixel ); endPoint[0][1] = cmp_max( endPoint[0][1], pixel ); } } endPointBackup[0][0] = endPoint[0][0]; endPointBackup[0][1] = endPoint[0][1]; endPointBackup[1][0] = endPoint[1][0]; endPointBackup[1][1] = endPoint[1][1]; endPointBackup[2][0] = endPoint[2][0]; endPointBackup[2][1] = endPoint[2][1]; CGU_UINT32 max_p; if (0 == g_mode_id) { max_p = 4; } else { max_p = 1; } CGU_UINT32 final_p[3] = { 0, 0, 0 }; CGU_UINT32 error[3] = { MAX_UINT, MAX_UINT, MAX_UINT }; CGU_Vec4ui ep_quantized[2]; for ( CGU_UINT32 p = 0; p < max_p; p ++ ) { endPoint[0][0] = endPointBackup[0][0]; endPoint[0][1] = endPointBackup[0][1]; endPoint[1][0] = endPointBackup[1][0]; endPoint[1][1] = endPointBackup[1][1]; endPoint[2][0] = endPointBackup[2][0]; endPoint[2][1] = endPointBackup[2][1]; for ( i = 0; i < 3; i ++ ) { if (0 == g_mode_id) { compress_endpoints0( endPoint[i],ep_quantized, CGU_Vec2ui(p& 1, (p >> 1)& 1)); } else { compress_endpoints2( endPoint[i],ep_quantized ); } } CGU_UINT32 step_selector = 1 + (2 == g_mode_id); CGU_Vec4i span[3]; span[0] = cmp_castimp(endPoint[0][1] - endPoint[0][0]); span[1] = cmp_castimp(endPoint[1][1] - endPoint[1][0]); span[2] = cmp_castimp(endPoint[2][1] - endPoint[2][0]); span[0].w = span[1].w = span[2].w = 0; CGU_INT span_norm_sqr[3]; span_norm_sqr[0] = dot( span[0], span[0] ); span_norm_sqr[1] = dot( span[1], span[1] ); span_norm_sqr[2] = dot( span[2], span[2] ); // TODO: again, this shouldn't be necessary here in error calculation CGU_UINT32 ci[3] = { 0, candidateFixUpIndex1D[partition].x, candidateFixUpIndex1D[partition].y }; CGU_Vec4ui diff; for (i = 0; i < 3; i ++) { diff = shared_temp[threadBase + ci[i]].pixel - endPoint[i][0]; CGU_INT dotProduct = dot( span[i], diff ); if ( span_norm_sqr[i] > 0 && dotProduct > 0 && CGU_UINT32( dotProduct * 63.49999 ) > CGU_UINT32( 32 * span_norm_sqr[i] ) ) { span[i] = -span[i]; swap(endPoint[i][0], endPoint[i][1]); } } CGU_UINT32 p_error[3] = { 0, 0, 0 }; for ( i = 0; i < 16; i ++ ) { CGU_UINT32 subset_index = ( bits2 >> ( i * 2 ) ) & 0x03; if ( subset_index == 2 ) { diff = shared_temp[threadBase + i].pixel - endPoint[2][0]; CGU_INT dotProduct = dot( span[2], diff ); color_index[i] = ( span_norm_sqr[2] <= 0 || dotProduct <= 0 ) ? 0 : ( ( dotProduct < span_norm_sqr[2] ) ? aStep[step_selector][ CGU_UINT32( dotProduct * 63.49999 / span_norm_sqr[2] ) ] : aStep[step_selector][63] ); } else if ( subset_index == 1 ) { diff = shared_temp[threadBase + i].pixel - endPoint[1][0]; CGU_INT dotProduct = dot( span[1], diff ); color_index[i] = ( span_norm_sqr[1] <= 0 || dotProduct <= 0 ) ? 0 : ( ( dotProduct < span_norm_sqr[1] ) ? aStep[step_selector][ CGU_UINT32( dotProduct * 63.49999 / span_norm_sqr[1] ) ] : aStep[step_selector][63] ); } else { diff = shared_temp[threadBase + i].pixel - endPoint[0][0]; CGU_INT dotProduct = dot( span[0], diff ); color_index[i] = ( span_norm_sqr[0] <= 0 || dotProduct <= 0 ) ? 0 : ( ( dotProduct < span_norm_sqr[0] ) ? aStep[step_selector][ CGU_UINT32( dotProduct * 63.49999 / span_norm_sqr[0] ) ] : aStep[step_selector][63] ); } pixel_r = ( endPoint[subset_index][0]*( 64 - aWeight[step_selector][color_index[i]] ) + endPoint[subset_index][1]* aWeight[step_selector][color_index[i]] + 32U ) >> 6; pixel_r.a = 255; CGU_Vec4ui pixel = shared_temp[threadBase + i].pixel; Ensure_A_Is_Larger( pixel_r, pixel ); pixel_r -= pixel; CGU_UINT32 pixel_error = ComputeError(pixel_r, pixel_r); if ( subset_index == 2 ) p_error[2] += pixel_error; else if ( subset_index == 1 ) p_error[1] += pixel_error; else p_error[0] += pixel_error; } for ( i = 0; i < 3; i++ ) { if (p_error[i] < error[i]) { error[i] = p_error[i]; final_p[i] = p; // Borrow rotation for p } } } shared_temp[GI].error = error[0] + error[1] + error[2]; shared_temp[GI].partition = partition; shared_temp[GI].rotation = (final_p[2] << 4) | (final_p[1] << 2) | final_p[0]; } GroupSync(); if (threadInBlock < 32) { if ( shared_temp[GI].error > shared_temp[GI + 32].error ) { shared_temp[GI].error = shared_temp[GI + 32].error; shared_temp[GI].partition = shared_temp[GI + 32].partition; shared_temp[GI].rotation = shared_temp[GI + 32].rotation; } } GroupSync(); if (threadInBlock < 16) { if ( shared_temp[GI].error > shared_temp[GI + 16].error ) { shared_temp[GI].error = shared_temp[GI + 16].error; shared_temp[GI].partition = shared_temp[GI + 16].partition; shared_temp[GI].rotation = shared_temp[GI + 16].rotation; } } GroupSync(); if (threadInBlock < 8) { if ( shared_temp[GI].error > shared_temp[GI + 8].error ) { shared_temp[GI].error = shared_temp[GI + 8].error; shared_temp[GI].partition = shared_temp[GI + 8].partition; shared_temp[GI].rotation = shared_temp[GI + 8].rotation; } } GroupSync(); if (threadInBlock < 4) { if ( shared_temp[GI].error > shared_temp[GI + 4].error ) { shared_temp[GI].error = shared_temp[GI + 4].error; shared_temp[GI].partition = shared_temp[GI + 4].partition; shared_temp[GI].rotation = shared_temp[GI + 4].rotation; } } GroupSync(); if (threadInBlock < 2) { if ( shared_temp[GI].error > shared_temp[GI + 2].error ) { shared_temp[GI].error = shared_temp[GI + 2].error; shared_temp[GI].partition = shared_temp[GI + 2].partition; shared_temp[GI].rotation = shared_temp[GI + 2].rotation; } } GroupSync(); if (threadInBlock < 1) { if ( shared_temp[GI].error > shared_temp[GI + 1].error ) { shared_temp[GI].error = shared_temp[GI + 1].error; shared_temp[GI].partition = shared_temp[GI + 1].partition; shared_temp[GI].rotation = shared_temp[GI + 1].rotation; } if (g_InBuff[blockID].error > shared_temp[GI].error) { g_OutBuff1[blockID].error = shared_temp[GI].error; g_OutBuff1[blockID].mode = g_mode_id; g_OutBuff1[blockID].partition = shared_temp[GI].partition; g_OutBuff1[blockID].rotation = shared_temp[GI].rotation; g_OutBuff1[blockID].data2 = 0; } else { g_OutBuff1[blockID].error = g_InBuff[blockID].error; g_OutBuff1[blockID].mode = g_InBuff[blockID].mode; g_OutBuff1[blockID].partition = g_InBuff[blockID].partition; g_OutBuff1[blockID].index_selector = g_InBuff[blockID].index_selector; g_OutBuff1[blockID].rotation = g_InBuff[blockID].rotation; g_OutBuff1[blockID].data2 = g_InBuff[blockID].data2; } } #endif } CMP_NUMTHREADS(THREAD_GROUP_SIZE, 1, 1) void EncodeBlocks(CGU_UINT32 GI CMP_SVGROUPINDEX, CGU_Vec3ui groupID CMP_SVGROUPID) { CMP_CONSTANT CGU_UINT32 MAX_USED_THREAD = 16; CGU_UINT32 BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD; CGU_UINT32 blockInGroup = GI / MAX_USED_THREAD; CGU_UINT32 blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup; CGU_UINT32 threadBase = blockInGroup * MAX_USED_THREAD; CGU_UINT32 threadInBlock = GI - threadBase; CGU_UINT32 block_y = blockID / g_num_block_x; CGU_UINT32 block_x = blockID - block_y * g_num_block_x; CGU_UINT32 base_x = block_x * BLOCK_SIZE_X; CGU_UINT32 base_y = block_y * BLOCK_SIZE_Y; CGU_UINT32 use_cmp = g_InBuff[blockID].mode & 0x10; CGU_UINT32 best_mode = g_InBuff[blockID].mode & 0x07; CGU_UINT32 best_partition = g_InBuff[blockID].partition; CGU_UINT32 best_index_selector = g_InBuff[blockID].index_selector; CGU_UINT32 best_rotation = g_InBuff[blockID].rotation; if (threadInBlock < 16) { CGU_Vec4f px = g_Input.Load(CGU_Vec3ui(base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0)) * 255.0f; px = clamp(px, 0.0f, 255.0f); CGU_Vec4ui pixel; pixel.r = (CGU_UINT32)px.r; pixel.g = (CGU_UINT32)px.g; pixel.b = (CGU_UINT32)px.b; pixel.a = (CGU_UINT32)px.a; if ((4 == best_mode) || (5 == best_mode)) set_pixel_rotation(pixel,best_rotation); shared_temp[GI].pixel = pixel; } GroupSync(); CGU_UINT32 bits = blockPartitions[best_partition]; CGU_UINT32 bits2 = blockPartitions2[best_partition - 64]; CGU_Vec4ui ep[2]; ep[0] = MAX_UINT; ep[1] = MIN_UINT; CGU_Vec4ui ep_quantized[2]; CGU_Vec3ui diff3; CGU_Vec4ui diff4; CMP_UNROLL for (CGU_INT ii = 2; ii >= 0; -- ii) { if (threadInBlock < 16) { CGU_Vec4ui epTemp[2]; epTemp[0] = MAX_UINT; epTemp[1] = MIN_UINT; CGU_Vec4ui pixel = shared_temp[GI].pixel; CGU_UINT32 subset_index = ( bits >> threadInBlock ) & 0x01; CGU_UINT32 subset_index2 = ( bits2 >> ( threadInBlock * 2 ) ) & 0x03; if (0 == ii) { if ((0 == best_mode) || (2 == best_mode)) { if (0 == subset_index2) { epTemp[0] = epTemp[1] = pixel; } } else if ((1 == best_mode) || (3 == best_mode) || (7 == best_mode)) { if (0 == subset_index) { epTemp[0] = epTemp[1] = pixel; } } else if ((4 == best_mode) || (5 == best_mode) || (6 == best_mode)) { epTemp[0] = epTemp[1] = pixel; } } else if (1 == ii) { if ((0 == best_mode) || (2 == best_mode)) { if (1 == subset_index2) { epTemp[0] = epTemp[1] = pixel; } } else if ((1 == best_mode) || (3 == best_mode) || (7 == best_mode)) { if (1 == subset_index) { epTemp[0] = epTemp[1] = pixel; } } } else { if ((0 == best_mode) || (2 == best_mode)) { if (2 == subset_index2) { epTemp[0] = epTemp[1] = pixel; } } } shared_temp[GI].endPoint_low = epTemp[0]; shared_temp[GI].endPoint_high = epTemp[1]; } GroupSync(); if (threadInBlock < 8) { shared_temp[GI].endPoint_low = cmp_min(shared_temp[GI].endPoint_low, shared_temp[GI + 8].endPoint_low); shared_temp[GI].endPoint_high = cmp_max(shared_temp[GI].endPoint_high, shared_temp[GI + 8].endPoint_high); } GroupSync(); if (threadInBlock < 4) { shared_temp[GI].endPoint_low = cmp_min(shared_temp[GI].endPoint_low, shared_temp[GI + 4].endPoint_low); shared_temp[GI].endPoint_high = cmp_max(shared_temp[GI].endPoint_high, shared_temp[GI + 4].endPoint_high); } GroupSync(); if (threadInBlock < 2) { shared_temp[GI].endPoint_low = cmp_min(shared_temp[GI].endPoint_low, shared_temp[GI + 2].endPoint_low); shared_temp[GI].endPoint_high = cmp_max(shared_temp[GI].endPoint_high, shared_temp[GI + 2].endPoint_high); } GroupSync(); if (threadInBlock < 1) { shared_temp[GI].endPoint_low = cmp_min(shared_temp[GI].endPoint_low, shared_temp[GI + 1].endPoint_low); shared_temp[GI].endPoint_high = cmp_max(shared_temp[GI].endPoint_high, shared_temp[GI + 1].endPoint_high); } GroupSync(); if (ii == (int)threadInBlock) { ep[0] = shared_temp[threadBase].endPoint_low; ep[1] = shared_temp[threadBase].endPoint_high; } } if (threadInBlock < 3) { CGU_Vec2ui P; if (1 == best_mode) { P = (best_rotation >> threadInBlock) & 1; } else { P = CGU_Vec2ui((best_rotation >> (threadInBlock * 2 + 0))&1, (best_rotation >> (threadInBlock * 2 + 1))&1); } if (0 == best_mode) { compress_endpoints0( ep,ep_quantized, P ); } else if (1 == best_mode) { compress_endpoints1( ep,ep_quantized, P ); } else if (2 == best_mode) { compress_endpoints2( ep,ep_quantized ); } else if (3 == best_mode) { compress_endpoints3( ep,ep_quantized, P ); } else if (4 == best_mode) { compress_endpoints4( ep,ep_quantized ); } else if (5 == best_mode) { compress_endpoints5( ep,ep_quantized); } else if (6 == best_mode) { compress_endpoints6( ep,ep_quantized, P ); } else //if (7 == mode) { compress_endpoints7( ep,ep_quantized, P ); } CGU_Vec4i span = cmp_castimp(ep[1] - ep[0]); if (best_mode < 4) span.w = 0; if ((4 == best_mode) || (5 == best_mode)) { if (0 == threadInBlock) { CGU_Vec2i span_norm_sqr = CGU_Vec2i( dot( span.rgb, span.rgb ),span.a * span.a ); diff3 = shared_temp[threadBase + 0].pixel.rgb - ep[0].rgb; CGU_Vec2i dotProduct = CGU_Vec2i( dot( span.rgb, diff3 ), span.a * ( shared_temp[threadBase + 0].pixel.a - ep[0].a ) ); if ( span_norm_sqr.x > 0 && dotProduct.x > 0 && CGU_UINT32( dotProduct.x * 63.49999 ) > CGU_UINT32( 32 * span_norm_sqr.x ) ) { swap(ep[0].rgb, ep[1].rgb); swap(ep_quantized[0].rgb, ep_quantized[1].rgb); } if ( span_norm_sqr.y > 0 && dotProduct.y > 0 && CGU_UINT32( dotProduct.y * 63.49999 ) > CGU_UINT32( 32 * span_norm_sqr.y ) ) { swap(ep[0].a, ep[1].a); swap(ep_quantized[0].a, ep_quantized[1].a); } } } else //if ((0 == mode) || (2 == mode) || (1 == mode) || (3 == mode) || (7 == mode) || (6 == mode)) { CGU_INT p; if (0 == threadInBlock) { p = 0; } else if (1 == threadInBlock) { p = candidateFixUpIndex1D[best_partition].x; } else //if (2 == threadInBlock) { p = candidateFixUpIndex1D[best_partition].y; } CGU_INT span_norm_sqr = dot( span, span ); diff4 = shared_temp[threadBase + p].pixel - ep[0]; CGU_INT dotProduct = dot( span, diff4 ); if ( span_norm_sqr > 0 && dotProduct > 0 && CGU_UINT32( dotProduct * 63.49999 ) > CGU_UINT32( 32 * span_norm_sqr ) ) { swap(ep[0], ep[1]); swap(ep_quantized[0], ep_quantized[1]); } } shared_temp[GI].endPoint_low = ep[0]; shared_temp[GI].endPoint_high = ep[1]; shared_temp[GI].endPoint_low_quantized = ep_quantized[0]; shared_temp[GI].endPoint_high_quantized = ep_quantized[1]; } GroupSync(); if (threadInBlock < 16) { CGU_UINT32 color_index = 0; CGU_UINT32 alpha_index = 0; CGU_Vec4ui epTemp[2]; CGU_Vec2ui indexPrec; if ((0 == best_mode) || (1 == best_mode)) { indexPrec = 1; } else if (6 == best_mode) { indexPrec = 0; } else if (4 == best_mode) { if (0 == best_index_selector) { indexPrec = CGU_Vec2ui(2, 1); } else { indexPrec = CGU_Vec2ui(1, 2); } } else { indexPrec = 2; } CGU_INT subset_index; if ((0 == best_mode) || (2 == best_mode)) { subset_index = (bits2 >> (threadInBlock * 2)) & 0x03; } else if ((1 == best_mode) || (3 == best_mode) || (7 == best_mode)) { subset_index = (bits >> threadInBlock) & 0x01; } else { subset_index = 0; } epTemp[0] = shared_temp[threadBase + subset_index].endPoint_low; epTemp[1] = shared_temp[threadBase + subset_index].endPoint_high; CGU_Vec4i span = cmp_castimp(epTemp[1] - epTemp[0]); if (best_mode < 4) { span.w = 0; } if ((4 == best_mode) || (5 == best_mode)) { CGU_Vec2i span_norm_sqr; span_norm_sqr.x = dot( span.rgb, span.rgb ); span_norm_sqr.y = span.a * span.a; diff3 = shared_temp[threadBase + threadInBlock].pixel.rgb - epTemp[0].rgb; CGU_INT dotProduct = dot( span.rgb, diff3 ); color_index = ( span_norm_sqr.x <= 0 || dotProduct <= 0 ) ? 0 : ( ( dotProduct < span_norm_sqr.x ) ? aStep[indexPrec.x][ CGU_UINT32( dotProduct * 63.49999 / span_norm_sqr.x ) ] : aStep[indexPrec.x][63] ); CGU_UINT32 diffa = shared_temp[threadBase + threadInBlock].pixel.a - epTemp[0].a; dotProduct = dot( span.a, diffa ); alpha_index = ( span_norm_sqr.y <= 0 || dotProduct <= 0 ) ? 0 : ( ( dotProduct < span_norm_sqr.y ) ? aStep[indexPrec.y][ CGU_UINT32( dotProduct * 63.49999 / span_norm_sqr.y ) ] : aStep[indexPrec.y][63] ); if (best_index_selector) { swap(color_index, alpha_index); } } else { CGU_INT span_norm_sqr = dot( span, span ); diff4 = shared_temp[threadBase + threadInBlock].pixel - epTemp[0] ; CGU_INT dotProduct = dot( span, diff4); color_index = ( span_norm_sqr <= 0 || dotProduct <= 0 ) ? 0 : ( ( dotProduct < span_norm_sqr ) ? aStep[indexPrec.x][ CGU_UINT32( dotProduct * 63.49999 / span_norm_sqr ) ] : aStep[indexPrec.x][63] ); } shared_temp[GI].error = color_index; shared_temp[GI].mode = alpha_index; } GroupSync(); if (0 == threadInBlock) { CGU_Vec4ui blockRed = {0x001fffc0, 0xfffe0000, 0x00000001, 0x00000000}; CGU_Vec4ui blockBlue = {0x00000040, 0xfffffff8, 0x00000001, 0x00000000}; CGU_Vec4ui block = {0, 0, 0, 0}; switch (best_mode) { case 0: block_package0(block, best_partition, threadBase); //block = blockRed; break; case 1: block_package1(block, best_partition, threadBase); //block = blockRed; break; case 2: block_package2(block, best_partition, threadBase); //block = blockRed; break; case 3: block_package3(block, best_partition, threadBase); //block = blockRed; break; case 4: block_package4(block, best_rotation, best_index_selector, threadBase); //block = blockRed; break; case 5: block_package5(block, best_rotation, threadBase); //block = blockRed; break; case 6: if (use_cmp) { block = g_InBuff[blockID].data2; //block = blockBlue; } else { block_package6( block, threadBase ); //block = blockRed; } break; case 7: block_package7(block, best_partition, threadBase); //block = blockRed; break; default: // error! block = blockRed; break; } g_OutBuff[blockID] = block; } } //================================================= // This is a prototype API interface to run on CPU // move to GPU when completed //================================================= CMP_STATIC CGU_Vec4ui CompressBlockBC7_CMPMSC(CMP_IN CGU_Vec4f image_src[16], CMP_IN CGU_FLOAT fquality) { CMP_UNUSED(fquality); CGU_Vec4ui cmp = {0, 0, 0, 0}; #ifndef ASPM_HLSL #ifdef SIMULATE_GPU HLSLHost(image_src); cmp = g_OutBuff[0]; #else CGU_Vec4ui image_srcui[16]; // Transfer local pixel data over to shared global for (CGU_INT ii = 0; ii < 16; ii++) { image_srcui[ii].x = image_src[ii].x; image_srcui[ii].y = image_src[ii].y; image_srcui[ii].z = image_src[ii].z; image_srcui[ii].w = image_src[ii].w; } #if defined (ENABLE_CMP_MODE6) CGU_Vec4ui epo_code_out[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}}; CGU_UINT32 best_index_out[16]; CGU_FLOAT besterr; CGU_FLOAT err; // Fast Encode of block besterr = cmp_GetIndexedEndPoints(epo_code_out, best_index_out, image_srcui, 15, // numEntries 0..15 (Note this function is changed from using 16) 0xffffffff); CGU_UINT32 index_packed_out[2] = {0, 0}; cmp_pack4bitindex32(index_packed_out, best_index_out); #ifdef ENABLE_CMP_REFINE_MODE6_API // Refined for better quailty err = cmp_mode6_optimize_IndexAndEndPoints(epo_code_out, best_index_out, image_srcui, // using shared_temp[].pixel with 0 thread offset 16, // numEntries g_modesettings[6].clusters, // 16, g_modesettings[6].bits, // 58, g_modesettings[6].channels3or4, // 4, 0.1f); cmp_pack4bitindex32(index_packed_out, best_index_out); #endif // encode results CGU_UINT32 cmp_out6[4] = {0, 0, 0, 0}; cmp_encode_mode6(cmp_out6, epo_code_out, index_packed_out); cmp.x = cmp_out6[0]; cmp.y = cmp_out6[1]; cmp.z = cmp_out6[2]; cmp.w = cmp_out6[3]; #endif #if defined (ENABLE_CMP_MODE4) || defined(ENABLE_CMP_MODE5) { CGU_UINT32 cmp_out[4] = {0, 0, 0, 0}; Compress_mode45(cmp_out, 4, image_srcui); cmp.x = cmp_out[0]; cmp.y = cmp_out[1]; cmp.z = cmp_out[2]; cmp.w = cmp_out[3]; } #endif #if defined(ENABLE_CMP_MODE1) { CGU_UINT32 cmp_out1[5] = {0, 0, 0, 0, 0}; cmp_process_mode(cmp_out1, image_srcui, 1); cmp.x = cmp_out1[0]; cmp.y = cmp_out1[1]; cmp.z = cmp_out1[2]; cmp.w = cmp_out1[3]; } #endif #endif // SIMULATE_GPU #endif // Not HLSL return cmp; }