//===================================================================== // Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files(the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and / or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions : // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. // //===================================================================== #include #include #include "compressonator.h" #include "common.h" #include "tc_pluginapi.h" #include "tc_plugininternal.h" #include "copencl.h" CMIPS *GPU_CLMips = nullptr; #define BUILD_AS_PLUGIN_DLL #ifdef BUILD_AS_PLUGIN_DLL DECLARE_PLUGIN(Plugin_COpenCL) SET_PLUGIN_TYPE("PIPELINE") SET_PLUGIN_NAME("GPU_OCL") #else void *make_Plugin_Compute_OpenCL() { return new Plugin_COpenCL; } #endif #pragma comment(lib,"advapi32.lib") // for RegCloseKey and other Reg calls ... Plugin_COpenCL::Plugin_COpenCL() { m_pComputeBase = NULL; } Plugin_COpenCL::~Plugin_COpenCL() { if (m_pComputeBase) delete m_pComputeBase; } int Plugin_COpenCL::TC_PluginSetSharedIO(void* Shared) { if (!Shared) return 1; GPU_CLMips = reinterpret_cast (Shared); GPU_CLMips->m_infolevel = 0x01; // Turn on print Info return 0; } int Plugin_COpenCL::TC_PluginGetVersion(TC_PluginVersion* pPluginVersion) { pPluginVersion->guid = g_GUID_GPU; pPluginVersion->dwAPIVersionMajor = TC_API_VERSION_MAJOR; pPluginVersion->dwAPIVersionMinor = TC_API_VERSION_MINOR; pPluginVersion->dwPluginVersionMajor = TC_PLUGIN_VERSION_MAJOR; pPluginVersion->dwPluginVersionMinor = TC_PLUGIN_VERSION_MINOR; return 0; } int Plugin_COpenCL::TC_Init(void *kernel_options) { m_pComputeBase = (ComputeBase *) new COpenCL(kernel_options); if (m_pComputeBase == NULL) return -1; return 0; } #ifdef ENABLE_MAKE_COMPATIBLE_API bool Plugin_COpenCL::IsFloatFormat(CMP_FORMAT InFormat) { switch (InFormat) { case CMP_FORMAT_ARGB_16F: case CMP_FORMAT_ABGR_16F: case CMP_FORMAT_RGBA_16F: case CMP_FORMAT_BGRA_16F: case CMP_FORMAT_RG_16F: case CMP_FORMAT_R_16F: case CMP_FORMAT_ARGB_32F: case CMP_FORMAT_ABGR_32F: case CMP_FORMAT_RGBA_32F: case CMP_FORMAT_BGRA_32F: case CMP_FORMAT_RGB_32F: case CMP_FORMAT_BGR_32F: case CMP_FORMAT_RG_32F: case CMP_FORMAT_R_32F: case CMP_FORMAT_BC6H: case CMP_FORMAT_BC6H_SF: case CMP_FORMAT_RGBE_32F: { return true; } break; default: break; } return false; } inline float clamp(float a, float l, float h) { return (a < l) ? l : ((a > h) ? h : a); } inline float knee(double x, double f) { return float(log(x * f + 1.f) / f); } float Plugin_COpenCL::findKneeValueHPC(float x, float y) { float f0 = 0; float f1 = 1.f; while (knee(x, f1) > y) { f0 = f1; f1 = f1 * 2.f; } for (int i = 0; i < 30; ++i) { const float f2 = (f0 + f1) / 2.f; const float y2 = knee(x, f2); if (y2 < y) { f1 = f2; } else { f0 = f2; } } return (f0 + f1) / 2.f; } CMP_ERROR Plugin_COpenCL::CF_16BitTo8Bit(CMP_WORD* sBlock, CMP_BYTE* cBlock, CMP_DWORD dwBlockSize) { assert(sBlock); assert(cBlock); assert(dwBlockSize); if (sBlock && cBlock && dwBlockSize) { for (CMP_DWORD i = 0; i < dwBlockSize; i++) { cBlock[i] = (CMP_BYTE)(sBlock[i] / 257); } } return CMP_OK; } CMP_ERROR Plugin_COpenCL::Byte2HalfShort(CMP_HALFSHORT* hfsBlock, CMP_BYTE* cBlock, CMP_DWORD dwBlockSize) { assert(hfsBlock); assert(cBlock); assert(dwBlockSize); if (hfsBlock && cBlock && dwBlockSize) { for (CMP_DWORD i = 0; i < dwBlockSize; i++) { hfsBlock[i] = CMP_HALF(float(cBlock[i] / 255.0f)).bits(); } } return CMP_OK; } CMP_ERROR Plugin_COpenCL::Float2Byte(CMP_BYTE cBlock[], CMP_FLOAT* fBlock, MipSet &srcTexture, const CMP_CompressOptions* pOptions) { assert(cBlock); assert(fBlock); assert(&srcTexture); if (cBlock && fBlock) { CMP_HALF* hfData = (CMP_HALF*)fBlock; float r = 0, g = 0, b = 0, a = 0; float kl = powf(2.f, pOptions->fInputKneeLow); float f = findKneeValueHPC(powf(2.f, pOptions->fInputKneeHigh) - kl, powf(2.f, 3.5f) - kl); float luminance3f = powf(2, -3.5); // always assume max intensity is 1 and 3.5f darker for scale later float invGamma = 1 / pOptions->fInputGamma; //for gamma correction float scale = (float)255.0 * powf(luminance3f, invGamma); int i = 0; for (unsigned int y = 0; y < srcTexture.dwHeight; y++) { for (unsigned int x = 0; x < srcTexture.dwWidth; x++) { if (srcTexture.m_ChannelFormat == CF_Float16) { r = (float)(*hfData); hfData++; g = (float)(*hfData); hfData++; b = (float)(*hfData); hfData++; a = (float)(*hfData); hfData++; } else if (srcTexture.m_ChannelFormat == CF_Float32) { r = (float)(*fBlock); fBlock++; g = (float)(*fBlock); fBlock++; b = (float)(*fBlock); fBlock++; a = (float)(*fBlock); fBlock++; } CMP_BYTE r_b, g_b, b_b, a_b; // 1) Compensate for fogging by subtracting defog // from the raw pixel values. // We assume a defog of 0 if (pOptions->fInputDefog > 0.0) { r = r - pOptions->fInputDefog; g = g - pOptions->fInputDefog; b = b - pOptions->fInputDefog; a = a - pOptions->fInputDefog; } // 2) Multiply the defogged pixel values by // 2^(exposure + 2.47393). const float exposeScale = powf(2, pOptions->fInputExposure + 2.47393f); r = r * exposeScale; g = g * exposeScale; b = b * exposeScale; a = a * exposeScale; // 3) Values that are now 1.0 are called "middle gray". // If defog and exposure are both set to 0.0, then // middle gray corresponds to a raw pixel value of 0.18. // In step 6, middle gray values will be mapped to an // intensity 3.5 f-stops below the display's maximum // intensity. // 4) Apply a knee function. The knee function has two // parameters, kneeLow and kneeHigh. Pixel values // below 2^kneeLow are not changed by the knee // function. Pixel values above kneeLow are lowered // according to a logarithmic curve, such that the // value 2^kneeHigh is mapped to 2^3.5. (In step 6, // this value will be mapped to the the display's // maximum intensity.) if (r > kl) { r = kl + knee(r - kl, f); } if (g > kl) { g = kl + knee(g - kl, f); } if (b > kl) { b = kl + knee(b - kl, f); } if (a > kl) { a = kl + knee(a - kl, f); } // 5) Gamma-correct the pixel values, according to the // screen's gamma. (We assume that the gamma curve // is a simple power function.) r = powf(r, invGamma); g = powf(g, invGamma); b = powf(b, invGamma); a = powf(a, pOptions->fInputGamma); // 6) Scale the values such that middle gray pixels are // mapped to a frame buffer value that is 3.5 f-stops // below the display's maximum intensity. r *= scale; g *= scale; b *= scale; a *= scale; r_b = (CMP_BYTE)clamp(r, 0.f, 255.f); g_b = (CMP_BYTE)clamp(g, 0.f, 255.f); b_b = (CMP_BYTE)clamp(b, 0.f, 255.f); a_b = (CMP_BYTE)clamp(a, 0.f, 255.f); cBlock[i] = r_b; i++; cBlock[i] = g_b; i++; cBlock[i] = b_b; i++; cBlock[i] = a_b; i++; } } } return CMP_OK; } #endif CMP_ERROR Plugin_COpenCL::TC_Compress(void *Options, MipSet &SrcTexture, MipSet &destTexture, CMP_Feedback_Proc pFeedback) { CMP_ERROR result = CMP_OK; #ifdef ENABLE_MAKE_COMPATIBLE_API bool srcFloat; bool destFloat; srcFloat = (SrcTexture.m_ChannelFormat == CF_Float16 || SrcTexture.m_ChannelFormat == CF_Float32) ? true : false; if (destTexture.m_format != CMP_FORMAT_Unknown) destFloat= IsFloatFormat(destTexture.m_format); else destFloat = (destTexture.m_ChannelFormat == CF_Float16 || destTexture.m_ChannelFormat == CF_Float32) ? true : false; bool newBuffer = false; // store original source data while ACF data is been processed CMP_BYTE *hold_pData = NULL; CMP_DWORD hold_dwDataSize = 0; CMP_FORMAT hold_format = CMP_FORMAT_Unknown; if (srcFloat && !destFloat) { hold_pData = SrcTexture.pData; hold_format = SrcTexture.m_format; hold_dwDataSize = SrcTexture.dwDataSize; CMP_DWORD size = SrcTexture.dwWidth * SrcTexture.dwHeight; CMP_FLOAT*pfData = new CMP_FLOAT[SrcTexture.dwDataSize]; memcpy(pfData, SrcTexture.pData, SrcTexture.dwDataSize); CMP_BYTE *byteData = new CMP_BYTE[size * 4]; CMP_CompressOptions fDataOptions; fDataOptions.fInputDefog = AMD_CODEC_DEFOG_DEFAULT; fDataOptions.fInputExposure = AMD_CODEC_EXPOSURE_DEFAULT; fDataOptions.fInputKneeLow = AMD_CODEC_KNEELOW_DEFAULT; fDataOptions.fInputKneeHigh = AMD_CODEC_KNEEHIGH_DEFAULT; fDataOptions.fInputGamma = AMD_CODEC_GAMMA_DEFAULT; Float2Byte(byteData, pfData, SrcTexture, &fDataOptions); delete[] pfData; SrcTexture.pData = byteData; SrcTexture.m_format = CMP_FORMAT_ARGB_8888; SrcTexture.dwDataSize = size * 4; newBuffer = true; } else if (!srcFloat && destFloat) { // Process the current mip level data hold_pData = SrcTexture.pData; hold_format = SrcTexture.m_format; hold_dwDataSize = SrcTexture.dwDataSize; CMP_DWORD size = SrcTexture.dwWidth * SrcTexture.dwHeight; CMP_BYTE *pbData = SrcTexture.pData; CMP_HALFSHORT *hfloatData = new CMP_HALFSHORT[size * 4]; Byte2HalfShort(hfloatData, pbData, size * 4); SrcTexture.pData = (CMP_BYTE*)hfloatData; SrcTexture.m_format = CMP_FORMAT_ARGB_16F; SrcTexture.dwDataSize = size * 4 * 2; newBuffer = true; } else { // both src & dest are of type int // check if src format is 8 bit and dest is 8 bit if not convert src to match dest if ((SrcTexture.m_ChannelFormat == CF_16bit) && (destTexture.m_ChannelFormat == CF_Compressed)) { hold_pData = SrcTexture.pData; hold_format = SrcTexture.m_format; hold_dwDataSize = SrcTexture.dwDataSize; CMP_DWORD size = hold_dwDataSize/2; CMP_WORD *pbData = (CMP_WORD *)SrcTexture.pData; CMP_BYTE *sData = new CMP_BYTE[size]; CF_16BitTo8Bit(pbData,sData, size); SrcTexture.pData = (CMP_BYTE*)sData; SrcTexture.m_format = CMP_FORMAT_ARGB_8888; SrcTexture.dwDataSize = size; newBuffer = true; } } #endif if (m_pComputeBase) result = m_pComputeBase->Compress((KernelOptions *)Options, SrcTexture,destTexture,pFeedback); #ifdef ENABLE_MAKE_COMPATIBLE_API if (newBuffer) { // remove the new data free(SrcTexture.pData); // restore original data SrcTexture.pData = hold_pData; SrcTexture.m_format = hold_format; SrcTexture.dwDataSize = hold_dwDataSize; } #endif return result; } CMP_ERROR Plugin_COpenCL::TC_GetPerformanceStats(void* pPerfStats) { CMP_ERROR result = CMP_ERR_NOPERFSTATS; if (m_pComputeBase) { KernelPerformanceStats *PerfStats = reinterpret_cast(pPerfStats); PerfStats->m_num_blocks = m_pComputeBase->GetBlockSize(); PerfStats->m_computeShaderElapsedMS = m_pComputeBase->GetProcessElapsedTimeMS(); PerfStats->m_CmpMTxPerSec = m_pComputeBase->GetMTxPerSec(); result = CMP_OK; } return result; } CMP_ERROR Plugin_COpenCL::TC_GetDeviceInfo(void* pDeviceInfo) { CMP_ERROR result = CMP_ERR_NOPERFSTATS; if (m_pComputeBase) { KernelDeviceInfo *DeviceInfo = reinterpret_cast(pDeviceInfo); snprintf(DeviceInfo->m_deviceName,sizeof(DeviceInfo->m_deviceName),"%s",m_pComputeBase->GetDeviceName()); snprintf(DeviceInfo->m_version,sizeof(DeviceInfo->m_version),"%s",m_pComputeBase->GetVersion()); DeviceInfo->m_maxUCores = m_pComputeBase->GetMaxUCores(); result = CMP_OK; } return result; } void Plugin_COpenCL::TC_SetComputeOptions(void *options) { if (m_pComputeBase) m_pComputeBase->SetComputeOptions((ComputeOptions *)options); } char *Plugin_COpenCL::TC_ComputeSourceFile() { return NULL; } int Plugin_COpenCL::TC_Close() { if (m_pComputeBase) { delete m_pComputeBase; m_pComputeBase = NULL; } return 0; }