TexConv/Applications/_Plugins/CMP_GPU/OpenCL/COpenCL.cpp

//=====================================================================
// Copyright (c) 2020    Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files(the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions :
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//
//=====================================================================

#include <stdio.h>
#include <stdlib.h>
#include "compressonator.h"
#include "common.h"
#include "tc_pluginapi.h"
#include "tc_plugininternal.h"
#include "copencl.h"


CMIPS *GPU_CLMips = nullptr;

#define BUILD_AS_PLUGIN_DLL

#ifdef BUILD_AS_PLUGIN_DLL
DECLARE_PLUGIN(Plugin_COpenCL)
SET_PLUGIN_TYPE("PIPELINE")
SET_PLUGIN_NAME("GPU_OCL")
#else
void *make_Plugin_Compute_OpenCL() {
    return new Plugin_COpenCL;
}
#endif

#pragma comment(lib,"advapi32.lib")        // for RegCloseKey and other Reg calls ...

Plugin_COpenCL::Plugin_COpenCL() {
    m_pComputeBase = NULL;
}

Plugin_COpenCL::~Plugin_COpenCL() {
    if (m_pComputeBase)
        delete m_pComputeBase;
}

int Plugin_COpenCL::TC_PluginSetSharedIO(void* Shared) {
    if (!Shared) return 1;
    GPU_CLMips = reinterpret_cast<CMIPS *> (Shared);
    GPU_CLMips->m_infolevel = 0x01; // Turn on print Info
    return 0;
}

int Plugin_COpenCL::TC_PluginGetVersion(TC_PluginVersion* pPluginVersion) {
    pPluginVersion->guid                    = g_GUID_GPU;
    pPluginVersion->dwAPIVersionMajor       = TC_API_VERSION_MAJOR;
    pPluginVersion->dwAPIVersionMinor       = TC_API_VERSION_MINOR;
    pPluginVersion->dwPluginVersionMajor    = TC_PLUGIN_VERSION_MAJOR;
    pPluginVersion->dwPluginVersionMinor    = TC_PLUGIN_VERSION_MINOR;
    return 0;
}

int Plugin_COpenCL::TC_Init(void  *kernel_options) {
    m_pComputeBase = (ComputeBase *) new COpenCL(kernel_options);
    if (m_pComputeBase == NULL)
        return -1;
    return 0;
}

#ifdef ENABLE_MAKE_COMPATIBLE_API
bool Plugin_COpenCL::IsFloatFormat(CMP_FORMAT InFormat) {
    switch (InFormat) {
    case CMP_FORMAT_ARGB_16F:
    case CMP_FORMAT_ABGR_16F:
    case CMP_FORMAT_RGBA_16F:
    case CMP_FORMAT_BGRA_16F:
    case CMP_FORMAT_RG_16F:
    case CMP_FORMAT_R_16F:
    case CMP_FORMAT_ARGB_32F:
    case CMP_FORMAT_ABGR_32F:
    case CMP_FORMAT_RGBA_32F:
    case CMP_FORMAT_BGRA_32F:
    case CMP_FORMAT_RGB_32F:
    case CMP_FORMAT_BGR_32F:
    case CMP_FORMAT_RG_32F:
    case CMP_FORMAT_R_32F:
    case CMP_FORMAT_BC6H:
    case CMP_FORMAT_BC6H_SF:
    case CMP_FORMAT_RGBE_32F: {
        return true;
    }
    break;
    default:
        break;
    }

    return false;
}

inline float clamp(float a, float l, float h) {
    return (a < l) ? l : ((a > h) ? h : a);
}

inline float knee(double x, double f) {
    return float(log(x * f + 1.f) / f);
}

float Plugin_COpenCL::findKneeValueHPC(float x, float y) {
    float f0 = 0;
    float f1 = 1.f;

    while (knee(x, f1) > y) {
        f0 = f1;
        f1 = f1 * 2.f;
    }

    for (int i = 0; i < 30; ++i) {
        const float f2 = (f0 + f1) / 2.f;
        const float y2 = knee(x, f2);

        if (y2 < y) {
            f1 = f2;
        } else {
            f0 = f2;
        }
    }

    return (f0 + f1) / 2.f;
}

CMP_ERROR Plugin_COpenCL::CF_16BitTo8Bit(CMP_WORD* sBlock, CMP_BYTE* cBlock, CMP_DWORD dwBlockSize) {
    assert(sBlock);
    assert(cBlock);
    assert(dwBlockSize);
    if (sBlock && cBlock && dwBlockSize) {
        for (CMP_DWORD i = 0; i < dwBlockSize; i++) {
            cBlock[i] =  (CMP_BYTE)(sBlock[i] / 257);
        }
    }

    return CMP_OK;
}

CMP_ERROR Plugin_COpenCL::Byte2HalfShort(CMP_HALFSHORT* hfsBlock, CMP_BYTE* cBlock, CMP_DWORD dwBlockSize) {
    assert(hfsBlock);
    assert(cBlock);
    assert(dwBlockSize);
    if (hfsBlock && cBlock && dwBlockSize) {
        for (CMP_DWORD i = 0; i < dwBlockSize; i++) {
            hfsBlock[i] = CMP_HALF(float(cBlock[i] / 255.0f)).bits();
        }
    }

    return CMP_OK;
}

CMP_ERROR Plugin_COpenCL::Float2Byte(CMP_BYTE cBlock[], CMP_FLOAT* fBlock, MipSet  &srcTexture, const CMP_CompressOptions* pOptions) {
    assert(cBlock);
    assert(fBlock);
    assert(&srcTexture);

    if (cBlock && fBlock) {
        CMP_HALF* hfData = (CMP_HALF*)fBlock;
        float r = 0, g = 0, b = 0, a = 0;

        float kl = powf(2.f, pOptions->fInputKneeLow);
        float f = findKneeValueHPC(powf(2.f, pOptions->fInputKneeHigh) - kl, powf(2.f, 3.5f) - kl);
        float luminance3f = powf(2, -3.5);         // always assume max intensity is 1 and 3.5f darker for scale later
        float invGamma = 1 / pOptions->fInputGamma; //for gamma correction
        float scale = (float)255.0 * powf(luminance3f, invGamma);
        int i = 0;

        for (unsigned int y = 0; y < srcTexture.dwHeight; y++) {
            for (unsigned int x = 0; x < srcTexture.dwWidth; x++) {
                if (srcTexture.m_ChannelFormat == CF_Float16) {
                    r = (float)(*hfData);
                    hfData++;
                    g = (float)(*hfData);
                    hfData++;
                    b = (float)(*hfData);
                    hfData++;
                    a = (float)(*hfData);
                    hfData++;
                } else if (srcTexture.m_ChannelFormat == CF_Float32) {
                    r = (float)(*fBlock);
                    fBlock++;
                    g = (float)(*fBlock);
                    fBlock++;
                    b = (float)(*fBlock);
                    fBlock++;
                    a = (float)(*fBlock);
                    fBlock++;
                }

                CMP_BYTE r_b, g_b, b_b, a_b;


                //  1) Compensate for fogging by subtracting defog
                //     from the raw pixel values.
                // We assume a defog of 0
                if (pOptions->fInputDefog > 0.0) {
                    r = r - pOptions->fInputDefog;
                    g = g - pOptions->fInputDefog;
                    b = b - pOptions->fInputDefog;
                    a = a - pOptions->fInputDefog;
                }

                //  2) Multiply the defogged pixel values by
                //     2^(exposure + 2.47393).
                const float exposeScale = powf(2, pOptions->fInputExposure + 2.47393f);
                r = r * exposeScale;
                g = g * exposeScale;
                b = b * exposeScale;
                a = a * exposeScale;

                //  3) Values that are now 1.0 are called "middle gray".
                //     If defog and exposure are both set to 0.0, then
                //     middle gray corresponds to a raw pixel value of 0.18.
                //     In step 6, middle gray values will be mapped to an
                //     intensity 3.5 f-stops below the display's maximum
                //     intensity.

                //  4) Apply a knee function.  The knee function has two
                //     parameters, kneeLow and kneeHigh.  Pixel values
                //     below 2^kneeLow are not changed by the knee
                //     function.  Pixel values above kneeLow are lowered
                //     according to a logarithmic curve, such that the
                //     value 2^kneeHigh is mapped to 2^3.5.  (In step 6,
                //     this value will be mapped to the the display's
                //     maximum intensity.)
                if (r > kl) {
                    r = kl + knee(r - kl, f);
                }
                if (g > kl) {
                    g = kl + knee(g - kl, f);
                }
                if (b > kl) {
                    b = kl + knee(b - kl, f);
                }
                if (a > kl) {
                    a = kl + knee(a - kl, f);
                }

                //  5) Gamma-correct the pixel values, according to the
                //     screen's gamma.  (We assume that the gamma curve
                //     is a simple power function.)
                r = powf(r, invGamma);
                g = powf(g, invGamma);
                b = powf(b, invGamma);
                a = powf(a, pOptions->fInputGamma);

                //  6) Scale the values such that middle gray pixels are
                //     mapped to a frame buffer value that is 3.5 f-stops
                //     below the display's maximum intensity.
                r *= scale;
                g *= scale;
                b *= scale;
                a *= scale;

                r_b = (CMP_BYTE)clamp(r, 0.f, 255.f);
                g_b = (CMP_BYTE)clamp(g, 0.f, 255.f);
                b_b = (CMP_BYTE)clamp(b, 0.f, 255.f);
                a_b = (CMP_BYTE)clamp(a, 0.f, 255.f);
                cBlock[i] = r_b;
                i++;
                cBlock[i] = g_b;
                i++;
                cBlock[i] = b_b;
                i++;
                cBlock[i] = a_b;
                i++;
            }

        }

    }

    return CMP_OK;
}
#endif

CMP_ERROR Plugin_COpenCL::TC_Compress(void *Options, MipSet  &SrcTexture, MipSet  &destTexture, CMP_Feedback_Proc pFeedback) {
    CMP_ERROR result = CMP_OK;

#ifdef ENABLE_MAKE_COMPATIBLE_API

    bool srcFloat;
    bool destFloat;

    srcFloat  = (SrcTexture.m_ChannelFormat == CF_Float16  || SrcTexture.m_ChannelFormat == CF_Float32) ? true : false;

    if (destTexture.m_format != CMP_FORMAT_Unknown)
        destFloat= IsFloatFormat(destTexture.m_format);
    else
        destFloat = (destTexture.m_ChannelFormat == CF_Float16 || destTexture.m_ChannelFormat == CF_Float32) ? true : false;

    bool newBuffer = false;

    // store original source data while ACF data is been processed
    CMP_BYTE    *hold_pData = NULL;
    CMP_DWORD   hold_dwDataSize = 0;
    CMP_FORMAT  hold_format = CMP_FORMAT_Unknown;

    if (srcFloat && !destFloat) {
        hold_pData      = SrcTexture.pData;
        hold_format     = SrcTexture.m_format;
        hold_dwDataSize = SrcTexture.dwDataSize;

        CMP_DWORD size = SrcTexture.dwWidth * SrcTexture.dwHeight;
        CMP_FLOAT*pfData = new CMP_FLOAT[SrcTexture.dwDataSize];
        memcpy(pfData, SrcTexture.pData, SrcTexture.dwDataSize);

        CMP_BYTE *byteData = new CMP_BYTE[size * 4];

        CMP_CompressOptions fDataOptions;
        fDataOptions.fInputDefog    = AMD_CODEC_DEFOG_DEFAULT;
        fDataOptions.fInputExposure = AMD_CODEC_EXPOSURE_DEFAULT;
        fDataOptions.fInputKneeLow  = AMD_CODEC_KNEELOW_DEFAULT;
        fDataOptions.fInputKneeHigh = AMD_CODEC_KNEEHIGH_DEFAULT;
        fDataOptions.fInputGamma    = AMD_CODEC_GAMMA_DEFAULT;
        Float2Byte(byteData, pfData, SrcTexture, &fDataOptions);

        delete[] pfData;
        SrcTexture.pData        = byteData;
        SrcTexture.m_format     = CMP_FORMAT_ARGB_8888;
        SrcTexture.dwDataSize   = size * 4;
        newBuffer = true;
    }

    else if (!srcFloat && destFloat) {
        // Process the current mip level data
        hold_pData      = SrcTexture.pData;
        hold_format     = SrcTexture.m_format;
        hold_dwDataSize = SrcTexture.dwDataSize;

        CMP_DWORD size = SrcTexture.dwWidth * SrcTexture.dwHeight;
        CMP_BYTE *pbData = SrcTexture.pData;
        CMP_HALFSHORT *hfloatData = new CMP_HALFSHORT[size * 4];
        Byte2HalfShort(hfloatData, pbData, size * 4);

        SrcTexture.pData        = (CMP_BYTE*)hfloatData;
        SrcTexture.m_format     = CMP_FORMAT_ARGB_16F;
        SrcTexture.dwDataSize   = size * 4 * 2;
        newBuffer = true;
    } else { // both src & dest are of type int
        // check if src format is 8 bit and dest is 8 bit if not convert src to match dest
        if ((SrcTexture.m_ChannelFormat == CF_16bit) && (destTexture.m_ChannelFormat == CF_Compressed)) {
            hold_pData      = SrcTexture.pData;
            hold_format     = SrcTexture.m_format;
            hold_dwDataSize = SrcTexture.dwDataSize;

            CMP_DWORD size = hold_dwDataSize/2;
            CMP_WORD  *pbData = (CMP_WORD  *)SrcTexture.pData;
            CMP_BYTE *sData = new CMP_BYTE[size];
            CF_16BitTo8Bit(pbData,sData, size);

            SrcTexture.pData        = (CMP_BYTE*)sData;
            SrcTexture.m_format     = CMP_FORMAT_ARGB_8888;
            SrcTexture.dwDataSize   = size;
            newBuffer = true;
        }

    }
#endif

    if (m_pComputeBase)
        result = m_pComputeBase->Compress((KernelOptions *)Options, SrcTexture,destTexture,pFeedback);

#ifdef ENABLE_MAKE_COMPATIBLE_API
    if (newBuffer) {
        // remove the new data
        free(SrcTexture.pData);
        // restore original data
        SrcTexture.pData        = hold_pData;
        SrcTexture.m_format     = hold_format;
        SrcTexture.dwDataSize   = hold_dwDataSize;
    }
#endif

    return result;
}

CMP_ERROR Plugin_COpenCL::TC_GetPerformanceStats(void* pPerfStats) {
    CMP_ERROR result = CMP_ERR_NOPERFSTATS;
    if (m_pComputeBase) {
        KernelPerformanceStats *PerfStats =  reinterpret_cast<KernelPerformanceStats *>(pPerfStats);
        PerfStats->m_num_blocks  = m_pComputeBase->GetBlockSize();
        PerfStats->m_computeShaderElapsedMS = m_pComputeBase->GetProcessElapsedTimeMS();
        PerfStats->m_CmpMTxPerSec  = m_pComputeBase->GetMTxPerSec();
        result = CMP_OK;
    }
    return result;
}

CMP_ERROR Plugin_COpenCL::TC_GetDeviceInfo(void* pDeviceInfo) {
    CMP_ERROR result = CMP_ERR_NOPERFSTATS;
    if (m_pComputeBase) {
        KernelDeviceInfo *DeviceInfo =  reinterpret_cast<KernelDeviceInfo *>(pDeviceInfo);
        snprintf(DeviceInfo->m_deviceName,sizeof(DeviceInfo->m_deviceName),"%s",m_pComputeBase->GetDeviceName());
        snprintf(DeviceInfo->m_version,sizeof(DeviceInfo->m_version),"%s",m_pComputeBase->GetVersion());
        DeviceInfo->m_maxUCores      = m_pComputeBase->GetMaxUCores();
        result = CMP_OK;
    }
    return result;
}

void Plugin_COpenCL::TC_SetComputeOptions(void *options) {
    if (m_pComputeBase)
        m_pComputeBase->SetComputeOptions((ComputeOptions *)options);
}

char *Plugin_COpenCL::TC_ComputeSourceFile() {
    return NULL;
}

int Plugin_COpenCL::TC_Close() {
    if (m_pComputeBase) {
        delete m_pComputeBase;
        m_pComputeBase = NULL;
    }
    return 0;
}