//=============================================================================
// Copyright (c) 2020    Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files(the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions :
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//
//==============================================================================

#include "copencl.h"

#ifdef USE_CPU_PERFORMANCE_COUNTERS
#include "query_timer.h"  // can use CPU timing but pref is to use GPU counters
#endif

#ifdef  _M_X64
//#define ENABLE_SVM //- disable for now, causing build issue in 64bit release
#endif

#ifdef cl_amd_fp64
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
#define fp64_supported      1
#elif defined(cl_khr_fp64)
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#define fp64_supported      1
#else
// "Double precision floating point not supported by OpenCL implementation."
#define fp64_supported      0
#endif

#ifdef ENABLE_SVM
extern bool SVMInitCodec(KernelOptions *options);
#endif

extern CMIPS *GPU_CLMips;

#define LOG_BUFFER_SIZE          102400
#define KERNEL_ARG_SOURCE        0
#define KERNEL_ARG_DESTINATION   1
#define KERNEL_ARG_SOURCEINFO    2
#define KERNEL_ARG_ENCODE        3

static bool is64Bit() {
    return (sizeof(int*) == 8);
}

void PrintCL(const char* Format, ... ) {
    // define a pointer to save argument list
    va_list args;
    char buff[1024];
    // process the arguments into our debug buffer
    va_start(args, Format);
    vsprintf_s(buff, Format, args);
    va_end(args);

    if (GPU_CLMips) {
        GPU_CLMips->Print(buff);
    } else {
        printf(buff);
    }
}

void PrintOCLError(cl_int error) {
    switch (error) {
    case CL_DEVICE_NOT_FOUND:
        PrintCL("Error: CL_DEVICE_NOT_FOUND\n");
        break;
    case CL_DEVICE_NOT_AVAILABLE:
        PrintCL("Error: CL_DEVICE_NOT_AVAILABLE\n");
        break;
    case CL_COMPILER_NOT_AVAILABLE:
        PrintCL("Error: CL_COMPILER_NOT_AVAILABLE\n");
        break;
    case CL_MEM_OBJECT_ALLOCATION_FAILURE:
        PrintCL("Error: CL_MEM_OBJECT_ALLOCATION_FAILURE\n");
        break;
    case CL_OUT_OF_RESOURCES:
        PrintCL("Error: CL_OUT_OF_RESOURCES\n");
        break;
    case CL_OUT_OF_HOST_MEMORY:
        PrintCL("Error: CL_OUT_OF_HOST_MEMORY\n");
        break;
    case CL_PROFILING_INFO_NOT_AVAILABLE:
        PrintCL("Error: CL_PROFILING_INFO_NOT_AVAILABLE\n");
        break;
    case CL_MEM_COPY_OVERLAP:
        PrintCL("Error: CL_MEM_COPY_OVERLAP\n");
        break;
    case CL_IMAGE_FORMAT_MISMATCH:
        PrintCL("Error: CL_IMAGE_FORMAT_MISMATCH\n");
        break;
    case CL_IMAGE_FORMAT_NOT_SUPPORTED:
        PrintCL("Error: CL_IMAGE_FORMAT_NOT_SUPPORTED\n");
        break;
    case CL_BUILD_PROGRAM_FAILURE:
        PrintCL("Error: CL_BUILD_PROGRAM_FAILURE\n");
        break;
    case CL_MAP_FAILURE:
        PrintCL("Error: CL_MAP_FAILURE\n");
        break;
    case CL_MISALIGNED_SUB_BUFFER_OFFSET:
        PrintCL("Error: CL_MISALIGNED_SUB_BUFFER_OFFSET\n");
        break;
    case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
        PrintCL("Error: CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST\n");
        break;
    case CL_COMPILE_PROGRAM_FAILURE:
        PrintCL("Error: CL_COMPILE_PROGRAM_FAILURE\n");
        break;
    case CL_LINKER_NOT_AVAILABLE:
        PrintCL("Error: CL_LINKER_NOT_AVAILABLE\n");
        break;
    case CL_LINK_PROGRAM_FAILURE:
        PrintCL("Error: CL_LINK_PROGRAM_FAILURE\n");
        break;
    case CL_DEVICE_PARTITION_FAILED:
        PrintCL("Error: CL_DEVICE_PARTITION_FAILED\n");
        break;
    case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
        PrintCL("Error: CL_KERNEL_ARG_INFO_NOT_AVAILABLE\n");
        break;
    case CL_INVALID_VALUE:
        PrintCL("Error: CL_INVALID_VALUE\n");
        break;
    case CL_INVALID_DEVICE_TYPE:
        PrintCL("Error: CL_INVALID_DEVICE_TYPE\n");
        break;
    case CL_INVALID_PLATFORM:
        PrintCL("Error: CL_INVALID_PLATFORM\n");
        break;
    case CL_INVALID_DEVICE:
        PrintCL("Error: CL_INVALID_DEVICE\n");
        break;
    case CL_INVALID_CONTEXT:
        PrintCL("Error: CL_INVALID_CONTEXT\n");
        break;
    case CL_INVALID_QUEUE_PROPERTIES:
        PrintCL("Error: CL_INVALID_QUEUE_PROPERTIES\n");
        break;
    case CL_INVALID_COMMAND_QUEUE:
        PrintCL("Error: CL_INVALID_COMMAND_QUEUE\n");
        break;
    case CL_INVALID_HOST_PTR:
        PrintCL("Error: CL_INVALID_HOST_PTR\n");
        break;
    case CL_INVALID_MEM_OBJECT:
        PrintCL("Error: CL_INVALID_MEM_OBJECT\n");
        break;
    case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
        PrintCL("Error: CL_INVALID_IMAGE_FORMAT_DESCRIPTOR\n");
        break;
    case CL_INVALID_IMAGE_SIZE:
        PrintCL("Error: CL_INVALID_IMAGE_SIZE\n");
        break;
    case CL_INVALID_SAMPLER:
        PrintCL("Error: CL_INVALID_SAMPLER\n");
        break;
    case CL_INVALID_BINARY:
        PrintCL("Error: CL_INVALID_BINARY\n");
        break;
    case CL_INVALID_BUILD_OPTIONS:
        PrintCL("Error: CL_INVALID_BUILD_OPTIONS\n");
        break;
    case CL_INVALID_PROGRAM:
        PrintCL("Error: CL_INVALID_PROGRAM\n");
        break;
    case CL_INVALID_PROGRAM_EXECUTABLE:
        PrintCL("Error: CL_INVALID_PROGRAM_EXECUTABLE\n");
        break;
    case CL_INVALID_KERNEL_NAME:
        PrintCL("Error: CL_INVALID_KERNEL_NAME\n");
        break;
    case CL_INVALID_KERNEL_DEFINITION:
        PrintCL("Error: CL_INVALID_KERNEL_DEFINITION\n");
        break;
    case CL_INVALID_KERNEL:
        PrintCL("Error: CL_INVALID_KERNEL\n");
        break;
    case CL_INVALID_ARG_INDEX:
        PrintCL("Error: CL_INVALID_ARG_INDEX\n");
        break;
    case CL_INVALID_ARG_VALUE:
        PrintCL("Error: CL_INVALID_ARG_VALUE\n");
        break;
    case CL_INVALID_ARG_SIZE:
        PrintCL("Error: CL_INVALID_ARG_SIZE\n");
        break;
    case CL_INVALID_KERNEL_ARGS:
        PrintCL("Error: CL_INVALID_KERNEL_ARGS\n");
        break;
    case CL_INVALID_WORK_DIMENSION:
        PrintCL("Error: CL_INVALID_WORK_DIMENSION\n");
        break;
    case CL_INVALID_WORK_GROUP_SIZE:
        PrintCL("Error: CL_INVALID_WORK_GROUP_SIZE\n");
        break;
    case CL_INVALID_WORK_ITEM_SIZE:
        PrintCL("Error: CL_INVALID_WORK_ITEM_SIZE\n");
        break;
    case CL_INVALID_GLOBAL_OFFSET:
        PrintCL("Error: CL_INVALID_GLOBAL_OFFSET\n");
        break;
    case CL_INVALID_EVENT_WAIT_LIST:
        PrintCL("Error: CL_INVALID_EVENT_WAIT_LIST\n");
        break;
    case CL_INVALID_EVENT:
        PrintCL("Error: CL_INVALID_EVENT\n");
        break;
    case CL_INVALID_OPERATION:
        PrintCL("Error: CL_INVALID_OPERATION\n");
        break;
    case CL_INVALID_GL_OBJECT:
        PrintCL("Error: CL_INVALID_GL_OBJECT\n");
        break;
    case CL_INVALID_BUFFER_SIZE:
        PrintCL("Error: CL_INVALID_BUFFER_SIZE\n");
        break;
    case CL_INVALID_MIP_LEVEL:
        PrintCL("Error: CL_INVALID_MIP_LEVEL\n");
        break;
    case CL_INVALID_GLOBAL_WORK_SIZE:
        PrintCL("Error: CL_INVALID_GLOBAL_WORK_SIZE\n");
        break;
    case CL_INVALID_PROPERTY:
        PrintCL("Error: CL_INVALID_PROPERTY\n");
        break;
    case CL_INVALID_IMAGE_DESCRIPTOR:
        PrintCL("Error: CL_INVALID_IMAGE_DESCRIPTOR\n");
        break;
    case CL_INVALID_COMPILER_OPTIONS:
        PrintCL("Error: CL_INVALID_COMPILER_OPTIONS\n");
        break;
    case CL_INVALID_LINKER_OPTIONS:
        PrintCL("Error: CL_INVALID_LINKER_OPTIONS\n");
        break;
    case CL_INVALID_DEVICE_PARTITION_COUNT:
        PrintCL("Error: CL_INVALID_DEVICE_PARTITION_COUNT\n");
        break;
    case CL_INVALID_PIPE_SIZE:
        PrintCL("Error: CL_INVALID_PIPE_SIZE\n");
        break;
    case CL_INVALID_DEVICE_QUEUE:
        PrintCL("Error: CL_INVALID_DEVICE_QUEUE\n");
        break;
    default:
        PrintCL("Error: UKNOWN 0x%X\n", error);
        break;
    }

}

//====================================== Framework Common Interfaces : OpenCL Compute  ==========================================

void COpenCL::Init() {
    query_timer::initialize();

    m_initDeviceOk              = false;
    m_programRun                = false;
    m_codecFormat               = CMP_FORMAT_Unknown;
    m_num_blocks                = 0;
    m_CmpMTxPerSec              = 0;
    m_computeShaderElapsedMS    = 0.0f;
    m_platform_id               = NULL;
    m_device_id                 = NULL;
    m_num_platforms             = 0;
    m_command_queue             = NULL;
    m_kernel                    = NULL;
    m_device_destination_buffer = NULL;
    m_device_source_buffer      = NULL;
    m_Source_Info_buffer        = NULL;
    m_Encoder_buffer            = NULL;
    m_program_encoder           = NULL;
    m_context                   = NULL;
    m_svmSupport                = false;
    m_svmData                   = NULL;
    p_program.buffer            = NULL;
    ocl_time_device             = 0;
    m_deviceName                = "";
    m_version                   = "";
    m_maxUCores                 = 12;

    //-------------------------
    // OpenCL compiler options
    //-------------------------
    long cmp_opt_size = sizeof(m_compile_options);
    m_compile_options[0] = 0;

    // Make all warnings into errors, use -w to Inhitit all warning messages
    // strncat_s(m_compile_options, cmp_opt_size, "-Werror ", _TRUNCATE);

    // single and double precision denormalized numbers may be flushed to zero
    strncat_s(m_compile_options, cmp_opt_size, "-cl-denorms-are-zero ", _TRUNCATE);

    // Looks for addtional include file in this sub-folder
    strncat_s(m_compile_options, cmp_opt_size, "-I ./Plugins/Compute/ ", _TRUNCATE);

    // Use this to debug with CodeXL or other debuggers
    // This option disables all optimizations
    // strncat_s(m_compile_options, cmp_opt_size, "-g -cl-opt-disable", _TRUNCATE);

    // User override options set after this init call
    m_force_rebuild = false;
}

COpenCL::COpenCL(ComputeOptions CLOptions) {
    Init();
    m_force_rebuild = CLOptions.force_rebuild;
}

COpenCL::~COpenCL() {
#ifdef ENABLE_SVM
    if (m_context && m_svmData)
        clSVMFree(m_context, m_svmData);
#endif

    if (m_context)
        clReleaseContext(m_context);

    if (m_programRun) {
        CleanUpKernelAndIOBuffers();
        CleanUpProgramEncoder();
    }


}

void COpenCL::SetComputeOptions(ComputeOptions *CLOptions) {
    m_force_rebuild = CLOptions->force_rebuild;
}

COpenCL::COpenCL(void *kerneloptions) {
    m_kernel_options = (KernelOptions *)kerneloptions;
    Init();
}

void COpenCL::CleanUpProgramEncoder() {
    // Encoder Program & Buffer
    if (p_program.buffer)
        delete[] p_program.buffer;
    if (m_program_encoder)
        clReleaseProgram(m_program_encoder);
}

void COpenCL::CleanUpKernelAndIOBuffers() {
    // Command Queues and Kernel functions
    if (m_command_queue)
        clReleaseCommandQueue(m_command_queue);
    if (m_kernel)
        clReleaseKernel(m_kernel);

    // IO Buffers
    if (m_Encoder_buffer)
        clReleaseMemObject(m_Encoder_buffer);
    if (m_Source_Info_buffer)
        clReleaseMemObject(m_Source_Info_buffer);
    if (m_device_destination_buffer)
        clReleaseMemObject(m_device_destination_buffer);
    if (m_device_source_buffer)
        clReleaseMemObject(m_device_source_buffer);
}

bool COpenCL::GetPlatformID() {
    //QUERY_PERFORMANCE("Get Platform ID ");

    m_result = clGetPlatformIDs(MAX_PLATFORMS, m_platform_ids, &m_num_platforms);
    if (m_result != CL_SUCCESS) {
        PrintCL("Failed to get the GPU platforms!\n");
        PrintOCLError(m_result);
        return false;
    }
    return true;
}

bool COpenCL::SearchForGPU() {
    for (uint32_t i = 0; i < m_num_platforms; i++) {

        // Get the device ids.
        m_result = clGetDeviceIDs(m_platform_ids[i], CL_DEVICE_TYPE_GPU, 1, &m_device_id, NULL);
        if (m_result == CL_SUCCESS) {

            m_platform_id = m_platform_ids[i];
            break;
        }

    } // end for

    if (m_result != CL_SUCCESS) {
        PrintCL("Failed to get a GPU device!\n");
        PrintOCLError(m_result);
        return false;
    }

    return true;
}

bool COpenCL::GetDeviceInfo() {
    // Show the device info.
    char device_name[256] = { 0 };
    if (clGetDeviceInfo(m_device_id, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL) == CL_SUCCESS)
        m_deviceName = device_name;

    char openclVersion[1024];
    if (clGetDeviceInfo(m_device_id, CL_DEVICE_OPENCL_C_VERSION, sizeof(openclVersion), openclVersion, NULL) == CL_SUCCESS)
        m_version = openclVersion;

    if (clGetDeviceInfo(m_device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(m_maxUCores), &m_maxUCores, NULL) != CL_SUCCESS) m_maxUCores = 12;

    // long long GlobalMem;
    // clGetDeviceInfo(m_device_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(long long), &GlobalMem, NULL);
    // PrintCL("Device Global Mem: %I64d Bytes\n", GlobalMem);
    //
    // long long LocalMem;
    // clGetDeviceInfo(m_device_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(long long), &LocalMem, NULL);
    // PrintCL("Device Local  Mem: %I64d Bytes\n", LocalMem);
    //
    // size_t MaxWorkGroupSize;
    // clGetDeviceInfo(m_device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(int), &MaxWorkGroupSize, NULL);
    // PrintCL("Max work Groups  :%ld\n", MaxWorkGroupSize);
    //
    // cl_ulong MaxConstDataSize;
    // clGetDeviceInfo(m_device_id, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(cl_ulong), &MaxConstDataSize, NULL);
    // PrintCL("Max Const Data   : %ld Bytes\n", MaxConstDataSize);

#ifdef  ENABLE_SVM
    // SVM Support
    cl_device_svm_capabilities caps;
    clGetDeviceInfo(m_device_id, CL_DEVICE_SVM_CAPABILITIES, sizeof(cl_device_svm_capabilities), &caps, NULL);
    if (caps > 0) {
        m_svmSupport = true;
        PrintCL("SVM Course Grain Buffer Support: %s\n", (caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) ? "Yes" : "No");
        PrintCL("SVM Fine Grain Buffer Support  : %s\n", (caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) ? "Yes" : "No");
        PrintCL("SVM Fine Grain System Support  : %s\n", (caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) ? "Yes" : "No");
        PrintCL("SVM Atomics                    : %s\n", (caps & CL_DEVICE_SVM_ATOMICS) ? "Yes" : "No");
    }
#endif

    return true;
}

bool COpenCL::CreateContext() {
    // Create a context.
    m_context = clCreateContext(NULL, 1, &m_device_id, NULL, NULL, &m_result);
    if (m_result != CL_SUCCESS) {

        PrintCL("Failed to create a context!\n");
        return false;
    }
    return true;
}

long COpenCL::file_size(FILE* p_file) {
    // Get the size of the program.
    if (fseek(p_file, 0, SEEK_END) != 0) return 0;
    long program_size = ftell(p_file);
    fseek(p_file, 0, SEEK_SET);

    return program_size;
}

bool COpenCL::load_file() {

#ifdef _DEBUG
    PrintCL("Loading [%s]\n",m_source_file.c_str());
#endif

    errno_t fopen_result;

    m_isBinary = false;

    if (!m_force_rebuild) {
        //===========================
        // Try loading the Binary file
        //===========================
        FILE* p_file_bin = NULL;
        std::string tmp = m_source_file;
        bool        rebuild    = false;

#ifdef _WIN32
        // Check build configuration of the shader, has it been modified since last use
        rebuild = cmp_recompile_shader(m_source_file);
#endif

        if (!rebuild) {
#ifdef _WIN32
            fopen_result = fopen_s(&p_file_bin, tmp.append(".cmp").c_str(), "rb");
            if (fopen_result != 0)
                rebuild = true;
#else
            p_file_bin = fopen(tmp.append(".cmp").c_str(), "rb");
            if (p_file_bin)
                rebuild = true;
#endif
        }

        // Found a .cmp file use it
        if (!rebuild) {
            OpenCLBinary_Header BinFile_Header;
            if (fread(&BinFile_Header, sizeof(OpenCLBinary_Header), 1, p_file_bin) != 1) {
                fclose(p_file_bin);
                PrintCL("Failed to read \"%s.cmp\" file header!\n", m_source_file.c_str());
                return false;
            }

            if (BinFile_Header.version != 1) {
                fclose(p_file_bin);
                PrintCL("File \"%s.cmp\" is not compatible with current application!\n", m_source_file.c_str());
                return false;
            }

            // entry to this code is reserved for CRC checks
            {

                m_isBinary = true;
                m_program_size = file_size(p_file_bin) - sizeof(OpenCLBinary_Header);

                if (m_program_size == 0) {
                    fclose(p_file_bin);
                    PrintCL("Failed to read \"%s.cmp\" file size!\n", m_source_file.c_str());
                    return false;
                }

                // Allocate memory for the program.
                p_program.ubuffer = new unsigned char[m_program_size];

                // Reposition to bin data that is after header
                fseek(p_file_bin, sizeof(OpenCLBinary_Header), SEEK_SET);

                // Read the program in to memory.
                if (fread(p_program.buffer, m_program_size, 1, p_file_bin) != 1) {
                    fclose(p_file_bin);
                    PrintCL("Failed to read \"%s.cmp\" in to memory!\n", m_source_file.c_str());
                    return false;
                }

                fclose(p_file_bin);
                return true;
            } // reserved for CRC type checks
        }
    } // !m_force_rebuild

    //===========================
    // Try loading the source file
    //===========================
    FILE* p_file_src = NULL;
    fopen_result = fopen_s(&p_file_src, m_source_file.c_str(), "rb");
    if (fopen_result == 0) {
        m_program_size = file_size(p_file_src);

        if (m_program_size == 0) {
            fclose(p_file_src);
            PrintCL("Failed to read \"%s\" file size!\n", m_source_file.c_str());
            return false;
        }

        // Allocate memory for the program.
        p_program.buffer = new  char[m_program_size];


        // Read the program in to memory.
        size_t read_size = fread(p_program.buffer, m_program_size, 1, p_file_src);
        if (read_size != 1) {
            fclose(p_file_src);
            PrintCL("Failed to read \"%s\" in to memory!\n", m_source_file.c_str());
            return false;
        }

        fclose(p_file_src);
        return true;
    }

    //===========================
    // Failed to load the file
    //===========================

    return false;
}

bool COpenCL::Create_Program_File() {
    //------------------------------
    // Load the Source or Binary file
    //------------------------------
    load_file();

    cl_int      result;

    if (!m_isBinary) {
        // Create the program.
        m_program_encoder = clCreateProgramWithSource(m_context, 1, const_cast< char const** >(&p_program.buffer), &m_program_size, &result);
        if (result != CL_SUCCESS) {

            PrintCL("Failed to create the program!\n");
            PrintOCLError(result);
            return false;
        }

        delete[] p_program.buffer;
        p_program.buffer = NULL;

        // Build the program.
        result = clBuildProgram(m_program_encoder, 1, &m_device_id, m_compile_options, NULL, NULL);
        if (result != CL_SUCCESS) {
            char message[LOG_BUFFER_SIZE];
            result = clGetProgramBuildInfo(m_program_encoder, m_device_id, CL_PROGRAM_BUILD_LOG, LOG_BUFFER_SIZE, message, NULL);
            if (result != CL_SUCCESS) message[0] = char(0);
            //PrintCL("Failed to build the program!\n%s",message);
            printf("Failed to build the program!\n%s",message);
            return false;
        }

        size_t compiled_size = 0;
        result = clGetProgramInfo(m_program_encoder, CL_PROGRAM_BINARY_SIZES, sizeof(compiled_size), &compiled_size, NULL);

        uint8_t* p_binary = new uint8_t[compiled_size];
        result = clGetProgramInfo(m_program_encoder, CL_PROGRAM_BINARIES, sizeof(p_binary), &p_binary, NULL);

        // Save the compiled code
        FILE* p_file = NULL;
        errno_t fopen_result = fopen_s(&p_file, m_source_file.append(".cmp").c_str(), "wb");
        if (fopen_result == 0) {
            OpenCLBinary_Header BinFile_Header;
            BinFile_Header.version = 1;
            BinFile_Header.crc32 = 0;
            fwrite(&BinFile_Header, sizeof(OpenCLBinary_Header), 1, p_file);
            fwrite(p_binary, compiled_size, 1, p_file);
        }
        fclose(p_file);

    } // not precompiled code
    else {
        // Create the program.
        m_program_encoder = clCreateProgramWithBinary(m_context, 1, &m_device_id, &m_program_size, (const unsigned char **)&p_program.ubuffer, NULL, &result);

        if (result != CL_SUCCESS) {
            PrintCL("Failed to load the binary program!\n");
            PrintOCLError(result);
            return false;
        }

        delete[] p_program.buffer;
        p_program.buffer = NULL;

        // Build the program.
        result = clBuildProgram(m_program_encoder, 1, &m_device_id, NULL, NULL, NULL);
        if (result != CL_SUCCESS) {

            PrintCL("Failed to build the program!\n");
            PrintOCLError(result);

            char message[128 * 1024];
            result = clGetProgramBuildInfo(m_program_encoder, m_device_id, CL_PROGRAM_BUILD_LOG, sizeof(message), message, NULL);
            if (result == CL_SUCCESS) {

                PrintCL(message);
            }

            return false;
        }
    }
    return true;
}

bool COpenCL::CreateProgramEncoder() {
//    QUERY_PERFORMANCE("Create Program  ");
    // Create the program.
    if (!Create_Program_File()) {
        return false;
    }
    return true;
}

bool COpenCL::CreateIOBuffers() {
#ifdef  ENABLE_SVM
    // AllocateBuffers()
    // SVM
    if (m_svmSupport && is64Bit() && (m_kernel_options->size > 0)) {
        // initialize any device/SVM memory here.
        m_svmData = clSVMAlloc(m_context, CL_MEM_READ_WRITE, m_kernel_options->size, NULL);
        if (m_svmData == NULL) {
            PrintCL("Failed to allocate the Encode buffer on the device!\n");
            return false;
        }
        m_kernel_options->dataSVM = m_svmData;
    }
#endif

    // allocate device 32bit buffer for the image source
    // idealy we want to not copy the buffer form host
    // Look into the following options clmap  and CL_MEM_AMD_PERSISTANT
    // Allocate the 32-bit source buffer in device memory.

    m_device_source_buffer = clCreateBuffer(m_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, m_source_buffer_size, (void*)m_psource, &m_result);
    if (m_result != CL_SUCCESS) {
        PrintCL("Failed to allocate the source buffer on the device!\n");
        return false;
    }

    m_Source_Info_buffer = clCreateBuffer(m_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(Source_Info), (void *)&m_SourceInfo, &m_result);
    if (m_result != CL_SUCCESS) {
        PrintCL("Failed to allocate the source info buffer on the device!\n");
        PrintOCLError(m_result);
        return false;
    }

    // Allocate the destination buffer in device memory.
    m_device_destination_buffer = clCreateBuffer(m_context, CL_MEM_WRITE_ONLY, m_destination_size, NULL, &m_result);
    if (m_result != CL_SUCCESS) {
        PrintCL("Failed to allocate the destination buffer on the device!\n");
        PrintOCLError(m_result);
        return false;
    }
    return true;
}

//#include "bcn_common_kernel.h" use this for debugging data to kernel when using BC15

bool  COpenCL::RunKernel() {
//    QUERY_PERFORMANCE("Run Kernel      ");

    // Get a handle to the kernel.
    m_kernel = clCreateKernel(m_program_encoder, "CMP_GPUEncoder", &m_result);
    if (m_result != CL_SUCCESS) {

        PrintCL("Failed to create the kernel!\n");
        PrintOCLError(m_result);
        return false;
    }

    // Create the command queue. with profiling enabled
    const cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
    m_command_queue = clCreateCommandQueueWithProperties(m_context, m_device_id, properties, &m_result);
    if (m_result != CL_SUCCESS) {
        PrintCL("Failed to create the command queue!\n");
        return false;
    }

    //====================================================================================

    // ImageSource
    m_result = clSetKernelArg(m_kernel, KERNEL_ARG_SOURCE, sizeof(m_device_source_buffer), &m_device_source_buffer);
    if (m_result != CL_SUCCESS) {

        PrintCL("Failed to set the source kernel argument!\n");
        PrintOCLError(m_result);
        return false;
    }

    //ImageDestination
    m_result = clSetKernelArg(m_kernel, KERNEL_ARG_DESTINATION, sizeof(m_device_destination_buffer), &m_device_destination_buffer);
    if (m_result != CL_SUCCESS) {

        PrintCL("Failed to set the destination kernel argument!\n");
        PrintOCLError(m_result);
        return false;
    }

    //SourceInfo
    m_result = clSetKernelArg(m_kernel, KERNEL_ARG_SOURCEINFO, sizeof(m_Source_Info_buffer), (void *)&m_Source_Info_buffer);
    if (m_result != CL_SUCCESS) {
        PrintCL("Failed to set the source info argument!\n");
        PrintOCLError(m_result);
        return false;
    }

#ifdef ENABLE_SVM   // Don not enable unless CMP_GPUEncoder parameters are updated 
    if (m_svmSupport) {
        if (m_svmData) {
            /* reserve svm space for CPU update */
            m_result = clEnqueueSVMMap(m_command_queue,
                                       CL_TRUE, //blocking call
                                       CL_MAP_WRITE_INVALIDATE_REGION,
                                       m_svmData,
                                       m_kernel_options->size,
                                       0,
                                       NULL,
                                       NULL);

            m_kernel_options->dataSVM = m_svmData;

            if (!SVMInitCodec(m_kernel_options)) {

                PrintCL("Failed to initialize SVM Encode kernel data!\n");
                PrintOCLError(m_result);
                return false;
            }

            m_result = clEnqueueSVMUnmap(m_command_queue,
                                         m_svmData,
                                         0,
                                         NULL,
                                         NULL);

            // Set appropriate arguments to the kernel
            m_result = clSetKernelArgSVMPointer(m_kernel, KERNEL_ARG_ENCODE, (void *)(m_svmData));
            if (m_result != CL_SUCCESS) {

                PrintCL("Failed to set the SVM Encode kernel argument!\n");
                PrintOCLError(m_result);
                return false;
            }
        } // Encode ptr
        else {
            PrintCL("Failed to set the SVM Encode kernel argument, invalid pointer!\n");
            PrintOCLError(m_result);
            return false;
        }
    } else
#endif
    {
        m_Encoder_buffer = clCreateBuffer(m_context, CL_MEM_READ_WRITE, m_kernel_options->size, NULL, &m_result);

        // Set argument for the compress()
        m_result = clSetKernelArg(m_kernel, KERNEL_ARG_ENCODE, sizeof(m_Encoder_buffer), (void *)&m_Encoder_buffer);
        if (m_result != CL_SUCCESS) {
            PrintCL("Failed to set the Encode block argument!\n");
            PrintOCLError(m_result);
            return false;
        }

        // CMP_BC15Options *temp = reinterpret_cast<CMP_BC15Options *>(m_kernel_options->data);
        // int blocksize = sizeof(CMP_BC15Options);

        m_result = clEnqueueWriteBuffer(m_command_queue, m_Encoder_buffer, CL_TRUE, 0, m_kernel_options->size, (void*)m_kernel_options->data, 0, NULL, NULL);
        if (m_result != CL_SUCCESS) {
            PrintCL("Failed to set the Encode block buffer!\n");
            PrintOCLError(m_result);
            return false;
        }
    }

    // Run the kernel.
    //----------------------------------
    //# todo: max numbers of blocks to launch on low end GPUs to avoid kernel timeout
    //int maxblockslaunch = MIN(m_width_in_blocks*m_height_in_blocks, 768 * (int)m_maxComputeUnits);

    size_t local_work_size[]  = { 8, 8 };
    size_t global_work_size[] = { 8, 8 };

    // Check for smal images < 64 width x 64 height in pixels
    if (m_width_in_blocks >= 8) {
        global_work_size[0] = ((m_width_in_blocks + local_work_size[0] - 1) / local_work_size[0]) * local_work_size[0];
    } else {
        local_work_size[0] = 1;
        global_work_size[0] = m_width_in_blocks;
    }

    if (m_height_in_blocks >= 8) {
        global_work_size[1] = ((m_height_in_blocks + local_work_size[1] - 1) / local_work_size[1]) * local_work_size[1];
    } else {
        local_work_size[1] = 1;
        global_work_size[1] = m_height_in_blocks;
    }

#ifdef USE_CPU_PERFORMANCE_COUNTERS
    cmp_cputimer cputimer;
    cputimer.initialize();
    cputimer.Start(0);
#else
    cl_event cl_perf_event = NULL;
#endif

    m_result = clEnqueueNDRangeKernel(m_command_queue,
                                      m_kernel,
                                      2,
                                      NULL,
                                      global_work_size,
                                      local_work_size,
                                      0, NULL,
#ifdef USE_CPU_PERFORMANCE_COUNTERS
                                      NULL
#else
                                      m_getPerfStats? &cl_perf_event:NULL
#endif
                                     );

    if (m_result != CL_SUCCESS) {

        PrintCL("Failed to launch the kernel!\n");
        PrintOCLError(m_result);
        return false;
    }

    // Wait until all queued commands have been processed and completed.
    if (m_getPerfStats)
        m_result = clFinish(m_command_queue);

#ifdef USE_CPU_PERFORMANCE_COUNTERS
    cputimer.Stop(0);
    m_computeShaderElapsedMS = cputimer.GetMS(0);
#endif

    // Check if performance event has been set and valid
    if (
        m_getPerfStats &&
#ifdef USE_CPU_PERFORMANCE_COUNTERS
        (m_computeShaderElapsedMS > 0) &&
#else
        cl_perf_event &&
#endif
        (m_result == CL_SUCCESS)) {
#ifdef USE_CPU_PERFORMANCE_COUNTERS
#else
        // Get the event data
        cl_ulong start = 0;
        cl_ulong end   = 0;
        cl_ulong nspercount = 1;

        m_result = clGetEventProfilingInfo(cl_perf_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);

        if (m_result == CL_SUCCESS)
            m_result = clGetEventProfilingInfo(cl_perf_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);

        // Get the number of nanoseconds per count
        cl_ulong resolution;
        if (m_result == CL_SUCCESS)
            m_result = clGetDeviceInfo(m_device_id, CL_DEVICE_PROFILING_TIMER_RESOLUTION, sizeof(cl_ulong), &resolution, NULL);

        if (m_result == CL_SUCCESS)
            m_result = clReleaseEvent(cl_perf_event);

        if (m_result != CL_SUCCESS) {
            PrintCL("Failed clReleaseEvent!\n");
            PrintOCLError(m_result);
            return false;
        }

        // counters are in nano second incriments 1e-9f Convert
        m_num_blocks    = m_height_in_blocks*m_width_in_blocks;
        if (m_num_blocks == 0) m_num_blocks = 1;
        float nanoSeconds = (float)(end-start);
        // Convert nanosec to ms divide by 1e6f
        m_computeShaderElapsedMS = nanoSeconds/1e6f;
        // time to process a single block (4x4) which is 16 texels
        m_computeShaderElapsedMS = m_computeShaderElapsedMS/(float)m_num_blocks;
#endif
        if (m_computeShaderElapsedMS > 0) {
            float ElapsedSeconds      = m_computeShaderElapsedMS/1E3f;
            float ElapsedSecondsPerTx = ElapsedSeconds/16;
            float TxPerSec            = 1/ElapsedSecondsPerTx;
            // time to process a 1M texels in a second
            m_CmpMTxPerSec            = TxPerSec/1E6f;
        } else
            m_CmpMTxPerSec = 0;
    }

    return true;
}

bool  COpenCL::GetResults() {
//    QUERY_PERFORMANCE("Get Results     ");

    // Copy the results from device to host memory.
    m_result = clEnqueueReadBuffer(m_command_queue, m_device_destination_buffer, true, 0, m_destination_size, p_destination, 0, NULL, NULL);

    if (m_result != CL_SUCCESS) {
        PrintCL("Failed to copy the results from the device!\n");
        PrintOCLError(m_result);
        return false;
    }

    return true;
}

float COpenCL::GetProcessElapsedTimeMS() {
    return m_computeShaderElapsedMS;
}

float COpenCL::GetMTxPerSec() {
    return m_CmpMTxPerSec;
}

int   COpenCL::GetBlockSize() {
    return m_num_blocks;
}

int   COpenCL::GetMaxUCores() {
    return m_maxUCores;
}

const char* COpenCL::GetDeviceName() {
    return m_deviceName.c_str();
}

const char* COpenCL::GetVersion() {
    return m_version.c_str();
}

CMP_ERROR COpenCL::Compress( KernelOptions *KernelOptions, MipSet  &srcTexture, MipSet  &destTexture,CMP_Feedback_Proc pFeedback = NULL) {
    bool newFormat = false;
    if (m_codecFormat != destTexture.m_format) {
        m_codecFormat = destTexture.m_format;
        newFormat = true;
    }

    if (m_codecFormat == CMP_FORMAT_Unknown) {
        // dont know how to progress this destination
        return (CMP_ERR_GENERIC);
    }

    m_source_file = KernelOptions->srcfile;
    if (m_source_file.length() == 0) return CMP_ERR_NOSHADER_CODE_DEFINED;


    m_getPerfStats              = KernelOptions->getPerfStats && (destTexture.m_nIterations < 1);

    m_kernel_options->data      = KernelOptions->data;
    m_kernel_options->size      = KernelOptions->size;
    m_kernel_options->format    = KernelOptions->format;
    m_kernel_options->dataSVM   = KernelOptions->dataSVM;;

    m_source_buffer_size        = srcTexture.dwDataSize;

    p_destination               = destTexture.pData;
    m_destination_size          = destTexture.dwDataSize;

    if (destTexture.m_nBlockWidth > 0)
        m_width_in_blocks = (cl_uint)srcTexture.dwWidth / destTexture.m_nBlockWidth;
    else {
        return CMP_ERR_GENERIC;
    }

    if (destTexture.m_nBlockHeight > 0)
        m_height_in_blocks = (cl_uint)srcTexture.dwHeight / destTexture.m_nBlockHeight;
    else {
        return CMP_ERR_GENERIC;
    }

    m_psource = (CMP_Vec4uc *)srcTexture.pData;
    m_SourceInfo.m_src_height       = srcTexture.dwHeight;
    m_SourceInfo.m_src_width        = srcTexture.dwWidth;
    m_SourceInfo.m_width_in_blocks  = m_width_in_blocks;
    m_SourceInfo.m_height_in_blocks = m_height_in_blocks;
    m_SourceInfo.m_fquality         = KernelOptions->fquality;

    // Using OpenCL - Setit up and call the kernel function
    bool ok = true;

    // check for first time use on host device
    if (!m_initDeviceOk) {
        if (        GetPlatformID() == false)    ok = false;
        if (ok &&   (SearchForGPU() == false))   ok = false;
        if (ok && (GetDeviceInfo()  == false))   ok = false;
        if (ok &&  (CreateContext() == false))   ok = false;
        m_initDeviceOk = ok;
    }
    if (m_programRun) {
        CleanUpKernelAndIOBuffers();
        if (newFormat)
            CleanUpProgramEncoder();
        m_programRun = false;
    }
    if (newFormat) {
        if (ok && (CreateProgramEncoder()   == false)) ok = false;
    }
    if (ok && (CreateIOBuffers() == false)) ok = false;
    if (ok && (RunKernel()       == false)) ok = false;
    if (ok && (GetResults()      == false)) ok = false;

    if (ok) {
        m_programRun = true;
        return CMP_OK;
    }

    return(CMP_ERR_GENERIC);

}