1081 lines
36 KiB
C++

//=============================================================================
// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files(the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions :
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//
//==============================================================================
#include "copencl.h"
#ifdef USE_CPU_PERFORMANCE_COUNTERS
#include "query_timer.h" // can use CPU timing but pref is to use GPU counters
#endif
#ifdef _M_X64
//#define ENABLE_SVM //- disable for now, causing build issue in 64bit release
#endif
#ifdef cl_amd_fp64
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
#define fp64_supported 1
#elif defined(cl_khr_fp64)
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#define fp64_supported 1
#else
// "Double precision floating point not supported by OpenCL implementation."
#define fp64_supported 0
#endif
#ifdef ENABLE_SVM
extern bool SVMInitCodec(KernelOptions *options);
#endif
extern CMIPS *GPU_CLMips;
#define LOG_BUFFER_SIZE 102400
#define KERNEL_ARG_SOURCE 0
#define KERNEL_ARG_DESTINATION 1
#define KERNEL_ARG_SOURCEINFO 2
#define KERNEL_ARG_ENCODE 3
static bool is64Bit() {
return (sizeof(int*) == 8);
}
void PrintCL(const char* Format, ... ) {
// define a pointer to save argument list
va_list args;
char buff[1024];
// process the arguments into our debug buffer
va_start(args, Format);
vsprintf_s(buff, Format, args);
va_end(args);
if (GPU_CLMips) {
GPU_CLMips->Print(buff);
} else {
printf(buff);
}
}
void PrintOCLError(cl_int error) {
switch (error) {
case CL_DEVICE_NOT_FOUND:
PrintCL("Error: CL_DEVICE_NOT_FOUND\n");
break;
case CL_DEVICE_NOT_AVAILABLE:
PrintCL("Error: CL_DEVICE_NOT_AVAILABLE\n");
break;
case CL_COMPILER_NOT_AVAILABLE:
PrintCL("Error: CL_COMPILER_NOT_AVAILABLE\n");
break;
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
PrintCL("Error: CL_MEM_OBJECT_ALLOCATION_FAILURE\n");
break;
case CL_OUT_OF_RESOURCES:
PrintCL("Error: CL_OUT_OF_RESOURCES\n");
break;
case CL_OUT_OF_HOST_MEMORY:
PrintCL("Error: CL_OUT_OF_HOST_MEMORY\n");
break;
case CL_PROFILING_INFO_NOT_AVAILABLE:
PrintCL("Error: CL_PROFILING_INFO_NOT_AVAILABLE\n");
break;
case CL_MEM_COPY_OVERLAP:
PrintCL("Error: CL_MEM_COPY_OVERLAP\n");
break;
case CL_IMAGE_FORMAT_MISMATCH:
PrintCL("Error: CL_IMAGE_FORMAT_MISMATCH\n");
break;
case CL_IMAGE_FORMAT_NOT_SUPPORTED:
PrintCL("Error: CL_IMAGE_FORMAT_NOT_SUPPORTED\n");
break;
case CL_BUILD_PROGRAM_FAILURE:
PrintCL("Error: CL_BUILD_PROGRAM_FAILURE\n");
break;
case CL_MAP_FAILURE:
PrintCL("Error: CL_MAP_FAILURE\n");
break;
case CL_MISALIGNED_SUB_BUFFER_OFFSET:
PrintCL("Error: CL_MISALIGNED_SUB_BUFFER_OFFSET\n");
break;
case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
PrintCL("Error: CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST\n");
break;
case CL_COMPILE_PROGRAM_FAILURE:
PrintCL("Error: CL_COMPILE_PROGRAM_FAILURE\n");
break;
case CL_LINKER_NOT_AVAILABLE:
PrintCL("Error: CL_LINKER_NOT_AVAILABLE\n");
break;
case CL_LINK_PROGRAM_FAILURE:
PrintCL("Error: CL_LINK_PROGRAM_FAILURE\n");
break;
case CL_DEVICE_PARTITION_FAILED:
PrintCL("Error: CL_DEVICE_PARTITION_FAILED\n");
break;
case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
PrintCL("Error: CL_KERNEL_ARG_INFO_NOT_AVAILABLE\n");
break;
case CL_INVALID_VALUE:
PrintCL("Error: CL_INVALID_VALUE\n");
break;
case CL_INVALID_DEVICE_TYPE:
PrintCL("Error: CL_INVALID_DEVICE_TYPE\n");
break;
case CL_INVALID_PLATFORM:
PrintCL("Error: CL_INVALID_PLATFORM\n");
break;
case CL_INVALID_DEVICE:
PrintCL("Error: CL_INVALID_DEVICE\n");
break;
case CL_INVALID_CONTEXT:
PrintCL("Error: CL_INVALID_CONTEXT\n");
break;
case CL_INVALID_QUEUE_PROPERTIES:
PrintCL("Error: CL_INVALID_QUEUE_PROPERTIES\n");
break;
case CL_INVALID_COMMAND_QUEUE:
PrintCL("Error: CL_INVALID_COMMAND_QUEUE\n");
break;
case CL_INVALID_HOST_PTR:
PrintCL("Error: CL_INVALID_HOST_PTR\n");
break;
case CL_INVALID_MEM_OBJECT:
PrintCL("Error: CL_INVALID_MEM_OBJECT\n");
break;
case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
PrintCL("Error: CL_INVALID_IMAGE_FORMAT_DESCRIPTOR\n");
break;
case CL_INVALID_IMAGE_SIZE:
PrintCL("Error: CL_INVALID_IMAGE_SIZE\n");
break;
case CL_INVALID_SAMPLER:
PrintCL("Error: CL_INVALID_SAMPLER\n");
break;
case CL_INVALID_BINARY:
PrintCL("Error: CL_INVALID_BINARY\n");
break;
case CL_INVALID_BUILD_OPTIONS:
PrintCL("Error: CL_INVALID_BUILD_OPTIONS\n");
break;
case CL_INVALID_PROGRAM:
PrintCL("Error: CL_INVALID_PROGRAM\n");
break;
case CL_INVALID_PROGRAM_EXECUTABLE:
PrintCL("Error: CL_INVALID_PROGRAM_EXECUTABLE\n");
break;
case CL_INVALID_KERNEL_NAME:
PrintCL("Error: CL_INVALID_KERNEL_NAME\n");
break;
case CL_INVALID_KERNEL_DEFINITION:
PrintCL("Error: CL_INVALID_KERNEL_DEFINITION\n");
break;
case CL_INVALID_KERNEL:
PrintCL("Error: CL_INVALID_KERNEL\n");
break;
case CL_INVALID_ARG_INDEX:
PrintCL("Error: CL_INVALID_ARG_INDEX\n");
break;
case CL_INVALID_ARG_VALUE:
PrintCL("Error: CL_INVALID_ARG_VALUE\n");
break;
case CL_INVALID_ARG_SIZE:
PrintCL("Error: CL_INVALID_ARG_SIZE\n");
break;
case CL_INVALID_KERNEL_ARGS:
PrintCL("Error: CL_INVALID_KERNEL_ARGS\n");
break;
case CL_INVALID_WORK_DIMENSION:
PrintCL("Error: CL_INVALID_WORK_DIMENSION\n");
break;
case CL_INVALID_WORK_GROUP_SIZE:
PrintCL("Error: CL_INVALID_WORK_GROUP_SIZE\n");
break;
case CL_INVALID_WORK_ITEM_SIZE:
PrintCL("Error: CL_INVALID_WORK_ITEM_SIZE\n");
break;
case CL_INVALID_GLOBAL_OFFSET:
PrintCL("Error: CL_INVALID_GLOBAL_OFFSET\n");
break;
case CL_INVALID_EVENT_WAIT_LIST:
PrintCL("Error: CL_INVALID_EVENT_WAIT_LIST\n");
break;
case CL_INVALID_EVENT:
PrintCL("Error: CL_INVALID_EVENT\n");
break;
case CL_INVALID_OPERATION:
PrintCL("Error: CL_INVALID_OPERATION\n");
break;
case CL_INVALID_GL_OBJECT:
PrintCL("Error: CL_INVALID_GL_OBJECT\n");
break;
case CL_INVALID_BUFFER_SIZE:
PrintCL("Error: CL_INVALID_BUFFER_SIZE\n");
break;
case CL_INVALID_MIP_LEVEL:
PrintCL("Error: CL_INVALID_MIP_LEVEL\n");
break;
case CL_INVALID_GLOBAL_WORK_SIZE:
PrintCL("Error: CL_INVALID_GLOBAL_WORK_SIZE\n");
break;
case CL_INVALID_PROPERTY:
PrintCL("Error: CL_INVALID_PROPERTY\n");
break;
case CL_INVALID_IMAGE_DESCRIPTOR:
PrintCL("Error: CL_INVALID_IMAGE_DESCRIPTOR\n");
break;
case CL_INVALID_COMPILER_OPTIONS:
PrintCL("Error: CL_INVALID_COMPILER_OPTIONS\n");
break;
case CL_INVALID_LINKER_OPTIONS:
PrintCL("Error: CL_INVALID_LINKER_OPTIONS\n");
break;
case CL_INVALID_DEVICE_PARTITION_COUNT:
PrintCL("Error: CL_INVALID_DEVICE_PARTITION_COUNT\n");
break;
case CL_INVALID_PIPE_SIZE:
PrintCL("Error: CL_INVALID_PIPE_SIZE\n");
break;
case CL_INVALID_DEVICE_QUEUE:
PrintCL("Error: CL_INVALID_DEVICE_QUEUE\n");
break;
default:
PrintCL("Error: UKNOWN 0x%X\n", error);
break;
}
}
//====================================== Framework Common Interfaces : OpenCL Compute ==========================================
void COpenCL::Init() {
query_timer::initialize();
m_initDeviceOk = false;
m_programRun = false;
m_codecFormat = CMP_FORMAT_Unknown;
m_num_blocks = 0;
m_CmpMTxPerSec = 0;
m_computeShaderElapsedMS = 0.0f;
m_platform_id = NULL;
m_device_id = NULL;
m_num_platforms = 0;
m_command_queue = NULL;
m_kernel = NULL;
m_device_destination_buffer = NULL;
m_device_source_buffer = NULL;
m_Source_Info_buffer = NULL;
m_Encoder_buffer = NULL;
m_program_encoder = NULL;
m_context = NULL;
m_svmSupport = false;
m_svmData = NULL;
p_program.buffer = NULL;
ocl_time_device = 0;
m_deviceName = "";
m_version = "";
m_maxUCores = 12;
//-------------------------
// OpenCL compiler options
//-------------------------
long cmp_opt_size = sizeof(m_compile_options);
m_compile_options[0] = 0;
// Make all warnings into errors, use -w to Inhitit all warning messages
// strncat_s(m_compile_options, cmp_opt_size, "-Werror ", _TRUNCATE);
// single and double precision denormalized numbers may be flushed to zero
strncat_s(m_compile_options, cmp_opt_size, "-cl-denorms-are-zero ", _TRUNCATE);
// Looks for addtional include file in this sub-folder
strncat_s(m_compile_options, cmp_opt_size, "-I ./Plugins/Compute/ ", _TRUNCATE);
// Use this to debug with CodeXL or other debuggers
// This option disables all optimizations
// strncat_s(m_compile_options, cmp_opt_size, "-g -cl-opt-disable", _TRUNCATE);
// User override options set after this init call
m_force_rebuild = false;
}
COpenCL::COpenCL(ComputeOptions CLOptions) {
Init();
m_force_rebuild = CLOptions.force_rebuild;
}
COpenCL::~COpenCL() {
#ifdef ENABLE_SVM
if (m_context && m_svmData)
clSVMFree(m_context, m_svmData);
#endif
if (m_context)
clReleaseContext(m_context);
if (m_programRun) {
CleanUpKernelAndIOBuffers();
CleanUpProgramEncoder();
}
}
void COpenCL::SetComputeOptions(ComputeOptions *CLOptions) {
m_force_rebuild = CLOptions->force_rebuild;
}
COpenCL::COpenCL(void *kerneloptions) {
m_kernel_options = (KernelOptions *)kerneloptions;
Init();
}
void COpenCL::CleanUpProgramEncoder() {
// Encoder Program & Buffer
if (p_program.buffer)
delete[] p_program.buffer;
if (m_program_encoder)
clReleaseProgram(m_program_encoder);
}
void COpenCL::CleanUpKernelAndIOBuffers() {
// Command Queues and Kernel functions
if (m_command_queue)
clReleaseCommandQueue(m_command_queue);
if (m_kernel)
clReleaseKernel(m_kernel);
// IO Buffers
if (m_Encoder_buffer)
clReleaseMemObject(m_Encoder_buffer);
if (m_Source_Info_buffer)
clReleaseMemObject(m_Source_Info_buffer);
if (m_device_destination_buffer)
clReleaseMemObject(m_device_destination_buffer);
if (m_device_source_buffer)
clReleaseMemObject(m_device_source_buffer);
}
bool COpenCL::GetPlatformID() {
//QUERY_PERFORMANCE("Get Platform ID ");
m_result = clGetPlatformIDs(MAX_PLATFORMS, m_platform_ids, &m_num_platforms);
if (m_result != CL_SUCCESS) {
PrintCL("Failed to get the GPU platforms!\n");
PrintOCLError(m_result);
return false;
}
return true;
}
bool COpenCL::SearchForGPU() {
for (uint32_t i = 0; i < m_num_platforms; i++) {
// Get the device ids.
m_result = clGetDeviceIDs(m_platform_ids[i], CL_DEVICE_TYPE_GPU, 1, &m_device_id, NULL);
if (m_result == CL_SUCCESS) {
m_platform_id = m_platform_ids[i];
break;
}
} // end for
if (m_result != CL_SUCCESS) {
PrintCL("Failed to get a GPU device!\n");
PrintOCLError(m_result);
return false;
}
return true;
}
bool COpenCL::GetDeviceInfo() {
// Show the device info.
char device_name[256] = { 0 };
if (clGetDeviceInfo(m_device_id, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL) == CL_SUCCESS)
m_deviceName = device_name;
char openclVersion[1024];
if (clGetDeviceInfo(m_device_id, CL_DEVICE_OPENCL_C_VERSION, sizeof(openclVersion), openclVersion, NULL) == CL_SUCCESS)
m_version = openclVersion;
if (clGetDeviceInfo(m_device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(m_maxUCores), &m_maxUCores, NULL) != CL_SUCCESS) m_maxUCores = 12;
// long long GlobalMem;
// clGetDeviceInfo(m_device_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(long long), &GlobalMem, NULL);
// PrintCL("Device Global Mem: %I64d Bytes\n", GlobalMem);
//
// long long LocalMem;
// clGetDeviceInfo(m_device_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(long long), &LocalMem, NULL);
// PrintCL("Device Local Mem: %I64d Bytes\n", LocalMem);
//
// size_t MaxWorkGroupSize;
// clGetDeviceInfo(m_device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(int), &MaxWorkGroupSize, NULL);
// PrintCL("Max work Groups :%ld\n", MaxWorkGroupSize);
//
// cl_ulong MaxConstDataSize;
// clGetDeviceInfo(m_device_id, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(cl_ulong), &MaxConstDataSize, NULL);
// PrintCL("Max Const Data : %ld Bytes\n", MaxConstDataSize);
#ifdef ENABLE_SVM
// SVM Support
cl_device_svm_capabilities caps;
clGetDeviceInfo(m_device_id, CL_DEVICE_SVM_CAPABILITIES, sizeof(cl_device_svm_capabilities), &caps, NULL);
if (caps > 0) {
m_svmSupport = true;
PrintCL("SVM Course Grain Buffer Support: %s\n", (caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) ? "Yes" : "No");
PrintCL("SVM Fine Grain Buffer Support : %s\n", (caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) ? "Yes" : "No");
PrintCL("SVM Fine Grain System Support : %s\n", (caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) ? "Yes" : "No");
PrintCL("SVM Atomics : %s\n", (caps & CL_DEVICE_SVM_ATOMICS) ? "Yes" : "No");
}
#endif
return true;
}
bool COpenCL::CreateContext() {
// Create a context.
m_context = clCreateContext(NULL, 1, &m_device_id, NULL, NULL, &m_result);
if (m_result != CL_SUCCESS) {
PrintCL("Failed to create a context!\n");
return false;
}
return true;
}
long COpenCL::file_size(FILE* p_file) {
// Get the size of the program.
if (fseek(p_file, 0, SEEK_END) != 0) return 0;
long program_size = ftell(p_file);
fseek(p_file, 0, SEEK_SET);
return program_size;
}
bool COpenCL::load_file() {
#ifdef _DEBUG
PrintCL("Loading [%s]\n",m_source_file.c_str());
#endif
errno_t fopen_result;
m_isBinary = false;
if (!m_force_rebuild) {
//===========================
// Try loading the Binary file
//===========================
FILE* p_file_bin = NULL;
std::string tmp = m_source_file;
bool rebuild = false;
#ifdef _WIN32
// Check build configuration of the shader, has it been modified since last use
rebuild = cmp_recompile_shader(m_source_file);
#endif
if (!rebuild) {
#ifdef _WIN32
fopen_result = fopen_s(&p_file_bin, tmp.append(".cmp").c_str(), "rb");
if (fopen_result != 0)
rebuild = true;
#else
p_file_bin = fopen(tmp.append(".cmp").c_str(), "rb");
if (p_file_bin)
rebuild = true;
#endif
}
// Found a .cmp file use it
if (!rebuild) {
OpenCLBinary_Header BinFile_Header;
if (fread(&BinFile_Header, sizeof(OpenCLBinary_Header), 1, p_file_bin) != 1) {
fclose(p_file_bin);
PrintCL("Failed to read \"%s.cmp\" file header!\n", m_source_file.c_str());
return false;
}
if (BinFile_Header.version != 1) {
fclose(p_file_bin);
PrintCL("File \"%s.cmp\" is not compatible with current application!\n", m_source_file.c_str());
return false;
}
// entry to this code is reserved for CRC checks
{
m_isBinary = true;
m_program_size = file_size(p_file_bin) - sizeof(OpenCLBinary_Header);
if (m_program_size == 0) {
fclose(p_file_bin);
PrintCL("Failed to read \"%s.cmp\" file size!\n", m_source_file.c_str());
return false;
}
// Allocate memory for the program.
p_program.ubuffer = new unsigned char[m_program_size];
// Reposition to bin data that is after header
fseek(p_file_bin, sizeof(OpenCLBinary_Header), SEEK_SET);
// Read the program in to memory.
if (fread(p_program.buffer, m_program_size, 1, p_file_bin) != 1) {
fclose(p_file_bin);
PrintCL("Failed to read \"%s.cmp\" in to memory!\n", m_source_file.c_str());
return false;
}
fclose(p_file_bin);
return true;
} // reserved for CRC type checks
}
} // !m_force_rebuild
//===========================
// Try loading the source file
//===========================
FILE* p_file_src = NULL;
fopen_result = fopen_s(&p_file_src, m_source_file.c_str(), "rb");
if (fopen_result == 0) {
m_program_size = file_size(p_file_src);
if (m_program_size == 0) {
fclose(p_file_src);
PrintCL("Failed to read \"%s\" file size!\n", m_source_file.c_str());
return false;
}
// Allocate memory for the program.
p_program.buffer = new char[m_program_size];
// Read the program in to memory.
size_t read_size = fread(p_program.buffer, m_program_size, 1, p_file_src);
if (read_size != 1) {
fclose(p_file_src);
PrintCL("Failed to read \"%s\" in to memory!\n", m_source_file.c_str());
return false;
}
fclose(p_file_src);
return true;
}
//===========================
// Failed to load the file
//===========================
return false;
}
bool COpenCL::Create_Program_File() {
//------------------------------
// Load the Source or Binary file
//------------------------------
load_file();
cl_int result;
if (!m_isBinary) {
// Create the program.
m_program_encoder = clCreateProgramWithSource(m_context, 1, const_cast< char const** >(&p_program.buffer), &m_program_size, &result);
if (result != CL_SUCCESS) {
PrintCL("Failed to create the program!\n");
PrintOCLError(result);
return false;
}
delete[] p_program.buffer;
p_program.buffer = NULL;
// Build the program.
result = clBuildProgram(m_program_encoder, 1, &m_device_id, m_compile_options, NULL, NULL);
if (result != CL_SUCCESS) {
char message[LOG_BUFFER_SIZE];
result = clGetProgramBuildInfo(m_program_encoder, m_device_id, CL_PROGRAM_BUILD_LOG, LOG_BUFFER_SIZE, message, NULL);
if (result != CL_SUCCESS) message[0] = char(0);
//PrintCL("Failed to build the program!\n%s",message);
printf("Failed to build the program!\n%s",message);
return false;
}
size_t compiled_size = 0;
result = clGetProgramInfo(m_program_encoder, CL_PROGRAM_BINARY_SIZES, sizeof(compiled_size), &compiled_size, NULL);
uint8_t* p_binary = new uint8_t[compiled_size];
result = clGetProgramInfo(m_program_encoder, CL_PROGRAM_BINARIES, sizeof(p_binary), &p_binary, NULL);
// Save the compiled code
FILE* p_file = NULL;
errno_t fopen_result = fopen_s(&p_file, m_source_file.append(".cmp").c_str(), "wb");
if (fopen_result == 0) {
OpenCLBinary_Header BinFile_Header;
BinFile_Header.version = 1;
BinFile_Header.crc32 = 0;
fwrite(&BinFile_Header, sizeof(OpenCLBinary_Header), 1, p_file);
fwrite(p_binary, compiled_size, 1, p_file);
}
fclose(p_file);
} // not precompiled code
else {
// Create the program.
m_program_encoder = clCreateProgramWithBinary(m_context, 1, &m_device_id, &m_program_size, (const unsigned char **)&p_program.ubuffer, NULL, &result);
if (result != CL_SUCCESS) {
PrintCL("Failed to load the binary program!\n");
PrintOCLError(result);
return false;
}
delete[] p_program.buffer;
p_program.buffer = NULL;
// Build the program.
result = clBuildProgram(m_program_encoder, 1, &m_device_id, NULL, NULL, NULL);
if (result != CL_SUCCESS) {
PrintCL("Failed to build the program!\n");
PrintOCLError(result);
char message[128 * 1024];
result = clGetProgramBuildInfo(m_program_encoder, m_device_id, CL_PROGRAM_BUILD_LOG, sizeof(message), message, NULL);
if (result == CL_SUCCESS) {
PrintCL(message);
}
return false;
}
}
return true;
}
bool COpenCL::CreateProgramEncoder() {
// QUERY_PERFORMANCE("Create Program ");
// Create the program.
if (!Create_Program_File()) {
return false;
}
return true;
}
bool COpenCL::CreateIOBuffers() {
#ifdef ENABLE_SVM
// AllocateBuffers()
// SVM
if (m_svmSupport && is64Bit() && (m_kernel_options->size > 0)) {
// initialize any device/SVM memory here.
m_svmData = clSVMAlloc(m_context, CL_MEM_READ_WRITE, m_kernel_options->size, NULL);
if (m_svmData == NULL) {
PrintCL("Failed to allocate the Encode buffer on the device!\n");
return false;
}
m_kernel_options->dataSVM = m_svmData;
}
#endif
// allocate device 32bit buffer for the image source
// idealy we want to not copy the buffer form host
// Look into the following options clmap and CL_MEM_AMD_PERSISTANT
// Allocate the 32-bit source buffer in device memory.
m_device_source_buffer = clCreateBuffer(m_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, m_source_buffer_size, (void*)m_psource, &m_result);
if (m_result != CL_SUCCESS) {
PrintCL("Failed to allocate the source buffer on the device!\n");
return false;
}
m_Source_Info_buffer = clCreateBuffer(m_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(Source_Info), (void *)&m_SourceInfo, &m_result);
if (m_result != CL_SUCCESS) {
PrintCL("Failed to allocate the source info buffer on the device!\n");
PrintOCLError(m_result);
return false;
}
// Allocate the destination buffer in device memory.
m_device_destination_buffer = clCreateBuffer(m_context, CL_MEM_WRITE_ONLY, m_destination_size, NULL, &m_result);
if (m_result != CL_SUCCESS) {
PrintCL("Failed to allocate the destination buffer on the device!\n");
PrintOCLError(m_result);
return false;
}
return true;
}
//#include "bcn_common_kernel.h" use this for debugging data to kernel when using BC15
bool COpenCL::RunKernel() {
// QUERY_PERFORMANCE("Run Kernel ");
// Get a handle to the kernel.
m_kernel = clCreateKernel(m_program_encoder, "CMP_GPUEncoder", &m_result);
if (m_result != CL_SUCCESS) {
PrintCL("Failed to create the kernel!\n");
PrintOCLError(m_result);
return false;
}
// Create the command queue. with profiling enabled
const cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
m_command_queue = clCreateCommandQueueWithProperties(m_context, m_device_id, properties, &m_result);
if (m_result != CL_SUCCESS) {
PrintCL("Failed to create the command queue!\n");
return false;
}
//====================================================================================
// ImageSource
m_result = clSetKernelArg(m_kernel, KERNEL_ARG_SOURCE, sizeof(m_device_source_buffer), &m_device_source_buffer);
if (m_result != CL_SUCCESS) {
PrintCL("Failed to set the source kernel argument!\n");
PrintOCLError(m_result);
return false;
}
//ImageDestination
m_result = clSetKernelArg(m_kernel, KERNEL_ARG_DESTINATION, sizeof(m_device_destination_buffer), &m_device_destination_buffer);
if (m_result != CL_SUCCESS) {
PrintCL("Failed to set the destination kernel argument!\n");
PrintOCLError(m_result);
return false;
}
//SourceInfo
m_result = clSetKernelArg(m_kernel, KERNEL_ARG_SOURCEINFO, sizeof(m_Source_Info_buffer), (void *)&m_Source_Info_buffer);
if (m_result != CL_SUCCESS) {
PrintCL("Failed to set the source info argument!\n");
PrintOCLError(m_result);
return false;
}
#ifdef ENABLE_SVM // Don not enable unless CMP_GPUEncoder parameters are updated
if (m_svmSupport) {
if (m_svmData) {
/* reserve svm space for CPU update */
m_result = clEnqueueSVMMap(m_command_queue,
CL_TRUE, //blocking call
CL_MAP_WRITE_INVALIDATE_REGION,
m_svmData,
m_kernel_options->size,
0,
NULL,
NULL);
m_kernel_options->dataSVM = m_svmData;
if (!SVMInitCodec(m_kernel_options)) {
PrintCL("Failed to initialize SVM Encode kernel data!\n");
PrintOCLError(m_result);
return false;
}
m_result = clEnqueueSVMUnmap(m_command_queue,
m_svmData,
0,
NULL,
NULL);
// Set appropriate arguments to the kernel
m_result = clSetKernelArgSVMPointer(m_kernel, KERNEL_ARG_ENCODE, (void *)(m_svmData));
if (m_result != CL_SUCCESS) {
PrintCL("Failed to set the SVM Encode kernel argument!\n");
PrintOCLError(m_result);
return false;
}
} // Encode ptr
else {
PrintCL("Failed to set the SVM Encode kernel argument, invalid pointer!\n");
PrintOCLError(m_result);
return false;
}
} else
#endif
{
m_Encoder_buffer = clCreateBuffer(m_context, CL_MEM_READ_WRITE, m_kernel_options->size, NULL, &m_result);
// Set argument for the compress()
m_result = clSetKernelArg(m_kernel, KERNEL_ARG_ENCODE, sizeof(m_Encoder_buffer), (void *)&m_Encoder_buffer);
if (m_result != CL_SUCCESS) {
PrintCL("Failed to set the Encode block argument!\n");
PrintOCLError(m_result);
return false;
}
// CMP_BC15Options *temp = reinterpret_cast<CMP_BC15Options *>(m_kernel_options->data);
// int blocksize = sizeof(CMP_BC15Options);
m_result = clEnqueueWriteBuffer(m_command_queue, m_Encoder_buffer, CL_TRUE, 0, m_kernel_options->size, (void*)m_kernel_options->data, 0, NULL, NULL);
if (m_result != CL_SUCCESS) {
PrintCL("Failed to set the Encode block buffer!\n");
PrintOCLError(m_result);
return false;
}
}
// Run the kernel.
//----------------------------------
//# todo: max numbers of blocks to launch on low end GPUs to avoid kernel timeout
//int maxblockslaunch = MIN(m_width_in_blocks*m_height_in_blocks, 768 * (int)m_maxComputeUnits);
size_t local_work_size[] = { 8, 8 };
size_t global_work_size[] = { 8, 8 };
// Check for smal images < 64 width x 64 height in pixels
if (m_width_in_blocks >= 8) {
global_work_size[0] = ((m_width_in_blocks + local_work_size[0] - 1) / local_work_size[0]) * local_work_size[0];
} else {
local_work_size[0] = 1;
global_work_size[0] = m_width_in_blocks;
}
if (m_height_in_blocks >= 8) {
global_work_size[1] = ((m_height_in_blocks + local_work_size[1] - 1) / local_work_size[1]) * local_work_size[1];
} else {
local_work_size[1] = 1;
global_work_size[1] = m_height_in_blocks;
}
#ifdef USE_CPU_PERFORMANCE_COUNTERS
cmp_cputimer cputimer;
cputimer.initialize();
cputimer.Start(0);
#else
cl_event cl_perf_event = NULL;
#endif
m_result = clEnqueueNDRangeKernel(m_command_queue,
m_kernel,
2,
NULL,
global_work_size,
local_work_size,
0, NULL,
#ifdef USE_CPU_PERFORMANCE_COUNTERS
NULL
#else
m_getPerfStats? &cl_perf_event:NULL
#endif
);
if (m_result != CL_SUCCESS) {
PrintCL("Failed to launch the kernel!\n");
PrintOCLError(m_result);
return false;
}
// Wait until all queued commands have been processed and completed.
if (m_getPerfStats)
m_result = clFinish(m_command_queue);
#ifdef USE_CPU_PERFORMANCE_COUNTERS
cputimer.Stop(0);
m_computeShaderElapsedMS = cputimer.GetMS(0);
#endif
// Check if performance event has been set and valid
if (
m_getPerfStats &&
#ifdef USE_CPU_PERFORMANCE_COUNTERS
(m_computeShaderElapsedMS > 0) &&
#else
cl_perf_event &&
#endif
(m_result == CL_SUCCESS)) {
#ifdef USE_CPU_PERFORMANCE_COUNTERS
#else
// Get the event data
cl_ulong start = 0;
cl_ulong end = 0;
cl_ulong nspercount = 1;
m_result = clGetEventProfilingInfo(cl_perf_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
if (m_result == CL_SUCCESS)
m_result = clGetEventProfilingInfo(cl_perf_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
// Get the number of nanoseconds per count
cl_ulong resolution;
if (m_result == CL_SUCCESS)
m_result = clGetDeviceInfo(m_device_id, CL_DEVICE_PROFILING_TIMER_RESOLUTION, sizeof(cl_ulong), &resolution, NULL);
if (m_result == CL_SUCCESS)
m_result = clReleaseEvent(cl_perf_event);
if (m_result != CL_SUCCESS) {
PrintCL("Failed clReleaseEvent!\n");
PrintOCLError(m_result);
return false;
}
// counters are in nano second incriments 1e-9f Convert
m_num_blocks = m_height_in_blocks*m_width_in_blocks;
if (m_num_blocks == 0) m_num_blocks = 1;
float nanoSeconds = (float)(end-start);
// Convert nanosec to ms divide by 1e6f
m_computeShaderElapsedMS = nanoSeconds/1e6f;
// time to process a single block (4x4) which is 16 texels
m_computeShaderElapsedMS = m_computeShaderElapsedMS/(float)m_num_blocks;
#endif
if (m_computeShaderElapsedMS > 0) {
float ElapsedSeconds = m_computeShaderElapsedMS/1E3f;
float ElapsedSecondsPerTx = ElapsedSeconds/16;
float TxPerSec = 1/ElapsedSecondsPerTx;
// time to process a 1M texels in a second
m_CmpMTxPerSec = TxPerSec/1E6f;
} else
m_CmpMTxPerSec = 0;
}
return true;
}
bool COpenCL::GetResults() {
// QUERY_PERFORMANCE("Get Results ");
// Copy the results from device to host memory.
m_result = clEnqueueReadBuffer(m_command_queue, m_device_destination_buffer, true, 0, m_destination_size, p_destination, 0, NULL, NULL);
if (m_result != CL_SUCCESS) {
PrintCL("Failed to copy the results from the device!\n");
PrintOCLError(m_result);
return false;
}
return true;
}
float COpenCL::GetProcessElapsedTimeMS() {
return m_computeShaderElapsedMS;
}
float COpenCL::GetMTxPerSec() {
return m_CmpMTxPerSec;
}
int COpenCL::GetBlockSize() {
return m_num_blocks;
}
int COpenCL::GetMaxUCores() {
return m_maxUCores;
}
const char* COpenCL::GetDeviceName() {
return m_deviceName.c_str();
}
const char* COpenCL::GetVersion() {
return m_version.c_str();
}
CMP_ERROR COpenCL::Compress( KernelOptions *KernelOptions, MipSet &srcTexture, MipSet &destTexture,CMP_Feedback_Proc pFeedback = NULL) {
bool newFormat = false;
if (m_codecFormat != destTexture.m_format) {
m_codecFormat = destTexture.m_format;
newFormat = true;
}
if (m_codecFormat == CMP_FORMAT_Unknown) {
// dont know how to progress this destination
return (CMP_ERR_GENERIC);
}
m_source_file = KernelOptions->srcfile;
if (m_source_file.length() == 0) return CMP_ERR_NOSHADER_CODE_DEFINED;
m_getPerfStats = KernelOptions->getPerfStats && (destTexture.m_nIterations < 1);
m_kernel_options->data = KernelOptions->data;
m_kernel_options->size = KernelOptions->size;
m_kernel_options->format = KernelOptions->format;
m_kernel_options->dataSVM = KernelOptions->dataSVM;;
m_source_buffer_size = srcTexture.dwDataSize;
p_destination = destTexture.pData;
m_destination_size = destTexture.dwDataSize;
if (destTexture.m_nBlockWidth > 0)
m_width_in_blocks = (cl_uint)srcTexture.dwWidth / destTexture.m_nBlockWidth;
else {
return CMP_ERR_GENERIC;
}
if (destTexture.m_nBlockHeight > 0)
m_height_in_blocks = (cl_uint)srcTexture.dwHeight / destTexture.m_nBlockHeight;
else {
return CMP_ERR_GENERIC;
}
m_psource = (CMP_Vec4uc *)srcTexture.pData;
m_SourceInfo.m_src_height = srcTexture.dwHeight;
m_SourceInfo.m_src_width = srcTexture.dwWidth;
m_SourceInfo.m_width_in_blocks = m_width_in_blocks;
m_SourceInfo.m_height_in_blocks = m_height_in_blocks;
m_SourceInfo.m_fquality = KernelOptions->fquality;
// Using OpenCL - Setit up and call the kernel function
bool ok = true;
// check for first time use on host device
if (!m_initDeviceOk) {
if ( GetPlatformID() == false) ok = false;
if (ok && (SearchForGPU() == false)) ok = false;
if (ok && (GetDeviceInfo() == false)) ok = false;
if (ok && (CreateContext() == false)) ok = false;
m_initDeviceOk = ok;
}
if (m_programRun) {
CleanUpKernelAndIOBuffers();
if (newFormat)
CleanUpProgramEncoder();
m_programRun = false;
}
if (newFormat) {
if (ok && (CreateProgramEncoder() == false)) ok = false;
}
if (ok && (CreateIOBuffers() == false)) ok = false;
if (ok && (RunKernel() == false)) ok = false;
if (ok && (GetResults() == false)) ok = false;
if (ok) {
m_programRun = true;
return CMP_OK;
}
return(CMP_ERR_GENERIC);
}