//============================================================================= // Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files(the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and / or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions : // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. // //============================================================================== #include "copencl.h" #ifdef USE_CPU_PERFORMANCE_COUNTERS #include "query_timer.h" // can use CPU timing but pref is to use GPU counters #endif #ifdef _M_X64 //#define ENABLE_SVM //- disable for now, causing build issue in 64bit release #endif #ifdef cl_amd_fp64 #pragma OPENCL EXTENSION cl_amd_fp64 : enable #define fp64_supported 1 #elif defined(cl_khr_fp64) #pragma OPENCL EXTENSION cl_khr_fp64 : enable #define fp64_supported 1 #else // "Double precision floating point not supported by OpenCL implementation." #define fp64_supported 0 #endif #ifdef ENABLE_SVM extern bool SVMInitCodec(KernelOptions *options); #endif extern CMIPS *GPU_CLMips; #define LOG_BUFFER_SIZE 102400 #define KERNEL_ARG_SOURCE 0 #define KERNEL_ARG_DESTINATION 1 #define KERNEL_ARG_SOURCEINFO 2 #define KERNEL_ARG_ENCODE 3 static bool is64Bit() { return (sizeof(int*) == 8); } void PrintCL(const char* Format, ... ) { // define a pointer to save argument list va_list args; char buff[1024]; // process the arguments into our debug buffer va_start(args, Format); vsprintf_s(buff, Format, args); va_end(args); if (GPU_CLMips) { GPU_CLMips->Print(buff); } else { printf(buff); } } void PrintOCLError(cl_int error) { switch (error) { case CL_DEVICE_NOT_FOUND: PrintCL("Error: CL_DEVICE_NOT_FOUND\n"); break; case CL_DEVICE_NOT_AVAILABLE: PrintCL("Error: CL_DEVICE_NOT_AVAILABLE\n"); break; case CL_COMPILER_NOT_AVAILABLE: PrintCL("Error: CL_COMPILER_NOT_AVAILABLE\n"); break; case CL_MEM_OBJECT_ALLOCATION_FAILURE: PrintCL("Error: CL_MEM_OBJECT_ALLOCATION_FAILURE\n"); break; case CL_OUT_OF_RESOURCES: PrintCL("Error: CL_OUT_OF_RESOURCES\n"); break; case CL_OUT_OF_HOST_MEMORY: PrintCL("Error: CL_OUT_OF_HOST_MEMORY\n"); break; case CL_PROFILING_INFO_NOT_AVAILABLE: PrintCL("Error: CL_PROFILING_INFO_NOT_AVAILABLE\n"); break; case CL_MEM_COPY_OVERLAP: PrintCL("Error: CL_MEM_COPY_OVERLAP\n"); break; case CL_IMAGE_FORMAT_MISMATCH: PrintCL("Error: CL_IMAGE_FORMAT_MISMATCH\n"); break; case CL_IMAGE_FORMAT_NOT_SUPPORTED: PrintCL("Error: CL_IMAGE_FORMAT_NOT_SUPPORTED\n"); break; case CL_BUILD_PROGRAM_FAILURE: PrintCL("Error: CL_BUILD_PROGRAM_FAILURE\n"); break; case CL_MAP_FAILURE: PrintCL("Error: CL_MAP_FAILURE\n"); break; case CL_MISALIGNED_SUB_BUFFER_OFFSET: PrintCL("Error: CL_MISALIGNED_SUB_BUFFER_OFFSET\n"); break; case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: PrintCL("Error: CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST\n"); break; case CL_COMPILE_PROGRAM_FAILURE: PrintCL("Error: CL_COMPILE_PROGRAM_FAILURE\n"); break; case CL_LINKER_NOT_AVAILABLE: PrintCL("Error: CL_LINKER_NOT_AVAILABLE\n"); break; case CL_LINK_PROGRAM_FAILURE: PrintCL("Error: CL_LINK_PROGRAM_FAILURE\n"); break; case CL_DEVICE_PARTITION_FAILED: PrintCL("Error: CL_DEVICE_PARTITION_FAILED\n"); break; case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: PrintCL("Error: CL_KERNEL_ARG_INFO_NOT_AVAILABLE\n"); break; case CL_INVALID_VALUE: PrintCL("Error: CL_INVALID_VALUE\n"); break; case CL_INVALID_DEVICE_TYPE: PrintCL("Error: CL_INVALID_DEVICE_TYPE\n"); break; case CL_INVALID_PLATFORM: PrintCL("Error: CL_INVALID_PLATFORM\n"); break; case CL_INVALID_DEVICE: PrintCL("Error: CL_INVALID_DEVICE\n"); break; case CL_INVALID_CONTEXT: PrintCL("Error: CL_INVALID_CONTEXT\n"); break; case CL_INVALID_QUEUE_PROPERTIES: PrintCL("Error: CL_INVALID_QUEUE_PROPERTIES\n"); break; case CL_INVALID_COMMAND_QUEUE: PrintCL("Error: CL_INVALID_COMMAND_QUEUE\n"); break; case CL_INVALID_HOST_PTR: PrintCL("Error: CL_INVALID_HOST_PTR\n"); break; case CL_INVALID_MEM_OBJECT: PrintCL("Error: CL_INVALID_MEM_OBJECT\n"); break; case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: PrintCL("Error: CL_INVALID_IMAGE_FORMAT_DESCRIPTOR\n"); break; case CL_INVALID_IMAGE_SIZE: PrintCL("Error: CL_INVALID_IMAGE_SIZE\n"); break; case CL_INVALID_SAMPLER: PrintCL("Error: CL_INVALID_SAMPLER\n"); break; case CL_INVALID_BINARY: PrintCL("Error: CL_INVALID_BINARY\n"); break; case CL_INVALID_BUILD_OPTIONS: PrintCL("Error: CL_INVALID_BUILD_OPTIONS\n"); break; case CL_INVALID_PROGRAM: PrintCL("Error: CL_INVALID_PROGRAM\n"); break; case CL_INVALID_PROGRAM_EXECUTABLE: PrintCL("Error: CL_INVALID_PROGRAM_EXECUTABLE\n"); break; case CL_INVALID_KERNEL_NAME: PrintCL("Error: CL_INVALID_KERNEL_NAME\n"); break; case CL_INVALID_KERNEL_DEFINITION: PrintCL("Error: CL_INVALID_KERNEL_DEFINITION\n"); break; case CL_INVALID_KERNEL: PrintCL("Error: CL_INVALID_KERNEL\n"); break; case CL_INVALID_ARG_INDEX: PrintCL("Error: CL_INVALID_ARG_INDEX\n"); break; case CL_INVALID_ARG_VALUE: PrintCL("Error: CL_INVALID_ARG_VALUE\n"); break; case CL_INVALID_ARG_SIZE: PrintCL("Error: CL_INVALID_ARG_SIZE\n"); break; case CL_INVALID_KERNEL_ARGS: PrintCL("Error: CL_INVALID_KERNEL_ARGS\n"); break; case CL_INVALID_WORK_DIMENSION: PrintCL("Error: CL_INVALID_WORK_DIMENSION\n"); break; case CL_INVALID_WORK_GROUP_SIZE: PrintCL("Error: CL_INVALID_WORK_GROUP_SIZE\n"); break; case CL_INVALID_WORK_ITEM_SIZE: PrintCL("Error: CL_INVALID_WORK_ITEM_SIZE\n"); break; case CL_INVALID_GLOBAL_OFFSET: PrintCL("Error: CL_INVALID_GLOBAL_OFFSET\n"); break; case CL_INVALID_EVENT_WAIT_LIST: PrintCL("Error: CL_INVALID_EVENT_WAIT_LIST\n"); break; case CL_INVALID_EVENT: PrintCL("Error: CL_INVALID_EVENT\n"); break; case CL_INVALID_OPERATION: PrintCL("Error: CL_INVALID_OPERATION\n"); break; case CL_INVALID_GL_OBJECT: PrintCL("Error: CL_INVALID_GL_OBJECT\n"); break; case CL_INVALID_BUFFER_SIZE: PrintCL("Error: CL_INVALID_BUFFER_SIZE\n"); break; case CL_INVALID_MIP_LEVEL: PrintCL("Error: CL_INVALID_MIP_LEVEL\n"); break; case CL_INVALID_GLOBAL_WORK_SIZE: PrintCL("Error: CL_INVALID_GLOBAL_WORK_SIZE\n"); break; case CL_INVALID_PROPERTY: PrintCL("Error: CL_INVALID_PROPERTY\n"); break; case CL_INVALID_IMAGE_DESCRIPTOR: PrintCL("Error: CL_INVALID_IMAGE_DESCRIPTOR\n"); break; case CL_INVALID_COMPILER_OPTIONS: PrintCL("Error: CL_INVALID_COMPILER_OPTIONS\n"); break; case CL_INVALID_LINKER_OPTIONS: PrintCL("Error: CL_INVALID_LINKER_OPTIONS\n"); break; case CL_INVALID_DEVICE_PARTITION_COUNT: PrintCL("Error: CL_INVALID_DEVICE_PARTITION_COUNT\n"); break; case CL_INVALID_PIPE_SIZE: PrintCL("Error: CL_INVALID_PIPE_SIZE\n"); break; case CL_INVALID_DEVICE_QUEUE: PrintCL("Error: CL_INVALID_DEVICE_QUEUE\n"); break; default: PrintCL("Error: UKNOWN 0x%X\n", error); break; } } //====================================== Framework Common Interfaces : OpenCL Compute ========================================== void COpenCL::Init() { query_timer::initialize(); m_initDeviceOk = false; m_programRun = false; m_codecFormat = CMP_FORMAT_Unknown; m_num_blocks = 0; m_CmpMTxPerSec = 0; m_computeShaderElapsedMS = 0.0f; m_platform_id = NULL; m_device_id = NULL; m_num_platforms = 0; m_command_queue = NULL; m_kernel = NULL; m_device_destination_buffer = NULL; m_device_source_buffer = NULL; m_Source_Info_buffer = NULL; m_Encoder_buffer = NULL; m_program_encoder = NULL; m_context = NULL; m_svmSupport = false; m_svmData = NULL; p_program.buffer = NULL; ocl_time_device = 0; m_deviceName = ""; m_version = ""; m_maxUCores = 12; //------------------------- // OpenCL compiler options //------------------------- long cmp_opt_size = sizeof(m_compile_options); m_compile_options[0] = 0; // Make all warnings into errors, use -w to Inhitit all warning messages // strncat_s(m_compile_options, cmp_opt_size, "-Werror ", _TRUNCATE); // single and double precision denormalized numbers may be flushed to zero strncat_s(m_compile_options, cmp_opt_size, "-cl-denorms-are-zero ", _TRUNCATE); // Looks for addtional include file in this sub-folder strncat_s(m_compile_options, cmp_opt_size, "-I ./Plugins/Compute/ ", _TRUNCATE); // Use this to debug with CodeXL or other debuggers // This option disables all optimizations // strncat_s(m_compile_options, cmp_opt_size, "-g -cl-opt-disable", _TRUNCATE); // User override options set after this init call m_force_rebuild = false; } COpenCL::COpenCL(ComputeOptions CLOptions) { Init(); m_force_rebuild = CLOptions.force_rebuild; } COpenCL::~COpenCL() { #ifdef ENABLE_SVM if (m_context && m_svmData) clSVMFree(m_context, m_svmData); #endif if (m_context) clReleaseContext(m_context); if (m_programRun) { CleanUpKernelAndIOBuffers(); CleanUpProgramEncoder(); } } void COpenCL::SetComputeOptions(ComputeOptions *CLOptions) { m_force_rebuild = CLOptions->force_rebuild; } COpenCL::COpenCL(void *kerneloptions) { m_kernel_options = (KernelOptions *)kerneloptions; Init(); } void COpenCL::CleanUpProgramEncoder() { // Encoder Program & Buffer if (p_program.buffer) delete[] p_program.buffer; if (m_program_encoder) clReleaseProgram(m_program_encoder); } void COpenCL::CleanUpKernelAndIOBuffers() { // Command Queues and Kernel functions if (m_command_queue) clReleaseCommandQueue(m_command_queue); if (m_kernel) clReleaseKernel(m_kernel); // IO Buffers if (m_Encoder_buffer) clReleaseMemObject(m_Encoder_buffer); if (m_Source_Info_buffer) clReleaseMemObject(m_Source_Info_buffer); if (m_device_destination_buffer) clReleaseMemObject(m_device_destination_buffer); if (m_device_source_buffer) clReleaseMemObject(m_device_source_buffer); } bool COpenCL::GetPlatformID() { //QUERY_PERFORMANCE("Get Platform ID "); m_result = clGetPlatformIDs(MAX_PLATFORMS, m_platform_ids, &m_num_platforms); if (m_result != CL_SUCCESS) { PrintCL("Failed to get the GPU platforms!\n"); PrintOCLError(m_result); return false; } return true; } bool COpenCL::SearchForGPU() { for (uint32_t i = 0; i < m_num_platforms; i++) { // Get the device ids. m_result = clGetDeviceIDs(m_platform_ids[i], CL_DEVICE_TYPE_GPU, 1, &m_device_id, NULL); if (m_result == CL_SUCCESS) { m_platform_id = m_platform_ids[i]; break; } } // end for if (m_result != CL_SUCCESS) { PrintCL("Failed to get a GPU device!\n"); PrintOCLError(m_result); return false; } return true; } bool COpenCL::GetDeviceInfo() { // Show the device info. char device_name[256] = { 0 }; if (clGetDeviceInfo(m_device_id, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL) == CL_SUCCESS) m_deviceName = device_name; char openclVersion[1024]; if (clGetDeviceInfo(m_device_id, CL_DEVICE_OPENCL_C_VERSION, sizeof(openclVersion), openclVersion, NULL) == CL_SUCCESS) m_version = openclVersion; if (clGetDeviceInfo(m_device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(m_maxUCores), &m_maxUCores, NULL) != CL_SUCCESS) m_maxUCores = 12; // long long GlobalMem; // clGetDeviceInfo(m_device_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(long long), &GlobalMem, NULL); // PrintCL("Device Global Mem: %I64d Bytes\n", GlobalMem); // // long long LocalMem; // clGetDeviceInfo(m_device_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(long long), &LocalMem, NULL); // PrintCL("Device Local Mem: %I64d Bytes\n", LocalMem); // // size_t MaxWorkGroupSize; // clGetDeviceInfo(m_device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(int), &MaxWorkGroupSize, NULL); // PrintCL("Max work Groups :%ld\n", MaxWorkGroupSize); // // cl_ulong MaxConstDataSize; // clGetDeviceInfo(m_device_id, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(cl_ulong), &MaxConstDataSize, NULL); // PrintCL("Max Const Data : %ld Bytes\n", MaxConstDataSize); #ifdef ENABLE_SVM // SVM Support cl_device_svm_capabilities caps; clGetDeviceInfo(m_device_id, CL_DEVICE_SVM_CAPABILITIES, sizeof(cl_device_svm_capabilities), &caps, NULL); if (caps > 0) { m_svmSupport = true; PrintCL("SVM Course Grain Buffer Support: %s\n", (caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) ? "Yes" : "No"); PrintCL("SVM Fine Grain Buffer Support : %s\n", (caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) ? "Yes" : "No"); PrintCL("SVM Fine Grain System Support : %s\n", (caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) ? "Yes" : "No"); PrintCL("SVM Atomics : %s\n", (caps & CL_DEVICE_SVM_ATOMICS) ? "Yes" : "No"); } #endif return true; } bool COpenCL::CreateContext() { // Create a context. m_context = clCreateContext(NULL, 1, &m_device_id, NULL, NULL, &m_result); if (m_result != CL_SUCCESS) { PrintCL("Failed to create a context!\n"); return false; } return true; } long COpenCL::file_size(FILE* p_file) { // Get the size of the program. if (fseek(p_file, 0, SEEK_END) != 0) return 0; long program_size = ftell(p_file); fseek(p_file, 0, SEEK_SET); return program_size; } bool COpenCL::load_file() { #ifdef _DEBUG PrintCL("Loading [%s]\n",m_source_file.c_str()); #endif errno_t fopen_result; m_isBinary = false; if (!m_force_rebuild) { //=========================== // Try loading the Binary file //=========================== FILE* p_file_bin = NULL; std::string tmp = m_source_file; bool rebuild = false; #ifdef _WIN32 // Check build configuration of the shader, has it been modified since last use rebuild = cmp_recompile_shader(m_source_file); #endif if (!rebuild) { #ifdef _WIN32 fopen_result = fopen_s(&p_file_bin, tmp.append(".cmp").c_str(), "rb"); if (fopen_result != 0) rebuild = true; #else p_file_bin = fopen(tmp.append(".cmp").c_str(), "rb"); if (p_file_bin) rebuild = true; #endif } // Found a .cmp file use it if (!rebuild) { OpenCLBinary_Header BinFile_Header; if (fread(&BinFile_Header, sizeof(OpenCLBinary_Header), 1, p_file_bin) != 1) { fclose(p_file_bin); PrintCL("Failed to read \"%s.cmp\" file header!\n", m_source_file.c_str()); return false; } if (BinFile_Header.version != 1) { fclose(p_file_bin); PrintCL("File \"%s.cmp\" is not compatible with current application!\n", m_source_file.c_str()); return false; } // entry to this code is reserved for CRC checks { m_isBinary = true; m_program_size = file_size(p_file_bin) - sizeof(OpenCLBinary_Header); if (m_program_size == 0) { fclose(p_file_bin); PrintCL("Failed to read \"%s.cmp\" file size!\n", m_source_file.c_str()); return false; } // Allocate memory for the program. p_program.ubuffer = new unsigned char[m_program_size]; // Reposition to bin data that is after header fseek(p_file_bin, sizeof(OpenCLBinary_Header), SEEK_SET); // Read the program in to memory. if (fread(p_program.buffer, m_program_size, 1, p_file_bin) != 1) { fclose(p_file_bin); PrintCL("Failed to read \"%s.cmp\" in to memory!\n", m_source_file.c_str()); return false; } fclose(p_file_bin); return true; } // reserved for CRC type checks } } // !m_force_rebuild //=========================== // Try loading the source file //=========================== FILE* p_file_src = NULL; fopen_result = fopen_s(&p_file_src, m_source_file.c_str(), "rb"); if (fopen_result == 0) { m_program_size = file_size(p_file_src); if (m_program_size == 0) { fclose(p_file_src); PrintCL("Failed to read \"%s\" file size!\n", m_source_file.c_str()); return false; } // Allocate memory for the program. p_program.buffer = new char[m_program_size]; // Read the program in to memory. size_t read_size = fread(p_program.buffer, m_program_size, 1, p_file_src); if (read_size != 1) { fclose(p_file_src); PrintCL("Failed to read \"%s\" in to memory!\n", m_source_file.c_str()); return false; } fclose(p_file_src); return true; } //=========================== // Failed to load the file //=========================== return false; } bool COpenCL::Create_Program_File() { //------------------------------ // Load the Source or Binary file //------------------------------ load_file(); cl_int result; if (!m_isBinary) { // Create the program. m_program_encoder = clCreateProgramWithSource(m_context, 1, const_cast< char const** >(&p_program.buffer), &m_program_size, &result); if (result != CL_SUCCESS) { PrintCL("Failed to create the program!\n"); PrintOCLError(result); return false; } delete[] p_program.buffer; p_program.buffer = NULL; // Build the program. result = clBuildProgram(m_program_encoder, 1, &m_device_id, m_compile_options, NULL, NULL); if (result != CL_SUCCESS) { char message[LOG_BUFFER_SIZE]; result = clGetProgramBuildInfo(m_program_encoder, m_device_id, CL_PROGRAM_BUILD_LOG, LOG_BUFFER_SIZE, message, NULL); if (result != CL_SUCCESS) message[0] = char(0); //PrintCL("Failed to build the program!\n%s",message); printf("Failed to build the program!\n%s",message); return false; } size_t compiled_size = 0; result = clGetProgramInfo(m_program_encoder, CL_PROGRAM_BINARY_SIZES, sizeof(compiled_size), &compiled_size, NULL); uint8_t* p_binary = new uint8_t[compiled_size]; result = clGetProgramInfo(m_program_encoder, CL_PROGRAM_BINARIES, sizeof(p_binary), &p_binary, NULL); // Save the compiled code FILE* p_file = NULL; errno_t fopen_result = fopen_s(&p_file, m_source_file.append(".cmp").c_str(), "wb"); if (fopen_result == 0) { OpenCLBinary_Header BinFile_Header; BinFile_Header.version = 1; BinFile_Header.crc32 = 0; fwrite(&BinFile_Header, sizeof(OpenCLBinary_Header), 1, p_file); fwrite(p_binary, compiled_size, 1, p_file); } fclose(p_file); } // not precompiled code else { // Create the program. m_program_encoder = clCreateProgramWithBinary(m_context, 1, &m_device_id, &m_program_size, (const unsigned char **)&p_program.ubuffer, NULL, &result); if (result != CL_SUCCESS) { PrintCL("Failed to load the binary program!\n"); PrintOCLError(result); return false; } delete[] p_program.buffer; p_program.buffer = NULL; // Build the program. result = clBuildProgram(m_program_encoder, 1, &m_device_id, NULL, NULL, NULL); if (result != CL_SUCCESS) { PrintCL("Failed to build the program!\n"); PrintOCLError(result); char message[128 * 1024]; result = clGetProgramBuildInfo(m_program_encoder, m_device_id, CL_PROGRAM_BUILD_LOG, sizeof(message), message, NULL); if (result == CL_SUCCESS) { PrintCL(message); } return false; } } return true; } bool COpenCL::CreateProgramEncoder() { // QUERY_PERFORMANCE("Create Program "); // Create the program. if (!Create_Program_File()) { return false; } return true; } bool COpenCL::CreateIOBuffers() { #ifdef ENABLE_SVM // AllocateBuffers() // SVM if (m_svmSupport && is64Bit() && (m_kernel_options->size > 0)) { // initialize any device/SVM memory here. m_svmData = clSVMAlloc(m_context, CL_MEM_READ_WRITE, m_kernel_options->size, NULL); if (m_svmData == NULL) { PrintCL("Failed to allocate the Encode buffer on the device!\n"); return false; } m_kernel_options->dataSVM = m_svmData; } #endif // allocate device 32bit buffer for the image source // idealy we want to not copy the buffer form host // Look into the following options clmap and CL_MEM_AMD_PERSISTANT // Allocate the 32-bit source buffer in device memory. m_device_source_buffer = clCreateBuffer(m_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, m_source_buffer_size, (void*)m_psource, &m_result); if (m_result != CL_SUCCESS) { PrintCL("Failed to allocate the source buffer on the device!\n"); return false; } m_Source_Info_buffer = clCreateBuffer(m_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(Source_Info), (void *)&m_SourceInfo, &m_result); if (m_result != CL_SUCCESS) { PrintCL("Failed to allocate the source info buffer on the device!\n"); PrintOCLError(m_result); return false; } // Allocate the destination buffer in device memory. m_device_destination_buffer = clCreateBuffer(m_context, CL_MEM_WRITE_ONLY, m_destination_size, NULL, &m_result); if (m_result != CL_SUCCESS) { PrintCL("Failed to allocate the destination buffer on the device!\n"); PrintOCLError(m_result); return false; } return true; } //#include "bcn_common_kernel.h" use this for debugging data to kernel when using BC15 bool COpenCL::RunKernel() { // QUERY_PERFORMANCE("Run Kernel "); // Get a handle to the kernel. m_kernel = clCreateKernel(m_program_encoder, "CMP_GPUEncoder", &m_result); if (m_result != CL_SUCCESS) { PrintCL("Failed to create the kernel!\n"); PrintOCLError(m_result); return false; } // Create the command queue. with profiling enabled const cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0}; m_command_queue = clCreateCommandQueueWithProperties(m_context, m_device_id, properties, &m_result); if (m_result != CL_SUCCESS) { PrintCL("Failed to create the command queue!\n"); return false; } //==================================================================================== // ImageSource m_result = clSetKernelArg(m_kernel, KERNEL_ARG_SOURCE, sizeof(m_device_source_buffer), &m_device_source_buffer); if (m_result != CL_SUCCESS) { PrintCL("Failed to set the source kernel argument!\n"); PrintOCLError(m_result); return false; } //ImageDestination m_result = clSetKernelArg(m_kernel, KERNEL_ARG_DESTINATION, sizeof(m_device_destination_buffer), &m_device_destination_buffer); if (m_result != CL_SUCCESS) { PrintCL("Failed to set the destination kernel argument!\n"); PrintOCLError(m_result); return false; } //SourceInfo m_result = clSetKernelArg(m_kernel, KERNEL_ARG_SOURCEINFO, sizeof(m_Source_Info_buffer), (void *)&m_Source_Info_buffer); if (m_result != CL_SUCCESS) { PrintCL("Failed to set the source info argument!\n"); PrintOCLError(m_result); return false; } #ifdef ENABLE_SVM // Don not enable unless CMP_GPUEncoder parameters are updated if (m_svmSupport) { if (m_svmData) { /* reserve svm space for CPU update */ m_result = clEnqueueSVMMap(m_command_queue, CL_TRUE, //blocking call CL_MAP_WRITE_INVALIDATE_REGION, m_svmData, m_kernel_options->size, 0, NULL, NULL); m_kernel_options->dataSVM = m_svmData; if (!SVMInitCodec(m_kernel_options)) { PrintCL("Failed to initialize SVM Encode kernel data!\n"); PrintOCLError(m_result); return false; } m_result = clEnqueueSVMUnmap(m_command_queue, m_svmData, 0, NULL, NULL); // Set appropriate arguments to the kernel m_result = clSetKernelArgSVMPointer(m_kernel, KERNEL_ARG_ENCODE, (void *)(m_svmData)); if (m_result != CL_SUCCESS) { PrintCL("Failed to set the SVM Encode kernel argument!\n"); PrintOCLError(m_result); return false; } } // Encode ptr else { PrintCL("Failed to set the SVM Encode kernel argument, invalid pointer!\n"); PrintOCLError(m_result); return false; } } else #endif { m_Encoder_buffer = clCreateBuffer(m_context, CL_MEM_READ_WRITE, m_kernel_options->size, NULL, &m_result); // Set argument for the compress() m_result = clSetKernelArg(m_kernel, KERNEL_ARG_ENCODE, sizeof(m_Encoder_buffer), (void *)&m_Encoder_buffer); if (m_result != CL_SUCCESS) { PrintCL("Failed to set the Encode block argument!\n"); PrintOCLError(m_result); return false; } // CMP_BC15Options *temp = reinterpret_cast(m_kernel_options->data); // int blocksize = sizeof(CMP_BC15Options); m_result = clEnqueueWriteBuffer(m_command_queue, m_Encoder_buffer, CL_TRUE, 0, m_kernel_options->size, (void*)m_kernel_options->data, 0, NULL, NULL); if (m_result != CL_SUCCESS) { PrintCL("Failed to set the Encode block buffer!\n"); PrintOCLError(m_result); return false; } } // Run the kernel. //---------------------------------- //# todo: max numbers of blocks to launch on low end GPUs to avoid kernel timeout //int maxblockslaunch = MIN(m_width_in_blocks*m_height_in_blocks, 768 * (int)m_maxComputeUnits); size_t local_work_size[] = { 8, 8 }; size_t global_work_size[] = { 8, 8 }; // Check for smal images < 64 width x 64 height in pixels if (m_width_in_blocks >= 8) { global_work_size[0] = ((m_width_in_blocks + local_work_size[0] - 1) / local_work_size[0]) * local_work_size[0]; } else { local_work_size[0] = 1; global_work_size[0] = m_width_in_blocks; } if (m_height_in_blocks >= 8) { global_work_size[1] = ((m_height_in_blocks + local_work_size[1] - 1) / local_work_size[1]) * local_work_size[1]; } else { local_work_size[1] = 1; global_work_size[1] = m_height_in_blocks; } #ifdef USE_CPU_PERFORMANCE_COUNTERS cmp_cputimer cputimer; cputimer.initialize(); cputimer.Start(0); #else cl_event cl_perf_event = NULL; #endif m_result = clEnqueueNDRangeKernel(m_command_queue, m_kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, #ifdef USE_CPU_PERFORMANCE_COUNTERS NULL #else m_getPerfStats? &cl_perf_event:NULL #endif ); if (m_result != CL_SUCCESS) { PrintCL("Failed to launch the kernel!\n"); PrintOCLError(m_result); return false; } // Wait until all queued commands have been processed and completed. if (m_getPerfStats) m_result = clFinish(m_command_queue); #ifdef USE_CPU_PERFORMANCE_COUNTERS cputimer.Stop(0); m_computeShaderElapsedMS = cputimer.GetMS(0); #endif // Check if performance event has been set and valid if ( m_getPerfStats && #ifdef USE_CPU_PERFORMANCE_COUNTERS (m_computeShaderElapsedMS > 0) && #else cl_perf_event && #endif (m_result == CL_SUCCESS)) { #ifdef USE_CPU_PERFORMANCE_COUNTERS #else // Get the event data cl_ulong start = 0; cl_ulong end = 0; cl_ulong nspercount = 1; m_result = clGetEventProfilingInfo(cl_perf_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); if (m_result == CL_SUCCESS) m_result = clGetEventProfilingInfo(cl_perf_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); // Get the number of nanoseconds per count cl_ulong resolution; if (m_result == CL_SUCCESS) m_result = clGetDeviceInfo(m_device_id, CL_DEVICE_PROFILING_TIMER_RESOLUTION, sizeof(cl_ulong), &resolution, NULL); if (m_result == CL_SUCCESS) m_result = clReleaseEvent(cl_perf_event); if (m_result != CL_SUCCESS) { PrintCL("Failed clReleaseEvent!\n"); PrintOCLError(m_result); return false; } // counters are in nano second incriments 1e-9f Convert m_num_blocks = m_height_in_blocks*m_width_in_blocks; if (m_num_blocks == 0) m_num_blocks = 1; float nanoSeconds = (float)(end-start); // Convert nanosec to ms divide by 1e6f m_computeShaderElapsedMS = nanoSeconds/1e6f; // time to process a single block (4x4) which is 16 texels m_computeShaderElapsedMS = m_computeShaderElapsedMS/(float)m_num_blocks; #endif if (m_computeShaderElapsedMS > 0) { float ElapsedSeconds = m_computeShaderElapsedMS/1E3f; float ElapsedSecondsPerTx = ElapsedSeconds/16; float TxPerSec = 1/ElapsedSecondsPerTx; // time to process a 1M texels in a second m_CmpMTxPerSec = TxPerSec/1E6f; } else m_CmpMTxPerSec = 0; } return true; } bool COpenCL::GetResults() { // QUERY_PERFORMANCE("Get Results "); // Copy the results from device to host memory. m_result = clEnqueueReadBuffer(m_command_queue, m_device_destination_buffer, true, 0, m_destination_size, p_destination, 0, NULL, NULL); if (m_result != CL_SUCCESS) { PrintCL("Failed to copy the results from the device!\n"); PrintOCLError(m_result); return false; } return true; } float COpenCL::GetProcessElapsedTimeMS() { return m_computeShaderElapsedMS; } float COpenCL::GetMTxPerSec() { return m_CmpMTxPerSec; } int COpenCL::GetBlockSize() { return m_num_blocks; } int COpenCL::GetMaxUCores() { return m_maxUCores; } const char* COpenCL::GetDeviceName() { return m_deviceName.c_str(); } const char* COpenCL::GetVersion() { return m_version.c_str(); } CMP_ERROR COpenCL::Compress( KernelOptions *KernelOptions, MipSet &srcTexture, MipSet &destTexture,CMP_Feedback_Proc pFeedback = NULL) { bool newFormat = false; if (m_codecFormat != destTexture.m_format) { m_codecFormat = destTexture.m_format; newFormat = true; } if (m_codecFormat == CMP_FORMAT_Unknown) { // dont know how to progress this destination return (CMP_ERR_GENERIC); } m_source_file = KernelOptions->srcfile; if (m_source_file.length() == 0) return CMP_ERR_NOSHADER_CODE_DEFINED; m_getPerfStats = KernelOptions->getPerfStats && (destTexture.m_nIterations < 1); m_kernel_options->data = KernelOptions->data; m_kernel_options->size = KernelOptions->size; m_kernel_options->format = KernelOptions->format; m_kernel_options->dataSVM = KernelOptions->dataSVM;; m_source_buffer_size = srcTexture.dwDataSize; p_destination = destTexture.pData; m_destination_size = destTexture.dwDataSize; if (destTexture.m_nBlockWidth > 0) m_width_in_blocks = (cl_uint)srcTexture.dwWidth / destTexture.m_nBlockWidth; else { return CMP_ERR_GENERIC; } if (destTexture.m_nBlockHeight > 0) m_height_in_blocks = (cl_uint)srcTexture.dwHeight / destTexture.m_nBlockHeight; else { return CMP_ERR_GENERIC; } m_psource = (CMP_Vec4uc *)srcTexture.pData; m_SourceInfo.m_src_height = srcTexture.dwHeight; m_SourceInfo.m_src_width = srcTexture.dwWidth; m_SourceInfo.m_width_in_blocks = m_width_in_blocks; m_SourceInfo.m_height_in_blocks = m_height_in_blocks; m_SourceInfo.m_fquality = KernelOptions->fquality; // Using OpenCL - Setit up and call the kernel function bool ok = true; // check for first time use on host device if (!m_initDeviceOk) { if ( GetPlatformID() == false) ok = false; if (ok && (SearchForGPU() == false)) ok = false; if (ok && (GetDeviceInfo() == false)) ok = false; if (ok && (CreateContext() == false)) ok = false; m_initDeviceOk = ok; } if (m_programRun) { CleanUpKernelAndIOBuffers(); if (newFormat) CleanUpProgramEncoder(); m_programRun = false; } if (newFormat) { if (ok && (CreateProgramEncoder() == false)) ok = false; } if (ok && (CreateIOBuffers() == false)) ok = false; if (ok && (RunKernel() == false)) ok = false; if (ok && (GetResults() == false)) ok = false; if (ok) { m_programRun = true; return CMP_OK; } return(CMP_ERR_GENERIC); }