public void Run() { Console.WriteLine("================== " + GetType().Name + "================== "); string source = @" __kernel void myOpenClFunc ( __global float* cl_input, __global float* cl_output ) { size_t i = get_global_id(0); cl_output[i] = cl_input[i] + cl_input[i]; }; "; /************ Initialize OpenClWithGCN ***************************************/ OpenClWithGCN gprog = new OpenClWithGCN(); OpenClEnvironment env = gprog.env; bool success = gprog.GcnCompile(source); Console.Write(env.lastMessage); if (!success) { Assert.Fail(); return; } /************ Create some random data *******************************************/ // create some random data for testing var random = new Random(); const int count = 1024 * 1024; const int dataSz = count * sizeof(float); float[] data = (from i in Enumerable.Range(0, count) select (float)random.NextDouble()).ToArray(); /************ Build and run the kernel *******************************************/ Kernel kernel = env.program.CreateKernel("myOpenClFunc"); Mem cl_input = env.context.CreateBuffer(MemoryFlags.ReadOnly, dataSz); Mem cl_output = env.context.CreateBuffer(MemoryFlags.WriteOnly, dataSz); env.cmdQueue.EnqueueWriteBuffer(cl_input, true, 0, dataSz, data); kernel.Arguments[0].SetValue(cl_input); kernel.Arguments[1].SetValue(cl_output); env.cmdQueue.EnqueueNDRangeKernel(kernel, count, 256); env.cmdQueue.Finish(); /************ Read back and Validate the results ***********************************/ float[] results = new float[count]; env.cmdQueue.EnqueueReadBufferAndWait(cl_output, results, dataSz); int correct = Enumerable.Range(0, count).Where(i => results[i] == data[i] * 2).Count(); Console.WriteLine("{0} - Computed {1}/{2} correct values!", correct == count ? "PASS" : "FAIL", correct.ToString(), count.ToString()); Assert.AreEqual(correct, count); }
public void Run() { Console.WriteLine("================== " + GetType().Name + "================== "); string source = @" __asm4GCN myAsmFunc ( float*, float*) { // ======== Pre-Loaded Registers ========= // s[2:3] - PTR_UAV_TABLE // s[2:3] +0x60 - base_resource_const1(#T) // s[2:3] +0x68 - base_resource_const2(#T) // s[4:7] - IMM_CONST_BUFFER0 // s[4:7] +0x00 - Grid Size // s[4:7] +0x04 - Local Size // s[4:7] +0x18 - baseGlobalId // s[8:11] -IMM_CONST_BUFFER1 // s[8:11]+0x00 - param1 offset // s[8:11]+0x04 - param2 offset // s[8:11]+0x08 - param3 offset // s1 - threadgroupId // s12 - groupId // v0 - laneId // Use a #define to shorten tbuffer_load instruction #define _F32_ 0 offen format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_FLOAT] // Variable decelerations with specific registers (pre-loaded regs) s8u resConsts s[2:3] // base resource constants s16u uavBuff s[4:7], paramsOffset s[8:11], groupId s12 v4b laneId v0 // v0 is pre-loaded with lane id // Setup Resource Constant s_load_dwordx4 s16u resConst1, resConsts, 0x68 // establish GlobalID s_buffer_load_dword s4u baseGlobalId,uavBuff, 0x18 s_buffer_load_dword s4u localSize, uavBuff, 0x04 s_waitcnt lgkmcnt(0) v_mov_b32 v4u vLocalSize, localSize v_mul_i32_i24 vLocalSize, groupId, vLocalSize v_add_i32 v4u localSizeIdx, vcc, laneId, vLocalSize v_add_i32 v4u vGlobalID, vcc, baseGlobalId, localSizeIdx v_lshlrev_b32 v4u vGlobalOffset, 2, vGlobalID // Fetch the value that we want to multiply by 2. s_buffer_load_dword s4u sPara1Ptr, paramsOffset, 0x00 s_load_dwordx4 s16u resConst0, resConsts, 0x60 s_waitcnt lgkmcnt(0) v_add_i32 v4b baseOffset, vcc, sPara1Ptr, vGlobalOffset tbuffer_load_format_x v4b val, baseOffset, resConst0, _F32_ s_waitcnt vmcnt(0) // Perform the 'sum = val * 2' v_mul_f32 v4f sum, 2.0, val // Write the results back to memory. s_buffer_load_dword s4u param2Offset, paramsOffset, 0x04 s_waitcnt lgkmcnt(0) s_waitcnt vmcnt(0) v_add_i32 v4u dstOffset, vcc, param2Offset, vGlobalOffset free param2Offset // var is freed and register returned to pool. tbuffer_store_format_x sum, dstOffset, resConst1, _F32_ // Exit the kernel s_endpgm }"; /************ Initialize OpenClWithGCN ***************************************/ OpenClWithGCN gprog = new OpenClWithGCN(); OpenClEnvironment env = gprog.env; bool success = gprog.GcnCompile(source); Console.Write(env.lastMessage); if (!success) { Assert.Fail(); return; } /************ Create some random data *******************************************/ // create some random data for testing var random = new Random(); const int count = 1024 * 1024; const int dataSz = count * sizeof(float); float[] data = (from i in Enumerable.Range(0, count) select (float)random.NextDouble()).ToArray(); /************ Build and run the kernel *******************************************/ Kernel kernel = env.program.CreateKernel("myAsmFunc"); Mem cl_input = env.context.CreateBuffer(MemoryFlags.ReadOnly, dataSz); Mem cl_output = env.context.CreateBuffer(MemoryFlags.WriteOnly, dataSz); env.cmdQueue.EnqueueWriteBuffer(cl_input, true, 0, dataSz, data); kernel.Arguments[0].SetValue(cl_input); kernel.Arguments[1].SetValue(cl_output); env.cmdQueue.EnqueueNDRangeKernel(kernel, count, 256); env.cmdQueue.Finish(); /************ Read back and Validate the results ***********************************/ float[] results = new float[count]; env.cmdQueue.EnqueueReadBufferAndWait(cl_output, results, dataSz); int correct = Enumerable.Range(0,count).Where(i=>results[i]==data[i]*2).Count(); Console.WriteLine("{0} - Computed {1}/{2} correct values!", correct == count ? "PASS" : "FAIL", correct.ToString(), count.ToString()); Assert.AreEqual(correct, count); }
public void Run() { // The example below is an OpenCL example by Derek Gerstmann(UWA). It's been modified for NOpenCL use. Console.WriteLine("================== " + GetType().Name + "================== "); /************ Create and build a program from our OpenCL-C source code ***************/ // const string source = GCN_NS.Code.DevCode; string source = @" __asm4GCN myAsmFunc ( float*, float*) { #define _ZeroInHex_ 0x00 #define _32Float_ 0 offen format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_FLOAT] // Loc| Binary |BinaryExt s_buffer_load_dword s0, s[4:7], 0x04 // 000|C2000504| s_buffer_load_dword s1, s[4:7], 0x18 // 004|C2008518| s_waitcnt lgkmcnt(0) // 008|BF8C007F| s_min_u32 s0, s0, 0x0000ffff // 00C|8380FF00|0000FFFF s_buffer_load_dword s4, s[8:11], _ZeroInHex_ // 014|C2020900| v_mov_b32 v1, s0 // 018|7E020200| v_mul_i32_i24 v1, s12, v1 // 01C|1202020C| v_add_i32 v0, vcc, v0, v1 // 020|4A000300| v_add_i32 v0, vcc, s1, v0 // 024|4A000001| v_lshlrev_b32 v0, 2, v0 // 028|34000082| s_load_dwordx4 s[12:15], s[2:3], 0x60 // 02C|C0860360| s_waitcnt lgkmcnt(0) // 030|BF8C007F| v_add_i32 v1, vcc, s4, v0 // 034|4A020004| tbuffer_load_format_x v1, v1, s[12:15], _32Float_ // 038|EBA01000|80030101 s_buffer_load_dword s0, s[8:11], 0x04 // 040|C2000904| s_load_dwordx4 s[4:7], s[2:3], 0x68 // 044|C0820368| s_waitcnt lgkmcnt(0) // 048|BF8C007F| v_add_i32 v0, vcc, s0, v0 // 04C|4A000000| s_waitcnt vmcnt(0) // 050|BF8C0F70| v_add_f32 v1, v1, v1 // 054|10020301| tbuffer_store_format_x v1, v0, s[4:7], _32Float_ // 058|EBA41000|80010100 s_endpgm // 060|BF810000| }; // This is not used here but is what generates above when de-assembled. // __kernel and __asm4GCN blocks can be used in the same clProgram __kernel void myOpenClFunc ( __global float* cl_input, __global float* cl_output ) { size_t i = get_global_id(0); cl_output[i] = cl_input[i] + cl_input[i]; }; "; /************ Initialize OpenClWithGCN ***************************************/ // OpenClEnvironment env = SetupOpenClEnvironment(); // for manual setup OpenClWithGCN gprog = new OpenClWithGCN(); OpenClEnvironment env = gprog.env; // Let just use the default environment bool success = gprog.GcnCompile(source); Console.Write(env.lastMessage); if (!success) { Assert.Fail(); return; } /************ Create some random data *******************************************/ // create some random data for testing var random = new Random(); const int count = 1024 * 1024; const int dataSz = count * sizeof(float); float[] data = (from i in Enumerable.Range(0, count) select (float)random.NextDouble()).ToArray(); /************ Create a kernel from our modProgram *******************************/ Kernel kernel = env.program.CreateKernel("myAsmFunc"); /************ Allocate cl_input, and fill with data ********************************/ // OpenCL: cl_mem cl_input = clCreateBuffer(context, CL_MEM_READ_ONLY, dataSz, host_ptr, NULL); Mem cl_input = env.context.CreateBuffer(MemoryFlags.ReadOnly, dataSz); /************ Create an cl_output memory buffer for our results *****************/ // OpenCL: cl_mem cl_output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, dataSz, host_ptr, NULL); Mem cl_output = env.context.CreateBuffer(MemoryFlags.WriteOnly, dataSz); /************ Copy our host buffer of random values to cl_input device buffer ******/ // OpenCL: clEnqueueWriteBuffer(cmdQueue, cl_input, CL_TRUE, 0, dataSz, data, 0, NULL, NULL); env.cmdQueue.EnqueueWriteBuffer(cl_input, true, 0, dataSz, data); /************ Set the arguments to our kernel, and enqueue it for execution ********/ // OpenCL: clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_input); kernel.Arguments[0].SetValue(cl_input); kernel.Arguments[1].SetValue(cl_output); /************ Enqueue and run the kernel *******************************************/ // OpenCL: clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, &globalMem &localMem, 0, NULL,NULL); env.cmdQueue.EnqueueNDRangeKernel(kernel, count, 256); /************ Force command queue to get processed, wait until all commands finish **/ env.cmdQueue.Finish(); /************ Read back the results ************************************************/ // OpenCL: clEnqueueReadBuffer(cmdQueue, cl_output, CL_TRUE, 0, dataSz, results, 0, NULL, NULL); float[] results = new float[count]; env.cmdQueue.EnqueueReadBufferAndWait(cl_output, results, dataSz); /************ Validate our results *************************************************/ int correct = 0; for (int i = 0; i < count; i++) correct += (results[i] == data[i] + data[i]) ? 1 : 0; // int correct = Enumerable.Range(0,count).Where(i=>results[i]==data[i]*2).Count(); /************ Print a brief summary detailing the results **************************/ Console.WriteLine("{0} - Computed {1}/{2} correct values!", correct == count ? "PASS" : "FAIL", correct.ToString(), count.ToString()); Assert.AreEqual(correct, count); }
public void Run() { Console.WriteLine("================== " + GetType().Name + "================== "); string source = @"__asm4GCN myAsmFunc (float*,float*,float*) { // ======== Pre-Loaded Registers ========= // s[2:3] - PTR_UAV_TABLE // s[2:3] +0x60 - base_resource_const1(#T) // s[2:3] +0x68 - base_resource_const2(#T) // s[4:7] - IMM_CONST_BUFFER0 // s[4:7] +0x00 - Grid Size // s[4:7] +0x04 - Local Size // s[4:7] +0x18 - baseGlobalId // s[8:11] -IMM_CONST_BUFFER1 // s[8:11]+0x00 - param1 offset // s[8:11]+0x04 - param2 offset // s[8:11]+0x08 - param3 offset // s1 - threadgroupId // s12 - groupId // v0 - laneId // Use a #define to shorten tbuffer_load instruction #define _F32_ 0 offen format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_FLOAT] // Variable decelerations with specific registers (pre-loaded regs) s8u resConsts s[2:3] // base resource constants s16u uavBuff s[4:7], paramsOffset s[8:11], groupId s12 v4b laneId v0 // v0 is pre-loaded with lane id // Setup Resource Constant s_load_dwordx4 s16u resConst1, resConsts, 0x68 // establish GlobalID s_buffer_load_dword s4u baseGlobalId,uavBuff, 0x18 s_buffer_load_dword s4u localSize, uavBuff, 0x04 s_waitcnt lgkmcnt(0) v_mov_b32 v4u vLocalSize, localSize v_mul_i32_i24 vLocalSize, groupId, vLocalSize v_add_i32 v4u localSizeIdx, vcc, laneId, vLocalSize v_add_i32 v4u vGlobalID, vcc, baseGlobalId, localSizeIdx v_lshlrev_b32 v4u vGlobalOffset, 2, vGlobalID // create a tool called 'ShowVar' to easily output a variable #define _ShowVar_(VarToPrint) \ v_mov_b32 v4u debugToPrint, VarToPrint;\ tbuffer_store_format_x debugToPrint, debugOffset,resConst1,_F32_ s_buffer_load_dword s4u param3Offset, paramsOffset, 0x08 s_waitcnt lgkmcnt(0) v_add_i32 v4u debugOffset, vcc, param3Offset, vGlobalOffset // Fetch the value that we want to multiply by 2. s_buffer_load_dword s4u sPara1Ptr, paramsOffset, 0x00 s_load_dwordx4 s16u resConst0, resConsts, 0x60 s_waitcnt lgkmcnt(0) v_add_i32 v4b baseOffset, vcc, sPara1Ptr, vGlobalOffset tbuffer_load_format_x v4b val, baseOffset, resConst0, _F32_ s_waitcnt vmcnt(0) // Fast 64-lane wavefront SUM reduction // Product, Avg, Min, Max can also be easily achieved with minor edits. v4f tmp; [[ for (int i = 2; i <7; i++) {]] ds_swizzle_b32 tmp, val, tmp, tmp offset1:[[= 1<<i ]] offset0:0b00011111 s_waitcnt lgkmcnt(0) v_add_f32 val, tmp, val // can also use v_min_f32, v_max_f32, or v_mul_f32 [[ } ]] v_readfirstlane_b32 s4u sum, val v_add_f32 val, sum, val v_readlane_b32 sum, val, 32 // Write the results back to memory. v_mov_b32 val, sum s_buffer_load_dword s4u param2Offset, paramsOffset, 0x04 s_waitcnt lgkmcnt(0) s_waitcnt vmcnt(0) v_add_i32 v4u dstOffset, vcc, param2Offset,vGlobalOffset free param2Offset // var is freed and register returned to pool. tbuffer_store_format_x val, dstOffset,resConst1,_F32_ _ShowVar_(val) // Exit the kernel s_endpgm };"; // https://visualstudiogallery.msdn.microsoft.com/46c0c49e-f825-454b-9f6a-48b216797eb5/view/Reviews/0?showReviewForm=True // Initialize OpenClWithGCN OpenClWithGCN gprog = new OpenClWithGCN(); OpenClEnvironment env = gprog.env; // use the default environment bool success = gprog.GcnCompile(source); Console.Write(env.lastMessage); if (!success) return; // Create some random data var random = new Random(3); const int count = 640000; const int dataSz = count * sizeof(float); float[] data = (from i in Enumerable.Range(0, count) select (float)random.NextDouble()).ToArray(); // Create a kernel from our modProgram Kernel kernel = env.program.CreateKernel("myAsmFunc"); // Allocate an input and output memory buffers Mem input = env.context.CreateBuffer(MemoryFlags.ReadOnly, dataSz); Mem output = env.context.CreateBuffer(MemoryFlags.WriteOnly, dataSz); Mem debug = env.context.CreateBuffer(MemoryFlags.WriteOnly, dataSz); // Copy our host buffer of random values to input device buffer env.cmdQueue.EnqueueWriteBuffer(input, true, 0, dataSz, data); // Set the arguments to our kernel, and enqueue it for execution kernel.Arguments[0].SetValue(input); kernel.Arguments[1].SetValue(output); kernel.Arguments[2].SetValue(debug); // Enqueue and run the kernel. env.cmdQueue.EnqueueNDRangeKernel(kernel, count, 256); // Wait until all commands finish. env.cmdQueue.Finish(); // Read back the results. float[] results = new float[count]; env.cmdQueue.EnqueueReadBufferAndWait(output, results); float[] debugOutput = new float[count]; env.cmdQueue.EnqueueReadBufferAndWait(debug, debugOutput); // Print Debug information if (printDetails) for (int i = 0; i < 256; i++) Console.WriteLine("Debug: {0}: {1} -> {2}", i, data[i], debugOutput[i]); // Validate and print a brief summary detailing the results int correct = 0; for (int i = 0; i < (count / 64); i++) { float cpuSum = 0; for (int j = 0; j < 64; j++) cpuSum += data[i * 64 + j]; float gpuSum = results[i * 64]; if (cpuSum > gpuSum * 0.999999 && cpuSum < gpuSum * 1.000001) correct++; } Console.WriteLine("Computed {0}/{1} correct values!", correct, count / 64); Assert.AreEqual(correct, count / 64); }