public void Run()
        {
            Console.WriteLine("================== " + GetType().Name + "================== ");

            string source = @"
                __kernel void myOpenClFunc ( __global float* cl_input, __global float* cl_output ) 
                { 
                    size_t i = get_global_id(0);
                    cl_output[i] = cl_input[i] + cl_input[i];
                }; ";


            /************ Initialize OpenClWithGCN    ***************************************/
            OpenClWithGCN gprog = new OpenClWithGCN();
            OpenClEnvironment env = gprog.env;
            bool success = gprog.GcnCompile(source);
            Console.Write(env.lastMessage);
            if (!success)
            {
                Assert.Fail();
                return;
            }

            /************ Create some random data   *******************************************/
            // create some random data for testing
            var random = new Random();
            const int count = 1024 * 1024;
            const int dataSz = count * sizeof(float);
            float[] data = (from i in Enumerable.Range(0, count) select (float)random.NextDouble()).ToArray();

            /************ Build and run the kernel  *******************************************/
            Kernel kernel = env.program.CreateKernel("myOpenClFunc");
            Mem cl_input = env.context.CreateBuffer(MemoryFlags.ReadOnly, dataSz);
            Mem cl_output = env.context.CreateBuffer(MemoryFlags.WriteOnly, dataSz);
            env.cmdQueue.EnqueueWriteBuffer(cl_input, true, 0, dataSz, data);
            kernel.Arguments[0].SetValue(cl_input);
            kernel.Arguments[1].SetValue(cl_output);
            env.cmdQueue.EnqueueNDRangeKernel(kernel, count, 256);
            env.cmdQueue.Finish();

            /************ Read back and Validate the results ***********************************/
            float[] results = new float[count];
            env.cmdQueue.EnqueueReadBufferAndWait(cl_output, results, dataSz);
            int correct = Enumerable.Range(0, count).Where(i => results[i] == data[i] * 2).Count();
            Console.WriteLine("{0} - Computed {1}/{2} correct values!",
                correct == count ? "PASS" : "FAIL", correct.ToString(), count.ToString());

            Assert.AreEqual(correct, count);
        }
Esempio n. 2
0
        public void Run()
        {
            Console.WriteLine("================== " + GetType().Name + "================== ");

            string source = @"
    __asm4GCN myAsmFunc ( float*, float*)
    {
    // ======== Pre-Loaded Registers =========
    // s[2:3] - PTR_UAV_TABLE
    // s[2:3] +0x60 - base_resource_const1(#T)
    // s[2:3] +0x68 - base_resource_const2(#T)
    // s[4:7] - IMM_CONST_BUFFER0
    // s[4:7] +0x00 - Grid Size
    // s[4:7] +0x04 - Local Size
    // s[4:7] +0x18 - baseGlobalId
    // s[8:11] -IMM_CONST_BUFFER1
    // s[8:11]+0x00 - param1 offset
    // s[8:11]+0x04 - param2 offset 
    // s[8:11]+0x08 - param3 offset
    // s1     - threadgroupId
    // s12    - groupId
    // v0     - laneId
  
    // Use a #define to shorten tbuffer_load instruction
    #define _F32_ 0 offen format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_FLOAT]
  
    // Variable decelerations with specific registers (pre-loaded regs)
    s8u  resConsts s[2:3]   // base resource constants
    s16u uavBuff   s[4:7], paramsOffset s[8:11], groupId s12
    v4b  laneId    v0       // v0 is pre-loaded with lane id
  
    // Setup Resource Constant
    s_load_dwordx4 s16u resConst1, resConsts, 0x68             
  
    // establish GlobalID
    s_buffer_load_dword s4u baseGlobalId,uavBuff, 0x18 
    s_buffer_load_dword s4u localSize, uavBuff, 0x04  
    s_waitcnt     lgkmcnt(0)                      
    v_mov_b32     v4u vLocalSize, localSize                  
    v_mul_i32_i24     vLocalSize, groupId, vLocalSize           
    v_add_i32     v4u localSizeIdx, vcc, laneId, vLocalSize 
    v_add_i32     v4u vGlobalID, vcc, baseGlobalId, localSizeIdx
    v_lshlrev_b32 v4u vGlobalOffset, 2, vGlobalID
 
    // Fetch the value that we want to multiply by 2.
    s_buffer_load_dword s4u sPara1Ptr, paramsOffset, 0x00         
    s_load_dwordx4 s16u resConst0, resConsts, 0x60
    s_waitcnt     lgkmcnt(0)
    v_add_i32     v4b baseOffset, vcc, sPara1Ptr, vGlobalOffset
    tbuffer_load_format_x v4b val, baseOffset, resConst0, _F32_
    s_waitcnt     vmcnt(0)                              

    // Perform the 'sum = val * 2'
    v_mul_f32 v4f sum, 2.0, val

    // Write the results back to memory.
    s_buffer_load_dword s4u param2Offset, paramsOffset, 0x04
    s_waitcnt lgkmcnt(0)
    s_waitcnt     vmcnt(0)
    v_add_i32     v4u dstOffset, vcc, param2Offset, vGlobalOffset
    free param2Offset  // var is freed and register returned to pool.
    tbuffer_store_format_x sum, dstOffset, resConst1, _F32_

    // Exit the kernel
    s_endpgm
    }";


            /************ Initialize OpenClWithGCN    ***************************************/
            OpenClWithGCN gprog = new OpenClWithGCN();
            OpenClEnvironment env = gprog.env;
            bool success = gprog.GcnCompile(source);
            Console.Write(env.lastMessage);
            if (!success)
            {
                Assert.Fail();
                return;
            }

            /************ Create some random data   *******************************************/
            // create some random data for testing
            var random = new Random();
            const int count = 1024 * 1024;
            const int dataSz = count * sizeof(float);
            float[] data = (from i in Enumerable.Range(0, count) select (float)random.NextDouble()).ToArray();

            /************ Build and run the kernel  *******************************************/
            Kernel kernel = env.program.CreateKernel("myAsmFunc");
            Mem cl_input = env.context.CreateBuffer(MemoryFlags.ReadOnly, dataSz);
            Mem cl_output = env.context.CreateBuffer(MemoryFlags.WriteOnly, dataSz);
            env.cmdQueue.EnqueueWriteBuffer(cl_input, true, 0, dataSz, data);
            kernel.Arguments[0].SetValue(cl_input);
            kernel.Arguments[1].SetValue(cl_output);
            env.cmdQueue.EnqueueNDRangeKernel(kernel, count, 256);
            env.cmdQueue.Finish();

            /************ Read back and Validate the results ***********************************/
            float[] results = new float[count];
            env.cmdQueue.EnqueueReadBufferAndWait(cl_output, results, dataSz);
            int correct = Enumerable.Range(0,count).Where(i=>results[i]==data[i]*2).Count();
            Console.WriteLine("{0} - Computed {1}/{2} correct values!",
                correct == count ? "PASS" : "FAIL", correct.ToString(), count.ToString());

            Assert.AreEqual(correct, count);
        }
Esempio n. 3
0
        public void Run()
        {
            // The example below is an OpenCL example by Derek Gerstmann(UWA). It's been modified for NOpenCL use.
            Console.WriteLine("================== " + GetType().Name + "================== ");

            /************ Create and build a program from our OpenCL-C source code ***************/
            // const string source = GCN_NS.Code.DevCode;

            string source = @"
                __asm4GCN myAsmFunc ( float*, float*)
                {
                  #define _ZeroInHex_ 0x00
                  #define _32Float_   0 offen format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_FLOAT]
                                                                      // Loc| Binary |BinaryExt
                  s_buffer_load_dword  s0, s[4:7], 0x04               // 000|C2000504|
                  s_buffer_load_dword  s1, s[4:7], 0x18               // 004|C2008518|
                  s_waitcnt     lgkmcnt(0)                            // 008|BF8C007F|
                  s_min_u32     s0, s0, 0x0000ffff                    // 00C|8380FF00|0000FFFF
                  s_buffer_load_dword  s4, s[8:11], _ZeroInHex_       // 014|C2020900|
                  v_mov_b32     v1, s0                                // 018|7E020200|
                  v_mul_i32_i24 v1, s12, v1                           // 01C|1202020C|
                  v_add_i32     v0, vcc, v0, v1                       // 020|4A000300|
                  v_add_i32     v0, vcc, s1, v0                       // 024|4A000001|
                  v_lshlrev_b32 v0, 2, v0                             // 028|34000082|
                  s_load_dwordx4 s[12:15], s[2:3], 0x60               // 02C|C0860360|
                  s_waitcnt     lgkmcnt(0)                            // 030|BF8C007F|
                  v_add_i32     v1, vcc, s4, v0                       // 034|4A020004|
                  tbuffer_load_format_x  v1, v1, s[12:15], _32Float_  // 038|EBA01000|80030101
                  s_buffer_load_dword  s0, s[8:11], 0x04              // 040|C2000904|
                  s_load_dwordx4 s[4:7], s[2:3], 0x68                 // 044|C0820368|
                  s_waitcnt     lgkmcnt(0)                            // 048|BF8C007F|
                  v_add_i32     v0, vcc, s0, v0                       // 04C|4A000000|
                  s_waitcnt     vmcnt(0)                              // 050|BF8C0F70|
                  v_add_f32     v1, v1, v1                            // 054|10020301|
                  tbuffer_store_format_x  v1, v0, s[4:7], _32Float_   // 058|EBA41000|80010100
                  s_endpgm                                            // 060|BF810000|
                };

                // This is not used here but is what generates above when de-assembled.
                // __kernel and __asm4GCN blocks can be used in the same clProgram
                __kernel void myOpenClFunc ( __global float* cl_input, __global float* cl_output ) 
                { 
                    size_t i = get_global_id(0);
                    cl_output[i] = cl_input[i] + cl_input[i];
                }; ";

            /************ Initialize OpenClWithGCN    ***************************************/
            // OpenClEnvironment env = SetupOpenClEnvironment(); // for manual setup
            OpenClWithGCN gprog = new OpenClWithGCN();
            OpenClEnvironment env = gprog.env;  // Let just use the default environment
            bool success = gprog.GcnCompile(source);
            Console.Write(env.lastMessage);
            if (!success)
            {
                Assert.Fail();
                return;
            }

            /************ Create some random data   *******************************************/
            // create some random data for testing
            var random = new Random();
            const int count = 1024 * 1024;
            const int dataSz = count * sizeof(float);
            float[] data = (from i in Enumerable.Range(0, count) select (float)random.NextDouble()).ToArray();

            /************ Create a kernel from our modProgram    *******************************/
            Kernel kernel = env.program.CreateKernel("myAsmFunc");

            /************ Allocate cl_input, and fill with data ********************************/
            // OpenCL: cl_mem cl_input = clCreateBuffer(context, CL_MEM_READ_ONLY, dataSz, host_ptr, NULL);
            Mem cl_input = env.context.CreateBuffer(MemoryFlags.ReadOnly, dataSz);

            /************ Create an cl_output memory buffer for our results    *****************/
            // OpenCL: cl_mem cl_output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, dataSz, host_ptr, NULL);
            Mem cl_output = env.context.CreateBuffer(MemoryFlags.WriteOnly, dataSz);

            /************ Copy our host buffer of random values to cl_input device buffer ******/
            // OpenCL: clEnqueueWriteBuffer(cmdQueue, cl_input, CL_TRUE, 0, dataSz, data, 0, NULL, NULL);
            env.cmdQueue.EnqueueWriteBuffer(cl_input, true, 0, dataSz, data);

            /************ Set the arguments to our kernel, and enqueue it for execution ********/
            // OpenCL: clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_input);
            kernel.Arguments[0].SetValue(cl_input);
            kernel.Arguments[1].SetValue(cl_output);

            /************ Enqueue and run the kernel *******************************************/
            // OpenCL: clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, &globalMem  &localMem, 0, NULL,NULL);
            env.cmdQueue.EnqueueNDRangeKernel(kernel, count, 256);

            /************ Force command queue to get processed, wait until all commands finish **/
            env.cmdQueue.Finish();

            /************ Read back the results ************************************************/
            // OpenCL: clEnqueueReadBuffer(cmdQueue, cl_output, CL_TRUE, 0, dataSz, results, 0, NULL, NULL); 
            float[] results = new float[count];
            env.cmdQueue.EnqueueReadBufferAndWait(cl_output, results, dataSz);

            /************ Validate our results *************************************************/
            int correct = 0;
            for (int i = 0; i < count; i++)
                correct += (results[i] == data[i] + data[i]) ? 1 : 0;
            // int correct = Enumerable.Range(0,count).Where(i=>results[i]==data[i]*2).Count();

            /************ Print a brief summary detailing the results **************************/
            Console.WriteLine("{0} - Computed {1}/{2} correct values!",
                correct == count ? "PASS" : "FAIL", correct.ToString(), count.ToString());

            Assert.AreEqual(correct, count);

        }
        public void Run()
        {
            Console.WriteLine("================== " + GetType().Name + "================== ");

            string source = @"__asm4GCN myAsmFunc (float*,float*,float*)
{
  // ======== Pre-Loaded Registers =========
  // s[2:3] - PTR_UAV_TABLE
  // s[2:3] +0x60 - base_resource_const1(#T)
  // s[2:3] +0x68 - base_resource_const2(#T)
  // s[4:7] - IMM_CONST_BUFFER0
  // s[4:7] +0x00 - Grid Size
  // s[4:7] +0x04 - Local Size
  // s[4:7] +0x18 - baseGlobalId
  // s[8:11] -IMM_CONST_BUFFER1
  // s[8:11]+0x00 - param1 offset
  // s[8:11]+0x04 - param2 offset 
  // s[8:11]+0x08 - param3 offset
  // s1     - threadgroupId
  // s12    - groupId
  // v0     - laneId
  
  // Use a #define to shorten tbuffer_load instruction
  #define _F32_ 0 offen format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_FLOAT]
  
  // Variable decelerations with specific registers (pre-loaded regs)
  s8u  resConsts s[2:3]   // base resource constants
  s16u uavBuff   s[4:7], paramsOffset s[8:11], groupId s12
  v4b  laneId    v0       // v0 is pre-loaded with lane id
  
  // Setup Resource Constant
  s_load_dwordx4 s16u resConst1, resConsts, 0x68             
  
  // establish GlobalID
  s_buffer_load_dword s4u baseGlobalId,uavBuff, 0x18 
  s_buffer_load_dword s4u localSize, uavBuff, 0x04  
  s_waitcnt     lgkmcnt(0)                      
  v_mov_b32     v4u vLocalSize, localSize                  
  v_mul_i32_i24     vLocalSize, groupId, vLocalSize           
  v_add_i32     v4u localSizeIdx, vcc, laneId, vLocalSize 
  v_add_i32     v4u vGlobalID, vcc, baseGlobalId, localSizeIdx
  v_lshlrev_b32 v4u vGlobalOffset, 2, vGlobalID

  // create a tool called 'ShowVar' to easily output a variable
  #define _ShowVar_(VarToPrint) \
    v_mov_b32 v4u debugToPrint, VarToPrint;\
    tbuffer_store_format_x debugToPrint, debugOffset,resConst1,_F32_ 
  s_buffer_load_dword s4u param3Offset, paramsOffset, 0x08
  s_waitcnt     lgkmcnt(0)                      
  v_add_i32 v4u debugOffset, vcc, param3Offset, vGlobalOffset
  
  // Fetch the value that we want to multiply by 2.
  s_buffer_load_dword s4u sPara1Ptr, paramsOffset, 0x00         
  s_load_dwordx4 s16u resConst0, resConsts, 0x60
  s_waitcnt     lgkmcnt(0) 
  v_add_i32     v4b baseOffset, vcc, sPara1Ptr, vGlobalOffset
  tbuffer_load_format_x v4b val, baseOffset, resConst0, _F32_
  s_waitcnt     vmcnt(0)                              

  // Fast 64-lane wavefront SUM reduction
  // Product, Avg, Min, Max can also be easily achieved with minor edits.
  v4f tmp;
  [[ for (int i = 2; i <7; i++) {]]
    ds_swizzle_b32 tmp, val, tmp, tmp offset1:[[= 1<<i ]] offset0:0b00011111
    s_waitcnt     lgkmcnt(0)                      
    v_add_f32 val, tmp, val  // can also use v_min_f32, v_max_f32, or v_mul_f32
  [[ } ]]
  v_readfirstlane_b32 s4u sum, val  
  v_add_f32 val, sum, val
  v_readlane_b32 sum, val, 32

  
  // Write the results back to memory.
  v_mov_b32 val, sum
  s_buffer_load_dword s4u param2Offset, paramsOffset, 0x04
  s_waitcnt     lgkmcnt(0)                            
  s_waitcnt     vmcnt(0)                              
  v_add_i32     v4u dstOffset, vcc, param2Offset,vGlobalOffset
  free param2Offset  // var is freed and register returned to pool.
  tbuffer_store_format_x val, dstOffset,resConst1,_F32_ 
  
  _ShowVar_(val)
  
  // Exit the kernel
  s_endpgm                                            
};";


            // https://visualstudiogallery.msdn.microsoft.com/46c0c49e-f825-454b-9f6a-48b216797eb5/view/Reviews/0?showReviewForm=True

            // Initialize OpenClWithGCN 
            OpenClWithGCN gprog = new OpenClWithGCN();
            OpenClEnvironment env = gprog.env;  // use the default environment
            bool success = gprog.GcnCompile(source);
            Console.Write(env.lastMessage);
            if (!success) return;

            // Create some random data  
            var random = new Random(3);
            const int count = 640000;
            const int dataSz = count * sizeof(float);
            float[] data = (from i in Enumerable.Range(0, count)
                            select (float)random.NextDouble()).ToArray();

            // Create a kernel from our modProgram   
            Kernel kernel = env.program.CreateKernel("myAsmFunc");

            // Allocate an input and output memory buffers
            Mem input = env.context.CreateBuffer(MemoryFlags.ReadOnly, dataSz);
            Mem output = env.context.CreateBuffer(MemoryFlags.WriteOnly, dataSz);
            Mem debug = env.context.CreateBuffer(MemoryFlags.WriteOnly, dataSz);

            // Copy our host buffer of random values to input device buffer 
            env.cmdQueue.EnqueueWriteBuffer(input, true, 0, dataSz, data);

            // Set the arguments to our kernel, and enqueue it for execution 
            kernel.Arguments[0].SetValue(input);
            kernel.Arguments[1].SetValue(output);
            kernel.Arguments[2].SetValue(debug);

            // Enqueue and run the kernel.
            env.cmdQueue.EnqueueNDRangeKernel(kernel, count, 256);

            // Wait until all commands finish.
            env.cmdQueue.Finish();

            // Read back the results.
            float[] results = new float[count];
            env.cmdQueue.EnqueueReadBufferAndWait(output, results);
            float[] debugOutput = new float[count];
            env.cmdQueue.EnqueueReadBufferAndWait(debug, debugOutput);

            // Print Debug information
            if (printDetails)
                for (int i = 0; i < 256; i++)
                    Console.WriteLine("Debug: {0}: {1} -> {2}", i, data[i], debugOutput[i]);

            // Validate and print a brief summary detailing the results
            int correct = 0;
            for (int i = 0; i < (count / 64); i++)
            {
                float cpuSum = 0;
                for (int j = 0; j < 64; j++)
                    cpuSum += data[i * 64 + j];

                float gpuSum = results[i * 64];

                if (cpuSum > gpuSum * 0.999999
                    && cpuSum < gpuSum * 1.000001)
                    correct++;
            }
            Console.WriteLine("Computed {0}/{1} correct values!", correct, count / 64);
            Assert.AreEqual(correct, count / 64);
        }