private void SetupAndCount(CLMemoryHandle input, int bitOffset) { ComputeErrorCode error; IntPtr agentPtrSize = (IntPtr)0; agentPtrSize = (IntPtr)Marshal.SizeOf(typeof(IntPtr)); var ptrSize = (IntPtr)Marshal.SizeOf(typeof(Mem)); int globalWorkSize = gpuConstants.numThreadsPerBlock * gpuConstants.numBlocks; int localWorkSize = gpuConstants.numThreadsPerBlock; IntPtr[] workGroupSizePtr = new IntPtr[] { (IntPtr)globalWorkSize }; IntPtr[] localWorkGroupSizePtr = new IntPtr[] { (IntPtr)localWorkSize }; ComputeEvent clevent; error = CL10.SetKernelArg(ckSetupAndCount, 0, ptrSize, input); CheckErr(error, "CL10.SetKernelArg"); error = CL10.SetKernelArg(ckSetupAndCount, 1, ptrSize, mCounters); CheckErr(error, "CL10.SetKernelArg"); //if(DEBUG_CONSOLE_OUTPUT) Console.WriteLine((Marshal.SizeOf(typeof(GPUConstants)))); error = CL10.SetKernelArg(ckSetupAndCount, 2, (IntPtr)(Marshal.SizeOf(typeof(GPUConstants))), gpuConstants); CheckErr(error, "CL10.SetKernelArg"); error = CL10.SetKernelArg(ckSetupAndCount, 3, (IntPtr)4, bitOffset); CheckErr(error, "CL10.SetKernelArg"); error = CL10.EnqueueNDRangeKernel(cqCommandQueue, ckSetupAndCount, 1, null, workGroupSizePtr, localWorkGroupSizePtr, 0, null, out clevent); CheckErr(error, "CL10.EnqueueNDRangeKernel"); error = CL10.Finish(cqCommandQueue); CheckErr(error, "CL10.Finish"); if (DEBUG) { ComputeEvent eve; CL10.EnqueueReadBuffer(cqCommandQueue, input, Bool.True, IntPtr.Zero, (IntPtr)(gpuConstants.numTotalElements * 4), debugRead, 0, null, out eve); CheckErr(error, "CL10.EnqueueReadBuffer"); PrintElementBuffer(debugRead, gpuConstants.numTotalElements, "Setup and Count -> Input -> bitoffset = " + bitOffset); CL10.EnqueueReadBuffer(cqCommandQueue, mCounters, Bool.True, IntPtr.Zero, (IntPtr)(numCounters * sizeof(int)), debugRead, 0, null, out eve); CheckErr(error, "CL10.EnqueueReadBuffer"); PrintCounterBuffer(debugRead, "Setup and Count -> bitoffset = " + bitOffset); if (DEBUG_CONSOLE_OUTPUT) { Console.WriteLine("Setup and Count -> bitoffset = " + bitOffset); } if (DEBUG_CONSOLE_OUTPUT) { Console.WriteLine(); } } }
/// <summary> /// Blocks until all previously enqueued commands are issued to the <see cref="ComputeCommandQueue.Device"/> and have completed. /// </summary> public void Finish() { ComputeErrorCode error = CL10.Finish(Handle); ComputeException.ThrowOnError(error); }
/// <summary> /// Blocks until all previously enqueued commands are issued to the <see cref="OpenCLCommandQueue.Device"/> and have completed. /// </summary> public void Finish() { OpenCLErrorCode error = CL10.Finish(Handle); OpenCLException.ThrowOnError(error); }
private void ReorderingKeyValue(CLMemoryHandle inputKey, CLMemoryHandle outputKey, CLMemoryHandle inputValue, CLMemoryHandle outputValue, int bitOffset) { ComputeErrorCode error; IntPtr agentPtrSize = (IntPtr)0; agentPtrSize = (IntPtr)Marshal.SizeOf(typeof(IntPtr)); var ptrSize = (IntPtr)Marshal.SizeOf(typeof(Mem)); int globalWorkSize = gpuConstants.numThreadsPerBlock * gpuConstants.numBlocks; int localWorkSize = gpuConstants.numThreadsPerBlock; IntPtr[] workGroupSizePtr = new IntPtr[] { (IntPtr)globalWorkSize }; IntPtr[] localWorkGroupSizePtr = new IntPtr[] { (IntPtr)localWorkSize }; ComputeEvent clevent; error = CL10.SetKernelArg(ckReorderingKeyValue, 0, ptrSize, inputKey); CheckErr(error, "CL10.SetKernelArg"); error = CL10.SetKernelArg(ckReorderingKeyValue, 1, ptrSize, outputKey); CheckErr(error, "CL10.SetKernelArg"); error = CL10.SetKernelArg(ckReorderingKeyValue, 2, ptrSize, inputValue); CheckErr(error, "CL10.SetKernelArg"); error = CL10.SetKernelArg(ckReorderingKeyValue, 3, ptrSize, outputValue); CheckErr(error, "CL10.SetKernelArg"); error = CL10.SetKernelArg(ckReorderingKeyValue, 4, ptrSize, mCounters); CheckErr(error, "CL10.SetKernelArg"); error = CL10.SetKernelArg(ckReorderingKeyValue, 5, ptrSize, mRadixPrefixes); CheckErr(error, "CL10.SetKernelArg"); error = CL10.SetKernelArg(ckReorderingKeyValue, 6, (IntPtr)(gpuConstants.numGroupsPerBlock * gpuConstants.numBlocks * gpuConstants.numRadicesPerBlock * 4), null); CheckErr(error, "CL10.SetKernelArg"); error = CL10.SetKernelArg(ckReorderingKeyValue, 7, (IntPtr)(gpuConstants.numRadices * 4), null); CheckErr(error, "CL10.SetKernelArg"); error = CL10.SetKernelArg(ckReorderingKeyValue, 8, (IntPtr)(Marshal.SizeOf(typeof(GPUConstants))), gpuConstants); CheckErr(error, "CL10.SetKernelArg"); error = CL10.SetKernelArg(ckReorderingKeyValue, 9, (IntPtr)4, bitOffset); CheckErr(error, "CL10.SetKernelArg"); error = CL10.EnqueueNDRangeKernel(cqCommandQueue, ckReorderingKeyValue, 1, null, workGroupSizePtr, localWorkGroupSizePtr, 0, null, out clevent); CheckErr(error, "CL10.EnqueueNDRangeKernel"); error = CL10.Finish(cqCommandQueue); CheckErr(error, "CL10.Finish"); if (DEBUG) { if (DEBUG_CONSOLE_OUTPUT) { Console.WriteLine("-------------------------------Reordering-------------------------------------------------"); } ComputeEvent eve; if (DEBUG_CONSOLE_OUTPUT) { Console.WriteLine(" Input "); } CL10.EnqueueReadBuffer(cqCommandQueue, inputKey, Bool.True, IntPtr.Zero, (IntPtr)(gpuConstants.numTotalElements * 4), debugRead, 0, null, out eve); CheckErr(error, "CL10.EnqueueReadBuffer"); PrintElementBuffer(debugRead, gpuConstants.numTotalElements, "Reordering -> Input -> bitoffset = " + bitOffset); CL10.EnqueueReadBuffer(cqCommandQueue, inputValue, Bool.True, IntPtr.Zero, (IntPtr)(gpuConstants.numTotalElements * 4), debugRead, 0, null, out eve); CheckErr(error, "CL10.EnqueueReadBuffer"); PrintElementBuffer(debugRead, gpuConstants.numTotalElements, "Reordering -> InputValues -> bitoffset = " + bitOffset); if (DEBUG_CONSOLE_OUTPUT) { Console.WriteLine(" Counters "); } CL10.EnqueueReadBuffer(cqCommandQueue, mCounters, Bool.True, IntPtr.Zero, (IntPtr)(numCounters * sizeof(int)), debugRead, 0, null, out eve); CheckErr(error, "CL10.EnqueueReadBuffer"); PrintCounterBuffer(debugRead, "Reordering -> bitoffset = " + bitOffset); if (DEBUG_CONSOLE_OUTPUT) { Console.WriteLine(" Counters "); } CL10.EnqueueReadBuffer(cqCommandQueue, mRadixPrefixes, Bool.True, IntPtr.Zero, (IntPtr)(gpuConstants.numRadices * sizeof(int)), debugRead, 0, null, out eve); CheckErr(error, "CL10.EnqueueReadBuffer"); PrintElementBuffer(debugRead, gpuConstants.numRadices, "Reordering -> RadixPrefixe -> bitoffset = " + bitOffset); if (DEBUG_CONSOLE_OUTPUT) { Console.WriteLine(" Output "); } CL10.EnqueueReadBuffer(cqCommandQueue, outputKey, Bool.True, IntPtr.Zero, (IntPtr)(gpuConstants.numTotalElements * 4), debugRead, 0, null, out eve); CheckErr(error, "CL10.EnqueueReadBuffer"); PrintElementBuffer(debugRead, gpuConstants.numTotalElements, "Reordering -> Output -> bitoffset = " + bitOffset); CL10.EnqueueReadBuffer(cqCommandQueue, outputValue, Bool.True, IntPtr.Zero, (IntPtr)(gpuConstants.numTotalElements * 4), debugRead, 0, null, out eve); CheckErr(error, "CL10.EnqueueReadBuffer"); PrintElementBuffer(debugRead, gpuConstants.numTotalElements, "Reordering -> OutputValue -> bitoffset = " + bitOffset); if (DEBUG_CONSOLE_OUTPUT) { Console.WriteLine("Reordering -> bitoffset = " + bitOffset); } if (DEBUG_CONSOLE_OUTPUT) { Console.WriteLine(); } } ; }
public void sortKeysValue(CLMemoryHandle key, CLMemoryHandle value, int numElements) { debugRead = new int[Math.Max(numElements, numCounters)]; ComputeErrorCode error; ComputeEvent eve; /* * error = CL10.EnqueueReadBuffer(cqCommandQueue, input, Bool.True, IntPtr.Zero, (IntPtr)(numElements * 4), * debugRead, 0, null, out eve); * CheckErr(error, "CL10.EnqueueReadBuffer"); */ mCounters = CL10.CreateBuffer(cxGPUContext, ComputeMemoryFlags.ReadWrite, gpuConstants.numGroupsPerBlock * gpuConstants.numRadices * gpuConstants.numBlocks * sizeof(int), out error); CheckErr(error, "CL10.CreateBuffer"); mRadixPrefixes = CL10.CreateBuffer(cxGPUContext, ComputeMemoryFlags.ReadWrite, gpuConstants.numRadices * sizeof(int), out error); CheckErr(error, "CL10.CreateBuffer"); CLMemoryHandle outputValue = CL10.CreateBuffer(cxGPUContext, ComputeMemoryFlags.ReadWrite, (IntPtr)(8 * numElements), out error); CheckErr(error, "CL10.CreateBuffer"); CLMemoryHandle outputKey = CL10.CreateBuffer(cxGPUContext, ComputeMemoryFlags.ReadWrite, (IntPtr)(4 * numElements), out error); CheckErr(error, "CL10.CreateBuffer"); gpuConstants.numElementsPerGroup = (numElements / (gpuConstants.numBlocks * gpuConstants.numGroupsPerBlock)) + 1; gpuConstants.numTotalElements = numElements; int i; for (i = 0; i < 8; i++) { error = CL10.EnqueueWriteBuffer(cqCommandQueue, mCounters, Bool.True, IntPtr.Zero, (IntPtr)(numCounters * 4), counters, 0, null, out eve); CheckErr(error, "CL10.EnqueueWriteBuffer Counter initialize"); if (i % 2 == 0) { SetupAndCount(key, 4 * i); SumIt(key, 4 * i); ReorderingKeyValue(key, outputKey, value, outputValue, 4 * i); } else { SetupAndCount(outputKey, 4 * i); SumIt(outputKey, 4 * i); ReorderingKeyValue(outputKey, key, outputValue, value, 4 * i); } } if (i % 2 == 0) { error = CL10.EnqueueCopyBuffer(cqCommandQueue, outputKey, key, IntPtr.Zero, IntPtr.Zero, (IntPtr)(numElements * 4), 0, null, out eve); CheckErr(error, "CL10.EnqueueCopyBuffer"); error = CL10.Finish(cqCommandQueue); CheckErr(error, "CL10.Finish Copybuffer"); error = CL10.EnqueueCopyBuffer(cqCommandQueue, outputValue, value, IntPtr.Zero, IntPtr.Zero, (IntPtr)(numElements * 8), 0, null, out eve); CheckErr(error, "CL10.EnqueueCopyBuffer"); error = CL10.Finish(cqCommandQueue); CheckErr(error, "CL10.Finish Copybuffer"); } error = CL10.ReleaseMemObject(outputKey); CheckErr(error, "CL10.ReleaseMemObj"); error = CL10.ReleaseMemObject(outputValue); CheckErr(error, "CL10.ReleaseMemObj"); error = CL10.ReleaseMemObject(mRadixPrefixes); CheckErr(error, "CL10.ReleaseMemObj"); error = CL10.ReleaseMemObject(mCounters); CheckErr(error, "CL10.ReleaseMemObj"); Log_Idx++; }
public void sortKeysOnly(CLMemoryHandle input, CLMemoryHandle output, int numElements) { debugRead = new int[Math.Max(numElements, numCounters)]; ComputeErrorCode error; Compute ComputeEvent eve; mCounters = CL10.CreateBuffer(cxGPUContext, ComputeMemoryFlags.ReadWrite, gpuConstants.numGroupsPerBlock * gpuConstants.numRadices * gpuConstants.numBlocks * sizeof(int), out error); CheckErr(error, "CL10.CreateBuffer"); mRadixPrefixes = CL10.CreateBuffer(cxGPUContext, ComputeMemoryFlags.ReadWrite, gpuConstants.numRadices * sizeof(int), out error); CheckErr(error, "CL10.CreateBuffer"); gpuConstants.numElementsPerGroup = (numElements / (gpuConstants.numBlocks * gpuConstants.numGroupsPerBlock)) + 1; gpuConstants.numTotalElements = numElements; if (DEBUG) { CL10.EnqueueReadBuffer(cqCommandQueue, input, Bool.True, IntPtr.Zero, (IntPtr)(gpuConstants.numTotalElements * 4), debugRead, 0, null, out eve); CheckErr(error, "CL10.EnqueueReadBuffer"); PrintAsArray(debugRead, gpuConstants.numTotalElements); } int i; for (i = 0; i < 8; i++) { error = CL10.EnqueueWriteBuffer(cqCommandQueue, mCounters, true, IntPtr.Zero, (IntPtr)(numCounters * 4), counters, 0, null, out eve); CheckErr(error, "CL10.EnqueueWriteBuffer Counter initialize"); if (i % 2 == 0) { DateTime before = DateTime.Now; SetupAndCount(input, 4 * i); if (DEBUG_CONSOLE_OUTPUT) { Console.WriteLine("Setup and Count =" + (DateTime.Now - before).TotalMilliseconds); } before = DateTime.Now; SumIt(input, 4 * i); if (DEBUG_CONSOLE_OUTPUT) { Console.WriteLine("SumIt =" + (DateTime.Now - before).TotalMilliseconds); } before = DateTime.Now; ReorderingKeysOnly(input, output, 4 * i); if (DEBUG_CONSOLE_OUTPUT) { Console.WriteLine("Reorder =" + (DateTime.Now - before).TotalMilliseconds); } } else { SetupAndCount(output, 4 * i); SumIt(output, 4 * i); ReorderingKeysOnly(output, input, 4 * i); } } if (i % 2 != 0) { error = CL10.EnqueueCopyBuffer(cqCommandQueue, input, output, IntPtr.Zero, IntPtr.Zero, (IntPtr)(numElements * 4), 0, null, out eve); CheckErr(error, "CL10.EnqueueCopyBuffer"); error = CL10.Finish(cqCommandQueue); CheckErr(error, "CL10.Finish Copybuffer"); } error = CL10.ReleaseMemObject(mRadixPrefixes); CheckErr(error, "CL10.ReleaseMemObj"); error = CL10.ReleaseMemObject(mCounters); CheckErr(error, "CL10.ReleaseMemObj"); Log_Idx++; }