Ejemplo n.º 1
0
        private void SetupAndCount(CLMemoryHandle input, int bitOffset)
        {
            ComputeErrorCode error;
            IntPtr           agentPtrSize = (IntPtr)0;

            agentPtrSize = (IntPtr)Marshal.SizeOf(typeof(IntPtr));
            var ptrSize = (IntPtr)Marshal.SizeOf(typeof(Mem));


            int globalWorkSize = gpuConstants.numThreadsPerBlock * gpuConstants.numBlocks;
            int localWorkSize  = gpuConstants.numThreadsPerBlock;

            IntPtr[]     workGroupSizePtr      = new IntPtr[] { (IntPtr)globalWorkSize };
            IntPtr[]     localWorkGroupSizePtr = new IntPtr[] { (IntPtr)localWorkSize };
            ComputeEvent clevent;

            error = CL10.SetKernelArg(ckSetupAndCount, 0, ptrSize, input);
            CheckErr(error, "CL10.SetKernelArg");
            error = CL10.SetKernelArg(ckSetupAndCount, 1, ptrSize, mCounters);
            CheckErr(error, "CL10.SetKernelArg");
            //if(DEBUG_CONSOLE_OUTPUT) Console.WriteLine((Marshal.SizeOf(typeof(GPUConstants))));
            error = CL10.SetKernelArg(ckSetupAndCount, 2, (IntPtr)(Marshal.SizeOf(typeof(GPUConstants))), gpuConstants);
            CheckErr(error, "CL10.SetKernelArg");
            error = CL10.SetKernelArg(ckSetupAndCount, 3, (IntPtr)4, bitOffset);
            CheckErr(error, "CL10.SetKernelArg");
            error = CL10.EnqueueNDRangeKernel(cqCommandQueue, ckSetupAndCount, 1, null, workGroupSizePtr, localWorkGroupSizePtr, 0, null, out clevent);
            CheckErr(error, "CL10.EnqueueNDRangeKernel");

            error = CL10.Finish(cqCommandQueue);
            CheckErr(error, "CL10.Finish");
            if (DEBUG)
            {
                ComputeEvent eve;
                CL10.EnqueueReadBuffer(cqCommandQueue, input, Bool.True, IntPtr.Zero, (IntPtr)(gpuConstants.numTotalElements * 4), debugRead, 0,
                                       null, out eve);
                CheckErr(error, "CL10.EnqueueReadBuffer");
                PrintElementBuffer(debugRead, gpuConstants.numTotalElements, "Setup and Count -> Input  -> bitoffset = " + bitOffset);

                CL10.EnqueueReadBuffer(cqCommandQueue, mCounters, Bool.True, IntPtr.Zero, (IntPtr)(numCounters * sizeof(int)), debugRead, 0,
                                       null, out eve);
                CheckErr(error, "CL10.EnqueueReadBuffer");
                PrintCounterBuffer(debugRead, "Setup and Count -> bitoffset = " + bitOffset);
                if (DEBUG_CONSOLE_OUTPUT)
                {
                    Console.WriteLine("Setup and Count -> bitoffset = " + bitOffset);
                }

                if (DEBUG_CONSOLE_OUTPUT)
                {
                    Console.WriteLine();
                }
            }
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Enqueues a command to execute a range of <see cref="ComputeKernel"/>s in parallel.
        /// </summary>
        /// <param name="kernel"> The <see cref="ComputeKernel"/> to execute. </param>
        /// <param name="globalWorkOffset"> An array of values that describe the offset used to calculate the global ID of a work-item instead of having the global IDs always start at offset (0, 0,... 0). </param>
        /// <param name="globalWorkSize"> An array of values that describe the number of global work-items in dimensions that will execute the kernel function. The total number of global work-items is computed as global_work_size[0] *...* global_work_size[work_dim - 1]. </param>
        /// <param name="localWorkSize"> An array of values that describe the number of work-items that make up a work-group (also referred to as the size of the work-group) that will execute the <paramref name="kernel"/>. The total number of work-items in a work-group is computed as local_work_size[0] *... * local_work_size[work_dim - 1]. </param>
        /// <param name="events"> A collection of events that need to complete before this particular command can be executed. If <paramref name="events"/> is not <c>null</c> or read-only a new <see cref="ComputeEvent"/> identifying this command is created and attached to the end of the collection. </param>
        public void Execute(ComputeKernel kernel, long[] globalWorkOffset, long[] globalWorkSize, long[] localWorkSize, ICollection <ComputeEventBase> events)
        {
            int eventWaitListSize;

            CLEventHandle[] eventHandles   = ComputeTools.ExtractHandles(events, out eventWaitListSize);
            bool            eventsWritable = (events != null && !events.IsReadOnly);

            CLEventHandle[] newEventHandle = (eventsWritable) ? new CLEventHandle[1] : null;

            ComputeErrorCode error = CL10.EnqueueNDRangeKernel(Handle, kernel.Handle, globalWorkSize.Length, ComputeTools.ConvertArray(globalWorkOffset), ComputeTools.ConvertArray(globalWorkSize), ComputeTools.ConvertArray(localWorkSize), eventWaitListSize, eventHandles, newEventHandle);

            ComputeException.ThrowOnError(error);

            if (eventsWritable)
            {
                events.Add(new ComputeEvent(newEventHandle[0], this));
            }
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Enqueues a command to execute a range of <see cref="OpenCLKernel"/>s in parallel.
        /// </summary>
        /// <param name="kernel"> The <see cref="OpenCLKernel"/> to execute. </param>
        /// <param name="globalWorkOffset"> An array of values that describe the offset used to calculate the global ID of a work-item instead of having the global IDs always start at offset (0, 0,... 0). </param>
        /// <param name="globalWorkSize"> An array of values that describe the number of global work-items in dimensions that will execute the kernel function. The total number of global work-items is computed as global_work_size[0] *...* global_work_size[work_dim - 1]. </param>
        /// <param name="localWorkSize"> An array of values that describe the number of work-items that make up a work-group (also referred to as the size of the work-group) that will execute the <paramref name="kernel"/>. The total number of work-items in a work-group is computed as local_work_size[0] *... * local_work_size[work_dim - 1]. </param>
        /// <param name="events"> A collection of events that need to complete before this particular command can be executed. If <paramref name="events"/> is not <c>null</c> or read-only a new <see cref="OpenCLEvent"/> identifying this command is created and attached to the end of the collection. </param>
        public void Execute(OpenCLKernel kernel, long[] globalWorkOffset, long[] globalWorkSize, long[] localWorkSize, IReadOnlyList <OpenCLEventBase> events = null, IList <OpenCLEventBase> newEvents = null)
        {
            int eventWaitListSize;

            CLEventHandle[] eventHandles = OpenCLTools.ExtractHandles(events, out eventWaitListSize);

            CLEventHandle[] newEventHandle = (newEvents != null) ? new CLEventHandle[1] : null;

            OpenCLErrorCode error = CL10.EnqueueNDRangeKernel(Handle, kernel.Handle, globalWorkSize.Length, OpenCLTools.ConvertArray(globalWorkOffset), OpenCLTools.ConvertArray(globalWorkSize), OpenCLTools.ConvertArray(localWorkSize), eventWaitListSize, eventHandles, newEventHandle);

            OpenCLException.ThrowOnError(error);

            if (newEvents != null)
            {
                lock (newEvents)
                {
                    newEvents.Add(new OpenCLEvent(newEventHandle[0], this));
                }
            }
        }
Ejemplo n.º 4
0
        private void ReorderingKeyValue(CLMemoryHandle inputKey, CLMemoryHandle outputKey, CLMemoryHandle inputValue, CLMemoryHandle outputValue, int bitOffset)
        {
            ComputeErrorCode error;
            IntPtr           agentPtrSize = (IntPtr)0;

            agentPtrSize = (IntPtr)Marshal.SizeOf(typeof(IntPtr));
            var ptrSize = (IntPtr)Marshal.SizeOf(typeof(Mem));


            int globalWorkSize = gpuConstants.numThreadsPerBlock * gpuConstants.numBlocks;
            int localWorkSize  = gpuConstants.numThreadsPerBlock;

            IntPtr[]     workGroupSizePtr      = new IntPtr[] { (IntPtr)globalWorkSize };
            IntPtr[]     localWorkGroupSizePtr = new IntPtr[] { (IntPtr)localWorkSize };
            ComputeEvent clevent;

            error = CL10.SetKernelArg(ckReorderingKeyValue, 0, ptrSize, inputKey);
            CheckErr(error, "CL10.SetKernelArg");
            error = CL10.SetKernelArg(ckReorderingKeyValue, 1, ptrSize, outputKey);
            CheckErr(error, "CL10.SetKernelArg");
            error = CL10.SetKernelArg(ckReorderingKeyValue, 2, ptrSize, inputValue);
            CheckErr(error, "CL10.SetKernelArg");
            error = CL10.SetKernelArg(ckReorderingKeyValue, 3, ptrSize, outputValue);
            CheckErr(error, "CL10.SetKernelArg");
            error = CL10.SetKernelArg(ckReorderingKeyValue, 4, ptrSize, mCounters);
            CheckErr(error, "CL10.SetKernelArg");
            error = CL10.SetKernelArg(ckReorderingKeyValue, 5, ptrSize, mRadixPrefixes);
            CheckErr(error, "CL10.SetKernelArg");
            error = CL10.SetKernelArg(ckReorderingKeyValue, 6, (IntPtr)(gpuConstants.numGroupsPerBlock * gpuConstants.numBlocks * gpuConstants.numRadicesPerBlock * 4), null);
            CheckErr(error, "CL10.SetKernelArg");
            error = CL10.SetKernelArg(ckReorderingKeyValue, 7, (IntPtr)(gpuConstants.numRadices * 4), null);
            CheckErr(error, "CL10.SetKernelArg");
            error = CL10.SetKernelArg(ckReorderingKeyValue, 8, (IntPtr)(Marshal.SizeOf(typeof(GPUConstants))), gpuConstants);
            CheckErr(error, "CL10.SetKernelArg");
            error = CL10.SetKernelArg(ckReorderingKeyValue, 9, (IntPtr)4, bitOffset);
            CheckErr(error, "CL10.SetKernelArg");

            error = CL10.EnqueueNDRangeKernel(cqCommandQueue, ckReorderingKeyValue, 1, null, workGroupSizePtr, localWorkGroupSizePtr, 0, null, out clevent);
            CheckErr(error, "CL10.EnqueueNDRangeKernel");

            error = CL10.Finish(cqCommandQueue);
            CheckErr(error, "CL10.Finish");
            if (DEBUG)
            {
                if (DEBUG_CONSOLE_OUTPUT)
                {
                    Console.WriteLine("-------------------------------Reordering-------------------------------------------------");
                }
                ComputeEvent eve;

                if (DEBUG_CONSOLE_OUTPUT)
                {
                    Console.WriteLine("              Input                ");
                }
                CL10.EnqueueReadBuffer(cqCommandQueue, inputKey, Bool.True, IntPtr.Zero, (IntPtr)(gpuConstants.numTotalElements * 4), debugRead, 0,
                                       null, out eve);
                CheckErr(error, "CL10.EnqueueReadBuffer");
                PrintElementBuffer(debugRead, gpuConstants.numTotalElements, "Reordering -> Input -> bitoffset = " + bitOffset);

                CL10.EnqueueReadBuffer(cqCommandQueue, inputValue, Bool.True, IntPtr.Zero, (IntPtr)(gpuConstants.numTotalElements * 4), debugRead, 0,
                                       null, out eve);
                CheckErr(error, "CL10.EnqueueReadBuffer");
                PrintElementBuffer(debugRead, gpuConstants.numTotalElements, "Reordering -> InputValues -> bitoffset = " + bitOffset);

                if (DEBUG_CONSOLE_OUTPUT)
                {
                    Console.WriteLine("              Counters                ");
                }
                CL10.EnqueueReadBuffer(cqCommandQueue, mCounters, Bool.True, IntPtr.Zero, (IntPtr)(numCounters * sizeof(int)), debugRead, 0,
                                       null, out eve);
                CheckErr(error, "CL10.EnqueueReadBuffer");
                PrintCounterBuffer(debugRead, "Reordering -> bitoffset = " + bitOffset);

                if (DEBUG_CONSOLE_OUTPUT)
                {
                    Console.WriteLine("              Counters                ");
                }
                CL10.EnqueueReadBuffer(cqCommandQueue, mRadixPrefixes, Bool.True, IntPtr.Zero, (IntPtr)(gpuConstants.numRadices * sizeof(int)), debugRead, 0,
                                       null, out eve);
                CheckErr(error, "CL10.EnqueueReadBuffer");
                PrintElementBuffer(debugRead, gpuConstants.numRadices, "Reordering -> RadixPrefixe -> bitoffset = " + bitOffset);



                if (DEBUG_CONSOLE_OUTPUT)
                {
                    Console.WriteLine("              Output                ");
                }
                CL10.EnqueueReadBuffer(cqCommandQueue, outputKey, Bool.True, IntPtr.Zero, (IntPtr)(gpuConstants.numTotalElements * 4), debugRead, 0,
                                       null, out eve);
                CheckErr(error, "CL10.EnqueueReadBuffer");
                PrintElementBuffer(debugRead, gpuConstants.numTotalElements, "Reordering -> Output -> bitoffset = " + bitOffset);

                CL10.EnqueueReadBuffer(cqCommandQueue, outputValue, Bool.True, IntPtr.Zero, (IntPtr)(gpuConstants.numTotalElements * 4), debugRead, 0,
                                       null, out eve);
                CheckErr(error, "CL10.EnqueueReadBuffer");
                PrintElementBuffer(debugRead, gpuConstants.numTotalElements, "Reordering -> OutputValue -> bitoffset = " + bitOffset);

                if (DEBUG_CONSOLE_OUTPUT)
                {
                    Console.WriteLine("Reordering -> bitoffset = " + bitOffset);
                }
                if (DEBUG_CONSOLE_OUTPUT)
                {
                    Console.WriteLine();
                }
            }
            ;
        }