public void ExternalLoopBody(Cl.Program program) { // create kernel Cl.Kernel kernel = Cl.CreateKernel(program, "ExternalLoopBody", out error); clSafeCall(error); // create command queue Cl.CommandQueue cmdQueue = Cl.CreateCommandQueue(context, device, Cl.CommandQueueProperties.None, out error); clSafeCall(error); // allocate host vectors int[] hres = { 0, 1, 2, 3, 4, 5 }; // allocate device vectors Cl.Mem dres = Cl.CreateBuffer(context, Cl.MemFlags.ReadWrite | Cl.MemFlags.CopyHostPtr, (IntPtr)(sizeof(int) * hres.Length), hres, out error); clSafeCall(error); // setup kernel arguments clSafeCall(Cl.SetKernelArg(kernel, 0, dres)); clSafeCall(Cl.SetKernelArg(kernel, 1, hres.Length)); // execute kernel clSafeCall(Cl.EnqueueNDRangeKernel(cmdQueue, kernel, 1, null, new[] { (IntPtr)1 }, null, 0, null, out clevent)); // copy results from device back to host clSafeCall(Cl.EnqueueReadBuffer(cmdQueue, dres, Cl.Bool.True, IntPtr.Zero, (IntPtr)(sizeof(int) * hres.Length), hres, 0, null, out clevent)); clSafeCall(Cl.Finish(cmdQueue)); Assert.AreEqual(new[] { 1, 4, 3, 6, 5, 8 }, hres); }
public void ExecuteKernel(out ErrorCode error, params object[] args) { //Check initialized if (!initialized) { error = ErrorCode.InvalidKernel; return; } //Setup arguments Setup(out error, args); if (error != ErrorCode.Success) //Check error { return; } //Run kernel error = Cl.EnqueueNDRangeKernel(commandQueue, kernel, 2, null, globalWorkGroupSize, null, 0, null, out Event clEvent); error |= Cl.Finish(commandQueue); if (error != ErrorCode.Success) //Check error { return; } //Collect result OnPostExecute(out error, args); }
/// <summary> /// Executes a 1D kernel /// </summary> /// <param name="kernel">Name of the kernel to execute</param> /// <param name="globalWorkSize">Total Number of work items</param> /// <param name="localWorkSize">Number of work items per local group</param> public void ExecuteKernel1D(string kernel, int globalWorkSize, int localWorkSize) { Event e; _error = Cl.EnqueueNDRangeKernel(_handle.Queue, _kernels[kernel], 1, null, new[] { new IntPtr(globalWorkSize) }, new[] { new IntPtr(localWorkSize) }, 0, null, out e); CLException.CheckException(_error); }
public override void FeedForward() { #if TIMING_LAYERS // TODO: add timer #endif OpenCLSpace.ClError = Cl.SetKernelArg(OpenCLSpace.AveragePoolingForward, 0, outputNeurons.ActivationsGPU); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.AveragePoolingForward, 1, inputNeurons.ActivationsGPU); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.AveragePoolingForward, 2, (IntPtr)sizeof(int), nInputUnits); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.AveragePoolingForward, 3, (IntPtr)sizeof(int), inputArea); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.AveragePoolingForward, 4, (IntPtr)sizeof(int), inputDepth); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.AveragePoolingForward, 5, (IntPtr)sizeof(int), inputNeurons.MiniBatchSize); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "Cl.SetKernelArg"); OpenCLSpace.ClError = Cl.EnqueueNDRangeKernel(OpenCLSpace.Queue, OpenCLSpace.AveragePoolingForward, 2, null, fwdGlobalWorkSizePtr, fwdLocalWorkSizePtr, 0, null, out OpenCLSpace.ClEvent); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "Cl.EnqueueNDRangeKernel"); OpenCLSpace.ClError = Cl.ReleaseEvent(OpenCLSpace.ClEvent); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "Cl.ReleaseEvent"); OpenCLSpace.ClError = Cl.Finish(OpenCLSpace.Queue); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "Cl.Finish"); #if TIMING_LAYERS // TODO: add timer #endif }
public void Dispatch(Kernel k, uint[] global_sz, uint[] local_sz) { #if BENCHMARK if (perfLogFile == null) { perfLogFile = File.Create("log.txt"); perfLogWriter = new StreamWriter(perfLogFile); } System.Diagnostics.Stopwatch stopwatch = new System.Diagnostics.Stopwatch(); stopwatch.Start(); #endif HandleEvent(); Cl.EnqueueNDRangeKernel(env.CommandQueues[0], k.kern, 2, null, new IntPtr[] { (IntPtr)global_sz[0], (IntPtr)global_sz[1] }, local_sz == null ? null : new IntPtr[] { (IntPtr)local_sz[0], (IntPtr)local_sz[1] }, 0, null, out var eve); k.PendingExecution = eve; lastEventHandler = () => { Cl.WaitForEvents((uint)1, new Event[] { eve }); eve.Release(); k.Reset(); }; #if BENCHMARK HandleEvent(); stopwatch.Stop(); perfLogWriter.WriteLine($"Kernel Name: {k.Name} Execution Time: {stopwatch.ElapsedTicks / (double)System.Diagnostics.Stopwatch.Frequency * 1000}ms"); perfLogWriter.Flush(); #endif }
public override void FeedForward() { #if TIMING_LAYERS Utils.FCForwardTimer.Start(); #endif #if OPENCL_ENABLED // Set kernel arguments OpenCLSpace.ClError = Cl.SetKernelArg(OpenCLSpace.FCForward, 0, outputNeurons.ActivationsGPU); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.FCForward, 1, inputNeurons.ActivationsGPU); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.FCForward, 2, weightsGPU); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.FCForward, 3, biasesGPU); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.FCForward, 4, (IntPtr)sizeof(int), nInputUnits); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.FCForward, 5, (IntPtr)sizeof(int), nOutputUnits); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.FCForward, 6, (IntPtr)sizeof(int), inputNeurons.MiniBatchSize); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.FCForward, 7, (IntPtr)sizeof(float), (float)dropoutParameter); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.FCForward, 8, (IntPtr)sizeof(ulong), (ulong)Guid.NewGuid().GetHashCode()); // this should be quite a good random seed OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.FCForward, 9, dropoutMaskGPU); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "FullyConnected.FeedForward(): Cl.SetKernelArg"); // Run kernel OpenCLSpace.ClError = Cl.EnqueueNDRangeKernel(OpenCLSpace.Queue, OpenCLSpace.FCForward, 2, null, forwardGlobalWorkSizePtr, forwardLocalWorkSizePtr, 0, null, out OpenCLSpace.ClEvent); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "FullyConnected.FeedForward(): Cl.EnqueueNDRangeKernel"); OpenCLSpace.ClError = Cl.ReleaseEvent(OpenCLSpace.ClEvent); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "Cl.ReleaseEvent"); OpenCLSpace.ClError = Cl.Finish(OpenCLSpace.Queue); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "Cl.Finish"); #else // TODO: add dropout CPU // Generate dropout mask if (dropoutParameter < 1) { for (int iUnit = 0; iUnit < nOutputUnits * inputNeurons.MiniBatchSize; ++iUnit) { dropoutMask[iUnit] = Global.RandomDouble() < dropoutParameter; } } for (int m = 0; m < inputNeurons.MiniBatchSize; m++) { double[] unbiasedOutput = Utils.MultiplyMatrixByVector(weights, inputNeurons.GetHost()[m]); this.outputNeurons.SetHost(m, unbiasedOutput.Zip(biases, (x, y) => x + y).ToArray()); } #endif #if TIMING_LAYERS Utils.FCForwardTimer.Stop(); #endif }
public override void BackPropagate() { // Errors have already been backpropagated to input of first convolutional layer (see method UpdateSpeeds) // Now just cumulate the gradients coming from the skip connection OpenCLSpace.ClError = Cl.SetKernelArg(OpenCLSpace.SkipBackward, 0, inputNeurons.DeltaGPU); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.SkipBackward, 1, outputNeurons.DeltaGPU); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.SkipBackward, 2, (IntPtr)sizeof(int), nInputUnits); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.SkipBackward, 3, (IntPtr)sizeof(int), inputNeurons.MiniBatchSize); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "Cl.SetKernelArg"); // Run kernel OpenCLSpace.ClError = Cl.EnqueueNDRangeKernel(OpenCLSpace.Queue, OpenCLSpace.SkipBackward, 1, null, globalWorkSizePtr, localWorkSizePtr, 0, null, out OpenCLSpace.ClEvent); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "Cl.EnqueueNDRangeKernel"); OpenCLSpace.ClError = Cl.ReleaseEvent(OpenCLSpace.ClEvent); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "Cl.ReleaseEvent"); OpenCLSpace.ClError = Cl.Finish(OpenCLSpace.Queue); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "Cl.Finish"); }
public override void Execute(object sender) { base.Execute(sender); var queue = sender as CommandQueue; var kernel = Kernel as ICLKernel; var range = Range as INDRangeDimension; var waitList = (from name in WaitList let ev = CommandQueue.FindEvent(name) where ev != null select ev.Value).ToArray(); Event eventID; ErrorCode error = Cl.EnqueueNDRangeKernel(queue.Queue, kernel.ClKernel, (uint)kernel.WorkDim, null, range.GlobalWorkSize, range.LocalWorkSize, (uint)waitList.Length, waitList.Length == 0 ? null : waitList.ToArray(), out eventID); if (error != ErrorCode.Success) { throw new CLException(error); } if (Name == string.Empty) { eventID.Dispose(); } else { CommandQueue.AddEvent(Name, eventID); } }
public override void BackPropagate() { // Set kernel arguments OpenCLSpace.ClError = Cl.SetKernelArg(OpenCLSpace.TanhBackward, 0, inputNeurons.DeltaGPU); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.TanhBackward, 1, outputNeurons.DeltaGPU); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.TanhBackward, 2, outputNeurons.ActivationsGPU); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.TanhBackward, 3, (IntPtr)sizeof(float), beta); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.TanhBackward, 4, (IntPtr)sizeof(int), nInputUnits); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.TanhBackward, 5, (IntPtr)sizeof(int), inputNeurons.MiniBatchSize); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "Tanh.BackPropagate(): Cl.SetKernelArg"); // Run kernel OpenCLSpace.ClError = Cl.EnqueueNDRangeKernel(OpenCLSpace.Queue, OpenCLSpace.TanhBackward, 1, null, globalWorkSizePtr, localWorkSizePtr, 0, null, out OpenCLSpace.ClEvent); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "Tanh.BackPropagate(): Cl.EnqueueNDRangeKernel"); OpenCLSpace.ClError = Cl.ReleaseEvent(OpenCLSpace.ClEvent); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "Cl.ReleaseEvent"); OpenCLSpace.ClError = Cl.Finish(OpenCLSpace.Queue); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "Cl.Finish"); }
// images cannot be read_write... so let's continue using plain buffers // should implement this in a way that allows imgfAccu to be loaded only once // should test for image size consistency public void Accumulate(FloatMap imgfAccu, FloatMap imgfSrc, float k) { var kernel = _kernels["accumulate"]; // Creation of on-device memory objects IMem <float> accuMapBuffer = Cl.CreateBuffer <float>(_context, MemFlags.ReadWrite, imgfAccu.Size, out err); // why MemFlags.CopyHostPtr doesn't work here (and forces me to manual copy) ??? assert(err, "accu buf creation"); IMem <float> srcMapBuffer = Cl.CreateBuffer <float>(_context, MemFlags.WriteOnly, imgfSrc.Size, out err); assert(err, "src buf creation"); // Set memory objects as parameters to kernel err = Cl.SetKernelArg(kernel, 0, intPtrSize, accuMapBuffer); assert(err, "accu map setKernelArg"); err = Cl.SetKernelArg(kernel, 1, intPtrSize, srcMapBuffer); assert(err, "src map setKernelArg"); err = Cl.SetKernelArg(kernel, 2, intSize, imgfAccu.Stride); assert(err, "in stride setKernelArg"); err = Cl.SetKernelArg(kernel, 3, intSize, imgfSrc.Stride); assert(err, "out stride setKernelArg"); err = Cl.SetKernelArg(kernel, 4, floatSize, k); assert(err, "out stride setKernelArg"); // write actual data into memory object Event clevent; err = Cl.EnqueueWriteBuffer <float>(_commandsQueue, accuMapBuffer, Bool.True, 0, imgfAccu.Size, imgfAccu._buf, 0, null, out clevent); clevent.Dispose(); assert(err, "write accu buffer"); err = Cl.EnqueueWriteBuffer <float>(_commandsQueue, srcMapBuffer, Bool.True, 0, imgfSrc.Size, imgfSrc._buf, 0, null, out clevent); clevent.Dispose(); assert(err, "write src buffer"); // execute err = Cl.EnqueueNDRangeKernel(_commandsQueue, kernel, 2, new[] { (IntPtr)0, (IntPtr)0, (IntPtr)0 }, // offset new[] { new IntPtr(imgfAccu.W), new IntPtr(imgfAccu.H), (IntPtr)1 }, // range null, 0, null, out clevent); clevent.Dispose(); assert(err, "Cl.EnqueueNDRangeKernel"); // sync Cl.Finish(_commandsQueue); // read from output memory object into actual buffer err = Cl.EnqueueReadBuffer <float>(_commandsQueue, accuMapBuffer, Bool.True, imgfAccu._buf, 0, null, out clevent); clevent.Dispose(); assert(err, "read output buffer"); Cl.ReleaseMemObject(srcMapBuffer); Cl.ReleaseMemObject(accuMapBuffer); // maybe i could return this without disposing; would affect non-OpenCl implementation }
public static long CrackHigh(int[] sequence, long low) { if (!crackHighProgram.HasValue || !crackHighKernel.HasValue || !initialized || sequence.Length != 16) { return(-1); } ErrorCode error; IMem <int> sequence_dev = Cl.CreateBuffer <int>(context, MemFlags.CopyHostPtr | MemFlags.ReadOnly, sequence, out error); ErrorCheck(error, "CrackHigh(): Cl.CreateBuffer"); long[] seeds = new long[1]; IMem <long> seed_dev = Cl.CreateBuffer <long>(context, MemFlags.CopyHostPtr | MemFlags.WriteOnly, seeds, out error); ErrorCheck(error, "InitializeParameters(): Cl.CreateBuffer"); error = Cl.SetKernelArg(crackHighKernel.Value, 0, sequence_dev); ErrorCheck(error, "Cl.SetKernelArg"); error = Cl.SetKernelArg(crackHighKernel.Value, 1, seed_dev); ErrorCheck(error, "Cl.SetKernelArg"); error = Cl.SetKernelArg(crackHighKernel.Value, 2, (IntPtr)sizeof(long), low); ErrorCheck(error, "Cl.SetKernelArg"); CommandQueue cmdQueue = Cl.CreateCommandQueue(context, device, 0, out error); ErrorCheck(error, "Cl.CreateCommandQueue"); int maxGroupWorkSize = Cl.GetKernelWorkGroupInfo(crackHighKernel.Value, device, KernelWorkGroupInfo.WorkGroupSize, out error).CastTo <int>(); ErrorCheck(error, "Cl.GetKernelWorkGroupInfo"); Event e; int threads = maxGroupWorkSize * 12; IntPtr[] workSize = new IntPtr[] { (IntPtr)threads }; error = Cl.EnqueueNDRangeKernel(cmdQueue, crackHighKernel.Value, 1, null, workSize, null, 0, null, out e); ErrorCheck(error, "Cl.EnqueueNDRangeKernel"); error = Cl.Finish(cmdQueue); ErrorCheck(error, "Cl.Finish"); long[] seed_host = new long[1]; error = Cl.EnqueueReadBuffer(cmdQueue, seed_dev, Bool.True, (IntPtr)0, (IntPtr)(sizeof(long) * 1), seed_host, 0, null, out e); ErrorCheck(error, "CL.EnqueueReadBuffer"); //Dispose your shit error = Cl.ReleaseCommandQueue(cmdQueue); ErrorCheck(error, "CL.ReleaseCommandQueue"); error = Cl.ReleaseMemObject(sequence_dev); ErrorCheck(error, "CL.ReleaseMemObject"); error = Cl.ReleaseMemObject(seed_dev); ErrorCheck(error, "CL.ReleaseMemObject"); return(seed_host[0]); }
public void singlePass(Kernel kernel, FloatMap inMap, FloatMap outMap) { var clInImageFormat = new OpenCL.Net.ImageFormat(ChannelOrder.Luminance, ChannelType.Float); IMem inputMapBuffer = Cl.CreateImage2D(_context, MemFlags.CopyHostPtr | MemFlags.ReadOnly, clInImageFormat, (IntPtr)inMap.W, (IntPtr)inMap.H, new IntPtr(inMap.Stride * sizeof(float)), inMap._buf, out err); assert(err, "input img creation"); IMem outputMapBuffer = Cl.CreateImage2D(_context, MemFlags.WriteOnly, clInImageFormat, (IntPtr)outMap.W, (IntPtr)outMap.H, new IntPtr(outMap.Stride * sizeof(float)), outMap._buf, out err); assert(err, "output img creation"); // Set memory objects as parameters to kernel err = Cl.SetKernelArg(kernel, 0, intPtrSize, inputMapBuffer); assert(err, "input map setKernelArg"); err = Cl.SetKernelArg(kernel, 1, intPtrSize, outputMapBuffer); assert(err, "output map setKernelArg"); // write actual data into memory object IntPtr[] originPtr = new IntPtr[] { (IntPtr)0, (IntPtr)0, (IntPtr)0 }; //x, y, z IntPtr[] inRegionPtr = new IntPtr[] { (IntPtr)inMap.W, (IntPtr)inMap.H, (IntPtr)1 }; //x, y, z IntPtr[] outRegionPtr = new IntPtr[] { (IntPtr)outMap.W, (IntPtr)outMap.H, (IntPtr)1 }; //x, y, z IntPtr[] workGroupSizePtr = new IntPtr[] { (IntPtr)outMap.W, (IntPtr)outMap.H, (IntPtr)1 }; Event clevent; //err = Cl.EnqueueWriteImage(_commandsQueue, inputMapBuffer, Bool.True, originPtr, inRegionPtr, (IntPtr)0, (IntPtr)0, inMap._buf, 0, null, out clevent); //clevent.Dispose(); //assert(err, "write input img"); // execute err = Cl.EnqueueNDRangeKernel(_commandsQueue, kernel, 2, originPtr, workGroupSizePtr, null, 0, null, out clevent); clevent.Dispose(); assert(err, "Cl.EnqueueNDRangeKernel"); // sync Cl.Finish(_commandsQueue); // read from output memory object into actual buffer err = Cl.EnqueueReadImage(_commandsQueue, outputMapBuffer, Bool.True, originPtr, outRegionPtr, new IntPtr(outMap.Stride * sizeof(float)), (IntPtr)0, outMap._buf, 0, null, out clevent); clevent.Dispose(); assert(err, "read output buffer"); Cl.ReleaseMemObject(inputMapBuffer); Cl.ReleaseMemObject(outputMapBuffer); }
public void Execute(IntPtr[] workGroupSizePtr, uint workingDim = 1) { ErrorCode err; if ((err = Cl.EnqueueNDRangeKernel(_commandQueue, Kernel, workingDim, null, workGroupSizePtr, null, 0, null, out Event @event)) != ErrorCode.Success) { throw new Exception($"{err}"); } @event.WaitForComplete(); }
public float CalculateVar(out double totalMs, float cf = .9f, int iterations = 10) { using (var annualizedReturnsKernel = _env.Context.CompileKernelFromSource(_programSource, "annualizedReturns")) using (var aggregateReturnsKernel = _env.Context.CompileKernelFromSource(_programSource, "aggregateReturnsKernel")) { Event clStart, clEnd; var queue = _env.CommandQueues[0]; var xSize = (_stockData.QuotesPerStock - _holding); var ySize = _stockData.StocksCount; var err = Cl.SetKernelArg(annualizedReturnsKernel, 0, _d_output); err = Cl.SetKernelArg(annualizedReturnsKernel, 1, _d_stocksAndPrices); err = Cl.SetKernelArg(annualizedReturnsKernel, 2, _d_portfolioStockMv); err = Cl.SetKernelArg(annualizedReturnsKernel, 3, _ann); err = Cl.SetKernelArg(annualizedReturnsKernel, 4, _holding); err = Cl.SetKernelArg(aggregateReturnsKernel, 0, _d_output); err = Cl.SetKernelArg(aggregateReturnsKernel, 1, ySize); Cl.EnqueueMarker(queue, out clStart); Cl.Finish(queue); for (int i = 0; i < iterations; i++) { err = Cl.EnqueueNDRangeKernel(queue, annualizedReturnsKernel, 2, null, new[] { (IntPtr)xSize, (IntPtr)ySize }, new[] { (IntPtr)256, (IntPtr)1 }, 0, null, out Event notInterestedInThisEvent1); err = Cl.EnqueueNDRangeKernel(queue, aggregateReturnsKernel, 2, null, new[] { (IntPtr)xSize, (IntPtr)1 }, new[] { (IntPtr)256, (IntPtr)1 }, 0, null, out notInterestedInThisEvent1); } Cl.EnqueueMarker(queue, out clEnd); Cl.Finish(queue); var startInfo = Cl.GetEventProfilingInfo(clStart, ProfilingInfo.Start, out err); var endInfo = Cl.GetEventProfilingInfo(clEnd, ProfilingInfo.End, out err); Cl.EnqueueReadBuffer <float>(queue, _d_output, Bool.True, _h_output, 0, null, out Event notInterestedInThisEvent2); totalMs = (endInfo.CastTo <ulong>() - startInfo.CastTo <ulong>()) * 10e-6; clStart.Dispose(); clEnd.Dispose(); annualizedReturnsKernel.Dispose(); return(_h_output[0] + _h_output[xSize - 1]); } }
public override void FeedForward() { #if TIMING_LAYERS Utils.NonlinearityForwardTimer.Start(); #endif #if OPENCL_ENABLED // Set kernel arguments OpenCLSpace.ClError = Cl.SetKernelArg(OpenCLSpace.ReLUForward, 0, OutputNeurons.ActivationsGPU); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.ReLUForward, 1, InputNeurons.ActivationsGPU); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.ReLUForward, 2, (IntPtr)sizeof(int), OutputNeurons.NumberOfUnits * inputNeurons.MiniBatchSize); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "ReLU.FeedForward(): Cl.SetKernelArg"); // Run kernel OpenCLSpace.ClError = Cl.EnqueueNDRangeKernel(OpenCLSpace.Queue, OpenCLSpace.ReLUForward, 1, null, globalWorkSizePtr, localWorkSizePtr, 0, null, out OpenCLSpace.ClEvent); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "ReLU.FeedForward(): Cl.EnqueueNDRangeKernel"); OpenCLSpace.ClError = Cl.ReleaseEvent(OpenCLSpace.ClEvent); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "Cl.ReleaseEvent"); OpenCLSpace.ClError = Cl.Finish(OpenCLSpace.Queue); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "Cl.Finish"); #else for (int m = 0; m < inputNeurons.MiniBatchSize; m++) { double[] tmpOutput = new double[this.nOutputUnits]; for (int i = 0; i < this.nOutputUnits; i++) { if (this.inputNeurons.GetHost()[m][i] > 0) { tmpOutput[i] = this.inputNeurons.GetHost()[m][i]; } else { tmpOutput[i] = 0.0; } } this.outputNeurons.SetHost(m, tmpOutput); } #endif #if TIMING_LAYERS Utils.NonlinearityForwardTimer.Stop(); #endif }
public void ArrayCompare(Cl.Program program) { // create kernel Cl.Kernel kernel = Cl.CreateKernel(program, "ArrayCompare", out error); clSafeCall(error); // create command queue Cl.CommandQueue cmdQueue = Cl.CreateCommandQueue(context, device, Cl.CommandQueueProperties.None, out error); clSafeCall(error); // allocate host vectors bool[] res = { true, false, true, false }; // allocate device vectors Cl.Mem dp1 = Cl.CreateBuffer(context, Cl.MemFlags.WriteOnly, (IntPtr)(sizeof(int)), IntPtr.Zero, out error); clSafeCall(error); Cl.Mem dp2 = Cl.CreateBuffer(context, Cl.MemFlags.WriteOnly, (IntPtr)(sizeof(int)), IntPtr.Zero, out error); clSafeCall(error); Cl.Mem dp3 = Cl.CreateBuffer(context, Cl.MemFlags.WriteOnly, (IntPtr)(sizeof(bool) * res.Length), IntPtr.Zero, out error); clSafeCall(error); // setup kernel arguments clSafeCall(Cl.SetKernelArg(kernel, 0, dp1)); clSafeCall(Cl.SetKernelArg(kernel, 1, dp2)); clSafeCall(Cl.SetKernelArg(kernel, 2, dp3)); // execute kernel clSafeCall(Cl.EnqueueNDRangeKernel(cmdQueue, kernel, 1, null, new[] { (IntPtr)1 }, null, 0, null, out clevent)); // copy results from device back to host clSafeCall(Cl.EnqueueReadBuffer(cmdQueue, dp3, Cl.Bool.True, IntPtr.Zero, (IntPtr)(sizeof(bool) * res.Length), res, 0, null, out clevent)); clSafeCall(Cl.Finish(cmdQueue)); Assert.AreEqual(new[] { false, true, false, true }, res); // setup kernel arguments clSafeCall(Cl.SetKernelArg(kernel, 0, dummy)); clSafeCall(Cl.SetKernelArg(kernel, 1, dummy)); // execute kernel clSafeCall(Cl.EnqueueNDRangeKernel(cmdQueue, kernel, 1, null, new[] { (IntPtr)1 }, null, 0, null, out clevent)); // copy results from device back to host clSafeCall(Cl.EnqueueReadBuffer(cmdQueue, dp3, Cl.Bool.True, IntPtr.Zero, (IntPtr)(sizeof(bool) * res.Length), res, 0, null, out clevent)); clSafeCall(Cl.Finish(cmdQueue)); Assert.AreEqual(new[] { true, false, true, false }, res); }
public float [] MathFunctionsSingleTest(int[] input) { if (input.Length == 0) { return(new float[0]); } var source = @"#pragma OPENCL EXTENSION cl_khr_fp64 : enable __kernel void kernelCode(__global int* ___input___, __global float* ___result___) { int n0; float ___final___10; int ___flag___11; int ___id___ = get_global_id(0); n0 = ___input___[___id___]; float pi = 3.14159274f; float c = cos(((float) n0)); float s = sin(((float) n0)); float f = floor(pi); float sq = sqrt(((float) (n0 * n0))); float ex = exp(pi); float p = powr(pi, 2.0f); float a = fabs(c); float l = log(((float) n0)); ___final___10 = ((((((((f * pi) * c) * s) * sq) * ex) * p) * a) * l); ___result___[___id___] = ___final___10; } "; var output = new float[input.Length]; ErrorCode error; var a = Cl.CreateBuffer(env.Context, MemFlags.ReadOnly | MemFlags.None | MemFlags.UseHostPtr, (IntPtr)(input.Length * sizeof(int)), input, out error); var b = Cl.CreateBuffer(env.Context, MemFlags.WriteOnly | MemFlags.None | MemFlags.UseHostPtr, (IntPtr)(input.Length * sizeof(float)), output, out error); var max = Cl.GetDeviceInfo(env.Devices[0], DeviceInfo.MaxWorkGroupSize, out error).CastTo <uint>(); OpenCL.Net.Program program = Cl.CreateProgramWithSource(env.Context, 1u, new string[] { source }, null, out error); error = Cl.BuildProgram(program, (uint)env.Devices.Length, env.Devices, " -cl-fast-relaxed-math -cl-mad-enable ", null, IntPtr.Zero); OpenCL.Net.Kernel kernel = Cl.CreateKernel(program, "kernelCode", out error); error = Cl.SetKernelArg(kernel, 0, a); error = Cl.SetKernelArg(kernel, 1, b); Event eventID; error = Cl.EnqueueNDRangeKernel(env.CommandQueues[0], kernel, (uint)1, null, new IntPtr[] { (IntPtr)input.Length }, new IntPtr[] { (IntPtr)1 }, (uint)0, null, out eventID); env.CommandQueues[0].ReadFromBuffer(b, output); a.Dispose(); b.Dispose(); //env.Dispose(); return(output); }
public static Event EnqueueKernel(this CommandQueue commandQueue, Kernel kernel, uint globalWorkSize, uint localWorkSize = 0, params Event[] waitFor) { Event e; Cl.EnqueueNDRangeKernel(commandQueue, kernel, 1, null, new[] { (IntPtr)globalWorkSize }, localWorkSize == 0 ? null : new[] { (IntPtr)localWorkSize }, (uint)waitFor.Length, waitFor.Length == 0 ? null : waitFor, out e).Check(); return(e); }
public static void WipeBuffer(Mem buffer, int nElementsInBuffer, Type type) { Kernel WipeKernel; if (type == typeof(float)) { WipeKernel = WipeBufferFloatKernel; } else if (type == typeof(int)) { WipeKernel = WipeBufferIntKernel; } else if (type == typeof(bool)) { WipeKernel = WipeBufferBoolKernel; } else { throw new ArgumentException("Type not supported. Use either float, int, or bool."); } // Set kernel arguments OpenCLSpace.ClError = Cl.SetKernelArg(WipeKernel, 0, buffer); OpenCLSpace.ClError |= Cl.SetKernelArg(WipeKernel, 1, (IntPtr)sizeof(int), nElementsInBuffer); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "Cl.SetKernelArg WipeBufferKernel"); // Work sizes IntPtr[] localWorkSizePtr = { (IntPtr)OPTIMAL_GROUP_SIZE }; IntPtr[] globalWorkSizePtr = { (IntPtr)(OPTIMAL_GROUP_SIZE * Math.Ceiling((double)(nElementsInBuffer) / (double)OPTIMAL_GROUP_SIZE)) }; // Run kernel ClError = Cl.EnqueueNDRangeKernel(queue, WipeKernel, 1, null, globalWorkSizePtr, localWorkSizePtr, 0, null, out ClEvent); CheckErr(ClError, "Cl.EnqueueNDRangeKernel ZeroUnpadBatch"); ClError = Cl.ReleaseEvent(ClEvent); CheckErr(ClError, "Cl.ReleaseEvent"); ClError = Cl.Finish(queue); CheckErr(ClError, "Cl.Finish"); //Cl.ReleaseKernel(WipeKernel); }
public void Execute(IntPtr[] workGroupSizePtr, uint[] argsIndex, SvmPointer[] args, uint workingDim = 1) { SetSvmArgs(argsIndex, args); LockSvmForGPU(args); ErrorCode err; if ((err = Cl.EnqueueNDRangeKernel(CommandQueue, Kernel, workingDim, null, workGroupSizePtr, null, 0, null, out Event @event)) != ErrorCode.Success) { throw new Exception($"{err}"); } @event.WaitForComplete(); UnlockSvmGPU(args); }
public override void BackPropagate() { #if TIMING_LAYERS Utils.NonlinearityBackpropTimer.Start(); #endif #if OPENCL_ENABLED // Set kernel arguments OpenCLSpace.ClError = Cl.SetKernelArg(OpenCLSpace.ELUBackward, 0, inputNeurons.DeltaGPU); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.ELUBackward, 1, outputNeurons.DeltaGPU); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.ELUBackward, 2, inputNeurons.ActivationsGPU); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.ELUBackward, 3, (IntPtr)sizeof(float), alpha); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.ELUBackward, 4, (IntPtr)sizeof(int), nInputUnits * inputNeurons.MiniBatchSize); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "ELU.BackPropagate(): Cl.SetKernelArg"); // Run kernel OpenCLSpace.ClError = Cl.EnqueueNDRangeKernel(OpenCLSpace.Queue, OpenCLSpace.ELUBackward, 1, null, globalWorkSizePtr, localWorkSizePtr, 0, null, out OpenCLSpace.ClEvent); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "ELU.BackPropagate(): Cl.EnqueueNDRangeKernel"); OpenCLSpace.ClError = Cl.ReleaseEvent(OpenCLSpace.ClEvent); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "Cl.ReleaseEvent"); OpenCLSpace.ClError = Cl.Finish(OpenCLSpace.Queue); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "Cl.Finish"); #else throw new NotImplementedException("CPU code for ELUs not implemented yet."); for (int m = 0; m < inputNeurons.MiniBatchSize; m++) { for (int i = 0; i < nOutputUnits; i++) //inputNeurons.DeltaHost[m][i] = inputNeurons.GetHost()[m][i] > 0 ? outputNeurons.DeltaHost[m][i] : 0.0; } #endif #if TIMING_LAYERS Utils.NonlinearityBackpropTimer.Stop(); #endif }
public override void BackPropagate() { #if TIMING_LAYERS Utils.FCBackpropTimer.Start(); #endif #if OPENCL_ENABLED // Set kernel arguments OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.FCBackward, 0, inputNeurons.DeltaGPU); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.FCBackward, 1, outputNeurons.DeltaGPU); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.FCBackward, 2, weightsGPU); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.FCBackward, 3, dropoutMaskGPU); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.FCBackward, 4, (IntPtr)sizeof(int), nInputUnits); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.FCBackward, 5, (IntPtr)sizeof(int), nOutputUnits); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.FCBackward, 6, (IntPtr)sizeof(int), inputNeurons.MiniBatchSize); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "FullyConnected.BackPropagate(): Cl.SetKernelArg"); // Run kernel OpenCLSpace.ClError = Cl.EnqueueNDRangeKernel(OpenCLSpace.Queue, OpenCLSpace.FCBackward, 2, null, backwardGlobalWorkSizePtr, backwardLocalWorkSizePtr, 0, null, out OpenCLSpace.ClEvent); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "FullyConnected.BackPropagate(): Cl.EnqueueNDRangeKernel"); OpenCLSpace.ClError = Cl.ReleaseEvent(OpenCLSpace.ClEvent); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "Cl.ReleaseEvent"); OpenCLSpace.ClError = Cl.Finish(OpenCLSpace.Queue); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "Cl.Finish"); #else for (int m = 0; m < inputNeurons.MiniBatchSize; m++) { inputNeurons.DeltaHost[m] = Utils.MultiplyMatrixTranspByVector(weights, outputNeurons.DeltaHost[m]); } #endif #if TIMING_LAYERS Utils.FCBackpropTimer.Stop(); #endif }
public override void BackPropagate() { #if TIMING_LAYERS Utils.PoolingBackpropTimer.Start(); #endif #if OPENCL_ENABLED OpenCLSpace.ClError = Cl.SetKernelArg(OpenCLSpace.MaxPoolingBackward, 0, inputNeurons.DeltaGPU); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.MaxPoolingBackward, 1, outputNeurons.DeltaGPU); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.MaxPoolingBackward, 2, switchesGPU); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.MaxPoolingBackward, 3, poolingTableGPU); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.MaxPoolingBackward, 4, (IntPtr)sizeof(int), nInputUnits); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.MaxPoolingBackward, 5, (IntPtr)sizeof(int), inputWidth * inputWidth); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.MaxPoolingBackward, 6, (IntPtr)sizeof(int), nOutputUnits); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.MaxPoolingBackward, 7, (IntPtr)sizeof(int), outputWidth * outputWidth); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.MaxPoolingBackward, 8, (IntPtr)sizeof(int), inputNeurons.MiniBatchSize); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "Cl.SetKernelArg PoolingBackward"); OpenCLSpace.ClError = Cl.EnqueueNDRangeKernel(OpenCLSpace.Queue, OpenCLSpace.MaxPoolingBackward, 1, null, globalWorkSizePtr, localWorkSizePtr, 0, null, out OpenCLSpace.ClEvent); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "Cl.EnqueueNDRangeKernel PoolingBackward"); OpenCLSpace.ClError = Cl.ReleaseEvent(OpenCLSpace.ClEvent); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "Cl.ReleaseEvent"); OpenCLSpace.ClError = Cl.Finish(OpenCLSpace.Queue); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "Cl.Finish"); #else //TODO: CPU code #endif #if TIMING_LAYERS Utils.PoolingBackpropTimer.Stop(); #endif }
public void EnqueueNDRangeKernel(Kernel kernel, uint workDim, IntPtr[] globalWorkOffset, IntPtr[] globalWorkSize, IntPtr[] localWorkSize) { var errorCode = Cl.EnqueueNDRangeKernel( CommandQueue, kernel, workDim, globalWorkOffset, globalWorkSize, localWorkSize, _awaitEvent == null ? (uint)0 : (uint)1, _awaitEvent == null ? null : new Event[] { _awaitEvent.Value }, out var awaitEvent ); if (errorCode != ErrorCode.Success) { throw new NerotiqException($"Error enqueueing ND range kernel: {errorCode}"); } _awaitEvent = awaitEvent; }
public void ArrayRefOut(Cl.Program program) { // create kernel Cl.Kernel kernel = Cl.CreateKernel(program, "ArrayRefOut", out error); clSafeCall(error); // create command queue Cl.CommandQueue cmdQueue = Cl.CreateCommandQueue(context, device, Cl.CommandQueueProperties.None, out error); clSafeCall(error); // allocate host vectors int[] hp1 = { 1 }; int[] hp2 = { 2 }; // allocate device vectors Cl.Mem dp1 = Cl.CreateBuffer(context, Cl.MemFlags.CopyHostPtr | Cl.MemFlags.ReadWrite, (IntPtr)(sizeof(int) * hp1.Length), hp1, out error); clSafeCall(error); Cl.Mem dp2 = Cl.CreateBuffer(context, Cl.MemFlags.CopyHostPtr | Cl.MemFlags.ReadWrite, (IntPtr)(sizeof(int) * hp2.Length), hp2, out error); clSafeCall(error); // setup kernel arguments clSafeCall(Cl.SetKernelArg(kernel, 0, dp1)); clSafeCall(Cl.SetKernelArg(kernel, 1, dp2)); clSafeCall(Cl.SetKernelArg(kernel, 2, dummy)); // execute kernel clSafeCall(Cl.EnqueueNDRangeKernel(cmdQueue, kernel, 1, null, new[] { (IntPtr)1 }, null, 0, null, out clevent)); // copy results from device back to host clSafeCall(Cl.EnqueueReadBuffer(cmdQueue, dp1, Cl.Bool.True, IntPtr.Zero, (IntPtr)(sizeof(int) * hp1.Length), hp1, 0, null, out clevent)); clSafeCall(Cl.EnqueueReadBuffer(cmdQueue, dp2, Cl.Bool.True, IntPtr.Zero, (IntPtr)(sizeof(int) * hp1.Length), hp2, 0, null, out clevent)); clSafeCall(Cl.Finish(cmdQueue)); Assert.AreEqual(5, hp1[0]); Assert.AreEqual(4, hp2[0]); }
public override void BackPropagate() { #if TIMING_LAYERS Utils.BNConvBackpropTimer.Start(); #endif OpenCLSpace.ClError = Cl.SetKernelArg(OpenCLSpace.BNConvBackPropagate, 0, inputNeurons.DeltaGPU); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.BNConvBackPropagate, 1, outputNeurons.DeltaGPU); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.BNConvBackPropagate, 2, normalizedInputGPU); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.BNConvBackPropagate, 3, gammaGPU); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.BNConvBackPropagate, 4, varianceGPU); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.BNConvBackPropagate, 5, deltaGammaGPU); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.BNConvBackPropagate, 6, deltaBetaGPU); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.BNConvBackPropagate, 7, (IntPtr)sizeof(int), inputArea); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.BNConvBackPropagate, 8, (IntPtr)sizeof(int), nInputUnits); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.BNConvBackPropagate, 9, (IntPtr)sizeof(int), inputNeurons.MiniBatchSize); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "Cl.SetKernelArg"); OpenCLSpace.ClError = Cl.EnqueueNDRangeKernel(OpenCLSpace.Queue, OpenCLSpace.BNConvBackPropagate, 1, null, nActivationsGlobalWorkSizePtr, optimalLocalWorkSizePtr, 0, null, out OpenCLSpace.ClEvent); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "Cl.EnqueueNDRangeKernel"); OpenCLSpace.ClError = Cl.ReleaseEvent(OpenCLSpace.ClEvent); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "Cl.ReleaseEvent"); OpenCLSpace.ClError = Cl.Finish(OpenCLSpace.Queue); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "Cl.Finish"); #if TIMING_LAYERS Utils.BNConvBackpropTimer.Stop(); #endif }
static void Main(string[] args) { Console.WriteLine("Hello World!"); uint platformCount; ErrorCode result = Cl.GetPlatformIDs(0, null, out platformCount); Console.WriteLine("{0} platforms found", platformCount); var platformIds = new Platform[platformCount]; result = Cl.GetPlatformIDs(platformCount, platformIds, out platformCount); var platformCounter = 0; foreach (var platformId in platformIds) { IntPtr paramSize; result = Cl.GetPlatformInfo(platformId, PlatformInfo.Name, IntPtr.Zero, InfoBuffer.Empty, out paramSize); using (var buffer = new InfoBuffer(paramSize)) { result = Cl.GetPlatformInfo(platformIds[0], PlatformInfo.Name, paramSize, buffer, out paramSize); Console.WriteLine($"Platform {platformCounter}: {buffer}"); } platformCounter++; } Console.WriteLine($"Using first platform..."); uint deviceCount; result = Cl.GetDeviceIDs(platformIds[0], DeviceType.All, 0, null, out deviceCount); Console.WriteLine("{0} devices found", deviceCount); var deviceIds = new Device[deviceCount]; result = Cl.GetDeviceIDs(platformIds[0], DeviceType.All, deviceCount, deviceIds, out var numberDevices); var selectedDevice = deviceIds[0]; var context = Cl.CreateContext(null, 1, new[] { selectedDevice }, null, IntPtr.Zero, out var error); const string kernelSrc = @" // Simple test; c[i] = a[i] + b[i] __kernel void add_array(__global float *a, __global float *b, __global float *c) { int xid = get_global_id(0); c[xid] = a[xid] + b[xid] - 1500; } __kernel void sub_array(__global float *a, __global float *b, __global float *c) { int xid = get_global_id(0); c[xid] = a[xid] - b[xid] - 2000; } __kernel void double_everything(__global float *a) { int xid = get_global_id(0); a[xid] = a[xid] * 2; } "; var src = kernelSrc; Console.WriteLine("=== src ==="); Console.WriteLine(src); Console.WriteLine("============"); var program = Cl.CreateProgramWithSource(context, 1, new[] { src }, null, out var error2); error2 = Cl.BuildProgram(program, 1, new[] { selectedDevice }, string.Empty, null, IntPtr.Zero); if (error2 == ErrorCode.BuildProgramFailure) { Console.Error.WriteLine(Cl.GetProgramBuildInfo(program, selectedDevice, ProgramBuildInfo.Log, out error)); } Console.WriteLine(error2); // Get the kernels. var kernels = Cl.CreateKernelsInProgram(program, out error); Console.WriteLine($"Program contains {kernels.Length} kernels."); var kernelAdd = kernels[0]; var kernelDouble = kernels[2]; // float[] A = new float[1000]; float[] B = new float[1000]; float[] C = new float[1000]; for (var i = 0; i < 1000; i++) { A[i] = i; B[i] = i; } IMem <float> hDeviceMemA = Cl.CreateBuffer(context, MemFlags.CopyHostPtr | MemFlags.ReadOnly, A, out error); IMem <float> hDeviceMemB = Cl.CreateBuffer(context, MemFlags.CopyHostPtr | MemFlags.ReadOnly, B, out error); IMem <float> hDeviceMemC = Cl.CreateBuffer(context, MemFlags.CopyHostPtr | MemFlags.ReadOnly, C, out error); // Create a command queue. var cmdQueue = Cl.CreateCommandQueue(context, selectedDevice, CommandQueueProperties.None, out error); int intPtrSize = 0; intPtrSize = Marshal.SizeOf(typeof(IntPtr)); error = Cl.SetKernelArg(kernelDouble, 0, new IntPtr(intPtrSize), hDeviceMemA); error = Cl.SetKernelArg(kernelAdd, 0, new IntPtr(intPtrSize), hDeviceMemA); error = Cl.SetKernelArg(kernelAdd, 1, new IntPtr(intPtrSize), hDeviceMemB); error = Cl.SetKernelArg(kernelAdd, 2, new IntPtr(intPtrSize), hDeviceMemC); // write data from host to device Event clevent; error = Cl.EnqueueWriteBuffer(cmdQueue, hDeviceMemA, Bool.True, IntPtr.Zero, new IntPtr(1000 * sizeof(float)), A, 0, null, out clevent); error = Cl.EnqueueWriteBuffer(cmdQueue, hDeviceMemB, Bool.True, IntPtr.Zero, new IntPtr(1000 * sizeof(float)), B, 1, new [] { clevent }, out clevent); // execute kernel error = Cl.EnqueueNDRangeKernel(cmdQueue, kernelDouble, 1, null, new IntPtr[] { new IntPtr(1000) }, null, 1, new [] { clevent }, out clevent); var infoBuffer = Cl.GetEventInfo(clevent, EventInfo.CommandExecutionStatus, out var e2); error = Cl.EnqueueNDRangeKernel(cmdQueue, kernelAdd, 1, null, new IntPtr[] { new IntPtr(1000) }, null, 1, new [] { clevent }, out clevent); Console.WriteLine($"Run result: {error}"); error = Cl.EnqueueReadBuffer(cmdQueue, hDeviceMemC, Bool.False, 0, C.Length, C, 1, new [] { clevent }, out clevent); Cl.WaitForEvents(1, new [] { clevent }); for (var i = 0; i < 1000; i++) { Console.WriteLine($"[{i}]: {C[i]}"); } program.Dispose(); foreach (var res in typeof(SourceLoader).Assembly.GetManifestResourceNames()) { Console.WriteLine(res); } }
public override Bitmap Plot() { ErrorCode error; using (Kernel kernel = CompileKernel("mandelbrot")) { Bitmap plotImg = new Bitmap(Width, Height); int intPtrSize = Marshal.SizeOf(typeof(IntPtr)); int uint4size = Marshal.SizeOf(typeof(uint4)); // Buffer do OpenCL para manter os dados da imagem OpenCL.Net.ImageFormat clImageFormat = new OpenCL.Net.ImageFormat(ChannelOrder.RGBA, ChannelType.Unsigned_Int8); //// Obtém o buffer de pixels //BitmapData data = plotImg.LockBits(new Rectangle(0, 0, plotImg.Width, plotImg.Height), ImageLockMode.WriteOnly, plotImg.PixelFormat); //int depth = Bitmap.GetPixelFormatSize(data.PixelFormat) / 8; // Tamanho de cada pixel em memória, em bytes int depth = Bitmap.GetPixelFormatSize(PixelFormat.Format32bppArgb) / 8; int stride = 4 * ((Width * depth + 3) / 4); byte[] buffer = new byte[Height * stride]; // Cria o buffer para se trabalhar na imagem //Marshal.Copy(data.Scan0, buffer, 0, buffer.Length); // Copia as informações da imagem no buffer // Cria o buffer do OpenCL para a imagem Mem image2dbuffer = (Mem)Cl.CreateImage2D(context, MemFlags.CopyHostPtr | MemFlags.WriteOnly, clImageFormat, (IntPtr)Width, (IntPtr)Height, (IntPtr)0, buffer, out error); CheckErr(error, "Cl.CreateImage2D"); // Passa os parametros para o kernel error = Cl.SetKernelArg(kernel, 0, (IntPtr)intPtrSize, image2dbuffer); CheckErr(error, "Cl.SetKernelArg imageBuffer"); uint4 startColorUi = new uint4(Colors.StartColor.B, Colors.StartColor.G, Colors.StartColor.R, Colors.StartColor.A); error = Cl.SetKernelArg(kernel, 1, (IntPtr)uint4size, startColorUi); CheckErr(error, "Cl.SetKernelArg startColor"); uint4 endColorUi = new uint4(Colors.EndColor.B, Colors.EndColor.G, Colors.EndColor.R, Colors.EndColor.A); error = Cl.SetKernelArg(kernel, 2, (IntPtr)uint4size, endColorUi); CheckErr(error, "Cl.SetKernelArg endColor"); error = Cl.SetKernelArg(kernel, 3, (IntPtr)sizeof(int), Iterations); CheckErr(error, "Cl.SetKernelArg iterations"); // Cria uma fila de comandos, com todos os comandos a serem executados pelo kernel CommandQueue cmdQueue = Cl.CreateCommandQueue(context, device, 0, out error); CheckErr(error, "Cl.CreateCommandQueue"); // Copia a imagem para a GPU Event clevent; IntPtr[] imgOriginPtr = new IntPtr[] { (IntPtr)0, (IntPtr)0, (IntPtr)0 }; //x, y, z IntPtr[] imgRegionPtr = new IntPtr[] { (IntPtr)Width, (IntPtr)Height, (IntPtr)1 }; //x, y, z error = Cl.EnqueueWriteImage(cmdQueue, image2dbuffer, Bool.True, imgOriginPtr, imgRegionPtr, (IntPtr)0, (IntPtr)0, buffer, 0, null, out clevent); CheckErr(error, "Cl.EnqueueWriteImage"); // Executa o Kernel carregado pelo OpenCL (com múltiplos processadores :D) IntPtr[] workGroupSizePtr = new IntPtr[] { (IntPtr)Width, (IntPtr)Height, (IntPtr)1 }; // x, y, z error = Cl.EnqueueNDRangeKernel(cmdQueue, kernel, 2, null, workGroupSizePtr, null, 0, null, out clevent); CheckErr(error, "Cl.EnqueueNDRangeKernel"); // Espera terminar a execução error = Cl.Finish(cmdQueue); CheckErr(error, "Cl.Finish"); // Lê a imagem processada pela GPU e coloca novamente no buffer error = Cl.EnqueueReadImage(cmdQueue, image2dbuffer, Bool.True, imgOriginPtr, imgRegionPtr, (IntPtr)0, (IntPtr)0, buffer, 0, null, out clevent); CheckErr(error, "Cl.clEnqueueReadImage"); // Limpa a memória Cl.ReleaseKernel(kernel); Cl.ReleaseCommandQueue(cmdQueue); Cl.ReleaseMemObject(image2dbuffer); // Get a pointer to our unmanaged output byte[] array //GCHandle pinnedBuffer = GCHandle.Alloc(buffer, GCHandleType.Pinned); //IntPtr bmpPointer = pinnedBuffer.AddrOfPinnedObject(); BitmapData data = plotImg.LockBits(new Rectangle(0, 0, plotImg.Width, plotImg.Height), ImageLockMode.WriteOnly, plotImg.PixelFormat); Marshal.Copy(buffer, 0, data.Scan0, buffer.Length); // Copia as informações no buffer de volta à imagem plotImg.UnlockBits(data); // Libera a imagem //pinnedBuffer.Free(); return(plotImg); } }
public HTTPResponse GetResponse(HTTPRequest request) { HTTPResponse response = new HTTPResponse(200); StringBuilder sb = new StringBuilder(); ErrorCode error; if (!_isInit) { init(); _isInit = true; } if (request.Method == HTTPRequest.METHOD_GET) { // Input form, this can be place by any HTML page sb.Append("<html><body>"); sb.Append(GenUploadForm()); sb.Append("</body></html>"); response.Body = Encoding.UTF8.GetBytes(sb.ToString()); return(response); } else if (request.Method == HTTPRequest.METHOD_POST) { // Get remote image from URL string url = Uri.UnescapeDataString(request.GetRequestByKey("imageUploadUrl")); byte[] data; try { data = DownloadImageFromUrl(url); } catch (Exception) { return(new HTTPResponse(400)); } // https://www.codeproject.com/Articles/502829/GPGPU-image-processing-basics-using-OpenCL-NET // Convert image to bitmap binary Image inputImage = Image.FromStream(new MemoryStream(data)); if (inputImage == null) { return(new HTTPResponse(500)); } int imagewidth = inputImage.Width; int imageHeight = inputImage.Height; Bitmap bmpImage = new Bitmap(inputImage); BitmapData bitmapData = bmpImage.LockBits(new Rectangle(0, 0, bmpImage.Width, bmpImage.Height), ImageLockMode.ReadOnly, PixelFormat.Format32bppArgb); int inputImageByteSize = bitmapData.Stride * bitmapData.Height; byte[] inputByteArray = new byte[inputImageByteSize]; Marshal.Copy(bitmapData.Scan0, inputByteArray, 0, inputImageByteSize); // Load kernel source code string programPath = System.Environment.CurrentDirectory + "/Kernel.cl"; if (!System.IO.File.Exists(programPath)) { return(new HTTPResponse(404)); } string programSource = System.IO.File.ReadAllText(programPath); using (OpenCL.Net.Program program = Cl.CreateProgramWithSource(_context, 1, new[] { programSource }, null, out error)) { // Create kernel LogError(error, "Cl.CreateProgramWithSource"); error = Cl.BuildProgram(program, 1, new[] { _device }, string.Empty, null, IntPtr.Zero); LogError(error, "Cl.BuildProgram"); if (Cl.GetProgramBuildInfo(program, _device, ProgramBuildInfo.Status, out error).CastTo <OpenCL.Net.BuildStatus>() != BuildStatus.Success) { LogError(error, "Cl.GetProgramBuildInfo"); return(new HTTPResponse(404)); } Kernel kernel = Cl.CreateKernel(program, _parameters["KernelFunction"], out error); LogError(error, "Cl.CreateKernel"); // Create image memory objects OpenCL.Net.ImageFormat clImageFormat = new OpenCL.Net.ImageFormat(ChannelOrder.RGBA, ChannelType.Unsigned_Int8); IMem inputImage2DBuffer = Cl.CreateImage2D(_context, MemFlags.CopyHostPtr | MemFlags.ReadOnly, clImageFormat, (IntPtr)bitmapData.Width, (IntPtr)bitmapData.Height, (IntPtr)0, inputByteArray, out error); LogError(error, "CreateImage2D input"); byte[] outputByteArray = new byte[inputImageByteSize]; IMem outputImage2DBuffer = Cl.CreateImage2D(_context, MemFlags.CopyHostPtr | MemFlags.WriteOnly, clImageFormat, (IntPtr)bitmapData.Width, (IntPtr)bitmapData.Height, (IntPtr)0, outputByteArray, out error); LogError(error, "CreateImage2D output"); // Set arguments int IntPtrSize = Marshal.SizeOf(typeof(IntPtr)); error = Cl.SetKernelArg(kernel, 0, (IntPtr)IntPtrSize, inputImage2DBuffer); error |= Cl.SetKernelArg(kernel, 1, (IntPtr)IntPtrSize, outputImage2DBuffer); LogError(error, "Cl.SetKernelArg"); // Create command queue CommandQueue cmdQueue = Cl.CreateCommandQueue(_context, _device, (CommandQueueProperties)0, out error); LogError(error, "Cl.CreateCommandQueue"); Event clevent; // Copy input image from the host to the GPU IntPtr[] originPtr = new IntPtr[] { (IntPtr)0, (IntPtr)0, (IntPtr)0 }; IntPtr[] regionPtr = new IntPtr[] { (IntPtr)imagewidth, (IntPtr)imageHeight, (IntPtr)1 }; IntPtr[] workGroupSizePtr = new IntPtr[] { (IntPtr)imagewidth, (IntPtr)imageHeight, (IntPtr)1 }; error = Cl.EnqueueWriteImage(cmdQueue, inputImage2DBuffer, Bool.True, originPtr, regionPtr, (IntPtr)0, (IntPtr)0, inputByteArray, 0, null, out clevent); LogError(error, "Cl.EnqueueWriteImage"); // Run the kernel error = Cl.EnqueueNDRangeKernel(cmdQueue, kernel, 2, null, workGroupSizePtr, null, 0, null, out clevent); LogError(error, "Cl.EnqueueNDRangeKernel"); // Wait for finish event error = Cl.Finish(cmdQueue); LogError(error, "Cl.Finish"); // Read the output image back from GPU error = Cl.EnqueueReadImage(cmdQueue, outputImage2DBuffer, Bool.True, originPtr, regionPtr, (IntPtr)0, (IntPtr)0, outputByteArray, 0, null, out clevent); LogError(error, "Cl.EnqueueReadImage"); error = Cl.Finish(cmdQueue); LogError(error, "Cl.Finih"); // Release memory Cl.ReleaseKernel(kernel); Cl.ReleaseCommandQueue(cmdQueue); Cl.ReleaseMemObject(inputImage2DBuffer); Cl.ReleaseMemObject(outputImage2DBuffer); // Convert binary bitmap to JPEG image and return as response GCHandle pinnedOutputArray = GCHandle.Alloc(outputByteArray, GCHandleType.Pinned); IntPtr outputBmpPointer = pinnedOutputArray.AddrOfPinnedObject(); Bitmap outputBitmap = new Bitmap(imagewidth, imageHeight, bitmapData.Stride, PixelFormat.Format32bppArgb, outputBmpPointer); MemoryStream msOutput = new MemoryStream(); outputBitmap.Save(msOutput, System.Drawing.Imaging.ImageFormat.Jpeg); response.Body = msOutput.ToArray(); response.Type = "image/jpeg"; return(response); } } return(new HTTPResponse(501)); }
public void FeedData(DataSet dataSet, int[] iExamples) { #if TIMING_LAYERS Utils.InputFeedTimer.Start(); #endif int dataPointSize = dataSet.DataDimension; for (int m = 0; m < outputNeurons.MiniBatchSize; m++) { #if OPENCL_ENABLED int iDataPoint = iExamples[m]; OpenCLSpace.ClError = Cl.EnqueueCopyBuffer(OpenCLSpace.Queue, dataSet.DataContainer[iDataPoint].Data, // source outputNeurons.ActivationsGPU, // destination (IntPtr)0, // source offset (in bytes) (IntPtr)(sizeof(float) * m * dataPointSize), // destination offset (in bytes) (IntPtr)(sizeof(float) * dataPointSize), // size of buffer to copy 0, null, out OpenCLSpace.ClEvent); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "InputLayer.FeedData Cl.EnqueueCopyBuffer inputData"); OpenCLSpace.ClError = Cl.ReleaseEvent(OpenCLSpace.ClEvent); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "Cl.ReleaseEvent"); // Dropout! if (dropoutParameter < 1.0) { // Set kernel arguments OpenCLSpace.ClError = Cl.SetKernelArg(OpenCLSpace.InputDropout, 0, outputNeurons.ActivationsGPU); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.InputDropout, 1, (IntPtr)sizeof(int), nOutputUnits * outputNeurons.MiniBatchSize); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.InputDropout, 2, (IntPtr)sizeof(float), (float)dropoutParameter); OpenCLSpace.ClError |= Cl.SetKernelArg(OpenCLSpace.InputDropout, 3, (IntPtr)sizeof(ulong), (ulong)Guid.NewGuid().GetHashCode()); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "InputDropout: Cl.SetKernelArg"); // Run kernel OpenCLSpace.ClError = Cl.EnqueueNDRangeKernel(OpenCLSpace.Queue, OpenCLSpace.InputDropout, 1, null, dropoutGlobalWorkSizePtr, dropoutLocalWorkSizePtr, 0, null, out OpenCLSpace.ClEvent); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "InputDropout: Cl.EnqueueNDRangeKernel"); OpenCLSpace.ClError = Cl.ReleaseEvent(OpenCLSpace.ClEvent); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "Cl.ReleaseEvent"); OpenCLSpace.ClError = Cl.Finish(OpenCLSpace.Queue); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "Cl.Finish"); } #else outputNeurons.SetHost(m, dataSet.Data[iExamples[m]]); #endif } #if OPENCL_ENABLED OpenCLSpace.ClError = Cl.Finish(OpenCLSpace.Queue); OpenCLSpace.CheckErr(OpenCLSpace.ClError, "Cl.Finish"); #endif #if TIMING_LAYERS Utils.InputFeedTimer.Stop(); #endif }