private unsafe double TestDeviceToHostTransferPaged(Context context, CommandQueue commandQueue, int memSize, AccessMode accessMode) { // standard host allocation byte[] data = new byte[memSize]; for (int i = 0; i < data.Length; i++) { data[i] = (byte)i; fixed(byte *pdata = data) { // allocate device memory using (Buffer deviceData = context.CreateBuffer(MemoryFlags.ReadWrite, memSize)) { // initialize device memory commandQueue.EnqueueWriteBuffer(deviceData, false, 0, memSize, (IntPtr)pdata); // sync queue to host commandQueue.Finish(); var timer = Stopwatch.StartNew(); if (accessMode == AccessMode.Direct) { // DIRECT: API access to device buffer for (int i = 0; i < MemoryCopyIterations; i++) { commandQueue.EnqueueReadBuffer(deviceData, false, 0, memSize, (IntPtr)pdata); } commandQueue.Finish(); } else { // MAPPED: mapped pointers to device buffer for conventional pointer access IntPtr dm_idata; commandQueue.EnqueueMapBuffer(deviceData, true, MapFlags.Read, 0, memSize, out dm_idata); for (int i = 0; i < MemoryCopyIterations; i++) { CopyMemory((IntPtr)pdata, dm_idata, (UIntPtr)memSize); } commandQueue.EnqueueUnmapMemObject(deviceData, dm_idata); } // get the elapsed time in seconds double elapsedTimeInSeconds = timer.Elapsed.TotalSeconds; // Calculate bandwidth in MB/s // This is for kernels that read and write GMEM simultaneously // Obtained Throughput for unidirectional block copies will be 1/2 of this # double bandwidthInMBs = 2.0 * ((double)memSize * (double)MemoryCopyIterations) / (elapsedTimeInSeconds * (double)(1 << 20)); return(bandwidthInMBs); } } }
protected override void Dispose(bool disposing) { if (Buffer != null) { if (HasOwnership) { Synchronize(); } Buffer.Dispose(); Buffer = null; Ptr = IntPtr.Zero; } }
public void TestBufferDestroyedEvent() { bool destroyed = false; Platform platform = Platform.GetPlatforms()[0]; using (Context context = Context.Create(platform.GetDevices())) { using (Buffer buffer = context.CreateBuffer(MemoryFlags.AllocateHostPointer, 1024)) { buffer.Destroyed += (sender, e) => destroyed = true; Assert.IsFalse(destroyed); } } Assert.IsTrue(destroyed); }
private unsafe void ExecuteKernel( Context context, Device device, CommandQueue commandQueue, Kernel kernel, float[] input, float[] output, int globalWorkSize, int localWorkSize, bool warming, bool useHostPointer, bool autoGroupSize, bool enableProfiling, out TimeSpan stopwatchTime, out TimeSpan profiledTime, out TimeSpan readTime) { MemoryFlags inFlags = (useHostPointer ? MemoryFlags.UseHostPointer : MemoryFlags.CopyHostPointer) | MemoryFlags.ReadOnly; MemoryFlags outFlags = (useHostPointer ? MemoryFlags.UseHostPointer : MemoryFlags.CopyHostPointer) | MemoryFlags.ReadWrite; int taskSize = input.Length; // allocate buffers fixed(float *pinput = input, poutput = output) { using (Buffer inputBuffer = context.CreateBuffer(inFlags, sizeof(float) * taskSize, (IntPtr)pinput), outputBuffer = context.CreateBuffer(outFlags, sizeof(float) * taskSize, (IntPtr)poutput)) { kernel.Arguments[0].SetValue(inputBuffer); kernel.Arguments[1].SetValue(outputBuffer); Console.WriteLine("Original global work size {0}", globalWorkSize); Console.WriteLine("Original local work size {0}", localWorkSize); if (autoGroupSize) { Console.WriteLine("Run-time determines optimal workgroup size"); } IntPtr workGroupSizeMaximum = kernel.GetWorkGroupSize(device); Console.WriteLine("Maximum workgroup size for this kernel {0}", workGroupSizeMaximum.ToInt64()); if (warming) { Console.Write("Warming up OpenCL execution..."); using (commandQueue.EnqueueNDRangeKernel(kernel, new[] { (IntPtr)globalWorkSize }, autoGroupSize ? null : new[] { (IntPtr)localWorkSize })) { } commandQueue.Finish(); Console.WriteLine("Done"); } Console.Write("Executing OpenCL kernel..."); Stopwatch timer = Stopwatch.StartNew(); // execute kernel, pls notice autoGroupSize using (Event perfEvent = commandQueue.EnqueueNDRangeKernel(kernel, new[] { (IntPtr)globalWorkSize }, autoGroupSize ? null : new[] { (IntPtr)localWorkSize })) { Event.WaitAll(perfEvent); stopwatchTime = timer.Elapsed; Console.WriteLine("Done"); if (enableProfiling) { ulong start = perfEvent.CommandStartTime; ulong end = perfEvent.CommandEndTime; // a tick is 100ns profiledTime = TimeSpan.FromTicks((long)(end - start) / 100); } else { profiledTime = TimeSpan.Zero; } } timer.Restart(); if (useHostPointer) { IntPtr tmpPtr; using (commandQueue.EnqueueMapBuffer(outputBuffer, true, MapFlags.Read, 0, sizeof(float) * taskSize, out tmpPtr)) { } Assert.AreEqual((IntPtr)poutput, tmpPtr, "EnqueueMapBuffer failed to return original pointer"); using (commandQueue.EnqueueUnmapMemObject(outputBuffer, tmpPtr)) { } } else { using (commandQueue.EnqueueReadBuffer(outputBuffer, true, 0, sizeof(float) * taskSize, (IntPtr)poutput)) { } } commandQueue.Finish(); readTime = timer.Elapsed; } } }
private double TestHostToDeviceTransferPinned(Context context, CommandQueue commandQueue, int memSize, AccessMode accessMode) { // Create a host buffer using (Buffer pinnedData = context.CreateBuffer(MemoryFlags.ReadWrite | MemoryFlags.AllocateHostPointer, memSize)) { // get a mapped pointer IntPtr h_data; commandQueue.EnqueueMapBuffer(pinnedData, true, MapFlags.Write, 0, memSize, out h_data); // initialize for (int i = 0; i < memSize; i++) { Marshal.WriteByte(h_data, i, (byte)i); } // unmap and make data in the host buffer valid commandQueue.EnqueueUnmapMemObject(pinnedData, h_data); // allocate device memory using (Buffer deviceData = context.CreateBuffer(MemoryFlags.ReadWrite, memSize)) { // sync queue to host commandQueue.Finish(); var timer = Stopwatch.StartNew(); if (accessMode == AccessMode.Direct) { commandQueue.EnqueueMapBuffer(pinnedData, true, MapFlags.Read, 0, memSize, out h_data); // DIRECT: API access to device buffer for (int i = 0; i < MemoryCopyIterations; i++) { commandQueue.EnqueueWriteBuffer(deviceData, false, 0, memSize, h_data); } commandQueue.Finish(); } else { // MAPPED: mapped pointers to device buffer for conventional pointer access IntPtr dm_idata; commandQueue.EnqueueMapBuffer(deviceData, true, MapFlags.Write, 0, memSize, out dm_idata); commandQueue.EnqueueMapBuffer(pinnedData, true, MapFlags.Read, 0, memSize, out h_data); for (int i = 0; i < MemoryCopyIterations; i++) { CopyMemory(dm_idata, h_data, (UIntPtr)memSize); } commandQueue.EnqueueUnmapMemObject(deviceData, dm_idata); } // get the elapsed time in seconds double elapsedTimeInSeconds = timer.Elapsed.TotalSeconds; // Calculate bandwidth in MB/s // This is for kernels that read and write GMEM simultaneously // Obtained Throughput for unidirectional block copies will be 1/2 of this # double bandwidthInMBs = 2.0 * ((double)memSize * (double)MemoryCopyIterations) / (elapsedTimeInSeconds * (double)(1 << 20)); return(bandwidthInMBs); } } }
public unsafe void TestVectorAdd() { Console.WriteLine("TestVectorAdd Starting..."); Console.WriteLine(); Console.WriteLine("# of float elements per array \t= {0}", NumElements); // set global and local work size dimensions int localWorkSize = 256; int globalWorkSize = RoundUp(localWorkSize, NumElements); Console.WriteLine("Global work size \t\t= {0}", globalWorkSize); Console.WriteLine("Local work size \t\t= {0}", localWorkSize); Console.WriteLine("Number of work groups \t\t= {0}", (globalWorkSize % localWorkSize) + (globalWorkSize / localWorkSize)); Console.WriteLine(); // allocate and initialize host arrays Console.WriteLine("Allocate and initialize host memory..."); float[] srcA = new float[globalWorkSize]; float[] srcB = new float[globalWorkSize]; float[] dst = new float[globalWorkSize]; float[] golden = new float[NumElements]; FillArray(srcA, NumElements); FillArray(srcB, NumElements); // get an OpenCL platform Console.WriteLine("Get platform..."); Platform platform = OclUtils.GetPlatform(); // get the devices Console.WriteLine("Get GPU devices..."); Device[] devices = platform.GetDevices(DeviceType.Gpu); if (devices.Length == 0) { Console.WriteLine("No GPU devices found. Falling back to CPU for test..."); devices = platform.GetDevices(DeviceType.Cpu); Assert.AreNotEqual(0, devices.Length, "There are no devices supporting OpenCL"); } // create the context Console.WriteLine("Get context..."); using (var context = Context.Create(devices)) { // create a command queue Console.WriteLine("Get command queue..."); using (CommandQueue commandQueue = context.CreateCommandQueue(devices[0], CommandQueueProperties.None)) { Console.WriteLine("Create buffers..."); using (Buffer deviceSrcA = context.CreateBuffer(MemoryFlags.ReadOnly, globalWorkSize * sizeof(float)), deviceSrcB = context.CreateBuffer(MemoryFlags.ReadOnly, globalWorkSize * sizeof(float)), deviceDst = context.CreateBuffer(MemoryFlags.WriteOnly, globalWorkSize * sizeof(float))) { string source = @"// OpenCL Kernel Function for element by element vector addition __kernel void VectorAdd(__global const float* a, __global const float* b, __global float* c, int iNumElements) { // get index into global data array int iGID = get_global_id(0); // bound check (equivalent to the limit on a 'for' loop for standard/serial C code if (iGID >= iNumElements) { return; } // add the vector elements c[iGID] = a[iGID] + b[iGID]; } "; // create the program Console.WriteLine("Create program with source..."); using (Program program = context.CreateProgramWithSource(source)) { // build the program string options; #if false options = "-cl-fast-relaxed-math -DMAC"; #else options = "-cl-fast-relaxed-math"; #endif program.Build(options); // create the kernel using (Kernel kernel = program.CreateKernel("VectorAdd")) { kernel.Arguments[0].SetValue(deviceSrcA); kernel.Arguments[1].SetValue(deviceSrcB); kernel.Arguments[2].SetValue(deviceDst); kernel.Arguments[3].SetValue(NumElements); // Start core sequence... copy input data to GPU, compute, copy results back fixed(float *psrcA = srcA, psrcB = srcB, pdst = dst) { // asynchronous write of data to GPU device using (commandQueue.EnqueueWriteBuffer(deviceSrcA, false, 0, sizeof(float) * globalWorkSize, (IntPtr)psrcA)) using (commandQueue.EnqueueWriteBuffer(deviceSrcB, false, 0, sizeof(float) * globalWorkSize, (IntPtr)psrcB)) { } // launch kernel using (commandQueue.EnqueueNDRangeKernel(kernel, (IntPtr)globalWorkSize, (IntPtr)localWorkSize)) { } // synchronous/blocking read of results, and check accumulated errors using (commandQueue.EnqueueReadBuffer(deviceDst, true, 0, sizeof(float) * globalWorkSize, (IntPtr)pdst)) { } } } } } } } // compute and compare results for golden-host and report errors and pass/fail Console.WriteLine("Comparing against host computation..."); Console.WriteLine(); VectorAddHost(srcA, srcB, golden, NumElements); bool match = Comparefet(golden, dst, NumElements, 0.0f, 0); Assert.IsTrue(match); }