public void TestBandwidth() { int start = DefaultSize; int end = DefaultSize; int increment = DefaultIncrement; PrintMode printMode = PrintMode.UserReadable; MemoryMode memoryMode = MemoryMode.Pageable; AccessMode accessMode = AccessMode.Direct; // Get OpenCL platform ID for NVIDIA if available, otherwise default Platform platform = OclUtils.GetPlatform(); // Find out how many devices there are Device[] devices = platform.GetDevices(DeviceType.Gpu); if (devices.Length == 0) { Console.WriteLine("No GPU devices found. Falling back to CPU for test..."); devices = platform.GetDevices(DeviceType.Cpu); Assert.AreNotEqual(0, devices.Length, "There are no devices supporting OpenCL"); } int startDevice = 0; int endDevice = 0; // Get and log the device info Console.WriteLine("Running on..."); Console.WriteLine(); for (int i = startDevice; i <= endDevice; i++) { Console.WriteLine(devices[i].Name); } Console.WriteLine(); // Mode Console.WriteLine("Quick Mode"); Console.WriteLine(); TestMode testMode = TestMode.Quick; bool hostToDevice = true; bool deviceToHost = true; bool deviceToDevice = true; if (testMode == TestMode.Range) { throw new NotImplementedException(); } using (var context = Context.Create(devices)) { if (hostToDevice) { TestBandwidth(context, devices, start, end, increment, testMode, MemoryCopyKind.HostToDevice, printMode, accessMode, memoryMode, startDevice, endDevice); } if (deviceToHost) { TestBandwidth(context, devices, start, end, increment, testMode, MemoryCopyKind.DeviceToHost, printMode, accessMode, memoryMode, startDevice, endDevice); } if (deviceToDevice) { TestBandwidth(context, devices, start, end, increment, testMode, MemoryCopyKind.DeviceToDevice, printMode, accessMode, memoryMode, startDevice, endDevice); } } }
public unsafe void TestVectorAdd() { Console.WriteLine("TestVectorAdd Starting..."); Console.WriteLine(); Console.WriteLine("# of float elements per array \t= {0}", NumElements); // set global and local work size dimensions int localWorkSize = 256; int globalWorkSize = RoundUp(localWorkSize, NumElements); Console.WriteLine("Global work size \t\t= {0}", globalWorkSize); Console.WriteLine("Local work size \t\t= {0}", localWorkSize); Console.WriteLine("Number of work groups \t\t= {0}", globalWorkSize % localWorkSize + globalWorkSize / localWorkSize); Console.WriteLine(); // allocate and initialize host arrays Console.WriteLine("Allocate and initialize host memory..."); float[] srcA = new float[globalWorkSize]; float[] srcB = new float[globalWorkSize]; float[] dst = new float[globalWorkSize]; float[] golden = new float[NumElements]; FillArray(srcA, NumElements); FillArray(srcB, NumElements); // get an OpenCL platform Console.WriteLine("Get platform..."); Platform platform = OclUtils.GetPlatform(); // get the devices Console.WriteLine("Get GPU devices..."); Device[] devices = platform.GetDevices(DeviceType.Gpu); // create the context Console.WriteLine("Get context..."); using (Context context = Context.Create(devices)) { // create a command queue Console.WriteLine("Get command queue..."); using (CommandQueue commandQueue = context.CreateCommandQueue(devices[0], CommandQueueProperties.None)) { Console.WriteLine("Create buffers..."); using (Mem deviceSrcA = context.CreateBuffer(MemoryFlags.ReadOnly, globalWorkSize * sizeof(float)), deviceSrcB = context.CreateBuffer(MemoryFlags.ReadOnly, globalWorkSize * sizeof(float)), deviceDst = context.CreateBuffer(MemoryFlags.WriteOnly, globalWorkSize * sizeof(float))) { string source = @"// OpenCL Kernel Function for element by element vector addition __kernel void VectorAdd(__global const float* a, __global const float* b, __global float* c, int iNumElements) { // get index into global data array int iGID = get_global_id(0); // bound check (equivalent to the limit on a 'for' loop for standard/serial C code if (iGID >= iNumElements) { return; } // add the vector elements c[iGID] = a[iGID] + b[iGID]; } "; // create the program Console.WriteLine("Create program with source..."); using (Program program = context.CreateProgramWithSource(source)) { // build the program string options = "-cl-fast-relaxed-math"; program.Build(options); // create the kernel using (Kernel kernel = program.CreateKernel("VectorAdd")) { kernel.Arguments[0].SetValue(deviceSrcA); kernel.Arguments[1].SetValue(deviceSrcB); kernel.Arguments[2].SetValue(deviceDst); kernel.Arguments[3].SetValue(NumElements); // Start core sequence... copy input data to GPU, compute, copy results back fixed(float *psrcA = srcA, psrcB = srcB, pdst = dst) { // asynchronous write of data to GPU device commandQueue.EnqueueWriteBuffer(deviceSrcA, false, 0, sizeof(float) * globalWorkSize, (IntPtr)psrcA); commandQueue.EnqueueWriteBuffer(deviceSrcB, false, 0, sizeof(float) * globalWorkSize, (IntPtr)psrcB); // launch kernel commandQueue.EnqueueNDRangeKernel(kernel, (IntPtr)globalWorkSize, (IntPtr)localWorkSize); // synchronous/blocking read of results, and check accumulated errors commandQueue.EnqueueReadBufferAndWait(deviceDst, (IntPtr)pdst, sizeof(float) * globalWorkSize); } } } } } } // compute and compare results for golden-host and report errors and pass/fail Console.WriteLine("Comparing against host computation..."); Console.WriteLine(); VectorAddHost(srcA, srcB, golden, NumElements); bool match = Comparefet(golden, dst, NumElements, 0.0f, 0); Assert.IsTrue(match); }