private float cuda_host_alloc_copy_test(int size, bool up) { IntPtr a = _gpu.HostAllocate <int>(size); IntPtr b = _gpu.HostAllocate <int>(size); int[] dev_a = _gpu.Allocate <int>(size); int[] host_a = new int[size]; _gpu.StartTimer(); for (int i = 0; i < 50; i++) // 50 = two copies per loop { if (up) { a.Write(host_a); _gpu.CopyToDeviceAsync(a, 0, dev_a, 0, size); b.Write(host_a); _gpu.CopyToDeviceAsync(b, 0, dev_a, 0, size); } else { _gpu.CopyFromDeviceAsync(dev_a, 0, a, 0, size); b.Read(host_a); _gpu.CopyFromDeviceAsync(dev_a, 0, b, 0, size); b.Read(host_a); } } _gpu.SynchronizeStream(); float elapsedTime = _gpu.StopTimer(); _gpu.FreeAll(); _gpu.HostFree(a); _gpu.HostFree(b); GC.Collect(); return(elapsedTime); }
public void Test_smartCopyToDevice() { if (_gpu is OpenCLDevice) { Console.WriteLine("Device not supporting smart copy, so skip."); return; } var mod = CudafyModule.TryDeserialize(); if (mod == null || !mod.TryVerifyChecksums()) { mod = CudafyTranslator.Cudafy(CudafyModes.Architecture); mod.Serialize(); } _gpu.LoadModule(mod); _gpuuintBufferIn = _gpu.Allocate <uint>(N); _gpuuintBufferOut = _gpu.Allocate <uint>(N); int batchSize = 8; int loops = 6; Stopwatch sw = Stopwatch.StartNew(); for (int x = 0; x < loops; x++) { for (int i = 0; i < batchSize; i++) { _gpu.CopyToDevice(_uintBufferIn, 0, _gpuuintBufferIn, 0, N); _gpu.Launch(N / 512, 512, "DoubleAllValues", _gpuuintBufferIn, _gpuuintBufferOut); _gpu.CopyFromDevice(_gpuuintBufferOut, 0, _uintBufferOut, 0, N); } } long time = sw.ElapsedMilliseconds; Console.WriteLine(time); IntPtr[] stagingPostIn = new IntPtr[batchSize]; IntPtr[] stagingPostOut = new IntPtr[batchSize]; for (int i = 0; i < batchSize; i++) { stagingPostIn[i] = _gpu.HostAllocate <uint>(N); stagingPostOut[i] = _gpu.HostAllocate <uint>(N); } _gpu.EnableSmartCopy(); sw.Restart(); for (int x = 0; x < loops; x++) { for (int i = 0; i < batchSize; i++) { _gpu.CopyToDeviceAsync(_uintBufferIn, 0, _gpuuintBufferIn, 0, N, i + 1, stagingPostIn[i]); } for (int i = 0; i < batchSize; i++) { _gpu.LaunchAsync(N / 256, 256, i + 1, "DoubleAllValues", _gpuuintBufferIn, _gpuuintBufferOut); } for (int i = 0; i < batchSize; i++) { _gpu.CopyFromDeviceAsync(_gpuuintBufferOut, 0, _uintBufferOut, 0, N, i + 1, stagingPostOut[i]); } for (int i = 0; i < batchSize; i++) { _gpu.SynchronizeStream(i + 1); } //for (int i = 0; i < batchSize; i++) //{ // _gpu.CopyToDeviceAsync(stagingPostIn[i], 0, _gpuuintBufferIn, 0, N, i+1); // _gpu.LaunchAsync(N / 512, 512, i + 1, "DoubleAllValues", _gpuuintBufferIn, _gpuuintBufferOut); // _gpu.CopyFromDeviceAsync(_gpuuintBufferOut, 0, stagingPostOut[i], 0, N, i + 1); //} for (int i = 0; i < batchSize; i++) { _gpu.SynchronizeStream(i + 1); } } time = sw.ElapsedMilliseconds; Console.WriteLine(time); _gpu.DisableSmartCopy(); for (int i = 0; i < N; i++) { _uintBufferIn[i] *= 2; } Assert.IsTrue(Compare(_uintBufferIn, _uintBufferOut)); ClearOutputsAndGPU(); }
public static void Execute() { CudafyModule km = CudafyTranslator.Cudafy(); GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId); gpu.LoadModule(km); int[] dev_a0, dev_b0, dev_c0; int[] dev_a1, dev_b1, dev_c1; // allocate the memory on the GPU dev_a0 = gpu.Allocate <int>(N); dev_b0 = gpu.Allocate <int>(N); dev_c0 = gpu.Allocate <int>(N); dev_a1 = gpu.Allocate <int>(N); dev_b1 = gpu.Allocate <int>(N); dev_c1 = gpu.Allocate <int>(N); // allocate host locked memory, used to stream IntPtr host_aPtr = gpu.HostAllocate <int>(FULL_DATA_SIZE); IntPtr host_bPtr = gpu.HostAllocate <int>(FULL_DATA_SIZE); IntPtr host_cPtr = gpu.HostAllocate <int>(FULL_DATA_SIZE); Random rand = new Random(); for (int i = 0; i < FULL_DATA_SIZE; i++) { host_aPtr.Set(i, rand.Next(1024 * 1024)); // There will be differences between the .NET code and the GPU host_bPtr.Set(i, rand.Next(1024 * 1024)); // So let's keep these to a minimum by having a max random values. } // start timer gpu.StartTimer(); // now loop over full data, in bite-sized chunks for (int i = 0; i < FULL_DATA_SIZE; i += N * 2) { gpu.CopyToDeviceAsync(host_aPtr, i, dev_a0, 0, N, 1); gpu.CopyToDeviceAsync(host_bPtr, i, dev_b0, 0, N, 2); gpu.CopyToDeviceAsync(host_aPtr, i + N, dev_a1, 0, N, 1); gpu.CopyToDeviceAsync(host_bPtr, i + N, dev_b1, 0, N, 2); gpu.LaunchAsync(N / 256, 256, 1, "thekernel", dev_a0, dev_b0, dev_c0); gpu.LaunchAsync(N / 256, 256, 2, "thekernel", dev_a1, dev_b1, dev_c1); //gpu.Launch(N / 256, 256, 1).kernel(dev_a0, dev_b0, dev_c0); //gpu.Launch(N / 256, 256, 2).kernel(dev_a1, dev_b1, dev_c1); gpu.CopyFromDeviceAsync(dev_c0, 0, host_cPtr, i, N, 1); gpu.CopyFromDeviceAsync(dev_c1, 0, host_cPtr, i + N, N, 2); } gpu.SynchronizeStream(1); gpu.SynchronizeStream(2); float elapsed = gpu.StopTimer(); // verify int[] host_a = new int[FULL_DATA_SIZE]; int[] host_b = new int[FULL_DATA_SIZE]; int[] host_c = new int[FULL_DATA_SIZE]; GPGPU.CopyOnHost(host_aPtr, 0, host_a, 0, FULL_DATA_SIZE); GPGPU.CopyOnHost(host_bPtr, 0, host_b, 0, FULL_DATA_SIZE); GPGPU.CopyOnHost(host_cPtr, 0, host_c, 0, FULL_DATA_SIZE); Console.WriteLine("Elapsed: {0} ms", elapsed); int[] host_d = new int[FULL_DATA_SIZE]; int errors = 0; int id = 0; { for (int j = 0; j < N; j++, id++) { control(id, j, host_a, host_b, host_d); if (host_c[id] > host_d[id] + 1) // There will be differences between the .NET code and the GPU { Console.WriteLine("Mismatch at {0}: {1} != {2}", id, host_c[id], host_d[id]); errors++; if (errors > 8) { break; } } } } gpu.HostFree(host_aPtr); gpu.HostFree(host_bPtr); gpu.HostFree(host_cPtr); gpu.DestroyStream(1); gpu.DestroyStream(2); }
public static void Execute() { CudafyModule km = CudafyModule.TryDeserialize(); if (km == null || !km.TryVerifyChecksums()) { km = CudafyTranslator.Cudafy(); km.Serialize(); } GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target); gpu.LoadModule(km); float c = 0; int loops = 20; int batches = 12; // allocate memory on the cpu side float[] a = new float[N]; float[] b = new float[N]; float[] partial_c = new float[blocksPerGrid]; // allocate the memory on the GPU float[] dev_a = gpu.Allocate <float>(N); float[] dev_b = gpu.Allocate <float>(N); float[] dev_partial_c = gpu.Allocate <float>(blocksPerGrid); float[] dev_test = gpu.Allocate <float>(blocksPerGrid * blocksPerGrid); // fill in the host memory with data for (int i = 0; i < N; i++) { a[i] = i; b[i] = i * 2; } // Synchronous Implementation Stopwatch sw = Stopwatch.StartNew(); for (int l = 0; l < loops; l++) { for (int bat = 0; bat < batches; bat++) { // copy the arrays 'a' and 'b' to the GPU gpu.CopyToDevice(a, dev_a); gpu.CopyToDevice(b, dev_b); gpu.Launch(blocksPerGrid, threadsPerBlock, "Dot", dev_a, dev_b, dev_partial_c); // copy the array 'c' back from the GPU to the CPU gpu.CopyFromDevice(dev_partial_c, partial_c); // finish up on the CPU side c = 0; for (int i = 0; i < blocksPerGrid; i++) { c += partial_c[i]; } } } long syncTime = sw.ElapsedMilliseconds; Console.WriteLine("Synchronous Time: {0}", syncTime); Console.WriteLine("Does GPU value {0} = {1}?\n", c, 2 * sum_squares((float)(N - 1))); // Asynchronous Pinned Memory Implementation IntPtr[] host_stages_a = new IntPtr[batches]; IntPtr[] host_stages_b = new IntPtr[batches]; IntPtr[] host_stages_c = new IntPtr[batches]; for (int bat = 0; bat < batches; bat++) { host_stages_a[bat] = gpu.HostAllocate <float>(N); host_stages_b[bat] = gpu.HostAllocate <float>(N); host_stages_c[bat] = gpu.HostAllocate <float>(blocksPerGrid); } // Set GPU memory to zero gpu.Set(dev_a); gpu.Set(dev_b); gpu.Set(dev_partial_c); gpu.EnableSmartCopy(); sw.Restart(); for (int l = 0; l < loops; l++) { // Queue all the copying operations of the batch for (int bat = 0; bat < batches; bat++) { // Finish processing the previous loop on CPU if (l > 0) { gpu.SynchronizeStream(bat + 1); c = 0; for (int i = 0; i < blocksPerGrid; i++) { c += partial_c[i]; } } gpu.CopyToDeviceAsync(a, 0, dev_a, 0, N, bat + 1, host_stages_a[bat]); } // All copies to the GPU are put into a queue, so the different stream id's are abstract only // We are guaranteed that all previous copies with same stream id will be completed first. for (int bat = 0; bat < batches; bat++) { gpu.CopyToDeviceAsync(b, 0, dev_b, 0, N, bat + 1, host_stages_b[bat]); } // Launch the kernels. These have same stream id as the copies and will take place as soon as // the copy to the GPU with same stream id is complete. Hence kernels may be running in parallel // with copies fo higher stream id that are still running. for (int bat = 0; bat < batches; bat++) { gpu.LaunchAsync(blocksPerGrid, threadsPerBlock, bat + 1, "Dot", dev_a, dev_b, dev_partial_c); } // Here we add to the copying from GPU queue. Copying will begin once the kernel with same stream // id is completed. If the GPU supports concurrent copying to and from at same time then the first // copy from operations may be completed before all the copy to operations have. for (int bat = 0; bat < batches; bat++) { gpu.CopyFromDeviceAsync(dev_partial_c, 0, partial_c, 0, blocksPerGrid, bat + 1, host_stages_c[bat]); } } // Finish processing the last loop on CPU for (int bat = 0; bat < batches; bat++) { gpu.SynchronizeStream(bat + 1); c = 0; for (int i = 0; i < blocksPerGrid; i++) { c += partial_c[i]; } } long asyncTime = sw.ElapsedMilliseconds; Console.WriteLine("Asynchronous Time: {0}", asyncTime); Console.WriteLine("Does GPU value {0} = {1}?\n", c, 2 * sum_squares((float)(N - 1))); gpu.DisableSmartCopy(); // free memory on the gpu side gpu.FreeAll(); // free memory on the cpu side gpu.HostFreeAll(); // let's try and do this on the CPU in an straight forward fashion sw.Restart(); c = 0; for (int l = 0; l < loops; l++) { for (int bat = 0; bat < batches; bat++) { c = DotProduct(a, b); } } long cpuTime = sw.ElapsedMilliseconds; Console.WriteLine("CPU Time: {0}", cpuTime); Console.WriteLine("Does CPU value {0} = {1}?\n", c, 2 * sum_squares((float)(N - 1))); // let's try and do this on the CPU with Linq sw.Restart(); c = 0; for (int l = 0; l < loops; l++) { for (int bat = 0; bat < batches; bat++) { c = DotProductLinq(a, b); } } long cpuLinqTime = sw.ElapsedMilliseconds; Console.WriteLine("CPU Linq Time: {0}", cpuLinqTime); Console.WriteLine("Does CPU value {0} = {1}?\n", c, 2 * sum_squares((float)(N - 1))); // let's try and do this on the CPU with multiple threads DotProductDelegate dlgt = new DotProductDelegate(DotProduct); IAsyncResult[] res = new IAsyncResult[batches]; for (int bat = 0; bat < batches; bat++) { res[bat] = null; } sw.Restart(); c = 0; for (int l = 0; l < loops; l++) { for (int bat = 0; bat < batches; bat++) { if (res[bat] != null) { c = dlgt.EndInvoke(res[bat]); } res[bat] = dlgt.BeginInvoke(a, b, null, null); } } for (int bat = 0; bat < batches; bat++) { if (res[bat] != null) { c = dlgt.EndInvoke(res[bat]); } } long cpuMultiTime = sw.ElapsedMilliseconds; Console.WriteLine("CPU Multi Time: {0}", cpuMultiTime); Console.WriteLine("Does CPU value {0} = {1}?\n", c, 2 * sum_squares((float)(N - 1))); }