public static void Execute() { // Translates this class to CUDA C and then compliles CudafyModule km = CudafyTranslator.Cudafy(); // Get the first GPU and load the module GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId); gpu.LoadModule(km); // Create some arrays on the host int[] a = new int[N]; int[] b = new int[N]; int[] c = new int[N]; // allocate the memory on the GPU int[] dev_c = gpu.Allocate <int>(c); // fill the arrays 'a' and 'b' on the CPU for (int i = 0; i < N; i++) { a[i] = i; b[i] = 2 * i; } // copy the arrays 'a' and 'b' to the GPU int[] dev_a = gpu.CopyToDevice(a); int[] dev_b = gpu.CopyToDevice(b); // Launch 128 blocks of 128 threads each gpu.Launch(128, 128).add(dev_a, dev_b, dev_c); // copy the array 'c' back from the GPU to the CPU gpu.CopyFromDevice(dev_c, c); // verify that the GPU did the work we requested bool success = true; for (int i = 0; i < N; i++) { if ((a[i] + b[i]) != c[i]) { Console.WriteLine("{0} + {1} != {2}", a[i], b[i], c[i]); success = false; break; } } if (success) { Console.WriteLine("We did it!"); } // free the memory allocated on the GPU gpu.FreeAll(); }
public static void Execute(byte[] bitmap) { CudafyModule km = CudafyModule.TryDeserialize(); if (km == null || !km.TryVerifyChecksums()) { km = CudafyTranslator.Cudafy(typeof(Sphere), typeof(ray_noconst)); km.TrySerialize(); } GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId); gpu.LoadModule(km); // capture the start time gpu.StartTimer(); // allocate memory on the GPU for the bitmap (same size as ptr) byte[] dev_bitmap = gpu.Allocate(bitmap); // allocate memory for the Sphere dataset Sphere[] s = gpu.Allocate <Sphere>(SPHERES); // allocate temp memory, initialize it, copy to constant memory on the GPU Sphere[] temp_s = new Sphere[SPHERES]; for (int i = 0; i < SPHERES; i++) { temp_s[i].r = rnd(1.0f); temp_s[i].g = rnd(1.0f); temp_s[i].b = rnd(1.0f); temp_s[i].x = rnd(1000.0f) - 500; temp_s[i].y = rnd(1000.0f) - 500; temp_s[i].z = rnd(1000.0f) - 500; temp_s[i].radius = rnd(100.0f) + 20; } gpu.CopyToDevice(temp_s, s); // generate a bitmap from our sphere data dim3 grids = new dim3(ray_gui.DIM / 16, ray_gui.DIM / 16); dim3 threads = new dim3(16, 16); //gpu.Launch(grids, threads).kernel(s, dev_bitmap); // Dynamic gpu.Launch(grids, threads, ((Action <GThread, Sphere[], byte[]>)thekernel), s, dev_bitmap); // Strongly typed // copy our bitmap back from the GPU for display gpu.CopyFromDevice(dev_bitmap, bitmap); // get stop time, and display the timing results float elapsedTime = gpu.StopTimer(); Console.WriteLine("Time to generate: {0} ms", elapsedTime); gpu.FreeAll(); }
public void Execute() { float elapsedTime; float MB = (float)100 * SIZE * sizeof(int) / 1024 / 1024; _gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId); var props = _gpu.GetDeviceProperties(); Console.WriteLine(props.Name); Console.WriteLine("Using {0}optimized driver.", props.HighPerformanceDriver ? "" : "non-"); // try it with malloc elapsedTime = cuda_malloc_test(SIZE, true); Console.WriteLine("Time using cudaMalloc: {0} ms", elapsedTime); Console.WriteLine("\tMB/s during copy up: {0}", MB / (elapsedTime / 1000)); elapsedTime = cuda_malloc_test(SIZE, false); Console.WriteLine("Time using cudaMalloc: {0} ms", elapsedTime); Console.WriteLine("\tMB/s during copy down: {0}", MB / (elapsedTime / 1000)); // now try it with cudaHostAlloc elapsedTime = cuda_host_alloc_test(SIZE, true); Console.WriteLine("Time using cudaHostAlloc: {0} ms", elapsedTime); Console.WriteLine("\tMB/s during copy up: {0}", MB / (elapsedTime / 1000)); elapsedTime = cuda_host_alloc_test(SIZE, false); Console.WriteLine("Time using cudaHostAlloc: {0} ms", elapsedTime); Console.WriteLine("\tMB/s during copy down: {0}", MB / (elapsedTime / 1000)); #region 15-06-2011 Not working on laptop, works fine on workstation //// now try it with cudaHostAlloc copy //elapsedTime = cuda_host_alloc_copy_test(SIZE, true); //Console.WriteLine("Time using cudaHostAlloc + async copy: {0} ms", // elapsedTime); //Console.WriteLine("\tMB/s during copy up: {0}", // MB / (elapsedTime / 1000)); //elapsedTime = cuda_host_alloc_copy_test(SIZE, false); //Console.WriteLine("Time using cudaHostAlloc + async copy: {0} ms", // elapsedTime); //Console.WriteLine("\tMB/s during copy down: {0}", // MB / (elapsedTime / 1000)); #endregion }
public double[] transpose(double[] inputArray) { GPGPU gpu = CudafyHost.GetDevice(eGPUType.Cuda); CudafyModule km = CudafyTranslator.Cudafy(eArchitecture.sm_35); gpu.LoadModule(km); dim3 grid = new dim3(1000); gpu.Launch(); return(new double[1]); }
public static void RunTest() { CudafyModule km = CudafyTranslator.Cudafy(); GPGPU gpu = CudafyHost.GetDevice(eGPUType.Cuda); gpu.LoadModule(km); gpu.Launch().thekernel(); // or gpu.Launch(1, 1, "kernel"); Console.WriteLine("Sample kernel started successfully!"); }
public void SetUp() { //var x = CompilerHelper.Create(ePlatform.x64, eArchitecture.OpenCL, eCudafyCompileMode.Default); var y = CompilerHelper.Create(ePlatform.x64, CudafyModes.Architecture, eCudafyCompileMode.DynamicParallelism); _cm = CudafyTranslator.Cudafy(new CompileProperties[] { y }, this.GetType()); Console.WriteLine(_cm.CompilerOutput); _cm.Serialize(); _gpu = CudafyHost.GetDevice(y.Architecture, CudafyModes.DeviceId); _gpu.LoadModule(_cm); }
public void Test_CreateEmulatedGPU() { if (CudafyModes.Target != eGPUType.Emulator) { Console.WriteLine("Only tests Emulator devices, so skip."); return; } GPGPU gpu = CudafyHost.GetDevice(eGPUType.Emulator); Assert.IsTrue(gpu is EmulatedGPU); gpu = null; }
public void SetUp() { CudafyTranslator.GenerateDebug = true; _cm = CudafyModule.TryDeserialize(); _gpu = CudafyHost.GetDevice(CudafyModes.Architecture, CudafyModes.DeviceId); if (_cm == null || !_cm.TryVerifyChecksums()) { _cm = CudafyTranslator.Cudafy(_gpu.GetArchitecture(), this.GetType(), (_gpu is OpenCLDevice) ? null : typeof(StringConstClass)); _cm.TrySerialize(); } _gpu.LoadModule(_cm); }
public void Initialize(int bytes) { CudafyModule km = CudafyTranslator.Cudafy(); _gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId); _gpu.LoadModule(km); _dev_bitmap = _gpu.Allocate <byte>(bytes); _blocks = new dim3(DIM / 16, DIM / 16); _threads = new dim3(16, 16); }
public void Initialize(int DeviceId, String Directory) { CudafyModes.Target = eGPUType.OpenCL; CudafyTranslator.Language = eLanguage.OpenCL; CudafyTranslator.WorkingDirectory = Directory; CudafyTranslator.DeleteTempFiles = false; CudafyModule Module = CudafyTranslator.Cudafy(); Gpu = CudafyHost.GetDevice(eGPUType.OpenCL, DeviceId); Gpu.LoadModule(Module); Initialized = true; }
public static void Execute() { CudafyModule km = CudafyTranslator.Cudafy(); GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId); gpu.LoadModule(km); float c; // allocate memory on the cpu side float[] a = new float[N]; float[] b = new float[N]; float[] partial_c = new float[blocksPerGrid]; // allocate the memory on the GPU float[] dev_a = gpu.Allocate <float>(N); float[] dev_b = gpu.Allocate <float>(N); float[] dev_partial_c = gpu.Allocate <float>(blocksPerGrid); float[] dev_test = gpu.Allocate <float>(blocksPerGrid * blocksPerGrid); // fill in the host memory with data for (int i = 0; i < N; i++) { a[i] = i; b[i] = i * 2; } // copy the arrays 'a' and 'b' to the GPU gpu.CopyToDevice(a, dev_a); gpu.CopyToDevice(b, dev_b); gpu.Launch(blocksPerGrid, threadsPerBlock).Dot(dev_a, dev_b, dev_partial_c); // copy the array 'c' back from the GPU to the CPU gpu.CopyFromDevice(dev_partial_c, partial_c); // finish up on the CPU side c = 0; for (int i = 0; i < blocksPerGrid; i++) { c += partial_c[i]; } Console.WriteLine("Does GPU value {0} = {1}?\n", c, 2 * sum_squares((float)(N - 1))); // free memory on the gpu side gpu.FreeAll(); // free memory on the cpu side // No worries... }
static void Main(string[] args) { try { CudafyModes.DeviceId = 0; CudafyModes.Architecture = CudafyHost.GetDevice(eGPUType.Cuda, CudafyModes.DeviceId).GetArchitecture(); //eArchitecture.sm_35; // *** Change this to the architecture of your target board *** CudafyModes.Target = CompilerHelper.GetGPUType(CudafyModes.Architecture); if (CudafyModes.Target != eGPUType.OpenCL) { CURANDTests.Basics(); } StringTests st = new StringTests(); CudafyUnitTest.PerformAllTests(st); BasicFunctionTests bft = new BasicFunctionTests(); CudafyUnitTest.PerformAllTests(bft); GMathUnitTests gmu = new GMathUnitTests(); CudafyUnitTest.PerformAllTests(gmu); MultithreadedTests mtt = new MultithreadedTests(); CudafyUnitTest.PerformAllTests(mtt); CopyTests1D ct1d = new CopyTests1D(); CudafyUnitTest.PerformAllTests(ct1d); GPGPUTests gput = new GPGPUTests(); CudafyUnitTest.PerformAllTests(gput); if (CudafyHost.GetDeviceCount(CudafyModes.Target) > 1) { MultiGPUTests mgt = new MultiGPUTests(); CudafyUnitTest.PerformAllTests(mgt); } if (CudafyModes.Architecture == eArchitecture.sm_35) { Compute35Features c35f = new Compute35Features(); CudafyUnitTest.PerformAllTests(c35f); } Console.WriteLine("Done"); Console.ReadLine(); } catch (Exception ex) { Console.WriteLine(ex.ToString()); Console.ReadLine(); } }
public static void Execute() { CudafyModes.Target = eGPUType.Cuda; CudafyModes.DeviceId = ChosenDeviceId; // If not set, the value is 0 - so default, good one CudafyTranslator.Language = CudafyModes.Target == eGPUType.OpenCL ? eLanguage.OpenCL : eLanguage.Cuda; var gpu = CudafyHost.GetDevice(CudafyModes.Target); var arch = gpu.GetArchitecture(); var km = CudafyTranslator.Cudafy(arch); gpu.LoadModule(km); MaximumDimensionSize = GetMaxThreadsPerBlock(); // Save vanilla state of matrix DataHandler.SaveMatrix(Matrix, string.Format(AppConfigHelper.GetValueFromAppSettings(@"CellMatrixOutputLocation"), 0)); DataHandler.PrepareVisualisation(string.Format(AppConfigHelper.GetValueFromAppSettings(@"CellMatrixOutputLocation"), 0)); var a = GetGridSize(); var b = GetBlockSize(); Console.WriteLine("Grid size - {0},{1},{2} - Block size {3},{4},{5}", a.x, a.y, a.z, b.x, b.y, b.z); for (var i = 0; i < Generations; i++) { var rulesArray = new[] { LonelinessDeathNumber, OvercrowingDeathNumber, RevivalNumber, MaximumDimensionSize }; var rules = gpu.CopyToDevice(rulesArray); // copy the matrix to the GPU var deviceMatrix = gpu.Allocate <bool>(Matrix); gpu.CopyToDevice(Matrix, deviceMatrix); gpu.Launch(GetGridSize(), GetBlockSize(), @"Simulation", deviceMatrix, rules); // copy the array 'c' back from the GPU to the CPU gpu.CopyFromDevice(deviceMatrix, Matrix); // verify that the GPU did the work we requested // free the memory allocated on the GPU gpu.Free(deviceMatrix); gpu.Free(rules); //Save on disk DataHandler.SaveMatrix(Matrix, string.Format(AppConfigHelper.GetValueFromAppSettings(@"CellMatrixOutputLocation"), i + 1)); DataHandler.PrepareVisualisation(string.Format(AppConfigHelper.GetValueFromAppSettings(@"CellMatrixOutputLocation"), i + 1)); // free the memory we allocated on the CPU // Not necessary, this is .NET } }
public static void Execute() { bool previousValue = CudafyTranslator.AllowClasses; CudafyTranslator.AllowClasses = true; CudafyModule km = CudafyTranslator.Cudafy(new Type[] { typeof(BaseClass), typeof(MemberClass), typeof(DerivedClass), typeof(ArrayView), typeof(CudafyClassExamples) }); GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, 0); gpu.LoadModule(km); Example1(gpu); Example2(gpu); CudafyTranslator.AllowClasses = previousValue; }
private LevenshteinGPU() { CudafyModule km = null; try { km = CudafyModule.Deserialize(typeof(LevenshteinGPU).Name); } catch { km = CudafyTranslator.Cudafy(eArchitecture.sm_50); } _gpu = CudafyHost.GetDevice(CudafyModes.Target); _gpu.LoadModule(km); }
public void SetUp() { _cm = CudafyModule.TryDeserialize(); if (_cm == null || !_cm.TryVerifyChecksums()) { _cm = CudafyTranslator.Cudafy(CudafyModes.Architecture);//typeof(PrimitiveStruct), typeof(BasicFunctionTests)); Console.WriteLine(_cm.CompilerOutput); _cm.TrySerialize(); } _gpu = CudafyHost.GetDevice(CudafyModes.Architecture, CudafyModes.DeviceId); _gpu.LoadModule(_cm); //_gpu.CopyToConstantMemory(new int[constant_data.Length], constant_data); }
internal override Answer GetAnswer() { var stopWatchLoad = Stopwatch.StartNew(); using (var gpu = CudafyHost.GetDevice()) { var arch = gpu.GetDeviceProperties().Capability.GetArchitecture(); gpu.LoadModule(CudafyTranslator.Cudafy(ePlatform.x64, arch)); LoadTime = stopWatchLoad.ElapsedMilliseconds; var stopWatchRun = Stopwatch.StartNew(); var gpuLatLong = gpu.CopyToDevice(_latLong.ToArray()); var divisors = new long[_cities]; long divisor = _permutations; for (int city = _cities; city > 0; /* decrement in loop body */) { divisor /= city; city--; divisors[city] = divisor; } gpu.CopyToConstantMemory(divisors, gpuDivisors); var answer = new AnswerStruct[_blocksPerGrid];; var gpuAnswer = gpu.Allocate(answer); gpu.SafeLaunch(_blocksPerGrid, _threadsPerBlock, GpuFindPathDistance, _permutations, gpuLatLong, gpuAnswer); gpu.Synchronize(); gpu.CopyFromDevice(gpuAnswer, answer); gpu.FreeAll(); var bestDistance = float.MaxValue; var bestPermutation = 0L; for (var i = 0; i < _blocksPerGrid; i++) { if (answer[i].distance < bestDistance) { bestDistance = answer[i].distance; bestPermutation = answer[i].pathNo; } } return(new Answer { Distance = bestDistance, Permutation = bestPermutation, msLoadTime = LoadTime, msRunTime = stopWatchRun.ElapsedMilliseconds }); } }
public static void Execute() { CudafyModule km = CudafyTranslator.Cudafy(Program.testArchitecture); GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, 0); gpu.LoadModule(km); const int warps = 4; const int count = warps * 32; var random = new Random(); var input = new int[count]; var output = new int[count / 32]; var expectedOutput = new int[count / 32]; for (var i = 0; i < warps; i++) { expectedOutput[i] = 0; } for (var i = 0; i < count; i++) { input[i] = random.Next(2); } for (var i = 0; i < count; i++) { expectedOutput[i / 32] += input[i] << (i % 32); } var devInput = gpu.Allocate <int>(count); var devOutput = gpu.Allocate <int>(warps); gpu.CopyToDevice(input, devInput); gpu.Launch(1, count, "BallotKernel", devInput, devOutput); // copy the array 'c' back from the GPU to the CPU gpu.CopyFromDevice(devOutput, output); gpu.Free(devInput); gpu.Free(devOutput); for (var i = 0; i < warps; i++) { Console.WriteLine("Warp {0} Ballot: {1}", i, output[i]); Console.WriteLine("Expected: {0} \t{1}", expectedOutput[i], expectedOutput[i] == output[i] ? "PASSED" : "FAILED"); } }
public void Test_CreateCudaGPU() { if (CudafyModes.Target != eGPUType.Cuda) { Console.WriteLine("Only tests CUDA devices, so skip."); return; } int cnt = CudafyHost.GetDeviceCount(eGPUType.Cuda); if (cnt > 0) { GPGPU gpu = CudafyHost.GetDevice(eGPUType.Cuda, 0); Assert.IsTrue(gpu is CudaGPU); gpu = null; } }
public static void Execute(byte[] ptr) { CudafyModule km = CudafyTranslator.Cudafy(); GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId); gpu.LoadModule(km); byte[] dev_bitmap = gpu.Allocate <byte>(ptr.Length); gpu.Launch(new dim3(DIM, DIM), 1).thekernel(dev_bitmap); gpu.CopyFromDevice(dev_bitmap, ptr); gpu.FreeAll(); }
public void Test_CreateOpenCLDevice() { if (CudafyModes.Target != eGPUType.OpenCL) { Console.WriteLine("Only tests OpenCL devices, so skip."); return; } int cnt = CudafyHost.GetDeviceCount(eGPUType.OpenCL); if (cnt > 0) { GPGPU gpu = CudafyHost.GetDevice(eGPUType.OpenCL, 0); Assert.IsTrue(gpu is OpenCLDevice); gpu = null; } }
/// <summary> /// Приведение матрицы к "каноническому" виду, методом Гаусса-Жордана, /// то есть к матрице, получаемой в результате эквивалентных преобразований /// над строками, и у которой выполнено следующее - если i - индекс первого ненулевого значения в строке, то во всех /// остальных строках матрицы по индексу i содержится только ноль. /// Очевидно, что если индекса первого нулевого значения нет (-1), то вся строка нулевая. /// Приведение матрицы к каноническому виду используется при решении систем линейных уравнений и при поиске /// фундаментальной системы решений системы линейных уравнений. /// В данной реализации используется матрица на полем GF(2), то есть булева матрица. /// </summary> /// <param name="function"></param> public static void ExecuteGaussJordan() { CudafyModule km = CudafyTranslator.Cudafy(); GPGPU gpu = CudafyHost.GetDevice(); gpu.LoadModule(km); int[,] devA = gpu.Allocate(_a); int[,] devB = gpu.Allocate(_b); int[] devC = gpu.Allocate(_c); int[] devD = gpu.Allocate(_d); int[] devE = gpu.Allocate(E); gpu.CopyToDevice(_a, devA); int rows = _a.GetLength(0); int columns = _a.GetLength(1); dim3 gridSize = Math.Min(15, (int)Math.Pow(rows * columns, 0.33333333333)); dim3 blockSize = Math.Min(15, (int)Math.Pow(rows * columns, 0.33333333333)); gpu.Launch(gridSize, blockSize, "RepeatZero", devA, devB, devC, devD, devE); for (int i = 0; i < Math.Min(rows, columns); i++) { gpu.Launch(gridSize, blockSize, "IndexOfNonZero", devA, devB, devC, devD, devE); gpu.CopyFromDevice(devC, _c); while (i < Math.Min(rows, columns) && _c[i] == -1) { i++; } if (i >= Math.Min(rows, columns)) { break; } int j = _c[i]; gpu.Launch(gridSize, blockSize, "BooleanGaussJordan", devA, devB, i, j); int[,] t = devA; devA = devB; devB = t; } gpu.CopyFromDevice(devA, _a); // free the memory allocated on the GPU gpu.FreeAll(); }
public virtual void SetUp() { _gpu = CudafyHost.GetDevice(CudafyModes.Architecture, CudafyModes.DeviceId); var types = new List <Type>(); types.Add(this.GetType()); types.Add(typeof(MathSingleTest)); SupportsDouble = _gpu.GetDeviceProperties().SupportsDoublePrecision; if (SupportsDouble) { types.Add(typeof(MathDoubleTest)); } _cm = CudafyTranslator.Cudafy(CudafyModes.Architecture, types.ToArray()); Debug.WriteLine(_cm.SourceCode); _gpu.LoadModule(_cm); }
public void ExeTestKernel() { GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, 0); eArchitecture arch = gpu.GetArchitecture(); CudafyModule km = CudafyTranslator.Cudafy(arch); gpu.LoadModule(km); int[] host_results = new int[N]; // Either assign a new block of memory to hold results on device var dev_results = gpu.Allocate <int>(N); gpu.Set <int>(dev_results); // Or fill your array with values first and then for (int i = 0; i < N; i++) { host_results[i] = i * 3; } // Copy array with ints to device //var dev_filled_results = gpu.CopyToDevice(host_results); // 64*16 = 1024 threads per block (which is max for sm_30) dim3 threadsPerBlock = new dim3(64, 16); // 8*8 = 64 blocks per grid, 1024 threads per block = kernel launched 65536 times dim3 blocksPerGrid = new dim3(8, 8); //var threadsPerBlock = 1024; // this will only give you blockDim.x = 1024, .y = 0, .z = 0 //var blocksPerGrid = 1; // just for show gpu.Launch(blocksPerGrid, threadsPerBlock, "GenerateRipples", dev_results); gpu.CopyFromDevice(dev_results, host_results); // Test our results for (int index = 0; index < N; index++) { if (host_results[index] != index) { throw new Exception("Check your indexing math, genius!!!"); } } }
public static void InitGPU() { //CudafyModes.Target = eGPUType.Cuda; //CudafyModes.DeviceId = 0; //CudafyTranslator.Language = CudafyModes.Target == eGPUType.OpenCL ? eLanguage.OpenCL : eLanguage.Cuda; //int deviceCount = CudafyHost.GetDeviceCount(CudafyModes.Target); //if (deviceCount == 0) // throw new InvalidOperationException(string.Format("No suitable {0} devices found.", CudafyModes.Target)); CudafyTranslator.GenerateDebug = true; CudafyModule _km = CudafyTranslator.Cudafy(typeof(Lab)); _gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId); _gpu.LoadModule(_km); Console.WriteLine("Running examples using {0}", _gpu.GetDeviceProperties(false).Name); }
public static void Execute() { Console.WriteLine("Compiling ..."); RunTest(GetThreadInfo(), GetAnswer()); ThreadsPerBlock /= 2; RunTest(GetThreadInfo(), GetAnswer()); ThreadsPerBlock /= 2; RunTest(GetThreadInfo(), GetAnswer()); BlocksPerGrid /= 2; RunTest(GetThreadInfo(), GetAnswer()); Console.WriteLine("Done ... Press Enter to shutdown."); try { Console.Read(); } catch (InvalidOperationException) {; } CudafyHost.GetDevice().FreeAll(); CudafyHost.GetDevice().HostFreeAll(); }
public static void Execute() { CudafyModule km = CudafyTranslator.Cudafy(); GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId); gpu.LoadModule(km); int[] a = new int[N]; int[] b = new int[N]; int[] c = new int[N]; // allocate the memory on the GPU int[] dev_a = gpu.Allocate <int>(a); int[] dev_b = gpu.Allocate <int>(b); int[] dev_c = gpu.Allocate <int>(c); // fill the arrays 'a' and 'b' on the CPU for (int i = 0; i < N; i++) { a[i] = -i; b[i] = i * i; } // copy the arrays 'a' and 'b' to the GPU gpu.CopyToDevice(a, dev_a); gpu.CopyToDevice(b, dev_b); // launch add on N threads (really blocks) gpu.Launch(N, 1).adder(dev_a, dev_b, dev_c); // copy the array 'c' back from the GPU to the CPU gpu.CopyFromDevice(dev_c, c); // display the results for (int i = 0; i < N; i++) { Console.WriteLine("{0} + {1} = {2}", a[i], b[i], c[i]); } // free the memory allocated on the GPU gpu.Free(dev_a); gpu.Free(dev_b); gpu.Free(dev_c); }
public void Test_TwoThreadTwoGPUVer2() { eArchitecture arch = CudafyModes.Target == eGPUType.OpenCL ? eArchitecture.OpenCL : eArchitecture.sm_11; _gpu0 = CudafyHost.GetDevice(CudafyModes.Target, 0); var cm = CudafyTranslator.Cudafy(arch, typeof(MultiGPUTests)); _gpu0.SetCurrentContext(); _gpu0.LoadModule(cm); _gpuuintBufferIn0 = _gpu0.Allocate(_uintBufferIn0); _gpu1 = CudafyHost.GetDevice(CudafyModes.Target, 1); // Cannot load same module to two devices, therefore need to clone. var cm1 = cm.Clone(); _gpu1.SetCurrentContext(); _gpu1.LoadModule(cm1); _gpuuintBufferIn1 = _gpu1.Allocate(_uintBufferIn1); _gpu0.EnableMultithreading(); _gpu1.EnableMultithreading(); bool j1 = false; bool j2 = false; for (int i = 0; i < 10; i++) { Console.WriteLine(i); Thread t1 = new Thread(Test_TwoThreadTwoGPU_Thread0V2); Thread t2 = new Thread(Test_TwoThreadTwoGPU_Thread1V2); t1.Start(); t2.Start(); j1 = t1.Join(10000); j2 = t2.Join(10000); if (!j1 || !j2) { break; } } _gpu0.DisableMultithreading(); _gpu0.FreeAll(); _gpu1.DisableMultithreading(); _gpu1.FreeAll(); Assert.IsTrue(j1); Assert.IsTrue(j2); }
public static void MyExecute(byte[] ptr, int dimX, int dimY) { CudafyModule km = CudafyTranslator.Cudafy(); GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId); gpu.LoadModule(km); byte[] allocated_dev_bitmap = gpu.Allocate <byte>(ptr.Length); byte[] copied_dev_bitmap = gpu.CopyToDevice(ptr); gpu.Launch(new dim3(dimX, dimY), 1).mykernel(allocated_dev_bitmap, copied_dev_bitmap); gpu.CopyFromDevice(allocated_dev_bitmap, ptr); gpu.FreeAll(); }
public static void Execute() { _gpu = CudafyHost.GetDevice(eGPUType.Cuda); CudafyModule km = CudafyTranslator.Cudafy(ePlatform.Auto, Program.testArchitecture, typeof(SIMDFunctions)); //CudafyModule km = CudafyTranslator.Cudafy(ePlatform.Auto, eArchitecture.sm_12, typeof(SIMDFunctions)); _gpu.LoadModule(km); int w = 1024; int h = 1024; for (int loop = 0; loop < 3; loop++) { uint[] a = new uint[w * h]; Fill(a); uint[] dev_a = _gpu.CopyToDevice(a); uint[] b = new uint[w * h]; Fill(b); uint[] dev_b = _gpu.CopyToDevice(b); uint[] c = new uint[w * h]; uint[] dev_c = _gpu.Allocate(c); _gpu.StartTimer(); _gpu.Launch(h, w, "SIMDFunctionTest", dev_a, dev_b, dev_c); _gpu.CopyFromDevice(dev_c, c); float time = _gpu.StopTimer(); Console.WriteLine("Time: {0}", time); if (loop == 0) { bool passed = true; GThread thread = new GThread(1, 1, null); for (int i = 0; i < w * h; i++) { uint exp = thread.vadd2(a[i], b[i]); if (exp != c[i]) { passed = false; } } Console.WriteLine("Test {0}", passed ? "passed. " : "failed!"); } _gpu.FreeAll(); } }