static void Main(string[] args) { using var context = new Context(); using var accelerator = new CudaAccelerator(context); Demo2DDenseX(accelerator); Demo2DDenseY(accelerator); Demo2DTile(accelerator); }
static void Main() { const int DataSize = 1024; const CuBlasAPIVersion CuBlasVersion = CuBlasAPIVersion.V10; using (var context = new Context()) { // Enable algorithms library context.EnableAlgorithms(); // Check for Cuda support foreach (var acceleratorId in CudaAccelerator.CudaAccelerators) { using (var accelerator = new CudaAccelerator(context, acceleratorId)) { Console.WriteLine($"Performing operations on {accelerator}"); var buf = accelerator.Allocate <float>(DataSize); var buf2 = accelerator.Allocate <float>(DataSize); accelerator.Initialize(accelerator.DefaultStream, buf, 1.0f); accelerator.Initialize(accelerator.DefaultStream, buf2.View, 1.0f); // Initialize the CuBlas library using manual pointer mode handling // (default behavior) using (var blas = new CuBlas(accelerator, CuBlasVersion)) { // Set pointer mode to Host to enable data transfer to CPU memory blas.PointerMode = CuBlasPointerMode.Host; float output = blas.Nrm2(buf); // Set pointer mode to Device to enable data transfer to GPU memory blas.PointerMode = CuBlasPointerMode.Device; blas.Nrm2(buf, buf2); // Use pointer mode scopes to recover the previous pointer mode using (var scope = blas.BeginPointerScope(CuBlasPointerMode.Host)) { float output2 = blas.Nrm2(buf); } } // Initialize the CuBlas<T> library using custom pointer mode handlers using (var blas = new CuBlas <CuBlasPointerModeHandlers.AutomaticMode>(accelerator, CuBlasVersion)) { // Automatic transfer to host float output = blas.Nrm2(buf); // Automatic transfer to device blas.Nrm2(buf, buf2); } } } } }
public static float[] RunOddEvenSort(float[] a, ref Stopwatch sw) { int N = a.Length; bool evenArr = (N % 2) == 0 ? true : false; bool stopFlag = false; bool iterationEven = true; //Create context and accelerator var gpu = new CudaAccelerator(new Context()); //Create typed launcher var oddEvenKernel = gpu.LoadAutoGroupedStreamKernel < Index1, ArrayView <float>, VariableView <byte>, bool, int, bool>(OddEvenSort); //Allocate memory MemoryBuffer <float> d_a = gpu.Allocate <float>(N); MemoryBuffer <byte> d_stopFlag = gpu.AllocateZero <byte>(1); d_a.CopyFrom(a, 0, Index1.Zero, N); sw.Restart(); //Run kernel byte[] zero_val = new byte[1]; zero_val[0] = 0; while (true) { if (stopFlag) { break; } stopFlag = true; d_stopFlag.CopyFrom(zero_val, 0, 0, 1); oddEvenKernel(N / 2, d_a, d_stopFlag.View.GetVariableView(), iterationEven, N, evenArr); gpu.Synchronize(); if (d_stopFlag.GetAsArray()[0] > 0) { stopFlag = false; } iterationEven = !iterationEven; } sw.Stop(); return(d_a.GetAsArray()); }
public CudaProgressMemoryBuffer( CudaAccelerator accelerator, long length, int elementSize) : base(accelerator, length, elementSize) { CudaException.ThrowIfFailed( CudaAPI.CurrentAPI.AllocateHostMemory( out IntPtr resultPtr, new IntPtr(LengthInBytes))); NativePtr = resultPtr; }
/// <summary> /// Constructs a new Cuda buffer. /// </summary> /// <param name="accelerator">The target accelerator.</param> /// <param name="d3dDevice">The target DX device.</param> /// <param name="buffer">The target DX buffer.</param> /// <param name="bufferFlags">The buffer flags.</param> /// <param name="viewFlags">The used view flags</param> internal CudaDirectXBuffer( CudaAccelerator accelerator, Device d3dDevice, Buffer buffer, DirectXBufferFlags bufferFlags, DirectXViewFlags viewFlags) : base(accelerator, d3dDevice, buffer, bufferFlags, viewFlags) { CudaDirectXAccelerator.RegisterResource( Buffer, viewFlags, out cudaGraphicsResource); }
/// <summary> /// Constructs a new Cuda texture 2D. /// </summary> /// <param name="accelerator">The target accelerator.</param> /// <param name="d3dDevice">The target DX device.</param> /// <param name="texture">The target DX texture.</param> /// <param name="bufferFlags">The used buffer flags.</param> /// <param name="viewFlags">The used view flags.</param> internal CudaDirectXTexture2DArray( CudaAccelerator accelerator, Device d3dDevice, Texture2D texture, DirectXBufferFlags bufferFlags, DirectXViewFlags viewFlags) : base(accelerator, d3dDevice, texture, bufferFlags, viewFlags) { CudaDirectXAccelerator.RegisterResource( texture, viewFlags, out cudaGraphicsResource); }
/// <summary> /// Demonstrates using EmitRef. /// </summary> static void SubtractUsingEmitRef(CudaAccelerator accelerator) { using var buffer = accelerator.Allocate1D <long>(32); var kernel = accelerator.LoadAutoGroupedStreamKernel <Index1D, ArrayView <long> >(SubtractEmitRefKernel); kernel((int)buffer.Length, buffer.View); var results = buffer.GetAsArray1D(); for (var i = 0; i < results.Length; i++) { Console.WriteLine($"[{i}] = {results[i]}"); } }
/// <summary> /// Demonstrates a block statement, with local register declaration. /// </summary> static void AddUsingTempRegister(CudaAccelerator accelerator) { using var buffer = accelerator.Allocate1D <double>(1024); var kernel = accelerator.LoadAutoGroupedStreamKernel <Index1D, ArrayView <double> >(MultipleInstructionKernel); kernel((int)buffer.Length, buffer.View); var results = buffer.GetAsArray1D(); for (var i = 0; i < results.Length; i++) { Console.WriteLine($"[{i}] = {results[i]}"); } }
public static void IlGpuConstants( CudaAccelerator gpu, Real[] mSquaredDistances, Real[] mCoordinates, int c, int n) { if (n % 2 != 0) { throw new ArgumentException("n must be a multiple of 2"); } IlGpuOptimisedImpl(gpu, mSquaredDistances, mCoordinates, c, n, "SquaredDistance.IlGpuConstants", IlGpuKernelConstants, i => new SpecializedValue <int>(i)); }
public static void IlGpuLocalMemory( CudaAccelerator gpu, Real[] mSquaredDistances, Real[] mCoordinates, int c, int n) { if (n % 2 != 0) { throw new ArgumentException("n must be a multiple of 2"); } IlGpuOptimisedImpl(gpu, mSquaredDistances, mCoordinates, c, n, "SquaredDistance.IlGpuLocalMemory", IlGpuKernelLocalMemory); }
// Use the low-level CuFFT API to perform an inverse transform. static void DoInversePlan( CudaAccelerator accelerator, CuFFTAPI api, Complex[] input, out Complex[] output) { using var stream = accelerator.CreateStream() as CudaStream; using var inputBuffer = accelerator.Allocate1D(input); using var outputBuffer = accelerator.Allocate1D <Complex>(input.Length); CuFFTException.ThrowIfFailed( api.Plan1D( out var plan, input.Length, CuFFTType.CUFFT_Z2Z, batch: 1)); try { CuFFTException.ThrowIfFailed( api.SetStream(plan, stream)); CuFFTException.ThrowIfFailed( api.ExecZ2Z( plan, inputBuffer.View.BaseView, outputBuffer.View.BaseView, CuFFTDirection.INVERSE)); output = outputBuffer.GetAsArray1D(stream); } finally { CuFFTException.ThrowIfFailed( api.Destroy(plan)); } WorkaroundKnownIssue(accelerator, api); // Scale the output to obtain the inverse. for (var i = 0; i < output.Length; i++) { output[i] /= output.Length; } Console.WriteLine("Inverse Values:"); for (var i = 0; i < output.Length; i++) { Console.WriteLine($" [{i}] = {output[i].Real}"); } }
/// <summary> /// Constructs a page lock scope for the accelerator. /// </summary> /// <param name="accelerator">The associated accelerator.</param> /// <param name="hostPtr">The host buffer pointer to page lock.</param> /// <param name="numElements">The number of elements in the buffer.</param> internal CudaPageLockScope( CudaAccelerator accelerator, IntPtr hostPtr, long numElements) : base(accelerator, numElements) { if (!accelerator.Device.SupportsMappingHostMemory) { throw new NotSupportedException( RuntimeErrorMessages.NotSupportedPageLock); } HostPtr = hostPtr; bool supportsHostPointer = accelerator .Device .SupportsUsingHostPointerForRegisteredMemory; // Setup internal memory registration flags. var flags = MemHostRegisterFlags.CU_MEMHOSTREGISTER_PORTABLE; if (!supportsHostPointer) { flags |= MemHostRegisterFlags.CU_MEMHOSTREGISTER_DEVICEMAP; } // Perform the memory registration. CudaException.ThrowIfFailed( CurrentAPI.MemHostRegister( hostPtr, new IntPtr(LengthInBytes), flags)); // Check whether we have to determine the actual device pointer or are able // to reuse the host pointer for all operations. if (supportsHostPointer) { AddrOfLockedObject = hostPtr; } else { CudaException.ThrowIfFailed( CurrentAPI.MemHostGetDevicePointer( out IntPtr devicePtr, hostPtr, 0)); AddrOfLockedObject = devicePtr; } }
public static void Main() { using Context context = new Context(); context.EnableAlgorithms(); using Accelerator device = new CudaAccelerator(context); int width = 1920; int height = 1080; byte[] h_bitmapData = new byte[width * height * 3]; using MemoryBuffer2D <Vec3> canvasData = device.Allocate <Vec3>(width, height); using MemoryBuffer <byte> d_bitmapData = device.Allocate <byte>(width * height * 3); CanvasData c = new CanvasData(canvasData, d_bitmapData, width, height); // pos // look at // up Camera camera = new Camera(new Vec3(0, 50, -100), new Vec3(0, 0, 0), new Vec3(0, -1, 0), width, height, 40f); WorldData world = new WorldData(device); //world.loadMesh(new Vec3(10, 0, 0), "./Assets/defaultcube.obj"); world.loadMesh(new Vec3(0, 0, 0), "./Assets/cat.obj"); var frameBufferToBitmap = device.LoadAutoGroupedStreamKernel <Index2, CanvasData>(CanvasData.CanvasToBitmap); var RTMethod = device.LoadAutoGroupedStreamKernel <Index2, CanvasData, dWorldBuffer, Camera>(PerPixelRayIntersectionMethod); //do rasterization here Stopwatch timer = new Stopwatch(); timer.Start(); RTMethod(new Index2(width, height), c, world.getDeviceWorldBuffer(), camera); frameBufferToBitmap(canvasData.Extent, c); device.Synchronize(); d_bitmapData.CopyTo(h_bitmapData, 0, 0, d_bitmapData.Extent); timer.Stop(); Console.WriteLine("Rendered in: " + timer.Elapsed); //bitmap magic that ignores striding be careful with some using Bitmap b = new Bitmap(width, height, width * 3, PixelFormat.Format24bppRgb, Marshal.UnsafeAddrOfPinnedArrayElement(h_bitmapData, 0)); b.Save("out.bmp"); Process.Start("cmd.exe", "/c out.bmp"); }
/// <summary> /// Demonstrates using the mul.hi.u64 and mul.lo.u64 inline PTX instructions to /// multiply two UInt64 values to produce a UInt128 value. /// </summary> static void MultiplyUInt128(CudaAccelerator accelerator) { using var buffer = accelerator.Allocate1D <UInt128>(1024); var kernel = accelerator.LoadAutoGroupedStreamKernel <Index1D, ArrayView <UInt128>, SpecializedValue <ulong> >(MultiplyUInt128Kernel); kernel( (int)buffer.Length, buffer.View, SpecializedValue.New(ulong.MaxValue)); var results = buffer.GetAsArray1D(); for (var i = 0; i < results.Length; i++) { Console.WriteLine($"[{i}] = {results[i]}"); } }
private static void RunManyMatrixMultiplication(Gpu aleaGpu, CudaAccelerator ilGpu) { const int m = 100; const int n = 250 - 1; var resultM = new Real[m * n * n]; var resultC = new Real[m * n * n]; var left = new Real[m * n * n]; var right = new Real[n * n]; Benchmark.Run(Loops, () => ManyMatrixMultiplication.Initialise(left, right, m, n), () => ManyMatrixMultiplication.Initialise(left, right, m, n), () => AssertAreEqual(resultM, resultC, m * n, n), () => ManyMatrixMultiplication.Managed(resultM, left, right, m, n), () => ManyMatrixMultiplication.Alea(aleaGpu, resultC, left, right, m, n)); }
public static HashSet <Point> QuickHull(Points points) { if (points.Xs.Length != points.Ys.Length) { throw new ArgumentException($"Invalid {nameof(Points)} structure"); } if (points.Xs.Length <= 2) { throw new ArgumentException($"Too little points: {points.Xs.Length}, expected 3 or more"); } var result = new HashSet <Point>(); var left = (points.Xs[0], points.Ys[0]); var right = (points.Xs[0], points.Ys[0]); for (var i = 0; i < points.Xs.Length; i++) { if (points.Xs[i] < left.Item1) { left.Item1 = points.Xs[i]; left.Item2 = points.Ys[i]; } if (points.Xs[i] > right.Item1) { right.Item1 = points.Xs[i]; right.Item2 = points.Ys[i]; } } using var context = new Context(); context.EnableAlgorithms(); using var accelerator = new CudaAccelerator(context); using var xsBuffer = accelerator.Allocate <float>(points.Xs.Length); using var ysBuffer = accelerator.Allocate <float>(points.Ys.Length); xsBuffer.CopyFrom(points.Xs, 0, 0, points.Xs.Length); ysBuffer.CopyFrom(points.Ys, 0, 0, points.Ys.Length); FindHull(in points, left.Item1, left.Item2, right.Item1, right.Item2, 1, result, accelerator, xsBuffer.View, ysBuffer.View); FindHull(in points, left.Item1, left.Item2, right.Item1, right.Item2, -1, result, accelerator, xsBuffer.View, ysBuffer.View); return(result); }
private static void RunAddVector(Gpu aleaGpu, CudaAccelerator ilGpu) { const int m = 2 * 24 * 12; const int n = 2 * 25600 - 1; var matrixM = new Real[m * n]; var matrixC = new Real[m * n]; var vector = new Real[n]; Benchmark.Run(Loops, () => AddVector.Initialise(matrixM, vector, m, n), () => AddVector.Initialise(matrixC, vector, m, n), () => AssertAreEqual(matrixM, matrixC, m, n), () => AddVector.Managed(matrixM, vector, m, n), #if USE_ALEA () => AddVector.Alea(aleaGpu, matrixC, vector, m, n), #endif () => AddVector.IlGpu(ilGpu, matrixC, vector, m, n)); }
public static void IlGpu(CudaAccelerator gpu, Real[] matrix, Real[] vector, int m, int n) { using (var cudaMatrix = gpu.Allocate(matrix)) using (var cudaVector = gpu.Allocate(vector)) { var timer = Stopwatch.StartNew(); var gridSizeX = Util.DivUp(n, 32); var gridSizeY = Util.DivUp(m, 8); var lp = ((gridSizeX, gridSizeY, 1), (32, 8)); gpu.Launch(IlGpuKernel, gpu.DefaultStream, lp, cudaMatrix.View, cudaVector.View, m, n); gpu.Synchronize(); Util.PrintPerformance(timer, "AddVector.IlGpu", 3, m, n); cudaMatrix.CopyTo(matrix, 0, 0, matrix.Length); } }
public static float[] RunMatrixMulShared(float[][] a, float[][] b, int N, ref Stopwatch sw) { //Create context and accelerator var gpu = new CudaAccelerator(new Context()); //Create typed launcher var matrixMulKernelShared = gpu.LoadStreamKernel < ArrayView <float>, ArrayView <float>, ArrayView <float>, int>(MatrixMulShared); //Allocate memory var buffSize = N * N; MemoryBuffer <float> d_a = gpu.Allocate <float>(buffSize); MemoryBuffer <float> d_b = gpu.Allocate <float>(buffSize); MemoryBuffer <float> d_c = gpu.Allocate <float>(buffSize); d_a.CopyFrom(FlatternArr(a), 0, Index1.Zero, buffSize); d_b.CopyFrom(FlatternArr(b), 0, Index1.Zero, buffSize); //Groups per grid dimension int GrPerDim = (int)Math.Ceiling((float)N / groupSize); KernelConfig dimension = ( new Index2(GrPerDim, GrPerDim), // Number of groups new Index2(groupSize, groupSize)); // Group size (thread count in group) sw.Restart(); matrixMulKernelShared(dimension, d_a.View, d_b.View, d_c.View, N); // Wait for the kernel to finish... gpu.Synchronize(); sw.Stop(); var c = d_c.GetAsArray(); return(c); }
/// <summary> /// Constructs a page lock scope for the accelerator. /// </summary> /// <param name="accelerator">The associated accelerator.</param> /// <param name="hostPtr">The host buffer pointer to page lock.</param> /// <param name="numElements">The number of elements in the buffer.</param> internal CudaPageLockScope( CudaAccelerator accelerator, IntPtr hostPtr, long numElements) : base(accelerator) { if (!accelerator.Device.SupportsMappingHostMemory) { throw new NotSupportedException( RuntimeErrorMessages.NotSupportedPageLock); } HostPtr = hostPtr; Length = numElements; var flags = MemHostRegisterFlags.CU_MEMHOSTREGISTER_PORTABLE; if (!accelerator.Device.SupportsUsingHostPointerForRegisteredMemory) { flags |= MemHostRegisterFlags.CU_MEMHOSTREGISTER_DEVICEMAP; } CudaException.ThrowIfFailed( CurrentAPI.MemHostRegister( hostPtr, new IntPtr(LengthInBytes), flags)); if (accelerator.Device.SupportsUsingHostPointerForRegisteredMemory) { AddrOfLockedObject = hostPtr; } else { CudaException.ThrowIfFailed( CurrentAPI.MemHostGetDevicePointer( out IntPtr devicePtr, hostPtr, 0)); AddrOfLockedObject = devicePtr; } }
public static void IlGpu( CudaAccelerator gpu, Real[] mSquaredDistances, Real[] mCoordinates, int c, int n) { using var cudaSquaredDistance = gpu.Allocate(mSquaredDistances); using var cudaCoordinates = gpu.Allocate(mCoordinates); var timer = Stopwatch.StartNew(); const int blockSize = 128; var gridSize = Util.DivUp(n * n, blockSize); var lp = (gridSize, blockSize); gpu.Launch(IlGpuKernel, gpu.DefaultStream, lp, cudaSquaredDistance.View, cudaCoordinates.View, c, n); gpu.Synchronize(); Util.PrintPerformance(timer, "SquaredDistance.IlGpu", n, c, n); cudaSquaredDistance.CopyTo(mSquaredDistances, 0, 0, mSquaredDistances.Length); }
public static HashSet <Point> QuickHull(Point[] points) { if (points.Length <= 2) { throw new ArgumentException($"Too little points: {points.Length}, expected 3 or more"); } var result = new HashSet <Point>(); var left = points[0]; var right = points[0]; for (var i = 1; i < points.Length; i++) { if (points[i].X < left.X) { left = points[i]; } if (points[i].X > right.X) { right = points[i]; } } using var context = new Context(); context.EnableAlgorithms(); using var accelerator = new CudaAccelerator(context); using var buffer = accelerator.Allocate <Point>(points.Length); buffer.CopyFrom(points, 0, 0, points.Length); FindHull(points, left, right, 1, result, accelerator, buffer.View); FindHull(points, left, right, -1, result, accelerator, buffer.View); return(result); }
private static void IlGpuOptimisedImpl( CudaAccelerator gpu, Real[] mSquaredDistances, Real[] mCoordinates, int c, int n, string name, Action <ArrayView2D <Real>, ArrayView <Real>, SpecializedValue <int>, SpecializedValue <int>, int> kernelFunc) { using var cudaSquaredDistance = gpu.Allocate <Real>(n, n); using var cudaCoordinates = gpu.Allocate(mCoordinates); var timer = Stopwatch.StartNew(); const int blockSize = 128; var gridSize = Util.DivUp(n, blockSize); var lp = ((gridSize, gridSize, 1), (blockSize, 1, 1)); gpu.Launch(kernelFunc, gpu.DefaultStream, lp, cudaSquaredDistance.View, cudaCoordinates.View, SpecializedValue.New(blockSize), SpecializedValue.New(c), n); gpu.Synchronize(); Util.PrintPerformance(timer, name, n, c, n); cudaSquaredDistance.CopyTo(mSquaredDistances, (0, 0), 0, (n, n)); }
private static void Performance() { using (var context = new Context()) { using (var accelerator = new CudaAccelerator(context)) { using (var b = accelerator.CreateBackend()) { using (var c = accelerator.Context.CreateCompileUnit(b)) { var method = typeof(Program).GetMethod("MathKernel", BindingFlags.Static | BindingFlags.Public); var compiled = b.Compile(c, method); var kernel = accelerator.LoadAutoGroupedStreamKernel<Index2, ArrayView2D<float>>(MathKernel); //var kernel = accelerator.LoadAutoGroupedKernel(compiled); int size = 100000; var W = new[] { 50 }; var H = new[] { 50 }; for (int n = 0; n < W.Length; n++) { for (int m = 0; m < H.Length; m++) { int x = W[n]; int y = H[m]; Console.WriteLine($"\n\nW {x}, H {y} \n\n"); //var watch = Stopwatch.StartNew(); //for (int k = 0; k < size; k++) //{ // var v = new float[x, y]; // for (int i = 0; i < x; i++) // { // for (int j = 0; j < y; j++) // { // v[i, j] = (float)Math.Sqrt(i * j); // } // } //} //watch.Stop(); //Console.WriteLine($"\n\nElapsed CPU Time Linear: {watch.ElapsedMilliseconds}ms\n"); //GC.Collect(); // //watch = Stopwatch.StartNew(); //Parallel.For(0, size, k => //{ // var v = new float[x, y]; // Parallel.For(0, x, i => // { // Parallel.For(0, y, j => // { // v[i, j] = (float)Math.Sqrt(i * j); // }); // }); //}); //watch.Stop(); //Console.WriteLine($"Elapsed CPU Time Parallel: {watch.ElapsedMilliseconds}ms\n\n"); //GC.Collect(); //var watch = Stopwatch.StartNew(); //for (int k = 0; k < size; k++) //{ // var idx = new Index2(x, y); // var buffer = accelerator.Allocate<float>(idx); // kernel(idx, buffer.View); // accelerator.Synchronize(); // buffer.Dispose(); //} //watch.Stop(); //Console.WriteLine($"\n\nElapsed GPU Time Linear: {watch.ElapsedMilliseconds}ms\n"); //GC.Collect(); var kn = Enumerable.Repeat(accelerator.LoadAutoGroupedStreamKernel<Index2, ArrayView2D<float>>(MathKernel), size).ToList(); var watch = Stopwatch.StartNew(); Parallel.For(0, size, k => { var idx = new Index2(x, y); var buffer = accelerator.Allocate<float>(idx); //kn[k](idx, buffer.View); //kernel.Launch(idx, buffer.View); kernel(idx, buffer.View); accelerator.Synchronize(); buffer.Dispose(); }); watch.Stop(); Console.WriteLine($"Elapsed GPU Time Parallel: {watch.ElapsedMilliseconds}ms\n\n"); GC.Collect(); } } } } } } }
//Note: This program runs a *lot* faster in Release mode than Debug mode (because bounds checking is disabled in ILGPU). static void Main(string[] args) { //needed for the other method (Parallel.For) //int maxLength = 0; //long iterations = 0; long originalMin = 0; //As a note, it takes around 2 hours on CPU (Core i7 4790K) to search ~120B numbers long min = 0; long max = 113373373373; //Length of an array of longs. //Since longs are int64, multiply by ~8 for actual memory use in bytes const long allocatedMemory = 200000000; //Cache to store results that work in. As this grows, so does the time to search it for good results. const int resultCacheSize = 100; //Stores the minimum value for the specific depth long minForMax = long.MaxValue; //The chain length to search for: //E.G. 100 (OneHundred) -> 10 (Ten) -> 3 (Three) -> 5 (Five) -> 4 [end] is of length 4. const int chainLength = 8; //Include punctuation in the count or not. //E.G., With punctuation 137 -> "One Hundred and Thirty-Seven" (28 characters) //& Without punctuation, 137 -> "OneHundredandThirtySeven" (24 characters) const bool includePunctuation = false; //Stop when one number is found with specified chain length (obviously invalidates the percentage count) const bool stopAtOneFound = false; //min is increased as the program runs, so we need a copy of its original value for calculating the % done. originalMin = min; using (var context = new Context()) { Accelerator acc; try { acc = new CudaAccelerator(context); } catch (Exception) { //no cuda acc = new CPUAccelerator(context); } var a = acc; Console.WriteLine("Performing ops on " + a.Name + ". " + a.NumMultiprocessors.ToString() + " processors."); //Set up two kernels to get the data var searchKernel = a.LoadAutoGroupedStreamKernel <Index, ArrayView <UInt64>, long, long, bool>(SearchForChain); var resultKernel = a.LoadAutoGroupedStreamKernel <Index, ArrayView <UInt64>, ArrayView <UInt64> >(FindNonZero); using (var buffa = a.Allocate <UInt64>((int)allocatedMemory)) { using (var buffb = a.Allocate <UInt64>(resultCacheSize)) { //Loop while we haven't gone over the maximum value in search range while (min < max) { //Search for numbers first (Kernel) searchKernel((int)allocatedMemory, buffa.View, min, chainLength, !includePunctuation); a.Synchronize(); //Read back array to find nonzero entries (Kernel) resultKernel(resultCacheSize, buffb, buffa); a.Synchronize(); var arr = buffb.GetAsArray(); bool found = false; //Read back the results array for nonzero entries (Normal .net) for (int i = 0; i < buffb.Length; i++) { if (arr[i] != 0) { found = true; Console.WriteLine(arr[i]); if (arr[i] < (ulong)minForMax) { minForMax = (long)arr[i]; } } } //break if we have found a number and had to stop at one found if (found && stopAtOneFound) { break; } min += allocatedMemory; long total = max - originalMin; long diff = min - originalMin; //For displaying the percentage complete Console.WriteLine((((decimal)diff / (decimal)total) * 100).ToString() + "% complete"); } } } } //The code commented below is for doing this via a Parallel.for. /* * Parallel.For(min,max,(i)=> { * int chainLength = searchGPU(i,true); * if (chainLength > maxLength) * { * maxLength = chainLength; * Console.WriteLine(NumberToString(i) + " <=> (" + i.ToString() + ") gave chain length: " + chainLength.ToString()); * minForMax = long.MaxValue; * } * * if (chainLength == maxLength && Math.Abs(i) < Math.Abs(minForMax)) * { * minForMax = i; * Console.WriteLine(i.ToString() + " was a better candidate for " + chainLength.ToString()); * } * * iterations++; * if (iterations % ((max - min) / 10000) == 0) * { * * decimal percent = (decimal)iterations / (decimal)((max - min)); * * Console.WriteLine((percent * 100)+" Percent done."); * } * });*/ Console.WriteLine(NumberToString(minForMax) + " <=> (" + minForMax.ToString() + ") gave chain length: " + chainLength.ToString()); }
/// <summary> /// Constructs a new CuBlas instance to access the Nvidia cublas library. /// </summary> /// <param name="accelerator">The associated cuda accelerator.</param> public CuBlas(CudaAccelerator accelerator) : base(accelerator) { }
/// <summary> /// Constructs a new CuBlas instance to access the Nvidia cublas library. /// </summary> /// <param name="accelerator">The associated cuda accelerator.</param> /// <param name="apiVersion">The cuBlas API version.</param> public CuBlas(CudaAccelerator accelerator, CuBlasAPIVersion apiVersion) : base(accelerator, apiVersion) { }
/// <summary> /// Constructs a new Cuda DX-interop accelerator. /// </summary> /// <param name="accelerator">The target Cuda accelerator.</param> /// <param name="d3dDevice">The target DX device.</param> internal CudaDirectXAccelerator(CudaAccelerator accelerator, Device d3dDevice) : base(accelerator, d3dDevice) { }
public ScanProviderImplementation CreateCudaExtension(CudaAccelerator accelerator) { return(new Cuda.CudaScanProviderImplementation(accelerator)); }
public ImplCuda() { _context = new Context(); _gpu = new CudaAccelerator(_context); }