static double[] SumMatrixManagedCuda(double[][,] matrix) { int Z = matrix.Length; int Y = matrix[0].GetLength(0); int X = matrix[0].GetLength(1); var result = new double[Y * X]; var lm = ToLinearArray(matrix); int N = lm.Length; matrixSumCude.SetComputeSize((uint)X, (uint)Y); //matrixSumCude.BlockDimensions = 128; //matrixSumCude.GridDimensions = (N + 127) / 128; var da = cntxt.AllocateMemory(N * sizeof(double)); var db = cntxt.AllocateMemory(result.Length * sizeof(double)); cntxt.CopyToDevice(da, lm); cntxt.CopyToDevice(db, result); //CudaDeviceVariable<int> dA = a; //CudaDeviceVariable<int> dB = b; //CudaDeviceVariable<int> dC = new CudaDeviceVariable<int>(N); // Invoke kernel //kernel.Run(dA.DevicePointer, dC.DevicePointer, dimX, dimY, dimZ); matrixSumCude.Run(db, da, X, Y, Z); cntxt.CopyToHost <double>(result, db); return(result); }
public IDeviceMemory Allocate(long byteCount) { long size = PadToAlignment(byteCount, MemoryAlignment); lock (locker) { // allocatedSize += size; if (pools.TryGetValue(size, out Queue <IDeviceMemory> sizedPool)) { if (sizedPool.Count > 0) { IDeviceMemory result = sizedPool.Dequeue(); // HACK bizarrely, Queue.Dequeue appears to sometimes return null, even when there are many elements in the queue, // and when the queue is only ever accessed from one thread. if (result != null) { return(result); } } } else { sizedPool = new Queue <IDeviceMemory>(); pools.Add(size, sizedPool); } CUdeviceptr buffer; try { try { // If control flow gets to this point, sizedPool exists in the dictionary and is empty. context.SetCurrent(); buffer = context.AllocateMemory(size); } catch (ManagedCuda.CudaException) { FreeMemory(false); buffer = context.AllocateMemory(size); } } catch (ManagedCuda.CudaException) { FreeMemory(true); buffer = context.AllocateMemory(size); } BasicDeviceMemory devMemory = null; devMemory = new BasicDeviceMemory(buffer, () => { lock (locker) { sizedPool.Enqueue(devMemory); } }); return(devMemory); } }
internal IDeviceMemoryPtr MultiEuclideanDistance(IDeviceMemoryPtr vector, CUdeviceptr[] compareTo, int size) { IDeviceMemoryPtr ret = null; var buffer = _cuda.AllocateMemory(8 * compareTo.Length); try { _cuda.CopyToDevice(buffer, compareTo); ret = Allocate(size * compareTo.Length); _Use(_multiEuclidean, size, compareTo.Length, k => k.Run(0, vector.DevicePointer, buffer, ret.DevicePointer, size, compareTo.Length)); } finally { _cuda.FreeMemory(buffer); } return(ret); }
internal CudaDeviceVariable <float> MultiEuclideanDistance(CudaDeviceVariable <float> vector, CUdeviceptr[] compareTo, int size) { CudaDeviceVariable <float> ret = null; var buffer = _cuda.AllocateMemory(8 * compareTo.Length); try { _cuda.CopyToDevice(buffer, compareTo); ret = new CudaDeviceVariable <float>(size * compareTo.Length); _Use(_multiEuclidean, size, compareTo.Length, k => k.Run(0, vector.DevicePointer, buffer, ret.DevicePointer, size, compareTo.Length)); } finally { _cuda.FreeMemory(buffer); } return(ret); }
/// <summary> /// Allocates the specified byte count. /// </summary> /// <param name="byteCount">The byte count.</param> /// <returns>IDeviceMemory.</returns> public IDeviceMemory Allocate(long byteCount) { var size = PadToAlignment(byteCount, MemoryAlignment); Queue<IDeviceMemory> sizedPool; if (pools.TryGetValue(size, out sizedPool)) { if (sizedPool.Count > 0) { var result = sizedPool.Dequeue(); // HACK bizarrely, Queue.Dequeue appears to sometimes return null, even when there are many elements in the queue, // and when the queue is only ever accessed from one thread. if(result != null) return result; } } else { sizedPool = new Queue<IDeviceMemory>(); pools.Add(size, sizedPool); } // If control flow gets to this point, sizedPool exists in the dictionary and is empty. var buffer = context.AllocateMemory(size); BasicDeviceMemory devMemory = null; devMemory = new BasicDeviceMemory(buffer, () => { sizedPool.Enqueue(devMemory); }); return devMemory; }
/// <summary> /// Allocs the scratch space. /// </summary> /// <param name="context">The context.</param> /// <param name="deviceProps">The device props.</param> /// <returns>ScratchSpace.</returns> private static ScratchSpace AllocScratchSpace(CudaContext context, CudaDeviceProperties deviceProps) { var size = ScratchSpacePerSMStream * deviceProps.MultiProcessorCount; var buffer = context.AllocateMemory(size); return(new ScratchSpace() { size = size, buffer = buffer }); }
private static ScratchSpace AllocScratchSpace(CudaContext context, CudaDeviceProperties deviceProps) { int size = ScratchSpacePerSMStream * deviceProps.MultiProcessorCount; ManagedCuda.BasicTypes.CUdeviceptr buffer = context.AllocateMemory(size); return(new ScratchSpace() { size = size, buffer = buffer }); }
public PoolingDeviceAllocator(CudaContext context, float memoryUsageRatio = 0.9f) { m_context = context; context.SetCurrent(); m_ulAvailMemByteInTotal = (ulong)((ulong)context.GetFreeDeviceMemorySize() * memoryUsageRatio); m_memPoolPtr = context.AllocateMemory(m_ulAvailMemByteInTotal); m_startMemAddr = m_memPoolPtr.Pointer; m_endMemAddr = m_startMemAddr + m_ulAvailMemByteInTotal; m_usedAddr2Size = new SortedDictionary <ulong, ulong>(); Logger.WriteLine($"Allocated Cuda memory: {m_ulAvailMemByteInTotal}, address from '{m_startMemAddr}' to '{m_endMemAddr}'"); }
public uint[] Run() { var ptx = @"C:\Src\_Tree\SmallPrograms\Buddhabrot\Buddhabrot.Cuda70\x64\Release\Buddhabrot.ptx"; var context = new CudaContext(); var module = new CudaModuleHelper(context, ptx); var init = module.GetKernel("Init"); var setSettings = module.GetKernel("SetSettings"); var runBuddha = module.GetKernel("RunBuddha"); var nBlocks = 4196; var nThreads = 256; var dSettings = context.AllocateMemoryFor(settings); context.CopyToDevice(dSettings, settings); var array = new uint[settings.Width * settings.Height]; var dState = context.AllocateMemory(nThreads * nBlocks * SizeOfCurandState); var dArray = context.AllocateMemoryFor(array); context.CopyToDevice(dArray, array); init.Launch(nBlocks, nThreads, dState); setSettings.Launch(1, 1, dSettings); Console.WriteLine("Starting..."); var sw = Stopwatch.StartNew(); long i = 0; while (!IsStopping) { runBuddha.Launch(nBlocks, nThreads, dArray, dState); double count = (++i * nBlocks * nThreads); if (i % 5 == 0) { Console.WriteLine("Generated {0:0.0} Million samples in {1:0.000} sec", count / 1000000.0, sw.ElapsedMilliseconds / 1000.0); } if (maxSamples.HasValue && count >= maxSamples) break; } context.CopyToHost(array, dArray); return array; }
static void Test(byte[] ptxFile) { const int size = 16; var context = new CudaContext(); var kernel = context.LoadKernelPTX(ptxFile, "kernel"); var memory = context.AllocateMemory(4 * size); var gpuMemory = new CudaDeviceVariable<int>(memory); var cpuMemory = new int[size]; for (var i = 0; i < size; i++) cpuMemory[i] = i - 2; gpuMemory.CopyToDevice(cpuMemory); kernel.BlockDimensions = 4; kernel.GridDimensions = 4; kernel.Run(memory); gpuMemory.CopyToHost(cpuMemory); for (var i = 0; i < size; i++) Console.WriteLine("{0} = {1}", i, cpuMemory[i]); }
public IDeviceMemory Allocate(long byteCount) { var buffer = context.AllocateMemory(byteCount); return(new BasicDeviceMemory(buffer, () => context.FreeMemory(buffer))); }