static string TestCublasHandle() { CublasOp o = CublasClr.CublasOp.N; string testName = "TestCublasHandle"; var cuby = new CublasClr.Cublas(); IntPtr devHandle = new IntPtr(); var aa = new CudaArray(); try { var res = aa.ResetDevice(); res = res + cuby.MakeCublasHandle(ref devHandle); res = res + cuby.DestroyCublasHandle(devHandle); if (res != String.Empty) { return(testName + " fail: " + res); } return(testName + " pass"); } catch { return(testName + " fail"); } finally { //aa.ReleaseDevicePtr(devData); aa.ResetDevice(); } }
static string TestcublasSgemm2() { string testName = "TestcublasSgemm2"; uint aw = 2; uint bh = aw; uint ah = 3; uint bw = 3; uint ch = ah; uint cw = bw; GpuMatrix gpuA; GpuMatrix gpuB; GpuMatrix gpuC; var dataA = MatrixUtils.AA(); var dataB = MatrixUtils.BB(); var cRes = new float[ch * cw]; var cuby = new CublasClr.Cublas(); var aa = new CudaArray(); var res = aa.ResetDevice(); res = res + GpuMatrixOps.SetupGpuMatrix( out gpuA, new Matrix <float>(_rows: ch, _cols: cw, host_data: ImmutableArray.Create(dataA), matrixFormat: MatrixFormat.Column_Major)); res = res + GpuMatrixOps.SetupGpuMatrix( out gpuB, new Matrix <float>(_rows: bh, _cols: bw, host_data: ImmutableArray.Create(dataB), matrixFormat: MatrixFormat.Column_Major)); res = res + GpuMatrixOps.SetupGpuMatrix( out gpuC, new Matrix <float>(_rows: ch, _cols: cw, host_data: ImmutableArray.Create(cRes), matrixFormat: MatrixFormat.Column_Major)); IntPtr cublasHandle = new IntPtr(); res = res + cuby.MakeCublasHandle(ref cublasHandle); GpuMatrix gpuProd; res = res + GpuMatrixOps.Multiply( gmOut: out gpuProd, cublasHandle: cublasHandle, gmA: gpuA, gmB: gpuB, gmC: gpuC); GpuMatrix gpuSynched; res = res + GpuMatrixOps.CopyToHost(out gpuSynched, gpuProd); // GpuMatrixUtils.MatrixMult(C: cRes, A: dataA, B: dataB, wA: aw, hA: ah, wB: bw); return(string.Empty); }
public static string CopyToDevice(out GpuMatrix gmOut, GpuMatrix gmIn) { if (gmIn.DevHostState == DevHostState.DeviceNotAllocated) { gmOut = null; return("Device data pointer not allocated"); } var aa = new CudaArray(); var strRet = aa.CopyFloatsToDevice( gmIn.Matrix.Data.ToArray(), gmIn.DevPtr, (uint)gmIn.Matrix.Data.Length); if (!String.IsNullOrEmpty(strRet)) { gmOut = null; return(strRet); } gmOut = new GpuMatrix( matrix: gmIn.Matrix, devPtr: gmIn.DevPtr, devHostState: DevHostState.Synched); return(String.Empty); }
public static string Init(int[] inputs, uint span, uint blockSize) { _span = span; _block_size = blockSize; _area = _span * _span; _blocks_per_span = span / blockSize; _blockCount = _blocks_per_span * _blocks_per_span; d_indexRands = new IntPtr(); d_grid = new IntPtr(); _cudaArray = new CudaArray(); _gridProcs = new GridProcs(); _randoProcs = new RandoProcs(); var strRet = _cudaArray.ResetDevice(); strRet = strRet + _randoProcs.MakeGenerator32(SEED); strRet = strRet + _cudaArray.MallocIntsOnDevice(ref d_grid, _area); strRet = strRet + _cudaArray.MallocIntsOnDevice(ref d_energy, _area); strRet = strRet + _cudaArray.MallocIntsOnDevice(ref d_energyBlocks, _area / 1024); strRet = strRet + _cudaArray.CopyIntsToDevice(inputs, d_grid, _area); strRet = strRet + _cudaArray.MallocIntsOnDevice(ref d_indexRands, _blockCount); strRet = strRet + _cudaArray.MallocFloatsOnDevice(ref d_tempRands, _blockCount); strRet = strRet + _gridProcs.Runk_Energy4(d_energy, d_grid, _span); strRet = strRet + _cudaArray.MallocFloatsOnDevice(ref d_betas, 5); return(strRet); }
public static string CopyToHost(out GpuMatrix gmOut, GpuMatrix gmIn) { if (gmIn.DevHostState == DevHostState.DeviceNotAllocated) { gmOut = null; return("Device data pointer not allocated"); } var hostData = new float[gmIn.Matrix.Data.Length]; var aa = new CudaArray(); var strRet = aa.CopyFloatsFromDevice(hostData, gmIn.DevPtr, (uint)gmIn.Matrix.Data.Length); if (!String.IsNullOrEmpty(strRet)) { gmOut = null; return(strRet); } gmOut = new GpuMatrix( matrix: new Matrix <float>(_rows: gmIn.Matrix.Rows, _cols: gmIn.Matrix.Cols, host_data: ImmutableArray.Create(hostData), matrixFormat: MatrixFormat.Column_Major), devPtr: gmIn.DevPtr, devHostState: DevHostState.Synched); return(String.Empty); }
public static string Init(int[] inputs, uint span) { // init libs _cudaArray = new CudaArray(); _gridProcs = new GridProcs(); _randoProcs = new RandoProcs(); // Set grid sizes _span = span; _area = _span * _span; // Set block and thread sizes _blockSize = (_span < MAXTHREADS) ? _span : MAXTHREADS; _gridSize = _area / _blockSize; // Set memory sizes _mem_N = sizeof(int) * (_area); _mem_rand = sizeof(double) * (3 * _area); _mem_1 = sizeof(int) * (1); _mem_measured_quantity = sizeof(int) * (_gridSize); _mem_measured_magnet = sizeof(int) * (_gridSize); // Allocate device arrays d_rands = new IntPtr(); d_grid = new IntPtr(); var strRet = _cudaArray.ResetDevice(); strRet = strRet + _randoProcs.MakeGenerator64(SEED); strRet = strRet + _cudaArray.MallocIntsOnDevice(ref d_grid, _area); strRet = strRet + _cudaArray.CopyIntsToDevice(inputs, d_grid, _area); strRet = strRet + _cudaArray.MallocFloatsOnDevice(ref d_rands, _area); return(strRet); }
public static string Init(int[] inputs, uint span) { _span = span; _area = _span * _span; d_rands = new IntPtr(); d_gridA = new IntPtr(); d_gridB = new IntPtr(); d_energy = new IntPtr(); d_energyBlocks = new IntPtr(); _cudaArray = new CudaArray(); _gridProcs = new GridProcs(); _randoProcs = new RandoProcs(); var strRet = _cudaArray.ResetDevice(); strRet = strRet + _randoProcs.MakeGenerator64(SEED); strRet = strRet + _cudaArray.MallocIntsOnDevice(ref d_gridA, _area); strRet = strRet + _cudaArray.MallocIntsOnDevice(ref d_energy, _area); strRet = strRet + _cudaArray.MallocIntsOnDevice(ref d_energyBlocks, _area / 1024); strRet = strRet + _cudaArray.MallocIntsOnDevice(ref d_gridB, _area); strRet = strRet + _cudaArray.CopyIntsToDevice(inputs, d_gridA, _area); strRet = strRet + _cudaArray.CopyIntsToDevice(inputs, d_gridB, _area); strRet = strRet + _cudaArray.MallocFloatsOnDevice(ref d_rands, _area); strRet = strRet + _cudaArray.MallocFloatsOnDevice(ref d_betas, 10); return(strRet); }
public void Initialize() { if (NumAttributeAxes > GPUConstants.MaxAttributeAxes || NumCategoricalAxes > GPUConstants.MaxCategoricalAxes) { throw new InvalidOperationException("Too attribute axes"); } ContextBuffer = new CudaArray <GPUDecisionLearnerContext>(1); DataPointBuffer = DataPointsToGpu(DataPoints); DataPointIds = new CudaArray <int>(DataPoints.Count); Nodes = new CudaArray <GPUNode>(GPUConstants.MaxTotalNodes); OpenNodeIds = new CudaArray <int>(GPUConstants.MaxNodesAtSingleLevel); NextOpenNodeIds = new CudaArray <int>(GPUConstants.MaxNodesAtSingleLevel); Kernels = new KernelManager(CudaManager) { BlockSize = GPUConstants.NumThreadsPerBlock, PrefixArguments = new[] { ContextBuffer }, }; Kernels["dlInitContext"].Arguments( DataPointBuffer, DataPoints.Count, TotalWeight, NumAttributeAxes, NumCategoricalAxes, DataPointIds, Nodes, OpenNodeIds, NextOpenNodeIds, (int)OpenNodeIds.Size).ExecuteTask(); }
public void ApplyOptimalSplit(CudaArray <GPUSplit> bestSplits) { Kernels["spcApplyOptimalSplit"] .Arguments(_dataPointsPerAxis, bestSplits, _sortKeys) .Execute(GPUConstants.NumThreadsPerBlock * NumSmallBlocks); #if DEBUGCUDA SanityCheckSplit(bestSplits); #endif }
public int FindOptimalSplit(CudaArray <GPUSplit> bestSplits, int writeSplitsStart) { Kernels["spcCopyDataPointsPerAxis"] .Arguments(_dataPointsPerAxis) .Execute(NumSmallBlocks * GPUConstants.NumThreadsPerBlock, GPUConstants.NumThreadsPerBlock); Kernels["spcBestCategoricalSplitPerAxis"] .Arguments(_dataPointsPerAxis, bestSplits, writeSplitsStart) .Execute(NumSmallBlocks * GPUConstants.NumThreadsPerBlock, GPUConstants.NumThreadsPerBlock); return(Context.NumCategoricalAxes); }
static void Main(string[] args) { var aa = new CudaArray(); var res = aa.TestRuntimeErr(); res = aa.TestCudaStatusErr(); Console.WriteLine(TestCopyIntsToDevice()); Console.WriteLine(TestCopyFloatsToDevice()); Console.WriteLine(TestCopyIntsDeviceToDevice()); Console.WriteLine(TestCopyFloatsDeviceToDevice()); }
public override void ApplyPalette(Palette palette) { context.Synchronize(); mainStream.Synchronize(); if (palette.Width <= 0 || palette.Height <= 0) { throw new ArgumentException("palette may not be empty."); } paletteImage.Free(); paletteImage = CudaArray.Allocate(palette.Width, palette.Height, CudaArrayFormat.Float, 4); HostBuffer2D <Vec4> hostPaletteBuffer = HostBuffer2D <Vec4> .Alloc(palette.Width, palette.Height); Color col; Vec4 colorVec; for (int y = 0; y < palette.Height; y++) { for (int x = 0; x < palette.Width; x++) { col = palette.GetPixel(x, y); colorVec = new Vec4( (float)col.R / 255.0f, (float)col.G / 255.0f, (float)col.B / 255.0f, 1.0f); hostPaletteBuffer[y, x] = colorVec; } } CudaMem.Copy(hostPaletteBuffer, paletteImage); hostPaletteBuffer.Free(); paletteTex.Array = paletteImage; paletteTex.SetFormat(CudaArrayFormat.Float, 4); paletteTex.AddressModeX = TexAddressMode.Clamp; paletteTex.AddressModeY = TexAddressMode.Clamp; paletteTex.FilterMode = TexFilterMode.Linear; paletteTex.Flags = TexFlags.NormalizedCoordinates; iterateKernel.SetTexRef(paletteTex); context.Synchronize(); }
public static string AllocateOnDevice(out GpuMatrix gmOut, GpuMatrix gmIn) { IntPtr devData = new IntPtr(); var aa = new CudaArray(); var strRet = aa.MallocFloatsOnDevice(ref devData, (uint)gmIn.Matrix.Data.Length); if (!String.IsNullOrEmpty(strRet)) { gmOut = null; return(strRet); } gmOut = new GpuMatrix( matrix: gmIn.Matrix, devPtr: devData, devHostState: DevHostState.HostIsNewer); return(String.Empty); }
static string TestCopyIntsDeviceToDevice() { string testName = "TestCopyIntsDeviceToDevice"; uint arrayLen = 1000; var alist = Enumerable.Range(4, (int)arrayLen).ToArray(); var aa = new CudaArray(); IntPtr devDataA = new System.IntPtr(); IntPtr devDataB = new System.IntPtr(); var retlist = new int[(int)arrayLen]; try { var res = aa.ResetDevice(); res = res + aa.MallocIntsOnDevice(ref devDataA, arrayLen); res = res + aa.MallocIntsOnDevice(ref devDataB, arrayLen); res = res + aa.CopyIntsToDevice(alist, devDataA, arrayLen); res = res + aa.CopyIntsDeviceToDevice(devDataB, devDataA, arrayLen); res = res + aa.CopyIntsFromDevice(retlist, devDataB, arrayLen); res = res + aa.ReleaseDevicePtr(devDataA); res = res + aa.ReleaseDevicePtr(devDataB); if (!alist.SequenceEqual(retlist)) { return(testName + " fail: sequences do not match"); } if (res != String.Empty) { return(testName + " fail: " + res); } return(testName + " pass"); } catch (Exception ex) { return(testName + " exception " + ex.Message); } finally { aa.ReleaseDevicePtr(devDataA); aa.ReleaseDevicePtr(devDataB); aa.ResetDevice(); } }
public static string Init(int[] inputs, uint span) { _span = span; _area = _span * _span; _backwards = false; d_In = new IntPtr(); d_Out = new IntPtr(); _cudaArray = new CudaArray(); _gridProcs = new GridProcs(); var res = _cudaArray.MallocIntsOnDevice(ref d_In, _area); res = res + _cudaArray.CopyIntsToDevice(inputs, d_In, _area); res = res + _cudaArray.MallocIntsOnDevice(ref d_Out, _area); return(res); }
private void SanityCheckSplit(CudaArray <GPUSplit> bestSplits) { GPUSplit[] bestSplitsArray = bestSplits.Read(); GPUDecisionLearnerContext context = Context.ContextBuffer.Read()[0]; int[] dataPointIds = Context.DataPointIds.Read(); for (int openNodeIndex = 0; openNodeIndex < context.NumOpenNodes; ++openNodeIndex) { int nodeId = Context.OpenNodeIds[openNodeIndex]; if (nodeId == -1) { continue; } GPUSplit bestSplit = bestSplitsArray[openNodeIndex]; if (bestSplit.SplitType != GPUConstants.SplitType_Categorical) { continue; } GPUNode parentNode = Context.Nodes[nodeId]; GPUNode leftNode = Context.Nodes[parentNode.LeftChild]; GPUNode rightNode = Context.Nodes[parentNode.RightChild]; Assert.AreEqual(parentNode.RangeStart, leftNode.RangeStart); Assert.AreEqual(leftNode.RangeStart + leftNode.RangeLength, rightNode.RangeStart); Assert.AreEqual(parentNode.RangeLength, leftNode.RangeLength + rightNode.RangeLength); for (int i = 0; i < parentNode.RangeLength; ++i) { int index = parentNode.RangeStart + i; IDataPoint dataPoint = Context.DataPoints[dataPointIds[index]]; bool goRight = (dataPoint.Categories[bestSplit.Axis] & bestSplit.SplitCategories) != 0; if (goRight) { Assert.IsTrue(rightNode.RangeStart <= index && index <= rightNode.RangeStart + rightNode.RangeLength); } else { Assert.IsTrue(leftNode.RangeStart <= index && index <= leftNode.RangeStart + leftNode.RangeLength); } } } }
public void Initialize() { if (_initialized) { return; } else { _initialized = true; } Context.Initialize(); _allSplits = new CudaArray <GPUSplit>(GPUConstants.MaxSplits * Context.OpenNodeIds.Size); _bestSplits = new CudaArray <GPUSplit>(Context.OpenNodeIds.Size); _attributeSplitter.Initialize(); _categoricalSplitter.Initialize(); Context.CudaManager.Context.Synchronize(); }
public void TestSum() { int[] numbers = Enumerable.Range(999, 10000).ToArray(); using (CudaManager cudaManager = Provider.CudaManagerPool.GetCudaManagerForThread(Provider.Logger)) using (CudaArray <int> numbersBuffer = numbers) using (CudaArray <int> outputBuffer = new[] { 0 }) { KernelManager kernels = new KernelManager(cudaManager); for (int nBlocks = 1; nBlocks <= 1024; nBlocks *= 2) { for (int threadsPerBlock = 1; threadsPerBlock <= 1024; threadsPerBlock *= 2) { kernels["setIntKernel"].Arguments(outputBuffer, 0).ExecuteTask(); Assert.AreEqual(0, outputBuffer.Read()[0]); kernels["sumToOutputKernel"].Arguments(numbersBuffer, numbers.Length, outputBuffer, SharedBuffer.Ints(threadsPerBlock)).Execute(nBlocks * threadsPerBlock, threadsPerBlock); Assert.AreEqual(numbers.Sum(), outputBuffer.Read()[0]); } } } }
public static string Init(float[] temp_inputs, int[] flip_inputs, uint span, uint blockSize, int seed) { _span = span; _block_size = blockSize; _area = _span * _span; _blocks_per_span = span / blockSize; _blockCount = _blocks_per_span * _blocks_per_span; d_flipData = new IntPtr(); d_tempData = new IntPtr(); d_flipRands = new IntPtr(); d_indexRands = new IntPtr(); d_threshes = new IntPtr(); _cudaArray = new CudaArray(); _gridProcs = new GridProcs(); _randoProcs = new RandoProcs(); var strRet = _cudaArray.ResetDevice(); strRet = strRet + _randoProcs.MakeGenerator32(seed); strRet = strRet + _cudaArray.MallocFloatsOnDevice(ref d_tempData, _area); strRet = strRet + _cudaArray.MallocFloatsOnDevice(ref d_heatBlocks, _area / 1024); strRet = strRet + _cudaArray.MallocIntsOnDevice(ref d_flipData, _area); strRet = strRet + _cudaArray.MallocIntsOnDevice(ref d_indexRands, _blockCount); strRet = strRet + _cudaArray.MallocIntsOnDevice(ref d_flipRands, _blockCount); strRet = strRet + _cudaArray.MallocFloatsOnDevice(ref d_threshes, _allTempSteps); strRet = strRet + _cudaArray.CopyIntsToDevice(flip_inputs, d_flipData, _area); strRet = strRet + _cudaArray.CopyFloatsToDevice(temp_inputs, d_tempData, _area); var res9 = new int[_area]; strRet = strRet + _cudaArray.CopyIntsFromDevice(res9, d_flipData, _area); return(strRet); }
public static string Init(float[] inputs, uint span) { _span = span; _area = _span * _span; d_gridA = new IntPtr(); d_gridB = new IntPtr(); _cudaArray = new CudaArray(); _gridProcs = new GridProcs(); _randoProcs = new RandoProcs(); var strRet = _cudaArray.ResetDevice(); strRet = strRet + _cudaArray.MallocFloatsOnDevice(ref d_gridA, _area); strRet = strRet + _cudaArray.MallocFloatsOnDevice(ref d_gridB, _area); strRet = strRet + _cudaArray.CopyFloatsToDevice(inputs, d_gridA, _area); strRet = strRet + _cudaArray.CopyFloatsToDevice(inputs, d_gridB, _area); return(strRet); }
static string TestMakeNormalRands() { string testName = "TestMakeNormalRands"; var rdo = new RandoClr.RandoProcs(); var aa = new CudaArray(); uint arrayLen = 1000; int seed = 1234; IntPtr devRando = new IntPtr(); IntPtr devData = new IntPtr(); var retlist = new float[(int)arrayLen]; try { var res = aa.ResetDevice(); res = res + rdo.MakeGenerator64(ref devRando, seed); res = res + aa.MallocFloatsOnDevice(ref devData, arrayLen); res = res + rdo.MakeNormalRands(devData, devRando, arrayLen, 0.0f, 1.0f); res = res + aa.CopyFloatsFromDevice(retlist, devData, arrayLen); res = res + aa.ReleaseDevicePtr(devData); res = res + rdo.DestroyGenerator(devRando); if (res != String.Empty) { return(testName + " fail: " + res); } return(testName + " pass"); } catch { return(testName + " fail"); } finally { //rdo.DestroyGenerator(devRando); aa.ReleaseDevicePtr(devData); aa.ResetDevice(); } }
public int FindOptimalSplit(CudaArray <GPUSplit> splits, int writeSplitsStart) { Kernels["spaCopyDataPointsPerAxis"] .Arguments(_sortedDataPointsPerAxis, _sortKeysPerAxis) .Execute(GPUConstants.NumThreadsPerBlock * NumSmallBlocks, GPUConstants.NumThreadsPerBlock); Kernels["spaSortDataPointsPerAxis"] .Arguments(_sortedDataPointsPerAxis, _sortKeysPerAxis) .Execute(GPUConstants.NumThreadsPerBlock * NumSmallBlocks, GPUConstants.NumThreadsPerBlock); Kernels["spaAccumulateFrequenciesPerAxis"] .Arguments(_sortedDataPointsPerAxis, _cumulativeFrequenciesPerAxis) .Execute(GPUConstants.NumThreadsPerBlock * NumSmallBlocks, GPUConstants.NumThreadsPerBlock); #if DEBUGCUDA SanityCheckCumulativeFrequencies(); #endif Kernels["spaBestSplitPerAxis"] .Arguments(_sortedDataPointsPerAxis, _cumulativeFrequenciesPerAxis, splits, writeSplitsStart) .Execute(GPUConstants.NumThreadsPerBlock * NumSmallBlocks, GPUConstants.NumThreadsPerBlock); int numSplitsAdded = Context.NumAttributeAxes; return(numSplitsAdded); }
public static string ClearOnDevice(out GpuMatrix gmOut, GpuMatrix gmIn) { if (gmIn.DevHostState == DevHostState.DeviceNotAllocated) { gmOut = null; return("Device data pointer already cleared"); } var aa = new CudaArray(); var strRet = aa.ReleaseDevicePtr(gmIn.DevPtr); if (!String.IsNullOrEmpty(strRet)) { gmOut = null; return(strRet); } gmOut = new GpuMatrix( matrix: gmIn.Matrix, devPtr: new IntPtr(), devHostState: DevHostState.DeviceNotAllocated); return(String.Empty); }
public void Initialize() { _dataPointsPerAxis = new CudaArray <GPUCategoricalDataPoint>(Context.DataPoints.Count * GPUConstants.MaxCategoricalAxes); _classFrequenciesPerCategory = new CudaArray <float>(GPUConstants.MaxClasses * GPUConstants.MaxCategoricalAxes); _sortKeys = new CudaArray <byte>(Context.DataPoints.Count); }
internal CudaFractalEngine(Device device) { if (device == null) { throw new ArgumentException("Invalid device passed to CudaFractalEngine.", "device"); } this.device = device; context = device.CreateContext(); iterBlockCount = Util.Clamp(device.MultiprocessorCount * 2, 2, 64); System.IO.MemoryStream ptxStream = new System.IO.MemoryStream(Kernels.KernelResources.kernels_ptx); module = context.LoadModule(ptxStream); initIteratorsKernel = module.GetKernel("init_iterators_kernel"); resetIteratorsKernel = module.GetKernel("reset_iterators_kernel"); iterateKernel = module.GetKernel("iterate_kernel"); updateStatsKernel = module.GetKernel("update_stats_kernel"); resetOutputKernel = module.GetKernel("reset_output_kernel"); updateOutputKernel = module.GetKernel("update_output_kernel"); glOutputBufferID = 0; mainStream = new Cuda.Stream(); iterPosStateBuffer = DeviceBuffer.Alloc(8, IteratorCount); iterColorStateBuffer = DeviceBuffer.Alloc(8, IteratorCount); iterStatBuffer = DeviceBuffer.Alloc(Marshal.SizeOf(typeof(NativeIterStatEntry)), IteratorCount); globalStatBuffer = DeviceBuffer.Alloc(Marshal.SizeOf(typeof(NativeGlobalStatEntry)), 1); entropyXBuffer = DeviceBuffer.Alloc(16, IteratorCount); entropyCBuffer = DeviceBuffer.Alloc(4, IteratorCount); entropySeedBuffer = DeviceBuffer.Alloc(4, IteratorCount); uint[] seeds = new uint[IteratorCount]; for (int i = 0; i < IteratorCount; i++) { seeds[i] = (uint)rand.Next(65536); } CudaMem.Copy(seeds, entropySeedBuffer); paletteImage = CudaArray.Null; paletteTex = module.GetTexRef("paletteTex"); resetBeginEvt = new Event(); resetEndEvt = new Event(); cycleIterEvt = new Event(); cycleStatEvt = new Event(); cycleEndEvt = new Event(); toneBeginEvt = new Event(); toneEndEvt = new Event(); initIteratorsKernel.SetBlockShape(IterBlockSize, 1, 1); initIteratorsKernel.SetGridDim(IterBlockCount, 1); initIteratorsKernel.SetSharedSize(0); resetIteratorsKernel.SetBlockShape(IterBlockSize, 1, 1); resetIteratorsKernel.SetGridDim(IterBlockCount, 1); resetIteratorsKernel.SetSharedSize(0); iterateKernel.SetBlockShape(IterBlockSize, 1, 1); iterateKernel.SetGridDim(IterBlockCount, 1); iterateKernel.SetSharedSize(0); updateStatsKernel.SetBlockShape(1, 1, 1); updateStatsKernel.SetGridDim(1, 1); updateStatsKernel.SetSharedSize(0); initIteratorsKernel.Launch(entropyXBuffer.Ptr.RawPtr, entropyCBuffer.Ptr.RawPtr, entropySeedBuffer.Ptr.RawPtr); context.Synchronize(); }
static string TestcublasSgemm1() { string testName = "TestcublasSgemm"; uint aw = 5; uint bh = aw; uint ah = 5; uint bw = 5; uint ch = ah; uint cw = bw; var cuby = new CublasClr.Cublas(); var aa = new CudaArray(); var res = aa.ResetDevice(); var dataA = MatrixUtils.MakeIdentity(rows: ah, cols: aw); GpuMatrix gpuA; res = res + GpuMatrixOps.SetupGpuMatrix( out gpuA, new Matrix <float>(_rows: ah, _cols: aw, host_data: ImmutableArray.Create(dataA), matrixFormat: MatrixFormat.Column_Major)); var dataB = MatrixUtils.MakeIdentiPoke(rows: bh, cols: bw); GpuMatrix gpuB; res = res + GpuMatrixOps.SetupGpuMatrix( out gpuB, new Matrix <float>(_rows: bh, _cols: bw, host_data: ImmutableArray.Create(dataB), matrixFormat: MatrixFormat.Column_Major)); var dataC = MatrixUtils.MakeZeroes(rows: bh, cols: bw); GpuMatrix gpuC; res = res + GpuMatrixOps.SetupGpuMatrix( out gpuC, new Matrix <float>(_rows: ch, _cols: cw, host_data: ImmutableArray.Create(dataC), matrixFormat: MatrixFormat.Column_Major)); IntPtr cublasHandle = new IntPtr(); res = res + cuby.MakeCublasHandle(ref cublasHandle); GpuMatrix gpuProd; res = res + GpuMatrixOps.Multiply( gmOut: out gpuProd, cublasHandle: cublasHandle, gmA: gpuA, gmB: gpuB, gmC: gpuC); GpuMatrix gpuSynched; res = res + GpuMatrixOps.CopyToHost(out gpuSynched, gpuProd); var cpuRes = new float[ah * bw]; MatrixUtils.RowMajorMatrixMult(C: cpuRes, A: dataA, B: dataB, wA: aw, hA: ah, wB: bw); var cpuRes2 = new float[bh * aw]; MatrixUtils.RowMajorMatrixMult(C: cpuRes2, A: dataB, B: dataA, wA: bw, hA: bh, wB: aw); return(res); }
public void Initialize() { _sortedDataPointsPerAxis = new CudaArray <GPUAttributeDataPoint>(Context.DataPoints.Count * Context.NumAttributeAxes); _sortKeysPerAxis = new CudaArray <float>(Context.DataPoints.Count * Context.NumAttributeAxes); _cumulativeFrequenciesPerAxis = new CudaArray <float>(Context.DataPoints.Count * Context.NumAttributeAxes * GPUConstants.MaxClasses); }
public CudaFractalEngine() { device = Device.Devices[0]; context = device.CreateContext(); iterBlockCount = Util.Clamp(device.MultiprocessorCount * 2, 2, 64); //System.Reflection.Assembly loadedAssembly = typeof(CudaFractalEngine).Assembly; //System.IO.Stream stream = loadedAssembly.GetManifestResourceStream(typeof(CudaFractalEngine), "kernels.ptx"); System.IO.MemoryStream stream = new System.IO.MemoryStream(CudaResources.kernels_ptx); module = context.LoadModule(stream); initIteratorsKernel = module.GetKernel("init_iterators_kernel"); resetIteratorsKernel = module.GetKernel("reset_iterators_kernel"); iterateKernel = module.GetKernel("iterate_kernel"); updateStatsKernel = module.GetKernel("update_stats_kernel"); resetOutputKernel = module.GetKernel("reset_output_kernel"); updateOutputKernel = module.GetKernel("update_output_kernel"); glOutputBufferID = 0; mainStream = new Cuda.Stream(); iterPosStateBuffer = DeviceBuffer2D.Alloc(8, IterBlockSize, IterBlockCount); module.WriteConstant("iterPosStateBuffer", iterPosStateBuffer); iterColorStateBuffer = DeviceBuffer2D.Alloc(16, IterBlockSize, IterBlockCount); module.WriteConstant("iterColorStateBuffer", iterColorStateBuffer); entropyXBuffer = DeviceBuffer2D.Alloc(16, IterBlockSize, IterBlockCount); module.WriteConstant("entropyXBuffer", entropyXBuffer); entropyCBuffer = DeviceBuffer2D.Alloc(4, IterBlockSize, IterBlockCount); module.WriteConstant("entropyCBuffer", entropyCBuffer); entropySeedBuffer = DeviceBuffer2D.Alloc(4, IterBlockSize, IterBlockCount); module.WriteConstant("entropySeedBuffer", entropySeedBuffer); HostBuffer2D <uint> hostEntropySeedBuffer = HostBuffer2D <uint> .Alloc(IterBlockSize, IterBlockCount); uint rnd; for (int y = 0; y < IterBlockCount; y++) { for (int x = 0; x < IterBlockSize; x++) { rnd = (uint)rand.Next(65536); hostEntropySeedBuffer[y, x] = rnd; } } CudaMem.Copy(hostEntropySeedBuffer, entropySeedBuffer); hostEntropySeedBuffer.Free(); dotCountBuffer = DeviceBuffer2D.Alloc(8, IterBlockSize, IterBlockCount); module.WriteConstant("dotCountBuffer", dotCountBuffer); peakDensityBuffer = DeviceBuffer2D.Alloc(4, IterBlockSize, IterBlockCount); module.WriteConstant("peakDensityBuffer", peakDensityBuffer); totalIterCountMem = DevicePtr.AllocRaw(8); module.WriteConstant("totalIterCountMem", totalIterCountMem); totalDotCountMem = DevicePtr.AllocRaw(8); module.WriteConstant("totalDotCountMem", totalDotCountMem); densityMem = DevicePtr.AllocRaw(4); module.WriteConstant("densityMem", densityMem); peakDensityMem = DevicePtr.AllocRaw(4); module.WriteConstant("peakDensityMem", peakDensityMem); scaleConstantMem = DevicePtr.AllocRaw(4); module.WriteConstant("scaleConstantMem", scaleConstantMem); paletteImage = CudaArray.Null; paletteTex = module.GetTexRef("paletteTex"); resetBeginEvt = new Event(); resetEndEvt = new Event(); cycleIterEvt = new Event(); cycleStatEvt = new Event(); cycleEndEvt = new Event(); toneBeginEvt = new Event(); toneEndEvt = new Event(); initIteratorsKernel.SetBlockShape(IterBlockSize, 1, 1); initIteratorsKernel.SetGridDim(IterBlockCount, 1); initIteratorsKernel.SetSharedSize(0); resetIteratorsKernel.SetBlockShape(IterBlockSize, 1, 1); resetIteratorsKernel.SetGridDim(IterBlockCount, 1); resetIteratorsKernel.SetSharedSize(0); iterateKernel.SetBlockShape(IterBlockSize, 1, 1); iterateKernel.SetGridDim(IterBlockCount, 1); iterateKernel.SetSharedSize(0); updateStatsKernel.SetBlockShape(1, 1, 1); updateStatsKernel.SetGridDim(1, 1); updateStatsKernel.SetSharedSize(0); initIteratorsKernel.Launch(); context.Synchronize(); }