public static void For(int number_of_threads, SimpleKernel simpleKernel) { if (Campy.Utils.Options.IsOn("import-only")) { JustImport(simpleKernel); return; } GCHandle handle1 = default(GCHandle); GCHandle handle2 = default(GCHandle); try { unsafe { System.Reflection.MethodInfo method_info = simpleKernel.Method; String kernel_assembly_file_name = method_info.DeclaringType.Assembly.Location; Mono.Cecil.ModuleDefinition md = Campy.Meta.StickyReadMod.StickyReadModule( kernel_assembly_file_name, new ReaderParameters { ReadSymbols = true }); MethodReference method_reference = md.ImportReference(method_info); CUfunction ptr_to_kernel = default(CUfunction); CUmodule module = default(CUmodule); Campy.Utils.TimePhase.Time("compile ", () => { IntPtr image = Singleton._compiler.Compile(method_reference, simpleKernel.Target); module = Singleton._compiler.SetModule(method_reference, image); Singleton._compiler.StoreJits(module); ptr_to_kernel = Singleton._compiler.GetCudaFunction(method_reference, module); }); RUNTIME.BclCheckHeap(); BUFFERS buffer = Singleton.Buffer; IntPtr kernel_target_object = IntPtr.Zero; Campy.Utils.TimePhase.Time("deep copy ", () => { int count = simpleKernel.Method.GetParameters().Length; var bb = Singleton._compiler.GetBasicBlock(method_reference); if (bb.HasThis) { count++; } if (!(count == 1 || count == 2)) { throw new Exception("Expecting at least one parameter for kernel."); } if (bb.HasThis) { kernel_target_object = buffer.AddDataStructure(simpleKernel.Target); } }); Campy.Utils.TimePhase.Time("kernel cctor set up", () => { // For each cctor, run on GPU. // Construct dependency graph of methods. List <MethodReference> order_list = COMPILER.Singleton.ConstructCctorOrder(); // Finally, call cctors. foreach (var bb in order_list) { if (Campy.Utils.Options.IsOn("trace-cctors")) { System.Console.WriteLine("Executing cctor " + bb.FullName); } var cctor = Singleton._compiler.GetCudaFunction(bb, module); var res = CUresult.CUDA_SUCCESS; Campy.Utils.CudaHelpers.MakeLinearTiling(1, out Campy.Utils.CudaHelpers.dim3 tile_size, out Campy.Utils.CudaHelpers.dim3 tiles); res = Cuda.cuLaunchKernel( cctor, tiles.x, tiles.y, tiles.z, // grid has one block. tile_size.x, tile_size.y, tile_size.z, // n threads. 0, // no shared memory default(CUstream), (IntPtr)IntPtr.Zero, (IntPtr)IntPtr.Zero ); CudaHelpers.CheckCudaError(res); res = Cuda.cuCtxSynchronize(); // Make sure it's copied back to host. CudaHelpers.CheckCudaError(res); } }); if (Campy.Utils.Options.IsOn("trace-cctors")) { System.Console.WriteLine("Done with cctors"); } Campy.Utils.TimePhase.Time("kernel call ", () => { IntPtr[] parm1 = new IntPtr[1]; IntPtr[] parm2 = new IntPtr[1]; parm1[0] = kernel_target_object; parm2[0] = buffer.New(BUFFERS.SizeOf(typeof(int))); IntPtr[] x1 = parm1; handle1 = GCHandle.Alloc(x1, GCHandleType.Pinned); IntPtr pointer1 = handle1.AddrOfPinnedObject(); IntPtr[] x2 = parm2; handle2 = GCHandle.Alloc(x2, GCHandleType.Pinned); IntPtr pointer2 = handle2.AddrOfPinnedObject(); IntPtr[] kp = new IntPtr[] { pointer1, pointer2 }; var res = CUresult.CUDA_SUCCESS; fixed(IntPtr * kernelParams = kp) { Campy.Utils.CudaHelpers.MakeLinearTiling(number_of_threads, out Campy.Utils.CudaHelpers.dim3 tile_size, out Campy.Utils.CudaHelpers.dim3 tiles); //MakeLinearTiling(1, out dim3 tile_size, out dim3 tiles); res = Cuda.cuLaunchKernel( ptr_to_kernel, tiles.x, tiles.y, tiles.z, // grid has one block. tile_size.x, tile_size.y, tile_size.z, // n threads. 0, // no shared memory default(CUstream), (IntPtr)kernelParams, (IntPtr)IntPtr.Zero ); }
private static void GemmOp(TSCudaContext context, BlasOp transA, BlasOp transB, float alpha, Tensor a, Tensor b, float beta, Tensor c) { if (a.Strides[0] != 1) { throw new ArgumentException($"a must be contiguous in the first dimension (column major / fortran order). ({a.Strides[0]},{a.Strides[1]}) ({b.Strides[0]},{b.Strides[1]}) ({c.Strides[0]},{c.Strides[1]})"); } if (b.Strides[0] != 1) { throw new ArgumentException("b must be contiguous in the first dimension (column major / fortran order)"); } if (c.Strides[0] != 1) { throw new ArgumentException("c must be contiguous in the first dimension (column major / fortran order)"); } using (var blas = context.BlasForTensor(c)) { bool nta = transA == BlasOp.NonTranspose; bool ntb = transB == BlasOp.NonTranspose; Operation transa = GetCudaBlasOp(transA); Operation transb = GetCudaBlasOp(transB); int m = (int)a.Sizes[nta ? 0 : 1]; int k = (int)b.Sizes[ntb ? 0 : 1]; int n = (int)b.Sizes[ntb ? 1 : 0]; int lda = (int)a.Strides[1]; int ldb = (int)b.Strides[1]; int ldc = (int)c.Strides[1]; if (c.ElementType == DType.Float32) { var aPtrSingle = CudaHelpers.GetBufferStart(a); var bPtrSingle = CudaHelpers.GetBufferStart(b); var cPtrSingle = CudaHelpers.GetBufferStart(c); var _statusF32 = CudaBlasNativeMethods.cublasSgemm_v2(blas.Value.CublasHandle, transa, transb, m, n, k, ref alpha, aPtrSingle, lda, bPtrSingle, ldb, ref beta, cPtrSingle, ldc); if (_statusF32 != CublasStatus.Success) { throw new CudaBlasException(_statusF32); } } else if (c.ElementType == DType.Float64) { var aPtrDouble = CudaHelpers.GetBufferStart(a); var bPtrDouble = CudaHelpers.GetBufferStart(b); var cPtrDouble = CudaHelpers.GetBufferStart(c); var alphaDouble = (double)alpha; var betaDouble = (double)beta; var _statusF64 = CudaBlasNativeMethods.cublasDgemm_v2(blas.Value.CublasHandle, transa, transb, m, n, k, ref alphaDouble, aPtrDouble, lda, bPtrDouble, ldb, ref betaDouble, cPtrDouble, ldc); if (_statusF64 != CublasStatus.Success) { throw new CudaBlasException(_statusF64); } } else { throw new NotSupportedException("CUDA GEMM with element type " + c.ElementType + " not supported"); } } }
public static Tensor Mul_M_V(TSCudaContext context, Tensor result, Tensor lhs, Tensor rhs) { if (lhs.ElementType != rhs.ElementType || (result != null && result.ElementType != lhs.ElementType)) { throw new InvalidOperationException("All tensors must have the same element type"); } CudaHelpers.ThrowIfDifferentDevices(result, lhs, rhs); if (result != null && (result.Storage is CudaStorage)) { throw new ArgumentException("result must be a CUDA tensor", "result"); } if (!(lhs.Storage is CudaStorage)) { throw new ArgumentException("lhs must be a CUDA tensor", "lhs"); } if (!(rhs.Storage is CudaStorage)) { throw new ArgumentException("rhs must be a CUDA tensor", "rhs"); } if (lhs.DimensionCount != 2) { throw new ArgumentException("lhs must have 2 dimensions", "lhs"); } if (rhs.DimensionCount != 1) { throw new ArgumentException("rhs must have 1 dimension (ie. be a vector)", "rhs"); } Tensor lhsClone; if (lhs.Strides[1] == 1) // If lhs is already row-major, do nothing { lhsClone = lhs.CopyRef(); } else if (lhs.Strides[0] == 1) // If lhs is column-major, transpose it { lhsClone = lhs.Transpose(); } else // If lhs is not contiguous in either dimension, make a temporary contiguous copy { lhsClone = Ops.NewContiguous(lhs); } var writeTarget = TensorResultBuilder.GetWriteTarget(result, rhs, false, lhs.Sizes[0]); try { if (writeTarget.ElementType == DType.Float32) { Run_M_V_float(context, writeTarget, lhsClone, rhs); } else if (writeTarget.ElementType == DType.Float64) { Run_M_V_double(context, writeTarget, lhsClone, rhs); } else { throw new NotSupportedException("CUDA Matrix-Vector multiplication with element type " + result.ElementType + " not supported"); } } finally { lhsClone.Dispose(); } return(writeTarget); }
public override void Init() { linKernel.ProblemElements = problemElements; linKernel.Y = Y; linKernel.Init(); base.Init(); float[] vecVals; int[] vecColIdx; int[] vecLenght; int align = preFetch; CudaHelpers.TransformToEllpackRFormat(out vecVals, out vecColIdx, out vecLenght, problemElements, align); // CudaHelpers.TransformToEllpackRFormat(out vecVals, out vecColIdx, out vecLenght, problemElements); selfLinDot = linKernel.DiagonalDotCache; #region cuda initialization InitCudaModule(); //copy data to device, set cuda function parameters valsPtr = cuda.CopyHostToDevice(vecVals); idxPtr = cuda.CopyHostToDevice(vecColIdx); vecLengthPtr = cuda.CopyHostToDevice(vecLenght); labelsPtr = cuda.CopyHostToDevice(Y); selfLinDotPtr = cuda.CopyHostToDevice(selfLinDot); uint memSize = (uint)(2 * problemElements.Length * sizeof(float)); //allocate mapped memory for our results //CUDARuntime.cudaSetDeviceFlags(CUDARuntime.cudaDeviceMapHost); // var e= CUDADriver.cuMemHostAlloc(ref outputIntPtr, memSize, 8); //CUDARuntime.cudaHostAlloc(ref outputIntPtr, memSize, CUDARuntime.cudaHostAllocMapped); //var errMsg=CUDARuntime.cudaGetErrorString(e); //cuda.HostRegister(outputIntPtr,memSize, Cuda) outputIntPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); outputPtr = cuda.GetHostDevicePointer(outputIntPtr, 0); //normal memory allocation //outputPtr = cuda.Allocate((uint)(sizeof(float) * problemElements.Length)); #endregion SetCudaFunctionParameters(); //allocate memory for main vector, size of this vector is the same as dimenson, so many //indexes will be zero, but cuda computation is faster VectorI = new float[problemElements[0].Dim + 1]; VectorJ = new float[problemElements[0].Dim + 1]; CudaHelpers.FillDenseVector(problemElements[0], VectorI); CudaHelpers.FillDenseVector(problemElements[1], VectorJ); CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuVecI_TexRef, cuVecITexRefName, VectorI, ref VecIPtr); CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuVecJ_TexRef, cuVecJTexRefName, VectorJ, ref VecJPtr); }
/// <summary> /// Copies the gpu. /// </summary> /// <param name="result">The result.</param> /// <param name="src">The source.</param> /// <param name="totalElements">The total elements.</param> /// <exception cref="CudaException"> /// </exception> public void CopyGpu(Tensor result, Tensor src, long totalElements) { // We assume here that we are using the default stream for both devices. var context = CudaHelpers.TSContextForTensor(src); var resultStorage = (CudaStorage)result.Storage; var resultContext = context.CudaContextForTensor(result); var resultPtr = resultStorage.DevicePtrAtElement(result.StorageOffset); var srcStorage = (CudaStorage)src.Storage; var srcContext = context.CudaContextForTensor(src); var srcPtr = srcStorage.DevicePtrAtElement(src.StorageOffset); if (CudaHelpers.GetDeviceId(result) != CudaHelpers.GetDeviceId(src)) { // Cross-device copy. Perform two-way barrier between both devices' default streams. resultContext.SetCurrent(); var dstReady = new CudaEvent(CUEventFlags.DisableTiming); dstReady.Record(); srcContext.SetCurrent(); var res = DriverAPINativeMethods.Streams.cuStreamWaitEvent(CUstream.NullStream, dstReady.Event, 0); if (res != CUResult.Success) { throw new CudaException(res); } dstReady.Dispose(); } else { srcContext.SetCurrent(); } var canMemcpy = CanMemcpy(result, src, totalElements); if (canMemcpy) { var res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyAsync( resultPtr, srcPtr, totalElements * src.ElementType.Size(), CUstream.NullStream); if (res != CUResult.Success) { throw new CudaException(res); } } else { if (result.ElementType != src.ElementType) { CopyGpuConvertTypes(result, src, totalElements); } else if (context.CanAccessPeer(CudaHelpers.GetDeviceId(src), CudaHelpers.GetDeviceId(result))) { CopyGpuDirect(result, src, srcContext); } else { CopyGpuIndirect(result, src, totalElements); } } }
public static Tensor Invoke(CudaReduceAllKernels reduceAllKernels, float init, ReduceInitType initType, string kernelName, Tensor result, Tensor src, object extraArg = null) { int deviceId = CudaHelpers.GetDeviceId(src); TSCudaContext context = CudaHelpers.TSContextForTensor(src); CudaContext cudaContext = context.CudaContextForDevice(deviceId); if (src.DimensionCount > TSCudaContext.MaxDims) { throw new InvalidOperationException("Tensors with dimension count > " + TSCudaContext.MaxDims + " are not supported"); } Tensor writeTarget = TensorResultBuilder.GetWriteTarget(result, src, false, 1); if (src.DimensionCount == 0) { return(result); } long totalElements = src.ElementCount(); ApplySpecialization config = new ApplySpecialization(src); object totalElementsTyped = config.Use32BitIndices ? (uint)totalElements : (ulong)totalElements; object initValueTyped = ReduceInitConverter.GetInitValue(init, initType, src.ElementType); dim3 grid; dim3 block; byte[] ptx = reduceAllKernels.GetPtx(context.Compiler); string fullKernelName = PermutationGenerator.GetMangledName(kernelName, config); ManagedCuda.BasicTypes.CUdeviceptr outputDevicePtr = CudaHelpers.GetBufferStart(writeTarget); if (isTwoPassReductionSize(totalElements)) { getPass1ReduceBlockGrid(context, deviceId, totalElements, out grid, out block); uint smemSize = block.x * sizeof(float); ManagedCuda.BasicTypes.CUdeviceptr scratchSpace = context.ScratchSpaceForDevice(deviceId).buffer; if (extraArg == null) { InvokeReduceAll(context, cudaContext, ptx, "twoPassA_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, scratchSpace); } else { InvokeReduceAll(context, cudaContext, ptx, "twoPassA_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, scratchSpace, extraArg); } uint numPass1Blocks = grid.x; getPass2ReduceBlockGrid(context, deviceId, totalElements, out grid, out block); smemSize = block.x * sizeof(float); InvokeReduceAllPass2(context, cudaContext, ptx, "twoPassB_" + fullKernelName, grid, block, smemSize, config.Use32BitIndices, numPass1Blocks, initValueTyped, scratchSpace, outputDevicePtr); } else { getSinglePassReduceBlockGrid(totalElements, out grid, out block); uint smemSize = block.x * sizeof(float); if (extraArg == null) { InvokeReduceAll(context, cudaContext, ptx, "onePass_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, outputDevicePtr); } else { InvokeReduceAll(context, cudaContext, ptx, "onePass_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, outputDevicePtr, extraArg); } } return(writeTarget); }
public Tensor IndexSelect(Tensor result, Tensor src, int dim, Tensor indices) { TSCudaContext context = CudaHelpers.TSContextForTensor(src); CudaContext cudaContext = context.CudaContextForTensor(src); long[] requiredOutputSize = (long[])src.Sizes.Clone(); requiredOutputSize[dim] = 1; Tensor writeTarget = TensorResultBuilder.GetWriteTarget(result, src, true, requiredOutputSize); // The `src` is partitioned into two parts: // -the size of each slice we are indexing, which is the // total size of the tensor ignoring dimension `dim`; // -the number of indices we are choosing, which is the total size // of the tensor `indices`. long numIndices = indices.ElementCount(); long dstTotalSize = writeTarget.ElementCount(); long srcSelectDimSize = src.Sizes[dim]; long sliceSize = dstTotalSize / numIndices; int mpc = context.DeviceInfoForContext(cudaContext).MultiProcessorCount; dim3 smallIndexGrid = new dim3((uint)Math.Min(ApplyUtils.CeilDiv(sliceSize, 128), (mpc * 8))); dim3 smallIndexBlock = new dim3((uint)Math.Min(sliceSize, 128)); dim3 largeIndexGrid = new dim3((uint)Math.Min(ApplyUtils.CeilDiv(dstTotalSize, 128), (mpc * 8))); dim3 largeIndexBlock = new dim3((uint)Math.Min(dstTotalSize, 128)); long[] newResultSize = (long[])writeTarget.Sizes.Clone(); newResultSize[dim] = 1; Tensor resultFlat = new Tensor(newResultSize, writeTarget.Strides, writeTarget.Storage, writeTarget.StorageOffset); long[] newSrcSize = (long[])src.Sizes.Clone(); newSrcSize[dim] = 1; Tensor srcFlat = new Tensor(newSrcSize, src.Strides, src.Storage, src.StorageOffset); if (ApplyUtils.CanUse32BitIndexMath(writeTarget) && ApplyUtils.CanUse32BitIndexMath(src) && ApplyUtils.CanUse32BitIndexMath(indices)) { // Threshold for small kernel bool smallKernel = numIndices <= 16; string kernelName = ""; bool indContig = indices.IsContiguous(); if (writeTarget.DimensionCount == src.DimensionCount && writeTarget.DimensionCount <= 3 && indContig) { kernelName = MakeKernelName(smallKernel, true, writeTarget.DimensionCount, src.DimensionCount, -2); } else { kernelName = MakeKernelName(smallKernel, true, -1, -1, -1); } dim3 grid = smallKernel ? smallIndexGrid : largeIndexGrid; dim3 block = smallKernel ? smallIndexBlock : largeIndexBlock; Invoke(context, cudaContext, kernelName, grid, block, 0, CUstream.NullStream, true, writeTarget, src, indices, dim, dim, sliceSize, srcSelectDimSize); } else { string kernelName = MakeKernelName(false, false, -1, -1, -1); Invoke(context, cudaContext, kernelName, largeIndexGrid, largeIndexBlock, 0, CUstream.NullStream, false, writeTarget, src, indices, dim, dim, dstTotalSize, sliceSize, srcSelectDimSize); } return(writeTarget); }
/// <summary> /// Copies the gpu direct. /// </summary> /// <param name="result">The result.</param> /// <param name="src">The source.</param> /// <param name="srcContext">The source context.</param> private void CopyGpuDirect(Tensor result, Tensor src, CudaContext srcContext) { var context = CudaHelpers.TSContextForTensor(src); CopyOp.Invoke(fillCopyKernels, context, srcContext, result, src); }
public static void For(int number_of_threads, SimpleKernel simpleKernel) { GCHandle handle1 = default(GCHandle); GCHandle handle2 = default(GCHandle); try { unsafe { //////// COMPILE KERNEL INTO GPU CODE /////// ///////////////////////////////////////////// var stopwatch_cuda_compile = new Stopwatch(); stopwatch_cuda_compile.Start(); IntPtr image = Singleton()._converter.Compile(simpleKernel.Method, simpleKernel.Target); CUfunction ptr_to_kernel = Singleton()._converter.GetCudaFunction(simpleKernel.Method, image); var elapse_cuda_compile = stopwatch_cuda_compile.Elapsed; RUNTIME.CheckHeap(); //////// COPY DATA INTO GPU ///////////////// ///////////////////////////////////////////// var stopwatch_deep_copy_to = new Stopwatch(); stopwatch_deep_copy_to.Reset(); stopwatch_deep_copy_to.Start(); BUFFERS buffer = Singleton().Buffer; // Set up parameters. int count = simpleKernel.Method.GetParameters().Length; var bb = Singleton()._converter.GetBasicBlock(simpleKernel.Method); if (bb.HasThis) { count++; } if (!(count == 1 || count == 2)) { throw new Exception("Expecting at least one parameter for kernel."); } IntPtr[] parm1 = new IntPtr[1]; IntPtr[] parm2 = new IntPtr[1]; IntPtr ptr = IntPtr.Zero; // The method really should have a "this" because it's a closure // object. if (bb.HasThis) { RUNTIME.CheckHeap(); ptr = buffer.AddDataStructure(simpleKernel.Target); parm1[0] = ptr; } { Type btype = typeof(int); var s = BUFFERS.SizeOf(btype); var ptr2 = buffer.New(s); // buffer.DeepCopyToImplementation(index, ptr2); parm2[0] = ptr2; } stopwatch_deep_copy_to.Start(); var elapse_deep_copy_to = stopwatch_cuda_compile.Elapsed; var stopwatch_call_kernel = new Stopwatch(); stopwatch_call_kernel.Reset(); stopwatch_call_kernel.Start(); IntPtr[] x1 = parm1; handle1 = GCHandle.Alloc(x1, GCHandleType.Pinned); IntPtr pointer1 = handle1.AddrOfPinnedObject(); IntPtr[] x2 = parm2; handle2 = GCHandle.Alloc(x2, GCHandleType.Pinned); IntPtr pointer2 = handle2.AddrOfPinnedObject(); RUNTIME.CheckHeap(); IntPtr[] kp = new IntPtr[] { pointer1, pointer2 }; var res = CUresult.CUDA_SUCCESS; fixed(IntPtr *kernelParams = kp) { Campy.Utils.CudaHelpers.MakeLinearTiling(number_of_threads, out Campy.Utils.CudaHelpers.dim3 tile_size, out Campy.Utils.CudaHelpers.dim3 tiles); //MakeLinearTiling(1, out dim3 tile_size, out dim3 tiles); res = Cuda.cuLaunchKernel( ptr_to_kernel, tiles.x, tiles.y, tiles.z, // grid has one block. tile_size.x, tile_size.y, tile_size.z, // n threads. 0, // no shared memory default(CUstream), (IntPtr)kernelParams, (IntPtr)IntPtr.Zero ); } CudaHelpers.CheckCudaError(res); res = Cuda.cuCtxSynchronize(); // Make sure it's copied back to host. CudaHelpers.CheckCudaError(res); stopwatch_call_kernel.Stop(); var elapse_call_kernel = stopwatch_call_kernel.Elapsed; if (Campy.Utils.Options.IsOn("jit_trace")) { System.Console.WriteLine("cuda compile " + elapse_cuda_compile); System.Console.WriteLine("deep copy in " + elapse_deep_copy_to); System.Console.WriteLine("cuda kernel " + elapse_call_kernel); } { var stopwatch_deep_copy_back = new Stopwatch(); stopwatch_deep_copy_back.Reset(); RUNTIME.CheckHeap(); stopwatch_deep_copy_back.Start(); buffer.SynchDataStructures(); stopwatch_deep_copy_back.Stop(); RUNTIME.CheckHeap(); var elapse_deep_copy_back = stopwatch_deep_copy_back.Elapsed; if (Campy.Utils.Options.IsOn("jit_trace")) { System.Console.WriteLine("deep copy out " + elapse_deep_copy_back); } } } } catch (Exception e) { Console.WriteLine(e); throw e; } finally { if (default(GCHandle) != handle1) { handle1.Free(); } if (default(GCHandle) != handle2) { handle2.Free(); } } }
private void SetCudaData(Problem <SparseVec> sub_prob) { int vecDim = sub_prob.FeaturesCount;//.Elements[0].Dim; /* * copy vectors to CUDA device */ #region copy trainning examples to GPU float[] vecVals; int[] vecIdx; int[] vecLenght; CudaHelpers.TransformToCSRFormat(out vecVals, out vecIdx, out vecLenght, sub_prob.Elements); valsCSRPtr = cuda.CopyHostToDevice(vecVals); idxCSRPtr = cuda.CopyHostToDevice(vecIdx); vecLenghtCSRPtr = cuda.CopyHostToDevice(vecLenght); CudaHelpers.TransformToCSCFormat(out vecVals, out vecIdx, out vecLenght, sub_prob.Elements); valsCSCPtr = cuda.CopyHostToDevice(vecVals); idxCSCPtr = cuda.CopyHostToDevice(vecIdx); vecLenghtCSCPtr = cuda.CopyHostToDevice(vecLenght); #endregion /* * allocate memory for gradient */ alphaMemSize = (uint)(sub_prob.ElementsCount * sizeof(float)); gradPtr = cuda.Allocate(alphaMemSize); gradOldPtr = cuda.Allocate(alphaMemSize); alphaPtr = cuda.Allocate(alphaMemSize); alphaOldPtr = cuda.Allocate(alphaMemSize); alphaTmpPtr = cuda.Allocate(alphaMemSize); /* * reduction blocks for computing Obj */ GetNumThreadsAndBlocks(vecDim, 64, threadsPerBlock, ref threadsForReduceObjW, ref bpgReduceW); reduceObjW = new float[bpgReduceW]; uint reduceWBytes = (uint)bpgReduceW * sizeof(float); reduceObjWPtr = cuda.Allocate(reduceWBytes); /* * reduction size for kernels which operate on alpha */ int reductionSize = problem.ElementsCount; threadsForReduceObjAlpha = 0; GetNumThreadsAndBlocks(problem.ElementsCount, 64, threadsPerBlock, ref threadsForReduceObjAlpha, ref bpgReduceAlpha); uint alphaReductionBytes = (uint)bpgReduceAlpha * sizeof(float); /* * reduction array for computing objective function value */ reduceObjAlpha = new float[bpgReduceAlpha]; reduceObjAlphaPtr = cuda.Allocate(alphaReductionBytes); /* * reduction arrays for computing BB step */ alphaPartReduce = new float[bpgReduceAlpha]; gradPartReduce = new float[bpgReduceAlpha]; alphaGradPartReduce = new float[bpgReduceAlpha]; reduceBBAlphaGradPtr = cuda.Allocate(alphaReductionBytes); reduceBBAlphaPtr = cuda.Allocate(alphaReductionBytes); reduceBBGradPtr = cuda.Allocate(alphaReductionBytes); /* * reduction arrays for comuting lin part */ reduceLinPart = new float[bpgReduceAlpha]; reduceLinPartPtr = cuda.Allocate(alphaReductionBytes); //float[] wVec = new float[vecDim]; wVecMemSize = (uint)vecDim * sizeof(float); wTempVecPtr = cuda.Allocate(wVecMemSize); //move W wector SetTextureMemory(ref cuWVecTexRef, cudaWVecTexRefName, ref wVecPtr, wVecMemSize); //set texture memory for labels SetTextureMemory(ref cuLabelsTexRef, cudaLabelsTexRefName, sub_prob.Y, ref labelsPtr); SetTextureMemory(ref cuDeltasTexRef, "deltasTexRef", ref deltasPtr, alphaMemSize); diagPtr = cuda.GetModuleGlobal(cuModule, "diag_shift"); stepBBPtr = cuda.GetModuleGlobal(cuModule, "stepBB"); float[] stepData = new float[] { 0.1f }; cuda.CopyHostToDevice(stepBBPtr, stepData); SetCudaParameters(sub_prob); }
private static void GemmOpBatch(TSCudaContext context, BlasOp transA, BlasOp transB, float alpha, Tensor a, Tensor b, float beta, Tensor c) { if (a.Strides[1] != 1) { throw new ArgumentException($"a must be contiguous in the first dimension (column major / fortran order). ({a.Strides[0]},{a.Strides[1]}) ({b.Strides[0]},{b.Strides[1]}) ({c.Strides[0]},{c.Strides[1]})"); } if (b.Strides[1] != 1) { throw new ArgumentException("b must be contiguous in the first dimension (column major / fortran order)"); } if (c.Strides[1] != 1) { throw new ArgumentException($"c must be contiguous in the first dimension (column major / fortran order) ({a.Strides[0]}, {a.Strides[1]}, {a.Strides[2]}) ({b.Strides[0]}, {b.Strides[1]}, {b.Strides[2]}) ({c.Strides[0]}, {c.Strides[1]}, {c.Strides[2]})"); } using (Util.PooledObject <CudaBlas> blas = context.BlasForTensor(c)) { bool nta = transA == BlasOp.NonTranspose; bool ntb = transB == BlasOp.NonTranspose; Operation transa = GetCudaBlasOp(transA); Operation transb = GetCudaBlasOp(transB); int m = (int)a.Sizes[nta ? 1 : 2]; int k = (int)b.Sizes[ntb ? 1 : 2]; int n = (int)b.Sizes[ntb ? 2 : 1]; int lda = (int)a.Strides[2]; int ldb = (int)b.Strides[2]; int ldc = (int)c.Strides[2]; int stra = (int)a.Strides[0]; int strb = (int)b.Strides[0]; int strc = (int)c.Strides[0]; int batchSize = (int)c.Sizes[0]; //// Set the math mode to allow cuBLAS to use Tensor Cores: //cublasStat = cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH); CublasStatus status = CudaBlasNativeMethods.cublasSetMathMode(blas.Value.CublasHandle, ManagedCuda.CudaBlas.Math.TensorOpMath); if (status != CublasStatus.Success) { throw new CudaBlasException($"Failed to set math mode to tensor ops."); } if (c.ElementType == DType.Float32) { CUdeviceptr aPtrSingle = CudaHelpers.GetBufferStart(a); CUdeviceptr bPtrSingle = CudaHelpers.GetBufferStart(b); CUdeviceptr cPtrSingle = CudaHelpers.GetBufferStart(c); CublasStatus _statusF32 = CudaBlasNativeMethods.cublasSgemmStridedBatched(blas.Value.CublasHandle, transa, transb, m, n, k, ref alpha, aPtrSingle, lda, stra, bPtrSingle, ldb, strb, ref beta, cPtrSingle, ldc, strc, batchSize); if (_statusF32 != CublasStatus.Success) { throw new CudaBlasException(_statusF32); } } else if (c.ElementType == DType.Float64) { CUdeviceptr aPtrDouble = CudaHelpers.GetBufferStart(a); CUdeviceptr bPtrDouble = CudaHelpers.GetBufferStart(b); CUdeviceptr cPtrDouble = CudaHelpers.GetBufferStart(c); double alphaDouble = alpha; double betaDouble = beta; CublasStatus _statusF64 = CudaBlasNativeMethods.cublasDgemmStridedBatched(blas.Value.CublasHandle, transa, transb, m, n, k, ref alphaDouble, aPtrDouble, lda, stra, bPtrDouble, ldb, strb, ref betaDouble, cPtrDouble, ldc, strc, batchSize); if (_statusF64 != CublasStatus.Success) { throw new CudaBlasException(_statusF64); } } else { throw new NotSupportedException("CUDA GEMM with element type " + c.ElementType + " not supported"); } } }
public static Tensor Invoke(CudaReduceKernels reduceKernels, string kernelName, float init, ReduceInitType initType, Tensor result, Tensor src, int dim, object extraArg = null) { if (src.DimensionCount == 0) { return(result); } var context = CudaHelpers.TSContextForTensor(src); var cudaContext = context.CudaContextForTensor(src); var requiredOutputSize = (long[])src.Sizes.Clone(); requiredOutputSize[dim] = 1; var writeTarget = TensorResultBuilder.GetWriteTarget(result, src, false, requiredOutputSize); ThrowIfAnyTensorInvalid(writeTarget, src); var inElements = src.ElementCount(); var reductionSize = src.Sizes[dim]; var reductionStride = src.Strides[dim]; var outElements = inElements / reductionSize; var contigReduction = reductionStride == 1; // We must make sure that when the tensor is passed to the kernel, src.Sizes[dim] is set to 1 // This includes for the purposes of determining which tensor specializations to use (changing // the dimension size to 1 may make the tensor non-contiguous var newSizes = (long[])src.Sizes.Clone(); newSizes[dim] = 1; var srcSlim = new Tensor(newSizes, src.Strides, src.Storage, src.StorageOffset); var config = new ApplySpecialization(writeTarget, srcSlim); object totalSlices = config.Use32BitIndices ? (uint)outElements : (ulong)outElements; object reductionSizeTyped = config.Use32BitIndices ? (uint)reductionSize : (ulong)reductionSize; object reductionStrideTyped = config.Use32BitIndices ? (uint)reductionStride : (ulong)reductionStride; var initValueTyped = ReduceInitConverter.GetInitValue(init, initType, src.ElementType); var ptx = reduceKernels.GetPtx(context.Compiler); if (contigReduction) { var block = GetContigReduceBlock(cudaContext, outElements, reductionSize); var grid = GetContigReduceGrid(outElements); var smemSize = (uint)src.ElementType.Size() * block.x; var fullName = "contig_" + PermutationGenerator.GetMangledName(kernelName, config); if (extraArg == null) { InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionSizeTyped, totalSlices, initValueTyped); } else { InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionSizeTyped, totalSlices, initValueTyped, extraArg); } } else { var deviceProps = context.DeviceInfoForContext(cudaContext); var block = GetNonContigReduceBlock(deviceProps); var grid = GetNoncontigReduceGrid(deviceProps, outElements); uint smemSize = 0; var fullName = "noncontig_" + PermutationGenerator.GetMangledName(kernelName, config); if (extraArg == null) { InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionStrideTyped, reductionSizeTyped, totalSlices, initValueTyped); } else { InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionStrideTyped, reductionSizeTyped, totalSlices, initValueTyped, extraArg); } } return(writeTarget); }
private void SetCudaData(Problem <SparseVec> sub_prob) { int vecDim = sub_prob.Elements[0].Dim; /* * copy vectors to CUDA device */ float[] vecVals; int[] vecIdx; int[] vecLenght; CudaHelpers.TransformToCSRFormat(out vecVals, out vecIdx, out vecLenght, sub_prob.Elements); valsCSRPtr = cuda.CopyHostToDevice(vecVals); idxCSRPtr = cuda.CopyHostToDevice(vecIdx); vecLenghtCSRPtr = cuda.CopyHostToDevice(vecLenght); CudaHelpers.TransformToCSCFormat(out vecVals, out vecIdx, out vecLenght, sub_prob.Elements); valsCSCPtr = cuda.CopyHostToDevice(vecVals); idxCSCPtr = cuda.CopyHostToDevice(vecIdx); vecLenghtCSCPtr = cuda.CopyHostToDevice(vecLenght); /* * allocate memory for gradient */ uint memSize = (uint)(sub_prob.ElementsCount * sizeof(float)); //allocate mapped memory for our results (dot product beetween vector W and all elements) gradIntPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); gradPtr = cuda.GetHostDevicePointer(gradIntPtr, 0); //allocate memory for main vector, size of this vector is the same as dimenson, so many //indexes will be zero, but cuda computation is faster mainVector = new float[vecDim]; //move W wector //CudaHelpers.FillDenseVector(problemElements[0], mainVector); CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVecPtr); //set texture memory for labels CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuLabelsTexRef, cudaLabelsTexRefName, sub_prob.Y, ref labelsPtr); /* * data for cuda solver */ //normaly for L2 solver QDii= xii*xii+Diag_i //where Diag_i = 0.5/Cp if yi=1 // Diag_i = 0.5/Cn if yi=-1 //but we will add this on GPU QD = new float[sub_prob.ElementsCount]; alpha = new float[sub_prob.ElementsCount]; deltas = new float[sub_prob.ElementsCount]; float[] diag = new float[3]; for (int i = 0; i < sub_prob.ElementsCount; i++) { QD[i] = sub_prob.Elements[i].DotProduct(); alpha[i] = 0f; deltas[i] = 0; } qdPtr = cuda.CopyHostToDevice(QD); alphaPtr = cuda.Allocate(alpha); //deltasPtr = cuda.Allocate(deltas); CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuDeltasTexRef, "deltasTexRef", deltas, ref deltasPtr); diagPtr = cuda.GetModuleGlobal(cuModule, "diag_shift"); //set this in fill function //cuda.CopyHostToDevice(diagPtr, diag); //CUdeviceptr dimPtr = cuda.GetModuleGlobal(cuModule, "Dim"); ////todo: check if it ok ////cuda.Memset(dimPtr,(uint) vecDim, 1); //int[] dimArr = new int[] { vecDim }; //cuda.CopyHostToDevice(dimPtr,dimArr); //CUDARuntime.cudaMemcpyToSymbol("Dim", dimPtr, 1, 0, cudaMemcpyKind.cudaMemcpyHostToDevice); //CUDARuntime.cudaMemcpyToSymbol("Dim", ,1,0, cudaMemcpyKind.cudaMemcpyHostToDevice); CUdeviceptr deltaScalingPtr = cuda.GetModuleGlobal(cuModule, "stepScaling"); //two ways of computing scaling param, should be the same, but it depends on rounding. //stepScaling = (float)(1.0 / Math.Sqrt(sub_prob.ElementsCount)); stepScaling = 0.0002f;// (float)(1.0 / sub_prob.ElementsCount); //set scaling constant float[] scArr = new float[] { stepScaling }; cuda.CopyHostToDevice(deltaScalingPtr, scArr); //cuda.Memset(deltaScalingPtr, (uint) scaling,sizeof(float)); //cuda.CopyHostToDevice(dimPtr, problem.Elements[0].Dim); SetCudaParameters(sub_prob); }