private static void Run_M_V_float(TSCudaContext context, Tensor result, Tensor mat, Tensor vec) { // Require lhs to be row-major. This means we must tell BLAS to transpose it (BLAS expects column-major matrices) if (mat.Strides[1] != 1) { throw new ArgumentException("lhs must be contiguous in the last dimension"); } using (Util.PooledObject <CudaBlas> blas = context.BlasForTensor(mat)) { ManagedCuda.BasicTypes.CUdeviceptr yPtr = CudaHelpers.GetBufferStart(result); ManagedCuda.BasicTypes.CUdeviceptr aPtr = CudaHelpers.GetBufferStart(mat); ManagedCuda.BasicTypes.CUdeviceptr xPtr = CudaHelpers.GetBufferStart(vec); Operation trans = Operation.Transpose; int m = (int)mat.Sizes[1]; int n = (int)mat.Sizes[0]; int incx = (int)vec.Strides[0]; int lda = (int)mat.Strides[0]; int incy = (int)result.Strides[0]; float alpha = 1; float beta = 0; CudaBlasNativeMethods.cublasSgemv_v2(blas.Value.CublasHandle, trans, m, n, ref alpha, aPtr, lda, xPtr, incx, ref beta, yPtr, incy); } }
private static ScratchSpace AllocScratchSpace(CudaContext context, CudaDeviceProperties deviceProps) { int size = ScratchSpacePerSMStream * deviceProps.MultiProcessorCount; ManagedCuda.BasicTypes.CUdeviceptr buffer = context.AllocateMemory(size); return(new ScratchSpace() { size = size, buffer = buffer }); }
public static Tensor Invoke(CudaReduceAllKernels reduceAllKernels, float init, ReduceInitType initType, string kernelName, Tensor result, Tensor src, object extraArg = null) { int deviceId = CudaHelpers.GetDeviceId(src); TSCudaContext context = CudaHelpers.TSContextForTensor(src); CudaContext cudaContext = context.CudaContextForDevice(deviceId); if (src.DimensionCount > TSCudaContext.MaxDims) { throw new InvalidOperationException("Tensors with dimension count > " + TSCudaContext.MaxDims + " are not supported"); } Tensor writeTarget = TensorResultBuilder.GetWriteTarget(result, src, false, 1); if (src.DimensionCount == 0) { return(result); } long totalElements = src.ElementCount(); ApplySpecialization config = new ApplySpecialization(src); object totalElementsTyped = config.Use32BitIndices ? (uint)totalElements : (ulong)totalElements; object initValueTyped = ReduceInitConverter.GetInitValue(init, initType, src.ElementType); dim3 grid; dim3 block; byte[] ptx = reduceAllKernels.GetPtx(context.Compiler); string fullKernelName = PermutationGenerator.GetMangledName(kernelName, config); ManagedCuda.BasicTypes.CUdeviceptr outputDevicePtr = CudaHelpers.GetBufferStart(writeTarget); if (isTwoPassReductionSize(totalElements)) { getPass1ReduceBlockGrid(context, deviceId, totalElements, out grid, out block); uint smemSize = block.x * sizeof(float); ManagedCuda.BasicTypes.CUdeviceptr scratchSpace = context.ScratchSpaceForDevice(deviceId).buffer; if (extraArg == null) { InvokeReduceAll(context, cudaContext, ptx, "twoPassA_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, scratchSpace); } else { InvokeReduceAll(context, cudaContext, ptx, "twoPassA_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, scratchSpace, extraArg); } uint numPass1Blocks = grid.x; getPass2ReduceBlockGrid(context, deviceId, totalElements, out grid, out block); smemSize = block.x * sizeof(float); InvokeReduceAllPass2(context, cudaContext, ptx, "twoPassB_" + fullKernelName, grid, block, smemSize, config.Use32BitIndices, numPass1Blocks, initValueTyped, scratchSpace, outputDevicePtr); } else { getSinglePassReduceBlockGrid(totalElements, out grid, out block); uint smemSize = block.x * sizeof(float); if (extraArg == null) { InvokeReduceAll(context, cudaContext, ptx, "onePass_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, outputDevicePtr); } else { InvokeReduceAll(context, cudaContext, ptx, "onePass_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, outputDevicePtr, extraArg); } } return(writeTarget); }
public void EpsilonPointer(out ManagedCuda.BasicTypes.CUdeviceptr epsilonDevicePointer) { epsilonDevicePointer = cudaEpsilon.DevicePointer; }
public void SigmaPointer(out ManagedCuda.BasicTypes.CUdeviceptr sigmaDevicePointer) { sigmaDevicePointer = cudaSigma.DevicePointer; }