private void LayerNorm(TSCudaContext context, Tensor result, Tensor src, Tensor alpha, Tensor beta, float eps = 1e-9f) { CudaContext cudaContext = context.CudaContextForTensor(src); cudaContext.SetCurrent(); int ndim = src.DimensionCount; long storageSize = TensorDimensionHelpers.GetStorageSize(src.Sizes, src.Strides); long cols = src.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } long rows = storageSize / cols; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result); CUdeviceptr srcPtr = CudaHelpers.GetBufferStart(src); CUdeviceptr alphaPtr = CudaHelpers.GetBufferStart(alpha); CUdeviceptr betaPtr = CudaHelpers.GetBufferStart(beta); Invoke(context, cudaContext, "gLNormalization", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, srcPtr, alphaPtr, betaPtr, rows, cols, eps); }
/// <summary> /// Runs the m v double. /// </summary> /// <param name="context">The context.</param> /// <param name="result">The result.</param> /// <param name="mat">The mat.</param> /// <param name="vec">The vec.</param> /// <exception cref="ArgumentException">lhs must be contiguous in the last dimension</exception> private static void Run_M_V_double(TSCudaContext context, NDArray result, NDArray mat, NDArray vec) { // Require lhs to be row-major. This means we must tell BLAS to transpose it (BLAS expects column-major matrices) if (mat.Strides[1] != 1) { throw new ArgumentException("lhs must be contiguous in the last dimension"); } using (var blas = context.BlasForTensor(mat)) { var yPtr = CudaHelpers.GetBufferStart(result); var aPtr = CudaHelpers.GetBufferStart(mat); var xPtr = CudaHelpers.GetBufferStart(vec); Operation trans = Operation.Transpose; int m = (int)mat.Shape[1]; int n = (int)mat.Shape[0]; int incx = (int)vec.Strides[0]; int lda = (int)mat.Strides[0]; int incy = (int)result.Strides[0]; double alpha = 1; double beta = 0; CudaBlasNativeMethods.cublasDgemv_v2(blas.Value.CublasHandle, trans, m, n, ref alpha, aPtr, lda, xPtr, incx, ref beta, yPtr, incy); } }
private void UpdateCost(TSCudaContext context, Tensor weight, Tensor ids, Tensor costs) { CudaContext cudaContext = context.CudaContextForTensor(weight); cudaContext.SetCurrent(); int ndim = weight.DimensionCount; long rows = 1; for (int dim = 0; dim < ndim - 1; dim++) { rows *= weight.Sizes[dim]; } long cols = weight.Sizes[ndim - 1]; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr weightPtr = CudaHelpers.GetBufferStart(weight); CUdeviceptr idsPtr = CudaHelpers.GetBufferStart(ids); CUdeviceptr costsPtr = CudaHelpers.GetBufferStart(costs); Invoke(context, cudaContext, "UpdateCost", grid, threads, 0, CUstream.NullStream, weightPtr, idsPtr, costsPtr, rows, cols); }
private void AddLayerNormGrad(TSCudaContext context, Tensor out1Grad, Tensor out2Grad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x1, Tensor x2, Tensor alpha, Tensor beta, float eps = 1e-9f) { CudaContext cudaContext = context.CudaContextForTensor(inGrad); cudaContext.SetCurrent(); int ndim = inGrad.DimensionCount; long storageSize = TensorDimensionHelpers.GetStorageSize(inGrad.Sizes, inGrad.Strides); long cols = inGrad.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } long rows = storageSize / cols; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr out1GradPtr = CudaHelpers.GetBufferStart(out1Grad); CUdeviceptr out2GradPtr = CudaHelpers.GetBufferStart(out2Grad); CUdeviceptr alphaGradPtr = CudaHelpers.GetBufferStart(alphaGrad); CUdeviceptr betaGradPtr = CudaHelpers.GetBufferStart(betaGrad); CUdeviceptr inGradPtr = CudaHelpers.GetBufferStart(inGrad); CUdeviceptr yPtr = CudaHelpers.GetBufferStart(y); CUdeviceptr x1Ptr = CudaHelpers.GetBufferStart(x1); CUdeviceptr x2Ptr = CudaHelpers.GetBufferStart(x2); CUdeviceptr alphaPtr = CudaHelpers.GetBufferStart(alpha); CUdeviceptr betaPtr = CudaHelpers.GetBufferStart(beta); Invoke(context, cudaContext, "gAddLayerNormalizationGrad", grid, threads, threads.x * sizeof(float) * 4, CUstream.NullStream, out1GradPtr, out2GradPtr, alphaGradPtr, betaGradPtr, inGradPtr, yPtr, x1Ptr, x2Ptr, alphaPtr, betaPtr, rows, cols, eps); }
private void ReduceIndexOuterDim(TSCudaContext context, Tensor resultValues, Tensor resultIndices, Tensor src, int dimension, Tuple <float, float> init, string baseKernelName) { CudaContext cudaContext = context.CudaContextForTensor(src); int ndim = src.DimensionCount; long num_orows = 1; for (int dim = 0; dim < dimension; dim++) { num_orows *= src.Sizes[dim]; } long row_size = src.Sizes[dimension]; long num_irows = 1; for (int dim = dimension + 1; dim < ndim; dim++) { num_irows *= src.Sizes[dim]; } dim3 threads = new dim3((uint)Math.Min(512, num_irows)); int maxGridDim = 1024; dim3 grid = new dim3((uint)Math.Min(maxGridDim, num_orows), (uint)Math.Min(maxGridDim, ApplyUtils.CeilDiv(num_irows, threads.x))); CUdeviceptr resultValPtr = CudaHelpers.GetBufferStart(resultValues); CUdeviceptr resultIdxPtr = CudaHelpers.GetBufferStart(resultIndices); CUdeviceptr srcPtr = CudaHelpers.GetBufferStart(src); string kernelName = "outer_index_" + baseKernelName; Invoke(context, cudaContext, kernelName, grid, threads, 0, CUstream.NullStream, resultValPtr, resultIdxPtr, srcPtr, num_orows, num_irows, row_size, init.Item1, init.Item2); }
private void BuildTriMask(TSCudaContext context, Tensor result, float value, float maskedValue) { CudaContext cudaContext = context.CudaContextForTensor(result); cudaContext.SetCurrent(); int ndim = result.DimensionCount; long storageSize = TensorDimensionHelpers.GetStorageSize(result.Sizes, result.Strides); long cols = result.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } long rows = storageSize / cols; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result); Invoke(context, cudaContext, "BuildTriMask", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, rows, cols, value, maskedValue); }
private void Adam(TSCudaContext context, Tensor weight, Tensor gradient, Tensor v, Tensor m, int batchSize, float step_size, float clipval, float regc, float decay_rate_v, float decay_rate_m, int iter, float eps) { var cudaContext = context.CudaContextForTensor(weight); cudaContext.SetCurrent(); var rows = weight.Sizes[0]; var cols = weight.Sizes[1]; var ndim = weight.DimensionCount; long num_rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { num_rows *= weight.Sizes[dim]; } var threads = new dim3((uint)Math.Min(512, num_rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); var weightPtr = CudaHelpers.GetBufferStart(weight); var gradientPtr = CudaHelpers.GetBufferStart(gradient); var vPtr = CudaHelpers.GetBufferStart(v); var mPtr = CudaHelpers.GetBufferStart(m); Invoke(context, cudaContext, "Adam", grid, threads, 0, CUstream.NullStream, weightPtr, gradientPtr, vPtr, mPtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate_v, decay_rate_m, iter, eps); }
//__global__ void SGD(float* w, float* g, float* c, float* l, unsigned rows, unsigned cols, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps) private void SGD(TSCudaContext context, Tensor weight, Tensor gradient, Tensor cache, Tensor lrw, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps) { var cudaContext = context.CudaContextForTensor(weight); cudaContext.SetCurrent(); var rows = weight.Sizes[0]; var cols = weight.Sizes[1]; var ndim = weight.DimensionCount; long num_rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { num_rows *= weight.Sizes[dim]; } var threads = new dim3((uint)Math.Min(512, num_rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); var weightPtr = CudaHelpers.GetBufferStart(weight); var gradientPtr = CudaHelpers.GetBufferStart(gradient); var cachePtr = CudaHelpers.GetBufferStart(cache); var lrwPtr = CudaHelpers.GetBufferStart(lrw); Invoke(context, cudaContext, "SGD", grid, threads, (uint)(threads.x * sizeof(float)), CUstream.NullStream, weightPtr, gradientPtr, cachePtr, lrwPtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate, eps); }
private void AddLayerNormGrad(TSCudaContext context, Tensor out1Grad, Tensor out2Grad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x1, Tensor x2, Tensor alpha, Tensor beta, float eps = 1e-9f) { var cudaContext = context.CudaContextForTensor(inGrad); cudaContext.SetCurrent(); var rows = inGrad.Sizes[0]; var cols = inGrad.Sizes[1]; var ndim = inGrad.DimensionCount; long num_rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { num_rows *= inGrad.Sizes[dim]; } var threads = new dim3((uint)Math.Min(512, num_rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); var out1GradPtr = CudaHelpers.GetBufferStart(out1Grad); var out2GradPtr = CudaHelpers.GetBufferStart(out2Grad); var alphaGradPtr = CudaHelpers.GetBufferStart(alphaGrad); var betaGradPtr = CudaHelpers.GetBufferStart(betaGrad); var inGradPtr = CudaHelpers.GetBufferStart(inGrad); var yPtr = CudaHelpers.GetBufferStart(y); var x1Ptr = CudaHelpers.GetBufferStart(x1); var x2Ptr = CudaHelpers.GetBufferStart(x2); var alphaPtr = CudaHelpers.GetBufferStart(alpha); var betaPtr = CudaHelpers.GetBufferStart(beta); Invoke(context, cudaContext, "gAddLayerNormalizationGrad", grid, threads, (uint)(threads.x * sizeof(float)) * 4, CUstream.NullStream, out1GradPtr, out2GradPtr, alphaGradPtr, betaGradPtr, inGradPtr, yPtr, x1Ptr, x2Ptr, alphaPtr, betaPtr, rows, cols, eps); }
private void AddLayerNorm(TSCudaContext context, Tensor result, Tensor src1, Tensor src2, Tensor alpha, Tensor beta, float eps = 1e-9f) { var cudaContext = context.CudaContextForTensor(src1); cudaContext.SetCurrent(); var rows = src1.Sizes[0]; var cols = src1.Sizes[1]; var ndim = src1.DimensionCount; long num_rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { num_rows *= src1.Sizes[dim]; } var threads = new dim3((uint)Math.Min(512, num_rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); var resultPtr = CudaHelpers.GetBufferStart(result); var src1Ptr = CudaHelpers.GetBufferStart(src1); var src2Ptr = CudaHelpers.GetBufferStart(src2); var alphaPtr = CudaHelpers.GetBufferStart(alpha); var betaPtr = CudaHelpers.GetBufferStart(beta); Invoke(context, cudaContext, "gAddLNormalization", grid, threads, (uint)(threads.x * sizeof(float)), CUstream.NullStream, resultPtr, src1Ptr, src2Ptr, alphaPtr, betaPtr, rows, cols, eps); }
public void SpatialMaxPoolingBackward(Tensor input, Tensor gradOutput, Tensor gradInput, Tensor indices, ConvolutionDesc2d cd, bool ceilMode) { TSCudaContext context = CudaHelpers.TSContextForTensor(gradOutput); CudaContext cudaContext = context.CudaContextForTensor(gradOutput); int dimw = 3; int dimh = 2; int dimc = 1; long nbatch = input.Sizes[0]; long nslices = input.Sizes[dimc]; long iheight = input.Sizes[dimh]; long iwidth = input.Sizes[dimw]; long owidth = gradOutput.Sizes[dimw]; long oheight = gradOutput.Sizes[dimh]; using (Tensor gradOutputContig = Ops.AsContiguous(gradOutput)) { CUdeviceptr gradOutputPtr = CudaHelpers.GetBufferStart(gradOutputContig); CUdeviceptr indicesPtr = CudaHelpers.GetBufferStart(indices); CUdeviceptr gradInputPtr = CudaHelpers.GetBufferStart(gradInput); int count = (int)input.ElementCount(); Invoke(context, cudaContext, "MaxPoolBackward", new dim3(NNThreads.NumBlocks(count)), new dim3(NNThreads.NumThreads), 0, CUstream.NullStream, count, gradOutputPtr, indicesPtr, nbatch, nslices, iheight, iwidth, oheight, owidth, cd.kH, cd.kW, cd.dH, cd.dW, cd.padH, cd.padW, gradInputPtr); } }
public static Tensor Invoke(CudaCode kernels, string funcName, Tensor result, Tensor src) { try { TSCudaContext context = CudaHelpers.TSContextForTensor(src); CudaContext cudaContext = context.CudaContextForTensor(src); cudaContext.SetCurrent(); Tensor writeTarget = TensorResultBuilder.GetWriteTarget(result, src, false, src.Sizes); long elementCount = writeTarget.ElementCount(); byte[] ptx = kernels.GetPtx(context.Compiler); if (result == src) { ApplyOpInvoke.Invoke(context, cudaContext, ptx, "t1_" + funcName, writeTarget, elementCount); } else { ApplyOpInvoke.Invoke(context, cudaContext, ptx, "t2_" + funcName, writeTarget, src, elementCount); } return(writeTarget); } catch (Exception e) { Logger.WriteLine($"Error = '{e.Message}', Call stack = '{e.StackTrace}'"); throw; } }
public static void Invoke(TSCudaContext context, CudaContext cudaContext, byte[] ptx, string baseName, params object[] args) { ThrowIfAnyTensorInvalid(args); try { cudaContext.SetCurrent(); CudaDeviceProperties deviceInfo = context.DeviceInfoForContext(cudaContext); IEnumerable <Tensor> allTensors = args.OfType <Tensor>(); Tensor firstTensor = allTensors.First(); long elementCount = firstTensor.ElementCount(); ApplySpecialization spec = new ApplySpecialization(allTensors.ToArray()); ConvertTensorArgs.Convert(cudaContext, spec.Use32BitIndices, args); ManagedCuda.VectorTypes.dim3 block = ApplyUtils.GetApplyBlock(); ManagedCuda.VectorTypes.dim3 grid = ApplyUtils.GetApplyGrid(deviceInfo, elementCount); string fullKernelName = PermutationGenerator.GetMangledName(baseName, spec); CudaKernel kernel = context.KernelCache.Get(cudaContext, ptx, fullKernelName); kernel.GridDimensions = grid; kernel.BlockDimensions = block; kernel.RunAsync(CUstream.NullStream, args); } catch (Exception ex) { Logger.WriteLine($"Exception message = {ex.Message}, Call stack = {ex.StackTrace}"); throw; } }
private void VarOuterDim(TSCudaContext context, Tensor result, Tensor src, int dimension, bool normByN, bool applySqrt) { var cudaContext = context.CudaContextForTensor(src); var ndim = src.DimensionCount; long num_orows = 1; for (var dim = 0; dim < dimension; dim++) { num_orows *= src.Sizes[dim]; } var row_size = src.Sizes[dimension]; // Treat all inner dimensions (i.e. dim > dimension) as one. long num_irows = 1; for (var dim = dimension + 1; dim < ndim; dim++) { num_irows *= src.Sizes[dim]; } var threads = new dim3((uint)Math.Min(512, num_irows)); var maxGridDim = 1024; var grid = new dim3((uint)Math.Min(maxGridDim, num_orows), (uint)Math.Min(maxGridDim, ApplyUtils.CeilDiv(num_irows, threads.x))); var resultPtr = CudaHelpers.GetBufferStart(result); var srcPtr = CudaHelpers.GetBufferStart(src); var kernelName = "kernel_varOuterDim" + GetMangledNameSuffix(normByN, applySqrt); this.Invoke(context, cudaContext, kernelName, grid, threads, 0, CUstream.NullStream, resultPtr, srcPtr, num_orows, num_irows, row_size); }
private void RMSProp(TSCudaContext context, Tensor weight, Tensor gradient, Tensor cache, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps) { CudaContext cudaContext = context.CudaContextForTensor(weight); cudaContext.SetCurrent(); int ndim = weight.DimensionCount; long storageSize = TensorDimensionHelpers.GetStorageSize(weight.Sizes, weight.Strides); long cols = weight.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } long rows = storageSize / cols; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr weightPtr = CudaHelpers.GetBufferStart(weight); CUdeviceptr gradientPtr = CudaHelpers.GetBufferStart(gradient); CUdeviceptr cachePtr = CudaHelpers.GetBufferStart(cache); Invoke(context, cudaContext, "RMSProp", grid, threads, 0, CUstream.NullStream, weightPtr, gradientPtr, cachePtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate, eps); }
private void Softmax(TSCudaContext context, Tensor result, Tensor src) { var cudaContext = context.CudaContextForTensor(src); cudaContext.SetCurrent(); var rows = src.Sizes[0]; var cols = src.Sizes[1]; var ndim = src.DimensionCount; long num_rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { num_rows *= src.Sizes[dim]; } var threads = new dim3((uint)Math.Min(512, num_rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); var resultPtr = CudaHelpers.GetBufferStart(result); var srcPtr = CudaHelpers.GetBufferStart(src); Invoke(context, cudaContext, "gSoftmax", grid, threads, (uint)(threads.x * sizeof(float)), CUstream.NullStream, resultPtr, srcPtr, rows, cols); }
private void SoftmaxGrad(TSCudaContext context, Tensor grad, Tensor adj, Tensor val, bool addGrad = true) { CudaContext cudaContext = context.CudaContextForTensor(grad); cudaContext.SetCurrent(); int ndim = grad.DimensionCount; long storageSize = TensorDimensionHelpers.GetStorageSize(grad.Sizes, grad.Strides); long cols = grad.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } long rows = storageSize / cols; int iAddGrad = addGrad ? 1 : 0; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr gradPtr = CudaHelpers.GetBufferStart(grad); CUdeviceptr adjPtr = CudaHelpers.GetBufferStart(adj); CUdeviceptr valPtr = CudaHelpers.GetBufferStart(val); Invoke(context, cudaContext, "gSoftmaxGrad", grid, threads, threads.x * sizeof(float), CUstream.NullStream, gradPtr, adjPtr, valPtr, rows, cols, iAddGrad); }
private void SoftmaxGrad(TSCudaContext context, Tensor grad, Tensor adj, Tensor val) { var cudaContext = context.CudaContextForTensor(grad); cudaContext.SetCurrent(); var rows = grad.Sizes[0]; var cols = grad.Sizes[1]; var ndim = grad.DimensionCount; long num_rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { num_rows *= grad.Sizes[dim]; } var threads = new dim3((uint)Math.Min(512, num_rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); var gradPtr = CudaHelpers.GetBufferStart(grad); var adjPtr = CudaHelpers.GetBufferStart(adj); var valPtr = CudaHelpers.GetBufferStart(val); Invoke(context, cudaContext, "gSoftmaxGrad", grid, threads, (uint)(threads.x * sizeof(float)), CUstream.NullStream, gradPtr, adjPtr, valPtr, rows, cols); }
//BuildSrcTgtMask(float* weights, int* originalSrcLengths, int* originalTgtLengths, int batchSize, unsigned rows, unsigned cols) private void BuildSrcTgtMask(TSCudaContext context, Tensor mask, Tensor originalSrcLengths, Tensor originalTgtLengths, int batchSize) { CudaContext cudaContext = context.CudaContextForTensor(mask); cudaContext.SetCurrent(); int ndim = mask.DimensionCount; long rows = 1; for (int dim = 0; dim < ndim - 1; dim++) { rows *= mask.Sizes[dim]; } long cols = mask.Sizes[ndim - 1]; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr maskPtr = CudaHelpers.GetBufferStart(mask); CUdeviceptr originalSrcLengthsPtr = CudaHelpers.GetBufferStart(originalSrcLengths); CUdeviceptr originalTgtLengthsPtr = CudaHelpers.GetBufferStart(originalTgtLengths); Invoke(context, cudaContext, "BuildSrcTgtMask", grid, threads, 0, CUstream.NullStream, maskPtr, originalSrcLengthsPtr, originalTgtLengthsPtr, batchSize, rows, cols); }
public static Tensor Mul_M_M(TSCudaContext context, Tensor result, Tensor lhs, Tensor rhs) { if (lhs.ElementType != rhs.ElementType || (result != null && result.ElementType != lhs.ElementType)) { throw new InvalidOperationException("All tensors must have the same element type"); } CudaHelpers.ThrowIfDifferentDevices(result, lhs, rhs); if (result != null && !(result.Storage is CudaStorage)) { throw new ArgumentException("result must be a CUDA tensor", "result"); } if (!(lhs.Storage is CudaStorage)) { throw new ArgumentException("lhs must be a CUDA tensor", "lhs"); } if (!(rhs.Storage is CudaStorage)) { throw new ArgumentException("rhs must be a CUDA tensor", "rhs"); } var writeTarget = TensorResultBuilder.GetWriteTarget(result, lhs, false, lhs.Sizes[0], rhs.Sizes[1]); Gemm(context, 1, lhs, rhs, 0, writeTarget); return(writeTarget); }
/// <summary> /// Invokes the specified kernels. /// </summary> /// <param name="kernels">The kernels.</param> /// <param name="context">The context.</param> /// <param name="cudaContext">The cuda context.</param> /// <param name="result">The result.</param> /// <param name="src">The source.</param> public static void Invoke(FillCopyKernels kernels, TSCudaContext context, CudaContext cudaContext, Tensor result, Tensor src) { var ptx = kernels.GetPtx(context.Compiler); var elementCount = result.ElementCount(); ApplyOpInvoke.Invoke(context, cudaContext, ptx, "copy", result, src, elementCount); }
public void CopyGpuToCpu(Tensor result, Tensor src, long totalElements) { TSCudaContext context = CudaHelpers.TSContextForTensor(src); CudaContext srcContext = context.CudaContextForTensor(src); using (Tensor srcContig = Ops.AsContiguous(src)) using (Tensor resultContig = AsTypeCpu(result, src.ElementType, true)) { IntPtr resultContigPtr = ((Cpu.CpuStorage)resultContig.Storage).PtrAtElement(resultContig.StorageOffset); CUdeviceptr srcContigPtr = ((CudaStorage)srcContig.Storage).DevicePtrAtElement(srcContig.StorageOffset); long totalBytes = totalElements * srcContig.ElementType.Size(); // Use DriverAPINativeMethods directly here instead of CudaContext.CopyToHost, because CopyToHost only has an overload // for specifying totalBytes as a uint, but we may exceed the range of a uint here. CUResult res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(resultContigPtr, srcContigPtr, totalBytes); if (res != CUResult.Success) { throw new CudaException(res); } if (result.Storage != resultContig.Storage) { Ops.Copy(result, resultContig); // copy on CPU } } }
public Tensor Scatter(Tensor result, Tensor src, int dim, Tensor indices) { TSCudaContext context = CudaHelpers.TSContextForTensor(src); CudaContext cudaContext = context.CudaContextForTensor(src); if (result == null) { throw new ArgumentNullException("result"); } if (result.DimensionCount != src.DimensionCount) { throw new InvalidOperationException("result and src must have same number of dimensions"); } if (dim < 0 && dim >= result.DimensionCount) { throw new ArgumentOutOfRangeException("dim"); } if (indices.DimensionCount != src.DimensionCount) { throw new InvalidOperationException("src and indices must have same number of dimensions"); } if (!src.IsSameSizeAs(indices)) { throw new InvalidOperationException("src and indices must be the same size"); } if (!TensorResultBuilder.ArrayEqualExcept(src.Sizes, result.Sizes, dim)) { throw new InvalidOperationException("result and src must be the same size except in dimension dim"); } Tensor writeTarget = result; long nElement = indices.ElementCount(); dim3 block = ApplyUtils.GetApplyBlock(); dim3 grid = ApplyUtils.GetApplyGrid(context.DeviceInfoForContext(cudaContext), nElement); if (ApplyUtils.CanUse32BitIndexMath(writeTarget) && ApplyUtils.CanUse32BitIndexMath(src) && ApplyUtils.CanUse32BitIndexMath(indices)) { int dims = indices.DimensionCount <= 3 ? indices.DimensionCount : -1; string kernelName = MakeKernelName(ScatterBaseName, true, dims); Invoke(context, cudaContext, kernelName, grid, block, 0, CUstream.NullStream, true, writeTarget, src, indices, dim, (int)nElement); } else { string kernelName = MakeKernelName(ScatterBaseName, false, -1); Invoke(context, cudaContext, kernelName, grid, block, 0, CUstream.NullStream, false, writeTarget, src, indices, dim, nElement); } return(writeTarget); }
public Tensor SoftmaxGrad(Tensor grad, Tensor adj, Tensor val, bool addGrad = true) { TSCudaContext context = CudaHelpers.TSContextForTensor(grad); SoftmaxGrad(context, grad, adj, val, addGrad); return(grad); }
public Tensor RMSProp(Tensor weight, Tensor gradient, Tensor cache, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps) { TSCudaContext context = CudaHelpers.TSContextForTensor(weight); RMSProp(context, weight, gradient, cache, batchSize, step_size, clipval, regc, decay_rate, eps); return(weight); }
public Tensor Adam(Tensor weight, Tensor gradient, Tensor v, Tensor m, int batchSize, float step_size, float clipval, float regc, float decay_rate_v, float decay_rate_m, int iter, float eps) { TSCudaContext context = CudaHelpers.TSContextForTensor(weight); Adam(context, weight, gradient, v, m, batchSize, step_size, clipval, regc, decay_rate_v, decay_rate_m, iter, eps); return(weight); }
public void AddLayerNormGrad(Tensor out1Grad, Tensor out2Grad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x1, Tensor x2, Tensor alpha, Tensor beta, float eps = 1e-9f) { TSCudaContext context = CudaHelpers.TSContextForTensor(inGrad); Tensor writeTarget1 = TensorResultBuilder.GetWriteTarget(out1Grad, inGrad, false, inGrad.Sizes); Tensor writeTarget2 = TensorResultBuilder.GetWriteTarget(out2Grad, inGrad, false, inGrad.Sizes); AddLayerNormGrad(context, writeTarget1, writeTarget2, alphaGrad, betaGrad, inGrad, y, x1, x2, alpha, beta, eps); }
public Tensor BuildTriMask(Tensor result, float value, float maskedValue) { TSCudaContext context = CudaHelpers.TSContextForTensor(result); BuildTriMask(context, result, value, maskedValue); return(result); }
public Tensor Softmax(Tensor result, Tensor src) { TSCudaContext context = CudaHelpers.TSContextForTensor(src); Tensor writeTarget = TensorResultBuilder.GetWriteTarget(result, src, true, src.Sizes); Softmax(context, writeTarget, src); return(writeTarget); }
public Tensor AddLayerNorm(Tensor result, Tensor src1, Tensor src2, Tensor alpha, Tensor beta, float eps = 1e-9f) { TSCudaContext context = CudaHelpers.TSContextForTensor(src1); Tensor writeTarget = TensorResultBuilder.GetWriteTarget(result, src1, false, src1.Sizes); AddLayerNorm(context, writeTarget, src1, src2, alpha, beta, eps); return(writeTarget); }