/// <summary> /// Col2s the im. /// </summary> /// <param name="col">The col.</param> /// <param name="im">The im.</param> /// <param name="channels">The channels.</param> /// <param name="height">The height.</param> /// <param name="width">The width.</param> /// <param name="patch_h">The patch h.</param> /// <param name="patch_w">The patch w.</param> /// <param name="pad_h">The pad h.</param> /// <param name="pad_w">The pad w.</param> /// <param name="stride_h">The stride h.</param> /// <param name="stride_w">The stride w.</param> /// <param name="dilation_h">The dilation h.</param> /// <param name="dilation_w">The dilation w.</param> public void Col2Im(Tensor col, Tensor im, int channels, int height, int width, int patch_h, int patch_w, int pad_h, int pad_w, int stride_h, int stride_w, int dilation_h, int dilation_w) { var context = CudaHelpers.TSContextForTensor(im); var cudaContext = context.CudaContextForTensor(im); int height_col = (height + 2 * pad_h - (dilation_h * (patch_h - 1) + 1)) / stride_h + 1; int width_col = (width + 2 * pad_w - (dilation_w * (patch_w - 1) + 1)) / stride_w + 1; int num_kernels = channels * height * width; var data_im = CudaHelpers.GetBufferStart(im); var data_col = CudaHelpers.GetBufferStart(col); // From Torch source: // To avoid involving atomic operations, we will launch one kernel per // bottom dimension, and then in the kernel add up the top dimensions. Invoke(context, cudaContext, "col2im_kernel", new dim3(NNThreads.NumBlocks(num_kernels)), new dim3(NNThreads.NumThreads), 0, CUstream.NullStream, num_kernels, data_col, height, width, channels, patch_h, patch_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, height_col, width_col, data_im); }
/// <summary> /// Im2s the col. /// </summary> /// <param name="im">The im.</param> /// <param name="col">The col.</param> /// <param name="channels">The channels.</param> /// <param name="height">The height.</param> /// <param name="width">The width.</param> /// <param name="ksize_h">The ksize h.</param> /// <param name="ksize_w">The ksize w.</param> /// <param name="pad_h">The pad h.</param> /// <param name="pad_w">The pad w.</param> /// <param name="stride_h">The stride h.</param> /// <param name="stride_w">The stride w.</param> /// <param name="dilation_h">The dilation h.</param> /// <param name="dilation_w">The dilation w.</param> public void Im2Col(Tensor im, Tensor col, int channels, int height, int width, int ksize_h, int ksize_w, int pad_h, int pad_w, int stride_h, int stride_w, int dilation_h, int dilation_w) { var context = CudaHelpers.TSContextForTensor(im); var cudaContext = context.CudaContextForTensor(im); // From Torch source: // We are going to launch channels * height_col * width_col kernels, each // kernel responsible for copying a single-channel grid. int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1; int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1; int num_kernels = channels * height_col * width_col; var data_im = CudaHelpers.GetBufferStart(im); var data_col = CudaHelpers.GetBufferStart(col); Invoke(context, cudaContext, "im2col_kernel", new dim3(NNThreads.NumBlocks(num_kernels)), new dim3(NNThreads.NumThreads), 0, CUstream.NullStream, num_kernels, data_im, height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, height_col, width_col, data_col); }
/// <summary> /// Copies the gpu to cpu. /// </summary> /// <param name="result">The result.</param> /// <param name="src">The source.</param> /// <param name="totalElements">The total elements.</param> /// <exception cref="CudaException"></exception> public void CopyGpuToCpu(Tensor result, Tensor src, long totalElements) { var context = CudaHelpers.TSContextForTensor(src); var srcContext = context.CudaContextForTensor(src); using (var srcContig = Ops.AsContiguous(src)) using (var resultContig = AsTypeCpu(result, src.ElementType, true)) { var resultContigPtr = ((Cpu.CpuStorage)resultContig.Storage).PtrAtElement(resultContig.StorageOffset); var srcContigPtr = ((CudaStorage)srcContig.Storage).DevicePtrAtElement(srcContig.StorageOffset); var totalBytes = totalElements * srcContig.ElementType.Size(); // Use DriverAPINativeMethods directly here instead of CudaContext.CopyToHost, because CopyToHost only has an overload // for specifying totalBytes as a uint, but we may exceed the range of a uint here. var res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(resultContigPtr, srcContigPtr, totalBytes); if (res != CUResult.Success) { throw new CudaException(res); } if (result.Storage != resultContig.Storage) { Ops.Copy(result, resultContig); // copy on CPU } } }
public void SpatialMaxPoolingBackward(Tensor input, Tensor gradOutput, Tensor gradInput, Tensor indices, ConvolutionDesc2d cd, bool ceilMode) { var context = CudaHelpers.TSContextForTensor(gradOutput); var cudaContext = context.CudaContextForTensor(gradOutput); var dimw = 3; var dimh = 2; var dimc = 1; var nbatch = input.Sizes[0]; var nslices = input.Sizes[dimc]; var iheight = input.Sizes[dimh]; var iwidth = input.Sizes[dimw]; var owidth = gradOutput.Sizes[dimw]; var oheight = gradOutput.Sizes[dimh]; using var gradOutputContig = Ops.AsContiguous(gradOutput); var gradOutputPtr = CudaHelpers.GetBufferStart(gradOutputContig); var indicesPtr = CudaHelpers.GetBufferStart(indices); var gradInputPtr = CudaHelpers.GetBufferStart(gradInput); var count = (int)input.ElementCount(); this.Invoke(context, cudaContext, "MaxPoolBackward", new dim3(NNThreads.NumBlocks(count)), new dim3(NNThreads.NumThreads), 0, CUstream.NullStream, count, gradOutputPtr, indicesPtr, nbatch, nslices, iheight, iwidth, oheight, owidth, cd.kH, cd.kW, cd.dH, cd.dW, cd.padH, cd.padW, gradInputPtr); }
public static Tensor Invoke(CudaCode kernels, string funcName, Tensor result, Tensor src) { try { TSCudaContext context = CudaHelpers.TSContextForTensor(src); CudaContext cudaContext = context.CudaContextForTensor(src); cudaContext.SetCurrent(); Tensor writeTarget = TensorResultBuilder.GetWriteTarget(result, src, false, src.Sizes); long elementCount = writeTarget.ElementCount(); byte[] ptx = kernels.GetPtx(context.Compiler); if (result == src) { ApplyOpInvoke.Invoke(context, cudaContext, ptx, "t1_" + funcName, writeTarget, elementCount); } else { ApplyOpInvoke.Invoke(context, cudaContext, ptx, "t2_" + funcName, writeTarget, src, elementCount); } return(writeTarget); } catch (Exception e) { Logger.WriteLine($"Error = '{e.Message}', Call stack = '{e.StackTrace}'"); throw; } }
public Tensor Scatter(Tensor result, Tensor src, int dim, Tensor indices) { TSCudaContext context = CudaHelpers.TSContextForTensor(src); CudaContext cudaContext = context.CudaContextForTensor(src); if (result == null) { throw new ArgumentNullException("result"); } if (result.DimensionCount != src.DimensionCount) { throw new InvalidOperationException("result and src must have same number of dimensions"); } if (dim < 0 && dim >= result.DimensionCount) { throw new ArgumentOutOfRangeException("dim"); } if (indices.DimensionCount != src.DimensionCount) { throw new InvalidOperationException("src and indices must have same number of dimensions"); } if (!src.IsSameSizeAs(indices)) { throw new InvalidOperationException("src and indices must be the same size"); } if (!TensorResultBuilder.ArrayEqualExcept(src.Sizes, result.Sizes, dim)) { throw new InvalidOperationException("result and src must be the same size except in dimension dim"); } Tensor writeTarget = result; long nElement = indices.ElementCount(); dim3 block = ApplyUtils.GetApplyBlock(); dim3 grid = ApplyUtils.GetApplyGrid(context.DeviceInfoForContext(cudaContext), nElement); if (ApplyUtils.CanUse32BitIndexMath(writeTarget) && ApplyUtils.CanUse32BitIndexMath(src) && ApplyUtils.CanUse32BitIndexMath(indices)) { int dims = indices.DimensionCount <= 3 ? indices.DimensionCount : -1; string kernelName = MakeKernelName(ScatterBaseName, true, dims); Invoke(context, cudaContext, kernelName, grid, block, 0, CUstream.NullStream, true, writeTarget, src, indices, dim, (int)nElement); } else { string kernelName = MakeKernelName(ScatterBaseName, false, -1); Invoke(context, cudaContext, kernelName, grid, block, 0, CUstream.NullStream, false, writeTarget, src, indices, dim, nElement); } return(writeTarget); }
public Tensor BuildTriMask(Tensor result, float value, float maskedValue) { TSCudaContext context = CudaHelpers.TSContextForTensor(result); BuildTriMask(context, result, value, maskedValue); return(result); }
public Tensor SoftmaxGrad(Tensor grad, Tensor adj, Tensor val) { var context = CudaHelpers.TSContextForTensor(grad); SoftmaxGrad(context, grad, adj, val); return(grad); }
public Tensor SGD(Tensor weight, Tensor gradient, Tensor cache, Tensor lrw, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps) { var context = CudaHelpers.TSContextForTensor(weight); SGD(context, weight, gradient, cache, lrw, batchSize, step_size, clipval, regc, decay_rate, eps); return(weight); }
public Tensor SoftmaxGrad(Tensor grad, Tensor adj, Tensor val, bool addGrad = true) { TSCudaContext context = CudaHelpers.TSContextForTensor(grad); SoftmaxGrad(context, grad, adj, val, addGrad); return(grad); }
public Tensor Adam(Tensor weight, Tensor gradient, Tensor v, Tensor m, int batchSize, float step_size, float clipval, float regc, float decay_rate_v, float decay_rate_m, int iter, float eps) { TSCudaContext context = CudaHelpers.TSContextForTensor(weight); Adam(context, weight, gradient, v, m, batchSize, step_size, clipval, regc, decay_rate_v, decay_rate_m, iter, eps); return(weight); }
public void AddLayerNormGrad(Tensor out1Grad, Tensor out2Grad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x1, Tensor x2, Tensor alpha, Tensor beta, float eps = 1e-9f) { TSCudaContext context = CudaHelpers.TSContextForTensor(inGrad); Tensor writeTarget1 = TensorResultBuilder.GetWriteTarget(out1Grad, inGrad, false, inGrad.Sizes); Tensor writeTarget2 = TensorResultBuilder.GetWriteTarget(out2Grad, inGrad, false, inGrad.Sizes); AddLayerNormGrad(context, writeTarget1, writeTarget2, alphaGrad, betaGrad, inGrad, y, x1, x2, alpha, beta, eps); }
public Tensor RMSProp(Tensor weight, Tensor gradient, Tensor cache, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps) { TSCudaContext context = CudaHelpers.TSContextForTensor(weight); RMSProp(context, weight, gradient, cache, batchSize, step_size, clipval, regc, decay_rate, eps); return(weight); }
public Tensor BuildPadSelfTriMask(Tensor originalLengths, int batchSize, int paddedLength) { TSCudaContext context = CudaHelpers.TSContextForTensor(originalLengths); Tensor writeTarget = TensorResultBuilder.GetWriteTarget(null, originalLengths.Allocator, DType.Float32, true, new long[] { batchSize, paddedLength, paddedLength }); BuildPadSelfTriMask(context, writeTarget, originalLengths, batchSize); return(writeTarget); }
public Tensor LayerNorm(Tensor result, Tensor src, Tensor alpha, Tensor beta, float eps = 1e-9f) { var context = CudaHelpers.TSContextForTensor(src); var writeTarget = TensorResultBuilder.GetWriteTarget(result, src, false, src.Sizes); this.LayerNorm(context, writeTarget, src, alpha, beta, eps); return(writeTarget); }
public Tensor SoftmaxMask(Tensor result, Tensor src, Tensor mask) { var context = CudaHelpers.TSContextForTensor(src); var writeTarget = TensorResultBuilder.GetWriteTarget(result, src, true, src.Sizes); this.SoftmaxMask(context, writeTarget, src, mask); return(writeTarget); }
public Tensor BuildSelfTriMask(Tensor result, Tensor originalLengths, int paddedSeqLen, float value, float maskedValue) { TSCudaContext context = CudaHelpers.TSContextForTensor(originalLengths); Tensor writeTarget = TensorResultBuilder.GetWriteTarget(result, originalLengths, true, new long[] { originalLengths.Sizes[0], paddedSeqLen, paddedSeqLen }); BuildSelfTriMask(context, writeTarget, originalLengths, paddedSeqLen, value, maskedValue); return(writeTarget); }
/// <summary> /// Invokes the specified kernels. /// </summary> /// <param name="kernels">The kernels.</param> /// <param name="result">The result.</param> /// <param name="value">The value.</param> public static void Invoke(FillCopyKernels kernels, NDArray result, float value) { var context = CudaHelpers.TSContextForTensor(result); var cudaContext = context.CudaContextForTensor(result); var ptx = kernels.GetPtx(context.Compiler); var elementCount = result.ElementCount(); ApplyOpInvoke.Invoke(context, cudaContext, ptx, "fill", result, value, elementCount); }
public Tensor BuildSrcTgtMask(Tensor originalSrcLengths, Tensor originalTgtLengths, int batchSize, int paddedSrcLength, int paddedTgtLength) { var context = CudaHelpers.TSContextForTensor(originalSrcLengths); var writeTarget = TensorResultBuilder.GetWriteTarget(null, originalSrcLengths.Allocator, DType.Float32, true, new long[] { batchSize, paddedTgtLength, paddedSrcLength }); this.BuildSrcTgtMask(context, writeTarget, originalSrcLengths, originalTgtLengths, batchSize); return(writeTarget); }
public Tensor AddLayerNorm(Tensor result, Tensor src1, Tensor src2, Tensor alpha, Tensor beta, float eps = 1e-9f) { TSCudaContext context = CudaHelpers.TSContextForTensor(src1); Tensor writeTarget = TensorResultBuilder.GetWriteTarget(result, src1, false, src1.Sizes); AddLayerNorm(context, writeTarget, src1, src2, alpha, beta, eps); return(writeTarget); }
public Tensor Softmax(Tensor result, Tensor src) { TSCudaContext context = CudaHelpers.TSContextForTensor(src); Tensor writeTarget = TensorResultBuilder.GetWriteTarget(result, src, true, src.Sizes); Softmax(context, writeTarget, src); return(writeTarget); }
public Tensor LayerNormGrad(Tensor outGrad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x, Tensor alpha, Tensor beta, float eps = 1e-9f) { TSCudaContext context = CudaHelpers.TSContextForTensor(inGrad); Tensor writeTarget = TensorResultBuilder.GetWriteTarget(outGrad, inGrad, false, inGrad.Sizes); LayerNormGrad(context, writeTarget, alphaGrad, betaGrad, inGrad, y, x, alpha, beta, eps); return(writeTarget); }
public Tensor BuildSrcTgtMask(Tensor result, Tensor srcOriginalLengths, Tensor tgtOriginalLengths, int srcPaddedSeqLen, int tgtPaddedSeqLen, float value, float maskedValue) { TSCudaContext context = CudaHelpers.TSContextForTensor(srcOriginalLengths); Tensor writeTarget = TensorResultBuilder.GetWriteTarget(result, srcOriginalLengths, true, new long[] { srcOriginalLengths.Sizes[0], tgtPaddedSeqLen, srcPaddedSeqLen }); BuildSrcTgtMask(context, writeTarget, srcOriginalLengths, tgtOriginalLengths, tgtPaddedSeqLen, value, maskedValue); return(writeTarget); }
public Tensor IndexSelect(Tensor result, Tensor src, Tensor indice) { TSCudaContext context = CudaHelpers.TSContextForTensor(src); Tensor writeTarget = TensorResultBuilder.GetWriteTarget(result, src, true, new long[] { indice.Sizes[0], src.Sizes[1] }); IndexSelect(context, writeTarget, src, indice); return(writeTarget); }
public Tensor Gather(Tensor result, Tensor src, int dim, Tensor indices) { var context = CudaHelpers.TSContextForTensor(src); var cudaContext = context.CudaContextForTensor(src); if (result != null && result.DimensionCount != src.DimensionCount) { throw new InvalidOperationException("result and src must have same number of dimensions"); } if (result != null && dim < 0 && dim >= result.DimensionCount) { throw new ArgumentOutOfRangeException(nameof(dim)); } if (indices.DimensionCount != src.DimensionCount) { throw new InvalidOperationException("src and indices must have same number of dimensions"); } if (result != null && !result.IsSameSizeAs(indices)) { throw new InvalidOperationException("result and indices must be the same size"); } if (result != null && !TensorResultBuilder.ArrayEqualExcept(src.Sizes, result.Sizes, dim)) { throw new InvalidOperationException("result and src must be the same size except in dimension dim"); } var writeTarget = TensorResultBuilder.GetWriteTarget(result, indices.Allocator, src.ElementType, false, indices.Sizes); var nElement = indices.ElementCount(); var block = ApplyUtils.GetApplyBlock(); var grid = ApplyUtils.GetApplyGrid(context.DeviceInfoForContext(cudaContext), nElement); if (ApplyUtils.CanUse32BitIndexMath(writeTarget) && ApplyUtils.CanUse32BitIndexMath(src) && ApplyUtils.CanUse32BitIndexMath(indices)) { var dims = indices.DimensionCount <= 3 ? indices.DimensionCount : -1; var kernelName = MakeKernelName(GatherBaseName, true, dims); this.Invoke(context, cudaContext, kernelName, grid, block, 0, CUstream.NullStream, true, writeTarget, src, indices, dim, (int)nElement); } else { var kernelName = MakeKernelName(GatherBaseName, false, -1); this.Invoke(context, cudaContext, kernelName, grid, block, 0, CUstream.NullStream, false, writeTarget, src, indices, dim, nElement); } return(writeTarget); }
public Tensor UpdateCost(Tensor costs, Tensor weight, Tensor ids) { var context = CudaHelpers.TSContextForTensor(weight); var writeTarget = TensorResultBuilder.GetWriteTarget(costs, weight, true, ids.Sizes); Ops.Fill(writeTarget, 0.0f); this.UpdateCost(context, weight, ids, writeTarget); return(writeTarget); }
public Tensor Scatter(Tensor result, Tensor src, int dim, Tensor indices) { try { TSCudaContext context = CudaHelpers.TSContextForTensor(src); CudaContext cudaContext = context.CudaContextForTensor(src); if (result == null) { throw new ArgumentNullException("result"); } if (result.DimensionCount != src.DimensionCount) { throw new InvalidOperationException($"result and src must have same number of dimensions. result dim count = '{result.DimensionCount}', source dim count = '{src.DimensionCount}'"); } if (dim < 0 && dim >= result.DimensionCount) { throw new ArgumentOutOfRangeException("dim"); } if (indices.DimensionCount != src.DimensionCount) { throw new InvalidOperationException("src and indices must have same number of dimensions"); } if (!src.IsSameSizeAs(indices)) { throw new InvalidOperationException("src and indices must be the same size"); } if (!TensorResultBuilder.ArrayEqualExcept(src.Sizes, result.Sizes, dim)) { throw new InvalidOperationException("result and src must be the same size except in dimension dim"); } Tensor writeTarget = result; long nElement = indices.ElementCount(); dim3 block = ApplyUtils.GetApplyBlock(); dim3 grid = ApplyUtils.GetApplyGrid(context.DeviceInfoForContext(cudaContext), nElement); Invoke(context, cudaContext, "scatter_kernel", grid, block, 0, CUstream.NullStream, false, writeTarget, src, indices, dim, nElement); return(writeTarget); } catch (Exception err) { Logger.WriteLine($"Error = '{err.Message}', Call stack = '{err.StackTrace}'"); throw; } }
public void SpatialMaxPoolingForward(Tensor input, Tensor output, Tensor indices, ConvolutionDesc2d cd, bool ceilMode) { var context = CudaHelpers.TSContextForTensor(input); var cudaContext = context.CudaContextForTensor(input); var iwidth = input.Sizes[3]; var iheight = input.Sizes[2]; var nInputPlane = input.Sizes[1]; var batchSize = input.Sizes[0]; long owidth; long oheight; if (ceilMode) { // ReSharper disable once ArrangeRedundantParentheses oheight = (long)(Math.Ceiling((float)(iheight - cd.kH + 2 * cd.padH) / cd.dH)) + 1; // ReSharper disable once ArrangeRedundantParentheses owidth = (long)(Math.Ceiling((float)(iwidth - cd.kW + 2 * cd.padW) / cd.dW)) + 1; } else { // ReSharper disable once ArrangeRedundantParentheses oheight = (long)(Math.Floor((float)(iheight - cd.kH + 2 * cd.padH) / cd.dH)) + 1; // ReSharper disable once ArrangeRedundantParentheses owidth = (long)(Math.Floor((float)(iwidth - cd.kW + 2 * cd.padW) / cd.dW)) + 1; } if (cd.padW != 0 || cd.padH != 0) { // ensure that the last pooling starts inside the image if ((oheight - 1) * cd.dH >= iheight + cd.padH) { --oheight; } if ((owidth - 1) * cd.dW >= iwidth + cd.padW) { --owidth; } } using var inputContig = Ops.AsContiguous(input); var inputPtr = CudaHelpers.GetBufferStart(inputContig); var outputPtr = CudaHelpers.GetBufferStart(output); var indicesPtr = CudaHelpers.GetBufferStart(indices); var count = (int)output.ElementCount(); this.Invoke(context, cudaContext, "MaxPoolForward", new dim3(NNThreads.NumBlocks(count)), new dim3(NNThreads.NumThreads), 0, CUstream.NullStream, count, inputPtr, batchSize, nInputPlane, iheight, iwidth, oheight, owidth, cd.kH, cd.kW, cd.dH, cd.dW, cd.padH, cd.padW, outputPtr, indicesPtr); }
public static void Invoke(FillCopyKernels kernels, Tensor result, float value) { TSCudaContext context = CudaHelpers.TSContextForTensor(result); CudaContext cudaContext = context.CudaContextForTensor(result); cudaContext.SetCurrent(); byte[] ptx = kernels.GetPtx(context.Compiler); long elementCount = result.ElementCount(); ApplyOpInvoke.Invoke(context, cudaContext, ptx, "fill", result, value, elementCount); }
public Tensor IndexSelectGrad(Tensor grad, Tensor adj, Tensor indice) { if (grad == null) { throw new ArgumentNullException($"Tensor grad should not be null."); } TSCudaContext context = CudaHelpers.TSContextForTensor(adj); IndexSelectGrad(context, grad, adj, indice); return(grad); }