/// <summary> /// Copies the gpu to cpu. /// </summary> /// <param name="result">The result.</param> /// <param name="src">The source.</param> /// <param name="totalElements">The total elements.</param> /// <exception cref="CudaException"></exception> public void CopyGpuToCpu(Tensor result, Tensor src, long totalElements) { var context = CudaHelpers.TSContextForTensor(src); var srcContext = context.CudaContextForTensor(src); using (var srcContig = Ops.AsContiguous(src)) using (var resultContig = AsTypeCpu(result, src.ElementType, true)) { var resultContigPtr = ((Cpu.CpuStorage)resultContig.Storage).PtrAtElement(resultContig.StorageOffset); var srcContigPtr = ((CudaStorage)srcContig.Storage).DevicePtrAtElement(srcContig.StorageOffset); var totalBytes = totalElements * srcContig.ElementType.Size(); // Use DriverAPINativeMethods directly here instead of CudaContext.CopyToHost, because CopyToHost only has an overload // for specifying totalBytes as a uint, but we may exceed the range of a uint here. var res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(resultContigPtr, srcContigPtr, totalBytes); if (res != CUResult.Success) { throw new CudaException(res); } if (result.Storage != resultContig.Storage) { Ops.Copy(result, resultContig); // copy on CPU } } }
public void SpatialMaxPoolingBackward(Tensor input, Tensor gradOutput, Tensor gradInput, Tensor indices, ConvolutionDesc2d cd, bool ceilMode) { var context = CudaHelpers.TSContextForTensor(gradOutput); var cudaContext = context.CudaContextForTensor(gradOutput); var dimw = 3; var dimh = 2; var dimc = 1; var nbatch = input.Sizes[0]; var nslices = input.Sizes[dimc]; var iheight = input.Sizes[dimh]; var iwidth = input.Sizes[dimw]; var owidth = gradOutput.Sizes[dimw]; var oheight = gradOutput.Sizes[dimh]; using var gradOutputContig = Ops.AsContiguous(gradOutput); var gradOutputPtr = CudaHelpers.GetBufferStart(gradOutputContig); var indicesPtr = CudaHelpers.GetBufferStart(indices); var gradInputPtr = CudaHelpers.GetBufferStart(gradInput); var count = (int)input.ElementCount(); this.Invoke(context, cudaContext, "MaxPoolBackward", new dim3(NNThreads.NumBlocks(count)), new dim3(NNThreads.NumThreads), 0, CUstream.NullStream, count, gradOutputPtr, indicesPtr, nbatch, nslices, iheight, iwidth, oheight, owidth, cd.kH, cd.kW, cd.dH, cd.dW, cd.padH, cd.padW, gradInputPtr); }
public IWeightTensor Permute(IWeightTensor w, params int[] dims) { var m = w as WeightTensor; WeightTensor res = m_weightTensorFactory.CreateWeightTensor(m.Sizes, m_deviceId, name: $"{GetHashString(w.Name)}.Permute"); VisualizeNodes(w, res); using (var tWPremute = m.TWeight.Permute(dims)) { res.TWeight = Ops.AsContiguous(tWPremute); } if (m_needsBackprop) { Action backward = () => { using (var gT = m.TGradient.Permute(dims)) { Ops.Add(gT, gT, res.TGradient); } res.Dispose(); }; this.m_backprop.Add(backward); } return(res); }
public static void SpatialMaxPoolingBackward(Tensor input, Tensor gradOutput, Tensor gradInput, Tensor indices, ConvolutionDesc2d cd, bool ceilMode) { int dimw = 3; int dimh = 2; int dimc = 1; long nbatch = input.Sizes[0]; long nslices = input.Sizes[dimc]; long iheight = input.Sizes[dimh]; long iwidth = input.Sizes[dimw]; long owidth = gradOutput.Sizes[dimw]; long oheight = gradOutput.Sizes[dimh]; Ops.Fill(gradInput, 0); using Tensor gradOutputContig = Ops.AsContiguous(gradOutput); for (int i = 0; i < nbatch; ++i) { using Tensor gradInput_i = gradInput.Select(0, i); using Tensor gradOutput_i = gradOutputContig.Select(0, i); using Tensor indices_i = indices.Select(0, i); using (NativeWrapper.BuildTensorRefPtr(gradInput_i, out IntPtr gradInput_iPtr)) using (NativeWrapper.BuildTensorRefPtr(gradOutput_i, out IntPtr gradOutput_iPtr)) using (NativeWrapper.BuildTensorRefPtr(indices_i, out IntPtr indices_iPtr)) { CpuOpsNative.TS_SpatialMaxPooling_updateGradInput_frame(gradInput_iPtr, gradOutput_iPtr, indices_iPtr, nslices, iwidth, iheight, owidth, oheight, cd.dW, cd.dH); } } }
public IWeightMatrix PermuteBatch(IWeightMatrix m, int batchSize) { WeightTensor t = m as WeightTensor; var res = weightTensorFactory.CreateWeightTensor(m.Rows, m.Columns, deviceId); int sizeEveryBatch = m.Rows / batchSize; res.TWeight = Ops.AsContiguous(t.TWeight.View(sizeEveryBatch, batchSize, m.Columns).Permute(1, 0, 2)).View(m.Rows, m.Columns); if (this.needs_backprop) { Action backward = () => { var g = t.TGradient.View(sizeEveryBatch, batchSize, m.Columns); var t2 = res.TGradient.View(batchSize, sizeEveryBatch, m.Columns).Permute(1, 0, 2); Ops.Add(g, g, t2); g.Dispose(); t2.Dispose(); res.Dispose(); }; this.backprop.Add(backward); } return(res); }
public void SpatialMaxPoolingForward(Tensor input, Tensor output, Tensor indices, ConvolutionDesc2d cd, bool ceilMode) { var context = CudaHelpers.TSContextForTensor(input); var cudaContext = context.CudaContextForTensor(input); var iwidth = input.Sizes[3]; var iheight = input.Sizes[2]; var nInputPlane = input.Sizes[1]; var batchSize = input.Sizes[0]; long owidth; long oheight; if (ceilMode) { // ReSharper disable once ArrangeRedundantParentheses oheight = (long)(Math.Ceiling((float)(iheight - cd.kH + 2 * cd.padH) / cd.dH)) + 1; // ReSharper disable once ArrangeRedundantParentheses owidth = (long)(Math.Ceiling((float)(iwidth - cd.kW + 2 * cd.padW) / cd.dW)) + 1; } else { // ReSharper disable once ArrangeRedundantParentheses oheight = (long)(Math.Floor((float)(iheight - cd.kH + 2 * cd.padH) / cd.dH)) + 1; // ReSharper disable once ArrangeRedundantParentheses owidth = (long)(Math.Floor((float)(iwidth - cd.kW + 2 * cd.padW) / cd.dW)) + 1; } if (cd.padW != 0 || cd.padH != 0) { // ensure that the last pooling starts inside the image if ((oheight - 1) * cd.dH >= iheight + cd.padH) { --oheight; } if ((owidth - 1) * cd.dW >= iwidth + cd.padW) { --owidth; } } using var inputContig = Ops.AsContiguous(input); var inputPtr = CudaHelpers.GetBufferStart(inputContig); var outputPtr = CudaHelpers.GetBufferStart(output); var indicesPtr = CudaHelpers.GetBufferStart(indices); var count = (int)output.ElementCount(); this.Invoke(context, cudaContext, "MaxPoolForward", new dim3(NNThreads.NumBlocks(count)), new dim3(NNThreads.NumThreads), 0, CUstream.NullStream, count, inputPtr, batchSize, nInputPlane, iheight, iwidth, oheight, owidth, cd.kH, cd.kW, cd.dH, cd.dW, cd.padH, cd.padW, outputPtr, indicesPtr); }
public IWeightTensor TransposeBatch(IWeightTensor m, int batchSize) { WeightTensor t = m as WeightTensor; WeightTensor res = m_weightTensorFactory.CreateWeightTensor(t.Sizes, m_deviceId, name: $"{GetHashString(m.Name)}.TransposeBatch", graphToBind: this); VisualizeNodes(m, res); int sizeEveryBatch = m.Rows / batchSize; using (Tensor tWView = t.TWeight.View(sizeEveryBatch, batchSize, m.Columns)) { using (Tensor tWViewPermute = tWView.Permute(1, 0, 2)) { using (Tensor tW2 = Ops.AsContiguous(tWViewPermute)) { res.TWeight = tW2.View(m.Rows, m.Columns); res.Sizes = res.TWeight.Sizes; } } } if (m_needsBackprop) { Action backward = () => { res.ReleaseWeight(); using (Tensor g = t.TGradient.View(sizeEveryBatch, batchSize, m.Columns)) { using (Tensor t2 = res.TGradient.View(batchSize, sizeEveryBatch, m.Columns)) { using (Tensor t2Permute = t2.Permute(1, 0, 2)) { Ops.Add(g, g, t2Permute); } } } res.Dispose(); }; m_backprop.Add(backward); } return(res); }
/// <summary> /// Copies the cpu to gpu. /// </summary> /// <param name="result">The result.</param> /// <param name="src">The source.</param> /// <param name="totalElements">The total elements.</param> public void CopyCpuToGpu(Tensor result, Tensor src, long totalElements) { var context = CudaHelpers.TSContextForTensor(result); var resultContext = context.CudaContextForTensor(result); // If types of src and result are different, convert on the CPU first. using (var srcContig = AsTypeCpu(src, result.ElementType, true)) using (var resultContig = Ops.AsContiguous(result)) { var resultContigPtr = ((CudaStorage)resultContig.Storage).DevicePtrAtElement(resultContig.StorageOffset); var srcContigPtr = ((Cpu.CpuStorage)srcContig.Storage).PtrAtElement(srcContig.StorageOffset); resultContext.CopyToDevice(resultContigPtr, srcContigPtr, totalElements * srcContig.ElementType.Size()); if (result.Storage != resultContig.Storage) { CopyGpuDirect(result, resultContig, resultContext); } } }
public IWeightTensor AsContiguous(IWeightTensor w, bool runGradient = true) { WeightTensor m = w as WeightTensor; WeightTensor res = m_weightTensorFactory.CreateWeightTensor(m.Sizes, m_deviceId, name: $"{GetHashString(w.Name)}.AsContiguous"); VisualizeNodes(w, res); res.TWeight = Ops.AsContiguous(m.TWeight); if (m_needsBackprop && runGradient) { Action backward = () => { m.CopyOrAddGradient(res); res.Dispose(); }; m_backprop.Add(backward); } return(res); }
/// <summary> /// Spatials the maximum pooling backward. /// </summary> /// <param name="input">The input.</param> /// <param name="gradOutput">The grad output.</param> /// <param name="gradInput">The grad input.</param> /// <param name="indices">The indices.</param> /// <param name="cd">The cd.</param> /// <param name="ceilMode">if set to <c>true</c> [ceil mode].</param> public static void SpatialMaxPoolingBackward(NDArray input, NDArray gradOutput, NDArray gradInput, NDArray indices, ConvolutionDesc2d cd, bool ceilMode) { var dimw = 3; var dimh = 2; var dimc = 1; var nbatch = input.Shape[0]; var nslices = input.Shape[dimc]; var iheight = input.Shape[dimh]; var iwidth = input.Shape[dimw]; var owidth = gradOutput.Shape[dimw]; var oheight = gradOutput.Shape[dimh]; Ops.Fill(gradInput, 0); using (var gradOutputContig = Ops.AsContiguous(gradOutput)) { for (int i = 0; i < nbatch; ++i) { using (var gradInput_i = gradInput.Select(0, i)) using (var gradOutput_i = gradOutputContig.Select(0, i)) using (var indices_i = indices.Select(0, i)) { IntPtr gradInput_iPtr, gradOutput_iPtr, indices_iPtr; using (NativeWrapper.BuildTensorRefPtr(gradInput_i, out gradInput_iPtr)) using (NativeWrapper.BuildTensorRefPtr(gradOutput_i, out gradOutput_iPtr)) using (NativeWrapper.BuildTensorRefPtr(indices_i, out indices_iPtr)) { CpuOpsNative.TS_SpatialMaxPooling_updateGradInput_frame(gradInput_iPtr, gradOutput_iPtr, indices_iPtr, nslices, iwidth, iheight, owidth, oheight, cd.dW, cd.dH); } } } } }
public static unsafe TensorProto GetProto(this Tensor tensor) { var result = new TensorProto(); var sizes = tensor.Sizes.Select(l => (int)l); result.Shape.Add(sizes); result.Count = sizes.Aggregate((a, i) => a * i); result.Type = _dtypeToDataType[tensor.ElementType]; result.Format = TensorFormat.RowMajor; tensor = Ops.AsContiguous(tensor); var bytes = new byte[tensor.Storage.ByteLength]; fixed(byte *p = bytes) { IntPtr ptr = (IntPtr)p; tensor.Storage.CopyFromStorage(ptr, 0, bytes.Length); } result.Data = ByteString.CopyFrom(bytes); return(result); }
/// <summary> /// Copies the gpu indirect. /// </summary> /// <param name="result">The result.</param> /// <param name="src">The source.</param> /// <param name="totalElements">The total elements.</param> /// <exception cref="CudaException"></exception> private void CopyGpuIndirect(Tensor result, Tensor src, long totalElements) { // This is only called if the tensors have the same type, but memcpy cannot be used on the tensor pair, // and we can't get direct access to the other GPU's memory. // We will make contiguous proxy tensors as necessary, so we can use cuMemcpy to perform the copy. // If result needs to be proxied, we then copy back from the contiguous proxy to result on the same GPU var context = CudaHelpers.TSContextForTensor(src); var isResultContig = result.IsContiguous(); var resultContig = result; using (var srcContig = Ops.AsContiguous(src)) { if (!isResultContig) { resultContig = new Tensor(result.Allocator, result.ElementType, result.Shape); } var resultContigPtr = ((CudaStorage)resultContig.Storage).DevicePtrAtElement(resultContig.StorageOffset); var srcContigPtr = ((CudaStorage)srcContig.Storage).DevicePtrAtElement(srcContig.StorageOffset); var res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyAsync( resultContigPtr, srcContigPtr, totalElements * srcContig.ElementType.Size(), CUstream.NullStream); if (res != CUResult.Success) { throw new CudaException(res); } if (!isResultContig) { CopyGpuDirect(result, resultContig, context.CudaContextForTensor(result)); resultContig.Dispose(); } } }
/// <summary> /// Spatials the maximum pooling forward. /// </summary> /// <param name="input">The input.</param> /// <param name="output">The output.</param> /// <param name="indices">The indices.</param> /// <param name="cd">The cd.</param> /// <param name="ceilMode">if set to <c>true</c> [ceil mode].</param> /// <exception cref="ArgumentException">input must be a 4D tensor</exception> /// <exception cref="InvalidOperationException"> /// input image is smaller than kernel size /// or /// pad should be smaller than half of the kernel size /// </exception> public static void SpatialMaxPoolingForward(NDArray input, NDArray output, NDArray indices, ConvolutionDesc2d cd, bool ceilMode) { if (input.DimensionCount != 4) { throw new ArgumentException("input must be a 4D tensor"); } var dimw = 3; var dimh = 2; var dimc = 1; if (input.Shape[dimw] < cd.kW - cd.padW || input.Shape[dimh] < cd.kH - cd.padH) { throw new InvalidOperationException("input image is smaller than kernel size"); } if (cd.padW > cd.kW / 2 || cd.padH > cd.kH / 2) { throw new InvalidOperationException("pad should be smaller than half of the kernel size"); } var nbatch = input.Shape[0]; var nslices = input.Shape[dimc]; var iheight = input.Shape[dimh]; var iwidth = input.Shape[dimw]; long owidth; long oheight; if (ceilMode) { oheight = (long)(Math.Ceiling((float)(iheight - cd.kH + 2 * cd.padH) / cd.dH)) + 1; owidth = (long)(Math.Ceiling((float)(iwidth - cd.kW + 2 * cd.padW) / cd.dW)) + 1; } else { oheight = (long)(Math.Floor((float)(iheight - cd.kH + 2 * cd.padH) / cd.dH)) + 1; owidth = (long)(Math.Floor((float)(iwidth - cd.kW + 2 * cd.padW) / cd.dW)) + 1; } if (cd.padW != 0 || cd.padH != 0) { // ensure that the last pooling starts inside the image if ((oheight - 1) * cd.dH >= iheight + cd.padH) { --oheight; } if ((owidth - 1) * cd.dW >= iwidth + cd.padW) { --owidth; } } using (var inputContig = Ops.AsContiguous(input)) { for (int i = 0; i < nbatch; ++i) { using (var input_i = inputContig.Select(0, i)) using (var output_i = output.Select(0, i)) using (var indices_i = indices.Select(0, i)) { IntPtr input_iPtr, output_iPtr, indices_iPtr; using (NativeWrapper.BuildTensorRefPtr(input_i, out input_iPtr)) using (NativeWrapper.BuildTensorRefPtr(output_i, out output_iPtr)) using (NativeWrapper.BuildTensorRefPtr(indices_i, out indices_iPtr)) { CpuOpsNative.TS_SpatialMaxPooling_updateOutput_frame(input_iPtr, output_iPtr, indices_iPtr, nslices, iwidth, iheight, owidth, oheight, cd.kW, cd.kH, cd.dW, cd.dH, cd.padW, cd.padH); } } } } }
public IWeightTensor View(IWeightTensor w, bool runGradient = true, params long[] dims) { bool hasNegOne = false; int negOneIdx = 0; long totalGivenSize = 1; for (int i = 0; i < dims.Length; i++) { long dim = dims[i]; if (dim == -1) { if (hasNegOne) { throw new ArgumentException($"View operation only allows single -1 in dims."); } hasNegOne = true; negOneIdx = i; } else { totalGivenSize *= dim; } } if (hasNegOne) { long totalSrcSize = 1; foreach (int size in w.Sizes) { totalSrcSize *= size; } dims[negOneIdx] = totalSrcSize / totalGivenSize; } WeightTensor m = w as WeightTensor; WeightTensor res = m_weightTensorFactory.CreateWeightTensor(dims, m_deviceId, name: w.Name, graphToBind: this); // VisualizeNodes(w, res); Tensor congtiW = Ops.AsContiguous(m.TWeight); m.ReleaseWeight(); m.TWeight = congtiW; res.TWeight = congtiW.View(dims); if (m_needsBackprop) { Action backward = () => { if (runGradient) { res.ReleaseWeight(); using (Tensor resG = res.TGradient.View(m.Sizes)) { m.CopyOrAddGradient(resG, res.Name); } } res.Dispose(); }; m_backprop.Add(backward); } return(res); }