/// <summary> /// Asynchron copy device to host /// </summary> /// <param name="devicePtr"></param> /// <param name="pitchDevice"></param> /// <param name="stream"></param> public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream) { if (disposed) { throw new ObjectDisposedException(this.ToString()); } CUDAMemCpy2D cpyProps = new CUDAMemCpy2D(); cpyProps.srcDevice = devicePtr; cpyProps.srcMemoryType = CUMemoryType.Device; cpyProps.srcPitch = pitchDevice; cpyProps.dstHost = _intPtr; cpyProps.dstMemoryType = CUMemoryType.Host; cpyProps.dstPitch = _pitchInBytes; cpyProps.WidthInBytes = _width * _typeSize; cpyProps.Height = _height; res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res)); if (res != CUResult.Success) { throw new CudaException(res); } }
private void ReduceIndexInnermostDim(TSCudaContext context, Tensor resultValues, Tensor resultIndices, Tensor src, Tuple <float, float> init, string baseKernelName) { CudaContext cudaContext = context.CudaContextForTensor(src); int ndim = src.DimensionCount; long num_rows = 1; for (int dim = 0; dim < ndim - 1; dim++) { num_rows *= src.Sizes[dim]; } long row_size = src.Sizes[ndim - 1]; dim3 threads = new dim3(16, 32); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); CUdeviceptr resultValPtr = CudaHelpers.GetBufferStart(resultValues); CUdeviceptr resultIdxPtr = CudaHelpers.GetBufferStart(resultIndices); CUdeviceptr srcPtr = CudaHelpers.GetBufferStart(src); string kernelName = "inner_index_" + baseKernelName; Invoke(context, cudaContext, kernelName, grid, threads, 0, CUstream.NullStream, resultValPtr, resultIdxPtr, srcPtr, num_rows, row_size, init.Item1, init.Item2); }
public override void Execute() { CudaDeviceVariable <float> codeVectors = MyMemoryManager.Instance.GetGlobalVariable( Owner.GlobalVariableName, Owner.GPU, Owner.GenerateRandomVectors); CUdeviceptr dirX = codeVectors.DevicePointer + GetSymbolOffset(MyCodeVector.DirX); CUdeviceptr dirY = codeVectors.DevicePointer + GetSymbolOffset(MyCodeVector.DirY); CUdeviceptr negDirX = codeVectors.DevicePointer + GetSymbolOffset(MyCodeVector.NegDirX); CUdeviceptr negDirY = codeVectors.DevicePointer + GetSymbolOffset(MyCodeVector.NegDirY); CUdeviceptr originX = codeVectors.DevicePointer + GetSymbolOffset(MyCodeVector.OriginX); CUdeviceptr originY = codeVectors.DevicePointer + GetSymbolOffset(MyCodeVector.OriginY); if (Owner.Mode == MySpatialCoderMode.Encode) { m_kernel.Run(Owner.Input, Owner.Input.Count, Owner.Output, Owner.SymbolSize, UseSquaredTransform ? 1 : 0, dirX, dirY, negDirX, negDirY, originX, originY); } else { m_kernel.Run(Owner.Input, Owner.SymbolSize, Owner.Output, Owner.Reliability, Owner.Output.Count, UseSquaredTransform ? 1 : 0, dirX, dirY, negDirX, negDirY, originX, originY); } }
public override void Initialize(Int32 nGPU) { base.Initialize(nGPU); // Set WeightChange and BiasChange dimensions according to respective Weight and Bias if (m_weightBlock != null) { m_weight.Ptr = m_weightBlock.GetDevicePtr(m_network, m_weightOffset); m_weightChange.Ptr = m_weightChangeBlock.GetDevicePtr(m_network, m_weightChangeOffset); } if (m_biasBlock != null) { m_bias.Ptr = m_biasBlock.GetDevicePtr(m_network, m_biasOffset); m_biasChange.Ptr = m_biasChangeBlock.GetDevicePtr(m_network, m_biasChangeOffset); } // Send the structures to GPU m_network.DataDimsMemoryBlock.Host[m_weightDimGPUPtrOffset] = Weight; m_network.DataDimsMemoryBlock.Host[m_weightChangeDimGPUPtrOffset] = WeightChange; m_network.DataDimsMemoryBlock.Host[m_biasDimGPUPtrOffset] = Bias; m_network.DataDimsMemoryBlock.Host[m_biasChangeDimGPUPtrOffset] = BiasChange; m_network.DataDimsMemoryBlock.Host[m_lastWeightDeltaDimGPUPtrOffset] = LastWeightDelta; m_network.DataDimsMemoryBlock.Host[m_storedOutputDimGPUPtrOffset] = StoredOutput; // Store the GPU pointers WeightDataPtr = m_network.DataDimsMemoryBlock.GetDevicePtr(m_network, (int)m_weightDimGPUPtrOffset); WeightChangeDataPtr = m_network.DataDimsMemoryBlock.GetDevicePtr(m_network, (int)m_weightChangeDimGPUPtrOffset); BiasDataPtr = m_network.DataDimsMemoryBlock.GetDevicePtr(m_network, (int)m_biasDimGPUPtrOffset); BiasChangeDataPtr = m_network.DataDimsMemoryBlock.GetDevicePtr(m_network, (int)m_biasChangeDimGPUPtrOffset); LastWeightDeltaDataPtr = m_network.DataDimsMemoryBlock.GetDevicePtr(m_network, (int)m_lastWeightDeltaDimGPUPtrOffset); StoredOutputDataPtr = m_network.DataDimsMemoryBlock.GetDevicePtr(m_network, (int)m_storedOutputDimGPUPtrOffset); // Generate initial weights GenerateWeights(); }
/// <summary> /// Applies the <paramref name="multKernel"/> operation on <paramref name="identity"/> and <paramref name="codeVector"/> <paramref name="power"/>-times. /// </summary> public static void MakePower( CUdeviceptr identity, CUdeviceptr codeVector, CudaDeviceVariable <float> output, MyCudaKernel multKernel, int power, int symbolSize) { if (power == 1) { output.CopyToDevice(codeVector, 0, 0, sizeof(float) * symbolSize); return; } output.CopyToDevice(identity, 0, 0, sizeof(float) * symbolSize); // Multiply identity power-times by codeVector var method = power > 0 ? (int)MyJoin.MyJoinOperation.Permutation : (int)MyJoin.MyJoinOperation.Inv_Permutation; for (int i = 0; i < Math.Abs(power); i++) { multKernel.Run(output.DevicePointer, codeVector, output.DevicePointer, method, symbolSize); } }
public override float GetElementAsFloat(long index) { CUdeviceptr ptr = DevicePtrAtElement(index); try { if (ElementType == DType.Float32) { float[] result = new float[1]; context.CopyToHost(result, ptr); return(result[0]); } else if (ElementType == DType.Float64) { double[] result = new double[1]; context.CopyToHost(result, ptr); return((float)result[0]); } else if (ElementType == DType.Int32) { int[] result = new int[1]; context.CopyToHost(result, ptr); return(result[0]); } else if (ElementType == DType.UInt8) { byte[] result = new byte[1]; context.CopyToHost(result, ptr); return(result[0]); } else { throw new NotSupportedException("Element type " + ElementType + " not supported"); } } catch (Exception err) { Logger.WriteLine($"Failed to get element as float from addr = '{ptr.Pointer}'"); Logger.WriteLine($"Exception: {err.Message}"); Logger.WriteLine($"Call stack: {err.StackTrace}"); throw err; } }
public override void Bind(CUdeviceptr firstInput, params CUdeviceptr[] otherInputs) { if (otherInputs == null) { otherInputs = new CUdeviceptr[] { firstInput }; } m_fft.Exec(firstInput, m_tempBlock.GetDevicePtr(m_owner, m_secondFFTOffset)); int count = otherInputs.Length == 1 ? otherInputs.Length : otherInputs.Length - 1; for (int i = 0; i < count; ++i) { CUdeviceptr start = otherInputs[i]; m_fft.Exec(start, m_tempBlock.GetDevicePtr(m_owner, m_firstFFTOffset)); m_mulkernel.Run( m_tempBlock.GetDevicePtr(m_owner, m_firstFFTOffset), m_tempBlock.GetDevicePtr(m_owner, m_secondFFTOffset), m_tempBlock.GetDevicePtr(m_owner, m_secondFFTOffset), m_inputSize + 1); } CUdeviceptr output = otherInputs[otherInputs.Length - 1]; FinishBinding(output); }
private void CopyGpuIndirect(Tensor result, Tensor src, long totalElements) { // This is only called if the tensors have the same type, but memcpy cannot be used on the tensor pair, // and we can't get direct access to the other GPU's memory. // We will make contiguous proxy tensors as necessary, so we can use cuMemcpy to perform the copy. // If result needs to be proxied, we then copy back from the contiguous proxy to result on the same GPU TSCudaContext context = CudaHelpers.TSContextForTensor(src); bool isResultContig = result.IsContiguous(); Tensor resultContig = result; using (Tensor srcContig = Ops.AsContiguous(src)) { if (!isResultContig) { resultContig = new Tensor(result.Allocator, result.ElementType, result.Sizes); } CUdeviceptr resultContigPtr = ((CudaStorage)resultContig.Storage).DevicePtrAtElement(resultContig.StorageOffset); CUdeviceptr srcContigPtr = ((CudaStorage)srcContig.Storage).DevicePtrAtElement(srcContig.StorageOffset); CUResult res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyAsync( resultContigPtr, srcContigPtr, totalElements * srcContig.ElementType.Size(), CUstream.NullStream); if (res != CUResult.Success) { throw new CudaException(res); } if (!isResultContig) { CopyGpuDirect(result, resultContig, context.CudaContextForTensor(result)); resultContig.Dispose(); } } }
public override void Init() { linKernel.ProblemElements = problemElements; linKernel.Y = Y; linKernel.Init(); base.Init(); float[] vecVals; int[] vecColIdx; int[] vecLenght; CudaHelpers.TransformToEllpackRFormat(out vecVals, out vecColIdx, out vecLenght, problemElements); selfLinDot = linKernel.DiagonalDotCache; #region cuda initialization InitCudaModule(); //copy data to device, set cuda function parameters valsPtr = cuda.CopyHostToDevice(vecVals); idxPtr = cuda.CopyHostToDevice(vecColIdx); vecLengthPtr = cuda.CopyHostToDevice(vecLenght); selfLinDotPtr = cuda.CopyHostToDevice(selfLinDot); uint memSize = (uint)(problemElements.Length * sizeof(float)); //allocate mapped memory for our results //CUDARuntime.cudaSetDeviceFlags(CUDARuntime.cudaDeviceMapHost); // var e= CUDADriver.cuMemHostAlloc(ref outputIntPtr, memSize, 8); //CUDARuntime.cudaHostAlloc(ref outputIntPtr, memSize, CUDARuntime.cudaHostAllocMapped); //var errMsg=CUDARuntime.cudaGetErrorString(e); //cuda.HostRegister(outputIntPtr,memSize, Cuda) outputIntPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); outputPtr = cuda.GetHostDevicePointer(outputIntPtr, 0); //normal memory allocation //outputPtr = cuda.Allocate((uint)(sizeof(float) * problemElements.Length)); #endregion SetCudaFunctionParameters(); //allocate memory for main vector, size of this vector is the same as dimenson, so many //indexes will be zero, but cuda computation is faster mainVector = new float[problemElements[0].Dim + 1]; CudaHelpers.FillDenseVector(problemElements[0], mainVector); CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVecPtr); CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuLabelsTexRef, cudaLabelsTexRefName, Y, ref labelsPtr); if (MakeDenseVectorOnGPU) { vecBuilder = new EllpackDenseVectorBuilder(cuda, mainVecPtr, valsPtr, idxPtr, vecLengthPtr, problemElements.Length, problemElements[0].Dim); vecBuilder.Init(); } }
public void GetVector <T>(CUdeviceptr ptr, T[] data) { this.GetVector <T>(ptr, 1, data, 1); }
public static extern cufftResult cufftExecZ2Z([In] cufftHandle plan, [In] CUdeviceptr idata, [Out] CUdeviceptr odata, [In] TransformDirection direction);
public BasicDeviceMemory(CUdeviceptr pointer, Action freeHandler) { this.pointer = pointer; this.freeHandler = freeHandler; }
/// <summary> /// Creates a new NPPImage from allocated device ptr. /// </summary> /// <param name="devPtr">Already allocated device ptr.</param> /// <param name="size">Image size</param> /// <param name="pitch">Pitch / Line step</param> public NPPImage_16uC2(CUdeviceptr devPtr, NppiSize size, int pitch) : this(devPtr, size.width, size.height, pitch) { }
/// <summary> see CUDA doc; </summary> public static void MemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, uint ByteCount) { TestResult(my.cuMemcpyDtoD(dstDevice, srcDevice, ByteCount)); }
public static extern CurandStatus curandGenerateLogNormal(CurandGenerator generator, CUdeviceptr outputPtr, SizeT n, float mean, float stddev);
/// <summary> see CUDA doc; </summary> public static void MemHostGetDevicePointer(out CUdeviceptr dptr, IntPtr p, uint flags) { TestResult(my.cuMemHostGetDevicePointer(out dptr, p, flags)); }
/// <summary> see CUDA doc; </summary> static public void MemFree(CUdeviceptr dptr) { TestResult(my.cuMemFree(dptr)); }
public static extern CurandStatus curandGenerateLogNormalDouble(CurandGenerator generator, CUdeviceptr outputPtr, SizeT n, double mean, double stddev);
/// <summary> see CUDA doc; </summary> static public void ParamSetp(CUfunction hfunc, int offset, CUdeviceptr ptr) { ParamSetl(hfunc, offset, (long)ptr.p); }
public static extern CurandStatus curandGeneratePoisson(CurandGenerator generator, CUdeviceptr outputPtr, SizeT n, double lambda);
/// <summary> /// Creates a new NPPImage from allocated device ptr. Does not take ownership of decPtr. /// </summary> /// <param name="devPtr">Already allocated device ptr.</param> /// <param name="width">Image width in pixels</param> /// <param name="height">Image height in pixels</param> /// <param name="pitch">Pitch / Line step</param> public NPPImage_16uC2(CUdeviceptr devPtr, int width, int height, int pitch) : this(devPtr, width, height, pitch, false) { }
/// <summary> /// Creates a new NPPImage from allocated device ptr. /// </summary> /// <param name="devPtr">Already allocated device ptr.</param> /// <param name="size">Image size</param> /// <param name="pitch">Pitch / Line step</param> /// <param name="isOwner">If TRUE, devPtr is freed when disposing</param> public NPPImage_32fcC1(CUdeviceptr devPtr, NppiSize size, int pitch, bool isOwner) : this(devPtr, size.width, size.height, pitch, isOwner) { }
public static extern cufftResult cufftSetWorkArea(cufftHandle plan, CUdeviceptr workArea);
public static extern nvgraphStatus nvgraphSetVertexData(nvgraphContext handle, nvgraphGraphDescr descrG, CUdeviceptr vertexData, SizeT setnum);
public static extern cufftResult cufftExecZ2D([In] cufftHandle plan, [In] CUdeviceptr idata, [In] CUdeviceptr odata);
public static extern nvgraphStatus nvgraphConvertTopology(nvgraphContext handle, nvgraphTopologyType srcTType, nvgraphTopologyBase srcTopology, CUdeviceptr srcEdgeData, ref cudaDataType dataType, nvgraphTopologyType dstTType, nvgraphTopologyBase dstTopology, CUdeviceptr dstEdgeData);
public void SetVector <T>(T[] data, CUdeviceptr ptr) { this.SetVector <T>(data, 1, ptr, 1); }
public static extern nvgraphStatus nvgraphGetEdgeData(nvgraphContext handle, nvgraphGraphDescr descrG, CUdeviceptr edgeData, SizeT setnum);
internal void VectorSplit(IDeviceMemoryPtr a, int size, int blockSize, CUdeviceptr output) { _Use(_vectorSplit, size, k => k.Run(0, a.DevicePointer, output, size, blockSize)); }
/// <summary> see CUDA doc; </summary> public static void MemAlloc(out CUdeviceptr dptr, uint bytesize) { TestResult(my.cuMemAlloc(out dptr, bytesize)); }