示例#1
0
        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed)
            {
                throw new ObjectDisposedException(this.ToString());
            }
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();

            cpyProps.srcDevice     = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch      = pitchDevice;
            cpyProps.dstHost       = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch      = _pitchInBytes;
            cpyProps.WidthInBytes  = _width * _typeSize;
            cpyProps.Height        = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success)
            {
                throw new CudaException(res);
            }
        }
示例#2
0
        private void ReduceIndexInnermostDim(TSCudaContext context, Tensor resultValues, Tensor resultIndices, Tensor src, Tuple <float, float> init, string baseKernelName)
        {
            CudaContext cudaContext = context.CudaContextForTensor(src);

            int  ndim     = src.DimensionCount;
            long num_rows = 1;

            for (int dim = 0; dim < ndim - 1; dim++)
            {
                num_rows *= src.Sizes[dim];
            }
            long row_size = src.Sizes[ndim - 1];

            dim3 threads = new dim3(16, 32);
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y)));

            CUdeviceptr resultValPtr = CudaHelpers.GetBufferStart(resultValues);
            CUdeviceptr resultIdxPtr = CudaHelpers.GetBufferStart(resultIndices);
            CUdeviceptr srcPtr       = CudaHelpers.GetBufferStart(src);

            string kernelName = "inner_index_" + baseKernelName;

            Invoke(context, cudaContext, kernelName, grid, threads, 0, CUstream.NullStream, resultValPtr, resultIdxPtr, srcPtr, num_rows, row_size, init.Item1, init.Item2);
        }
示例#3
0
            public override void Execute()
            {
                CudaDeviceVariable <float> codeVectors = MyMemoryManager.Instance.GetGlobalVariable(
                    Owner.GlobalVariableName, Owner.GPU, Owner.GenerateRandomVectors);

                CUdeviceptr dirX    = codeVectors.DevicePointer + GetSymbolOffset(MyCodeVector.DirX);
                CUdeviceptr dirY    = codeVectors.DevicePointer + GetSymbolOffset(MyCodeVector.DirY);
                CUdeviceptr negDirX = codeVectors.DevicePointer + GetSymbolOffset(MyCodeVector.NegDirX);
                CUdeviceptr negDirY = codeVectors.DevicePointer + GetSymbolOffset(MyCodeVector.NegDirY);

                CUdeviceptr originX = codeVectors.DevicePointer + GetSymbolOffset(MyCodeVector.OriginX);
                CUdeviceptr originY = codeVectors.DevicePointer + GetSymbolOffset(MyCodeVector.OriginY);

                if (Owner.Mode == MySpatialCoderMode.Encode)
                {
                    m_kernel.Run(Owner.Input, Owner.Input.Count, Owner.Output, Owner.SymbolSize, UseSquaredTransform ? 1 : 0,
                                 dirX, dirY, negDirX, negDirY, originX, originY);
                }
                else
                {
                    m_kernel.Run(Owner.Input, Owner.SymbolSize, Owner.Output, Owner.Reliability, Owner.Output.Count, UseSquaredTransform ? 1 : 0,
                                 dirX, dirY, negDirX, negDirY, originX, originY);
                }
            }
        public override void Initialize(Int32 nGPU)
        {
            base.Initialize(nGPU);

            // Set WeightChange and BiasChange dimensions according to respective Weight and Bias

            if (m_weightBlock != null)
            {
                m_weight.Ptr       = m_weightBlock.GetDevicePtr(m_network, m_weightOffset);
                m_weightChange.Ptr = m_weightChangeBlock.GetDevicePtr(m_network, m_weightChangeOffset);
            }
            if (m_biasBlock != null)
            {
                m_bias.Ptr       = m_biasBlock.GetDevicePtr(m_network, m_biasOffset);
                m_biasChange.Ptr = m_biasChangeBlock.GetDevicePtr(m_network, m_biasChangeOffset);
            }

            // Send the structures to GPU
            m_network.DataDimsMemoryBlock.Host[m_weightDimGPUPtrOffset]          = Weight;
            m_network.DataDimsMemoryBlock.Host[m_weightChangeDimGPUPtrOffset]    = WeightChange;
            m_network.DataDimsMemoryBlock.Host[m_biasDimGPUPtrOffset]            = Bias;
            m_network.DataDimsMemoryBlock.Host[m_biasChangeDimGPUPtrOffset]      = BiasChange;
            m_network.DataDimsMemoryBlock.Host[m_lastWeightDeltaDimGPUPtrOffset] = LastWeightDelta;
            m_network.DataDimsMemoryBlock.Host[m_storedOutputDimGPUPtrOffset]    = StoredOutput;

            // Store the GPU pointers
            WeightDataPtr          = m_network.DataDimsMemoryBlock.GetDevicePtr(m_network, (int)m_weightDimGPUPtrOffset);
            WeightChangeDataPtr    = m_network.DataDimsMemoryBlock.GetDevicePtr(m_network, (int)m_weightChangeDimGPUPtrOffset);
            BiasDataPtr            = m_network.DataDimsMemoryBlock.GetDevicePtr(m_network, (int)m_biasDimGPUPtrOffset);
            BiasChangeDataPtr      = m_network.DataDimsMemoryBlock.GetDevicePtr(m_network, (int)m_biasChangeDimGPUPtrOffset);
            LastWeightDeltaDataPtr = m_network.DataDimsMemoryBlock.GetDevicePtr(m_network, (int)m_lastWeightDeltaDimGPUPtrOffset);
            StoredOutputDataPtr    = m_network.DataDimsMemoryBlock.GetDevicePtr(m_network, (int)m_storedOutputDimGPUPtrOffset);

            // Generate initial weights
            GenerateWeights();
        }
        /// <summary>
        /// Applies the <paramref name="multKernel"/> operation on <paramref name="identity"/> and <paramref name="codeVector"/> <paramref name="power"/>-times.
        /// </summary>
        public static void MakePower(
            CUdeviceptr identity, CUdeviceptr codeVector, CudaDeviceVariable <float> output,
            MyCudaKernel multKernel, int power, int symbolSize)
        {
            if (power == 1)
            {
                output.CopyToDevice(codeVector, 0, 0, sizeof(float) * symbolSize);
                return;
            }


            output.CopyToDevice(identity, 0, 0, sizeof(float) * symbolSize);


            // Multiply identity power-times by codeVector
            var method = power > 0
                ? (int)MyJoin.MyJoinOperation.Permutation
                : (int)MyJoin.MyJoinOperation.Inv_Permutation;

            for (int i = 0; i < Math.Abs(power); i++)
            {
                multKernel.Run(output.DevicePointer, codeVector, output.DevicePointer, method, symbolSize);
            }
        }
示例#6
0
        public override float GetElementAsFloat(long index)
        {
            CUdeviceptr ptr = DevicePtrAtElement(index);

            try
            {
                if (ElementType == DType.Float32)
                {
                    float[] result = new float[1]; context.CopyToHost(result, ptr); return(result[0]);
                }
                else if (ElementType == DType.Float64)
                {
                    double[] result = new double[1]; context.CopyToHost(result, ptr); return((float)result[0]);
                }
                else if (ElementType == DType.Int32)
                {
                    int[] result = new int[1]; context.CopyToHost(result, ptr); return(result[0]);
                }
                else if (ElementType == DType.UInt8)
                {
                    byte[] result = new byte[1]; context.CopyToHost(result, ptr); return(result[0]);
                }
                else
                {
                    throw new NotSupportedException("Element type " + ElementType + " not supported");
                }
            }
            catch (Exception err)
            {
                Logger.WriteLine($"Failed to get element as float from addr = '{ptr.Pointer}'");
                Logger.WriteLine($"Exception: {err.Message}");
                Logger.WriteLine($"Call stack: {err.StackTrace}");

                throw err;
            }
        }
示例#7
0
        public override void Bind(CUdeviceptr firstInput, params CUdeviceptr[] otherInputs)
        {
            if (otherInputs == null)
            {
                otherInputs = new CUdeviceptr[] { firstInput };
            }
            m_fft.Exec(firstInput, m_tempBlock.GetDevicePtr(m_owner, m_secondFFTOffset));

            int count = otherInputs.Length == 1 ? otherInputs.Length : otherInputs.Length - 1;

            for (int i = 0; i < count; ++i)
            {
                CUdeviceptr start = otherInputs[i];
                m_fft.Exec(start, m_tempBlock.GetDevicePtr(m_owner, m_firstFFTOffset));
                m_mulkernel.Run(
                    m_tempBlock.GetDevicePtr(m_owner, m_firstFFTOffset),
                    m_tempBlock.GetDevicePtr(m_owner, m_secondFFTOffset),
                    m_tempBlock.GetDevicePtr(m_owner, m_secondFFTOffset), m_inputSize + 1);
            }

            CUdeviceptr output = otherInputs[otherInputs.Length - 1];

            FinishBinding(output);
        }
示例#8
0
        private void CopyGpuIndirect(Tensor result, Tensor src, long totalElements)
        {
            // This is only called if the tensors have the same type, but memcpy cannot be used on the tensor pair,
            // and we can't get direct access to the other GPU's memory.

            // We will make contiguous proxy tensors as necessary, so we can use cuMemcpy to perform the copy.
            // If result needs to be proxied, we then copy back from the contiguous proxy to result on the same GPU

            TSCudaContext context        = CudaHelpers.TSContextForTensor(src);
            bool          isResultContig = result.IsContiguous();
            Tensor        resultContig   = result;

            using (Tensor srcContig = Ops.AsContiguous(src))
            {
                if (!isResultContig)
                {
                    resultContig = new Tensor(result.Allocator, result.ElementType, result.Sizes);
                }

                CUdeviceptr resultContigPtr = ((CudaStorage)resultContig.Storage).DevicePtrAtElement(resultContig.StorageOffset);
                CUdeviceptr srcContigPtr    = ((CudaStorage)srcContig.Storage).DevicePtrAtElement(srcContig.StorageOffset);

                CUResult res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyAsync(
                    resultContigPtr, srcContigPtr, totalElements * srcContig.ElementType.Size(), CUstream.NullStream);
                if (res != CUResult.Success)
                {
                    throw new CudaException(res);
                }

                if (!isResultContig)
                {
                    CopyGpuDirect(result, resultContig, context.CudaContextForTensor(result));
                    resultContig.Dispose();
                }
            }
        }
示例#9
0
        public override void Init()
        {
            linKernel.ProblemElements = problemElements;
            linKernel.Y = Y;
            linKernel.Init();

            base.Init();

            float[] vecVals;
            int[]   vecColIdx;
            int[]   vecLenght;

            CudaHelpers.TransformToEllpackRFormat(out vecVals, out vecColIdx, out vecLenght, problemElements);

            selfLinDot = linKernel.DiagonalDotCache;

            #region cuda initialization

            InitCudaModule();

            //copy data to device, set cuda function parameters
            valsPtr      = cuda.CopyHostToDevice(vecVals);
            idxPtr       = cuda.CopyHostToDevice(vecColIdx);
            vecLengthPtr = cuda.CopyHostToDevice(vecLenght);


            selfLinDotPtr = cuda.CopyHostToDevice(selfLinDot);

            uint memSize = (uint)(problemElements.Length * sizeof(float));
            //allocate mapped memory for our results
            //CUDARuntime.cudaSetDeviceFlags(CUDARuntime.cudaDeviceMapHost);



            // var e= CUDADriver.cuMemHostAlloc(ref outputIntPtr, memSize, 8);
            //CUDARuntime.cudaHostAlloc(ref outputIntPtr, memSize, CUDARuntime.cudaHostAllocMapped);
            //var errMsg=CUDARuntime.cudaGetErrorString(e);
            //cuda.HostRegister(outputIntPtr,memSize, Cuda)
            outputIntPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            outputPtr    = cuda.GetHostDevicePointer(outputIntPtr, 0);

            //normal memory allocation
            //outputPtr = cuda.Allocate((uint)(sizeof(float) * problemElements.Length));


            #endregion

            SetCudaFunctionParameters();

            //allocate memory for main vector, size of this vector is the same as dimenson, so many
            //indexes will be zero, but cuda computation is faster
            mainVector = new float[problemElements[0].Dim + 1];
            CudaHelpers.FillDenseVector(problemElements[0], mainVector);

            CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVecPtr);

            CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuLabelsTexRef, cudaLabelsTexRefName, Y, ref labelsPtr);

            if (MakeDenseVectorOnGPU)
            {
                vecBuilder = new EllpackDenseVectorBuilder(cuda, mainVecPtr, valsPtr, idxPtr, vecLengthPtr, problemElements.Length, problemElements[0].Dim);
                vecBuilder.Init();
            }
        }
示例#10
0
 public void GetVector <T>(CUdeviceptr ptr, T[] data)
 {
     this.GetVector <T>(ptr, 1, data, 1);
 }
示例#11
0
 public static extern cufftResult cufftExecZ2Z([In] cufftHandle plan, [In] CUdeviceptr idata, [Out] CUdeviceptr odata, [In] TransformDirection direction);
示例#12
0
 public BasicDeviceMemory(CUdeviceptr pointer, Action freeHandler)
 {
     this.pointer     = pointer;
     this.freeHandler = freeHandler;
 }
示例#13
0
 /// <summary>
 /// Creates a new NPPImage from allocated device ptr.
 /// </summary>
 /// <param name="devPtr">Already allocated device ptr.</param>
 /// <param name="size">Image size</param>
 /// <param name="pitch">Pitch / Line step</param>
 public NPPImage_16uC2(CUdeviceptr devPtr, NppiSize size, int pitch)
     : this(devPtr, size.width, size.height, pitch)
 {
 }
示例#14
0
 /// <summary> see CUDA doc; </summary>
 public static void MemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, uint ByteCount)
 {
     TestResult(my.cuMemcpyDtoD(dstDevice, srcDevice, ByteCount));
 }
示例#15
0
 public static extern CurandStatus curandGenerateLogNormal(CurandGenerator generator, CUdeviceptr outputPtr, SizeT n, float mean, float stddev);
示例#16
0
 /// <summary> see CUDA doc; </summary>
 public static void MemHostGetDevicePointer(out CUdeviceptr dptr, IntPtr p, uint flags)
 {
     TestResult(my.cuMemHostGetDevicePointer(out dptr, p, flags));
 }
示例#17
0
 /// <summary> see CUDA doc; </summary>
 static public void MemFree(CUdeviceptr dptr)
 {
     TestResult(my.cuMemFree(dptr));
 }
示例#18
0
 public static extern CurandStatus curandGenerateLogNormalDouble(CurandGenerator generator, CUdeviceptr outputPtr, SizeT n, double mean, double stddev);
示例#19
0
 /// <summary> see CUDA doc; </summary>
 static public void ParamSetp(CUfunction hfunc, int offset, CUdeviceptr ptr)
 {
     ParamSetl(hfunc, offset, (long)ptr.p);
 }
示例#20
0
 public static extern CurandStatus curandGeneratePoisson(CurandGenerator generator, CUdeviceptr outputPtr, SizeT n, double lambda);
示例#21
0
 /// <summary>
 /// Creates a new NPPImage from allocated device ptr. Does not take ownership of decPtr.
 /// </summary>
 /// <param name="devPtr">Already allocated device ptr.</param>
 /// <param name="width">Image width in pixels</param>
 /// <param name="height">Image height in pixels</param>
 /// <param name="pitch">Pitch / Line step</param>
 public NPPImage_16uC2(CUdeviceptr devPtr, int width, int height, int pitch)
     : this(devPtr, width, height, pitch, false)
 {
 }
示例#22
0
 /// <summary>
 /// Creates a new NPPImage from allocated device ptr.
 /// </summary>
 /// <param name="devPtr">Already allocated device ptr.</param>
 /// <param name="size">Image size</param>
 /// <param name="pitch">Pitch / Line step</param>
 /// <param name="isOwner">If TRUE, devPtr is freed when disposing</param>
 public NPPImage_32fcC1(CUdeviceptr devPtr, NppiSize size, int pitch, bool isOwner)
     : this(devPtr, size.width, size.height, pitch, isOwner)
 {
 }
示例#23
0
 public static extern cufftResult cufftSetWorkArea(cufftHandle plan, CUdeviceptr workArea);
示例#24
0
 public static extern nvgraphStatus nvgraphSetVertexData(nvgraphContext handle, nvgraphGraphDescr descrG, CUdeviceptr vertexData, SizeT setnum);
示例#25
0
 public static extern cufftResult cufftExecZ2D([In] cufftHandle plan, [In] CUdeviceptr idata, [In] CUdeviceptr odata);
示例#26
0
 public static extern nvgraphStatus nvgraphConvertTopology(nvgraphContext handle,
                                                           nvgraphTopologyType srcTType, nvgraphTopologyBase srcTopology, CUdeviceptr srcEdgeData, ref cudaDataType dataType,
                                                           nvgraphTopologyType dstTType, nvgraphTopologyBase dstTopology, CUdeviceptr dstEdgeData);
示例#27
0
 public void SetVector <T>(T[] data, CUdeviceptr ptr)
 {
     this.SetVector <T>(data, 1, ptr, 1);
 }
示例#28
0
 public static extern nvgraphStatus nvgraphGetEdgeData(nvgraphContext handle, nvgraphGraphDescr descrG, CUdeviceptr edgeData, SizeT setnum);
示例#29
0
 internal void VectorSplit(IDeviceMemoryPtr a, int size, int blockSize, CUdeviceptr output)
 {
     _Use(_vectorSplit, size, k => k.Run(0, a.DevicePointer, output, size, blockSize));
 }
示例#30
0
 /// <summary> see CUDA doc; </summary>
 public static void MemAlloc(out CUdeviceptr dptr, uint bytesize)
 {
     TestResult(my.cuMemAlloc(out dptr, bytesize));
 }