예제 #1
0
        public static void For(int number_of_threads, SimpleKernel simpleKernel)
        {
            if (Campy.Utils.Options.IsOn("import-only"))
            {
                JustImport(simpleKernel);
                return;
            }

            GCHandle handle1 = default(GCHandle);
            GCHandle handle2 = default(GCHandle);

            try
            {
                unsafe
                {
                    System.Reflection.MethodInfo method_info = simpleKernel.Method;
                    String kernel_assembly_file_name         = method_info.DeclaringType.Assembly.Location;
                    Mono.Cecil.ModuleDefinition md           = Campy.Meta.StickyReadMod.StickyReadModule(
                        kernel_assembly_file_name, new ReaderParameters {
                        ReadSymbols = true
                    });
                    MethodReference method_reference = md.ImportReference(method_info);

                    CUfunction ptr_to_kernel = default(CUfunction);
                    CUmodule   module        = default(CUmodule);

                    Campy.Utils.TimePhase.Time("compile     ", () =>
                    {
                        IntPtr image = Singleton._compiler.Compile(method_reference, simpleKernel.Target);
                        module       = Singleton._compiler.SetModule(method_reference, image);
                        Singleton._compiler.StoreJits(module);
                        ptr_to_kernel = Singleton._compiler.GetCudaFunction(method_reference, module);
                    });

                    RUNTIME.BclCheckHeap();

                    BUFFERS buffer = Singleton.Buffer;
                    IntPtr  kernel_target_object = IntPtr.Zero;

                    Campy.Utils.TimePhase.Time("deep copy ", () =>
                    {
                        int count = simpleKernel.Method.GetParameters().Length;
                        var bb    = Singleton._compiler.GetBasicBlock(method_reference);
                        if (bb.HasThis)
                        {
                            count++;
                        }
                        if (!(count == 1 || count == 2))
                        {
                            throw new Exception("Expecting at least one parameter for kernel.");
                        }

                        if (bb.HasThis)
                        {
                            kernel_target_object = buffer.AddDataStructure(simpleKernel.Target);
                        }
                    });

                    Campy.Utils.TimePhase.Time("kernel cctor set up", () =>
                    {
                        // For each cctor, run on GPU.
                        // Construct dependency graph of methods.
                        List <MethodReference> order_list = COMPILER.Singleton.ConstructCctorOrder();

                        // Finally, call cctors.
                        foreach (var bb in order_list)
                        {
                            if (Campy.Utils.Options.IsOn("trace-cctors"))
                            {
                                System.Console.WriteLine("Executing cctor "
                                                         + bb.FullName);
                            }
                            var cctor = Singleton._compiler.GetCudaFunction(bb, module);

                            var res = CUresult.CUDA_SUCCESS;
                            Campy.Utils.CudaHelpers.MakeLinearTiling(1,
                                                                     out Campy.Utils.CudaHelpers.dim3 tile_size, out Campy.Utils.CudaHelpers.dim3 tiles);

                            res = Cuda.cuLaunchKernel(
                                cctor,
                                tiles.x, tiles.y, tiles.z,             // grid has one block.
                                tile_size.x, tile_size.y, tile_size.z, // n threads.
                                0,                                     // no shared memory
                                default(CUstream),
                                (IntPtr)IntPtr.Zero,
                                (IntPtr)IntPtr.Zero
                                );

                            CudaHelpers.CheckCudaError(res);
                            res = Cuda.cuCtxSynchronize(); // Make sure it's copied back to host.
                            CudaHelpers.CheckCudaError(res);
                        }
                    });

                    if (Campy.Utils.Options.IsOn("trace-cctors"))
                    {
                        System.Console.WriteLine("Done with cctors");
                    }

                    Campy.Utils.TimePhase.Time("kernel call ", () =>
                    {
                        IntPtr[] parm1 = new IntPtr[1];
                        IntPtr[] parm2 = new IntPtr[1];

                        parm1[0] = kernel_target_object;
                        parm2[0] = buffer.New(BUFFERS.SizeOf(typeof(int)));

                        IntPtr[] x1     = parm1;
                        handle1         = GCHandle.Alloc(x1, GCHandleType.Pinned);
                        IntPtr pointer1 = handle1.AddrOfPinnedObject();

                        IntPtr[] x2     = parm2;
                        handle2         = GCHandle.Alloc(x2, GCHandleType.Pinned);
                        IntPtr pointer2 = handle2.AddrOfPinnedObject();

                        IntPtr[] kp = new IntPtr[] { pointer1, pointer2 };
                        var res     = CUresult.CUDA_SUCCESS;
                        fixed(IntPtr * kernelParams = kp)
                        {
                            Campy.Utils.CudaHelpers.MakeLinearTiling(number_of_threads,
                                                                     out Campy.Utils.CudaHelpers.dim3 tile_size, out Campy.Utils.CudaHelpers.dim3 tiles);

                            //MakeLinearTiling(1, out dim3 tile_size, out dim3 tiles);

                            res = Cuda.cuLaunchKernel(
                                ptr_to_kernel,
                                tiles.x, tiles.y, tiles.z,             // grid has one block.
                                tile_size.x, tile_size.y, tile_size.z, // n threads.
                                0,                                     // no shared memory
                                default(CUstream),
                                (IntPtr)kernelParams,
                                (IntPtr)IntPtr.Zero
                                );
                        }
예제 #2
0
        private static void GemmOp(TSCudaContext context, BlasOp transA, BlasOp transB, float alpha, Tensor a, Tensor b, float beta, Tensor c)
        {
            if (a.Strides[0] != 1)
            {
                throw new ArgumentException($"a must be contiguous in the first dimension (column major / fortran order). ({a.Strides[0]},{a.Strides[1]}) ({b.Strides[0]},{b.Strides[1]}) ({c.Strides[0]},{c.Strides[1]})");
            }
            if (b.Strides[0] != 1)
            {
                throw new ArgumentException("b must be contiguous in the first dimension (column major / fortran order)");
            }
            if (c.Strides[0] != 1)
            {
                throw new ArgumentException("c must be contiguous in the first dimension (column major / fortran order)");
            }

            using (var blas = context.BlasForTensor(c))
            {
                bool      nta    = transA == BlasOp.NonTranspose;
                bool      ntb    = transB == BlasOp.NonTranspose;
                Operation transa = GetCudaBlasOp(transA);
                Operation transb = GetCudaBlasOp(transB);
                int       m      = (int)a.Sizes[nta ? 0 : 1];
                int       k      = (int)b.Sizes[ntb ? 0 : 1];
                int       n      = (int)b.Sizes[ntb ? 1 : 0];
                int       lda    = (int)a.Strides[1];
                int       ldb    = (int)b.Strides[1];
                int       ldc    = (int)c.Strides[1];



                if (c.ElementType == DType.Float32)
                {
                    var aPtrSingle = CudaHelpers.GetBufferStart(a);
                    var bPtrSingle = CudaHelpers.GetBufferStart(b);
                    var cPtrSingle = CudaHelpers.GetBufferStart(c);

                    var _statusF32 = CudaBlasNativeMethods.cublasSgemm_v2(blas.Value.CublasHandle,
                                                                          transa, transb, m, n, k, ref alpha, aPtrSingle, lda, bPtrSingle, ldb, ref beta, cPtrSingle, ldc);
                    if (_statusF32 != CublasStatus.Success)
                    {
                        throw new CudaBlasException(_statusF32);
                    }
                }
                else if (c.ElementType == DType.Float64)
                {
                    var aPtrDouble  = CudaHelpers.GetBufferStart(a);
                    var bPtrDouble  = CudaHelpers.GetBufferStart(b);
                    var cPtrDouble  = CudaHelpers.GetBufferStart(c);
                    var alphaDouble = (double)alpha;
                    var betaDouble  = (double)beta;
                    var _statusF64  = CudaBlasNativeMethods.cublasDgemm_v2(blas.Value.CublasHandle,
                                                                           transa, transb, m, n, k, ref alphaDouble, aPtrDouble, lda, bPtrDouble, ldb, ref betaDouble, cPtrDouble, ldc);
                    if (_statusF64 != CublasStatus.Success)
                    {
                        throw new CudaBlasException(_statusF64);
                    }
                }
                else
                {
                    throw new NotSupportedException("CUDA GEMM with element type " + c.ElementType + " not supported");
                }
            }
        }
예제 #3
0
        public static Tensor Mul_M_V(TSCudaContext context, Tensor result, Tensor lhs, Tensor rhs)
        {
            if (lhs.ElementType != rhs.ElementType || (result != null && result.ElementType != lhs.ElementType))
            {
                throw new InvalidOperationException("All tensors must have the same element type");
            }
            CudaHelpers.ThrowIfDifferentDevices(result, lhs, rhs);
            if (result != null && (result.Storage is CudaStorage))
            {
                throw new ArgumentException("result must be a CUDA tensor", "result");
            }
            if (!(lhs.Storage is CudaStorage))
            {
                throw new ArgumentException("lhs must be a CUDA tensor", "lhs");
            }
            if (!(rhs.Storage is CudaStorage))
            {
                throw new ArgumentException("rhs must be a CUDA tensor", "rhs");
            }

            if (lhs.DimensionCount != 2)
            {
                throw new ArgumentException("lhs must have 2 dimensions", "lhs");
            }
            if (rhs.DimensionCount != 1)
            {
                throw new ArgumentException("rhs must have 1 dimension (ie. be a vector)", "rhs");
            }

            Tensor lhsClone;

            if (lhs.Strides[1] == 1) // If lhs is already row-major, do nothing
            {
                lhsClone = lhs.CopyRef();
            }
            else if (lhs.Strides[0] == 1) // If lhs is column-major, transpose it
            {
                lhsClone = lhs.Transpose();
            }
            else // If lhs is not contiguous in either dimension, make a temporary contiguous copy
            {
                lhsClone = Ops.NewContiguous(lhs);
            }

            var writeTarget = TensorResultBuilder.GetWriteTarget(result, rhs, false, lhs.Sizes[0]);

            try
            {
                if (writeTarget.ElementType == DType.Float32)
                {
                    Run_M_V_float(context, writeTarget, lhsClone, rhs);
                }
                else if (writeTarget.ElementType == DType.Float64)
                {
                    Run_M_V_double(context, writeTarget, lhsClone, rhs);
                }
                else
                {
                    throw new NotSupportedException("CUDA Matrix-Vector multiplication with element type " + result.ElementType + " not supported");
                }
            }
            finally
            {
                lhsClone.Dispose();
            }

            return(writeTarget);
        }
예제 #4
0
        public override void Init()
        {
            linKernel.ProblemElements = problemElements;
            linKernel.Y = Y;
            linKernel.Init();

            base.Init();

            float[] vecVals;
            int[]   vecColIdx;
            int[]   vecLenght;

            int align = preFetch;

            CudaHelpers.TransformToEllpackRFormat(out vecVals, out vecColIdx, out vecLenght, problemElements, align);
            // CudaHelpers.TransformToEllpackRFormat(out vecVals, out vecColIdx, out vecLenght, problemElements);

            selfLinDot = linKernel.DiagonalDotCache;

            #region cuda initialization

            InitCudaModule();


            //copy data to device, set cuda function parameters
            valsPtr = cuda.CopyHostToDevice(vecVals);

            idxPtr       = cuda.CopyHostToDevice(vecColIdx);
            vecLengthPtr = cuda.CopyHostToDevice(vecLenght);


            labelsPtr = cuda.CopyHostToDevice(Y);

            selfLinDotPtr = cuda.CopyHostToDevice(selfLinDot);

            uint memSize = (uint)(2 * problemElements.Length * sizeof(float));
            //allocate mapped memory for our results
            //CUDARuntime.cudaSetDeviceFlags(CUDARuntime.cudaDeviceMapHost);



            // var e= CUDADriver.cuMemHostAlloc(ref outputIntPtr, memSize, 8);
            //CUDARuntime.cudaHostAlloc(ref outputIntPtr, memSize, CUDARuntime.cudaHostAllocMapped);
            //var errMsg=CUDARuntime.cudaGetErrorString(e);
            //cuda.HostRegister(outputIntPtr,memSize, Cuda)
            outputIntPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            outputPtr    = cuda.GetHostDevicePointer(outputIntPtr, 0);

            //normal memory allocation
            //outputPtr = cuda.Allocate((uint)(sizeof(float) * problemElements.Length));


            #endregion

            SetCudaFunctionParameters();

            //allocate memory for main vector, size of this vector is the same as dimenson, so many
            //indexes will be zero, but cuda computation is faster
            VectorI = new float[problemElements[0].Dim + 1];
            VectorJ = new float[problemElements[0].Dim + 1];

            CudaHelpers.FillDenseVector(problemElements[0], VectorI);
            CudaHelpers.FillDenseVector(problemElements[1], VectorJ);

            CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuVecI_TexRef, cuVecITexRefName, VectorI, ref VecIPtr);
            CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuVecJ_TexRef, cuVecJTexRefName, VectorJ, ref VecJPtr);
        }
예제 #5
0
        /// <summary>
        /// Copies the gpu.
        /// </summary>
        /// <param name="result">The result.</param>
        /// <param name="src">The source.</param>
        /// <param name="totalElements">The total elements.</param>
        /// <exception cref="CudaException">
        /// </exception>
        public void CopyGpu(Tensor result, Tensor src, long totalElements)
        {
            // We assume here that we are using the default stream for both devices.
            var context = CudaHelpers.TSContextForTensor(src);

            var resultStorage = (CudaStorage)result.Storage;
            var resultContext = context.CudaContextForTensor(result);
            var resultPtr     = resultStorage.DevicePtrAtElement(result.StorageOffset);

            var srcStorage = (CudaStorage)src.Storage;
            var srcContext = context.CudaContextForTensor(src);
            var srcPtr     = srcStorage.DevicePtrAtElement(src.StorageOffset);


            if (CudaHelpers.GetDeviceId(result) != CudaHelpers.GetDeviceId(src))
            {
                // Cross-device copy. Perform two-way barrier between both devices' default streams.
                resultContext.SetCurrent();
                var dstReady = new CudaEvent(CUEventFlags.DisableTiming);
                dstReady.Record();

                srcContext.SetCurrent();
                var res = DriverAPINativeMethods.Streams.cuStreamWaitEvent(CUstream.NullStream, dstReady.Event, 0);
                if (res != CUResult.Success)
                {
                    throw new CudaException(res);
                }
                dstReady.Dispose();
            }
            else
            {
                srcContext.SetCurrent();
            }

            var canMemcpy = CanMemcpy(result, src, totalElements);

            if (canMemcpy)
            {
                var res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyAsync(
                    resultPtr, srcPtr, totalElements * src.ElementType.Size(), CUstream.NullStream);
                if (res != CUResult.Success)
                {
                    throw new CudaException(res);
                }
            }
            else
            {
                if (result.ElementType != src.ElementType)
                {
                    CopyGpuConvertTypes(result, src, totalElements);
                }
                else if (context.CanAccessPeer(CudaHelpers.GetDeviceId(src), CudaHelpers.GetDeviceId(result)))
                {
                    CopyGpuDirect(result, src, srcContext);
                }
                else
                {
                    CopyGpuIndirect(result, src, totalElements);
                }
            }
        }
예제 #6
0
        public static Tensor Invoke(CudaReduceAllKernels reduceAllKernels, float init, ReduceInitType initType, string kernelName, Tensor result, Tensor src, object extraArg = null)
        {
            int           deviceId    = CudaHelpers.GetDeviceId(src);
            TSCudaContext context     = CudaHelpers.TSContextForTensor(src);
            CudaContext   cudaContext = context.CudaContextForDevice(deviceId);

            if (src.DimensionCount > TSCudaContext.MaxDims)
            {
                throw new InvalidOperationException("Tensors with dimension count > " + TSCudaContext.MaxDims + " are not supported");
            }

            Tensor writeTarget = TensorResultBuilder.GetWriteTarget(result, src, false, 1);

            if (src.DimensionCount == 0)
            {
                return(result);
            }

            long totalElements         = src.ElementCount();
            ApplySpecialization config = new ApplySpecialization(src);
            object totalElementsTyped  = config.Use32BitIndices ? (uint)totalElements : (ulong)totalElements;
            object initValueTyped      = ReduceInitConverter.GetInitValue(init, initType, src.ElementType);

            dim3 grid;
            dim3 block;

            byte[] ptx            = reduceAllKernels.GetPtx(context.Compiler);
            string fullKernelName = PermutationGenerator.GetMangledName(kernelName, config);

            ManagedCuda.BasicTypes.CUdeviceptr outputDevicePtr = CudaHelpers.GetBufferStart(writeTarget);

            if (isTwoPassReductionSize(totalElements))
            {
                getPass1ReduceBlockGrid(context, deviceId, totalElements, out grid, out block);
                uint smemSize = block.x * sizeof(float);

                ManagedCuda.BasicTypes.CUdeviceptr scratchSpace = context.ScratchSpaceForDevice(deviceId).buffer;

                if (extraArg == null)
                {
                    InvokeReduceAll(context, cudaContext, ptx, "twoPassA_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, scratchSpace);
                }
                else
                {
                    InvokeReduceAll(context, cudaContext, ptx, "twoPassA_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, scratchSpace, extraArg);
                }

                uint numPass1Blocks = grid.x;
                getPass2ReduceBlockGrid(context, deviceId, totalElements, out grid, out block);
                smemSize = block.x * sizeof(float);

                InvokeReduceAllPass2(context, cudaContext, ptx, "twoPassB_" + fullKernelName, grid, block, smemSize, config.Use32BitIndices, numPass1Blocks, initValueTyped, scratchSpace, outputDevicePtr);
            }
            else
            {
                getSinglePassReduceBlockGrid(totalElements, out grid, out block);
                uint smemSize = block.x * sizeof(float);

                if (extraArg == null)
                {
                    InvokeReduceAll(context, cudaContext, ptx, "onePass_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, outputDevicePtr);
                }
                else
                {
                    InvokeReduceAll(context, cudaContext, ptx, "onePass_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, outputDevicePtr, extraArg);
                }
            }

            return(writeTarget);
        }
예제 #7
0
        public Tensor IndexSelect(Tensor result, Tensor src, int dim, Tensor indices)
        {
            TSCudaContext context     = CudaHelpers.TSContextForTensor(src);
            CudaContext   cudaContext = context.CudaContextForTensor(src);

            long[] requiredOutputSize = (long[])src.Sizes.Clone();
            requiredOutputSize[dim] = 1;
            Tensor writeTarget = TensorResultBuilder.GetWriteTarget(result, src, true, requiredOutputSize);


            // The `src` is partitioned into two parts:
            // -the size of each slice we are indexing, which is the
            // total size of the tensor ignoring dimension `dim`;
            // -the number of indices we are choosing, which is the total size
            // of the tensor `indices`.
            long numIndices       = indices.ElementCount();
            long dstTotalSize     = writeTarget.ElementCount();
            long srcSelectDimSize = src.Sizes[dim];
            long sliceSize        = dstTotalSize / numIndices;

            int  mpc             = context.DeviceInfoForContext(cudaContext).MultiProcessorCount;
            dim3 smallIndexGrid  = new dim3((uint)Math.Min(ApplyUtils.CeilDiv(sliceSize, 128), (mpc * 8)));
            dim3 smallIndexBlock = new dim3((uint)Math.Min(sliceSize, 128));

            dim3 largeIndexGrid  = new dim3((uint)Math.Min(ApplyUtils.CeilDiv(dstTotalSize, 128), (mpc * 8)));
            dim3 largeIndexBlock = new dim3((uint)Math.Min(dstTotalSize, 128));


            long[] newResultSize = (long[])writeTarget.Sizes.Clone();
            newResultSize[dim] = 1;
            Tensor resultFlat = new Tensor(newResultSize, writeTarget.Strides, writeTarget.Storage, writeTarget.StorageOffset);

            long[] newSrcSize = (long[])src.Sizes.Clone();
            newSrcSize[dim] = 1;
            Tensor srcFlat = new Tensor(newSrcSize, src.Strides, src.Storage, src.StorageOffset);


            if (ApplyUtils.CanUse32BitIndexMath(writeTarget) &&
                ApplyUtils.CanUse32BitIndexMath(src) &&
                ApplyUtils.CanUse32BitIndexMath(indices))
            {
                // Threshold for small kernel
                bool   smallKernel = numIndices <= 16;
                string kernelName  = "";
                bool   indContig   = indices.IsContiguous();

                if (writeTarget.DimensionCount == src.DimensionCount &&
                    writeTarget.DimensionCount <= 3 &&
                    indContig)
                {
                    kernelName = MakeKernelName(smallKernel, true, writeTarget.DimensionCount, src.DimensionCount, -2);
                }
                else
                {
                    kernelName = MakeKernelName(smallKernel, true, -1, -1, -1);
                }

                dim3 grid  = smallKernel ? smallIndexGrid : largeIndexGrid;
                dim3 block = smallKernel ? smallIndexBlock : largeIndexBlock;
                Invoke(context, cudaContext, kernelName, grid, block, 0, CUstream.NullStream, true,
                       writeTarget, src, indices, dim, dim, sliceSize, srcSelectDimSize);
            }
            else
            {
                string kernelName = MakeKernelName(false, false, -1, -1, -1);

                Invoke(context, cudaContext, kernelName, largeIndexGrid, largeIndexBlock, 0, CUstream.NullStream, false,
                       writeTarget, src, indices, dim, dim, dstTotalSize, sliceSize, srcSelectDimSize);
            }



            return(writeTarget);
        }
예제 #8
0
        /// <summary>
        /// Copies the gpu direct.
        /// </summary>
        /// <param name="result">The result.</param>
        /// <param name="src">The source.</param>
        /// <param name="srcContext">The source context.</param>
        private void CopyGpuDirect(Tensor result, Tensor src, CudaContext srcContext)
        {
            var context = CudaHelpers.TSContextForTensor(src);

            CopyOp.Invoke(fillCopyKernels, context, srcContext, result, src);
        }
예제 #9
0
        public static void For(int number_of_threads, SimpleKernel simpleKernel)
        {
            GCHandle handle1 = default(GCHandle);
            GCHandle handle2 = default(GCHandle);

            try
            {
                unsafe
                {
                    //////// COMPILE KERNEL INTO GPU CODE ///////
                    /////////////////////////////////////////////
                    var stopwatch_cuda_compile = new Stopwatch();
                    stopwatch_cuda_compile.Start();
                    IntPtr     image               = Singleton()._converter.Compile(simpleKernel.Method, simpleKernel.Target);
                    CUfunction ptr_to_kernel       = Singleton()._converter.GetCudaFunction(simpleKernel.Method, image);
                    var        elapse_cuda_compile = stopwatch_cuda_compile.Elapsed;

                    RUNTIME.CheckHeap();

                    //////// COPY DATA INTO GPU /////////////////
                    /////////////////////////////////////////////
                    var stopwatch_deep_copy_to = new Stopwatch();
                    stopwatch_deep_copy_to.Reset();
                    stopwatch_deep_copy_to.Start();
                    BUFFERS buffer = Singleton().Buffer;

                    // Set up parameters.
                    int count = simpleKernel.Method.GetParameters().Length;
                    var bb    = Singleton()._converter.GetBasicBlock(simpleKernel.Method);
                    if (bb.HasThis)
                    {
                        count++;
                    }
                    if (!(count == 1 || count == 2))
                    {
                        throw new Exception("Expecting at least one parameter for kernel.");
                    }

                    IntPtr[] parm1 = new IntPtr[1];
                    IntPtr[] parm2 = new IntPtr[1];
                    IntPtr   ptr   = IntPtr.Zero;

                    // The method really should have a "this" because it's a closure
                    // object.
                    if (bb.HasThis)
                    {
                        RUNTIME.CheckHeap();
                        ptr      = buffer.AddDataStructure(simpleKernel.Target);
                        parm1[0] = ptr;
                    }

                    {
                        Type btype = typeof(int);
                        var  s     = BUFFERS.SizeOf(btype);
                        var  ptr2  = buffer.New(s);
                        // buffer.DeepCopyToImplementation(index, ptr2);
                        parm2[0] = ptr2;
                    }

                    stopwatch_deep_copy_to.Start();
                    var elapse_deep_copy_to = stopwatch_cuda_compile.Elapsed;

                    var stopwatch_call_kernel = new Stopwatch();
                    stopwatch_call_kernel.Reset();
                    stopwatch_call_kernel.Start();

                    IntPtr[] x1 = parm1;
                    handle1 = GCHandle.Alloc(x1, GCHandleType.Pinned);
                    IntPtr pointer1 = handle1.AddrOfPinnedObject();

                    IntPtr[] x2 = parm2;
                    handle2 = GCHandle.Alloc(x2, GCHandleType.Pinned);
                    IntPtr pointer2 = handle2.AddrOfPinnedObject();

                    RUNTIME.CheckHeap();

                    IntPtr[] kp  = new IntPtr[] { pointer1, pointer2 };
                    var      res = CUresult.CUDA_SUCCESS;
                    fixed(IntPtr *kernelParams = kp)
                    {
                        Campy.Utils.CudaHelpers.MakeLinearTiling(number_of_threads, out Campy.Utils.CudaHelpers.dim3 tile_size, out Campy.Utils.CudaHelpers.dim3 tiles);

                        //MakeLinearTiling(1, out dim3 tile_size, out dim3 tiles);

                        res = Cuda.cuLaunchKernel(
                            ptr_to_kernel,
                            tiles.x, tiles.y, tiles.z,             // grid has one block.
                            tile_size.x, tile_size.y, tile_size.z, // n threads.
                            0,                                     // no shared memory
                            default(CUstream),
                            (IntPtr)kernelParams,
                            (IntPtr)IntPtr.Zero
                            );
                    }
                    CudaHelpers.CheckCudaError(res);
                    res = Cuda.cuCtxSynchronize(); // Make sure it's copied back to host.
                    CudaHelpers.CheckCudaError(res);

                    stopwatch_call_kernel.Stop();
                    var elapse_call_kernel = stopwatch_call_kernel.Elapsed;

                    if (Campy.Utils.Options.IsOn("jit_trace"))
                    {
                        System.Console.WriteLine("cuda compile  " + elapse_cuda_compile);
                        System.Console.WriteLine("deep copy in  " + elapse_deep_copy_to);
                        System.Console.WriteLine("cuda kernel   " + elapse_call_kernel);
                    }

                    {
                        var stopwatch_deep_copy_back = new Stopwatch();
                        stopwatch_deep_copy_back.Reset();

                        RUNTIME.CheckHeap();

                        stopwatch_deep_copy_back.Start();

                        buffer.SynchDataStructures();

                        stopwatch_deep_copy_back.Stop();

                        RUNTIME.CheckHeap();

                        var elapse_deep_copy_back = stopwatch_deep_copy_back.Elapsed;
                        if (Campy.Utils.Options.IsOn("jit_trace"))
                        {
                            System.Console.WriteLine("deep copy out " + elapse_deep_copy_back);
                        }
                    }
                }
            }
            catch (Exception e)
            {
                Console.WriteLine(e);
                throw e;
            }
            finally
            {
                if (default(GCHandle) != handle1)
                {
                    handle1.Free();
                }
                if (default(GCHandle) != handle2)
                {
                    handle2.Free();
                }
            }
        }
예제 #10
0
        private void SetCudaData(Problem <SparseVec> sub_prob)
        {
            int vecDim = sub_prob.FeaturesCount;//.Elements[0].Dim;

            /*
             * copy vectors to CUDA device
             */

            #region copy trainning examples to GPU

            float[] vecVals;
            int[]   vecIdx;
            int[]   vecLenght;
            CudaHelpers.TransformToCSRFormat(out vecVals, out vecIdx, out vecLenght, sub_prob.Elements);
            valsCSRPtr      = cuda.CopyHostToDevice(vecVals);
            idxCSRPtr       = cuda.CopyHostToDevice(vecIdx);
            vecLenghtCSRPtr = cuda.CopyHostToDevice(vecLenght);


            CudaHelpers.TransformToCSCFormat(out vecVals, out vecIdx, out vecLenght, sub_prob.Elements);
            valsCSCPtr      = cuda.CopyHostToDevice(vecVals);
            idxCSCPtr       = cuda.CopyHostToDevice(vecIdx);
            vecLenghtCSCPtr = cuda.CopyHostToDevice(vecLenght);

            #endregion

            /*
             * allocate memory for gradient
             */
            alphaMemSize = (uint)(sub_prob.ElementsCount * sizeof(float));

            gradPtr    = cuda.Allocate(alphaMemSize);
            gradOldPtr = cuda.Allocate(alphaMemSize);

            alphaPtr    = cuda.Allocate(alphaMemSize);
            alphaOldPtr = cuda.Allocate(alphaMemSize);
            alphaTmpPtr = cuda.Allocate(alphaMemSize);


            /*
             * reduction blocks for computing Obj
             */



            GetNumThreadsAndBlocks(vecDim, 64, threadsPerBlock, ref threadsForReduceObjW, ref bpgReduceW);

            reduceObjW = new float[bpgReduceW];
            uint reduceWBytes = (uint)bpgReduceW * sizeof(float);
            reduceObjWPtr = cuda.Allocate(reduceWBytes);

            /*
             * reduction size for kernels which operate on alpha
             */
            int reductionSize = problem.ElementsCount;
            threadsForReduceObjAlpha = 0;

            GetNumThreadsAndBlocks(problem.ElementsCount, 64, threadsPerBlock, ref threadsForReduceObjAlpha, ref bpgReduceAlpha);

            uint alphaReductionBytes = (uint)bpgReduceAlpha * sizeof(float);

            /*
             * reduction array for computing objective function value
             */

            reduceObjAlpha    = new float[bpgReduceAlpha];
            reduceObjAlphaPtr = cuda.Allocate(alphaReductionBytes);


            /*
             * reduction arrays for computing BB step
             */
            alphaPartReduce     = new float[bpgReduceAlpha];
            gradPartReduce      = new float[bpgReduceAlpha];
            alphaGradPartReduce = new float[bpgReduceAlpha];

            reduceBBAlphaGradPtr = cuda.Allocate(alphaReductionBytes);
            reduceBBAlphaPtr     = cuda.Allocate(alphaReductionBytes);
            reduceBBGradPtr      = cuda.Allocate(alphaReductionBytes);

            /*
             * reduction arrays for comuting lin part
             */
            reduceLinPart    = new float[bpgReduceAlpha];
            reduceLinPartPtr = cuda.Allocate(alphaReductionBytes);



            //float[] wVec = new float[vecDim];
            wVecMemSize = (uint)vecDim * sizeof(float);
            wTempVecPtr = cuda.Allocate(wVecMemSize);
            //move W wector
            SetTextureMemory(ref cuWVecTexRef, cudaWVecTexRefName, ref wVecPtr, wVecMemSize);

            //set texture memory for labels
            SetTextureMemory(ref cuLabelsTexRef, cudaLabelsTexRefName, sub_prob.Y, ref labelsPtr);


            SetTextureMemory(ref cuDeltasTexRef, "deltasTexRef", ref deltasPtr, alphaMemSize);

            diagPtr = cuda.GetModuleGlobal(cuModule, "diag_shift");


            stepBBPtr = cuda.GetModuleGlobal(cuModule, "stepBB");
            float[] stepData = new float[] { 0.1f };
            cuda.CopyHostToDevice(stepBBPtr, stepData);

            SetCudaParameters(sub_prob);
        }
예제 #11
0
        private static void GemmOpBatch(TSCudaContext context, BlasOp transA, BlasOp transB, float alpha, Tensor a, Tensor b, float beta, Tensor c)
        {
            if (a.Strides[1] != 1)
            {
                throw new ArgumentException($"a must be contiguous in the first dimension (column major / fortran order). ({a.Strides[0]},{a.Strides[1]}) ({b.Strides[0]},{b.Strides[1]}) ({c.Strides[0]},{c.Strides[1]})");
            }

            if (b.Strides[1] != 1)
            {
                throw new ArgumentException("b must be contiguous in the first dimension (column major / fortran order)");
            }

            if (c.Strides[1] != 1)
            {
                throw new ArgumentException($"c must be contiguous in the first dimension (column major / fortran order) ({a.Strides[0]}, {a.Strides[1]}, {a.Strides[2]}) ({b.Strides[0]}, {b.Strides[1]}, {b.Strides[2]}) ({c.Strides[0]}, {c.Strides[1]}, {c.Strides[2]})");
            }

            using (Util.PooledObject <CudaBlas> blas = context.BlasForTensor(c))
            {
                bool      nta    = transA == BlasOp.NonTranspose;
                bool      ntb    = transB == BlasOp.NonTranspose;
                Operation transa = GetCudaBlasOp(transA);
                Operation transb = GetCudaBlasOp(transB);
                int       m      = (int)a.Sizes[nta ? 1 : 2];
                int       k      = (int)b.Sizes[ntb ? 1 : 2];
                int       n      = (int)b.Sizes[ntb ? 2 : 1];
                int       lda    = (int)a.Strides[2];
                int       ldb    = (int)b.Strides[2];
                int       ldc    = (int)c.Strides[2];

                int stra      = (int)a.Strides[0];
                int strb      = (int)b.Strides[0];
                int strc      = (int)c.Strides[0];
                int batchSize = (int)c.Sizes[0];


                //// Set the math mode to allow cuBLAS to use Tensor Cores:
                //cublasStat = cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH);

                CublasStatus status = CudaBlasNativeMethods.cublasSetMathMode(blas.Value.CublasHandle, ManagedCuda.CudaBlas.Math.TensorOpMath);
                if (status != CublasStatus.Success)
                {
                    throw new CudaBlasException($"Failed to set math mode to tensor ops.");
                }


                if (c.ElementType == DType.Float32)
                {
                    CUdeviceptr aPtrSingle = CudaHelpers.GetBufferStart(a);
                    CUdeviceptr bPtrSingle = CudaHelpers.GetBufferStart(b);
                    CUdeviceptr cPtrSingle = CudaHelpers.GetBufferStart(c);

                    CublasStatus _statusF32 = CudaBlasNativeMethods.cublasSgemmStridedBatched(blas.Value.CublasHandle,
                                                                                              transa, transb, m, n, k, ref alpha, aPtrSingle, lda, stra, bPtrSingle, ldb, strb, ref beta, cPtrSingle, ldc, strc, batchSize);
                    if (_statusF32 != CublasStatus.Success)
                    {
                        throw new CudaBlasException(_statusF32);
                    }
                }
                else if (c.ElementType == DType.Float64)
                {
                    CUdeviceptr  aPtrDouble  = CudaHelpers.GetBufferStart(a);
                    CUdeviceptr  bPtrDouble  = CudaHelpers.GetBufferStart(b);
                    CUdeviceptr  cPtrDouble  = CudaHelpers.GetBufferStart(c);
                    double       alphaDouble = alpha;
                    double       betaDouble  = beta;
                    CublasStatus _statusF64  = CudaBlasNativeMethods.cublasDgemmStridedBatched(blas.Value.CublasHandle,
                                                                                               transa, transb, m, n, k, ref alphaDouble, aPtrDouble, lda, stra, bPtrDouble, ldb, strb, ref betaDouble, cPtrDouble, ldc, strc, batchSize);
                    if (_statusF64 != CublasStatus.Success)
                    {
                        throw new CudaBlasException(_statusF64);
                    }
                }
                else
                {
                    throw new NotSupportedException("CUDA GEMM with element type " + c.ElementType + " not supported");
                }
            }
        }
예제 #12
0
        public static Tensor Invoke(CudaReduceKernels reduceKernels, string kernelName, float init, ReduceInitType initType, Tensor result, Tensor src, int dim, object extraArg = null)
        {
            if (src.DimensionCount == 0)
            {
                return(result);
            }

            var context     = CudaHelpers.TSContextForTensor(src);
            var cudaContext = context.CudaContextForTensor(src);

            var requiredOutputSize = (long[])src.Sizes.Clone();

            requiredOutputSize[dim] = 1;
            var writeTarget = TensorResultBuilder.GetWriteTarget(result, src, false, requiredOutputSize);

            ThrowIfAnyTensorInvalid(writeTarget, src);

            var inElements      = src.ElementCount();
            var reductionSize   = src.Sizes[dim];
            var reductionStride = src.Strides[dim];
            var outElements     = inElements / reductionSize;
            var contigReduction = reductionStride == 1;


            // We must make sure that when the tensor is passed to the kernel, src.Sizes[dim] is set to 1
            // This includes for the purposes of determining which tensor specializations to use (changing
            // the dimension size to 1 may make the tensor non-contiguous
            var newSizes = (long[])src.Sizes.Clone();

            newSizes[dim] = 1;
            var srcSlim = new Tensor(newSizes, src.Strides, src.Storage, src.StorageOffset);

            var    config               = new ApplySpecialization(writeTarget, srcSlim);
            object totalSlices          = config.Use32BitIndices ? (uint)outElements : (ulong)outElements;
            object reductionSizeTyped   = config.Use32BitIndices ? (uint)reductionSize : (ulong)reductionSize;
            object reductionStrideTyped = config.Use32BitIndices ? (uint)reductionStride : (ulong)reductionStride;
            var    initValueTyped       = ReduceInitConverter.GetInitValue(init, initType, src.ElementType);

            var ptx = reduceKernels.GetPtx(context.Compiler);

            if (contigReduction)
            {
                var block    = GetContigReduceBlock(cudaContext, outElements, reductionSize);
                var grid     = GetContigReduceGrid(outElements);
                var smemSize = (uint)src.ElementType.Size() * block.x;

                var fullName = "contig_" + PermutationGenerator.GetMangledName(kernelName, config);
                if (extraArg == null)
                {
                    InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionSizeTyped, totalSlices, initValueTyped);
                }
                else
                {
                    InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionSizeTyped, totalSlices, initValueTyped, extraArg);
                }
            }
            else
            {
                var  deviceProps = context.DeviceInfoForContext(cudaContext);
                var  block       = GetNonContigReduceBlock(deviceProps);
                var  grid        = GetNoncontigReduceGrid(deviceProps, outElements);
                uint smemSize    = 0;

                var fullName = "noncontig_" + PermutationGenerator.GetMangledName(kernelName, config);
                if (extraArg == null)
                {
                    InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionStrideTyped, reductionSizeTyped, totalSlices, initValueTyped);
                }
                else
                {
                    InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionStrideTyped, reductionSizeTyped, totalSlices, initValueTyped, extraArg);
                }
            }

            return(writeTarget);
        }
예제 #13
0
        private void SetCudaData(Problem <SparseVec> sub_prob)
        {
            int vecDim = sub_prob.Elements[0].Dim;

            /*
             * copy vectors to CUDA device
             */
            float[] vecVals;
            int[]   vecIdx;
            int[]   vecLenght;
            CudaHelpers.TransformToCSRFormat(out vecVals, out vecIdx, out vecLenght, sub_prob.Elements);
            valsCSRPtr      = cuda.CopyHostToDevice(vecVals);
            idxCSRPtr       = cuda.CopyHostToDevice(vecIdx);
            vecLenghtCSRPtr = cuda.CopyHostToDevice(vecLenght);


            CudaHelpers.TransformToCSCFormat(out vecVals, out vecIdx, out vecLenght, sub_prob.Elements);
            valsCSCPtr      = cuda.CopyHostToDevice(vecVals);
            idxCSCPtr       = cuda.CopyHostToDevice(vecIdx);
            vecLenghtCSCPtr = cuda.CopyHostToDevice(vecLenght);



            /*
             * allocate memory for gradient
             */
            uint memSize = (uint)(sub_prob.ElementsCount * sizeof(float));

            //allocate mapped memory for our results (dot product beetween vector W and all elements)
            gradIntPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            gradPtr    = cuda.GetHostDevicePointer(gradIntPtr, 0);

            //allocate memory for main vector, size of this vector is the same as dimenson, so many
            //indexes will be zero, but cuda computation is faster
            mainVector = new float[vecDim];


            //move W wector
            //CudaHelpers.FillDenseVector(problemElements[0], mainVector);
            CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVecPtr);


            //set texture memory for labels
            CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuLabelsTexRef, cudaLabelsTexRefName, sub_prob.Y, ref labelsPtr);


            /*
             * data for cuda solver
             */

            //normaly for L2 solver QDii= xii*xii+Diag_i
            //where Diag_i = 0.5/Cp if yi=1
            //      Diag_i = 0.5/Cn if yi=-1
            //but we will add this on GPU
            QD     = new float[sub_prob.ElementsCount];
            alpha  = new float[sub_prob.ElementsCount];
            deltas = new float[sub_prob.ElementsCount];
            float[] diag = new float[3];
            for (int i = 0; i < sub_prob.ElementsCount; i++)
            {
                QD[i]     = sub_prob.Elements[i].DotProduct();
                alpha[i]  = 0f;
                deltas[i] = 0;
            }

            qdPtr = cuda.CopyHostToDevice(QD);

            alphaPtr = cuda.Allocate(alpha);


            //deltasPtr = cuda.Allocate(deltas);
            CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuDeltasTexRef, "deltasTexRef", deltas, ref deltasPtr);

            diagPtr = cuda.GetModuleGlobal(cuModule, "diag_shift");
            //set this in fill function
            //cuda.CopyHostToDevice(diagPtr, diag);

            //CUdeviceptr dimPtr = cuda.GetModuleGlobal(cuModule, "Dim");
            ////todo: check if it ok
            ////cuda.Memset(dimPtr,(uint) vecDim, 1);
            //int[] dimArr = new int[] { vecDim };
            //cuda.CopyHostToDevice(dimPtr,dimArr);

            //CUDARuntime.cudaMemcpyToSymbol("Dim", dimPtr, 1, 0, cudaMemcpyKind.cudaMemcpyHostToDevice);
            //CUDARuntime.cudaMemcpyToSymbol("Dim", ,1,0, cudaMemcpyKind.cudaMemcpyHostToDevice);

            CUdeviceptr deltaScalingPtr = cuda.GetModuleGlobal(cuModule, "stepScaling");

            //two ways of computing scaling param, should be the same, but it depends on rounding.
            //stepScaling = (float)(1.0 / Math.Sqrt(sub_prob.ElementsCount));

            stepScaling = 0.0002f;// (float)(1.0 / sub_prob.ElementsCount);

            //set scaling constant
            float[] scArr = new float[] { stepScaling };
            cuda.CopyHostToDevice(deltaScalingPtr, scArr);
            //cuda.Memset(deltaScalingPtr, (uint) scaling,sizeof(float));

            //cuda.CopyHostToDevice(dimPtr, problem.Elements[0].Dim);

            SetCudaParameters(sub_prob);
        }