public static void Invoke(TSCudaContext context, CudaContext cudaContext, byte[] ptx, string baseName, params object[] args)
        {
            ThrowIfAnyTensorInvalid(args);

            cudaContext.SetCurrent();

            CudaDeviceProperties deviceInfo = context.DeviceInfoForContext(cudaContext);

            IEnumerable <Tensor> allTensors = args.OfType <Tensor>();
            Tensor firstTensor       = allTensors.First();
            long   elementCount      = firstTensor.ElementCount();
            ApplySpecialization spec = new ApplySpecialization(allTensors.ToArray());

            ConvertTensorArgs.Convert(cudaContext, spec.Use32BitIndices, args);

            ManagedCuda.VectorTypes.dim3 block = ApplyUtils.GetApplyBlock();
            ManagedCuda.VectorTypes.dim3 grid  = ApplyUtils.GetApplyGrid(deviceInfo, elementCount);

            string     fullKernelName = PermutationGenerator.GetMangledName(baseName, spec);
            CudaKernel kernel         = context.KernelCache.Get(cudaContext, ptx, fullKernelName);

            kernel.GridDimensions  = grid;
            kernel.BlockDimensions = block;
            kernel.RunAsync(CUstream.NullStream, args);
        }
예제 #2
0
 public CUDATaskParams()
     : base()
 {
     BlockSize    = new Size(1, 1);
     GridSize     = new Size(1, 1);
     Architecture = ArchitectureType.x64;
     CudaDevice   = CUDADevices.All.FirstOrDefault();
 }
예제 #3
0
        private static CudaContext ContextWithDevice(CudaDeviceProperties device)
        {
            int deviceIndex = Enumerable
                              .Range(0, CudaContext.GetDeviceCount())
                              .Where(i => CudaContext.GetDeviceInfo(i).Equals(device))
                              .FirstOrDefault();

            return(new CudaContext(deviceIndex));
        }
예제 #4
0
        public GPUInfo(int deviceID)
        {
            CudaDeviceProperties props = CudaContext.GetDeviceInfo(deviceID);

            DeviceID         = deviceID;
            DeviceProperties = props;
            TotalVRam        = props.TotalGlobalMemory;
            FreeVRam         = props.TotalConstantMemory;
        }
예제 #5
0
        public static void GetDeviceProperties(ref CudaDeviceProperties props, int dev)
        {
            int err = _GetDeviceProperties(ref props, dev);

            if (err != 0)
            {
                Exceptions.CudaKernelExceptionFactory.ThrowException("_GetDeviceProperties", err);
            }
        }
예제 #6
0
        private static ScratchSpace AllocScratchSpace(CudaContext context, CudaDeviceProperties deviceProps)
        {
            int size = ScratchSpacePerSMStream * deviceProps.MultiProcessorCount;

            ManagedCuda.BasicTypes.CUdeviceptr buffer = context.AllocateMemory(size);
            return(new ScratchSpace()
            {
                size = size, buffer = buffer
            });
        }
예제 #7
0
        /// <summary>
        /// Allocs the scratch space.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="deviceProps">The device props.</param>
        /// <returns>ScratchSpace.</returns>
        private static ScratchSpace AllocScratchSpace(CudaContext context, CudaDeviceProperties deviceProps)
        {
            var size   = ScratchSpacePerSMStream * deviceProps.MultiProcessorCount;
            var buffer = context.AllocateMemory(size);

            return(new ScratchSpace()
            {
                size = size, buffer = buffer
            });
        }
        //static float3[] h_A;
        //static float3[] h_C;
        //static CudaDeviceVariable<float3> d_A;
        //static CudaDeviceVariable<float3> d_C;
        public CalculateHeatmap()
        {
            ctx = new CudaContext(CudaContext.GetMaxGflopsDeviceId());
            dev = ctx.GetDeviceInfo();
            Console.WriteLine("Using CUDA Device {0} compute level {1} timeout {2}", dev.DeviceName, dev.ComputeCapability, dev.KernelExecTimeoutEnabled ? "enabled" : "disabled");
            string resName;

            resName = @"C:\WEDEV\GpuImplementations\GpuInterpolation\RasterInterpolation_x64.ptx";
            Console.WriteLine("Loading Interpolation Kernel");
            InterpolateKernel = ctx.LoadKernelPTX(resName, "RasterInterpolate");
        }
예제 #9
0
        public CudaFloat32Handler(int deviceId = 0) : base(new CudaFloat32BackendHandle(deviceId, backendTag: -1))
        {
            _cudaBackendHandle = (CudaFloat32BackendHandle)DiffsharpBackendHandle;
            DeviceId           = _cudaBackendHandle.CudaContext.DeviceId;

            CudaDeviceProperties deviceProperties = CudaContext.GetDeviceInfo(deviceId);

            _logger.Info($"Using CUDA device {deviceProperties.DeviceName} with device id {deviceId} (compute capability {deviceProperties.ComputeCapability}, " +
                         $"memory {deviceProperties.TotalGlobalMemory / (1024 * 1024)}MB).");

            RegisterContext(_cudaBackendHandle.CudaContext, _cudaBackendHandle.CudaStream);
        }
예제 #10
0
        /// <summary>
        /// Gets the apply grid.
        /// </summary>
        /// <param name="deviceInfo">The device information.</param>
        /// <param name="totalElements">The total elements.</param>
        /// <returns>dim3.</returns>
        public static dim3 GetApplyGrid(CudaDeviceProperties deviceInfo, long totalElements)
        {
            var smCount = deviceInfo.MultiProcessorCount;

            // Rationale for grid size - from cuTorch source code:
            // 16 warps per block * 4 per SM gives 64 warps per SM at maximum,
            // which seems to be a good sweetspot for latency hiding
            var maxSize    = 4 * smCount;
            var targetSize = CeilDiv(totalElements, ApplyThreadsPerBlock);

            return(new dim3((uint)Math.Min(targetSize, maxSize)));
        }
예제 #11
0
        public DeviceState(int deviceId)
        {
            CudaContext = new CudaContext(deviceId);
            DeviceInfo  = CudaContext.GetDeviceInfo();

            BlasHandles = new ObjectPool <CudaBlas>(1, () =>
            {
                CudaContext.SetCurrent();
                return(new CudaBlas());
            },
                                                    blas => blas.Dispose());

            MemoryAllocator = new PoolingDeviceAllocator(CudaContext);
            ScratchSpace    = AllocScratchSpace(CudaContext, DeviceInfo);
        }
예제 #12
0
        public DeviceState(int deviceId, float memoryUsageRatio = 0.9f)
        {
            this.CudaContext = new CudaContext(deviceId);
            this.DeviceInfo  = this.CudaContext.GetDeviceInfo();

            this.BlasHandles = new ObjectPool <CudaBlas>(1, () =>
            {
                this.CudaContext.SetCurrent();
                return(new CudaBlas());
            },
                                                         blas => blas.Dispose());

            this.MemoryAllocator = new PoolingDeviceAllocator(this.CudaContext, memoryUsageRatio);
            this.ScratchSpace    = AllocScratchSpace(this.CudaContext, this.DeviceInfo);
        }
예제 #13
0
        private void SetDevice(int i, string name, CudaDeviceProperties info, SizeT mem)
        {
            CheckBox cb  = checkBox1;
            Label    lbl = g0;

            switch (i)
            {
            case 0:
                cb  = checkBox1;
                lbl = g0;
                break;

            case 1:
                cb  = checkBox2;
                lbl = g1;
                break;

            case 2:
                cb  = checkBox3;
                lbl = g2;
                break;

            case 3:
                cb  = checkBox4;
                lbl = g3;
                break;

            case 4:
                cb  = checkBox5;
                lbl = g4;
                break;

            case 5:
                cb  = checkBox6;
                lbl = g5;
                break;

            case 6:
                cb  = checkBox7;
                lbl = g6;
                break;
            }

            cb.Enabled = true;
            lbl.Text   = name;
            lbl.Tag    = mem;
        }
예제 #14
0
        /// <summary>
        /// Initializes a new instance of the <see cref="DeviceState"/> class.
        /// </summary>
        /// <param name="deviceId">The device identifier.</param>
        public DeviceState(int deviceId)
        {
            this.CudaContext = new CudaContext(deviceId);
            this.DeviceInfo  = this.CudaContext.GetDeviceInfo();

            this.BlasHandles = new ObjectPool <CudaBlas>(2, () =>
            {
                CudaContext.SetCurrent();
                return(new CudaBlas());
            },
                                                         blas => blas.Dispose());

            this.DnnHandles = new ObjectPool <ManagedCuda.CudaDNN.CudaDNNContext>(2, () =>
            {
                CudaContext.SetCurrent();
                return(new ManagedCuda.CudaDNN.CudaDNNContext());
            },
                                                                                  dnn => dnn.Dispose());

            this.MemoryAllocator = new PoolingDeviceAllocator(CudaContext);
            this.ScratchSpace    = AllocScratchSpace(CudaContext, DeviceInfo);
        }
예제 #15
0
        // 1.Простой тест
        // 2.Тест с передачей и приемом больших и разных структур
        public string GetSummary()
        {
            string s = "";

            //int deviceID = 0;
            //CudaContext ctx = new CudaContext(deviceID, CUCtxFlags.MapHost | CUCtxFlags.BlockingSync);

            //for default setting with device 0:
            //CudaContext ctx = new CudaContext();

            int deviceCount = CudaContext.GetDeviceCount();

            s += $"deviceCount = {deviceCount}\n";
            int devID = CudaContext.GetMaxGflopsDeviceId();

            s += $"GetMaxGflopsDeviceId = {devID}\n";

            //return s;

            for (int deviceID = 0; deviceID < deviceCount; deviceID++)
            {
                s += $"----- DeviceID = {deviceID} -----\n";
                CudaDeviceProperties props = CudaContext.GetDeviceInfo(deviceID);
                s += $"DeviceName = {props.DeviceName}\n";
                s += $"DriverVersion = {props.DriverVersion.ToString()}\n";
                s += $"CUDA ComputeCapability = {props.ComputeCapability.ToString()}\n";

                s += $"Integrated = {props.Integrated.ToString()}\n";
                s += $"MultiProcessorCount = {props.MultiProcessorCount.ToString()}\n";
                s += $"ClockRate = {(props.ClockRate/1000).ToString()} MHz\n";

                s += $"TotalGlobalMemory = {(props.TotalGlobalMemory / 1000000).ToString()} Mb\n";
                s += $"MemoryClockRate = {(props.MemoryClockRate / 1000).ToString()} MHz\n";
                s += $"GlobalMemoryBusWidth = {props.GlobalMemoryBusWidth.ToString()} bit\n";

                s += $"maxGridSize[3] (MaxGridDim) = ({props.MaxGridDim.x}; {props.MaxGridDim.y}; {props.MaxGridDim.z}) \n";

                s += $"maxThreadsPerBlock = {props.MaxThreadsPerBlock} \n";
                s += $"maxThreadsDim[3] (MaxBlockDim) = ({props.MaxBlockDim.x}; {props.MaxBlockDim.y}; {props.MaxBlockDim.z}) \n";
                s += $"MaxThreadsPerMultiProcessor = {props.MaxThreadsPerMultiProcessor} \n";
            }

            return(s);

            /*
             + name[256] = is an ASCII string identifying the device;
             + uuid is a 16-byte unique identifier.
             + totalGlobalMem = is the total amount of global memory available on the device in bytes;
             + sharedMemPerBlock is the maximum amount of shared memory available to a thread block in bytes;
             + regsPerBlock is the maximum number of 32-bit registers available to a thread block;
             + warpSize is the warp size in threads;
             + memPitch is the maximum pitch in bytes allowed by the memory copy functions that involve memory regions allocated through cudaMallocPitch();
             + maxThreadsPerBlock = is the maximum number of threads per block;
             + maxThreadsDim[3] = MaxBlockDim = contains the maximum size of each dimension of a block;
             + maxGridSize[3] = MaxGridDim = contains the maximum size of each dimension of a grid;
             + clockRate = is the clock frequency in kilohertz;
             + totalConstMem is the total amount of constant memory available on the device in bytes;
             + major, minor are the major and minor revision numbers defining the device's compute capability;
             + textureAlignment is the alignment requirement; texture base addresses that are aligned to textureAlignment bytes do not need an offset applied to texture fetches;
             + texturePitchAlignment is the pitch alignment requirement for 2D texture references that are bound to pitched memory;
             + deviceOverlap is 1 if the device can concurrently copy memory between host and device while executing a kernel, or 0 if not. Deprecated, use instead asyncEngineCount.
             + multiProcessorCount = is the number of multiprocessors on the device;
             + kernelExecTimeoutEnabled is 1 if there is a run time limit for kernels executed on the device, or 0 if not.
             + integrated = is 1 if the device is an integrated (motherboard) GPU and 0 if it is a discrete (card) component.
             + canMapHostMemory is 1 if the device can map host memory into the CUDA address space for use with cudaHostAlloc()/cudaHostGetDevicePointer(), or 0 if not;
             + computeMode is the compute mode that the device is currently in. Available modes are as follows:
             +  cudaComputeModeDefault: Default mode - Device is not restricted and multiple threads can use cudaSetDevice() with this device.
             +  cudaComputeModeExclusive: Compute-exclusive mode - Only one thread will be able to use cudaSetDevice() with this device.
             +  cudaComputeModeProhibited: Compute-prohibited mode - No threads can use cudaSetDevice() with this device.
             +  cudaComputeModeExclusiveProcess: Compute-exclusive-process mode - Many threads in one process will be able to use cudaSetDevice() with this device.
             +  If cudaSetDevice() is called on an already occupied device with computeMode cudaComputeModeExclusive, cudaErrorDeviceAlreadyInUse will be immediately returned indicating the device cannot be used. When an occupied exclusive mode device is chosen with cudaSetDevice, all subsequent non-device management runtime functions will return cudaErrorDevicesUnavailable.
             + maxTexture1D is the maximum 1D texture size.
             + maxTexture1DMipmap is the maximum 1D mipmapped texture texture size.
             + maxTexture1DLinear is the maximum 1D texture size for textures bound to linear memory.
             + maxTexture2D[2] contains the maximum 2D texture dimensions.
             + maxTexture2DMipmap[2] contains the maximum 2D mipmapped texture dimensions.
             + maxTexture2DLinear[3] contains the maximum 2D texture dimensions for 2D textures bound to pitch linear memory.
             + maxTexture2DGather[2] contains the maximum 2D texture dimensions if texture gather operations have to be performed.
             + maxTexture3D[3] contains the maximum 3D texture dimensions.
             + maxTexture3DAlt[3] contains the maximum alternate 3D texture dimensions.
             + maxTextureCubemap is the maximum cubemap texture width or height.
             + maxTexture1DLayered[2] contains the maximum 1D layered texture dimensions.
             + maxTexture2DLayered[3] contains the maximum 2D layered texture dimensions.
             + maxTextureCubemapLayered[2] contains the maximum cubemap layered texture dimensions.
             + maxSurface1D is the maximum 1D surface size.
             + maxSurface2D[2] contains the maximum 2D surface dimensions.
             + maxSurface3D[3] contains the maximum 3D surface dimensions.
             + maxSurface1DLayered[2] contains the maximum 1D layered surface dimensions.
             + maxSurface2DLayered[3] contains the maximum 2D layered surface dimensions.
             + maxSurfaceCubemap is the maximum cubemap surface width or height.
             + maxSurfaceCubemapLayered[2] contains the maximum cubemap layered surface dimensions.
             + surfaceAlignment specifies the alignment requirements for surfaces.
             + concurrentKernels is 1 if the device supports executing multiple kernels within the same context simultaneously, or 0 if not. It is not guaranteed that multiple kernels will be resident on the device concurrently so this feature should not be relied upon for correctness;
             + ECCEnabled is 1 if the device has ECC support turned on, or 0 if not.
             + pciBusID is the PCI bus identifier of the device.
             + pciDeviceID is the PCI device (sometimes called slot) identifier of the device.
             + pciDomainID is the PCI domain identifier of the device.
             + tccDriver is 1 if the device is using a TCC driver or 0 if not.
             + asyncEngineCount is 1 when the device can concurrently copy memory between host and device while executing a kernel. It is 2 when the device can concurrently copy memory between host and device in both directions and execute a kernel at the same time. It is 0 if neither of these is supported.
             + unifiedAddressing is 1 if the device shares a unified address space with the host and 0 otherwise.
             + memoryClockRate is the peak memory clock frequency in kilohertz.
             + memoryBusWidth is the memory bus width in bits.
             + l2CacheSize is L2 cache size in bytes.
             + maxThreadsPerMultiProcessor is the number of maximum resident threads per multiprocessor.
             + streamPrioritiesSupported is 1 if the device supports stream priorities, or 0 if it is not supported.
             + globalL1CacheSupported is 1 if the device supports caching of globals in L1 cache, or 0 if it is not supported.
             + localL1CacheSupported is 1 if the device supports caching of locals in L1 cache, or 0 if it is not supported.
             + sharedMemPerMultiprocessor is the maximum amount of shared memory available to a multiprocessor in bytes; this amount is shared by all thread blocks simultaneously resident on a multiprocessor;
             + regsPerMultiprocessor is the maximum number of 32-bit registers available to a multiprocessor; this number is shared by all thread blocks simultaneously resident on a multiprocessor;
             + managedMemory is 1 if the device supports allocating managed memory on this system, or 0 if it is not supported.
             + isMultiGpuBoard is 1 if the device is on a multi-GPU board (e.g. Gemini cards), and 0 if not;
             + multiGpuBoardGroupID is a unique identifier for a group of devices associated with the same board. Devices on the same multi-GPU board will share the same identifier;
             + singleToDoublePrecisionPerfRatio is the ratio of single precision performance (in floating-point operations per second) to double precision performance.
             + pageableMemoryAccess is 1 if the device supports coherently accessing pageable memory without calling cudaHostRegister on it, and 0 otherwise.
             + concurrentManagedAccess is 1 if the device can coherently access managed memory concurrently with the CPU, and 0 otherwise.
             + computePreemptionSupported is 1 if the device supports Compute Preemption, and 0 otherwise.
             + canUseHostPointerForRegisteredMem is 1 if the device can access host registered memory at the same virtual address as the CPU, and 0 otherwise.
             + cooperativeLaunch is 1 if the device supports launching cooperative kernels via cudaLaunchCooperativeKernel, and 0 otherwise.
             + cooperativeMultiDeviceLaunch is 1 if the device supports launching cooperative kernels via cudaLaunchCooperativeKernelMultiDevice, and 0 otherwise.
             + pageableMemoryAccessUsesHostPageTables is 1 if the device accesses pageable memory via the host's page tables, and 0 otherwise.
             + directManagedMemAccessFromHost is 1 if the host can directly access managed memory on the device without migration, and 0 otherwise.
             */

            /*
             * deviceCount = 1
             * GetMaxGflopsDeviceId = 0
             * ----- DeviceID = 0 -----
             * DeviceName = GeForce GTX 1050 Ti
             * DriverVersion = 10.20
             * CUDA ComputeCapability = 6.1
             * Integrated = False
             * MultiProcessorCount = 6
             * ClockRate = 1392 MHz
             * TotalGlobalMemory = 4294 Mb
             * MemoryClockRate = 3504 MHz
             * GlobalMemoryBusWidth = 128 bit
             * maxGridSize[3] (MaxGridDim) = (2147483647; 65535; 65535)
             * maxThreadsPerBlock = 1024
             * maxThreadsDim[3] (MaxBlockDim) = (1024; 1024; 64)
             * MaxThreadsPerMultiProcessor = 2048
             */
        }
예제 #16
0
 private static double DevicePerformanceValue(CudaDeviceProperties device)
 {
     return(device.ClockRate * device.MultiProcessorCount * device.MaxThreadsPerMultiProcessor);
 }
예제 #17
0
        /* PageRank
         *  Find PageRank for a graph with a given transition probabilities, a bookmark vector of dangling vertices, and the damping factor.
         *  This is equivalent to an eigenvalue problem where we want the eigenvector corresponding to the maximum eigenvalue.
         *  By construction, the maximum eigenvalue is 1.
         *  The eigenvalue problem is solved with the power method.
         *
         * Initially :
         * V = 6
         * E = 10
         *
         * Edges       W
         * 0 -> 1    0.50
         * 0 -> 2    0.50
         * 2 -> 0    0.33
         * 2 -> 1    0.33
         * 2 -> 4    0.33
         * 3 -> 4    0.50
         * 3 -> 5    0.50
         * 4 -> 3    0.50
         * 4 -> 5    0.50
         * 5 -> 3    1.00
         *
         * bookmark (0.0, 1.0, 0.0, 0.0, 0.0, 0.0)^T note: 1.0 if i is a dangling node, 0.0 otherwise
         *
         * Source oriented representation (CSC):
         * destination_offsets {0, 1, 3, 4, 6, 8, 10}
         * source_indices {2, 0, 2, 0, 4, 5, 2, 3, 3, 4}
         * W0 = {0.33, 0.50, 0.33, 0.50, 0.50, 1.00, 0.33, 0.50, 0.50, 1.00}
         *
         * ----------------------------------
         *
         * Operation : Pagerank with various damping factor
         * ----------------------------------
         *
         * Expected output for alpha= 0.9 (result stored in pr_2) : (0.037210, 0.053960, 0.041510, 0.37510, 0.206000, 0.28620)^T
         * From "Google's PageRank and Beyond: The Science of Search Engine Rankings" Amy N. Langville & Carl D. Meyer
         */

        static void Main(string[] args)
        {
            SizeT n = 6, nnz = 10, vertex_numsets = 3, edge_numsets = 1;

            float[] alpha1 = new float[] { 0.85f }, alpha2 = new float[] { 0.90f };

            int i;

            int[]   destination_offsets_h, source_indices_h;
            float[] weights_h, bookmark_h, pr_1, pr_2;
            //void** vertex_dim;

            // nvgraph variables
            GraphContext          handle;
            GraphDescriptor       graph;
            nvgraphCSCTopology32I CSC_input;

            cudaDataType[] edge_dimT = new cudaDataType[] { cudaDataType.CUDA_R_32F };
            cudaDataType[] vertex_dimT;

            // use command-line specified CUDA device, otherwise use device with highest Gflops/s
            int cuda_device = 0;

            CudaDeviceProperties deviceProp = CudaContext.GetDeviceInfo(cuda_device);

            Console.WriteLine("> Detected Compute SM {0}.{1} hardware with {2} multi-processors",
                              deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor, deviceProp.MultiProcessorCount);

            if (deviceProp.ComputeCapability.Major < 3)
            {
                Console.WriteLine("> nvGraph requires device SM 3.0+");
                Console.WriteLine("> Waiving.");
                return;
            }


            // Allocate host data
            destination_offsets_h = new int[n + 1];
            source_indices_h      = new int[nnz];
            weights_h             = new float[nnz];
            bookmark_h            = new float[n];
            pr_1 = new float[n];
            pr_2 = new float[n];
            //vertex_dim = (void**)malloc(vertex_numsets * sizeof(void*));
            vertex_dimT = new cudaDataType[vertex_numsets];
            CSC_input   = new nvgraphCSCTopology32I();

            // Initialize host data
            //vertex_dim[0] = (void*)bookmark_h; vertex_dim[1]= (void*)pr_1, vertex_dim[2]= (void*)pr_2;
            vertex_dimT[0] = cudaDataType.CUDA_R_32F; vertex_dimT[1] = cudaDataType.CUDA_R_32F; vertex_dimT[2] = cudaDataType.CUDA_R_32F;

            weights_h[0] = 0.333333f;
            weights_h[1] = 0.500000f;
            weights_h[2] = 0.333333f;
            weights_h[3] = 0.500000f;
            weights_h[4] = 0.500000f;
            weights_h[5] = 1.000000f;
            weights_h[6] = 0.333333f;
            weights_h[7] = 0.500000f;
            weights_h[8] = 0.500000f;
            weights_h[9] = 0.500000f;

            destination_offsets_h[0] = 0;
            destination_offsets_h[1] = 1;
            destination_offsets_h[2] = 3;
            destination_offsets_h[3] = 4;
            destination_offsets_h[4] = 6;
            destination_offsets_h[5] = 8;
            destination_offsets_h[6] = 10;

            source_indices_h[0] = 2;
            source_indices_h[1] = 0;
            source_indices_h[2] = 2;
            source_indices_h[3] = 0;
            source_indices_h[4] = 4;
            source_indices_h[5] = 5;
            source_indices_h[6] = 2;
            source_indices_h[7] = 3;
            source_indices_h[8] = 3;
            source_indices_h[9] = 4;

            bookmark_h[0] = 0.0f;
            bookmark_h[1] = 1.0f;
            bookmark_h[2] = 0.0f;
            bookmark_h[3] = 0.0f;
            bookmark_h[4] = 0.0f;
            bookmark_h[5] = 0.0f;

            // Starting nvgraph
            handle = new GraphContext();
            graph  = handle.CreateGraphDecriptor();

            GCHandle destination_offsets_handle = GCHandle.Alloc(destination_offsets_h, GCHandleType.Pinned);
            GCHandle source_indices_handle      = GCHandle.Alloc(source_indices_h, GCHandleType.Pinned);

            CSC_input.nvertices           = n;
            CSC_input.nedges              = nnz;
            CSC_input.destination_offsets = destination_offsets_handle.AddrOfPinnedObject();
            CSC_input.source_indices      = source_indices_handle.AddrOfPinnedObject();

            // Set graph connectivity and properties (tranfers)
            graph.SetGraphStructure(CSC_input);
            graph.AllocateVertexData(vertex_dimT);
            graph.AllocateEdgeData(edge_dimT);

            graph.SetVertexData(bookmark_h, 0);
            graph.SetVertexData(pr_1, 1);
            graph.SetVertexData(pr_2, 2);

            graph.SetEdgeData(weights_h, 0);

            // First run with default values
            graph.Pagerank(0, alpha1, 0, 0, 1, 0.0f, 0);

            // Get and print result
            graph.GetVertexData(pr_1, 1);
            Console.WriteLine("pr_1, alpha = 0.85");
            for (i = 0; i < n; i++)
            {
                Console.WriteLine(pr_1[i]);
            }
            Console.WriteLine();

            // Second run with different damping factor and an initial guess
            for (i = 0; i < n; i++)
            {
                pr_2[i] = pr_1[i];
            }

            graph.SetVertexData(pr_2, 2);
            graph.Pagerank(0, alpha2, 0, 1, 2, 0.0f, 0);

            // Get and print result
            graph.GetVertexData(pr_2, 2);
            Console.WriteLine("pr_2, alpha = 0.90");
            for (i = 0; i < n; i++)
            {
                Console.WriteLine(pr_2[i]);
            }
            Console.WriteLine();

            //Clean
            graph.Dispose();
            handle.Dispose();


            Console.WriteLine("\nDone!");
        }
예제 #18
0
        static void Main(string[] args)
        {
            int   cuda_device = 0;
            int   nstreams = 4;                           // number of streams for CUDA calls
            int   nreps = 10;                             // number of times each experiment is repeated
            int   n = 16 * 1024 * 1024;                   // number of ints in the data set
            int   nbytes = n * sizeof(int);               // number of data bytes
            dim3  threads, blocks;                        // kernel launch configuration
            float elapsed_time, time_memcpy, time_kernel; // timing variables
            float scale_factor = 1.0f;

            // allocate generic memory and pin it laster instead of using cudaHostAlloc()
            // Untested in C#, so stick to cudaHostAlloc().
            bool       bPinGenericMemory  = false;                   // we want this to be the default behavior
            CUCtxFlags device_sync_method = CUCtxFlags.BlockingSync; // by default we use BlockingSync

            int niterations;                                         // number of iterations for the loop inside the kernel

            ShrQATest.shrQAStart(args);

            Console.WriteLine("[ simpleStreams ]");

            foreach (var item in args)
            {
                if (item.Contains("help"))
                {
                    printHelp();
                    ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_PASSED);
                }
            }

            bPinGenericMemory = false;
            foreach (var item in args)
            {
                if (item.Contains("use_generic_memory"))
                {
                    bPinGenericMemory = true;
                }
            }

            for (int i = 0; i < args.Length; i++)
            {
                if (args[i].Contains("sync_method"))
                {
                    int  temp  = -1;
                    bool error = false;
                    if (i < args.Length - 1)
                    {
                        error = int.TryParse(args[i + 1], out temp);
                        switch (temp)
                        {
                        case 0:
                            device_sync_method = CUCtxFlags.SchedAuto;
                            break;

                        case 1:
                            device_sync_method = CUCtxFlags.SchedSpin;
                            break;

                        case 2:
                            device_sync_method = CUCtxFlags.SchedYield;
                            break;

                        case 4:
                            device_sync_method = CUCtxFlags.BlockingSync;
                            break;

                        default:
                            error = true;
                            break;
                        }
                    }
                    if (!error)
                    {
                        Console.Write("Specifying device_sync_method = {0}, setting reps to 100 to demonstrate steady state\n", sDeviceSyncMethod[(int)device_sync_method]);
                        nreps = 100;
                    }
                    else
                    {
                        Console.Write("Invalid command line option sync_method=\"{0}\"\n", temp);
                        ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_FAILED);
                    }
                }
            }

            int num_devices = CudaContext.GetDeviceCount();

            if (0 == num_devices)
            {
                Console.Write("your system does not have a CUDA capable device, waiving test...\n");
                ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_FAILED);
            }
            cuda_device = CudaContext.GetMaxGflopsDeviceId();

            CudaDeviceProperties deviceProp = CudaContext.GetDeviceInfo(cuda_device);

            if ((1 == deviceProp.ComputeCapability.Major) && (deviceProp.ComputeCapability.Minor < 1))
            {
                Console.Write("{0} does not have Compute Capability 1.1 or newer. Reducing workload.\n", deviceProp.DeviceName);
            }

            if (deviceProp.ComputeCapability.Major >= 2)
            {
                niterations = 100;
            }
            else
            {
                if (deviceProp.ComputeCapability.Minor > 1)
                {
                    niterations = 5;
                }
                else
                {
                    niterations = 1;                     // reduced workload for compute capability 1.0 and 1.1
                }
            }

            // Check if GPU can map host memory (Generic Method), if not then we override bPinGenericMemory to be false
            // In .net we cannot allocate easily generic aligned memory, so <bPinGenericMemory> is always false in our case...
            if (bPinGenericMemory)
            {
                Console.Write("Device: <{0}> canMapHostMemory: {1}\n", deviceProp.DeviceName, deviceProp.CanMapHostMemory ? "Yes" : "No");
                if (deviceProp.CanMapHostMemory == false)
                {
                    Console.Write("Using cudaMallocHost, CUDA device does not support mapping of generic host memory\n");
                    bPinGenericMemory = false;
                }
            }

            // Anything that is less than 32 Cores will have scaled down workload
            scale_factor = Math.Max((32.0f / (ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor) * (float)deviceProp.MultiProcessorCount)), 1.0f);
            n            = (int)Math.Round((float)n / scale_factor);

            Console.Write("> CUDA Capable: SM {0}.{1} hardware\n", deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor);
            Console.Write("> {0} Multiprocessor(s) x {1} (Cores/Multiprocessor) = {2} (Cores)\n",
                          deviceProp.MultiProcessorCount,
                          ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor),
                          ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor) * deviceProp.MultiProcessorCount);

            Console.Write("> scale_factor = {0:0.0000}\n", 1.0f / scale_factor);
            Console.Write("> array_size   = {0}\n\n", n);

            // enable use of blocking sync, to reduce CPU usage
            Console.Write("> Using CPU/GPU Device Synchronization method ({0})\n", sDeviceSyncMethod[(int)device_sync_method]);

            CudaContext ctx;

            if (bPinGenericMemory)
            {
                ctx = new CudaContext(cuda_device, device_sync_method | CUCtxFlags.MapHost);
            }
            else
            {
                ctx = new CudaContext(cuda_device, device_sync_method);
            }

            //Load Kernel image from resources
            string resName;

            if (IntPtr.Size == 8)
            {
                resName = "simpleStreams_x64.ptx";
            }
            else
            {
                resName = "simpleStreams.ptx";
            }

            string resNamespace = "simpleStreams";
            string resource     = resNamespace + "." + resName;
            Stream stream       = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource);

            if (stream == null)
            {
                throw new ArgumentException("Kernel not found in resources.");
            }

            CudaKernel init_array = ctx.LoadKernelPTX(stream, "init_array");


            // allocate host memory
            int c = 5;                                                          // value to which the array will be initialized

            int[] h_a = null;                                                   // pointer to the array data in host memory
            CudaPageLockedHostMemory <int> hAligned_a = null;                   // pointer to the array data in host memory (aligned to MEMORY_ALIGNMENT)

            //Note: In .net we have two seperated arrays: One is in managed memory (h_a), the other one in unmanaged memory (hAligned_a).
            //In C++ hAligned_a would point somewhere inside the h_a array.
            AllocateHostMemory(bPinGenericMemory, ref h_a, ref hAligned_a, nbytes);

            Console.Write("\nStarting Test\n");

            // allocate device memory
            CudaDeviceVariable <int> d_c = c;            //using new implicit cast to allocate memory and asign value
            CudaDeviceVariable <int> d_a = new CudaDeviceVariable <int>(nbytes / sizeof(int));

            CudaStream[] streams = new CudaStream[nstreams];
            for (int i = 0; i < nstreams; i++)
            {
                streams[i] = new CudaStream();
            }

            // create CUDA event handles
            // use blocking sync
            CudaEvent    start_event, stop_event;
            CUEventFlags eventflags = ((device_sync_method == CUCtxFlags.BlockingSync) ? CUEventFlags.BlockingSync : CUEventFlags.Default);

            start_event = new CudaEvent(eventflags);
            stop_event  = new CudaEvent(eventflags);

            // time memcopy from device
            start_event.Record();                 // record in stream-0, to ensure that all previous CUDA calls have completed
            hAligned_a.AsyncCopyToDevice(d_a, streams[0].Stream);
            stop_event.Record();
            stop_event.Synchronize();               // block until the event is actually recorded
            time_memcpy = CudaEvent.ElapsedTime(start_event, stop_event);
            Console.Write("memcopy:\t{0:0.00}\n", time_memcpy);

            // time kernel
            threads = new dim3(512, 1);
            blocks  = new dim3(n / (int)threads.x, 1);
            start_event.Record();
            init_array.BlockDimensions = threads;
            init_array.GridDimensions  = blocks;
            init_array.RunAsync(streams[0].Stream, d_a.DevicePointer, d_c.DevicePointer, niterations);
            stop_event.Record();
            stop_event.Synchronize();
            time_kernel = CudaEvent.ElapsedTime(start_event, stop_event);
            Console.Write("kernel:\t\t{0:0.00}\n", time_kernel);


            //////////////////////////////////////////////////////////////////////
            // time non-streamed execution for reference
            threads = new dim3(512, 1);
            blocks  = new dim3(n / (int)threads.x, 1);
            start_event.Record();
            for (int k = 0; k < nreps; k++)
            {
                init_array.BlockDimensions = threads;
                init_array.GridDimensions  = blocks;
                init_array.Run(d_a.DevicePointer, d_c.DevicePointer, niterations);
                hAligned_a.SynchronCopyToHost(d_a);
            }
            stop_event.Record();
            stop_event.Synchronize();
            elapsed_time = CudaEvent.ElapsedTime(start_event, stop_event);
            Console.Write("non-streamed:\t{0:0.00} ({1:00} expected)\n", elapsed_time / nreps, time_kernel + time_memcpy);

            //////////////////////////////////////////////////////////////////////
            // time execution with nstreams streams
            threads = new dim3(512, 1);
            blocks  = new dim3(n / (int)(nstreams * threads.x), 1);
            byte[] memset = new byte[nbytes];             // set host memory bits to all 1s, for testing correctness
            for (int i = 0; i < nbytes; i++)
            {
                memset[i] = 255;
            }
            System.Runtime.InteropServices.Marshal.Copy(memset, 0, hAligned_a.PinnedHostPointer, nbytes);
            d_a.Memset(0);             // set device memory to all 0s, for testing correctness

            start_event.Record();
            for (int k = 0; k < nreps; k++)
            {
                init_array.BlockDimensions = threads;
                init_array.GridDimensions  = blocks;
                // asynchronously launch nstreams kernels, each operating on its own portion of data
                for (int i = 0; i < nstreams; i++)
                {
                    init_array.RunAsync(streams[i].Stream, d_a.DevicePointer + i * n / nstreams * sizeof(int), d_c.DevicePointer, niterations);
                }

                // asynchronously launch nstreams memcopies.  Note that memcopy in stream x will only
                //   commence executing when all previous CUDA calls in stream x have completed
                for (int i = 0; i < nstreams; i++)
                {
                    hAligned_a.AsyncCopyFromDevice(d_a, i * n / nstreams * sizeof(int), i * n / nstreams * sizeof(int), nbytes / nstreams, streams[i].Stream);
                }
            }
            stop_event.Record();
            stop_event.Synchronize();
            elapsed_time = CudaEvent.ElapsedTime(start_event, stop_event);
            Console.Write("{0} streams:\t{1:0.00} ({2:0.00} expected with compute capability 1.1 or later)\n", nstreams, elapsed_time / nreps, time_kernel + time_memcpy / nstreams);

            // check whether the output is correct
            Console.Write("-------------------------------\n");
            //We can directly access data in hAligned_a using the [] operator, but copying
            //data first to h_a is faster.
            System.Runtime.InteropServices.Marshal.Copy(hAligned_a.PinnedHostPointer, h_a, 0, nbytes / sizeof(int));

            bool bResults = correct_data(h_a, n, c * nreps * niterations);

            // release resources
            for (int i = 0; i < nstreams; i++)
            {
                streams[i].Dispose();
            }
            start_event.Dispose();
            stop_event.Dispose();

            hAligned_a.Dispose();
            d_a.Dispose();
            d_c.Dispose();
            CudaContext.ProfilerStop();
            ctx.Dispose();

            Console.ReadKey();
            ShrQATest.shrQAFinishExit(args, bResults ? ShrQATest.eQAstatus.QA_PASSED : ShrQATest.eQAstatus.QA_FAILED);
        }
예제 #19
0
        public static Tensor Invoke(CudaReduceKernels reduceKernels, string kernelName, float init, ReduceInitType initType, Tensor result, Tensor src, int dim, object extraArg = null)
        {
            if (src.DimensionCount == 0)
            {
                return(result);
            }

            TSCudaContext context     = CudaHelpers.TSContextForTensor(src);
            CudaContext   cudaContext = context.CudaContextForTensor(src);

            long[] requiredOutputSize = (long[])src.Sizes.Clone();
            requiredOutputSize[dim] = 1;
            Tensor writeTarget = TensorResultBuilder.GetWriteTarget(result, src, false, requiredOutputSize);

            ThrowIfAnyTensorInvalid(writeTarget, src);

            long inElements      = src.ElementCount();
            long reductionSize   = src.Sizes[dim];
            long reductionStride = src.Strides[dim];
            long outElements     = inElements / reductionSize;
            bool contigReduction = reductionStride == 1;


            // We must make sure that when the tensor is passed to the kernel, src.Sizes[dim] is set to 1
            // This includes for the purposes of determining which tensor specializations to use (changing
            // the dimension size to 1 may make the tensor non-contiguous
            long[] newSizes = (long[])src.Sizes.Clone();
            newSizes[dim] = 1;
            Tensor srcSlim = new Tensor(newSizes, src.Strides, src.Storage, src.StorageOffset);

            ApplySpecialization config  = new ApplySpecialization(writeTarget, srcSlim);
            object totalSlices          = config.Use32BitIndices ? (uint)outElements : (ulong)outElements;
            object reductionSizeTyped   = config.Use32BitIndices ? (uint)reductionSize : (ulong)reductionSize;
            object reductionStrideTyped = config.Use32BitIndices ? (uint)reductionStride : (ulong)reductionStride;
            object initValueTyped       = ReduceInitConverter.GetInitValue(init, initType, src.ElementType);

            byte[] ptx = reduceKernels.GetPtx(context.Compiler);

            if (contigReduction)
            {
                dim3 block    = GetContigReduceBlock(cudaContext, outElements, reductionSize);
                dim3 grid     = GetContigReduceGrid(outElements);
                uint smemSize = (uint)src.ElementType.Size() * block.x;

                string fullName = "contig_" + PermutationGenerator.GetMangledName(kernelName, config);
                if (extraArg == null)
                {
                    InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionSizeTyped, totalSlices, initValueTyped);
                }
                else
                {
                    InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionSizeTyped, totalSlices, initValueTyped, extraArg);
                }
            }
            else
            {
                CudaDeviceProperties deviceProps = context.DeviceInfoForContext(cudaContext);
                dim3 block    = GetNonContigReduceBlock(deviceProps);
                dim3 grid     = GetNoncontigReduceGrid(deviceProps, outElements);
                uint smemSize = 0;

                string fullName = "noncontig_" + PermutationGenerator.GetMangledName(kernelName, config);
                if (extraArg == null)
                {
                    InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionStrideTyped, reductionSizeTyped, totalSlices, initValueTyped);
                }
                else
                {
                    InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionStrideTyped, reductionSizeTyped, totalSlices, initValueTyped, extraArg);
                }
            }

            return(writeTarget);
        }
예제 #20
0
 private static int GetNonContigReduceBlockSize(CudaDeviceProperties deviceProps)
 {
     return(Math.Min(DeviceCode.Headers.Reduce.NonContigReduceBlockSize, (int)deviceProps.MaxBlockDim.x));
 }
예제 #21
0
 private static extern unsafe int _GetDeviceProperties(ref CudaDeviceProperties props, int dev);
예제 #22
0
 private static dim3 GetNoncontigReduceGrid(CudaDeviceProperties deviceProps, long elements)
 {
     // One output point per thread
     return(GridFromTiles(ApplyUtils.CeilDiv(elements, GetNonContigReduceBlockSize(deviceProps))));
 }
예제 #23
0
 private static dim3 GetNonContigReduceBlock(CudaDeviceProperties deviceProps)
 {
     return(new(GetNonContigReduceBlockSize(deviceProps)));
 }