public MyFourierBinder(MyWorkingNode owner, int inputSize, MyMemoryBlock<float> tempBlock)
            : base(owner, inputSize, tempBlock)
        {
            m_stream = new CudaStream();

            m_fft = new CudaFFTPlan1D(inputSize, cufftType.R2C, 1);
            m_fft.SetStream(m_stream.Stream);
            m_ifft = new CudaFFTPlan1D(inputSize, cufftType.C2R, 1);
            m_ifft.SetStream(m_stream.Stream);

            m_mulkernel = MyKernelFactory.Instance.Kernel(owner.GPU, @"Common\CombineVectorsKernel", "MulComplexElementWise");
            m_mulkernel.SetupExecution(inputSize + 1);

            m_involutionKernel = MyKernelFactory.Instance.Kernel(owner.GPU, @"Common\CombineVectorsKernel", "InvolveVector");
            m_involutionKernel.SetupExecution(inputSize - 1);

            m_inversionKernel = MyKernelFactory.Instance.Kernel(owner.GPU, @"Transforms\InvertValuesKernel", "InvertLengthComplexKernel");
            m_inversionKernel.SetupExecution(inputSize);

            m_dotKernel = MyKernelFactory.Instance.KernelProduct<float>(owner, owner.GPU, ProductMode.f_DotProduct_f);

            m_normalKernel = MyKernelFactory.Instance.Kernel(owner.GPU, @"Transforms\TransformKernels", "PolynomialFunctionKernel");
            m_normalKernel.SetupExecution(inputSize);

            m_firstFFTOffset = 0;
            m_secondFFTOffset = (inputSize + 1) * 2;
            m_tempOffset = (inputSize + 1) * 4;

            Denominator = inputSize;
        }
        public MyPermutationBinder(MyWorkingNode owner, int inputSize, MyMemoryBlock<float> tempBlock)
            : base(owner, inputSize, tempBlock)
        {
            m_stream = new CudaStream();

            m_binaryPermKernel = MyKernelFactory.Instance.Kernel(owner.GPU, @"Common\CombineVectorsKernel", "CombineTwoVectorsKernel");
            m_binaryPermKernel.SetupExecution(inputSize);
        }
Beispiel #3
0
 /// <summary>
 /// Launches an executable graph in a stream.<para/>
 /// Only one instance of GraphExec may be executing
 /// at a time. Each launch is ordered behind both any previous work in Stream
 /// and any previous launches of GraphExec.To execute a graph concurrently, it must be
 /// instantiated multiple times into multiple executable graphs.
 /// </summary>
 /// <param name="stream"></param>
 public void Launch(CudaStream stream)
 {
     res = DriverAPINativeMethods.GraphManagment.cuGraphLaunch(_graph, stream.Stream);
     Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuGraphLaunch", res));
     if (res != CUResult.Success)
     {
         throw new CudaException(res);
     }
 }
        private CudnnContext( CudnnHandle handle, CudaStream stream )
        {
            if (handle.Pointer == IntPtr.Zero)
                throw new ArgumentException("handle");

            Contract.EndContractBlock();

            this.handle = handle;
            this.stream = stream;
        }
Beispiel #5
0
        /// <summary>
        /// Copies attributes from source stream to destination stream<para/>
        /// Copies attributes from source stream \p src to destination stream \p dst.<para/>
        /// Both streams must have the same context.
        /// </summary>
        /// <param name="dst">Destination stream</param>
        public void CopyAttributes(CudaStream dst)
        {
            CUResult res = DriverAPINativeMethods.Streams.cuStreamCopyAttributes(dst.Stream, _stream);

            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuStreamCopyAttributes", res));
            if (res != CUResult.Success)
            {
                throw new CudaException(res);
            }
        }
        public static CudnnContext Create( CudaStream stream = null )
        {
            CudnnHandle handle = default(CudnnHandle);
            Invoke(() => CudnnNativeMethods.cudnnCreate(out handle));
            Contract.Assume(handle.Pointer != IntPtr.Zero);

            if (stream != null)
            {
                Invoke(() => CudnnNativeMethods.cudnnSetStream(handle, stream.Stream));
            }
            return new CudnnContext(handle, stream);
        }
Beispiel #7
0
        /// <summary>
        /// Allocates memory from a specified pool with stream ordered semantics.<para/>
        /// Inserts an allocation operation into \p hStream.<para/>
        /// A pointer to the allocated memory is returned immediately in *dptr.<para/>
        /// The allocation must not be accessed until the the allocation operation completes.<para/>
        /// The allocation comes from the specified memory pool.<para/>
        /// note<para/>
        /// -  The specified memory pool may be from a device different than that of the specified \p hStream.<para/>
        /// -  Basic stream ordering allows future work submitted into the same stream to use the allocation.
        /// Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation
        /// operation completes before work submitted in a separate stream runs.
        /// </summary>
        /// <param name="bytesize">Number of bytes to allocate</param>
        /// <param name="hStream">The stream establishing the stream ordering semantic</param>
        public CudaDeviceVariable <T> MemAllocFromPoolAsync <T>(SizeT bytesize, CudaStream hStream) where T : struct
        {
            CUdeviceptr devPtr = new CUdeviceptr();

            res = DriverAPINativeMethods.MemoryManagement.cuMemAllocFromPoolAsync(ref devPtr, bytesize, _memoryPool, hStream.Stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemAllocFromPoolAsync", res));
            if (res != CUResult.Success)
            {
                throw new CudaException(res);
            }
            return(new CudaDeviceVariable <T>(devPtr, false, bytesize));
        }
        public void ContextWithStream()
        {
            using ( var cuda = new CudaContext() )
            using ( var stream = new CudaStream(CUStreamFlags.Default) )
            {
                using (var context = CudnnContext.Create(stream))
                {
                    Assert.True(context.IsInitialized);

                    var streamId = default (CUstream);
                    CudnnContext.Invoke(() => CudnnNativeMethods.cudnnGetStream(context.Handle, out streamId));

                    Assert.Equal(stream.Stream, streamId);
                }
            }
        }
Beispiel #9
0
        static void Main(string[] args)
        {
            int cuda_device = 0;
            int nstreams = 4;               // number of streams for CUDA calls
            int nreps = 10;                 // number of times each experiment is repeated
            int n = 16 * 1024 * 1024;       // number of ints in the data set
            int nbytes = n * sizeof(int);   // number of data bytes
            dim3 threads, blocks;           // kernel launch configuration
            float elapsed_time, time_memcpy, time_kernel;   // timing variables
            float scale_factor = 1.0f;

            // allocate generic memory and pin it laster instead of using cudaHostAlloc()
            // Untested in C#, so stick to cudaHostAlloc().
            bool bPinGenericMemory = false; // we want this to be the default behavior
            CUCtxFlags device_sync_method = CUCtxFlags.BlockingSync; // by default we use BlockingSync

            int niterations;	// number of iterations for the loop inside the kernel

            ShrQATest.shrQAStart(args);

            Console.WriteLine("[ simpleStreams ]");

            foreach (var item in args)
            {
                if (item.Contains("help"))
                {
                    printHelp();
                    ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_PASSED);
                }
            }

            bPinGenericMemory = false;
            foreach (var item in args)
            {
                if (item.Contains("use_generic_memory"))
                {
                    bPinGenericMemory = true;
                }
            }

            for (int i = 0; i < args.Length; i++)
            {
                if (args[i].Contains("sync_method"))
                {
                    int temp = -1;
                    bool error = false;
                    if (i < args.Length - 1)
                    {
                        error = int.TryParse(args[i + 1], out temp);
                        switch (temp)
                        {
                            case 0:
                                device_sync_method = CUCtxFlags.SchedAuto;
                                break;
                            case 1:
                                device_sync_method = CUCtxFlags.SchedSpin;
                                break;
                            case 2:
                                device_sync_method = CUCtxFlags.SchedYield;
                                break;
                            case 4:
                                device_sync_method = CUCtxFlags.BlockingSync;
                                break;
                            default:
                                error = true;
                                break;
                        }
                    }
                    if (!error)
                    {
                        Console.Write("Specifying device_sync_method = {0}, setting reps to 100 to demonstrate steady state\n", sDeviceSyncMethod[(int)device_sync_method]);
                        nreps = 100;
                    }
                    else
                    {
                        Console.Write("Invalid command line option sync_method=\"{0}\"\n", temp);
                        ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_FAILED);
                    }
                }
            }

            int num_devices = CudaContext.GetDeviceCount();
            if(0==num_devices)
            {
                Console.Write("your system does not have a CUDA capable device, waiving test...\n");
                ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_FAILED);
            }
            cuda_device = CudaContext.GetMaxGflopsDeviceId();

            CudaDeviceProperties deviceProp = CudaContext.GetDeviceInfo(cuda_device);
            if ((1 == deviceProp.ComputeCapability.Major) && (deviceProp.ComputeCapability.Minor < 1))
            {
                Console.Write("{0} does not have Compute Capability 1.1 or newer. Reducing workload.\n", deviceProp.DeviceName);
            }

            if (deviceProp.ComputeCapability.Major >= 2)
            {
                niterations = 100;
            }
            else
            {
                if (deviceProp.ComputeCapability.Minor > 1)
                {
                    niterations = 5;
                }
                else
                {
                    niterations = 1; // reduced workload for compute capability 1.0 and 1.1
                }
            }

            // Check if GPU can map host memory (Generic Method), if not then we override bPinGenericMemory to be false
            // In .net we cannot allocate easily generic aligned memory, so <bPinGenericMemory> is always false in our case...
            if (bPinGenericMemory)
            {
                Console.Write("Device: <{0}> canMapHostMemory: {1}\n", deviceProp.DeviceName, deviceProp.CanMapHostMemory ? "Yes" : "No");
                if (deviceProp.CanMapHostMemory == false)
                {
                    Console.Write("Using cudaMallocHost, CUDA device does not support mapping of generic host memory\n");
                    bPinGenericMemory = false;
                }
            }

            // Anything that is less than 32 Cores will have scaled down workload
            scale_factor = Math.Max((32.0f / (ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor) * (float)deviceProp.MultiProcessorCount)), 1.0f);
            n = (int)Math.Round((float)n / scale_factor);

            Console.Write("> CUDA Capable: SM {0}.{1} hardware\n", deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor);
            Console.Write("> {0} Multiprocessor(s) x {1} (Cores/Multiprocessor) = {2} (Cores)\n",
                    deviceProp.MultiProcessorCount,
                    ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor),
                    ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor) * deviceProp.MultiProcessorCount);

            Console.Write("> scale_factor = {0:0.0000}\n", 1.0f / scale_factor);
            Console.Write("> array_size   = {0}\n\n", n);

            // enable use of blocking sync, to reduce CPU usage
            Console.Write("> Using CPU/GPU Device Synchronization method ({0})\n", sDeviceSyncMethod[(int)device_sync_method]);

            CudaContext ctx;
            if (bPinGenericMemory)
                ctx = new CudaContext(cuda_device, device_sync_method | CUCtxFlags.MapHost);
            else
                ctx = new CudaContext(cuda_device, device_sync_method);

            //Load Kernel image from resources
            string resName;
            if (IntPtr.Size == 8)
                resName = "simpleStreams_x64.ptx";
            else
                resName = "simpleStreams.ptx";

            string resNamespace = "simpleStreams";
            string resource = resNamespace + "." + resName;
            Stream stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource);
            if (stream == null) throw new ArgumentException("Kernel not found in resources.");

            CudaKernel init_array = ctx.LoadKernelPTX(stream, "init_array");

            // allocate host memory
            int c = 5;											// value to which the array will be initialized
            int[] h_a = null;									// pointer to the array data in host memory
            CudaPageLockedHostMemory<int> hAligned_a = null;	// pointer to the array data in host memory (aligned to MEMORY_ALIGNMENT)
            //Note: In .net we have two seperated arrays: One is in managed memory (h_a), the other one in unmanaged memory (hAligned_a).
            //In C++ hAligned_a would point somewhere inside the h_a array.
            AllocateHostMemory(bPinGenericMemory, ref h_a, ref hAligned_a, nbytes);

            Console.Write("\nStarting Test\n");

            // allocate device memory
            CudaDeviceVariable<int> d_c = c; //using new implicit cast to allocate memory and asign value
            CudaDeviceVariable<int> d_a = new CudaDeviceVariable<int>(nbytes / sizeof(int));

            CudaStream[] streams = new CudaStream[nstreams];
            for (int i = 0; i < nstreams; i++)
            {
                streams[i] = new CudaStream();
            }

            // create CUDA event handles
            // use blocking sync
            CudaEvent start_event, stop_event;
            CUEventFlags eventflags = ((device_sync_method == CUCtxFlags.BlockingSync) ? CUEventFlags.BlockingSync : CUEventFlags.Default);

            start_event = new CudaEvent(eventflags);
            stop_event = new CudaEvent(eventflags);

            // time memcopy from device
            start_event.Record();     // record in stream-0, to ensure that all previous CUDA calls have completed
            hAligned_a.AsyncCopyToDevice(d_a, streams[0].Stream);
            stop_event.Record();
            stop_event.Synchronize();   // block until the event is actually recorded
            time_memcpy = CudaEvent.ElapsedTime(start_event, stop_event);
            Console.Write("memcopy:\t{0:0.00}\n", time_memcpy);

            // time kernel
            threads = new dim3(512, 1);
            blocks = new dim3(n / (int)threads.x, 1);
            start_event.Record();
            init_array.BlockDimensions = threads;
            init_array.GridDimensions = blocks;
            init_array.RunAsync(streams[0].Stream, d_a.DevicePointer, d_c.DevicePointer, niterations);
            stop_event.Record();
            stop_event.Synchronize();
            time_kernel = CudaEvent.ElapsedTime(start_event, stop_event);
            Console.Write("kernel:\t\t{0:0.00}\n", time_kernel);

            //////////////////////////////////////////////////////////////////////
            // time non-streamed execution for reference
            threads = new dim3(512, 1);
            blocks = new dim3(n / (int)threads.x, 1);
            start_event.Record();
            for(int k = 0; k < nreps; k++)
            {
                init_array.BlockDimensions = threads;
                init_array.GridDimensions = blocks;
                init_array.Run(d_a.DevicePointer, d_c.DevicePointer, niterations);
                hAligned_a.SynchronCopyToHost(d_a);
            }
            stop_event.Record();
            stop_event.Synchronize();
            elapsed_time = CudaEvent.ElapsedTime(start_event, stop_event);
            Console.Write("non-streamed:\t{0:0.00} ({1:00} expected)\n", elapsed_time / nreps, time_kernel + time_memcpy);

            //////////////////////////////////////////////////////////////////////
            // time execution with nstreams streams
            threads = new dim3(512, 1);
            blocks = new dim3(n / (int)(nstreams * threads.x), 1);
            byte[] memset = new byte[nbytes]; // set host memory bits to all 1s, for testing correctness
            for (int i = 0; i < nbytes; i++)
            {
                memset[i] = 255;
            }
            System.Runtime.InteropServices.Marshal.Copy(memset, 0, hAligned_a.PinnedHostPointer, nbytes);
            d_a.Memset(0); // set device memory to all 0s, for testing correctness

            start_event.Record();
            for(int k = 0; k < nreps; k++)
            {
                init_array.BlockDimensions = threads;
                init_array.GridDimensions = blocks;
                // asynchronously launch nstreams kernels, each operating on its own portion of data
                for(int i = 0; i < nstreams; i++)
                    init_array.RunAsync(streams[i].Stream, d_a.DevicePointer + i * n / nstreams * sizeof(int), d_c.DevicePointer, niterations);

                // asynchronously launch nstreams memcopies.  Note that memcopy in stream x will only
                //   commence executing when all previous CUDA calls in stream x have completed
                for (int i = 0; i < nstreams; i++)
                    hAligned_a.AsyncCopyFromDevice(d_a, i * n / nstreams * sizeof(int), i * n / nstreams * sizeof(int), nbytes / nstreams, streams[i].Stream);
            }
            stop_event.Record();
            stop_event.Synchronize();
            elapsed_time = CudaEvent.ElapsedTime(start_event, stop_event);
            Console.Write("{0} streams:\t{1:0.00} ({2:0.00} expected with compute capability 1.1 or later)\n", nstreams, elapsed_time / nreps, time_kernel + time_memcpy / nstreams);

            // check whether the output is correct
            Console.Write("-------------------------------\n");
            //We can directly access data in hAligned_a using the [] operator, but copying
            //data first to h_a is faster.
            System.Runtime.InteropServices.Marshal.Copy(hAligned_a.PinnedHostPointer, h_a, 0, nbytes / sizeof(int));

            bool bResults = correct_data(h_a, n, c*nreps*niterations);

            // release resources
            for(int i = 0; i < nstreams; i++) {
                streams[i].Dispose();
            }
            start_event.Dispose();
            stop_event.Dispose();

            hAligned_a.Dispose();
            d_a.Dispose();
            d_c.Dispose();
            CudaContext.ProfilerStop();
            ctx.Dispose();

            ShrQATest.shrQAFinishExit(args, bResults ? ShrQATest.eQAstatus.QA_PASSED : ShrQATest.eQAstatus.QA_FAILED);
        }