示例#1
0
        /// <summary>
        /// 
        /// </summary>
        public CudaStopWatch(CUEventFlags flags)
        {
			_start = new CudaEvent(flags);
			_stop = new CudaEvent(flags);
            _stream = new CUstream();

		}
示例#2
0
		/// <summary>
		/// 
		/// </summary>
		public CudaStopWatch(CUstream stream)
		{
			_start = new CudaEvent();
			_stop = new CudaEvent();
			_stream = stream;

		}
示例#3
0
        /// <summary>
        /// 
        /// </summary>
        public CudaStopWatch()
        {
            _start = new CudaEvent();
			_stop = new CudaEvent();
            _stream = new CUstream();

        }
示例#4
0
 /// <summary>
 /// Get elapsed time in milliseconds, no sync on stop event
 /// </summary>
 /// <returns>Elapsed time in ms</returns>
 public float GetElapsedTimeNoSync()
 {
     if (disposed)
     {
         throw new ObjectDisposedException(this.ToString());
     }
     return(CudaEvent.ElapsedTime(_start, _stop));
 }
示例#5
0
 /// <summary>
 /// Computes the elapsed time between two events (in milliseconds with a resolution of around 0.5 microseconds). If
 /// either event has not been recorded yet, this function throws <see cref="CUResult.ErrorNotReady"/>. If either event has been
 /// recorded with a non-zero stream, the result is undefined.
 /// </summary>
 /// <param name="eventStart"></param>
 /// <param name="eventEnd"></param>
 /// <returns></returns>
 public static float ElapsedTime(CudaEvent eventStart, CudaEvent eventEnd)
 {
     float time = 0;
     CUResult res = DriverAPINativeMethods.Events.cuEventElapsedTime(ref time, eventStart.Event, eventEnd.Event);
     Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuEventElapsedTime", res));
     if (res != CUResult.Success) throw new CudaException(res);
     return time;
 }
示例#6
0
 /// <summary>
 /// Sets the event for an event record node in the given graphExec
 /// Sets the event of an event record node in an executable graph \p hGraphExec.
 /// The node is identified by the corresponding node \p hNode in the
 /// non-executable graph, from which the executable graph was instantiated.
 /// The modifications only affect future launches of \p hGraphExec. Already
 /// enqueued or running launches of \p hGraphExec are not affected by this call.
 /// \p hNode is also not modified by this call.
 /// </summary>
 public void SetWaitEvent(CUgraphNode hNode, CudaEvent event_)
 {
     res = DriverAPINativeMethods.GraphManagment.cuGraphExecEventWaitNodeSetEvent(_graph, hNode, event_.Event);
     Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuGraphExecEventWaitNodeSetEvent", res));
     if (res != CUResult.Success)
     {
         throw new CudaException(res);
     }
 }
示例#7
0
        /// <summary>
        /// Creates an event wait node and adds it to a graph
        /// Creates a new event wait node and adds it to \p hGraph with \p numDependencies
        /// dependencies specified via \p dependencies and arguments specified in \p params.
        /// It is possible for \p numDependencies to be 0, in which case the node will be placed
        /// at the root of the graph. \p dependencies may not have any duplicate entries.
        /// A handle to the new node will be returned in \p phGraphNode.
        /// The graph node will wait for all work captured in \p event.  See ::cuEventRecord()
        /// for details on what is captured by an event. \p event may be from a different context
        /// or device than the launch stream.
        /// </summary>
        /// <param name="dependencies">Dependencies of the node</param>
        /// <param name="event_">Event for the node</param>
        /// <returns>Returns newly created node</returns>
        public CUgraphNode AddEventWaitNode(CUgraphNode[] dependencies, CudaEvent event_)
        {
            CUgraphNode node            = new CUgraphNode();
            SizeT       numDependencies = 0;

            if (dependencies != null)
            {
                numDependencies = dependencies.Length;
            }

            res = DriverAPINativeMethods.GraphManagment.cuGraphAddEventWaitNode(ref node, _graph, dependencies, numDependencies, event_.Event);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuGraphAddEventWaitNode", res));
            if (res != CUResult.Success)
            {
                throw new CudaException(res);
            }

            return(node);
        }
示例#8
0
		/// <summary>
		/// 
		/// </summary>
		public CudaStopWatch(CUEventFlags flags, CUstream stream)
		{
			_start = new CudaEvent(flags);
			_stop = new CudaEvent(flags);
			_stream = stream;
		}       
示例#9
0
 /// <summary>
 ///
 /// </summary>
 public CudaStopWatch(CUEventFlags flags, CUstream stream)
 {
     _start  = new CudaEvent(flags);
     _stop   = new CudaEvent(flags);
     _stream = stream;
 }
示例#10
0
 /// <summary>
 ///
 /// </summary>
 public CudaStopWatch(CUstream stream)
 {
     _start  = new CudaEvent();
     _stop   = new CudaEvent();
     _stream = stream;
 }
示例#11
0
 /// <summary>
 ///
 /// </summary>
 public CudaStopWatch(CUEventFlags flags)
 {
     _start  = new CudaEvent(flags);
     _stop   = new CudaEvent(flags);
     _stream = new CUstream();
 }
示例#12
0
 /// <summary>
 ///
 /// </summary>
 public CudaStopWatch()
 {
     _start  = new CudaEvent();
     _stop   = new CudaEvent();
     _stream = new CUstream();
 }
示例#13
0
 /// <summary>
 /// Computes the elapsed time between two events (in milliseconds with a resolution of around 0.5 microseconds). If
 /// either event has not been recorded yet, this function throws <see cref="CUResult.ErrorNotReady"/>. If either event has been
 /// recorded with a non-zero stream, the result is undefined.
 /// </summary>
 /// <param name="eventStart"></param>
 /// <param name="eventEnd"></param>
 /// <returns></returns>
 public static float ElapsedTime(CudaEvent eventStart, CudaEvent eventEnd)
 {
     float time = 0;
     CUResult res = DriverAPINativeMethods.Events.cuEventElapsedTime(ref time, eventStart.Event, eventEnd.Event);
     Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuEventElapsedTime", res));
     if (res != CUResult.Success) throw new CudaException(res);
     return time;
 }
示例#14
0
        static void Main(string[] args)
        {
            int cuda_device = 0;
            int nstreams = 4;               // number of streams for CUDA calls
            int nreps = 10;                 // number of times each experiment is repeated
            int n = 16 * 1024 * 1024;       // number of ints in the data set
            int nbytes = n * sizeof(int);   // number of data bytes
            dim3 threads, blocks;           // kernel launch configuration
            float elapsed_time, time_memcpy, time_kernel;   // timing variables
            float scale_factor = 1.0f;

            // allocate generic memory and pin it laster instead of using cudaHostAlloc()
            // Untested in C#, so stick to cudaHostAlloc().
            bool bPinGenericMemory = false; // we want this to be the default behavior
            CUCtxFlags device_sync_method = CUCtxFlags.BlockingSync; // by default we use BlockingSync

            int niterations;	// number of iterations for the loop inside the kernel

            ShrQATest.shrQAStart(args);

            Console.WriteLine("[ simpleStreams ]");

            foreach (var item in args)
            {
                if (item.Contains("help"))
                {
                    printHelp();
                    ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_PASSED);
                }
            }

            bPinGenericMemory = false;
            foreach (var item in args)
            {
                if (item.Contains("use_generic_memory"))
                {
                    bPinGenericMemory = true;
                }
            }

            for (int i = 0; i < args.Length; i++)
            {
                if (args[i].Contains("sync_method"))
                {
                    int temp = -1;
                    bool error = false;
                    if (i < args.Length - 1)
                    {
                        error = int.TryParse(args[i + 1], out temp);
                        switch (temp)
                        {
                            case 0:
                                device_sync_method = CUCtxFlags.SchedAuto;
                                break;
                            case 1:
                                device_sync_method = CUCtxFlags.SchedSpin;
                                break;
                            case 2:
                                device_sync_method = CUCtxFlags.SchedYield;
                                break;
                            case 4:
                                device_sync_method = CUCtxFlags.BlockingSync;
                                break;
                            default:
                                error = true;
                                break;
                        }
                    }
                    if (!error)
                    {
                        Console.Write("Specifying device_sync_method = {0}, setting reps to 100 to demonstrate steady state\n", sDeviceSyncMethod[(int)device_sync_method]);
                        nreps = 100;
                    }
                    else
                    {
                        Console.Write("Invalid command line option sync_method=\"{0}\"\n", temp);
                        ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_FAILED);
                    }
                }
            }

            int num_devices = CudaContext.GetDeviceCount();
            if(0==num_devices)
            {
                Console.Write("your system does not have a CUDA capable device, waiving test...\n");
                ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_FAILED);
            }
            cuda_device = CudaContext.GetMaxGflopsDeviceId();

            CudaDeviceProperties deviceProp = CudaContext.GetDeviceInfo(cuda_device);
            if ((1 == deviceProp.ComputeCapability.Major) && (deviceProp.ComputeCapability.Minor < 1))
            {
                Console.Write("{0} does not have Compute Capability 1.1 or newer. Reducing workload.\n", deviceProp.DeviceName);
            }

            if (deviceProp.ComputeCapability.Major >= 2)
            {
                niterations = 100;
            }
            else
            {
                if (deviceProp.ComputeCapability.Minor > 1)
                {
                    niterations = 5;
                }
                else
                {
                    niterations = 1; // reduced workload for compute capability 1.0 and 1.1
                }
            }

            // Check if GPU can map host memory (Generic Method), if not then we override bPinGenericMemory to be false
            // In .net we cannot allocate easily generic aligned memory, so <bPinGenericMemory> is always false in our case...
            if (bPinGenericMemory)
            {
                Console.Write("Device: <{0}> canMapHostMemory: {1}\n", deviceProp.DeviceName, deviceProp.CanMapHostMemory ? "Yes" : "No");
                if (deviceProp.CanMapHostMemory == false)
                {
                    Console.Write("Using cudaMallocHost, CUDA device does not support mapping of generic host memory\n");
                    bPinGenericMemory = false;
                }
            }

            // Anything that is less than 32 Cores will have scaled down workload
            scale_factor = Math.Max((32.0f / (ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor) * (float)deviceProp.MultiProcessorCount)), 1.0f);
            n = (int)Math.Round((float)n / scale_factor);

            Console.Write("> CUDA Capable: SM {0}.{1} hardware\n", deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor);
            Console.Write("> {0} Multiprocessor(s) x {1} (Cores/Multiprocessor) = {2} (Cores)\n",
                    deviceProp.MultiProcessorCount,
                    ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor),
                    ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor) * deviceProp.MultiProcessorCount);

            Console.Write("> scale_factor = {0:0.0000}\n", 1.0f / scale_factor);
            Console.Write("> array_size   = {0}\n\n", n);

            // enable use of blocking sync, to reduce CPU usage
            Console.Write("> Using CPU/GPU Device Synchronization method ({0})\n", sDeviceSyncMethod[(int)device_sync_method]);

            CudaContext ctx;
            if (bPinGenericMemory)
                ctx = new CudaContext(cuda_device, device_sync_method | CUCtxFlags.MapHost);
            else
                ctx = new CudaContext(cuda_device, device_sync_method);

            //Load Kernel image from resources
            string resName;
            if (IntPtr.Size == 8)
                resName = "simpleStreams_x64.ptx";
            else
                resName = "simpleStreams.ptx";

            string resNamespace = "simpleStreams";
            string resource = resNamespace + "." + resName;
            Stream stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource);
            if (stream == null) throw new ArgumentException("Kernel not found in resources.");

            CudaKernel init_array = ctx.LoadKernelPTX(stream, "init_array");

            // allocate host memory
            int c = 5;											// value to which the array will be initialized
            int[] h_a = null;									// pointer to the array data in host memory
            CudaPageLockedHostMemory<int> hAligned_a = null;	// pointer to the array data in host memory (aligned to MEMORY_ALIGNMENT)
            //Note: In .net we have two seperated arrays: One is in managed memory (h_a), the other one in unmanaged memory (hAligned_a).
            //In C++ hAligned_a would point somewhere inside the h_a array.
            AllocateHostMemory(bPinGenericMemory, ref h_a, ref hAligned_a, nbytes);

            Console.Write("\nStarting Test\n");

            // allocate device memory
            CudaDeviceVariable<int> d_c = c; //using new implicit cast to allocate memory and asign value
            CudaDeviceVariable<int> d_a = new CudaDeviceVariable<int>(nbytes / sizeof(int));

            CudaStream[] streams = new CudaStream[nstreams];
            for (int i = 0; i < nstreams; i++)
            {
                streams[i] = new CudaStream();
            }

            // create CUDA event handles
            // use blocking sync
            CudaEvent start_event, stop_event;
            CUEventFlags eventflags = ((device_sync_method == CUCtxFlags.BlockingSync) ? CUEventFlags.BlockingSync : CUEventFlags.Default);

            start_event = new CudaEvent(eventflags);
            stop_event = new CudaEvent(eventflags);

            // time memcopy from device
            start_event.Record();     // record in stream-0, to ensure that all previous CUDA calls have completed
            hAligned_a.AsyncCopyToDevice(d_a, streams[0].Stream);
            stop_event.Record();
            stop_event.Synchronize();   // block until the event is actually recorded
            time_memcpy = CudaEvent.ElapsedTime(start_event, stop_event);
            Console.Write("memcopy:\t{0:0.00}\n", time_memcpy);

            // time kernel
            threads = new dim3(512, 1);
            blocks = new dim3(n / (int)threads.x, 1);
            start_event.Record();
            init_array.BlockDimensions = threads;
            init_array.GridDimensions = blocks;
            init_array.RunAsync(streams[0].Stream, d_a.DevicePointer, d_c.DevicePointer, niterations);
            stop_event.Record();
            stop_event.Synchronize();
            time_kernel = CudaEvent.ElapsedTime(start_event, stop_event);
            Console.Write("kernel:\t\t{0:0.00}\n", time_kernel);

            //////////////////////////////////////////////////////////////////////
            // time non-streamed execution for reference
            threads = new dim3(512, 1);
            blocks = new dim3(n / (int)threads.x, 1);
            start_event.Record();
            for(int k = 0; k < nreps; k++)
            {
                init_array.BlockDimensions = threads;
                init_array.GridDimensions = blocks;
                init_array.Run(d_a.DevicePointer, d_c.DevicePointer, niterations);
                hAligned_a.SynchronCopyToHost(d_a);
            }
            stop_event.Record();
            stop_event.Synchronize();
            elapsed_time = CudaEvent.ElapsedTime(start_event, stop_event);
            Console.Write("non-streamed:\t{0:0.00} ({1:00} expected)\n", elapsed_time / nreps, time_kernel + time_memcpy);

            //////////////////////////////////////////////////////////////////////
            // time execution with nstreams streams
            threads = new dim3(512, 1);
            blocks = new dim3(n / (int)(nstreams * threads.x), 1);
            byte[] memset = new byte[nbytes]; // set host memory bits to all 1s, for testing correctness
            for (int i = 0; i < nbytes; i++)
            {
                memset[i] = 255;
            }
            System.Runtime.InteropServices.Marshal.Copy(memset, 0, hAligned_a.PinnedHostPointer, nbytes);
            d_a.Memset(0); // set device memory to all 0s, for testing correctness

            start_event.Record();
            for(int k = 0; k < nreps; k++)
            {
                init_array.BlockDimensions = threads;
                init_array.GridDimensions = blocks;
                // asynchronously launch nstreams kernels, each operating on its own portion of data
                for(int i = 0; i < nstreams; i++)
                    init_array.RunAsync(streams[i].Stream, d_a.DevicePointer + i * n / nstreams * sizeof(int), d_c.DevicePointer, niterations);

                // asynchronously launch nstreams memcopies.  Note that memcopy in stream x will only
                //   commence executing when all previous CUDA calls in stream x have completed
                for (int i = 0; i < nstreams; i++)
                    hAligned_a.AsyncCopyFromDevice(d_a, i * n / nstreams * sizeof(int), i * n / nstreams * sizeof(int), nbytes / nstreams, streams[i].Stream);
            }
            stop_event.Record();
            stop_event.Synchronize();
            elapsed_time = CudaEvent.ElapsedTime(start_event, stop_event);
            Console.Write("{0} streams:\t{1:0.00} ({2:0.00} expected with compute capability 1.1 or later)\n", nstreams, elapsed_time / nreps, time_kernel + time_memcpy / nstreams);

            // check whether the output is correct
            Console.Write("-------------------------------\n");
            //We can directly access data in hAligned_a using the [] operator, but copying
            //data first to h_a is faster.
            System.Runtime.InteropServices.Marshal.Copy(hAligned_a.PinnedHostPointer, h_a, 0, nbytes / sizeof(int));

            bool bResults = correct_data(h_a, n, c*nreps*niterations);

            // release resources
            for(int i = 0; i < nstreams; i++) {
                streams[i].Dispose();
            }
            start_event.Dispose();
            stop_event.Dispose();

            hAligned_a.Dispose();
            d_a.Dispose();
            d_c.Dispose();
            CudaContext.ProfilerStop();
            ctx.Dispose();

            ShrQATest.shrQAFinishExit(args, bResults ? ShrQATest.eQAstatus.QA_PASSED : ShrQATest.eQAstatus.QA_FAILED);
        }