예제 #1
0
파일: CUDA.cs 프로젝트: rblenis/cudafy
        public float ElapsedTime(CUevent start, CUevent end)
        {
            float pMilliseconds = 0f;

            this.LastError = CUDADriver.cuEventElapsedTime(ref pMilliseconds, start, end);
            return(pMilliseconds);
        }
예제 #2
0
 public static void cuEventRecord(CUevent hEvent, CUstream hStream)
 {
     Wrap(() =>
     {
         try
         {
             var error = nativeEventRecord(hEvent, hStream);
             if (error != CUresult.CUDA_SUCCESS)
             {
                 throw new CudaException(error);
             }
         }
         catch (CudaException)
         {
             throw;
         }
         catch (DllNotFoundException dnfe)
         {
             throw new CudaException(CudaError.NoDriver, dnfe);
         }
         catch (Exception e)
         {
             throw new CudaException(CudaError.Unknown, e);
         }
     });
 }
예제 #3
0
파일: CUDA.cs 프로젝트: rblenis/cudafy
        public CUevent CreateEvent(CUEventFlags flags)
        {
            CUevent phEvent = new CUevent();

            this.LastError = CUDADriver.cuEventCreate(ref phEvent, (uint)flags);
            return(phEvent);
        }
예제 #4
0
        public static ElapsedTime cuEventElapsedTime(CUevent hStart, CUevent hEnd)
        {
            return(Wrap(() =>
            {
                try
                {
                    float milliseconds;
                    var error = nativeEventElapsedTime(out milliseconds, hStart, hEnd);
                    if (error != CUresult.CUDA_SUCCESS)
                    {
                        throw new CudaException(error);
                    }

                    // note. cannot use TimeSpan here because it ain't work with fractions of milliseconds
                    return new ElapsedTime(milliseconds);
                }
                catch (CudaException)
                {
                    throw;
                }
                catch (DllNotFoundException dnfe)
                {
                    throw new CudaException(CudaError.NoDriver, dnfe);
                }
                catch (Exception e)
                {
                    throw new CudaException(CudaError.Unknown, e);
                }
            }));
        }
예제 #5
0
        /// <summary>
        /// Creates a new Event
        /// </summary>
        /// <param name="flags">Parameters for event creation</param>
        public CudaEvent(CUEventFlags flags)
        {
            _event = new CUevent();

            res = DriverAPINativeMethods.Events.cuEventCreate(ref _event, flags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuEventCreate", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
예제 #6
0
 public CudaTest(CUDA cuda)
 {
     _cuda = cuda;
     //+ load frame module
     LoadFrameModule(Cuda.CudaModle.BuiltinModules.Test);
     //+ create cuda event handles
     _cudaStartEvent = _cuda.CreateEvent();
     _cudaStopEvent  = _cuda.CreateEvent();
 }
예제 #7
0
파일: CUDA.cs 프로젝트: rblenis/cudafy
 public void DestroyEvent(CUevent e)
 {
     if (_version >= 4000)
     {
         this.LastError = CUDADriver.cuEventDestroy_v2(e);
     }
     else
     {
         this.LastError = CUDADriver.cuEventDestroy(e);
     }
 }
예제 #8
0
 /// <summary>
 /// Make a compute stream wait on an event<para/>
 /// Makes all future work submitted to the Stream wait until <c>hEvent</c>
 /// reports completion before beginning execution. This synchronization
 /// will be performed efficiently on the device.
 /// <para/>
 /// The stream will wait only for the completion of the most recent
 /// host call to <see cref="CudaEvent.Record()"/> on <c>hEvent</c>. Once this call has returned,
 /// any functions (including <see cref="CudaEvent.Record()"/> and <see cref="Dispose()"/> may be
 /// called on <c>hEvent</c> again, and the subsequent calls will not have any
 /// effect on this stream.
 /// <para/>
 /// If <c>hStream</c> is 0 (the NULL stream) any future work submitted in any stream
 /// will wait for <c>hEvent</c> to complete before beginning execution. This
 /// effectively creates a barrier for all future work submitted to the context.
 /// <para/>
 /// If <see cref="CudaEvent.Record()"/> has not been called on <c>hEvent</c>, this call acts as if
 /// the record has already completed, and so is a functional no-op.
 /// </summary>
 /// <returns></returns>
 public void WaitEvent(CUevent cuevent)
 {
     if (disposed)
     {
         throw new ObjectDisposedException(this.ToString());
     }
     res = DriverAPINativeMethods.Streams.cuStreamWaitEvent(_stream, cuevent, 0);
     Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuStreamWaitEvent", res));
     if (res != CUResult.Success)
     {
         throw new CudaException(res);
     }
 }
예제 #9
0
        public CUDAExecution(GASS.CUDA.CUDA cuda, string module, string function)
        {
            this.parameters   = new List <Parameter>();
            this.textures     = new List <CUtexref>();
            this.CUDAInstance = cuda;
            if (!module.EndsWith("cubin"))
            {
                module = module + ".cubin";
            }
            FileInfo info = new FileInfo(module);

            this.Module       = info.FullName;
            this.Function     = function;
            this.CUDAModule   = cuda.LoadModule(this.module);
            this.CUDAFunction = cuda.GetModuleFunction(this.function);
            this.start        = cuda.CreateEvent();
            this.stop         = cuda.CreateEvent();
        }
예제 #10
0
 // http://developer.download.nvidia.com/compute/cuda/3_1/toolkit/docs/online/group__CUEVENT_g7895332c94680b174ef41373af09d9ce.html
 private static extern CUresult nativeEventElapsedTime(out float pMilliseconds, CUevent hStart, CUevent hEnd);
예제 #11
0
파일: CUDA.cs 프로젝트: rblenis/cudafy
 public void RecordEvent(CUevent e)
 {
     this.RecordEvent(e, new CUstream());
 }
예제 #12
0
파일: Program.cs 프로젝트: rblenis/cudafy
        static void Main(string[] args)
        {
            // Init and select 1st device.
            CUDA cuda = new CUDA(0, true);

            // load module
            //cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "transpose_kernel.cubin"));
            cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "transpose_kernel.ptx"));
            CUfunction transpose       = cuda.GetModuleFunction("transpose");
            CUfunction transpose_naive = cuda.GetModuleFunction("transpose_naive");

            const int size_x   = 4096;
            const int size_y   = 4096;
            const int mem_size = sizeof(float) * size_x * size_y;

            float[] h_idata = new float[size_x * size_y];
            for (int i = 0; i < h_idata.Length; i++)
            {
                h_idata[i] = (float)i;
            }

            // allocate device memory
            // copy host memory to device
            CUdeviceptr d_idata = cuda.CopyHostToDevice <float>(h_idata);
            CUdeviceptr d_odata = cuda.Allocate <float>(h_idata);

            // setup execution parameters
            cuda.SetFunctionBlockShape(transpose_naive, BLOCK_DIM, BLOCK_DIM, 1);
            cuda.SetParameter(transpose_naive, 0, (uint)d_odata.Pointer);
            cuda.SetParameter(transpose_naive, IntPtr.Size, (uint)d_idata.Pointer);
            cuda.SetParameter(transpose_naive, IntPtr.Size * 2, (uint)size_x);
            cuda.SetParameter(transpose_naive, IntPtr.Size * 2 + 4, (uint)size_y);
            cuda.SetParameterSize(transpose_naive, (uint)(IntPtr.Size * 2 + 8));

            cuda.SetFunctionBlockShape(transpose, BLOCK_DIM, BLOCK_DIM, 1);
            cuda.SetParameter(transpose, 0, (uint)d_odata.Pointer);
            cuda.SetParameter(transpose, IntPtr.Size, (uint)d_idata.Pointer);
            cuda.SetParameter(transpose, IntPtr.Size * 2, (uint)size_x);
            cuda.SetParameter(transpose, IntPtr.Size * 2 + 4, (uint)size_y);
            cuda.SetParameterSize(transpose, (uint)(IntPtr.Size * 2 + 8));

            // warmup so we don't time CUDA startup
            cuda.Launch(transpose_naive, size_x / BLOCK_DIM, size_y / BLOCK_DIM);
            cuda.Launch(transpose, size_x / BLOCK_DIM, size_y / BLOCK_DIM);
            //System.Threading.Thread.Sleep(10);
            int numIterations = 100;

            Console.WriteLine("Transposing a {0} by {1} matrix of floats...", size_x, size_y);
            CUevent start = cuda.CreateEvent();
            CUevent end   = cuda.CreateEvent();

            cuda.RecordEvent(start);
            for (int i = 0; i < numIterations; i++)
            {
                cuda.Launch(transpose_naive, size_x / BLOCK_DIM, size_y / BLOCK_DIM);
            }
            cuda.SynchronizeContext();
            cuda.RecordEvent(end);
            cuda.SynchronizeContext();
            float naiveTime = cuda.ElapsedTime(start, end);

            Console.WriteLine("Naive transpose average time:     {0} ms\n", naiveTime / numIterations);

            cuda.RecordEvent(start);
            for (int i = 0; i < numIterations; i++)
            {
                cuda.Launch(transpose, size_x / BLOCK_DIM, size_y / BLOCK_DIM);
            }
            cuda.SynchronizeContext();
            cuda.RecordEvent(end);
            cuda.SynchronizeContext();
            float optimizedTime = cuda.ElapsedTime(start, end);


            Console.WriteLine("Optimized transpose average time:     {0} ms\n", optimizedTime / numIterations);

            float[] h_odata = new float[size_x * size_y];
            cuda.CopyDeviceToHost <float>(d_odata, h_odata);

            float[] reference = new float[size_x * size_y];
            computeGold(reference, h_idata, size_x, size_y);

            bool res = CompareF(reference, h_odata, size_x * size_y);

            Console.WriteLine("Test {0}", res == true? "PASSED":"FAILED");

            cuda.Free(d_idata);
            cuda.Free(d_odata);

            Console.ReadKey();
        }
예제 #13
0
        /// <summary>
        /// implementation of sparese matrix product
        /// </summary>
        /// <param name="repetition">how many times kernel should be launch</param>
        /// <param name="moduleFunction">cuda kenrel name</param>
        /// <param name="blockSizeX">block size X</param>
        /// <param name="blockSizeY">block size Y</param>
        /// <param name="transposeGrid">indicate that grid dimensions should be
        /// computed alternativly, if false than gridDimY- connected with rows
        /// else gridDim.Y conected with cols</param>
        /// <returns></returns>
        public static float[] CRSSparseMM(int repetition, string moduleFunction,
                                          int blockSizeX, int blockSizeY, bool transposeGrid)
        {
            //int blockSizeX = 4;
            //int blockSizeY = 4;

            CUDA cuda = new CUDA(0, true);

            // load module
            CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "matrixKernels.cubin"));

            CUfunction cuFunc = cuda.GetModuleFunction(moduleFunction);

            int maxRowSize = avgElements + stdElements - 1;

            Console.WriteLine("------------------------------------");
            Console.WriteLine("init Matrix");
            Stopwatch t = Stopwatch.StartNew();

            //values in CRS format
            float[] AVals, BVals;
            //indexes in Crs format
            int[] AIdx, BIdx;
            //Lenght of each row in CRS format
            int[] ARowLen, BRowLen;
            int   maxIndex = 0;

            MakeRandCrsSparseMatrix(Rows, maxRowSize, out AVals, out AIdx, out ARowLen, out maxIndex);

            // DisplayCrsMatrix(AVals, AIdx, ARowLen,maxIndex);
            MakeRandCrsSparseMatrix(Cols, maxRowSize, out BVals, out BIdx, out BRowLen, out maxIndex);
            //DisplayCrsMatrix(BVals, BIdx, BRowLen, maxIndex);


            Console.WriteLine("Init takes {0}", t.Elapsed);
            t.Start();

            CUdeviceptr AValsPtr   = cuda.CopyHostToDevice(AVals);
            CUdeviceptr AIdxPtr    = cuda.CopyHostToDevice(AIdx);
            CUdeviceptr ALenghtPtr = cuda.CopyHostToDevice(ARowLen);

            CUdeviceptr BValsPtr   = cuda.CopyHostToDevice(BVals);
            CUdeviceptr BIdxPtr    = cuda.CopyHostToDevice(BIdx);
            CUdeviceptr BLenghtPtr = cuda.CopyHostToDevice(BRowLen);

            int outputSize = Rows * Cols;

            float[] output = new float[outputSize];
            //CUdeviceptr dOutput = cuda.Allocate(output);

            IntPtr      outputPtr2 = cuda.HostAllocate((uint)(outputSize * sizeof(float)), CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            CUdeviceptr dOutput    = cuda.GetHostDevicePointer(outputPtr2, 0);


            Console.WriteLine("copy to device takes {0}", t.Elapsed);
            #region set cuda parameters


            int Aelements = AVals.Length;
            int Belements = BVals.Length;

            cuda.SetFunctionBlockShape(cuFunc, blockSizeX, blockSizeY, 1);

            int offset = 0;
            cuda.SetParameter(cuFunc, offset, AValsPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, AIdxPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, ALenghtPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, BValsPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, BIdxPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, BLenghtPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, dOutput.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, (uint)Rows);
            offset += sizeof(int);
            cuda.SetParameter(cuFunc, offset, (uint)Cols);
            offset += sizeof(int);

            cuda.SetParameter(cuFunc, offset, (uint)Aelements);
            offset += sizeof(int);
            cuda.SetParameter(cuFunc, offset, (uint)Belements);
            offset += sizeof(int);


            cuda.SetParameterSize(cuFunc, (uint)offset);
            #endregion
            Console.WriteLine("start computation");

            CUevent start = cuda.CreateEvent();
            CUevent end   = cuda.CreateEvent();

            //CUtexref cuTexRef = cuda.GetModuleTexture(module, "texRef");
            //cuda.SetTextureFlags(cuTexRef, 0);

            int gridDimX = (int)Math.Ceiling((Cols + 0.0) / (blockSizeX));
            int gridDimY = (int)Math.Ceiling((0.0 + Rows) / blockSizeY);
            if (transposeGrid)
            {
                gridDimX = (int)Math.Ceiling((Rows + 0.0) / (blockSizeX));
                gridDimY = (int)Math.Ceiling((0.0 + Cols) / blockSizeY);
            }

            Stopwatch timer = Stopwatch.StartNew();
            cuda.RecordEvent(start);


            for (int k = 0; k < repetition; k++)
            {
                cuda.Launch(cuFunc, gridDimX, gridDimY);

                cuda.SynchronizeContext();
                //  cuda.CopyDeviceToHost(dOutput, output);
                Marshal.Copy(outputPtr2, output, 0, outputSize);
            }

            cuda.RecordEvent(end);

            cuda.SynchronizeContext();

            timer.Stop();
            float cudaTime = cuda.ElapsedTime(start, end);

            Console.WriteLine("Matrix products with kernel {0}", moduleFunction);
            Console.WriteLine("  takes {0} ms stopwatch time {1} ms", cudaTime, timer.Elapsed);


            int lenght = displayCount;// Math.Min(displayCount, Rows);
            Console.WriteLine();
            for (int i = 0; i < lenght; i++)
            {
                Console.WriteLine("{0}-{1}", i, output[i]);
            }

            cuda.Free(AValsPtr);
            cuda.Free(AIdxPtr);
            cuda.Free(ALenghtPtr);

            cuda.Free(BValsPtr);
            cuda.Free(BIdxPtr);
            cuda.Free(BLenghtPtr);

            cuda.Free(dOutput);

            cuda.DestroyEvent(start);
            cuda.DestroyEvent(end);

            return(output);
        }
예제 #14
0
 /// <summary>
 /// Creates a new Event using <see cref="CUEventFlags.Default"/> 
 /// </summary>
 internal CudaEvent(CUevent event_)
 {
     _event = event_;
 }
예제 #15
0
파일: CUDA.cs 프로젝트: rblenis/cudafy
 public void RecordEvent(CUevent e, CUstream stream)
 {
     this.LastError = CUDADriver.cuEventRecord(e, stream);
 }
예제 #16
0
        public static float[] CRSSparseMMwithDenseVector(int repetition,
                                                         string moduleFunction, int blockSizeX, int blockSizeY)
        {
            CUDA cuda = new CUDA(0, true);

            // load module

            CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "matrixKernels.cubin"));

            CUfunction cuFunc = cuda.GetModuleFunction(moduleFunction);

            int maxRowSize = avgElements + stdElements - 1;

            Console.WriteLine("------------------------------------");
            Console.WriteLine("init Matrix");
            Stopwatch t = Stopwatch.StartNew();

            //values in CRS format
            float[] AVals, BVals;
            //indexes in Crs format
            int[] AIdx, BIdx;
            //Lenght of each row in CRS format
            int[] ARowLen, BRowLen;

            int maxIndex = 0;

            MakeRandCrsSparseMatrix(Rows, maxRowSize, out AVals, out AIdx, out ARowLen, out maxIndex);

            // DisplayCrsMatrix(AVals, AIdx, ARowLen,maxIndex);
            MakeRandCrsSparseMatrix(Cols, maxRowSize, out BVals, out BIdx, out BRowLen, out maxIndex);
            //DisplayCrsMatrix(BVals, BIdx, BRowLen, maxIndex);


            Console.WriteLine("Init takes {0}", t.Elapsed);
            t.Start();

            CUdeviceptr AValsPtr   = cuda.CopyHostToDevice(AVals);
            CUdeviceptr AIdxPtr    = cuda.CopyHostToDevice(AIdx);
            CUdeviceptr ALenghtPtr = cuda.CopyHostToDevice(ARowLen);

            int outputSize = Rows * Cols;

            float[] output = new float[outputSize];

            //allocate memory for output
            IntPtr      outputPtr2 = cuda.HostAllocate((uint)(outputSize * sizeof(float)), CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            CUdeviceptr dOutput    = cuda.GetHostDevicePointer(outputPtr2, 0);

            //create dense vector for each column in B matrix
            float[] mainVec = new float[maxIndex + 1];

            uint memSize = (uint)((maxIndex + 1) * sizeof(float));

            CUstream stream0 = cuda.CreateStream();


            IntPtr[] mainVecIntPtrs = new IntPtr[2];

            //write combined memory allocation
            //IntPtr mainVecIPtr = cuda.HostAllocate(memSize,CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED);
            //CUdeviceptr mainVecPtr=cuda.CopyHostToDeviceAsync(mainVecIPtr,memSize,stream0);

            //
            //mainVecIntPtrs[0] = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED);
            //mainVecIntPtrs[1] = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED);

            mainVecIntPtrs[0] = cuda.AllocateHost(memSize);
            mainVecIntPtrs[1] = cuda.AllocateHost(memSize);
            CUdeviceptr mainVecPtr = cuda.CopyHostToDeviceAsync(mainVecIntPtrs[0], memSize, stream0);

            //IntPtr mainVecIPtr = cuda.HostAllocate(memSize,CUDADriver.CU_MEMHOSTALLOC_PORTABLE);
            //CUdeviceptr mainVecPtr=cuda.CopyHostToDeviceAsync(mainVecIPtr,memSize,stream0);

            //mapped memory allocation
            //IntPtr mainVecIPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            //CUdeviceptr mainVecPtr = cuda.CopyHostToDevice(mainVecIPtr, memSize);

            //get texture reference
            CUtexref cuTexRef = cuda.GetModuleTexture(module, "vectorTexRef");

            cuda.SetTextureFlags(cuTexRef, 0);
            cuda.SetTextureAddress(cuTexRef, mainVecPtr, memSize);

            Console.WriteLine("copy to device takes {0}", t.Elapsed);
            #region set cuda parameters

            int Aelements = AVals.Length;

            cuda.SetFunctionBlockShape(cuFunc, blockSizeX, blockSizeY, 1);

            int offset = 0;
            cuda.SetParameter(cuFunc, offset, AValsPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, AIdxPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, ALenghtPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, dOutput.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, (uint)Rows);
            offset += sizeof(int);
            cuda.SetParameter(cuFunc, offset, (uint)Cols);
            offset += sizeof(int);

            int colIndexParamOffset = offset;
            cuda.SetParameter(cuFunc, offset, (uint)0);
            offset += sizeof(int);
            cuda.SetParameterSize(cuFunc, (uint)offset);
            #endregion
            Console.WriteLine("start computation");

            CUevent start = cuda.CreateEvent();
            CUevent end   = cuda.CreateEvent();


            int gridDimX = (int)Math.Ceiling((Rows + 0.0) / (blockSizeX));
            int gridDim  = (Rows + blockSizeX - 1) / blockSizeX;



            Stopwatch timer = Stopwatch.StartNew();
            cuda.RecordEvent(start);
            for (int rep = 0; rep < repetition; rep++)
            {
                for (int k = 0; k < Cols; k++)
                {
                    Helpers.InitBuffer(BVals, BIdx, BRowLen, k, mainVecIntPtrs[k % 2]);

                    cuda.SynchronizeStream(stream0);

                    cuda.CopyHostToDeviceAsync(mainVecPtr, mainVecIntPtrs[k % 2], memSize, stream0);
                    cuda.SetParameter(cuFunc, colIndexParamOffset, (uint)k);
                    cuda.LaunchAsync(cuFunc, gridDimX, 1, stream0);
                    //cuda.SynchronizeStream(stream0);
                    ////clear host buffer
                    Helpers.SetBufferIdx(BIdx, BRowLen, k - 1, mainVecIntPtrs[(k + 1) % 2], 0.0f);

                    //Helpers.InitBuffer(BVals, BIdx, BRowLen, k, mainVecIPtr);
                    ////make asynchronius copy and kernel lauch
                    //cuda.CopyHostToDeviceAsync(mainVecPtr, mainVecIPtr, memSize, stream0);
                    //cuda.SetParameter(cuFunc, colIndexParamOffset,(uint) k);
                    //cuda.LaunchAsync(cuFunc, gridDimX, 1, stream0);
                    //cuda.SynchronizeStream(stream0);
                    ////clear host buffer
                    //Helpers.SetBufferIdx(BIdx, BRowLen, k, mainVecIPtr, 0.0f);
                }
            }
            cuda.RecordEvent(end);
            cuda.SynchronizeContext();

            timer.Stop();
            float cudaTime = cuda.ElapsedTime(start, end);

            Marshal.Copy(outputPtr2, output, 0, outputSize);

            Console.WriteLine("Matrix products with kernel {0}", moduleFunction);
            Console.WriteLine("  takes {0} ms stopwatch time {1} ms", cudaTime, timer.Elapsed);


            int lenght = displayCount;// Math.Min(displayCount, Rows);
            Console.WriteLine();
            for (int i = 0; i < lenght; i++)
            {
                Console.WriteLine("{0}-{1}", i, output[i]);
            }

            cuda.Free(AValsPtr);
            cuda.Free(AIdxPtr);
            cuda.Free(ALenghtPtr);
            cuda.Free(dOutput);
            cuda.DestroyEvent(start);
            cuda.DestroyEvent(end);

            cuda.DestroyStream(stream0);
            cuda.Free(mainVecPtr);
            cuda.DestroyTexture(cuTexRef);


            return(output);
        }
예제 #17
0
 // http://developer.download.nvidia.com/compute/cuda/3_1/toolkit/docs/online/group__CUEVENT_g349006734f6e7378ea36cb57c239d4c7.html
 private static extern CUresult nativeEventDestroy(CUevent hEvent);
예제 #18
0
 //http://developer.download.nvidia.com/compute/cuda/3_1/toolkit/docs/online/group__CUEVENT_g93468fbdae4190b79926381a90a94301.html
 private static extern CUresult nativeEventRecord(CUevent hEvent, CUstream hStream);
예제 #19
0
        static void Main(string[] args)
        {
            // Create a new instance of CUDA class, select 1st device.
            CUDA cuda = new CUDA(0, true);

            // Prepare parameters.
            int  n      = 16 * 1024 * 1024;
            uint nbytes = (uint)(n * sizeof(int));
            int  value  = 26;

            // allocate host memory
            int[] a = new int[n];

            // allocate device memory
            CUdeviceptr d_a = cuda.Allocate <int>(a);

            CUDADriver.cuMemsetD8(d_a, 0xff, nbytes);

            // load module
            cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "asyncAPI.ptx"));
            CUfunction func = cuda.GetModuleFunction("increment_kernel");

            // set kernel launch configuration
            cuda.SetFunctionBlockShape(func, 512, 1, 1);

            // create cuda event handles
            CUevent start = cuda.CreateEvent();
            CUevent stop  = cuda.CreateEvent();

            // asynchronously issue work to the GPU (all to stream 0)
            CUstream stream = new CUstream();

            cuda.RecordEvent(start);
            cuda.CopyHostToDeviceAsync <int>(d_a, a, stream);

            // set parameters for kernel function
            cuda.SetParameter(func, 0, (uint)d_a.Pointer);
            cuda.SetParameter(func, IntPtr.Size, (uint)value);

            cuda.SetParameterSize(func, (uint)(IntPtr.Size + 4));

            // actually launch kernel
            cuda.LaunchAsync(func, n / 512, 1, stream);

            // wait for every thing to finish, then start copy back data
            cuda.CopyDeviceToHostAsync <int>(d_a, a, stream);

            cuda.RecordEvent(stop);

            // print the cpu and gpu times
            Console.WriteLine("time spent executing by the GPU: {0} ms", cuda.ElapsedTime(start, stop));

            // check the output for correctness
            if (CorrectOutput(a, value))
            {
                Console.WriteLine("Test PASSED");
            }
            else
            {
                Console.WriteLine("Test FAILED");
            }

            // release resources
            cuda.DestroyEvent(start);
            cuda.DestroyEvent(stop);
            cuda.Free(d_a);
        }
예제 #20
0
파일: CUDA.cs 프로젝트: rblenis/cudafy
 public void SynchronizeEvent(CUevent e)
 {
     this.LastError = CUDADriver.cuEventSynchronize(e);
 }
예제 #21
0
 // http://developer.download.nvidia.com/compute/cuda/3_1/toolkit/docs/online/group__CUEVENT_g433317083f929b9298f8a88d57aa5017.html
 private static extern CUresult nativeEventCreate(out CUevent phEvent, CUevent_flags Flags);
예제 #22
0
 // http://developer.download.nvidia.com/compute/cuda/3_1/toolkit/docs/online/group__CUEVENT_ge3ed6a308c602d139373895cb99cb7ab.html
 private static extern CUresult nativeEventSynchronize(CUevent hEvent);
예제 #23
0
 public static void cuEventRecord(CUevent hEvent)
 {
     cuEventRecord(hEvent, CUstream.Null);
 }