Exemplo n.º 1
0
    public void GenerateRandomNumbers()
    {
        CleanupResources();

        //Init Cuda context
        ctx = new CudaContext(CudaContext.GetMaxGflopsDeviceId());

        // Allocate input vectors h_A and h_B in host memory
        h_A = new float[Count];
        h_B = new float[Count];

        // Initialize input vectors
        RandomInit(h_A, Count);
        RandomInit(h_B, Count);

        // Allocate vectors in device memory and copy vectors from host memory to device memory
        // Notice the new syntax with implicit conversion operators: Allocation of device memory and data copy is one operation.
        d_A = h_A;
        d_B = h_B;
        //d_C = new CudaDeviceVariable<float>(Count);

        // Allocate Shared Memory. The GPU will write here
        // A = new CudaManagedMemory_float(Count, CUmemAttach_flags.Global);
        // B = new CudaManagedMemory_float(Count, CUmemAttach_flags.Global);
        C = new CudaManagedMemory_float(Count, CUmemAttach_flags.Global);
    }
Exemplo n.º 2
0
        // Testing getting device information via managedCuda
        private static void GetInformationAboutDevice()
        {
            // Number of devices
            var deviceCount = CudaContext.GetDeviceCount();

            Console.WriteLine(deviceCount + " Devices");

            if (deviceCount <= 0)
            {
                throw new Exception("No cuda device detected");
            }

            // Pick device based on performance.
            var deviceByFlops = CudaContext.GetMaxGflopsDeviceId();

            Console.WriteLine("Unit {0} has the most Gflops", deviceByFlops);

            var deviceProperties = CudaContext.GetDeviceInfo(deviceByFlops);

            Console.WriteLine("And has the following properties: ");
            Console.WriteLine(deviceProperties.DeviceName);
            Console.WriteLine("Can execute concurrent kernels: " + deviceProperties.ConcurrentKernels);
            Console.WriteLine("Multi processor count: " + deviceProperties.MultiProcessorCount);
            Console.WriteLine("Clockrate (mhz): " + (int)deviceProperties.ClockRate / 1000.0);
            Console.WriteLine("Total global memory (MB): " + deviceProperties.TotalGlobalMemory / 1000000);
            Console.WriteLine("Is integrated: " + deviceProperties.Integrated);
            Console.WriteLine("Max block dimension: " + deviceProperties.MaxGridDim);
            Console.WriteLine("Max block dimension: " + deviceProperties.MaxBlockDim);
            Console.WriteLine("Max threads per block: " + deviceProperties.MaxThreadsPerBlock);
            Console.WriteLine("Max threads per multiprocessor: " + deviceProperties.MaxThreadsPerMultiProcessor);
            Console.WriteLine("Max shared mem block can use (b): " + deviceProperties.SharedMemoryPerBlock);
            Console.WriteLine("If device can do mem copy and kernel execution: " + deviceProperties.GpuOverlap);
            Console.WriteLine("can map memory adress space on host and device: " + deviceProperties.CanMapHostMemory);
        }
Exemplo n.º 3
0
        public GrabCutGMM()
        {
            ctx = new CudaContext(CudaContext.GetMaxGflopsDeviceId(), false);


            //Load Kernel image from resources
            string resName;

            if (IntPtr.Size == 8)
            {
                resName = "GrabCutGMM_x64.ptx";
            }
            else
            {
                resName = "GrabCutGMM.ptx";
            }

            string resNamespace = "GrabCutNPP";
            string resource     = resNamespace + "." + resName;
            Stream stream       = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource);

            if (stream == null)
            {
                throw new ArgumentException("Kernel not found in resources.");
            }
            byte[] kernel = new byte[stream.Length];

            int bytesToRead = (int)stream.Length;

            while (bytesToRead > 0)
            {
                bytesToRead -= stream.Read(kernel, (int)stream.Position, bytesToRead);
            }

            CUmodule module = ctx.LoadModulePTX(kernel);

            GMMReductionKernelCreateGmmFlags   = new CudaKernel("_Z18GMMReductionKernelILi4ELb1EEviPfiPK6uchar4iPhiiiPj", module, ctx);
            GMMReductionKernelNoCreateGmmFlags = new CudaKernel("_Z18GMMReductionKernelILi4ELb0EEviPfiPK6uchar4iPhiiiPj", module, ctx);
            GMMFinalizeKernelInvertSigma       = new CudaKernel("_Z17GMMFinalizeKernelILi4ELb1EEvPfS0_ii", module, ctx);
            GMMFinalizeKernelNoInvertSigma     = new CudaKernel("_Z17GMMFinalizeKernelILi4ELb0EEvPfS0_ii", module, ctx);
            GMMcommonTerm   = new CudaKernel("_Z13GMMcommonTermiPfi", module, ctx);
            DataTermKernel  = new CudaKernel("_Z14DataTermKernelPiiiPKfiPK6uchar4iPKhiii", module, ctx);
            GMMAssignKernel = new CudaKernel("_Z15GMMAssignKerneliPKfiPK6uchar4iPhiii", module, ctx);
            GMMFindSplit    = new CudaKernel("_Z12GMMFindSplitP10GMMSplit_tiPfi", module, ctx);
            GMMDoSplit      = new CudaKernel("_Z10GMMDoSplitPK10GMMSplit_tiPfiPK6uchar4iPhiii", module, ctx);
            MeanEdgeStrengthReductionKernel = new CudaKernel("_Z31MeanEdgeStrengthReductionKerneliiPf", module, ctx);
            MeanEdgeStrengthFinalKernel     = new CudaKernel("_Z27MeanEdgeStrengthFinalKernelPfi", module, ctx);
            EdgeCuesKernel            = new CudaKernel("_Z14EdgeCuesKernelfPKfPiS1_S1_S1_S1_S1_S1_S1_iiii", module, ctx);
            SegmentationChangedKernel = new CudaKernel("_Z25SegmentationChangedKernelPiPhS0_iii", module, ctx);
            downscaleKernel1          = new CudaKernel("_Z18downscaleKernelBoxI6uchar4EvPT_iiiPKS1_iii", module, ctx);
            downscaleKernel2          = new CudaKernel("_Z18downscaleKernelMaxIhEvPT_iiiPKS0_iii", module, ctx);
            upsampleAlphaKernel       = new CudaKernel("_Z19upsampleAlphaKernelPhS_iiii", module, ctx);

            GMMFinalizeKernelInvertSigma.SetConstantVariable("det_indices", det_indices);
            GMMFinalizeKernelInvertSigma.SetConstantVariable("inv_indices", inv_indices);
            GMMFinalizeKernelNoInvertSigma.SetConstantVariable("det_indices", det_indices);
            GMMFinalizeKernelNoInvertSigma.SetConstantVariable("inv_indices", inv_indices);
        }
        //static float3[] h_A;
        //static float3[] h_C;
        //static CudaDeviceVariable<float3> d_A;
        //static CudaDeviceVariable<float3> d_C;
        public CalculateHeatmap()
        {
            ctx = new CudaContext(CudaContext.GetMaxGflopsDeviceId());
            dev = ctx.GetDeviceInfo();
            Console.WriteLine("Using CUDA Device {0} compute level {1} timeout {2}", dev.DeviceName, dev.ComputeCapability, dev.KernelExecTimeoutEnabled ? "enabled" : "disabled");
            string resName;

            resName = @"C:\WEDEV\GpuImplementations\GpuInterpolation\RasterInterpolation_x64.ptx";
            Console.WriteLine("Loading Interpolation Kernel");
            InterpolateKernel = ctx.LoadKernelPTX(resName, "RasterInterpolate");
        }
Exemplo n.º 5
0
        // Initialization code to find the best CUDA Device
        static int findCudaDevice(string[] args)
        {
            int devID = 0;
            // If the command-line has a device number specified, use it
            bool found = false;

            foreach (var item in args)
            {
                if (item.Contains("device="))
                {
                    found = true;
                    if (!int.TryParse(item, out devID))
                    {
                        Console.WriteLine("Invalid command line parameters");
                        Environment.Exit(-1);
                    }
                    if (devID < 0)
                    {
                        Console.WriteLine("Invalid command line parameters\n");
                        Environment.Exit(-1);
                    }
                    else
                    {
                        devID = gpuDeviceInit(devID);
                        if (devID < 0)
                        {
                            Console.WriteLine("exiting...\n");
                            ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_FAILED);
                            Environment.Exit(-1);
                        }
                    }
                }
            }

            if (!found)
            {
                // Otherwise pick the device with highest Gflops/s
                devID = CudaContext.GetMaxGflopsDeviceId();
                ctx   = new CudaContext(devID, CUCtxFlags.SchedAuto);
                Console.Write("> Using CUDA device [{0}]: {1}\n", devID, ctx.GetDeviceName());
            }
            return(devID);
        }
Exemplo n.º 6
0
        public GrabCutUtils()
        {
            ctx = new CudaContext(CudaContext.GetMaxGflopsDeviceId(), false);


            //Load Kernel image from resources
            string resName;

            if (IntPtr.Size == 8)
            {
                resName = "GrabCutUtils_x64.ptx";
            }
            else
            {
                resName = "GrabCutUtils.ptx";
            }

            string resNamespace = "GrabCutNPP";
            string resource     = resNamespace + "." + resName;
            Stream stream       = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource);

            if (stream == null)
            {
                throw new ArgumentException("Kernel not found in resources.");
            }
            byte[] kernel = new byte[stream.Length];

            int bytesToRead = (int)stream.Length;

            while (bytesToRead > 0)
            {
                bytesToRead -= stream.Read(kernel, (int)stream.Position, bytesToRead);
            }

            TrimapFromRectKernel   = ctx.LoadKernelPTX(kernel, "_Z20TrimapFromRectKernelPhi8NppiRectii");
            ApplyMatteKernelMode0  = ctx.LoadKernelPTX(kernel, "_Z16ApplyMatteKernelILi0EEvP6uchar4iPKS0_iPKhiii");
            ApplyMatteKernelMode1  = ctx.LoadKernelPTX(kernel, "_Z16ApplyMatteKernelILi1EEvP6uchar4iPKS0_iPKhiii");
            ApplyMatteKernelMode2  = ctx.LoadKernelPTX(kernel, "_Z16ApplyMatteKernelILi2EEvP6uchar4iPKS0_iPKhiii");
            convertRGBToRGBAKernel = ctx.LoadKernelPTX(kernel, "_Z22convertRGBToRGBAKernelP6uchar4iP6uchar3iii");
        }
Exemplo n.º 7
0
        public BarycentricCuda(int width, int height)
        {
            Width  = width;
            Height = height;

            //Init Cuda context
            ctx = new CudaContext(CudaContext.GetMaxGflopsDeviceId());

            //Load Kernel image from resources
            string resName = "bary.ptx";

            string resNamespace = "BarycentricCudaLib";
            string resource     = resNamespace + "." + resName;
            Stream stream       = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource);

            if (stream == null)
            {
                throw new ArgumentException("Kernel not found in resources.");
            }

            baryKernel = ctx.LoadKernelPTX(stream, "baryKernel");
        }
Exemplo n.º 8
0
        static void Main(string[] args)
        {
            int   cuda_device = 0;
            int   nstreams = 4;                           // number of streams for CUDA calls
            int   nreps = 10;                             // number of times each experiment is repeated
            int   n = 16 * 1024 * 1024;                   // number of ints in the data set
            int   nbytes = n * sizeof(int);               // number of data bytes
            dim3  threads, blocks;                        // kernel launch configuration
            float elapsed_time, time_memcpy, time_kernel; // timing variables
            float scale_factor = 1.0f;

            // allocate generic memory and pin it laster instead of using cudaHostAlloc()
            // Untested in C#, so stick to cudaHostAlloc().
            bool       bPinGenericMemory  = false;                   // we want this to be the default behavior
            CUCtxFlags device_sync_method = CUCtxFlags.BlockingSync; // by default we use BlockingSync

            int niterations;                                         // number of iterations for the loop inside the kernel

            ShrQATest.shrQAStart(args);

            Console.WriteLine("[ simpleStreams ]");

            foreach (var item in args)
            {
                if (item.Contains("help"))
                {
                    printHelp();
                    ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_PASSED);
                }
            }

            bPinGenericMemory = false;
            foreach (var item in args)
            {
                if (item.Contains("use_generic_memory"))
                {
                    bPinGenericMemory = true;
                }
            }

            for (int i = 0; i < args.Length; i++)
            {
                if (args[i].Contains("sync_method"))
                {
                    int  temp  = -1;
                    bool error = false;
                    if (i < args.Length - 1)
                    {
                        error = int.TryParse(args[i + 1], out temp);
                        switch (temp)
                        {
                        case 0:
                            device_sync_method = CUCtxFlags.SchedAuto;
                            break;

                        case 1:
                            device_sync_method = CUCtxFlags.SchedSpin;
                            break;

                        case 2:
                            device_sync_method = CUCtxFlags.SchedYield;
                            break;

                        case 4:
                            device_sync_method = CUCtxFlags.BlockingSync;
                            break;

                        default:
                            error = true;
                            break;
                        }
                    }
                    if (!error)
                    {
                        Console.Write("Specifying device_sync_method = {0}, setting reps to 100 to demonstrate steady state\n", sDeviceSyncMethod[(int)device_sync_method]);
                        nreps = 100;
                    }
                    else
                    {
                        Console.Write("Invalid command line option sync_method=\"{0}\"\n", temp);
                        ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_FAILED);
                    }
                }
            }

            int num_devices = CudaContext.GetDeviceCount();

            if (0 == num_devices)
            {
                Console.Write("your system does not have a CUDA capable device, waiving test...\n");
                ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_FAILED);
            }
            cuda_device = CudaContext.GetMaxGflopsDeviceId();

            CudaDeviceProperties deviceProp = CudaContext.GetDeviceInfo(cuda_device);

            if ((1 == deviceProp.ComputeCapability.Major) && (deviceProp.ComputeCapability.Minor < 1))
            {
                Console.Write("{0} does not have Compute Capability 1.1 or newer. Reducing workload.\n", deviceProp.DeviceName);
            }

            if (deviceProp.ComputeCapability.Major >= 2)
            {
                niterations = 100;
            }
            else
            {
                if (deviceProp.ComputeCapability.Minor > 1)
                {
                    niterations = 5;
                }
                else
                {
                    niterations = 1;                     // reduced workload for compute capability 1.0 and 1.1
                }
            }

            // Check if GPU can map host memory (Generic Method), if not then we override bPinGenericMemory to be false
            // In .net we cannot allocate easily generic aligned memory, so <bPinGenericMemory> is always false in our case...
            if (bPinGenericMemory)
            {
                Console.Write("Device: <{0}> canMapHostMemory: {1}\n", deviceProp.DeviceName, deviceProp.CanMapHostMemory ? "Yes" : "No");
                if (deviceProp.CanMapHostMemory == false)
                {
                    Console.Write("Using cudaMallocHost, CUDA device does not support mapping of generic host memory\n");
                    bPinGenericMemory = false;
                }
            }

            // Anything that is less than 32 Cores will have scaled down workload
            scale_factor = Math.Max((32.0f / (ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor) * (float)deviceProp.MultiProcessorCount)), 1.0f);
            n            = (int)Math.Round((float)n / scale_factor);

            Console.Write("> CUDA Capable: SM {0}.{1} hardware\n", deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor);
            Console.Write("> {0} Multiprocessor(s) x {1} (Cores/Multiprocessor) = {2} (Cores)\n",
                          deviceProp.MultiProcessorCount,
                          ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor),
                          ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor) * deviceProp.MultiProcessorCount);

            Console.Write("> scale_factor = {0:0.0000}\n", 1.0f / scale_factor);
            Console.Write("> array_size   = {0}\n\n", n);

            // enable use of blocking sync, to reduce CPU usage
            Console.Write("> Using CPU/GPU Device Synchronization method ({0})\n", sDeviceSyncMethod[(int)device_sync_method]);

            CudaContext ctx;

            if (bPinGenericMemory)
            {
                ctx = new CudaContext(cuda_device, device_sync_method | CUCtxFlags.MapHost);
            }
            else
            {
                ctx = new CudaContext(cuda_device, device_sync_method);
            }

            //Load Kernel image from resources
            string resName;

            if (IntPtr.Size == 8)
            {
                resName = "simpleStreams_x64.ptx";
            }
            else
            {
                resName = "simpleStreams.ptx";
            }

            string resNamespace = "simpleStreams";
            string resource     = resNamespace + "." + resName;
            Stream stream       = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource);

            if (stream == null)
            {
                throw new ArgumentException("Kernel not found in resources.");
            }

            CudaKernel init_array = ctx.LoadKernelPTX(stream, "init_array");


            // allocate host memory
            int c = 5;                                                          // value to which the array will be initialized

            int[] h_a = null;                                                   // pointer to the array data in host memory
            CudaPageLockedHostMemory <int> hAligned_a = null;                   // pointer to the array data in host memory (aligned to MEMORY_ALIGNMENT)

            //Note: In .net we have two seperated arrays: One is in managed memory (h_a), the other one in unmanaged memory (hAligned_a).
            //In C++ hAligned_a would point somewhere inside the h_a array.
            AllocateHostMemory(bPinGenericMemory, ref h_a, ref hAligned_a, nbytes);

            Console.Write("\nStarting Test\n");

            // allocate device memory
            CudaDeviceVariable <int> d_c = c;            //using new implicit cast to allocate memory and asign value
            CudaDeviceVariable <int> d_a = new CudaDeviceVariable <int>(nbytes / sizeof(int));

            CudaStream[] streams = new CudaStream[nstreams];
            for (int i = 0; i < nstreams; i++)
            {
                streams[i] = new CudaStream();
            }

            // create CUDA event handles
            // use blocking sync
            CudaEvent    start_event, stop_event;
            CUEventFlags eventflags = ((device_sync_method == CUCtxFlags.BlockingSync) ? CUEventFlags.BlockingSync : CUEventFlags.Default);

            start_event = new CudaEvent(eventflags);
            stop_event  = new CudaEvent(eventflags);

            // time memcopy from device
            start_event.Record();                 // record in stream-0, to ensure that all previous CUDA calls have completed
            hAligned_a.AsyncCopyToDevice(d_a, streams[0].Stream);
            stop_event.Record();
            stop_event.Synchronize();               // block until the event is actually recorded
            time_memcpy = CudaEvent.ElapsedTime(start_event, stop_event);
            Console.Write("memcopy:\t{0:0.00}\n", time_memcpy);

            // time kernel
            threads = new dim3(512, 1);
            blocks  = new dim3(n / (int)threads.x, 1);
            start_event.Record();
            init_array.BlockDimensions = threads;
            init_array.GridDimensions  = blocks;
            init_array.RunAsync(streams[0].Stream, d_a.DevicePointer, d_c.DevicePointer, niterations);
            stop_event.Record();
            stop_event.Synchronize();
            time_kernel = CudaEvent.ElapsedTime(start_event, stop_event);
            Console.Write("kernel:\t\t{0:0.00}\n", time_kernel);


            //////////////////////////////////////////////////////////////////////
            // time non-streamed execution for reference
            threads = new dim3(512, 1);
            blocks  = new dim3(n / (int)threads.x, 1);
            start_event.Record();
            for (int k = 0; k < nreps; k++)
            {
                init_array.BlockDimensions = threads;
                init_array.GridDimensions  = blocks;
                init_array.Run(d_a.DevicePointer, d_c.DevicePointer, niterations);
                hAligned_a.SynchronCopyToHost(d_a);
            }
            stop_event.Record();
            stop_event.Synchronize();
            elapsed_time = CudaEvent.ElapsedTime(start_event, stop_event);
            Console.Write("non-streamed:\t{0:0.00} ({1:00} expected)\n", elapsed_time / nreps, time_kernel + time_memcpy);

            //////////////////////////////////////////////////////////////////////
            // time execution with nstreams streams
            threads = new dim3(512, 1);
            blocks  = new dim3(n / (int)(nstreams * threads.x), 1);
            byte[] memset = new byte[nbytes];             // set host memory bits to all 1s, for testing correctness
            for (int i = 0; i < nbytes; i++)
            {
                memset[i] = 255;
            }
            System.Runtime.InteropServices.Marshal.Copy(memset, 0, hAligned_a.PinnedHostPointer, nbytes);
            d_a.Memset(0);             // set device memory to all 0s, for testing correctness

            start_event.Record();
            for (int k = 0; k < nreps; k++)
            {
                init_array.BlockDimensions = threads;
                init_array.GridDimensions  = blocks;
                // asynchronously launch nstreams kernels, each operating on its own portion of data
                for (int i = 0; i < nstreams; i++)
                {
                    init_array.RunAsync(streams[i].Stream, d_a.DevicePointer + i * n / nstreams * sizeof(int), d_c.DevicePointer, niterations);
                }

                // asynchronously launch nstreams memcopies.  Note that memcopy in stream x will only
                //   commence executing when all previous CUDA calls in stream x have completed
                for (int i = 0; i < nstreams; i++)
                {
                    hAligned_a.AsyncCopyFromDevice(d_a, i * n / nstreams * sizeof(int), i * n / nstreams * sizeof(int), nbytes / nstreams, streams[i].Stream);
                }
            }
            stop_event.Record();
            stop_event.Synchronize();
            elapsed_time = CudaEvent.ElapsedTime(start_event, stop_event);
            Console.Write("{0} streams:\t{1:0.00} ({2:0.00} expected with compute capability 1.1 or later)\n", nstreams, elapsed_time / nreps, time_kernel + time_memcpy / nstreams);

            // check whether the output is correct
            Console.Write("-------------------------------\n");
            //We can directly access data in hAligned_a using the [] operator, but copying
            //data first to h_a is faster.
            System.Runtime.InteropServices.Marshal.Copy(hAligned_a.PinnedHostPointer, h_a, 0, nbytes / sizeof(int));

            bool bResults = correct_data(h_a, n, c * nreps * niterations);

            // release resources
            for (int i = 0; i < nstreams; i++)
            {
                streams[i].Dispose();
            }
            start_event.Dispose();
            stop_event.Dispose();

            hAligned_a.Dispose();
            d_a.Dispose();
            d_c.Dispose();
            CudaContext.ProfilerStop();
            ctx.Dispose();

            Console.ReadKey();
            ShrQATest.shrQAFinishExit(args, bResults ? ShrQATest.eQAstatus.QA_PASSED : ShrQATest.eQAstatus.QA_FAILED);
        }
Exemplo n.º 9
0
        static void Main(string[] args)
        {
            ShrQATest.shrQAStart(args);

            Console.WriteLine("Vector Addition");
            int N = 50000;

            //Init Cuda context
            ctx = new CudaContext(CudaContext.GetMaxGflopsDeviceId());

            //Load Kernel image from resources
            string resName;

            if (IntPtr.Size == 8)
            {
                resName = "vectorAdd_x64.ptx";
            }
            else
            {
                resName = "vectorAdd.ptx";
            }

            string resNamespace = "vectorAdd";
            string resource     = resNamespace + "." + resName;
            Stream stream       = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource);

            if (stream == null)
            {
                throw new ArgumentException("Kernel not found in resources.");
            }

            CudaKernel vectorAddKernel = ctx.LoadKernelPTX(stream, "VecAdd");

            // Allocate input vectors h_A and h_B in host memory
            h_A = new float[N];
            h_B = new float[N];


            // Initialize input vectors
            RandomInit(h_A, N);
            RandomInit(h_B, N);

            // Allocate vectors in device memory and copy vectors from host memory to device memory
            // Notice the new syntax with implicit conversion operators: Allocation of device memory and data copy is one operation.
            d_A = h_A;
            d_B = h_B;
            d_C = new CudaDeviceVariable <float>(N);

            // Invoke kernel
            int threadsPerBlock = 256;

            vectorAddKernel.BlockDimensions = threadsPerBlock;
            vectorAddKernel.GridDimensions  = (N + threadsPerBlock - 1) / threadsPerBlock;

            vectorAddKernel.Run(d_A.DevicePointer, d_B.DevicePointer, d_C.DevicePointer, N);

            // Copy result from device memory to host memory
            // h_C contains the result in host memory
            h_C = d_C;

            // Verify result
            int i;

            for (i = 0; i < N; ++i)
            {
                float sum = h_A[i] + h_B[i];
                if (Math.Abs(h_C[i] - sum) > 1e-5)
                {
                    break;
                }
            }

            CleanupResources();

            ShrQATest.shrQAFinishExit(args, i == N ? ShrQATest.eQAstatus.QA_PASSED : ShrQATest.eQAstatus.QA_FAILED);
        }
Exemplo n.º 10
0
        public void TestPTX()
        {
            LLVM.InitializeAllTargets();
            LLVM.InitializeAllTargetMCs();
            LLVM.InitializeAllTargetInfos();
            LLVM.InitializeAllAsmPrinters();
            ModuleRef mod = LLVM.ModuleCreateWithName("llvmptx");
            var       pt  = LLVM.PointerType(LLVM.Int64Type(), 1);

            TypeRef[]     param_types = { pt };
            TypeRef       ret_type    = LLVM.FunctionType(LLVM.VoidType(), param_types, false);
            ValueRef      sum         = LLVM.AddFunction(mod, "sum", ret_type);
            BasicBlockRef entry       = LLVM.AppendBasicBlock(sum, "entry");
            BuilderRef    builder     = LLVM.CreateBuilder();

            LLVM.PositionBuilderAtEnd(builder, entry);
            var      v   = LLVM.BuildLoad(builder, LLVM.GetParam(sum, 0), "");
            ValueRef tmp = LLVM.BuildAdd(builder, v, LLVM.ConstInt(LLVM.Int64Type(), 1, false), "tmp");

            LLVM.BuildStore(builder, tmp, LLVM.GetParam(sum, 0));
            LLVM.BuildRetVoid(builder);
            MyString the_error = new MyString();

            LLVM.VerifyModule(mod, VerifierFailureAction.PrintMessageAction, the_error);

            string    triple = "nvptx64-nvidia-cuda";
            TargetRef t2;
            var       b = LLVM.GetTargetFromTriple(triple, out t2, the_error);

            string cpu      = "";
            string features = "";

            TargetMachineRef tmr = LLVM.CreateTargetMachine(t2, triple, cpu, features,
                                                            CodeGenOptLevel.CodeGenLevelDefault,
                                                            RelocMode.RelocDefault,
                                                            CodeModel.CodeModelKernel);
            ContextRef context_ref = LLVM.ContextCreate();
            ValueRef   kernelMd    = LLVM.MDNodeInContext(context_ref, new ValueRef[3]
            {
                sum,
                LLVM.MDStringInContext(context_ref, "kernel", 6),
                LLVM.ConstInt(LLVM.Int32TypeInContext(context_ref), 1, false)
            });

            LLVM.AddNamedMetadataOperand(mod, "nvvm.annotations", kernelMd);
            var y1 = LLVM.TargetMachineEmitToMemoryBuffer(
                tmr,
                mod,
                Swigged.LLVM.CodeGenFileType.AssemblyFile,
                the_error,
                out MemoryBufferRef buffer);
            string ptx = null;

            try
            {
                ptx = LLVM.GetBufferStart(buffer);
                uint length = LLVM.GetBufferSize(buffer);
                // Output the PTX assembly code. We can run this using the CUDA Driver API
                System.Console.WriteLine(ptx);
            }
            finally
            {
                LLVM.DisposeMemoryBuffer(buffer);
            }


            // RUN THE MF.

            Int64[]     h_C             = new Int64[100];
            CudaContext ctx             = new CudaContext(CudaContext.GetMaxGflopsDeviceId());
            CudaKernel  kernel          = ctx.LoadKernelPTX(Encoding.ASCII.GetBytes(ptx), "sum");
            var         d_C             = new CudaDeviceVariable <Int64>(100);
            int         N               = 1;
            int         threadsPerBlock = 256;

            kernel.BlockDimensions = threadsPerBlock;
            kernel.GridDimensions  = (N + threadsPerBlock - 1) / threadsPerBlock;
            kernel.Run(d_C.DevicePointer);
            h_C = d_C;
            System.Console.WriteLine("Result " + h_C[0]);
            if (h_C[0] != 1)
            {
                throw new Exception("Failed.");
            }
            LLVM.DumpModule(mod);
            LLVM.DisposeBuilder(builder);
        }
Exemplo n.º 11
0
        public Form1()
        {
            InitializeComponent();

            ctx = new CudaContext(CudaContext.GetMaxGflopsDeviceId());
        }
Exemplo n.º 12
0
        // 1.Простой тест
        // 2.Тест с передачей и приемом больших и разных структур
        public string GetSummary()
        {
            string s = "";

            //int deviceID = 0;
            //CudaContext ctx = new CudaContext(deviceID, CUCtxFlags.MapHost | CUCtxFlags.BlockingSync);

            //for default setting with device 0:
            //CudaContext ctx = new CudaContext();

            int deviceCount = CudaContext.GetDeviceCount();

            s += $"deviceCount = {deviceCount}\n";
            int devID = CudaContext.GetMaxGflopsDeviceId();

            s += $"GetMaxGflopsDeviceId = {devID}\n";

            //return s;

            for (int deviceID = 0; deviceID < deviceCount; deviceID++)
            {
                s += $"----- DeviceID = {deviceID} -----\n";
                CudaDeviceProperties props = CudaContext.GetDeviceInfo(deviceID);
                s += $"DeviceName = {props.DeviceName}\n";
                s += $"DriverVersion = {props.DriverVersion.ToString()}\n";
                s += $"CUDA ComputeCapability = {props.ComputeCapability.ToString()}\n";

                s += $"Integrated = {props.Integrated.ToString()}\n";
                s += $"MultiProcessorCount = {props.MultiProcessorCount.ToString()}\n";
                s += $"ClockRate = {(props.ClockRate/1000).ToString()} MHz\n";

                s += $"TotalGlobalMemory = {(props.TotalGlobalMemory / 1000000).ToString()} Mb\n";
                s += $"MemoryClockRate = {(props.MemoryClockRate / 1000).ToString()} MHz\n";
                s += $"GlobalMemoryBusWidth = {props.GlobalMemoryBusWidth.ToString()} bit\n";

                s += $"maxGridSize[3] (MaxGridDim) = ({props.MaxGridDim.x}; {props.MaxGridDim.y}; {props.MaxGridDim.z}) \n";

                s += $"maxThreadsPerBlock = {props.MaxThreadsPerBlock} \n";
                s += $"maxThreadsDim[3] (MaxBlockDim) = ({props.MaxBlockDim.x}; {props.MaxBlockDim.y}; {props.MaxBlockDim.z}) \n";
                s += $"MaxThreadsPerMultiProcessor = {props.MaxThreadsPerMultiProcessor} \n";
            }

            return(s);

            /*
             + name[256] = is an ASCII string identifying the device;
             + uuid is a 16-byte unique identifier.
             + totalGlobalMem = is the total amount of global memory available on the device in bytes;
             + sharedMemPerBlock is the maximum amount of shared memory available to a thread block in bytes;
             + regsPerBlock is the maximum number of 32-bit registers available to a thread block;
             + warpSize is the warp size in threads;
             + memPitch is the maximum pitch in bytes allowed by the memory copy functions that involve memory regions allocated through cudaMallocPitch();
             + maxThreadsPerBlock = is the maximum number of threads per block;
             + maxThreadsDim[3] = MaxBlockDim = contains the maximum size of each dimension of a block;
             + maxGridSize[3] = MaxGridDim = contains the maximum size of each dimension of a grid;
             + clockRate = is the clock frequency in kilohertz;
             + totalConstMem is the total amount of constant memory available on the device in bytes;
             + major, minor are the major and minor revision numbers defining the device's compute capability;
             + textureAlignment is the alignment requirement; texture base addresses that are aligned to textureAlignment bytes do not need an offset applied to texture fetches;
             + texturePitchAlignment is the pitch alignment requirement for 2D texture references that are bound to pitched memory;
             + deviceOverlap is 1 if the device can concurrently copy memory between host and device while executing a kernel, or 0 if not. Deprecated, use instead asyncEngineCount.
             + multiProcessorCount = is the number of multiprocessors on the device;
             + kernelExecTimeoutEnabled is 1 if there is a run time limit for kernels executed on the device, or 0 if not.
             + integrated = is 1 if the device is an integrated (motherboard) GPU and 0 if it is a discrete (card) component.
             + canMapHostMemory is 1 if the device can map host memory into the CUDA address space for use with cudaHostAlloc()/cudaHostGetDevicePointer(), or 0 if not;
             + computeMode is the compute mode that the device is currently in. Available modes are as follows:
             +  cudaComputeModeDefault: Default mode - Device is not restricted and multiple threads can use cudaSetDevice() with this device.
             +  cudaComputeModeExclusive: Compute-exclusive mode - Only one thread will be able to use cudaSetDevice() with this device.
             +  cudaComputeModeProhibited: Compute-prohibited mode - No threads can use cudaSetDevice() with this device.
             +  cudaComputeModeExclusiveProcess: Compute-exclusive-process mode - Many threads in one process will be able to use cudaSetDevice() with this device.
             +  If cudaSetDevice() is called on an already occupied device with computeMode cudaComputeModeExclusive, cudaErrorDeviceAlreadyInUse will be immediately returned indicating the device cannot be used. When an occupied exclusive mode device is chosen with cudaSetDevice, all subsequent non-device management runtime functions will return cudaErrorDevicesUnavailable.
             + maxTexture1D is the maximum 1D texture size.
             + maxTexture1DMipmap is the maximum 1D mipmapped texture texture size.
             + maxTexture1DLinear is the maximum 1D texture size for textures bound to linear memory.
             + maxTexture2D[2] contains the maximum 2D texture dimensions.
             + maxTexture2DMipmap[2] contains the maximum 2D mipmapped texture dimensions.
             + maxTexture2DLinear[3] contains the maximum 2D texture dimensions for 2D textures bound to pitch linear memory.
             + maxTexture2DGather[2] contains the maximum 2D texture dimensions if texture gather operations have to be performed.
             + maxTexture3D[3] contains the maximum 3D texture dimensions.
             + maxTexture3DAlt[3] contains the maximum alternate 3D texture dimensions.
             + maxTextureCubemap is the maximum cubemap texture width or height.
             + maxTexture1DLayered[2] contains the maximum 1D layered texture dimensions.
             + maxTexture2DLayered[3] contains the maximum 2D layered texture dimensions.
             + maxTextureCubemapLayered[2] contains the maximum cubemap layered texture dimensions.
             + maxSurface1D is the maximum 1D surface size.
             + maxSurface2D[2] contains the maximum 2D surface dimensions.
             + maxSurface3D[3] contains the maximum 3D surface dimensions.
             + maxSurface1DLayered[2] contains the maximum 1D layered surface dimensions.
             + maxSurface2DLayered[3] contains the maximum 2D layered surface dimensions.
             + maxSurfaceCubemap is the maximum cubemap surface width or height.
             + maxSurfaceCubemapLayered[2] contains the maximum cubemap layered surface dimensions.
             + surfaceAlignment specifies the alignment requirements for surfaces.
             + concurrentKernels is 1 if the device supports executing multiple kernels within the same context simultaneously, or 0 if not. It is not guaranteed that multiple kernels will be resident on the device concurrently so this feature should not be relied upon for correctness;
             + ECCEnabled is 1 if the device has ECC support turned on, or 0 if not.
             + pciBusID is the PCI bus identifier of the device.
             + pciDeviceID is the PCI device (sometimes called slot) identifier of the device.
             + pciDomainID is the PCI domain identifier of the device.
             + tccDriver is 1 if the device is using a TCC driver or 0 if not.
             + asyncEngineCount is 1 when the device can concurrently copy memory between host and device while executing a kernel. It is 2 when the device can concurrently copy memory between host and device in both directions and execute a kernel at the same time. It is 0 if neither of these is supported.
             + unifiedAddressing is 1 if the device shares a unified address space with the host and 0 otherwise.
             + memoryClockRate is the peak memory clock frequency in kilohertz.
             + memoryBusWidth is the memory bus width in bits.
             + l2CacheSize is L2 cache size in bytes.
             + maxThreadsPerMultiProcessor is the number of maximum resident threads per multiprocessor.
             + streamPrioritiesSupported is 1 if the device supports stream priorities, or 0 if it is not supported.
             + globalL1CacheSupported is 1 if the device supports caching of globals in L1 cache, or 0 if it is not supported.
             + localL1CacheSupported is 1 if the device supports caching of locals in L1 cache, or 0 if it is not supported.
             + sharedMemPerMultiprocessor is the maximum amount of shared memory available to a multiprocessor in bytes; this amount is shared by all thread blocks simultaneously resident on a multiprocessor;
             + regsPerMultiprocessor is the maximum number of 32-bit registers available to a multiprocessor; this number is shared by all thread blocks simultaneously resident on a multiprocessor;
             + managedMemory is 1 if the device supports allocating managed memory on this system, or 0 if it is not supported.
             + isMultiGpuBoard is 1 if the device is on a multi-GPU board (e.g. Gemini cards), and 0 if not;
             + multiGpuBoardGroupID is a unique identifier for a group of devices associated with the same board. Devices on the same multi-GPU board will share the same identifier;
             + singleToDoublePrecisionPerfRatio is the ratio of single precision performance (in floating-point operations per second) to double precision performance.
             + pageableMemoryAccess is 1 if the device supports coherently accessing pageable memory without calling cudaHostRegister on it, and 0 otherwise.
             + concurrentManagedAccess is 1 if the device can coherently access managed memory concurrently with the CPU, and 0 otherwise.
             + computePreemptionSupported is 1 if the device supports Compute Preemption, and 0 otherwise.
             + canUseHostPointerForRegisteredMem is 1 if the device can access host registered memory at the same virtual address as the CPU, and 0 otherwise.
             + cooperativeLaunch is 1 if the device supports launching cooperative kernels via cudaLaunchCooperativeKernel, and 0 otherwise.
             + cooperativeMultiDeviceLaunch is 1 if the device supports launching cooperative kernels via cudaLaunchCooperativeKernelMultiDevice, and 0 otherwise.
             + pageableMemoryAccessUsesHostPageTables is 1 if the device accesses pageable memory via the host's page tables, and 0 otherwise.
             + directManagedMemAccessFromHost is 1 if the host can directly access managed memory on the device without migration, and 0 otherwise.
             */

            /*
             * deviceCount = 1
             * GetMaxGflopsDeviceId = 0
             * ----- DeviceID = 0 -----
             * DeviceName = GeForce GTX 1050 Ti
             * DriverVersion = 10.20
             * CUDA ComputeCapability = 6.1
             * Integrated = False
             * MultiProcessorCount = 6
             * ClockRate = 1392 MHz
             * TotalGlobalMemory = 4294 Mb
             * MemoryClockRate = 3504 MHz
             * GlobalMemoryBusWidth = 128 bit
             * maxGridSize[3] (MaxGridDim) = (2147483647; 65535; 65535)
             * maxThreadsPerBlock = 1024
             * maxThreadsDim[3] (MaxBlockDim) = (1024; 1024; 64)
             * MaxThreadsPerMultiProcessor = 2048
             */
        }
 /// <summary>
 ///     Create a new <see cref="BaseOperator" /> with a specified <see cref="IComputationHandler" />.
 ///     The <see cref="IComputationHandler" /> will <c>not</c> be modified by the <see cref="ITrainer" />.
 /// </summary>
 public CudaSinglethreadedOperator() : this(CudaContext.GetMaxGflopsDeviceId())
 {
 }
Exemplo n.º 14
0
        private void initGLAndCuda()
        {
            //Create render target control
            m_renderControl              = new OpenTK.GLControl(GraphicsMode.Default, 1, 0, GraphicsContextFlags.Default);
            m_renderControl.Dock         = DockStyle.Fill;
            m_renderControl.BackColor    = Color.White;
            m_renderControl.BorderStyle  = BorderStyle.FixedSingle;
            m_renderControl.KeyDown     += new KeyEventHandler(m_renderControl_KeyDown);
            m_renderControl.MouseMove   += new MouseEventHandler(m_renderControl_MouseMove);
            m_renderControl.MouseDown   += new MouseEventHandler(m_renderControl_MouseDown);
            m_renderControl.SizeChanged += new EventHandler(m_renderControl_SizeChanged);

            panel1.Controls.Add(m_renderControl);
            Console.WriteLine("   OpenGL device is Available");

            int deviceID = CudaContext.GetMaxGflopsDeviceId();

            ctx = CudaContext.CreateOpenGLContext(deviceID, CUCtxFlags.BlockingSync);
            string console = string.Format("CUDA device [{0}] has {1} Multi-Processors", ctx.GetDeviceName(), ctx.GetDeviceInfo().MultiProcessorCount);

            Console.WriteLine(console);

            CUmodule module = ctx.LoadModulePTX("kernel.ptx");

            addForces_k       = new CudaKernel("addForces_k", module, ctx);
            advectVelocity_k  = new CudaKernel("advectVelocity_k", module, ctx);
            diffuseProject_k  = new CudaKernel("diffuseProject_k", module, ctx);
            updateVelocity_k  = new CudaKernel("updateVelocity_k", module, ctx);
            advectParticles_k = new CudaKernel("advectParticles_OGL", module, ctx);

            hvfield = new cData[DS];
            dvfield = new CudaPitchedDeviceVariable <cData>(DIM, DIM);
            tPitch  = dvfield.Pitch;

            dvfield.CopyToDevice(hvfield);

            vxfield = new CudaDeviceVariable <cData>(DS);
            vyfield = new CudaDeviceVariable <cData>(DS);

            // Create particle array
            particles = new cData[DS];
            initParticles(particles, DIM, DIM);

            // TODO: update kernels to use the new unpadded memory layout for perf
            // rather than the old FFTW-compatible layout
            planr2c = new CudaFFTPlan2D(DIM, DIM, cufftType.R2C, Compatibility.FFTWPadding);
            planc2r = new CudaFFTPlan2D(DIM, DIM, cufftType.C2R, Compatibility.FFTWPadding);

            GL.GenBuffers(1, out vbo);
            GL.BindBuffer(BufferTarget.ArrayBuffer, vbo);
            GL.BufferData <cData>(BufferTarget.ArrayBuffer, new IntPtr(cData.SizeOf * DS), particles, BufferUsageHint.DynamicDraw);
            int bsize;

            GL.GetBufferParameter(BufferTarget.ArrayBuffer, BufferParameterName.BufferSize, out bsize);

            if (bsize != DS * cData.SizeOf)
            {
                throw new Exception("Sizes don't match.");
            }

            GL.BindBuffer(BufferTarget.ArrayBuffer, 0);

            cuda_vbo_resource = new CudaGraphicsInteropResourceCollection();
            cuda_vbo_resource.Add(new CudaOpenGLBufferInteropResource(vbo, CUGraphicsRegisterFlags.None));

            texref = new CudaTextureArray2D(advectVelocity_k, "texref", CUAddressMode.Wrap, CUFilterMode.Linear, 0, CUArrayFormat.Float, DIM, DIM, CudaArray2DNumChannels.Two);

            stopwatch = new CudaStopWatch(CUEventFlags.Default);

            reshape();
            isInit = true;
            display();
        }
Exemplo n.º 15
0
        public static void Execute()
        {
            Console.WriteLine("Barycentric stuff");

            //Init Cuda context
            ctx = new CudaContext(CudaContext.GetMaxGflopsDeviceId());

            //Load Kernel image from resources
            string resName = "baryTest.ptx";

            string resNamespace = "TestManagedCuda";
            string resource     = resNamespace + "." + resName;
            Stream stream       = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource);

            if (stream == null)
            {
                throw new ArgumentException("Kernel not found in resources.");
            }

            CudaKernel baryKernel = ctx.LoadKernelPTX(stream, "baryKernel");


            framebufferSize = new int2(5, 5);

            // Allocate input vectors h_A and h_B in host memory
            h_v0     = new float3(0, 1, 0);
            h_v1     = new float3(1, -1, 0);
            h_v2     = new float3(-1, -1, 0);
            h_da     = 3;
            h_db     = 2;
            h_dc     = 1;
            h_dOut   = new float[framebufferSize.x * framebufferSize.y];
            h_width  = framebufferSize.x;
            h_height = framebufferSize.y;

            // Allocate vectors in device memory and copy vectors from host memory to device memory
            // Notice the new syntax with implicit conversion operators: Allocation of device memory and data copy is one operation.
            dev_v0 = h_v0;
            dev_v1 = h_v1;
            dev_v2 = h_v2;
            dev_da = h_da;
            dev_db = h_db;
            dev_dc = h_dc;

            dev_dOut = new CudaDeviceVariable <float>(framebufferSize.x * framebufferSize.y);

            dev_width  = h_width;
            dev_height = h_height;

            // Invoke kernel
            //int threadsPerBlock = 256;
            //vectorAddKernel.BlockDimensions = threadsPerBlock;
            //vectorAddKernel.GridDimensions = (framebufferSize.x + threadsPerBlock - 1) / threadsPerBlock;


            dim3 windowSize = new dim3(framebufferSize.x, framebufferSize.y);
            dim3 blockSize  = new dim3(16, 16, 1);
            dim3 gridSize   = new dim3(windowSize.x / blockSize.x + 1, windowSize.y / blockSize.y + 1);

            baryKernel.BlockDimensions = blockSize;
            baryKernel.GridDimensions  = gridSize;

            baryKernel.Run(dev_v0.DevicePointer, dev_v1.DevicePointer, dev_v2.DevicePointer, dev_da.DevicePointer, dev_db.DevicePointer, dev_dc.DevicePointer, dev_dOut.DevicePointer, dev_width.DevicePointer, dev_height.DevicePointer);

            // Copy result from device memory to host memory
            // h_C contains the result in host memory
            h_dOut = dev_dOut;


            CleanupResources();

            Console.Write("{\n");
            for (int y = 0; y < framebufferSize.y; y++)
            {
                Console.Write("  {");
                for (int x = 0; x < framebufferSize.x; x++)
                {
                    Console.Write(h_dOut[x + y * framebufferSize.y] + "|");
                }
                Console.Write("}\n");
            }
            Console.Write("}\n");

            Console.ReadKey();
        }