Example #1
0
    public void CUDA_AddFloatArrays()
    {
        //Load Kernel image from resources
        Stream stream = new StreamReader(resName).BaseStream;

        if (stream == null)
        {
            throw new ArgumentException("Kernel not found in resources.");
        }

        vectorAddKernel = ctx.LoadKernelPTX(stream, "VecAdd");

        var threadsPerBlock = 1024;

        vectorAddKernel.BlockDimensions = threadsPerBlock;
        vectorAddKernel.GridDimensions  = (Count + threadsPerBlock - 1) / threadsPerBlock;

        CudaStopWatch w = new CudaStopWatch();

        w.Start();
        vectorAddKernel.Run(d_A.DevicePointer, d_B.DevicePointer, C.DevicePointer, Count);
        w.Stop();

        Debug.Log(w.GetElapsedTime() / 1000.0f);
        Debug.Log($"{h_A[0]} + {h_B[0]} = {C[0]}");
        Debug.Log($"{h_A[Count-1]} + {h_B[Count-1]} = {C[Count-1]}");

        // Copy result from device memory to host memory
        // h_C contains the result in host memory
        // h_C = d_C;
    }
Example #2
0
        private T[] RunKernel <T>(Action <T[]> method, T[] parameters) where T : struct
        {
            var methodInfo = method.Method;

            string[] kernels;
            string   llvmIr, ptxIr;
            var      ptx = CudaSharp.CudaSharp.Translate(out kernels, out llvmIr, out ptxIr, "sm_20", methodInfo);

            Console.WriteLine(llvmIr);
            Console.WriteLine(ptxIr);
            var kernel     = _context.LoadKernelPTX(ptx, kernels[0]);
            var maxThreads = kernel.MaxThreadsPerBlock;

            if (parameters.Length <= maxThreads)
            {
                kernel.BlockDimensions = parameters.Length;
                kernel.GridDimensions  = 1;
            }
            else
            {
                kernel.BlockDimensions = maxThreads;
                kernel.GridDimensions  = parameters.Length / maxThreads;
                if ((kernel.BlockDimensions * kernel.GridDimensions) != parameters.Length)
                {
                    throw new Exception(string.Format("Invalid parameters size (must be <= {0} or a multiple of {0}", maxThreads));
                }
            }
            var gpuMem = new CudaDeviceVariable <T>(parameters.Length);

            gpuMem.CopyToDevice(parameters);
            kernel.Run(gpuMem.DevicePointer);
            gpuMem.CopyToHost(parameters);
            gpuMem.Dispose();
            return(parameters);
        }
Example #3
0
        public CudaKernel Get(CudaContext context, byte[] ptx, string kernelName)
        {
            lock (locker)
            {
                try
                {
                    if (activeKernels.TryGetValue(Tuple.Create(context, ptx, kernelName), out CudaKernel value))
                    {
                        return(value);
                    }
                    else
                    {
                        value = context.LoadKernelPTX(ptx, kernelName);
                        activeKernels.Add(Tuple.Create(context, ptx, kernelName), value);
                        return(value);
                    }
                }
                catch (Exception err)
                {
                    Logger.WriteLine(Logger.Level.err, ConsoleColor.Red, $"Exception: '{err.Message}'");
                    Logger.WriteLine(Logger.Level.err, ConsoleColor.Red, $"Call stack: '{err.StackTrace}'");

                    throw err;
                }
            }
        }
        static CudaKernel BuildKernelFromFunction(string functionName, ref CudaContext context)
        {
            //CudaContext newContext = new CudaContext();
            CudaKernel kernel = context.LoadKernelPTX(PTX_NAME, functionName);

            kernel.BlockDimensions = THREADS_PER_BLOCK;
            kernel.GridDimensions  = BLOCKS_PER_GRID;
            return(kernel);
        }
        //static float3[] h_A;
        //static float3[] h_C;
        //static CudaDeviceVariable<float3> d_A;
        //static CudaDeviceVariable<float3> d_C;
        public CalculateHeatmap()
        {
            ctx = new CudaContext(CudaContext.GetMaxGflopsDeviceId());
            dev = ctx.GetDeviceInfo();
            Console.WriteLine("Using CUDA Device {0} compute level {1} timeout {2}", dev.DeviceName, dev.ComputeCapability, dev.KernelExecTimeoutEnabled ? "enabled" : "disabled");
            string resName;

            resName = @"C:\WEDEV\GpuImplementations\GpuInterpolation\RasterInterpolation_x64.ptx";
            Console.WriteLine("Loading Interpolation Kernel");
            InterpolateKernel = ctx.LoadKernelPTX(resName, "RasterInterpolate");
        }
Example #6
0
        public GrabCutUtils()
        {
            ctx = new CudaContext(CudaContext.GetMaxGflopsDeviceId(), false);


            //Load Kernel image from resources
            string resName;

            if (IntPtr.Size == 8)
            {
                resName = "GrabCutUtils_x64.ptx";
            }
            else
            {
                resName = "GrabCutUtils.ptx";
            }

            string resNamespace = "GrabCutNPP";
            string resource     = resNamespace + "." + resName;
            Stream stream       = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource);

            if (stream == null)
            {
                throw new ArgumentException("Kernel not found in resources.");
            }
            byte[] kernel = new byte[stream.Length];

            int bytesToRead = (int)stream.Length;

            while (bytesToRead > 0)
            {
                bytesToRead -= stream.Read(kernel, (int)stream.Position, bytesToRead);
            }

            TrimapFromRectKernel   = ctx.LoadKernelPTX(kernel, "_Z20TrimapFromRectKernelPhi8NppiRectii");
            ApplyMatteKernelMode0  = ctx.LoadKernelPTX(kernel, "_Z16ApplyMatteKernelILi0EEvP6uchar4iPKS0_iPKhiii");
            ApplyMatteKernelMode1  = ctx.LoadKernelPTX(kernel, "_Z16ApplyMatteKernelILi1EEvP6uchar4iPKS0_iPKhiii");
            ApplyMatteKernelMode2  = ctx.LoadKernelPTX(kernel, "_Z16ApplyMatteKernelILi2EEvP6uchar4iPKS0_iPKhiii");
            convertRGBToRGBAKernel = ctx.LoadKernelPTX(kernel, "_Z22convertRGBToRGBAKernelP6uchar4iP6uchar3iii");
        }
Example #7
0
        /// <summary>
        /// Gets the specified context.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="ptx">The PTX.</param>
        /// <param name="kernelName">Name of the kernel.</param>
        /// <returns>CudaKernel.</returns>
        public CudaKernel Get(CudaContext context, byte[] ptx, string kernelName)
        {
            CudaKernel value;

            if (activeKernels.TryGetValue(Tuple.Create(context, ptx, kernelName), out value))
            {
                return(value);
            }
            else
            {
                value = context.LoadKernelPTX(ptx, kernelName);
                activeKernels.Add(Tuple.Create(context, ptx, kernelName), value);
                return(value);
            }
        }
Example #8
0
        unsafe static public Int64 GPUSobel(Bitmap image, byte[] grayData)
        {
            int width = image.Width;
            int height = image.Height;

            var context = new CudaContext();
            dim3 blockDim = new dim3(16, 16);
            uint gridX = (uint)(width + blockDim.x - 1) / blockDim.x;
            uint gridY = (uint)(height + blockDim.y - 1) / blockDim.y;
            dim3 gridDim = new dim3(gridX, gridY);
            CudaKernel kernel = context.LoadKernelPTX("Kernel.ptx", "Sobel");
            kernel.BlockDimensions = blockDim;
            kernel.GridDimensions = gridDim;

            BitmapData imageData = image.LockBits(new Rectangle(0, 0, width, height), ImageLockMode.ReadWrite, image.PixelFormat);
            uint* ptr = (uint*)imageData.Scan0.ToPointer(); // An unsigned int pointer. This points to the image data in memory, each uint is one pixel ARGB
            int stride = imageData.Stride / 4; // Stride is the width of one pixel row, including any padding. In bytes, /4 converts to 4 byte pixels
            CudaDeviceVariable<byte> deviceGrayData = grayData;
            CudaDeviceVariable<uint> output = new CudaDeviceVariable<uint>(width * height);
            Stopwatch sw = Stopwatch.StartNew();
            kernel.Run(deviceGrayData.DevicePointer, output.DevicePointer, width, height);
            sw.Stop();
            Int64 ticks = sw.ElapsedTicks;

            uint[] filteredImage = output;
            int index = 0;
            for (int i = 1; i < height; ++i)
            {
                for (int j = 1; j < width; ++j)
                {
                    *(ptr + i * stride + j) = filteredImage[index++];
                }
            }
            for (int x = 0; x < width; ++x)
            {
                *(ptr + (height - 1) * stride + x) = 0;
                *(ptr + x) = 0;
            }
            for (int y = 0; y < height; ++y)
            {
                *(ptr + y * stride) = 0;
                *(ptr + y * stride + width - 1) = 0;
            }
            // Finish with image and save
            image.UnlockBits(imageData);

            return ticks;
        }
Example #9
0
 static void Test(byte[] ptxFile)
 {
     const int size = 16;
     var context = new CudaContext();
     var kernel = context.LoadKernelPTX(ptxFile, "kernel");
     var memory = context.AllocateMemory(4 * size);
     var gpuMemory = new CudaDeviceVariable<int>(memory);
     var cpuMemory = new int[size];
     for (var i = 0; i < size; i++)
         cpuMemory[i] = i - 2;
     gpuMemory.CopyToDevice(cpuMemory);
     kernel.BlockDimensions = 4;
     kernel.GridDimensions = 4;
     kernel.Run(memory);
     gpuMemory.CopyToHost(cpuMemory);
     for (var i = 0; i < size; i++)
         Console.WriteLine("{0} = {1}", i, cpuMemory[i]);
 }
Example #10
0
    protected void InitializeCUDA()
    {
        string[] filetext = new string[cudafiles.Length];
        cudaKernel = new CudaKernel[cudafiles.Length];
        ctx        = new CudaContext(0);

        for (int i = 0; i < cudafiles.Length; ++i)
        {
            filetext[i] = File.ReadAllText(Application.dataPath + @"\Scripts\CUDA\" + cudafiles[i] + ".cu");
            Debug.Log(filetext[i]);

            CudaRuntimeCompiler rtc = new CudaRuntimeCompiler(filetext[i], cudafiles[i]);
            rtc.Compile(CompileOption);
            Debug.Log(rtc.GetLogAsString());

            byte[] ptx = rtc.GetPTX();
            rtc.Dispose();

            cudaKernel[i] = ctx.LoadKernelPTX(ptx, cudafiles[i]);
        }
    }
        public void CompileKernel()
        {
            //generate as output language obviously from strict code
            var code =
                @"extern ""C"" __global__ void blur(unsigned char* image, unsigned char* output, size_t width, size_t height)
{
  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
  if (tid > width && tid < width*height-width) {
    output[tid] = image[tid];// (image[tid-2048]+image[tid-1]+image[tid]+image[tid+1]+image[tid+2048])/5;
  }
}";

            using var rtc = new CudaRuntimeCompiler(code, "blur");
            try
            {
                // Use max capabilities on actual hardware we have at runtime
                var computeVersion     = CudaContext.GetDeviceComputeCapability(0);
                var shaderModelVersion = "" + computeVersion.Major + computeVersion.Minor;
                Console.WriteLine("ShaderModelVersion=" + shaderModelVersion);
                // see http://docs.nvidia.com/cuda/nvrtc/index.html for usage and options
                //https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
                //nvcc .\vectorAdd.cu -use_fast_math -ptx -m 64 -arch compute_61 -code sm_61 -o .\vectorAdd.ptx
                //https://docs.nvidia.com/cuda/nvrtc/index.html#group__options
                rtc.Compile(new[] { "--gpu-architecture=compute_" + shaderModelVersion });
                Console.WriteLine("Cuda compile log: " + rtc.GetLogAsString());
                var deviceID = 0;
                var ctx      = new CudaContext(deviceID);
                kernel = ctx.LoadKernelPTX(rtc.GetPTX(), "blur");
                kernel.GridDimensions  = (Size + 511) / 512;
                kernel.BlockDimensions = 512;
                //unused: float[] copyInput = new float[Size];
                input  = image;
                output = new CudaDeviceVariable <byte>(Size);
            }
            catch (NVRTCException ex)
            {
                Console.WriteLine("Cuda compile log: " + rtc.GetLogAsString());
                throw new Exception(ex.NVRTCError + " " + ex);
            }
        }
Example #12
0
        protected T[] InternalExecuteCuda <T>(
            byte[] kernelBinary,
            String function,
            int bufferSize,
            ParallelTaskParams loaderParams,
            params Object[] kernelParams) where T : struct
        {
            TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointStart);

            CudaContext context = ContextWithDevice(loaderParams.CudaDevice);

            TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointPlatformInit);
            TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointKernelBuild);

            CudaDeviceVariable <T> resultBufferVar = new CudaDeviceVariable <T>(bufferSize);

            resultBufferVar.Memset(0);

            List <Tuple <Object, IDisposable> > vars = new List <Tuple <Object, IDisposable> >();

            vars.Add(new Tuple <Object, IDisposable>(resultBufferVar.DevicePointer, resultBufferVar));
            vars.AddRange(WrapDeviceVariables(kernelParams, true));
            TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointDeviceWrite);

            CudaKernel kernel = context.LoadKernelPTX(kernelBinary, function);

            kernel.BlockDimensions = new dim3(loaderParams.BlockSize.Width, loaderParams.BlockSize.Height);
            kernel.GridDimensions  = new dim3(loaderParams.GridSize.Width, loaderParams.GridSize.Height);
            kernel.Run(vars.Select(tuple => tuple.Item1).ToArray());
            TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointKernelExecute);

            T[] resultBuffer = resultBufferVar;
            TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointDeviceRead);

            vars.Where(tuple => tuple.Item2 != null).ToList().ForEach(tuple => tuple.Item2.Dispose());
            TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointPlatformDeinit);

            return(resultBuffer);
        }
Example #13
0
        public BarycentricCuda(int width, int height)
        {
            Width  = width;
            Height = height;

            //Init Cuda context
            ctx = new CudaContext(CudaContext.GetMaxGflopsDeviceId());

            //Load Kernel image from resources
            string resName = "bary.ptx";

            string resNamespace = "BarycentricCudaLib";
            string resource     = resNamespace + "." + resName;
            Stream stream       = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource);

            if (stream == null)
            {
                throw new ArgumentException("Kernel not found in resources.");
            }

            baryKernel = ctx.LoadKernelPTX(stream, "baryKernel");
        }
Example #14
0
        static void Main(string[] args)
        {
            var assembly     = Assembly.GetExecutingAssembly();
            var resourceName = "simpleOccupancy.simpleOccupancy.ptx";

            ctx = new CudaContext(0);
            string[] liste = assembly.GetManifestResourceNames();
            using (Stream stream = assembly.GetManifestResourceStream(resourceName))
            {
                kernel = ctx.LoadKernelPTX(stream, "square");
            }


            Console.WriteLine("starting Simple Occupancy");
            Console.WriteLine();

            Console.WriteLine("[ Manual configuration with {0} threads per block ]", manualBlockSize);

            int status = test(false);

            if (status != 0)
            {
                Console.WriteLine("Test failed");
                return;
            }

            Console.WriteLine();

            Console.WriteLine("[ Automatic, occupancy-based configuration ]");
            status = test(true);
            if (status != 0)
            {
                Console.WriteLine("Test failed");
                return;
            }

            Console.WriteLine();
            Console.WriteLine("Test PASSED");
        }
Example #15
0
        static void Main(string[] args)
        {
            int N        = 50000;
            int deviceID = 0;

            ManagedCuda.CudaContext ctx = new CudaContext(deviceID);

            string ptx = @"//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-21112126
// Cuda compilation tools, release 8.0, V8.0.43
// Based on LLVM 3.4svn
//

                .version 5.0
                .target sm_20, debug
                .address_size 64

    // .globl   VecAdd

                .visible .entry VecAdd(
            .param .u64 VecAdd_param_0,
            .param .u64 VecAdd_param_1,
            .param .u64 VecAdd_param_2,
            .param .u32 VecAdd_param_3
            )
        {
            .reg .pred  %p<3>;
            .reg .f32   %f<4>;
            .reg .b32   %r<7>;
            .reg .b64   %rd<13>;


            .loc 1 27 1
func_begin0:
                .loc    1 0 0

                        .loc 1 27 1

                        ld.param.u64    %rd1, [VecAdd_param_0];
                ld.param.u64    %rd2, [VecAdd_param_1];
                ld.param.u64    %rd3, [VecAdd_param_2];
                ld.param.u32    %r2, [VecAdd_param_3];
func_exec_begin0:
                .loc    1 29 1
tmp0:
                mov.u32     %r3, %ntid.x;
                mov.u32     %r4, %ctaid.x;
                mul.lo.s32  %r5, %r3, %r4;
                mov.u32     %r6, %tid.x;
                add.s32     %r1, %r5, %r6;
tmp1:
                .loc    1 30 1
                        setp.lt.s32 %p1, %r1, %r2;
                not.pred    %p2, %p1;
                @%p2 bra    BB0_2;
                bra.uni     BB0_1;

BB0_1:
                .loc    1 31 1
tmp2:
                    cvt.s64.s32 %rd4, %r1;
                    shl.b64     %rd5, %rd4, 2;
                    add.s64     %rd6, %rd1, %rd5;
                    ld.f32  %f1, [%rd6];
                    cvt.s64.s32 %rd7, %r1;
                    shl.b64     %rd8, %rd7, 2;
                    add.s64     %rd9, %rd2, %rd8;
                    ld.f32  %f2, [%rd9];
                    add.f32     %f3, %f1, %f2;
                    cvt.s64.s32 %rd10, %r1;
                    shl.b64     %rd11, %rd10, 2;
                    add.s64     %rd12, %rd3, %rd11;
                    st.f32  [%rd12], %f3;
tmp3:

BB0_2:
                    .loc    1 32 2
                            ret;
tmp4:
func_end0:
        }

        .file   1 ""I:/ManagedCuda/managedCuda/Samples/ManagedCudaSamples/vectorAddKernel/vectorAdd.cu"", 1477220395, 691

                .section .debug_info {
            .b32 464
                    .b8 2
                    .b8 0
                    .b32 .debug_abbrev
                    .b8 8
                    .b8 1

                    .b8 108
                    .b8 103
                    .b8 101
                    .b8 110
                    .b8 102
                    .b8 101
                    .b8 58
                    .b8 32
                    .b8 69
                    .b8 68
                    .b8 71
                    .b8 32
                    .b8 52
                    .b8 46
                    .b8 49
                    .b8 48

                    .b8 0
                    .b8 4
                    .b8 73
                    .b8 58
                    .b8 47
                    .b8 77
                    .b8 97
                    .b8 110
                    .b8 97
                    .b8 103
                    .b8 101
                    .b8 100
                    .b8 67
                    .b8 117
                    .b8 100
                    .b8 97
                    .b8 47
                    .b8 109
                    .b8 97
                    .b8 110
                    .b8 97
                    .b8 103
                    .b8 101
                    .b8 100
                    .b8 67
                    .b8 117
                    .b8 100
                    .b8 97
                    .b8 47
                    .b8 83
                    .b8 97
                    .b8 109
                    .b8 112
                    .b8 108
                    .b8 101
                    .b8 115
                    .b8 47
                    .b8 77
                    .b8 97
                    .b8 110
                    .b8 97
                    .b8 103
                    .b8 101
                    .b8 100
                    .b8 67
                    .b8 117
                    .b8 100
                    .b8 97
                    .b8 83
                    .b8 97
                    .b8 109
                    .b8 112
                    .b8 108
                    .b8 101
                    .b8 115
                    .b8 47
                    .b8 118
                    .b8 101
                    .b8 99
                    .b8 116
                    .b8 111
                    .b8 114
                    .b8 65
                    .b8 100
                    .b8 100
                    .b8 75
                    .b8 101
                    .b8 114
                    .b8 110
                    .b8 101
                    .b8 108
                    .b8 47
                    .b8 118
                    .b8 101
                    .b8 99
                    .b8 116
                    .b8 111
                    .b8 114
                    .b8 65
                    .b8 100
                            .b8 100
                            .b8 46
                            .b8 99
                            .b8 117

                            .b8 0
                            .b64 0
                            .b32 .debug_line
                            .b8 73
                            .b8 58
                            .b8 92
                            .b8 77
                            .b8 97
                            .b8 110
                            .b8 97
                            .b8 103
                            .b8 101
                            .b8 100
                            .b8 67
                            .b8 117
                            .b8 100
                            .b8 97
                            .b8 92
                            .b8 109
                            .b8 97
                            .b8 110
                            .b8 97
                            .b8 103
                            .b8 101
                            .b8 100
                            .b8 67
                            .b8 117
                            .b8 100
                            .b8 97
                            .b8 92
                            .b8 83
                            .b8 97
                            .b8 109
                            .b8 112
                            .b8 108
                            .b8 101
                            .b8 115
                            .b8 92
                            .b8 77
                            .b8 97
                            .b8 110
                            .b8 97
                            .b8 103
                            .b8 101
                            .b8 100
                            .b8 67
                            .b8 117
                            .b8 100
                            .b8 97
                            .b8 83
                            .b8 97
                            .b8 109
                            .b8 112
                            .b8 108
                            .b8 101
                            .b8 115
                            .b8 92
                            .b8 118
                            .b8 101
                            .b8 99
                            .b8 116
                            .b8 111
                            .b8 114
                            .b8 65
                            .b8 100
                            .b8 100
                            .b8 75
                            .b8 101
                            .b8 114
                            .b8 110
                            .b8 101
                            .b8 108

                            .b8 0
                            .b8 2

                            .b8 86
                            .b8 101
                            .b8 99
                            .b8 65
                            .b8 100
                            .b8 100

                            .b8 0
                            .b8 86
                            .b8 101
                            .b8 99
                            .b8 65
                            .b8 100
                            .b8 100

                            .b8 0
                            .b32 1
                            .b32 27
                            .b32 422
                            .b8 1
                            .b64 func_begin0
                            .b64 func_end0
                            .b8 1
                            .b8 156
                            .b8 3

                                    .b8 65

                                    .b8 0
                                    .b32 1
                                    .b32 27
                                    .b32 428
                                    .b8 9
                                    .b8 3
                                    .b64 VecAdd_param_0
                                    .b8 7
                                    .b8 3

                                    .b8 66

                                    .b8 0
                                    .b32 1
                                    .b32 27
                                    .b32 428
                                    .b8 9
                                    .b8 3
                                    .b64 VecAdd_param_1
                                    .b8 7
                                    .b8 3

                                    .b8 67

                                    .b8 0
                                    .b32 1
                                    .b32 27
                                    .b32 451
                                    .b8 9
                                    .b8 3
                                    .b64 VecAdd_param_2
                                    .b8 7
                                    .b8 3

                                    .b8 78

                                    .b8 0
                                    .b32 1
                                    .b32 27
                                    .b32 457
                                    .b8 9
                                    .b8 3
                                    .b64 VecAdd_param_3
                                    .b8 7
                                    .b8 4

                                    .b64 tmp0
                                    .b64 tmp4
                                    .b8 4

                                    .b64 tmp0
                                    .b64 tmp3
                                    .b8 4

                                    .b64 tmp0
                                    .b64 tmp3
                                    .b8 5

                                    .b8 105

                                    .b8 0
                                    .b32 1
                                    .b32 29
                                    .b32 457
                                    .b8 5
                                    .b8 144
                                    .b8 177
                                    .b8 228
                                    .b8 149
                                    .b8 1
                                    .b8 2
                                    .b8 0
                                    .b8 0
                                    .b8 0
                                    .b8 0
                                    .b8 6

                                    .b8 118
                                    .b8 111
                                    .b8 105
                                    .b8 100

                                    .b8 0
                                    .b8 7

                                    .b32 434
                                    .b8 12
                                    .b8 8

                                    .b32 439
                                    .b8 9

                                    .b8 102
                                    .b8 108
                                    .b8 111
                                    .b8 97
                                    .b8 116

                                    .b8 0
                                    .b8 4
                                    .b32 4
                                    .b8 7

                                    .b32 439
                                    .b8 12
                                    .b8 9

                                    .b8 105
                                    .b8 110
                                    .b8 116

                                    .b8 0
                                    .b8 5
                                    .b32 4
                                    .b8 0
        }
        .section .debug_abbrev {
            .b8 1

                    .b8 17

                    .b8 1

                    .b8 37

                    .b8 8

                    .b8 19

                    .b8 11

                    .b8 3

                    .b8 8

                    .b8 17

                    .b8 1

                    .b8 16

                    .b8 6

                    .b8 27

                    .b8 8

                    .b8 0

                    .b8 0

                    .b8 2

                    .b8 46

                    .b8 1

                    .b8 135
                    .b8 64

                    .b8 8

                    .b8 3

                    .b8 8

                    .b8 58

                    .b8 6

                    .b8 59

                    .b8 6

                    .b8 73

                    .b8 19

                    .b8 63

                    .b8 12

                    .b8 17

                    .b8 1

                    .b8 18

                    .b8 1

                    .b8 64

                    .b8 10

                    .b8 0

                    .b8 0

                    .b8 3

                    .b8 5

                    .b8 0

                    .b8 3

                    .b8 8

                    .b8 58

                    .b8 6

                    .b8 59

                    .b8 6

                    .b8 73

                    .b8 19

                    .b8 2

                    .b8 10

                    .b8 51

                    .b8 11

                    .b8 0

                    .b8 0

                    .b8 4

                    .b8 11

                    .b8 1

                    .b8 17

                    .b8 1

                    .b8 18

                    .b8 1

                    .b8 0

                    .b8 0

                    .b8 5

                    .b8 52

                    .b8 0

                    .b8 3

                    .b8 8

                    .b8 58

                    .b8 6

                    .b8 59

                    .b8 6

                    .b8 73

                    .b8 19

                    .b8 2

                    .b8 10

                    .b8 51

                    .b8 11

                    .b8 0

                    .b8 0

                    .b8 6

                    .b8 59

                    .b8 0

                    .b8 3

                    .b8 8

                    .b8 0

                    .b8 0

                    .b8 7

                    .b8 15

                    .b8 0

                    .b8 73

                    .b8 19

                    .b8 51

                    .b8 11

                    .b8 0

                    .b8 0

                    .b8 8

                    .b8 38

                            .b8 0

                            .b8 73

                            .b8 19

                            .b8 0

                            .b8 0

                            .b8 9

                            .b8 36

                            .b8 0

                            .b8 3

                            .b8 8

                            .b8 62

                            .b8 11

                            .b8 11

                            .b8 6

                            .b8 0

                            .b8 0

                            .b8 0

        }
        .section .debug_ranges {
        }
        .section .debug_pubnames {
            .b32 25
                    .b8 2
                    .b8 0
                    .b32 .debug_info
                    .b32 464
                    .b32 195
                    .b8 86
                    .b8 101
                    .b8 99
                    .b8 65
                    .b8 100
                    .b8 100
                    .b8 0

                    .b32 0
        }
            ";

            System.IO.Stream moduleImage = new MemoryStream(Encoding.UTF8.GetBytes(ptx));
            CudaKernel       kernel      = ctx.LoadKernelPTX(moduleImage, "VecAdd");

            kernel.GridDimensions  = (N + 255) / 256;
            kernel.BlockDimensions = 256;

            // Allocate input vectors h_A and h_B in host memory
            float[] h_A = new float[N];
            float[] h_B = new float[N];

            // TODO: Initialize input vectors h_A, h_B
            System.Random random = new System.Random();
            for (int i = 0; i < N; ++i)
            {
                h_A[i] = (float)random.NextDouble();
            }
            for (int i = 0; i < N; ++i)
            {
                h_B[i] = (float)random.NextDouble();
            }

            // Allocate vectors in device memory and copy vectors from host memory to device memory
            CudaDeviceVariable <float> d_A = h_A;
            CudaDeviceVariable <float> d_B = h_B;
            CudaDeviceVariable <float> d_C = new CudaDeviceVariable <float>(N);

            // Invoke kernel
            kernel.Run(d_A.DevicePointer, d_B.DevicePointer, d_C.DevicePointer, N);

            // Copy result from device memory to host memory
            // h_C contains the result in host memory
            float[] h_C = d_C;

            for (int i = 0; i < 4; ++i)
            {
                System.Console.WriteLine(h_C[i]);
            }
        }
        public void Compile()
        {
            using (var ctx = new CudaContext())
            {
                // with verbaim string @, we only have to double up double quotes: no other escaping
                string source = @"
                extern ""C"" __global__
                void saxpy(float a, float *x, float *y, float *out, size_t n)
                {
                    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
                    if (tid < n)
                    {
                        out[tid] = a * x[tid] + y[tid];
                    }
                }
                ";

                source += Environment.NewLine;

                var name = "Test";
                var headers = new string[0];
                var includeNames = new string[0];

                var compiler = new CudaRuntimeCompiler(source, name, headers, includeNames);

                //var compiler2 = new CudaRuntimeCompiler(source, name, headers, includeNames);
                // --ptxas-options=-v -keep
                compiler.Compile(new string[] { "-G" });

                //var ptxString = compiler.GetPTXAsString(); // for debugging

                var ptx = compiler.GetPTX();

                //compiler2.Compile(new string[] { });

                var kernel = ctx.LoadKernelPTX(ptx, "kernelName");

                //One kernel per cu file:
                //CudaKernel kernel = ctx.LoadKernel(@"path\to\kernel.ptx", "kernelname");
                kernel.GridDimensions = new dim3(1, 1, 1);
                kernel.BlockDimensions = new dim3(16, 16);

                //kernel.Run()

                var a = new CudaDeviceVariable<double>(100);
                //ManagedCuda.NPP.NPPsExtensions.NPPsExtensionMethods.Sqr()

                //Multiple kernels per cu file:
                CUmodule cumodule = ctx.LoadModule(@"path\to\kernel.ptx");
                CudaKernel kernel1 = new CudaKernel("kernel1", cumodule, ctx)
                {
                    GridDimensions = new dim3(1, 1, 1),
                    BlockDimensions = new dim3(16, 16),
                };
                CudaKernel kernel2 = new CudaKernel("kernel2", cumodule, ctx)
                {
                    GridDimensions = new dim3(1, 1, 1),
                    BlockDimensions = new dim3(16, 16),
                };

            }
        }
Example #17
0
        static void Main(string[] args)
        {
            var assembly = Assembly.GetExecutingAssembly();
            var resourceName = "simpleOccupancy.simpleOccupancy.ptx";

            ctx = new CudaContext(0);
            string[] liste = assembly.GetManifestResourceNames();
            using (Stream stream = assembly.GetManifestResourceStream(resourceName))
            {
                kernel = ctx.LoadKernelPTX(stream, "square");
            }

            Console.WriteLine("starting Simple Occupancy");
            Console.WriteLine();

            Console.WriteLine("[ Manual configuration with {0} threads per block ]", manualBlockSize);

            int status = test(false);
            if (status != 0)
            {
                Console.WriteLine("Test failed");
                return;
            }

            Console.WriteLine();

            Console.WriteLine("[ Automatic, occupancy-based configuration ]");
            status = test(true);
            if (status != 0)
            {
                Console.WriteLine("Test failed");
                return;
            }

            Console.WriteLine();
            Console.WriteLine("Test PASSED");
        }
Example #18
0
        static void Main(string[] args)
        {
            ShrQATest.shrQAStart(args);

            Console.WriteLine("Vector Addition");
            int N = 50000;

            //Init Cuda context
            ctx = new CudaContext(CudaContext.GetMaxGflopsDeviceId());

            //Load Kernel image from resources
            string resName;

            if (IntPtr.Size == 8)
            {
                resName = "vectorAdd_x64.ptx";
            }
            else
            {
                resName = "vectorAdd.ptx";
            }

            string resNamespace = "vectorAdd";
            string resource     = resNamespace + "." + resName;
            Stream stream       = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource);

            if (stream == null)
            {
                throw new ArgumentException("Kernel not found in resources.");
            }

            CudaKernel vectorAddKernel = ctx.LoadKernelPTX(stream, "VecAdd");

            // Allocate input vectors h_A and h_B in host memory
            h_A = new float[N];
            h_B = new float[N];


            // Initialize input vectors
            RandomInit(h_A, N);
            RandomInit(h_B, N);

            // Allocate vectors in device memory and copy vectors from host memory to device memory
            // Notice the new syntax with implicit conversion operators: Allocation of device memory and data copy is one operation.
            d_A = h_A;
            d_B = h_B;
            d_C = new CudaDeviceVariable <float>(N);

            // Invoke kernel
            int threadsPerBlock = 256;

            vectorAddKernel.BlockDimensions = threadsPerBlock;
            vectorAddKernel.GridDimensions  = (N + threadsPerBlock - 1) / threadsPerBlock;

            vectorAddKernel.Run(d_A.DevicePointer, d_B.DevicePointer, d_C.DevicePointer, N);

            // Copy result from device memory to host memory
            // h_C contains the result in host memory
            h_C = d_C;

            // Verify result
            int i;

            for (i = 0; i < N; ++i)
            {
                float sum = h_A[i] + h_B[i];
                if (Math.Abs(h_C[i] - sum) > 1e-5)
                {
                    break;
                }
            }

            CleanupResources();

            ShrQATest.shrQAFinishExit(args, i == N ? ShrQATest.eQAstatus.QA_PASSED : ShrQATest.eQAstatus.QA_FAILED);
        }
        public void Compile()
        {
            using (var ctx = new CudaContext())
            {
                // with verbaim string @, we only have to double up double quotes: no other escaping
                string source = @"
                extern ""C"" __global__ 
                void saxpy(float a, float *x, float *y, float *out, size_t n)
                { 
	                size_t tid = blockIdx.x * blockDim.x + threadIdx.x; 
	                if (tid < n) 
	                { 
		                out[tid] = a * x[tid] + y[tid]; 
	                } 
                }
                ";

                source += Environment.NewLine;

                var name         = "Test";
                var headers      = new string[0];
                var includeNames = new string[0];

                var compiler = new CudaRuntimeCompiler(source, name, headers, includeNames);

                //var compiler2 = new CudaRuntimeCompiler(source, name, headers, includeNames);
                // --ptxas-options=-v -keep
                compiler.Compile(new string[] { "-G" });

                //var ptxString = compiler.GetPTXAsString(); // for debugging

                var ptx = compiler.GetPTX();

                //compiler2.Compile(new string[] { });

                var kernel = ctx.LoadKernelPTX(ptx, "kernelName");

                //One kernel per cu file:
                //CudaKernel kernel = ctx.LoadKernel(@"path\to\kernel.ptx", "kernelname");
                kernel.GridDimensions  = new dim3(1, 1, 1);
                kernel.BlockDimensions = new dim3(16, 16);

                //kernel.Run()

                var a = new CudaDeviceVariable <double>(100);
                //ManagedCuda.NPP.NPPsExtensions.NPPsExtensionMethods.Sqr()

                //Multiple kernels per cu file:
                CUmodule   cumodule = ctx.LoadModule(@"path\to\kernel.ptx");
                CudaKernel kernel1  = new CudaKernel("kernel1", cumodule, ctx)
                {
                    GridDimensions  = new dim3(1, 1, 1),
                    BlockDimensions = new dim3(16, 16),
                };
                CudaKernel kernel2 = new CudaKernel("kernel2", cumodule, ctx)
                {
                    GridDimensions  = new dim3(1, 1, 1),
                    BlockDimensions = new dim3(16, 16),
                };
            }
        }
Example #20
0
        static void Main(string[] args)
        {
            int SIGNAL_SIZE        = 50;
            int FILTER_KERNEL_SIZE = 11;

            Console.WriteLine("[simpleCUFFT] is starting...");

            var assembly     = Assembly.GetExecutingAssembly();
            var resourceName = "simpleCUFFT.simpleCUFFTKernel.ptx";

            CudaContext ctx = new CudaContext(0);
            CudaKernel  ComplexPointwiseMulAndScale;

            string[] liste = assembly.GetManifestResourceNames();
            using (Stream stream = assembly.GetManifestResourceStream(resourceName))
            {
                ComplexPointwiseMulAndScale = ctx.LoadKernelPTX(stream, "ComplexPointwiseMulAndScale");
            }

            // Allocate host memory for the signal
            cuFloatComplex[] h_signal = new cuFloatComplex[SIGNAL_SIZE]; //we use cuFloatComplex for complex multiplaction in reference host code...

            Random rand = new Random(0);

            // Initialize the memory for the signal
            for (int i = 0; i < SIGNAL_SIZE; ++i)
            {
                h_signal[i].real = (float)rand.NextDouble();
                h_signal[i].imag = 0;
            }

            // Allocate host memory for the filter
            cuFloatComplex[] h_filter_kernel = new cuFloatComplex[FILTER_KERNEL_SIZE];

            // Initialize the memory for the filter
            for (int i = 0; i < FILTER_KERNEL_SIZE; ++i)
            {
                h_filter_kernel[i].real = (float)rand.NextDouble();
                h_filter_kernel[i].imag = 0;
            }

            // Pad signal and filter kernel
            cuFloatComplex[] h_padded_signal        = null;
            cuFloatComplex[] h_padded_filter_kernel = null;
            int new_size = PadData(h_signal, ref h_padded_signal, SIGNAL_SIZE,
                                   h_filter_kernel, ref h_padded_filter_kernel, FILTER_KERNEL_SIZE);
            int mem_size = (int)cuFloatComplex.SizeOf * new_size;


            // Allocate device memory for signal
            CudaDeviceVariable <cuFloatComplex> d_signal = new CudaDeviceVariable <cuFloatComplex>(new_size);

            // Copy host memory to device
            d_signal.CopyToDevice(h_padded_signal);

            // Allocate device memory for filter kernel
            CudaDeviceVariable <cuFloatComplex> d_filter_kernel = new CudaDeviceVariable <cuFloatComplex>(new_size);

            // Copy host memory to device
            d_filter_kernel.CopyToDevice(h_padded_filter_kernel);

            // CUFFT plan simple API
            CudaFFTPlan1D plan = new CudaFFTPlan1D(new_size, cufftType.C2C, 1);

            // Transform signal and kernel
            Console.WriteLine("Transforming signal cufftExecC2C");
            plan.Exec(d_signal.DevicePointer, TransformDirection.Forward);
            plan.Exec(d_filter_kernel.DevicePointer, TransformDirection.Forward);

            // Multiply the coefficients together and normalize the result
            Console.WriteLine("Launching ComplexPointwiseMulAndScale<<< >>>");
            ComplexPointwiseMulAndScale.BlockDimensions = 256;
            ComplexPointwiseMulAndScale.GridDimensions  = 32;
            ComplexPointwiseMulAndScale.Run(d_signal.DevicePointer, d_filter_kernel.DevicePointer, new_size, 1.0f / new_size);

            // Transform signal back
            Console.WriteLine("Transforming signal back cufftExecC2C");
            plan.Exec(d_signal.DevicePointer, TransformDirection.Inverse);

            // Copy device memory to host
            cuFloatComplex[] h_convolved_signal = d_signal;

            // Allocate host memory for the convolution result
            cuFloatComplex[] h_convolved_signal_ref = new cuFloatComplex[SIGNAL_SIZE];

            // Convolve on the host
            Convolve(h_signal, SIGNAL_SIZE,
                     h_filter_kernel, FILTER_KERNEL_SIZE,
                     h_convolved_signal_ref);

            // check result
            bool bTestResult = sdkCompareL2fe(h_convolved_signal_ref, h_convolved_signal, 1e-5f);

            //Destroy CUFFT context
            plan.Dispose();

            // cleanup memory
            d_filter_kernel.Dispose();
            d_signal.Dispose();
            ctx.Dispose();

            if (bTestResult)
            {
                Console.WriteLine("Test Passed");
            }
            else
            {
                Console.WriteLine("Test Failed");
            }
        }
Example #21
0
        public KernelTests()
        {
            ctx = new CudaContext(); // we must call this first
            int Len     = 100000;
            var A       = new SyncVariable <float3>(GenRandomVectors(Len));
            var B       = new SyncVariable <float3>(GenRandomVectors(Len));
            var C       = new SyncVariable <float3>(Len);
            var D       = new SyncVariable <float>(Len);
            var Length  = new SyncVariable <int>(new int[] { Len }); // instead of an int use an int array of length 1
            var TrisCpu = new FEA.Mesher.TriangleSTL[Len];
            var rng     = new Random();

            for (int i = 0; i < Len; i++)
            {
                TrisCpu[i].Vertex1.x = (float)rng.NextDouble();
                TrisCpu[i].Vertex1.y = (float)rng.NextDouble();
                TrisCpu[i].Vertex1.z = (float)rng.NextDouble();

                TrisCpu[i].Vertex2.x = (float)rng.NextDouble();
                TrisCpu[i].Vertex2.y = (float)rng.NextDouble();
                TrisCpu[i].Vertex2.z = (float)rng.NextDouble();

                TrisCpu[i].Vertex3.x = (float)rng.NextDouble();
                TrisCpu[i].Vertex3.y = (float)rng.NextDouble();
                TrisCpu[i].Vertex3.z = (float)rng.NextDouble();
            }
            var Tris = new SyncVariable <FEA.Mesher.TriangleSTL>(TrisCpu);


            var PtxFile            = "KernelUnitTests.ptx";
            var CrossProdKernel    = ctx.LoadKernelPTX(PtxFile, "TestCrossProduct");
            var AddKernel          = ctx.LoadKernelPTX(PtxFile, "TestAdd");
            var SubKernel          = ctx.LoadKernelPTX(PtxFile, "TestSubtract");
            var DotKernel          = ctx.LoadKernelPTX(PtxFile, "TestDotProduct");
            var AreaKernel         = ctx.LoadKernelPTX(PtxFile, "TestTriangleArea");
            var IntersectionKernel = ctx.LoadKernelPTX(PtxFile, "TestPlaneIntersection");
            var BlockDims          = new dim3(512);
            var GridDims           = new dim3(Len / 512 + 1);

            CrossProdKernel.BlockDimensions = BlockDims;
            CrossProdKernel.GridDimensions  = GridDims;
            AddKernel.BlockDimensions       = BlockDims;
            AddKernel.GridDimensions        = GridDims;
            SubKernel.BlockDimensions       = BlockDims;
            SubKernel.GridDimensions        = GridDims;
            DotKernel.BlockDimensions       = BlockDims;
            DotKernel.GridDimensions        = GridDims;
            AreaKernel.BlockDimensions      = BlockDims;
            AreaKernel.GridDimensions       = GridDims;

            CrossProdKernel.Run(Len, A.GPUPtr(), B.GPUPtr(), C.GPUPtr());
            A.Sync();
            B.Sync();
            C.Sync();
            float eps = 1e-7f;

            for (int i = 0; i < Len; i++)
            {
                var Ans = A.cpuArray[i].Cross(B.cpuArray[i]) - C.cpuArray[i];

                if (Ans.Length >= eps)
                {
                    throw new Exception("Test Failed");
                }
            }
            AddKernel.Run(Len, A.GPUPtr(), B.GPUPtr(), C.GPUPtr());
            A.Sync();
            B.Sync();
            C.Sync();
            for (int i = 0; i < Len; i++)
            {
                var Ans = A.cpuArray[i] + B.cpuArray[i] - C.cpuArray[i];

                if (Ans.Length >= eps)
                {
                    throw new Exception("Test Failed");
                }
            }
            SubKernel.Run(Len, A.GPUPtr(), B.GPUPtr(), C.GPUPtr());
            A.Sync();
            B.Sync();
            C.Sync();
            for (int i = 0; i < Len; i++)
            {
                var Ans = A.cpuArray[i] - B.cpuArray[i] - C.cpuArray[i];

                if (Ans.Length >= eps)
                {
                    throw new Exception("Test Failed");
                }
            }
            DotKernel.Run(Len, A.GPUPtr(), B.GPUPtr(), D.GPUPtr());
            A.Sync();
            B.Sync();
            D.Sync();
            for (int i = 0; i < Len; i++)
            {
                float Ans = A.cpuArray[i].Dot(B.cpuArray[i]) - D.cpuArray[i];

                if (Ans >= 3 * eps)
                {
                    throw new Exception("Test Failed");
                }
            }
            AreaKernel.Run(Len, Tris.GPUPtr(), D.GPUPtr());
            Tris.Sync();
            D.Sync();
            for (int i = 0; i < Len; i++)
            {
                float ans = D.cpuArray[i];
            }
        }
Example #22
0
        static void Main(string[] args)
        {
            const int nx = 2048;
            const int ny = 2048;

            // shifts applied to x and y data
            const int x_shift = 5;
            const int y_shift = 7;

            ShrQATest.shrQAStart(args);

            if ((nx % TILE_DIM != 0) || (ny % TILE_DIM != 0))
            {
                Console.Write("nx and ny must be multiples of TILE_DIM\n");
                ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_WAIVED);
            }

            // execution configuration parameters
            dim3 grid    = new dim3(nx / TILE_DIM, ny / TILE_DIM, 1);
            dim3 threads = new dim3(TILE_DIM, TILE_DIM, 1);

            // This will pick the best possible CUDA capable device
            int devID = findCudaDevice(args);


            //Load Kernel image from resources
            string resName;

            if (IntPtr.Size == 8)
            {
                resName = "simplePitchLinearTexture_x64.ptx";
            }
            else
            {
                resName = "simplePitchLinearTexture.ptx";
            }

            string resNamespace = "simplePitchLinearTexture";
            string resource     = resNamespace + "." + resName;
            Stream stream       = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource);

            if (stream == null)
            {
                throw new ArgumentException("Kernel not found in resources.");
            }
            byte[] kernels = new byte[stream.Length];

            int bytesToRead = (int)stream.Length;

            while (bytesToRead > 0)
            {
                bytesToRead -= stream.Read(kernels, (int)stream.Position, bytesToRead);
            }

            CudaKernel PLKernel    = ctx.LoadKernelPTX(kernels, "shiftPitchLinear");
            CudaKernel ArrayKernel = ctx.LoadKernelPTX(kernels, "shiftArray");

            CudaStopWatch stopwatch = new CudaStopWatch();

            // ----------------------------------
            // Host allocation and initialization
            // ----------------------------------

            float[] h_idata = new float[nx * ny];
            float[] h_odata = new float[nx * ny];
            float[] gold    = new float[nx * ny];

            for (int i = 0; i < nx * ny; ++i)
            {
                h_idata[i] = (float)i;
            }

            // ------------------------
            // Device memory allocation
            // ------------------------

            // Pitch linear input data
            CudaPitchedDeviceVariable <float> d_idataPL = new CudaPitchedDeviceVariable <float>(nx, ny);

            // Array input data
            CudaArray2D d_idataArray = new CudaArray2D(CUArrayFormat.Float, nx, ny, CudaArray2DNumChannels.One);

            // Pitch linear output data
            CudaPitchedDeviceVariable <float> d_odata = new CudaPitchedDeviceVariable <float>(nx, ny);

            // ------------------------
            // copy host data to device
            // ------------------------

            // Pitch linear
            d_idataPL.CopyToDevice(h_idata);

            // Array
            d_idataArray.CopyFromHostToThis <float>(h_idata);

            // ----------------------
            // Bind texture to memory
            // ----------------------

            // Pitch linear
            CudaTextureLinearPitched2D <float> texRefPL = new CudaTextureLinearPitched2D <float>(PLKernel, "texRefPL", CUAddressMode.Wrap, CUFilterMode.Point, CUTexRefSetFlags.NormalizedCoordinates, CUArrayFormat.Float, d_idataPL);
            CudaTextureArray2D texRefArray = new CudaTextureArray2D(ArrayKernel, "texRefArray", CUAddressMode.Wrap, CUFilterMode.Point, CUTexRefSetFlags.NormalizedCoordinates, d_idataArray);

            // ---------------------
            // reference calculation
            // ---------------------

            for (int j = 0; j < ny; j++)
            {
                int jshift = (j + y_shift) % ny;
                for (int i = 0; i < nx; i++)
                {
                    int ishift = (i + x_shift) % nx;
                    gold[j * nx + i] = h_idata[jshift * nx + ishift];
                }
            }

            // ----------------
            // shiftPitchLinear
            // ----------------

            ctx.ClearMemory(d_odata.DevicePointer, 0, d_odata.TotalSizeInBytes);
            PLKernel.BlockDimensions = threads;
            PLKernel.GridDimensions  = grid;
            stopwatch.Start();
            for (int i = 0; i < NUM_REPS; i++)
            {
                PLKernel.Run(d_odata.DevicePointer, (int)(d_odata.Pitch / sizeof(float)), nx, ny, x_shift, y_shift);
            }
            stopwatch.Stop();
            stopwatch.StopEvent.Synchronize();
            float timePL = stopwatch.GetElapsedTime();

            // check results
            d_odata.CopyToHost(h_odata);

            bool res = cutComparef(gold, h_odata);

            bool success = true;

            if (res == false)
            {
                Console.Write("*** shiftPitchLinear failed ***\n");
                success = false;
            }

            // ----------
            // shiftArray
            // ----------

            ctx.ClearMemory(d_odata.DevicePointer, 0, d_odata.TotalSizeInBytes);
            ArrayKernel.BlockDimensions = threads;
            ArrayKernel.GridDimensions  = grid;
            stopwatch.Start();
            for (int i = 0; i < NUM_REPS; i++)
            {
                ArrayKernel.Run(d_odata.DevicePointer, (int)(d_odata.Pitch / sizeof(float)), nx, ny, x_shift, y_shift);
            }

            stopwatch.Stop();
            stopwatch.StopEvent.Synchronize();
            float timeArray = stopwatch.GetElapsedTime();

            // check results
            d_odata.CopyToHost(h_odata);

            res = cutComparef(gold, h_odata);

            if (res == false)
            {
                Console.Write("*** shiftArray failed ***\n");
                success = false;
            }

            float bandwidthPL    = 2.0f * 1000.0f * nx * ny * sizeof(float) / (1e+9f) / (timePL / NUM_REPS);
            float bandwidthArray = 2.0f * 1000.0f * nx * ny * sizeof(float) / (1e+9f) / (timeArray / NUM_REPS);

            Console.Write("\nBandwidth (GB/s) for pitch linear: {0}; for array: {1}\n",
                          bandwidthPL, bandwidthArray);

            float fetchRatePL    = nx * ny / 1e+6f / (timePL / (1000.0f * NUM_REPS));
            float fetchRateArray = nx * ny / 1e+6f / (timeArray / (1000.0f * NUM_REPS));

            Console.Write("\nTexture fetch rate (Mpix/s) for pitch linear: {0}; for array: {1}\n\n",
                          fetchRatePL, fetchRateArray);


            // cleanup
            texRefPL.Dispose();
            texRefArray.Dispose();
            d_idataPL.Dispose();
            d_idataArray.Dispose();
            d_odata.Dispose();
            stopwatch.Dispose();
            ctx.Dispose();

            ShrQATest.shrQAFinishExit(args, (success == true) ? ShrQATest.eQAstatus.QA_PASSED : ShrQATest.eQAstatus.QA_FAILED);
        }
Example #23
0
        public float3[] GetPointsGPU(int NumPoints)
        {
            int BlockSize = 512;

            if (NumPoints % BlockSize != 0)
            {
                throw new Exception("NumPoints must be divisible by " + BlockSize.ToString());
            }
            int[] TriangleCounts = new int[GridCount + 1];
            var   Maxima         = new float3[GridCount];
            var   Minima         = new float3[GridCount];

            TriangleCounts[0] = 0;
            for (int i = 0; i < GridCount; i++)
            {
                int LocalCount = TriangleCounts[i] + (int)Domains[i].TriangleCount;
                if (Domains[i].TriangleCount > BlockSize)
                {
                    throw new Exception("STL File must have no more than " + BlockSize.ToString() + " Triangles");
                }
                TriangleCounts[i + 1] = LocalCount;
                Minima[i]             = STLReader.ToFloat3(Domains[i].Extrema.Min);
                Maxima[i]             = STLReader.ToFloat3(Domains[i].Extrema.Max);
            }
            var Triangles = new TriangleSTL[TriangleCounts[GridCount]];
            int id        = 0;

            for (int i = 0; i < GridCount; i++)
            {
                for (int j = 0; j < TriangleCounts[i]; j++)
                {
                    var LocalTri = Domains[i].Triangles[j];
                    Triangles[id] = new TriangleSTL(LocalTri);
                    id++;
                }
            }

            var ctx              = new CudaContext(1);
            var DeviceInfo       = ctx.GetDeviceInfo();
            var d_Triangles      = new CudaDeviceVariable <TriangleSTL>(Triangles.Length);
            var d_TriangleCounts = new CudaDeviceVariable <int>(GridCount);
            var d_Minima         = new CudaDeviceVariable <float3>(GridCount);
            var d_Maxima         = new CudaDeviceVariable <float3>(GridCount);
            var d_Points         = new CudaDeviceVariable <float3>(GridCount * NumPoints);
            var h_Points         = new float3[GridCount * NumPoints];
            var rng              = new Random(0); // use a sequence that is repeatable over and over again

            for (int i = 0; i < GridCount * NumPoints; i++)
            {
                h_Points[i].x = (float)rng.NextDouble();
                h_Points[i].y = (float)rng.NextDouble();
                h_Points[i].z = (float)rng.NextDouble();
            }
            int ctr = 0;

            for (int i = 0; i < GridCount; i++)
            {
                for (int j = 0; j < NumPoints; j++)
                {
                    h_Points[ctr].x = Minima[i].x + h_Points[ctr].x * (Maxima[i].x - Minima[i].x);
                    h_Points[ctr].y = Minima[i].y + h_Points[ctr].y * (Maxima[i].y - Minima[i].y);
                    h_Points[ctr].z = Minima[i].z + h_Points[ctr].z * (Maxima[i].z - Minima[i].z);
                    ctr++;
                }
            }
            d_Points         = h_Points;
            d_Triangles      = Triangles;
            d_TriangleCounts = TriangleCounts;
            d_Minima         = Minima;
            d_Maxima         = Maxima;
            // copy over to host
            // TODO generate grid on GPU instead of CPU


            var PointInPolygonKernel = ctx.LoadKernelPTX("PointInPolygon.ptx", "PointInPolygon");
            var BlockDim             = new dim3(BlockSize, 1, 1);
            var GridDim = new dim3(GridCount, 1, 1);

            PointInPolygonKernel.BlockDimensions = BlockDim;
            PointInPolygonKernel.GridDimensions  = GridDim;

            PointInPolygonKernel.Run(GridCount,
                                     NumPoints,
                                     d_TriangleCounts.DevicePointer,
                                     d_Triangles.DevicePointer,
                                     d_Maxima.DevicePointer,
                                     d_Minima.DevicePointer,
                                     d_Points.DevicePointer);
            h_Points = d_Points;

            return(h_Points); // TODO Fix this to remove bad points
        }
Example #24
0
        static void Main(string[] args)
        {
            string filename      = "vectorAdd_kernel.cu"; //we assume the file is in the same folder...
            string fileToCompile = File.ReadAllText(filename);


            CudaRuntimeCompiler rtc = new CudaRuntimeCompiler(fileToCompile, "vectorAdd_kernel");

            rtc.Compile(args);

            string log = rtc.GetLogAsString();

            Console.WriteLine(log);

            byte[] ptx = rtc.GetPTX();

            rtc.Dispose();

            CudaContext ctx = new CudaContext(0);

            CudaKernel vectorAdd = ctx.LoadKernelPTX(ptx, "vectorAdd");


            // Print the vector length to be used, and compute its size
            int   numElements = 50000;
            SizeT size        = numElements * sizeof(float);

            Console.WriteLine("[Vector addition of {0} elements]", numElements);

            // Allocate the host input vector A
            float[] h_A = new float[numElements];
            // Allocate the host input vector B
            float[] h_B = new float[numElements];
            // Allocate the host output vector C
            float[] h_C = new float[numElements];

            Random rand = new Random(0);

            // Initialize the host input vectors
            for (int i = 0; i < numElements; ++i)
            {
                h_A[i] = (float)rand.NextDouble();
                h_B[i] = (float)rand.NextDouble();
            }

            Console.WriteLine("Allocate and copy input data from the host memory to the CUDA device\n");
            // Allocate the device input vector A and copy to device
            CudaDeviceVariable <float> d_A = h_A;

            // Allocate the device input vector B and copy to device
            CudaDeviceVariable <float> d_B = h_B;

            // Allocate the device output vector C
            CudaDeviceVariable <float> d_C = new CudaDeviceVariable <float>(numElements);

            // Launch the Vector Add CUDA Kernel
            int threadsPerBlock = 256;
            int blocksPerGrid   = (numElements + threadsPerBlock - 1) / threadsPerBlock;

            Console.WriteLine("CUDA kernel launch with {0} blocks of {1} threads\n", blocksPerGrid, threadsPerBlock);
            vectorAdd.BlockDimensions = new dim3(threadsPerBlock, 1, 1);
            vectorAdd.GridDimensions  = new dim3(blocksPerGrid, 1, 1);

            vectorAdd.Run(d_A.DevicePointer, d_B.DevicePointer, d_C.DevicePointer, numElements);

            // Copy the device result vector in device memory to the host result vector
            // in host memory.
            Console.WriteLine("Copy output data from the CUDA device to the host memory\n");
            d_C.CopyToHost(h_C);


            // Verify that the result vector is correct
            for (int i = 0; i < numElements; ++i)
            {
                if (Math.Abs(h_A[i] + h_B[i] - h_C[i]) > 1e-5)
                {
                    Console.WriteLine("Result verification failed at element {0}!\n", i);
                    return;
                }
            }

            Console.WriteLine("Test PASSED\n");

            // Free device global memory
            d_A.Dispose();
            d_B.Dispose();
            d_C.Dispose();

            ctx.Dispose();
            Console.WriteLine("Done\n");
        }
Example #25
0
        static void Main(string[] args)
        {
            try
            {
                if (args.Length == 1 && args[0].ToLower().Contains("fidelity"))
                {
                    string[] fseg = args[0].Split(':');
                    deviceID = int.Parse(fseg[1]);
                    nonce    = Int64.Parse(fseg[2]) - 1;
                    range    = int.Parse(fseg[3]);
                    QTEST    = true;
                }
                else
                {
                    if (args.Length > 0)
                    {
                        deviceID = int.Parse(args[0]);
                    }
                }
            }
            catch (Exception ex)
            {
                Logger.Log(LogLevel.Error, "Device ID parse error: " + ex.Message);
            }

            try
            {
                if (args.Length > 0)
                {
                    deviceID = int.Parse(args[0]);
                }
            }
            catch (Exception ex)
            {
                Logger.Log(LogLevel.Error, "Device ID parse error");
            }

            try
            {
                if (args.Length > 1)
                {
                    port = int.Parse(args[1]);
                    Comms.ConnectToMaster(port);
                }
                else
                {
                    TEST = true;
                    Logger.CopyToConsole = true;
                    CGraph.ShowCycles    = true;
                }
            }
            catch (Exception ex)
            {
                Logger.Log(LogLevel.Error, "Master connection error");
            }

            try
            {
                if (args.Length > 3)
                {
                    gpuCount = int.Parse(args[3]);
                    fastCuda = gpuCount <= (Environment.ProcessorCount / 2);
                    if (fastCuda)
                    {
                        Logger.Log(LogLevel.Info, "Using single GPU blocking mode");
                    }
                }
            }
            catch
            {
            }

            if (TEST)
            {
                currentJob = nextJob = new Job()
                {
                    jobID     = 0,
                    k0        = 0xf4956dc403730b01L,
                    k1        = 0xe6d45de39c2a5a3eL,
                    k2        = 0xcbf626a8afee35f6L,
                    k3        = 0x4307b94b1a0c9980L,
                    pre_pow   = TestPrePow,
                    timestamp = DateTime.Now
                };
            }
            else
            {
                currentJob = nextJob = new Job()
                {
                    jobID     = 0,
                    k0        = 0xf4956dc403730b01L,
                    k1        = 0xe6d45de39c2a5a3eL,
                    k2        = 0xcbf626a8afee35f6L,
                    k3        = 0x4307b94b1a0c9980L,
                    pre_pow   = TestPrePow,
                    timestamp = DateTime.Now
                };

                if (!Comms.IsConnected())
                {
                    Console.WriteLine("Master connection failed, aborting");
                    Logger.Log(LogLevel.Error, "No master connection, exitting!");
                    return;
                }

                if (deviceID < 0)
                {
                    int devCnt             = CudaContext.GetDeviceCount();
                    GpuDevicesMessage gpum = new GpuDevicesMessage()
                    {
                        devices = new List <GpuDevice>(devCnt)
                    };
                    for (int i = 0; i < devCnt; i++)
                    {
                        string name = CudaContext.GetDeviceName(i);
                        var    info = CudaContext.GetDeviceInfo(i);
                        gpum.devices.Add(new GpuDevice()
                        {
                            deviceID = i, name = name, memory = info.TotalGlobalMemory
                        });
                    }
                    //Console.WriteLine(devCnt);
                    Comms.gpuMsg = gpum;
                    Comms.SetEvent();
                    //Console.WriteLine("event fired");
                    Task.Delay(1000).Wait();
                    //Console.WriteLine("closing");
                    Comms.Close();
                    return;
                }
            }

            try
            {
                var assembly       = Assembly.GetEntryAssembly();
                var resourceStream = assembly.GetManifestResourceStream("CudaSolver.kernel_x64.ptx");
                ctx = new CudaContext(deviceID, /*!fastCuda ? (CUCtxFlags.BlockingSync | CUCtxFlags.MapHost) :*/ CUCtxFlags.MapHost);
                string pow = new StreamReader(resourceStream).ReadToEnd();

                //pow = File.ReadAllText(@"kernel_x64.ptx");

                Turing = ctx.GetDeviceInfo().MaxSharedMemoryPerMultiprocessor == 65536;

                using (var s = GenerateStreamFromString(pow))
                {
                    if (!Turing)
                    {
                        meanSeedA = ctx.LoadKernelPTX(s, "FluffySeed4K", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)40 });
                        meanSeedA.BlockDimensions = 512;
                        meanSeedA.GridDimensions  = 1024;
                        meanSeedA.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                        meanRound = ctx.LoadKernelPTX(s, "FluffyRound_A2", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)40 });
                        meanRound.BlockDimensions = 512;
                        meanRound.GridDimensions  = 4096;
                        meanRound.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                        meanRound_4 = ctx.LoadKernelPTX(s, "FluffyRound_A1", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)32 });
                        meanRound_4.BlockDimensions = 1024;
                        meanRound_4.GridDimensions  = 1024;
                        meanRound_4.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                        meanRoundJoin = ctx.LoadKernelPTX(s, "FluffyRound_A3", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)32 });
                        meanRoundJoin.BlockDimensions = 1024;
                        meanRoundJoin.GridDimensions  = 4096;
                        meanRoundJoin.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                        meanTail = ctx.LoadKernelPTX(s, "FluffyTail");
                        meanTail.BlockDimensions = 1024;
                        meanTail.GridDimensions  = 4096;
                        meanTail.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1;

                        meanRecover = ctx.LoadKernelPTX(s, "FluffyRecovery");
                        meanRecover.BlockDimensions = 256;
                        meanRecover.GridDimensions  = 2048;
                        meanRecover.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1;
                    }
                    else
                    {
                        meanSeedA = ctx.LoadKernelPTX(s, "FluffySeed4K", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)64 });
                        meanSeedA.BlockDimensions = 512;
                        meanSeedA.GridDimensions  = 1024;
                        meanSeedA.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                        meanRound = ctx.LoadKernelPTX(s, "FluffyRound_C2", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)32 });
                        meanRound.BlockDimensions = 1024;
                        meanRound.GridDimensions  = 4096;
                        meanRound.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                        meanRound_4 = ctx.LoadKernelPTX(s, "FluffyRound_C1", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)64 });
                        meanRound_4.BlockDimensions = 1024;
                        meanRound_4.GridDimensions  = 1024;
                        meanRound_4.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                        meanRoundJoin = ctx.LoadKernelPTX(s, "FluffyRound_C3", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)32 });
                        meanRoundJoin.BlockDimensions = 1024;
                        meanRoundJoin.GridDimensions  = 4096;
                        meanRoundJoin.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                        meanTail = ctx.LoadKernelPTX(s, "FluffyTail");
                        meanTail.BlockDimensions = 1024;
                        meanTail.GridDimensions  = 4096;
                        meanTail.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1;

                        meanRecover = ctx.LoadKernelPTX(s, "FluffyRecovery");
                        meanRecover.BlockDimensions = 256;
                        meanRecover.GridDimensions  = 2048;
                        meanRecover.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1;
                    }
                }
            }
            catch (Exception ex)
            {
                Logger.Log(LogLevel.Error, "Unable to create kernels: " + ex.Message);
                Task.Delay(500).Wait();
                Comms.Close();
                return;
            }

            try
            {
                d_buffer    = new CudaDeviceVariable <ulong>(BUFFER_SIZE_U32 * (temp ? 8 : 1));
                d_bufferMid = new CudaDeviceVariable <ulong>(d_buffer.DevicePointer + (BUFFER_SIZE_B * 2));
                d_bufferB   = new CudaDeviceVariable <ulong>(d_buffer.DevicePointer + (BUFFER_SIZE_B * 8));

                d_indexesA = new CudaDeviceVariable <uint>(INDEX_SIZE);
                d_indexesB = new CudaDeviceVariable <uint>(INDEX_SIZE);
                d_aux      = new CudaDeviceVariable <uint>(512);

                Array.Clear(h_indexesA, 0, h_indexesA.Length);
                Array.Clear(h_indexesB, 0, h_indexesA.Length);

                d_indexesA = h_indexesA;
                d_indexesB = h_indexesB;

                streamPrimary = new CudaStream(CUStreamFlags.NonBlocking);
            }
            catch (Exception ex)
            {
                Task.Delay(200).Wait();
                Logger.Log(LogLevel.Error, $"Mem alloc exception. Out of video memory? {ctx.GetFreeDeviceMemorySize()} free");
                Task.Delay(500).Wait();
                Comms.Close();
                return;
            }

            try
            {
                AllocateHostMemory(true, ref h_a, ref hAligned_a, 1024 * 1024 * 32);
            }
            catch (Exception ex)
            {
                Logger.Log(LogLevel.Error, "Unable to create pinned memory.");
                Task.Delay(500).Wait();
                Comms.Close();
                return;
            }

            int loopCnt = 0;

            while (!Comms.IsTerminated)
            {
                try
                {
                    if (!TEST && (Comms.nextJob.pre_pow == null || Comms.nextJob.pre_pow == "" || Comms.nextJob.pre_pow == TestPrePow))
                    {
                        Logger.Log(LogLevel.Info, string.Format("Waiting for job...."));
                        Task.Delay(1000).Wait();
                        continue;
                    }

                    if (!TEST && ((currentJob.pre_pow != Comms.nextJob.pre_pow) || (currentJob.origin != Comms.nextJob.origin)))
                    {
                        currentJob           = Comms.nextJob;
                        currentJob.timestamp = DateTime.Now;
                    }

                    if (!TEST && (currentJob.timestamp.AddMinutes(30) < DateTime.Now) && Comms.lastIncoming.AddMinutes(30) < DateTime.Now)
                    {
                        Logger.Log(LogLevel.Info, string.Format("Job too old..."));
                        Task.Delay(1000).Wait();
                        continue;
                    }

                    // test runs only once
                    if (TEST && ++loopCnt >= range)
                    {
                        Comms.IsTerminated = true;
                    }

                    Solution s;
                    while (graphSolutions.TryDequeue(out s))
                    {
                        meanRecover.SetConstantVariable <ulong>("recovery", s.GetUlongEdges());
                        d_indexesB.MemsetAsync(0, streamPrimary.Stream);
                        meanRecover.RunAsync(streamPrimary.Stream, s.job.k0, s.job.k1, s.job.k2, s.job.k3, d_indexesB.DevicePointer);
                        streamPrimary.Synchronize();
                        s.nonces = new uint[32];
                        d_indexesB.CopyToHost(s.nonces, 0, 0, 32 * 4);
                        s.nonces = s.nonces.OrderBy(n => n).ToArray();
                        //fidelity = (32-cycles_found / graphs_searched) * 32
                        solutions++;
                        s.fidelity = ((double)solutions / (double)trims) * 32.0;
                        //Console.WriteLine(s.fidelity.ToString("0.000"));
                        if (Comms.IsConnected())
                        {
                            Comms.graphSolutionsOut.Enqueue(s);
                            Comms.SetEvent();
                        }
                        if (QTEST)
                        {
                            Console.ForegroundColor = ConsoleColor.Red;
                            Console.WriteLine($"Solution for nonce {s.job.nonce}: {string.Join(' ', s.nonces)}");
                            Console.ResetColor();
                        }
                    }

                    if (QTEST)
                    {
                        currentJob = currentJob.NextSequential(ref nonce);
                        Console.WriteLine($"Nonce: {nonce} K0: {currentJob.k0:X} K1: {currentJob.k1:X} K2: {currentJob.k2:X} K3: {currentJob.k3:X}");
                    }
                    else
                    {
                        currentJob = currentJob.Next();
                    }

                    Logger.Log(LogLevel.Debug, string.Format("GPU NV{4}:Trimming #{4}: {0} {1} {2} {3}", currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, currentJob.jobID, deviceID));

                    timer.Restart();

                    d_indexesA.MemsetAsync(0, streamPrimary.Stream);
                    d_indexesB.MemsetAsync(0, streamPrimary.Stream);
                    d_aux.MemsetAsync(0, streamPrimary.Stream);

                    meanSeedA.RunAsync(streamPrimary.Stream, currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, d_bufferMid.DevicePointer, d_indexesB.DevicePointer, 0);
                    meanSeedA.RunAsync(streamPrimary.Stream, currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, d_bufferMid.DevicePointer + ((BUFFER_SIZE_A * 8) / 4 / 4) * 1, d_indexesB.DevicePointer + (4096 * 4), EDGE_SEG);
                    meanSeedA.RunAsync(streamPrimary.Stream, currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, d_bufferMid.DevicePointer + ((BUFFER_SIZE_A * 8) / 4 / 4) * 2, d_indexesB.DevicePointer + (4096 * 8), EDGE_SEG * 2);
                    meanSeedA.RunAsync(streamPrimary.Stream, currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, d_bufferMid.DevicePointer + ((BUFFER_SIZE_A * 8) / 4 / 4) * 3, d_indexesB.DevicePointer + (4096 * 12), EDGE_SEG * 3);

                    meanRound_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_A / 4, DUCK_EDGES_B / 4, 0);
                    meanRound_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 1, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_A / 4, DUCK_EDGES_B / 4, 1024);
                    meanRound_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 2, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_A / 4, DUCK_EDGES_B / 4, 2048);
                    meanRound_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 3, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_A / 4, DUCK_EDGES_B / 4, 3072);


                    //streamPrimary.Synchronize();
                    //h_indexesA = d_indexesA;
                    //h_indexesB = d_indexesB;
                    //var sumA = h_indexesA.Sum(e => e);
                    //var sumB = h_indexesB.Sum(e => e);
                    //streamPrimary.Synchronize();

                    d_indexesB.MemsetAsync(0, streamPrimary.Stream);
                    meanRoundJoin.RunAsync(streamPrimary.Stream,
                                           d_buffer.DevicePointer,
                                           d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 1,
                                           d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 2,
                                           d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 3,
                                           d_bufferB.DevicePointer,
                                           d_indexesA.DevicePointer,
                                           d_indexesB.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 2);

                    d_indexesA.MemsetAsync(0, streamPrimary.Stream);
                    meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2, 0, d_aux.DevicePointer);
                    d_indexesB.MemsetAsync(0, streamPrimary.Stream);
                    meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2, 1, d_aux.DevicePointer);
                    d_indexesA.MemsetAsync(0, streamPrimary.Stream);
                    meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2, 2, d_aux.DevicePointer);
                    d_indexesB.MemsetAsync(0, streamPrimary.Stream);
                    meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 4, 3, d_aux.DevicePointer);

                    for (int i = 0; i < (TEST ? 80 : trimRounds); i++)
                    //for (int i = 0; i < 85; i++)
                    {
                        d_indexesA.MemsetAsync(0, streamPrimary.Stream);
                        meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 4, i * 2 + 4, d_aux.DevicePointer);
                        d_indexesB.MemsetAsync(0, streamPrimary.Stream);
                        meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 4, i * 2 + 5, d_aux.DevicePointer);
                    }

                    d_indexesA.MemsetAsync(0, streamPrimary.Stream);
                    meanTail.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer);

                    Task.Delay((int)lastTrimMs).Wait();

                    streamPrimary.Synchronize();

                    uint[] count = new uint[2];
                    d_indexesA.CopyToHost(count, 0, 0, 8);

                    if (count[0] > 131071)
                    {
                        // trouble
                        count[0] = 131071;
                        // log
                    }

                    hAligned_a.AsyncCopyFromDevice(d_buffer.DevicePointer, 0, 0, count[0] * 8, streamPrimary.Stream);
                    streamPrimary.Synchronize();
                    System.Runtime.InteropServices.Marshal.Copy(hAligned_a.PinnedHostPointer, h_a, 0, ((int)count[0] * 8) / sizeof(int));

                    trims++;
                    timer.Stop();
                    lastTrimMs          = (long)Math.Min(Math.Max((float)timer.ElapsedMilliseconds * 0.9f, 50), 500);
                    currentJob.solvedAt = DateTime.Now;
                    currentJob.trimTime = timer.ElapsedMilliseconds;

                    //Console.WriteLine("Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0]);
                    Logger.Log(LogLevel.Info, string.Format("GPU NV{2}:     Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0], deviceID));


                    FinderBag.RunFinder(TEST, ref trims, count[0], h_a, currentJob, graphSolutions, timer);

                    if (trims % 50 == 0 && TEST)
                    {
                        Console.ForegroundColor = ConsoleColor.Green;
                        Console.WriteLine("SOLS: {0}/{1} - RATE: {2:F1}", solutions, trims, (float)trims / solutions);
                        Console.ResetColor();
                    }

                    /*
                     * if (TEST)
                     * {
                     *  //Console.WriteLine("Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0]);
                     *
                     *  CGraph cg = FinderBag.GetFinder();
                     *  cg.SetEdges(h_a, (int)count[0]);
                     *  cg.SetHeader(currentJob);
                     *
                     *  //currentJob = currentJob.Next();
                     *
                     *  Task.Factory.StartNew(() =>
                     *     {
                     *         Stopwatch sw = new Stopwatch();
                     *         sw.Start();
                     *
                     *         if (count[0] < 131071)
                     *         {
                     *             try
                     *             {
                     *                 if (findersInFlight++ < 3)
                     *                 {
                     *                     Stopwatch cycleTime = new Stopwatch();
                     *                     cycleTime.Start();
                     *                     cg.FindSolutions(graphSolutions);
                     *                     cycleTime.Stop();
                     *                     AdjustTrims(cycleTime.ElapsedMilliseconds);
                     *                     //if (graphSolutions.Count > 0) solutions++;
                     *                 }
                     *                 else
                     *                     Logger.Log(LogLevel.Warning, "CPU overloaded!");
                     *             }
                     *             catch (Exception ex)
                     *             {
                     *                 Logger.Log(LogLevel.Error, "Cycle finder error" + ex.Message);
                     *             }
                     *             finally
                     *             {
                     *                 FinderBag.ReturnFinder(cg);
                     *                 findersInFlight--;
                     *             }
                     *         }
                     *
                     *         sw.Stop();
                     *
                     *         if (trims % 50 == 0)
                     *         {
                     *             Console.ForegroundColor = ConsoleColor.Green;
                     *             Console.WriteLine("SOLS: {0}/{1} - RATE: {2:F1}", solutions, trims, (float)trims/solutions );
                     *             Console.ResetColor();
                     *         }
                     *         //Console.WriteLine("Finder completed in {0}ms on {1} edges with {2} solution(s)", sw.ElapsedMilliseconds, count[0], graphSolutions.Count);
                     *         //Console.WriteLine("Duped edges: {0}", cg.dupes);
                     *         if (!QTEST)
                     *          Logger.Log(LogLevel.Info, string.Format("Finder completed in {0}ms on {1} edges with {2} solution(s) and {3} dupes", sw.ElapsedMilliseconds, count[0], graphSolutions.Count, cg.dupes));
                     *     });
                     *
                     *  //h_indexesA = d_indexesA;
                     *  //h_indexesB = d_indexesB;
                     *
                     *  //var sumA = h_indexesA.Sum(e => e);
                     *  //var sumB = h_indexesB.Sum(e => e);
                     *
                     *  ;
                     * }
                     * else
                     * {
                     *  CGraph cg = FinderBag.GetFinder();
                     *  cg.SetEdges(h_a, (int)count[0]);
                     *  cg.SetHeader(currentJob);
                     *
                     *  Task.Factory.StartNew(() =>
                     *  {
                     *      if (count[0] < 131071)
                     *      {
                     *          try
                     *          {
                     *              if (findersInFlight++ < 3)
                     *              {
                     *                  Stopwatch cycleTime = new Stopwatch();
                     *                  cycleTime.Start();
                     *                  cg.FindSolutions(graphSolutions);
                     *                  cycleTime.Stop();
                     *                  AdjustTrims(cycleTime.ElapsedMilliseconds);
                     *              }
                     *              else
                     *                  Logger.Log(LogLevel.Warning, "CPU overloaded!");
                     *          }
                     *          catch (Exception ex)
                     *          {
                     *              Logger.Log(LogLevel.Warning, "Cycle finder crashed: " + ex.Message);
                     *          }
                     *          finally
                     *          {
                     *              FinderBag.ReturnFinder(cg);
                     *              findersInFlight--;
                     *          }
                     *      }
                     *  });
                     * }
                     *
                     */
                }
                catch (Exception ex)
                {
                    Logger.Log(LogLevel.Error, "Critical error in main cuda loop " + ex.Message);
                    Task.Delay(500).Wait();
                    break;
                }
            }

            // clean up
            try
            {
                Task.Delay(500).Wait();

                Comms.Close();

                d_buffer.Dispose();
                d_indexesA.Dispose();
                d_indexesB.Dispose();
                d_aux.Dispose();

                streamPrimary.Dispose();
                streamSecondary.Dispose();

                hAligned_a.Dispose();

                if (ctx != null)
                {
                    ctx.Dispose();
                }
            }
            catch { }
        }
        static void Main(string[] args)
        {
            try
            {
                if (args.Length > 0)
                {
                    deviceID = int.Parse(args[0]);
                }
            }
            catch (Exception ex)
            {
                Logger.Log(LogLevel.Error, "Device ID parse error");
            }

            try
            {
                if (args.Length > 1)
                {
                    port = int.Parse(args[1]);
                    Comms.ConnectToMaster(port);
                }
                else
                {
                    TEST = true;
                    Logger.CopyToConsole = true;
                    CGraph.ShowCycles    = true;
                }
            }
            catch (Exception ex)
            {
                Logger.Log(LogLevel.Error, "Master connection error");
            }

            try
            {
                if (args.Length > 3)
                {
                    gpuCount = int.Parse(args[3]);
                    fastCuda = gpuCount <= (Environment.ProcessorCount / 2);
                    if (fastCuda)
                    {
                        Logger.Log(LogLevel.Info, "Using single GPU blocking mode");
                    }
                }
            }
            catch
            {
            }

            if (TEST)
            {
                currentJob = nextJob = new Job()
                {
                    jobID     = 0,
                    k0        = 0xf4956dc403730b01L,
                    k1        = 0xe6d45de39c2a5a3eL,
                    k2        = 0xcbf626a8afee35f6L,
                    k3        = 0x4307b94b1a0c9980L,
                    pre_pow   = TestPrePow,
                    timestamp = DateTime.Now
                };
            }
            else
            {
                currentJob = nextJob = new Job()
                {
                    jobID     = 0,
                    k0        = 0xf4956dc403730b01L,
                    k1        = 0xe6d45de39c2a5a3eL,
                    k2        = 0xcbf626a8afee35f6L,
                    k3        = 0x4307b94b1a0c9980L,
                    pre_pow   = TestPrePow,
                    timestamp = DateTime.Now
                };

                if (!Comms.IsConnected())
                {
                    Console.WriteLine("Master connection failed, aborting");
                    Logger.Log(LogLevel.Error, "No master connection, exitting!");
                    return;
                }

                if (deviceID < 0)
                {
                    int devCnt             = CudaContext.GetDeviceCount();
                    GpuDevicesMessage gpum = new GpuDevicesMessage()
                    {
                        devices = new List <GpuDevice>(devCnt)
                    };
                    for (int i = 0; i < devCnt; i++)
                    {
                        string name = CudaContext.GetDeviceName(i);
                        var    info = CudaContext.GetDeviceInfo(i);
                        gpum.devices.Add(new GpuDevice()
                        {
                            deviceID = i, name = name, memory = info.TotalGlobalMemory
                        });
                    }
                    //Console.WriteLine(devCnt);
                    Comms.gpuMsg = gpum;
                    Comms.SetEvent();
                    //Console.WriteLine("event fired");
                    Task.Delay(1000).Wait();
                    //Console.WriteLine("closing");
                    Comms.Close();
                    return;
                }
            }


            try
            {
                var assembly       = Assembly.GetEntryAssembly();
                var resourceStream = assembly.GetManifestResourceStream("CudaSolver.kernel_x64.ptx");
                ctx = new CudaContext(deviceID, !fastCuda ? (CUCtxFlags.BlockingSync | CUCtxFlags.MapHost) : CUCtxFlags.MapHost);

                meanSeedA = ctx.LoadKernelPTX(resourceStream, "FluffySeed2A");
                meanSeedA.BlockDimensions = 128;
                meanSeedA.GridDimensions  = 2048;
                meanSeedA.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                meanSeedB = ctx.LoadKernelPTX(resourceStream, "FluffySeed2B");
                meanSeedB.BlockDimensions = 128;
                meanSeedB.GridDimensions  = 2048;
                meanSeedB.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                meanSeedB_4 = ctx.LoadKernelPTX(resourceStream, "FluffySeed2B");
                meanSeedB_4.BlockDimensions = 128;
                meanSeedB_4.GridDimensions  = 1024;
                meanSeedB_4.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                meanRound = ctx.LoadKernelPTX(resourceStream, "FluffyRound");
                meanRound.BlockDimensions = 512;
                meanRound.GridDimensions  = 4096;
                meanRound.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                meanRound_2 = ctx.LoadKernelPTX(resourceStream, "FluffyRound");
                meanRound_2.BlockDimensions = 512;
                meanRound_2.GridDimensions  = 2048;
                meanRound_2.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                meanRoundJoin = ctx.LoadKernelPTX(resourceStream, "FluffyRound_J");
                meanRoundJoin.BlockDimensions = 512;
                meanRoundJoin.GridDimensions  = 4096;
                meanRoundJoin.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                meanTail = ctx.LoadKernelPTX(resourceStream, "FluffyTail");
                meanTail.BlockDimensions = 1024;
                meanTail.GridDimensions  = 4096;
                meanTail.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1;

                meanRecover = ctx.LoadKernelPTX(resourceStream, "FluffyRecovery");
                meanRecover.BlockDimensions = 256;
                meanRecover.GridDimensions  = 2048;
                meanRecover.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1;
            }
            catch (Exception ex)
            {
                Logger.Log(LogLevel.Error, "Unable to create kernels: " + ex.Message);
                Task.Delay(500).Wait();
                Comms.Close();
                return;
            }

            try
            {
                d_buffer    = new CudaDeviceVariable <ulong>(BUFFER_SIZE_U32);
                d_bufferMid = new CudaDeviceVariable <ulong>(d_buffer.DevicePointer + (BUFFER_SIZE_B * 8));
                d_bufferB   = new CudaDeviceVariable <ulong>(d_buffer.DevicePointer + (BUFFER_SIZE_A * 8));

                d_indexesA = new CudaDeviceVariable <uint>(INDEX_SIZE * 2);
                d_indexesB = new CudaDeviceVariable <uint>(INDEX_SIZE * 2);

                Array.Clear(h_indexesA, 0, h_indexesA.Length);
                Array.Clear(h_indexesB, 0, h_indexesA.Length);

                d_indexesA = h_indexesA;
                d_indexesB = h_indexesB;

                streamPrimary   = new CudaStream(CUStreamFlags.NonBlocking);
                streamSecondary = new CudaStream(CUStreamFlags.NonBlocking);
            }
            catch (Exception ex)
            {
                Task.Delay(200).Wait();
                Logger.Log(LogLevel.Error, $"Out of video memory! Only {ctx.GetFreeDeviceMemorySize()} free");
                Task.Delay(500).Wait();
                Comms.Close();
                return;
            }

            try
            {
                AllocateHostMemory(true, ref h_a, ref hAligned_a, 1024 * 1024 * 32);
            }
            catch (Exception ex)
            {
                Logger.Log(LogLevel.Error, "Unable to create pinned memory.");
                Task.Delay(500).Wait();
                Comms.Close();
                return;
            }

            int loopCnt = 0;

            while (!Comms.IsTerminated)
            {
                try
                {
                    if (!TEST && (Comms.nextJob.pre_pow == null || Comms.nextJob.pre_pow == "" || Comms.nextJob.pre_pow == TestPrePow))
                    {
                        Logger.Log(LogLevel.Info, string.Format("Waiting for job...."));
                        Task.Delay(1000).Wait();
                        continue;
                    }

                    if (!TEST && ((currentJob.pre_pow != Comms.nextJob.pre_pow) || (currentJob.origin != Comms.nextJob.origin)))
                    {
                        currentJob           = Comms.nextJob;
                        currentJob.timestamp = DateTime.Now;
                    }

                    if (!TEST && (currentJob.timestamp.AddMinutes(30) < DateTime.Now) && Comms.lastIncoming.AddMinutes(30) < DateTime.Now)
                    {
                        Logger.Log(LogLevel.Info, string.Format("Job too old..."));
                        Task.Delay(1000).Wait();
                        continue;
                    }

                    // test runs only once
                    if (TEST && loopCnt++ > 100)
                    {
                        Comms.IsTerminated = true;
                    }

                    Solution s;
                    while (graphSolutions.TryDequeue(out s))
                    {
                        meanRecover.SetConstantVariable <ulong>("recovery", s.GetUlongEdges());
                        d_indexesB.MemsetAsync(0, streamPrimary.Stream);
                        meanRecover.RunAsync(streamPrimary.Stream, s.job.k0, s.job.k1, s.job.k2, s.job.k3, d_indexesB.DevicePointer);
                        streamPrimary.Synchronize();
                        s.nonces = new uint[40];
                        d_indexesB.CopyToHost(s.nonces, 0, 0, 40 * 4);
                        s.nonces = s.nonces.OrderBy(n => n).ToArray();
                        lock (Comms.graphSolutionsOut)
                        {
                            Comms.graphSolutionsOut.Enqueue(s);
                        }
                        Comms.SetEvent();
                    }
                    uint[] count;
                    do
                    {
                        if (!TEST && ((currentJob.pre_pow != Comms.nextJob.pre_pow) || (currentJob.origin != Comms.nextJob.origin)))
                        {
                            currentJob           = Comms.nextJob;
                            currentJob.timestamp = DateTime.Now;
                        }
                        currentJob = currentJob.Next();

                        Logger.Log(LogLevel.Debug, string.Format("GPU NV{4}:Trimming #{4}: {0} {1} {2} {3}", currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, currentJob.jobID, deviceID));

                        timer.Restart();

                        d_indexesA.MemsetAsync(0, streamPrimary.Stream);
                        d_indexesB.MemsetAsync(0, streamPrimary.Stream);

                        meanSeedA.RunAsync(streamPrimary.Stream, currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, d_bufferMid.DevicePointer, d_indexesB.DevicePointer);
                        meanSeedB_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, 0);
                        meanSeedB_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_A * 8) / 4) * 1, d_indexesB.DevicePointer, d_indexesA.DevicePointer, 16);
                        meanSeedB_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_A * 8) / 4) * 2, d_indexesB.DevicePointer, d_indexesA.DevicePointer, 32);
                        meanSeedB_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_A * 8) / 4) * 3, d_indexesB.DevicePointer, d_indexesA.DevicePointer, 48);

                        d_indexesB.MemsetAsync(0, streamPrimary.Stream);
                        meanRound_2.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer + ((BUFFER_SIZE_A * 8) / 4) * 2, d_bufferB.DevicePointer, d_indexesA.DevicePointer + (2048 * 4), d_indexesB.DevicePointer + (4096 * 4), DUCK_EDGES_A, DUCK_EDGES_B / 2);
                        meanRound_2.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer - (BUFFER_SIZE_B * 8), d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_A, DUCK_EDGES_B / 2);
                        d_indexesA.MemsetAsync(0, streamPrimary.Stream);
                        meanRoundJoin.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer - (BUFFER_SIZE_B * 8), d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2);

                        //d_indexesA.MemsetAsync(0, streamPrimary.Stream);
                        //meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B, DUCK_EDGES_B / 2);
                        d_indexesB.MemsetAsync(0, streamPrimary.Stream);
                        meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2);
                        d_indexesA.MemsetAsync(0, streamPrimary.Stream);
                        meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2);
                        d_indexesB.MemsetAsync(0, streamPrimary.Stream);
                        meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 4);

                        for (int i = 0; i < trimRounds; i++)
                        {
                            d_indexesA.MemsetAsync(0, streamPrimary.Stream);
                            meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 4);
                            d_indexesB.MemsetAsync(0, streamPrimary.Stream);
                            meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 4);
                        }

                        d_indexesA.MemsetAsync(0, streamPrimary.Stream);
                        meanTail.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer);

                        ctx.Synchronize();
                        streamPrimary.Synchronize();

                        count = new uint[2];
                        d_indexesA.CopyToHost(count, 0, 0, 8);

                        if (count[0] > 4194304)
                        {
                            // trouble
                            count[0] = 4194304;
                            // log
                        }

                        hAligned_a.AsyncCopyFromDevice(d_buffer.DevicePointer, 0, 0, count[0] * 8, streamPrimary.Stream);
                        streamPrimary.Synchronize();
                        System.Runtime.InteropServices.Marshal.Copy(hAligned_a.PinnedHostPointer, h_a, 0, ((int)count[0] * 8) / sizeof(int));

                        timer.Stop();
                        currentJob.solvedAt = DateTime.Now;
                        currentJob.trimTime = timer.ElapsedMilliseconds;

                        //Console.WriteLine("Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0]);
                        Logger.Log(LogLevel.Info, string.Format("GPU NV{2}:     Trimmed in {0}ms to {1} edges, h {3}", timer.ElapsedMilliseconds, count[0], deviceID, currentJob.height));
                    }while((currentJob.height != Comms.nextJob.height) && (!Comms.IsTerminated) && (!TEST));

                    if (TEST)
                    {
                        //Console.WriteLine("Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0]);

                        CGraph cg = FinderBag.GetFinder();
                        if (cg == null)
                        {
                            continue;
                        }

                        cg.SetEdges(h_a, (int)count[0]);
                        cg.SetHeader(currentJob);

                        //currentJob = currentJob.Next();

                        Task.Factory.StartNew(() =>
                        {
                            Stopwatch sw = new Stopwatch();
                            sw.Start();

                            if (count[0] < 200000)
                            {
                                try
                                {
                                    if (findersInFlight++ < 3)
                                    {
                                        Stopwatch cycleTime = new Stopwatch();
                                        cycleTime.Start();
                                        cg.FindSolutions(graphSolutions);
                                        cycleTime.Stop();
                                        AdjustTrims(cycleTime.ElapsedMilliseconds);
                                        if (graphSolutions.Count > 0)
                                        {
                                            solutions++;
                                        }
                                    }
                                    else
                                    {
                                        Logger.Log(LogLevel.Warning, "CPU overloaded!");
                                    }
                                }
                                catch (Exception ex)
                                {
                                    Logger.Log(LogLevel.Error, "Cycle finder error" + ex.Message);
                                }
                                finally
                                {
                                    findersInFlight--;
                                    FinderBag.ReturnFinder(cg);
                                }
                            }

                            sw.Stop();

                            if (++trims % 50 == 0)
                            {
                                Console.ForegroundColor = ConsoleColor.Green;
                                Console.WriteLine("SOLS: {0}/{1} - RATE: {2:F1}", solutions, trims, (float)trims / solutions);
                                Console.ResetColor();
                            }
                            //Console.WriteLine("Finder completed in {0}ms on {1} edges with {2} solution(s)", sw.ElapsedMilliseconds, count[0], graphSolutions.Count);
                            //Console.WriteLine("Duped edges: {0}", cg.dupes);
                            Logger.Log(LogLevel.Info, string.Format("Finder completed in {0}ms on {1} edges with {2} solution(s) and {3} dupes", sw.ElapsedMilliseconds, count[0], graphSolutions.Count, cg.dupes));
                        });

                        //h_indexesA = d_indexesA;
                        //h_indexesB = d_indexesB;

                        //var sumA = h_indexesA.Sum(e => e);
                        //var sumB = h_indexesB.Sum(e => e);

                        ;
                    }
                    else
                    {
                        CGraph cg = FinderBag.GetFinder();
                        cg.SetEdges(h_a, (int)count[0]);
                        cg.SetHeader(currentJob);

                        Task.Factory.StartNew(() =>
                        {
                            if (count[0] < 200000)
                            {
                                try
                                {
                                    if (findersInFlight++ < 3)
                                    {
                                        Stopwatch cycleTime = new Stopwatch();
                                        cycleTime.Start();
                                        cg.FindSolutions(graphSolutions);
                                        cycleTime.Stop();
                                        AdjustTrims(cycleTime.ElapsedMilliseconds);
                                        if (graphSolutions.Count > 0)
                                        {
                                            solutions++;
                                        }
                                    }
                                    else
                                    {
                                        Logger.Log(LogLevel.Warning, "CPU overloaded!");
                                    }
                                }
                                catch (Exception ex)
                                {
                                    Logger.Log(LogLevel.Error, "Cycle finder crashed: " + ex.Message);
                                }
                                finally
                                {
                                    findersInFlight--;
                                    FinderBag.ReturnFinder(cg);
                                }
                            }
                        });
                    }
                }
                catch (Exception ex)
                {
                    Logger.Log(LogLevel.Error, "Critical error in main cuda loop " + ex.Message);
                    Task.Delay(5000).Wait();
                }
            }

            // clean up
            try
            {
                Task.Delay(500).Wait();

                Comms.Close();

                d_buffer.Dispose();
                d_indexesA.Dispose();
                d_indexesB.Dispose();

                streamPrimary.Dispose();
                streamSecondary.Dispose();

                hAligned_a.Dispose();

                if (ctx != null)
                {
                    ctx.Dispose();
                }
            }
            catch { }
        }
Example #27
0
        public static void Execute()
        {
            Console.WriteLine("Barycentric stuff");

            //Init Cuda context
            ctx = new CudaContext(CudaContext.GetMaxGflopsDeviceId());

            //Load Kernel image from resources
            string resName = "baryTest.ptx";

            string resNamespace = "TestManagedCuda";
            string resource     = resNamespace + "." + resName;
            Stream stream       = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource);

            if (stream == null)
            {
                throw new ArgumentException("Kernel not found in resources.");
            }

            CudaKernel baryKernel = ctx.LoadKernelPTX(stream, "baryKernel");


            framebufferSize = new int2(5, 5);

            // Allocate input vectors h_A and h_B in host memory
            h_v0     = new float3(0, 1, 0);
            h_v1     = new float3(1, -1, 0);
            h_v2     = new float3(-1, -1, 0);
            h_da     = 3;
            h_db     = 2;
            h_dc     = 1;
            h_dOut   = new float[framebufferSize.x * framebufferSize.y];
            h_width  = framebufferSize.x;
            h_height = framebufferSize.y;

            // Allocate vectors in device memory and copy vectors from host memory to device memory
            // Notice the new syntax with implicit conversion operators: Allocation of device memory and data copy is one operation.
            dev_v0 = h_v0;
            dev_v1 = h_v1;
            dev_v2 = h_v2;
            dev_da = h_da;
            dev_db = h_db;
            dev_dc = h_dc;

            dev_dOut = new CudaDeviceVariable <float>(framebufferSize.x * framebufferSize.y);

            dev_width  = h_width;
            dev_height = h_height;

            // Invoke kernel
            //int threadsPerBlock = 256;
            //vectorAddKernel.BlockDimensions = threadsPerBlock;
            //vectorAddKernel.GridDimensions = (framebufferSize.x + threadsPerBlock - 1) / threadsPerBlock;


            dim3 windowSize = new dim3(framebufferSize.x, framebufferSize.y);
            dim3 blockSize  = new dim3(16, 16, 1);
            dim3 gridSize   = new dim3(windowSize.x / blockSize.x + 1, windowSize.y / blockSize.y + 1);

            baryKernel.BlockDimensions = blockSize;
            baryKernel.GridDimensions  = gridSize;

            baryKernel.Run(dev_v0.DevicePointer, dev_v1.DevicePointer, dev_v2.DevicePointer, dev_da.DevicePointer, dev_db.DevicePointer, dev_dc.DevicePointer, dev_dOut.DevicePointer, dev_width.DevicePointer, dev_height.DevicePointer);

            // Copy result from device memory to host memory
            // h_C contains the result in host memory
            h_dOut = dev_dOut;


            CleanupResources();

            Console.Write("{\n");
            for (int y = 0; y < framebufferSize.y; y++)
            {
                Console.Write("  {");
                for (int x = 0; x < framebufferSize.x; x++)
                {
                    Console.Write(h_dOut[x + y * framebufferSize.y] + "|");
                }
                Console.Write("}\n");
            }
            Console.Write("}\n");

            Console.ReadKey();
        }
Example #28
0
        static void Main(string[] args)
        {
            int SIGNAL_SIZE = 50;
            int FILTER_KERNEL_SIZE = 11;

            Console.WriteLine("[simpleCUFFT] is starting...");

            var assembly = Assembly.GetExecutingAssembly();
            var resourceName = "simpleCUFFT.simpleCUFFTKernel.ptx";

            CudaContext ctx = new CudaContext(0);
            CudaKernel ComplexPointwiseMulAndScale;
            string[] liste = assembly.GetManifestResourceNames();
            using (Stream stream = assembly.GetManifestResourceStream(resourceName))
            {
                ComplexPointwiseMulAndScale = ctx.LoadKernelPTX(stream, "ComplexPointwiseMulAndScale");
            }

            // Allocate host memory for the signal
            cuFloatComplex[] h_signal = new cuFloatComplex[SIGNAL_SIZE]; //we use cuFloatComplex for complex multiplaction in reference host code...

            Random rand = new Random(0);
            // Initialize the memory for the signal
            for (int i = 0; i < SIGNAL_SIZE; ++i)
            {
                h_signal[i].real = (float)rand.NextDouble();
                h_signal[i].imag = 0;
            }

            // Allocate host memory for the filter
            cuFloatComplex[] h_filter_kernel = new cuFloatComplex[FILTER_KERNEL_SIZE];

            // Initialize the memory for the filter
            for (int i = 0; i < FILTER_KERNEL_SIZE; ++i)
            {
                h_filter_kernel[i].real = (float)rand.NextDouble();
                h_filter_kernel[i].imag = 0;
            }

            // Pad signal and filter kernel
            cuFloatComplex[] h_padded_signal = null;
            cuFloatComplex[] h_padded_filter_kernel = null;
            int new_size = PadData(h_signal, ref h_padded_signal, SIGNAL_SIZE,
                                   h_filter_kernel, ref h_padded_filter_kernel, FILTER_KERNEL_SIZE);
            int mem_size = (int)cuFloatComplex.SizeOf * new_size;

            // Allocate device memory for signal
            CudaDeviceVariable<cuFloatComplex> d_signal = new CudaDeviceVariable<cuFloatComplex>(new_size);
            // Copy host memory to device
            d_signal.CopyToDevice(h_padded_signal);

            // Allocate device memory for filter kernel
            CudaDeviceVariable<cuFloatComplex> d_filter_kernel = new CudaDeviceVariable<cuFloatComplex>(new_size);

            // Copy host memory to device
            d_filter_kernel.CopyToDevice(h_padded_filter_kernel);

            // CUFFT plan simple API
            CudaFFTPlan1D plan = new CudaFFTPlan1D(new_size, cufftType.C2C, 1);

            // Transform signal and kernel
            Console.WriteLine("Transforming signal cufftExecC2C");
            plan.Exec(d_signal.DevicePointer, TransformDirection.Forward);
            plan.Exec(d_filter_kernel.DevicePointer, TransformDirection.Forward);

            // Multiply the coefficients together and normalize the result
            Console.WriteLine("Launching ComplexPointwiseMulAndScale<<< >>>");
            ComplexPointwiseMulAndScale.BlockDimensions = 256;
            ComplexPointwiseMulAndScale.GridDimensions = 32;
            ComplexPointwiseMulAndScale.Run(d_signal.DevicePointer, d_filter_kernel.DevicePointer, new_size, 1.0f / new_size);

            // Transform signal back
            Console.WriteLine("Transforming signal back cufftExecC2C");
            plan.Exec(d_signal.DevicePointer, TransformDirection.Inverse);

            // Copy device memory to host
            cuFloatComplex[] h_convolved_signal = d_signal;

            // Allocate host memory for the convolution result
            cuFloatComplex[] h_convolved_signal_ref = new cuFloatComplex[SIGNAL_SIZE];

            // Convolve on the host
            Convolve(h_signal, SIGNAL_SIZE,
                     h_filter_kernel, FILTER_KERNEL_SIZE,
                     h_convolved_signal_ref);

            // check result
            bool bTestResult = sdkCompareL2fe(h_convolved_signal_ref, h_convolved_signal, 1e-5f);

            //Destroy CUFFT context
            plan.Dispose();

            // cleanup memory
            d_filter_kernel.Dispose();
            d_signal.Dispose();
            ctx.Dispose();

            if (bTestResult)
            {
                Console.WriteLine("Test Passed");
            }
            else
            {
                Console.WriteLine("Test Failed");
            }
        }
Example #29
0
        static void Main(string[] args)
        {
            ShrQATest.shrQAStart(args);

            Console.WriteLine("Vector Addition");
            int N = 50000;

            //Init Cuda context
            ctx = new CudaContext(CudaContext.GetMaxGflopsDeviceId());

            //Load Kernel image from resources
            string resName;
            if (IntPtr.Size == 8)
                resName = "vectorAdd_x64.ptx";
            else
                resName = "vectorAdd.ptx";

            string resNamespace = "vectorAdd";
            string resource = resNamespace + "." + resName;
            Stream stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource);
            if (stream == null) throw new ArgumentException("Kernel not found in resources.");

            CudaKernel vectorAddKernel = ctx.LoadKernelPTX(stream, "VecAdd");

            // Allocate input vectors h_A and h_B in host memory
            h_A = new float[N];
            h_B = new float[N];

            // Initialize input vectors
            RandomInit(h_A, N);
            RandomInit(h_B, N);

            // Allocate vectors in device memory and copy vectors from host memory to device memory
            // Notice the new syntax with implicit conversion operators: Allocation of device memory and data copy is one operation.
            d_A = h_A;
            d_B = h_B;
            d_C = new CudaDeviceVariable<float>(N);

            // Invoke kernel
            int threadsPerBlock = 256;
            vectorAddKernel.BlockDimensions = threadsPerBlock;
            vectorAddKernel.GridDimensions = (N + threadsPerBlock - 1) / threadsPerBlock;

            vectorAddKernel.Run(d_A.DevicePointer, d_B.DevicePointer, d_C.DevicePointer, N);

            // Copy result from device memory to host memory
            // h_C contains the result in host memory
            h_C = d_C;

            // Verify result
            int i;
            for (i = 0; i < N; ++i)
            {
                float sum = h_A[i] + h_B[i];
                if (Math.Abs(h_C[i] - sum) > 1e-5)
                    break;
            }

            CleanupResources();

            ShrQATest.shrQAFinishExit(args, i == N ? ShrQATest.eQAstatus.QA_PASSED : ShrQATest.eQAstatus.QA_FAILED);
        }
Example #30
0
        static void Main(string[] args)
        {
            string filename = "vectorAdd_kernel.cu"; //we assume the file is in the same folder...
            string fileToCompile = File.ReadAllText(filename);

            CudaRuntimeCompiler rtc = new CudaRuntimeCompiler(fileToCompile, "vectorAdd_kernel");

            rtc.Compile(args);

            string log = rtc.GetLogAsString();

            Console.WriteLine(log);

            byte[] ptx = rtc.GetPTX();

            rtc.Dispose();

            CudaContext ctx = new CudaContext(0);

            CudaKernel vectorAdd = ctx.LoadKernelPTX(ptx, "vectorAdd");

            // Print the vector length to be used, and compute its size
            int numElements = 50000;
            SizeT size = numElements * sizeof(float);
            Console.WriteLine("[Vector addition of {0} elements]", numElements);

            // Allocate the host input vector A
            float[] h_A = new float[numElements];
            // Allocate the host input vector B
            float[] h_B = new float[numElements];
            // Allocate the host output vector C
            float[] h_C = new float[numElements];

            Random rand = new Random(0);

            // Initialize the host input vectors
            for (int i = 0; i < numElements; ++i)
            {
                h_A[i] = (float)rand.NextDouble();
                h_B[i] = (float)rand.NextDouble();
            }

            Console.WriteLine("Allocate and copy input data from the host memory to the CUDA device\n");
            // Allocate the device input vector A and copy to device
            CudaDeviceVariable<float> d_A = h_A;

            // Allocate the device input vector B and copy to device
            CudaDeviceVariable<float> d_B = h_B;

            // Allocate the device output vector C
            CudaDeviceVariable<float> d_C = new CudaDeviceVariable<float>(numElements);

            // Launch the Vector Add CUDA Kernel
            int threadsPerBlock = 256;
            int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
            Console.WriteLine("CUDA kernel launch with {0} blocks of {1} threads\n", blocksPerGrid, threadsPerBlock);
            vectorAdd.BlockDimensions = new dim3(threadsPerBlock,1, 1);
            vectorAdd.GridDimensions = new dim3(blocksPerGrid, 1, 1);

            vectorAdd.Run(d_A.DevicePointer, d_B.DevicePointer, d_C.DevicePointer, numElements);

            // Copy the device result vector in device memory to the host result vector
            // in host memory.
            Console.WriteLine("Copy output data from the CUDA device to the host memory\n");
            d_C.CopyToHost(h_C);

            // Verify that the result vector is correct
            for (int i = 0; i < numElements; ++i)
            {
                if (Math.Abs(h_A[i] + h_B[i] - h_C[i]) > 1e-5)
                {
                    Console.WriteLine("Result verification failed at element {0}!\n", i);
                    return;
                }
            }

            Console.WriteLine("Test PASSED\n");

            // Free device global memory
            d_A.Dispose();
            d_B.Dispose();
            d_C.Dispose();

            ctx.Dispose();
            Console.WriteLine("Done\n");
        }
Example #31
0
        public void TestPTX()
        {
            LLVM.InitializeAllTargets();
            LLVM.InitializeAllTargetMCs();
            LLVM.InitializeAllTargetInfos();
            LLVM.InitializeAllAsmPrinters();
            ModuleRef mod = LLVM.ModuleCreateWithName("llvmptx");
            var       pt  = LLVM.PointerType(LLVM.Int64Type(), 1);

            TypeRef[]     param_types = { pt };
            TypeRef       ret_type    = LLVM.FunctionType(LLVM.VoidType(), param_types, false);
            ValueRef      sum         = LLVM.AddFunction(mod, "sum", ret_type);
            BasicBlockRef entry       = LLVM.AppendBasicBlock(sum, "entry");
            BuilderRef    builder     = LLVM.CreateBuilder();

            LLVM.PositionBuilderAtEnd(builder, entry);
            var      v   = LLVM.BuildLoad(builder, LLVM.GetParam(sum, 0), "");
            ValueRef tmp = LLVM.BuildAdd(builder, v, LLVM.ConstInt(LLVM.Int64Type(), 1, false), "tmp");

            LLVM.BuildStore(builder, tmp, LLVM.GetParam(sum, 0));
            LLVM.BuildRetVoid(builder);
            MyString the_error = new MyString();

            LLVM.VerifyModule(mod, VerifierFailureAction.PrintMessageAction, the_error);

            string    triple = "nvptx64-nvidia-cuda";
            TargetRef t2;
            var       b = LLVM.GetTargetFromTriple(triple, out t2, the_error);

            string cpu      = "";
            string features = "";

            TargetMachineRef tmr = LLVM.CreateTargetMachine(t2, triple, cpu, features,
                                                            CodeGenOptLevel.CodeGenLevelDefault,
                                                            RelocMode.RelocDefault,
                                                            CodeModel.CodeModelKernel);
            ContextRef context_ref = LLVM.ContextCreate();
            ValueRef   kernelMd    = LLVM.MDNodeInContext(context_ref, new ValueRef[3]
            {
                sum,
                LLVM.MDStringInContext(context_ref, "kernel", 6),
                LLVM.ConstInt(LLVM.Int32TypeInContext(context_ref), 1, false)
            });

            LLVM.AddNamedMetadataOperand(mod, "nvvm.annotations", kernelMd);
            var y1 = LLVM.TargetMachineEmitToMemoryBuffer(
                tmr,
                mod,
                Swigged.LLVM.CodeGenFileType.AssemblyFile,
                the_error,
                out MemoryBufferRef buffer);
            string ptx = null;

            try
            {
                ptx = LLVM.GetBufferStart(buffer);
                uint length = LLVM.GetBufferSize(buffer);
                // Output the PTX assembly code. We can run this using the CUDA Driver API
                System.Console.WriteLine(ptx);
            }
            finally
            {
                LLVM.DisposeMemoryBuffer(buffer);
            }


            // RUN THE MF.

            Int64[]     h_C             = new Int64[100];
            CudaContext ctx             = new CudaContext(CudaContext.GetMaxGflopsDeviceId());
            CudaKernel  kernel          = ctx.LoadKernelPTX(Encoding.ASCII.GetBytes(ptx), "sum");
            var         d_C             = new CudaDeviceVariable <Int64>(100);
            int         N               = 1;
            int         threadsPerBlock = 256;

            kernel.BlockDimensions = threadsPerBlock;
            kernel.GridDimensions  = (N + threadsPerBlock - 1) / threadsPerBlock;
            kernel.Run(d_C.DevicePointer);
            h_C = d_C;
            System.Console.WriteLine("Result " + h_C[0]);
            if (h_C[0] != 1)
            {
                throw new Exception("Failed.");
            }
            LLVM.DumpModule(mod);
            LLVM.DisposeBuilder(builder);
        }
Example #32
0
        public static CudaKernel load_kernel(String kernelName)
        {
            byte[] ptx = prepare_kernel(kernelName);

            return(ctx.LoadKernelPTX(ptx, kernelName));
        }