unsafe static void Main(string[] args)
        {
            int nStreams = 8;

            cudaStream_t[] streams = new cudaStream_t[nStreams];

            //create streams

            int N = 1024 * 1024 * 32;

            float[] a = new float[N];
            float[] b = new float[N];

            for (int k = 0; k < N; ++k)
            {
                a[k] = (float)k;
                b[k] = 1.0F;
            }

            IntPtr d_a, d_b; // device pointers

            cuda.Malloc(out d_a, N * sizeof(float));
            cuda.Malloc(out d_b, N * sizeof(float));

            GCHandle handle_a = GCHandle.Alloc(a, GCHandleType.Pinned);
            GCHandle handle_b = GCHandle.Alloc(b, GCHandleType.Pinned);
            IntPtr   h_a      = handle_a.AddrOfPinnedObject();
            IntPtr   h_b      = handle_b.AddrOfPinnedObject();

            cuda.DeviceSynchronize();

            cuda.Memcpy(d_a, h_a, N * sizeof(float), cudaMemcpyKind.cudaMemcpyHostToDevice);
            cuda.Memcpy(d_b, h_b, N * sizeof(float), cudaMemcpyKind.cudaMemcpyHostToDevice);

            int slice = N / nStreams; // size of the array compute by each stream

            dynamic wrapped = HybRunner.Cuda().Wrap(new Program());
            int     start;
            int     stop;

            // call kernel with each stream

            // copy data device to host

            // synchronize and destroy streams

            for (int i = 0; i < N; ++i)
            {
                if (a[i] != (float)i + (1.0F * 100.0F))
                {
                    Console.Error.WriteLine("ERROR at {0} -- {1} != {2}", i, a[i], i + 1);
                    Environment.Exit(6); // abort
                }
            }

            handle_a.Free();
            handle_b.Free();
        }
        public static NPPImage Load(string path, cudaStream_t stream)
        {
            NPPImage result = new NPPImage();

            byte[] rawData;
            if (Path.GetExtension(path).Contains("pgm"))
            {
                using (FileStream fs = new FileStream(path, FileMode.Open))
                {
                    using (TextReader tReader = new StreamReader(fs))
                        using (BinaryReader bReader = new BinaryReader(fs))
                        {
                            string   formatLine = tReader.ReadLine(); // skip
                            string   sizeLine   = tReader.ReadLine();
                            string[] splitted   = sizeLine.Split(' ');
                            result.width  = int.Parse(splitted[0]);
                            result.height = int.Parse(splitted[1]);

                            string maxValueLine = tReader.ReadLine(); // skip
                            int    pos          = formatLine.Length + sizeLine.Length + maxValueLine.Length + 3;
                            fs.Seek(pos, SeekOrigin.Begin);

                            // TODO: optimize that part
                            rawData = bReader.ReadBytes((int)(fs.Length - pos));
                        }
                }
            }
            else if (Path.GetExtension(path).Contains("png"))
            {
                Bitmap image = Bitmap.FromFile(path) as Bitmap;
                result.width  = image.Width;
                result.height = image.Height;
                rawData       = new byte[result.width * result.height];
                int index = 0;
                for (int j = 0; j < result.height; ++j)
                {
                    for (int i = 0; i < result.width; ++i, ++index)
                    {
                        rawData[index] = image.GetPixel(i, j).R;
                    }
                }
            }
            else
            {
                throw new NotSupportedException("unsupported file format");
            }

            IntPtr deviceData;
            size_t p;

            cuda.ERROR_CHECK(cuda.MallocPitch(out deviceData, out p, result.width * sizeof(ushort), result.height));
            result.pitch = (int)p;

            result.hostData = new ushort[result.height * result.width];
            for (int j = 0; j < result.height; ++j)
            {
                for (int i = 0; i < result.width; ++i)
                {
                    result.hostData[j * result.width + i] = rawData[j * result.width + i];
                }
            }

            var handle = GCHandle.Alloc(result.hostData, GCHandleType.Pinned);

            cuda.ERROR_CHECK(cuda.Memcpy2DAsync(deviceData, p, handle.AddrOfPinnedObject(), result.width * sizeof(ushort), result.width * sizeof(ushort), result.height, cudaMemcpyKind.cudaMemcpyHostToDevice, stream));
            handle.Free();
            result.deviceData = deviceData;

            return(result);
        }
Example #3
0
 public static extern int NPP_ImageSegmentationx46Programx46ColorizeLabels_ExternCWrapperStream_CUDA(
     int gridDimX, int gridDimY, int blockDimX, int blockDimY, int blockDimZ, int shared, cudaStream_t stream,
     IntPtr segments, IntPtr colors, IntPtr colormap, int maxLabel, int count, int width, int pitch);
        unsafe static void Main(string[] args)
        {
            int nStreams = 8;

            cudaStream_t[] streams = new cudaStream_t[nStreams];
            for (int k = 0; k < nStreams; ++k)
            {
                cuda.StreamCreate(out streams[k]);
            }

            int N = 1024 * 1024 * 32;

            float[] a = new float[N];
            float[] b = new float[N];

            for (int k = 0; k < N; ++k)
            {
                a[k] = (float)k;
                b[k] = 1.0F;
            }

            IntPtr d_a, d_b; // device pointers

            cuda.Malloc(out d_a, N * sizeof(float));
            cuda.Malloc(out d_b, N * sizeof(float));

            GCHandle handle_a = GCHandle.Alloc(a, GCHandleType.Pinned);
            GCHandle handle_b = GCHandle.Alloc(b, GCHandleType.Pinned);
            IntPtr   h_a      = handle_a.AddrOfPinnedObject();
            IntPtr   h_b      = handle_b.AddrOfPinnedObject();

            cuda.DeviceSynchronize();

            cuda.Memcpy(d_a, h_a, N * sizeof(float), cudaMemcpyKind.cudaMemcpyHostToDevice);
            cuda.Memcpy(d_b, h_b, N * sizeof(float), cudaMemcpyKind.cudaMemcpyHostToDevice);

            int slice = N / nStreams;

            dynamic wrapped = HybRunner.Cuda().Wrap(new Program());

            for (int k = 0; k < nStreams; ++k)
            {
                int start = k * slice;
                int stop  = start + slice;
                wrapped.SetStream(streams[k]).Add(d_a, d_b, start, stop);
            }

            for (int k = 0; k < nStreams; ++k)
            {
                int start = k * slice;
                cuda.MemcpyAsync(h_a + start * sizeof(float), d_a + start * sizeof(float), slice * sizeof(float), cudaMemcpyKind.cudaMemcpyDeviceToHost, streams[k]);
            }

            for (int k = 0; k < nStreams; ++k)
            {
                cuda.StreamSynchronize(streams[k]);
                cuda.StreamDestroy(streams[k]);
            }

            for (int i = 0; i < N; ++i)
            {
                if (a[i] != (float)i + 1.0F)
                {
                    Console.Error.WriteLine("ERROR at {0} -- {1} != {2}", i, a[i], i + 1);
                    Environment.Exit(6); // abort
                }
            }

            handle_a.Free();
            handle_b.Free();
        }
Example #5
0
        unsafe static void Main(string[] args)
        {
            int nStreams = 8;

            cudaStream_t[] streams = new cudaStream_t[nStreams];
            HybRunner[]    runners = new HybRunner[nStreams];
            dynamic[]      wrapped = new dynamic[nStreams];
            cudaDeviceProp prop;

            cuda.GetDeviceProperties(out prop, 0);
            for (int k = 0; k < nStreams; ++k)
            {
                cuda.StreamCreate(out streams[k]);

                runners[k] = HybRunner.Cuda("Streams_CUDA.dll", streams[k], CudaMarshaler.Instance).SetDistrib(16 * prop.multiProcessorCount, 128);
                wrapped[k] = runners[k].Wrap(new Program());
            }

            int    N = 1024 * 1024 * 32;
            IntPtr d_a, d_b; // device pointers

            float[] a = new float[N];
            float[] b = new float[N];

            cuda.Malloc(out d_a, N * sizeof(float));
            cuda.Malloc(out d_b, N * sizeof(float));

            for (int k = 0; k < N; ++k)
            {
                a[k] = (float)k;
                b[k] = 1.0F;
            }

            GCHandle handle_a = GCHandle.Alloc(a, GCHandleType.Pinned);
            GCHandle handle_b = GCHandle.Alloc(b, GCHandleType.Pinned);
            IntPtr   h_a      = handle_a.AddrOfPinnedObject();
            IntPtr   h_b      = handle_b.AddrOfPinnedObject();

            int slice = N / nStreams;

            cuda.DeviceSynchronize();

            cuda.Memcpy(d_a, h_a, N * sizeof(float), cudaMemcpyKind.cudaMemcpyHostToDevice);
            cuda.Memcpy(d_b, h_b, N * sizeof(float), cudaMemcpyKind.cudaMemcpyHostToDevice);

            for (int k = 0; k < nStreams; ++k)
            {
                int start = k * slice;
                int stop  = start + slice;
                wrapped[k].Add(d_a, d_b, start, stop, 100);
            }
            for (int k = 0; k < nStreams; ++k)
            {
                int start = k * slice;
                cuda.MemcpyAsync(h_a + start * sizeof(float), d_a + start * sizeof(float), slice * sizeof(float), cudaMemcpyKind.cudaMemcpyDeviceToHost, streams[k]);
            }

            for (int k = 0; k < nStreams; ++k)
            {
                cuda.StreamSynchronize(streams[k]);
                cuda.StreamDestroy(streams[k]);
            }

            for (int k = 0; k < 10; ++k)
            {
                Console.WriteLine(a[k]);
            }

            handle_a.Free();
            handle_b.Free();
        }
Example #6
0
 public void SetStream(cudaStream_t stream)
 {
     cuRAND.SetStream(generator, stream);
 }