unsafe static void Main(string[] args) { int nStreams = 8; cudaStream_t[] streams = new cudaStream_t[nStreams]; //create streams int N = 1024 * 1024 * 32; float[] a = new float[N]; float[] b = new float[N]; for (int k = 0; k < N; ++k) { a[k] = (float)k; b[k] = 1.0F; } IntPtr d_a, d_b; // device pointers cuda.Malloc(out d_a, N * sizeof(float)); cuda.Malloc(out d_b, N * sizeof(float)); GCHandle handle_a = GCHandle.Alloc(a, GCHandleType.Pinned); GCHandle handle_b = GCHandle.Alloc(b, GCHandleType.Pinned); IntPtr h_a = handle_a.AddrOfPinnedObject(); IntPtr h_b = handle_b.AddrOfPinnedObject(); cuda.DeviceSynchronize(); cuda.Memcpy(d_a, h_a, N * sizeof(float), cudaMemcpyKind.cudaMemcpyHostToDevice); cuda.Memcpy(d_b, h_b, N * sizeof(float), cudaMemcpyKind.cudaMemcpyHostToDevice); int slice = N / nStreams; // size of the array compute by each stream dynamic wrapped = HybRunner.Cuda().Wrap(new Program()); int start; int stop; // call kernel with each stream // copy data device to host // synchronize and destroy streams for (int i = 0; i < N; ++i) { if (a[i] != (float)i + (1.0F * 100.0F)) { Console.Error.WriteLine("ERROR at {0} -- {1} != {2}", i, a[i], i + 1); Environment.Exit(6); // abort } } handle_a.Free(); handle_b.Free(); }
public static NPPImage Load(string path, cudaStream_t stream) { NPPImage result = new NPPImage(); byte[] rawData; if (Path.GetExtension(path).Contains("pgm")) { using (FileStream fs = new FileStream(path, FileMode.Open)) { using (TextReader tReader = new StreamReader(fs)) using (BinaryReader bReader = new BinaryReader(fs)) { string formatLine = tReader.ReadLine(); // skip string sizeLine = tReader.ReadLine(); string[] splitted = sizeLine.Split(' '); result.width = int.Parse(splitted[0]); result.height = int.Parse(splitted[1]); string maxValueLine = tReader.ReadLine(); // skip int pos = formatLine.Length + sizeLine.Length + maxValueLine.Length + 3; fs.Seek(pos, SeekOrigin.Begin); // TODO: optimize that part rawData = bReader.ReadBytes((int)(fs.Length - pos)); } } } else if (Path.GetExtension(path).Contains("png")) { Bitmap image = Bitmap.FromFile(path) as Bitmap; result.width = image.Width; result.height = image.Height; rawData = new byte[result.width * result.height]; int index = 0; for (int j = 0; j < result.height; ++j) { for (int i = 0; i < result.width; ++i, ++index) { rawData[index] = image.GetPixel(i, j).R; } } } else { throw new NotSupportedException("unsupported file format"); } IntPtr deviceData; size_t p; cuda.ERROR_CHECK(cuda.MallocPitch(out deviceData, out p, result.width * sizeof(ushort), result.height)); result.pitch = (int)p; result.hostData = new ushort[result.height * result.width]; for (int j = 0; j < result.height; ++j) { for (int i = 0; i < result.width; ++i) { result.hostData[j * result.width + i] = rawData[j * result.width + i]; } } var handle = GCHandle.Alloc(result.hostData, GCHandleType.Pinned); cuda.ERROR_CHECK(cuda.Memcpy2DAsync(deviceData, p, handle.AddrOfPinnedObject(), result.width * sizeof(ushort), result.width * sizeof(ushort), result.height, cudaMemcpyKind.cudaMemcpyHostToDevice, stream)); handle.Free(); result.deviceData = deviceData; return(result); }
public static extern int NPP_ImageSegmentationx46Programx46ColorizeLabels_ExternCWrapperStream_CUDA( int gridDimX, int gridDimY, int blockDimX, int blockDimY, int blockDimZ, int shared, cudaStream_t stream, IntPtr segments, IntPtr colors, IntPtr colormap, int maxLabel, int count, int width, int pitch);
unsafe static void Main(string[] args) { int nStreams = 8; cudaStream_t[] streams = new cudaStream_t[nStreams]; for (int k = 0; k < nStreams; ++k) { cuda.StreamCreate(out streams[k]); } int N = 1024 * 1024 * 32; float[] a = new float[N]; float[] b = new float[N]; for (int k = 0; k < N; ++k) { a[k] = (float)k; b[k] = 1.0F; } IntPtr d_a, d_b; // device pointers cuda.Malloc(out d_a, N * sizeof(float)); cuda.Malloc(out d_b, N * sizeof(float)); GCHandle handle_a = GCHandle.Alloc(a, GCHandleType.Pinned); GCHandle handle_b = GCHandle.Alloc(b, GCHandleType.Pinned); IntPtr h_a = handle_a.AddrOfPinnedObject(); IntPtr h_b = handle_b.AddrOfPinnedObject(); cuda.DeviceSynchronize(); cuda.Memcpy(d_a, h_a, N * sizeof(float), cudaMemcpyKind.cudaMemcpyHostToDevice); cuda.Memcpy(d_b, h_b, N * sizeof(float), cudaMemcpyKind.cudaMemcpyHostToDevice); int slice = N / nStreams; dynamic wrapped = HybRunner.Cuda().Wrap(new Program()); for (int k = 0; k < nStreams; ++k) { int start = k * slice; int stop = start + slice; wrapped.SetStream(streams[k]).Add(d_a, d_b, start, stop); } for (int k = 0; k < nStreams; ++k) { int start = k * slice; cuda.MemcpyAsync(h_a + start * sizeof(float), d_a + start * sizeof(float), slice * sizeof(float), cudaMemcpyKind.cudaMemcpyDeviceToHost, streams[k]); } for (int k = 0; k < nStreams; ++k) { cuda.StreamSynchronize(streams[k]); cuda.StreamDestroy(streams[k]); } for (int i = 0; i < N; ++i) { if (a[i] != (float)i + 1.0F) { Console.Error.WriteLine("ERROR at {0} -- {1} != {2}", i, a[i], i + 1); Environment.Exit(6); // abort } } handle_a.Free(); handle_b.Free(); }
unsafe static void Main(string[] args) { int nStreams = 8; cudaStream_t[] streams = new cudaStream_t[nStreams]; HybRunner[] runners = new HybRunner[nStreams]; dynamic[] wrapped = new dynamic[nStreams]; cudaDeviceProp prop; cuda.GetDeviceProperties(out prop, 0); for (int k = 0; k < nStreams; ++k) { cuda.StreamCreate(out streams[k]); runners[k] = HybRunner.Cuda("Streams_CUDA.dll", streams[k], CudaMarshaler.Instance).SetDistrib(16 * prop.multiProcessorCount, 128); wrapped[k] = runners[k].Wrap(new Program()); } int N = 1024 * 1024 * 32; IntPtr d_a, d_b; // device pointers float[] a = new float[N]; float[] b = new float[N]; cuda.Malloc(out d_a, N * sizeof(float)); cuda.Malloc(out d_b, N * sizeof(float)); for (int k = 0; k < N; ++k) { a[k] = (float)k; b[k] = 1.0F; } GCHandle handle_a = GCHandle.Alloc(a, GCHandleType.Pinned); GCHandle handle_b = GCHandle.Alloc(b, GCHandleType.Pinned); IntPtr h_a = handle_a.AddrOfPinnedObject(); IntPtr h_b = handle_b.AddrOfPinnedObject(); int slice = N / nStreams; cuda.DeviceSynchronize(); cuda.Memcpy(d_a, h_a, N * sizeof(float), cudaMemcpyKind.cudaMemcpyHostToDevice); cuda.Memcpy(d_b, h_b, N * sizeof(float), cudaMemcpyKind.cudaMemcpyHostToDevice); for (int k = 0; k < nStreams; ++k) { int start = k * slice; int stop = start + slice; wrapped[k].Add(d_a, d_b, start, stop, 100); } for (int k = 0; k < nStreams; ++k) { int start = k * slice; cuda.MemcpyAsync(h_a + start * sizeof(float), d_a + start * sizeof(float), slice * sizeof(float), cudaMemcpyKind.cudaMemcpyDeviceToHost, streams[k]); } for (int k = 0; k < nStreams; ++k) { cuda.StreamSynchronize(streams[k]); cuda.StreamDestroy(streams[k]); } for (int k = 0; k < 10; ++k) { Console.WriteLine(a[k]); } handle_a.Free(); handle_b.Free(); }
public void SetStream(cudaStream_t stream) { cuRAND.SetStream(generator, stream); }