// Now a schedule that uses CUDA or OpenCL. public void ScheduleForGpu() { // We make the decision about whether to use the GPU for each // Func independently. If you have one Func computed on the // CPU, and the next computed on the GPU, Halide will do the // copy-to-gpu under the hood. For this pipeline, there's no // reason to use the CPU for any of the stages. Halide will // copy the input image to the GPU the first time we run the // pipeline, and leave it there to reuse on subsequent runs. // As before, we'll compute the LUT once at the start of the // pipeline. Lut.ComputeRoot(); // Let's compute the look-up-table using the GPU in 16-wide // one-dimensional thread blocks. First we split the index // into blocks of size 16: var block = new HSVar("block"); var thread = new HSVar("thread"); Lut.Split(I, block, thread, 16); // Then we tell cuda that our Vars 'block' and 'thread' // correspond to CUDA's notions of blocks and threads, or // OpenCL's notions of thread groups and threads. Lut.GpuBlocks(block) .GpuThreads(thread); // This is a very common scheduling pattern on the GPU, so // there's a shorthand for it: // lut.gpu_tile(i, block, thread, 16); // Func::gpu_tile behaves the same as Func::tile, except that // it also specifies that the tile coordinates correspond to // GPU blocks, and the coordinates within each tile correspond // to GPU threads. // Compute color channels innermost. Promise that there will // be three of them and unroll across them. Curved.Reorder(C, X, Y) .Bound(C, 0, 3) .Unroll(C); // Compute curved in 2D 8x8 tiles using the GPU. Curved.GpuTile(X, Y, XO, YO, XI, YI, 8, 8); // This is equivalent to: // curved.tile(x, y, xo, yo, xi, yi, 8, 8) // .gpu_blocks(xo, yo) // .gpu_threads(xi, yi); // We'll leave sharpen as inlined into curved. // Compute the padded input as needed per GPU block, storing // the intermediate result in shared memory. In the schedule // above xo corresponds to GPU blocks. Padded.ComputeAt(Curved, XO); // Use the GPU threads for the x and y coordinates of the // padded input. Padded.GpuThreads(X, Y); // JIT-compile the pipeline for the GPU. CUDA, OpenCL, or // Metal are not enabled by default. We have to construct a // Target object, enable one of them, and then pass that // target object to compile_jit. Otherwise your CPU will very // slowly pretend it's a GPU, and use one thread per output // pixel. // Start with a target suitable for the machine you're running // this on. var target = HS.GetHostTarget(); // Then enable OpenCL or Metal, depending on which platform // we're on. OS X doesn't update its OpenCL drivers, so they // tend to be broken. CUDA would also be a fine choice on // machines with NVidia GPUs. if (target.OS == HSOperatingSystem.OSX) { target.SetFeature(HSFeature.Metal); } else { target.SetFeature(HSFeature.OpenCL); } // Uncomment the next line and comment out the lines above to // try CUDA instead. //target.SetFeature(HSFeature.CUDA); // If you want to see all of the OpenCL, Metal, or CUDA API // calls done by the pipeline, you can also enable the Debug // flag. This is helpful for figuring out which stages are // slow, or when CPU -> GPU copies happen. It hurts // performance though, so we'll leave it commented out. // target.set_feature(Target::Debug); Curved.CompileJit(target); }