Example #1
0
        // Now a schedule that uses CUDA or OpenCL.
        public void ScheduleForGpu()
        {
            // We make the decision about whether to use the GPU for each
            // Func independently. If you have one Func computed on the
            // CPU, and the next computed on the GPU, Halide will do the
            // copy-to-gpu under the hood. For this pipeline, there's no
            // reason to use the CPU for any of the stages. Halide will
            // copy the input image to the GPU the first time we run the
            // pipeline, and leave it there to reuse on subsequent runs.

            // As before, we'll compute the LUT once at the start of the
            // pipeline.
            Lut.ComputeRoot();

            // Let's compute the look-up-table using the GPU in 16-wide
            // one-dimensional thread blocks. First we split the index
            // into blocks of size 16:
            var block  = new HSVar("block");
            var thread = new HSVar("thread");

            Lut.Split(I, block, thread, 16);
            // Then we tell cuda that our Vars 'block' and 'thread'
            // correspond to CUDA's notions of blocks and threads, or
            // OpenCL's notions of thread groups and threads.
            Lut.GpuBlocks(block)
            .GpuThreads(thread);

            // This is a very common scheduling pattern on the GPU, so
            // there's a shorthand for it:

            // lut.gpu_tile(i, block, thread, 16);

            // Func::gpu_tile behaves the same as Func::tile, except that
            // it also specifies that the tile coordinates correspond to
            // GPU blocks, and the coordinates within each tile correspond
            // to GPU threads.

            // Compute color channels innermost. Promise that there will
            // be three of them and unroll across them.
            Curved.Reorder(C, X, Y)
            .Bound(C, 0, 3)
            .Unroll(C);

            // Compute curved in 2D 8x8 tiles using the GPU.
            Curved.GpuTile(X, Y, XO, YO, XI, YI, 8, 8);

            // This is equivalent to:
            // curved.tile(x, y, xo, yo, xi, yi, 8, 8)
            //       .gpu_blocks(xo, yo)
            //       .gpu_threads(xi, yi);

            // We'll leave sharpen as inlined into curved.

            // Compute the padded input as needed per GPU block, storing
            // the intermediate result in shared memory. In the schedule
            // above xo corresponds to GPU blocks.
            Padded.ComputeAt(Curved, XO);

            // Use the GPU threads for the x and y coordinates of the
            // padded input.
            Padded.GpuThreads(X, Y);

            // JIT-compile the pipeline for the GPU. CUDA, OpenCL, or
            // Metal are not enabled by default. We have to construct a
            // Target object, enable one of them, and then pass that
            // target object to compile_jit. Otherwise your CPU will very
            // slowly pretend it's a GPU, and use one thread per output
            // pixel.

            // Start with a target suitable for the machine you're running
            // this on.
            var target = HS.GetHostTarget();

            // Then enable OpenCL or Metal, depending on which platform
            // we're on. OS X doesn't update its OpenCL drivers, so they
            // tend to be broken. CUDA would also be a fine choice on
            // machines with NVidia GPUs.
            if (target.OS == HSOperatingSystem.OSX)
            {
                target.SetFeature(HSFeature.Metal);
            }
            else
            {
                target.SetFeature(HSFeature.OpenCL);
            }

            // Uncomment the next line and comment out the lines above to
            // try CUDA instead.
            //target.SetFeature(HSFeature.CUDA);

            // If you want to see all of the OpenCL, Metal, or CUDA API
            // calls done by the pipeline, you can also enable the Debug
            // flag. This is helpful for figuring out which stages are
            // slow, or when CPU -> GPU copies happen. It hurts
            // performance though, so we'll leave it commented out.
            // target.set_feature(Target::Debug);

            Curved.CompileJit(target);
        }