Exemple #1
0
        public static int Main(string[] args)
        {
            // We'll start by defining the simple single-stage imaging
            // pipeline from lesson 1.

            // This lesson will be about debugging, but unfortunately in C++,
            // objects don't know their own names, which makes it hard for us
            // to understand the generated code. To get around this, you can
            // pass a string to the Func and Var constructors to give them a
            // name for debugging purposes.
            var gradient = new HSFunc("gradient");
            var x        = new HSVar("x");
            var y        = new HSVar("y");

            gradient[x, y] = x + y;

            // Realize the function to produce an output image. We'll keep it
            // very small for this lesson.
            var output = gradient.Realize <int>(8, 8);

            // That line compiled and ran the pipeline. Try running this
            // lesson with the environment variable HL_DEBUG_CODEGEN set to
            // 1. It will print out the various stages of compilation, and a
            // pseudocode representation of the final pipeline.

            // If you set HL_DEBUG_CODEGEN to a higher number, you can see
            // more and more details of how Halide compiles your pipeline.
            // Setting HL_DEBUG_CODEGEN=2 shows the Halide code at each stage
            // of compilation, and also the llvm bitcode we generate at the
            // end.

            // Halide will also output an HTML version of this output, which
            // supports syntax highlighting and code-folding, so it can be
            // nicer to read for large pipelines. Open gradient.html with your
            // browser after running this tutorial.
            gradient.CompileToLoweredStmt("gradient.html", HSOutputFormat.HS_HTML);

            // You can usually figure out what code Halide is generating using
            // this pseudocode. In the next lesson we'll see how to snoop on
            // Halide at runtime.

            Console.WriteLine("Success!");
            return(0);
        }
Exemple #2
0
        public static void Main()
        {
            // Now make a small program that will trigger an error
            var buffer = new HSBuffer <int>(2, 2);

            for (var j = 0; j < 2; j++)
            {
                for (var i = 0; i < 2; i++)
                {
                    buffer[j, i] = i * j;
                }
            }
            var errilicious = new HSFunc("EISFORERROR");
            var x           = new HSVar("x");
            var y           = new HSVar("y");

            errilicious[x, y] = buffer[x, y] * buffer[x, y];

            // Now realize over a domain that is larger than the buffer, which should trigger
            // an error
            var result = errilicious.Realize <int>(100, 100);
        }
Exemple #3
0
        public static int Main(string[] args)
        {
            // First we'll declare some Vars to use below.
            var x = new HSVar("x");
            var y = new HSVar("y");
            var c = new HSVar("c");

            // Now we'll express a multi-stage pipeline that blurs an image
            // first horizontally, and then vertically.
            {
                // Take a color 8-bit input
                var input = HSBuffer <byte> .LoadImage("rgb.png");

                // Upgrade it to 16-bit, so we can do math without it overflowing.
                var input_16 = new HSFunc("input_16");
                input_16[x, y, c] = HS.Cast <ushort>(input[x, y, c]);

                // Blur it horizontally:
                var blur_x = new HSFunc("blur_x");
                blur_x[x, y, c] = (input_16[x - 1, y, c] +
                                   2 * input_16[x, y, c] +
                                   input_16[x + 1, y, c]) / 4;

                // Blur it vertically:
                var blur_y = new HSFunc("blur_y");
                blur_y[x, y, c] = (blur_x[x, y - 1, c] +
                                   2 * blur_x[x, y, c] +
                                   blur_x[x, y + 1, c]) / 4;

                // Convert back to 8-bit.
                var output = new HSFunc("output");
                output[x, y, c] = HS.Cast <byte>(blur_y[x, y, c]);

                // Each Func in this pipeline calls a previous one using
                // familiar function call syntax (we've overloaded operator()
                // on Func objects). A Func may call any other Func that has
                // been given a definition. This restriction prevents
                // pipelines with loops in them. Halide pipelines are always
                // feed-forward graphs of Funcs.

                // Now let's realize it...

                // Buffer<byte> result = output.realize(input.width(), input.height(), 3);

                // Except that the line above is not going to work. Uncomment
                // it to see what happens.

                // Realizing this pipeline over the same domain as the input
                // image requires reading pixels out of bounds in the input,
                // because the blur_x stage reaches outwards horizontally, and
                // the blur_y stage reaches outwards vertically. Halide
                // detects this by injecting a piece of code at the top of the
                // pipeline that computes the region over which the input will
                // be read. When it starts to run the pipeline it first runs
                // this code, determines that the input will be read out of
                // bounds, and refuses to continue. No actual bounds checks
                // occur in the inner loop; that would be slow.
                //
                // So what do we do? There are a few options. If we realize
                // over a domain shifted inwards by one pixel, we won't be
                // asking the Halide routine to read out of bounds. We saw how
                // to do this in the previous lesson:
                var result = new HSBuffer <byte>(input.Width - 2, input.Height - 2, 3);
                result.SetMin(1, 1);
                output.Realize(result);

                // Save the result. It should look like a slightly blurry
                // parrot, and it should be two pixels narrower and two pixels
                // shorter than the input image.
                result.SaveImage("blurry_parrot_1.png");

                // This is usually the fastest way to deal with boundaries:
                // don't write code that reads out of bounds :) The more
                // general solution is our next example.
            }

            // The same pipeline, with a boundary condition on the input.
            {
                // Take a color 8-bit input
                var input = HSBuffer <byte> .LoadImage("rgb.png");

                // This time, we'll wrap the input in a Func that prevents
                // reading out of bounds:
                var clamped = new HSFunc("clamped");

                // Define an expression that clamps x to lie within the
                // range [0, input.width()-1].
                var clamped_x = HS.Clamp(x, 0, input.Width - 1);
                // clamp(x, a, b) is equivalent to max(min(x, b), a).

                // Similarly clamp y.
                var clamped_y = HS.Clamp(y, 0, input.Height - 1);
                // Load from input at the clamped coordinates. This means that
                // no matter how we evaluated the Func 'clamped', we'll never
                // read out of bounds on the input. This is a clamp-to-edge
                // style boundary condition, and is the simplest boundary
                // condition to express in Halide.
                clamped[x, y, c] = input[clamped_x, clamped_y, c];

                // Defining 'clamped' in that way can be done more concisely
                // using a helper function from the BoundaryConditions
                // namespace like so:
                //
                // clamped = BoundaryConditions::repeat_edge(input);
                //
                // These are important to use for other boundary conditions,
                // because they are expressed in the way that Halide can best
                // understand and optimize. When used correctly they are as
                // cheap as having no boundary condition at all.

                // Upgrade it to 16-bit, so we can do math without it
                // overflowing. This time we'll refer to our new Func
                // 'clamped', instead of referring to the input image
                // directly.
                var input_16 = new HSFunc("input_16");
                input_16[x, y, c] = HS.Cast <ushort>(clamped[x, y, c]);

                // The rest of the pipeline will be the same...

                // Blur it horizontally:
                var blur_x = new HSFunc("blur_x");
                blur_x[x, y, c] = (input_16[x - 1, y, c] +
                                   2 * input_16[x, y, c] +
                                   input_16[x + 1, y, c]) / 4;

                // Blur it vertically:
                var blur_y = new HSFunc("blur_y");
                blur_y[x, y, c] = (blur_x[x, y - 1, c] +
                                   2 * blur_x[x, y, c] +
                                   blur_x[x, y + 1, c]) / 4;

                // Convert back to 8-bit.
                var output = new HSFunc("output");
                output[x, y, c] = HS.Cast <byte>(blur_y[x, y, c]);

                // This time it's safe to evaluate the output over the some
                // domain as the input, because we have a boundary condition.
                var result = output.Realize <byte>(input.Width, input.Height, 3);

                // Save the result. It should look like a slightly blurry
                // parrot, but this time it will be the same size as the
                // input.
                result.SaveImage("blurry_parrot_2.png");
            }

            Console.WriteLine("Success!");
            return(0);
        }
Exemple #4
0
        public static int Main(string[] args)
        {
            // First we'll declare some Vars to use below.
            var x  = new HSVar("x");
            var y  = new HSVar("y");
            var xo = new HSVar("xo");
            var yo = new HSVar("yo");
            var xi = new HSVar("xi");
            var yi = new HSVar("yi");

            // This lesson will be about "wrapping" a Func or an ImageParam using the
            // Func::in and ImageParam::in directives
            {
                {
                    // Consider a simple two-stage pipeline:
                    var f = new HSFunc("f_local");
                    var g = new HSFunc("g_local");
                    f[x, y] = x + y;
                    g[x, y] = 2 * f[x, y] + 3;

                    f.ComputeRoot();

                    // This produces the following loop nests:
                    // for y:
                    //   for x:
                    //     f(x, y) = x + y
                    // for y:
                    //   for x:
                    //     g(x, y) = 2 * f(x, y) + 3

                    // Using Func::in, we can interpose a new Func in between f
                    // and g using the schedule alone:
                    HSFunc f_in_g = f.In(g);
                    f_in_g.ComputeRoot();

                    // Equivalently, we could also chain the schedules like so:
                    // f.in(g).ComputeRoot();

                    // This produces the following three loop nests:
                    // for y:
                    //   for x:
                    //     f(x, y) = x + y
                    // for y:
                    //   for x:
                    //     f_in_g(x, y) = f(x, y)
                    // for y:
                    //   for x:
                    //     g(x, y) = 2 * f_in_g(x, y) + 3

                    g.Realize <int>(5, 5);

                    // See figures/lesson_19_wrapper_local.mp4 for a visualization.
                }

                // The schedule directive f.in(g) replaces all calls to 'f'
                // inside 'g' with a wrapper Func and then returns that
                // wrapper. Essentially, it rewrites the original pipeline
                // above into the following:
                {
                    var f_in_g = new HSFunc("f_in_g");
                    var f      = new HSFunc("f");
                    var g      = new HSFunc("g");
                    f[x, y]      = x + y;
                    f_in_g[x, y] = f[x, y];
                    g[x, y]      = 2 * f_in_g[x, y] + 3;

                    f.ComputeRoot();
                    f_in_g.ComputeRoot();
                    g.ComputeRoot();
                }

                // In isolation, such a transformation seems pointless, but it
                // can be used for a variety of scheduling tricks.
            }

            {
                // In the schedule above, only the calls to 'f' made by 'g'
                // are replaced. Other calls made to f would still call 'f'
                // directly. If we wish to globally replace all calls to 'f'
                // with a single wrapper, we simply say f.in().

                // Consider a three stage pipeline, with two consumers of f:
                var f = new HSFunc("f_global");
                var g = new HSFunc("g_global");
                var h = new HSFunc("h_global");
                f[x, y] = x + y;
                g[x, y] = 2 * f[x, y];
                h[x, y] = 3 + g[x, y] - f[x, y];
                f.ComputeRoot();
                g.ComputeRoot();
                h.ComputeRoot();

                // We will replace all calls to 'f' inside both 'g' and 'h'
                // with calls to a single wrapper:
                f.In().ComputeRoot();

                // The equivalent loop nests are:
                // for y:
                //   for x:
                //     f(x, y) = x + y
                // for y:
                //   for x:
                //     f_in(x, y) = f(x, y)
                // for y:
                //   for x:
                //     g(x, y) = 2 * f_in(x, y)
                // for y:
                //   for x:
                //     h(x, y) = 3 + g(x, y) - f_in(x, y)

                h.Realize <int>(5, 5);

                // See figures/lesson_19_wrapper_global.mp4 and for a
                // visualization of what this did.
            }

            {
                // We could also give g and h their own unique wrappers of
                // f. This time we'll schedule them each inside the loop nests
                // of the consumer, which is not something we could do with a
                // single global wrapper.

                var f = new HSFunc("f_unique");
                var g = new HSFunc("g_unique");
                var h = new HSFunc("h_unique");
                f[x, y] = x + y;
                g[x, y] = 2 * f[x, y];
                h[x, y] = 3 + g[x, y] - f[x, y];

                f.ComputeRoot();
                g.ComputeRoot();
                h.ComputeRoot();

                f.In(g).ComputeAt(g, y);
                f.In(h).ComputeAt(h, y);

                // This creates the loop nests:
                // for y:
                //   for x:
                //     f(x, y) = x + y
                // for y:
                //   for x:
                //     f_in_g(x, y) = f(x, y)
                //   for x:
                //     g(x, y) = 2 * f_in_g(x, y)
                // for y:
                //   for x:
                //     f_in_h(x, y) = f(x, y)
                //   for x:
                //     h(x, y) = 3 + g(x, y) - f_in_h(x, y)

                h.Realize <int>(5, 5);
                // See figures/lesson_19_wrapper_unique.mp4 for a visualization.
            }

            {
                // So far this may seem like a lot of pointless copying of
                // memory. Func::in can be combined with other scheduling
                // directives for a variety of purposes. The first we will
                // examine is creating distinct realizations of a Func for
                // several consumers and scheduling each differently.

                // We'll start with nearly the same pipeline.
                var f = new HSFunc("f_sched");
                var g = new HSFunc("g_sched");
                var h = new HSFunc("h_sched");
                f[x, y] = x + y;
                g[x, y] = 2 * f[x, y];
                // h will use a far-away region of f
                h[x, y] = 3 + g[x, y] - f[x + 93, y - 87];

                // This time we'll inline f.
                // f.ComputeRoot();
                g.ComputeRoot();
                h.ComputeRoot();

                f.In(g).ComputeAt(g, y);
                f.In(h).ComputeAt(h, y);

                // g and h now call f via distinct wrappers. The wrappers are
                // scheduled, but f is not, which means that f is inlined into
                // its two wrappers. They will each independently compute the
                // region of f required by their consumer. If we had scheduled
                // f ComputeRoot, we'd be computing the bounding box of the
                // region required by g and the region required by h, which
                // would mostly be unused data.

                // We can also schedule each of these wrappers
                // differently. For scheduling purposes, wrappers inherit the
                // pure vars of the Func they wrap, so we use the same x and y
                // that we used when defining f:
                f.In(g).Vectorize(x, 4);
                f.In(h).Split(x, xo, xi, 2).Reorder(xo, xi);

                // Note that calling f.in(g) a second time returns the wrapper
                // already created by the first call, it doesn't make a new one.

                h.Realize <int>(8, 8);
                // See figures/lesson_19_wrapper_vary_schedule.mp4 for a
                // visualization.

                // Note that because f is inlined into its two wrappers, it is
                // the wrappers that do the work of computing f, rather than
                // just loading from an existing computed realization.
            }

            {
                // Func::in is useful to stage loads from a Func via some
                // smaller intermediate buffer, perhaps on the stack or in
                // shared GPU memory.

                // Consider a pipeline that transposes some ComputeRoot'd Func:

                var f = new HSFunc("f_transpose");
                var g = new HSFunc("g_transpose");
                f[x, y] = HSMath.Sin(((x + y) * HSMath.Sqrt(y)) / 10);
                f.ComputeRoot();

                g[x, y] = f[y, x];

                // The execution strategy we want is to load an 4x4 tile of f
                // into registers, transpose it in-register, and then write it
                // out as an 4x4 tile of g. We will use Func::in to express this:

                HSFunc f_tile = f.In(g);

                // We now have a three stage pipeline:
                // f -> f_tile -> g

                // f_tile will load vectors of f, and store them transposed
                // into registers. g will then write this data back to main
                // memory.
                g.Tile(x, y, xo, yo, xi, yi, 4, 4)
                .Vectorize(xi)
                .Unroll(yi);

                // We will compute f_transpose at tiles of g, and use
                // Func::reorder_storage to state that f_transpose should be
                // stored column-major, so that the loads to it done by g can
                // be dense vector loads.
                f_tile.ComputeAt(g, xo)
                .ReorderStorage(y, x)
                .Vectorize(x)
                .Unroll(y);

                // We take care to make sure f_transpose is only ever accessed
                // at constant indicies. The full unrolling/vectorization of
                // all loops that exist inside its compute_at level has this
                // effect. Allocations that are only ever accessed at constant
                // indices can be promoted into registers.

                g.Realize <float>(16, 16);
                // See figures/lesson_19_transpose.mp4 for a visualization
            }

            {
                // ImageParam::in behaves the same way as Func::in, and you
                // can use it to stage loads in similar ways. Instead of
                // transposing again, we'll use ImageParam::in to stage tiles
                // of an input image into GPU shared memory, effectively using
                // shared/local memory as an explicitly-managed cache.

                var img = new HSImageParam <int>(2);

                // We will compute a small blur of the input.
                var blur = new HSFunc("blur");
                blur[x, y] = (img[x - 1, y - 1] + img[x, y - 1] + img[x + 1, y - 1] +
                              img[x - 1, y] + img[x, y] + img[x + 1, y] +
                              img[x - 1, y + 1] + img[x, y + 1] + img[x + 1, y + 1]);

                blur.ComputeRoot().GpuTile(x, y, xo, yo, xi, yi, 8, 8);

                // The wrapper Func created by ImageParam::in has pure vars
                // named _0, _1, etc. Schedule it per tile of "blur", and map
                // _0 and _1 to gpu threads.
                img.In(blur).ComputeAt(blur, xo).GpuThreads(HS._0, HS._1);

                // Without Func::in, computing an 8x8 tile of blur would do
                // 8*8*9 loads to global memory. With Func::in, the wrapper
                // does 10*10 loads to global memory up front, and then blur
                // does 8*8*9 loads to shared/local memory.

                // Select an appropriate GPU API, as we did in lesson 12
                var target = HS.GetHostTarget();
                if (target.OS == HSOperatingSystem.OSX)
                {
                    target.SetFeature(HSFeature.Metal);
                }
                else
                {
                    target.SetFeature(HSFeature.OpenCL);
                }

                // Create an interesting input image to use.
                var input = new HSBuffer <int>(258, 258);
                input.SetMin(-1, -1);
                for (int yy = input.Top; yy <= input.Bottom; yy++)
                {
                    for (int xx = input.Left; xx <= input.Right; xx++)
                    {
                        input[xx, yy] = xx * 17 + yy % 4;
                    }
                }

                img.Set(input);
                blur.CompileJit(target);
                var output = blur.Realize <int>(256, 256);

                // Check the output is what we expected
                for (int yy = output.Top; yy <= output.Bottom; yy++)
                {
                    for (int xx = output.Left; xx <= output.Right; xx++)
                    {
                        int val      = output[xx, yy];
                        int expected = (input[xx - 1, yy - 1] + input[xx, yy - 1] + input[xx + 1, yy - 1] +
                                        input[xx - 1, yy] + input[xx, yy] + input[xx + 1, yy] +
                                        input[xx - 1, yy + 1] + input[xx, yy + 1] + input[xx + 1, yy + 1]);
                        if (val != expected)
                        {
                            Console.WriteLine($"output({xx}, {yy}) = {val} instead of {expected}\n",
                                              xx, yy, val, expected);
                            return(-1);
                        }
                    }
                }
            }

            {
                // Func::in can also be used to group multiple stages of a
                // Func into the same loop nest. Consider the following
                // pipeline, which computes a value per pixel, then sweeps
                // from left to right and back across each scanline.
                var f = new HSFunc("f_group");
                var g = new HSFunc("g_group");
                var h = new HSFunc("h_group");

                // Initialize f
                f[x, y] = HSMath.Sin(x - y);
                var r = new HSRDom(1, 7);

                // Sweep from left to right
                f[r, y] = (f[r, y] + f[r - 1, y]) / 2;

                // Sweep from right to left
                f[7 - r, y] = (f[7 - r, y] + f[8 - r, y]) / 2;

                // Then we do something with a complicated access pattern: A
                // 45 degree rotation with wrap-around
                g[x, y] = f[(x + y) % 8, (x - y) % 8];

                // f should be scheduled ComputeRoot, because its consumer
                // accesses it in a complicated way. But that means all stages
                // of f are computed in separate loop nests:

                // for y:
                //   for x:
                //     f(x, y) = sin(x - y)
                // for y:
                //   for r:
                //     f(r, y) = (f(r, y) + f(r - 1, y)) / 2
                // for y:
                //   for r:
                //     f(7 - r, y) = (f(7 - r, y) + f(8 - r, y)) / 2
                // for y:
                //   for x:
                //     g(x, y) = f((x + y) % 8, (x - y) % 8);

                // We can get better locality if we schedule the work done by
                // f to share a common loop over y. We can do this by
                // computing f at scanlines of a wrapper like so:

                f.In(g).ComputeRoot();
                f.ComputeAt(f.In(g), y);

                // f has the default schedule for a Func with update stages,
                // which is to be computed at the innermost loop of its
                // consumer, which is now the wrapper f.in(g). This therefore
                // generates the following loop nest, which has better
                // locality:

                // for y:
                //   for x:
                //     f(x, y) = sin(x - y)
                //   for r:
                //     f(r, y) = (f(r, y) + f(r - 1, y)) / 2
                //   for r:
                //     f(7 - r, y) = (f(7 - r, y) + f(8 - r, y)) / 2
                //   for x:
                //     f_in_g(x, y) = f(x, y)
                // for y:
                //   for x:
                //     g(x, y) = f_in_g((x + y) % 8, (x - y) % 8);

                // We'll additionally vectorize the initialization of, and
                // then transfer of pixel values from f into its wrapper:
                f.Vectorize(x, 4);
                f.In(g).Vectorize(x, 4);

                g.Realize <float>(8, 8);
                // See figures/lesson_19_group_updates.mp4 for a visualization.
            }

            Console.WriteLine("Success!");

            return(0);
        }
Exemple #5
0
        public static int Main()
        {
            // This program defines a single-stage imaging pipeline that
            // outputs a grayscale diagonal gradient.

            // A 'Func' object represents a pipeline stage. It's a pure
            // function that defines what value each pixel should have. You
            // can think of it as a computed image.
            var gradient = new HSFunc("gradient");

            // Var objects are names to use as variables in the definition of
            // a Func. They have no meaning by themselves.
            var x = new HSVar("x");
            var y = new HSVar("y");

            // We typically use Vars named 'x' and 'y' to correspond to the x
            // and y axes of an image, and we write them in that order. If
            // you're used to thinking of images as having rows and columns,
            // then x is the column index, and y is the row index.

            // Funcs are defined at any integer coordinate of its variables as
            // an Expr in terms of those variables and other functions.
            // Here, we'll define an Expr which has the value x + y. Vars have
            // appropriate operator overloading so that expressions like
            // 'x + y' become 'Expr' objects.
            var e = x + y;

            // Now we'll add a definition for the Func object. At pixel x, y,
            // the image will have the value of the Expr e. On the left hand
            // side we have the Func we're defining and some Vars. On the right
            // hand side we have some Expr object that uses those same Vars.
            gradient[x, y] = e;

            // This is the same as writing:
            //
            //   gradient(x, y) = x + y;
            //
            // which is the more common form, but we are showing the
            // intermediate Expr here for completeness.

            // That line of code defined the Func, but it didn't actually
            // compute the output image yet. At this stage it's just Funcs,
            // Exprs, and Vars in memory, representing the structure of our
            // imaging pipeline. We're meta-programming. This C++ program is
            // constructing a Halide program in memory. Actually computing
            // pixel data comes next.

            // Now we 'realize' the Func, which JIT compiles some code that
            // implements the pipeline we've defined, and then runs it.  We
            // also need to tell Halide the domain over which to evaluate the
            // Func, which determines the range of x and y above, and the
            // resolution of the output image. Halide.h also provides a basic
            // templatized image type we can use. We'll make an 800 x 600
            // image.
            var output = gradient.Realize <int>(800, 600);

            // Halide does type inference for you. Var objects represent
            // 32-bit integers, so the Expr object 'x + y' also represents a
            // 32-bit integer, and so 'gradient' defines a 32-bit image, and
            // so we got a 32-bit signed integer image out when we call
            // 'realize'. Halide types and type-casting rules are equivalent
            // to C.

            // Let's check everything worked, and we got the output we were
            // expecting:
            for (int j = 0; j < output.Height; j++)
            {
                for (int i = 0; i < output.Width; i++)
                {
                    // We can access a pixel of an Buffer object using similar
                    // syntax to defining and using functions.
                    if (output[i, j] != i + j)
                    {
                        Console.WriteLine("Something went wrong!");
                        Console.WriteLine($"Pixel {i}, {j} was supposed to be {i + j}, but instead it's {output[i, j]}");
                        return(-1);
                    }
                }
            }

            // Everything worked! We defined a Func, then called 'realize' on
            // it to generate and run machine code that produced an Buffer.
            Console.WriteLine("Success!");

            return(0);
        }
Exemple #6
0
        public static int Main(string[] args)
        {
            // First we'll declare some Vars to use below.
            var x = new HSVar("x");
            var y = new HSVar("y");

            // Let's examine various scheduling options for a simple two stage
            // pipeline. We'll start with the default schedule:
            {
                var producer = new HSFunc("producer_default");
                var consumer = new HSFunc("consumer_default");

                // The first stage will be some simple pointwise math similar
                // to our familiar gradient function. The value at position x,
                // y is the sin of product of x and y.
                producer[x, y] = HSMath.Sin(x * y);

                // Now we'll add a second stage which averages together multiple
                // points in the first stage.
                consumer[x, y] = (producer[x, y] +
                                  producer[x, y + 1] +
                                  producer[x + 1, y] +
                                  producer[x + 1, y + 1]) / 4;

                // We'll turn on tracing for both functions.
                consumer.TraceStores();
                producer.TraceStores();

                // And evaluate it over a 4x4 box.
                Console.WriteLine("\nEvaluating producer-consumer pipeline with default schedule");
                consumer.Realize <float>(4, 4);

                // There were no messages about computing values of the
                // producer. This is because the default schedule fully
                // inlines 'producer' into 'consumer'. It is as if we had
                // written the following code instead:

                // consumer(x, y) = (sin(x * y) +
                //                   sin(x * (y + 1)) +
                //                   sin((x + 1) * y) +
                //                   sin((x + 1) * (y + 1))/4);

                // All calls to 'producer' have been replaced with the body of
                // 'producer', with the arguments substituted in for the
                // variables.

                // The equivalent C code is:
                var result = new float[4, 4];
                for (int yy = 0; yy < 4; yy++)
                {
                    for (int xx = 0; xx < 4; xx++)
                    {
                        result[yy, xx] = (float)((Math.Sin(xx * yy) +
                                                  Math.Sin(xx * (yy + 1)) +
                                                  Math.Sin((xx + 1) * yy) +
                                                  Math.Sin((xx + 1) * (yy + 1))) / 4);
                    }
                }
                Console.WriteLine();

                // If we look at the loop nest, the producer doesn't appear
                // at all. It has been inlined into the consumer.
                Console.WriteLine("Pseudo-code for the schedule:");
                consumer.PrintLoopNest();
                Console.WriteLine();
            }

            // Next we'll examine the next simplest option - computing all
            // values required in the producer before computing any of the
            // consumer. We call this schedule "root".
            {
                // Start with the same function definitions:
                var producer = new HSFunc("producer_root");
                var consumer = new HSFunc("consumer_root");
                producer[x, y] = HSMath.Sin(x * y);
                consumer[x, y] = (producer[x, y] +
                                  producer[x, y + 1] +
                                  producer[x + 1, y] +
                                  producer[x + 1, y + 1]) / 4;

                // Tell Halide to evaluate all of producer before any of consumer.
                producer.ComputeRoot();

                // Turn on tracing.
                consumer.TraceStores();
                producer.TraceStores();

                // Compile and run.
                Console.WriteLine("\nEvaluating producer.compute_root()");
                consumer.Realize <float>(4, 4);

                // Reading the output we can see that:
                // A) There were stores to producer.
                // B) They all happened before any stores to consumer.

                // See figures/lesson_08_compute_root.gif for a visualization.
                // The producer is on the left and the consumer is on the
                // right. Stores are marked in orange and loads are marked in
                // blue.

                // Equivalent C:

                var result = new float[4, 4];

                // Allocate some temporary storage for the producer.
                var producer_storage = new float[5, 5];

                // Compute the producer.
                for (int yy = 0; yy < 5; yy++)
                {
                    for (int xx = 0; xx < 5; xx++)
                    {
                        producer_storage[yy, xx] = (float)Math.Sin(xx * yy);
                    }
                }

                // Compute the consumer. Skip the prints this time.
                for (int yy = 0; yy < 4; yy++)
                {
                    for (int xx = 0; xx < 4; xx++)
                    {
                        result[yy, xx] = (producer_storage[yy, xx] +
                                          producer_storage[yy + 1, xx] +
                                          producer_storage[yy, xx + 1] +
                                          producer_storage[yy + 1, xx + 1]) / 4;
                    }
                }

                // Note that consumer was evaluated over a 4x4 box, so Halide
                // automatically inferred that producer was needed over a 5x5
                // box. This is the same 'bounds inference' logic we saw in
                // the previous lesson, where it was used to detect and avoid
                // out-of-bounds reads from an input image.

                // If we print the loop nest, we'll see something very
                // similar to the C above.
                Console.WriteLine("Pseudo-code for the schedule:");
                consumer.PrintLoopNest();
                Console.WriteLine();
            }

            // Let's compare the two approaches above from a performance
            // perspective.

            // Full inlining (the default schedule):
            // - Temporary memory allocated: 0
            // - Loads: 0
            // - Stores: 16
            // - Calls to sin: 64

            // producer.compute_root():
            // - Temporary memory allocated: 25 floats
            // - Loads: 64
            // - Stores: 41
            // - Calls to sin: 25

            // There's a trade-off here. Full inlining used minimal temporary
            // memory and memory bandwidth, but did a whole bunch of redundant
            // expensive math (calling sin). It evaluated most points in
            // 'producer' four times. The second schedule,
            // producer.compute_root(), did the mimimum number of calls to
            // sin, but used more temporary memory and more memory bandwidth.

            // In any given situation the correct choice can be difficult to
            // make. If you're memory-bandwidth limited, or don't have much
            // memory (e.g. because you're running on an old cell-phone), then
            // it can make sense to do redundant math. On the other hand, sin
            // is expensive, so if you're compute-limited then fewer calls to
            // sin will make your program faster. Adding vectorization or
            // multi-core parallelism tilts the scales in favor of doing
            // redundant work, because firing up multiple cpu cores increases
            // the amount of math you can do per second, but doesn't increase
            // your system memory bandwidth or capacity.

            // We can make choices in between full inlining and
            // compute_root. Next we'll alternate between computing the
            // producer and consumer on a per-scanline basis:
            {
                // Start with the same function definitions:
                var producer = new HSFunc("producer_y");
                var consumer = new HSFunc("consumer_y");
                producer[x, y] = HSMath.Sin(x * y);
                consumer[x, y] = (producer[x, y] +
                                  producer[x, y + 1] +
                                  producer[x + 1, y] +
                                  producer[x + 1, y + 1]) / 4;

                // Tell Halide to evaluate producer as needed per y coordinate
                // of the consumer:
                producer.ComputeAt(consumer, y);

                // This places the code that computes the producer just
                // *inside* the consumer's for loop over y, as in the
                // equivalent C below.

                // Turn on tracing.
                producer.TraceStores();
                consumer.TraceStores();

                // Compile and run.
                Console.WriteLine("\nEvaluating producer.ComputeAt(consumer, y)");
                consumer.Realize <float>(4, 4);

                // See figures/lesson_08_compute_y.gif for a visualization.

                // Reading the log or looking at the figure you should see
                // that producer and consumer alternate on a per-scanline
                // basis. Let's look at the equivalent C:

                var result = new float[4, 4];

                // There's an outer loop over scanlines of consumer:
                for (int yy = 0; yy < 4; yy++)
                {
                    // Allocate space and compute enough of the producer to
                    // satisfy this single scanline of the consumer. This
                    // means a 5x2 box of the producer.
                    var producer_storage = new float[2, 5];
                    for (int py = yy; py < yy + 2; py++)
                    {
                        for (int px = 0; px < 5; px++)
                        {
                            producer_storage[py - yy, px] = (float)Math.Sin(px * py);
                        }
                    }

                    // Compute a scanline of the consumer.
                    for (int xx = 0; xx < 4; xx++)
                    {
                        result[yy, xx] = (producer_storage[0, xx] +
                                          producer_storage[1, xx] +
                                          producer_storage[0, xx + 1] +
                                          producer_storage[1, xx + 1]) / 4;
                    }
                }

                // Again, if we print the loop nest, we'll see something very
                // similar to the C above.
                Console.WriteLine("Pseudo-code for the schedule:");
                consumer.PrintLoopNest();
                Console.WriteLine();

                // The performance characteristics of this strategy are in
                // between inlining and compute root. We still allocate some
                // temporary memory, but less that compute_root, and with
                // better locality (we load from it soon after writing to it,
                // so for larger images, values should still be in cache). We
                // still do some redundant work, but less than full inlining:

                // producer.ComputeAt(consumer, y):
                // - Temporary memory allocated: 10 floats
                // - Loads: 64
                // - Stores: 56
                // - Calls to sin: 40
            }

            // We could also say producer.ComputeAt(consumer, x), but this
            // would be very similar to full inlining (the default
            // schedule). Instead let's distinguish between the loop level at
            // which we allocate storage for producer, and the loop level at
            // which we actually compute it. This unlocks a few optimizations.
            {
                var producer = new HSFunc("producer_root_y");
                var consumer = new HSFunc("consumer_root_y");
                producer[x, y] = HSMath.Sin(x * y);
                consumer[x, y] = (producer[x, y] +
                                  producer[x, y + 1] +
                                  producer[x + 1, y] +
                                  producer[x + 1, y + 1]) / 4;


                // Tell Halide to make a buffer to store all of producer at
                // the outermost level:
                producer.StoreRoot();
                // ... but compute it as needed per y coordinate of the
                // consumer.
                producer.ComputeAt(consumer, y);

                producer.TraceStores();
                consumer.TraceStores();

                Console.WriteLine("\nEvaluating producer.store_root().ComputeAt(consumer, y)");
                consumer.Realize <float>(4, 4);

                // See figures/lesson_08_store_root_compute_y.gif for a
                // visualization.

                // Reading the log or looking at the figure you should see
                // that producer and consumer again alternate on a
                // per-scanline basis. It computes a 5x2 box of the producer
                // to satisfy the first scanline of the consumer, but after
                // that it only computes a 5x1 box of the output for each new
                // scanline of the consumer!
                //
                // Halide has detected that for all scanlines except for the
                // first, it can reuse the values already sitting in the
                // buffer we've allocated for producer. Let's look at the
                // equivalent C:

                var result = new float[4, 4];

                {
                    // producer.store_root() implies that storage goes here:
                    var producer_storage = new float[5, 5];

                    // There's an outer loop over scanlines of consumer:
                    for (int yy = 0; yy < 4; yy++)
                    {
                        // Compute enough of the producer to satisfy this scanline
                        // of the consumer.
                        for (int py = yy; py < yy + 2; py++)
                        {
                            // Skip over rows of producer that we've already
                            // computed in a previous iteration.
                            if (yy > 0 && py == yy)
                            {
                                continue;
                            }

                            for (int px = 0; px < 5; px++)
                            {
                                producer_storage[py, px] = (float)Math.Sin(px * py);
                            }
                        }

                        // Compute a scanline of the consumer.
                        for (int xx = 0; xx < 4; xx++)
                        {
                            result[yy, xx] = (producer_storage[yy, xx] +
                                              producer_storage[yy + 1, xx] +
                                              producer_storage[yy, xx + 1] +
                                              producer_storage[yy + 1, xx + 1]) / 4;
                        }
                    }
                }

                Console.WriteLine("Pseudo-code for the schedule:");
                consumer.PrintLoopNest();
                Console.WriteLine();

                // The performance characteristics of this strategy are pretty
                // good! The numbers are similar compute_root, except locality
                // is better. We're doing the minimum number of sin calls,
                // and we load values soon after they are stored, so we're
                // probably making good use of the cache:

                // producer.store_root().ComputeAt(consumer, y):
                // - Temporary memory allocated: 10 floats
                // - Loads: 64
                // - Stores: 39
                // - Calls to sin: 25

                // Note that my claimed amount of memory allocated doesn't
                // match the reference C code. Halide is performing one more
                // optimization under the hood. It folds the storage for the
                // producer down into a circular buffer of two
                // scanlines. Equivalent C would actually look like this:

                {
                    // Actually store 2 scanlines instead of 5
                    var producer_storage = new float[2, 5];
                    for (int yy = 0; yy < 4; yy++)
                    {
                        for (int py = yy; py < yy + 2; py++)
                        {
                            if (yy > 0 && py == yy)
                            {
                                continue;
                            }
                            for (int px = 0; px < 5; px++)
                            {
                                // Stores to producer_storage have their y coordinate bit-masked.
                                producer_storage[py & 1, px] = (float)Math.Sin(px * py);
                            }
                        }

                        // Compute a scanline of the consumer.
                        for (int xx = 0; xx < 4; xx++)
                        {
                            // Loads from producer_storage have their y coordinate bit-masked.
                            result[yy, xx] = (producer_storage[yy & 1, xx] +
                                              producer_storage[(yy + 1) & 1, xx] +
                                              producer_storage[yy & 1, xx + 1] +
                                              producer_storage[(yy + 1) & 1, xx + 1]) / 4;
                        }
                    }
                }
            }

            // We can do even better, by leaving the storage outermost, but
            // moving the computation into the innermost loop:
            {
                var producer = new HSFunc("producer_root_x");
                var consumer = new HSFunc("consumer_root_x");
                producer[x, y] = HSMath.Sin(x * y);
                consumer[x, y] = (producer[x, y] +
                                  producer[x, y + 1] +
                                  producer[x + 1, y] +
                                  producer[x + 1, y + 1]) / 4;


                // Store outermost, compute innermost.
                producer.StoreRoot().ComputeAt(consumer, x);

                producer.TraceStores();
                consumer.TraceStores();

                Console.WriteLine("\nEvaluating producer.store_root().ComputeAt(consumer, x)");
                consumer.Realize <float>(4, 4);

                // See figures/lesson_08_store_root_compute_x.gif for a
                // visualization.

                // You should see that producer and consumer now alternate on
                // a per-pixel basis. Here's the equivalent C:

                var result = new float[4, 4];

                // producer.store_root() implies that storage goes here, but
                // we can fold it down into a circular buffer of two
                // scanlines:
                var producer_storage = new float[2, 5];

                // For every pixel of the consumer:
                for (int yy = 0; yy < 4; yy++)
                {
                    for (int xx = 0; xx < 4; xx++)
                    {
                        // Compute enough of the producer to satisfy this
                        // pixel of the consumer, but skip values that we've
                        // already computed:
                        if (yy == 0 && xx == 0)
                        {
                            producer_storage[yy & 1, xx] = (float)Math.Sin(xx * yy);
                        }
                        if (yy == 0)
                        {
                            producer_storage[yy & 1, xx + 1] = (float)Math.Sin((xx + 1) * yy);
                        }
                        if (xx == 0)
                        {
                            producer_storage[(yy + 1) & 1, xx] = (float)Math.Sin(xx * (yy + 1));
                        }
                        producer_storage[(yy + 1) & 1, xx + 1] = (float)Math.Sin((xx + 1) * (yy + 1));

                        result[yy, xx] = (producer_storage[yy & 1, xx] +
                                          producer_storage[(yy + 1) & 1, xx] +
                                          producer_storage[yy & 1, xx + 1] +
                                          producer_storage[(yy + 1) & 1, xx + 1]) / 4;
                    }
                }

                Console.WriteLine("Pseudo-code for the schedule:");
                consumer.PrintLoopNest();
                Console.WriteLine();

                // The performance characteristics of this strategy are the
                // best so far. One of the four values of the producer we need
                // is probably still sitting in a register, so I won't count
                // it as a load:
                // producer.store_root().ComputeAt(consumer, x):
                // - Temporary memory allocated: 10 floats
                // - Loads: 48
                // - Stores: 56
                // - Calls to sin: 40
            }

            // So what's the catch? Why not always do
            // producer.store_root().ComputeAt(consumer, x) for this type of
            // code?
            //
            // The answer is parallelism. In both of the previous two
            // strategies we've assumed that values computed on previous
            // iterations are lying around for us to reuse. This assumes that
            // previous values of x or y happened earlier in time and have
            // finished. This is not true if you parallelize or vectorize
            // either loop. Darn. If you parallelize, Halide won't inject the
            // optimizations that skip work already done if there's a parallel
            // loop in between the store_at level and the ComputeAt level,
            // and won't fold the storage down into a circular buffer either,
            // which makes our store_root pointless.

            // We're running out of options. We can make new ones by
            // splitting. We can store_at or ComputeAt at the natural
            // variables of the consumer (x and y), or we can split x or y
            // into new inner and outer sub-variables and then schedule with
            // respect to those. We'll use this to express fusion in tiles:
            {
                var producer = new HSFunc("producer_tile");
                var consumer = new HSFunc("consumer_tile");
                producer[x, y] = HSMath.Sin(x * y);
                consumer[x, y] = (producer[x, y] +
                                  producer[x, y + 1] +
                                  producer[x + 1, y] +
                                  producer[x + 1, y + 1]) / 4;

                // We'll compute 8x8 of the consumer, in 4x4 tiles.
                var x_outer = new HSVar("x_outer");
                var y_outer = new HSVar("y_outer");
                var x_inner = new HSVar("x_inner");
                var y_inner = new HSVar("y_inner");
                consumer.Tile(x, y, x_outer, y_outer, x_inner, y_inner, 4, 4);

                // Compute the producer per tile of the consumer
                producer.ComputeAt(consumer, x_outer);

                // Notice that I wrote my schedule starting from the end of
                // the pipeline (the consumer). This is because the schedule
                // for the producer refers to x_outer, which we introduced
                // when we tiled the consumer. You can write it in the other
                // order, but it tends to be harder to read.

                // Turn on tracing.
                producer.TraceStores();
                consumer.TraceStores();

                Console.WriteLine("\nEvaluating:");
                Console.WriteLine("consumer.tile(x, y, x_outer, y_outer, x_inner, y_inner, 4, 4);");
                Console.WriteLine("producer.ComputeAt(consumer, x_outer);");
                consumer.Realize <float>(8, 8);

                // See figures/lesson_08_tile.gif for a visualization.

                // The producer and consumer now alternate on a per-tile
                // basis. Here's the equivalent C:

                var result = new float[8, 8];

                // For every tile of the consumer:
                for (int yy_outer = 0; yy_outer < 2; yy_outer++)
                {
                    for (int xx_outer = 0; xx_outer < 2; xx_outer++)
                    {
                        // Compute the x and y coords of the start of this tile.
                        int x_base = xx_outer * 4;
                        int y_base = yy_outer * 4;

                        // Compute enough of producer to satisfy this tile. A
                        // 4x4 tile of the consumer requires a 5x5 tile of the
                        // producer.
                        var producer_storage = new float[5, 5];
                        for (int py = y_base; py < y_base + 5; py++)
                        {
                            for (int px = x_base; px < x_base + 5; px++)
                            {
                                producer_storage[py - y_base, px - x_base] = (float)Math.Sin(px * py);
                            }
                        }

                        // Compute this tile of the consumer
                        for (int yy_inner = 0; yy_inner < 4; yy_inner++)
                        {
                            for (int xx_inner = 0; xx_inner < 4; xx_inner++)
                            {
                                int xx = x_base + xx_inner;
                                int yy = y_base + yy_inner;
                                result[yy, xx] =
                                    (producer_storage[yy - y_base, xx - x_base] +
                                     producer_storage[yy - y_base + 1, xx - x_base] +
                                     producer_storage[yy - y_base, xx - x_base + 1] +
                                     producer_storage[yy - y_base + 1, xx - x_base + 1]) / 4;
                            }
                        }
                    }
                }

                Console.WriteLine("Pseudo-code for the schedule:");
                consumer.PrintLoopNest();
                Console.WriteLine();

                // Tiling can make sense for problems like this one with
                // stencils that reach outwards in x and y. Each tile can be
                // computed independently in parallel, and the redundant work
                // done by each tile isn't so bad once the tiles get large
                // enough.
            }

            // Let's try a mixed strategy that combines what we have done with
            // splitting, parallelizing, and vectorizing. This is one that
            // often works well in practice for large images. If you
            // understand this schedule, then you understand 95% of scheduling
            // in Halide.
            {
                var producer = new HSFunc("producer_mixed");
                var consumer = new HSFunc("consumer_mixed");
                producer[x, y] = HSMath.Sin(x * y);
                consumer[x, y] = (producer[x, y] +
                                  producer[x, y + 1] +
                                  producer[x + 1, y] +
                                  producer[x + 1, y + 1]) / 4;

                // Split the y coordinate of the consumer into strips of 16 scanlines:
                var yo = new HSVar("yo");
                var yi = new HSVar("yi");
                consumer.Split(y, yo, yi, 16);
                // Compute the strips using a thread pool and a task queue.
                consumer.Parallel(yo);
                // Vectorize across x by a factor of four.
                consumer.Vectorize(x, 4);

                // Now store the producer per-strip. This will be 17 scanlines
                // of the producer (16+1), but hopefully it will fold down
                // into a circular buffer of two scanlines:
                producer.StoreAt(consumer, yo);
                // Within each strip, compute the producer per scanline of the
                // consumer, skipping work done on previous scanlines.
                producer.ComputeAt(consumer, yi);
                // Also vectorize the producer (because sin is vectorizable on x86 using SSE).
                producer.Vectorize(x, 4);

                // Let's leave tracing off this time, because we're going to
                // evaluate over a larger image.
                // consumer.TraceStores();
                // producer.TraceStores();

                var halide_result = consumer.Realize <float>(160, 160);

                // See figures/lesson_08_mixed.mp4 for a visualization.

                // Here's the equivalent (serial) C:

                var c_result = new float[160, 160];

                // For every strip of 16 scanlines (this loop is parallel in
                // the Halide version)
                for (int yyo = 0; yyo < 160 / 16 + 1; yyo++)
                {
                    // 16 doesn't divide 160, so push the last slice upwards
                    // to fit within [0, 159] (see lesson 05).
                    int y_base = yyo * 16;
                    if (y_base > 160 - 16)
                    {
                        y_base = 160 - 16;
                    }

                    // Allocate a two-scanline circular buffer for the producer
                    var producer_storage = new float[2, 161];

                    // For every scanline in the strip of 16:
                    for (int yyi = 0; yyi < 16; yyi++)
                    {
                        int yy = y_base + yyi;

                        for (int py = yy; py < yy + 2; py++)
                        {
                            // Skip scanlines already computed *within this task*
                            if (yyi > 0 && py == yy)
                            {
                                continue;
                            }

                            // Compute this scanline of the producer in 4-wide vectors
                            for (int x_vec = 0; x_vec < 160 / 4 + 1; x_vec++)
                            {
                                int x_base = x_vec * 4;
                                // 4 doesn't divide 161, so push the last vector left
                                // (see lesson 05).
                                if (x_base > 161 - 4)
                                {
                                    x_base = 161 - 4;
                                }
                                // If you're on x86, Halide generates SSE code for this part:
                                int[]   xx  = { x_base, x_base + 1, x_base + 2, x_base + 3 };
                                float[] vec = { (float)Math.Sin(xx[0] * py), (float)Math.Sin(xx[1] * py),
                                                (float)Math.Sin(xx[2] * py), (float)Math.Sin(xx[3] * py) };
                                producer_storage[py & 1, xx[0]] = vec[0];
                                producer_storage[py & 1, xx[1]] = vec[1];
                                producer_storage[py & 1, xx[2]] = vec[2];
                                producer_storage[py & 1, xx[3]] = vec[3];
                            }
                        }

                        // Now compute consumer for this scanline:
                        for (int x_vec = 0; x_vec < 160 / 4; x_vec++)
                        {
                            int x_base = x_vec * 4;
                            // Again, Halide's equivalent here uses SSE.
                            int[]   xx  = { x_base, x_base + 1, x_base + 2, x_base + 3 };
                            float[] vec =
                            {
                                (producer_storage[yy & 1, xx[0]] +
                                 producer_storage[(yy + 1) & 1, xx[0]] +
                                 producer_storage[yy & 1, xx[0] + 1] +
                                 producer_storage[(yy + 1) & 1, xx[0] + 1]) / 4,
                                (producer_storage[yy & 1, xx[1]] +
                                 producer_storage[(yy + 1) & 1, xx[1]] +
                                 producer_storage[yy & 1, xx[1] + 1] +
                                 producer_storage[(yy + 1) & 1, xx[1] + 1]) / 4,
                                (producer_storage[yy & 1, xx[2]] +
                                 producer_storage[(yy + 1) & 1, xx[2]] +
                                 producer_storage[yy & 1, xx[2] + 1] +
                                 producer_storage[(yy + 1) & 1, xx[2] + 1]) / 4,
                                (producer_storage[yy & 1, xx[3]] +
                                 producer_storage[(yy + 1) & 1, xx[3]] +
                                 producer_storage[yy & 1, xx[3] + 1] +
                                 producer_storage[(yy + 1) & 1, xx[3] + 1]) / 4
                            };

                            c_result[yy, xx[0]] = vec[0];
                            c_result[yy, xx[1]] = vec[1];
                            c_result[yy, xx[2]] = vec[2];
                            c_result[yy, xx[3]] = vec[3];
                        }
                    }
                }
                Console.WriteLine("Pseudo-code for the schedule:");
                consumer.PrintLoopNest();
                Console.WriteLine();

                // Look on my code, ye mighty, and despair!

                // Let's check the C result against the Halide result. Doing
                // this I found several bugs in my C implementation, which
                // should tell you something.
                for (int yy = 0; yy < 160; yy++)
                {
                    for (int xx = 0; xx < 160; xx++)
                    {
                        float error = halide_result[xx, yy] - c_result[yy, xx];
                        // It's floating-point math, so we'll allow some slop:
                        if (error < -0.001f || error > 0.001f)
                        {
                            Console.WriteLine("halide_result(%d, %d) = %f instead of %f",
                                              xx, yy, halide_result[xx, yy], c_result[yy, xx]);
                            return(-1);
                        }
                    }
                }
            }

            // This stuff is hard. We ended up in a three-way trade-off
            // between memory bandwidth, redundant work, and
            // parallelism. Halide can't make the correct choice for you
            // automatically (sorry). Instead it tries to make it easier for
            // you to explore various options, without messing up your
            // program. In fact, Halide promises that scheduling calls like
            // compute_root won't change the meaning of your algorithm -- you
            // should get the same bits back no matter how you schedule
            // things.

            // So be empirical! Experiment with various schedules and keep a
            // log of performance. Form hypotheses and then try to prove
            // yourself wrong. Don't assume that you just need to vectorize
            // your code by a factor of four and run it on eight cores and
            // you'll get 32x faster. This almost never works. Modern systems
            // are complex enough that you can't predict performance reliably
            // without running your code.

            // We suggest you start by scheduling all of your non-trivial
            // stages compute_root, and then work from the end of the pipeline
            // upwards, inlining, parallelizing, and vectorizing each stage in
            // turn until you reach the top.

            // Halide is not just about vectorizing and parallelizing your
            // code. That's not enough to get you very far. Halide is about
            // giving you tools that help you quickly explore different
            // trade-offs between locality, redundant work, and parallelism,
            // without messing up the actual result you're trying to compute.

            Console.WriteLine("Success!");
            return(0);
        }
Exemple #7
0
        public static int Main(string[] args)
        {
            // The last lesson was quite involved, and scheduling complex
            // multi-stage pipelines is ahead of us. As an interlude, let's
            // consider something easy: evaluating funcs over rectangular
            // domains that do not start at the origin.

            // We define our familiar gradient function.
            var gradient = new HSFunc("gradient");
            var x        = new HSVar("x");
            var y        = new HSVar("y");

            gradient[x, y] = x + y;

            // And turn on tracing so we can see how it is being evaluated.
            gradient.TraceStores();

            // Previously we've realized gradient like so:
            //
            // gradient.realize(8, 8);
            //
            // This does three things internally:
            // 1) Generates code than can evaluate gradient over an arbitrary
            // rectangle.
            // 2) Allocates a new 8 x 8 image.
            // 3) Runs the generated code to evaluate gradient for all x, y
            // from (0, 0) to (7, 7) and puts the result into the image.
            // 4) Returns the new image as the result of the realize call.

            // What if we're managing memory carefully and don't want Halide
            // to allocate a new image for us? We can call realize another
            // way. We can pass it an image we would like it to fill in. The
            // following evaluates our Func into an existing image:
            Console.WriteLine("Evaluating gradient from (0, 0) to (7, 7)");
            var result = new HSBuffer <int>(8, 8);

            gradient.Realize(result);

            // Let's check it did what we expect:
            for (int yy = 0; yy < 8; yy++)
            {
                for (int xx = 0; xx < 8; xx++)
                {
                    if (result[xx, yy] != xx + yy)
                    {
                        Console.WriteLine("Something went wrong!\n");
                        return(-1);
                    }
                }
            }

            // Now let's evaluate gradient over a 5 x 7 rectangle that starts
            // somewhere else -- at position (100, 50). So x and y will run
            // from (100, 50) to (104, 56) inclusive.

            // We start by creating an image that represents that rectangle:
            var shifted = new HSBuffer <int>(5, 7); // In the constructor we tell it the size.

            shifted.SetMin(100, 50);                // Then we tell it the top-left corner.

            Console.WriteLine("Evaluating gradient from (100, 50) to (104, 56)");

            // Note that this won't need to compile any new code, because when
            // we realized it the first time, we generated code capable of
            // evaluating gradient over an arbitrary rectangle.
            gradient.Realize(shifted);

            // From C++, we also access the image object using coordinates
            // that start at (100, 50).
            for (int yy = 50; yy < 57; yy++)
            {
                for (int xx = 100; xx < 105; xx++)
                {
                    if (shifted[xx, yy] != xx + yy)
                    {
                        Console.WriteLine("Something went wrong!");
                        return(-1);
                    }
                }
            }
            // The image 'shifted' stores the value of our Func over a domain
            // that starts at (100, 50), so asking for shifted(0, 0) would in
            // fact read out-of-bounds and probably crash.

            // What if we want to evaluate our Func over some region that
            // isn't rectangular? Too bad. Halide only does rectangles :)

            Console.WriteLine("Success!");
            return(0);
        }
Exemple #8
0
        public static int Main(string[] args)
        {
            // This program defines a single-stage imaging pipeline that
            // brightens an image.

            // First we'll load the input image we wish to brighten.
            var input = HSBuffer <byte> .LoadImage("rgb.png");

            // See figures/lesson_02_input.jpg for a smaller version.

            // Next we define our Func object that represents our one pipeline
            // stage.
            var brighter = new HSFunc("brighter");

            // Our Func will have three arguments, representing the position
            // in the image and the color channel. Halide treats color
            // channels as an extra dimension of the image.
            var x = new HSVar("x");
            var y = new HSVar("y");
            var c = new HSVar("c");

            // Normally we'd probably write the whole function definition on
            // one line. Here we'll break it apart so we can explain what
            // we're doing at every step.

            // For each pixel of the input image.
            var value = input[x, y, c];

            // Cast it to a floating point value.
            value = HS.Cast <float>(value);

            // Multiply it by 1.5 to brighten it. Halide represents real
            // numbers as floats, not doubles, so we stick an 'f' on the end
            // of our constant.
            value = value * 1.5f;

            // Clamp it to be less than 255, so we don't get overflow when we
            // cast it back to an 8-bit unsigned int.
            value = HSMath.Min(value, 255.0f);

            // Cast it back to an 8-bit unsigned integer.
            value = HS.Cast <byte>(value);

            // Define the function.
            brighter[x, y, c] = value;

            // The equivalent one-liner to all of the above is:
            //
            // brighter(x, y, c) = Halide::cast<uint8_t>(min(input(x, y, c) * 1.5f, 255));
            //
            // In the shorter version:
            // - I skipped the cast to float, because multiplying by 1.5f does
            //   that automatically.
            // - I also used an integer constant as the second argument in the
            //   call to min, because it gets cast to float to be compatible
            //   with the first argument.
            // - I left the Halide:: off the call to min. It's unnecessary due
            //   to Koenig lookup.

            // Remember, all we've done so far is build a representation of a
            // Halide program in memory. We haven't actually processed any
            // pixels yet. We haven't even compiled that Halide program yet.

            // So now we'll realize the Func. The size of the output image
            // should match the size of the input image. If we just wanted to
            // brighten a portion of the input image we could request a
            // smaller size. If we request a larger size Halide will throw an
            // error at runtime telling us we're trying to read out of bounds
            // on the input image.
            var output =
                brighter.Realize <byte>(input.Width, input.Height, input.Channels);

            // Save the output for inspection. It should look like a bright parrot.
            output.SaveImage("brighter.png");

            // See figures/lesson_02_output.jpg for a small version of the output.

            Console.WriteLine("Success!");
            return(0);
        }
Exemple #9
0
        public static int Main(string[] argv)
        {
            // We'll define the simple one-stage pipeline that we used in lesson 10.
            var brighter = new HSFunc("brighter");
            var x        = new HSVar("x");
            var y        = new HSVar("y");

            // Declare the arguments.
            var offset = new HSParam <byte>();
            var input  = new HSImageParam <byte>(2);
            var args   = new List <HSArgument>();

            args.Add(input);
            args.Add(offset);

            // Define the Func.
            brighter[x, y] = input[x, y] + offset;

            // Schedule it.
            brighter.Vectorize(x, 16).Parallel(y);

            // The following line is what we did in lesson 10. It compiles an
            // object file suitable for the system that you're running this
            // program on.  For example, if you compile and run this file on
            // 64-bit linux on an x86 cpu with sse4.1, then the generated code
            // will be suitable for 64-bit linux on x86 with sse4.1.
            brighter.CompileToFile("lesson_11_host", args, "brighter");

            // We can also compile object files suitable for other cpus and
            // operating systems. You do this with an optional third argument
            // to compile_to_file which specifies the target to compile for.

            // Let's use this to compile a 32-bit arm android version of this code:
            var target = new HSTarget();

            target.OS   = HSOperatingSystem.Android;   // The operating system
            target.Arch = HSArchitecture.ARM;          // The CPU architecture
            target.Bits = 32;                          // The bit-width of the architecture
            var arm_features = new List <HSFeature>(); // A list of features to set

            target.SetFeatures(arm_features);
            // We then pass the target as the last argument to compile_to_file.
            brighter.CompileToFile("lesson_11_arm_32_android", args, "brighter", target);

            // And now a Windows object file for 64-bit x86 with AVX and SSE 4.1:
            target.OS   = HSOperatingSystem.Windows;
            target.Arch = HSArchitecture.X86;
            target.Bits = 64;
            var x86_features = new List <HSFeature>();

            x86_features.Add(HSFeature.AVX);
            x86_features.Add(HSFeature.SSE41);
            target.SetFeatures(x86_features);
            brighter.CompileToFile("lesson_11_x86_64_windows", args, "brighter", target);

            // And finally an iOS mach-o object file for one of Apple's 32-bit
            // ARM processors - the A6. It's used in the iPhone 5. The A6 uses
            // a slightly modified ARM architecture called ARMv7s. We specify
            // this using the target features field.  Support for Apple's
            // 64-bit ARM processors is very new in llvm, and still somewhat
            // flaky.
            target.OS   = HSOperatingSystem.IOS;
            target.Arch = HSArchitecture.ARM;
            target.Bits = 32;
            var armv7s_features = new List <HSFeature>();

            armv7s_features.Add(HSFeature.ARMv7s);
            target.SetFeatures(armv7s_features);
            brighter.CompileToFile("lesson_11_arm_32_ios", args, "brighter", target);


            // Now let's check these files are what they claim, by examining
            // their first few bytes.

            {
                // 32-arm android object files start with the magic bytes:
                byte[] arm_32_android_magic = { 0x7f, (byte)'E', (byte)'L', (byte)'F', // ELF format
                                                1,                                     // 32-bit
                                                1,                                     // 2's complement little-endian
                                                1 };                                   // Current version of elf

                var androidObjectFile = "lesson_11_arm_32_android.o";
                if (!File.Exists(androidObjectFile))
                {
                    Console.WriteLine("Object file not generated");
                    return(-1);
                }

                var androidObjectData = File.ReadAllBytes(androidObjectFile);
                var header            = new byte[arm_32_android_magic.Length];
                Buffer.BlockCopy(androidObjectData, 0, header, 0, arm_32_android_magic.Length);

                if (!header.SequenceEqual(arm_32_android_magic))
                {
                    Console.WriteLine("Unexpected header bytes in 32-bit arm object file.");
                    return(-1);
                }
            }

            {
                // 64-bit windows object files start with the magic 16-bit value 0x8664
                // (presumably referring to x86-64)
                byte[] win_64_magic = { 0x64, 0x86 };

                var winObjectFile = "lesson_11_x86_64_windows.obj";
                if (!File.Exists(winObjectFile))
                {
                    Console.WriteLine("Object file not generated");
                    return(-1);
                }
                var windowsObjectData = File.ReadAllBytes(winObjectFile);
                var header            = new byte[win_64_magic.Length];
                Buffer.BlockCopy(windowsObjectData, 0, header, 0, win_64_magic.Length);

                if (!header.SequenceEqual(win_64_magic))
                {
                    Console.WriteLine("Unexpected header bytes in 64-bit windows object file.");
                    return(-1);
                }
            }

            {
                // 32-bit arm iOS mach-o files start with the following magic bytes:
                uint[] arm_32_ios_magic = { 0xfeedface, // Mach-o magic bytes
                                            12,         // CPU type is ARM
                                            11,         // CPU subtype is ARMv7s
                                            1 };        // It's a relocatable object file.
                var    magicBytes = new byte[arm_32_ios_magic.Length * 4];
                Buffer.BlockCopy(arm_32_ios_magic, 0, magicBytes, 0, arm_32_ios_magic.Length * 4);

                var iosObjectFile = "lesson_11_arm_32_ios.o";
                if (!File.Exists(iosObjectFile))
                {
                    Console.WriteLine("Object file not generated");
                    return(-1);
                }

                var iosObjectData = File.ReadAllBytes(iosObjectFile);
                var header        = new byte[magicBytes.Length];
                Buffer.BlockCopy(iosObjectData, 0, header, 0, magicBytes.Length);
                if (!header.SequenceEqual(magicBytes))
                {
                    Console.WriteLine("Unexpected header bytes in 32-bit arm ios object file.");
                    return(-1);
                }
            }

            // It looks like the object files we produced are plausible for
            // those targets. We'll count that as a success for the purposes
            // of this tutorial. For a real application you'd then need to
            // figure out how to integrate Halide into your cross-compilation
            // toolchain. There are several small examples of this in the
            // Halide repository under the apps folder. See HelloAndroid and
            // HelloiOS here:
            // https://github.com/halide/Halide/tree/master/apps/
            Console.WriteLine("Success!");
            return(0);
        }
Exemple #10
0
        public static int Main(string[] args)
        {
            var x = new HSVar("x");
            var y = new HSVar("y");

            // Printing out the value of Funcs as they are computed.
            {
                // We'll define our gradient function as before.
                var gradient = new HSFunc("gradient");
                gradient[x, y] = x + y;

                // And tell Halide that we'd like to be notified of all
                // evaluations.
                gradient.TraceStores();

                // Realize the function over an 8x8 region.
                Console.WriteLine("Evaluating gradient");
                var output = gradient.Realize <int>(8, 8);
                // Click to show output ...

                // This will print out all the times gradient(x, y) gets
                // evaluated.

                // Now that we can snoop on what Halide is doing, let's try our
                // first scheduling primitive. We'll make a new version of
                // gradient that processes each scanline in parallel.
                var parallel_gradient = new HSFunc("parallel_gradient");
                parallel_gradient[x, y] = x + y;

                // We'll also trace this function.
                parallel_gradient.TraceStores();

                // Things are the same so far. We've defined the algorithm, but
                // haven't said anything about how to schedule it. In general,
                // exploring different scheduling decisions doesn't change the code
                // that describes the algorithm.

                // Now we tell Halide to use a parallel for loop over the y
                // coordinate. On Linux we run this using a thread pool and a task
                // queue. On OS X we call into grand central dispatch, which does
                // the same thing for us.
                parallel_gradient.Parallel(y);

                // This time the printfs should come out of order, because each
                // scanline is potentially being processed in a different
                // thread. The number of threads should adapt to your system, but
                // on linux you can control it manually using the environment
                // variable HL_NUM_THREADS.
                Console.WriteLine("\nEvaluating parallel_gradient");
                parallel_gradient.Realize <int>(8, 8);
                // Click to show output ...
            }

            // Printing individual Exprs.
            {
                // trace_stores() can only print the value of a
                // Func. Sometimes you want to inspect the value of
                // sub-expressions rather than the entire Func. The built-in
                // function 'print' can be wrapped around any Expr to print
                // the value of that Expr every time it is evaluated.

                // For example, say we have some Func that is the sum of two terms:
                var f = new HSFunc("f");
                f[x, y] = HSMath.Sin(x) + HSMath.Cos(y);

                // If we want to inspect just one of the terms, we can wrap
                // 'print' around it like so:
                var g = new HSFunc("g");
                g[x, y] = HSMath.Sin(x) + HS.Print(HSMath.Cos(y));

                Console.WriteLine("\nEvaluating sin(x) + cos(y), and just printing cos(y)");
                g.Realize <float>(4, 4);
                // Click to show output ...
            }

            // Printing additional context.
            {
                // print can take multiple arguments. It prints all of them
                // and evaluates to the first one. The arguments can be Exprs
                // or constant strings. This can be used to print additional
                // context alongside the value:
                var f = new HSFunc("f");
                f[x, y] = HSMath.Sin(x) + HS.Print(HSMath.Cos(y), "<- this is cos(", y, ") when x =", x);

                Console.WriteLine("\nEvaluating sin(x) + cos(y), and printing cos(y) with more context");
                f.Realize <float>(4, 4);
                // Click to show output ...

                // It can be useful to split expressions like the one above
                // across multiple lines to make it easier to turn on and off
                // printing certain values while debugging.
                HSExpr e = HSMath.Cos(y);
                // Uncomment the following line to print the value of cos(y)
                // e = print(e, "<- this is cos(", y, ") when x =", x);
                var g = new HSFunc("g");
                g[x, y] = HSMath.Sin(x) + e;
                g.Realize <float>(4, 4);
            }

            // Conditional printing
            {
                // Both print and trace_stores can produce a lot of output. If
                // you're looking for a rare event, or just want to see what
                // happens at a single pixel, this amount of output can be
                // difficult to dig through. Instead, the function print_when
                // can be used to conditionally print an Expr. The first
                // argument to print_when is a boolean Expr. If the Expr
                // evaluates to true, it returns the second argument and
                // prints all of the arguments. If the Expr evaluates to false
                // it just returns the second argument and does not print.

                var    f = new HSFunc("f");
                HSExpr e = HSMath.Cos(y);
                e       = HS.PrintWhen(x == 37 && y == 42, e, "<- this is cos(y) at x, y == (37, 42)");
                f[x, y] = HSMath.Sin(x) + e;
                Console.WriteLine("\nEvaluating sin(x) + cos(y), and printing cos(y) at a single pixel");
                f.Realize <float>(640, 480);
                // Click to show output ...

                // print_when can also be used to check for values you're not expecting:
                var g = new HSFunc("g");
                e       = HSMath.Cos(y);
                e       = HS.PrintWhen(e < 0, e, "cos(y) < 0 at y ==", y);
                g[x, y] = HSMath.Sin(x) + e;
                Console.WriteLine("\nEvaluating sin(x) + cos(y), and printing whenever cos(y) < 0");
                g.Realize <float>(4, 4);
                // Click to show output ...
            }

            // Printing expressions at compile-time.
            {
                // The code above builds up a Halide Expr across several lines
                // of code. If you're programmatically constructing a complex
                // expression, and you want to check the Expr you've created
                // is what you think it is, you can also print out the
                // expression itself using C++ streams:
                var fizz = new HSVar("fizz");
                var buzz = new HSVar("buzz");
                var e    = new HSExpr(1);
                for (int i = 2; i < 100; i++)
                {
                    if (i % 3 == 0 && i % 5 == 0)
                    {
                        e += fizz * buzz;
                    }
                    else if (i % 3 == 0)
                    {
                        e += fizz;
                    }
                    else if (i % 5 == 0)
                    {
                        e += buzz;
                    }
                    else
                    {
                        e += i;
                    }
                }

                Console.WriteLine($"Printing a complex Expr: {e}");
                // Click to show output ...
            }

            Console.WriteLine("Success!");
            return(0);
        }
Exemple #11
0
        public static int Main(string[] args)
        {
            // We're going to define and schedule our gradient function in
            // several different ways, and see what order pixels are computed
            // in.

            var x = new HSVar("x");
            var y = new HSVar("y");

            // First we observe the default ordering.
            {
                var gradient = new HSFunc("gradient");
                gradient[x, y] = x + y;
                gradient.TraceStores();

                // By default we walk along the rows and then down the
                // columns. This means x varies quickly, and y varies
                // slowly. x is the column and y is the row, so this is a
                // row-major traversal.
                Console.WriteLine("Evaluating gradient row-major");
                var output = gradient.Realize <int>(4, 4);

                // See figures/lesson_05_row_major.gif for a visualization of
                // what this did.

                // The equivalent C is:
                Console.WriteLine("Equivalent C:");
                for (int yy = 0; yy < 4; yy++)
                {
                    for (int xx = 0; xx < 4; xx++)
                    {
                        Console.WriteLine($"Evaluating at x = {xx}, y = {yy}: {xx + yy}");
                    }
                }
                Console.WriteLine("\n");

                // Tracing is one useful way to understand what a schedule is
                // doing. You can also ask Halide to print out pseudocode
                // showing what loops Halide is generating:
                Console.WriteLine("Pseudo-code for the schedule:");
                gradient.PrintLoopNest();
                Console.WriteLine();

                // Because we're using the default ordering, it should print:
                // compute gradient:
                //   for y:
                //     for x:
                //       gradient(...) = ...
            }

            // Reorder variables.
            {
                var gradient = new HSFunc("gradient_col_major");
                gradient[x, y] = x + y;
                gradient.TraceStores();

                // If we reorder x and y, we can walk down the columns
                // instead. The reorder call takes the arguments of the func,
                // and sets a new nesting order for the for loops that are
                // generated. The arguments are specified from the innermost
                // loop out, so the following call puts y in the inner loop:
                gradient.Reorder(y, x);

                // This means y (the row) will vary quickly, and x (the
                // column) will vary slowly, so this is a column-major
                // traversal.

                Console.WriteLine("Evaluating gradient column-major");
                var output = gradient.Realize <int>(4, 4);

                // See figures/lesson_05_col_major.gif for a visualization of
                // what this did.

                Console.WriteLine("Equivalent C:");
                for (int xx = 0; xx < 4; xx++)
                {
                    for (int yy = 0; yy < 4; yy++)
                    {
                        Console.WriteLine($"Evaluating at x = {xx}, y = {yy}: {xx + yy}");
                    }
                }
                Console.WriteLine();

                // If we print pseudo-code for this schedule, we'll see that
                // the loop over y is now inside the loop over x.
                Console.WriteLine("Pseudo-code for the schedule:");
                gradient.PrintLoopNest();
                Console.WriteLine();
            }

            // Split a variable into two.
            {
                var gradient = new HSFunc("gradient_split");
                gradient[x, y] = x + y;
                gradient.TraceStores();

                // The most powerful primitive scheduling operation you can do
                // to a var is to split it into inner and outer sub-variables:
                var x_outer = new HSVar("x_outer");
                var x_inner = new HSVar("x_inner");
                gradient.Split(x, x_outer, x_inner, 2);

                // This breaks the loop over x into two nested loops: an outer
                // one over x_outer, and an inner one over x_inner. The last
                // argument to split was the "split factor". The inner loop
                // runs from zero to the split factor. The outer loop runs
                // from zero to the extent required of x (4 in this case)
                // divided by the split factor. Within the loops, the old
                // variable is defined to be outer * factor + inner. If the
                // old loop started at a value other than zero, then that is
                // also added within the loops.

                Console.WriteLine("Evaluating gradient with x split into x_outer and x_inner ");
                var output = gradient.Realize <int>(4, 4);

                Console.WriteLine("Equivalent C:");
                for (int yy = 0; yy < 4; yy++)
                {
                    for (int xOuter = 0; xOuter < 2; xOuter++)
                    {
                        for (int xInner = 0; xInner < 2; xInner++)
                        {
                            int xx = xOuter * 2 + xInner;
                            Console.WriteLine($"Evaluating at x = {xx}, y = {yy}: {xx + yy}");
                        }
                    }
                }
                Console.WriteLine();

                Console.WriteLine("Pseudo-code for the schedule:");
                gradient.PrintLoopNest();
                Console.WriteLine();

                // Note that the order of evaluation of pixels didn't actually
                // change! Splitting by itself does nothing, but it does open
                // up all of the scheduling possibilities that we will explore
                // below.
            }

            // Fuse two variables into one.
            {
                var gradient = new HSFunc("gradient_fused");
                gradient[x, y] = x + y;

                // The opposite of splitting is 'fusing'. Fusing two variables
                // merges the two loops into a single for loop over the
                // product of the extents. Fusing is less important than
                // splitting, but it also sees use (as we'll see later in this
                // lesson). Like splitting, fusing by itself doesn't change
                // the order of evaluation.
                var fused = new HSVar("fused");
                gradient.Fuse(x, y, fused);

                Console.WriteLine("Evaluating gradient with x and y fused");
                var output = gradient.Realize <int>(4, 4);

                Console.WriteLine("Equivalent C:");
                for (int f = 0; f < 4 * 4; f++)
                {
                    int yy = f / 4;
                    int xx = f % 4;
                    Console.WriteLine($"Evaluating at x = {xx}, y = {yy}: {xx + yy}");
                }
                Console.WriteLine();

                Console.WriteLine("Pseudo-code for the schedule:");
                gradient.PrintLoopNest();
                Console.WriteLine();
            }

            // Evaluating in tiles.
            {
                var gradient = new HSFunc("gradient_tiled");
                gradient[x, y] = x + y;
                gradient.TraceStores();

                // Now that we can both split and reorder, we can do tiled
                // evaluation. Let's split both x and y by a factor of four,
                // and then reorder the vars to express a tiled traversal.
                //
                // A tiled traversal splits the domain into small rectangular
                // tiles, and outermost iterates over the tiles, and within
                // that iterates over the points within each tile. It can be
                // good for performance if neighboring pixels use overlapping
                // input data, for example in a blur. We can express a tiled
                // traversal like so:
                var x_outer = new HSVar("x_outer");
                var x_inner = new HSVar("x_inner");
                var y_outer = new HSVar("y_outer");
                var y_inner = new HSVar("y_inner");
                gradient.Split(x, x_outer, x_inner, 4);
                gradient.Split(y, y_outer, y_inner, 4);
                gradient.Reorder(x_inner, y_inner, x_outer, y_outer);

                // This pattern is common enough that there's a shorthand for it:
                // gradient.tile(x, y, x_outer, y_outer, x_inner, y_inner, 4, 4);

                Console.WriteLine("Evaluating gradient in 4x4 tiles");
                var output = gradient.Realize <int>(8, 8);

                // See figures/lesson_05_tiled.gif for a visualization of this
                // schedule.

                Console.WriteLine("Equivalent C:");
                for (int yOuter = 0; yOuter < 2; yOuter++)
                {
                    for (int xOuter = 0; xOuter < 2; xOuter++)
                    {
                        for (int yInner = 0; yInner < 4; yInner++)
                        {
                            for (int xInner = 0; xInner < 4; xInner++)
                            {
                                int xx = xOuter * 4 + xInner;
                                int yy = yOuter * 4 + yInner;
                                Console.WriteLine($"Evaluating at x = {xx}, y = {yy}: {xx + yy}");
                            }
                        }
                    }
                }
                Console.WriteLine();

                Console.WriteLine("Pseudo-code for the schedule:");
                gradient.PrintLoopNest();
                Console.WriteLine();
            }

            // Evaluating in vectors.
            {
                var gradient = new HSFunc("gradient_in_vectors");
                gradient[x, y] = x + y;
                gradient.TraceStores();

                // The nice thing about splitting is that it guarantees the
                // inner variable runs from zero to the split factor. Most of
                // the time the split-factor will be a compile-time constant,
                // so we can replace the loop over the inner variable with a
                // single vectorized computation. This time we'll split by a
                // factor of four, because on X86 we can use SSE to compute in
                // 4-wide vectors.
                var x_outer = new HSVar("x_outer");
                var x_inner = new HSVar("x_inner");
                gradient.Split(x, x_outer, x_inner, 4);
                gradient.Vectorize(x_inner);

                // Splitting and then vectorizing the inner variable is common
                // enough that there's a short-hand for it. We could have also
                // said:
                //
                // gradient.vectorize(x, 4);
                //
                // which is equivalent to:
                //
                // gradient.split(x, x, x_inner, 4);
                // gradient.vectorize(x_inner);
                //
                // Note that in this case we reused the name 'x' as the new
                // outer variable. Later scheduling calls that refer to x
                // will refer to this new outer variable named x.

                // This time we'll evaluate over an 8x4 box, so that we have
                // more than one vector of work per scanline.
                Console.WriteLine("Evaluating gradient with x_inner vectorized ");
                var output = gradient.Realize <int>(8, 4);

                // See figures/lesson_05_vectors.gif for a visualization.

                Console.WriteLine("Equivalent C:");
                for (int yy = 0; yy < 4; yy++)
                {
                    for (int xOuter = 0; xOuter < 2; xOuter++)
                    {
                        // The loop over x_inner has gone away, and has been
                        // replaced by a vectorized version of the
                        // expression. On x86 processors, Halide generates SSE
                        // for all of this.
                        int[] x_vec = { xOuter * 4 + 0,
                              xOuter * 4 + 1,
                              xOuter * 4 + 2,
                              xOuter * 4 + 3 };
                        int[] val = { x_vec[0] + yy,
                                      x_vec[1] + yy,
                                      x_vec[2] + yy,
                                      x_vec[3] + yy };
                        Console.WriteLine($"Evaluating at " +
                                          $"<{x_vec[0]}, {x_vec[1]}, {x_vec[2]}, {x_vec[3]}>, " +
                                          $"<{yy}, {yy}, {yy}, {yy}>: " +
                                          $"<{val[0]}, {val[1]}, {val[2]}, {val[3]}>");
                    }
                }
                Console.WriteLine();

                Console.WriteLine("Pseudo-code for the schedule:");
                gradient.PrintLoopNest();
                Console.WriteLine();
            }

            // Unrolling a loop.
            {
                var gradient = new HSFunc("gradient_unroll");
                gradient[x, y] = x + y;
                gradient.TraceStores();

                // If multiple pixels share overlapping data, it can make
                // sense to unroll a computation so that shared values are
                // only computed or loaded once. We do this similarly to how
                // we expressed vectorizing. We split a dimension and then
                // fully unroll the loop of the inner variable. Unrolling
                // doesn't change the order in which things are evaluated.
                var x_outer = new HSVar("x_outer");
                var x_inner = new HSVar("x_inner");
                gradient.Split(x, x_outer, x_inner, 2);
                gradient.Unroll(x_inner);

                // The shorthand for this is:
                // gradient.unroll(x, 2);

                Console.WriteLine("Evaluating gradient unrolled by a factor of two");
                var result = gradient.Realize <int>(4, 4);

                Console.WriteLine("Equivalent C:");
                for (int yy = 0; yy < 4; yy++)
                {
                    for (int xOuter = 0; xOuter < 2; xOuter++)
                    {
                        // Instead of a for loop over x_inner, we get two
                        // copies of the innermost statement.
                        {
                            int xInner = 0;
                            int xx     = xOuter * 2 + xInner;
                            Console.WriteLine($"Evaluating at x = {xx}, y = {yy}: {xx + yy}");
                        }
                        {
                            int xInner = 1;
                            int xx     = xOuter * 2 + xInner;
                            Console.WriteLine($"Evaluating at x = {xx}, y = {yy}: {xx + yy}");
                        }
                    }
                }
                Console.WriteLine();

                Console.WriteLine("Pseudo-code for the schedule:");
                gradient.PrintLoopNest();
                Console.WriteLine();
            }

            // Splitting by factors that don't divide the extent.
            {
                var gradient = new HSFunc("gradient_split_7x2");
                gradient[x, y] = x + y;
                gradient.TraceStores();

                // Splitting guarantees that the inner loop runs from zero to
                // the split factor, which is important for the uses we saw
                // above. So what happens when the total extent we wish to
                // evaluate x over isn't a multiple of the split factor? We'll
                // split by a factor three, and we'll evaluate gradient over a
                // 7x2 box instead of the 4x4 box we've been using.
                var x_outer = new HSVar("x_outer");
                var x_inner = new HSVar("x_inner");
                gradient.Split(x, x_outer, x_inner, 3);

                Console.WriteLine("Evaluating gradient over a 7x2 box with x split by three ");
                var output = gradient.Realize <int>(7, 2);

                // See figures/lesson_05_split_7_by_3.gif for a visualization
                // of what happened. Note that some points get evaluated more
                // than once!

                Console.WriteLine("Equivalent C:");
                for (int yy = 0; yy < 2; yy++)
                {
                    for (int xOuter = 0; xOuter < 3; xOuter++)   // Now runs from 0 to 2
                    {
                        for (int xInner = 0; xInner < 3; xInner++)
                        {
                            int xx = xOuter * 3;
                            // Before we add x_inner, make sure we don't
                            // evaluate points outside of the 7x2 box. We'll
                            // clamp x to be at most 4 (7 minus the split
                            // factor).
                            if (xx > 4)
                            {
                                xx = 4;
                            }
                            xx += xInner;
                            Console.WriteLine($"Evaluating at x = {xx}, y = {yy}: {xx + yy}");
                        }
                    }
                }
                Console.WriteLine();

                Console.WriteLine("Pseudo-code for the schedule:");
                gradient.PrintLoopNest();
                Console.WriteLine();

                // If you read the output, you'll see that some coordinates
                // were evaluated more than once. That's generally OK, because
                // pure Halide functions have no side-effects, so it's safe to
                // evaluate the same point multiple times. If you're calling
                // out to C functions like we are, it's your responsibility to
                // make sure you can handle the same point being evaluated
                // multiple times.

                // The general rule is: If we require x from x_min to x_min + x_extent, and
                // we split by a factor 'factor', then:
                //
                // x_outer runs from 0 to (x_extent + factor - 1)/factor
                // x_inner runs from 0 to factor
                // x = min(x_outer * factor, x_extent - factor) + x_inner + x_min
                //
                // In our example, x_min was 0, x_extent was 7, and factor was 3.

                // However, if you write a Halide function with an update
                // definition (see lesson 9), then it is not safe to evaluate
                // the same point multiple times, so we won't apply this
                // trick. Instead the range of values computed will be rounded
                // up to the next multiple of the split factor.
            }

            // Fusing, tiling, and parallelizing.
            {
                // We saw in the previous lesson that we can parallelize
                // across a variable. Here we combine it with fusing and
                // tiling to express a useful pattern - processing tiles in
                // parallel.

                // This is where fusing shines. Fusing helps when you want to
                // parallelize across multiple dimensions without introducing
                // nested parallelism. Nested parallelism (parallel for loops
                // within parallel for loops) is supported by Halide, but
                // often gives poor performance compared to fusing the
                // parallel variables into a single parallel for loop.

                var gradient = new HSFunc("gradient_fused_tiles");
                gradient[x, y] = x + y;
                gradient.TraceStores();

                // First we'll tile, then we'll fuse the tile indices and
                // parallelize across the combination.
                var x_outer    = new HSVar("x_outer");
                var y_outer    = new HSVar("y_outer");
                var x_inner    = new HSVar("x_inner");
                var y_inner    = new HSVar("y_inner");
                var tile_index = new HSVar("tile_index");
                gradient.Tile(x, y, x_outer, y_outer, x_inner, y_inner, 4, 4);
                gradient.Fuse(x_outer, y_outer, tile_index);
                gradient.Parallel(tile_index);

                // The scheduling calls all return a reference to the Func, so
                // you can also chain them together into a single statement to
                // make things slightly clearer:
                //
                // gradient
                //     .tile(x, y, x_outer, y_outer, x_inner, y_inner, 2, 2)
                //     .fuse(x_outer, y_outer, tile_index)
                //     .parallel(tile_index);


                Console.WriteLine("Evaluating gradient tiles in parallel");
                var output = gradient.Realize <int>(8, 8);

                // The tiles should occur in arbitrary order, but within each
                // tile the pixels will be traversed in row-major order. See
                // figures/lesson_05_parallel_tiles.gif for a visualization.

                Console.WriteLine("Equivalent (serial) C:\n");
                // This outermost loop should be a parallel for loop, but that's hard in C.
                for (int ti = 0; ti < 4; ti++)
                {
                    int yOuter = ti / 2;
                    int xOuter = ti % 2;
                    for (int j_inner = 0; j_inner < 4; j_inner++)
                    {
                        for (int i_inner = 0; i_inner < 4; i_inner++)
                        {
                            int j = yOuter * 4 + j_inner;
                            int i = xOuter * 4 + i_inner;
                            Console.WriteLine($"Evaluating at x = {i}, y = {j}: {i + j}");
                        }
                    }
                }

                Console.WriteLine();

                Console.WriteLine("Pseudo-code for the schedule:");
                gradient.PrintLoopNest();
                Console.WriteLine();
            }

            // Putting it all together.
            {
                // Are you ready? We're going to use all of the features above now.
                var gradient_fast = new HSFunc("gradient_fast");
                gradient_fast[x, y] = x + y;

                // We'll process 64x64 tiles in parallel.
                var x_outer    = new HSVar("x_outer");
                var y_outer    = new HSVar("y_outer");
                var x_inner    = new HSVar("x_inner");
                var y_inner    = new HSVar("y_inner");
                var tile_index = new HSVar("tile_index");
                gradient_fast
                .Tile(x, y, x_outer, y_outer, x_inner, y_inner, 64, 64)
                .Fuse(x_outer, y_outer, tile_index)
                .Parallel(tile_index);

                // We'll compute two scanlines at once while we walk across
                // each tile. We'll also vectorize in x. The easiest way to
                // express this is to recursively tile again within each tile
                // into 4x2 subtiles, then vectorize the subtiles across x and
                // unroll them across y:
                var x_inner_outer = new HSVar("x_inner_outer");
                var y_inner_outer = new HSVar("y_inner_outer");
                var x_vectors     = new HSVar("x_vectors");
                var y_pairs       = new HSVar("y_pairs");
                gradient_fast
                .Tile(x_inner, y_inner, x_inner_outer, y_inner_outer, x_vectors, y_pairs, 4, 2)
                .Vectorize(x_vectors)
                .Unroll(y_pairs);

                // Note that we didn't do any explicit splitting or
                // reordering. Those are the most important primitive
                // operations, but mostly they are buried underneath tiling,
                // vectorizing, or unrolling calls.

                // Now let's evaluate this over a range which is not a
                // multiple of the tile size.

                // If you like you can turn on tracing, but it's going to
                // produce a lot of printfs. Instead we'll compute the answer
                // both in C and Halide and see if the answers match.
                var result = gradient_fast.Realize <int>(350, 250);

                // See figures/lesson_05_fast.mp4 for a visualization.

                Console.WriteLine("Checking Halide result against equivalent C...");
                for (int tileIndex = 0; tileIndex < 6 * 4; tileIndex++)
                {
                    int yOuter = tileIndex / 4;
                    int xOuter = tileIndex % 4;
                    for (int yInnerOuter = 0; yInnerOuter < 64 / 2; yInnerOuter++)
                    {
                        for (int xInnerOuter = 0; xInnerOuter < 64 / 4; xInnerOuter++)
                        {
                            // We're vectorized across x
                            int   xx   = Math.Min(xOuter * 64, 350 - 64) + xInnerOuter * 4;
                            int[] xVec = { xx + 0,
                                           xx + 1,
                                           xx + 2,
                                           xx + 3 };

                            // And we unrolled across y
                            int yBase = Math.Min(yOuter * 64, 250 - 64) + yInnerOuter * 2;
                            {
                                // y_pairs = 0
                                int   yy   = yBase + 0;
                                int[] yVec = { yy, yy, yy, yy };
                                int[] val  = { xVec[0] + yVec[0],
                                               xVec[1] + yVec[1],
                                               xVec[2] + yVec[2],
                                               xVec[3] + yVec[3] };

                                // Check the result.
                                for (int i = 0; i < 4; i++)
                                {
                                    if (result[xVec[i], yVec[i]] != val[i])
                                    {
                                        Console.WriteLine($"There was an error at {xVec[i]} {yVec[i]}!");
                                        return(-1);
                                    }
                                }
                            }
                            {
                                // y_pairs = 1
                                int   yy   = yBase + 1;
                                int[] yVec = { yy, yy, yy, yy };
                                int[] val  = { xVec[0] + yVec[0],
                                               xVec[1] + yVec[1],
                                               xVec[2] + yVec[2],
                                               xVec[3] + yVec[3] };

                                // Check the result.
                                for (int i = 0; i < 4; i++)
                                {
                                    if (result[xVec[i], yVec[i]] != val[i])
                                    {
                                        Console.WriteLine($"There was an error at {xVec[i]} {yVec[i]}!");
                                        return(-1);
                                    }
                                }
                            }
                        }
                    }
                }
                Console.WriteLine();

                Console.WriteLine("Pseudo-code for the schedule:");
                gradient_fast.PrintLoopNest();
                Console.WriteLine();

                // Note that in the Halide version, the algorithm is specified
                // once at the top, separately from the optimizations, and there
                // aren't that many lines of code total. Compare this to the C
                // version. There's more code (and it isn't even parallelized or
                // vectorized properly). More annoyingly, the statement of the
                // algorithm (the result is x plus y) is buried in multiple places
                // within the mess. This C code is hard to write, hard to read,
                // hard to debug, and hard to optimize further. This is why Halide
                // exists.
            }


            Console.WriteLine("Success!");
            return(0);
        }