public void TestPerformance() { // Test the performance of the scheduled MyPipeline. var output = new HSBuffer <byte>(Input.Width, Input.Height, Input.Channels); // Run the filter once to initialize any GPU runtime state. Curved.Realize(output); // Now take the best of 3 runs for timing. double best_time = 0.0; for (int i = 0; i < 3; i++) { var timer = Stopwatch.StartNew(); // Run the filter 100 times. for (int j = 0; j < 100; j++) { Curved.Realize(output); } // Force any GPU code to finish by copying the buffer back to the CPU. output.CopyToHost(); timer.Stop(); var elapsed = timer.ElapsedMilliseconds; if (i == 0 || elapsed < best_time) { best_time = elapsed; } } Console.WriteLine($"{best_time} milliseconds"); }
public static int Main(string[] args) { // We'll start by defining the simple single-stage imaging // pipeline from lesson 1. // This lesson will be about debugging, but unfortunately in C++, // objects don't know their own names, which makes it hard for us // to understand the generated code. To get around this, you can // pass a string to the Func and Var constructors to give them a // name for debugging purposes. var gradient = new HSFunc("gradient"); var x = new HSVar("x"); var y = new HSVar("y"); gradient[x, y] = x + y; // Realize the function to produce an output image. We'll keep it // very small for this lesson. var output = gradient.Realize <int>(8, 8); // That line compiled and ran the pipeline. Try running this // lesson with the environment variable HL_DEBUG_CODEGEN set to // 1. It will print out the various stages of compilation, and a // pseudocode representation of the final pipeline. // If you set HL_DEBUG_CODEGEN to a higher number, you can see // more and more details of how Halide compiles your pipeline. // Setting HL_DEBUG_CODEGEN=2 shows the Halide code at each stage // of compilation, and also the llvm bitcode we generate at the // end. // Halide will also output an HTML version of this output, which // supports syntax highlighting and code-folding, so it can be // nicer to read for large pipelines. Open gradient.html with your // browser after running this tutorial. gradient.CompileToLoweredStmt("gradient.html", HSOutputFormat.HS_HTML); // You can usually figure out what code Halide is generating using // this pseudocode. In the next lesson we'll see how to snoop on // Halide at runtime. Console.WriteLine("Success!"); return(0); }
public static void Main() { // Now make a small program that will trigger an error var buffer = new HSBuffer <int>(2, 2); for (var j = 0; j < 2; j++) { for (var i = 0; i < 2; i++) { buffer[j, i] = i * j; } } var errilicious = new HSFunc("EISFORERROR"); var x = new HSVar("x"); var y = new HSVar("y"); errilicious[x, y] = buffer[x, y] * buffer[x, y]; // Now realize over a domain that is larger than the buffer, which should trigger // an error var result = errilicious.Realize <int>(100, 100); }
public static int Main(string[] args) { // First we'll declare some Vars to use below. var x = new HSVar("x"); var y = new HSVar("y"); var c = new HSVar("c"); // Now we'll express a multi-stage pipeline that blurs an image // first horizontally, and then vertically. { // Take a color 8-bit input var input = HSBuffer <byte> .LoadImage("rgb.png"); // Upgrade it to 16-bit, so we can do math without it overflowing. var input_16 = new HSFunc("input_16"); input_16[x, y, c] = HS.Cast <ushort>(input[x, y, c]); // Blur it horizontally: var blur_x = new HSFunc("blur_x"); blur_x[x, y, c] = (input_16[x - 1, y, c] + 2 * input_16[x, y, c] + input_16[x + 1, y, c]) / 4; // Blur it vertically: var blur_y = new HSFunc("blur_y"); blur_y[x, y, c] = (blur_x[x, y - 1, c] + 2 * blur_x[x, y, c] + blur_x[x, y + 1, c]) / 4; // Convert back to 8-bit. var output = new HSFunc("output"); output[x, y, c] = HS.Cast <byte>(blur_y[x, y, c]); // Each Func in this pipeline calls a previous one using // familiar function call syntax (we've overloaded operator() // on Func objects). A Func may call any other Func that has // been given a definition. This restriction prevents // pipelines with loops in them. Halide pipelines are always // feed-forward graphs of Funcs. // Now let's realize it... // Buffer<byte> result = output.realize(input.width(), input.height(), 3); // Except that the line above is not going to work. Uncomment // it to see what happens. // Realizing this pipeline over the same domain as the input // image requires reading pixels out of bounds in the input, // because the blur_x stage reaches outwards horizontally, and // the blur_y stage reaches outwards vertically. Halide // detects this by injecting a piece of code at the top of the // pipeline that computes the region over which the input will // be read. When it starts to run the pipeline it first runs // this code, determines that the input will be read out of // bounds, and refuses to continue. No actual bounds checks // occur in the inner loop; that would be slow. // // So what do we do? There are a few options. If we realize // over a domain shifted inwards by one pixel, we won't be // asking the Halide routine to read out of bounds. We saw how // to do this in the previous lesson: var result = new HSBuffer <byte>(input.Width - 2, input.Height - 2, 3); result.SetMin(1, 1); output.Realize(result); // Save the result. It should look like a slightly blurry // parrot, and it should be two pixels narrower and two pixels // shorter than the input image. result.SaveImage("blurry_parrot_1.png"); // This is usually the fastest way to deal with boundaries: // don't write code that reads out of bounds :) The more // general solution is our next example. } // The same pipeline, with a boundary condition on the input. { // Take a color 8-bit input var input = HSBuffer <byte> .LoadImage("rgb.png"); // This time, we'll wrap the input in a Func that prevents // reading out of bounds: var clamped = new HSFunc("clamped"); // Define an expression that clamps x to lie within the // range [0, input.width()-1]. var clamped_x = HS.Clamp(x, 0, input.Width - 1); // clamp(x, a, b) is equivalent to max(min(x, b), a). // Similarly clamp y. var clamped_y = HS.Clamp(y, 0, input.Height - 1); // Load from input at the clamped coordinates. This means that // no matter how we evaluated the Func 'clamped', we'll never // read out of bounds on the input. This is a clamp-to-edge // style boundary condition, and is the simplest boundary // condition to express in Halide. clamped[x, y, c] = input[clamped_x, clamped_y, c]; // Defining 'clamped' in that way can be done more concisely // using a helper function from the BoundaryConditions // namespace like so: // // clamped = BoundaryConditions::repeat_edge(input); // // These are important to use for other boundary conditions, // because they are expressed in the way that Halide can best // understand and optimize. When used correctly they are as // cheap as having no boundary condition at all. // Upgrade it to 16-bit, so we can do math without it // overflowing. This time we'll refer to our new Func // 'clamped', instead of referring to the input image // directly. var input_16 = new HSFunc("input_16"); input_16[x, y, c] = HS.Cast <ushort>(clamped[x, y, c]); // The rest of the pipeline will be the same... // Blur it horizontally: var blur_x = new HSFunc("blur_x"); blur_x[x, y, c] = (input_16[x - 1, y, c] + 2 * input_16[x, y, c] + input_16[x + 1, y, c]) / 4; // Blur it vertically: var blur_y = new HSFunc("blur_y"); blur_y[x, y, c] = (blur_x[x, y - 1, c] + 2 * blur_x[x, y, c] + blur_x[x, y + 1, c]) / 4; // Convert back to 8-bit. var output = new HSFunc("output"); output[x, y, c] = HS.Cast <byte>(blur_y[x, y, c]); // This time it's safe to evaluate the output over the some // domain as the input, because we have a boundary condition. var result = output.Realize <byte>(input.Width, input.Height, 3); // Save the result. It should look like a slightly blurry // parrot, but this time it will be the same size as the // input. result.SaveImage("blurry_parrot_2.png"); } Console.WriteLine("Success!"); return(0); }
public static int Main(string[] args) { // First we'll declare some Vars to use below. var x = new HSVar("x"); var y = new HSVar("y"); var xo = new HSVar("xo"); var yo = new HSVar("yo"); var xi = new HSVar("xi"); var yi = new HSVar("yi"); // This lesson will be about "wrapping" a Func or an ImageParam using the // Func::in and ImageParam::in directives { { // Consider a simple two-stage pipeline: var f = new HSFunc("f_local"); var g = new HSFunc("g_local"); f[x, y] = x + y; g[x, y] = 2 * f[x, y] + 3; f.ComputeRoot(); // This produces the following loop nests: // for y: // for x: // f(x, y) = x + y // for y: // for x: // g(x, y) = 2 * f(x, y) + 3 // Using Func::in, we can interpose a new Func in between f // and g using the schedule alone: HSFunc f_in_g = f.In(g); f_in_g.ComputeRoot(); // Equivalently, we could also chain the schedules like so: // f.in(g).ComputeRoot(); // This produces the following three loop nests: // for y: // for x: // f(x, y) = x + y // for y: // for x: // f_in_g(x, y) = f(x, y) // for y: // for x: // g(x, y) = 2 * f_in_g(x, y) + 3 g.Realize <int>(5, 5); // See figures/lesson_19_wrapper_local.mp4 for a visualization. } // The schedule directive f.in(g) replaces all calls to 'f' // inside 'g' with a wrapper Func and then returns that // wrapper. Essentially, it rewrites the original pipeline // above into the following: { var f_in_g = new HSFunc("f_in_g"); var f = new HSFunc("f"); var g = new HSFunc("g"); f[x, y] = x + y; f_in_g[x, y] = f[x, y]; g[x, y] = 2 * f_in_g[x, y] + 3; f.ComputeRoot(); f_in_g.ComputeRoot(); g.ComputeRoot(); } // In isolation, such a transformation seems pointless, but it // can be used for a variety of scheduling tricks. } { // In the schedule above, only the calls to 'f' made by 'g' // are replaced. Other calls made to f would still call 'f' // directly. If we wish to globally replace all calls to 'f' // with a single wrapper, we simply say f.in(). // Consider a three stage pipeline, with two consumers of f: var f = new HSFunc("f_global"); var g = new HSFunc("g_global"); var h = new HSFunc("h_global"); f[x, y] = x + y; g[x, y] = 2 * f[x, y]; h[x, y] = 3 + g[x, y] - f[x, y]; f.ComputeRoot(); g.ComputeRoot(); h.ComputeRoot(); // We will replace all calls to 'f' inside both 'g' and 'h' // with calls to a single wrapper: f.In().ComputeRoot(); // The equivalent loop nests are: // for y: // for x: // f(x, y) = x + y // for y: // for x: // f_in(x, y) = f(x, y) // for y: // for x: // g(x, y) = 2 * f_in(x, y) // for y: // for x: // h(x, y) = 3 + g(x, y) - f_in(x, y) h.Realize <int>(5, 5); // See figures/lesson_19_wrapper_global.mp4 and for a // visualization of what this did. } { // We could also give g and h their own unique wrappers of // f. This time we'll schedule them each inside the loop nests // of the consumer, which is not something we could do with a // single global wrapper. var f = new HSFunc("f_unique"); var g = new HSFunc("g_unique"); var h = new HSFunc("h_unique"); f[x, y] = x + y; g[x, y] = 2 * f[x, y]; h[x, y] = 3 + g[x, y] - f[x, y]; f.ComputeRoot(); g.ComputeRoot(); h.ComputeRoot(); f.In(g).ComputeAt(g, y); f.In(h).ComputeAt(h, y); // This creates the loop nests: // for y: // for x: // f(x, y) = x + y // for y: // for x: // f_in_g(x, y) = f(x, y) // for x: // g(x, y) = 2 * f_in_g(x, y) // for y: // for x: // f_in_h(x, y) = f(x, y) // for x: // h(x, y) = 3 + g(x, y) - f_in_h(x, y) h.Realize <int>(5, 5); // See figures/lesson_19_wrapper_unique.mp4 for a visualization. } { // So far this may seem like a lot of pointless copying of // memory. Func::in can be combined with other scheduling // directives for a variety of purposes. The first we will // examine is creating distinct realizations of a Func for // several consumers and scheduling each differently. // We'll start with nearly the same pipeline. var f = new HSFunc("f_sched"); var g = new HSFunc("g_sched"); var h = new HSFunc("h_sched"); f[x, y] = x + y; g[x, y] = 2 * f[x, y]; // h will use a far-away region of f h[x, y] = 3 + g[x, y] - f[x + 93, y - 87]; // This time we'll inline f. // f.ComputeRoot(); g.ComputeRoot(); h.ComputeRoot(); f.In(g).ComputeAt(g, y); f.In(h).ComputeAt(h, y); // g and h now call f via distinct wrappers. The wrappers are // scheduled, but f is not, which means that f is inlined into // its two wrappers. They will each independently compute the // region of f required by their consumer. If we had scheduled // f ComputeRoot, we'd be computing the bounding box of the // region required by g and the region required by h, which // would mostly be unused data. // We can also schedule each of these wrappers // differently. For scheduling purposes, wrappers inherit the // pure vars of the Func they wrap, so we use the same x and y // that we used when defining f: f.In(g).Vectorize(x, 4); f.In(h).Split(x, xo, xi, 2).Reorder(xo, xi); // Note that calling f.in(g) a second time returns the wrapper // already created by the first call, it doesn't make a new one. h.Realize <int>(8, 8); // See figures/lesson_19_wrapper_vary_schedule.mp4 for a // visualization. // Note that because f is inlined into its two wrappers, it is // the wrappers that do the work of computing f, rather than // just loading from an existing computed realization. } { // Func::in is useful to stage loads from a Func via some // smaller intermediate buffer, perhaps on the stack or in // shared GPU memory. // Consider a pipeline that transposes some ComputeRoot'd Func: var f = new HSFunc("f_transpose"); var g = new HSFunc("g_transpose"); f[x, y] = HSMath.Sin(((x + y) * HSMath.Sqrt(y)) / 10); f.ComputeRoot(); g[x, y] = f[y, x]; // The execution strategy we want is to load an 4x4 tile of f // into registers, transpose it in-register, and then write it // out as an 4x4 tile of g. We will use Func::in to express this: HSFunc f_tile = f.In(g); // We now have a three stage pipeline: // f -> f_tile -> g // f_tile will load vectors of f, and store them transposed // into registers. g will then write this data back to main // memory. g.Tile(x, y, xo, yo, xi, yi, 4, 4) .Vectorize(xi) .Unroll(yi); // We will compute f_transpose at tiles of g, and use // Func::reorder_storage to state that f_transpose should be // stored column-major, so that the loads to it done by g can // be dense vector loads. f_tile.ComputeAt(g, xo) .ReorderStorage(y, x) .Vectorize(x) .Unroll(y); // We take care to make sure f_transpose is only ever accessed // at constant indicies. The full unrolling/vectorization of // all loops that exist inside its compute_at level has this // effect. Allocations that are only ever accessed at constant // indices can be promoted into registers. g.Realize <float>(16, 16); // See figures/lesson_19_transpose.mp4 for a visualization } { // ImageParam::in behaves the same way as Func::in, and you // can use it to stage loads in similar ways. Instead of // transposing again, we'll use ImageParam::in to stage tiles // of an input image into GPU shared memory, effectively using // shared/local memory as an explicitly-managed cache. var img = new HSImageParam <int>(2); // We will compute a small blur of the input. var blur = new HSFunc("blur"); blur[x, y] = (img[x - 1, y - 1] + img[x, y - 1] + img[x + 1, y - 1] + img[x - 1, y] + img[x, y] + img[x + 1, y] + img[x - 1, y + 1] + img[x, y + 1] + img[x + 1, y + 1]); blur.ComputeRoot().GpuTile(x, y, xo, yo, xi, yi, 8, 8); // The wrapper Func created by ImageParam::in has pure vars // named _0, _1, etc. Schedule it per tile of "blur", and map // _0 and _1 to gpu threads. img.In(blur).ComputeAt(blur, xo).GpuThreads(HS._0, HS._1); // Without Func::in, computing an 8x8 tile of blur would do // 8*8*9 loads to global memory. With Func::in, the wrapper // does 10*10 loads to global memory up front, and then blur // does 8*8*9 loads to shared/local memory. // Select an appropriate GPU API, as we did in lesson 12 var target = HS.GetHostTarget(); if (target.OS == HSOperatingSystem.OSX) { target.SetFeature(HSFeature.Metal); } else { target.SetFeature(HSFeature.OpenCL); } // Create an interesting input image to use. var input = new HSBuffer <int>(258, 258); input.SetMin(-1, -1); for (int yy = input.Top; yy <= input.Bottom; yy++) { for (int xx = input.Left; xx <= input.Right; xx++) { input[xx, yy] = xx * 17 + yy % 4; } } img.Set(input); blur.CompileJit(target); var output = blur.Realize <int>(256, 256); // Check the output is what we expected for (int yy = output.Top; yy <= output.Bottom; yy++) { for (int xx = output.Left; xx <= output.Right; xx++) { int val = output[xx, yy]; int expected = (input[xx - 1, yy - 1] + input[xx, yy - 1] + input[xx + 1, yy - 1] + input[xx - 1, yy] + input[xx, yy] + input[xx + 1, yy] + input[xx - 1, yy + 1] + input[xx, yy + 1] + input[xx + 1, yy + 1]); if (val != expected) { Console.WriteLine($"output({xx}, {yy}) = {val} instead of {expected}\n", xx, yy, val, expected); return(-1); } } } } { // Func::in can also be used to group multiple stages of a // Func into the same loop nest. Consider the following // pipeline, which computes a value per pixel, then sweeps // from left to right and back across each scanline. var f = new HSFunc("f_group"); var g = new HSFunc("g_group"); var h = new HSFunc("h_group"); // Initialize f f[x, y] = HSMath.Sin(x - y); var r = new HSRDom(1, 7); // Sweep from left to right f[r, y] = (f[r, y] + f[r - 1, y]) / 2; // Sweep from right to left f[7 - r, y] = (f[7 - r, y] + f[8 - r, y]) / 2; // Then we do something with a complicated access pattern: A // 45 degree rotation with wrap-around g[x, y] = f[(x + y) % 8, (x - y) % 8]; // f should be scheduled ComputeRoot, because its consumer // accesses it in a complicated way. But that means all stages // of f are computed in separate loop nests: // for y: // for x: // f(x, y) = sin(x - y) // for y: // for r: // f(r, y) = (f(r, y) + f(r - 1, y)) / 2 // for y: // for r: // f(7 - r, y) = (f(7 - r, y) + f(8 - r, y)) / 2 // for y: // for x: // g(x, y) = f((x + y) % 8, (x - y) % 8); // We can get better locality if we schedule the work done by // f to share a common loop over y. We can do this by // computing f at scanlines of a wrapper like so: f.In(g).ComputeRoot(); f.ComputeAt(f.In(g), y); // f has the default schedule for a Func with update stages, // which is to be computed at the innermost loop of its // consumer, which is now the wrapper f.in(g). This therefore // generates the following loop nest, which has better // locality: // for y: // for x: // f(x, y) = sin(x - y) // for r: // f(r, y) = (f(r, y) + f(r - 1, y)) / 2 // for r: // f(7 - r, y) = (f(7 - r, y) + f(8 - r, y)) / 2 // for x: // f_in_g(x, y) = f(x, y) // for y: // for x: // g(x, y) = f_in_g((x + y) % 8, (x - y) % 8); // We'll additionally vectorize the initialization of, and // then transfer of pixel values from f into its wrapper: f.Vectorize(x, 4); f.In(g).Vectorize(x, 4); g.Realize <float>(8, 8); // See figures/lesson_19_group_updates.mp4 for a visualization. } Console.WriteLine("Success!"); return(0); }
public static int Main() { // This program defines a single-stage imaging pipeline that // outputs a grayscale diagonal gradient. // A 'Func' object represents a pipeline stage. It's a pure // function that defines what value each pixel should have. You // can think of it as a computed image. var gradient = new HSFunc("gradient"); // Var objects are names to use as variables in the definition of // a Func. They have no meaning by themselves. var x = new HSVar("x"); var y = new HSVar("y"); // We typically use Vars named 'x' and 'y' to correspond to the x // and y axes of an image, and we write them in that order. If // you're used to thinking of images as having rows and columns, // then x is the column index, and y is the row index. // Funcs are defined at any integer coordinate of its variables as // an Expr in terms of those variables and other functions. // Here, we'll define an Expr which has the value x + y. Vars have // appropriate operator overloading so that expressions like // 'x + y' become 'Expr' objects. var e = x + y; // Now we'll add a definition for the Func object. At pixel x, y, // the image will have the value of the Expr e. On the left hand // side we have the Func we're defining and some Vars. On the right // hand side we have some Expr object that uses those same Vars. gradient[x, y] = e; // This is the same as writing: // // gradient(x, y) = x + y; // // which is the more common form, but we are showing the // intermediate Expr here for completeness. // That line of code defined the Func, but it didn't actually // compute the output image yet. At this stage it's just Funcs, // Exprs, and Vars in memory, representing the structure of our // imaging pipeline. We're meta-programming. This C++ program is // constructing a Halide program in memory. Actually computing // pixel data comes next. // Now we 'realize' the Func, which JIT compiles some code that // implements the pipeline we've defined, and then runs it. We // also need to tell Halide the domain over which to evaluate the // Func, which determines the range of x and y above, and the // resolution of the output image. Halide.h also provides a basic // templatized image type we can use. We'll make an 800 x 600 // image. var output = gradient.Realize <int>(800, 600); // Halide does type inference for you. Var objects represent // 32-bit integers, so the Expr object 'x + y' also represents a // 32-bit integer, and so 'gradient' defines a 32-bit image, and // so we got a 32-bit signed integer image out when we call // 'realize'. Halide types and type-casting rules are equivalent // to C. // Let's check everything worked, and we got the output we were // expecting: for (int j = 0; j < output.Height; j++) { for (int i = 0; i < output.Width; i++) { // We can access a pixel of an Buffer object using similar // syntax to defining and using functions. if (output[i, j] != i + j) { Console.WriteLine("Something went wrong!"); Console.WriteLine($"Pixel {i}, {j} was supposed to be {i + j}, but instead it's {output[i, j]}"); return(-1); } } } // Everything worked! We defined a Func, then called 'realize' on // it to generate and run machine code that produced an Buffer. Console.WriteLine("Success!"); return(0); }
public static int Main(string[] args) { // First we'll declare some Vars to use below. var x = new HSVar("x"); var y = new HSVar("y"); // Let's examine various scheduling options for a simple two stage // pipeline. We'll start with the default schedule: { var producer = new HSFunc("producer_default"); var consumer = new HSFunc("consumer_default"); // The first stage will be some simple pointwise math similar // to our familiar gradient function. The value at position x, // y is the sin of product of x and y. producer[x, y] = HSMath.Sin(x * y); // Now we'll add a second stage which averages together multiple // points in the first stage. consumer[x, y] = (producer[x, y] + producer[x, y + 1] + producer[x + 1, y] + producer[x + 1, y + 1]) / 4; // We'll turn on tracing for both functions. consumer.TraceStores(); producer.TraceStores(); // And evaluate it over a 4x4 box. Console.WriteLine("\nEvaluating producer-consumer pipeline with default schedule"); consumer.Realize <float>(4, 4); // There were no messages about computing values of the // producer. This is because the default schedule fully // inlines 'producer' into 'consumer'. It is as if we had // written the following code instead: // consumer(x, y) = (sin(x * y) + // sin(x * (y + 1)) + // sin((x + 1) * y) + // sin((x + 1) * (y + 1))/4); // All calls to 'producer' have been replaced with the body of // 'producer', with the arguments substituted in for the // variables. // The equivalent C code is: var result = new float[4, 4]; for (int yy = 0; yy < 4; yy++) { for (int xx = 0; xx < 4; xx++) { result[yy, xx] = (float)((Math.Sin(xx * yy) + Math.Sin(xx * (yy + 1)) + Math.Sin((xx + 1) * yy) + Math.Sin((xx + 1) * (yy + 1))) / 4); } } Console.WriteLine(); // If we look at the loop nest, the producer doesn't appear // at all. It has been inlined into the consumer. Console.WriteLine("Pseudo-code for the schedule:"); consumer.PrintLoopNest(); Console.WriteLine(); } // Next we'll examine the next simplest option - computing all // values required in the producer before computing any of the // consumer. We call this schedule "root". { // Start with the same function definitions: var producer = new HSFunc("producer_root"); var consumer = new HSFunc("consumer_root"); producer[x, y] = HSMath.Sin(x * y); consumer[x, y] = (producer[x, y] + producer[x, y + 1] + producer[x + 1, y] + producer[x + 1, y + 1]) / 4; // Tell Halide to evaluate all of producer before any of consumer. producer.ComputeRoot(); // Turn on tracing. consumer.TraceStores(); producer.TraceStores(); // Compile and run. Console.WriteLine("\nEvaluating producer.compute_root()"); consumer.Realize <float>(4, 4); // Reading the output we can see that: // A) There were stores to producer. // B) They all happened before any stores to consumer. // See figures/lesson_08_compute_root.gif for a visualization. // The producer is on the left and the consumer is on the // right. Stores are marked in orange and loads are marked in // blue. // Equivalent C: var result = new float[4, 4]; // Allocate some temporary storage for the producer. var producer_storage = new float[5, 5]; // Compute the producer. for (int yy = 0; yy < 5; yy++) { for (int xx = 0; xx < 5; xx++) { producer_storage[yy, xx] = (float)Math.Sin(xx * yy); } } // Compute the consumer. Skip the prints this time. for (int yy = 0; yy < 4; yy++) { for (int xx = 0; xx < 4; xx++) { result[yy, xx] = (producer_storage[yy, xx] + producer_storage[yy + 1, xx] + producer_storage[yy, xx + 1] + producer_storage[yy + 1, xx + 1]) / 4; } } // Note that consumer was evaluated over a 4x4 box, so Halide // automatically inferred that producer was needed over a 5x5 // box. This is the same 'bounds inference' logic we saw in // the previous lesson, where it was used to detect and avoid // out-of-bounds reads from an input image. // If we print the loop nest, we'll see something very // similar to the C above. Console.WriteLine("Pseudo-code for the schedule:"); consumer.PrintLoopNest(); Console.WriteLine(); } // Let's compare the two approaches above from a performance // perspective. // Full inlining (the default schedule): // - Temporary memory allocated: 0 // - Loads: 0 // - Stores: 16 // - Calls to sin: 64 // producer.compute_root(): // - Temporary memory allocated: 25 floats // - Loads: 64 // - Stores: 41 // - Calls to sin: 25 // There's a trade-off here. Full inlining used minimal temporary // memory and memory bandwidth, but did a whole bunch of redundant // expensive math (calling sin). It evaluated most points in // 'producer' four times. The second schedule, // producer.compute_root(), did the mimimum number of calls to // sin, but used more temporary memory and more memory bandwidth. // In any given situation the correct choice can be difficult to // make. If you're memory-bandwidth limited, or don't have much // memory (e.g. because you're running on an old cell-phone), then // it can make sense to do redundant math. On the other hand, sin // is expensive, so if you're compute-limited then fewer calls to // sin will make your program faster. Adding vectorization or // multi-core parallelism tilts the scales in favor of doing // redundant work, because firing up multiple cpu cores increases // the amount of math you can do per second, but doesn't increase // your system memory bandwidth or capacity. // We can make choices in between full inlining and // compute_root. Next we'll alternate between computing the // producer and consumer on a per-scanline basis: { // Start with the same function definitions: var producer = new HSFunc("producer_y"); var consumer = new HSFunc("consumer_y"); producer[x, y] = HSMath.Sin(x * y); consumer[x, y] = (producer[x, y] + producer[x, y + 1] + producer[x + 1, y] + producer[x + 1, y + 1]) / 4; // Tell Halide to evaluate producer as needed per y coordinate // of the consumer: producer.ComputeAt(consumer, y); // This places the code that computes the producer just // *inside* the consumer's for loop over y, as in the // equivalent C below. // Turn on tracing. producer.TraceStores(); consumer.TraceStores(); // Compile and run. Console.WriteLine("\nEvaluating producer.ComputeAt(consumer, y)"); consumer.Realize <float>(4, 4); // See figures/lesson_08_compute_y.gif for a visualization. // Reading the log or looking at the figure you should see // that producer and consumer alternate on a per-scanline // basis. Let's look at the equivalent C: var result = new float[4, 4]; // There's an outer loop over scanlines of consumer: for (int yy = 0; yy < 4; yy++) { // Allocate space and compute enough of the producer to // satisfy this single scanline of the consumer. This // means a 5x2 box of the producer. var producer_storage = new float[2, 5]; for (int py = yy; py < yy + 2; py++) { for (int px = 0; px < 5; px++) { producer_storage[py - yy, px] = (float)Math.Sin(px * py); } } // Compute a scanline of the consumer. for (int xx = 0; xx < 4; xx++) { result[yy, xx] = (producer_storage[0, xx] + producer_storage[1, xx] + producer_storage[0, xx + 1] + producer_storage[1, xx + 1]) / 4; } } // Again, if we print the loop nest, we'll see something very // similar to the C above. Console.WriteLine("Pseudo-code for the schedule:"); consumer.PrintLoopNest(); Console.WriteLine(); // The performance characteristics of this strategy are in // between inlining and compute root. We still allocate some // temporary memory, but less that compute_root, and with // better locality (we load from it soon after writing to it, // so for larger images, values should still be in cache). We // still do some redundant work, but less than full inlining: // producer.ComputeAt(consumer, y): // - Temporary memory allocated: 10 floats // - Loads: 64 // - Stores: 56 // - Calls to sin: 40 } // We could also say producer.ComputeAt(consumer, x), but this // would be very similar to full inlining (the default // schedule). Instead let's distinguish between the loop level at // which we allocate storage for producer, and the loop level at // which we actually compute it. This unlocks a few optimizations. { var producer = new HSFunc("producer_root_y"); var consumer = new HSFunc("consumer_root_y"); producer[x, y] = HSMath.Sin(x * y); consumer[x, y] = (producer[x, y] + producer[x, y + 1] + producer[x + 1, y] + producer[x + 1, y + 1]) / 4; // Tell Halide to make a buffer to store all of producer at // the outermost level: producer.StoreRoot(); // ... but compute it as needed per y coordinate of the // consumer. producer.ComputeAt(consumer, y); producer.TraceStores(); consumer.TraceStores(); Console.WriteLine("\nEvaluating producer.store_root().ComputeAt(consumer, y)"); consumer.Realize <float>(4, 4); // See figures/lesson_08_store_root_compute_y.gif for a // visualization. // Reading the log or looking at the figure you should see // that producer and consumer again alternate on a // per-scanline basis. It computes a 5x2 box of the producer // to satisfy the first scanline of the consumer, but after // that it only computes a 5x1 box of the output for each new // scanline of the consumer! // // Halide has detected that for all scanlines except for the // first, it can reuse the values already sitting in the // buffer we've allocated for producer. Let's look at the // equivalent C: var result = new float[4, 4]; { // producer.store_root() implies that storage goes here: var producer_storage = new float[5, 5]; // There's an outer loop over scanlines of consumer: for (int yy = 0; yy < 4; yy++) { // Compute enough of the producer to satisfy this scanline // of the consumer. for (int py = yy; py < yy + 2; py++) { // Skip over rows of producer that we've already // computed in a previous iteration. if (yy > 0 && py == yy) { continue; } for (int px = 0; px < 5; px++) { producer_storage[py, px] = (float)Math.Sin(px * py); } } // Compute a scanline of the consumer. for (int xx = 0; xx < 4; xx++) { result[yy, xx] = (producer_storage[yy, xx] + producer_storage[yy + 1, xx] + producer_storage[yy, xx + 1] + producer_storage[yy + 1, xx + 1]) / 4; } } } Console.WriteLine("Pseudo-code for the schedule:"); consumer.PrintLoopNest(); Console.WriteLine(); // The performance characteristics of this strategy are pretty // good! The numbers are similar compute_root, except locality // is better. We're doing the minimum number of sin calls, // and we load values soon after they are stored, so we're // probably making good use of the cache: // producer.store_root().ComputeAt(consumer, y): // - Temporary memory allocated: 10 floats // - Loads: 64 // - Stores: 39 // - Calls to sin: 25 // Note that my claimed amount of memory allocated doesn't // match the reference C code. Halide is performing one more // optimization under the hood. It folds the storage for the // producer down into a circular buffer of two // scanlines. Equivalent C would actually look like this: { // Actually store 2 scanlines instead of 5 var producer_storage = new float[2, 5]; for (int yy = 0; yy < 4; yy++) { for (int py = yy; py < yy + 2; py++) { if (yy > 0 && py == yy) { continue; } for (int px = 0; px < 5; px++) { // Stores to producer_storage have their y coordinate bit-masked. producer_storage[py & 1, px] = (float)Math.Sin(px * py); } } // Compute a scanline of the consumer. for (int xx = 0; xx < 4; xx++) { // Loads from producer_storage have their y coordinate bit-masked. result[yy, xx] = (producer_storage[yy & 1, xx] + producer_storage[(yy + 1) & 1, xx] + producer_storage[yy & 1, xx + 1] + producer_storage[(yy + 1) & 1, xx + 1]) / 4; } } } } // We can do even better, by leaving the storage outermost, but // moving the computation into the innermost loop: { var producer = new HSFunc("producer_root_x"); var consumer = new HSFunc("consumer_root_x"); producer[x, y] = HSMath.Sin(x * y); consumer[x, y] = (producer[x, y] + producer[x, y + 1] + producer[x + 1, y] + producer[x + 1, y + 1]) / 4; // Store outermost, compute innermost. producer.StoreRoot().ComputeAt(consumer, x); producer.TraceStores(); consumer.TraceStores(); Console.WriteLine("\nEvaluating producer.store_root().ComputeAt(consumer, x)"); consumer.Realize <float>(4, 4); // See figures/lesson_08_store_root_compute_x.gif for a // visualization. // You should see that producer and consumer now alternate on // a per-pixel basis. Here's the equivalent C: var result = new float[4, 4]; // producer.store_root() implies that storage goes here, but // we can fold it down into a circular buffer of two // scanlines: var producer_storage = new float[2, 5]; // For every pixel of the consumer: for (int yy = 0; yy < 4; yy++) { for (int xx = 0; xx < 4; xx++) { // Compute enough of the producer to satisfy this // pixel of the consumer, but skip values that we've // already computed: if (yy == 0 && xx == 0) { producer_storage[yy & 1, xx] = (float)Math.Sin(xx * yy); } if (yy == 0) { producer_storage[yy & 1, xx + 1] = (float)Math.Sin((xx + 1) * yy); } if (xx == 0) { producer_storage[(yy + 1) & 1, xx] = (float)Math.Sin(xx * (yy + 1)); } producer_storage[(yy + 1) & 1, xx + 1] = (float)Math.Sin((xx + 1) * (yy + 1)); result[yy, xx] = (producer_storage[yy & 1, xx] + producer_storage[(yy + 1) & 1, xx] + producer_storage[yy & 1, xx + 1] + producer_storage[(yy + 1) & 1, xx + 1]) / 4; } } Console.WriteLine("Pseudo-code for the schedule:"); consumer.PrintLoopNest(); Console.WriteLine(); // The performance characteristics of this strategy are the // best so far. One of the four values of the producer we need // is probably still sitting in a register, so I won't count // it as a load: // producer.store_root().ComputeAt(consumer, x): // - Temporary memory allocated: 10 floats // - Loads: 48 // - Stores: 56 // - Calls to sin: 40 } // So what's the catch? Why not always do // producer.store_root().ComputeAt(consumer, x) for this type of // code? // // The answer is parallelism. In both of the previous two // strategies we've assumed that values computed on previous // iterations are lying around for us to reuse. This assumes that // previous values of x or y happened earlier in time and have // finished. This is not true if you parallelize or vectorize // either loop. Darn. If you parallelize, Halide won't inject the // optimizations that skip work already done if there's a parallel // loop in between the store_at level and the ComputeAt level, // and won't fold the storage down into a circular buffer either, // which makes our store_root pointless. // We're running out of options. We can make new ones by // splitting. We can store_at or ComputeAt at the natural // variables of the consumer (x and y), or we can split x or y // into new inner and outer sub-variables and then schedule with // respect to those. We'll use this to express fusion in tiles: { var producer = new HSFunc("producer_tile"); var consumer = new HSFunc("consumer_tile"); producer[x, y] = HSMath.Sin(x * y); consumer[x, y] = (producer[x, y] + producer[x, y + 1] + producer[x + 1, y] + producer[x + 1, y + 1]) / 4; // We'll compute 8x8 of the consumer, in 4x4 tiles. var x_outer = new HSVar("x_outer"); var y_outer = new HSVar("y_outer"); var x_inner = new HSVar("x_inner"); var y_inner = new HSVar("y_inner"); consumer.Tile(x, y, x_outer, y_outer, x_inner, y_inner, 4, 4); // Compute the producer per tile of the consumer producer.ComputeAt(consumer, x_outer); // Notice that I wrote my schedule starting from the end of // the pipeline (the consumer). This is because the schedule // for the producer refers to x_outer, which we introduced // when we tiled the consumer. You can write it in the other // order, but it tends to be harder to read. // Turn on tracing. producer.TraceStores(); consumer.TraceStores(); Console.WriteLine("\nEvaluating:"); Console.WriteLine("consumer.tile(x, y, x_outer, y_outer, x_inner, y_inner, 4, 4);"); Console.WriteLine("producer.ComputeAt(consumer, x_outer);"); consumer.Realize <float>(8, 8); // See figures/lesson_08_tile.gif for a visualization. // The producer and consumer now alternate on a per-tile // basis. Here's the equivalent C: var result = new float[8, 8]; // For every tile of the consumer: for (int yy_outer = 0; yy_outer < 2; yy_outer++) { for (int xx_outer = 0; xx_outer < 2; xx_outer++) { // Compute the x and y coords of the start of this tile. int x_base = xx_outer * 4; int y_base = yy_outer * 4; // Compute enough of producer to satisfy this tile. A // 4x4 tile of the consumer requires a 5x5 tile of the // producer. var producer_storage = new float[5, 5]; for (int py = y_base; py < y_base + 5; py++) { for (int px = x_base; px < x_base + 5; px++) { producer_storage[py - y_base, px - x_base] = (float)Math.Sin(px * py); } } // Compute this tile of the consumer for (int yy_inner = 0; yy_inner < 4; yy_inner++) { for (int xx_inner = 0; xx_inner < 4; xx_inner++) { int xx = x_base + xx_inner; int yy = y_base + yy_inner; result[yy, xx] = (producer_storage[yy - y_base, xx - x_base] + producer_storage[yy - y_base + 1, xx - x_base] + producer_storage[yy - y_base, xx - x_base + 1] + producer_storage[yy - y_base + 1, xx - x_base + 1]) / 4; } } } } Console.WriteLine("Pseudo-code for the schedule:"); consumer.PrintLoopNest(); Console.WriteLine(); // Tiling can make sense for problems like this one with // stencils that reach outwards in x and y. Each tile can be // computed independently in parallel, and the redundant work // done by each tile isn't so bad once the tiles get large // enough. } // Let's try a mixed strategy that combines what we have done with // splitting, parallelizing, and vectorizing. This is one that // often works well in practice for large images. If you // understand this schedule, then you understand 95% of scheduling // in Halide. { var producer = new HSFunc("producer_mixed"); var consumer = new HSFunc("consumer_mixed"); producer[x, y] = HSMath.Sin(x * y); consumer[x, y] = (producer[x, y] + producer[x, y + 1] + producer[x + 1, y] + producer[x + 1, y + 1]) / 4; // Split the y coordinate of the consumer into strips of 16 scanlines: var yo = new HSVar("yo"); var yi = new HSVar("yi"); consumer.Split(y, yo, yi, 16); // Compute the strips using a thread pool and a task queue. consumer.Parallel(yo); // Vectorize across x by a factor of four. consumer.Vectorize(x, 4); // Now store the producer per-strip. This will be 17 scanlines // of the producer (16+1), but hopefully it will fold down // into a circular buffer of two scanlines: producer.StoreAt(consumer, yo); // Within each strip, compute the producer per scanline of the // consumer, skipping work done on previous scanlines. producer.ComputeAt(consumer, yi); // Also vectorize the producer (because sin is vectorizable on x86 using SSE). producer.Vectorize(x, 4); // Let's leave tracing off this time, because we're going to // evaluate over a larger image. // consumer.TraceStores(); // producer.TraceStores(); var halide_result = consumer.Realize <float>(160, 160); // See figures/lesson_08_mixed.mp4 for a visualization. // Here's the equivalent (serial) C: var c_result = new float[160, 160]; // For every strip of 16 scanlines (this loop is parallel in // the Halide version) for (int yyo = 0; yyo < 160 / 16 + 1; yyo++) { // 16 doesn't divide 160, so push the last slice upwards // to fit within [0, 159] (see lesson 05). int y_base = yyo * 16; if (y_base > 160 - 16) { y_base = 160 - 16; } // Allocate a two-scanline circular buffer for the producer var producer_storage = new float[2, 161]; // For every scanline in the strip of 16: for (int yyi = 0; yyi < 16; yyi++) { int yy = y_base + yyi; for (int py = yy; py < yy + 2; py++) { // Skip scanlines already computed *within this task* if (yyi > 0 && py == yy) { continue; } // Compute this scanline of the producer in 4-wide vectors for (int x_vec = 0; x_vec < 160 / 4 + 1; x_vec++) { int x_base = x_vec * 4; // 4 doesn't divide 161, so push the last vector left // (see lesson 05). if (x_base > 161 - 4) { x_base = 161 - 4; } // If you're on x86, Halide generates SSE code for this part: int[] xx = { x_base, x_base + 1, x_base + 2, x_base + 3 }; float[] vec = { (float)Math.Sin(xx[0] * py), (float)Math.Sin(xx[1] * py), (float)Math.Sin(xx[2] * py), (float)Math.Sin(xx[3] * py) }; producer_storage[py & 1, xx[0]] = vec[0]; producer_storage[py & 1, xx[1]] = vec[1]; producer_storage[py & 1, xx[2]] = vec[2]; producer_storage[py & 1, xx[3]] = vec[3]; } } // Now compute consumer for this scanline: for (int x_vec = 0; x_vec < 160 / 4; x_vec++) { int x_base = x_vec * 4; // Again, Halide's equivalent here uses SSE. int[] xx = { x_base, x_base + 1, x_base + 2, x_base + 3 }; float[] vec = { (producer_storage[yy & 1, xx[0]] + producer_storage[(yy + 1) & 1, xx[0]] + producer_storage[yy & 1, xx[0] + 1] + producer_storage[(yy + 1) & 1, xx[0] + 1]) / 4, (producer_storage[yy & 1, xx[1]] + producer_storage[(yy + 1) & 1, xx[1]] + producer_storage[yy & 1, xx[1] + 1] + producer_storage[(yy + 1) & 1, xx[1] + 1]) / 4, (producer_storage[yy & 1, xx[2]] + producer_storage[(yy + 1) & 1, xx[2]] + producer_storage[yy & 1, xx[2] + 1] + producer_storage[(yy + 1) & 1, xx[2] + 1]) / 4, (producer_storage[yy & 1, xx[3]] + producer_storage[(yy + 1) & 1, xx[3]] + producer_storage[yy & 1, xx[3] + 1] + producer_storage[(yy + 1) & 1, xx[3] + 1]) / 4 }; c_result[yy, xx[0]] = vec[0]; c_result[yy, xx[1]] = vec[1]; c_result[yy, xx[2]] = vec[2]; c_result[yy, xx[3]] = vec[3]; } } } Console.WriteLine("Pseudo-code for the schedule:"); consumer.PrintLoopNest(); Console.WriteLine(); // Look on my code, ye mighty, and despair! // Let's check the C result against the Halide result. Doing // this I found several bugs in my C implementation, which // should tell you something. for (int yy = 0; yy < 160; yy++) { for (int xx = 0; xx < 160; xx++) { float error = halide_result[xx, yy] - c_result[yy, xx]; // It's floating-point math, so we'll allow some slop: if (error < -0.001f || error > 0.001f) { Console.WriteLine("halide_result(%d, %d) = %f instead of %f", xx, yy, halide_result[xx, yy], c_result[yy, xx]); return(-1); } } } } // This stuff is hard. We ended up in a three-way trade-off // between memory bandwidth, redundant work, and // parallelism. Halide can't make the correct choice for you // automatically (sorry). Instead it tries to make it easier for // you to explore various options, without messing up your // program. In fact, Halide promises that scheduling calls like // compute_root won't change the meaning of your algorithm -- you // should get the same bits back no matter how you schedule // things. // So be empirical! Experiment with various schedules and keep a // log of performance. Form hypotheses and then try to prove // yourself wrong. Don't assume that you just need to vectorize // your code by a factor of four and run it on eight cores and // you'll get 32x faster. This almost never works. Modern systems // are complex enough that you can't predict performance reliably // without running your code. // We suggest you start by scheduling all of your non-trivial // stages compute_root, and then work from the end of the pipeline // upwards, inlining, parallelizing, and vectorizing each stage in // turn until you reach the top. // Halide is not just about vectorizing and parallelizing your // code. That's not enough to get you very far. Halide is about // giving you tools that help you quickly explore different // trade-offs between locality, redundant work, and parallelism, // without messing up the actual result you're trying to compute. Console.WriteLine("Success!"); return(0); }
public static int Main(string[] args) { // The last lesson was quite involved, and scheduling complex // multi-stage pipelines is ahead of us. As an interlude, let's // consider something easy: evaluating funcs over rectangular // domains that do not start at the origin. // We define our familiar gradient function. var gradient = new HSFunc("gradient"); var x = new HSVar("x"); var y = new HSVar("y"); gradient[x, y] = x + y; // And turn on tracing so we can see how it is being evaluated. gradient.TraceStores(); // Previously we've realized gradient like so: // // gradient.realize(8, 8); // // This does three things internally: // 1) Generates code than can evaluate gradient over an arbitrary // rectangle. // 2) Allocates a new 8 x 8 image. // 3) Runs the generated code to evaluate gradient for all x, y // from (0, 0) to (7, 7) and puts the result into the image. // 4) Returns the new image as the result of the realize call. // What if we're managing memory carefully and don't want Halide // to allocate a new image for us? We can call realize another // way. We can pass it an image we would like it to fill in. The // following evaluates our Func into an existing image: Console.WriteLine("Evaluating gradient from (0, 0) to (7, 7)"); var result = new HSBuffer <int>(8, 8); gradient.Realize(result); // Let's check it did what we expect: for (int yy = 0; yy < 8; yy++) { for (int xx = 0; xx < 8; xx++) { if (result[xx, yy] != xx + yy) { Console.WriteLine("Something went wrong!\n"); return(-1); } } } // Now let's evaluate gradient over a 5 x 7 rectangle that starts // somewhere else -- at position (100, 50). So x and y will run // from (100, 50) to (104, 56) inclusive. // We start by creating an image that represents that rectangle: var shifted = new HSBuffer <int>(5, 7); // In the constructor we tell it the size. shifted.SetMin(100, 50); // Then we tell it the top-left corner. Console.WriteLine("Evaluating gradient from (100, 50) to (104, 56)"); // Note that this won't need to compile any new code, because when // we realized it the first time, we generated code capable of // evaluating gradient over an arbitrary rectangle. gradient.Realize(shifted); // From C++, we also access the image object using coordinates // that start at (100, 50). for (int yy = 50; yy < 57; yy++) { for (int xx = 100; xx < 105; xx++) { if (shifted[xx, yy] != xx + yy) { Console.WriteLine("Something went wrong!"); return(-1); } } } // The image 'shifted' stores the value of our Func over a domain // that starts at (100, 50), so asking for shifted(0, 0) would in // fact read out-of-bounds and probably crash. // What if we want to evaluate our Func over some region that // isn't rectangular? Too bad. Halide only does rectangles :) Console.WriteLine("Success!"); return(0); }
public static int Main(string[] args) { // This program defines a single-stage imaging pipeline that // brightens an image. // First we'll load the input image we wish to brighten. var input = HSBuffer <byte> .LoadImage("rgb.png"); // See figures/lesson_02_input.jpg for a smaller version. // Next we define our Func object that represents our one pipeline // stage. var brighter = new HSFunc("brighter"); // Our Func will have three arguments, representing the position // in the image and the color channel. Halide treats color // channels as an extra dimension of the image. var x = new HSVar("x"); var y = new HSVar("y"); var c = new HSVar("c"); // Normally we'd probably write the whole function definition on // one line. Here we'll break it apart so we can explain what // we're doing at every step. // For each pixel of the input image. var value = input[x, y, c]; // Cast it to a floating point value. value = HS.Cast <float>(value); // Multiply it by 1.5 to brighten it. Halide represents real // numbers as floats, not doubles, so we stick an 'f' on the end // of our constant. value = value * 1.5f; // Clamp it to be less than 255, so we don't get overflow when we // cast it back to an 8-bit unsigned int. value = HSMath.Min(value, 255.0f); // Cast it back to an 8-bit unsigned integer. value = HS.Cast <byte>(value); // Define the function. brighter[x, y, c] = value; // The equivalent one-liner to all of the above is: // // brighter(x, y, c) = Halide::cast<uint8_t>(min(input(x, y, c) * 1.5f, 255)); // // In the shorter version: // - I skipped the cast to float, because multiplying by 1.5f does // that automatically. // - I also used an integer constant as the second argument in the // call to min, because it gets cast to float to be compatible // with the first argument. // - I left the Halide:: off the call to min. It's unnecessary due // to Koenig lookup. // Remember, all we've done so far is build a representation of a // Halide program in memory. We haven't actually processed any // pixels yet. We haven't even compiled that Halide program yet. // So now we'll realize the Func. The size of the output image // should match the size of the input image. If we just wanted to // brighten a portion of the input image we could request a // smaller size. If we request a larger size Halide will throw an // error at runtime telling us we're trying to read out of bounds // on the input image. var output = brighter.Realize <byte>(input.Width, input.Height, input.Channels); // Save the output for inspection. It should look like a bright parrot. output.SaveImage("brighter.png"); // See figures/lesson_02_output.jpg for a small version of the output. Console.WriteLine("Success!"); return(0); }
public static int Main(string[] args) { var x = new HSVar("x"); var y = new HSVar("y"); // Printing out the value of Funcs as they are computed. { // We'll define our gradient function as before. var gradient = new HSFunc("gradient"); gradient[x, y] = x + y; // And tell Halide that we'd like to be notified of all // evaluations. gradient.TraceStores(); // Realize the function over an 8x8 region. Console.WriteLine("Evaluating gradient"); var output = gradient.Realize <int>(8, 8); // Click to show output ... // This will print out all the times gradient(x, y) gets // evaluated. // Now that we can snoop on what Halide is doing, let's try our // first scheduling primitive. We'll make a new version of // gradient that processes each scanline in parallel. var parallel_gradient = new HSFunc("parallel_gradient"); parallel_gradient[x, y] = x + y; // We'll also trace this function. parallel_gradient.TraceStores(); // Things are the same so far. We've defined the algorithm, but // haven't said anything about how to schedule it. In general, // exploring different scheduling decisions doesn't change the code // that describes the algorithm. // Now we tell Halide to use a parallel for loop over the y // coordinate. On Linux we run this using a thread pool and a task // queue. On OS X we call into grand central dispatch, which does // the same thing for us. parallel_gradient.Parallel(y); // This time the printfs should come out of order, because each // scanline is potentially being processed in a different // thread. The number of threads should adapt to your system, but // on linux you can control it manually using the environment // variable HL_NUM_THREADS. Console.WriteLine("\nEvaluating parallel_gradient"); parallel_gradient.Realize <int>(8, 8); // Click to show output ... } // Printing individual Exprs. { // trace_stores() can only print the value of a // Func. Sometimes you want to inspect the value of // sub-expressions rather than the entire Func. The built-in // function 'print' can be wrapped around any Expr to print // the value of that Expr every time it is evaluated. // For example, say we have some Func that is the sum of two terms: var f = new HSFunc("f"); f[x, y] = HSMath.Sin(x) + HSMath.Cos(y); // If we want to inspect just one of the terms, we can wrap // 'print' around it like so: var g = new HSFunc("g"); g[x, y] = HSMath.Sin(x) + HS.Print(HSMath.Cos(y)); Console.WriteLine("\nEvaluating sin(x) + cos(y), and just printing cos(y)"); g.Realize <float>(4, 4); // Click to show output ... } // Printing additional context. { // print can take multiple arguments. It prints all of them // and evaluates to the first one. The arguments can be Exprs // or constant strings. This can be used to print additional // context alongside the value: var f = new HSFunc("f"); f[x, y] = HSMath.Sin(x) + HS.Print(HSMath.Cos(y), "<- this is cos(", y, ") when x =", x); Console.WriteLine("\nEvaluating sin(x) + cos(y), and printing cos(y) with more context"); f.Realize <float>(4, 4); // Click to show output ... // It can be useful to split expressions like the one above // across multiple lines to make it easier to turn on and off // printing certain values while debugging. HSExpr e = HSMath.Cos(y); // Uncomment the following line to print the value of cos(y) // e = print(e, "<- this is cos(", y, ") when x =", x); var g = new HSFunc("g"); g[x, y] = HSMath.Sin(x) + e; g.Realize <float>(4, 4); } // Conditional printing { // Both print and trace_stores can produce a lot of output. If // you're looking for a rare event, or just want to see what // happens at a single pixel, this amount of output can be // difficult to dig through. Instead, the function print_when // can be used to conditionally print an Expr. The first // argument to print_when is a boolean Expr. If the Expr // evaluates to true, it returns the second argument and // prints all of the arguments. If the Expr evaluates to false // it just returns the second argument and does not print. var f = new HSFunc("f"); HSExpr e = HSMath.Cos(y); e = HS.PrintWhen(x == 37 && y == 42, e, "<- this is cos(y) at x, y == (37, 42)"); f[x, y] = HSMath.Sin(x) + e; Console.WriteLine("\nEvaluating sin(x) + cos(y), and printing cos(y) at a single pixel"); f.Realize <float>(640, 480); // Click to show output ... // print_when can also be used to check for values you're not expecting: var g = new HSFunc("g"); e = HSMath.Cos(y); e = HS.PrintWhen(e < 0, e, "cos(y) < 0 at y ==", y); g[x, y] = HSMath.Sin(x) + e; Console.WriteLine("\nEvaluating sin(x) + cos(y), and printing whenever cos(y) < 0"); g.Realize <float>(4, 4); // Click to show output ... } // Printing expressions at compile-time. { // The code above builds up a Halide Expr across several lines // of code. If you're programmatically constructing a complex // expression, and you want to check the Expr you've created // is what you think it is, you can also print out the // expression itself using C++ streams: var fizz = new HSVar("fizz"); var buzz = new HSVar("buzz"); var e = new HSExpr(1); for (int i = 2; i < 100; i++) { if (i % 3 == 0 && i % 5 == 0) { e += fizz * buzz; } else if (i % 3 == 0) { e += fizz; } else if (i % 5 == 0) { e += buzz; } else { e += i; } } Console.WriteLine($"Printing a complex Expr: {e}"); // Click to show output ... } Console.WriteLine("Success!"); return(0); }
public static int Main(string[] args) { // We're going to define and schedule our gradient function in // several different ways, and see what order pixels are computed // in. var x = new HSVar("x"); var y = new HSVar("y"); // First we observe the default ordering. { var gradient = new HSFunc("gradient"); gradient[x, y] = x + y; gradient.TraceStores(); // By default we walk along the rows and then down the // columns. This means x varies quickly, and y varies // slowly. x is the column and y is the row, so this is a // row-major traversal. Console.WriteLine("Evaluating gradient row-major"); var output = gradient.Realize <int>(4, 4); // See figures/lesson_05_row_major.gif for a visualization of // what this did. // The equivalent C is: Console.WriteLine("Equivalent C:"); for (int yy = 0; yy < 4; yy++) { for (int xx = 0; xx < 4; xx++) { Console.WriteLine($"Evaluating at x = {xx}, y = {yy}: {xx + yy}"); } } Console.WriteLine("\n"); // Tracing is one useful way to understand what a schedule is // doing. You can also ask Halide to print out pseudocode // showing what loops Halide is generating: Console.WriteLine("Pseudo-code for the schedule:"); gradient.PrintLoopNest(); Console.WriteLine(); // Because we're using the default ordering, it should print: // compute gradient: // for y: // for x: // gradient(...) = ... } // Reorder variables. { var gradient = new HSFunc("gradient_col_major"); gradient[x, y] = x + y; gradient.TraceStores(); // If we reorder x and y, we can walk down the columns // instead. The reorder call takes the arguments of the func, // and sets a new nesting order for the for loops that are // generated. The arguments are specified from the innermost // loop out, so the following call puts y in the inner loop: gradient.Reorder(y, x); // This means y (the row) will vary quickly, and x (the // column) will vary slowly, so this is a column-major // traversal. Console.WriteLine("Evaluating gradient column-major"); var output = gradient.Realize <int>(4, 4); // See figures/lesson_05_col_major.gif for a visualization of // what this did. Console.WriteLine("Equivalent C:"); for (int xx = 0; xx < 4; xx++) { for (int yy = 0; yy < 4; yy++) { Console.WriteLine($"Evaluating at x = {xx}, y = {yy}: {xx + yy}"); } } Console.WriteLine(); // If we print pseudo-code for this schedule, we'll see that // the loop over y is now inside the loop over x. Console.WriteLine("Pseudo-code for the schedule:"); gradient.PrintLoopNest(); Console.WriteLine(); } // Split a variable into two. { var gradient = new HSFunc("gradient_split"); gradient[x, y] = x + y; gradient.TraceStores(); // The most powerful primitive scheduling operation you can do // to a var is to split it into inner and outer sub-variables: var x_outer = new HSVar("x_outer"); var x_inner = new HSVar("x_inner"); gradient.Split(x, x_outer, x_inner, 2); // This breaks the loop over x into two nested loops: an outer // one over x_outer, and an inner one over x_inner. The last // argument to split was the "split factor". The inner loop // runs from zero to the split factor. The outer loop runs // from zero to the extent required of x (4 in this case) // divided by the split factor. Within the loops, the old // variable is defined to be outer * factor + inner. If the // old loop started at a value other than zero, then that is // also added within the loops. Console.WriteLine("Evaluating gradient with x split into x_outer and x_inner "); var output = gradient.Realize <int>(4, 4); Console.WriteLine("Equivalent C:"); for (int yy = 0; yy < 4; yy++) { for (int xOuter = 0; xOuter < 2; xOuter++) { for (int xInner = 0; xInner < 2; xInner++) { int xx = xOuter * 2 + xInner; Console.WriteLine($"Evaluating at x = {xx}, y = {yy}: {xx + yy}"); } } } Console.WriteLine(); Console.WriteLine("Pseudo-code for the schedule:"); gradient.PrintLoopNest(); Console.WriteLine(); // Note that the order of evaluation of pixels didn't actually // change! Splitting by itself does nothing, but it does open // up all of the scheduling possibilities that we will explore // below. } // Fuse two variables into one. { var gradient = new HSFunc("gradient_fused"); gradient[x, y] = x + y; // The opposite of splitting is 'fusing'. Fusing two variables // merges the two loops into a single for loop over the // product of the extents. Fusing is less important than // splitting, but it also sees use (as we'll see later in this // lesson). Like splitting, fusing by itself doesn't change // the order of evaluation. var fused = new HSVar("fused"); gradient.Fuse(x, y, fused); Console.WriteLine("Evaluating gradient with x and y fused"); var output = gradient.Realize <int>(4, 4); Console.WriteLine("Equivalent C:"); for (int f = 0; f < 4 * 4; f++) { int yy = f / 4; int xx = f % 4; Console.WriteLine($"Evaluating at x = {xx}, y = {yy}: {xx + yy}"); } Console.WriteLine(); Console.WriteLine("Pseudo-code for the schedule:"); gradient.PrintLoopNest(); Console.WriteLine(); } // Evaluating in tiles. { var gradient = new HSFunc("gradient_tiled"); gradient[x, y] = x + y; gradient.TraceStores(); // Now that we can both split and reorder, we can do tiled // evaluation. Let's split both x and y by a factor of four, // and then reorder the vars to express a tiled traversal. // // A tiled traversal splits the domain into small rectangular // tiles, and outermost iterates over the tiles, and within // that iterates over the points within each tile. It can be // good for performance if neighboring pixels use overlapping // input data, for example in a blur. We can express a tiled // traversal like so: var x_outer = new HSVar("x_outer"); var x_inner = new HSVar("x_inner"); var y_outer = new HSVar("y_outer"); var y_inner = new HSVar("y_inner"); gradient.Split(x, x_outer, x_inner, 4); gradient.Split(y, y_outer, y_inner, 4); gradient.Reorder(x_inner, y_inner, x_outer, y_outer); // This pattern is common enough that there's a shorthand for it: // gradient.tile(x, y, x_outer, y_outer, x_inner, y_inner, 4, 4); Console.WriteLine("Evaluating gradient in 4x4 tiles"); var output = gradient.Realize <int>(8, 8); // See figures/lesson_05_tiled.gif for a visualization of this // schedule. Console.WriteLine("Equivalent C:"); for (int yOuter = 0; yOuter < 2; yOuter++) { for (int xOuter = 0; xOuter < 2; xOuter++) { for (int yInner = 0; yInner < 4; yInner++) { for (int xInner = 0; xInner < 4; xInner++) { int xx = xOuter * 4 + xInner; int yy = yOuter * 4 + yInner; Console.WriteLine($"Evaluating at x = {xx}, y = {yy}: {xx + yy}"); } } } } Console.WriteLine(); Console.WriteLine("Pseudo-code for the schedule:"); gradient.PrintLoopNest(); Console.WriteLine(); } // Evaluating in vectors. { var gradient = new HSFunc("gradient_in_vectors"); gradient[x, y] = x + y; gradient.TraceStores(); // The nice thing about splitting is that it guarantees the // inner variable runs from zero to the split factor. Most of // the time the split-factor will be a compile-time constant, // so we can replace the loop over the inner variable with a // single vectorized computation. This time we'll split by a // factor of four, because on X86 we can use SSE to compute in // 4-wide vectors. var x_outer = new HSVar("x_outer"); var x_inner = new HSVar("x_inner"); gradient.Split(x, x_outer, x_inner, 4); gradient.Vectorize(x_inner); // Splitting and then vectorizing the inner variable is common // enough that there's a short-hand for it. We could have also // said: // // gradient.vectorize(x, 4); // // which is equivalent to: // // gradient.split(x, x, x_inner, 4); // gradient.vectorize(x_inner); // // Note that in this case we reused the name 'x' as the new // outer variable. Later scheduling calls that refer to x // will refer to this new outer variable named x. // This time we'll evaluate over an 8x4 box, so that we have // more than one vector of work per scanline. Console.WriteLine("Evaluating gradient with x_inner vectorized "); var output = gradient.Realize <int>(8, 4); // See figures/lesson_05_vectors.gif for a visualization. Console.WriteLine("Equivalent C:"); for (int yy = 0; yy < 4; yy++) { for (int xOuter = 0; xOuter < 2; xOuter++) { // The loop over x_inner has gone away, and has been // replaced by a vectorized version of the // expression. On x86 processors, Halide generates SSE // for all of this. int[] x_vec = { xOuter * 4 + 0, xOuter * 4 + 1, xOuter * 4 + 2, xOuter * 4 + 3 }; int[] val = { x_vec[0] + yy, x_vec[1] + yy, x_vec[2] + yy, x_vec[3] + yy }; Console.WriteLine($"Evaluating at " + $"<{x_vec[0]}, {x_vec[1]}, {x_vec[2]}, {x_vec[3]}>, " + $"<{yy}, {yy}, {yy}, {yy}>: " + $"<{val[0]}, {val[1]}, {val[2]}, {val[3]}>"); } } Console.WriteLine(); Console.WriteLine("Pseudo-code for the schedule:"); gradient.PrintLoopNest(); Console.WriteLine(); } // Unrolling a loop. { var gradient = new HSFunc("gradient_unroll"); gradient[x, y] = x + y; gradient.TraceStores(); // If multiple pixels share overlapping data, it can make // sense to unroll a computation so that shared values are // only computed or loaded once. We do this similarly to how // we expressed vectorizing. We split a dimension and then // fully unroll the loop of the inner variable. Unrolling // doesn't change the order in which things are evaluated. var x_outer = new HSVar("x_outer"); var x_inner = new HSVar("x_inner"); gradient.Split(x, x_outer, x_inner, 2); gradient.Unroll(x_inner); // The shorthand for this is: // gradient.unroll(x, 2); Console.WriteLine("Evaluating gradient unrolled by a factor of two"); var result = gradient.Realize <int>(4, 4); Console.WriteLine("Equivalent C:"); for (int yy = 0; yy < 4; yy++) { for (int xOuter = 0; xOuter < 2; xOuter++) { // Instead of a for loop over x_inner, we get two // copies of the innermost statement. { int xInner = 0; int xx = xOuter * 2 + xInner; Console.WriteLine($"Evaluating at x = {xx}, y = {yy}: {xx + yy}"); } { int xInner = 1; int xx = xOuter * 2 + xInner; Console.WriteLine($"Evaluating at x = {xx}, y = {yy}: {xx + yy}"); } } } Console.WriteLine(); Console.WriteLine("Pseudo-code for the schedule:"); gradient.PrintLoopNest(); Console.WriteLine(); } // Splitting by factors that don't divide the extent. { var gradient = new HSFunc("gradient_split_7x2"); gradient[x, y] = x + y; gradient.TraceStores(); // Splitting guarantees that the inner loop runs from zero to // the split factor, which is important for the uses we saw // above. So what happens when the total extent we wish to // evaluate x over isn't a multiple of the split factor? We'll // split by a factor three, and we'll evaluate gradient over a // 7x2 box instead of the 4x4 box we've been using. var x_outer = new HSVar("x_outer"); var x_inner = new HSVar("x_inner"); gradient.Split(x, x_outer, x_inner, 3); Console.WriteLine("Evaluating gradient over a 7x2 box with x split by three "); var output = gradient.Realize <int>(7, 2); // See figures/lesson_05_split_7_by_3.gif for a visualization // of what happened. Note that some points get evaluated more // than once! Console.WriteLine("Equivalent C:"); for (int yy = 0; yy < 2; yy++) { for (int xOuter = 0; xOuter < 3; xOuter++) // Now runs from 0 to 2 { for (int xInner = 0; xInner < 3; xInner++) { int xx = xOuter * 3; // Before we add x_inner, make sure we don't // evaluate points outside of the 7x2 box. We'll // clamp x to be at most 4 (7 minus the split // factor). if (xx > 4) { xx = 4; } xx += xInner; Console.WriteLine($"Evaluating at x = {xx}, y = {yy}: {xx + yy}"); } } } Console.WriteLine(); Console.WriteLine("Pseudo-code for the schedule:"); gradient.PrintLoopNest(); Console.WriteLine(); // If you read the output, you'll see that some coordinates // were evaluated more than once. That's generally OK, because // pure Halide functions have no side-effects, so it's safe to // evaluate the same point multiple times. If you're calling // out to C functions like we are, it's your responsibility to // make sure you can handle the same point being evaluated // multiple times. // The general rule is: If we require x from x_min to x_min + x_extent, and // we split by a factor 'factor', then: // // x_outer runs from 0 to (x_extent + factor - 1)/factor // x_inner runs from 0 to factor // x = min(x_outer * factor, x_extent - factor) + x_inner + x_min // // In our example, x_min was 0, x_extent was 7, and factor was 3. // However, if you write a Halide function with an update // definition (see lesson 9), then it is not safe to evaluate // the same point multiple times, so we won't apply this // trick. Instead the range of values computed will be rounded // up to the next multiple of the split factor. } // Fusing, tiling, and parallelizing. { // We saw in the previous lesson that we can parallelize // across a variable. Here we combine it with fusing and // tiling to express a useful pattern - processing tiles in // parallel. // This is where fusing shines. Fusing helps when you want to // parallelize across multiple dimensions without introducing // nested parallelism. Nested parallelism (parallel for loops // within parallel for loops) is supported by Halide, but // often gives poor performance compared to fusing the // parallel variables into a single parallel for loop. var gradient = new HSFunc("gradient_fused_tiles"); gradient[x, y] = x + y; gradient.TraceStores(); // First we'll tile, then we'll fuse the tile indices and // parallelize across the combination. var x_outer = new HSVar("x_outer"); var y_outer = new HSVar("y_outer"); var x_inner = new HSVar("x_inner"); var y_inner = new HSVar("y_inner"); var tile_index = new HSVar("tile_index"); gradient.Tile(x, y, x_outer, y_outer, x_inner, y_inner, 4, 4); gradient.Fuse(x_outer, y_outer, tile_index); gradient.Parallel(tile_index); // The scheduling calls all return a reference to the Func, so // you can also chain them together into a single statement to // make things slightly clearer: // // gradient // .tile(x, y, x_outer, y_outer, x_inner, y_inner, 2, 2) // .fuse(x_outer, y_outer, tile_index) // .parallel(tile_index); Console.WriteLine("Evaluating gradient tiles in parallel"); var output = gradient.Realize <int>(8, 8); // The tiles should occur in arbitrary order, but within each // tile the pixels will be traversed in row-major order. See // figures/lesson_05_parallel_tiles.gif for a visualization. Console.WriteLine("Equivalent (serial) C:\n"); // This outermost loop should be a parallel for loop, but that's hard in C. for (int ti = 0; ti < 4; ti++) { int yOuter = ti / 2; int xOuter = ti % 2; for (int j_inner = 0; j_inner < 4; j_inner++) { for (int i_inner = 0; i_inner < 4; i_inner++) { int j = yOuter * 4 + j_inner; int i = xOuter * 4 + i_inner; Console.WriteLine($"Evaluating at x = {i}, y = {j}: {i + j}"); } } } Console.WriteLine(); Console.WriteLine("Pseudo-code for the schedule:"); gradient.PrintLoopNest(); Console.WriteLine(); } // Putting it all together. { // Are you ready? We're going to use all of the features above now. var gradient_fast = new HSFunc("gradient_fast"); gradient_fast[x, y] = x + y; // We'll process 64x64 tiles in parallel. var x_outer = new HSVar("x_outer"); var y_outer = new HSVar("y_outer"); var x_inner = new HSVar("x_inner"); var y_inner = new HSVar("y_inner"); var tile_index = new HSVar("tile_index"); gradient_fast .Tile(x, y, x_outer, y_outer, x_inner, y_inner, 64, 64) .Fuse(x_outer, y_outer, tile_index) .Parallel(tile_index); // We'll compute two scanlines at once while we walk across // each tile. We'll also vectorize in x. The easiest way to // express this is to recursively tile again within each tile // into 4x2 subtiles, then vectorize the subtiles across x and // unroll them across y: var x_inner_outer = new HSVar("x_inner_outer"); var y_inner_outer = new HSVar("y_inner_outer"); var x_vectors = new HSVar("x_vectors"); var y_pairs = new HSVar("y_pairs"); gradient_fast .Tile(x_inner, y_inner, x_inner_outer, y_inner_outer, x_vectors, y_pairs, 4, 2) .Vectorize(x_vectors) .Unroll(y_pairs); // Note that we didn't do any explicit splitting or // reordering. Those are the most important primitive // operations, but mostly they are buried underneath tiling, // vectorizing, or unrolling calls. // Now let's evaluate this over a range which is not a // multiple of the tile size. // If you like you can turn on tracing, but it's going to // produce a lot of printfs. Instead we'll compute the answer // both in C and Halide and see if the answers match. var result = gradient_fast.Realize <int>(350, 250); // See figures/lesson_05_fast.mp4 for a visualization. Console.WriteLine("Checking Halide result against equivalent C..."); for (int tileIndex = 0; tileIndex < 6 * 4; tileIndex++) { int yOuter = tileIndex / 4; int xOuter = tileIndex % 4; for (int yInnerOuter = 0; yInnerOuter < 64 / 2; yInnerOuter++) { for (int xInnerOuter = 0; xInnerOuter < 64 / 4; xInnerOuter++) { // We're vectorized across x int xx = Math.Min(xOuter * 64, 350 - 64) + xInnerOuter * 4; int[] xVec = { xx + 0, xx + 1, xx + 2, xx + 3 }; // And we unrolled across y int yBase = Math.Min(yOuter * 64, 250 - 64) + yInnerOuter * 2; { // y_pairs = 0 int yy = yBase + 0; int[] yVec = { yy, yy, yy, yy }; int[] val = { xVec[0] + yVec[0], xVec[1] + yVec[1], xVec[2] + yVec[2], xVec[3] + yVec[3] }; // Check the result. for (int i = 0; i < 4; i++) { if (result[xVec[i], yVec[i]] != val[i]) { Console.WriteLine($"There was an error at {xVec[i]} {yVec[i]}!"); return(-1); } } } { // y_pairs = 1 int yy = yBase + 1; int[] yVec = { yy, yy, yy, yy }; int[] val = { xVec[0] + yVec[0], xVec[1] + yVec[1], xVec[2] + yVec[2], xVec[3] + yVec[3] }; // Check the result. for (int i = 0; i < 4; i++) { if (result[xVec[i], yVec[i]] != val[i]) { Console.WriteLine($"There was an error at {xVec[i]} {yVec[i]}!"); return(-1); } } } } } } Console.WriteLine(); Console.WriteLine("Pseudo-code for the schedule:"); gradient_fast.PrintLoopNest(); Console.WriteLine(); // Note that in the Halide version, the algorithm is specified // once at the top, separately from the optimizations, and there // aren't that many lines of code total. Compare this to the C // version. There's more code (and it isn't even parallelized or // vectorized properly). More annoyingly, the statement of the // algorithm (the result is x plus y) is buried in multiple places // within the mess. This C code is hard to write, hard to read, // hard to debug, and hard to optimize further. This is why Halide // exists. } Console.WriteLine("Success!"); return(0); }