static void ReconstructAndSave(string fileName, MyImageReader image, Evaluation evaluation)
        {
            var tileOrders = new byte[1, 9];
            TileOrderFromPermutation(Permutations, evaluation.Permutation, 9, tileOrders, 0);
            var tileOrder = new byte[9];
            for (var i = 0; i < 9; i++)
            {
                tileOrder[i] = tileOrders[0, i];
            }

            var pixels = new uint[image.Width, image.Height];
            var cxTile = image.Width / 3;
            var cyTile = image.Height / 3;
            for (var destinationTile = 0; destinationTile < 9; destinationTile++)
            {
                var destTileX = destinationTile % 3;
                var destTileY = destinationTile / 3;

                var sourceTile = tileOrder[destinationTile];
                var sourceTileX = sourceTile % 3;
                var sourceTileY = sourceTile / 3;

                for (var x = 0; x < cxTile; x++)
                {
                    for (var y = 0; y < cyTile; y++)
                    {
                        pixels[destTileX * cxTile + x, destTileY * cyTile + y] = image.Pixels[sourceTileX * cxTile + x + (sourceTileY * cyTile + y) * image.Width];
                    }
                }
            }

            MyImageWriter.Write(fileName, pixels);
        }
 static void TimeCpuEnhancedScaleImage(MyImageReader cpuImage)
 {
     var scaledImage = new byte[64 * 3, 64 * 3];
     var sourceImage = cpuImage.Pixels.Clone() as uint[];
     var sourceWidth = cpuImage.Width;
     var sourceHeight = cpuImage.Height;
     var stopWatch = Stopwatch.StartNew();
     Parallel.For(0, 192 * 192, p =>
     {
         var x = p % 192;
         var y = p / 192;
         EnhancedScaleImagePixel(sourceImage, sourceWidth, sourceHeight, scaledImage, x, y);
     });
     stopWatch.Stop();
     CpuScaleImageTimeMs = (float)stopWatch.Elapsed.TotalMilliseconds;
 }
        public static void Solve(string sourceFileName, string destinationFileName)
        {
            // load image from disk
            var cpuImage = new MyImageReader(sourceFileName);

            // allocate image memory on the GPU
            var gpuImage = Gpu.Allocate<uint>(cpuImage.Pixels.Length);

            // copy the image pixels to the GPU
            Gpu.StartTimer();
            Gpu.CopyToDevice(cpuImage.Pixels, gpuImage);
            CopyImageTimeMs = Gpu.StopTimer();

            // allocate scaled image memory on the GPU
            var gpuScaledImage = Gpu.Allocate<byte>(64 * 3, 64 * 3);

            // rescale the image using the GPU
            Gpu.StartTimer();
            Gpu.Launch(new dim3(12, 12), new dim3(16, 16), EnhancedScaleImageKernel, gpuImage, cpuImage.Width, cpuImage.Height, gpuScaledImage);
            Gpu.Synchronize();
            ScaleImageTimeMs = Gpu.StopTimer();

            // save a copy of the scaled image
            SaveScaledImage("Scaled.bmp", gpuScaledImage);

            // compare with CPU scale image time
            TimeCpuEnhancedScaleImage(cpuImage);

            // allocate edges memory on the GPU
            var gpuEdges = Gpu.Allocate<byte>(9, 4, 64);

            // extract edge information using the GPU
            Gpu.StartTimer();
            Gpu.Launch(new dim3(9, 4), 64, ExtractEdgeKernel, gpuScaledImage, gpuEdges);
            Gpu.Synchronize();
            ExtractEdgeTimeMs = Gpu.StopTimer();

            // copy edges to GPU constant memory
            Gpu.StartTimer();
            Gpu.CopyFromDevice(gpuEdges, Edges);
            Gpu.CopyToConstantMemory(Edges, Edges);
            CopyEdgesToConstantMemoryTimeMs = Gpu.StopTimer();

            // allocate fit memory on the GPU
            var gpuFit = Gpu.Allocate<float>(9, 9);

            // compare edge fitting using the GPU
            Gpu.StartTimer();
            Gpu.Launch(new dim3(9, 9), 64, ComputeFitsKernel, 2, 0, gpuFit);
            Gpu.Synchronize();
            ComputeFitsTimeMs = Gpu.StopTimer();

            // copy edges to GPU constant memory
            Gpu.StartTimer();
            Gpu.CopyFromDevice(gpuFit, LeftRightFit);
            Gpu.CopyToConstantMemory(LeftRightFit, LeftRightFit);
            CopyFitsToConstantMemoryTimeMs = Gpu.StopTimer();

            // compare edge fitting using the GPU
            Gpu.StartTimer();
            Gpu.Launch(new dim3(9, 9), 64, ComputeFitsKernel, 3, 1, gpuFit);
            Gpu.Synchronize();
            ComputeFitsTimeMs += Gpu.StopTimer();

            // copy edges to GPU constant memory
            Gpu.StartTimer();
            Gpu.CopyFromDevice(gpuFit, TopBottomFit);
            Gpu.CopyToConstantMemory(TopBottomFit, TopBottomFit);
            CopyFitsToConstantMemoryTimeMs += Gpu.StopTimer();

            // evaluate all permutations
            const int threads = 256;
            const int blocks = Permutations / threads + 1;
            var cpuEvaluations = new Evaluation[blocks];
            var gpuEvaluations = Gpu.Allocate(cpuEvaluations);
            Gpu.StartTimer();
            Gpu.Launch(blocks, threads, ExplorePermutationsKernel, gpuEvaluations);
            Gpu.Synchronize();
            ExplorePermutationsTimeMs += Gpu.StopTimer();
            Gpu.StartTimer();
            Gpu.CopyFromDevice(gpuEvaluations, cpuEvaluations);
            CopyPermutationsTimeMs += Gpu.StopTimer();

            // get the best permutation
            var bestEvaluation = cpuEvaluations[0];
            foreach (var evaluation in cpuEvaluations)
            {
                if (evaluation.Metric < bestEvaluation.Metric)
                {
                    bestEvaluation = evaluation;
                }
            }

            // reconstruct and save the new image
            ReconstructAndSave(destinationFileName, cpuImage, bestEvaluation);

            // free memory on the GPU
            Gpu.Free(gpuImage);
            Gpu.Free(gpuScaledImage);
            Gpu.Free(gpuFit);
            Gpu.Free(gpuEdges);
            Gpu.Free(gpuEvaluations);
        }