//detect blend direction
        private void PreProcessCorners(Kernel4x4 ker)
        {
            _blendResult.Reset();

            if ((ker.F == ker.G && ker.J == ker.K) || (ker.F == ker.J && ker.G == ker.K))
            {
                return;
            }

            var dist = _colorDistance;

            const int weight = 4;
            var       jg     = dist.DistYCbCr(ker.I, ker.F) + dist.DistYCbCr(ker.F, ker.C) + dist.DistYCbCr(ker.N, ker.K) + dist.DistYCbCr(ker.K, ker.H) + weight * dist.DistYCbCr(ker.J, ker.G);
            var       fk     = dist.DistYCbCr(ker.E, ker.J) + dist.DistYCbCr(ker.J, ker.O) + dist.DistYCbCr(ker.B, ker.G) + dist.DistYCbCr(ker.G, ker.L) + weight * dist.DistYCbCr(ker.F, ker.K);

            if (jg < fk)
            {
                var dominantGradient = _cfg.DominantDirectionThreshold * jg < fk;
                if (ker.F != ker.G && ker.F != ker.J)
                {
                    _blendResult.F = (char)(dominantGradient ? BlendType.Dominant : BlendType.Normal);
                }
                if (ker.K != ker.J && ker.K != ker.G)
                {
                    _blendResult.K = (char)(dominantGradient ? BlendType.Dominant : BlendType.Normal);
                }
            }
            else if (fk < jg)
            {
                var dominantGradient = _cfg.DominantDirectionThreshold * fk < jg;
                if (ker.J != ker.F && ker.J != ker.K)
                {
                    _blendResult.J = (char)(dominantGradient ? BlendType.Dominant : BlendType.Normal);
                }
                if (ker.G != ker.F && ker.G != ker.K)
                {
                    _blendResult.G = (char)(dominantGradient ? BlendType.Dominant : BlendType.Normal);
                }
            }
        }
        //scaler policy: see "Scaler2x" reference implementation
        private void ScaleImageImpl(int[] src, int[] trg, int srcWidth, int srcHeight, int yFirst, int yLast)
        {
            yFirst = Math.Max(yFirst, 0);
            yLast  = Math.Min(yLast, srcHeight);

            if (yFirst >= yLast || srcWidth <= 0)
            {
                return;
            }

            var trgWidth = srcWidth * _scaler.Scale;

            //temporary buffer for "on the fly preprocessing"
            var preProcBuffer = new char[srcWidth];

            var ker4 = new Kernel4x4();

            //initialize preprocessing buffer for first row:
            //detect upper left and right corner blending
            //this cannot be optimized for adjacent processing
            //stripes; we must not allow for a memory race condition!
            if (yFirst > 0)
            {
                var y = yFirst - 1;

                var sM1 = srcWidth * Math.Max(y - 1, 0);
                var s0  = srcWidth * y; //center line
                var sP1 = srcWidth * Math.Min(y + 1, srcHeight - 1);
                var sP2 = srcWidth * Math.Min(y + 2, srcHeight - 1);

                for (var x = 0; x < srcWidth; ++x)
                {
                    var xM1 = Math.Max(x - 1, 0);
                    var xP1 = Math.Min(x + 1, srcWidth - 1);
                    var xP2 = Math.Min(x + 2, srcWidth - 1);

                    //read sequentially from memory as far as possible
                    ker4.A = src[sM1 + xM1];
                    ker4.B = src[sM1 + x];
                    ker4.C = src[sM1 + xP1];
                    ker4.D = src[sM1 + xP2];

                    ker4.E = src[s0 + xM1];
                    ker4.F = src[s0 + x];
                    ker4.G = src[s0 + xP1];
                    ker4.H = src[s0 + xP2];

                    ker4.I = src[sP1 + xM1];
                    ker4.J = src[sP1 + x];
                    ker4.K = src[sP1 + xP1];
                    ker4.L = src[sP1 + xP2];

                    ker4.M = src[sP2 + xM1];
                    ker4.N = src[sP2 + x];
                    ker4.O = src[sP2 + xP1];
                    ker4.P = src[sP2 + xP2];

                    PreProcessCorners(ker4); // writes to blendResult

                    /*
                     * preprocessing blend result:
                     * ---------
                     | F | G | //evalute corner between F, G, J, K
                     | ----|---| //input pixel is at position F
                     | J | K |
                     | ---------
                     */
                    preProcBuffer[x] = preProcBuffer[x].SetTopR(_blendResult.J);

                    if (x + 1 < srcWidth)
                    {
                        preProcBuffer[x + 1] = preProcBuffer[x + 1].SetTopL(_blendResult.K);
                    }
                }
            }

            _outputMatrix = new OutputMatrix(_scaler.Scale, trg, trgWidth);

            var ker3 = new Kernel3x3();

            for (var y = yFirst; y < yLast; ++y)
            {
                //consider MT "striped" access
                var trgi = _scaler.Scale * y * trgWidth;

                var sM1 = srcWidth * Math.Max(y - 1, 0);
                var s0  = srcWidth * y; //center line
                var sP1 = srcWidth * Math.Min(y + 1, srcHeight - 1);
                var sP2 = srcWidth * Math.Min(y + 2, srcHeight - 1);

                var blendXy1 = (char)0;

                for (var x = 0; x < srcWidth; ++x, trgi += _scaler.Scale)
                {
                    var xM1 = Math.Max(x - 1, 0);
                    var xP1 = Math.Min(x + 1, srcWidth - 1);
                    var xP2 = Math.Min(x + 2, srcWidth - 1);

                    //evaluate the four corners on bottom-right of current pixel
                    //blend_xy for current (x, y) position

                    //read sequentially from memory as far as possible
                    ker4.A = src[sM1 + xM1];
                    ker4.B = src[sM1 + x];
                    ker4.C = src[sM1 + xP1];
                    ker4.D = src[sM1 + xP2];

                    ker4.E = src[s0 + xM1];
                    ker4.F = src[s0 + x];
                    ker4.G = src[s0 + xP1];
                    ker4.H = src[s0 + xP2];

                    ker4.I = src[sP1 + xM1];
                    ker4.J = src[sP1 + x];
                    ker4.K = src[sP1 + xP1];
                    ker4.L = src[sP1 + xP2];

                    ker4.M = src[sP2 + xM1];
                    ker4.N = src[sP2 + x];
                    ker4.O = src[sP2 + xP1];
                    ker4.P = src[sP2 + xP2];

                    PreProcessCorners(ker4); // writes to blendResult

                    /*
                     *  preprocessing blend result:
                     *  ---------
                     | F | G | //evaluate corner between F, G, J, K
                     |  ----|---| //current input pixel is at position F
                     | J | K |
                     |  ---------
                     */

                    //all four corners of (x, y) have been determined at
                    //this point due to processing sequence!
                    var blendXy = preProcBuffer[x].SetBottomR(_blendResult.F);

                    //set 2nd known corner for (x, y + 1)
                    blendXy1 = blendXy1.SetTopR(_blendResult.J);
                    //store on current buffer position for use on next row
                    preProcBuffer[x] = blendXy1;

                    //set 1st known corner for (x + 1, y + 1) and
                    //buffer for use on next column
                    blendXy1 = ((char)0).SetTopL(_blendResult.K);

                    if (x + 1 < srcWidth)
                    {
                        //set 3rd known corner for (x + 1, y)
                        preProcBuffer[x + 1] = preProcBuffer[x + 1].SetBottomL(_blendResult.G);
                    }

                    //fill block of size scale * scale with the given color
                    //  //place *after* preprocessing step, to not overwrite the
                    //  //results while processing the the last pixel!
                    FillBlock(trg, trgi, trgWidth, src[s0 + x], _scaler.Scale);

                    //blend four corners of current pixel
                    if (blendXy == 0)
                    {
                        continue;
                    }

                    const int a = 0, b = 1, c = 2, d = 3, e = 4, f = 5, g = 6, h = 7, i = 8;

                    //read sequentially from memory as far as possible
                    ker3._[a] = src[sM1 + xM1];
                    ker3._[b] = src[sM1 + x];
                    ker3._[c] = src[sM1 + xP1];

                    ker3._[d] = src[s0 + xM1];
                    ker3._[e] = src[s0 + x];
                    ker3._[f] = src[s0 + xP1];

                    ker3._[g] = src[sP1 + xM1];
                    ker3._[h] = src[sP1 + x];
                    ker3._[i] = src[sP1 + xP1];

                    ScalePixel(_scaler, (int)RotationDegree.R0, ker3, trgi, blendXy);
                    ScalePixel(_scaler, (int)RotationDegree.R90, ker3, trgi, blendXy);
                    ScalePixel(_scaler, (int)RotationDegree.R180, ker3, trgi, blendXy);
                    ScalePixel(_scaler, (int)RotationDegree.R270, ker3, trgi, blendXy);
                }
            }
        }