private void Transform(byte[] Output, int OutOffset, uint[] Counter)
        {
            int  ctr = 0;
            uint X0  = m_wrkState[ctr];
            uint X1  = m_wrkState[++ctr];
            uint X2  = m_wrkState[++ctr];
            uint X3  = m_wrkState[++ctr];
            uint X4  = m_wrkState[++ctr];
            uint X5  = m_wrkState[++ctr];
            uint X6  = m_wrkState[++ctr];
            uint X7  = m_wrkState[++ctr];
            uint X8  = m_wrkState[++ctr];
            uint X9  = m_wrkState[++ctr];
            uint X10 = m_wrkState[++ctr];
            uint X11 = m_wrkState[++ctr];
            uint X12 = Counter[0];
            uint X13 = Counter[1];
            uint X14 = m_wrkState[++ctr];
            uint X15 = m_wrkState[++ctr];

            ctr = Rounds;
            while (ctr != 0)
            {
                X0  += X4;
                X12  = IntUtils.RotateLeft(X12 ^ X0, 16);
                X8  += X12;
                X4   = IntUtils.RotateLeft(X4 ^ X8, 12);
                X0  += X4;
                X12  = IntUtils.RotateLeft(X12 ^ X0, 8);
                X8  += X12;
                X4   = IntUtils.RotateLeft(X4 ^ X8, 7);
                X1  += X5;
                X13  = IntUtils.RotateLeft(X13 ^ X1, 16);
                X9  += X13;
                X5   = IntUtils.RotateLeft(X5 ^ X9, 12);
                X1  += X5;
                X13  = IntUtils.RotateLeft(X13 ^ X1, 8);
                X9  += X13;
                X5   = IntUtils.RotateLeft(X5 ^ X9, 7);
                X2  += X6;
                X14  = IntUtils.RotateLeft(X14 ^ X2, 16);
                X10 += X14;
                X6   = IntUtils.RotateLeft(X6 ^ X10, 12);
                X2  += X6;
                X14  = IntUtils.RotateLeft(X14 ^ X2, 8);
                X10 += X14;
                X6   = IntUtils.RotateLeft(X6 ^ X10, 7);
                X3  += X7;
                X15  = IntUtils.RotateLeft(X15 ^ X3, 16);
                X11 += X15;
                X7   = IntUtils.RotateLeft(X7 ^ X11, 12);
                X3  += X7;
                X15  = IntUtils.RotateLeft(X15 ^ X3, 8);
                X11 += X15;
                X7   = IntUtils.RotateLeft(X7 ^ X11, 7);
                X0  += X5;
                X15  = IntUtils.RotateLeft(X15 ^ X0, 16);
                X10 += X15;
                X5   = IntUtils.RotateLeft(X5 ^ X10, 12);
                X0  += X5;
                X15  = IntUtils.RotateLeft(X15 ^ X0, 8);
                X10 += X15;
                X5   = IntUtils.RotateLeft(X5 ^ X10, 7);
                X1  += X6;
                X12  = IntUtils.RotateLeft(X12 ^ X1, 16);
                X11 += X12;
                X6   = IntUtils.RotateLeft(X6 ^ X11, 12);
                X1  += X6;
                X12  = IntUtils.RotateLeft(X12 ^ X1, 8);
                X11 += X12;
                X6   = IntUtils.RotateLeft(X6 ^ X11, 7);
                X2  += X7;
                X13  = IntUtils.RotateLeft(X13 ^ X2, 16);
                X8  += X13;
                X7   = IntUtils.RotateLeft(X7 ^ X8, 12);
                X2  += X7;
                X13  = IntUtils.RotateLeft(X13 ^ X2, 8);
                X8  += X13;
                X7   = IntUtils.RotateLeft(X7 ^ X8, 7);
                X3  += X4;
                X14  = IntUtils.RotateLeft(X14 ^ X3, 16);
                X9  += X14;
                X4   = IntUtils.RotateLeft(X4 ^ X9, 12);
                X3  += X4;
                X14  = IntUtils.RotateLeft(X14 ^ X3, 8);
                X9  += X14;
                X4   = IntUtils.RotateLeft(X4 ^ X9, 7);
                ctr -= 2;
            }

            IntUtils.Le32ToBytes(X0 + m_wrkState[ctr], Output, OutOffset); OutOffset    += 4;
            IntUtils.Le32ToBytes(X1 + m_wrkState[++ctr], Output, OutOffset); OutOffset  += 4;
            IntUtils.Le32ToBytes(X2 + m_wrkState[++ctr], Output, OutOffset); OutOffset  += 4;
            IntUtils.Le32ToBytes(X3 + m_wrkState[++ctr], Output, OutOffset); OutOffset  += 4;
            IntUtils.Le32ToBytes(X4 + m_wrkState[++ctr], Output, OutOffset); OutOffset  += 4;
            IntUtils.Le32ToBytes(X5 + m_wrkState[++ctr], Output, OutOffset); OutOffset  += 4;
            IntUtils.Le32ToBytes(X6 + m_wrkState[++ctr], Output, OutOffset); OutOffset  += 4;
            IntUtils.Le32ToBytes(X7 + m_wrkState[++ctr], Output, OutOffset); OutOffset  += 4;
            IntUtils.Le32ToBytes(X8 + m_wrkState[++ctr], Output, OutOffset); OutOffset  += 4;
            IntUtils.Le32ToBytes(X9 + m_wrkState[++ctr], Output, OutOffset); OutOffset  += 4;
            IntUtils.Le32ToBytes(X10 + m_wrkState[++ctr], Output, OutOffset); OutOffset += 4;
            IntUtils.Le32ToBytes(X11 + m_wrkState[++ctr], Output, OutOffset); OutOffset += 4;
            IntUtils.Le32ToBytes(X12 + Counter[0], Output, OutOffset); OutOffset        += 4;
            IntUtils.Le32ToBytes(X13 + Counter[1], Output, OutOffset); OutOffset        += 4;
            IntUtils.Le32ToBytes(X14 + m_wrkState[++ctr], Output, OutOffset); OutOffset += 4;
            IntUtils.Le32ToBytes(X15 + m_wrkState[++ctr], Output, OutOffset);
        }
        private void Process(byte[] Input, int InOffset, byte[] Output, int OutOffset, int Length)
        {
            int prcSze = (Length >= Input.Length - InOffset) && Length >= Output.Length - OutOffset ? IntUtils.Min(Input.Length - InOffset, Output.Length - OutOffset) : Length;

            if (!m_isParallel || prcSze < m_parallelBlockSize)
            {
                // generate random
                Generate(prcSze, m_ctrVector, Output, OutOffset);
                // output is input xor with random
                int sze = prcSze - (prcSze % BLOCK_SIZE);

                if (sze != 0)
                {
                    IntUtils.XORBLK(Input, InOffset, Output, OutOffset, sze);
                }

                // get the remaining bytes
                if (sze != prcSze)
                {
                    for (int i = sze; i < prcSze; ++i)
                    {
                        Output[i + OutOffset] ^= Input[i + InOffset];
                    }
                }
            }
            else
            {
                // parallel CTR processing //
                int cnkSize = (prcSze / BLOCK_SIZE / ProcessorCount) * BLOCK_SIZE;
                int rndSize = cnkSize * ProcessorCount;
                int subSize = (cnkSize / BLOCK_SIZE);
                // create jagged array of 'sub counters'
                uint[] tmpCtr = new uint[m_ctrVector.Length];

                // create random, and xor to output in parallel
                System.Threading.Tasks.Parallel.For(0, m_processorCount, i =>
                {
                    // thread level counter
                    uint[] thdCtr = new uint[m_ctrVector.Length];
                    // offset counter by chunk size / block size
                    thdCtr = Increase(m_ctrVector, subSize * i);
                    // create random at offset position
                    this.Generate(cnkSize, thdCtr, Output, OutOffset + (i * cnkSize));
                    // xor with input at offset
                    IntUtils.XORBLK(Input, InOffset + (i * cnkSize), Output, OutOffset + (i * cnkSize), cnkSize);
                    // store last counter
                    if (i == m_processorCount - 1)
                    {
                        Array.Copy(thdCtr, 0, tmpCtr, 0, thdCtr.Length);
                    }
                });

                // last block processing
                if (rndSize < prcSze)
                {
                    int fnlSize = prcSze % rndSize;
                    Generate(fnlSize, tmpCtr, Output, rndSize);

                    for (int i = 0; i < fnlSize; ++i)
                    {
                        Output[i + OutOffset + rndSize] ^= (byte)(Input[i + InOffset + rndSize]);
                    }
                }

                // copy the last counter position to class variable
                Array.Copy(tmpCtr, 0, m_ctrVector, 0, m_ctrVector.Length);
            }
        }