// Brought Histogram outside of the main loop, to separate into two phases: Histogram/counting and permutation
        public static uint[] SortRadixPar2(this uint[] inputArray)
        {
            uint numberOfBins          = 256;
            int  Log2ofPowerOfTwoRadix = 8;
            int  numDigits             = 4;

            uint[] outputArray          = new uint[inputArray.Length];
            bool   outputArrayHasResult = false;

            if (inputArray.Length == 0)
            {
                return(outputArray);
            }

            uint numberOfQuantas = (inputArray.Length % SortRadixParallelWorkQuanta) == 0 ? (uint)(inputArray.Length / SortRadixParallelWorkQuanta)
                                                                                          : (uint)(inputArray.Length / SortRadixParallelWorkQuanta + 1);
            uint numberOfFullQuantas = (uint)(inputArray.Length / SortRadixParallelWorkQuanta);

            Console.WriteLine("Before Histogram");
            // TODO: This histogram operation can be done in parallel to speed it up
            uint[][][] count = Algorithm.HistogramByteComponentsAcrossWorkQuantas(inputArray, SortRadixParallelWorkQuanta);
            Console.WriteLine("After Histogram");

            uint[][][] startOfBin = new uint[numberOfQuantas][][];     // start of bin for each parallel work item
            for (int q = 0; q < numberOfQuantas; q++)
            {
                startOfBin[q] = new uint[numDigits][];
                for (int d = 0; d < numDigits; d++)
                {
                    startOfBin[q][d] = new uint[numberOfBins];
                }
            }

            for (int d = 0; d < numDigits; d++)             // for each bin, create startOfBin for each work quanta, but relative to zero for quanta[0] & bin[0]
            {
                for (uint b = 0; b < numberOfBins; b++)     // because all bin[0]'s will come before all bin[1]'s and so on... and each bin is split into pieces associated with each work quanta
                {
                    startOfBin[0][d][b] = 0;
                    for (int q = 1; q < numberOfQuantas; q++)
                    {
                        startOfBin[q][d][b] = startOfBin[q - 1][d][b] + count[q - 1][d][b];
                    }
                }
            }

            for (int d = 0; d < numDigits; d++)
            {
                for (uint b = 1; b < numberOfBins; b++)     // adjust each item within each bin by the offset of previous bin and that bin's size
                {
                    uint startOfThisBin = startOfBin[numberOfQuantas - 1][d][b - 1] + count[numberOfQuantas - 1][d][b - 1];
                    for (uint q = 0; q < numberOfQuantas; q++)
                    {
                        startOfBin[q][d][b] += startOfThisBin;
                    }
                }
            }
            //for (int d = 0; d < numDigits; d++)
            //    for (uint q = 0; q < numberOfQuantas; q++)
            //    {
            //        Console.WriteLine("s: q = {0}   d = {1}", q, d);
            //        for (uint b = 0; b < numberOfBins; b++)
            //            Console.Write("{0}, ", startOfBin[q][d][b]);
            //        Console.WriteLine();
            //    }

            // Use TPL ideas from https://docs.microsoft.com/en-us/dotnet/standard/parallel-programming/task-based-asynchronous-programming

            uint bitMask          = 255;
            int  shiftRightAmount = 0;
            uint digit            = 0;

            Console.WriteLine("Before main permutation");
            while (bitMask != 0)    // end processing digits when all the mask bits have been processed and shifted out, leaving no bits set in the bitMask
            {
#if false
                for (uint current = 0; current < inputArray.Length; current++)
                {
                    uint r = current / SortRadixParallelWorkQuanta;
                    outputArray[startOfBin[r, ExtractDigit(inputArray[current], bitMask, shiftRightAmount)]++] = inputArray[current];
                }
#else
#if false
                // The last work item may not have the full parallelWorkQuanta of items to process
                Task[] taskArray = new Task[numberOfFullQuantas];
                for (uint q = 0; q < numberOfFullQuantas; q++)
                {
                    uint current = q * SortRadixParallelWorkQuanta;
                    taskArray[q] = Task.Factory.StartNew((Object obj) => {
                        CustomData data = obj as CustomData;
                        if (data == null)
                        {
                            return;
                        }
                        uint currIndex       = data.current;
                        uint qLoc            = data.q;
                        uint[] startOfBinLoc = startOfBin[qLoc][digit];
                        //Console.WriteLine("current = {0}, q = {1}, bitMask = {2}, shiftRightAmount = {3}", currIndex, qLoc, data.bitMask, data.shiftRightAmount);
                        for (uint i = 0; i < SortRadixParallelWorkQuanta; i++)
                        {
                            outputArray[startOfBinLoc[(inputArray[currIndex] & data.bitMask) >> data.shiftRightAmount]++] = inputArray[currIndex];
                            currIndex++;
                        }
                    },
                                                         new CustomData()
                    {
                        current = current, q = q, bitMask = bitMask, shiftRightAmount = shiftRightAmount
                    }
                                                         );
                }
                Task.WaitAll(taskArray);
#else
                for (uint q = 0; q < numberOfFullQuantas; q++)
                {
                    uint[] startOfBinLoc = startOfBin[q][digit];
                    uint   current       = q * SortRadixParallelWorkQuanta;
                    for (uint i = 0; i < SortRadixParallelWorkQuanta; i++)
                    {
                        outputArray[startOfBinLoc[ExtractDigit(inputArray[current], bitMask, shiftRightAmount)]++] = inputArray[current];
                        current++;
                    }
                }
#endif
                // The last work item may not have the full parallelWorkQuanta of items to process
                Console.WriteLine("Before last permutation");
                if (numberOfQuantas > numberOfFullQuantas)
                {
                    // The last iteration, which may not have the full parallelWorkQuanta of items to process
                    uint currentLast = numberOfFullQuantas * SortRadixParallelWorkQuanta;
                    uint numItems    = (uint)inputArray.Length % SortRadixParallelWorkQuanta;
                    for (uint i = 0; i < numItems; i++)
                    {
                        outputArray[startOfBin[numberOfFullQuantas][digit][ExtractDigit(inputArray[currentLast], bitMask, shiftRightAmount)]++] = inputArray[currentLast];
                        currentLast++;
                    }
                }
#endif

                bitMask <<= Log2ofPowerOfTwoRadix;
                digit++;
                shiftRightAmount    += Log2ofPowerOfTwoRadix;
                outputArrayHasResult = !outputArrayHasResult;

                uint[] tmp = inputArray;       // swap input and output arrays
                inputArray  = outputArray;
                outputArray = tmp;
            }
            if (outputArrayHasResult)
            {
                for (uint current = 0; current < inputArray.Length; current++)    // copy from output array into the input array
                {
                    inputArray[current] = outputArray[current];
                }
            }

            return(inputArray);
        }
        ///// <summary>
        ///// Number of tasks that will run in parallel within the Parallel Radix Sort algorithm
        ///// </summary>
        //public static Int32 SortRadixParallelAmountOfParallelism { get; set; } = Environment.ProcessorCount;
        /// <summary>
        /// Sort an array of unsigned integers using Parallel Radix Sorting algorithm (least significant digit variation)
        /// </summary>
        /// <param name="inputArray"></param>
        /// <returns>array of unsigned integers</returns>
        public static uint[] SortRadixPar1(this uint[] inputArray)
        {
            uint numberOfBins          = 256;
            int  Log2ofPowerOfTwoRadix = 8;

            uint[] outputArray          = new uint[inputArray.Length];
            bool   outputArrayHasResult = false;

            uint numWorkItems = (uint)inputArray.Length / SortRadixParallelWorkQuanta + 1;

            uint[][] count = new uint[numWorkItems][];          // count        for each parallel work item
            for (int i = 0; i < numWorkItems; i++)
            {
                count[i] = new uint[numberOfBins];
            }
            uint[][] startOfBin = new uint[numWorkItems][];     // start of bin for each parallel work item
            for (int i = 0; i < numWorkItems; i++)
            {
                startOfBin[i] = new uint[numberOfBins];
            }

            // Use TPL ideas from https://docs.microsoft.com/en-us/dotnet/standard/parallel-programming/task-based-asynchronous-programming

            uint bitMask          = 255;
            int  shiftRightAmount = 0;

            while (bitMask != 0)    // end processing digits when all the mask bits have been processed and shifted out, leaving no bits set in the bitMask
            {
                for (uint r = 0; r < numWorkItems; r++)
                {
                    for (uint b = 0; b < numberOfBins; b++)
                    {
                        count[r][b] = 0;
                    }
                }
                for (uint current = 0; current < inputArray.Length; current++)  // Scan the array and count the number of times each digit value appears - i.e. size of each bin
                {
                    uint r = current / SortRadixParallelWorkQuanta;             // TODO: Optimize this division out by using nested loops, as division even integer is slow
                    count[r][ExtractDigit(inputArray[current], bitMask, shiftRightAmount)]++;
                }
                for (uint b = 0; b < numberOfBins; b++)     // for each bin, create startOfBin for each work item (work quanta), but relative to zero
                {
                    startOfBin[0][b] = 0;
                    for (uint r = 1; r < numWorkItems; r++)
                    {
                        startOfBin[r][b] = (uint)(startOfBin[r - 1][b] + count[r - 1][b]);
                    }
                }
                for (uint b = 1; b < numberOfBins; b++)     // adjust each item within each bin by the offset of previous bin and that bins size
                {
                    uint sizeOfPrevBin = startOfBin[numWorkItems - 1][b - 1] + count[numWorkItems - 1][b - 1];
                    for (uint r = 0; r < numWorkItems; r++)
                    {
                        startOfBin[r][b] += sizeOfPrevBin;
                    }
                }
#if false
                for (uint current = 0; current < inputArray.Length; current++)
                {
                    uint r = current / SortRadixParallelWorkQuanta;
                    outputArray[startOfBin[r, ExtractDigit(inputArray[current], bitMask, shiftRightAmount)]++] = inputArray[current];
                }
#else
#if true
                // The last work item may not have the full parallelWorkQuanta of items to process
                Task[] taskArray = new Task[numWorkItems - 1];
                for (uint q = 0; q < numWorkItems - 1; q++)
                {
                    uint current = q * SortRadixParallelWorkQuanta;
                    taskArray[q] = Task.Factory.StartNew((Object obj) => {
                        CustomData data = obj as CustomData;
                        if (data == null)
                        {
                            return;
                        }
                        uint currIndex       = data.current;
                        uint qLoc            = data.q;
                        uint[] startOfBinLoc = startOfBin[qLoc];
                        //Console.WriteLine("current = {0}, r = {1}, bitMask = {2}, shiftRightAmount = {3}", currIndex, rLoc, data.bitMask, data.shiftRightAmount);
                        for (uint i = 0; i < SortRadixParallelWorkQuanta; i++)
                        {
                            outputArray[startOfBinLoc[(inputArray[currIndex] & data.bitMask) >> data.shiftRightAmount]++] = inputArray[currIndex];
                            currIndex++;
                        }
                    },
                                                         new CustomData()
                    {
                        current = current, q = q, bitMask = bitMask, shiftRightAmount = shiftRightAmount
                    }
                                                         );
                }
                Task.WaitAll(taskArray);
#else
                // The last work item may not have the full parallelWorkQuanta of items to process
                for (uint r = 0; r < numWorkItems - 1; r++)
                {
                    uint current = r * SortRadixParallelWorkQuanta;
                    for (uint i = 0; i < SortRadixParallelWorkQuanta; i++)
                    {
                        outputArray[startOfBin[r, ExtractDigit(inputArray[current], bitMask, shiftRightAmount)]++] = inputArray[current];
                        current++;
                    }
                }
#endif
                // The last iteration, which may not have the full parallelWorkQuanta of items to process
                uint currentLast = (numWorkItems - 1) * SortRadixParallelWorkQuanta;
                uint numItems    = (uint)inputArray.Length % SortRadixParallelWorkQuanta;
                for (uint i = 0; i < numItems; i++)
                {
                    outputArray[startOfBin[(numWorkItems - 1)][ExtractDigit(inputArray[currentLast], bitMask, shiftRightAmount)]++] = inputArray[currentLast];
                    currentLast++;
                }
#endif

                bitMask            <<= Log2ofPowerOfTwoRadix;
                shiftRightAmount    += Log2ofPowerOfTwoRadix;
                outputArrayHasResult = !outputArrayHasResult;

                uint[] tmp = inputArray;       // swap input and output arrays
                inputArray  = outputArray;
                outputArray = tmp;
            }
            if (outputArrayHasResult)
            {
                for (uint current = 0; current < inputArray.Length; current++)    // copy from output array into the input array
                {
                    inputArray[current] = outputArray[current];
                }
            }

            return(inputArray);
        }