// Brought Histogram outside of the main loop, to separate into two phases: Histogram/counting and permutation
        public static uint[] SortRadixPar2(this uint[] inputArray)
        {
            uint numberOfBins          = 256;
            int  Log2ofPowerOfTwoRadix = 8;
            int  numDigits             = 4;

            uint[] outputArray          = new uint[inputArray.Length];
            bool   outputArrayHasResult = false;

            if (inputArray.Length == 0)
            {
                return(outputArray);
            }

            uint numberOfQuantas = (inputArray.Length % SortRadixParallelWorkQuanta) == 0 ? (uint)(inputArray.Length / SortRadixParallelWorkQuanta)
                                                                                          : (uint)(inputArray.Length / SortRadixParallelWorkQuanta + 1);
            uint numberOfFullQuantas = (uint)(inputArray.Length / SortRadixParallelWorkQuanta);

            Console.WriteLine("Before Histogram");
            // TODO: This histogram operation can be done in parallel to speed it up
            uint[][][] count = Algorithm.HistogramByteComponentsAcrossWorkQuantas(inputArray, SortRadixParallelWorkQuanta);
            Console.WriteLine("After Histogram");

            uint[][][] startOfBin = new uint[numberOfQuantas][][];     // start of bin for each parallel work item
            for (int q = 0; q < numberOfQuantas; q++)
            {
                startOfBin[q] = new uint[numDigits][];
                for (int d = 0; d < numDigits; d++)
                {
                    startOfBin[q][d] = new uint[numberOfBins];
                }
            }

            for (int d = 0; d < numDigits; d++)             // for each bin, create startOfBin for each work quanta, but relative to zero for quanta[0] & bin[0]
            {
                for (uint b = 0; b < numberOfBins; b++)     // because all bin[0]'s will come before all bin[1]'s and so on... and each bin is split into pieces associated with each work quanta
                {
                    startOfBin[0][d][b] = 0;
                    for (int q = 1; q < numberOfQuantas; q++)
                    {
                        startOfBin[q][d][b] = startOfBin[q - 1][d][b] + count[q - 1][d][b];
                    }
                }
            }

            for (int d = 0; d < numDigits; d++)
            {
                for (uint b = 1; b < numberOfBins; b++)     // adjust each item within each bin by the offset of previous bin and that bin's size
                {
                    uint startOfThisBin = startOfBin[numberOfQuantas - 1][d][b - 1] + count[numberOfQuantas - 1][d][b - 1];
                    for (uint q = 0; q < numberOfQuantas; q++)
                    {
                        startOfBin[q][d][b] += startOfThisBin;
                    }
                }
            }
            //for (int d = 0; d < numDigits; d++)
            //    for (uint q = 0; q < numberOfQuantas; q++)
            //    {
            //        Console.WriteLine("s: q = {0}   d = {1}", q, d);
            //        for (uint b = 0; b < numberOfBins; b++)
            //            Console.Write("{0}, ", startOfBin[q][d][b]);
            //        Console.WriteLine();
            //    }

            // Use TPL ideas from https://docs.microsoft.com/en-us/dotnet/standard/parallel-programming/task-based-asynchronous-programming

            uint bitMask          = 255;
            int  shiftRightAmount = 0;
            uint digit            = 0;

            Console.WriteLine("Before main permutation");
            while (bitMask != 0)    // end processing digits when all the mask bits have been processed and shifted out, leaving no bits set in the bitMask
            {
#if false
                for (uint current = 0; current < inputArray.Length; current++)
                {
                    uint r = current / SortRadixParallelWorkQuanta;
                    outputArray[startOfBin[r, ExtractDigit(inputArray[current], bitMask, shiftRightAmount)]++] = inputArray[current];
                }
#else
#if false
                // The last work item may not have the full parallelWorkQuanta of items to process
                Task[] taskArray = new Task[numberOfFullQuantas];
                for (uint q = 0; q < numberOfFullQuantas; q++)
                {
                    uint current = q * SortRadixParallelWorkQuanta;
                    taskArray[q] = Task.Factory.StartNew((Object obj) => {
                        CustomData data = obj as CustomData;
                        if (data == null)
                        {
                            return;
                        }
                        uint currIndex       = data.current;
                        uint qLoc            = data.q;
                        uint[] startOfBinLoc = startOfBin[qLoc][digit];
                        //Console.WriteLine("current = {0}, q = {1}, bitMask = {2}, shiftRightAmount = {3}", currIndex, qLoc, data.bitMask, data.shiftRightAmount);
                        for (uint i = 0; i < SortRadixParallelWorkQuanta; i++)
                        {
                            outputArray[startOfBinLoc[(inputArray[currIndex] & data.bitMask) >> data.shiftRightAmount]++] = inputArray[currIndex];
                            currIndex++;
                        }
                    },
                                                         new CustomData()
                    {
                        current = current, q = q, bitMask = bitMask, shiftRightAmount = shiftRightAmount
                    }
                                                         );
                }
                Task.WaitAll(taskArray);
#else
                for (uint q = 0; q < numberOfFullQuantas; q++)
                {
                    uint[] startOfBinLoc = startOfBin[q][digit];
                    uint   current       = q * SortRadixParallelWorkQuanta;
                    for (uint i = 0; i < SortRadixParallelWorkQuanta; i++)
                    {
                        outputArray[startOfBinLoc[ExtractDigit(inputArray[current], bitMask, shiftRightAmount)]++] = inputArray[current];
                        current++;
                    }
                }
#endif
                // The last work item may not have the full parallelWorkQuanta of items to process
                Console.WriteLine("Before last permutation");
                if (numberOfQuantas > numberOfFullQuantas)
                {
                    // The last iteration, which may not have the full parallelWorkQuanta of items to process
                    uint currentLast = numberOfFullQuantas * SortRadixParallelWorkQuanta;
                    uint numItems    = (uint)inputArray.Length % SortRadixParallelWorkQuanta;
                    for (uint i = 0; i < numItems; i++)
                    {
                        outputArray[startOfBin[numberOfFullQuantas][digit][ExtractDigit(inputArray[currentLast], bitMask, shiftRightAmount)]++] = inputArray[currentLast];
                        currentLast++;
                    }
                }
#endif

                bitMask <<= Log2ofPowerOfTwoRadix;
                digit++;
                shiftRightAmount    += Log2ofPowerOfTwoRadix;
                outputArrayHasResult = !outputArrayHasResult;

                uint[] tmp = inputArray;       // swap input and output arrays
                inputArray  = outputArray;
                outputArray = tmp;
            }
            if (outputArrayHasResult)
            {
                for (uint current = 0; current < inputArray.Length; current++)    // copy from output array into the input array
                {
                    inputArray[current] = outputArray[current];
                }
            }

            return(inputArray);
        }