// Brought Histogram outside of the main loop, to separate into two phases: Histogram/counting and permutation public static uint[] SortRadixPar2(this uint[] inputArray) { uint numberOfBins = 256; int Log2ofPowerOfTwoRadix = 8; int numDigits = 4; uint[] outputArray = new uint[inputArray.Length]; bool outputArrayHasResult = false; if (inputArray.Length == 0) { return(outputArray); } uint numberOfQuantas = (inputArray.Length % SortRadixParallelWorkQuanta) == 0 ? (uint)(inputArray.Length / SortRadixParallelWorkQuanta) : (uint)(inputArray.Length / SortRadixParallelWorkQuanta + 1); uint numberOfFullQuantas = (uint)(inputArray.Length / SortRadixParallelWorkQuanta); Console.WriteLine("Before Histogram"); // TODO: This histogram operation can be done in parallel to speed it up uint[][][] count = Algorithm.HistogramByteComponentsAcrossWorkQuantas(inputArray, SortRadixParallelWorkQuanta); Console.WriteLine("After Histogram"); uint[][][] startOfBin = new uint[numberOfQuantas][][]; // start of bin for each parallel work item for (int q = 0; q < numberOfQuantas; q++) { startOfBin[q] = new uint[numDigits][]; for (int d = 0; d < numDigits; d++) { startOfBin[q][d] = new uint[numberOfBins]; } } for (int d = 0; d < numDigits; d++) // for each bin, create startOfBin for each work quanta, but relative to zero for quanta[0] & bin[0] { for (uint b = 0; b < numberOfBins; b++) // because all bin[0]'s will come before all bin[1]'s and so on... and each bin is split into pieces associated with each work quanta { startOfBin[0][d][b] = 0; for (int q = 1; q < numberOfQuantas; q++) { startOfBin[q][d][b] = startOfBin[q - 1][d][b] + count[q - 1][d][b]; } } } for (int d = 0; d < numDigits; d++) { for (uint b = 1; b < numberOfBins; b++) // adjust each item within each bin by the offset of previous bin and that bin's size { uint startOfThisBin = startOfBin[numberOfQuantas - 1][d][b - 1] + count[numberOfQuantas - 1][d][b - 1]; for (uint q = 0; q < numberOfQuantas; q++) { startOfBin[q][d][b] += startOfThisBin; } } } //for (int d = 0; d < numDigits; d++) // for (uint q = 0; q < numberOfQuantas; q++) // { // Console.WriteLine("s: q = {0} d = {1}", q, d); // for (uint b = 0; b < numberOfBins; b++) // Console.Write("{0}, ", startOfBin[q][d][b]); // Console.WriteLine(); // } // Use TPL ideas from https://docs.microsoft.com/en-us/dotnet/standard/parallel-programming/task-based-asynchronous-programming uint bitMask = 255; int shiftRightAmount = 0; uint digit = 0; Console.WriteLine("Before main permutation"); while (bitMask != 0) // end processing digits when all the mask bits have been processed and shifted out, leaving no bits set in the bitMask { #if false for (uint current = 0; current < inputArray.Length; current++) { uint r = current / SortRadixParallelWorkQuanta; outputArray[startOfBin[r, ExtractDigit(inputArray[current], bitMask, shiftRightAmount)]++] = inputArray[current]; } #else #if false // The last work item may not have the full parallelWorkQuanta of items to process Task[] taskArray = new Task[numberOfFullQuantas]; for (uint q = 0; q < numberOfFullQuantas; q++) { uint current = q * SortRadixParallelWorkQuanta; taskArray[q] = Task.Factory.StartNew((Object obj) => { CustomData data = obj as CustomData; if (data == null) { return; } uint currIndex = data.current; uint qLoc = data.q; uint[] startOfBinLoc = startOfBin[qLoc][digit]; //Console.WriteLine("current = {0}, q = {1}, bitMask = {2}, shiftRightAmount = {3}", currIndex, qLoc, data.bitMask, data.shiftRightAmount); for (uint i = 0; i < SortRadixParallelWorkQuanta; i++) { outputArray[startOfBinLoc[(inputArray[currIndex] & data.bitMask) >> data.shiftRightAmount]++] = inputArray[currIndex]; currIndex++; } }, new CustomData() { current = current, q = q, bitMask = bitMask, shiftRightAmount = shiftRightAmount } ); } Task.WaitAll(taskArray); #else for (uint q = 0; q < numberOfFullQuantas; q++) { uint[] startOfBinLoc = startOfBin[q][digit]; uint current = q * SortRadixParallelWorkQuanta; for (uint i = 0; i < SortRadixParallelWorkQuanta; i++) { outputArray[startOfBinLoc[ExtractDigit(inputArray[current], bitMask, shiftRightAmount)]++] = inputArray[current]; current++; } } #endif // The last work item may not have the full parallelWorkQuanta of items to process Console.WriteLine("Before last permutation"); if (numberOfQuantas > numberOfFullQuantas) { // The last iteration, which may not have the full parallelWorkQuanta of items to process uint currentLast = numberOfFullQuantas * SortRadixParallelWorkQuanta; uint numItems = (uint)inputArray.Length % SortRadixParallelWorkQuanta; for (uint i = 0; i < numItems; i++) { outputArray[startOfBin[numberOfFullQuantas][digit][ExtractDigit(inputArray[currentLast], bitMask, shiftRightAmount)]++] = inputArray[currentLast]; currentLast++; } } #endif bitMask <<= Log2ofPowerOfTwoRadix; digit++; shiftRightAmount += Log2ofPowerOfTwoRadix; outputArrayHasResult = !outputArrayHasResult; uint[] tmp = inputArray; // swap input and output arrays inputArray = outputArray; outputArray = tmp; } if (outputArrayHasResult) { for (uint current = 0; current < inputArray.Length; current++) // copy from output array into the input array { inputArray[current] = outputArray[current]; } } return(inputArray); }