private void Sort(RecordsReader recordsReader, int field, StreamWriter output, int workersCount, long bufferSizeInBytesPerWorker) { Stopwatch sortSw = new Stopwatch(); sortSw.Start(); RecordsBuffer firstBlock; int blockCount = workersCount > 1 ? SortBlocksParallel(recordsReader, field, workersCount, out firstBlock) : SortBlocksSequential(recordsReader, field, out firstBlock); sortSw.Stop(); if (blockCount == 1) { RecordsWriter recordsWriter = new RecordsWriter(output); recordsWriter.WriteRecords(firstBlock); } else { Console.WriteLine($"{blockCount} blocks sorted in {sortSw.Elapsed}"); Console.WriteLine("Merging..."); Stopwatch mergeSw = new Stopwatch(); mergeSw.Start(); MergeBlocks(blockCount, field, output, bufferSizeInBytesPerWorker); mergeSw.Stop(); Console.WriteLine($"{blockCount} blocks merged in {mergeSw.Elapsed}"); _tempStreams.ClearBlocks(); } }
private void MergeBlocks(int blockCount, int field, StreamWriter output, long bufferSizeInBytes) { long blockSize = bufferSizeInBytes / (blockCount + 1); List <Stream> blockStreams = new List <Stream>(); try { List <IEnumerator <Record> > blockRecordsEnumerators = new List <IEnumerator <Record> >(); for (int i = 0; i < blockCount; i++) { Stream blockStream = _tempStreams.OpenBlockStream(i); blockStreams.Add(blockStream); StreamReader blockStreamReader = new StreamReader(blockStream); RecordsReader blockReader = new RecordsReader(blockStreamReader, blockSize); blockRecordsEnumerators.Add(blockReader.ReadRecords().GetEnumerator()); } using (BufferedRecordsWriter recordsWriter = new BufferedRecordsWriter(output, blockSize)) { MergeBlocks(blockRecordsEnumerators, recordsWriter, field); } } finally { foreach (Stream s in blockStreams) { s.Dispose(); } } }
private int SortBlocksSequential(RecordsReader recordsReader, int field, out RecordsBuffer firstBlock) { int blockIndex = 0; firstBlock = null; IEnumerable <RecordsBuffer> blocks = recordsReader.ReadBlocks(); foreach (RecordsBuffer block in blocks) { block.Sort(field); if (blockIndex == 0) { if (recordsReader.IsLastBlock) { firstBlock = block; return(1); } } WriteBlock(block, blockIndex); Console.WriteLine($"Block {blockIndex} sorted"); blockIndex++; } return(blockIndex); }
private int SortBlocksParallel(RecordsReader recordsReader, int field, int workersCount, out RecordsBuffer firstBlock) { firstBlock = null; IEnumerator <RecordsBuffer> blocks = recordsReader.ReadBlocks().GetEnumerator(); if (!blocks.MoveNext()) { return(0); } if (recordsReader.IsLastBlock) { firstBlock = blocks.Current; firstBlock.Sort(field); return(1); } int blockIndex = 0; using (var blockCollection = new BlockingCollection <Tuple <RecordsBuffer, int> >(workersCount)) using (var sortCompletionCollection = new BlockingCollection <bool>(workersCount)) { Task blocksReadingTask = Task.Factory.StartNew(() => { do { blockCollection.Add(new Tuple <RecordsBuffer, int>(blocks.Current, blockIndex++)); sortCompletionCollection.Add(true); } while (blocks.MoveNext()); blockCollection.CompleteAdding(); }); Task blocksSortingTask = Task.Factory.StartNew(() => { List <Task> sortTasks = new List <Task>(); try { while (true) { Tuple <RecordsBuffer, int> blockAndIndex = blockCollection.Take(); Task t = StartBlockSortingTask(blockAndIndex.Item1, blockAndIndex.Item2, field, sortCompletionCollection); sortTasks.Add(t); } } catch (InvalidOperationException) { // An InvalidOperationException means that Take() was called on a completed collection } Task.WaitAll(sortTasks.ToArray()); }); Task.WaitAll(blocksReadingTask, blocksSortingTask); } return(blockIndex); }
/// <summary> /// Sorts big tabled data from stream by specified fields. /// </summary> /// <param name="input">Source data stream.</param> /// <param name="field">Field indexes by which sorting will happen one after another.</param> /// <param name="output">Dest data stream.</param> public void Sort(Stream input, int[] fields, Stream output) { int prevField = -1; Stream originalOutput = output; int tempOutputFirst = fields.Length % 2; Stream tempOutput = fields.Length > 1 ? _tempStreams.CreateTempOutputStream() : null; int workersCount = _maxWorkersCount; long bufferSizeInBytesPerWorker = _maxBufferSizeInBytes / workersCount; try { for (int i = 0; i < fields.Length; i++) { Console.WriteLine($"Sorting by field {fields[i]}..."); Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); output = (i % 2) == tempOutputFirst ? tempOutput : originalOutput; input.Position = 0; output.Position = 0; StreamReader sr = new StreamReader(input); StreamWriter sw = new StreamWriter(output); RecordsReader recordsReader = new RecordsReader(sr, bufferSizeInBytesPerWorker, prevField); while (!recordsReader.IsEnd) { Sort(recordsReader, fields[i], sw, workersCount, bufferSizeInBytesPerWorker); } sw.Flush(); prevField = fields[i]; input = output; stopwatch.Stop(); Console.WriteLine($"Sorting by field {fields[i]} done in {stopwatch.Elapsed}"); } } finally { if (tempOutput != null) { tempOutput.Dispose(); } } }