/// <summary> /// Creates a task that sorts a list of records and saves them in a temporary file. /// </summary> /// <param name="records">the records to sort</param> /// <returns>the task that will perform the sort and return the path to the temporary file</returns> private Task <string> SortAndSave(List <T> records) { return(Task.Run(() => { // Sort the records if (StableSort) { var listStableOrdered = records.OrderBy(x => x, Comparer).ToList(); records.Clear(); records.AddRange(listStableOrdered); } else { records.Sort(Comparer); } // Save in temporary file var tmpFilename = Path.GetTempFileName(); _logger.Debug("Saving temp file: {0}", tmpFilename); using (var writer = RecordAccessorFactory.CreateWriter(new FileStream(tmpFilename, FileMode.Create))) { foreach (var record in records) { writer.Write(record); } } return tmpFilename; })); }
/// <summary> /// Copies records. This method is used when there is no comparer. /// Records are still filtered and formatted as required. /// </summary> private void Copy() { _logger.Info("No comparer: copying records to output using filter and formatter"); var sumWriters = GetSumWriters(); try { foreach (var file in InputFiles) { using (var reader = RecordAccessorFactory.CreateReader(file.OpenRead())) { _header = reader.ReadHeader(HeaderSize); WriteHeader(sumWriters); var record = ReadRecord(reader); while (record != null) { if (Select(record)) { WriteRecord(sumWriters, record); } record = ReadRecord(reader); } } } } finally { // Dispose all the sum writers foreach (var sumWriter in sumWriters) { sumWriter.Dispose(); } } }
/// <summary> /// Merges the temporary files to the final output file /// </summary> /// <param name="tmpFiles"></param> /// <param name="outputFile"></param> private void Merge(ICollection <string> tmpFiles, FileInfo outputFile) { _logger.Info("Merging temporary files"); // we use a list of buffers to sort them by // their current record var buffers = tmpFiles.Select( f => new RecordReaderBuffer <T>(RecordAccessorFactory.CreateReader(new FileStream(f, FileMode.Open)), Comparer)) .ToList(); using (var writer = GetSumWriter(outputFile)) { writer.WriteHeader(_header); try { Merge(buffers, writer); } finally { // Close the buffers foreach (var buffer in buffers) { buffer.Dispose(); } } } // Delete temp files _logger.Info("Deleting temporary files"); foreach (var file in tmpFiles) { File.Delete(file); } }
/// <summary> /// Sorts the input files in memory. /// </summary> private void InMemorySort() { _logger.Info("Sorting in memory"); var records = new List <T>(); // Read all the records foreach (var file in InputFiles) { using (var reader = RecordAccessorFactory.CreateReader(file.OpenRead())) { _header = reader.ReadHeader(HeaderSize); var record = ReadRecord(reader); while (record != null) { if (Select(record)) { records.Add(record); } record = ReadRecord(reader); } } } // Sort the records if (StableSort) { var listStableOrdered = records.OrderBy(x => x, Comparer).ToList(); records.Clear(); records.AddRange(listStableOrdered); } else { records.Sort(Comparer); } // Write the records var sumWriters = GetSumWriters(); try { WriteHeader(sumWriters); foreach (var record in records) { WriteRecord(sumWriters, record); } } finally { // Dispose all the sum writers foreach (var sumWriter in sumWriters) { sumWriter.Dispose(); } } }
/// <summary> /// Computes the maximum number of records in memory. /// The size of the first record is used as the average record size. /// </summary> /// <param name="record">the first record</param> /// <returns>the maximumu number of records to store in memory</returns> private long ComputeMaxInMemoryRecords(T record) { // We write the record in a memory stream just to check its size, // then compare with the max file size. // This is a very rough approximation, but it is fast to compute. using (var stream = new MemoryStream()) using (var writer = RecordAccessorFactory.CreateWriter(stream)) { writer.Write(record); return(_maxInMemorySize / stream.Length); } }
/// <summary> /// Merges the temporary files to the final output file /// </summary> /// <param name="tmpFiles"></param> private void Merge(ICollection <string> tmpFiles) { _logger.Info("Merging temporary files"); // we use a list of buffers to sort them by // their current record //order the files to enable stable sort var buffers = tmpFiles.Select( (f, i) => new RecordReaderBuffer <T>(RecordAccessorFactory.CreateReader(new FileStream(f, FileMode.Open)), Comparer, StableSort, i)) .ToList(); var sumWriters = GetSumWriters(); try { WriteHeader(sumWriters); try { Merge(buffers, sumWriters); } finally { // Close the buffers foreach (var buffer in buffers) { buffer.Dispose(); } } } finally { // Dispose all the sum writers foreach (var sumWriter in sumWriters) { sumWriter.Dispose(); } } // Delete temp files _logger.Info("Deleting temporary files"); foreach (var file in tmpFiles) { File.Delete(file); } }
/// <summary> /// Performs the sort using the external merge sort algorithm. /// </summary> private void ExternalSort() { _logger.Info("Input files too big for memory sort, performing an external merge sort"); var tasks = new List <Task <string> >(); var records = new List <T>(); long maxRecordsInMemory = -1; foreach (var file in InputFiles) { using (var reader = RecordAccessorFactory.CreateReader(file.OpenRead())) { _header = reader.ReadHeader(HeaderSize); var record = ReadRecord(reader); while (record != null) { if (maxRecordsInMemory == -1) { // computes the max number of records in memory // using the first read record maxRecordsInMemory = ComputeMaxInMemoryRecords(record); } if (records.Count >= maxRecordsInMemory) { // If we reached the limit, sort the current records // and save them in a temporary file tasks.Add(SortAndSave(records)); records = new List <T>(); } if (Select(record)) { records.Add(record); } record = ReadRecord(reader); } } } // Sort and save current records tasks.Add(SortAndSave(records)); var tmpFiles = tasks.Select(task => task.Result).ToList(); // Merge Merge(tmpFiles); }
/// <summary> /// Creates a task that sorts a list of records and saves them in a temporary file. /// </summary> /// <param name="records">the records to sort</param> /// <returns>the task that will perform the sort and return the path to the temporary file</returns> private Task <string> SortAndSave(List <T> records) { return(Task.Run(() => { // Sort the records records.Sort(Comparer); // Save in temporary file var tmpFilename = Path.GetTempFileName(); _logger.Debug("Saving temp file: {0}", tmpFilename); using (var writer = RecordAccessorFactory.CreateWriter(new FileStream(tmpFilename, FileMode.Create))) { foreach (var record in records) { writer.Write(record); } } return tmpFilename; })); }
/// <summary> /// Sorts the input files in memory. /// </summary> /// <param name="inputFiles">the files to sort</param> /// <param name="outputFile">the output file</param> private void InMemorySort(IEnumerable <FileInfo> inputFiles, FileInfo outputFile) { _logger.Info("Sorting in memory"); var records = new List <T>(); // Read all the records foreach (var file in inputFiles) { using (var reader = RecordAccessorFactory.CreateReader(file.OpenRead())) { _header = reader.ReadHeader(HeaderSize); var record = ReadRecord(reader); while (record != null) { if (Select(record)) { records.Add(record); } record = ReadRecord(reader); } } } // Sort the records records.Sort(Comparer); // Write the records using (var writer = GetSumWriter(outputFile)) { writer.WriteHeader(_header); foreach (var record in records) { writer.Write(record); } } }
/// <summary> /// Copies records. This method is used when there is no comparer. /// Records are still filtered and formatted as required. /// </summary> /// <param name="inputFiles">the files to copy</param> /// <param name="outputFile">the output file</param> private void Copy(IEnumerable <FileInfo> inputFiles, FileInfo outputFile) { _logger.Info("No comparer: copying records to output using filter and formatter"); using (var writer = GetSumWriter(outputFile)) { foreach (var file in inputFiles) { using (var reader = RecordAccessorFactory.CreateReader(file.OpenRead())) { _logger.Debug("Writing header"); writer.WriteHeader(reader.ReadHeader(HeaderSize)); var record = ReadRecord(reader); while (record != null) { if (Select(record)) { writer.Write(record); } record = ReadRecord(reader); } } } } }
/// <summary> /// Returns a record writer that uses <see cref="Sum"/> to sum similar records. /// </summary> /// <param name="file">the file to write to</param> /// <returns>the sum writer</returns> private SumWriter <T> GetSumWriter(FileInfo file) { file.Directory.Create(); return(new SumWriter <T>(RecordAccessorFactory.CreateWriter(file.Create()), Sum, Comparer, OutputFormatter)); }