private void AddCurrentItemIfNotExist(ChunkInfoItem chunkInfoItem) { if (!_listOfCurrentItemsToProcess.Contains(chunkInfoItem)) { _listOfCurrentItemsToProcess.Add(chunkInfoItem); } }
private ChunkInfoItem ReadAndSortInitialChunk(StreamReader streamReader) { // we don't pad number lines because length can be changed later if (streamReader == null) { throw new ArgumentNullException(nameof(streamReader)); } var item = new ChunkInfoItem(); string line; while ((line = streamReader.ReadLine()) != null) { _initialCountOfLines++; var linePart = LinePreProcessingFunc(line); item.Buffer.Add(linePart); if (item.Buffer.Count >= AllowedCountOfFileLinesToSortInMemory) { break; } } item.Buffer.Sort(_comparer); item.FirstPart = item.Buffer.FirstOrDefault(); item.LastPart = item.Buffer.LastOrDefault(); item.CountOfLinesInFile = item.Buffer.Count; WriteBufferToFile(item); return(item); }
private void ProcessNewChunk(ChunkInfoItem treeItemToProcess) { // new chunk - sort - write treeItemToProcess.Buffer.Sort(_comparer); treeItemToProcess.FirstPart = treeItemToProcess.Buffer.FirstOrDefault(); treeItemToProcess.LastPart = treeItemToProcess.Buffer.LastOrDefault(); treeItemToProcess.CountOfLinesInFile = treeItemToProcess.Buffer.Count; WriteBufferToFile(treeItemToProcess); }
private void WriteBufferToFile(ChunkInfoItem item) { item.StringFilePath = Path.Combine(_tempFolderForChunksPath, Guid.NewGuid().ToString()); item.NumberFilePath = Path.Combine(_tempFolderForChunksPath, Guid.NewGuid().ToString()); using (var stringWriter = new StreamWriter(item.StringFilePath)) { using (var numberWriter = new StreamWriter(item.NumberFilePath)) { foreach (var pair in item.Buffer) { stringWriter.WriteLine(pair.StringPart); numberWriter.WriteLine(pair.NumberPart); } } } item.StringFileLength = new FileInfo(item.StringFilePath).Length; item.NumberFileLength = new FileInfo(item.NumberFilePath).Length; item.Buffer.Clear(); }
private LinkedListNode <ChunkInfoItem> SortChunks(LinkedListNode <ChunkInfoItem> first, LinkedListNode <ChunkInfoItem> second) { var parts = new List <StringNumberPart>(); // read first chunk using (var stringReader = new StreamReader(first.Value.StringFilePath)) { using (var numberReader = new StreamReader(first.Value.NumberFilePath)) { string stringLine; while ((stringLine = stringReader.ReadLine()) != null) { var numberLine = numberReader.ReadLine(); parts.Add(new StringNumberPart(stringLine, numberLine)); } } } // read second chunk using (var stringReader = new StreamReader(second.Value.StringFilePath)) { using (var numberReader = new StreamReader(second.Value.NumberFilePath)) { string stringLine; while ((stringLine = stringReader.ReadLine()) != null) { var numberLine = numberReader.ReadLine(); parts.Add(new StringNumberPart(stringLine, numberLine)); } } } parts.Sort(_comparer); var firstPartOfList = parts.Count / 2; var lastPartOfList = parts.Count - firstPartOfList; var firstList = parts.GetRange(0, firstPartOfList); var lastList = parts.GetRange(firstPartOfList, lastPartOfList); var newFirstItem = new ChunkInfoItem { FirstPart = firstList.FirstOrDefault(), LastPart = firstList.LastOrDefault(), CountOfLinesInFile = firstList.Count, Buffer = firstList, }; WriteBufferToFile(newFirstItem); var newLastItem = new ChunkInfoItem { FirstPart = lastList.FirstOrDefault(), LastPart = lastList.LastOrDefault(), CountOfLinesInFile = lastList.Count, Buffer = lastList, }; WriteBufferToFile(newLastItem); var result = _linkedlist.AddBefore(first, newFirstItem); _linkedlist.AddAfter(second, newLastItem); _linkedlist.Remove(first); _linkedlist.Remove(second); File.Delete(first.Value.StringFilePath); File.Delete(second.Value.NumberFilePath); Console.WriteLine($"{DateTime.Now}. Sorting existing chunks to ensure correct order."); return(result); }
private void ProcessExistingChunkWithBuffer(ChunkInfoItem treeItemToProcess) { // additional chunk - find node - read old - join lines - sort - split to 2 chunks, write and update linked list var node = _linkedlist.Find(treeItemToProcess); if (node == null) { throw new InvalidOperationException("Cannot find mandatory node"); } var parts = new List <StringNumberPart>((int)treeItemToProcess.CountOfLinesInFile + treeItemToProcess.Buffer.Count); using (var stringReader = new StreamReader(treeItemToProcess.StringFilePath)) { using (var numberReader = new StreamReader(treeItemToProcess.NumberFilePath)) { string stringLine; while ((stringLine = stringReader.ReadLine()) != null) { var numberLine = numberReader.ReadLine(); parts.Add(new StringNumberPart(stringLine, numberLine)); } } } parts.AddRange(treeItemToProcess.Buffer); treeItemToProcess.Buffer.Clear(); parts.Sort(_comparer); var firstPartOfList = parts.Count / 2; var lastPartOfList = parts.Count - firstPartOfList; var firstList = parts.GetRange(0, firstPartOfList); var lastList = parts.GetRange(firstPartOfList, lastPartOfList); var firstItem = new ChunkInfoItem { FirstPart = firstList.FirstOrDefault(), LastPart = firstList.LastOrDefault(), CountOfLinesInFile = firstList.Count, Buffer = firstList, }; WriteBufferToFile(firstItem); var lastItem = new ChunkInfoItem { FirstPart = lastList.FirstOrDefault(), LastPart = lastList.LastOrDefault(), CountOfLinesInFile = lastList.Count, Buffer = lastList, }; WriteBufferToFile(lastItem); _linkedlist.AddBefore(node, firstItem); _linkedlist.AddAfter(node, lastItem); _linkedlist.Remove(node); File.Delete(node.Value.StringFilePath); File.Delete(node.Value.NumberFilePath); Console.WriteLine($"{DateTime.Now}. Processed buffer and split chunk to two new. Chunks count: {_linkedlist.Count}"); }
private void ProcessInitialLinePart(StringNumberPart linePart) { var node = _linkedlist.First; while (node != null) { var chunkInfoItem = node.Value; var compareToFirst = _comparer.Compare(linePart, chunkInfoItem.FirstPart); var compareToLast = _comparer.Compare(linePart, chunkInfoItem.LastPart); //if (compareToFirst < 0 && compareToLast < 0) //{ // // add node before // var newBeforeChunkInfo = new ChunkInfoItem(); // newBeforeChunkInfo.Buffer.Add(linePart); // newBeforeChunkInfo.FirstPart = linePart; // newBeforeChunkInfo.LastPart = linePart; // _linkedlist.AddBefore(node, newBeforeChunkInfo); // AddCurrentItemIfNotExist(newBeforeChunkInfo); // return; //} //if (compareToFirst == 0 && compareToLast < 0) //{ // chunkInfoItem.Buffer.Add(linePart); // chunkInfoItem.FirstPart = linePart; // AddCurrentItemIfNotExist(chunkInfoItem); // return; //} // will not create a lot of small chunks with unsorted data(more time to sort and merge) - hard to say - what will be better - depends on input data if ((compareToFirst == 0 && compareToLast < 0) || (compareToFirst < 0 && compareToLast < 0)) { chunkInfoItem.Buffer.Add(linePart); chunkInfoItem.FirstPart = linePart; AddCurrentItemIfNotExist(chunkInfoItem); return; } // item in if (compareToFirst > 0 && compareToLast < 0) { chunkInfoItem.Buffer.Add(linePart); AddCurrentItemIfNotExist(chunkInfoItem); return; } if (compareToLast == 0 && compareToFirst > 0) { chunkInfoItem.Buffer.Add(linePart); chunkInfoItem.LastPart = linePart; AddCurrentItemIfNotExist(chunkInfoItem); return; } // this will help to rechecking and resorting already sorted chunks if (compareToFirst > 0 && compareToLast > 0) { if (node.Next == null) { // end of list - last node var newAfterChunkInfo = new ChunkInfoItem(); newAfterChunkInfo.Buffer.Add(linePart); newAfterChunkInfo.FirstPart = linePart; newAfterChunkInfo.LastPart = linePart; _linkedlist.AddAfter(node, newAfterChunkInfo); AddCurrentItemIfNotExist(newAfterChunkInfo); return; } else { // check next node node = node.Next; continue; } } node = node.Next; } }