private void logCompletionInfo() { logger.Info("Mapper finished the job in {0}!", mapperWatch.Elapsed); logger.Info("Mapper mapped {0} records that sums to {1} chars.", StringFormatter.DigitGrouped(mapperInfo.ProcessedRecords), StringFormatter.HumanReadablePostfixs(mapperInfo.ProcessedChars)); logger.Info("Mapper emmited {0} pairs.", StringFormatter.DigitGrouped(mapperInfo.MapEmits)); logger.Info("Mapper spilled {0} records that sums to {1} bytes.", StringFormatter.DigitGrouped(mapperInfo.SpilledRecords), StringFormatter.HumanReadablePostfixs(mapperInfo.SpilledBytes)); }
public long WriteRecords(IEnumerable <KeyValuePair <InterKey, List <InterValue> > > sorted_pairs) { Stopwatch watch = new Stopwatch(); watch.Start(); long written_bytes = 0; foreach (var pair in sorted_pairs) { var record_bytes = IntermediateRecord <InterKey, InterValue> .GetIntermediateRecordBytes(pair.Key, pair.Value); foreach (var bytes in record_bytes) { fileStream.Write(bytes, 0, bytes.Length); written_bytes += bytes.Length; } } watch.Stop(); logger.Debug("Spilled {0} records summing to {2} bytes to disk in {1}.", StringFormatter.DigitGrouped(sorted_pairs.Count()), watch.Elapsed, StringFormatter.HumanReadablePostfixs(written_bytes)); //if (written_bytes > int.MaxValue) // throw new InvalidCastException("The intermediate file is very huge!"); return(written_bytes); }
private void consumeInput(int thread_num = 0) { while (!inputQ.IsCompleted) { var chunk = inputQ.Take(); var dics = doMap(chunk, thread_num); foreach (var dic in dics) { dicsQ.Add(dic); } } dicsQ.CompleteAdding(); logger.Info("Mapper processed {0} records that sums to {1} chars.", StringFormatter.DigitGrouped(mapperInfo.ProcessedRecords), StringFormatter.HumanReadablePostfixs(mapperInfo.ProcessedChars)); }
/// <summary> /// Spills the In-Memory store to the disk if needed. /// The spill condition is dependant on the combine store's content volume and maximum intermediate pairs to spill. /// </summary> /// <param name="final_spill">is this the final spill? means data must be spilled even if there is a little data</param> /// <param name="thread_num">number of threads to use for soritng. The default is to use all cores.</param> /// <returns>a string containing the name of resulted file.</returns> public string doSpillIfNeeded(bool final_spill = false, int thread_num = -1) { if (combinedDictionary.Count > 0 && (intermediatePairCount + combinedDictionary.Count > maxIntermediatePairsToSpill || final_spill)) { Stopwatch watch = new Stopwatch(); watch.Restart(); KeyValuePair <InterKey, List <InterValue> >[] sorted_pairs; if (thread_num <= 0) { sorted_pairs = combinedDictionary.AsParallel().OrderBy(t => t.Key).ToArray(); } else { sorted_pairs = combinedDictionary.AsParallel().WithDegreeOfParallelism(thread_num).OrderBy(t => t.Key).ToArray(); } combinedDictionary.Clear(); intermediatePairCount = 0; mapperInfo.SpilledRecords += sorted_pairs.Count(); watch.Stop(); logger.Debug("Sorted {0} records in {1}.", StringFormatter.DigitGrouped(sorted_pairs.Count()), watch.Elapsed); IntermediateFile <InterKey, InterValue> inter_file = new IntermediateFile <InterKey, InterValue>(tempDirectory, mapperID); long written_bytes = 0; written_bytes = inter_file.WriteRecords(sorted_pairs); mapperInfo.SpilledBytes += written_bytes; inter_file.Close(); if (!final_spill && written_bytes > 0) { if (written_bytes < maxIntermediateFileSize) { maxIntermediatePairsToSpill = (int)(maxIntermediatePairsToSpill * (double)(maxIntermediateFileSize) / written_bytes); logger.Debug("maxIntermediatePairsToSpill was set to {0} records.", StringFormatter.DigitGrouped(maxIntermediatePairsToSpill)); } if (written_bytes > 1.5 * maxIntermediateFileSize) { maxIntermediatePairsToSpill /= 2; logger.Debug("maxIntermediatePairsToSpill was set to {0} records.", StringFormatter.DigitGrouped(maxIntermediatePairsToSpill)); } } return(inter_file.Path); } return(null); }
private IEnumerable <Dictionary <InterKey, List <InterValue> > > doMap(InputTextCunk chunk, int thread_num = 0) { Stopwatch watch = new Stopwatch(); var input_records = chunk.Records; var char_count = chunk.CharCount; watch.Restart(); var dics = new ConcurrentBag <Dictionary <InterKey, List <InterValue> > >(); ParallelOptions option = new ParallelOptions(); if (thread_num != 0) { option.MaxDegreeOfParallelism = thread_num; } Parallel.ForEach(Partitioner.Create(0, input_records.Count), option, (range) => { var dic = new Dictionary <InterKey, List <InterValue> >(); var context = new MapContext <InterKey, InterValue>(dic); for (int i = range.Item1; i < range.Item2; i++) { mapFunc.Invoke(input_records[i], context); } dics.Add(dic); Interlocked.Add(ref mapperInfo.MapEmits, context.EmitCount); }); watch.Stop(); mapperInfo.ProcessedRecords += input_records.Count; mapperInfo.ProcessedChars += char_count; if (watch.Elapsed > maxWorkPeriod) { maxChunkSize = Math.Min(maxChunkSize / 2, maxCharsToMap); } if (watch.Elapsed < minWorkPeriod) { maxChunkSize = Math.Min(maxChunkSize * 2, maxCharsToMap); } logger.Debug("Mapped a chunk with {0} chars in {1}", StringFormatter.DigitGrouped(char_count), watch.Elapsed); return(dics); }
/// <summary> /// performs the reduce phase /// </summary> public void Reduce() { if (reduceFunc == null) { throw new InvalidOperationException("Reduce function is not defined!"); } isRunning = true; logger.Info("Reducing final file: {0}", inputPath); var reader = new ReduceInputReader <InterKey, InterVal>(inputPath, bufferSize); var reduce_context = new ReduceContext(outputStream); while (!reader.IsFinished) { var reduce_object = reader.GetNextReduceObject(); reduceFunc.Invoke(reduce_object, reduce_context); } logger.Info("Reducer emitted {0} records.", StringFormatter.DigitGrouped(reduce_context.EmitCount)); logger.Info("Reducer output: {0}.", outputFileName); outputStream.Close(); isRunning = false; }
/// <summary> /// Run the mapper in a sequential flow. e.g. first read a chunk, map it and then add to combine-store. /// All of the process is done step bye step,. However, each step is done using parallelism facilities but steps are done in order. /// </summary> /// <param name="thread_num">number of threads to be used</param> public void SequentialRun(int thread_num = 0) { isRunning = true; init(); mapperWatch.Start(); while (true) { InputTextCunk input_chunk; int char_count = 0; char_count = reader.ReadChunk(out input_chunk, maxChunkSize); if (char_count == 0) { break; } logger.Info("File percentage consumed: {3}%. Read a chunk: {0} records and {1} chars. InputQ count is {2}", StringFormatter.DigitGrouped(input_chunk.CharCount), StringFormatter.HumanReadablePostfixs(char_count), inputQ.Count, (100 * reader.Position) / reader.Length); var dics = doMap(input_chunk, thread_num); foreach (var dic in dics) { combineStore.Add(dic); combineStore.doSpillIfNeeded(false, thread_num); } } combineStore.doSpillIfNeeded(true, thread_num); mapperWatch.Stop(); logCompletionInfo(); isRunning = false; }
private void readInput() { while (true) { InputTextCunk input_chunk; int char_count = 0; lock (diskLock) { char_count = reader.ReadChunk(out input_chunk, maxChunkSize); } if (char_count == 0) { break; } inputQ.Add(input_chunk); logger.Debug("Read a chunk: {0} records and {1} chars. InputQ count is {2}", StringFormatter.DigitGrouped(input_chunk.CharCount), StringFormatter.HumanReadablePostfixs(char_count), inputQ.Count); } inputQ.CompleteAdding(); }
public string Merge(bool keep_files = false) { int memory_per_file = maxMemory / (concurrentFilesCount + 2); var fileQ = new Queue <string>(files); Stopwatch watch = new Stopwatch(); long total_records = 0; while (fileQ.Count > 1) { watch.Restart(); var destination_file = new IntermediateFile <InterKey, InterVal>(directory, ID, 2 * memory_per_file); var dest = destination_file.FileStream; var current_streams = new List <FileStream>(); for (int i = 0; i < concurrentFilesCount && fileQ.Count > 0; i++) { current_streams.Add(new FileStream(fileQ.Dequeue(), FileMode.Open, FileAccess.Read, FileShare.Read, memory_per_file)); } PriorityQueue <InterKey, Stream> priorityQ = new PriorityQueue <InterKey, Stream>(); var stream_len = new Dictionary <Stream, long>(); foreach (var stream in current_streams) { stream_len[stream] = stream.Length; if (stream_len[stream] < sizeof(int)) { throw new IOException("Malformed intermediate file: The file is too small!"); } var key = IntermediateRecord <InterKey, InterVal> .ReadKey(stream); priorityQ.Enqueue(key, stream); } logger.Debug("Merging {0} files summing to {1} bytes", current_streams.Count, StringFormatter.HumanReadablePostfixs(stream_len.Values.Sum())); var last_key = priorityQ.Peek().Key; bool first_time = true; while (priorityQ.Count > 0) { total_records++; var best = priorityQ.Dequeue(); if (!first_time) { if (last_key.Equals(best.Key)) { dest.WriteByte(1); } else { dest.WriteByte(0); } } last_key = best.Key; first_time = false; destination_file.WriteKey(best.Key); var current_stream = best.Value; var len = IntermediateRecord <InterKey, InterVal> .ReadValueListLength(current_stream); dest.Write(BitConverter.GetBytes(len), 0, sizeof(int)); StreamUtils.Copy(current_stream, dest, len - sizeof(byte)); current_stream.ReadByte(); if (best.Value.Position >= stream_len[current_stream]) { continue; } var new_key = IntermediateRecord <InterKey, InterVal> .ReadKey(current_stream); priorityQ.Enqueue(new_key, current_stream); } dest.WriteByte(0); dest.Close(); fileQ.Enqueue(destination_file.Path); foreach (var stream in current_streams) { stream.Close(); File.Delete(stream.Name); } watch.Stop(); logger.Debug("Merged {0} records to {1} in {2}.", StringFormatter.DigitGrouped(total_records), destination_file.Path, watch.Elapsed); } return(fileQ.First()); }