Exemple #1
0
 private void logCompletionInfo()
 {
     logger.Info("Mapper finished the job in {0}!", mapperWatch.Elapsed);
     logger.Info("Mapper mapped {0} records that sums to {1} chars.", StringFormatter.DigitGrouped(mapperInfo.ProcessedRecords), StringFormatter.HumanReadablePostfixs(mapperInfo.ProcessedChars));
     logger.Info("Mapper emmited {0} pairs.", StringFormatter.DigitGrouped(mapperInfo.MapEmits));
     logger.Info("Mapper spilled {0} records that sums to {1} bytes.", StringFormatter.DigitGrouped(mapperInfo.SpilledRecords), StringFormatter.HumanReadablePostfixs(mapperInfo.SpilledBytes));
 }
Exemple #2
0
        public long WriteRecords(IEnumerable <KeyValuePair <InterKey, List <InterValue> > > sorted_pairs)
        {
            Stopwatch watch = new Stopwatch();

            watch.Start();
            long written_bytes = 0;


            foreach (var pair in sorted_pairs)
            {
                var record_bytes = IntermediateRecord <InterKey, InterValue> .GetIntermediateRecordBytes(pair.Key, pair.Value);

                foreach (var bytes in record_bytes)
                {
                    fileStream.Write(bytes, 0, bytes.Length);
                    written_bytes += bytes.Length;
                }
            }

            watch.Stop();
            logger.Debug("Spilled {0} records summing to {2} bytes to disk in {1}.", StringFormatter.DigitGrouped(sorted_pairs.Count()), watch.Elapsed, StringFormatter.HumanReadablePostfixs(written_bytes));

            //if (written_bytes > int.MaxValue)
            //    throw new InvalidCastException("The intermediate file is very huge!");
            return(written_bytes);
        }
Exemple #3
0
        private void consumeInput(int thread_num = 0)
        {
            while (!inputQ.IsCompleted)
            {
                var chunk = inputQ.Take();
                var dics  = doMap(chunk, thread_num);
                foreach (var dic in dics)
                {
                    dicsQ.Add(dic);
                }
            }

            dicsQ.CompleteAdding();
            logger.Info("Mapper processed {0} records that sums to {1} chars.", StringFormatter.DigitGrouped(mapperInfo.ProcessedRecords), StringFormatter.HumanReadablePostfixs(mapperInfo.ProcessedChars));
        }
Exemple #4
0
        /// <summary>
        /// Spills the In-Memory store to the disk if needed.
        /// The spill condition is dependant on the combine store's content volume and maximum intermediate pairs to spill.
        /// </summary>
        /// <param name="final_spill">is this the final spill? means data must be spilled even if there is a little data</param>
        /// <param name="thread_num">number of threads to use for soritng. The default is to use all cores.</param>
        /// <returns>a string containing the name of resulted file.</returns>
        public string doSpillIfNeeded(bool final_spill = false, int thread_num = -1)
        {
            if (combinedDictionary.Count > 0 && (intermediatePairCount + combinedDictionary.Count > maxIntermediatePairsToSpill || final_spill))
            {
                Stopwatch watch = new Stopwatch();
                watch.Restart();
                KeyValuePair <InterKey, List <InterValue> >[] sorted_pairs;
                if (thread_num <= 0)
                {
                    sorted_pairs = combinedDictionary.AsParallel().OrderBy(t => t.Key).ToArray();
                }
                else
                {
                    sorted_pairs = combinedDictionary.AsParallel().WithDegreeOfParallelism(thread_num).OrderBy(t => t.Key).ToArray();
                }
                combinedDictionary.Clear();
                intermediatePairCount = 0;

                mapperInfo.SpilledRecords += sorted_pairs.Count();
                watch.Stop();
                logger.Debug("Sorted {0} records in {1}.", StringFormatter.DigitGrouped(sorted_pairs.Count()), watch.Elapsed);
                IntermediateFile <InterKey, InterValue> inter_file = new IntermediateFile <InterKey, InterValue>(tempDirectory, mapperID);
                long written_bytes = 0;
                written_bytes            = inter_file.WriteRecords(sorted_pairs);
                mapperInfo.SpilledBytes += written_bytes;
                inter_file.Close();

                if (!final_spill && written_bytes > 0)
                {
                    if (written_bytes < maxIntermediateFileSize)
                    {
                        maxIntermediatePairsToSpill = (int)(maxIntermediatePairsToSpill * (double)(maxIntermediateFileSize) / written_bytes);
                        logger.Debug("maxIntermediatePairsToSpill was set to {0} records.", StringFormatter.DigitGrouped(maxIntermediatePairsToSpill));
                    }
                    if (written_bytes > 1.5 * maxIntermediateFileSize)
                    {
                        maxIntermediatePairsToSpill /= 2;
                        logger.Debug("maxIntermediatePairsToSpill was set to {0} records.", StringFormatter.DigitGrouped(maxIntermediatePairsToSpill));
                    }
                }

                return(inter_file.Path);
            }
            return(null);
        }
Exemple #5
0
        private IEnumerable <Dictionary <InterKey, List <InterValue> > > doMap(InputTextCunk chunk, int thread_num = 0)
        {
            Stopwatch watch         = new Stopwatch();
            var       input_records = chunk.Records;
            var       char_count    = chunk.CharCount;

            watch.Restart();
            var             dics   = new ConcurrentBag <Dictionary <InterKey, List <InterValue> > >();
            ParallelOptions option = new ParallelOptions();

            if (thread_num != 0)
            {
                option.MaxDegreeOfParallelism = thread_num;
            }

            Parallel.ForEach(Partitioner.Create(0, input_records.Count), option, (range) =>
            {
                var dic     = new Dictionary <InterKey, List <InterValue> >();
                var context = new MapContext <InterKey, InterValue>(dic);
                for (int i = range.Item1; i < range.Item2; i++)
                {
                    mapFunc.Invoke(input_records[i], context);
                }
                dics.Add(dic);
                Interlocked.Add(ref mapperInfo.MapEmits, context.EmitCount);
            });
            watch.Stop();

            mapperInfo.ProcessedRecords += input_records.Count;
            mapperInfo.ProcessedChars   += char_count;

            if (watch.Elapsed > maxWorkPeriod)
            {
                maxChunkSize = Math.Min(maxChunkSize / 2, maxCharsToMap);
            }
            if (watch.Elapsed < minWorkPeriod)
            {
                maxChunkSize = Math.Min(maxChunkSize * 2, maxCharsToMap);
            }

            logger.Debug("Mapped a chunk with {0} chars in {1}", StringFormatter.DigitGrouped(char_count), watch.Elapsed);
            return(dics);
        }
Exemple #6
0
        /// <summary>
        /// performs the reduce phase
        /// </summary>
        public void Reduce()
        {
            if (reduceFunc == null)
            {
                throw new InvalidOperationException("Reduce function is not defined!");
            }

            isRunning = true;
            logger.Info("Reducing final file: {0}", inputPath);

            var reader         = new ReduceInputReader <InterKey, InterVal>(inputPath, bufferSize);
            var reduce_context = new ReduceContext(outputStream);

            while (!reader.IsFinished)
            {
                var reduce_object = reader.GetNextReduceObject();
                reduceFunc.Invoke(reduce_object, reduce_context);
            }
            logger.Info("Reducer emitted {0} records.", StringFormatter.DigitGrouped(reduce_context.EmitCount));
            logger.Info("Reducer output: {0}.", outputFileName);
            outputStream.Close();
            isRunning = false;
        }
Exemple #7
0
        /// <summary>
        /// Run the mapper in a sequential flow. e.g. first read a chunk, map it and then add to combine-store.
        /// All of the process is done step bye step,. However, each step is done using parallelism facilities but steps are done in order.
        /// </summary>
        /// <param name="thread_num">number of threads to be used</param>
        public void SequentialRun(int thread_num = 0)
        {
            isRunning = true;
            init();
            mapperWatch.Start();
            while (true)
            {
                InputTextCunk input_chunk;
                int           char_count = 0;

                char_count = reader.ReadChunk(out input_chunk, maxChunkSize);
                if (char_count == 0)
                {
                    break;
                }

                logger.Info("File percentage consumed: {3}%.  Read a chunk: {0} records and {1} chars. InputQ count is {2}", StringFormatter.DigitGrouped(input_chunk.CharCount), StringFormatter.HumanReadablePostfixs(char_count), inputQ.Count, (100 * reader.Position) / reader.Length);
                var dics = doMap(input_chunk, thread_num);
                foreach (var dic in dics)
                {
                    combineStore.Add(dic);
                    combineStore.doSpillIfNeeded(false, thread_num);
                }
            }
            combineStore.doSpillIfNeeded(true, thread_num);
            mapperWatch.Stop();
            logCompletionInfo();
            isRunning = false;
        }
Exemple #8
0
 private void readInput()
 {
     while (true)
     {
         InputTextCunk input_chunk;
         int           char_count = 0;
         lock (diskLock)
         {
             char_count = reader.ReadChunk(out input_chunk, maxChunkSize);
         }
         if (char_count == 0)
         {
             break;
         }
         inputQ.Add(input_chunk);
         logger.Debug("Read a chunk: {0} records and {1} chars. InputQ count is {2}", StringFormatter.DigitGrouped(input_chunk.CharCount), StringFormatter.HumanReadablePostfixs(char_count), inputQ.Count);
     }
     inputQ.CompleteAdding();
 }
        public string Merge(bool keep_files = false)
        {
            int       memory_per_file = maxMemory / (concurrentFilesCount + 2);
            var       fileQ           = new Queue <string>(files);
            Stopwatch watch           = new Stopwatch();
            long      total_records   = 0;

            while (fileQ.Count > 1)
            {
                watch.Restart();
                var destination_file = new IntermediateFile <InterKey, InterVal>(directory, ID, 2 * memory_per_file);
                var dest             = destination_file.FileStream;

                var current_streams = new List <FileStream>();
                for (int i = 0; i < concurrentFilesCount && fileQ.Count > 0; i++)
                {
                    current_streams.Add(new FileStream(fileQ.Dequeue(), FileMode.Open, FileAccess.Read, FileShare.Read, memory_per_file));
                }

                PriorityQueue <InterKey, Stream> priorityQ = new PriorityQueue <InterKey, Stream>();

                var stream_len = new Dictionary <Stream, long>();

                foreach (var stream in current_streams)
                {
                    stream_len[stream] = stream.Length;
                    if (stream_len[stream] < sizeof(int))
                    {
                        throw new IOException("Malformed intermediate file: The file is too small!");
                    }
                    var key = IntermediateRecord <InterKey, InterVal> .ReadKey(stream);

                    priorityQ.Enqueue(key, stream);
                }

                logger.Debug("Merging {0} files summing to {1} bytes", current_streams.Count, StringFormatter.HumanReadablePostfixs(stream_len.Values.Sum()));

                var  last_key   = priorityQ.Peek().Key;
                bool first_time = true;
                while (priorityQ.Count > 0)
                {
                    total_records++;
                    var best = priorityQ.Dequeue();

                    if (!first_time)
                    {
                        if (last_key.Equals(best.Key))
                        {
                            dest.WriteByte(1);
                        }
                        else
                        {
                            dest.WriteByte(0);
                        }
                    }
                    last_key   = best.Key;
                    first_time = false;

                    destination_file.WriteKey(best.Key);
                    var current_stream = best.Value;
                    var len            = IntermediateRecord <InterKey, InterVal> .ReadValueListLength(current_stream);

                    dest.Write(BitConverter.GetBytes(len), 0, sizeof(int));
                    StreamUtils.Copy(current_stream, dest, len - sizeof(byte));
                    current_stream.ReadByte();

                    if (best.Value.Position >= stream_len[current_stream])
                    {
                        continue;
                    }
                    var new_key = IntermediateRecord <InterKey, InterVal> .ReadKey(current_stream);

                    priorityQ.Enqueue(new_key, current_stream);
                }
                dest.WriteByte(0);
                dest.Close();
                fileQ.Enqueue(destination_file.Path);
                foreach (var stream in current_streams)
                {
                    stream.Close();
                    File.Delete(stream.Name);
                }
                watch.Stop();
                logger.Debug("Merged {0} records to {1} in {2}.", StringFormatter.DigitGrouped(total_records), destination_file.Path, watch.Elapsed);
            }

            return(fileQ.First());
        }