Пример #1
0
        /// <exception cref="System.Exception"/>
        private void RunMapReduce(JobConf conf, IList <string> mapperBadRecords, IList <string
                                                                                        > redBadRecords)
        {
            CreateInput();
            conf.SetJobName("mr");
            conf.SetNumMapTasks(1);
            conf.SetNumReduceTasks(1);
            conf.SetInt(JobContext.TaskTimeout, 30 * 1000);
            SkipBadRecords.SetMapperMaxSkipRecords(conf, long.MaxValue);
            SkipBadRecords.SetReducerMaxSkipGroups(conf, long.MaxValue);
            SkipBadRecords.SetAttemptsToStartSkipping(conf, 0);
            //the no of attempts to successfully complete the task depends
            //on the no of bad records.
            conf.SetMaxMapAttempts(SkipBadRecords.GetAttemptsToStartSkipping(conf) + 1 + mapperBadRecords
                                   .Count);
            conf.SetMaxReduceAttempts(SkipBadRecords.GetAttemptsToStartSkipping(conf) + 1 + redBadRecords
                                      .Count);
            FileInputFormat.SetInputPaths(conf, GetInputDir());
            FileOutputFormat.SetOutputPath(conf, GetOutputDir());
            conf.SetInputFormat(typeof(TextInputFormat));
            conf.SetMapOutputKeyClass(typeof(LongWritable));
            conf.SetMapOutputValueClass(typeof(Text));
            conf.SetOutputFormat(typeof(TextOutputFormat));
            conf.SetOutputKeyClass(typeof(LongWritable));
            conf.SetOutputValueClass(typeof(Text));
            RunningJob runningJob = JobClient.RunJob(conf);

            ValidateOutput(conf, runningJob, mapperBadRecords, redBadRecords);
        }
Пример #2
0
 public virtual void Configure(JobConf job)
 {
     this.mapper = ReflectionUtils.NewInstance(job.GetMapperClass(), job);
     //increment processed counter only if skipping feature is enabled
     this.incrProcCount = SkipBadRecords.GetMapperMaxSkipRecords(job) > 0 && SkipBadRecords
                          .GetAutoIncrMapperProcCount(job);
 }
Пример #3
0
        public virtual void TestSkipBadRecords()
        {
            // test default values
            Configuration conf = new Configuration();

            NUnit.Framework.Assert.AreEqual(2, SkipBadRecords.GetAttemptsToStartSkipping(conf
                                                                                         ));
            NUnit.Framework.Assert.IsTrue(SkipBadRecords.GetAutoIncrMapperProcCount(conf));
            NUnit.Framework.Assert.IsTrue(SkipBadRecords.GetAutoIncrReducerProcCount(conf));
            NUnit.Framework.Assert.AreEqual(0, SkipBadRecords.GetMapperMaxSkipRecords(conf));
            NUnit.Framework.Assert.AreEqual(0, SkipBadRecords.GetReducerMaxSkipGroups(conf),
                                            0);
            NUnit.Framework.Assert.IsNull(SkipBadRecords.GetSkipOutputPath(conf));
            // test setters
            SkipBadRecords.SetAttemptsToStartSkipping(conf, 5);
            SkipBadRecords.SetAutoIncrMapperProcCount(conf, false);
            SkipBadRecords.SetAutoIncrReducerProcCount(conf, false);
            SkipBadRecords.SetMapperMaxSkipRecords(conf, 6L);
            SkipBadRecords.SetReducerMaxSkipGroups(conf, 7L);
            JobConf jc = new JobConf();

            SkipBadRecords.SetSkipOutputPath(jc, new Path("test"));
            // test getters
            NUnit.Framework.Assert.AreEqual(5, SkipBadRecords.GetAttemptsToStartSkipping(conf
                                                                                         ));
            NUnit.Framework.Assert.IsFalse(SkipBadRecords.GetAutoIncrMapperProcCount(conf));
            NUnit.Framework.Assert.IsFalse(SkipBadRecords.GetAutoIncrReducerProcCount(conf));
            NUnit.Framework.Assert.AreEqual(6L, SkipBadRecords.GetMapperMaxSkipRecords(conf));
            NUnit.Framework.Assert.AreEqual(7L, SkipBadRecords.GetReducerMaxSkipGroups(conf),
                                            0);
            NUnit.Framework.Assert.AreEqual("test", SkipBadRecords.GetSkipOutputPath(jc).ToString
                                                ());
        }
Пример #4
0
 /// <exception cref="System.IO.IOException"/>
 private void WriteSkippedRec(KEY key, VALUE value)
 {
     if (this.skipWriter == null)
     {
         Path skipDir  = SkipBadRecords.GetSkipOutputPath(this._enclosing.conf);
         Path skipFile = new Path(skipDir, this._enclosing.GetTaskID().ToString());
         this.skipWriter = SequenceFile.CreateWriter(skipFile.GetFileSystem(this._enclosing
                                                                            .conf), this._enclosing.conf, skipFile, this.keyClass, this.valClass, SequenceFile.CompressionType
                                                     .Block, this.reporter);
     }
     this.skipWriter.Append(key, value);
 }
Пример #5
0
        /// <exception cref="System.IO.IOException"/>
        private void RunOldReducer <Inkey, Invalue, Outkey, Outvalue>(JobConf job, TaskUmbilicalProtocol
                                                                      umbilical, Task.TaskReporter reporter, RawKeyValueIterator rIter, RawComparator
                                                                      <INKEY> comparator)
        {
            System.Type keyClass   = typeof(INKEY);
            System.Type valueClass = typeof(INVALUE);
            Reducer <INKEY, INVALUE, OUTKEY, OUTVALUE> reducer = ReflectionUtils.NewInstance(job
                                                                                             .GetReducerClass(), job);
            // make output collector
            string finalName = GetOutputName(GetPartition());
            RecordWriter <OUTKEY, OUTVALUE> @out = new ReduceTask.OldTrackingRecordWriter <OUTKEY
                                                                                           , OUTVALUE>(this, job, reporter, finalName);
            RecordWriter <OUTKEY, OUTVALUE>    finalOut  = @out;
            OutputCollector <OUTKEY, OUTVALUE> collector = new _OutputCollector_419(finalOut,
                                                                                    reporter);

            // indicate that progress update needs to be sent
            // apply reduce function
            try
            {
                //increment processed counter only if skipping feature is enabled
                bool incrProcCount = SkipBadRecords.GetReducerMaxSkipGroups(job) > 0 && SkipBadRecords
                                     .GetAutoIncrReducerProcCount(job);
                ReduceTask.ReduceValuesIterator <INKEY, INVALUE> values = IsSkipping() ? new ReduceTask.SkippingReduceValuesIterator
                                                                          <INKEY, INVALUE>(this, rIter, comparator, keyClass, valueClass, job, reporter, umbilical
                                                                                           ) : new ReduceTask.ReduceValuesIterator <INKEY, INVALUE>(this, rIter, job.GetOutputValueGroupingComparator
                                                                                                                                                        (), keyClass, valueClass, job, reporter);
                values.InformReduceProgress();
                while (values.More())
                {
                    reduceInputKeyCounter.Increment(1);
                    reducer.Reduce(values.GetKey(), values, collector, reporter);
                    if (incrProcCount)
                    {
                        reporter.IncrCounter(SkipBadRecords.CounterGroup, SkipBadRecords.CounterReduceProcessedGroups
                                             , 1);
                    }
                    values.NextKey();
                    values.InformReduceProgress();
                }
                reducer.Close();
                reducer = null;
                @out.Close(reporter);
                @out = null;
            }
            finally
            {
                IOUtils.Cleanup(Log, reducer);
                CloseQuietly(@out, reporter);
            }
        }
Пример #6
0
 /// <exception cref="System.IO.IOException"/>
 public SkippingReduceValuesIterator(ReduceTask _enclosing, RawKeyValueIterator @in
                                     , RawComparator <KEY> comparator, Type keyClass, Type valClass, Configuration conf
                                     , Task.TaskReporter reporter, TaskUmbilicalProtocol umbilical)
     : base(_enclosing)
 {
     this._enclosing       = _enclosing;
     this.umbilical        = umbilical;
     this.skipGroupCounter = ((Counters.Counter)reporter.GetCounter(TaskCounter.ReduceSkippedGroups
                                                                    ));
     this.skipRecCounter = ((Counters.Counter)reporter.GetCounter(TaskCounter.ReduceSkippedRecords
                                                                  ));
     this.toWriteSkipRecs = this._enclosing.ToWriteSkipRecs() && SkipBadRecords.GetSkipOutputPath
                                (conf) != null;
     this.keyClass = keyClass;
     this.valClass = valClass;
     this.reporter = reporter;
     this.skipIt   = this._enclosing.GetSkipRanges().SkipRangeIterator();
     this.MayBeSkip();
 }
Пример #7
0
        /// <exception cref="System.Exception"/>
        private void ValidateOutput(JobConf conf, RunningJob runningJob, IList <string> mapperBadRecords
                                    , IList <string> redBadRecords)
        {
            Log.Info(runningJob.GetCounters().ToString());
            NUnit.Framework.Assert.IsTrue(runningJob.IsSuccessful());
            //validate counters
            Counters counters = runningJob.GetCounters();

            NUnit.Framework.Assert.AreEqual(counters.FindCounter(TaskCounter.MapSkippedRecords
                                                                 ).GetCounter(), mapperBadRecords.Count);
            int mapRecs = input.Count - mapperBadRecords.Count;

            NUnit.Framework.Assert.AreEqual(counters.FindCounter(TaskCounter.MapInputRecords)
                                            .GetCounter(), mapRecs);
            NUnit.Framework.Assert.AreEqual(counters.FindCounter(TaskCounter.MapOutputRecords
                                                                 ).GetCounter(), mapRecs);
            int redRecs = mapRecs - redBadRecords.Count;

            NUnit.Framework.Assert.AreEqual(counters.FindCounter(TaskCounter.ReduceSkippedRecords
                                                                 ).GetCounter(), redBadRecords.Count);
            NUnit.Framework.Assert.AreEqual(counters.FindCounter(TaskCounter.ReduceSkippedGroups
                                                                 ).GetCounter(), redBadRecords.Count);
            NUnit.Framework.Assert.AreEqual(counters.FindCounter(TaskCounter.ReduceInputGroups
                                                                 ).GetCounter(), redRecs);
            NUnit.Framework.Assert.AreEqual(counters.FindCounter(TaskCounter.ReduceInputRecords
                                                                 ).GetCounter(), redRecs);
            NUnit.Framework.Assert.AreEqual(counters.FindCounter(TaskCounter.ReduceOutputRecords
                                                                 ).GetCounter(), redRecs);
            //validate skipped records
            Path skipDir = SkipBadRecords.GetSkipOutputPath(conf);

            NUnit.Framework.Assert.IsNotNull(skipDir);
            Path[]         skips      = FileUtil.Stat2Paths(GetFileSystem().ListStatus(skipDir));
            IList <string> mapSkipped = new AList <string>();
            IList <string> redSkipped = new AList <string>();

            foreach (Path skipPath in skips)
            {
                Log.Info("skipPath: " + skipPath);
                SequenceFile.Reader reader = new SequenceFile.Reader(GetFileSystem(), skipPath, conf
                                                                     );
                object key   = ReflectionUtils.NewInstance(reader.GetKeyClass(), conf);
                object value = ReflectionUtils.NewInstance(reader.GetValueClass(), conf);
                key = reader.Next(key);
                while (key != null)
                {
                    value = reader.GetCurrentValue(value);
                    Log.Debug("key:" + key + " value:" + value.ToString());
                    if (skipPath.GetName().Contains("_r_"))
                    {
                        redSkipped.AddItem(value.ToString());
                    }
                    else
                    {
                        mapSkipped.AddItem(value.ToString());
                    }
                    key = reader.Next(key);
                }
                reader.Close();
            }
            NUnit.Framework.Assert.IsTrue(mapSkipped.ContainsAll(mapperBadRecords));
            NUnit.Framework.Assert.IsTrue(redSkipped.ContainsAll(redBadRecords));
            Path[] outputFiles = FileUtil.Stat2Paths(GetFileSystem().ListStatus(GetOutputDir(
                                                                                    ), new Utils.OutputFileUtils.OutputFilesFilter()));
            IList <string> mapperOutput = GetProcessed(input, mapperBadRecords);

            Log.Debug("mapperOutput " + mapperOutput.Count);
            IList <string> reducerOutput = GetProcessed(mapperOutput, redBadRecords);

            Log.Debug("reducerOutput " + reducerOutput.Count);
            if (outputFiles.Length > 0)
            {
                InputStream    @is     = GetFileSystem().Open(outputFiles[0]);
                BufferedReader reader  = new BufferedReader(new InputStreamReader(@is));
                string         line    = reader.ReadLine();
                int            counter = 0;
                while (line != null)
                {
                    counter++;
                    StringTokenizer tokeniz = new StringTokenizer(line, "\t");
                    string          key     = tokeniz.NextToken();
                    string          value   = tokeniz.NextToken();
                    Log.Debug("Output: key:" + key + "  value:" + value);
                    NUnit.Framework.Assert.IsTrue(value.Contains("hello"));
                    NUnit.Framework.Assert.IsTrue(reducerOutput.Contains(value));
                    line = reader.ReadLine();
                }
                reader.Close();
                NUnit.Framework.Assert.AreEqual(reducerOutput.Count, counter);
            }
        }