/// <exception cref="System.Exception"/> private void RunMapReduce(JobConf conf, IList <string> mapperBadRecords, IList <string > redBadRecords) { CreateInput(); conf.SetJobName("mr"); conf.SetNumMapTasks(1); conf.SetNumReduceTasks(1); conf.SetInt(JobContext.TaskTimeout, 30 * 1000); SkipBadRecords.SetMapperMaxSkipRecords(conf, long.MaxValue); SkipBadRecords.SetReducerMaxSkipGroups(conf, long.MaxValue); SkipBadRecords.SetAttemptsToStartSkipping(conf, 0); //the no of attempts to successfully complete the task depends //on the no of bad records. conf.SetMaxMapAttempts(SkipBadRecords.GetAttemptsToStartSkipping(conf) + 1 + mapperBadRecords .Count); conf.SetMaxReduceAttempts(SkipBadRecords.GetAttemptsToStartSkipping(conf) + 1 + redBadRecords .Count); FileInputFormat.SetInputPaths(conf, GetInputDir()); FileOutputFormat.SetOutputPath(conf, GetOutputDir()); conf.SetInputFormat(typeof(TextInputFormat)); conf.SetMapOutputKeyClass(typeof(LongWritable)); conf.SetMapOutputValueClass(typeof(Text)); conf.SetOutputFormat(typeof(TextOutputFormat)); conf.SetOutputKeyClass(typeof(LongWritable)); conf.SetOutputValueClass(typeof(Text)); RunningJob runningJob = JobClient.RunJob(conf); ValidateOutput(conf, runningJob, mapperBadRecords, redBadRecords); }
public virtual void Configure(JobConf job) { this.mapper = ReflectionUtils.NewInstance(job.GetMapperClass(), job); //increment processed counter only if skipping feature is enabled this.incrProcCount = SkipBadRecords.GetMapperMaxSkipRecords(job) > 0 && SkipBadRecords .GetAutoIncrMapperProcCount(job); }
public virtual void TestSkipBadRecords() { // test default values Configuration conf = new Configuration(); NUnit.Framework.Assert.AreEqual(2, SkipBadRecords.GetAttemptsToStartSkipping(conf )); NUnit.Framework.Assert.IsTrue(SkipBadRecords.GetAutoIncrMapperProcCount(conf)); NUnit.Framework.Assert.IsTrue(SkipBadRecords.GetAutoIncrReducerProcCount(conf)); NUnit.Framework.Assert.AreEqual(0, SkipBadRecords.GetMapperMaxSkipRecords(conf)); NUnit.Framework.Assert.AreEqual(0, SkipBadRecords.GetReducerMaxSkipGroups(conf), 0); NUnit.Framework.Assert.IsNull(SkipBadRecords.GetSkipOutputPath(conf)); // test setters SkipBadRecords.SetAttemptsToStartSkipping(conf, 5); SkipBadRecords.SetAutoIncrMapperProcCount(conf, false); SkipBadRecords.SetAutoIncrReducerProcCount(conf, false); SkipBadRecords.SetMapperMaxSkipRecords(conf, 6L); SkipBadRecords.SetReducerMaxSkipGroups(conf, 7L); JobConf jc = new JobConf(); SkipBadRecords.SetSkipOutputPath(jc, new Path("test")); // test getters NUnit.Framework.Assert.AreEqual(5, SkipBadRecords.GetAttemptsToStartSkipping(conf )); NUnit.Framework.Assert.IsFalse(SkipBadRecords.GetAutoIncrMapperProcCount(conf)); NUnit.Framework.Assert.IsFalse(SkipBadRecords.GetAutoIncrReducerProcCount(conf)); NUnit.Framework.Assert.AreEqual(6L, SkipBadRecords.GetMapperMaxSkipRecords(conf)); NUnit.Framework.Assert.AreEqual(7L, SkipBadRecords.GetReducerMaxSkipGroups(conf), 0); NUnit.Framework.Assert.AreEqual("test", SkipBadRecords.GetSkipOutputPath(jc).ToString ()); }
/// <exception cref="System.IO.IOException"/> private void WriteSkippedRec(KEY key, VALUE value) { if (this.skipWriter == null) { Path skipDir = SkipBadRecords.GetSkipOutputPath(this._enclosing.conf); Path skipFile = new Path(skipDir, this._enclosing.GetTaskID().ToString()); this.skipWriter = SequenceFile.CreateWriter(skipFile.GetFileSystem(this._enclosing .conf), this._enclosing.conf, skipFile, this.keyClass, this.valClass, SequenceFile.CompressionType .Block, this.reporter); } this.skipWriter.Append(key, value); }
/// <exception cref="System.IO.IOException"/> private void RunOldReducer <Inkey, Invalue, Outkey, Outvalue>(JobConf job, TaskUmbilicalProtocol umbilical, Task.TaskReporter reporter, RawKeyValueIterator rIter, RawComparator <INKEY> comparator) { System.Type keyClass = typeof(INKEY); System.Type valueClass = typeof(INVALUE); Reducer <INKEY, INVALUE, OUTKEY, OUTVALUE> reducer = ReflectionUtils.NewInstance(job .GetReducerClass(), job); // make output collector string finalName = GetOutputName(GetPartition()); RecordWriter <OUTKEY, OUTVALUE> @out = new ReduceTask.OldTrackingRecordWriter <OUTKEY , OUTVALUE>(this, job, reporter, finalName); RecordWriter <OUTKEY, OUTVALUE> finalOut = @out; OutputCollector <OUTKEY, OUTVALUE> collector = new _OutputCollector_419(finalOut, reporter); // indicate that progress update needs to be sent // apply reduce function try { //increment processed counter only if skipping feature is enabled bool incrProcCount = SkipBadRecords.GetReducerMaxSkipGroups(job) > 0 && SkipBadRecords .GetAutoIncrReducerProcCount(job); ReduceTask.ReduceValuesIterator <INKEY, INVALUE> values = IsSkipping() ? new ReduceTask.SkippingReduceValuesIterator <INKEY, INVALUE>(this, rIter, comparator, keyClass, valueClass, job, reporter, umbilical ) : new ReduceTask.ReduceValuesIterator <INKEY, INVALUE>(this, rIter, job.GetOutputValueGroupingComparator (), keyClass, valueClass, job, reporter); values.InformReduceProgress(); while (values.More()) { reduceInputKeyCounter.Increment(1); reducer.Reduce(values.GetKey(), values, collector, reporter); if (incrProcCount) { reporter.IncrCounter(SkipBadRecords.CounterGroup, SkipBadRecords.CounterReduceProcessedGroups , 1); } values.NextKey(); values.InformReduceProgress(); } reducer.Close(); reducer = null; @out.Close(reporter); @out = null; } finally { IOUtils.Cleanup(Log, reducer); CloseQuietly(@out, reporter); } }
/// <exception cref="System.IO.IOException"/> public SkippingReduceValuesIterator(ReduceTask _enclosing, RawKeyValueIterator @in , RawComparator <KEY> comparator, Type keyClass, Type valClass, Configuration conf , Task.TaskReporter reporter, TaskUmbilicalProtocol umbilical) : base(_enclosing) { this._enclosing = _enclosing; this.umbilical = umbilical; this.skipGroupCounter = ((Counters.Counter)reporter.GetCounter(TaskCounter.ReduceSkippedGroups )); this.skipRecCounter = ((Counters.Counter)reporter.GetCounter(TaskCounter.ReduceSkippedRecords )); this.toWriteSkipRecs = this._enclosing.ToWriteSkipRecs() && SkipBadRecords.GetSkipOutputPath (conf) != null; this.keyClass = keyClass; this.valClass = valClass; this.reporter = reporter; this.skipIt = this._enclosing.GetSkipRanges().SkipRangeIterator(); this.MayBeSkip(); }
/// <exception cref="System.Exception"/> private void ValidateOutput(JobConf conf, RunningJob runningJob, IList <string> mapperBadRecords , IList <string> redBadRecords) { Log.Info(runningJob.GetCounters().ToString()); NUnit.Framework.Assert.IsTrue(runningJob.IsSuccessful()); //validate counters Counters counters = runningJob.GetCounters(); NUnit.Framework.Assert.AreEqual(counters.FindCounter(TaskCounter.MapSkippedRecords ).GetCounter(), mapperBadRecords.Count); int mapRecs = input.Count - mapperBadRecords.Count; NUnit.Framework.Assert.AreEqual(counters.FindCounter(TaskCounter.MapInputRecords) .GetCounter(), mapRecs); NUnit.Framework.Assert.AreEqual(counters.FindCounter(TaskCounter.MapOutputRecords ).GetCounter(), mapRecs); int redRecs = mapRecs - redBadRecords.Count; NUnit.Framework.Assert.AreEqual(counters.FindCounter(TaskCounter.ReduceSkippedRecords ).GetCounter(), redBadRecords.Count); NUnit.Framework.Assert.AreEqual(counters.FindCounter(TaskCounter.ReduceSkippedGroups ).GetCounter(), redBadRecords.Count); NUnit.Framework.Assert.AreEqual(counters.FindCounter(TaskCounter.ReduceInputGroups ).GetCounter(), redRecs); NUnit.Framework.Assert.AreEqual(counters.FindCounter(TaskCounter.ReduceInputRecords ).GetCounter(), redRecs); NUnit.Framework.Assert.AreEqual(counters.FindCounter(TaskCounter.ReduceOutputRecords ).GetCounter(), redRecs); //validate skipped records Path skipDir = SkipBadRecords.GetSkipOutputPath(conf); NUnit.Framework.Assert.IsNotNull(skipDir); Path[] skips = FileUtil.Stat2Paths(GetFileSystem().ListStatus(skipDir)); IList <string> mapSkipped = new AList <string>(); IList <string> redSkipped = new AList <string>(); foreach (Path skipPath in skips) { Log.Info("skipPath: " + skipPath); SequenceFile.Reader reader = new SequenceFile.Reader(GetFileSystem(), skipPath, conf ); object key = ReflectionUtils.NewInstance(reader.GetKeyClass(), conf); object value = ReflectionUtils.NewInstance(reader.GetValueClass(), conf); key = reader.Next(key); while (key != null) { value = reader.GetCurrentValue(value); Log.Debug("key:" + key + " value:" + value.ToString()); if (skipPath.GetName().Contains("_r_")) { redSkipped.AddItem(value.ToString()); } else { mapSkipped.AddItem(value.ToString()); } key = reader.Next(key); } reader.Close(); } NUnit.Framework.Assert.IsTrue(mapSkipped.ContainsAll(mapperBadRecords)); NUnit.Framework.Assert.IsTrue(redSkipped.ContainsAll(redBadRecords)); Path[] outputFiles = FileUtil.Stat2Paths(GetFileSystem().ListStatus(GetOutputDir( ), new Utils.OutputFileUtils.OutputFilesFilter())); IList <string> mapperOutput = GetProcessed(input, mapperBadRecords); Log.Debug("mapperOutput " + mapperOutput.Count); IList <string> reducerOutput = GetProcessed(mapperOutput, redBadRecords); Log.Debug("reducerOutput " + reducerOutput.Count); if (outputFiles.Length > 0) { InputStream @is = GetFileSystem().Open(outputFiles[0]); BufferedReader reader = new BufferedReader(new InputStreamReader(@is)); string line = reader.ReadLine(); int counter = 0; while (line != null) { counter++; StringTokenizer tokeniz = new StringTokenizer(line, "\t"); string key = tokeniz.NextToken(); string value = tokeniz.NextToken(); Log.Debug("Output: key:" + key + " value:" + value); NUnit.Framework.Assert.IsTrue(value.Contains("hello")); NUnit.Framework.Assert.IsTrue(reducerOutput.Contains(value)); line = reader.ReadLine(); } reader.Close(); NUnit.Framework.Assert.AreEqual(reducerOutput.Count, counter); } }