/// <exception cref="System.IO.IOException"/> internal static long ReadBench(JobConf conf) { // InputFormat instantiation InputFormat inf = conf.GetInputFormat(); string fn = conf.Get("test.filebench.name", string.Empty); Path pin = new Path(FileInputFormat.GetInputPaths(conf)[0], fn); FileStatus @in = pin.GetFileSystem(conf).GetFileStatus(pin); RecordReader rr = inf.GetRecordReader(new FileSplit(pin, 0, @in.GetLen(), (string [])null), conf, Reporter.Null); try { object key = rr.CreateKey(); object val = rr.CreateValue(); DateTime start = new DateTime(); while (rr.Next(key, val)) { } DateTime end = new DateTime(); return(end.GetTime() - start.GetTime()); } finally { rr.Close(); } }
/// <exception cref="System.IO.IOException"/> private int CountRecords(int numSplits) { InputFormat <Text, BytesWritable> format = new SequenceFileInputFilter <Text, BytesWritable >(); Text key = new Text(); BytesWritable value = new BytesWritable(); if (numSplits == 0) { numSplits = random.Next(MaxLength / (SequenceFile.SyncInterval / 20)) + 1; } InputSplit[] splits = format.GetSplits(job, numSplits); // check each split int count = 0; Log.Info("Generated " + splits.Length + " splits."); for (int j = 0; j < splits.Length; j++) { RecordReader <Text, BytesWritable> reader = format.GetRecordReader(splits[j], job, reporter); try { while (reader.Next(key, value)) { Log.Info("Accept record " + key.ToString()); count++; } } finally { reader.Close(); } } return(count); }
/// <summary>From each split sampled, take the first numSamples / numSplits records.</summary> /// <exception cref="System.IO.IOException"/> public virtual K[] GetSample(InputFormat <K, V> inf, JobConf job) { // ArrayList::toArray doesn't preserve type InputSplit[] splits = inf.GetSplits(job, job.GetNumMapTasks()); AList <K> samples = new AList <K>(numSamples); int splitsToSample = Math.Min(maxSplitsSampled, splits.Length); int splitStep = splits.Length / splitsToSample; int samplesPerSplit = numSamples / splitsToSample; long records = 0; for (int i = 0; i < splitsToSample; ++i) { RecordReader <K, V> reader = inf.GetRecordReader(splits[i * splitStep], job, Reporter .Null); K key = reader.CreateKey(); V value = reader.CreateValue(); while (reader.Next(key, value)) { samples.AddItem(key); key = reader.CreateKey(); ++records; if ((i + 1) * samplesPerSplit <= records) { break; } } reader.Close(); } return((K[])Sharpen.Collections.ToArray(samples)); }
/// <exception cref="System.IO.IOException"/> private static IList <Text> ReadSplit(KeyValueTextInputFormat format, InputSplit split , JobConf job) { IList <Text> result = new AList <Text>(); RecordReader <Text, Text> reader = null; try { reader = format.GetRecordReader(split, job, voidReporter); Text key = reader.CreateKey(); Text value = reader.CreateValue(); while (reader.Next(key, value)) { result.AddItem(value); value = (Text)reader.CreateValue(); } } finally { if (reader != null) { reader.Close(); } } return(result); }
/// <summary> /// Randomize the split order, then take the specified number of keys from /// each split sampled, where each key is selected with the specified /// probability and possibly replaced by a subsequently selected key when /// the quota of keys from that split is satisfied. /// </summary> /// <exception cref="System.IO.IOException"/> public virtual K[] GetSample(InputFormat <K, V> inf, JobConf job) { // ArrayList::toArray doesn't preserve type InputSplit[] splits = inf.GetSplits(job, job.GetNumMapTasks()); AList <K> samples = new AList <K>(numSamples); int splitsToSample = Math.Min(maxSplitsSampled, splits.Length); Random r = new Random(); long seed = r.NextLong(); r.SetSeed(seed); Log.Debug("seed: " + seed); // shuffle splits for (int i = 0; i < splits.Length; ++i) { InputSplit tmp = splits[i]; int j = r.Next(splits.Length); splits[i] = splits[j]; splits[j] = tmp; } // our target rate is in terms of the maximum number of sample splits, // but we accept the possibility of sampling additional splits to hit // the target sample keyset for (int i_1 = 0; i_1 < splitsToSample || (i_1 < splits.Length && samples.Count < numSamples); ++i_1) { RecordReader <K, V> reader = inf.GetRecordReader(splits[i_1], job, Reporter.Null); K key = reader.CreateKey(); V value = reader.CreateValue(); while (reader.Next(key, value)) { if (r.NextDouble() <= freq) { if (samples.Count < numSamples) { samples.AddItem(key); } else { // When exceeding the maximum number of samples, replace a // random element with this one, then adjust the frequency // to reflect the possibility of existing elements being // pushed out int ind = r.Next(numSamples); if (ind != numSamples) { samples.Set(ind, key); } freq *= (numSamples - 1) / (double)numSamples; } key = reader.CreateKey(); } } reader.Close(); } return((K[])Sharpen.Collections.ToArray(samples)); }
/// <summary>Update the amount done and call progress on the reporter.</summary> /// <exception cref="System.IO.IOException"/> public virtual void Progress(float progress) { progressValue = progress; reporter.Progress(); if (recordReader != null) { progressKey.Set(progress); recordReader.Next(progressKey, nullValue); } }
/// <exception cref="System.Exception"/> public virtual void TestFormat() { JobConf job = new JobConf(conf); Reporter reporter = Reporter.Null; Random random = new Random(); long seed = random.NextLong(); Log.Info("seed = " + seed); random.SetSeed(seed); localFs.Delete(workDir, true); FileInputFormat.SetInputPaths(job, workDir); int length = 10000; int numFiles = 10; // create a file with various lengths CreateFiles(length, numFiles, random); // create a combine split for the files InputFormat <IntWritable, BytesWritable> format = new CombineSequenceFileInputFormat <IntWritable, BytesWritable>(); IntWritable key = new IntWritable(); BytesWritable value = new BytesWritable(); for (int i = 0; i < 3; i++) { int numSplits = random.Next(length / (SequenceFile.SyncInterval / 20)) + 1; Log.Info("splitting: requesting = " + numSplits); InputSplit[] splits = format.GetSplits(job, numSplits); Log.Info("splitting: got = " + splits.Length); // we should have a single split as the length is comfortably smaller than // the block size NUnit.Framework.Assert.AreEqual("We got more than one splits!", 1, splits.Length); InputSplit split = splits[0]; NUnit.Framework.Assert.AreEqual("It should be CombineFileSplit", typeof(CombineFileSplit ), split.GetType()); // check each split BitSet bits = new BitSet(length); RecordReader <IntWritable, BytesWritable> reader = format.GetRecordReader(split, job , reporter); try { while (reader.Next(key, value)) { NUnit.Framework.Assert.IsFalse("Key in multiple partitions.", bits.Get(key.Get()) ); bits.Set(key.Get()); } } finally { reader.Close(); } NUnit.Framework.Assert.AreEqual("Some keys in no partition.", length, bits.Cardinality ()); } }
/// <summary>Run the map task.</summary> /// <param name="input">the set of inputs</param> /// <param name="output">the object to collect the outputs of the map</param> /// <param name="reporter">the object to update with status</param> /// <exception cref="System.IO.IOException"/> public override void Run(RecordReader <K1, V1> input, OutputCollector <K2, V2> output , Reporter reporter) { Application <K1, V1, K2, V2> application = null; try { RecordReader <FloatWritable, NullWritable> fakeInput = (!Submitter.GetIsJavaRecordReader (job) && !Submitter.GetIsJavaMapper(job)) ? (RecordReader <FloatWritable, NullWritable >)input : null; application = new Application <K1, V1, K2, V2>(job, fakeInput, output, reporter, ( Type)job.GetOutputKeyClass(), (Type)job.GetOutputValueClass()); } catch (Exception ie) { throw new RuntimeException("interrupted", ie); } DownwardProtocol <K1, V1> downlink = application.GetDownlink(); bool isJavaInput = Submitter.GetIsJavaRecordReader(job); downlink.RunMap(reporter.GetInputSplit(), job.GetNumReduceTasks(), isJavaInput); bool skipping = job.GetBoolean(MRJobConfig.SkipRecords, false); try { if (isJavaInput) { // allocate key & value instances that are re-used for all entries K1 key = input.CreateKey(); V1 value = input.CreateValue(); downlink.SetInputTypes(key.GetType().FullName, value.GetType().FullName); while (input.Next(key, value)) { // map pair to output downlink.MapItem(key, value); if (skipping) { //flush the streams on every record input if running in skip mode //so that we don't buffer other records surrounding a bad record. downlink.Flush(); } } downlink.EndOfInput(); } application.WaitForFinish(); } catch (Exception t) { application.Abort(t); } finally { application.Cleanup(); } }
/// <exception cref="System.IO.IOException"/> public virtual bool Next(K key, V value) { while ((curReader == null) || !curReader.Next(key, value)) { if (!InitNextRecordReader()) { return(false); } } return(true); }
/// <exception cref="System.IO.IOException"/> private static IList <Text> ReadSplit(TextInputFormat format, InputSplit split, JobConf jobConf) { IList <Text> result = new AList <Text>(); RecordReader <LongWritable, Text> reader = format.GetRecordReader(split, jobConf, voidReporter); LongWritable key = reader.CreateKey(); Text value = reader.CreateValue(); while (reader.Next(key, value)) { result.AddItem(value); value = reader.CreateValue(); } reader.Close(); return(result); }
// A reporter that does nothing /// <exception cref="System.IO.IOException"/> internal virtual void CheckFormat(JobConf job, int expectedN) { NLineInputFormat format = new NLineInputFormat(); format.Configure(job); int ignoredNumSplits = 1; InputSplit[] splits = format.GetSplits(job, ignoredNumSplits); // check all splits except last one int count = 0; for (int j = 0; j < splits.Length - 1; j++) { NUnit.Framework.Assert.AreEqual("There are no split locations", 0, splits[j].GetLocations ().Length); RecordReader <LongWritable, Text> reader = format.GetRecordReader(splits[j], job, voidReporter); Type readerClass = reader.GetType(); NUnit.Framework.Assert.AreEqual("reader class is LineRecordReader.", typeof(LineRecordReader ), readerClass); LongWritable key = reader.CreateKey(); Type keyClass = key.GetType(); NUnit.Framework.Assert.AreEqual("Key class is LongWritable.", typeof(LongWritable ), keyClass); Text value = reader.CreateValue(); Type valueClass = value.GetType(); NUnit.Framework.Assert.AreEqual("Value class is Text.", typeof(Text), valueClass); try { count = 0; while (reader.Next(key, value)) { count++; } } finally { reader.Close(); } NUnit.Framework.Assert.AreEqual("number of lines in split is " + expectedN, expectedN , count); } }
/// <exception cref="System.IO.IOException"/> private static IList <string> ReadSplit(FixedLengthInputFormat format, InputSplit split, JobConf job) { IList <string> result = new AList <string>(); RecordReader <LongWritable, BytesWritable> reader = format.GetRecordReader(split, job, voidReporter); LongWritable key = reader.CreateKey(); BytesWritable value = reader.CreateValue(); try { while (reader.Next(key, value)) { result.AddItem(Sharpen.Runtime.GetStringForBytes(value.GetBytes(), 0, value.GetLength ())); } } finally { reader.Close(); } return(result); }
/// <exception cref="System.IO.IOException"/> public virtual void Run(RecordReader <K1, V1> input, OutputCollector <K2, V2> output , Reporter reporter) { try { // allocate key & value instances that are re-used for all entries K1 key = input.CreateKey(); V1 value = input.CreateValue(); while (input.Next(key, value)) { // map pair to output mapper.Map(key, value, output, reporter); if (incrProcCount) { reporter.IncrCounter(SkipBadRecords.CounterGroup, SkipBadRecords.CounterMapProcessedRecords , 1); } } } finally { mapper.Close(); } }
/// <summary> /// Read the next k,v pair into the head of this object; return true iff /// the RR and this are exhausted. /// </summary> /// <exception cref="System.IO.IOException"/> protected internal virtual bool Next() { empty = !rr.Next(khead, vhead); return(HasNext()); }
// A random task attempt id for testing. /// <exception cref="System.IO.IOException"/> public virtual void TestBinary() { JobConf job = new JobConf(); FileSystem fs = FileSystem.GetLocal(job); Path dir = new Path(new Path(new Path(Runtime.GetProperty("test.build.data", ".") ), FileOutputCommitter.TempDirName), "_" + attempt); Path file = new Path(dir, "testbinary.seq"); Random r = new Random(); long seed = r.NextLong(); r.SetSeed(seed); fs.Delete(dir, true); if (!fs.Mkdirs(dir)) { Fail("Failed to create output directory"); } job.Set(JobContext.TaskAttemptId, attempt); FileOutputFormat.SetOutputPath(job, dir.GetParent().GetParent()); FileOutputFormat.SetWorkOutputPath(job, dir); SequenceFileAsBinaryOutputFormat.SetSequenceFileOutputKeyClass(job, typeof(IntWritable )); SequenceFileAsBinaryOutputFormat.SetSequenceFileOutputValueClass(job, typeof(DoubleWritable )); SequenceFileAsBinaryOutputFormat.SetCompressOutput(job, true); SequenceFileAsBinaryOutputFormat.SetOutputCompressionType(job, SequenceFile.CompressionType .Block); BytesWritable bkey = new BytesWritable(); BytesWritable bval = new BytesWritable(); RecordWriter <BytesWritable, BytesWritable> writer = new SequenceFileAsBinaryOutputFormat ().GetRecordWriter(fs, job, file.ToString(), Reporter.Null); IntWritable iwritable = new IntWritable(); DoubleWritable dwritable = new DoubleWritable(); DataOutputBuffer outbuf = new DataOutputBuffer(); Log.Info("Creating data by SequenceFileAsBinaryOutputFormat"); try { for (int i = 0; i < Records; ++i) { iwritable = new IntWritable(r.Next()); iwritable.Write(outbuf); bkey.Set(outbuf.GetData(), 0, outbuf.GetLength()); outbuf.Reset(); dwritable = new DoubleWritable(r.NextDouble()); dwritable.Write(outbuf); bval.Set(outbuf.GetData(), 0, outbuf.GetLength()); outbuf.Reset(); writer.Write(bkey, bval); } } finally { writer.Close(Reporter.Null); } InputFormat <IntWritable, DoubleWritable> iformat = new SequenceFileInputFormat <IntWritable , DoubleWritable>(); int count = 0; r.SetSeed(seed); DataInputBuffer buf = new DataInputBuffer(); int NumSplits = 3; SequenceFileInputFormat.AddInputPath(job, file); Log.Info("Reading data by SequenceFileInputFormat"); foreach (InputSplit split in iformat.GetSplits(job, NumSplits)) { RecordReader <IntWritable, DoubleWritable> reader = iformat.GetRecordReader(split, job, Reporter.Null); try { int sourceInt; double sourceDouble; while (reader.Next(iwritable, dwritable)) { sourceInt = r.Next(); sourceDouble = r.NextDouble(); NUnit.Framework.Assert.AreEqual("Keys don't match: " + "*" + iwritable.Get() + ":" + sourceInt + "*", sourceInt, iwritable.Get()); NUnit.Framework.Assert.IsTrue("Vals don't match: " + "*" + dwritable.Get() + ":" + sourceDouble + "*", double.Compare(dwritable.Get(), sourceDouble) == 0); ++count; } } finally { reader.Close(); } } NUnit.Framework.Assert.AreEqual("Some records not found", Records, count); }
/// <exception cref="System.IO.IOException"/> private void RunRandomTests(CompressionCodec codec) { StringBuilder fileName = new StringBuilder("testFormat.txt"); if (codec != null) { fileName.Append(".gz"); } localFs.Delete(workDir, true); Path file = new Path(workDir, fileName.ToString()); int seed = new Random().Next(); Log.Info("Seed = " + seed); Random random = new Random(seed); int MaxTests = 20; LongWritable key = new LongWritable(); BytesWritable value = new BytesWritable(); for (int i = 0; i < MaxTests; i++) { Log.Info("----------------------------------------------------------"); // Maximum total records of 999 int totalRecords = random.Next(999) + 1; // Test an empty file if (i == 8) { totalRecords = 0; } // Maximum bytes in a record of 100K int recordLength = random.Next(1024 * 100) + 1; // For the 11th test, force a record length of 1 if (i == 10) { recordLength = 1; } // The total bytes in the test file int fileSize = (totalRecords * recordLength); Log.Info("totalRecords=" + totalRecords + " recordLength=" + recordLength); // Create the job JobConf job = new JobConf(defaultConf); if (codec != null) { ReflectionUtils.SetConf(codec, job); } // Create the test file AList <string> recordList = CreateFile(file, codec, recordLength, totalRecords); NUnit.Framework.Assert.IsTrue(localFs.Exists(file)); //set the fixed length record length config property for the job FixedLengthInputFormat.SetRecordLength(job, recordLength); int numSplits = 1; // Arbitrarily set number of splits. if (i > 0) { if (i == (MaxTests - 1)) { // Test a split size that is less than record len numSplits = (int)(fileSize / Math.Floor(recordLength / 2)); } else { if (MaxTests % i == 0) { // Let us create a split size that is forced to be // smaller than the end file itself, (ensures 1+ splits) numSplits = fileSize / (fileSize - random.Next(fileSize)); } else { // Just pick a random split size with no upper bound numSplits = Math.Max(1, fileSize / random.Next(int.MaxValue)); } } Log.Info("Number of splits set to: " + numSplits); } // Setup the input path FileInputFormat.SetInputPaths(job, workDir); // Try splitting the file in a variety of sizes FixedLengthInputFormat format = new FixedLengthInputFormat(); format.Configure(job); InputSplit[] splits = format.GetSplits(job, numSplits); Log.Info("Actual number of splits = " + splits.Length); // Test combined split lengths = total file size long recordOffset = 0; int recordNumber = 0; foreach (InputSplit split in splits) { RecordReader <LongWritable, BytesWritable> reader = format.GetRecordReader(split, job, voidReporter); Type clazz = reader.GetType(); NUnit.Framework.Assert.AreEqual("RecordReader class should be FixedLengthRecordReader:" , typeof(FixedLengthRecordReader), clazz); // Plow through the records in this split while (reader.Next(key, value)) { NUnit.Framework.Assert.AreEqual("Checking key", (long)(recordNumber * recordLength ), key.Get()); string valueString = Sharpen.Runtime.GetStringForBytes(value.GetBytes(), 0, value .GetLength()); NUnit.Framework.Assert.AreEqual("Checking record length:", recordLength, value.GetLength ()); NUnit.Framework.Assert.IsTrue("Checking for more records than expected:", recordNumber < totalRecords); string origRecord = recordList[recordNumber]; NUnit.Framework.Assert.AreEqual("Checking record content:", origRecord, valueString ); recordNumber++; } reader.Close(); } NUnit.Framework.Assert.AreEqual("Total original records should be total read records:" , recordList.Count, recordNumber); } }
/// <exception cref="System.Exception"/> public virtual void TestFormat() { JobConf job = new JobConf(); Path file = new Path(workDir, "test.txt"); // A reporter that does nothing Reporter reporter = Reporter.Null; int seed = new Random().Next(); Log.Info("seed = " + seed); Random random = new Random(seed); localFs.Delete(workDir, true); FileInputFormat.SetInputPaths(job, workDir); // for a variety of lengths for (int length = 0; length < MaxLength; length += random.Next(MaxLength / 10) + 1) { Log.Debug("creating; entries = " + length); // create a file with length entries TextWriter writer = new OutputStreamWriter(localFs.Create(file)); try { for (int i = 0; i < length; i++) { writer.Write(Sharpen.Extensions.ToString(i * 2)); writer.Write("\t"); writer.Write(Sharpen.Extensions.ToString(i)); writer.Write("\n"); } } finally { writer.Close(); } // try splitting the file in a variety of sizes KeyValueTextInputFormat format = new KeyValueTextInputFormat(); format.Configure(job); for (int i_1 = 0; i_1 < 3; i_1++) { int numSplits = random.Next(MaxLength / 20) + 1; Log.Debug("splitting: requesting = " + numSplits); InputSplit[] splits = format.GetSplits(job, numSplits); Log.Debug("splitting: got = " + splits.Length); // check each split BitSet bits = new BitSet(length); for (int j = 0; j < splits.Length; j++) { Log.Debug("split[" + j + "]= " + splits[j]); RecordReader <Text, Text> reader = format.GetRecordReader(splits[j], job, reporter ); Type readerClass = reader.GetType(); NUnit.Framework.Assert.AreEqual("reader class is KeyValueLineRecordReader.", typeof( KeyValueLineRecordReader), readerClass); Text key = reader.CreateKey(); Type keyClass = key.GetType(); Text value = reader.CreateValue(); Type valueClass = value.GetType(); NUnit.Framework.Assert.AreEqual("Key class is Text.", typeof(Text), keyClass); NUnit.Framework.Assert.AreEqual("Value class is Text.", typeof(Text), valueClass); try { int count = 0; while (reader.Next(key, value)) { int v = System.Convert.ToInt32(value.ToString()); Log.Debug("read " + v); if (bits.Get(v)) { Log.Warn("conflict with " + v + " in split " + j + " at position " + reader.GetPos ()); } NUnit.Framework.Assert.IsFalse("Key in multiple partitions.", bits.Get(v)); bits.Set(v); count++; } Log.Debug("splits[" + j + "]=" + splits[j] + " count=" + count); } finally { reader.Close(); } } NUnit.Framework.Assert.AreEqual("Some keys in no partition.", length, bits.Cardinality ()); } } }
/// <exception cref="System.IO.IOException"/> public virtual void TestBinary() { JobConf job = new JobConf(); FileSystem fs = FileSystem.GetLocal(job); Path dir = new Path(Runtime.GetProperty("test.build.data", ".") + "/mapred"); Path file = new Path(dir, "testbinary.seq"); Random r = new Random(); long seed = r.NextLong(); r.SetSeed(seed); fs.Delete(dir, true); FileInputFormat.SetInputPaths(job, dir); Text tkey = new Text(); Text tval = new Text(); SequenceFile.Writer writer = new SequenceFile.Writer(fs, job, file, typeof(Text), typeof(Text)); try { for (int i = 0; i < Records; ++i) { tkey.Set(Sharpen.Extensions.ToString(r.Next(), 36)); tval.Set(System.Convert.ToString(r.NextLong(), 36)); writer.Append(tkey, tval); } } finally { writer.Close(); } InputFormat <BytesWritable, BytesWritable> bformat = new SequenceFileAsBinaryInputFormat (); int count = 0; r.SetSeed(seed); BytesWritable bkey = new BytesWritable(); BytesWritable bval = new BytesWritable(); Text cmpkey = new Text(); Text cmpval = new Text(); DataInputBuffer buf = new DataInputBuffer(); int NumSplits = 3; FileInputFormat.SetInputPaths(job, file); foreach (InputSplit split in bformat.GetSplits(job, NumSplits)) { RecordReader <BytesWritable, BytesWritable> reader = bformat.GetRecordReader(split , job, Reporter.Null); try { while (reader.Next(bkey, bval)) { tkey.Set(Sharpen.Extensions.ToString(r.Next(), 36)); tval.Set(System.Convert.ToString(r.NextLong(), 36)); buf.Reset(bkey.GetBytes(), bkey.GetLength()); cmpkey.ReadFields(buf); buf.Reset(bval.GetBytes(), bval.GetLength()); cmpval.ReadFields(buf); NUnit.Framework.Assert.IsTrue("Keys don't match: " + "*" + cmpkey.ToString() + ":" + tkey.ToString() + "*", cmpkey.ToString().Equals(tkey.ToString())); NUnit.Framework.Assert.IsTrue("Vals don't match: " + "*" + cmpval.ToString() + ":" + tval.ToString() + "*", cmpval.ToString().Equals(tval.ToString())); ++count; } } finally { reader.Close(); } } NUnit.Framework.Assert.AreEqual("Some records not found", Records, count); }
/// <exception cref="System.IO.IOException"/> public virtual void TestSplitableCodecs() { JobConf conf = new JobConf(defaultConf); int seed = new Random().Next(); // Create the codec CompressionCodec codec = null; try { codec = (CompressionCodec)ReflectionUtils.NewInstance(conf.GetClassByName("org.apache.hadoop.io.compress.BZip2Codec" ), conf); } catch (TypeLoadException) { throw new IOException("Illegal codec!"); } Path file = new Path(workDir, "test" + codec.GetDefaultExtension()); // A reporter that does nothing Reporter reporter = Reporter.Null; Log.Info("seed = " + seed); Random random = new Random(seed); FileSystem localFs = FileSystem.GetLocal(conf); localFs.Delete(workDir, true); FileInputFormat.SetInputPaths(conf, workDir); int MaxLength = 500000; // for a variety of lengths for (int length = MaxLength / 2; length < MaxLength; length += random.Next(MaxLength / 4) + 1) { Log.Info("creating; entries = " + length); // create a file with length entries TextWriter writer = new OutputStreamWriter(codec.CreateOutputStream(localFs.Create (file))); try { for (int i = 0; i < length; i++) { writer.Write(Sharpen.Extensions.ToString(i)); writer.Write("\n"); } } finally { writer.Close(); } // try splitting the file in a variety of sizes TextInputFormat format = new TextInputFormat(); format.Configure(conf); LongWritable key = new LongWritable(); Text value = new Text(); for (int i_1 = 0; i_1 < 3; i_1++) { int numSplits = random.Next(MaxLength / 2000) + 1; Log.Info("splitting: requesting = " + numSplits); InputSplit[] splits = format.GetSplits(conf, numSplits); Log.Info("splitting: got = " + splits.Length); // check each split BitSet bits = new BitSet(length); for (int j = 0; j < splits.Length; j++) { Log.Debug("split[" + j + "]= " + splits[j]); RecordReader <LongWritable, Text> reader = format.GetRecordReader(splits[j], conf, reporter); try { int counter = 0; while (reader.Next(key, value)) { int v = System.Convert.ToInt32(value.ToString()); Log.Debug("read " + v); if (bits.Get(v)) { Log.Warn("conflict with " + v + " in split " + j + " at position " + reader.GetPos ()); } NUnit.Framework.Assert.IsFalse("Key in multiple partitions.", bits.Get(v)); bits.Set(v); counter++; } if (counter > 0) { Log.Info("splits[" + j + "]=" + splits[j] + " count=" + counter); } else { Log.Debug("splits[" + j + "]=" + splits[j] + " count=" + counter); } } finally { reader.Close(); } } NUnit.Framework.Assert.AreEqual("Some keys in no partition.", length, bits.Cardinality ()); } } }
/// <exception cref="System.IO.IOException"/> public virtual void Run(RecordReader <K1, V1> input, OutputCollector <K2, V2> output , Reporter reporter) { try { // allocate key & value instances these objects will not be reused // because execution of Mapper.map is not serialized. K1 key = input.CreateKey(); V1 value = input.CreateValue(); while (input.Next(key, value)) { executorService.Execute(new MultithreadedMapRunner.MapperInvokeRunable(this, key, value, output, reporter)); CheckForExceptionsFromProcessingThreads(); // Allocate new key & value instances as mapper is running in parallel key = input.CreateKey(); value = input.CreateValue(); } if (Log.IsDebugEnabled()) { Log.Debug("Finished dispatching all Mappper.map calls, job " + job.GetJobName()); } // Graceful shutdown of the Threadpool, it will let all scheduled // Runnables to end. executorService.Shutdown(); try { // Now waiting for all Runnables to end. while (!executorService.AwaitTermination(100, TimeUnit.Milliseconds)) { if (Log.IsDebugEnabled()) { Log.Debug("Awaiting all running Mappper.map calls to finish, job " + job.GetJobName ()); } // NOTE: while Mapper.map dispatching has concluded there are still // map calls in progress and exceptions would be thrown. CheckForExceptionsFromProcessingThreads(); } // NOTE: it could be that a map call has had an exception after the // call for awaitTermination() returing true. And edge case but it // could happen. CheckForExceptionsFromProcessingThreads(); } catch (IOException ioEx) { // Forcing a shutdown of all thread of the threadpool and rethrowing // the IOException executorService.ShutdownNow(); throw; } catch (Exception iEx) { throw new RuntimeException(iEx); } } finally { mapper.Close(); } }
// A reporter that does nothing /// <exception cref="System.Exception"/> public virtual void TestFormat() { JobConf job = new JobConf(defaultConf); Random random = new Random(); long seed = random.NextLong(); Log.Info("seed = " + seed); random.SetSeed(seed); localFs.Delete(workDir, true); FileInputFormat.SetInputPaths(job, workDir); int length = 10000; int numFiles = 10; CreateFiles(length, numFiles, random); // create a combined split for the files CombineTextInputFormat format = new CombineTextInputFormat(); LongWritable key = new LongWritable(); Text value = new Text(); for (int i = 0; i < 3; i++) { int numSplits = random.Next(length / 20) + 1; Log.Info("splitting: requesting = " + numSplits); InputSplit[] splits = format.GetSplits(job, numSplits); Log.Info("splitting: got = " + splits.Length); // we should have a single split as the length is comfortably smaller than // the block size NUnit.Framework.Assert.AreEqual("We got more than one splits!", 1, splits.Length); InputSplit split = splits[0]; NUnit.Framework.Assert.AreEqual("It should be CombineFileSplit", typeof(CombineFileSplit ), split.GetType()); // check the split BitSet bits = new BitSet(length); Log.Debug("split= " + split); RecordReader <LongWritable, Text> reader = format.GetRecordReader(split, job, voidReporter ); try { int count = 0; while (reader.Next(key, value)) { int v = System.Convert.ToInt32(value.ToString()); Log.Debug("read " + v); if (bits.Get(v)) { Log.Warn("conflict with " + v + " at position " + reader.GetPos()); } NUnit.Framework.Assert.IsFalse("Key in multiple partitions.", bits.Get(v)); bits.Set(v); count++; } Log.Info("splits=" + split + " count=" + count); } finally { reader.Close(); } NUnit.Framework.Assert.AreEqual("Some keys in no partition.", length, bits.Cardinality ()); } }
/// <exception cref="System.IO.IOException"/> public virtual bool Next(K key, V value) { return(delegate_.Next(key, value)); }
/// <exception cref="System.Exception"/> public virtual void TestFormat() { JobConf job = new JobConf(conf); FileSystem fs = FileSystem.GetLocal(conf); Path dir = new Path(Runtime.GetProperty("test.build.data", ".") + "/mapred"); Path file = new Path(dir, "test.seq"); Reporter reporter = Reporter.Null; int seed = new Random().Next(); //LOG.info("seed = "+seed); Random random = new Random(seed); fs.Delete(dir, true); FileInputFormat.SetInputPaths(job, dir); // for a variety of lengths for (int length = 0; length < MaxLength; length += random.Next(MaxLength / 10) + 1) { //LOG.info("creating; entries = " + length); // create a file with length entries SequenceFile.Writer writer = SequenceFile.CreateWriter(fs, conf, file, typeof(IntWritable ), typeof(LongWritable)); try { for (int i = 0; i < length; i++) { IntWritable key = new IntWritable(i); LongWritable value = new LongWritable(10 * i); writer.Append(key, value); } } finally { writer.Close(); } // try splitting the file in a variety of sizes InputFormat <Text, Text> format = new SequenceFileAsTextInputFormat(); for (int i_1 = 0; i_1 < 3; i_1++) { int numSplits = random.Next(MaxLength / (SequenceFile.SyncInterval / 20)) + 1; //LOG.info("splitting: requesting = " + numSplits); InputSplit[] splits = format.GetSplits(job, numSplits); //LOG.info("splitting: got = " + splits.length); // check each split BitSet bits = new BitSet(length); for (int j = 0; j < splits.Length; j++) { RecordReader <Text, Text> reader = format.GetRecordReader(splits[j], job, reporter ); Type readerClass = reader.GetType(); NUnit.Framework.Assert.AreEqual("reader class is SequenceFileAsTextRecordReader." , typeof(SequenceFileAsTextRecordReader), readerClass); Text value = reader.CreateValue(); Text key = reader.CreateKey(); try { int count = 0; while (reader.Next(key, value)) { // if (bits.get(key.get())) { // LOG.info("splits["+j+"]="+splits[j]+" : " + key.get()); // LOG.info("@"+reader.getPos()); // } int keyInt = System.Convert.ToInt32(key.ToString()); NUnit.Framework.Assert.IsFalse("Key in multiple partitions.", bits.Get(keyInt)); bits.Set(keyInt); count++; } } finally { //LOG.info("splits["+j+"]="+splits[j]+" count=" + count); reader.Close(); } } NUnit.Framework.Assert.AreEqual("Some keys in no partition.", length, bits.Cardinality ()); } } }