/// <exception cref="System.IO.IOException"/> /// <exception cref="System.Exception"/> internal virtual void CheckFormat(Job job, int expectedN, int lastN) { NLineInputFormat format = new NLineInputFormat(); IList <InputSplit> splits = format.GetSplits(job); int count = 0; for (int i = 0; i < splits.Count; i++) { NUnit.Framework.Assert.AreEqual("There are no split locations", 0, splits[i].GetLocations ().Length); TaskAttemptContext context = MapReduceTestUtil.CreateDummyMapTaskAttemptContext(job .GetConfiguration()); RecordReader <LongWritable, Text> reader = format.CreateRecordReader(splits[i], context ); Type clazz = reader.GetType(); NUnit.Framework.Assert.AreEqual("reader class is LineRecordReader.", typeof(LineRecordReader ), clazz); MapContext <LongWritable, Text, LongWritable, Text> mcontext = new MapContextImpl < LongWritable, Text, LongWritable, Text>(job.GetConfiguration(), context.GetTaskAttemptID (), reader, null, null, MapReduceTestUtil.CreateDummyReporter(), splits[i]); reader.Initialize(splits[i], mcontext); try { count = 0; while (reader.NextKeyValue()) { count++; } } finally { reader.Close(); } if (i == splits.Count - 1) { NUnit.Framework.Assert.AreEqual("number of lines in split(" + i + ") is wrong", lastN , count); } else { NUnit.Framework.Assert.AreEqual("number of lines in split(" + i + ") is wrong", expectedN , count); } } }
// A reporter that does nothing /// <exception cref="System.IO.IOException"/> internal virtual void CheckFormat(JobConf job, int expectedN) { NLineInputFormat format = new NLineInputFormat(); format.Configure(job); int ignoredNumSplits = 1; InputSplit[] splits = format.GetSplits(job, ignoredNumSplits); // check all splits except last one int count = 0; for (int j = 0; j < splits.Length - 1; j++) { NUnit.Framework.Assert.AreEqual("There are no split locations", 0, splits[j].GetLocations ().Length); RecordReader <LongWritable, Text> reader = format.GetRecordReader(splits[j], job, voidReporter); Type readerClass = reader.GetType(); NUnit.Framework.Assert.AreEqual("reader class is LineRecordReader.", typeof(LineRecordReader ), readerClass); LongWritable key = reader.CreateKey(); Type keyClass = key.GetType(); NUnit.Framework.Assert.AreEqual("Key class is LongWritable.", typeof(LongWritable ), keyClass); Text value = reader.CreateValue(); Type valueClass = value.GetType(); NUnit.Framework.Assert.AreEqual("Value class is Text.", typeof(Text), valueClass); try { count = 0; while (reader.Next(key, value)) { count++; } } finally { reader.Close(); } NUnit.Framework.Assert.AreEqual("number of lines in split is " + expectedN, expectedN , count); } }
/// <exception cref="System.Exception"/> public virtual void TestFormat() { Job job = Job.GetInstance(new Configuration(defaultConf)); Random random = new Random(); long seed = random.NextLong(); Log.Info("seed = " + seed); random.SetSeed(seed); localFs.Delete(workDir, true); FileInputFormat.SetInputPaths(job, workDir); int length = 10000; int numFiles = 10; // create files with various lengths CreateFiles(length, numFiles, random); // create a combined split for the files CombineTextInputFormat format = new CombineTextInputFormat(); for (int i = 0; i < 3; i++) { int numSplits = random.Next(length / 20) + 1; Log.Info("splitting: requesting = " + numSplits); IList <InputSplit> splits = format.GetSplits(job); Log.Info("splitting: got = " + splits.Count); // we should have a single split as the length is comfortably smaller than // the block size NUnit.Framework.Assert.AreEqual("We got more than one splits!", 1, splits.Count); InputSplit split = splits[0]; NUnit.Framework.Assert.AreEqual("It should be CombineFileSplit", typeof(CombineFileSplit ), split.GetType()); // check the split BitSet bits = new BitSet(length); Log.Debug("split= " + split); TaskAttemptContext context = MapReduceTestUtil.CreateDummyMapTaskAttemptContext(job .GetConfiguration()); RecordReader <LongWritable, Text> reader = format.CreateRecordReader(split, context ); NUnit.Framework.Assert.AreEqual("reader class is CombineFileRecordReader.", typeof( CombineFileRecordReader), reader.GetType()); MapContext <LongWritable, Text, LongWritable, Text> mcontext = new MapContextImpl < LongWritable, Text, LongWritable, Text>(job.GetConfiguration(), context.GetTaskAttemptID (), reader, null, null, MapReduceTestUtil.CreateDummyReporter(), split); reader.Initialize(split, mcontext); try { int count = 0; while (reader.NextKeyValue()) { LongWritable key = reader.GetCurrentKey(); NUnit.Framework.Assert.IsNotNull("Key should not be null.", key); Text value = reader.GetCurrentValue(); int v = System.Convert.ToInt32(value.ToString()); Log.Debug("read " + v); NUnit.Framework.Assert.IsFalse("Key in multiple partitions.", bits.Get(v)); bits.Set(v); count++; } Log.Debug("split=" + split + " count=" + count); } finally { reader.Close(); } NUnit.Framework.Assert.AreEqual("Some keys in no partition.", length, bits.Cardinality ()); } }
/// <exception cref="System.Exception"/> public virtual void TestFormat() { JobConf job = new JobConf(); Path file = new Path(workDir, "test.txt"); // A reporter that does nothing Reporter reporter = Reporter.Null; int seed = new Random().Next(); Log.Info("seed = " + seed); Random random = new Random(seed); localFs.Delete(workDir, true); FileInputFormat.SetInputPaths(job, workDir); // for a variety of lengths for (int length = 0; length < MaxLength; length += random.Next(MaxLength / 10) + 1) { Log.Debug("creating; entries = " + length); // create a file with length entries TextWriter writer = new OutputStreamWriter(localFs.Create(file)); try { for (int i = 0; i < length; i++) { writer.Write(Sharpen.Extensions.ToString(i * 2)); writer.Write("\t"); writer.Write(Sharpen.Extensions.ToString(i)); writer.Write("\n"); } } finally { writer.Close(); } // try splitting the file in a variety of sizes KeyValueTextInputFormat format = new KeyValueTextInputFormat(); format.Configure(job); for (int i_1 = 0; i_1 < 3; i_1++) { int numSplits = random.Next(MaxLength / 20) + 1; Log.Debug("splitting: requesting = " + numSplits); InputSplit[] splits = format.GetSplits(job, numSplits); Log.Debug("splitting: got = " + splits.Length); // check each split BitSet bits = new BitSet(length); for (int j = 0; j < splits.Length; j++) { Log.Debug("split[" + j + "]= " + splits[j]); RecordReader <Text, Text> reader = format.GetRecordReader(splits[j], job, reporter ); Type readerClass = reader.GetType(); NUnit.Framework.Assert.AreEqual("reader class is KeyValueLineRecordReader.", typeof( KeyValueLineRecordReader), readerClass); Text key = reader.CreateKey(); Type keyClass = key.GetType(); Text value = reader.CreateValue(); Type valueClass = value.GetType(); NUnit.Framework.Assert.AreEqual("Key class is Text.", typeof(Text), keyClass); NUnit.Framework.Assert.AreEqual("Value class is Text.", typeof(Text), valueClass); try { int count = 0; while (reader.Next(key, value)) { int v = System.Convert.ToInt32(value.ToString()); Log.Debug("read " + v); if (bits.Get(v)) { Log.Warn("conflict with " + v + " in split " + j + " at position " + reader.GetPos ()); } NUnit.Framework.Assert.IsFalse("Key in multiple partitions.", bits.Get(v)); bits.Set(v); count++; } Log.Debug("splits[" + j + "]=" + splits[j] + " count=" + count); } finally { reader.Close(); } } NUnit.Framework.Assert.AreEqual("Some keys in no partition.", length, bits.Cardinality ()); } } }
/// <exception cref="System.IO.IOException"/> private void RunRandomTests(CompressionCodec codec) { StringBuilder fileName = new StringBuilder("testFormat.txt"); if (codec != null) { fileName.Append(".gz"); } localFs.Delete(workDir, true); Path file = new Path(workDir, fileName.ToString()); int seed = new Random().Next(); Log.Info("Seed = " + seed); Random random = new Random(seed); int MaxTests = 20; LongWritable key = new LongWritable(); BytesWritable value = new BytesWritable(); for (int i = 0; i < MaxTests; i++) { Log.Info("----------------------------------------------------------"); // Maximum total records of 999 int totalRecords = random.Next(999) + 1; // Test an empty file if (i == 8) { totalRecords = 0; } // Maximum bytes in a record of 100K int recordLength = random.Next(1024 * 100) + 1; // For the 11th test, force a record length of 1 if (i == 10) { recordLength = 1; } // The total bytes in the test file int fileSize = (totalRecords * recordLength); Log.Info("totalRecords=" + totalRecords + " recordLength=" + recordLength); // Create the job JobConf job = new JobConf(defaultConf); if (codec != null) { ReflectionUtils.SetConf(codec, job); } // Create the test file AList <string> recordList = CreateFile(file, codec, recordLength, totalRecords); NUnit.Framework.Assert.IsTrue(localFs.Exists(file)); //set the fixed length record length config property for the job FixedLengthInputFormat.SetRecordLength(job, recordLength); int numSplits = 1; // Arbitrarily set number of splits. if (i > 0) { if (i == (MaxTests - 1)) { // Test a split size that is less than record len numSplits = (int)(fileSize / Math.Floor(recordLength / 2)); } else { if (MaxTests % i == 0) { // Let us create a split size that is forced to be // smaller than the end file itself, (ensures 1+ splits) numSplits = fileSize / (fileSize - random.Next(fileSize)); } else { // Just pick a random split size with no upper bound numSplits = Math.Max(1, fileSize / random.Next(int.MaxValue)); } } Log.Info("Number of splits set to: " + numSplits); } // Setup the input path FileInputFormat.SetInputPaths(job, workDir); // Try splitting the file in a variety of sizes FixedLengthInputFormat format = new FixedLengthInputFormat(); format.Configure(job); InputSplit[] splits = format.GetSplits(job, numSplits); Log.Info("Actual number of splits = " + splits.Length); // Test combined split lengths = total file size long recordOffset = 0; int recordNumber = 0; foreach (InputSplit split in splits) { RecordReader <LongWritable, BytesWritable> reader = format.GetRecordReader(split, job, voidReporter); Type clazz = reader.GetType(); NUnit.Framework.Assert.AreEqual("RecordReader class should be FixedLengthRecordReader:" , typeof(FixedLengthRecordReader), clazz); // Plow through the records in this split while (reader.Next(key, value)) { NUnit.Framework.Assert.AreEqual("Checking key", (long)(recordNumber * recordLength ), key.Get()); string valueString = Sharpen.Runtime.GetStringForBytes(value.GetBytes(), 0, value .GetLength()); NUnit.Framework.Assert.AreEqual("Checking record length:", recordLength, value.GetLength ()); NUnit.Framework.Assert.IsTrue("Checking for more records than expected:", recordNumber < totalRecords); string origRecord = recordList[recordNumber]; NUnit.Framework.Assert.AreEqual("Checking record content:", origRecord, valueString ); recordNumber++; } reader.Close(); } NUnit.Framework.Assert.AreEqual("Total original records should be total read records:" , recordList.Count, recordNumber); } }
public virtual void TestFormat() { Job job = Job.GetInstance(new Configuration(defaultConf)); Path file = new Path(workDir, "test.txt"); int seed = new Random().Next(); Log.Info("seed = " + seed); Random random = new Random(seed); localFs.Delete(workDir, true); FileInputFormat.SetInputPaths(job, workDir); int MaxLength = 10000; // for a variety of lengths for (int length = 0; length < MaxLength; length += random.Next(MaxLength / 10) + 1) { Log.Debug("creating; entries = " + length); // create a file with length entries TextWriter writer = new OutputStreamWriter(localFs.Create(file)); try { for (int i = 0; i < length; i++) { writer.Write(Sharpen.Extensions.ToString(i * 2)); writer.Write("\t"); writer.Write(Sharpen.Extensions.ToString(i)); writer.Write("\n"); } } finally { writer.Close(); } // try splitting the file in a variety of sizes KeyValueTextInputFormat format = new KeyValueTextInputFormat(); for (int i_1 = 0; i_1 < 3; i_1++) { int numSplits = random.Next(MaxLength / 20) + 1; Log.Debug("splitting: requesting = " + numSplits); IList <InputSplit> splits = format.GetSplits(job); Log.Debug("splitting: got = " + splits.Count); // check each split BitSet bits = new BitSet(length); for (int j = 0; j < splits.Count; j++) { Log.Debug("split[" + j + "]= " + splits[j]); TaskAttemptContext context = MapReduceTestUtil.CreateDummyMapTaskAttemptContext(job .GetConfiguration()); RecordReader <Text, Text> reader = format.CreateRecordReader(splits[j], context); Type clazz = reader.GetType(); NUnit.Framework.Assert.AreEqual("reader class is KeyValueLineRecordReader.", typeof( KeyValueLineRecordReader), clazz); MapContext <Text, Text, Text, Text> mcontext = new MapContextImpl <Text, Text, Text , Text>(job.GetConfiguration(), context.GetTaskAttemptID(), reader, null, null, MapReduceTestUtil.CreateDummyReporter(), splits[j]); reader.Initialize(splits[j], mcontext); Text key = null; Text value = null; try { int count = 0; while (reader.NextKeyValue()) { key = reader.GetCurrentKey(); clazz = key.GetType(); NUnit.Framework.Assert.AreEqual("Key class is Text.", typeof(Text), clazz); value = reader.GetCurrentValue(); clazz = value.GetType(); NUnit.Framework.Assert.AreEqual("Value class is Text.", typeof(Text), clazz); int k = System.Convert.ToInt32(key.ToString()); int v = System.Convert.ToInt32(value.ToString()); NUnit.Framework.Assert.AreEqual("Bad key", 0, k % 2); NUnit.Framework.Assert.AreEqual("Mismatched key/value", k / 2, v); Log.Debug("read " + v); NUnit.Framework.Assert.IsFalse("Key in multiple partitions.", bits.Get(v)); bits.Set(v); count++; } Log.Debug("splits[" + j + "]=" + splits[j] + " count=" + count); } finally { reader.Close(); } } NUnit.Framework.Assert.AreEqual("Some keys in no partition.", length, bits.Cardinality ()); } } }
public virtual void TestSplitableCodecs() { Job job = Job.GetInstance(defaultConf); Configuration conf = job.GetConfiguration(); // Create the codec CompressionCodec codec = null; try { codec = (CompressionCodec)ReflectionUtils.NewInstance(conf.GetClassByName("org.apache.hadoop.io.compress.BZip2Codec" ), conf); } catch (TypeLoadException) { throw new IOException("Illegal codec!"); } Path file = new Path(workDir, "test" + codec.GetDefaultExtension()); int seed = new Random().Next(); Log.Info("seed = " + seed); Random random = new Random(seed); localFs.Delete(workDir, true); FileInputFormat.SetInputPaths(job, workDir); int MaxLength = 500000; FileInputFormat.SetMaxInputSplitSize(job, MaxLength / 20); // for a variety of lengths for (int length = 0; length < MaxLength; length += random.Next(MaxLength / 4) + 1) { Log.Info("creating; entries = " + length); // create a file with length entries TextWriter writer = new OutputStreamWriter(codec.CreateOutputStream(localFs.Create (file))); try { for (int i = 0; i < length; i++) { writer.Write(Sharpen.Extensions.ToString(i * 2)); writer.Write("\t"); writer.Write(Sharpen.Extensions.ToString(i)); writer.Write("\n"); } } finally { writer.Close(); } // try splitting the file in a variety of sizes KeyValueTextInputFormat format = new KeyValueTextInputFormat(); NUnit.Framework.Assert.IsTrue("KVTIF claims not splittable", format.IsSplitable(job , file)); for (int i_1 = 0; i_1 < 3; i_1++) { int numSplits = random.Next(MaxLength / 2000) + 1; Log.Info("splitting: requesting = " + numSplits); IList <InputSplit> splits = format.GetSplits(job); Log.Info("splitting: got = " + splits.Count); // check each split BitSet bits = new BitSet(length); for (int j = 0; j < splits.Count; j++) { Log.Debug("split[" + j + "]= " + splits[j]); TaskAttemptContext context = MapReduceTestUtil.CreateDummyMapTaskAttemptContext(job .GetConfiguration()); RecordReader <Text, Text> reader = format.CreateRecordReader(splits[j], context); Type clazz = reader.GetType(); MapContext <Text, Text, Text, Text> mcontext = new MapContextImpl <Text, Text, Text , Text>(job.GetConfiguration(), context.GetTaskAttemptID(), reader, null, null, MapReduceTestUtil.CreateDummyReporter(), splits[j]); reader.Initialize(splits[j], mcontext); Text key = null; Text value = null; try { int count = 0; while (reader.NextKeyValue()) { key = reader.GetCurrentKey(); value = reader.GetCurrentValue(); int k = System.Convert.ToInt32(key.ToString()); int v = System.Convert.ToInt32(value.ToString()); NUnit.Framework.Assert.AreEqual("Bad key", 0, k % 2); NUnit.Framework.Assert.AreEqual("Mismatched key/value", k / 2, v); Log.Debug("read " + k + "," + v); NUnit.Framework.Assert.IsFalse(k + "," + v + " in multiple partitions.", bits.Get (v)); bits.Set(v); count++; } if (count > 0) { Log.Info("splits[" + j + "]=" + splits[j] + " count=" + count); } else { Log.Debug("splits[" + j + "]=" + splits[j] + " count=" + count); } } finally { reader.Close(); } } NUnit.Framework.Assert.AreEqual("Some keys in no partition.", length, bits.Cardinality ()); } } }
/// <exception cref="System.Exception"/> public virtual void TestFormat() { JobConf job = new JobConf(conf); FileSystem fs = FileSystem.GetLocal(conf); Path dir = new Path(Runtime.GetProperty("test.build.data", ".") + "/mapred"); Path file = new Path(dir, "test.seq"); Reporter reporter = Reporter.Null; int seed = new Random().Next(); //LOG.info("seed = "+seed); Random random = new Random(seed); fs.Delete(dir, true); FileInputFormat.SetInputPaths(job, dir); // for a variety of lengths for (int length = 0; length < MaxLength; length += random.Next(MaxLength / 10) + 1) { //LOG.info("creating; entries = " + length); // create a file with length entries SequenceFile.Writer writer = SequenceFile.CreateWriter(fs, conf, file, typeof(IntWritable ), typeof(LongWritable)); try { for (int i = 0; i < length; i++) { IntWritable key = new IntWritable(i); LongWritable value = new LongWritable(10 * i); writer.Append(key, value); } } finally { writer.Close(); } // try splitting the file in a variety of sizes InputFormat <Text, Text> format = new SequenceFileAsTextInputFormat(); for (int i_1 = 0; i_1 < 3; i_1++) { int numSplits = random.Next(MaxLength / (SequenceFile.SyncInterval / 20)) + 1; //LOG.info("splitting: requesting = " + numSplits); InputSplit[] splits = format.GetSplits(job, numSplits); //LOG.info("splitting: got = " + splits.length); // check each split BitSet bits = new BitSet(length); for (int j = 0; j < splits.Length; j++) { RecordReader <Text, Text> reader = format.GetRecordReader(splits[j], job, reporter ); Type readerClass = reader.GetType(); NUnit.Framework.Assert.AreEqual("reader class is SequenceFileAsTextRecordReader." , typeof(SequenceFileAsTextRecordReader), readerClass); Text value = reader.CreateValue(); Text key = reader.CreateKey(); try { int count = 0; while (reader.Next(key, value)) { // if (bits.get(key.get())) { // LOG.info("splits["+j+"]="+splits[j]+" : " + key.get()); // LOG.info("@"+reader.getPos()); // } int keyInt = System.Convert.ToInt32(key.ToString()); NUnit.Framework.Assert.IsFalse("Key in multiple partitions.", bits.Get(keyInt)); bits.Set(keyInt); count++; } } finally { //LOG.info("splits["+j+"]="+splits[j]+" count=" + count); reader.Close(); } } NUnit.Framework.Assert.AreEqual("Some keys in no partition.", length, bits.Cardinality ()); } } }
/// <exception cref="System.Exception"/> public virtual void TestFormat() { Job job = Job.GetInstance(conf); FileSystem fs = FileSystem.GetLocal(conf); Path dir = new Path(Runtime.GetProperty("test.build.data", ".") + "/mapred"); Path file = new Path(dir, "test.seq"); int seed = new Random().Next(); Random random = new Random(seed); fs.Delete(dir, true); FileInputFormat.SetInputPaths(job, dir); // for a variety of lengths for (int length = 0; length < MaxLength; length += random.Next(MaxLength / 10) + 1) { // create a file with length entries SequenceFile.Writer writer = SequenceFile.CreateWriter(fs, conf, file, typeof(IntWritable ), typeof(LongWritable)); try { for (int i = 0; i < length; i++) { IntWritable key = new IntWritable(i); LongWritable value = new LongWritable(10 * i); writer.Append(key, value); } } finally { writer.Close(); } TaskAttemptContext context = MapReduceTestUtil.CreateDummyMapTaskAttemptContext(job .GetConfiguration()); // try splitting the file in a variety of sizes InputFormat <Text, Text> format = new SequenceFileAsTextInputFormat(); for (int i_1 = 0; i_1 < 3; i_1++) { // check each split BitSet bits = new BitSet(length); int numSplits = random.Next(MaxLength / (SequenceFile.SyncInterval / 20)) + 1; FileInputFormat.SetMaxInputSplitSize(job, fs.GetFileStatus(file).GetLen() / numSplits ); foreach (InputSplit split in format.GetSplits(job)) { RecordReader <Text, Text> reader = format.CreateRecordReader(split, context); MapContext <Text, Text, Text, Text> mcontext = new MapContextImpl <Text, Text, Text , Text>(job.GetConfiguration(), context.GetTaskAttemptID(), reader, null, null, MapReduceTestUtil.CreateDummyReporter(), split); reader.Initialize(split, mcontext); Type readerClass = reader.GetType(); NUnit.Framework.Assert.AreEqual("reader class is SequenceFileAsTextRecordReader." , typeof(SequenceFileAsTextRecordReader), readerClass); Text key; try { int count = 0; while (reader.NextKeyValue()) { key = reader.GetCurrentKey(); int keyInt = System.Convert.ToInt32(key.ToString()); NUnit.Framework.Assert.IsFalse("Key in multiple partitions.", bits.Get(keyInt)); bits.Set(keyInt); count++; } } finally { reader.Close(); } } NUnit.Framework.Assert.AreEqual("Some keys in no partition.", length, bits.Cardinality ()); } } }