/// <exception cref="System.IO.IOException"/> private static IList <Text> ReadSplit(TextInputFormat format, InputSplit split, JobConf jobConf) { IList <Text> result = new AList <Text>(); RecordReader <LongWritable, Text> reader = format.GetRecordReader(split, jobConf, voidReporter); LongWritable key = reader.CreateKey(); Text value = reader.CreateValue(); while (reader.Next(key, value)) { result.AddItem(value); value = reader.CreateValue(); } reader.Close(); return(result); }
/// <exception cref="System.Exception"/> public virtual void TestFormat() { JobConf job = new JobConf(defaultConf); Path file = new Path(workDir, "test.txt"); // A reporter that does nothing Reporter reporter = Reporter.Null; int seed = new Random().Next(); Log.Info("seed = " + seed); Random random = new Random(seed); localFs.Delete(workDir, true); FileInputFormat.SetInputPaths(job, workDir); // for a variety of lengths for (int length = 0; length < MaxLength; length += random.Next(MaxLength / 10) + 1) { Log.Debug("creating; entries = " + length); // create a file with length entries TextWriter writer = new OutputStreamWriter(localFs.Create(file)); try { for (int i = 0; i < length; i++) { writer.Write(Sharpen.Extensions.ToString(i)); writer.Write("\n"); } } finally { writer.Close(); } // try splitting the file in a variety of sizes TextInputFormat format = new TextInputFormat(); format.Configure(job); LongWritable key = new LongWritable(); Text value = new Text(); for (int i_1 = 0; i_1 < 3; i_1++) { int numSplits = random.Next(MaxLength / 20) + 1; Log.Debug("splitting: requesting = " + numSplits); InputSplit[] splits = format.GetSplits(job, numSplits); Log.Debug("splitting: got = " + splits.Length); if (length == 0) { NUnit.Framework.Assert.AreEqual("Files of length 0 are not returned from FileInputFormat.getSplits()." , 1, splits.Length); NUnit.Framework.Assert.AreEqual("Empty file length == 0", 0, splits[0].GetLength( )); } // check each split BitSet bits = new BitSet(length); for (int j = 0; j < splits.Length; j++) { Log.Debug("split[" + j + "]= " + splits[j]); RecordReader <LongWritable, Text> reader = format.GetRecordReader(splits[j], job, reporter); try { int count = 0; while (reader.Next(key, value)) { int v = System.Convert.ToInt32(value.ToString()); Log.Debug("read " + v); if (bits.Get(v)) { Log.Warn("conflict with " + v + " in split " + j + " at position " + reader.GetPos ()); } NUnit.Framework.Assert.IsFalse("Key in multiple partitions.", bits.Get(v)); bits.Set(v); count++; } Log.Debug("splits[" + j + "]=" + splits[j] + " count=" + count); } finally { reader.Close(); } } NUnit.Framework.Assert.AreEqual("Some keys in no partition.", length, bits.Cardinality ()); } } }
/// <exception cref="System.IO.IOException"/> public virtual void TestSplitableCodecs() { JobConf conf = new JobConf(defaultConf); int seed = new Random().Next(); // Create the codec CompressionCodec codec = null; try { codec = (CompressionCodec)ReflectionUtils.NewInstance(conf.GetClassByName("org.apache.hadoop.io.compress.BZip2Codec" ), conf); } catch (TypeLoadException) { throw new IOException("Illegal codec!"); } Path file = new Path(workDir, "test" + codec.GetDefaultExtension()); // A reporter that does nothing Reporter reporter = Reporter.Null; Log.Info("seed = " + seed); Random random = new Random(seed); FileSystem localFs = FileSystem.GetLocal(conf); localFs.Delete(workDir, true); FileInputFormat.SetInputPaths(conf, workDir); int MaxLength = 500000; // for a variety of lengths for (int length = MaxLength / 2; length < MaxLength; length += random.Next(MaxLength / 4) + 1) { Log.Info("creating; entries = " + length); // create a file with length entries TextWriter writer = new OutputStreamWriter(codec.CreateOutputStream(localFs.Create (file))); try { for (int i = 0; i < length; i++) { writer.Write(Sharpen.Extensions.ToString(i)); writer.Write("\n"); } } finally { writer.Close(); } // try splitting the file in a variety of sizes TextInputFormat format = new TextInputFormat(); format.Configure(conf); LongWritable key = new LongWritable(); Text value = new Text(); for (int i_1 = 0; i_1 < 3; i_1++) { int numSplits = random.Next(MaxLength / 2000) + 1; Log.Info("splitting: requesting = " + numSplits); InputSplit[] splits = format.GetSplits(conf, numSplits); Log.Info("splitting: got = " + splits.Length); // check each split BitSet bits = new BitSet(length); for (int j = 0; j < splits.Length; j++) { Log.Debug("split[" + j + "]= " + splits[j]); RecordReader <LongWritable, Text> reader = format.GetRecordReader(splits[j], conf, reporter); try { int counter = 0; while (reader.Next(key, value)) { int v = System.Convert.ToInt32(value.ToString()); Log.Debug("read " + v); if (bits.Get(v)) { Log.Warn("conflict with " + v + " in split " + j + " at position " + reader.GetPos ()); } NUnit.Framework.Assert.IsFalse("Key in multiple partitions.", bits.Get(v)); bits.Set(v); counter++; } if (counter > 0) { Log.Info("splits[" + j + "]=" + splits[j] + " count=" + counter); } else { Log.Debug("splits[" + j + "]=" + splits[j] + " count=" + counter); } } finally { reader.Close(); } } NUnit.Framework.Assert.AreEqual("Some keys in no partition.", length, bits.Cardinality ()); } } }