/// <summary>Test using the gzip codec for reading</summary> /// <exception cref="System.IO.IOException"/> public virtual void TestGzip() { JobConf job = new JobConf(defaultConf); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.SetConf(gzip, job); localFs.Delete(workDir, true); WriteFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n" ); WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n" ); FileInputFormat.SetInputPaths(job, workDir); CombineTextInputFormat format = new CombineTextInputFormat(); InputSplit[] splits = format.GetSplits(job, 100); NUnit.Framework.Assert.AreEqual("compressed splits == 1", 1, splits.Length); IList <Text> results = ReadSplit(format, splits[0], job); NUnit.Framework.Assert.AreEqual("splits[0] length", 8, results.Count); string[] firstList = new string[] { "the quick", "brown", "fox jumped", "over", " the lazy" , " dog" }; string[] secondList = new string[] { "this is a test", "of gzip" }; string first = results[0].ToString(); if (first.Equals(firstList[0])) { TestResults(results, firstList, secondList); } else { if (first.Equals(secondList[0])) { TestResults(results, secondList, firstList); } else { NUnit.Framework.Assert.Fail("unexpected first token!"); } } }
// A reporter that does nothing /// <exception cref="System.Exception"/> public virtual void TestFormat() { JobConf job = new JobConf(defaultConf); Random random = new Random(); long seed = random.NextLong(); Log.Info("seed = " + seed); random.SetSeed(seed); localFs.Delete(workDir, true); FileInputFormat.SetInputPaths(job, workDir); int length = 10000; int numFiles = 10; CreateFiles(length, numFiles, random); // create a combined split for the files CombineTextInputFormat format = new CombineTextInputFormat(); LongWritable key = new LongWritable(); Text value = new Text(); for (int i = 0; i < 3; i++) { int numSplits = random.Next(length / 20) + 1; Log.Info("splitting: requesting = " + numSplits); InputSplit[] splits = format.GetSplits(job, numSplits); Log.Info("splitting: got = " + splits.Length); // we should have a single split as the length is comfortably smaller than // the block size NUnit.Framework.Assert.AreEqual("We got more than one splits!", 1, splits.Length); InputSplit split = splits[0]; NUnit.Framework.Assert.AreEqual("It should be CombineFileSplit", typeof(CombineFileSplit ), split.GetType()); // check the split BitSet bits = new BitSet(length); Log.Debug("split= " + split); RecordReader <LongWritable, Text> reader = format.GetRecordReader(split, job, voidReporter ); try { int count = 0; while (reader.Next(key, value)) { int v = System.Convert.ToInt32(value.ToString()); Log.Debug("read " + v); if (bits.Get(v)) { Log.Warn("conflict with " + v + " at position " + reader.GetPos()); } NUnit.Framework.Assert.IsFalse("Key in multiple partitions.", bits.Get(v)); bits.Set(v); count++; } Log.Info("splits=" + split + " count=" + count); } finally { reader.Close(); } NUnit.Framework.Assert.AreEqual("Some keys in no partition.", length, bits.Cardinality ()); } }