public virtual void TestBzip2() { JobConf jobConf = new JobConf(defaultConf); CompressionCodec bzip2 = new BZip2Codec(); ReflectionUtils.SetConf(bzip2, jobConf); localFs.Delete(workDir, true); System.Console.Out.WriteLine(ColorBrCyan + "testBzip2() using non-native CBZip2InputStream (presumably)" + ColorNormal); // copy prebuilt (correct!) version of concat.bz2 to HDFS string fn = "concat" + bzip2.GetDefaultExtension(); Path fnLocal = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn); Path fnHDFS = new Path(workDir, fn); localFs.CopyFromLocalFile(fnLocal, fnHDFS); WriteFile(localFs, new Path(workDir, "part2.txt.bz2"), bzip2, "this is a test\nof bzip2\n" ); FileInputFormat.SetInputPaths(jobConf, workDir); TextInputFormat format = new TextInputFormat(); // extends FileInputFormat format.Configure(jobConf); format.SetMinSplitSize(256); // work around 2-byte splits issue // [135 splits for a 208-byte file and a 62-byte file(!)] InputSplit[] splits = format.GetSplits(jobConf, 100); NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length); FileSplit tmp = (FileSplit)splits[0]; if (tmp.GetPath().GetName().Equals("part2.txt.bz2")) { splits[0] = splits[1]; splits[1] = tmp; } IList <Text> results = ReadSplit(format, splits[0], jobConf); NUnit.Framework.Assert.AreEqual("splits[0] num lines", 6, results.Count); NUnit.Framework.Assert.AreEqual("splits[0][5]", "member #3", results[5].ToString( )); results = ReadSplit(format, splits[1], jobConf); NUnit.Framework.Assert.AreEqual("splits[1] num lines", 2, results.Count); NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString ()); NUnit.Framework.Assert.AreEqual("splits[1][1]", "of bzip2", results[1].ToString() ); }
// this tests both files (testCompressThenConcat, testConcatThenCompress); all // should work with existing Java bzip2 decoder and any future native version /// <exception cref="System.IO.IOException"/> private static void DoSingleBzip2BufferSize(JobConf jConf) { TextInputFormat format = new TextInputFormat(); format.Configure(jConf); format.SetMinSplitSize(5500); // work around 256-byte/22-splits issue // here's Nth pair of DecompressorStreams: InputSplit[] splits = format.GetSplits(jConf, 100); NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length); FileSplit tmp = (FileSplit)splits[0]; if (tmp.GetPath().GetName().Equals("testCompressThenConcat.txt.gz")) { System.Console.Out.WriteLine(" (swapping)"); splits[0] = splits[1]; splits[1] = tmp; } // testConcatThenCompress (single) IList <Text> results = ReadSplit(format, splits[0], jConf); NUnit.Framework.Assert.AreEqual("splits[0] length (num lines)", 84, results.Count ); NUnit.Framework.Assert.AreEqual("splits[0][0]", "Call me Ishmael. Some years ago--never mind how long precisely--having" , results[0].ToString()); NUnit.Framework.Assert.AreEqual("splits[0][42]", "Tell me, does the magnetic virtue of the needles of the compasses of" , results[42].ToString()); // testCompressThenConcat (multi) results = ReadSplit(format, splits[1], jConf); NUnit.Framework.Assert.AreEqual("splits[1] length (num lines)", 84, results.Count ); NUnit.Framework.Assert.AreEqual("splits[1][0]", "Call me Ishmael. Some years ago--never mind how long precisely--having" , results[0].ToString()); NUnit.Framework.Assert.AreEqual("splits[1][42]", "Tell me, does the magnetic virtue of the needles of the compasses of" , results[42].ToString()); }