public virtual void TestBzip2() { JobConf jobConf = new JobConf(defaultConf); CompressionCodec bzip2 = new BZip2Codec(); ReflectionUtils.SetConf(bzip2, jobConf); localFs.Delete(workDir, true); System.Console.Out.WriteLine(ColorBrCyan + "testBzip2() using non-native CBZip2InputStream (presumably)" + ColorNormal); // copy prebuilt (correct!) version of concat.bz2 to HDFS string fn = "concat" + bzip2.GetDefaultExtension(); Path fnLocal = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn); Path fnHDFS = new Path(workDir, fn); localFs.CopyFromLocalFile(fnLocal, fnHDFS); WriteFile(localFs, new Path(workDir, "part2.txt.bz2"), bzip2, "this is a test\nof bzip2\n" ); FileInputFormat.SetInputPaths(jobConf, workDir); TextInputFormat format = new TextInputFormat(); // extends FileInputFormat format.Configure(jobConf); format.SetMinSplitSize(256); // work around 2-byte splits issue // [135 splits for a 208-byte file and a 62-byte file(!)] InputSplit[] splits = format.GetSplits(jobConf, 100); NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length); FileSplit tmp = (FileSplit)splits[0]; if (tmp.GetPath().GetName().Equals("part2.txt.bz2")) { splits[0] = splits[1]; splits[1] = tmp; } IList <Text> results = ReadSplit(format, splits[0], jobConf); NUnit.Framework.Assert.AreEqual("splits[0] num lines", 6, results.Count); NUnit.Framework.Assert.AreEqual("splits[0][5]", "member #3", results[5].ToString( )); results = ReadSplit(format, splits[1], jobConf); NUnit.Framework.Assert.AreEqual("splits[1] num lines", 2, results.Count); NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString ()); NUnit.Framework.Assert.AreEqual("splits[1][1]", "of bzip2", results[1].ToString() ); }
public virtual void TestMoreBzip2() { JobConf jobConf = new JobConf(defaultConf); CompressionCodec bzip2 = new BZip2Codec(); ReflectionUtils.SetConf(bzip2, jobConf); localFs.Delete(workDir, true); System.Console.Out.WriteLine(ColorBrMagenta + "testMoreBzip2() using non-native CBZip2InputStream (presumably)" + ColorNormal); // copy single-member test file to HDFS string fn1 = "testConcatThenCompress.txt" + bzip2.GetDefaultExtension(); Path fnLocal1 = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn1); Path fnHDFS1 = new Path(workDir, fn1); localFs.CopyFromLocalFile(fnLocal1, fnHDFS1); // copy multiple-member test file to HDFS string fn2 = "testCompressThenConcat.txt" + bzip2.GetDefaultExtension(); Path fnLocal2 = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn2); Path fnHDFS2 = new Path(workDir, fn2); localFs.CopyFromLocalFile(fnLocal2, fnHDFS2); FileInputFormat.SetInputPaths(jobConf, workDir); // here's first pair of BlockDecompressorStreams: FileInputStream in1 = new FileInputStream(fnLocal1.ToString()); FileInputStream in2 = new FileInputStream(fnLocal2.ToString()); NUnit.Framework.Assert.AreEqual("concat bytes available", 2567, in1.Available()); NUnit.Framework.Assert.AreEqual("concat bytes available", 3056, in2.Available()); /* * // FIXME * // The while-loop below dies at the beginning of the 2nd concatenated * // member (after 17 lines successfully read) with: * // * // java.io.IOException: bad block header * // at org.apache.hadoop.io.compress.bzip2.CBZip2InputStream.initBlock( * // CBZip2InputStream.java:527) * // * // It is not critical to concatenated-gzip support, HADOOP-6835, so it's * // simply commented out for now (and HADOOP-6852 filed). If and when the * // latter issue is resolved--perhaps by fixing an error here--this code * // should be reenabled. Note that the doMultipleBzip2BufferSizes() test * // below uses the same testCompressThenConcat.txt.bz2 file but works fine. * * CompressionInputStream cin2 = bzip2.createInputStream(in2); * LineReader in = new LineReader(cin2); * Text out = new Text(); * * int numBytes, totalBytes=0, lineNum=0; * while ((numBytes = in.readLine(out)) > 0) { ++lineNum; * totalBytes += numBytes; * } * in.close(); * assertEquals("total uncompressed bytes in concatenated test file", * 5346, totalBytes); * assertEquals("total uncompressed lines in concatenated test file", * 84, lineNum); */ // test CBZip2InputStream with lots of different input-buffer sizes DoMultipleBzip2BufferSizes(jobConf, false); }