public virtual void TestBuiltInGzipDecompressor() { // NOTE: This fails on RHEL4 with "java.io.IOException: header crc mismatch" // due to buggy version of zlib (1.2.1.2) included. JobConf jobConf = new JobConf(defaultConf); jobConf.SetBoolean("io.native.lib.available", false); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.SetConf(gzip, jobConf); localFs.Delete(workDir, true); NUnit.Framework.Assert.AreEqual("[non-native (Java) codec]", typeof(BuiltInGzipDecompressor ), gzip.GetDecompressorType()); System.Console.Out.WriteLine(ColorBrYellow + "testBuiltInGzipDecompressor() using" + " non-native (Java Inflater) Decompressor (" + gzip.GetDecompressorType() + ")" + ColorNormal); // copy single-member test file to HDFS string fn1 = "testConcatThenCompress.txt" + gzip.GetDefaultExtension(); Path fnLocal1 = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn1); Path fnHDFS1 = new Path(workDir, fn1); localFs.CopyFromLocalFile(fnLocal1, fnHDFS1); // copy multiple-member test file to HDFS // (actually in "seekable gzip" format, a la JIRA PIG-42) string fn2 = "testCompressThenConcat.txt" + gzip.GetDefaultExtension(); Path fnLocal2 = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn2); Path fnHDFS2 = new Path(workDir, fn2); localFs.CopyFromLocalFile(fnLocal2, fnHDFS2); FileInputFormat.SetInputPaths(jobConf, workDir); // here's first pair of DecompressorStreams: FileInputStream in1 = new FileInputStream(fnLocal1.ToString()); FileInputStream in2 = new FileInputStream(fnLocal2.ToString()); NUnit.Framework.Assert.AreEqual("concat bytes available", 2734, in1.Available()); NUnit.Framework.Assert.AreEqual("concat bytes available", 3413, in2.Available()); // w/hdr CRC CompressionInputStream cin2 = gzip.CreateInputStream(in2); LineReader @in = new LineReader(cin2); Text @out = new Text(); int numBytes; int totalBytes = 0; int lineNum = 0; while ((numBytes = @in.ReadLine(@out)) > 0) { ++lineNum; totalBytes += numBytes; } @in.Close(); NUnit.Framework.Assert.AreEqual("total uncompressed bytes in concatenated test file" , 5346, totalBytes); NUnit.Framework.Assert.AreEqual("total uncompressed lines in concatenated test file" , 84, lineNum); // test BuiltInGzipDecompressor with lots of different input-buffer sizes DoMultipleGzipBufferSizes(jobConf, false); // test GzipZlibDecompressor (native), just to be sure // (FIXME? could move this call to testGzip(), but would need filename // setup above) (alternatively, maybe just nuke testGzip() and extend this?) DoMultipleGzipBufferSizes(jobConf, true); }
public virtual void TestGzip() { JobConf jobConf = new JobConf(defaultConf); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.SetConf(gzip, jobConf); localFs.Delete(workDir, true); // preferred, but not compatible with Apache/trunk instance of Hudson: /* * assertFalse("[native (C/C++) codec]", * (org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.class == * gzip.getDecompressorType()) ); * System.out.println(COLOR_BR_RED + * "testGzip() using native-zlib Decompressor (" + * gzip.getDecompressorType() + ")" + COLOR_NORMAL); */ // alternative: if (typeof(BuiltInGzipDecompressor) == gzip.GetDecompressorType()) { System.Console.Out.WriteLine(ColorBrRed + "testGzip() using native-zlib Decompressor (" + gzip.GetDecompressorType() + ")" + ColorNormal); } else { Log.Warn("testGzip() skipped: native (C/C++) libs not loaded"); return; } /* * // THIS IS BUGGY: omits 2nd/3rd gzip headers; screws up 2nd/3rd CRCs-- * // see https://issues.apache.org/jira/browse/HADOOP-6799 * Path fnHDFS = new Path(workDir, "concat" + gzip.getDefaultExtension()); * //OutputStream out = localFs.create(fnHDFS); * //GzipCodec.GzipOutputStream gzOStm = new GzipCodec.GzipOutputStream(out); * // can just combine those two lines, probably * //GzipCodec.GzipOutputStream gzOStm = * // new GzipCodec.GzipOutputStream(localFs.create(fnHDFS)); * // oops, no: this is a protected helper class; need to access * // it via createOutputStream() instead: * OutputStream out = localFs.create(fnHDFS); * Compressor gzCmp = gzip.createCompressor(); * CompressionOutputStream gzOStm = gzip.createOutputStream(out, gzCmp); * // this SHOULD be going to HDFS: got out from localFs == HDFS * // ...yup, works * gzOStm.write("first gzip concat\n member\nwith three lines\n".getBytes()); * gzOStm.finish(); * gzOStm.resetState(); * gzOStm.write("2nd gzip concat member\n".getBytes()); * gzOStm.finish(); * gzOStm.resetState(); * gzOStm.write("gzip concat\nmember #3\n".getBytes()); * gzOStm.close(); * // * String fn = "hdfs-to-local-concat" + gzip.getDefaultExtension(); * Path fnLocal = new Path(System.getProperty("test.concat.data","/tmp"), fn); * localFs.copyToLocalFile(fnHDFS, fnLocal); */ // copy prebuilt (correct!) version of concat.gz to HDFS string fn = "concat" + gzip.GetDefaultExtension(); Path fnLocal = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn); Path fnHDFS = new Path(workDir, fn); localFs.CopyFromLocalFile(fnLocal, fnHDFS); WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n" ); FileInputFormat.SetInputPaths(jobConf, workDir); TextInputFormat format = new TextInputFormat(); format.Configure(jobConf); InputSplit[] splits = format.GetSplits(jobConf, 100); NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length); FileSplit tmp = (FileSplit)splits[0]; if (tmp.GetPath().GetName().Equals("part2.txt.gz")) { splits[0] = splits[1]; splits[1] = tmp; } IList <Text> results = ReadSplit(format, splits[0], jobConf); NUnit.Framework.Assert.AreEqual("splits[0] num lines", 6, results.Count); NUnit.Framework.Assert.AreEqual("splits[0][5]", "member #3", results[5].ToString( )); results = ReadSplit(format, splits[1], jobConf); NUnit.Framework.Assert.AreEqual("splits[1] num lines", 2, results.Count); NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString ()); NUnit.Framework.Assert.AreEqual("splits[1][1]", "of gzip", results[1].ToString()); }