public virtual void TestPrototypeInflaterGzip() { CompressionCodec gzip = new GzipCodec(); // used only for file extension localFs.Delete(workDir, true); // localFs = FileSystem instance System.Console.Out.WriteLine(ColorBrBlue + "testPrototypeInflaterGzip() using " + "non-native/Java Inflater and manual gzip header/trailer parsing" + ColorNormal ); // copy prebuilt (correct!) version of concat.gz to HDFS string fn = "concat" + gzip.GetDefaultExtension(); Path fnLocal = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn); Path fnHDFS = new Path(workDir, fn); localFs.CopyFromLocalFile(fnLocal, fnHDFS); FileInputStream @in = new FileInputStream(fnLocal.ToString()); NUnit.Framework.Assert.AreEqual("concat bytes available", 148, @in.Available()); // should wrap all of this header-reading stuff in a running-CRC wrapper // (did so in BuiltInGzipDecompressor; see below) byte[] compressedBuf = new byte[256]; int numBytesRead = @in.Read(compressedBuf, 0, 10); NUnit.Framework.Assert.AreEqual("header bytes read", 10, numBytesRead); NUnit.Framework.Assert.AreEqual("1st byte", unchecked ((int)(0x1f)), compressedBuf [0] & unchecked ((int)(0xff))); NUnit.Framework.Assert.AreEqual("2nd byte", unchecked ((int)(0x8b)), compressedBuf [1] & unchecked ((int)(0xff))); NUnit.Framework.Assert.AreEqual("3rd byte (compression method)", 8, compressedBuf [2] & unchecked ((int)(0xff))); byte flags = unchecked ((byte)(compressedBuf[3] & unchecked ((int)(0xff)))); if ((flags & unchecked ((int)(0x04))) != 0) { // FEXTRA numBytesRead = @in.Read(compressedBuf, 0, 2); NUnit.Framework.Assert.AreEqual("XLEN bytes read", 2, numBytesRead); int xlen = ((compressedBuf[1] << 8) | compressedBuf[0]) & unchecked ((int)(0xffff) ); @in.Skip(xlen); } if ((flags & unchecked ((int)(0x08))) != 0) { // FNAME while ((numBytesRead = @in.Read()) != 0) { NUnit.Framework.Assert.IsFalse("unexpected end-of-file while reading filename", numBytesRead == -1); } } if ((flags & unchecked ((int)(0x10))) != 0) { // FCOMMENT while ((numBytesRead = @in.Read()) != 0) { NUnit.Framework.Assert.IsFalse("unexpected end-of-file while reading comment", numBytesRead == -1); } } if ((flags & unchecked ((int)(0xe0))) != 0) { // reserved NUnit.Framework.Assert.IsTrue("reserved bits are set??", (flags & unchecked ((int) (0xe0))) == 0); } if ((flags & unchecked ((int)(0x02))) != 0) { // FHCRC numBytesRead = @in.Read(compressedBuf, 0, 2); NUnit.Framework.Assert.AreEqual("CRC16 bytes read", 2, numBytesRead); int crc16 = ((compressedBuf[1] << 8) | compressedBuf[0]) & unchecked ((int)(0xffff )); } // ready to go! next bytes should be start of deflated stream, suitable // for Inflater numBytesRead = @in.Read(compressedBuf); // Inflater docs refer to a "dummy byte": no clue what that's about; // appears to work fine without one byte[] uncompressedBuf = new byte[256]; Inflater inflater = new Inflater(true); inflater.SetInput(compressedBuf, 0, numBytesRead); try { int numBytesUncompressed = inflater.Inflate(uncompressedBuf); string outString = Sharpen.Runtime.GetStringForBytes(uncompressedBuf, 0, numBytesUncompressed , "UTF-8"); System.Console.Out.WriteLine("uncompressed data of first gzip member = [" + outString + "]"); } catch (SharpZipBaseException ex) { throw new IOException(ex.Message); } @in.Close(); }
public virtual void TestBuiltInGzipDecompressor() { // NOTE: This fails on RHEL4 with "java.io.IOException: header crc mismatch" // due to buggy version of zlib (1.2.1.2) included. JobConf jobConf = new JobConf(defaultConf); jobConf.SetBoolean("io.native.lib.available", false); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.SetConf(gzip, jobConf); localFs.Delete(workDir, true); NUnit.Framework.Assert.AreEqual("[non-native (Java) codec]", typeof(BuiltInGzipDecompressor ), gzip.GetDecompressorType()); System.Console.Out.WriteLine(ColorBrYellow + "testBuiltInGzipDecompressor() using" + " non-native (Java Inflater) Decompressor (" + gzip.GetDecompressorType() + ")" + ColorNormal); // copy single-member test file to HDFS string fn1 = "testConcatThenCompress.txt" + gzip.GetDefaultExtension(); Path fnLocal1 = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn1); Path fnHDFS1 = new Path(workDir, fn1); localFs.CopyFromLocalFile(fnLocal1, fnHDFS1); // copy multiple-member test file to HDFS // (actually in "seekable gzip" format, a la JIRA PIG-42) string fn2 = "testCompressThenConcat.txt" + gzip.GetDefaultExtension(); Path fnLocal2 = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn2); Path fnHDFS2 = new Path(workDir, fn2); localFs.CopyFromLocalFile(fnLocal2, fnHDFS2); FileInputFormat.SetInputPaths(jobConf, workDir); // here's first pair of DecompressorStreams: FileInputStream in1 = new FileInputStream(fnLocal1.ToString()); FileInputStream in2 = new FileInputStream(fnLocal2.ToString()); NUnit.Framework.Assert.AreEqual("concat bytes available", 2734, in1.Available()); NUnit.Framework.Assert.AreEqual("concat bytes available", 3413, in2.Available()); // w/hdr CRC CompressionInputStream cin2 = gzip.CreateInputStream(in2); LineReader @in = new LineReader(cin2); Text @out = new Text(); int numBytes; int totalBytes = 0; int lineNum = 0; while ((numBytes = @in.ReadLine(@out)) > 0) { ++lineNum; totalBytes += numBytes; } @in.Close(); NUnit.Framework.Assert.AreEqual("total uncompressed bytes in concatenated test file" , 5346, totalBytes); NUnit.Framework.Assert.AreEqual("total uncompressed lines in concatenated test file" , 84, lineNum); // test BuiltInGzipDecompressor with lots of different input-buffer sizes DoMultipleGzipBufferSizes(jobConf, false); // test GzipZlibDecompressor (native), just to be sure // (FIXME? could move this call to testGzip(), but would need filename // setup above) (alternatively, maybe just nuke testGzip() and extend this?) DoMultipleGzipBufferSizes(jobConf, true); }
public virtual void TestGzip() { JobConf jobConf = new JobConf(defaultConf); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.SetConf(gzip, jobConf); localFs.Delete(workDir, true); // preferred, but not compatible with Apache/trunk instance of Hudson: /* * assertFalse("[native (C/C++) codec]", * (org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.class == * gzip.getDecompressorType()) ); * System.out.println(COLOR_BR_RED + * "testGzip() using native-zlib Decompressor (" + * gzip.getDecompressorType() + ")" + COLOR_NORMAL); */ // alternative: if (typeof(BuiltInGzipDecompressor) == gzip.GetDecompressorType()) { System.Console.Out.WriteLine(ColorBrRed + "testGzip() using native-zlib Decompressor (" + gzip.GetDecompressorType() + ")" + ColorNormal); } else { Log.Warn("testGzip() skipped: native (C/C++) libs not loaded"); return; } /* * // THIS IS BUGGY: omits 2nd/3rd gzip headers; screws up 2nd/3rd CRCs-- * // see https://issues.apache.org/jira/browse/HADOOP-6799 * Path fnHDFS = new Path(workDir, "concat" + gzip.getDefaultExtension()); * //OutputStream out = localFs.create(fnHDFS); * //GzipCodec.GzipOutputStream gzOStm = new GzipCodec.GzipOutputStream(out); * // can just combine those two lines, probably * //GzipCodec.GzipOutputStream gzOStm = * // new GzipCodec.GzipOutputStream(localFs.create(fnHDFS)); * // oops, no: this is a protected helper class; need to access * // it via createOutputStream() instead: * OutputStream out = localFs.create(fnHDFS); * Compressor gzCmp = gzip.createCompressor(); * CompressionOutputStream gzOStm = gzip.createOutputStream(out, gzCmp); * // this SHOULD be going to HDFS: got out from localFs == HDFS * // ...yup, works * gzOStm.write("first gzip concat\n member\nwith three lines\n".getBytes()); * gzOStm.finish(); * gzOStm.resetState(); * gzOStm.write("2nd gzip concat member\n".getBytes()); * gzOStm.finish(); * gzOStm.resetState(); * gzOStm.write("gzip concat\nmember #3\n".getBytes()); * gzOStm.close(); * // * String fn = "hdfs-to-local-concat" + gzip.getDefaultExtension(); * Path fnLocal = new Path(System.getProperty("test.concat.data","/tmp"), fn); * localFs.copyToLocalFile(fnHDFS, fnLocal); */ // copy prebuilt (correct!) version of concat.gz to HDFS string fn = "concat" + gzip.GetDefaultExtension(); Path fnLocal = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn); Path fnHDFS = new Path(workDir, fn); localFs.CopyFromLocalFile(fnLocal, fnHDFS); WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n" ); FileInputFormat.SetInputPaths(jobConf, workDir); TextInputFormat format = new TextInputFormat(); format.Configure(jobConf); InputSplit[] splits = format.GetSplits(jobConf, 100); NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length); FileSplit tmp = (FileSplit)splits[0]; if (tmp.GetPath().GetName().Equals("part2.txt.gz")) { splits[0] = splits[1]; splits[1] = tmp; } IList <Text> results = ReadSplit(format, splits[0], jobConf); NUnit.Framework.Assert.AreEqual("splits[0] num lines", 6, results.Count); NUnit.Framework.Assert.AreEqual("splits[0][5]", "member #3", results[5].ToString( )); results = ReadSplit(format, splits[1], jobConf); NUnit.Framework.Assert.AreEqual("splits[1] num lines", 2, results.Count); NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString ()); NUnit.Framework.Assert.AreEqual("splits[1][1]", "of gzip", results[1].ToString()); }