/// <summary>Test using the gzip codec for reading</summary> /// <exception cref="System.IO.IOException"/> public static void TestGzip() { JobConf job = new JobConf(); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.SetConf(gzip, job); localFs.Delete(workDir, true); WriteFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "line-1\tthe quick\nline-2\tbrown\nline-3\tfox jumped\nline-4\tover\nline-5\t the lazy\nline-6\t dog\n" ); WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "line-1\tthis is a test\nline-1\tof gzip\n" ); FileInputFormat.SetInputPaths(job, workDir); KeyValueTextInputFormat format = new KeyValueTextInputFormat(); format.Configure(job); InputSplit[] splits = format.GetSplits(job, 100); NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length); FileSplit tmp = (FileSplit)splits[0]; if (tmp.GetPath().GetName().Equals("part2.txt.gz")) { splits[0] = splits[1]; splits[1] = tmp; } IList <Text> results = ReadSplit(format, splits[0], job); NUnit.Framework.Assert.AreEqual("splits[0] length", 6, results.Count); NUnit.Framework.Assert.AreEqual("splits[0][5]", " dog", results[5].ToString()); results = ReadSplit(format, splits[1], job); NUnit.Framework.Assert.AreEqual("splits[1] length", 2, results.Count); NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString ()); NUnit.Framework.Assert.AreEqual("splits[1][1]", "of gzip", results[1].ToString()); }
/// <summary>Test using the gzip codec with two input files.</summary> /// <exception cref="System.IO.IOException"/> public virtual void TestGzipWithTwoInputs() { CompressionCodec gzip = new GzipCodec(); localFs.Delete(workDir, true); FixedLengthInputFormat format = new FixedLengthInputFormat(); JobConf job = new JobConf(defaultConf); FixedLengthInputFormat.SetRecordLength(job, 5); FileInputFormat.SetInputPaths(job, workDir); ReflectionUtils.SetConf(gzip, job); format.Configure(job); // Create files with fixed length records with 5 byte long records. WriteFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "one two threefour five six seveneightnine ten " ); WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "ten nine eightsevensix five four threetwo one " ); InputSplit[] splits = format.GetSplits(job, 100); NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length); FileSplit tmp = (FileSplit)splits[0]; if (tmp.GetPath().GetName().Equals("part2.txt.gz")) { splits[0] = splits[1]; splits[1] = tmp; } IList <string> results = ReadSplit(format, splits[0], job); NUnit.Framework.Assert.AreEqual("splits[0] length", 10, results.Count); NUnit.Framework.Assert.AreEqual("splits[0][5]", "six ", results[5]); results = ReadSplit(format, splits[1], job); NUnit.Framework.Assert.AreEqual("splits[1] length", 10, results.Count); NUnit.Framework.Assert.AreEqual("splits[1][0]", "ten ", results[0]); NUnit.Framework.Assert.AreEqual("splits[1][1]", "nine ", results[1]); }
public virtual void TestIFileReaderWithCodec() { Configuration conf = new Configuration(); FileSystem localFs = FileSystem.GetLocal(conf); FileSystem rfs = ((LocalFileSystem)localFs).GetRaw(); Path path = new Path(new Path("build/test.ifile"), "data"); DefaultCodec codec = new GzipCodec(); codec.SetConf(conf); FSDataOutputStream @out = rfs.Create(path); IFile.Writer <Text, Text> writer = new IFile.Writer <Text, Text>(conf, @out, typeof( Text), typeof(Text), codec, null); writer.Close(); FSDataInputStream @in = rfs.Open(path); IFile.Reader <Text, Text> reader = new IFile.Reader <Text, Text>(conf, @in, rfs.GetFileStatus (path).GetLen(), codec, null); reader.Close(); // test check sum byte[] ab = new byte[100]; int readed = reader.checksumIn.ReadWithChecksum(ab, 0, ab.Length); NUnit.Framework.Assert.AreEqual(readed, reader.checksumIn.GetChecksum().Length); }
public virtual void TestIFileWriterWithCodec() { Configuration conf = new Configuration(); FileSystem localFs = FileSystem.GetLocal(conf); FileSystem rfs = ((LocalFileSystem)localFs).GetRaw(); Path path = new Path(new Path("build/test.ifile"), "data"); DefaultCodec codec = new GzipCodec(); codec.SetConf(conf); IFile.Writer <Text, Text> writer = new IFile.Writer <Text, Text>(conf, rfs.Create(path ), typeof(Text), typeof(Text), codec, null); writer.Close(); }
public virtual void TestGzip() { Configuration conf = new Configuration(defaultConf); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.SetConf(gzip, conf); localFs.Delete(workDir, true); WriteFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "line-1\tthe quick\nline-2\tbrown\nline-3\t" + "fox jumped\nline-4\tover\nline-5\t the lazy\nline-6\t dog\n"); WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "line-1\tthis is a test\nline-1\tof gzip\n" ); Job job = Job.GetInstance(conf); FileInputFormat.SetInputPaths(job, workDir); KeyValueTextInputFormat format = new KeyValueTextInputFormat(); IList <InputSplit> splits = format.GetSplits(job); NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Count); FileSplit tmp = (FileSplit)splits[0]; if (tmp.GetPath().GetName().Equals("part2.txt.gz")) { splits.Set(0, splits[1]); splits.Set(1, tmp); } IList <Text> results = ReadSplit(format, splits[0], job); NUnit.Framework.Assert.AreEqual("splits[0] length", 6, results.Count); NUnit.Framework.Assert.AreEqual("splits[0][0]", "the quick", results[0].ToString( )); NUnit.Framework.Assert.AreEqual("splits[0][1]", "brown", results[1].ToString()); NUnit.Framework.Assert.AreEqual("splits[0][2]", "fox jumped", results[2].ToString ()); NUnit.Framework.Assert.AreEqual("splits[0][3]", "over", results[3].ToString()); NUnit.Framework.Assert.AreEqual("splits[0][4]", " the lazy", results[4].ToString( )); NUnit.Framework.Assert.AreEqual("splits[0][5]", " dog", results[5].ToString()); results = ReadSplit(format, splits[1], job); NUnit.Framework.Assert.AreEqual("splits[1] length", 2, results.Count); NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString ()); NUnit.Framework.Assert.AreEqual("splits[1][1]", "of gzip", results[1].ToString()); }
/// <summary>Test using the gzip codec for reading</summary> /// <exception cref="System.IO.IOException"/> /// <exception cref="System.Exception"/> public virtual void TestGzip() { Configuration conf = new Configuration(defaultConf); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.SetConf(gzip, conf); localFs.Delete(workDir, true); WriteFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n" ); WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n" ); Job job = Job.GetInstance(conf); FileInputFormat.SetInputPaths(job, workDir); CombineTextInputFormat format = new CombineTextInputFormat(); IList <InputSplit> splits = format.GetSplits(job); NUnit.Framework.Assert.AreEqual("compressed splits == 1", 1, splits.Count); IList <Text> results = ReadSplit(format, splits[0], job); NUnit.Framework.Assert.AreEqual("splits[0] length", 8, results.Count); string[] firstList = new string[] { "the quick", "brown", "fox jumped", "over", " the lazy" , " dog" }; string[] secondList = new string[] { "this is a test", "of gzip" }; string first = results[0].ToString(); if (first.Equals(firstList[0])) { TestResults(results, firstList, secondList); } else { if (first.Equals(secondList[0])) { TestResults(results, secondList, firstList); } else { NUnit.Framework.Assert.Fail("unexpected first token!"); } } }
/// <summary>Test using the gzip codec and an empty input file</summary> /// <exception cref="System.IO.IOException"/> public virtual void TestGzipEmpty() { JobConf job = new JobConf(defaultConf); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.SetConf(gzip, job); localFs.Delete(workDir, true); WriteFile(localFs, new Path(workDir, "empty.gz"), gzip, string.Empty); FileInputFormat.SetInputPaths(job, workDir); TextInputFormat format = new TextInputFormat(); format.Configure(job); InputSplit[] splits = format.GetSplits(job, 100); NUnit.Framework.Assert.AreEqual("Compressed files of length 0 are not returned from FileInputFormat.getSplits()." , 1, splits.Length); IList <Text> results = ReadSplit(format, splits[0], job); NUnit.Framework.Assert.AreEqual("Compressed empty file length == 0", 0, results.Count ); }
public virtual void TestBuiltInGzipDecompressor() { // NOTE: This fails on RHEL4 with "java.io.IOException: header crc mismatch" // due to buggy version of zlib (1.2.1.2) included. JobConf jobConf = new JobConf(defaultConf); jobConf.SetBoolean("io.native.lib.available", false); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.SetConf(gzip, jobConf); localFs.Delete(workDir, true); NUnit.Framework.Assert.AreEqual("[non-native (Java) codec]", typeof(BuiltInGzipDecompressor ), gzip.GetDecompressorType()); System.Console.Out.WriteLine(ColorBrYellow + "testBuiltInGzipDecompressor() using" + " non-native (Java Inflater) Decompressor (" + gzip.GetDecompressorType() + ")" + ColorNormal); // copy single-member test file to HDFS string fn1 = "testConcatThenCompress.txt" + gzip.GetDefaultExtension(); Path fnLocal1 = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn1); Path fnHDFS1 = new Path(workDir, fn1); localFs.CopyFromLocalFile(fnLocal1, fnHDFS1); // copy multiple-member test file to HDFS // (actually in "seekable gzip" format, a la JIRA PIG-42) string fn2 = "testCompressThenConcat.txt" + gzip.GetDefaultExtension(); Path fnLocal2 = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn2); Path fnHDFS2 = new Path(workDir, fn2); localFs.CopyFromLocalFile(fnLocal2, fnHDFS2); FileInputFormat.SetInputPaths(jobConf, workDir); // here's first pair of DecompressorStreams: FileInputStream in1 = new FileInputStream(fnLocal1.ToString()); FileInputStream in2 = new FileInputStream(fnLocal2.ToString()); NUnit.Framework.Assert.AreEqual("concat bytes available", 2734, in1.Available()); NUnit.Framework.Assert.AreEqual("concat bytes available", 3413, in2.Available()); // w/hdr CRC CompressionInputStream cin2 = gzip.CreateInputStream(in2); LineReader @in = new LineReader(cin2); Text @out = new Text(); int numBytes; int totalBytes = 0; int lineNum = 0; while ((numBytes = @in.ReadLine(@out)) > 0) { ++lineNum; totalBytes += numBytes; } @in.Close(); NUnit.Framework.Assert.AreEqual("total uncompressed bytes in concatenated test file" , 5346, totalBytes); NUnit.Framework.Assert.AreEqual("total uncompressed lines in concatenated test file" , 84, lineNum); // test BuiltInGzipDecompressor with lots of different input-buffer sizes DoMultipleGzipBufferSizes(jobConf, false); // test GzipZlibDecompressor (native), just to be sure // (FIXME? could move this call to testGzip(), but would need filename // setup above) (alternatively, maybe just nuke testGzip() and extend this?) DoMultipleGzipBufferSizes(jobConf, true); }
public virtual void TestPrototypeInflaterGzip() { CompressionCodec gzip = new GzipCodec(); // used only for file extension localFs.Delete(workDir, true); // localFs = FileSystem instance System.Console.Out.WriteLine(ColorBrBlue + "testPrototypeInflaterGzip() using " + "non-native/Java Inflater and manual gzip header/trailer parsing" + ColorNormal ); // copy prebuilt (correct!) version of concat.gz to HDFS string fn = "concat" + gzip.GetDefaultExtension(); Path fnLocal = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn); Path fnHDFS = new Path(workDir, fn); localFs.CopyFromLocalFile(fnLocal, fnHDFS); FileInputStream @in = new FileInputStream(fnLocal.ToString()); NUnit.Framework.Assert.AreEqual("concat bytes available", 148, @in.Available()); // should wrap all of this header-reading stuff in a running-CRC wrapper // (did so in BuiltInGzipDecompressor; see below) byte[] compressedBuf = new byte[256]; int numBytesRead = @in.Read(compressedBuf, 0, 10); NUnit.Framework.Assert.AreEqual("header bytes read", 10, numBytesRead); NUnit.Framework.Assert.AreEqual("1st byte", unchecked ((int)(0x1f)), compressedBuf [0] & unchecked ((int)(0xff))); NUnit.Framework.Assert.AreEqual("2nd byte", unchecked ((int)(0x8b)), compressedBuf [1] & unchecked ((int)(0xff))); NUnit.Framework.Assert.AreEqual("3rd byte (compression method)", 8, compressedBuf [2] & unchecked ((int)(0xff))); byte flags = unchecked ((byte)(compressedBuf[3] & unchecked ((int)(0xff)))); if ((flags & unchecked ((int)(0x04))) != 0) { // FEXTRA numBytesRead = @in.Read(compressedBuf, 0, 2); NUnit.Framework.Assert.AreEqual("XLEN bytes read", 2, numBytesRead); int xlen = ((compressedBuf[1] << 8) | compressedBuf[0]) & unchecked ((int)(0xffff) ); @in.Skip(xlen); } if ((flags & unchecked ((int)(0x08))) != 0) { // FNAME while ((numBytesRead = @in.Read()) != 0) { NUnit.Framework.Assert.IsFalse("unexpected end-of-file while reading filename", numBytesRead == -1); } } if ((flags & unchecked ((int)(0x10))) != 0) { // FCOMMENT while ((numBytesRead = @in.Read()) != 0) { NUnit.Framework.Assert.IsFalse("unexpected end-of-file while reading comment", numBytesRead == -1); } } if ((flags & unchecked ((int)(0xe0))) != 0) { // reserved NUnit.Framework.Assert.IsTrue("reserved bits are set??", (flags & unchecked ((int) (0xe0))) == 0); } if ((flags & unchecked ((int)(0x02))) != 0) { // FHCRC numBytesRead = @in.Read(compressedBuf, 0, 2); NUnit.Framework.Assert.AreEqual("CRC16 bytes read", 2, numBytesRead); int crc16 = ((compressedBuf[1] << 8) | compressedBuf[0]) & unchecked ((int)(0xffff )); } // ready to go! next bytes should be start of deflated stream, suitable // for Inflater numBytesRead = @in.Read(compressedBuf); // Inflater docs refer to a "dummy byte": no clue what that's about; // appears to work fine without one byte[] uncompressedBuf = new byte[256]; Inflater inflater = new Inflater(true); inflater.SetInput(compressedBuf, 0, numBytesRead); try { int numBytesUncompressed = inflater.Inflate(uncompressedBuf); string outString = Sharpen.Runtime.GetStringForBytes(uncompressedBuf, 0, numBytesUncompressed , "UTF-8"); System.Console.Out.WriteLine("uncompressed data of first gzip member = [" + outString + "]"); } catch (SharpZipBaseException ex) { throw new IOException(ex.Message); } @in.Close(); }
public virtual void TestGzip() { JobConf jobConf = new JobConf(defaultConf); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.SetConf(gzip, jobConf); localFs.Delete(workDir, true); // preferred, but not compatible with Apache/trunk instance of Hudson: /* * assertFalse("[native (C/C++) codec]", * (org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.class == * gzip.getDecompressorType()) ); * System.out.println(COLOR_BR_RED + * "testGzip() using native-zlib Decompressor (" + * gzip.getDecompressorType() + ")" + COLOR_NORMAL); */ // alternative: if (typeof(BuiltInGzipDecompressor) == gzip.GetDecompressorType()) { System.Console.Out.WriteLine(ColorBrRed + "testGzip() using native-zlib Decompressor (" + gzip.GetDecompressorType() + ")" + ColorNormal); } else { Log.Warn("testGzip() skipped: native (C/C++) libs not loaded"); return; } /* * // THIS IS BUGGY: omits 2nd/3rd gzip headers; screws up 2nd/3rd CRCs-- * // see https://issues.apache.org/jira/browse/HADOOP-6799 * Path fnHDFS = new Path(workDir, "concat" + gzip.getDefaultExtension()); * //OutputStream out = localFs.create(fnHDFS); * //GzipCodec.GzipOutputStream gzOStm = new GzipCodec.GzipOutputStream(out); * // can just combine those two lines, probably * //GzipCodec.GzipOutputStream gzOStm = * // new GzipCodec.GzipOutputStream(localFs.create(fnHDFS)); * // oops, no: this is a protected helper class; need to access * // it via createOutputStream() instead: * OutputStream out = localFs.create(fnHDFS); * Compressor gzCmp = gzip.createCompressor(); * CompressionOutputStream gzOStm = gzip.createOutputStream(out, gzCmp); * // this SHOULD be going to HDFS: got out from localFs == HDFS * // ...yup, works * gzOStm.write("first gzip concat\n member\nwith three lines\n".getBytes()); * gzOStm.finish(); * gzOStm.resetState(); * gzOStm.write("2nd gzip concat member\n".getBytes()); * gzOStm.finish(); * gzOStm.resetState(); * gzOStm.write("gzip concat\nmember #3\n".getBytes()); * gzOStm.close(); * // * String fn = "hdfs-to-local-concat" + gzip.getDefaultExtension(); * Path fnLocal = new Path(System.getProperty("test.concat.data","/tmp"), fn); * localFs.copyToLocalFile(fnHDFS, fnLocal); */ // copy prebuilt (correct!) version of concat.gz to HDFS string fn = "concat" + gzip.GetDefaultExtension(); Path fnLocal = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn); Path fnHDFS = new Path(workDir, fn); localFs.CopyFromLocalFile(fnLocal, fnHDFS); WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n" ); FileInputFormat.SetInputPaths(jobConf, workDir); TextInputFormat format = new TextInputFormat(); format.Configure(jobConf); InputSplit[] splits = format.GetSplits(jobConf, 100); NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length); FileSplit tmp = (FileSplit)splits[0]; if (tmp.GetPath().GetName().Equals("part2.txt.gz")) { splits[0] = splits[1]; splits[1] = tmp; } IList <Text> results = ReadSplit(format, splits[0], jobConf); NUnit.Framework.Assert.AreEqual("splits[0] num lines", 6, results.Count); NUnit.Framework.Assert.AreEqual("splits[0][5]", "member #3", results[5].ToString( )); results = ReadSplit(format, splits[1], jobConf); NUnit.Framework.Assert.AreEqual("splits[1] num lines", 2, results.Count); NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString ()); NUnit.Framework.Assert.AreEqual("splits[1][1]", "of gzip", results[1].ToString()); }
/// <summary>Test with partial record at the end of a compressed input file.</summary> /// <exception cref="System.IO.IOException"/> public virtual void TestPartialRecordCompressedIn() { CompressionCodec gzip = new GzipCodec(); RunPartialRecordTest(gzip); }