// this tests both files (testCompressThenConcat, testConcatThenCompress); // all should work with either native zlib or new Inflater-based decoder /// <exception cref="System.IO.IOException"/> private static void DoSingleGzipBufferSize(JobConf jConf) { TextInputFormat format = new TextInputFormat(); format.Configure(jConf); // here's Nth pair of DecompressorStreams: InputSplit[] splits = format.GetSplits(jConf, 100); NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length); FileSplit tmp = (FileSplit)splits[0]; if (tmp.GetPath().GetName().Equals("testCompressThenConcat.txt.gz")) { System.Console.Out.WriteLine(" (swapping)"); splits[0] = splits[1]; splits[1] = tmp; } IList <Text> results = ReadSplit(format, splits[0], jConf); NUnit.Framework.Assert.AreEqual("splits[0] length (num lines)", 84, results.Count ); NUnit.Framework.Assert.AreEqual("splits[0][0]", "Call me Ishmael. Some years ago--never mind how long precisely--having" , results[0].ToString()); NUnit.Framework.Assert.AreEqual("splits[0][42]", "Tell me, does the magnetic virtue of the needles of the compasses of" , results[42].ToString()); results = ReadSplit(format, splits[1], jConf); NUnit.Framework.Assert.AreEqual("splits[1] length (num lines)", 84, results.Count ); NUnit.Framework.Assert.AreEqual("splits[1][0]", "Call me Ishmael. Some years ago--never mind how long precisely--having" , results[0].ToString()); NUnit.Framework.Assert.AreEqual("splits[1][42]", "Tell me, does the magnetic virtue of the needles of the compasses of" , results[42].ToString()); }
/// <summary>Test using the gzip codec for reading</summary> /// <exception cref="System.IO.IOException"/> public static void TestGzip() { JobConf job = new JobConf(); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.SetConf(gzip, job); localFs.Delete(workDir, true); WriteFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "line-1\tthe quick\nline-2\tbrown\nline-3\tfox jumped\nline-4\tover\nline-5\t the lazy\nline-6\t dog\n" ); WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "line-1\tthis is a test\nline-1\tof gzip\n" ); FileInputFormat.SetInputPaths(job, workDir); KeyValueTextInputFormat format = new KeyValueTextInputFormat(); format.Configure(job); InputSplit[] splits = format.GetSplits(job, 100); NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length); FileSplit tmp = (FileSplit)splits[0]; if (tmp.GetPath().GetName().Equals("part2.txt.gz")) { splits[0] = splits[1]; splits[1] = tmp; } IList <Text> results = ReadSplit(format, splits[0], job); NUnit.Framework.Assert.AreEqual("splits[0] length", 6, results.Count); NUnit.Framework.Assert.AreEqual("splits[0][5]", " dog", results[5].ToString()); results = ReadSplit(format, splits[1], job); NUnit.Framework.Assert.AreEqual("splits[1] length", 2, results.Count); NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString ()); NUnit.Framework.Assert.AreEqual("splits[1][1]", "of gzip", results[1].ToString()); }
/// <summary>Test using the gzip codec with two input files.</summary> /// <exception cref="System.IO.IOException"/> public virtual void TestGzipWithTwoInputs() { CompressionCodec gzip = new GzipCodec(); localFs.Delete(workDir, true); FixedLengthInputFormat format = new FixedLengthInputFormat(); JobConf job = new JobConf(defaultConf); FixedLengthInputFormat.SetRecordLength(job, 5); FileInputFormat.SetInputPaths(job, workDir); ReflectionUtils.SetConf(gzip, job); format.Configure(job); // Create files with fixed length records with 5 byte long records. WriteFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "one two threefour five six seveneightnine ten " ); WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "ten nine eightsevensix five four threetwo one " ); InputSplit[] splits = format.GetSplits(job, 100); NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length); FileSplit tmp = (FileSplit)splits[0]; if (tmp.GetPath().GetName().Equals("part2.txt.gz")) { splits[0] = splits[1]; splits[1] = tmp; } IList <string> results = ReadSplit(format, splits[0], job); NUnit.Framework.Assert.AreEqual("splits[0] length", 10, results.Count); NUnit.Framework.Assert.AreEqual("splits[0][5]", "six ", results[5]); results = ReadSplit(format, splits[1], job); NUnit.Framework.Assert.AreEqual("splits[1] length", 10, results.Count); NUnit.Framework.Assert.AreEqual("splits[1][0]", "ten ", results[0]); NUnit.Framework.Assert.AreEqual("splits[1][1]", "nine ", results[1]); }
/// <exception cref="System.IO.IOException"/> public FixedLengthRecordReader(Configuration job, FileSplit split, int recordLength ) { // Make use of the new API implementation to avoid code duplication. this.recordLength = recordLength; reader = new Org.Apache.Hadoop.Mapreduce.Lib.Input.FixedLengthRecordReader(recordLength ); reader.Initialize(job, split.GetStart(), split.GetLength(), split.GetPath()); }
/// <exception cref="System.IO.IOException"/> public LineRecordReader(Configuration job, FileSplit split, byte[] recordDelimiter ) { this.maxLineLength = job.GetInt(LineRecordReader.MaxLineLength, int.MaxValue); start = split.GetStart(); end = start + split.GetLength(); Path file = split.GetPath(); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.GetCodec(file); // open the file and seek to the start of the split FileSystem fs = file.GetFileSystem(job); fileIn = fs.Open(file); if (IsCompressedInput()) { decompressor = CodecPool.GetDecompressor(codec); if (codec is SplittableCompressionCodec) { SplitCompressionInputStream cIn = ((SplittableCompressionCodec)codec).CreateInputStream (fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.Byblock); @in = new CompressedSplitLineReader(cIn, job, recordDelimiter); start = cIn.GetAdjustedStart(); end = cIn.GetAdjustedEnd(); filePosition = cIn; } else { // take pos from compressed stream @in = new SplitLineReader(codec.CreateInputStream(fileIn, decompressor), job, recordDelimiter ); filePosition = fileIn; } } else { fileIn.Seek(start); @in = new UncompressedSplitLineReader(fileIn, job, recordDelimiter, split.GetLength ()); filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += @in.ReadLine(new Text(), 0, MaxBytesToConsume(start)); } this.pos = start; }
/// <exception cref="System.IO.IOException"/> public SequenceFileRecordReader(Configuration conf, FileSplit split) { Path path = split.GetPath(); FileSystem fs = path.GetFileSystem(conf); this.@in = new SequenceFile.Reader(fs, path, conf); this.end = split.GetStart() + split.GetLength(); this.conf = conf; if (split.GetStart() > @in.GetPosition()) { @in.Sync(split.GetStart()); } // sync to start this.start = @in.GetPosition(); more = start < end; }
/// <exception cref="System.IO.IOException"/> public SequenceFileAsBinaryRecordReader(Configuration conf, FileSplit split) { Path path = split.GetPath(); FileSystem fs = path.GetFileSystem(conf); this.@in = new SequenceFile.Reader(fs, path, conf); this.end = split.GetStart() + split.GetLength(); if (split.GetStart() > @in.GetPosition()) { @in.Sync(split.GetStart()); } // sync to start this.start = @in.GetPosition(); vbytes = @in.CreateValueBytes(); done = start >= end; }
public virtual void TestBzip2() { JobConf jobConf = new JobConf(defaultConf); CompressionCodec bzip2 = new BZip2Codec(); ReflectionUtils.SetConf(bzip2, jobConf); localFs.Delete(workDir, true); System.Console.Out.WriteLine(ColorBrCyan + "testBzip2() using non-native CBZip2InputStream (presumably)" + ColorNormal); // copy prebuilt (correct!) version of concat.bz2 to HDFS string fn = "concat" + bzip2.GetDefaultExtension(); Path fnLocal = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn); Path fnHDFS = new Path(workDir, fn); localFs.CopyFromLocalFile(fnLocal, fnHDFS); WriteFile(localFs, new Path(workDir, "part2.txt.bz2"), bzip2, "this is a test\nof bzip2\n" ); FileInputFormat.SetInputPaths(jobConf, workDir); TextInputFormat format = new TextInputFormat(); // extends FileInputFormat format.Configure(jobConf); format.SetMinSplitSize(256); // work around 2-byte splits issue // [135 splits for a 208-byte file and a 62-byte file(!)] InputSplit[] splits = format.GetSplits(jobConf, 100); NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length); FileSplit tmp = (FileSplit)splits[0]; if (tmp.GetPath().GetName().Equals("part2.txt.bz2")) { splits[0] = splits[1]; splits[1] = tmp; } IList <Text> results = ReadSplit(format, splits[0], jobConf); NUnit.Framework.Assert.AreEqual("splits[0] num lines", 6, results.Count); NUnit.Framework.Assert.AreEqual("splits[0][5]", "member #3", results[5].ToString( )); results = ReadSplit(format, splits[1], jobConf); NUnit.Framework.Assert.AreEqual("splits[1] num lines", 2, results.Count); NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString ()); NUnit.Framework.Assert.AreEqual("splits[1][1]", "of bzip2", results[1].ToString() ); }
public virtual void TestGzip() { JobConf jobConf = new JobConf(defaultConf); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.SetConf(gzip, jobConf); localFs.Delete(workDir, true); // preferred, but not compatible with Apache/trunk instance of Hudson: /* * assertFalse("[native (C/C++) codec]", * (org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.class == * gzip.getDecompressorType()) ); * System.out.println(COLOR_BR_RED + * "testGzip() using native-zlib Decompressor (" + * gzip.getDecompressorType() + ")" + COLOR_NORMAL); */ // alternative: if (typeof(BuiltInGzipDecompressor) == gzip.GetDecompressorType()) { System.Console.Out.WriteLine(ColorBrRed + "testGzip() using native-zlib Decompressor (" + gzip.GetDecompressorType() + ")" + ColorNormal); } else { Log.Warn("testGzip() skipped: native (C/C++) libs not loaded"); return; } /* * // THIS IS BUGGY: omits 2nd/3rd gzip headers; screws up 2nd/3rd CRCs-- * // see https://issues.apache.org/jira/browse/HADOOP-6799 * Path fnHDFS = new Path(workDir, "concat" + gzip.getDefaultExtension()); * //OutputStream out = localFs.create(fnHDFS); * //GzipCodec.GzipOutputStream gzOStm = new GzipCodec.GzipOutputStream(out); * // can just combine those two lines, probably * //GzipCodec.GzipOutputStream gzOStm = * // new GzipCodec.GzipOutputStream(localFs.create(fnHDFS)); * // oops, no: this is a protected helper class; need to access * // it via createOutputStream() instead: * OutputStream out = localFs.create(fnHDFS); * Compressor gzCmp = gzip.createCompressor(); * CompressionOutputStream gzOStm = gzip.createOutputStream(out, gzCmp); * // this SHOULD be going to HDFS: got out from localFs == HDFS * // ...yup, works * gzOStm.write("first gzip concat\n member\nwith three lines\n".getBytes()); * gzOStm.finish(); * gzOStm.resetState(); * gzOStm.write("2nd gzip concat member\n".getBytes()); * gzOStm.finish(); * gzOStm.resetState(); * gzOStm.write("gzip concat\nmember #3\n".getBytes()); * gzOStm.close(); * // * String fn = "hdfs-to-local-concat" + gzip.getDefaultExtension(); * Path fnLocal = new Path(System.getProperty("test.concat.data","/tmp"), fn); * localFs.copyToLocalFile(fnHDFS, fnLocal); */ // copy prebuilt (correct!) version of concat.gz to HDFS string fn = "concat" + gzip.GetDefaultExtension(); Path fnLocal = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn); Path fnHDFS = new Path(workDir, fn); localFs.CopyFromLocalFile(fnLocal, fnHDFS); WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n" ); FileInputFormat.SetInputPaths(jobConf, workDir); TextInputFormat format = new TextInputFormat(); format.Configure(jobConf); InputSplit[] splits = format.GetSplits(jobConf, 100); NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length); FileSplit tmp = (FileSplit)splits[0]; if (tmp.GetPath().GetName().Equals("part2.txt.gz")) { splits[0] = splits[1]; splits[1] = tmp; } IList <Text> results = ReadSplit(format, splits[0], jobConf); NUnit.Framework.Assert.AreEqual("splits[0] num lines", 6, results.Count); NUnit.Framework.Assert.AreEqual("splits[0][5]", "member #3", results[5].ToString( )); results = ReadSplit(format, splits[1], jobConf); NUnit.Framework.Assert.AreEqual("splits[1] num lines", 2, results.Count); NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString ()); NUnit.Framework.Assert.AreEqual("splits[1][1]", "of gzip", results[1].ToString()); }