// this tests both files (testCompressThenConcat, testConcatThenCompress);
        // all should work with either native zlib or new Inflater-based decoder
        /// <exception cref="System.IO.IOException"/>
        private static void DoSingleGzipBufferSize(JobConf jConf)
        {
            TextInputFormat format = new TextInputFormat();

            format.Configure(jConf);
            // here's Nth pair of DecompressorStreams:
            InputSplit[] splits = format.GetSplits(jConf, 100);
            NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length);
            FileSplit tmp = (FileSplit)splits[0];

            if (tmp.GetPath().GetName().Equals("testCompressThenConcat.txt.gz"))
            {
                System.Console.Out.WriteLine("  (swapping)");
                splits[0] = splits[1];
                splits[1] = tmp;
            }
            IList <Text> results = ReadSplit(format, splits[0], jConf);

            NUnit.Framework.Assert.AreEqual("splits[0] length (num lines)", 84, results.Count
                                            );
            NUnit.Framework.Assert.AreEqual("splits[0][0]", "Call me Ishmael. Some years ago--never mind how long precisely--having"
                                            , results[0].ToString());
            NUnit.Framework.Assert.AreEqual("splits[0][42]", "Tell me, does the magnetic virtue of the needles of the compasses of"
                                            , results[42].ToString());
            results = ReadSplit(format, splits[1], jConf);
            NUnit.Framework.Assert.AreEqual("splits[1] length (num lines)", 84, results.Count
                                            );
            NUnit.Framework.Assert.AreEqual("splits[1][0]", "Call me Ishmael. Some years ago--never mind how long precisely--having"
                                            , results[0].ToString());
            NUnit.Framework.Assert.AreEqual("splits[1][42]", "Tell me, does the magnetic virtue of the needles of the compasses of"
                                            , results[42].ToString());
        }
Example #2
0
        /// <summary>Test using the gzip codec for reading</summary>
        /// <exception cref="System.IO.IOException"/>
        public static void TestGzip()
        {
            JobConf          job  = new JobConf();
            CompressionCodec gzip = new GzipCodec();

            ReflectionUtils.SetConf(gzip, job);
            localFs.Delete(workDir, true);
            WriteFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "line-1\tthe quick\nline-2\tbrown\nline-3\tfox jumped\nline-4\tover\nline-5\t the lazy\nline-6\t dog\n"
                      );
            WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "line-1\tthis is a test\nline-1\tof gzip\n"
                      );
            FileInputFormat.SetInputPaths(job, workDir);
            KeyValueTextInputFormat format = new KeyValueTextInputFormat();

            format.Configure(job);
            InputSplit[] splits = format.GetSplits(job, 100);
            NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length);
            FileSplit tmp = (FileSplit)splits[0];

            if (tmp.GetPath().GetName().Equals("part2.txt.gz"))
            {
                splits[0] = splits[1];
                splits[1] = tmp;
            }
            IList <Text> results = ReadSplit(format, splits[0], job);

            NUnit.Framework.Assert.AreEqual("splits[0] length", 6, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[0][5]", " dog", results[5].ToString());
            results = ReadSplit(format, splits[1], job);
            NUnit.Framework.Assert.AreEqual("splits[1] length", 2, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString
                                                ());
            NUnit.Framework.Assert.AreEqual("splits[1][1]", "of gzip", results[1].ToString());
        }
Example #3
0
        /// <summary>Test using the gzip codec with two input files.</summary>
        /// <exception cref="System.IO.IOException"/>
        public virtual void TestGzipWithTwoInputs()
        {
            CompressionCodec gzip = new GzipCodec();

            localFs.Delete(workDir, true);
            FixedLengthInputFormat format = new FixedLengthInputFormat();
            JobConf job = new JobConf(defaultConf);

            FixedLengthInputFormat.SetRecordLength(job, 5);
            FileInputFormat.SetInputPaths(job, workDir);
            ReflectionUtils.SetConf(gzip, job);
            format.Configure(job);
            // Create files with fixed length records with 5 byte long records.
            WriteFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "one  two  threefour five six  seveneightnine ten  "
                      );
            WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "ten  nine eightsevensix  five four threetwo  one  "
                      );
            InputSplit[] splits = format.GetSplits(job, 100);
            NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length);
            FileSplit tmp = (FileSplit)splits[0];

            if (tmp.GetPath().GetName().Equals("part2.txt.gz"))
            {
                splits[0] = splits[1];
                splits[1] = tmp;
            }
            IList <string> results = ReadSplit(format, splits[0], job);

            NUnit.Framework.Assert.AreEqual("splits[0] length", 10, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[0][5]", "six  ", results[5]);
            results = ReadSplit(format, splits[1], job);
            NUnit.Framework.Assert.AreEqual("splits[1] length", 10, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[1][0]", "ten  ", results[0]);
            NUnit.Framework.Assert.AreEqual("splits[1][1]", "nine ", results[1]);
        }
 /// <exception cref="System.IO.IOException"/>
 public FixedLengthRecordReader(Configuration job, FileSplit split, int recordLength
                                )
 {
     // Make use of the new API implementation to avoid code duplication.
     this.recordLength = recordLength;
     reader            = new Org.Apache.Hadoop.Mapreduce.Lib.Input.FixedLengthRecordReader(recordLength
                                                                                           );
     reader.Initialize(job, split.GetStart(), split.GetLength(), split.GetPath());
 }
Example #5
0
        /// <exception cref="System.IO.IOException"/>
        public LineRecordReader(Configuration job, FileSplit split, byte[] recordDelimiter
                                )
        {
            this.maxLineLength = job.GetInt(LineRecordReader.MaxLineLength, int.MaxValue);
            start = split.GetStart();
            end   = start + split.GetLength();
            Path file = split.GetPath();

            compressionCodecs = new CompressionCodecFactory(job);
            codec             = compressionCodecs.GetCodec(file);
            // open the file and seek to the start of the split
            FileSystem fs = file.GetFileSystem(job);

            fileIn = fs.Open(file);
            if (IsCompressedInput())
            {
                decompressor = CodecPool.GetDecompressor(codec);
                if (codec is SplittableCompressionCodec)
                {
                    SplitCompressionInputStream cIn = ((SplittableCompressionCodec)codec).CreateInputStream
                                                          (fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.Byblock);
                    @in          = new CompressedSplitLineReader(cIn, job, recordDelimiter);
                    start        = cIn.GetAdjustedStart();
                    end          = cIn.GetAdjustedEnd();
                    filePosition = cIn;
                }
                else
                {
                    // take pos from compressed stream
                    @in = new SplitLineReader(codec.CreateInputStream(fileIn, decompressor), job, recordDelimiter
                                              );
                    filePosition = fileIn;
                }
            }
            else
            {
                fileIn.Seek(start);
                @in = new UncompressedSplitLineReader(fileIn, job, recordDelimiter, split.GetLength
                                                          ());
                filePosition = fileIn;
            }
            // If this is not the first split, we always throw away first record
            // because we always (except the last split) read one extra line in
            // next() method.
            if (start != 0)
            {
                start += @in.ReadLine(new Text(), 0, MaxBytesToConsume(start));
            }
            this.pos = start;
        }
        /// <exception cref="System.IO.IOException"/>
        public SequenceFileRecordReader(Configuration conf, FileSplit split)
        {
            Path       path = split.GetPath();
            FileSystem fs   = path.GetFileSystem(conf);

            this.@in  = new SequenceFile.Reader(fs, path, conf);
            this.end  = split.GetStart() + split.GetLength();
            this.conf = conf;
            if (split.GetStart() > @in.GetPosition())
            {
                @in.Sync(split.GetStart());
            }
            // sync to start
            this.start = @in.GetPosition();
            more       = start < end;
        }
Example #7
0
            /// <exception cref="System.IO.IOException"/>
            public SequenceFileAsBinaryRecordReader(Configuration conf, FileSplit split)
            {
                Path       path = split.GetPath();
                FileSystem fs   = path.GetFileSystem(conf);

                this.@in = new SequenceFile.Reader(fs, path, conf);
                this.end = split.GetStart() + split.GetLength();
                if (split.GetStart() > @in.GetPosition())
                {
                    @in.Sync(split.GetStart());
                }
                // sync to start
                this.start = @in.GetPosition();
                vbytes     = @in.CreateValueBytes();
                done       = start >= end;
            }
        public virtual void TestBzip2()
        {
            JobConf          jobConf = new JobConf(defaultConf);
            CompressionCodec bzip2   = new BZip2Codec();

            ReflectionUtils.SetConf(bzip2, jobConf);
            localFs.Delete(workDir, true);
            System.Console.Out.WriteLine(ColorBrCyan + "testBzip2() using non-native CBZip2InputStream (presumably)"
                                         + ColorNormal);
            // copy prebuilt (correct!) version of concat.bz2 to HDFS
            string fn      = "concat" + bzip2.GetDefaultExtension();
            Path   fnLocal = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn);
            Path   fnHDFS  = new Path(workDir, fn);

            localFs.CopyFromLocalFile(fnLocal, fnHDFS);
            WriteFile(localFs, new Path(workDir, "part2.txt.bz2"), bzip2, "this is a test\nof bzip2\n"
                      );
            FileInputFormat.SetInputPaths(jobConf, workDir);
            TextInputFormat format = new TextInputFormat();

            // extends FileInputFormat
            format.Configure(jobConf);
            format.SetMinSplitSize(256);
            // work around 2-byte splits issue
            // [135 splits for a 208-byte file and a 62-byte file(!)]
            InputSplit[] splits = format.GetSplits(jobConf, 100);
            NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length);
            FileSplit tmp = (FileSplit)splits[0];

            if (tmp.GetPath().GetName().Equals("part2.txt.bz2"))
            {
                splits[0] = splits[1];
                splits[1] = tmp;
            }
            IList <Text> results = ReadSplit(format, splits[0], jobConf);

            NUnit.Framework.Assert.AreEqual("splits[0] num lines", 6, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[0][5]", "member #3", results[5].ToString(
                                                ));
            results = ReadSplit(format, splits[1], jobConf);
            NUnit.Framework.Assert.AreEqual("splits[1] num lines", 2, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString
                                                ());
            NUnit.Framework.Assert.AreEqual("splits[1][1]", "of bzip2", results[1].ToString()
                                            );
        }
        public virtual void TestGzip()
        {
            JobConf          jobConf = new JobConf(defaultConf);
            CompressionCodec gzip    = new GzipCodec();

            ReflectionUtils.SetConf(gzip, jobConf);
            localFs.Delete(workDir, true);
            // preferred, but not compatible with Apache/trunk instance of Hudson:

            /*
             * assertFalse("[native (C/C++) codec]",
             * (org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.class ==
             * gzip.getDecompressorType()) );
             * System.out.println(COLOR_BR_RED +
             * "testGzip() using native-zlib Decompressor (" +
             * gzip.getDecompressorType() + ")" + COLOR_NORMAL);
             */
            // alternative:
            if (typeof(BuiltInGzipDecompressor) == gzip.GetDecompressorType())
            {
                System.Console.Out.WriteLine(ColorBrRed + "testGzip() using native-zlib Decompressor ("
                                             + gzip.GetDecompressorType() + ")" + ColorNormal);
            }
            else
            {
                Log.Warn("testGzip() skipped:  native (C/C++) libs not loaded");
                return;
            }

            /*
             *      // THIS IS BUGGY: omits 2nd/3rd gzip headers; screws up 2nd/3rd CRCs--
             *      //                see https://issues.apache.org/jira/browse/HADOOP-6799
             *  Path fnHDFS = new Path(workDir, "concat" + gzip.getDefaultExtension());
             *  //OutputStream out = localFs.create(fnHDFS);
             *  //GzipCodec.GzipOutputStream gzOStm = new GzipCodec.GzipOutputStream(out);
             *      // can just combine those two lines, probably
             *  //GzipCodec.GzipOutputStream gzOStm =
             *  //  new GzipCodec.GzipOutputStream(localFs.create(fnHDFS));
             *      // oops, no:  this is a protected helper class; need to access
             *      //   it via createOutputStream() instead:
             *  OutputStream out = localFs.create(fnHDFS);
             *  Compressor gzCmp = gzip.createCompressor();
             *  CompressionOutputStream gzOStm = gzip.createOutputStream(out, gzCmp);
             *      // this SHOULD be going to HDFS:  got out from localFs == HDFS
             *      //   ...yup, works
             *  gzOStm.write("first gzip concat\n member\nwith three lines\n".getBytes());
             *  gzOStm.finish();
             *  gzOStm.resetState();
             *  gzOStm.write("2nd gzip concat member\n".getBytes());
             *  gzOStm.finish();
             *  gzOStm.resetState();
             *  gzOStm.write("gzip concat\nmember #3\n".getBytes());
             *  gzOStm.close();
             *      //
             *  String fn = "hdfs-to-local-concat" + gzip.getDefaultExtension();
             *  Path fnLocal = new Path(System.getProperty("test.concat.data","/tmp"), fn);
             *  localFs.copyToLocalFile(fnHDFS, fnLocal);
             */
            // copy prebuilt (correct!) version of concat.gz to HDFS
            string fn      = "concat" + gzip.GetDefaultExtension();
            Path   fnLocal = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn);
            Path   fnHDFS  = new Path(workDir, fn);

            localFs.CopyFromLocalFile(fnLocal, fnHDFS);
            WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n"
                      );
            FileInputFormat.SetInputPaths(jobConf, workDir);
            TextInputFormat format = new TextInputFormat();

            format.Configure(jobConf);
            InputSplit[] splits = format.GetSplits(jobConf, 100);
            NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length);
            FileSplit tmp = (FileSplit)splits[0];

            if (tmp.GetPath().GetName().Equals("part2.txt.gz"))
            {
                splits[0] = splits[1];
                splits[1] = tmp;
            }
            IList <Text> results = ReadSplit(format, splits[0], jobConf);

            NUnit.Framework.Assert.AreEqual("splits[0] num lines", 6, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[0][5]", "member #3", results[5].ToString(
                                                ));
            results = ReadSplit(format, splits[1], jobConf);
            NUnit.Framework.Assert.AreEqual("splits[1] num lines", 2, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString
                                                ());
            NUnit.Framework.Assert.AreEqual("splits[1][1]", "of gzip", results[1].ToString());
        }