Exemplo n.º 1
0
        /// <summary>Test using the gzip codec for reading</summary>
        /// <exception cref="System.IO.IOException"/>
        public static void TestGzip()
        {
            JobConf          job  = new JobConf();
            CompressionCodec gzip = new GzipCodec();

            ReflectionUtils.SetConf(gzip, job);
            localFs.Delete(workDir, true);
            WriteFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "line-1\tthe quick\nline-2\tbrown\nline-3\tfox jumped\nline-4\tover\nline-5\t the lazy\nline-6\t dog\n"
                      );
            WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "line-1\tthis is a test\nline-1\tof gzip\n"
                      );
            FileInputFormat.SetInputPaths(job, workDir);
            KeyValueTextInputFormat format = new KeyValueTextInputFormat();

            format.Configure(job);
            InputSplit[] splits = format.GetSplits(job, 100);
            NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length);
            FileSplit tmp = (FileSplit)splits[0];

            if (tmp.GetPath().GetName().Equals("part2.txt.gz"))
            {
                splits[0] = splits[1];
                splits[1] = tmp;
            }
            IList <Text> results = ReadSplit(format, splits[0], job);

            NUnit.Framework.Assert.AreEqual("splits[0] length", 6, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[0][5]", " dog", results[5].ToString());
            results = ReadSplit(format, splits[1], job);
            NUnit.Framework.Assert.AreEqual("splits[1] length", 2, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString
                                                ());
            NUnit.Framework.Assert.AreEqual("splits[1][1]", "of gzip", results[1].ToString());
        }
Exemplo n.º 2
0
        /// <summary>Test using the gzip codec with two input files.</summary>
        /// <exception cref="System.IO.IOException"/>
        public virtual void TestGzipWithTwoInputs()
        {
            CompressionCodec gzip = new GzipCodec();

            localFs.Delete(workDir, true);
            FixedLengthInputFormat format = new FixedLengthInputFormat();
            JobConf job = new JobConf(defaultConf);

            FixedLengthInputFormat.SetRecordLength(job, 5);
            FileInputFormat.SetInputPaths(job, workDir);
            ReflectionUtils.SetConf(gzip, job);
            format.Configure(job);
            // Create files with fixed length records with 5 byte long records.
            WriteFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "one  two  threefour five six  seveneightnine ten  "
                      );
            WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "ten  nine eightsevensix  five four threetwo  one  "
                      );
            InputSplit[] splits = format.GetSplits(job, 100);
            NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length);
            FileSplit tmp = (FileSplit)splits[0];

            if (tmp.GetPath().GetName().Equals("part2.txt.gz"))
            {
                splits[0] = splits[1];
                splits[1] = tmp;
            }
            IList <string> results = ReadSplit(format, splits[0], job);

            NUnit.Framework.Assert.AreEqual("splits[0] length", 10, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[0][5]", "six  ", results[5]);
            results = ReadSplit(format, splits[1], job);
            NUnit.Framework.Assert.AreEqual("splits[1] length", 10, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[1][0]", "ten  ", results[0]);
            NUnit.Framework.Assert.AreEqual("splits[1][1]", "nine ", results[1]);
        }
Exemplo n.º 3
0
        public virtual void TestIFileReaderWithCodec()
        {
            Configuration conf    = new Configuration();
            FileSystem    localFs = FileSystem.GetLocal(conf);
            FileSystem    rfs     = ((LocalFileSystem)localFs).GetRaw();
            Path          path    = new Path(new Path("build/test.ifile"), "data");
            DefaultCodec  codec   = new GzipCodec();

            codec.SetConf(conf);
            FSDataOutputStream @out = rfs.Create(path);

            IFile.Writer <Text, Text> writer = new IFile.Writer <Text, Text>(conf, @out, typeof(
                                                                                 Text), typeof(Text), codec, null);
            writer.Close();
            FSDataInputStream @in = rfs.Open(path);

            IFile.Reader <Text, Text> reader = new IFile.Reader <Text, Text>(conf, @in, rfs.GetFileStatus
                                                                                 (path).GetLen(), codec, null);
            reader.Close();
            // test check sum
            byte[] ab     = new byte[100];
            int    readed = reader.checksumIn.ReadWithChecksum(ab, 0, ab.Length);

            NUnit.Framework.Assert.AreEqual(readed, reader.checksumIn.GetChecksum().Length);
        }
Exemplo n.º 4
0
        public virtual void TestIFileWriterWithCodec()
        {
            Configuration conf    = new Configuration();
            FileSystem    localFs = FileSystem.GetLocal(conf);
            FileSystem    rfs     = ((LocalFileSystem)localFs).GetRaw();
            Path          path    = new Path(new Path("build/test.ifile"), "data");
            DefaultCodec  codec   = new GzipCodec();

            codec.SetConf(conf);
            IFile.Writer <Text, Text> writer = new IFile.Writer <Text, Text>(conf, rfs.Create(path
                                                                                              ), typeof(Text), typeof(Text), codec, null);
            writer.Close();
        }
Exemplo n.º 5
0
        public virtual void TestGzip()
        {
            Configuration    conf = new Configuration(defaultConf);
            CompressionCodec gzip = new GzipCodec();

            ReflectionUtils.SetConf(gzip, conf);
            localFs.Delete(workDir, true);
            WriteFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "line-1\tthe quick\nline-2\tbrown\nline-3\t"
                      + "fox jumped\nline-4\tover\nline-5\t the lazy\nline-6\t dog\n");
            WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "line-1\tthis is a test\nline-1\tof gzip\n"
                      );
            Job job = Job.GetInstance(conf);

            FileInputFormat.SetInputPaths(job, workDir);
            KeyValueTextInputFormat format = new KeyValueTextInputFormat();
            IList <InputSplit>      splits = format.GetSplits(job);

            NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Count);
            FileSplit tmp = (FileSplit)splits[0];

            if (tmp.GetPath().GetName().Equals("part2.txt.gz"))
            {
                splits.Set(0, splits[1]);
                splits.Set(1, tmp);
            }
            IList <Text> results = ReadSplit(format, splits[0], job);

            NUnit.Framework.Assert.AreEqual("splits[0] length", 6, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[0][0]", "the quick", results[0].ToString(
                                                ));
            NUnit.Framework.Assert.AreEqual("splits[0][1]", "brown", results[1].ToString());
            NUnit.Framework.Assert.AreEqual("splits[0][2]", "fox jumped", results[2].ToString
                                                ());
            NUnit.Framework.Assert.AreEqual("splits[0][3]", "over", results[3].ToString());
            NUnit.Framework.Assert.AreEqual("splits[0][4]", " the lazy", results[4].ToString(
                                                ));
            NUnit.Framework.Assert.AreEqual("splits[0][5]", " dog", results[5].ToString());
            results = ReadSplit(format, splits[1], job);
            NUnit.Framework.Assert.AreEqual("splits[1] length", 2, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString
                                                ());
            NUnit.Framework.Assert.AreEqual("splits[1][1]", "of gzip", results[1].ToString());
        }
Exemplo n.º 6
0
        /// <summary>Test using the gzip codec for reading</summary>
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.Exception"/>
        public virtual void TestGzip()
        {
            Configuration    conf = new Configuration(defaultConf);
            CompressionCodec gzip = new GzipCodec();

            ReflectionUtils.SetConf(gzip, conf);
            localFs.Delete(workDir, true);
            WriteFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n"
                      );
            WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n"
                      );
            Job job = Job.GetInstance(conf);

            FileInputFormat.SetInputPaths(job, workDir);
            CombineTextInputFormat format = new CombineTextInputFormat();
            IList <InputSplit>     splits = format.GetSplits(job);

            NUnit.Framework.Assert.AreEqual("compressed splits == 1", 1, splits.Count);
            IList <Text> results = ReadSplit(format, splits[0], job);

            NUnit.Framework.Assert.AreEqual("splits[0] length", 8, results.Count);
            string[] firstList = new string[] { "the quick", "brown", "fox jumped", "over", " the lazy"
                                                , " dog" };
            string[] secondList = new string[] { "this is a test", "of gzip" };
            string   first      = results[0].ToString();

            if (first.Equals(firstList[0]))
            {
                TestResults(results, firstList, secondList);
            }
            else
            {
                if (first.Equals(secondList[0]))
                {
                    TestResults(results, secondList, firstList);
                }
                else
                {
                    NUnit.Framework.Assert.Fail("unexpected first token!");
                }
            }
        }
Exemplo n.º 7
0
        /// <summary>Test using the gzip codec and an empty input file</summary>
        /// <exception cref="System.IO.IOException"/>
        public virtual void TestGzipEmpty()
        {
            JobConf          job  = new JobConf(defaultConf);
            CompressionCodec gzip = new GzipCodec();

            ReflectionUtils.SetConf(gzip, job);
            localFs.Delete(workDir, true);
            WriteFile(localFs, new Path(workDir, "empty.gz"), gzip, string.Empty);
            FileInputFormat.SetInputPaths(job, workDir);
            TextInputFormat format = new TextInputFormat();

            format.Configure(job);
            InputSplit[] splits = format.GetSplits(job, 100);
            NUnit.Framework.Assert.AreEqual("Compressed files of length 0 are not returned from FileInputFormat.getSplits()."
                                            , 1, splits.Length);
            IList <Text> results = ReadSplit(format, splits[0], job);

            NUnit.Framework.Assert.AreEqual("Compressed empty file length == 0", 0, results.Count
                                            );
        }
        public virtual void TestBuiltInGzipDecompressor()
        {
            // NOTE:  This fails on RHEL4 with "java.io.IOException: header crc mismatch"
            //        due to buggy version of zlib (1.2.1.2) included.
            JobConf jobConf = new JobConf(defaultConf);

            jobConf.SetBoolean("io.native.lib.available", false);
            CompressionCodec gzip = new GzipCodec();

            ReflectionUtils.SetConf(gzip, jobConf);
            localFs.Delete(workDir, true);
            NUnit.Framework.Assert.AreEqual("[non-native (Java) codec]", typeof(BuiltInGzipDecompressor
                                                                                ), gzip.GetDecompressorType());
            System.Console.Out.WriteLine(ColorBrYellow + "testBuiltInGzipDecompressor() using"
                                         + " non-native (Java Inflater) Decompressor (" + gzip.GetDecompressorType() + ")"
                                         + ColorNormal);
            // copy single-member test file to HDFS
            string fn1      = "testConcatThenCompress.txt" + gzip.GetDefaultExtension();
            Path   fnLocal1 = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn1);
            Path   fnHDFS1  = new Path(workDir, fn1);

            localFs.CopyFromLocalFile(fnLocal1, fnHDFS1);
            // copy multiple-member test file to HDFS
            // (actually in "seekable gzip" format, a la JIRA PIG-42)
            string fn2      = "testCompressThenConcat.txt" + gzip.GetDefaultExtension();
            Path   fnLocal2 = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn2);
            Path   fnHDFS2  = new Path(workDir, fn2);

            localFs.CopyFromLocalFile(fnLocal2, fnHDFS2);
            FileInputFormat.SetInputPaths(jobConf, workDir);
            // here's first pair of DecompressorStreams:
            FileInputStream in1 = new FileInputStream(fnLocal1.ToString());
            FileInputStream in2 = new FileInputStream(fnLocal2.ToString());

            NUnit.Framework.Assert.AreEqual("concat bytes available", 2734, in1.Available());
            NUnit.Framework.Assert.AreEqual("concat bytes available", 3413, in2.Available());
            // w/hdr CRC
            CompressionInputStream cin2 = gzip.CreateInputStream(in2);
            LineReader             @in  = new LineReader(cin2);
            Text @out = new Text();
            int  numBytes;
            int  totalBytes = 0;
            int  lineNum    = 0;

            while ((numBytes = @in.ReadLine(@out)) > 0)
            {
                ++lineNum;
                totalBytes += numBytes;
            }
            @in.Close();
            NUnit.Framework.Assert.AreEqual("total uncompressed bytes in concatenated test file"
                                            , 5346, totalBytes);
            NUnit.Framework.Assert.AreEqual("total uncompressed lines in concatenated test file"
                                            , 84, lineNum);
            // test BuiltInGzipDecompressor with lots of different input-buffer sizes
            DoMultipleGzipBufferSizes(jobConf, false);
            // test GzipZlibDecompressor (native), just to be sure
            // (FIXME?  could move this call to testGzip(), but would need filename
            // setup above) (alternatively, maybe just nuke testGzip() and extend this?)
            DoMultipleGzipBufferSizes(jobConf, true);
        }
        public virtual void TestPrototypeInflaterGzip()
        {
            CompressionCodec gzip = new GzipCodec();

            // used only for file extension
            localFs.Delete(workDir, true);
            // localFs = FileSystem instance
            System.Console.Out.WriteLine(ColorBrBlue + "testPrototypeInflaterGzip() using " +
                                         "non-native/Java Inflater and manual gzip header/trailer parsing" + ColorNormal
                                         );
            // copy prebuilt (correct!) version of concat.gz to HDFS
            string fn      = "concat" + gzip.GetDefaultExtension();
            Path   fnLocal = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn);
            Path   fnHDFS  = new Path(workDir, fn);

            localFs.CopyFromLocalFile(fnLocal, fnHDFS);
            FileInputStream @in = new FileInputStream(fnLocal.ToString());

            NUnit.Framework.Assert.AreEqual("concat bytes available", 148, @in.Available());
            // should wrap all of this header-reading stuff in a running-CRC wrapper
            // (did so in BuiltInGzipDecompressor; see below)
            byte[] compressedBuf = new byte[256];
            int    numBytesRead  = @in.Read(compressedBuf, 0, 10);

            NUnit.Framework.Assert.AreEqual("header bytes read", 10, numBytesRead);
            NUnit.Framework.Assert.AreEqual("1st byte", unchecked ((int)(0x1f)), compressedBuf
                                            [0] & unchecked ((int)(0xff)));
            NUnit.Framework.Assert.AreEqual("2nd byte", unchecked ((int)(0x8b)), compressedBuf
                                            [1] & unchecked ((int)(0xff)));
            NUnit.Framework.Assert.AreEqual("3rd byte (compression method)", 8, compressedBuf
                                            [2] & unchecked ((int)(0xff)));
            byte flags = unchecked ((byte)(compressedBuf[3] & unchecked ((int)(0xff))));

            if ((flags & unchecked ((int)(0x04))) != 0)
            {
                // FEXTRA
                numBytesRead = @in.Read(compressedBuf, 0, 2);
                NUnit.Framework.Assert.AreEqual("XLEN bytes read", 2, numBytesRead);
                int xlen = ((compressedBuf[1] << 8) | compressedBuf[0]) & unchecked ((int)(0xffff)
                                                                                     );
                @in.Skip(xlen);
            }
            if ((flags & unchecked ((int)(0x08))) != 0)
            {
                // FNAME
                while ((numBytesRead = @in.Read()) != 0)
                {
                    NUnit.Framework.Assert.IsFalse("unexpected end-of-file while reading filename", numBytesRead
                                                   == -1);
                }
            }
            if ((flags & unchecked ((int)(0x10))) != 0)
            {
                // FCOMMENT
                while ((numBytesRead = @in.Read()) != 0)
                {
                    NUnit.Framework.Assert.IsFalse("unexpected end-of-file while reading comment", numBytesRead
                                                   == -1);
                }
            }
            if ((flags & unchecked ((int)(0xe0))) != 0)
            {
                // reserved
                NUnit.Framework.Assert.IsTrue("reserved bits are set??", (flags & unchecked ((int)
                                                                                             (0xe0))) == 0);
            }
            if ((flags & unchecked ((int)(0x02))) != 0)
            {
                // FHCRC
                numBytesRead = @in.Read(compressedBuf, 0, 2);
                NUnit.Framework.Assert.AreEqual("CRC16 bytes read", 2, numBytesRead);
                int crc16 = ((compressedBuf[1] << 8) | compressedBuf[0]) & unchecked ((int)(0xffff
                                                                                            ));
            }
            // ready to go!  next bytes should be start of deflated stream, suitable
            // for Inflater
            numBytesRead = @in.Read(compressedBuf);
            // Inflater docs refer to a "dummy byte":  no clue what that's about;
            // appears to work fine without one
            byte[]   uncompressedBuf = new byte[256];
            Inflater inflater        = new Inflater(true);

            inflater.SetInput(compressedBuf, 0, numBytesRead);
            try
            {
                int    numBytesUncompressed = inflater.Inflate(uncompressedBuf);
                string outString            = Sharpen.Runtime.GetStringForBytes(uncompressedBuf, 0, numBytesUncompressed
                                                                                , "UTF-8");
                System.Console.Out.WriteLine("uncompressed data of first gzip member = [" + outString
                                             + "]");
            }
            catch (SharpZipBaseException ex)
            {
                throw new IOException(ex.Message);
            }
            @in.Close();
        }
        public virtual void TestGzip()
        {
            JobConf          jobConf = new JobConf(defaultConf);
            CompressionCodec gzip    = new GzipCodec();

            ReflectionUtils.SetConf(gzip, jobConf);
            localFs.Delete(workDir, true);
            // preferred, but not compatible with Apache/trunk instance of Hudson:

            /*
             * assertFalse("[native (C/C++) codec]",
             * (org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.class ==
             * gzip.getDecompressorType()) );
             * System.out.println(COLOR_BR_RED +
             * "testGzip() using native-zlib Decompressor (" +
             * gzip.getDecompressorType() + ")" + COLOR_NORMAL);
             */
            // alternative:
            if (typeof(BuiltInGzipDecompressor) == gzip.GetDecompressorType())
            {
                System.Console.Out.WriteLine(ColorBrRed + "testGzip() using native-zlib Decompressor ("
                                             + gzip.GetDecompressorType() + ")" + ColorNormal);
            }
            else
            {
                Log.Warn("testGzip() skipped:  native (C/C++) libs not loaded");
                return;
            }

            /*
             *      // THIS IS BUGGY: omits 2nd/3rd gzip headers; screws up 2nd/3rd CRCs--
             *      //                see https://issues.apache.org/jira/browse/HADOOP-6799
             *  Path fnHDFS = new Path(workDir, "concat" + gzip.getDefaultExtension());
             *  //OutputStream out = localFs.create(fnHDFS);
             *  //GzipCodec.GzipOutputStream gzOStm = new GzipCodec.GzipOutputStream(out);
             *      // can just combine those two lines, probably
             *  //GzipCodec.GzipOutputStream gzOStm =
             *  //  new GzipCodec.GzipOutputStream(localFs.create(fnHDFS));
             *      // oops, no:  this is a protected helper class; need to access
             *      //   it via createOutputStream() instead:
             *  OutputStream out = localFs.create(fnHDFS);
             *  Compressor gzCmp = gzip.createCompressor();
             *  CompressionOutputStream gzOStm = gzip.createOutputStream(out, gzCmp);
             *      // this SHOULD be going to HDFS:  got out from localFs == HDFS
             *      //   ...yup, works
             *  gzOStm.write("first gzip concat\n member\nwith three lines\n".getBytes());
             *  gzOStm.finish();
             *  gzOStm.resetState();
             *  gzOStm.write("2nd gzip concat member\n".getBytes());
             *  gzOStm.finish();
             *  gzOStm.resetState();
             *  gzOStm.write("gzip concat\nmember #3\n".getBytes());
             *  gzOStm.close();
             *      //
             *  String fn = "hdfs-to-local-concat" + gzip.getDefaultExtension();
             *  Path fnLocal = new Path(System.getProperty("test.concat.data","/tmp"), fn);
             *  localFs.copyToLocalFile(fnHDFS, fnLocal);
             */
            // copy prebuilt (correct!) version of concat.gz to HDFS
            string fn      = "concat" + gzip.GetDefaultExtension();
            Path   fnLocal = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn);
            Path   fnHDFS  = new Path(workDir, fn);

            localFs.CopyFromLocalFile(fnLocal, fnHDFS);
            WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n"
                      );
            FileInputFormat.SetInputPaths(jobConf, workDir);
            TextInputFormat format = new TextInputFormat();

            format.Configure(jobConf);
            InputSplit[] splits = format.GetSplits(jobConf, 100);
            NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length);
            FileSplit tmp = (FileSplit)splits[0];

            if (tmp.GetPath().GetName().Equals("part2.txt.gz"))
            {
                splits[0] = splits[1];
                splits[1] = tmp;
            }
            IList <Text> results = ReadSplit(format, splits[0], jobConf);

            NUnit.Framework.Assert.AreEqual("splits[0] num lines", 6, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[0][5]", "member #3", results[5].ToString(
                                                ));
            results = ReadSplit(format, splits[1], jobConf);
            NUnit.Framework.Assert.AreEqual("splits[1] num lines", 2, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString
                                                ());
            NUnit.Framework.Assert.AreEqual("splits[1][1]", "of gzip", results[1].ToString());
        }
Exemplo n.º 11
0
        /// <summary>Test with partial record at the end of a compressed input file.</summary>
        /// <exception cref="System.IO.IOException"/>
        public virtual void TestPartialRecordCompressedIn()
        {
            CompressionCodec gzip = new GzipCodec();

            RunPartialRecordTest(gzip);
        }