// This is also called from the old FixedLengthRecordReader API implementation
        /// <exception cref="System.IO.IOException"/>
        public virtual void Initialize(Configuration job, long splitStart, long splitLength
                                       , Path file)
        {
            start = splitStart;
            end   = start + splitLength;
            long partialRecordLength = start % recordLength;
            long numBytesToSkip      = 0;

            if (partialRecordLength != 0)
            {
                numBytesToSkip = recordLength - partialRecordLength;
            }
            // open the file and seek to the start of the split
            FileSystem fs = file.GetFileSystem(job);

            fileIn = fs.Open(file);
            CompressionCodec codec = new CompressionCodecFactory(job).GetCodec(file);

            if (null != codec)
            {
                isCompressedInput = true;
                decompressor      = CodecPool.GetDecompressor(codec);
                CompressionInputStream cIn = codec.CreateInputStream(fileIn, decompressor);
                filePosition = cIn;
                inputStream  = cIn;
                numRecordsRemainingInSplit = long.MaxValue;
                Log.Info("Compressed input; cannot compute number of records in the split");
            }
            else
            {
                fileIn.Seek(start);
                filePosition = fileIn;
                inputStream  = fileIn;
                long splitSize = end - start - numBytesToSkip;
                numRecordsRemainingInSplit = (splitSize + recordLength - 1) / recordLength;
                if (numRecordsRemainingInSplit < 0)
                {
                    numRecordsRemainingInSplit = 0;
                }
                Log.Info("Expecting " + numRecordsRemainingInSplit + " records each with a length of "
                         + recordLength + " bytes in the split with an effective size of " + splitSize +
                         " bytes");
            }
            if (numBytesToSkip != 0)
            {
                start += inputStream.Skip(numBytesToSkip);
            }
            this.pos = start;
        }
        public virtual void TestBuiltInGzipDecompressor()
        {
            // NOTE:  This fails on RHEL4 with "java.io.IOException: header crc mismatch"
            //        due to buggy version of zlib (1.2.1.2) included.
            JobConf jobConf = new JobConf(defaultConf);

            jobConf.SetBoolean("io.native.lib.available", false);
            CompressionCodec gzip = new GzipCodec();

            ReflectionUtils.SetConf(gzip, jobConf);
            localFs.Delete(workDir, true);
            NUnit.Framework.Assert.AreEqual("[non-native (Java) codec]", typeof(BuiltInGzipDecompressor
                                                                                ), gzip.GetDecompressorType());
            System.Console.Out.WriteLine(ColorBrYellow + "testBuiltInGzipDecompressor() using"
                                         + " non-native (Java Inflater) Decompressor (" + gzip.GetDecompressorType() + ")"
                                         + ColorNormal);
            // copy single-member test file to HDFS
            string fn1      = "testConcatThenCompress.txt" + gzip.GetDefaultExtension();
            Path   fnLocal1 = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn1);
            Path   fnHDFS1  = new Path(workDir, fn1);

            localFs.CopyFromLocalFile(fnLocal1, fnHDFS1);
            // copy multiple-member test file to HDFS
            // (actually in "seekable gzip" format, a la JIRA PIG-42)
            string fn2      = "testCompressThenConcat.txt" + gzip.GetDefaultExtension();
            Path   fnLocal2 = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn2);
            Path   fnHDFS2  = new Path(workDir, fn2);

            localFs.CopyFromLocalFile(fnLocal2, fnHDFS2);
            FileInputFormat.SetInputPaths(jobConf, workDir);
            // here's first pair of DecompressorStreams:
            FileInputStream in1 = new FileInputStream(fnLocal1.ToString());
            FileInputStream in2 = new FileInputStream(fnLocal2.ToString());

            NUnit.Framework.Assert.AreEqual("concat bytes available", 2734, in1.Available());
            NUnit.Framework.Assert.AreEqual("concat bytes available", 3413, in2.Available());
            // w/hdr CRC
            CompressionInputStream cin2 = gzip.CreateInputStream(in2);
            LineReader             @in  = new LineReader(cin2);
            Text @out = new Text();
            int  numBytes;
            int  totalBytes = 0;
            int  lineNum    = 0;

            while ((numBytes = @in.ReadLine(@out)) > 0)
            {
                ++lineNum;
                totalBytes += numBytes;
            }
            @in.Close();
            NUnit.Framework.Assert.AreEqual("total uncompressed bytes in concatenated test file"
                                            , 5346, totalBytes);
            NUnit.Framework.Assert.AreEqual("total uncompressed lines in concatenated test file"
                                            , 84, lineNum);
            // test BuiltInGzipDecompressor with lots of different input-buffer sizes
            DoMultipleGzipBufferSizes(jobConf, false);
            // test GzipZlibDecompressor (native), just to be sure
            // (FIXME?  could move this call to testGzip(), but would need filename
            // setup above) (alternatively, maybe just nuke testGzip() and extend this?)
            DoMultipleGzipBufferSizes(jobConf, true);
        }
示例#3
0
        public virtual void TestCompress()
        {
            JobConf job = new JobConf();

            job.Set(JobContext.TaskAttemptId, attempt);
            job.Set(FileOutputFormat.Compress, "true");
            FileOutputFormat.SetOutputPath(job, workDir.GetParent().GetParent());
            FileOutputFormat.SetWorkOutputPath(job, workDir);
            FileSystem fs = workDir.GetFileSystem(job);

            if (!fs.Mkdirs(workDir))
            {
                NUnit.Framework.Assert.Fail("Failed to create output directory");
            }
            string file = "test_compress.txt";
            // A reporter that does nothing
            Reporter reporter = Reporter.Null;
            TextOutputFormat <object, object> theOutputFormat = new TextOutputFormat <object, object
                                                                                      >();
            RecordWriter <object, object> theRecordWriter = theOutputFormat.GetRecordWriter(localFs
                                                                                            , job, file, reporter);

            Org.Apache.Hadoop.IO.Text key1 = new Org.Apache.Hadoop.IO.Text("key1");
            Org.Apache.Hadoop.IO.Text key2 = new Org.Apache.Hadoop.IO.Text("key2");
            Org.Apache.Hadoop.IO.Text val1 = new Org.Apache.Hadoop.IO.Text("val1");
            Org.Apache.Hadoop.IO.Text val2 = new Org.Apache.Hadoop.IO.Text("val2");
            NullWritable nullWritable      = NullWritable.Get();

            try
            {
                theRecordWriter.Write(key1, val1);
                theRecordWriter.Write(null, nullWritable);
                theRecordWriter.Write(null, val1);
                theRecordWriter.Write(nullWritable, val2);
                theRecordWriter.Write(key2, nullWritable);
                theRecordWriter.Write(key1, null);
                theRecordWriter.Write(null, null);
                theRecordWriter.Write(key2, val2);
            }
            finally
            {
                theRecordWriter.Close(reporter);
            }
            StringBuilder expectedOutput = new StringBuilder();

            expectedOutput.Append(key1).Append("\t").Append(val1).Append("\n");
            expectedOutput.Append(val1).Append("\n");
            expectedOutput.Append(val2).Append("\n");
            expectedOutput.Append(key2).Append("\n");
            expectedOutput.Append(key1).Append("\n");
            expectedOutput.Append(key2).Append("\t").Append(val2).Append("\n");
            DefaultCodec codec = new DefaultCodec();

            codec.SetConf(job);
            Path                   expectedFile = new Path(workDir, file + codec.GetDefaultExtension());
            FileInputStream        istream      = new FileInputStream(expectedFile.ToString());
            CompressionInputStream cistream     = codec.CreateInputStream(istream);
            LineReader             reader       = new LineReader(cistream);
            string                 output       = string.Empty;

            Org.Apache.Hadoop.IO.Text @out = new Org.Apache.Hadoop.IO.Text();
            while (reader.ReadLine(@out) > 0)
            {
                output += @out;
                output += "\n";
            }
            reader.Close();
            NUnit.Framework.Assert.AreEqual(expectedOutput.ToString(), output);
        }