Example #1
0
        /// <summary>Test using the gzip codec with two input files.</summary>
        /// <exception cref="System.IO.IOException"/>
        public virtual void TestGzipWithTwoInputs()
        {
            CompressionCodec gzip = new GzipCodec();

            localFs.Delete(workDir, true);
            FixedLengthInputFormat format = new FixedLengthInputFormat();
            JobConf job = new JobConf(defaultConf);

            FixedLengthInputFormat.SetRecordLength(job, 5);
            FileInputFormat.SetInputPaths(job, workDir);
            ReflectionUtils.SetConf(gzip, job);
            format.Configure(job);
            // Create files with fixed length records with 5 byte long records.
            WriteFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "one  two  threefour five six  seveneightnine ten  "
                      );
            WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "ten  nine eightsevensix  five four threetwo  one  "
                      );
            InputSplit[] splits = format.GetSplits(job, 100);
            NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length);
            FileSplit tmp = (FileSplit)splits[0];

            if (tmp.GetPath().GetName().Equals("part2.txt.gz"))
            {
                splits[0] = splits[1];
                splits[1] = tmp;
            }
            IList <string> results = ReadSplit(format, splits[0], job);

            NUnit.Framework.Assert.AreEqual("splits[0] length", 10, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[0][5]", "six  ", results[5]);
            results = ReadSplit(format, splits[1], job);
            NUnit.Framework.Assert.AreEqual("splits[1] length", 10, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[1][0]", "ten  ", results[0]);
            NUnit.Framework.Assert.AreEqual("splits[1][1]", "nine ", results[1]);
        }
        // this tests both files (testCompressThenConcat, testConcatThenCompress);
        // all should work with either native zlib or new Inflater-based decoder
        /// <exception cref="System.IO.IOException"/>
        private static void DoSingleGzipBufferSize(JobConf jConf)
        {
            TextInputFormat format = new TextInputFormat();

            format.Configure(jConf);
            // here's Nth pair of DecompressorStreams:
            InputSplit[] splits = format.GetSplits(jConf, 100);
            NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length);
            FileSplit tmp = (FileSplit)splits[0];

            if (tmp.GetPath().GetName().Equals("testCompressThenConcat.txt.gz"))
            {
                System.Console.Out.WriteLine("  (swapping)");
                splits[0] = splits[1];
                splits[1] = tmp;
            }
            IList <Text> results = ReadSplit(format, splits[0], jConf);

            NUnit.Framework.Assert.AreEqual("splits[0] length (num lines)", 84, results.Count
                                            );
            NUnit.Framework.Assert.AreEqual("splits[0][0]", "Call me Ishmael. Some years ago--never mind how long precisely--having"
                                            , results[0].ToString());
            NUnit.Framework.Assert.AreEqual("splits[0][42]", "Tell me, does the magnetic virtue of the needles of the compasses of"
                                            , results[42].ToString());
            results = ReadSplit(format, splits[1], jConf);
            NUnit.Framework.Assert.AreEqual("splits[1] length (num lines)", 84, results.Count
                                            );
            NUnit.Framework.Assert.AreEqual("splits[1][0]", "Call me Ishmael. Some years ago--never mind how long precisely--having"
                                            , results[0].ToString());
            NUnit.Framework.Assert.AreEqual("splits[1][42]", "Tell me, does the magnetic virtue of the needles of the compasses of"
                                            , results[42].ToString());
        }
        public virtual void TestMultipleClose()
        {
            Uri testFileUrl = GetType().GetClassLoader().GetResource("recordSpanningMultipleSplits.txt.bz2"
                                                                     );

            NUnit.Framework.Assert.IsNotNull("Cannot find recordSpanningMultipleSplits.txt.bz2"
                                             , testFileUrl);
            FilePath      testFile     = new FilePath(testFileUrl.GetFile());
            Path          testFilePath = new Path(testFile.GetAbsolutePath());
            long          testFileSize = testFile.Length();
            Configuration conf         = new Configuration();

            conf.SetInt(LineRecordReader.MaxLineLength, int.MaxValue);
            FileSplit        split  = new FileSplit(testFilePath, 0, testFileSize, (string[])null);
            LineRecordReader reader = new LineRecordReader(conf, split);
            LongWritable     key    = new LongWritable();
            Text             value  = new Text();

            //noinspection StatementWithEmptyBody
            while (reader.Next(key, value))
            {
            }
            reader.Close();
            reader.Close();
            BZip2Codec codec = new BZip2Codec();

            codec.SetConf(conf);
            ICollection <Decompressor> decompressors = new HashSet <Decompressor>();

            for (int i = 0; i < 10; ++i)
            {
                decompressors.AddItem(CodecPool.GetDecompressor(codec));
            }
            NUnit.Framework.Assert.AreEqual(10, decompressors.Count);
        }
Example #4
0
 /// <exception cref="System.IO.IOException"/>
 public FilterRecordReader(Configuration conf, FileSplit split)
     : base(conf, split)
 {
     // instantiate filter
     filter = (SequenceFileInputFilter.Filter)ReflectionUtils.NewInstance(conf.GetClass
                                                                              (FilterClass, typeof(SequenceFileInputFilter.PercentFilter)), conf);
 }
        // Use the LineRecordReader to read records from the file
        /// <exception cref="System.IO.IOException"/>
        public virtual AList <string> ReadRecords(Uri testFileUrl, int splitSize)
        {
            // Set up context
            FilePath      testFile     = new FilePath(testFileUrl.GetFile());
            long          testFileSize = testFile.Length();
            Path          testFilePath = new Path(testFile.GetAbsolutePath());
            Configuration conf         = new Configuration();

            conf.SetInt("io.file.buffer.size", 1);
            // Gather the records returned by the record reader
            AList <string> records = new AList <string>();
            long           offset  = 0;
            LongWritable   key     = new LongWritable();
            Text           value   = new Text();

            while (offset < testFileSize)
            {
                FileSplit        split  = new FileSplit(testFilePath, offset, splitSize, (string[])null);
                LineRecordReader reader = new LineRecordReader(conf, split);
                while (reader.Next(key, value))
                {
                    records.AddItem(value.ToString());
                }
                offset += splitSize;
            }
            return(records);
        }
 /// <exception cref="System.IO.IOException"/>
 public SequenceFileAsTextRecordReader(Configuration conf, FileSplit split)
 {
     sequenceFileRecordReader = new SequenceFileRecordReader <WritableComparable, Writable
                                                              >(conf, split);
     innerKey   = sequenceFileRecordReader.CreateKey();
     innerValue = sequenceFileRecordReader.CreateValue();
 }
Example #7
0
        /// <summary>Test using the gzip codec for reading</summary>
        /// <exception cref="System.IO.IOException"/>
        public static void TestGzip()
        {
            JobConf          job  = new JobConf();
            CompressionCodec gzip = new GzipCodec();

            ReflectionUtils.SetConf(gzip, job);
            localFs.Delete(workDir, true);
            WriteFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "line-1\tthe quick\nline-2\tbrown\nline-3\tfox jumped\nline-4\tover\nline-5\t the lazy\nline-6\t dog\n"
                      );
            WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "line-1\tthis is a test\nline-1\tof gzip\n"
                      );
            FileInputFormat.SetInputPaths(job, workDir);
            KeyValueTextInputFormat format = new KeyValueTextInputFormat();

            format.Configure(job);
            InputSplit[] splits = format.GetSplits(job, 100);
            NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length);
            FileSplit tmp = (FileSplit)splits[0];

            if (tmp.GetPath().GetName().Equals("part2.txt.gz"))
            {
                splits[0] = splits[1];
                splits[1] = tmp;
            }
            IList <Text> results = ReadSplit(format, splits[0], job);

            NUnit.Framework.Assert.AreEqual("splits[0] length", 6, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[0][5]", " dog", results[5].ToString());
            results = ReadSplit(format, splits[1], job);
            NUnit.Framework.Assert.AreEqual("splits[1] length", 2, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString
                                                ());
            NUnit.Framework.Assert.AreEqual("splits[1][1]", "of gzip", results[1].ToString());
        }
 /// <exception cref="System.IO.IOException"/>
 public FixedLengthRecordReader(Configuration job, FileSplit split, int recordLength
                                )
 {
     // Make use of the new API implementation to avoid code duplication.
     this.recordLength = recordLength;
     reader            = new Org.Apache.Hadoop.Mapreduce.Lib.Input.FixedLengthRecordReader(recordLength
                                                                                           );
     reader.Initialize(job, split.GetStart(), split.GetLength(), split.GetPath());
 }
        /// <exception cref="System.IO.IOException"/>
        public KeyValueLineRecordReader(Configuration job, FileSplit split)
        {
            lineRecordReader = new LineRecordReader(job, split);
            dummyKey         = lineRecordReader.CreateKey();
            innerValue       = lineRecordReader.CreateValue();
            string sepStr = job.Get("mapreduce.input.keyvaluelinerecordreader.key.value.separator"
                                    , "\t");

            this.separator = unchecked ((byte)sepStr[0]);
        }
        public virtual void TestUncompressedInputDefaultDelimiterPosValue()
        {
            Configuration conf      = new Configuration();
            string        inputData = "1234567890\r\n12\r\n345";
            Path          inputFile = CreateInputFile(conf, inputData);

            conf.SetInt("io.file.buffer.size", 10);
            conf.SetInt(LineRecordReader.MaxLineLength, int.MaxValue);
            FileSplit        split  = new FileSplit(inputFile, 0, 15, (string[])null);
            LineRecordReader reader = new LineRecordReader(conf, split, null);
            LongWritable     key    = new LongWritable();
            Text             value  = new Text();

            reader.Next(key, value);
            // Get first record:"1234567890"
            NUnit.Framework.Assert.AreEqual(10, value.GetLength());
            // Position should be 12 right after "1234567890\r\n"
            NUnit.Framework.Assert.AreEqual(12, reader.GetPos());
            reader.Next(key, value);
            // Get second record:"12"
            NUnit.Framework.Assert.AreEqual(2, value.GetLength());
            // Position should be 16 right after "1234567890\r\n12\r\n"
            NUnit.Framework.Assert.AreEqual(16, reader.GetPos());
            NUnit.Framework.Assert.IsFalse(reader.Next(key, value));
            split  = new FileSplit(inputFile, 15, 4, (string[])null);
            reader = new LineRecordReader(conf, split, null);
            // The second split dropped the first record "\n"
            // The position should be 16 right after "1234567890\r\n12\r\n"
            NUnit.Framework.Assert.AreEqual(16, reader.GetPos());
            reader.Next(key, value);
            // Get third record:"345"
            NUnit.Framework.Assert.AreEqual(3, value.GetLength());
            // Position should be 19 right after "1234567890\r\n12\r\n345"
            NUnit.Framework.Assert.AreEqual(19, reader.GetPos());
            NUnit.Framework.Assert.IsFalse(reader.Next(key, value));
            NUnit.Framework.Assert.AreEqual(19, reader.GetPos());
            inputData = "123456789\r\r\n";
            inputFile = CreateInputFile(conf, inputData);
            split     = new FileSplit(inputFile, 0, 12, (string[])null);
            reader    = new LineRecordReader(conf, split, null);
            reader.Next(key, value);
            // Get first record:"123456789"
            NUnit.Framework.Assert.AreEqual(9, value.GetLength());
            // Position should be 10 right after "123456789\r"
            NUnit.Framework.Assert.AreEqual(10, reader.GetPos());
            reader.Next(key, value);
            // Get second record:""
            NUnit.Framework.Assert.AreEqual(0, value.GetLength());
            // Position should be 12 right after "123456789\r\r\n"
            NUnit.Framework.Assert.AreEqual(12, reader.GetPos());
            NUnit.Framework.Assert.IsFalse(reader.Next(key, value));
            NUnit.Framework.Assert.AreEqual(12, reader.GetPos());
        }
Example #11
0
            // Input formats
            /// <exception cref="System.IO.IOException"/>
            public virtual InputSplit[] GetSplits(JobConf job, int numSplits)
            {
                InputSplit[] result = new InputSplit[numSplits];
                Path         outDir = FileOutputFormat.GetOutputPath(job);

                for (int i = 0; i < result.Length; ++i)
                {
                    result[i] = new FileSplit(new Path(outDir, "dummy-split-" + i), 0, 1, (string[])null
                                              );
                }
                return(result);
            }
        /// <exception cref="System.IO.IOException"/>
        private void TestSplitRecordsForFile(Configuration conf, long firstSplitLength, long
                                             testFileSize, Path testFilePath)
        {
            conf.SetInt(LineRecordReader.MaxLineLength, int.MaxValue);
            NUnit.Framework.Assert.IsTrue("unexpected test data at " + testFilePath, testFileSize
                                          > firstSplitLength);
            string delimiter = conf.Get("textinputformat.record.delimiter");

            byte[] recordDelimiterBytes = null;
            if (null != delimiter)
            {
                recordDelimiterBytes = Sharpen.Runtime.GetBytesForString(delimiter, Charsets.Utf8
                                                                         );
            }
            // read the data without splitting to count the records
            FileSplit        split  = new FileSplit(testFilePath, 0, testFileSize, (string[])null);
            LineRecordReader reader = new LineRecordReader(conf, split, recordDelimiterBytes);
            LongWritable     key    = new LongWritable();
            Text             value  = new Text();
            int numRecordsNoSplits  = 0;

            while (reader.Next(key, value))
            {
                ++numRecordsNoSplits;
            }
            reader.Close();
            // count the records in the first split
            split  = new FileSplit(testFilePath, 0, firstSplitLength, (string[])null);
            reader = new LineRecordReader(conf, split, recordDelimiterBytes);
            int numRecordsFirstSplit = 0;

            while (reader.Next(key, value))
            {
                ++numRecordsFirstSplit;
            }
            reader.Close();
            // count the records in the second split
            split = new FileSplit(testFilePath, firstSplitLength, testFileSize - firstSplitLength
                                  , (string[])null);
            reader = new LineRecordReader(conf, split, recordDelimiterBytes);
            int numRecordsRemainingSplits = 0;

            while (reader.Next(key, value))
            {
                ++numRecordsRemainingSplits;
            }
            reader.Close();
            NUnit.Framework.Assert.AreEqual("Unexpected number of records in split", numRecordsNoSplits
                                            , numRecordsFirstSplit + numRecordsRemainingSplits);
        }
Example #13
0
        /// <exception cref="System.IO.IOException"/>
        public LineRecordReader(Configuration job, FileSplit split, byte[] recordDelimiter
                                )
        {
            this.maxLineLength = job.GetInt(LineRecordReader.MaxLineLength, int.MaxValue);
            start = split.GetStart();
            end   = start + split.GetLength();
            Path file = split.GetPath();

            compressionCodecs = new CompressionCodecFactory(job);
            codec             = compressionCodecs.GetCodec(file);
            // open the file and seek to the start of the split
            FileSystem fs = file.GetFileSystem(job);

            fileIn = fs.Open(file);
            if (IsCompressedInput())
            {
                decompressor = CodecPool.GetDecompressor(codec);
                if (codec is SplittableCompressionCodec)
                {
                    SplitCompressionInputStream cIn = ((SplittableCompressionCodec)codec).CreateInputStream
                                                          (fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.Byblock);
                    @in          = new CompressedSplitLineReader(cIn, job, recordDelimiter);
                    start        = cIn.GetAdjustedStart();
                    end          = cIn.GetAdjustedEnd();
                    filePosition = cIn;
                }
                else
                {
                    // take pos from compressed stream
                    @in = new SplitLineReader(codec.CreateInputStream(fileIn, decompressor), job, recordDelimiter
                                              );
                    filePosition = fileIn;
                }
            }
            else
            {
                fileIn.Seek(start);
                @in = new UncompressedSplitLineReader(fileIn, job, recordDelimiter, split.GetLength
                                                          ());
                filePosition = fileIn;
            }
            // If this is not the first split, we always throw away first record
            // because we always (except the last split) read one extra line in
            // next() method.
            if (start != 0)
            {
                start += @in.ReadLine(new Text(), 0, MaxBytesToConsume(start));
            }
            this.pos = start;
        }
        /// <exception cref="System.IO.IOException"/>
        public SequenceFileRecordReader(Configuration conf, FileSplit split)
        {
            Path       path = split.GetPath();
            FileSystem fs   = path.GetFileSystem(conf);

            this.@in  = new SequenceFile.Reader(fs, path, conf);
            this.end  = split.GetStart() + split.GetLength();
            this.conf = conf;
            if (split.GetStart() > @in.GetPosition())
            {
                @in.Sync(split.GetStart());
            }
            // sync to start
            this.start = @in.GetPosition();
            more       = start < end;
        }
Example #15
0
            /// <exception cref="System.IO.IOException"/>
            public SequenceFileAsBinaryRecordReader(Configuration conf, FileSplit split)
            {
                Path       path = split.GetPath();
                FileSystem fs   = path.GetFileSystem(conf);

                this.@in = new SequenceFile.Reader(fs, path, conf);
                this.end = split.GetStart() + split.GetLength();
                if (split.GetStart() > @in.GetPosition())
                {
                    @in.Sync(split.GetStart());
                }
                // sync to start
                this.start = @in.GetPosition();
                vbytes     = @in.CreateValueBytes();
                done       = start >= end;
            }
        public virtual void TestBzip2()
        {
            JobConf          jobConf = new JobConf(defaultConf);
            CompressionCodec bzip2   = new BZip2Codec();

            ReflectionUtils.SetConf(bzip2, jobConf);
            localFs.Delete(workDir, true);
            System.Console.Out.WriteLine(ColorBrCyan + "testBzip2() using non-native CBZip2InputStream (presumably)"
                                         + ColorNormal);
            // copy prebuilt (correct!) version of concat.bz2 to HDFS
            string fn      = "concat" + bzip2.GetDefaultExtension();
            Path   fnLocal = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn);
            Path   fnHDFS  = new Path(workDir, fn);

            localFs.CopyFromLocalFile(fnLocal, fnHDFS);
            WriteFile(localFs, new Path(workDir, "part2.txt.bz2"), bzip2, "this is a test\nof bzip2\n"
                      );
            FileInputFormat.SetInputPaths(jobConf, workDir);
            TextInputFormat format = new TextInputFormat();

            // extends FileInputFormat
            format.Configure(jobConf);
            format.SetMinSplitSize(256);
            // work around 2-byte splits issue
            // [135 splits for a 208-byte file and a 62-byte file(!)]
            InputSplit[] splits = format.GetSplits(jobConf, 100);
            NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length);
            FileSplit tmp = (FileSplit)splits[0];

            if (tmp.GetPath().GetName().Equals("part2.txt.bz2"))
            {
                splits[0] = splits[1];
                splits[1] = tmp;
            }
            IList <Text> results = ReadSplit(format, splits[0], jobConf);

            NUnit.Framework.Assert.AreEqual("splits[0] num lines", 6, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[0][5]", "member #3", results[5].ToString(
                                                ));
            results = ReadSplit(format, splits[1], jobConf);
            NUnit.Framework.Assert.AreEqual("splits[1] num lines", 2, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString
                                                ());
            NUnit.Framework.Assert.AreEqual("splits[1][1]", "of bzip2", results[1].ToString()
                                            );
        }
        /// <exception cref="System.Exception"/>
        public virtual void TestLocality()
        {
            JobConf job = new JobConf(conf);

            dfs = NewDFSCluster(job);
            FileSystem fs = dfs.GetFileSystem();

            System.Console.Out.WriteLine("FileSystem " + fs.GetUri());
            Path   inputDir = new Path("/foo/");
            string fileName = "part-0000";

            CreateInputs(fs, inputDir, fileName);
            // split it using a file input format
            TextInputFormat.AddInputPath(job, inputDir);
            TextInputFormat inFormat = new TextInputFormat();

            inFormat.Configure(job);
            InputSplit[] splits     = inFormat.GetSplits(job, 1);
            FileStatus   fileStatus = fs.GetFileStatus(new Path(inputDir, fileName));

            BlockLocation[] locations = fs.GetFileBlockLocations(fileStatus, 0, fileStatus.GetLen
                                                                     ());
            System.Console.Out.WriteLine("Made splits");
            // make sure that each split is a block and the locations match
            for (int i = 0; i < splits.Length; ++i)
            {
                FileSplit fileSplit = (FileSplit)splits[i];
                System.Console.Out.WriteLine("File split: " + fileSplit);
                foreach (string h in fileSplit.GetLocations())
                {
                    System.Console.Out.WriteLine("Location: " + h);
                }
                System.Console.Out.WriteLine("Block: " + locations[i]);
                NUnit.Framework.Assert.AreEqual(locations[i].GetOffset(), fileSplit.GetStart());
                NUnit.Framework.Assert.AreEqual(locations[i].GetLength(), fileSplit.GetLength());
                string[] blockLocs = locations[i].GetHosts();
                string[] splitLocs = fileSplit.GetLocations();
                NUnit.Framework.Assert.AreEqual(2, blockLocs.Length);
                NUnit.Framework.Assert.AreEqual(2, splitLocs.Length);
                NUnit.Framework.Assert.IsTrue((blockLocs[0].Equals(splitLocs[0]) && blockLocs[1].
                                               Equals(splitLocs[1])) || (blockLocs[1].Equals(splitLocs[0]) && blockLocs[0].Equals
                                                                             (splitLocs[1])));
            }
            NUnit.Framework.Assert.AreEqual("Expected value of " + FileInputFormat.NumInputFiles
                                            , 1, job.GetLong(FileInputFormat.NumInputFiles, 0));
        }
        public virtual void TestStripBOM()
        {
            // the test data contains a BOM at the start of the file
            // confirm the BOM is skipped by LineRecordReader
            string Utf8Bom     = "\uFEFF";
            Uri    testFileUrl = GetType().GetClassLoader().GetResource("testBOM.txt");

            NUnit.Framework.Assert.IsNotNull("Cannot find testBOM.txt", testFileUrl);
            FilePath      testFile     = new FilePath(testFileUrl.GetFile());
            Path          testFilePath = new Path(testFile.GetAbsolutePath());
            long          testFileSize = testFile.Length();
            Configuration conf         = new Configuration();

            conf.SetInt(LineRecordReader.MaxLineLength, int.MaxValue);
            // read the data and check whether BOM is skipped
            FileSplit        split  = new FileSplit(testFilePath, 0, testFileSize, (string[])null);
            LineRecordReader reader = new LineRecordReader(conf, split);
            LongWritable     key    = new LongWritable();
            Text             value  = new Text();
            int  numRecords         = 0;
            bool firstLine          = true;
            bool skipBOM            = true;

            while (reader.Next(key, value))
            {
                if (firstLine)
                {
                    firstLine = false;
                    if (value.ToString().StartsWith(Utf8Bom))
                    {
                        skipBOM = false;
                    }
                }
                ++numRecords;
            }
            reader.Close();
            NUnit.Framework.Assert.IsTrue("BOM is not skipped", skipBOM);
        }
        public virtual void TestUncompressedInputCustomDelimiterPosValue()
        {
            Configuration conf = new Configuration();

            conf.SetInt("io.file.buffer.size", 10);
            conf.SetInt(LineRecordReader.MaxLineLength, int.MaxValue);
            string inputData = "abcdefghij++kl++mno";
            Path   inputFile = CreateInputFile(conf, inputData);
            string delimiter = "++";

            byte[] recordDelimiterBytes = Sharpen.Runtime.GetBytesForString(delimiter, Charsets
                                                                            .Utf8);
            // the first split must contain two records to make sure that it also pulls
            // in the record from the 2nd split
            int              splitLength = 15;
            FileSplit        split       = new FileSplit(inputFile, 0, splitLength, (string[])null);
            LineRecordReader reader      = new LineRecordReader(conf, split, recordDelimiterBytes);
            LongWritable     key         = new LongWritable();
            Text             value       = new Text();

            // Get first record: "abcdefghij"
            NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.Next(key, value
                                                                                     ));
            NUnit.Framework.Assert.AreEqual("Wrong length for record value", 10, value.GetLength
                                                ());
            // Position should be 12 right after "abcdefghij++"
            NUnit.Framework.Assert.AreEqual("Wrong position after record read", 12, reader.GetPos
                                                ());
            // Get second record: "kl"
            NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.Next(key, value
                                                                                     ));
            NUnit.Framework.Assert.AreEqual("Wrong length for record value", 2, value.GetLength
                                                ());
            // Position should be 16 right after "abcdefghij++kl++"
            NUnit.Framework.Assert.AreEqual("Wrong position after record read", 16, reader.GetPos
                                                ());
            // Get third record: "mno"
            NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.Next(key, value
                                                                                     ));
            NUnit.Framework.Assert.AreEqual("Wrong length for record value", 3, value.GetLength
                                                ());
            // Position should be 19 right after "abcdefghij++kl++mno"
            NUnit.Framework.Assert.AreEqual("Wrong position after record read", 19, reader.GetPos
                                                ());
            NUnit.Framework.Assert.IsFalse(reader.Next(key, value));
            NUnit.Framework.Assert.AreEqual("Wrong position after record read", 19, reader.GetPos
                                                ());
            reader.Close();
            // No record is in the second split because the second split will drop
            // the first record, which was already reported by the first split.
            split = new FileSplit(inputFile, splitLength, inputData.Length - splitLength, (string
                                                                                           [])null);
            reader = new LineRecordReader(conf, split, recordDelimiterBytes);
            // The position should be 19 right after "abcdefghij++kl++mno" and should
            // not change
            NUnit.Framework.Assert.AreEqual("Wrong position after record read", 19, reader.GetPos
                                                ());
            NUnit.Framework.Assert.IsFalse("Unexpected record returned", reader.Next(key, value
                                                                                     ));
            NUnit.Framework.Assert.AreEqual("Wrong position after record read", 19, reader.GetPos
                                                ());
            reader.Close();
            // multi char delimiter with starting part of the delimiter in the data
            inputData   = "abcd+efgh++ijk++mno";
            inputFile   = CreateInputFile(conf, inputData);
            splitLength = 5;
            split       = new FileSplit(inputFile, 0, splitLength, (string[])null);
            reader      = new LineRecordReader(conf, split, recordDelimiterBytes);
            // Get first record: "abcd+efgh"
            NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.Next(key, value
                                                                                     ));
            NUnit.Framework.Assert.AreEqual("Wrong position after record read", 11, reader.GetPos
                                                ());
            NUnit.Framework.Assert.AreEqual("Wrong length for record value", 9, value.GetLength
                                                ());
            // should have jumped over the delimiter, no record
            NUnit.Framework.Assert.IsFalse("Unexpected record returned", reader.Next(key, value
                                                                                     ));
            NUnit.Framework.Assert.AreEqual("Wrong position after record read", 11, reader.GetPos
                                                ());
            reader.Close();
            // next split: check for duplicate or dropped records
            split = new FileSplit(inputFile, splitLength, inputData.Length - splitLength, (string
                                                                                           [])null);
            reader = new LineRecordReader(conf, split, recordDelimiterBytes);
            // Get second record: "ijk" first in this split
            NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.Next(key, value
                                                                                     ));
            NUnit.Framework.Assert.AreEqual("Wrong position after record read", 16, reader.GetPos
                                                ());
            NUnit.Framework.Assert.AreEqual("Wrong length for record value", 3, value.GetLength
                                                ());
            // Get third record: "mno" second in this split
            NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.Next(key, value
                                                                                     ));
            NUnit.Framework.Assert.AreEqual("Wrong position after record read", 19, reader.GetPos
                                                ());
            NUnit.Framework.Assert.AreEqual("Wrong length for record value", 3, value.GetLength
                                                ());
            // should be at the end of the input
            NUnit.Framework.Assert.IsFalse(reader.Next(key, value));
            NUnit.Framework.Assert.AreEqual("Wrong position after record read", 19, reader.GetPos
                                                ());
            reader.Close();
            inputData            = "abcd|efgh|+|ij|kl|+|mno|pqr";
            inputFile            = CreateInputFile(conf, inputData);
            delimiter            = "|+|";
            recordDelimiterBytes = Sharpen.Runtime.GetBytesForString(delimiter, Charsets.Utf8
                                                                     );
            // walking over the buffer and split sizes checks for proper processing
            // of the ambiguous bytes of the delimiter
            for (int bufferSize = 1; bufferSize <= inputData.Length; bufferSize++)
            {
                for (int splitSize = 1; splitSize < inputData.Length; splitSize++)
                {
                    conf.SetInt("io.file.buffer.size", bufferSize);
                    split  = new FileSplit(inputFile, 0, bufferSize, (string[])null);
                    reader = new LineRecordReader(conf, split, recordDelimiterBytes);
                    // Get first record: "abcd|efgh" always possible
                    NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.Next(key, value
                                                                                             ));
                    NUnit.Framework.Assert.IsTrue("abcd|efgh".Equals(value.ToString()));
                    NUnit.Framework.Assert.AreEqual("Wrong position after record read", 9, value.GetLength
                                                        ());
                    // Position should be 12 right after "|+|"
                    int recordPos = 12;
                    NUnit.Framework.Assert.AreEqual("Wrong position after record read", recordPos, reader
                                                    .GetPos());
                    // get the next record: "ij|kl" if the split/buffer allows it
                    if (reader.Next(key, value))
                    {
                        // check the record info: "ij|kl"
                        NUnit.Framework.Assert.IsTrue("ij|kl".Equals(value.ToString()));
                        // Position should be 20 right after "|+|"
                        recordPos = 20;
                        NUnit.Framework.Assert.AreEqual("Wrong position after record read", recordPos, reader
                                                        .GetPos());
                    }
                    // get the third record: "mno|pqr" if the split/buffer allows it
                    if (reader.Next(key, value))
                    {
                        // check the record info: "mno|pqr"
                        NUnit.Framework.Assert.IsTrue("mno|pqr".Equals(value.ToString()));
                        // Position should be 27 at the end of the string now
                        recordPos = inputData.Length;
                        NUnit.Framework.Assert.AreEqual("Wrong position after record read", recordPos, reader
                                                        .GetPos());
                    }
                    // no more records can be read we should still be at the last position
                    NUnit.Framework.Assert.IsFalse("Unexpected record returned", reader.Next(key, value
                                                                                             ));
                    NUnit.Framework.Assert.AreEqual("Wrong position after record read", recordPos, reader
                                                    .GetPos());
                    reader.Close();
                }
            }
        }
        public virtual void TestGzip()
        {
            JobConf          jobConf = new JobConf(defaultConf);
            CompressionCodec gzip    = new GzipCodec();

            ReflectionUtils.SetConf(gzip, jobConf);
            localFs.Delete(workDir, true);
            // preferred, but not compatible with Apache/trunk instance of Hudson:

            /*
             * assertFalse("[native (C/C++) codec]",
             * (org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.class ==
             * gzip.getDecompressorType()) );
             * System.out.println(COLOR_BR_RED +
             * "testGzip() using native-zlib Decompressor (" +
             * gzip.getDecompressorType() + ")" + COLOR_NORMAL);
             */
            // alternative:
            if (typeof(BuiltInGzipDecompressor) == gzip.GetDecompressorType())
            {
                System.Console.Out.WriteLine(ColorBrRed + "testGzip() using native-zlib Decompressor ("
                                             + gzip.GetDecompressorType() + ")" + ColorNormal);
            }
            else
            {
                Log.Warn("testGzip() skipped:  native (C/C++) libs not loaded");
                return;
            }

            /*
             *      // THIS IS BUGGY: omits 2nd/3rd gzip headers; screws up 2nd/3rd CRCs--
             *      //                see https://issues.apache.org/jira/browse/HADOOP-6799
             *  Path fnHDFS = new Path(workDir, "concat" + gzip.getDefaultExtension());
             *  //OutputStream out = localFs.create(fnHDFS);
             *  //GzipCodec.GzipOutputStream gzOStm = new GzipCodec.GzipOutputStream(out);
             *      // can just combine those two lines, probably
             *  //GzipCodec.GzipOutputStream gzOStm =
             *  //  new GzipCodec.GzipOutputStream(localFs.create(fnHDFS));
             *      // oops, no:  this is a protected helper class; need to access
             *      //   it via createOutputStream() instead:
             *  OutputStream out = localFs.create(fnHDFS);
             *  Compressor gzCmp = gzip.createCompressor();
             *  CompressionOutputStream gzOStm = gzip.createOutputStream(out, gzCmp);
             *      // this SHOULD be going to HDFS:  got out from localFs == HDFS
             *      //   ...yup, works
             *  gzOStm.write("first gzip concat\n member\nwith three lines\n".getBytes());
             *  gzOStm.finish();
             *  gzOStm.resetState();
             *  gzOStm.write("2nd gzip concat member\n".getBytes());
             *  gzOStm.finish();
             *  gzOStm.resetState();
             *  gzOStm.write("gzip concat\nmember #3\n".getBytes());
             *  gzOStm.close();
             *      //
             *  String fn = "hdfs-to-local-concat" + gzip.getDefaultExtension();
             *  Path fnLocal = new Path(System.getProperty("test.concat.data","/tmp"), fn);
             *  localFs.copyToLocalFile(fnHDFS, fnLocal);
             */
            // copy prebuilt (correct!) version of concat.gz to HDFS
            string fn      = "concat" + gzip.GetDefaultExtension();
            Path   fnLocal = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn);
            Path   fnHDFS  = new Path(workDir, fn);

            localFs.CopyFromLocalFile(fnLocal, fnHDFS);
            WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n"
                      );
            FileInputFormat.SetInputPaths(jobConf, workDir);
            TextInputFormat format = new TextInputFormat();

            format.Configure(jobConf);
            InputSplit[] splits = format.GetSplits(jobConf, 100);
            NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length);
            FileSplit tmp = (FileSplit)splits[0];

            if (tmp.GetPath().GetName().Equals("part2.txt.gz"))
            {
                splits[0] = splits[1];
                splits[1] = tmp;
            }
            IList <Text> results = ReadSplit(format, splits[0], jobConf);

            NUnit.Framework.Assert.AreEqual("splits[0] num lines", 6, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[0][5]", "member #3", results[5].ToString(
                                                ));
            results = ReadSplit(format, splits[1], jobConf);
            NUnit.Framework.Assert.AreEqual("splits[1] num lines", 2, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString
                                                ());
            NUnit.Framework.Assert.AreEqual("splits[1][1]", "of gzip", results[1].ToString());
        }
Example #21
0
 /// <exception cref="System.IO.IOException"/>
 public LineRecordReader(Configuration job, FileSplit split)
     : this(job, split, null)
 {
 }