/// <summary>Test using the gzip codec with two input files.</summary>
        /// <exception cref="System.Exception"/>
        public virtual void TestGzipWithTwoInputs()
        {
            CompressionCodec gzip = new GzipCodec();

            localFs.Delete(workDir, true);
            Job job = Job.GetInstance(defaultConf);
            FixedLengthInputFormat format = new FixedLengthInputFormat();

            FixedLengthInputFormat.SetRecordLength(job.GetConfiguration(), 5);
            ReflectionUtils.SetConf(gzip, job.GetConfiguration());
            FileInputFormat.SetInputPaths(job, workDir);
            // Create files with fixed length records with 5 byte long records.
            WriteFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "one  two  threefour five six  seveneightnine ten  "
                      );
            WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "ten  nine eightsevensix  five four threetwo  one  "
                      );
            IList <InputSplit> splits = format.GetSplits(job);

            NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Count);
            FileSplit tmp = (FileSplit)splits[0];

            if (tmp.GetPath().GetName().Equals("part2.txt.gz"))
            {
                splits.Set(0, splits[1]);
                splits.Set(1, tmp);
            }
            IList <string> results = ReadSplit(format, splits[0], job);

            NUnit.Framework.Assert.AreEqual("splits[0] length", 10, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[0][5]", "six  ", results[5]);
            results = ReadSplit(format, splits[1], job);
            NUnit.Framework.Assert.AreEqual("splits[1] length", 10, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[1][0]", "ten  ", results[0]);
            NUnit.Framework.Assert.AreEqual("splits[1][1]", "nine ", results[1]);
        }
Example #2
0
        public virtual void TestForEmptyFile()
        {
            Configuration      conf    = new Configuration();
            FileSystem         fileSys = FileSystem.Get(conf);
            Path               file    = new Path("test" + "/file");
            FSDataOutputStream @out    = fileSys.Create(file, true, conf.GetInt("io.file.buffer.size"
                                                                                , 4096), (short)1, (long)1024);

            @out.Write(new byte[0]);
            @out.Close();
            // split it using a File input format
            TestMRCJCFileInputFormat.DummyInputFormat inFormat = new TestMRCJCFileInputFormat.DummyInputFormat
                                                                     (this);
            Job job = Job.GetInstance(conf);

            FileInputFormat.SetInputPaths(job, "test");
            IList <InputSplit> splits = inFormat.GetSplits(job);

            NUnit.Framework.Assert.AreEqual(1, splits.Count);
            FileSplit fileSplit = (FileSplit)splits[0];

            NUnit.Framework.Assert.AreEqual(0, fileSplit.GetLocations().Length);
            NUnit.Framework.Assert.AreEqual(file.GetName(), fileSplit.GetPath().GetName());
            NUnit.Framework.Assert.AreEqual(0, fileSplit.GetStart());
            NUnit.Framework.Assert.AreEqual(0, fileSplit.GetLength());
            fileSys.Delete(file.GetParent(), true);
        }
        /// <exception cref="System.IO.IOException"/>
        public override void Initialize(InputSplit genericSplit, TaskAttemptContext context
                                        )
        {
            FileSplit     split = (FileSplit)genericSplit;
            Configuration job   = context.GetConfiguration();
            Path          file  = split.GetPath();

            Initialize(job, split.GetStart(), split.GetLength(), file);
        }
Example #4
0
        /// <exception cref="System.IO.IOException"/>
        public override void Initialize(InputSplit genericSplit, TaskAttemptContext context
                                        )
        {
            FileSplit     split = (FileSplit)genericSplit;
            Configuration job   = context.GetConfiguration();

            this.maxLineLength = job.GetInt(MaxLineLength, int.MaxValue);
            start = split.GetStart();
            end   = start + split.GetLength();
            Path file = split.GetPath();
            // open the file and seek to the start of the split
            FileSystem fs = file.GetFileSystem(job);

            fileIn = fs.Open(file);
            CompressionCodec codec = new CompressionCodecFactory(job).GetCodec(file);

            if (null != codec)
            {
                isCompressedInput = true;
                decompressor      = CodecPool.GetDecompressor(codec);
                if (codec is SplittableCompressionCodec)
                {
                    SplitCompressionInputStream cIn = ((SplittableCompressionCodec)codec).CreateInputStream
                                                          (fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.Byblock);
                    @in          = new CompressedSplitLineReader(cIn, job, this.recordDelimiterBytes);
                    start        = cIn.GetAdjustedStart();
                    end          = cIn.GetAdjustedEnd();
                    filePosition = cIn;
                }
                else
                {
                    @in = new SplitLineReader(codec.CreateInputStream(fileIn, decompressor), job, this
                                              .recordDelimiterBytes);
                    filePosition = fileIn;
                }
            }
            else
            {
                fileIn.Seek(start);
                @in = new UncompressedSplitLineReader(fileIn, job, this.recordDelimiterBytes, split
                                                      .GetLength());
                filePosition = fileIn;
            }
            // If this is not the first split, we always throw away first record
            // because we always (except the last split) read one extra line in
            // next() method.
            if (start != 0)
            {
                start += @in.ReadLine(new Text(), 0, MaxBytesToConsume(start));
            }
            this.pos = start;
        }
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.Exception"/>
        public override void Initialize(InputSplit split, TaskAttemptContext context)
        {
            FileSplit fileSplit = (FileSplit)split;

            conf = context.GetConfiguration();
            Path       path = fileSplit.GetPath();
            FileSystem fs   = path.GetFileSystem(conf);

            this.@in = new SequenceFile.Reader(fs, path, conf);
            this.end = fileSplit.GetStart() + fileSplit.GetLength();
            if (fileSplit.GetStart() > @in.GetPosition())
            {
                @in.Sync(fileSplit.GetStart());
            }
            // sync to start
            this.start = @in.GetPosition();
            more       = start < end;
        }
Example #6
0
        public virtual void TestGzip()
        {
            Configuration    conf = new Configuration(defaultConf);
            CompressionCodec gzip = new GzipCodec();

            ReflectionUtils.SetConf(gzip, conf);
            localFs.Delete(workDir, true);
            WriteFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "line-1\tthe quick\nline-2\tbrown\nline-3\t"
                      + "fox jumped\nline-4\tover\nline-5\t the lazy\nline-6\t dog\n");
            WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "line-1\tthis is a test\nline-1\tof gzip\n"
                      );
            Job job = Job.GetInstance(conf);

            FileInputFormat.SetInputPaths(job, workDir);
            KeyValueTextInputFormat format = new KeyValueTextInputFormat();
            IList <InputSplit>      splits = format.GetSplits(job);

            NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Count);
            FileSplit tmp = (FileSplit)splits[0];

            if (tmp.GetPath().GetName().Equals("part2.txt.gz"))
            {
                splits.Set(0, splits[1]);
                splits.Set(1, tmp);
            }
            IList <Text> results = ReadSplit(format, splits[0], job);

            NUnit.Framework.Assert.AreEqual("splits[0] length", 6, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[0][0]", "the quick", results[0].ToString(
                                                ));
            NUnit.Framework.Assert.AreEqual("splits[0][1]", "brown", results[1].ToString());
            NUnit.Framework.Assert.AreEqual("splits[0][2]", "fox jumped", results[2].ToString
                                                ());
            NUnit.Framework.Assert.AreEqual("splits[0][3]", "over", results[3].ToString());
            NUnit.Framework.Assert.AreEqual("splits[0][4]", " the lazy", results[4].ToString(
                                                ));
            NUnit.Framework.Assert.AreEqual("splits[0][5]", " dog", results[5].ToString());
            results = ReadSplit(format, splits[1], job);
            NUnit.Framework.Assert.AreEqual("splits[1] length", 2, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString
                                                ());
            NUnit.Framework.Assert.AreEqual("splits[1][1]", "of gzip", results[1].ToString());
        }
Example #7
0
        private bool FileSplitIsValid(TaskAttemptContext context)
        {
            Configuration conf   = context.GetConfiguration();
            long          offset = conf.GetLong(MRJobConfig.MapInputStart, 0L);

            if (fileSplit.GetStart() != offset)
            {
                return(false);
            }
            long length = conf.GetLong(MRJobConfig.MapInputPath, 0L);

            if (fileSplit.GetLength() != length)
            {
                return(false);
            }
            string path = conf.Get(MRJobConfig.MapInputFile);

            if (!fileSplit.GetPath().ToString().Equals(path))
            {
                return(false);
            }
            return(true);
        }