/// <summary>Test using the gzip codec with two input files.</summary> /// <exception cref="System.Exception"/> public virtual void TestGzipWithTwoInputs() { CompressionCodec gzip = new GzipCodec(); localFs.Delete(workDir, true); Job job = Job.GetInstance(defaultConf); FixedLengthInputFormat format = new FixedLengthInputFormat(); FixedLengthInputFormat.SetRecordLength(job.GetConfiguration(), 5); ReflectionUtils.SetConf(gzip, job.GetConfiguration()); FileInputFormat.SetInputPaths(job, workDir); // Create files with fixed length records with 5 byte long records. WriteFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "one two threefour five six seveneightnine ten " ); WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "ten nine eightsevensix five four threetwo one " ); IList <InputSplit> splits = format.GetSplits(job); NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Count); FileSplit tmp = (FileSplit)splits[0]; if (tmp.GetPath().GetName().Equals("part2.txt.gz")) { splits.Set(0, splits[1]); splits.Set(1, tmp); } IList <string> results = ReadSplit(format, splits[0], job); NUnit.Framework.Assert.AreEqual("splits[0] length", 10, results.Count); NUnit.Framework.Assert.AreEqual("splits[0][5]", "six ", results[5]); results = ReadSplit(format, splits[1], job); NUnit.Framework.Assert.AreEqual("splits[1] length", 10, results.Count); NUnit.Framework.Assert.AreEqual("splits[1][0]", "ten ", results[0]); NUnit.Framework.Assert.AreEqual("splits[1][1]", "nine ", results[1]); }
public virtual void TestForEmptyFile() { Configuration conf = new Configuration(); FileSystem fileSys = FileSystem.Get(conf); Path file = new Path("test" + "/file"); FSDataOutputStream @out = fileSys.Create(file, true, conf.GetInt("io.file.buffer.size" , 4096), (short)1, (long)1024); @out.Write(new byte[0]); @out.Close(); // split it using a File input format TestMRCJCFileInputFormat.DummyInputFormat inFormat = new TestMRCJCFileInputFormat.DummyInputFormat (this); Job job = Job.GetInstance(conf); FileInputFormat.SetInputPaths(job, "test"); IList <InputSplit> splits = inFormat.GetSplits(job); NUnit.Framework.Assert.AreEqual(1, splits.Count); FileSplit fileSplit = (FileSplit)splits[0]; NUnit.Framework.Assert.AreEqual(0, fileSplit.GetLocations().Length); NUnit.Framework.Assert.AreEqual(file.GetName(), fileSplit.GetPath().GetName()); NUnit.Framework.Assert.AreEqual(0, fileSplit.GetStart()); NUnit.Framework.Assert.AreEqual(0, fileSplit.GetLength()); fileSys.Delete(file.GetParent(), true); }
/// <exception cref="System.IO.IOException"/> public override void Initialize(InputSplit genericSplit, TaskAttemptContext context ) { FileSplit split = (FileSplit)genericSplit; Configuration job = context.GetConfiguration(); Path file = split.GetPath(); Initialize(job, split.GetStart(), split.GetLength(), file); }
/// <exception cref="System.IO.IOException"/> public override void Initialize(InputSplit genericSplit, TaskAttemptContext context ) { FileSplit split = (FileSplit)genericSplit; Configuration job = context.GetConfiguration(); this.maxLineLength = job.GetInt(MaxLineLength, int.MaxValue); start = split.GetStart(); end = start + split.GetLength(); Path file = split.GetPath(); // open the file and seek to the start of the split FileSystem fs = file.GetFileSystem(job); fileIn = fs.Open(file); CompressionCodec codec = new CompressionCodecFactory(job).GetCodec(file); if (null != codec) { isCompressedInput = true; decompressor = CodecPool.GetDecompressor(codec); if (codec is SplittableCompressionCodec) { SplitCompressionInputStream cIn = ((SplittableCompressionCodec)codec).CreateInputStream (fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.Byblock); @in = new CompressedSplitLineReader(cIn, job, this.recordDelimiterBytes); start = cIn.GetAdjustedStart(); end = cIn.GetAdjustedEnd(); filePosition = cIn; } else { @in = new SplitLineReader(codec.CreateInputStream(fileIn, decompressor), job, this .recordDelimiterBytes); filePosition = fileIn; } } else { fileIn.Seek(start); @in = new UncompressedSplitLineReader(fileIn, job, this.recordDelimiterBytes, split .GetLength()); filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += @in.ReadLine(new Text(), 0, MaxBytesToConsume(start)); } this.pos = start; }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.Exception"/> public override void Initialize(InputSplit split, TaskAttemptContext context) { FileSplit fileSplit = (FileSplit)split; conf = context.GetConfiguration(); Path path = fileSplit.GetPath(); FileSystem fs = path.GetFileSystem(conf); this.@in = new SequenceFile.Reader(fs, path, conf); this.end = fileSplit.GetStart() + fileSplit.GetLength(); if (fileSplit.GetStart() > @in.GetPosition()) { @in.Sync(fileSplit.GetStart()); } // sync to start this.start = @in.GetPosition(); more = start < end; }
public virtual void TestGzip() { Configuration conf = new Configuration(defaultConf); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.SetConf(gzip, conf); localFs.Delete(workDir, true); WriteFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "line-1\tthe quick\nline-2\tbrown\nline-3\t" + "fox jumped\nline-4\tover\nline-5\t the lazy\nline-6\t dog\n"); WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "line-1\tthis is a test\nline-1\tof gzip\n" ); Job job = Job.GetInstance(conf); FileInputFormat.SetInputPaths(job, workDir); KeyValueTextInputFormat format = new KeyValueTextInputFormat(); IList <InputSplit> splits = format.GetSplits(job); NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Count); FileSplit tmp = (FileSplit)splits[0]; if (tmp.GetPath().GetName().Equals("part2.txt.gz")) { splits.Set(0, splits[1]); splits.Set(1, tmp); } IList <Text> results = ReadSplit(format, splits[0], job); NUnit.Framework.Assert.AreEqual("splits[0] length", 6, results.Count); NUnit.Framework.Assert.AreEqual("splits[0][0]", "the quick", results[0].ToString( )); NUnit.Framework.Assert.AreEqual("splits[0][1]", "brown", results[1].ToString()); NUnit.Framework.Assert.AreEqual("splits[0][2]", "fox jumped", results[2].ToString ()); NUnit.Framework.Assert.AreEqual("splits[0][3]", "over", results[3].ToString()); NUnit.Framework.Assert.AreEqual("splits[0][4]", " the lazy", results[4].ToString( )); NUnit.Framework.Assert.AreEqual("splits[0][5]", " dog", results[5].ToString()); results = ReadSplit(format, splits[1], job); NUnit.Framework.Assert.AreEqual("splits[1] length", 2, results.Count); NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString ()); NUnit.Framework.Assert.AreEqual("splits[1][1]", "of gzip", results[1].ToString()); }
private bool FileSplitIsValid(TaskAttemptContext context) { Configuration conf = context.GetConfiguration(); long offset = conf.GetLong(MRJobConfig.MapInputStart, 0L); if (fileSplit.GetStart() != offset) { return(false); } long length = conf.GetLong(MRJobConfig.MapInputPath, 0L); if (fileSplit.GetLength() != length) { return(false); } string path = conf.Get(MRJobConfig.MapInputFile); if (!fileSplit.GetPath().ToString().Equals(path)) { return(false); } return(true); }