/// <summary>Test using the gzip codec with two input files.</summary> /// <exception cref="System.Exception"/> public virtual void TestGzipWithTwoInputs() { CompressionCodec gzip = new GzipCodec(); localFs.Delete(workDir, true); Job job = Job.GetInstance(defaultConf); FixedLengthInputFormat format = new FixedLengthInputFormat(); FixedLengthInputFormat.SetRecordLength(job.GetConfiguration(), 5); ReflectionUtils.SetConf(gzip, job.GetConfiguration()); FileInputFormat.SetInputPaths(job, workDir); // Create files with fixed length records with 5 byte long records. WriteFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "one two threefour five six seveneightnine ten " ); WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "ten nine eightsevensix five four threetwo one " ); IList <InputSplit> splits = format.GetSplits(job); NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Count); FileSplit tmp = (FileSplit)splits[0]; if (tmp.GetPath().GetName().Equals("part2.txt.gz")) { splits.Set(0, splits[1]); splits.Set(1, tmp); } IList <string> results = ReadSplit(format, splits[0], job); NUnit.Framework.Assert.AreEqual("splits[0] length", 10, results.Count); NUnit.Framework.Assert.AreEqual("splits[0][5]", "six ", results[5]); results = ReadSplit(format, splits[1], job); NUnit.Framework.Assert.AreEqual("splits[1] length", 10, results.Count); NUnit.Framework.Assert.AreEqual("splits[1][0]", "ten ", results[0]); NUnit.Framework.Assert.AreEqual("splits[1][1]", "nine ", results[1]); }
public virtual void TestForEmptyFile() { Configuration conf = new Configuration(); FileSystem fileSys = FileSystem.Get(conf); Path file = new Path("test" + "/file"); FSDataOutputStream @out = fileSys.Create(file, true, conf.GetInt("io.file.buffer.size" , 4096), (short)1, (long)1024); @out.Write(new byte[0]); @out.Close(); // split it using a File input format TestMRCJCFileInputFormat.DummyInputFormat inFormat = new TestMRCJCFileInputFormat.DummyInputFormat (this); Job job = Job.GetInstance(conf); FileInputFormat.SetInputPaths(job, "test"); IList <InputSplit> splits = inFormat.GetSplits(job); NUnit.Framework.Assert.AreEqual(1, splits.Count); FileSplit fileSplit = (FileSplit)splits[0]; NUnit.Framework.Assert.AreEqual(0, fileSplit.GetLocations().Length); NUnit.Framework.Assert.AreEqual(file.GetName(), fileSplit.GetPath().GetName()); NUnit.Framework.Assert.AreEqual(0, fileSplit.GetStart()); NUnit.Framework.Assert.AreEqual(0, fileSplit.GetLength()); fileSys.Delete(file.GetParent(), true); }
// Use the LineRecordReader to read records from the file /// <exception cref="System.IO.IOException"/> public virtual AList <string> ReadRecords(Uri testFileUrl, int splitSize) { // Set up context FilePath testFile = new FilePath(testFileUrl.GetFile()); long testFileSize = testFile.Length(); Path testFilePath = new Path(testFile.GetAbsolutePath()); Configuration conf = new Configuration(); conf.SetInt("io.file.buffer.size", 1); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID() ); // Gather the records returned by the record reader AList <string> records = new AList <string>(); long offset = 0; while (offset < testFileSize) { FileSplit split = new FileSplit(testFilePath, offset, splitSize, null); LineRecordReader reader = new LineRecordReader(); reader.Initialize(split, context); while (reader.NextKeyValue()) { records.AddItem(reader.GetCurrentValue().ToString()); } offset += splitSize; } return(records); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.Exception"/> protected internal CombineFileRecordReaderWrapper(FileInputFormat <K, V> inputFormat , CombineFileSplit split, TaskAttemptContext context, int idx) { fileSplit = new FileSplit(split.GetPath(idx), split.GetOffset(idx), split.GetLength (idx), split.GetLocations()); delegate_ = inputFormat.CreateRecordReader(fileSplit, context); }
/// <exception cref="System.IO.IOException"/> public override void Initialize(InputSplit genericSplit, TaskAttemptContext context ) { FileSplit split = (FileSplit)genericSplit; Configuration job = context.GetConfiguration(); Path file = split.GetPath(); Initialize(job, split.GetStart(), split.GetLength(), file); }
/// <exception cref="System.IO.IOException"/> private void TestSplitRecordsForFile(Configuration conf, long firstSplitLength, long testFileSize, Path testFilePath) { conf.SetInt(LineRecordReader.MaxLineLength, int.MaxValue); NUnit.Framework.Assert.IsTrue("unexpected test data at " + testFilePath, testFileSize > firstSplitLength); string delimiter = conf.Get("textinputformat.record.delimiter"); byte[] recordDelimiterBytes = null; if (null != delimiter) { recordDelimiterBytes = Sharpen.Runtime.GetBytesForString(delimiter, Charsets.Utf8 ); } TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID() ); // read the data without splitting to count the records FileSplit split = new FileSplit(testFilePath, 0, testFileSize, (string[])null); LineRecordReader reader = new LineRecordReader(recordDelimiterBytes); reader.Initialize(split, context); int numRecordsNoSplits = 0; while (reader.NextKeyValue()) { ++numRecordsNoSplits; } reader.Close(); // count the records in the first split split = new FileSplit(testFilePath, 0, firstSplitLength, (string[])null); reader = new LineRecordReader(recordDelimiterBytes); reader.Initialize(split, context); int numRecordsFirstSplit = 0; while (reader.NextKeyValue()) { ++numRecordsFirstSplit; } reader.Close(); // count the records in the second split split = new FileSplit(testFilePath, firstSplitLength, testFileSize - firstSplitLength , (string[])null); reader = new LineRecordReader(recordDelimiterBytes); reader.Initialize(split, context); int numRecordsRemainingSplits = 0; while (reader.NextKeyValue()) { ++numRecordsRemainingSplits; } reader.Close(); NUnit.Framework.Assert.AreEqual("Unexpected number of records in split ", numRecordsNoSplits , numRecordsFirstSplit + numRecordsRemainingSplits); }
/// <exception cref="System.IO.IOException"/> public override void Initialize(InputSplit genericSplit, TaskAttemptContext context ) { FileSplit split = (FileSplit)genericSplit; Configuration job = context.GetConfiguration(); this.maxLineLength = job.GetInt(MaxLineLength, int.MaxValue); start = split.GetStart(); end = start + split.GetLength(); Path file = split.GetPath(); // open the file and seek to the start of the split FileSystem fs = file.GetFileSystem(job); fileIn = fs.Open(file); CompressionCodec codec = new CompressionCodecFactory(job).GetCodec(file); if (null != codec) { isCompressedInput = true; decompressor = CodecPool.GetDecompressor(codec); if (codec is SplittableCompressionCodec) { SplitCompressionInputStream cIn = ((SplittableCompressionCodec)codec).CreateInputStream (fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.Byblock); @in = new CompressedSplitLineReader(cIn, job, this.recordDelimiterBytes); start = cIn.GetAdjustedStart(); end = cIn.GetAdjustedEnd(); filePosition = cIn; } else { @in = new SplitLineReader(codec.CreateInputStream(fileIn, decompressor), job, this .recordDelimiterBytes); filePosition = fileIn; } } else { fileIn.Seek(start); @in = new UncompressedSplitLineReader(fileIn, job, this.recordDelimiterBytes, split .GetLength()); filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += @in.ReadLine(new Text(), 0, MaxBytesToConsume(start)); } this.pos = start; }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.Exception"/> public override void Initialize(InputSplit split, TaskAttemptContext context) { FileSplit fileSplit = (FileSplit)split; conf = context.GetConfiguration(); Path path = fileSplit.GetPath(); FileSystem fs = path.GetFileSystem(conf); this.@in = new SequenceFile.Reader(fs, path, conf); this.end = fileSplit.GetStart() + fileSplit.GetLength(); if (fileSplit.GetStart() > @in.GetPosition()) { @in.Sync(fileSplit.GetStart()); } // sync to start this.start = @in.GetPosition(); more = start < end; }
public virtual void TestGzip() { Configuration conf = new Configuration(defaultConf); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.SetConf(gzip, conf); localFs.Delete(workDir, true); WriteFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "line-1\tthe quick\nline-2\tbrown\nline-3\t" + "fox jumped\nline-4\tover\nline-5\t the lazy\nline-6\t dog\n"); WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "line-1\tthis is a test\nline-1\tof gzip\n" ); Job job = Job.GetInstance(conf); FileInputFormat.SetInputPaths(job, workDir); KeyValueTextInputFormat format = new KeyValueTextInputFormat(); IList <InputSplit> splits = format.GetSplits(job); NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Count); FileSplit tmp = (FileSplit)splits[0]; if (tmp.GetPath().GetName().Equals("part2.txt.gz")) { splits.Set(0, splits[1]); splits.Set(1, tmp); } IList <Text> results = ReadSplit(format, splits[0], job); NUnit.Framework.Assert.AreEqual("splits[0] length", 6, results.Count); NUnit.Framework.Assert.AreEqual("splits[0][0]", "the quick", results[0].ToString( )); NUnit.Framework.Assert.AreEqual("splits[0][1]", "brown", results[1].ToString()); NUnit.Framework.Assert.AreEqual("splits[0][2]", "fox jumped", results[2].ToString ()); NUnit.Framework.Assert.AreEqual("splits[0][3]", "over", results[3].ToString()); NUnit.Framework.Assert.AreEqual("splits[0][4]", " the lazy", results[4].ToString( )); NUnit.Framework.Assert.AreEqual("splits[0][5]", " dog", results[5].ToString()); results = ReadSplit(format, splits[1], job); NUnit.Framework.Assert.AreEqual("splits[1] length", 2, results.Count); NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString ()); NUnit.Framework.Assert.AreEqual("splits[1][1]", "of gzip", results[1].ToString()); }
public virtual void TestStripBOM() { // the test data contains a BOM at the start of the file // confirm the BOM is skipped by LineRecordReader string Utf8Bom = "\uFEFF"; Uri testFileUrl = GetType().GetClassLoader().GetResource("testBOM.txt"); NUnit.Framework.Assert.IsNotNull("Cannot find testBOM.txt", testFileUrl); FilePath testFile = new FilePath(testFileUrl.GetFile()); Path testFilePath = new Path(testFile.GetAbsolutePath()); long testFileSize = testFile.Length(); Configuration conf = new Configuration(); conf.SetInt(LineRecordReader.MaxLineLength, int.MaxValue); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID() ); // read the data and check whether BOM is skipped FileSplit split = new FileSplit(testFilePath, 0, testFileSize, (string[])null); LineRecordReader reader = new LineRecordReader(); reader.Initialize(split, context); int numRecords = 0; bool firstLine = true; bool skipBOM = true; while (reader.NextKeyValue()) { if (firstLine) { firstLine = false; if (reader.GetCurrentValue().ToString().StartsWith(Utf8Bom)) { skipBOM = false; } } ++numRecords; } reader.Close(); NUnit.Framework.Assert.IsTrue("BOM is not skipped", skipBOM); }
public virtual void TestMultipleClose() { Uri testFileUrl = GetType().GetClassLoader().GetResource("recordSpanningMultipleSplits.txt.bz2" ); NUnit.Framework.Assert.IsNotNull("Cannot find recordSpanningMultipleSplits.txt.bz2" , testFileUrl); FilePath testFile = new FilePath(testFileUrl.GetFile()); Path testFilePath = new Path(testFile.GetAbsolutePath()); long testFileSize = testFile.Length(); Configuration conf = new Configuration(); conf.SetInt(LineRecordReader.MaxLineLength, int.MaxValue); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID() ); // read the data and check whether BOM is skipped FileSplit split = new FileSplit(testFilePath, 0, testFileSize, null); LineRecordReader reader = new LineRecordReader(); reader.Initialize(split, context); //noinspection StatementWithEmptyBody while (reader.NextKeyValue()) { } reader.Close(); reader.Close(); BZip2Codec codec = new BZip2Codec(); codec.SetConf(conf); ICollection <Decompressor> decompressors = new HashSet <Decompressor>(); for (int i = 0; i < 10; ++i) { decompressors.AddItem(CodecPool.GetDecompressor(codec)); } NUnit.Framework.Assert.AreEqual(10, decompressors.Count); }
public virtual void TestUncompressedInputDefaultDelimiterPosValue() { Configuration conf = new Configuration(); string inputData = "1234567890\r\n12\r\n345"; Path inputFile = CreateInputFile(conf, inputData); conf.SetInt("io.file.buffer.size", 10); conf.SetInt(LineRecordReader.MaxLineLength, int.MaxValue); FileSplit split = new FileSplit(inputFile, 0, 15, (string[])null); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID() ); LineRecordReader reader = new LineRecordReader(null); reader.Initialize(split, context); LongWritable key; Text value; reader.NextKeyValue(); key = reader.GetCurrentKey(); value = reader.GetCurrentValue(); // Get first record:"1234567890" NUnit.Framework.Assert.AreEqual(10, value.GetLength()); NUnit.Framework.Assert.AreEqual(0, key.Get()); reader.NextKeyValue(); // Get second record:"12" NUnit.Framework.Assert.AreEqual(2, value.GetLength()); // Key should be 12 right after "1234567890\r\n" NUnit.Framework.Assert.AreEqual(12, key.Get()); NUnit.Framework.Assert.IsFalse(reader.NextKeyValue()); // Key should be 16 right after "1234567890\r\n12\r\n" NUnit.Framework.Assert.AreEqual(16, key.Get()); split = new FileSplit(inputFile, 15, 4, (string[])null); reader = new LineRecordReader(null); reader.Initialize(split, context); // The second split dropped the first record "\n" reader.NextKeyValue(); key = reader.GetCurrentKey(); value = reader.GetCurrentValue(); // Get third record:"345" NUnit.Framework.Assert.AreEqual(3, value.GetLength()); // Key should be 16 right after "1234567890\r\n12\r\n" NUnit.Framework.Assert.AreEqual(16, key.Get()); NUnit.Framework.Assert.IsFalse(reader.NextKeyValue()); // Key should be 19 right after "1234567890\r\n12\r\n345" NUnit.Framework.Assert.AreEqual(19, key.Get()); inputData = "123456789\r\r\n"; inputFile = CreateInputFile(conf, inputData); split = new FileSplit(inputFile, 0, 12, (string[])null); reader = new LineRecordReader(null); reader.Initialize(split, context); reader.NextKeyValue(); key = reader.GetCurrentKey(); value = reader.GetCurrentValue(); // Get first record:"123456789" NUnit.Framework.Assert.AreEqual(9, value.GetLength()); NUnit.Framework.Assert.AreEqual(0, key.Get()); reader.NextKeyValue(); // Get second record:"" NUnit.Framework.Assert.AreEqual(0, value.GetLength()); // Key should be 10 right after "123456789\r" NUnit.Framework.Assert.AreEqual(10, key.Get()); NUnit.Framework.Assert.IsFalse(reader.NextKeyValue()); // Key should be 12 right after "123456789\r\r\n" NUnit.Framework.Assert.AreEqual(12, key.Get()); }
public virtual void TestUncompressedInputCustomDelimiterPosValue() { Configuration conf = new Configuration(); conf.SetInt("io.file.buffer.size", 10); conf.SetInt(LineRecordReader.MaxLineLength, int.MaxValue); string inputData = "abcdefghij++kl++mno"; Path inputFile = CreateInputFile(conf, inputData); string delimiter = "++"; byte[] recordDelimiterBytes = Sharpen.Runtime.GetBytesForString(delimiter, Charsets .Utf8); int splitLength = 15; FileSplit split = new FileSplit(inputFile, 0, splitLength, (string[])null); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID() ); LineRecordReader reader = new LineRecordReader(recordDelimiterBytes); reader.Initialize(split, context); // Get first record: "abcdefghij" NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.NextKeyValue( )); LongWritable key = reader.GetCurrentKey(); Text value = reader.GetCurrentValue(); NUnit.Framework.Assert.AreEqual("Wrong length for record value", 10, value.GetLength ()); NUnit.Framework.Assert.AreEqual("Wrong position after record read", 0, key.Get()); // Get second record: "kl" NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.NextKeyValue( )); NUnit.Framework.Assert.AreEqual("Wrong length for record value", 2, value.GetLength ()); // Key should be 12 right after "abcdefghij++" NUnit.Framework.Assert.AreEqual("Wrong position after record read", 12, key.Get() ); // Get third record: "mno" NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.NextKeyValue( )); NUnit.Framework.Assert.AreEqual("Wrong length for record value", 3, value.GetLength ()); // Key should be 16 right after "abcdefghij++kl++" NUnit.Framework.Assert.AreEqual("Wrong position after record read", 16, key.Get() ); NUnit.Framework.Assert.IsFalse(reader.NextKeyValue()); // Key should be 19 right after "abcdefghij++kl++mno" NUnit.Framework.Assert.AreEqual("Wrong position after record read", 19, key.Get() ); // after refresh should be empty key = reader.GetCurrentKey(); NUnit.Framework.Assert.IsNull("Unexpected key returned", key); reader.Close(); split = new FileSplit(inputFile, splitLength, inputData.Length - splitLength, (string [])null); reader = new LineRecordReader(recordDelimiterBytes); reader.Initialize(split, context); // No record is in the second split because the second split dropped // the first record, which was already reported by the first split. NUnit.Framework.Assert.IsFalse("Unexpected record returned", reader.NextKeyValue( )); key = reader.GetCurrentKey(); NUnit.Framework.Assert.IsNull("Unexpected key returned", key); reader.Close(); // multi char delimiter with starting part of the delimiter in the data inputData = "abcd+efgh++ijk++mno"; inputFile = CreateInputFile(conf, inputData); splitLength = 5; split = new FileSplit(inputFile, 0, splitLength, (string[])null); reader = new LineRecordReader(recordDelimiterBytes); reader.Initialize(split, context); // Get first record: "abcd+efgh" NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.NextKeyValue( )); key = reader.GetCurrentKey(); value = reader.GetCurrentValue(); NUnit.Framework.Assert.AreEqual("Wrong position after record read", 0, key.Get()); NUnit.Framework.Assert.AreEqual("Wrong length for record value", 9, value.GetLength ()); // should have jumped over the delimiter, no record NUnit.Framework.Assert.IsFalse(reader.NextKeyValue()); NUnit.Framework.Assert.AreEqual("Wrong position after record read", 11, key.Get() ); // after refresh should be empty key = reader.GetCurrentKey(); NUnit.Framework.Assert.IsNull("Unexpected key returned", key); reader.Close(); // next split: check for duplicate or dropped records split = new FileSplit(inputFile, splitLength, inputData.Length - splitLength, (string [])null); reader = new LineRecordReader(recordDelimiterBytes); reader.Initialize(split, context); NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.NextKeyValue( )); key = reader.GetCurrentKey(); value = reader.GetCurrentValue(); // Get second record: "ijk" first in this split NUnit.Framework.Assert.AreEqual("Wrong position after record read", 11, key.Get() ); NUnit.Framework.Assert.AreEqual("Wrong length for record value", 3, value.GetLength ()); // Get third record: "mno" second in this split NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.NextKeyValue( )); NUnit.Framework.Assert.AreEqual("Wrong position after record read", 16, key.Get() ); NUnit.Framework.Assert.AreEqual("Wrong length for record value", 3, value.GetLength ()); // should be at the end of the input NUnit.Framework.Assert.IsFalse(reader.NextKeyValue()); NUnit.Framework.Assert.AreEqual("Wrong position after record read", 19, key.Get() ); reader.Close(); inputData = "abcd|efgh|+|ij|kl|+|mno|pqr"; inputFile = CreateInputFile(conf, inputData); delimiter = "|+|"; recordDelimiterBytes = Sharpen.Runtime.GetBytesForString(delimiter, Charsets.Utf8 ); // walking over the buffer and split sizes checks for proper processing // of the ambiguous bytes of the delimiter for (int bufferSize = 1; bufferSize <= inputData.Length; bufferSize++) { for (int splitSize = 1; splitSize < inputData.Length; splitSize++) { // track where we are in the inputdata int keyPosition = 0; conf.SetInt("io.file.buffer.size", bufferSize); split = new FileSplit(inputFile, 0, bufferSize, (string[])null); reader = new LineRecordReader(recordDelimiterBytes); reader.Initialize(split, context); // Get the first record: "abcd|efgh" always possible NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.NextKeyValue( )); key = reader.GetCurrentKey(); value = reader.GetCurrentValue(); NUnit.Framework.Assert.IsTrue("abcd|efgh".Equals(value.ToString())); // Position should be 0 right at the start NUnit.Framework.Assert.AreEqual("Wrong position after record read", keyPosition, key.Get()); // Position should be 12 right after the first "|+|" keyPosition = 12; // get the next record: "ij|kl" if the split/buffer allows it if (reader.NextKeyValue()) { // check the record info: "ij|kl" NUnit.Framework.Assert.IsTrue("ij|kl".Equals(value.ToString())); NUnit.Framework.Assert.AreEqual("Wrong position after record read", keyPosition, key.Get()); // Position should be 20 after the second "|+|" keyPosition = 20; } // get the third record: "mno|pqr" if the split/buffer allows it if (reader.NextKeyValue()) { // check the record info: "mno|pqr" NUnit.Framework.Assert.IsTrue("mno|pqr".Equals(value.ToString())); NUnit.Framework.Assert.AreEqual("Wrong position after record read", keyPosition, key.Get()); // Position should be the end of the input keyPosition = inputData.Length; } NUnit.Framework.Assert.IsFalse("Unexpected record returned", reader.NextKeyValue( )); // no more records can be read we should be at the last position NUnit.Framework.Assert.AreEqual("Wrong position after record read", keyPosition, key.Get()); // after refresh should be empty key = reader.GetCurrentKey(); NUnit.Framework.Assert.IsNull("Unexpected key returned", key); reader.Close(); } } }