public virtual void TestUncompressedInputDefaultDelimiterPosValue() { Configuration conf = new Configuration(); string inputData = "1234567890\r\n12\r\n345"; Path inputFile = CreateInputFile(conf, inputData); conf.SetInt("io.file.buffer.size", 10); conf.SetInt(LineRecordReader.MaxLineLength, int.MaxValue); FileSplit split = new FileSplit(inputFile, 0, 15, (string[])null); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID() ); LineRecordReader reader = new LineRecordReader(null); reader.Initialize(split, context); LongWritable key; Text value; reader.NextKeyValue(); key = reader.GetCurrentKey(); value = reader.GetCurrentValue(); // Get first record:"1234567890" NUnit.Framework.Assert.AreEqual(10, value.GetLength()); NUnit.Framework.Assert.AreEqual(0, key.Get()); reader.NextKeyValue(); // Get second record:"12" NUnit.Framework.Assert.AreEqual(2, value.GetLength()); // Key should be 12 right after "1234567890\r\n" NUnit.Framework.Assert.AreEqual(12, key.Get()); NUnit.Framework.Assert.IsFalse(reader.NextKeyValue()); // Key should be 16 right after "1234567890\r\n12\r\n" NUnit.Framework.Assert.AreEqual(16, key.Get()); split = new FileSplit(inputFile, 15, 4, (string[])null); reader = new LineRecordReader(null); reader.Initialize(split, context); // The second split dropped the first record "\n" reader.NextKeyValue(); key = reader.GetCurrentKey(); value = reader.GetCurrentValue(); // Get third record:"345" NUnit.Framework.Assert.AreEqual(3, value.GetLength()); // Key should be 16 right after "1234567890\r\n12\r\n" NUnit.Framework.Assert.AreEqual(16, key.Get()); NUnit.Framework.Assert.IsFalse(reader.NextKeyValue()); // Key should be 19 right after "1234567890\r\n12\r\n345" NUnit.Framework.Assert.AreEqual(19, key.Get()); inputData = "123456789\r\r\n"; inputFile = CreateInputFile(conf, inputData); split = new FileSplit(inputFile, 0, 12, (string[])null); reader = new LineRecordReader(null); reader.Initialize(split, context); reader.NextKeyValue(); key = reader.GetCurrentKey(); value = reader.GetCurrentValue(); // Get first record:"123456789" NUnit.Framework.Assert.AreEqual(9, value.GetLength()); NUnit.Framework.Assert.AreEqual(0, key.Get()); reader.NextKeyValue(); // Get second record:"" NUnit.Framework.Assert.AreEqual(0, value.GetLength()); // Key should be 10 right after "123456789\r" NUnit.Framework.Assert.AreEqual(10, key.Get()); NUnit.Framework.Assert.IsFalse(reader.NextKeyValue()); // Key should be 12 right after "123456789\r\r\n" NUnit.Framework.Assert.AreEqual(12, key.Get()); }
public virtual void TestUncompressedInputCustomDelimiterPosValue() { Configuration conf = new Configuration(); conf.SetInt("io.file.buffer.size", 10); conf.SetInt(LineRecordReader.MaxLineLength, int.MaxValue); string inputData = "abcdefghij++kl++mno"; Path inputFile = CreateInputFile(conf, inputData); string delimiter = "++"; byte[] recordDelimiterBytes = Sharpen.Runtime.GetBytesForString(delimiter, Charsets .Utf8); int splitLength = 15; FileSplit split = new FileSplit(inputFile, 0, splitLength, (string[])null); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID() ); LineRecordReader reader = new LineRecordReader(recordDelimiterBytes); reader.Initialize(split, context); // Get first record: "abcdefghij" NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.NextKeyValue( )); LongWritable key = reader.GetCurrentKey(); Text value = reader.GetCurrentValue(); NUnit.Framework.Assert.AreEqual("Wrong length for record value", 10, value.GetLength ()); NUnit.Framework.Assert.AreEqual("Wrong position after record read", 0, key.Get()); // Get second record: "kl" NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.NextKeyValue( )); NUnit.Framework.Assert.AreEqual("Wrong length for record value", 2, value.GetLength ()); // Key should be 12 right after "abcdefghij++" NUnit.Framework.Assert.AreEqual("Wrong position after record read", 12, key.Get() ); // Get third record: "mno" NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.NextKeyValue( )); NUnit.Framework.Assert.AreEqual("Wrong length for record value", 3, value.GetLength ()); // Key should be 16 right after "abcdefghij++kl++" NUnit.Framework.Assert.AreEqual("Wrong position after record read", 16, key.Get() ); NUnit.Framework.Assert.IsFalse(reader.NextKeyValue()); // Key should be 19 right after "abcdefghij++kl++mno" NUnit.Framework.Assert.AreEqual("Wrong position after record read", 19, key.Get() ); // after refresh should be empty key = reader.GetCurrentKey(); NUnit.Framework.Assert.IsNull("Unexpected key returned", key); reader.Close(); split = new FileSplit(inputFile, splitLength, inputData.Length - splitLength, (string [])null); reader = new LineRecordReader(recordDelimiterBytes); reader.Initialize(split, context); // No record is in the second split because the second split dropped // the first record, which was already reported by the first split. NUnit.Framework.Assert.IsFalse("Unexpected record returned", reader.NextKeyValue( )); key = reader.GetCurrentKey(); NUnit.Framework.Assert.IsNull("Unexpected key returned", key); reader.Close(); // multi char delimiter with starting part of the delimiter in the data inputData = "abcd+efgh++ijk++mno"; inputFile = CreateInputFile(conf, inputData); splitLength = 5; split = new FileSplit(inputFile, 0, splitLength, (string[])null); reader = new LineRecordReader(recordDelimiterBytes); reader.Initialize(split, context); // Get first record: "abcd+efgh" NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.NextKeyValue( )); key = reader.GetCurrentKey(); value = reader.GetCurrentValue(); NUnit.Framework.Assert.AreEqual("Wrong position after record read", 0, key.Get()); NUnit.Framework.Assert.AreEqual("Wrong length for record value", 9, value.GetLength ()); // should have jumped over the delimiter, no record NUnit.Framework.Assert.IsFalse(reader.NextKeyValue()); NUnit.Framework.Assert.AreEqual("Wrong position after record read", 11, key.Get() ); // after refresh should be empty key = reader.GetCurrentKey(); NUnit.Framework.Assert.IsNull("Unexpected key returned", key); reader.Close(); // next split: check for duplicate or dropped records split = new FileSplit(inputFile, splitLength, inputData.Length - splitLength, (string [])null); reader = new LineRecordReader(recordDelimiterBytes); reader.Initialize(split, context); NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.NextKeyValue( )); key = reader.GetCurrentKey(); value = reader.GetCurrentValue(); // Get second record: "ijk" first in this split NUnit.Framework.Assert.AreEqual("Wrong position after record read", 11, key.Get() ); NUnit.Framework.Assert.AreEqual("Wrong length for record value", 3, value.GetLength ()); // Get third record: "mno" second in this split NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.NextKeyValue( )); NUnit.Framework.Assert.AreEqual("Wrong position after record read", 16, key.Get() ); NUnit.Framework.Assert.AreEqual("Wrong length for record value", 3, value.GetLength ()); // should be at the end of the input NUnit.Framework.Assert.IsFalse(reader.NextKeyValue()); NUnit.Framework.Assert.AreEqual("Wrong position after record read", 19, key.Get() ); reader.Close(); inputData = "abcd|efgh|+|ij|kl|+|mno|pqr"; inputFile = CreateInputFile(conf, inputData); delimiter = "|+|"; recordDelimiterBytes = Sharpen.Runtime.GetBytesForString(delimiter, Charsets.Utf8 ); // walking over the buffer and split sizes checks for proper processing // of the ambiguous bytes of the delimiter for (int bufferSize = 1; bufferSize <= inputData.Length; bufferSize++) { for (int splitSize = 1; splitSize < inputData.Length; splitSize++) { // track where we are in the inputdata int keyPosition = 0; conf.SetInt("io.file.buffer.size", bufferSize); split = new FileSplit(inputFile, 0, bufferSize, (string[])null); reader = new LineRecordReader(recordDelimiterBytes); reader.Initialize(split, context); // Get the first record: "abcd|efgh" always possible NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.NextKeyValue( )); key = reader.GetCurrentKey(); value = reader.GetCurrentValue(); NUnit.Framework.Assert.IsTrue("abcd|efgh".Equals(value.ToString())); // Position should be 0 right at the start NUnit.Framework.Assert.AreEqual("Wrong position after record read", keyPosition, key.Get()); // Position should be 12 right after the first "|+|" keyPosition = 12; // get the next record: "ij|kl" if the split/buffer allows it if (reader.NextKeyValue()) { // check the record info: "ij|kl" NUnit.Framework.Assert.IsTrue("ij|kl".Equals(value.ToString())); NUnit.Framework.Assert.AreEqual("Wrong position after record read", keyPosition, key.Get()); // Position should be 20 after the second "|+|" keyPosition = 20; } // get the third record: "mno|pqr" if the split/buffer allows it if (reader.NextKeyValue()) { // check the record info: "mno|pqr" NUnit.Framework.Assert.IsTrue("mno|pqr".Equals(value.ToString())); NUnit.Framework.Assert.AreEqual("Wrong position after record read", keyPosition, key.Get()); // Position should be the end of the input keyPosition = inputData.Length; } NUnit.Framework.Assert.IsFalse("Unexpected record returned", reader.NextKeyValue( )); // no more records can be read we should be at the last position NUnit.Framework.Assert.AreEqual("Wrong position after record read", keyPosition, key.Get()); // after refresh should be empty key = reader.GetCurrentKey(); NUnit.Framework.Assert.IsNull("Unexpected key returned", key); reader.Close(); } } }