Exemple #1
0
        public virtual void TestUncompressedInputDefaultDelimiterPosValue()
        {
            Configuration conf      = new Configuration();
            string        inputData = "1234567890\r\n12\r\n345";
            Path          inputFile = CreateInputFile(conf, inputData);

            conf.SetInt("io.file.buffer.size", 10);
            conf.SetInt(LineRecordReader.MaxLineLength, int.MaxValue);
            FileSplit          split   = new FileSplit(inputFile, 0, 15, (string[])null);
            TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()
                                                                    );
            LineRecordReader reader = new LineRecordReader(null);

            reader.Initialize(split, context);
            LongWritable key;
            Text         value;

            reader.NextKeyValue();
            key   = reader.GetCurrentKey();
            value = reader.GetCurrentValue();
            // Get first record:"1234567890"
            NUnit.Framework.Assert.AreEqual(10, value.GetLength());
            NUnit.Framework.Assert.AreEqual(0, key.Get());
            reader.NextKeyValue();
            // Get second record:"12"
            NUnit.Framework.Assert.AreEqual(2, value.GetLength());
            // Key should be 12 right after "1234567890\r\n"
            NUnit.Framework.Assert.AreEqual(12, key.Get());
            NUnit.Framework.Assert.IsFalse(reader.NextKeyValue());
            // Key should be 16 right after "1234567890\r\n12\r\n"
            NUnit.Framework.Assert.AreEqual(16, key.Get());
            split  = new FileSplit(inputFile, 15, 4, (string[])null);
            reader = new LineRecordReader(null);
            reader.Initialize(split, context);
            // The second split dropped the first record "\n"
            reader.NextKeyValue();
            key   = reader.GetCurrentKey();
            value = reader.GetCurrentValue();
            // Get third record:"345"
            NUnit.Framework.Assert.AreEqual(3, value.GetLength());
            // Key should be 16 right after "1234567890\r\n12\r\n"
            NUnit.Framework.Assert.AreEqual(16, key.Get());
            NUnit.Framework.Assert.IsFalse(reader.NextKeyValue());
            // Key should be 19 right after "1234567890\r\n12\r\n345"
            NUnit.Framework.Assert.AreEqual(19, key.Get());
            inputData = "123456789\r\r\n";
            inputFile = CreateInputFile(conf, inputData);
            split     = new FileSplit(inputFile, 0, 12, (string[])null);
            reader    = new LineRecordReader(null);
            reader.Initialize(split, context);
            reader.NextKeyValue();
            key   = reader.GetCurrentKey();
            value = reader.GetCurrentValue();
            // Get first record:"123456789"
            NUnit.Framework.Assert.AreEqual(9, value.GetLength());
            NUnit.Framework.Assert.AreEqual(0, key.Get());
            reader.NextKeyValue();
            // Get second record:""
            NUnit.Framework.Assert.AreEqual(0, value.GetLength());
            // Key should be 10 right after "123456789\r"
            NUnit.Framework.Assert.AreEqual(10, key.Get());
            NUnit.Framework.Assert.IsFalse(reader.NextKeyValue());
            // Key should be 12 right after "123456789\r\r\n"
            NUnit.Framework.Assert.AreEqual(12, key.Get());
        }
Exemple #2
0
        public virtual void TestUncompressedInputCustomDelimiterPosValue()
        {
            Configuration conf = new Configuration();

            conf.SetInt("io.file.buffer.size", 10);
            conf.SetInt(LineRecordReader.MaxLineLength, int.MaxValue);
            string inputData = "abcdefghij++kl++mno";
            Path   inputFile = CreateInputFile(conf, inputData);
            string delimiter = "++";

            byte[] recordDelimiterBytes = Sharpen.Runtime.GetBytesForString(delimiter, Charsets
                                                                            .Utf8);
            int                splitLength = 15;
            FileSplit          split       = new FileSplit(inputFile, 0, splitLength, (string[])null);
            TaskAttemptContext context     = new TaskAttemptContextImpl(conf, new TaskAttemptID()
                                                                        );
            LineRecordReader reader = new LineRecordReader(recordDelimiterBytes);

            reader.Initialize(split, context);
            // Get first record: "abcdefghij"
            NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.NextKeyValue(
                                              ));
            LongWritable key   = reader.GetCurrentKey();
            Text         value = reader.GetCurrentValue();

            NUnit.Framework.Assert.AreEqual("Wrong length for record value", 10, value.GetLength
                                                ());
            NUnit.Framework.Assert.AreEqual("Wrong position after record read", 0, key.Get());
            // Get second record: "kl"
            NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.NextKeyValue(
                                              ));
            NUnit.Framework.Assert.AreEqual("Wrong length for record value", 2, value.GetLength
                                                ());
            // Key should be 12 right after "abcdefghij++"
            NUnit.Framework.Assert.AreEqual("Wrong position after record read", 12, key.Get()
                                            );
            // Get third record: "mno"
            NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.NextKeyValue(
                                              ));
            NUnit.Framework.Assert.AreEqual("Wrong length for record value", 3, value.GetLength
                                                ());
            // Key should be 16 right after "abcdefghij++kl++"
            NUnit.Framework.Assert.AreEqual("Wrong position after record read", 16, key.Get()
                                            );
            NUnit.Framework.Assert.IsFalse(reader.NextKeyValue());
            // Key should be 19 right after "abcdefghij++kl++mno"
            NUnit.Framework.Assert.AreEqual("Wrong position after record read", 19, key.Get()
                                            );
            // after refresh should be empty
            key = reader.GetCurrentKey();
            NUnit.Framework.Assert.IsNull("Unexpected key returned", key);
            reader.Close();
            split = new FileSplit(inputFile, splitLength, inputData.Length - splitLength, (string
                                                                                           [])null);
            reader = new LineRecordReader(recordDelimiterBytes);
            reader.Initialize(split, context);
            // No record is in the second split because the second split dropped
            // the first record, which was already reported by the first split.
            NUnit.Framework.Assert.IsFalse("Unexpected record returned", reader.NextKeyValue(
                                               ));
            key = reader.GetCurrentKey();
            NUnit.Framework.Assert.IsNull("Unexpected key returned", key);
            reader.Close();
            // multi char delimiter with starting part of the delimiter in the data
            inputData   = "abcd+efgh++ijk++mno";
            inputFile   = CreateInputFile(conf, inputData);
            splitLength = 5;
            split       = new FileSplit(inputFile, 0, splitLength, (string[])null);
            reader      = new LineRecordReader(recordDelimiterBytes);
            reader.Initialize(split, context);
            // Get first record: "abcd+efgh"
            NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.NextKeyValue(
                                              ));
            key   = reader.GetCurrentKey();
            value = reader.GetCurrentValue();
            NUnit.Framework.Assert.AreEqual("Wrong position after record read", 0, key.Get());
            NUnit.Framework.Assert.AreEqual("Wrong length for record value", 9, value.GetLength
                                                ());
            // should have jumped over the delimiter, no record
            NUnit.Framework.Assert.IsFalse(reader.NextKeyValue());
            NUnit.Framework.Assert.AreEqual("Wrong position after record read", 11, key.Get()
                                            );
            // after refresh should be empty
            key = reader.GetCurrentKey();
            NUnit.Framework.Assert.IsNull("Unexpected key returned", key);
            reader.Close();
            // next split: check for duplicate or dropped records
            split = new FileSplit(inputFile, splitLength, inputData.Length - splitLength, (string
                                                                                           [])null);
            reader = new LineRecordReader(recordDelimiterBytes);
            reader.Initialize(split, context);
            NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.NextKeyValue(
                                              ));
            key   = reader.GetCurrentKey();
            value = reader.GetCurrentValue();
            // Get second record: "ijk" first in this split
            NUnit.Framework.Assert.AreEqual("Wrong position after record read", 11, key.Get()
                                            );
            NUnit.Framework.Assert.AreEqual("Wrong length for record value", 3, value.GetLength
                                                ());
            // Get third record: "mno" second in this split
            NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.NextKeyValue(
                                              ));
            NUnit.Framework.Assert.AreEqual("Wrong position after record read", 16, key.Get()
                                            );
            NUnit.Framework.Assert.AreEqual("Wrong length for record value", 3, value.GetLength
                                                ());
            // should be at the end of the input
            NUnit.Framework.Assert.IsFalse(reader.NextKeyValue());
            NUnit.Framework.Assert.AreEqual("Wrong position after record read", 19, key.Get()
                                            );
            reader.Close();
            inputData            = "abcd|efgh|+|ij|kl|+|mno|pqr";
            inputFile            = CreateInputFile(conf, inputData);
            delimiter            = "|+|";
            recordDelimiterBytes = Sharpen.Runtime.GetBytesForString(delimiter, Charsets.Utf8
                                                                     );
            // walking over the buffer and split sizes checks for proper processing
            // of the ambiguous bytes of the delimiter
            for (int bufferSize = 1; bufferSize <= inputData.Length; bufferSize++)
            {
                for (int splitSize = 1; splitSize < inputData.Length; splitSize++)
                {
                    // track where we are in the inputdata
                    int keyPosition = 0;
                    conf.SetInt("io.file.buffer.size", bufferSize);
                    split  = new FileSplit(inputFile, 0, bufferSize, (string[])null);
                    reader = new LineRecordReader(recordDelimiterBytes);
                    reader.Initialize(split, context);
                    // Get the first record: "abcd|efgh" always possible
                    NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.NextKeyValue(
                                                      ));
                    key   = reader.GetCurrentKey();
                    value = reader.GetCurrentValue();
                    NUnit.Framework.Assert.IsTrue("abcd|efgh".Equals(value.ToString()));
                    // Position should be 0 right at the start
                    NUnit.Framework.Assert.AreEqual("Wrong position after record read", keyPosition,
                                                    key.Get());
                    // Position should be 12 right after the first "|+|"
                    keyPosition = 12;
                    // get the next record: "ij|kl" if the split/buffer allows it
                    if (reader.NextKeyValue())
                    {
                        // check the record info: "ij|kl"
                        NUnit.Framework.Assert.IsTrue("ij|kl".Equals(value.ToString()));
                        NUnit.Framework.Assert.AreEqual("Wrong position after record read", keyPosition,
                                                        key.Get());
                        // Position should be 20 after the second "|+|"
                        keyPosition = 20;
                    }
                    // get the third record: "mno|pqr" if the split/buffer allows it
                    if (reader.NextKeyValue())
                    {
                        // check the record info: "mno|pqr"
                        NUnit.Framework.Assert.IsTrue("mno|pqr".Equals(value.ToString()));
                        NUnit.Framework.Assert.AreEqual("Wrong position after record read", keyPosition,
                                                        key.Get());
                        // Position should be the end of the input
                        keyPosition = inputData.Length;
                    }
                    NUnit.Framework.Assert.IsFalse("Unexpected record returned", reader.NextKeyValue(
                                                       ));
                    // no more records can be read we should be at the last position
                    NUnit.Framework.Assert.AreEqual("Wrong position after record read", keyPosition,
                                                    key.Get());
                    // after refresh should be empty
                    key = reader.GetCurrentKey();
                    NUnit.Framework.Assert.IsNull("Unexpected key returned", key);
                    reader.Close();
                }
            }
        }