/// <summary>Test using the gzip codec with two input files.</summary> /// <exception cref="System.IO.IOException"/> public virtual void TestGzipWithTwoInputs() { CompressionCodec gzip = new GzipCodec(); localFs.Delete(workDir, true); FixedLengthInputFormat format = new FixedLengthInputFormat(); JobConf job = new JobConf(defaultConf); FixedLengthInputFormat.SetRecordLength(job, 5); FileInputFormat.SetInputPaths(job, workDir); ReflectionUtils.SetConf(gzip, job); format.Configure(job); // Create files with fixed length records with 5 byte long records. WriteFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "one two threefour five six seveneightnine ten " ); WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "ten nine eightsevensix five four threetwo one " ); InputSplit[] splits = format.GetSplits(job, 100); NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length); FileSplit tmp = (FileSplit)splits[0]; if (tmp.GetPath().GetName().Equals("part2.txt.gz")) { splits[0] = splits[1]; splits[1] = tmp; } IList <string> results = ReadSplit(format, splits[0], job); NUnit.Framework.Assert.AreEqual("splits[0] length", 10, results.Count); NUnit.Framework.Assert.AreEqual("splits[0][5]", "six ", results[5]); results = ReadSplit(format, splits[1], job); NUnit.Framework.Assert.AreEqual("splits[1] length", 10, results.Count); NUnit.Framework.Assert.AreEqual("splits[1][0]", "ten ", results[0]); NUnit.Framework.Assert.AreEqual("splits[1][1]", "nine ", results[1]); }
// this tests both files (testCompressThenConcat, testConcatThenCompress); // all should work with either native zlib or new Inflater-based decoder /// <exception cref="System.IO.IOException"/> private static void DoSingleGzipBufferSize(JobConf jConf) { TextInputFormat format = new TextInputFormat(); format.Configure(jConf); // here's Nth pair of DecompressorStreams: InputSplit[] splits = format.GetSplits(jConf, 100); NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length); FileSplit tmp = (FileSplit)splits[0]; if (tmp.GetPath().GetName().Equals("testCompressThenConcat.txt.gz")) { System.Console.Out.WriteLine(" (swapping)"); splits[0] = splits[1]; splits[1] = tmp; } IList <Text> results = ReadSplit(format, splits[0], jConf); NUnit.Framework.Assert.AreEqual("splits[0] length (num lines)", 84, results.Count ); NUnit.Framework.Assert.AreEqual("splits[0][0]", "Call me Ishmael. Some years ago--never mind how long precisely--having" , results[0].ToString()); NUnit.Framework.Assert.AreEqual("splits[0][42]", "Tell me, does the magnetic virtue of the needles of the compasses of" , results[42].ToString()); results = ReadSplit(format, splits[1], jConf); NUnit.Framework.Assert.AreEqual("splits[1] length (num lines)", 84, results.Count ); NUnit.Framework.Assert.AreEqual("splits[1][0]", "Call me Ishmael. Some years ago--never mind how long precisely--having" , results[0].ToString()); NUnit.Framework.Assert.AreEqual("splits[1][42]", "Tell me, does the magnetic virtue of the needles of the compasses of" , results[42].ToString()); }
public virtual void TestMultipleClose() { Uri testFileUrl = GetType().GetClassLoader().GetResource("recordSpanningMultipleSplits.txt.bz2" ); NUnit.Framework.Assert.IsNotNull("Cannot find recordSpanningMultipleSplits.txt.bz2" , testFileUrl); FilePath testFile = new FilePath(testFileUrl.GetFile()); Path testFilePath = new Path(testFile.GetAbsolutePath()); long testFileSize = testFile.Length(); Configuration conf = new Configuration(); conf.SetInt(LineRecordReader.MaxLineLength, int.MaxValue); FileSplit split = new FileSplit(testFilePath, 0, testFileSize, (string[])null); LineRecordReader reader = new LineRecordReader(conf, split); LongWritable key = new LongWritable(); Text value = new Text(); //noinspection StatementWithEmptyBody while (reader.Next(key, value)) { } reader.Close(); reader.Close(); BZip2Codec codec = new BZip2Codec(); codec.SetConf(conf); ICollection <Decompressor> decompressors = new HashSet <Decompressor>(); for (int i = 0; i < 10; ++i) { decompressors.AddItem(CodecPool.GetDecompressor(codec)); } NUnit.Framework.Assert.AreEqual(10, decompressors.Count); }
/// <exception cref="System.IO.IOException"/> public FilterRecordReader(Configuration conf, FileSplit split) : base(conf, split) { // instantiate filter filter = (SequenceFileInputFilter.Filter)ReflectionUtils.NewInstance(conf.GetClass (FilterClass, typeof(SequenceFileInputFilter.PercentFilter)), conf); }
// Use the LineRecordReader to read records from the file /// <exception cref="System.IO.IOException"/> public virtual AList <string> ReadRecords(Uri testFileUrl, int splitSize) { // Set up context FilePath testFile = new FilePath(testFileUrl.GetFile()); long testFileSize = testFile.Length(); Path testFilePath = new Path(testFile.GetAbsolutePath()); Configuration conf = new Configuration(); conf.SetInt("io.file.buffer.size", 1); // Gather the records returned by the record reader AList <string> records = new AList <string>(); long offset = 0; LongWritable key = new LongWritable(); Text value = new Text(); while (offset < testFileSize) { FileSplit split = new FileSplit(testFilePath, offset, splitSize, (string[])null); LineRecordReader reader = new LineRecordReader(conf, split); while (reader.Next(key, value)) { records.AddItem(value.ToString()); } offset += splitSize; } return(records); }
/// <exception cref="System.IO.IOException"/> public SequenceFileAsTextRecordReader(Configuration conf, FileSplit split) { sequenceFileRecordReader = new SequenceFileRecordReader <WritableComparable, Writable >(conf, split); innerKey = sequenceFileRecordReader.CreateKey(); innerValue = sequenceFileRecordReader.CreateValue(); }
/// <summary>Test using the gzip codec for reading</summary> /// <exception cref="System.IO.IOException"/> public static void TestGzip() { JobConf job = new JobConf(); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.SetConf(gzip, job); localFs.Delete(workDir, true); WriteFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "line-1\tthe quick\nline-2\tbrown\nline-3\tfox jumped\nline-4\tover\nline-5\t the lazy\nline-6\t dog\n" ); WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "line-1\tthis is a test\nline-1\tof gzip\n" ); FileInputFormat.SetInputPaths(job, workDir); KeyValueTextInputFormat format = new KeyValueTextInputFormat(); format.Configure(job); InputSplit[] splits = format.GetSplits(job, 100); NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length); FileSplit tmp = (FileSplit)splits[0]; if (tmp.GetPath().GetName().Equals("part2.txt.gz")) { splits[0] = splits[1]; splits[1] = tmp; } IList <Text> results = ReadSplit(format, splits[0], job); NUnit.Framework.Assert.AreEqual("splits[0] length", 6, results.Count); NUnit.Framework.Assert.AreEqual("splits[0][5]", " dog", results[5].ToString()); results = ReadSplit(format, splits[1], job); NUnit.Framework.Assert.AreEqual("splits[1] length", 2, results.Count); NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString ()); NUnit.Framework.Assert.AreEqual("splits[1][1]", "of gzip", results[1].ToString()); }
/// <exception cref="System.IO.IOException"/> public FixedLengthRecordReader(Configuration job, FileSplit split, int recordLength ) { // Make use of the new API implementation to avoid code duplication. this.recordLength = recordLength; reader = new Org.Apache.Hadoop.Mapreduce.Lib.Input.FixedLengthRecordReader(recordLength ); reader.Initialize(job, split.GetStart(), split.GetLength(), split.GetPath()); }
/// <exception cref="System.IO.IOException"/> public KeyValueLineRecordReader(Configuration job, FileSplit split) { lineRecordReader = new LineRecordReader(job, split); dummyKey = lineRecordReader.CreateKey(); innerValue = lineRecordReader.CreateValue(); string sepStr = job.Get("mapreduce.input.keyvaluelinerecordreader.key.value.separator" , "\t"); this.separator = unchecked ((byte)sepStr[0]); }
public virtual void TestUncompressedInputDefaultDelimiterPosValue() { Configuration conf = new Configuration(); string inputData = "1234567890\r\n12\r\n345"; Path inputFile = CreateInputFile(conf, inputData); conf.SetInt("io.file.buffer.size", 10); conf.SetInt(LineRecordReader.MaxLineLength, int.MaxValue); FileSplit split = new FileSplit(inputFile, 0, 15, (string[])null); LineRecordReader reader = new LineRecordReader(conf, split, null); LongWritable key = new LongWritable(); Text value = new Text(); reader.Next(key, value); // Get first record:"1234567890" NUnit.Framework.Assert.AreEqual(10, value.GetLength()); // Position should be 12 right after "1234567890\r\n" NUnit.Framework.Assert.AreEqual(12, reader.GetPos()); reader.Next(key, value); // Get second record:"12" NUnit.Framework.Assert.AreEqual(2, value.GetLength()); // Position should be 16 right after "1234567890\r\n12\r\n" NUnit.Framework.Assert.AreEqual(16, reader.GetPos()); NUnit.Framework.Assert.IsFalse(reader.Next(key, value)); split = new FileSplit(inputFile, 15, 4, (string[])null); reader = new LineRecordReader(conf, split, null); // The second split dropped the first record "\n" // The position should be 16 right after "1234567890\r\n12\r\n" NUnit.Framework.Assert.AreEqual(16, reader.GetPos()); reader.Next(key, value); // Get third record:"345" NUnit.Framework.Assert.AreEqual(3, value.GetLength()); // Position should be 19 right after "1234567890\r\n12\r\n345" NUnit.Framework.Assert.AreEqual(19, reader.GetPos()); NUnit.Framework.Assert.IsFalse(reader.Next(key, value)); NUnit.Framework.Assert.AreEqual(19, reader.GetPos()); inputData = "123456789\r\r\n"; inputFile = CreateInputFile(conf, inputData); split = new FileSplit(inputFile, 0, 12, (string[])null); reader = new LineRecordReader(conf, split, null); reader.Next(key, value); // Get first record:"123456789" NUnit.Framework.Assert.AreEqual(9, value.GetLength()); // Position should be 10 right after "123456789\r" NUnit.Framework.Assert.AreEqual(10, reader.GetPos()); reader.Next(key, value); // Get second record:"" NUnit.Framework.Assert.AreEqual(0, value.GetLength()); // Position should be 12 right after "123456789\r\r\n" NUnit.Framework.Assert.AreEqual(12, reader.GetPos()); NUnit.Framework.Assert.IsFalse(reader.Next(key, value)); NUnit.Framework.Assert.AreEqual(12, reader.GetPos()); }
// Input formats /// <exception cref="System.IO.IOException"/> public virtual InputSplit[] GetSplits(JobConf job, int numSplits) { InputSplit[] result = new InputSplit[numSplits]; Path outDir = FileOutputFormat.GetOutputPath(job); for (int i = 0; i < result.Length; ++i) { result[i] = new FileSplit(new Path(outDir, "dummy-split-" + i), 0, 1, (string[])null ); } return(result); }
/// <exception cref="System.IO.IOException"/> private void TestSplitRecordsForFile(Configuration conf, long firstSplitLength, long testFileSize, Path testFilePath) { conf.SetInt(LineRecordReader.MaxLineLength, int.MaxValue); NUnit.Framework.Assert.IsTrue("unexpected test data at " + testFilePath, testFileSize > firstSplitLength); string delimiter = conf.Get("textinputformat.record.delimiter"); byte[] recordDelimiterBytes = null; if (null != delimiter) { recordDelimiterBytes = Sharpen.Runtime.GetBytesForString(delimiter, Charsets.Utf8 ); } // read the data without splitting to count the records FileSplit split = new FileSplit(testFilePath, 0, testFileSize, (string[])null); LineRecordReader reader = new LineRecordReader(conf, split, recordDelimiterBytes); LongWritable key = new LongWritable(); Text value = new Text(); int numRecordsNoSplits = 0; while (reader.Next(key, value)) { ++numRecordsNoSplits; } reader.Close(); // count the records in the first split split = new FileSplit(testFilePath, 0, firstSplitLength, (string[])null); reader = new LineRecordReader(conf, split, recordDelimiterBytes); int numRecordsFirstSplit = 0; while (reader.Next(key, value)) { ++numRecordsFirstSplit; } reader.Close(); // count the records in the second split split = new FileSplit(testFilePath, firstSplitLength, testFileSize - firstSplitLength , (string[])null); reader = new LineRecordReader(conf, split, recordDelimiterBytes); int numRecordsRemainingSplits = 0; while (reader.Next(key, value)) { ++numRecordsRemainingSplits; } reader.Close(); NUnit.Framework.Assert.AreEqual("Unexpected number of records in split", numRecordsNoSplits , numRecordsFirstSplit + numRecordsRemainingSplits); }
/// <exception cref="System.IO.IOException"/> public LineRecordReader(Configuration job, FileSplit split, byte[] recordDelimiter ) { this.maxLineLength = job.GetInt(LineRecordReader.MaxLineLength, int.MaxValue); start = split.GetStart(); end = start + split.GetLength(); Path file = split.GetPath(); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.GetCodec(file); // open the file and seek to the start of the split FileSystem fs = file.GetFileSystem(job); fileIn = fs.Open(file); if (IsCompressedInput()) { decompressor = CodecPool.GetDecompressor(codec); if (codec is SplittableCompressionCodec) { SplitCompressionInputStream cIn = ((SplittableCompressionCodec)codec).CreateInputStream (fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.Byblock); @in = new CompressedSplitLineReader(cIn, job, recordDelimiter); start = cIn.GetAdjustedStart(); end = cIn.GetAdjustedEnd(); filePosition = cIn; } else { // take pos from compressed stream @in = new SplitLineReader(codec.CreateInputStream(fileIn, decompressor), job, recordDelimiter ); filePosition = fileIn; } } else { fileIn.Seek(start); @in = new UncompressedSplitLineReader(fileIn, job, recordDelimiter, split.GetLength ()); filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += @in.ReadLine(new Text(), 0, MaxBytesToConsume(start)); } this.pos = start; }
/// <exception cref="System.IO.IOException"/> public SequenceFileRecordReader(Configuration conf, FileSplit split) { Path path = split.GetPath(); FileSystem fs = path.GetFileSystem(conf); this.@in = new SequenceFile.Reader(fs, path, conf); this.end = split.GetStart() + split.GetLength(); this.conf = conf; if (split.GetStart() > @in.GetPosition()) { @in.Sync(split.GetStart()); } // sync to start this.start = @in.GetPosition(); more = start < end; }
/// <exception cref="System.IO.IOException"/> public SequenceFileAsBinaryRecordReader(Configuration conf, FileSplit split) { Path path = split.GetPath(); FileSystem fs = path.GetFileSystem(conf); this.@in = new SequenceFile.Reader(fs, path, conf); this.end = split.GetStart() + split.GetLength(); if (split.GetStart() > @in.GetPosition()) { @in.Sync(split.GetStart()); } // sync to start this.start = @in.GetPosition(); vbytes = @in.CreateValueBytes(); done = start >= end; }
public virtual void TestBzip2() { JobConf jobConf = new JobConf(defaultConf); CompressionCodec bzip2 = new BZip2Codec(); ReflectionUtils.SetConf(bzip2, jobConf); localFs.Delete(workDir, true); System.Console.Out.WriteLine(ColorBrCyan + "testBzip2() using non-native CBZip2InputStream (presumably)" + ColorNormal); // copy prebuilt (correct!) version of concat.bz2 to HDFS string fn = "concat" + bzip2.GetDefaultExtension(); Path fnLocal = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn); Path fnHDFS = new Path(workDir, fn); localFs.CopyFromLocalFile(fnLocal, fnHDFS); WriteFile(localFs, new Path(workDir, "part2.txt.bz2"), bzip2, "this is a test\nof bzip2\n" ); FileInputFormat.SetInputPaths(jobConf, workDir); TextInputFormat format = new TextInputFormat(); // extends FileInputFormat format.Configure(jobConf); format.SetMinSplitSize(256); // work around 2-byte splits issue // [135 splits for a 208-byte file and a 62-byte file(!)] InputSplit[] splits = format.GetSplits(jobConf, 100); NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length); FileSplit tmp = (FileSplit)splits[0]; if (tmp.GetPath().GetName().Equals("part2.txt.bz2")) { splits[0] = splits[1]; splits[1] = tmp; } IList <Text> results = ReadSplit(format, splits[0], jobConf); NUnit.Framework.Assert.AreEqual("splits[0] num lines", 6, results.Count); NUnit.Framework.Assert.AreEqual("splits[0][5]", "member #3", results[5].ToString( )); results = ReadSplit(format, splits[1], jobConf); NUnit.Framework.Assert.AreEqual("splits[1] num lines", 2, results.Count); NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString ()); NUnit.Framework.Assert.AreEqual("splits[1][1]", "of bzip2", results[1].ToString() ); }
/// <exception cref="System.Exception"/> public virtual void TestLocality() { JobConf job = new JobConf(conf); dfs = NewDFSCluster(job); FileSystem fs = dfs.GetFileSystem(); System.Console.Out.WriteLine("FileSystem " + fs.GetUri()); Path inputDir = new Path("/foo/"); string fileName = "part-0000"; CreateInputs(fs, inputDir, fileName); // split it using a file input format TextInputFormat.AddInputPath(job, inputDir); TextInputFormat inFormat = new TextInputFormat(); inFormat.Configure(job); InputSplit[] splits = inFormat.GetSplits(job, 1); FileStatus fileStatus = fs.GetFileStatus(new Path(inputDir, fileName)); BlockLocation[] locations = fs.GetFileBlockLocations(fileStatus, 0, fileStatus.GetLen ()); System.Console.Out.WriteLine("Made splits"); // make sure that each split is a block and the locations match for (int i = 0; i < splits.Length; ++i) { FileSplit fileSplit = (FileSplit)splits[i]; System.Console.Out.WriteLine("File split: " + fileSplit); foreach (string h in fileSplit.GetLocations()) { System.Console.Out.WriteLine("Location: " + h); } System.Console.Out.WriteLine("Block: " + locations[i]); NUnit.Framework.Assert.AreEqual(locations[i].GetOffset(), fileSplit.GetStart()); NUnit.Framework.Assert.AreEqual(locations[i].GetLength(), fileSplit.GetLength()); string[] blockLocs = locations[i].GetHosts(); string[] splitLocs = fileSplit.GetLocations(); NUnit.Framework.Assert.AreEqual(2, blockLocs.Length); NUnit.Framework.Assert.AreEqual(2, splitLocs.Length); NUnit.Framework.Assert.IsTrue((blockLocs[0].Equals(splitLocs[0]) && blockLocs[1]. Equals(splitLocs[1])) || (blockLocs[1].Equals(splitLocs[0]) && blockLocs[0].Equals (splitLocs[1]))); } NUnit.Framework.Assert.AreEqual("Expected value of " + FileInputFormat.NumInputFiles , 1, job.GetLong(FileInputFormat.NumInputFiles, 0)); }
public virtual void TestStripBOM() { // the test data contains a BOM at the start of the file // confirm the BOM is skipped by LineRecordReader string Utf8Bom = "\uFEFF"; Uri testFileUrl = GetType().GetClassLoader().GetResource("testBOM.txt"); NUnit.Framework.Assert.IsNotNull("Cannot find testBOM.txt", testFileUrl); FilePath testFile = new FilePath(testFileUrl.GetFile()); Path testFilePath = new Path(testFile.GetAbsolutePath()); long testFileSize = testFile.Length(); Configuration conf = new Configuration(); conf.SetInt(LineRecordReader.MaxLineLength, int.MaxValue); // read the data and check whether BOM is skipped FileSplit split = new FileSplit(testFilePath, 0, testFileSize, (string[])null); LineRecordReader reader = new LineRecordReader(conf, split); LongWritable key = new LongWritable(); Text value = new Text(); int numRecords = 0; bool firstLine = true; bool skipBOM = true; while (reader.Next(key, value)) { if (firstLine) { firstLine = false; if (value.ToString().StartsWith(Utf8Bom)) { skipBOM = false; } } ++numRecords; } reader.Close(); NUnit.Framework.Assert.IsTrue("BOM is not skipped", skipBOM); }
public virtual void TestUncompressedInputCustomDelimiterPosValue() { Configuration conf = new Configuration(); conf.SetInt("io.file.buffer.size", 10); conf.SetInt(LineRecordReader.MaxLineLength, int.MaxValue); string inputData = "abcdefghij++kl++mno"; Path inputFile = CreateInputFile(conf, inputData); string delimiter = "++"; byte[] recordDelimiterBytes = Sharpen.Runtime.GetBytesForString(delimiter, Charsets .Utf8); // the first split must contain two records to make sure that it also pulls // in the record from the 2nd split int splitLength = 15; FileSplit split = new FileSplit(inputFile, 0, splitLength, (string[])null); LineRecordReader reader = new LineRecordReader(conf, split, recordDelimiterBytes); LongWritable key = new LongWritable(); Text value = new Text(); // Get first record: "abcdefghij" NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.Next(key, value )); NUnit.Framework.Assert.AreEqual("Wrong length for record value", 10, value.GetLength ()); // Position should be 12 right after "abcdefghij++" NUnit.Framework.Assert.AreEqual("Wrong position after record read", 12, reader.GetPos ()); // Get second record: "kl" NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.Next(key, value )); NUnit.Framework.Assert.AreEqual("Wrong length for record value", 2, value.GetLength ()); // Position should be 16 right after "abcdefghij++kl++" NUnit.Framework.Assert.AreEqual("Wrong position after record read", 16, reader.GetPos ()); // Get third record: "mno" NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.Next(key, value )); NUnit.Framework.Assert.AreEqual("Wrong length for record value", 3, value.GetLength ()); // Position should be 19 right after "abcdefghij++kl++mno" NUnit.Framework.Assert.AreEqual("Wrong position after record read", 19, reader.GetPos ()); NUnit.Framework.Assert.IsFalse(reader.Next(key, value)); NUnit.Framework.Assert.AreEqual("Wrong position after record read", 19, reader.GetPos ()); reader.Close(); // No record is in the second split because the second split will drop // the first record, which was already reported by the first split. split = new FileSplit(inputFile, splitLength, inputData.Length - splitLength, (string [])null); reader = new LineRecordReader(conf, split, recordDelimiterBytes); // The position should be 19 right after "abcdefghij++kl++mno" and should // not change NUnit.Framework.Assert.AreEqual("Wrong position after record read", 19, reader.GetPos ()); NUnit.Framework.Assert.IsFalse("Unexpected record returned", reader.Next(key, value )); NUnit.Framework.Assert.AreEqual("Wrong position after record read", 19, reader.GetPos ()); reader.Close(); // multi char delimiter with starting part of the delimiter in the data inputData = "abcd+efgh++ijk++mno"; inputFile = CreateInputFile(conf, inputData); splitLength = 5; split = new FileSplit(inputFile, 0, splitLength, (string[])null); reader = new LineRecordReader(conf, split, recordDelimiterBytes); // Get first record: "abcd+efgh" NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.Next(key, value )); NUnit.Framework.Assert.AreEqual("Wrong position after record read", 11, reader.GetPos ()); NUnit.Framework.Assert.AreEqual("Wrong length for record value", 9, value.GetLength ()); // should have jumped over the delimiter, no record NUnit.Framework.Assert.IsFalse("Unexpected record returned", reader.Next(key, value )); NUnit.Framework.Assert.AreEqual("Wrong position after record read", 11, reader.GetPos ()); reader.Close(); // next split: check for duplicate or dropped records split = new FileSplit(inputFile, splitLength, inputData.Length - splitLength, (string [])null); reader = new LineRecordReader(conf, split, recordDelimiterBytes); // Get second record: "ijk" first in this split NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.Next(key, value )); NUnit.Framework.Assert.AreEqual("Wrong position after record read", 16, reader.GetPos ()); NUnit.Framework.Assert.AreEqual("Wrong length for record value", 3, value.GetLength ()); // Get third record: "mno" second in this split NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.Next(key, value )); NUnit.Framework.Assert.AreEqual("Wrong position after record read", 19, reader.GetPos ()); NUnit.Framework.Assert.AreEqual("Wrong length for record value", 3, value.GetLength ()); // should be at the end of the input NUnit.Framework.Assert.IsFalse(reader.Next(key, value)); NUnit.Framework.Assert.AreEqual("Wrong position after record read", 19, reader.GetPos ()); reader.Close(); inputData = "abcd|efgh|+|ij|kl|+|mno|pqr"; inputFile = CreateInputFile(conf, inputData); delimiter = "|+|"; recordDelimiterBytes = Sharpen.Runtime.GetBytesForString(delimiter, Charsets.Utf8 ); // walking over the buffer and split sizes checks for proper processing // of the ambiguous bytes of the delimiter for (int bufferSize = 1; bufferSize <= inputData.Length; bufferSize++) { for (int splitSize = 1; splitSize < inputData.Length; splitSize++) { conf.SetInt("io.file.buffer.size", bufferSize); split = new FileSplit(inputFile, 0, bufferSize, (string[])null); reader = new LineRecordReader(conf, split, recordDelimiterBytes); // Get first record: "abcd|efgh" always possible NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.Next(key, value )); NUnit.Framework.Assert.IsTrue("abcd|efgh".Equals(value.ToString())); NUnit.Framework.Assert.AreEqual("Wrong position after record read", 9, value.GetLength ()); // Position should be 12 right after "|+|" int recordPos = 12; NUnit.Framework.Assert.AreEqual("Wrong position after record read", recordPos, reader .GetPos()); // get the next record: "ij|kl" if the split/buffer allows it if (reader.Next(key, value)) { // check the record info: "ij|kl" NUnit.Framework.Assert.IsTrue("ij|kl".Equals(value.ToString())); // Position should be 20 right after "|+|" recordPos = 20; NUnit.Framework.Assert.AreEqual("Wrong position after record read", recordPos, reader .GetPos()); } // get the third record: "mno|pqr" if the split/buffer allows it if (reader.Next(key, value)) { // check the record info: "mno|pqr" NUnit.Framework.Assert.IsTrue("mno|pqr".Equals(value.ToString())); // Position should be 27 at the end of the string now recordPos = inputData.Length; NUnit.Framework.Assert.AreEqual("Wrong position after record read", recordPos, reader .GetPos()); } // no more records can be read we should still be at the last position NUnit.Framework.Assert.IsFalse("Unexpected record returned", reader.Next(key, value )); NUnit.Framework.Assert.AreEqual("Wrong position after record read", recordPos, reader .GetPos()); reader.Close(); } } }
public virtual void TestGzip() { JobConf jobConf = new JobConf(defaultConf); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.SetConf(gzip, jobConf); localFs.Delete(workDir, true); // preferred, but not compatible with Apache/trunk instance of Hudson: /* * assertFalse("[native (C/C++) codec]", * (org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.class == * gzip.getDecompressorType()) ); * System.out.println(COLOR_BR_RED + * "testGzip() using native-zlib Decompressor (" + * gzip.getDecompressorType() + ")" + COLOR_NORMAL); */ // alternative: if (typeof(BuiltInGzipDecompressor) == gzip.GetDecompressorType()) { System.Console.Out.WriteLine(ColorBrRed + "testGzip() using native-zlib Decompressor (" + gzip.GetDecompressorType() + ")" + ColorNormal); } else { Log.Warn("testGzip() skipped: native (C/C++) libs not loaded"); return; } /* * // THIS IS BUGGY: omits 2nd/3rd gzip headers; screws up 2nd/3rd CRCs-- * // see https://issues.apache.org/jira/browse/HADOOP-6799 * Path fnHDFS = new Path(workDir, "concat" + gzip.getDefaultExtension()); * //OutputStream out = localFs.create(fnHDFS); * //GzipCodec.GzipOutputStream gzOStm = new GzipCodec.GzipOutputStream(out); * // can just combine those two lines, probably * //GzipCodec.GzipOutputStream gzOStm = * // new GzipCodec.GzipOutputStream(localFs.create(fnHDFS)); * // oops, no: this is a protected helper class; need to access * // it via createOutputStream() instead: * OutputStream out = localFs.create(fnHDFS); * Compressor gzCmp = gzip.createCompressor(); * CompressionOutputStream gzOStm = gzip.createOutputStream(out, gzCmp); * // this SHOULD be going to HDFS: got out from localFs == HDFS * // ...yup, works * gzOStm.write("first gzip concat\n member\nwith three lines\n".getBytes()); * gzOStm.finish(); * gzOStm.resetState(); * gzOStm.write("2nd gzip concat member\n".getBytes()); * gzOStm.finish(); * gzOStm.resetState(); * gzOStm.write("gzip concat\nmember #3\n".getBytes()); * gzOStm.close(); * // * String fn = "hdfs-to-local-concat" + gzip.getDefaultExtension(); * Path fnLocal = new Path(System.getProperty("test.concat.data","/tmp"), fn); * localFs.copyToLocalFile(fnHDFS, fnLocal); */ // copy prebuilt (correct!) version of concat.gz to HDFS string fn = "concat" + gzip.GetDefaultExtension(); Path fnLocal = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn); Path fnHDFS = new Path(workDir, fn); localFs.CopyFromLocalFile(fnLocal, fnHDFS); WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n" ); FileInputFormat.SetInputPaths(jobConf, workDir); TextInputFormat format = new TextInputFormat(); format.Configure(jobConf); InputSplit[] splits = format.GetSplits(jobConf, 100); NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length); FileSplit tmp = (FileSplit)splits[0]; if (tmp.GetPath().GetName().Equals("part2.txt.gz")) { splits[0] = splits[1]; splits[1] = tmp; } IList <Text> results = ReadSplit(format, splits[0], jobConf); NUnit.Framework.Assert.AreEqual("splits[0] num lines", 6, results.Count); NUnit.Framework.Assert.AreEqual("splits[0][5]", "member #3", results[5].ToString( )); results = ReadSplit(format, splits[1], jobConf); NUnit.Framework.Assert.AreEqual("splits[1] num lines", 2, results.Count); NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString ()); NUnit.Framework.Assert.AreEqual("splits[1][1]", "of gzip", results[1].ToString()); }
/// <exception cref="System.IO.IOException"/> public LineRecordReader(Configuration job, FileSplit split) : this(job, split, null) { }