/// <exception cref="System.IO.IOException"/> public Decompressor GetDecompressor() { CompressionCodec codec = GetCodec(); if (codec != null) { Decompressor decompressor = CodecPool.GetDecompressor(codec); if (decompressor != null) { if (decompressor.Finished()) { // Somebody returns the decompressor to CodecPool but is still using // it. Log.Warn("Deompressor obtained from CodecPool already finished()"); } else { if (Log.IsDebugEnabled()) { Log.Debug("Got a decompressor: " + decompressor.GetHashCode()); } } decompressor.Reset(); } return(decompressor); } return(null); }
public virtual void TestMultipleClose() { Uri testFileUrl = GetType().GetClassLoader().GetResource("recordSpanningMultipleSplits.txt.bz2" ); NUnit.Framework.Assert.IsNotNull("Cannot find recordSpanningMultipleSplits.txt.bz2" , testFileUrl); FilePath testFile = new FilePath(testFileUrl.GetFile()); Path testFilePath = new Path(testFile.GetAbsolutePath()); long testFileSize = testFile.Length(); Configuration conf = new Configuration(); conf.SetInt(LineRecordReader.MaxLineLength, int.MaxValue); FileSplit split = new FileSplit(testFilePath, 0, testFileSize, (string[])null); LineRecordReader reader = new LineRecordReader(conf, split); LongWritable key = new LongWritable(); Text value = new Text(); //noinspection StatementWithEmptyBody while (reader.Next(key, value)) { } reader.Close(); reader.Close(); BZip2Codec codec = new BZip2Codec(); codec.SetConf(conf); ICollection <Decompressor> decompressors = new HashSet <Decompressor>(); for (int i = 0; i < 10; ++i) { decompressors.AddItem(CodecPool.GetDecompressor(codec)); } NUnit.Framework.Assert.AreEqual(10, decompressors.Count); }
/// <summary>Construct an IFile Reader.</summary> /// <param name="conf">Configuration File</param> /// <param name="in">The input stream</param> /// <param name="length"> /// Length of the data in the stream, including the checksum /// bytes. /// </param> /// <param name="codec">codec</param> /// <param name="readsCounter">Counter for records read from disk</param> /// <exception cref="System.IO.IOException"/> public Reader(Configuration conf, FSDataInputStream @in, long length, CompressionCodec codec, Counters.Counter readsCounter) { // Count records read from disk // Possibly decompressed stream that we read readRecordsCounter = readsCounter; checksumIn = new IFileInputStream(@in, length, conf); if (codec != null) { decompressor = CodecPool.GetDecompressor(codec); if (decompressor != null) { this.@in = codec.CreateInputStream(checksumIn, decompressor); } else { Log.Warn("Could not obtain decompressor from CodecPool"); this.@in = checksumIn; } } else { this.@in = checksumIn; } this.dataIn = new DataInputStream(this.@in); this.fileLength = length; if (conf != null) { bufferSize = conf.GetInt("io.file.buffer.size", DefaultBufferSize); } }
/// <exception cref="System.IO.IOException"/> public override void Initialize(InputSplit genericSplit, TaskAttemptContext context ) { FileSplit split = (FileSplit)genericSplit; Configuration job = context.GetConfiguration(); this.maxLineLength = job.GetInt(MaxLineLength, int.MaxValue); start = split.GetStart(); end = start + split.GetLength(); Path file = split.GetPath(); // open the file and seek to the start of the split FileSystem fs = file.GetFileSystem(job); fileIn = fs.Open(file); CompressionCodec codec = new CompressionCodecFactory(job).GetCodec(file); if (null != codec) { isCompressedInput = true; decompressor = CodecPool.GetDecompressor(codec); if (codec is SplittableCompressionCodec) { SplitCompressionInputStream cIn = ((SplittableCompressionCodec)codec).CreateInputStream (fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.Byblock); @in = new CompressedSplitLineReader(cIn, job, this.recordDelimiterBytes); start = cIn.GetAdjustedStart(); end = cIn.GetAdjustedEnd(); filePosition = cIn; } else { @in = new SplitLineReader(codec.CreateInputStream(fileIn, decompressor), job, this .recordDelimiterBytes); filePosition = fileIn; } } else { fileIn.Seek(start); @in = new UncompressedSplitLineReader(fileIn, job, this.recordDelimiterBytes, split .GetLength()); filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += @in.ReadLine(new Text(), 0, MaxBytesToConsume(start)); } this.pos = start; }
/// <exception cref="System.IO.IOException"/> public LineRecordReader(Configuration job, FileSplit split, byte[] recordDelimiter ) { this.maxLineLength = job.GetInt(LineRecordReader.MaxLineLength, int.MaxValue); start = split.GetStart(); end = start + split.GetLength(); Path file = split.GetPath(); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.GetCodec(file); // open the file and seek to the start of the split FileSystem fs = file.GetFileSystem(job); fileIn = fs.Open(file); if (IsCompressedInput()) { decompressor = CodecPool.GetDecompressor(codec); if (codec is SplittableCompressionCodec) { SplitCompressionInputStream cIn = ((SplittableCompressionCodec)codec).CreateInputStream (fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.Byblock); @in = new CompressedSplitLineReader(cIn, job, recordDelimiter); start = cIn.GetAdjustedStart(); end = cIn.GetAdjustedEnd(); filePosition = cIn; } else { // take pos from compressed stream @in = new SplitLineReader(codec.CreateInputStream(fileIn, decompressor), job, recordDelimiter ); filePosition = fileIn; } } else { fileIn.Seek(start); @in = new UncompressedSplitLineReader(fileIn, job, recordDelimiter, split.GetLength ()); filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += @in.ReadLine(new Text(), 0, MaxBytesToConsume(start)); } this.pos = start; }
// This is also called from the old FixedLengthRecordReader API implementation /// <exception cref="System.IO.IOException"/> public virtual void Initialize(Configuration job, long splitStart, long splitLength , Path file) { start = splitStart; end = start + splitLength; long partialRecordLength = start % recordLength; long numBytesToSkip = 0; if (partialRecordLength != 0) { numBytesToSkip = recordLength - partialRecordLength; } // open the file and seek to the start of the split FileSystem fs = file.GetFileSystem(job); fileIn = fs.Open(file); CompressionCodec codec = new CompressionCodecFactory(job).GetCodec(file); if (null != codec) { isCompressedInput = true; decompressor = CodecPool.GetDecompressor(codec); CompressionInputStream cIn = codec.CreateInputStream(fileIn, decompressor); filePosition = cIn; inputStream = cIn; numRecordsRemainingInSplit = long.MaxValue; Log.Info("Compressed input; cannot compute number of records in the split"); } else { fileIn.Seek(start); filePosition = fileIn; inputStream = fileIn; long splitSize = end - start - numBytesToSkip; numRecordsRemainingInSplit = (splitSize + recordLength - 1) / recordLength; if (numRecordsRemainingInSplit < 0) { numRecordsRemainingInSplit = 0; } Log.Info("Expecting " + numRecordsRemainingInSplit + " records each with a length of " + recordLength + " bytes in the split with an effective size of " + splitSize + " bytes"); } if (numBytesToSkip != 0) { start += inputStream.Skip(numBytesToSkip); } this.pos = start; }
public InMemoryMapOutput(Configuration conf, TaskAttemptID mapId, MergeManagerImpl <K, V> merger, int size, CompressionCodec codec, bool primaryMapOutput) : base(mapId, (long)size, primaryMapOutput) { // Decompression of map-outputs this.conf = conf; this.merger = merger; this.codec = codec; byteStream = new BoundedByteArrayOutputStream(size); memory = byteStream.GetBuffer(); if (codec != null) { decompressor = CodecPool.GetDecompressor(codec); } else { decompressor = null; } }
public virtual void TestMultipleClose() { Uri testFileUrl = GetType().GetClassLoader().GetResource("recordSpanningMultipleSplits.txt.bz2" ); NUnit.Framework.Assert.IsNotNull("Cannot find recordSpanningMultipleSplits.txt.bz2" , testFileUrl); FilePath testFile = new FilePath(testFileUrl.GetFile()); Path testFilePath = new Path(testFile.GetAbsolutePath()); long testFileSize = testFile.Length(); Configuration conf = new Configuration(); conf.SetInt(LineRecordReader.MaxLineLength, int.MaxValue); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID() ); // read the data and check whether BOM is skipped FileSplit split = new FileSplit(testFilePath, 0, testFileSize, null); LineRecordReader reader = new LineRecordReader(); reader.Initialize(split, context); //noinspection StatementWithEmptyBody while (reader.NextKeyValue()) { } reader.Close(); reader.Close(); BZip2Codec codec = new BZip2Codec(); codec.SetConf(conf); ICollection <Decompressor> decompressors = new HashSet <Decompressor>(); for (int i = 0; i < 10; ++i) { decompressors.AddItem(CodecPool.GetDecompressor(codec)); } NUnit.Framework.Assert.AreEqual(10, decompressors.Count); }