Пример #1
0
            /// <exception cref="System.IO.IOException"/>
            public Decompressor GetDecompressor()
            {
                CompressionCodec codec = GetCodec();

                if (codec != null)
                {
                    Decompressor decompressor = CodecPool.GetDecompressor(codec);
                    if (decompressor != null)
                    {
                        if (decompressor.Finished())
                        {
                            // Somebody returns the decompressor to CodecPool but is still using
                            // it.
                            Log.Warn("Deompressor obtained from CodecPool already finished()");
                        }
                        else
                        {
                            if (Log.IsDebugEnabled())
                            {
                                Log.Debug("Got a decompressor: " + decompressor.GetHashCode());
                            }
                        }
                        decompressor.Reset();
                    }
                    return(decompressor);
                }
                return(null);
            }
Пример #2
0
        public virtual void TestMultipleClose()
        {
            Uri testFileUrl = GetType().GetClassLoader().GetResource("recordSpanningMultipleSplits.txt.bz2"
                                                                     );

            NUnit.Framework.Assert.IsNotNull("Cannot find recordSpanningMultipleSplits.txt.bz2"
                                             , testFileUrl);
            FilePath      testFile     = new FilePath(testFileUrl.GetFile());
            Path          testFilePath = new Path(testFile.GetAbsolutePath());
            long          testFileSize = testFile.Length();
            Configuration conf         = new Configuration();

            conf.SetInt(LineRecordReader.MaxLineLength, int.MaxValue);
            FileSplit        split  = new FileSplit(testFilePath, 0, testFileSize, (string[])null);
            LineRecordReader reader = new LineRecordReader(conf, split);
            LongWritable     key    = new LongWritable();
            Text             value  = new Text();

            //noinspection StatementWithEmptyBody
            while (reader.Next(key, value))
            {
            }
            reader.Close();
            reader.Close();
            BZip2Codec codec = new BZip2Codec();

            codec.SetConf(conf);
            ICollection <Decompressor> decompressors = new HashSet <Decompressor>();

            for (int i = 0; i < 10; ++i)
            {
                decompressors.AddItem(CodecPool.GetDecompressor(codec));
            }
            NUnit.Framework.Assert.AreEqual(10, decompressors.Count);
        }
Пример #3
0
 /// <summary>Construct an IFile Reader.</summary>
 /// <param name="conf">Configuration File</param>
 /// <param name="in">The input stream</param>
 /// <param name="length">
 /// Length of the data in the stream, including the checksum
 /// bytes.
 /// </param>
 /// <param name="codec">codec</param>
 /// <param name="readsCounter">Counter for records read from disk</param>
 /// <exception cref="System.IO.IOException"/>
 public Reader(Configuration conf, FSDataInputStream @in, long length, CompressionCodec
               codec, Counters.Counter readsCounter)
 {
     // Count records read from disk
     // Possibly decompressed stream that we read
     readRecordsCounter = readsCounter;
     checksumIn         = new IFileInputStream(@in, length, conf);
     if (codec != null)
     {
         decompressor = CodecPool.GetDecompressor(codec);
         if (decompressor != null)
         {
             this.@in = codec.CreateInputStream(checksumIn, decompressor);
         }
         else
         {
             Log.Warn("Could not obtain decompressor from CodecPool");
             this.@in = checksumIn;
         }
     }
     else
     {
         this.@in = checksumIn;
     }
     this.dataIn     = new DataInputStream(this.@in);
     this.fileLength = length;
     if (conf != null)
     {
         bufferSize = conf.GetInt("io.file.buffer.size", DefaultBufferSize);
     }
 }
Пример #4
0
        /// <exception cref="System.IO.IOException"/>
        public override void Initialize(InputSplit genericSplit, TaskAttemptContext context
                                        )
        {
            FileSplit     split = (FileSplit)genericSplit;
            Configuration job   = context.GetConfiguration();

            this.maxLineLength = job.GetInt(MaxLineLength, int.MaxValue);
            start = split.GetStart();
            end   = start + split.GetLength();
            Path file = split.GetPath();
            // open the file and seek to the start of the split
            FileSystem fs = file.GetFileSystem(job);

            fileIn = fs.Open(file);
            CompressionCodec codec = new CompressionCodecFactory(job).GetCodec(file);

            if (null != codec)
            {
                isCompressedInput = true;
                decompressor      = CodecPool.GetDecompressor(codec);
                if (codec is SplittableCompressionCodec)
                {
                    SplitCompressionInputStream cIn = ((SplittableCompressionCodec)codec).CreateInputStream
                                                          (fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.Byblock);
                    @in          = new CompressedSplitLineReader(cIn, job, this.recordDelimiterBytes);
                    start        = cIn.GetAdjustedStart();
                    end          = cIn.GetAdjustedEnd();
                    filePosition = cIn;
                }
                else
                {
                    @in = new SplitLineReader(codec.CreateInputStream(fileIn, decompressor), job, this
                                              .recordDelimiterBytes);
                    filePosition = fileIn;
                }
            }
            else
            {
                fileIn.Seek(start);
                @in = new UncompressedSplitLineReader(fileIn, job, this.recordDelimiterBytes, split
                                                      .GetLength());
                filePosition = fileIn;
            }
            // If this is not the first split, we always throw away first record
            // because we always (except the last split) read one extra line in
            // next() method.
            if (start != 0)
            {
                start += @in.ReadLine(new Text(), 0, MaxBytesToConsume(start));
            }
            this.pos = start;
        }
Пример #5
0
        /// <exception cref="System.IO.IOException"/>
        public LineRecordReader(Configuration job, FileSplit split, byte[] recordDelimiter
                                )
        {
            this.maxLineLength = job.GetInt(LineRecordReader.MaxLineLength, int.MaxValue);
            start = split.GetStart();
            end   = start + split.GetLength();
            Path file = split.GetPath();

            compressionCodecs = new CompressionCodecFactory(job);
            codec             = compressionCodecs.GetCodec(file);
            // open the file and seek to the start of the split
            FileSystem fs = file.GetFileSystem(job);

            fileIn = fs.Open(file);
            if (IsCompressedInput())
            {
                decompressor = CodecPool.GetDecompressor(codec);
                if (codec is SplittableCompressionCodec)
                {
                    SplitCompressionInputStream cIn = ((SplittableCompressionCodec)codec).CreateInputStream
                                                          (fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.Byblock);
                    @in          = new CompressedSplitLineReader(cIn, job, recordDelimiter);
                    start        = cIn.GetAdjustedStart();
                    end          = cIn.GetAdjustedEnd();
                    filePosition = cIn;
                }
                else
                {
                    // take pos from compressed stream
                    @in = new SplitLineReader(codec.CreateInputStream(fileIn, decompressor), job, recordDelimiter
                                              );
                    filePosition = fileIn;
                }
            }
            else
            {
                fileIn.Seek(start);
                @in = new UncompressedSplitLineReader(fileIn, job, recordDelimiter, split.GetLength
                                                          ());
                filePosition = fileIn;
            }
            // If this is not the first split, we always throw away first record
            // because we always (except the last split) read one extra line in
            // next() method.
            if (start != 0)
            {
                start += @in.ReadLine(new Text(), 0, MaxBytesToConsume(start));
            }
            this.pos = start;
        }
        // This is also called from the old FixedLengthRecordReader API implementation
        /// <exception cref="System.IO.IOException"/>
        public virtual void Initialize(Configuration job, long splitStart, long splitLength
                                       , Path file)
        {
            start = splitStart;
            end   = start + splitLength;
            long partialRecordLength = start % recordLength;
            long numBytesToSkip      = 0;

            if (partialRecordLength != 0)
            {
                numBytesToSkip = recordLength - partialRecordLength;
            }
            // open the file and seek to the start of the split
            FileSystem fs = file.GetFileSystem(job);

            fileIn = fs.Open(file);
            CompressionCodec codec = new CompressionCodecFactory(job).GetCodec(file);

            if (null != codec)
            {
                isCompressedInput = true;
                decompressor      = CodecPool.GetDecompressor(codec);
                CompressionInputStream cIn = codec.CreateInputStream(fileIn, decompressor);
                filePosition = cIn;
                inputStream  = cIn;
                numRecordsRemainingInSplit = long.MaxValue;
                Log.Info("Compressed input; cannot compute number of records in the split");
            }
            else
            {
                fileIn.Seek(start);
                filePosition = fileIn;
                inputStream  = fileIn;
                long splitSize = end - start - numBytesToSkip;
                numRecordsRemainingInSplit = (splitSize + recordLength - 1) / recordLength;
                if (numRecordsRemainingInSplit < 0)
                {
                    numRecordsRemainingInSplit = 0;
                }
                Log.Info("Expecting " + numRecordsRemainingInSplit + " records each with a length of "
                         + recordLength + " bytes in the split with an effective size of " + splitSize +
                         " bytes");
            }
            if (numBytesToSkip != 0)
            {
                start += inputStream.Skip(numBytesToSkip);
            }
            this.pos = start;
        }
Пример #7
0
 public InMemoryMapOutput(Configuration conf, TaskAttemptID mapId, MergeManagerImpl
                          <K, V> merger, int size, CompressionCodec codec, bool primaryMapOutput)
     : base(mapId, (long)size, primaryMapOutput)
 {
     // Decompression of map-outputs
     this.conf   = conf;
     this.merger = merger;
     this.codec  = codec;
     byteStream  = new BoundedByteArrayOutputStream(size);
     memory      = byteStream.GetBuffer();
     if (codec != null)
     {
         decompressor = CodecPool.GetDecompressor(codec);
     }
     else
     {
         decompressor = null;
     }
 }
Пример #8
0
        public virtual void TestMultipleClose()
        {
            Uri testFileUrl = GetType().GetClassLoader().GetResource("recordSpanningMultipleSplits.txt.bz2"
                                                                     );

            NUnit.Framework.Assert.IsNotNull("Cannot find recordSpanningMultipleSplits.txt.bz2"
                                             , testFileUrl);
            FilePath      testFile     = new FilePath(testFileUrl.GetFile());
            Path          testFilePath = new Path(testFile.GetAbsolutePath());
            long          testFileSize = testFile.Length();
            Configuration conf         = new Configuration();

            conf.SetInt(LineRecordReader.MaxLineLength, int.MaxValue);
            TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()
                                                                    );
            // read the data and check whether BOM is skipped
            FileSplit        split  = new FileSplit(testFilePath, 0, testFileSize, null);
            LineRecordReader reader = new LineRecordReader();

            reader.Initialize(split, context);
            //noinspection StatementWithEmptyBody
            while (reader.NextKeyValue())
            {
            }
            reader.Close();
            reader.Close();
            BZip2Codec codec = new BZip2Codec();

            codec.SetConf(conf);
            ICollection <Decompressor> decompressors = new HashSet <Decompressor>();

            for (int i = 0; i < 10; ++i)
            {
                decompressors.AddItem(CodecPool.GetDecompressor(codec));
            }
            NUnit.Framework.Assert.AreEqual(10, decompressors.Count);
        }