protected internal override bool IsSplitable(FileSystem fs, Path file) { CompressionCodec codec = compressionCodecs.GetCodec(file); if (null == codec) { return(true); } return(codec is SplittableCompressionCodec); }
/// <exception cref="System.IO.IOException"/> public LineRecordReader(Configuration job, FileSplit split, byte[] recordDelimiter ) { this.maxLineLength = job.GetInt(LineRecordReader.MaxLineLength, int.MaxValue); start = split.GetStart(); end = start + split.GetLength(); Path file = split.GetPath(); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.GetCodec(file); // open the file and seek to the start of the split FileSystem fs = file.GetFileSystem(job); fileIn = fs.Open(file); if (IsCompressedInput()) { decompressor = CodecPool.GetDecompressor(codec); if (codec is SplittableCompressionCodec) { SplitCompressionInputStream cIn = ((SplittableCompressionCodec)codec).CreateInputStream (fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.Byblock); @in = new CompressedSplitLineReader(cIn, job, recordDelimiter); start = cIn.GetAdjustedStart(); end = cIn.GetAdjustedEnd(); filePosition = cIn; } else { // take pos from compressed stream @in = new SplitLineReader(codec.CreateInputStream(fileIn, decompressor), job, recordDelimiter ); filePosition = fileIn; } } else { fileIn.Seek(start); @in = new UncompressedSplitLineReader(fileIn, job, recordDelimiter, split.GetLength ()); filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += @in.ReadLine(new Text(), 0, MaxBytesToConsume(start)); } this.pos = start; }
/// <exception cref="System.IO.IOException"/> protected internal override InputStream GetInputStream(PathData item) { FSDataInputStream i = (FSDataInputStream)base.GetInputStream(item); // Handle 0 and 1-byte files short leadBytes; try { leadBytes = i.ReadShort(); } catch (EOFException) { i.Seek(0); return(i); } switch (leadBytes) { case unchecked ((int)(0x1f8b)): { // Check type of stream first // RFC 1952 // Must be gzip i.Seek(0); return(new GZIPInputStream(i)); } case unchecked ((int)(0x5345)): { // 'S' 'E' // Might be a SequenceFile if (i.ReadByte() == 'Q') { i.Close(); return(new Display.TextRecordInputStream(this, item.stat)); } goto default; } default: { // Check the type of compression instead, depending on Codec class's // own detection methods, based on the provided path. CompressionCodecFactory cf = new CompressionCodecFactory(GetConf()); CompressionCodec codec = cf.GetCodec(item.path); if (codec != null) { i.Seek(0); return(codec.CreateInputStream(i)); } break; } case unchecked ((int)(0x4f62)): { // 'O' 'b' if (i.ReadByte() == 'j') { i.Close(); return(new Display.AvroFileInputStream(item.stat)); } break; } } // File is non-compressed, or not a file container we know. i.Seek(0); return(i); }