/// <exception cref="System.IO.IOException"/> public Compressor GetCompressor() { CompressionCodec codec = GetCodec(); if (codec != null) { Compressor compressor = CodecPool.GetCompressor(codec); if (compressor != null) { if (compressor.Finished()) { // Somebody returns the compressor to CodecPool but is still using // it. Log.Warn("Compressor obtained from CodecPool already finished()"); } else { if (Log.IsDebugEnabled()) { Log.Debug("Got a compressor: " + compressor.GetHashCode()); } } compressor.Reset(); } return(compressor); } return(null); }
/// <exception cref="System.IO.IOException"/> public override RecordWriter <K, V> GetRecordWriter(FileSystem ignored, JobConf job , string name, Progressable progress) { bool isCompressed = GetCompressOutput(job); string keyValueSeparator = job.Get("mapreduce.output.textoutputformat.separator", "\t"); if (!isCompressed) { Path file = FileOutputFormat.GetTaskOutputPath(job, name); FileSystem fs = file.GetFileSystem(job); FSDataOutputStream fileOut = fs.Create(file, progress); return(new TextOutputFormat.LineRecordWriter <K, V>(fileOut, keyValueSeparator)); } else { Type codecClass = GetOutputCompressorClass(job, typeof(GzipCodec)); // create the named codec CompressionCodec codec = ReflectionUtils.NewInstance(codecClass, job); // build the filename including the extension Path file = FileOutputFormat.GetTaskOutputPath(job, name + codec.GetDefaultExtension ()); FileSystem fs = file.GetFileSystem(job); FSDataOutputStream fileOut = fs.Create(file, progress); return(new TextOutputFormat.LineRecordWriter <K, V>(new DataOutputStream(codec.CreateOutputStream (fileOut)), keyValueSeparator)); } }
public void TestDeserializeMessageSet(CompressionCodec codec) { using (var serialized = Pool.Reserve()) { var set = new PartitionData { Partition = 42, CompressionCodec = codec, Messages = new[] { new Message { Key = Key, Value = Value }, new Message { Key = Key, Value = Value } } }; set.Serialize(serialized, SerializationConfig.ByteArraySerializers); serialized.Position = 4; var deserialized = FetchPartitionResponse.DeserializeMessageSet(serialized, SerializationConfig.ByteArrayDeserializers); Assert.AreEqual(2, deserialized.Count); foreach (var msg in deserialized) { Assert.AreEqual(0, msg.Offset); CollectionAssert.AreEqual(Key, msg.Message.Key as byte[]); CollectionAssert.AreEqual(Value, msg.Message.Value as byte[]); } } }
public MergeQueue(Configuration conf, FileSystem fs, IList <Merger.Segment <K, V> > segments, RawComparator <K> comparator, Progressable reporter, bool sortSegments, CompressionCodec codec, TaskType taskType) : this(conf, fs, segments, comparator, reporter, sortSegments, taskType) { this.codec = codec; }
/// <summary>Construct an IFile Reader.</summary> /// <param name="conf">Configuration File</param> /// <param name="in">The input stream</param> /// <param name="length"> /// Length of the data in the stream, including the checksum /// bytes. /// </param> /// <param name="codec">codec</param> /// <param name="readsCounter">Counter for records read from disk</param> /// <exception cref="System.IO.IOException"/> public Reader(Configuration conf, FSDataInputStream @in, long length, CompressionCodec codec, Counters.Counter readsCounter) { // Count records read from disk // Possibly decompressed stream that we read readRecordsCounter = readsCounter; checksumIn = new IFileInputStream(@in, length, conf); if (codec != null) { decompressor = CodecPool.GetDecompressor(codec); if (decompressor != null) { this.@in = codec.CreateInputStream(checksumIn, decompressor); } else { Log.Warn("Could not obtain decompressor from CodecPool"); this.@in = checksumIn; } } else { this.@in = checksumIn; } this.dataIn = new DataInputStream(this.@in); this.fileLength = length; if (conf != null) { bufferSize = conf.GetInt("io.file.buffer.size", DefaultBufferSize); } }
public Context(TaskAttemptID reduceId, JobConf jobConf, FileSystem localFS, TaskUmbilicalProtocol umbilical, LocalDirAllocator localDirAllocator, Reporter reporter, CompressionCodec codec, Type combinerClass, Task.CombineOutputCollector <K, V> combineCollector, Counters.Counter spilledRecordsCounter, Counters.Counter reduceCombineInputCounter , Counters.Counter shuffledMapsCounter, Counters.Counter reduceShuffleBytes, Counters.Counter failedShuffleCounter, Counters.Counter mergedMapOutputsCounter, TaskStatus status , Progress copyPhase, Progress mergePhase, Task reduceTask, MapOutputFile mapOutputFile , IDictionary <TaskAttemptID, MapOutputFile> localMapFiles) { this.reduceId = reduceId; this.jobConf = jobConf; this.localFS = localFS; this.umbilical = umbilical; this.localDirAllocator = localDirAllocator; this.reporter = reporter; this.codec = codec; this.combinerClass = combinerClass; this.combineCollector = combineCollector; this.spilledRecordsCounter = spilledRecordsCounter; this.reduceCombineInputCounter = reduceCombineInputCounter; this.shuffledMapsCounter = shuffledMapsCounter; this.reduceShuffleBytes = reduceShuffleBytes; this.failedShuffleCounter = failedShuffleCounter; this.mergedMapOutputsCounter = mergedMapOutputsCounter; this.status = status; this.copyPhase = copyPhase; this.mergePhase = mergePhase; this.reduceTask = reduceTask; this.mapOutputFile = mapOutputFile; this.localMapFiles = localMapFiles; }
// Create a file containing fixed length records with random data /// <exception cref="System.IO.IOException"/> private AList <string> CreateFile(Path targetFile, CompressionCodec codec, int recordLen , int numRecords) { AList <string> recordList = new AList <string>(numRecords); OutputStream ostream = localFs.Create(targetFile); if (codec != null) { ostream = codec.CreateOutputStream(ostream); } TextWriter writer = new OutputStreamWriter(ostream); try { StringBuilder sb = new StringBuilder(); for (int i = 0; i < numRecords; i++) { for (int j = 0; j < recordLen; j++) { sb.Append(chars[charRand.Next(chars.Length)]); } string recordData = sb.ToString(); recordList.AddItem(recordData); writer.Write(recordData); sb.Length = 0; } } finally { writer.Close(); } return(recordList); }
public DelimitedTextDataset( string folderName, string fileName, string folderPath = default, ColumnDelimiter columnDelimiter = default, RowDelimiter rowDelimiter = default, EncodingName encodingName = default, CompressionCodec compressionCodec = default, CompressionLevel compressionLevel = default, QuoteChar quoteChar = default, EscapeChar escapeChar = default, bool firstRowAsHeader = false, string nullValue = default, IList <SchemaColumn> schema = default) { FolderName = folderName; FileName = fileName; FolderPath = folderPath; ColumnDelimiter = columnDelimiter; RowDelimiter = rowDelimiter; EncodingName = encodingName; CompressionCodec = compressionCodec; CompressionLevel = compressionLevel; QuoteChar = quoteChar; EscapeChar = escapeChar; FirstRowAsHeader = firstRowAsHeader; NullValue = nullValue; Schema = schema; }
public Writer(Configuration conf, FileSystem fs, string dirName, Type keyClass, Type valClass, SequenceFile.CompressionType compress, CompressionCodec codec, Progressable progress) : this(conf, new Path(dirName), KeyClass(keyClass), ValueClass(valClass), Compression (compress, codec), Progressable(progress)) { }
public void Serialize(ReusableMemoryStream stream, CompressionCodec compressionCodec, Tuple <ISerializer, ISerializer> serializers) { var crcPos = stream.Position; stream.Write(Basics.MinusOne32, 0, 4); // crc placeholder var bodyPos = stream.Position; stream.WriteByte(0); // magic byte stream.WriteByte((byte)compressionCodec); // attributes if (SerializedKeyValue != null) { stream.Write(SerializedKeyValue.GetBuffer(), 0, (int)SerializedKeyValue.Length); } else { DoSerializeKeyValue(stream, serializers); } // update crc var crc = Crc32.Compute(stream, bodyPos, stream.Position - bodyPos); var curPos = stream.Position; stream.Position = crcPos; BigEndianConverter.Write(stream, (int)crc); stream.Position = curPos; }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.Exception"/> public override RecordWriter <K, V> GetRecordWriter(TaskAttemptContext job) { Configuration conf = job.GetConfiguration(); bool isCompressed = GetCompressOutput(job); string keyValueSeparator = conf.Get(Seperator, "\t"); CompressionCodec codec = null; string extension = string.Empty; if (isCompressed) { Type codecClass = GetOutputCompressorClass(job, typeof(GzipCodec)); codec = (CompressionCodec)ReflectionUtils.NewInstance(codecClass, conf); extension = codec.GetDefaultExtension(); } Path file = GetDefaultWorkFile(job, extension); FileSystem fs = file.GetFileSystem(conf); if (!isCompressed) { FSDataOutputStream fileOut = fs.Create(file, false); return(new TextOutputFormat.LineRecordWriter <K, V>(fileOut, keyValueSeparator)); } else { FSDataOutputStream fileOut = fs.Create(file, false); return(new TextOutputFormat.LineRecordWriter <K, V>(new DataOutputStream(codec.CreateOutputStream (fileOut)), keyValueSeparator)); } }
/// <exception cref="System.IO.IOException"/> public Segment(Configuration conf, FileSystem fs, Path file, CompressionCodec codec , bool preserve, Counters.Counter mergedMapOutputsCounter, long rawDataLength) : this(conf, fs, file, 0, fs.GetFileStatus(file).GetLen(), codec, preserve, mergedMapOutputsCounter ) { this.rawDataLength = rawDataLength; }
public void runSeekTest(CompressionCodec codec) { TestInStream.OutputCollector collect = new TestInStream.OutputCollector(); const int COUNT = 16384; BitFieldWriter @out = new BitFieldWriter( new OutStream("test", 500, codec, collect), 1); TestInStream.PositionCollector[] positions = new TestInStream.PositionCollector[COUNT]; for (int i = 0; i < COUNT; ++i) { positions[i] = new TestInStream.PositionCollector(); @out.getPosition(positions[i]); // test runs, non-runs if (i < COUNT / 2) { @out.write(i & 1); } else { @out.write((i / 3) & 1); } } @out.flush(); ByteBuffer inBuf = ByteBuffer.allocate(collect.buffer.size()); collect.buffer.setByteBuffer(inBuf, 0, collect.buffer.size()); inBuf.flip(); #pragma warning disable 612 BitFieldReader @in = new BitFieldReader(InStream.create(null, "test", new ByteBuffer[] { inBuf }, new long[] { 0 }, inBuf.remaining(), codec, 500), 1); #pragma warning restore 612 for (int i = 0; i < COUNT; ++i) { int x = @in.next(); if (i < COUNT / 2) { Assert.Equal(i & 1, x); } else { Assert.Equal((i / 3) & 1, x); } } for (int i = COUNT - 1; i >= 0; --i) { @in.seek(positions[i]); int x = @in.next(); if (i < COUNT / 2) { Assert.Equal(i & 1, x); } else { Assert.Equal((i / 3) & 1, x); } } }
public MetadataReaderImpl(Func <Stream> streamCreator, CompressionCodec codec, int bufferSize, int typeCount) { this.file = streamCreator(); this.codec = codec; this.bufferSize = bufferSize; this.typeCount = typeCount; }
public DefaultDataReader( Func <Stream> streamCreator, string path, bool useZeroCopy, CompressionCodec codec) { this.streamCreator = streamCreator; this.path = path; this.useZeroCopy = useZeroCopy; this.codec = codec; }
public MetadataReaderImpl(Func<Stream> streamCreator, CompressionCodec codec, int bufferSize, int typeCount) { this.file = streamCreator(); this.codec = codec; this.bufferSize = bufferSize; this.typeCount = typeCount; }
public virtual void TestSucceedAndFailedCopyMap <K, V>() { JobConf job = new JobConf(); job.SetNumMapTasks(2); //mock creation TaskUmbilicalProtocol mockUmbilical = Org.Mockito.Mockito.Mock <TaskUmbilicalProtocol >(); Reporter mockReporter = Org.Mockito.Mockito.Mock <Reporter>(); FileSystem mockFileSystem = Org.Mockito.Mockito.Mock <FileSystem>(); Type combinerClass = job.GetCombinerClass(); Task.CombineOutputCollector <K, V> mockCombineOutputCollector = (Task.CombineOutputCollector <K, V>)Org.Mockito.Mockito.Mock <Task.CombineOutputCollector>(); // needed for mock with generic TaskAttemptID mockTaskAttemptID = Org.Mockito.Mockito.Mock <TaskAttemptID>(); LocalDirAllocator mockLocalDirAllocator = Org.Mockito.Mockito.Mock <LocalDirAllocator >(); CompressionCodec mockCompressionCodec = Org.Mockito.Mockito.Mock <CompressionCodec >(); Counters.Counter mockCounter = Org.Mockito.Mockito.Mock <Counters.Counter>(); TaskStatus mockTaskStatus = Org.Mockito.Mockito.Mock <TaskStatus>(); Progress mockProgress = Org.Mockito.Mockito.Mock <Progress>(); MapOutputFile mockMapOutputFile = Org.Mockito.Mockito.Mock <MapOutputFile>(); Org.Apache.Hadoop.Mapred.Task mockTask = Org.Mockito.Mockito.Mock <Org.Apache.Hadoop.Mapred.Task >(); MapOutput <K, V> output = Org.Mockito.Mockito.Mock <MapOutput>(); ShuffleConsumerPlugin.Context <K, V> context = new ShuffleConsumerPlugin.Context <K , V>(mockTaskAttemptID, job, mockFileSystem, mockUmbilical, mockLocalDirAllocator , mockReporter, mockCompressionCodec, combinerClass, mockCombineOutputCollector, mockCounter, mockCounter, mockCounter, mockCounter, mockCounter, mockCounter, mockTaskStatus , mockProgress, mockProgress, mockTask, mockMapOutputFile, null); TaskStatus status = new _TaskStatus_251(); Progress progress = new Progress(); ShuffleSchedulerImpl <K, V> scheduler = new ShuffleSchedulerImpl <K, V>(job, status , null, null, progress, context.GetShuffledMapsCounter(), context.GetReduceShuffleBytes (), context.GetFailedShuffleCounter()); MapHost host1 = new MapHost("host1", null); TaskAttemptID failedAttemptID = new TaskAttemptID(new TaskID(new JobID("test", 0) , TaskType.Map, 0), 0); TaskAttemptID succeedAttemptID = new TaskAttemptID(new TaskID(new JobID("test", 0 ), TaskType.Map, 1), 1); // handle output fetch failure for failedAttemptID, part I scheduler.HostFailed(host1.GetHostName()); // handle output fetch succeed for succeedAttemptID long bytes = (long)500 * 1024 * 1024; scheduler.CopySucceeded(succeedAttemptID, host1, bytes, 0, 500000, output); // handle output fetch failure for failedAttemptID, part II // for MAPREDUCE-6361: verify no NPE exception get thrown out scheduler.CopyFailed(failedAttemptID, host1, true, false); }
public CompressedStream(long?fileId, string name, List <DiskRange> input, long length, CompressionCodec codec, int bufferSize) : base(fileId, name, length) { this.bytes = input; this.codec = codec; this.bufferSize = bufferSize; currentOffset = 0; currentRange = 0; }
private static OrcProto.Footer extractFooter(ByteBuffer bb, int footerAbsPos, int footerSize, CompressionCodec codec, int bufferSize) { bb.position(footerAbsPos); bb.limit(footerAbsPos + footerSize); return(OrcProto.Footer.ParseFrom(InStream.createCodedInputStream(null, "footer", new List <DiskRange> { new RecordReaderImpl.BufferChunk(bb, 0) }, footerSize, codec, bufferSize))); }
protected internal override bool IsSplitable(FileSystem fs, Path file) { CompressionCodec codec = compressionCodecs.GetCodec(file); if (null == codec) { return(true); } return(codec is SplittableCompressionCodec); }
private static OrcProto.Metadata extractMetadata(ByteBuffer bb, int metadataAbsPos, int metadataSize, CompressionCodec codec, int bufferSize) { bb.position(metadataAbsPos); bb.limit(metadataAbsPos + metadataSize); return(OrcProto.Metadata.ParseFrom(InStream.createCodedInputStream(null, "metadata", new List <DiskRange> { new RecordReaderImpl.BufferChunk(bb, 0) }, metadataSize, codec, bufferSize))); }
// Local directories /// <exception cref="System.IO.IOException"/> public static RawKeyValueIterator Merge <K, V>(Configuration conf, FileSystem fs, CompressionCodec codec, Path[] inputs, bool deleteInputs, int mergeFactor, Path tmpDir, RawComparator <K> comparator, Progressable reporter, Counters.Counter readsCounter , Counters.Counter writesCounter, Progress mergePhase) { System.Type keyClass = typeof(K); System.Type valueClass = typeof(V); return(new Merger.MergeQueue <K, V>(conf, fs, inputs, deleteInputs, codec, comparator , reporter, null, TaskType.Reduce).Merge(keyClass, valueClass, mergeFactor, tmpDir , readsCounter, writesCounter, mergePhase)); }
public virtual void TestConsumerApi() { JobConf jobConf = new JobConf(); ShuffleConsumerPlugin <K, V> shuffleConsumerPlugin = new TestShufflePlugin.TestShuffleConsumerPlugin <K, V>(); //mock creation ReduceTask mockReduceTask = Org.Mockito.Mockito.Mock <ReduceTask>(); TaskUmbilicalProtocol mockUmbilical = Org.Mockito.Mockito.Mock <TaskUmbilicalProtocol >(); Reporter mockReporter = Org.Mockito.Mockito.Mock <Reporter>(); FileSystem mockFileSystem = Org.Mockito.Mockito.Mock <FileSystem>(); Type combinerClass = jobConf.GetCombinerClass(); Task.CombineOutputCollector <K, V> mockCombineOutputCollector = (Task.CombineOutputCollector <K, V>)Org.Mockito.Mockito.Mock <Task.CombineOutputCollector>(); // needed for mock with generic TaskAttemptID mockTaskAttemptID = Org.Mockito.Mockito.Mock <TaskAttemptID>(); LocalDirAllocator mockLocalDirAllocator = Org.Mockito.Mockito.Mock <LocalDirAllocator >(); CompressionCodec mockCompressionCodec = Org.Mockito.Mockito.Mock <CompressionCodec >(); Counters.Counter mockCounter = Org.Mockito.Mockito.Mock <Counters.Counter>(); TaskStatus mockTaskStatus = Org.Mockito.Mockito.Mock <TaskStatus>(); Progress mockProgress = Org.Mockito.Mockito.Mock <Progress>(); MapOutputFile mockMapOutputFile = Org.Mockito.Mockito.Mock <MapOutputFile>(); Org.Apache.Hadoop.Mapred.Task mockTask = Org.Mockito.Mockito.Mock <Org.Apache.Hadoop.Mapred.Task >(); try { string[] dirs = jobConf.GetLocalDirs(); // verify that these APIs are available through super class handler ShuffleConsumerPlugin.Context <K, V> context = new ShuffleConsumerPlugin.Context <K , V>(mockTaskAttemptID, jobConf, mockFileSystem, mockUmbilical, mockLocalDirAllocator , mockReporter, mockCompressionCodec, combinerClass, mockCombineOutputCollector, mockCounter, mockCounter, mockCounter, mockCounter, mockCounter, mockCounter, mockTaskStatus , mockProgress, mockProgress, mockTask, mockMapOutputFile, null); shuffleConsumerPlugin.Init(context); shuffleConsumerPlugin.Run(); shuffleConsumerPlugin.Close(); } catch (Exception e) { NUnit.Framework.Assert.IsTrue("Threw exception:" + e, false); } // verify that these APIs are available for 3rd party plugins mockReduceTask.GetTaskID(); mockReduceTask.GetJobID(); mockReduceTask.GetNumMaps(); mockReduceTask.GetPartition(); mockReporter.Progress(); }
/// <exception cref="System.IO.IOException"/> public static RawKeyValueIterator Merge <K, V>(Configuration conf, FileSystem fs, CompressionCodec codec, IList <Merger.Segment <K, V> > segments, int mergeFactor, Path tmpDir, RawComparator <K> comparator, Progressable reporter, bool sortSegments, Counters.Counter readsCounter, Counters.Counter writesCounter, Progress mergePhase , TaskType taskType) { System.Type keyClass = typeof(K); System.Type valueClass = typeof(V); return(new Merger.MergeQueue <K, V>(conf, fs, segments, comparator, reporter, sortSegments , codec, taskType).Merge(keyClass, valueClass, mergeFactor, tmpDir, readsCounter , writesCounter, mergePhase)); }
/// <summary> /// Create a compression instance using the codec specified by /// <code>codecClassName</code> /// </summary> /// <exception cref="System.IO.IOException"/> internal static Org.Apache.Hadoop.Hdfs.Server.Namenode.FSImageCompression CreateCompression (Configuration conf, string codecClassName) { CompressionCodecFactory factory = new CompressionCodecFactory(conf); CompressionCodec codec = factory.GetCodecByClassName(codecClassName); if (codec == null) { throw new IOException("Not a supported codec: " + codecClassName); } return(new Org.Apache.Hadoop.Hdfs.Server.Namenode.FSImageCompression(codec)); }
public OutStream( string name, int bufferSize, CompressionCodec codec, OutputReceiver receiver) { this.name = name; this._bufferSize = bufferSize; this.codec = codec; this.receiver = receiver; this._suppress = false; }
// Remark: it might be the case that brokers are authorized to send us partial record batch. // So if the protocol exceptions in this method are triggered, you might want to investigate and remove // them altogether. public void Deserialize(ReusableMemoryStream input, Deserializers deserializers, long endOfAllBatches) { if (input.Position + BytesNecessaryToGetLength > endOfAllBatches) { throw new ProtocolException( $"Trying to read a batch record at {input.Position} and the end of all batches is {endOfAllBatches}." + $" There is not enough bytes remaining to even read the first fields..."); } BaseOffset = BigEndianConverter.ReadInt64(input); var batchLength = BigEndianConverter.ReadInt32(input); var endOfBatch = input.Position + batchLength; if (endOfAllBatches < endOfBatch) { throw new ProtocolException( $"The record batch says it has length that stops at {endOfBatch} but the list of all batches stop at {endOfAllBatches}."); } PartitionLeaderEpoch = BigEndianConverter.ReadInt32(input); var magic = input.ReadByte(); // Current magic value is 2 if ((uint)magic != 2) { throw new UnsupportedMagicByteVersion((byte)magic, "2"); } var crc = (uint)BigEndianConverter.ReadInt32(input); var afterCrcPosition = input.Position; // The crc is calculated starting from this position Crc32.CheckCrcCastagnoli((int)crc, input, afterCrcPosition, endOfBatch - afterCrcPosition); var attributes = BigEndianConverter.ReadInt16(input); CompressionCodec = (CompressionCodec)(attributes & CompressionCodecMask); IsTransactional = (attributes & TransactionalFlagMask) != 0; IsControl = (attributes & ControlFlagMask) != 0; TimestampType = (attributes & TimestampTypeMask) > 0 ? TimestampType.LogAppendTime : TimestampType.CreateTime; var lastOffsetDelta = BigEndianConverter.ReadInt32(input); var firstTimestamp = BigEndianConverter.ReadInt64(input); var maxTimestamp = BigEndianConverter.ReadInt64(input); ProducerId = BigEndianConverter.ReadInt64(input); ProducerEpoch = BigEndianConverter.ReadInt16(input); BaseSequence = BigEndianConverter.ReadInt32(input); var numberOfRecords = BigEndianConverter.ReadInt32(input); Records = DeserializeRecords(input, numberOfRecords, endOfBatch, firstTimestamp, deserializers); }
/** * Creates coded input stream (used for protobuf message parsing) with higher message size limit. * * @param name the name of the stream * @param input the list of ranges of bytes for the stream; from disk or cache * @param length the length in bytes of the stream * @param codec the compression codec * @param bufferSize the compression buffer size * @return coded input stream * @ */ public static CodedInputStream createCodedInputStream(long?fileId, string name, List <DiskRange> input, long length, CompressionCodec codec, int bufferSize) { InStream inStream = create(fileId, name, input, length, codec, bufferSize); CodedInputStream codedInputStream = CodedInputStream.CreateInstance(inStream); codedInputStream.SetSizeLimit(PROTOBUF_MESSAGE_MAX_LIMIT); return(codedInputStream); }
/// <exception cref="System.IO.IOException"/> public Segment(Configuration conf, FileSystem fs, Path file, long segmentOffset, long segmentLength, CompressionCodec codec, bool preserve, Counters.Counter mergedMapOutputsCounter ) { this.conf = conf; this.fs = fs; this.file = file; this.codec = codec; this.preserve = preserve; this.segmentOffset = segmentOffset; this.segmentLength = segmentLength; this.mapOutputsCounter = mergedMapOutputsCounter; }
/// <exception cref="System.IO.IOException"/> public static InputStream WrapInputStreamForCompression(Configuration conf, string codec, InputStream @in) { if (codec.IsEmpty()) { return(@in); } FSImageCompression compression = FSImageCompression.CreateCompression(conf, codec ); CompressionCodec imageCodec = compression.GetImageCodec(); return(imageCodec.CreateInputStream(@in)); }
/// <exception cref="System.IO.IOException"/> private void SaveInternal(FileOutputStream fout, FSImageCompression compression, string filePath) { StartupProgress prog = NameNode.GetStartupProgress(); MessageDigest digester = MD5Hash.GetDigester(); underlyingOutputStream = new DigestOutputStream(new BufferedOutputStream(fout), digester ); underlyingOutputStream.Write(FSImageUtil.MagicHeader); fileChannel = fout.GetChannel(); FsImageProto.FileSummary.Builder b = FsImageProto.FileSummary.NewBuilder().SetOndiskVersion (FSImageUtil.FileVersion).SetLayoutVersion(NameNodeLayoutVersion.CurrentLayoutVersion ); codec = compression.GetImageCodec(); if (codec != null) { b.SetCodec(codec.GetType().GetCanonicalName()); sectionOutputStream = codec.CreateOutputStream(underlyingOutputStream); } else { sectionOutputStream = underlyingOutputStream; } SaveNameSystemSection(b); // Check for cancellation right after serializing the name system section. // Some unit tests, such as TestSaveNamespace#testCancelSaveNameSpace // depends on this behavior. context.CheckCancelled(); Step step = new Step(StepType.Inodes, filePath); prog.BeginStep(Phase.SavingCheckpoint, step); SaveInodes(b); SaveSnapshots(b); prog.EndStep(Phase.SavingCheckpoint, step); step = new Step(StepType.DelegationTokens, filePath); prog.BeginStep(Phase.SavingCheckpoint, step); SaveSecretManagerSection(b); prog.EndStep(Phase.SavingCheckpoint, step); step = new Step(StepType.CachePools, filePath); prog.BeginStep(Phase.SavingCheckpoint, step); SaveCacheManagerSection(b); prog.EndStep(Phase.SavingCheckpoint, step); SaveStringTableSection(b); // We use the underlyingOutputStream to write the header. Therefore flush // the buffered stream (which is potentially compressed) first. FlushSectionOutputStream(); FsImageProto.FileSummary summary = ((FsImageProto.FileSummary)b.Build()); SaveFileSummary(underlyingOutputStream, summary); underlyingOutputStream.Close(); savedDigest = new MD5Hash(digester.Digest()); }
/** * Create an input stream from a list of disk ranges with data. * @param name the name of the stream * @param input the list of ranges of bytes for the stream; from disk or cache * @param length the length in bytes of the stream * @param codec the compression codec * @param bufferSize the compression buffer size * @param cache Low-level cache to use to put data, if any. Only works with compressed streams. * @return an input stream * @ */ public static InStream create(long? fileId, string name, List<DiskRange> input, long length, CompressionCodec codec, int bufferSize) { if (codec == null) { return new UncompressedStream(fileId, name, input, length); } else { return new CompressedStream(fileId, name, input, length, codec, bufferSize); } }
public WriterImpl( Stream stream, string path, OrcFile.WriterOptions options, ObjectInspector inspector, TypeDescription schema, long stripeSize, CompressionKind compress, int bufferSize, int rowIndexStride, MemoryManager memoryManager, bool addBlockPadding, OrcFile.Version version, OrcFile.WriterCallback callback, OrcFile.EncodingStrategy encodingStrategy, OrcFile.CompressionStrategy compressionStrategy, double paddingTolerance, long blockSizeValue, string bloomFilterColumnNames, double bloomFilterFpp) { this.baseStream = stream; this.streamFactory = new StreamFactory(this); this.path = path; this.options = options; this.callback = callback; this.schema = schema; this.adjustedStripeSize = stripeSize; this.defaultStripeSize = stripeSize; this.version = version; this.encodingStrategy = encodingStrategy; this.compressionStrategy = compressionStrategy; this.addBlockPadding = addBlockPadding; this.blockSize = blockSizeValue; this.paddingTolerance = paddingTolerance; this.compress = compress; this.rowIndexStride = rowIndexStride; this.memoryManager = memoryManager; buildIndex = rowIndexStride > 0; codec = createCodec(compress); int numColumns = schema.getMaximumId() + 1; this.bufferSize = getEstimatedBufferSize(defaultStripeSize, numColumns, bufferSize); if (version == OrcFile.Version.V_0_11) { /* do not write bloom filters for ORC v11 */ this.bloomFilterColumns = new bool[schema.getMaximumId() + 1]; } else { this.bloomFilterColumns = OrcUtils.includeColumns(bloomFilterColumnNames, schema); } this.bloomFilterFpp = bloomFilterFpp; treeWriter = createTreeWriter(inspector, schema, streamFactory, false); if (buildIndex && rowIndexStride < MIN_ROW_INDEX_STRIDE) { throw new ArgumentException("Row stride must be at least " + MIN_ROW_INDEX_STRIDE); } // ensure that we are able to handle callbacks before we register ourselves memoryManager.addWriter(path, stripeSize, this); }
public DefaultDataReader( Func<Stream> streamCreator, string path, bool useZeroCopy, CompressionCodec codec) { this.streamCreator = streamCreator; this.path = path; this.useZeroCopy = useZeroCopy; this.codec = codec; }
public static DataReader createDefaultDataReader( Func<Stream> streamCreator, string path, bool useZeroCopy, CompressionCodec codec) { return new DefaultDataReader(streamCreator, path, useZeroCopy, codec); }
public void runSeekTest(CompressionCodec codec) { TestInStream.OutputCollector collect = new TestInStream.OutputCollector(); RunLengthIntegerWriter @out = new RunLengthIntegerWriter( new OutStream("test", 1000, codec, collect), true); TestInStream.PositionCollector[] positions = new TestInStream.PositionCollector[4096]; Random random = new Random(99); int[] junk = new int[2048]; for (int i = 0; i < junk.Length; ++i) { junk[i] = random.Next(); } for (int i = 0; i < 4096; ++i) { positions[i] = new TestInStream.PositionCollector(); @out.getPosition(positions[i]); // test runs, incrementing runs, non-runs if (i < 1024) { @out.write(i / 4); } else if (i < 2048) { @out.write(2 * i); } else { @out.write(junk[i - 2048]); } } @out.flush(); ByteBuffer inBuf = ByteBuffer.allocate(collect.buffer.size()); collect.buffer.setByteBuffer(inBuf, 0, collect.buffer.size()); inBuf.flip(); #pragma warning disable 612 RunLengthIntegerReader @in = new RunLengthIntegerReader(InStream.create (null, "test", new ByteBuffer[] { inBuf }, new long[] { 0 }, inBuf.remaining(), codec, 1000), true); #pragma warning restore 612 for (int i = 0; i < 2048; ++i) { int x = (int)@in.next(); if (i < 1024) { Assert.Equal(i / 4, x); } else if (i < 2048) { Assert.Equal(2 * i, x); } else { Assert.Equal(junk[i - 2048], x); } } for (int i = 2047; i >= 0; --i) { @in.seek(positions[i]); int x = (int)@in.next(); if (i < 1024) { Assert.Equal(i / 4, x); } else if (i < 2048) { Assert.Equal(2 * i, x); } else { Assert.Equal(junk[i - 2048], x); } } }
public StreamReaderBuilder setCompressionCodec(CompressionCodec compressionCodec) { this.compressionCodec = compressionCodec; return this; }
public BufferedStream(string name, int bufferSize, CompressionCodec codec) { outStream = new OutStream(name, bufferSize, codec, this); }
public static TreeReader[] createEncodedTreeReader(int numCols, List<OrcProto.Type> types, List<OrcProto.ColumnEncoding> encodings, EncodedColumnBatch<OrcBatchKey> batch, CompressionCodec codec, bool skipCorrupt) { long file = batch.getBatchKey().file; TreeReader[] treeReaders = new TreeReader[numCols]; for (int i = 0; i < numCols; i++) { int columnIndex = batch.getColumnIxs()[i]; ColumnStreamData[] streamBuffers = batch.getColumnData()[i]; OrcProto.Type columnType = types[columnIndex]; // EncodedColumnBatch is already decompressed, we don't really need to pass codec. // But we need to know if the original data is compressed or not. This is used to skip // positions in row index properly. If the file is originally compressed, // then 1st position (compressed offset) in row index should be skipped to get // uncompressed offset, else 1st position should not be skipped. // TODO: there should be a better way to do this, code just needs to be modified OrcProto.ColumnEncoding columnEncoding = encodings[columnIndex]; // stream buffers are arranged in enum order of stream kind ColumnStreamData present = streamBuffers[Kind.PRESENT_VALUE], data = streamBuffers[Kind.DATA_VALUE], dictionary = streamBuffers[Kind.DICTIONARY_DATA_VALUE], lengths = streamBuffers[Kind.LENGTH_VALUE], secondary = streamBuffers[Kind.SECONDARY_VALUE]; switch (columnType.Kind) { case OrcProto.Type.Types.Kind.BINARY: treeReaders[i] = BinaryStreamReader.builder() .setFileId(file) .setColumnIndex(columnIndex) .setPresentStream(present) .setDataStream(data) .setLengthStream(lengths) .setCompressionCodec(codec) .setColumnEncoding(columnEncoding) .build(); break; case OrcProto.Type.Types.Kind.BOOLEAN: treeReaders[i] = BooleanStreamReader.builder() .setFileId(file) .setColumnIndex(columnIndex) .setPresentStream(present) .setDataStream(data) .setCompressionCodec(codec) .build(); break; case OrcProto.Type.Types.Kind.BYTE: treeReaders[i] = ByteStreamReader.builder() .setFileId(file) .setColumnIndex(columnIndex) .setPresentStream(present) .setDataStream(data) .setCompressionCodec(codec) .build(); break; case OrcProto.Type.Types.Kind.SHORT: treeReaders[i] = ShortStreamReader.builder() .setFileId(file) .setColumnIndex(columnIndex) .setPresentStream(present) .setDataStream(data) .setCompressionCodec(codec) .setColumnEncoding(columnEncoding) .build(); break; case OrcProto.Type.Types.Kind.INT: treeReaders[i] = IntStreamReader.builder() .setFileId(file) .setColumnIndex(columnIndex) .setPresentStream(present) .setDataStream(data) .setCompressionCodec(codec) .setColumnEncoding(columnEncoding) .build(); break; case OrcProto.Type.Types.Kind.LONG: treeReaders[i] = LongStreamReader.builder() .setFileId(file) .setColumnIndex(columnIndex) .setPresentStream(present) .setDataStream(data) .setCompressionCodec(codec) .setColumnEncoding(columnEncoding) .skipCorrupt(skipCorrupt) .build(); break; case OrcProto.Type.Types.Kind.FLOAT: treeReaders[i] = FloatStreamReader.builder() .setFileId(file) .setColumnIndex(columnIndex) .setPresentStream(present) .setDataStream(data) .setCompressionCodec(codec) .build(); break; case OrcProto.Type.Types.Kind.DOUBLE: treeReaders[i] = DoubleStreamReader.builder() .setFileId(file) .setColumnIndex(columnIndex) .setPresentStream(present) .setDataStream(data) .setCompressionCodec(codec) .build(); break; case OrcProto.Type.Types.Kind.CHAR: treeReaders[i] = CharStreamReader.builder() .setFileId(file) .setColumnIndex(columnIndex) .setMaxLength((int)columnType.MaximumLength) .setPresentStream(present) .setDataStream(data) .setLengthStream(lengths) .setDictionaryStream(dictionary) .setCompressionCodec(codec) .setColumnEncoding(columnEncoding) .build(); break; case OrcProto.Type.Types.Kind.VARCHAR: treeReaders[i] = VarcharStreamReader.builder() .setFileId(file) .setColumnIndex(columnIndex) .setMaxLength((int)columnType.MaximumLength) .setPresentStream(present) .setDataStream(data) .setLengthStream(lengths) .setDictionaryStream(dictionary) .setCompressionCodec(codec) .setColumnEncoding(columnEncoding) .build(); break; case OrcProto.Type.Types.Kind.STRING: treeReaders[i] = StringStreamReader.builder() .setFileId(file) .setColumnIndex(columnIndex) .setPresentStream(present) .setDataStream(data) .setLengthStream(lengths) .setDictionaryStream(dictionary) .setCompressionCodec(codec) .setColumnEncoding(columnEncoding) .build(); break; case OrcProto.Type.Types.Kind.DECIMAL: treeReaders[i] = DecimalStreamReader.builder() .setFileId(file) .setColumnIndex(columnIndex) .setPrecision((int)columnType.Precision) .setScale((int)columnType.Scale) .setPresentStream(present) .setValueStream(data) .setScaleStream(secondary) .setCompressionCodec(codec) .setColumnEncoding(columnEncoding) .build(); break; case OrcProto.Type.Types.Kind.TIMESTAMP: treeReaders[i] = TimestampStreamReader.builder() .setFileId(file) .setColumnIndex(columnIndex) .setPresentStream(present) .setSecondsStream(data) .setNanosStream(secondary) .setCompressionCodec(codec) .setColumnEncoding(columnEncoding) .skipCorrupt(skipCorrupt) .build(); break; case OrcProto.Type.Types.Kind.DATE: treeReaders[i] = DateStreamReader.builder() .setFileId(file) .setColumnIndex(columnIndex) .setPresentStream(present) .setDataStream(data) .setCompressionCodec(codec) .setColumnEncoding(columnEncoding) .build(); break; default: throw new NotSupportedException("Data type not supported yet! " + columnType); } } return treeReaders; }
internal static InStream create(long? fileId, string streamName, ByteBuffer[] buffers, long[] offsets, long length, CompressionCodec codec, int bufferSize) { List<DiskRange> input = new List<DiskRange>(buffers.Length); for (int i = 0; i < buffers.Length; ++i) { input.Add(new RecordReaderImpl.BufferChunk(buffers[i], offsets[i])); } return create(fileId, streamName, input, length, codec, bufferSize); }
/** * Creates coded input stream (used for protobuf message parsing) with higher message size limit. * * @param name the name of the stream * @param input the list of ranges of bytes for the stream; from disk or cache * @param length the length in bytes of the stream * @param codec the compression codec * @param bufferSize the compression buffer size * @return coded input stream * @ */ public static CodedInputStream createCodedInputStream(long? fileId, string name, List<DiskRange> input, long length, CompressionCodec codec, int bufferSize) { InStream inStream = create(fileId, name, input, length, codec, bufferSize); CodedInputStream codedInputStream = CodedInputStream.CreateInstance(inStream); codedInputStream.SetSizeLimit(PROTOBUF_MESSAGE_MAX_LIMIT); return codedInputStream; }
public CompressedStream(long? fileId, string name, List<DiskRange> input, long length, CompressionCodec codec, int bufferSize) : base(fileId, name, length) { this.bytes = input; this.codec = codec; this.bufferSize = bufferSize; currentOffset = 0; currentRange = 0; }