private static void writeStripeInformation(JsonWriter writer, StripeInformation stripe) { writer.newObject(); writer.key("offset").value(stripe.getOffset()); writer.key("indexLength").value(stripe.getIndexLength()); writer.key("dataLength").value(stripe.getDataLength()); writer.key("footerLength").value(stripe.getFooterLength()); writer.key("rowCount").value(stripe.getNumberOfRows()); writer.endObject(); }
public OrcProto.StripeFooter readStripeFooter(StripeInformation stripe) { long offset = stripe.getOffset() + stripe.getIndexLength() + stripe.getDataLength(); int tailLength = (int)stripe.getFooterLength(); // read the footer ByteBuffer tailBuf = ByteBuffer.allocate(tailLength); file.readFully(offset, tailBuf.array(), tailBuf.arrayOffset(), tailLength); return(OrcProto.StripeFooter.ParseFrom(InStream.createCodedInputStream(null, "footer", new List <DiskRange> { new RecordReaderImpl.BufferChunk(tailBuf, 0) }, tailLength, codec, bufferSize))); }
protected bool nextStripe(OrcFileKeyWrapper keyWrapper, OrcFileValueWrapper valueWrapper) { // missing stripe stats (old format). If numRows is 0 then its an empty file and no statistics // is present. We have to differentiate no stats (empty file) vs missing stats (old format). if ((stripeStatistics == null || stripeStatistics.Count == 0) && reader.getNumberOfRows() > 0) { keyWrapper.setInputPath(path); keyWrapper.setIsIncompatFile(true); skipFile = true; return(true); } bool active = iter.MoveNext(); while (active) { StripeInformation si = iter.Current; // if stripe offset is outside the split boundary then ignore the current // stripe as it will be handled by some other mapper. if (si.getOffset() >= start && si.getOffset() < end) { valueWrapper.setStripeStatistics(stripeStatistics[stripeIdx++]); valueWrapper.setStripeInformation(si); active = iter.MoveNext(); if (!active) { valueWrapper.setLastStripeInFile(true); valueWrapper.setUserMetadata(((ReaderImpl)reader).getOrcProtoUserMetadata()); } keyWrapper.setInputPath(path); keyWrapper.setCompression(reader.getCompression()); keyWrapper.setCompressBufferSize(reader.getCompressionSize()); keyWrapper.setVersion(reader.getFileVersion()); keyWrapper.setRowIndexStride(reader.getRowIndexStride()); keyWrapper.setTypes(reader.getTypes()); } else { stripeIdx++; continue; } return(true); } return(false); }
public void setStripeInformation(StripeInformation stripeInformation) { this.stripeInformation = stripeInformation; }
public void appendStripe(byte[] stripe, int offset, int length, StripeInformation stripeInfo, OrcProto.StripeStatistics stripeStatistics) { checkArgument(stripe != null, "Stripe must not be null"); checkArgument(length <= stripe.Length, "Specified length must not be greater specified array length"); checkArgument(stripeInfo != null, "Stripe information must not be null"); checkArgument(stripeStatistics != null, "Stripe statistics must not be null"); getStream(); long start = rawWriter.Position; long availBlockSpace = blockSize - (start % blockSize); // see if stripe can fit in the current hdfs block, else pad the remaining // space in the block if (length < blockSize && length > availBlockSpace && addBlockPadding) { byte[] pad = new byte[(int)Math.Min(HDFS_BUFFER_SIZE, availBlockSpace)]; LOG.info(String.Format("Padding ORC by {0} bytes while merging..", availBlockSpace)); start += availBlockSpace; while (availBlockSpace > 0) { int writeLen = (int)Math.Min(availBlockSpace, pad.Length); rawWriter.Write(pad, 0, writeLen); availBlockSpace -= writeLen; } } rawWriter.Write(stripe, 0, stripe.Length); rowsInStripe = (long)stripeStatistics.ColStatsList[0].NumberOfValues; rowCount += rowsInStripe; // since we have already written the stripe, just update stripe statistics treeWriter.stripeStatsBuilders.Add(stripeStatistics.ToBuilder()); // update file level statistics updateFileStatistics(stripeStatistics); // update stripe information OrcProto.StripeInformation.Builder dirEntry = OrcProto.StripeInformation.CreateBuilder(); dirEntry.Offset = (ulong)start; dirEntry.NumberOfRows = (ulong)rowsInStripe; dirEntry.IndexLength = (ulong)stripeInfo.getIndexLength(); dirEntry.DataLength = (ulong)stripeInfo.getDataLength(); dirEntry.FooterLength = (ulong)stripeInfo.getFooterLength(); stripes.Add(dirEntry.Build()); // reset it after writing the stripe rowsInStripe = 0; }
public RecordReaderImpl.Index readRowIndex(StripeInformation stripe, OrcProto.StripeFooter footer, bool[] included, OrcProto.RowIndex[] indexes, bool[] sargColumns, OrcProto.BloomFilterIndex[] bloomFilterIndices) { if (footer == null) { footer = readStripeFooter(stripe); } if (indexes == null) { indexes = new OrcProto.RowIndex[typeCount]; } if (bloomFilterIndices == null) { bloomFilterIndices = new OrcProto.BloomFilterIndex[typeCount]; } long offset = stripe.getOffset(); IList<OrcProto.Stream> streams = footer.StreamsList; for (int i = 0; i < streams.Count; i++) { OrcProto.Stream stream = streams[i]; OrcProto.Stream nextStream = null; if (i < streams.Count - 1) { nextStream = streams[i + 1]; } int col = (int)stream.Column; int len = (int)stream.Length; // row index stream and bloom filter are interlaced, check if the sarg column contains bloom // filter and combine the io to read row index and bloom filters for that column together if (stream.HasKind && (stream.Kind == OrcProto.Stream.Types.Kind.ROW_INDEX)) { bool readBloomFilter = false; if (sargColumns != null && sargColumns[col] && nextStream.Kind == OrcProto.Stream.Types.Kind.BLOOM_FILTER) { len += (int)nextStream.Length; i += 1; readBloomFilter = true; } if ((included == null || included[col]) && indexes[col] == null) { byte[] buffer = new byte[len]; file.readFully(offset, buffer, 0, buffer.Length); ByteBuffer bb = ByteBuffer.wrap(buffer); indexes[col] = OrcProto.RowIndex.ParseFrom(InStream.create(null, "index", new List<DiskRange> { new RecordReaderImpl.BufferChunk(bb, 0) }, (long)stream.Length, codec, bufferSize)); if (readBloomFilter) { bb.position((int)stream.Length); bloomFilterIndices[col] = OrcProto.BloomFilterIndex.ParseFrom(InStream.create( null, "bloom_filter", new List<DiskRange> { new RecordReaderImpl.BufferChunk(bb, 0) }, (long)nextStream.Length, codec, bufferSize)); } } } offset += len; } RecordReaderImpl.Index index = new RecordReaderImpl.Index(indexes, bloomFilterIndices); return index; }
public OrcProto.StripeFooter readStripeFooter(StripeInformation stripe) { long offset = stripe.getOffset() + stripe.getIndexLength() + stripe.getDataLength(); int tailLength = (int)stripe.getFooterLength(); // read the footer ByteBuffer tailBuf = ByteBuffer.allocate(tailLength); file.readFully(offset, tailBuf.array(), tailBuf.arrayOffset(), tailLength); return OrcProto.StripeFooter.ParseFrom(InStream.createCodedInputStream(null, "footer", new List<DiskRange> { new RecordReaderImpl.BufferChunk(tailBuf, 0) }, tailLength, codec, bufferSize)); }
private void CompleteStripe() { var stripeFooter = new StripeFooter(); var stripeStats = new StripeStatistics(); //Columns foreach (var writer in _columnWriters) { writer.ColumnWriter.FlushBuffers(); var dictionaryLength = (writer.ColumnWriter as StringWriter)?.DictionaryLength ?? 0; //DictionaryLength is only used by StringWriter stripeFooter.AddColumn(writer.ColumnWriter.ColumnEncoding, dictionaryLength); } var stripeInformation = new StripeInformation { Offset = (ulong)_outputStream.Position, NumberOfRows = (ulong)_rowsInStripe }; //Indexes foreach (var writer in _columnWriters) { //Write the index buffer var indexBuffer = _bufferFactory.CreateBuffer(StreamKind.RowIndex); writer.ColumnWriter.Statistics.WriteToBuffer(indexBuffer); indexBuffer.CopyTo(_outputStream); //Add the index to the footer stripeFooter.AddDataStream(writer.ColumnWriter.ColumnId, indexBuffer); //Collect summary statistics var columnStats = new ColumnStatistics(); foreach (var stats in writer.ColumnWriter.Statistics) { stats.FillColumnStatistics(columnStats); stats.FillColumnStatistics(writer.FileStatistics); } stripeStats.ColStats.Add(columnStats); } _stripeStats.Add(stripeStats); stripeInformation.IndexLength = (ulong)_outputStream.Position - stripeInformation.Offset; //Data streams foreach (var writer in _columnWriters) { foreach (var buffer in writer.ColumnWriter.Buffers) { if (!buffer.MustBeIncluded) { continue; } buffer.CopyTo(_outputStream); stripeFooter.AddDataStream(writer.ColumnWriter.ColumnId, buffer); } } stripeInformation.DataLength = (ulong)_outputStream.Position - stripeInformation.IndexLength - stripeInformation.Offset; //Footer long footerLength; _bufferFactory.SerializeAndCompressTo(_outputStream, stripeFooter, out footerLength); stripeInformation.FooterLength = (ulong)footerLength; _stripeInformations.Add(stripeInformation); _rowsInFile += _rowsInStripe; _rowsInStripe = 0; foreach (var writer in _columnWriters) { writer.ColumnWriter.Reset(); } }
public RecordReaderImpl.Index readRowIndex(StripeInformation stripe, OrcProto.StripeFooter footer, bool[] included, OrcProto.RowIndex[] indexes, bool[] sargColumns, OrcProto.BloomFilterIndex[] bloomFilterIndices) { if (footer == null) { footer = readStripeFooter(stripe); } if (indexes == null) { indexes = new OrcProto.RowIndex[typeCount]; } if (bloomFilterIndices == null) { bloomFilterIndices = new OrcProto.BloomFilterIndex[typeCount]; } long offset = stripe.getOffset(); IList <OrcProto.Stream> streams = footer.StreamsList; for (int i = 0; i < streams.Count; i++) { OrcProto.Stream stream = streams[i]; OrcProto.Stream nextStream = null; if (i < streams.Count - 1) { nextStream = streams[i + 1]; } int col = (int)stream.Column; int len = (int)stream.Length; // row index stream and bloom filter are interlaced, check if the sarg column contains bloom // filter and combine the io to read row index and bloom filters for that column together if (stream.HasKind && (stream.Kind == OrcProto.Stream.Types.Kind.ROW_INDEX)) { bool readBloomFilter = false; if (sargColumns != null && sargColumns[col] && nextStream.Kind == OrcProto.Stream.Types.Kind.BLOOM_FILTER) { len += (int)nextStream.Length; i += 1; readBloomFilter = true; } if ((included == null || included[col]) && indexes[col] == null) { byte[] buffer = new byte[len]; file.readFully(offset, buffer, 0, buffer.Length); ByteBuffer bb = ByteBuffer.wrap(buffer); indexes[col] = OrcProto.RowIndex.ParseFrom(InStream.create(null, "index", new List <DiskRange> { new RecordReaderImpl.BufferChunk(bb, 0) }, (long)stream.Length, codec, bufferSize)); if (readBloomFilter) { bb.position((int)stream.Length); bloomFilterIndices[col] = OrcProto.BloomFilterIndex.ParseFrom(InStream.create( null, "bloom_filter", new List <DiskRange> { new RecordReaderImpl.BufferChunk(bb, 0) }, (long)nextStream.Length, codec, bufferSize)); } } } offset += len; } RecordReaderImpl.Index index = new RecordReaderImpl.Index(indexes, bloomFilterIndices); return(index); }