Ejemplo n.º 1
0
 private static void writeStripeInformation(JsonWriter writer, StripeInformation stripe)
 {
     writer.newObject();
     writer.key("offset").value(stripe.getOffset());
     writer.key("indexLength").value(stripe.getIndexLength());
     writer.key("dataLength").value(stripe.getDataLength());
     writer.key("footerLength").value(stripe.getFooterLength());
     writer.key("rowCount").value(stripe.getNumberOfRows());
     writer.endObject();
 }
        public OrcProto.StripeFooter readStripeFooter(StripeInformation stripe)
        {
            long offset     = stripe.getOffset() + stripe.getIndexLength() + stripe.getDataLength();
            int  tailLength = (int)stripe.getFooterLength();

            // read the footer
            ByteBuffer tailBuf = ByteBuffer.allocate(tailLength);

            file.readFully(offset, tailBuf.array(), tailBuf.arrayOffset(), tailLength);
            return(OrcProto.StripeFooter.ParseFrom(InStream.createCodedInputStream(null, "footer",
                                                                                   new List <DiskRange> {
                new RecordReaderImpl.BufferChunk(tailBuf, 0)
            },
                                                                                   tailLength, codec, bufferSize)));
        }
        protected bool nextStripe(OrcFileKeyWrapper keyWrapper, OrcFileValueWrapper valueWrapper)
        {
            // missing stripe stats (old format). If numRows is 0 then its an empty file and no statistics
            // is present. We have to differentiate no stats (empty file) vs missing stats (old format).
            if ((stripeStatistics == null || stripeStatistics.Count == 0) && reader.getNumberOfRows() > 0)
            {
                keyWrapper.setInputPath(path);
                keyWrapper.setIsIncompatFile(true);
                skipFile = true;
                return(true);
            }

            bool active = iter.MoveNext();

            while (active)
            {
                StripeInformation si = iter.Current;

                // if stripe offset is outside the split boundary then ignore the current
                // stripe as it will be handled by some other mapper.
                if (si.getOffset() >= start && si.getOffset() < end)
                {
                    valueWrapper.setStripeStatistics(stripeStatistics[stripeIdx++]);
                    valueWrapper.setStripeInformation(si);
                    active = iter.MoveNext();
                    if (!active)
                    {
                        valueWrapper.setLastStripeInFile(true);
                        valueWrapper.setUserMetadata(((ReaderImpl)reader).getOrcProtoUserMetadata());
                    }
                    keyWrapper.setInputPath(path);
                    keyWrapper.setCompression(reader.getCompression());
                    keyWrapper.setCompressBufferSize(reader.getCompressionSize());
                    keyWrapper.setVersion(reader.getFileVersion());
                    keyWrapper.setRowIndexStride(reader.getRowIndexStride());
                    keyWrapper.setTypes(reader.getTypes());
                }
                else
                {
                    stripeIdx++;
                    continue;
                }
                return(true);
            }

            return(false);
        }
Ejemplo n.º 4
0
 public void setStripeInformation(StripeInformation stripeInformation)
 {
     this.stripeInformation = stripeInformation;
 }
Ejemplo n.º 5
0
        public void appendStripe(byte[] stripe, int offset, int length,
            StripeInformation stripeInfo,
            OrcProto.StripeStatistics stripeStatistics)
        {
            checkArgument(stripe != null, "Stripe must not be null");
            checkArgument(length <= stripe.Length,
                "Specified length must not be greater specified array length");
            checkArgument(stripeInfo != null, "Stripe information must not be null");
            checkArgument(stripeStatistics != null,
                "Stripe statistics must not be null");

            getStream();
            long start = rawWriter.Position;
            long availBlockSpace = blockSize - (start % blockSize);

            // see if stripe can fit in the current hdfs block, else pad the remaining
            // space in the block
            if (length < blockSize && length > availBlockSpace &&
                addBlockPadding)
            {
                byte[] pad = new byte[(int)Math.Min(HDFS_BUFFER_SIZE, availBlockSpace)];
                LOG.info(String.Format("Padding ORC by {0} bytes while merging..",
                    availBlockSpace));
                start += availBlockSpace;
                while (availBlockSpace > 0)
                {
                    int writeLen = (int)Math.Min(availBlockSpace, pad.Length);
                    rawWriter.Write(pad, 0, writeLen);
                    availBlockSpace -= writeLen;
                }
            }

            rawWriter.Write(stripe, 0, stripe.Length);
            rowsInStripe = (long)stripeStatistics.ColStatsList[0].NumberOfValues;
            rowCount += rowsInStripe;

            // since we have already written the stripe, just update stripe statistics
            treeWriter.stripeStatsBuilders.Add(stripeStatistics.ToBuilder());

            // update file level statistics
            updateFileStatistics(stripeStatistics);

            // update stripe information
            OrcProto.StripeInformation.Builder dirEntry = OrcProto.StripeInformation.CreateBuilder();
            dirEntry.Offset = (ulong)start;
            dirEntry.NumberOfRows = (ulong)rowsInStripe;
            dirEntry.IndexLength = (ulong)stripeInfo.getIndexLength();
            dirEntry.DataLength = (ulong)stripeInfo.getDataLength();
            dirEntry.FooterLength = (ulong)stripeInfo.getFooterLength();
            stripes.Add(dirEntry.Build());

            // reset it after writing the stripe
            rowsInStripe = 0;
        }
Ejemplo n.º 6
0
        public RecordReaderImpl.Index readRowIndex(StripeInformation stripe,
            OrcProto.StripeFooter footer, bool[] included, OrcProto.RowIndex[] indexes,
            bool[] sargColumns, OrcProto.BloomFilterIndex[] bloomFilterIndices)
        {
            if (footer == null)
            {
                footer = readStripeFooter(stripe);
            }
            if (indexes == null)
            {
                indexes = new OrcProto.RowIndex[typeCount];
            }
            if (bloomFilterIndices == null)
            {
                bloomFilterIndices = new OrcProto.BloomFilterIndex[typeCount];
            }
            long offset = stripe.getOffset();
            IList<OrcProto.Stream> streams = footer.StreamsList;
            for (int i = 0; i < streams.Count; i++)
            {
                OrcProto.Stream stream = streams[i];
                OrcProto.Stream nextStream = null;
                if (i < streams.Count - 1)
                {
                    nextStream = streams[i + 1];
                }
                int col = (int)stream.Column;
                int len = (int)stream.Length;
                // row index stream and bloom filter are interlaced, check if the sarg column contains bloom
                // filter and combine the io to read row index and bloom filters for that column together
                if (stream.HasKind && (stream.Kind == OrcProto.Stream.Types.Kind.ROW_INDEX))
                {
                    bool readBloomFilter = false;
                    if (sargColumns != null && sargColumns[col] &&
                        nextStream.Kind == OrcProto.Stream.Types.Kind.BLOOM_FILTER)
                    {
                        len += (int)nextStream.Length;
                        i += 1;
                        readBloomFilter = true;
                    }
                    if ((included == null || included[col]) && indexes[col] == null)
                    {
                        byte[] buffer = new byte[len];
                        file.readFully(offset, buffer, 0, buffer.Length);
                        ByteBuffer bb = ByteBuffer.wrap(buffer);
                        indexes[col] = OrcProto.RowIndex.ParseFrom(InStream.create(null, "index",
                            new List<DiskRange> { new RecordReaderImpl.BufferChunk(bb, 0) },
                            (long)stream.Length, codec, bufferSize));
                        if (readBloomFilter)
                        {
                            bb.position((int)stream.Length);
                            bloomFilterIndices[col] = OrcProto.BloomFilterIndex.ParseFrom(InStream.create(
                                null, "bloom_filter", new List<DiskRange> { new RecordReaderImpl.BufferChunk(bb, 0) },
                                (long)nextStream.Length, codec, bufferSize));
                        }
                    }
                }
                offset += len;
            }

            RecordReaderImpl.Index index = new RecordReaderImpl.Index(indexes, bloomFilterIndices);
            return index;
        }
Ejemplo n.º 7
0
        public OrcProto.StripeFooter readStripeFooter(StripeInformation stripe)
        {
            long offset = stripe.getOffset() + stripe.getIndexLength() + stripe.getDataLength();
            int tailLength = (int)stripe.getFooterLength();

            // read the footer
            ByteBuffer tailBuf = ByteBuffer.allocate(tailLength);
            file.readFully(offset, tailBuf.array(), tailBuf.arrayOffset(), tailLength);
            return OrcProto.StripeFooter.ParseFrom(InStream.createCodedInputStream(null, "footer",
                new List<DiskRange> { new RecordReaderImpl.BufferChunk(tailBuf, 0) },
                tailLength, codec, bufferSize));
        }
Ejemplo n.º 8
0
        private void CompleteStripe()
        {
            var stripeFooter = new StripeFooter();
            var stripeStats  = new StripeStatistics();

            //Columns
            foreach (var writer in _columnWriters)
            {
                writer.ColumnWriter.FlushBuffers();
                var dictionaryLength =
                    (writer.ColumnWriter as StringWriter)?.DictionaryLength ??
                    0; //DictionaryLength is only used by StringWriter
                stripeFooter.AddColumn(writer.ColumnWriter.ColumnEncoding, dictionaryLength);
            }

            var stripeInformation = new StripeInformation
            {
                Offset       = (ulong)_outputStream.Position,
                NumberOfRows = (ulong)_rowsInStripe
            };

            //Indexes
            foreach (var writer in _columnWriters)
            {
                //Write the index buffer
                var indexBuffer = _bufferFactory.CreateBuffer(StreamKind.RowIndex);
                writer.ColumnWriter.Statistics.WriteToBuffer(indexBuffer);
                indexBuffer.CopyTo(_outputStream);

                //Add the index to the footer
                stripeFooter.AddDataStream(writer.ColumnWriter.ColumnId, indexBuffer);

                //Collect summary statistics
                var columnStats = new ColumnStatistics();
                foreach (var stats in writer.ColumnWriter.Statistics)
                {
                    stats.FillColumnStatistics(columnStats);
                    stats.FillColumnStatistics(writer.FileStatistics);
                }
                stripeStats.ColStats.Add(columnStats);
            }
            _stripeStats.Add(stripeStats);

            stripeInformation.IndexLength = (ulong)_outputStream.Position - stripeInformation.Offset;

            //Data streams
            foreach (var writer in _columnWriters)
            {
                foreach (var buffer in writer.ColumnWriter.Buffers)
                {
                    if (!buffer.MustBeIncluded)
                    {
                        continue;
                    }
                    buffer.CopyTo(_outputStream);
                    stripeFooter.AddDataStream(writer.ColumnWriter.ColumnId, buffer);
                }
            }

            stripeInformation.DataLength = (ulong)_outputStream.Position - stripeInformation.IndexLength -
                                           stripeInformation.Offset;

            //Footer
            long footerLength;

            _bufferFactory.SerializeAndCompressTo(_outputStream, stripeFooter, out footerLength);
            stripeInformation.FooterLength = (ulong)footerLength;

            _stripeInformations.Add(stripeInformation);

            _rowsInFile  += _rowsInStripe;
            _rowsInStripe = 0;
            foreach (var writer in _columnWriters)
            {
                writer.ColumnWriter.Reset();
            }
        }
        public RecordReaderImpl.Index readRowIndex(StripeInformation stripe,
                                                   OrcProto.StripeFooter footer, bool[] included, OrcProto.RowIndex[] indexes,
                                                   bool[] sargColumns, OrcProto.BloomFilterIndex[] bloomFilterIndices)
        {
            if (footer == null)
            {
                footer = readStripeFooter(stripe);
            }
            if (indexes == null)
            {
                indexes = new OrcProto.RowIndex[typeCount];
            }
            if (bloomFilterIndices == null)
            {
                bloomFilterIndices = new OrcProto.BloomFilterIndex[typeCount];
            }
            long offset = stripe.getOffset();
            IList <OrcProto.Stream> streams = footer.StreamsList;

            for (int i = 0; i < streams.Count; i++)
            {
                OrcProto.Stream stream     = streams[i];
                OrcProto.Stream nextStream = null;
                if (i < streams.Count - 1)
                {
                    nextStream = streams[i + 1];
                }
                int col = (int)stream.Column;
                int len = (int)stream.Length;
                // row index stream and bloom filter are interlaced, check if the sarg column contains bloom
                // filter and combine the io to read row index and bloom filters for that column together
                if (stream.HasKind && (stream.Kind == OrcProto.Stream.Types.Kind.ROW_INDEX))
                {
                    bool readBloomFilter = false;
                    if (sargColumns != null && sargColumns[col] &&
                        nextStream.Kind == OrcProto.Stream.Types.Kind.BLOOM_FILTER)
                    {
                        len            += (int)nextStream.Length;
                        i              += 1;
                        readBloomFilter = true;
                    }
                    if ((included == null || included[col]) && indexes[col] == null)
                    {
                        byte[] buffer = new byte[len];
                        file.readFully(offset, buffer, 0, buffer.Length);
                        ByteBuffer bb = ByteBuffer.wrap(buffer);
                        indexes[col] = OrcProto.RowIndex.ParseFrom(InStream.create(null, "index",
                                                                                   new List <DiskRange> {
                            new RecordReaderImpl.BufferChunk(bb, 0)
                        },
                                                                                   (long)stream.Length, codec, bufferSize));
                        if (readBloomFilter)
                        {
                            bb.position((int)stream.Length);
                            bloomFilterIndices[col] = OrcProto.BloomFilterIndex.ParseFrom(InStream.create(
                                                                                              null, "bloom_filter", new List <DiskRange> {
                                new RecordReaderImpl.BufferChunk(bb, 0)
                            },
                                                                                              (long)nextStream.Length, codec, bufferSize));
                        }
                    }
                }
                offset += len;
            }

            RecordReaderImpl.Index index = new RecordReaderImpl.Index(indexes, bloomFilterIndices);
            return(index);
        }
Ejemplo n.º 10
0
 private static void writeStripeInformation(JsonWriter writer, StripeInformation stripe)
 {
     writer.newObject();
     writer.key("offset").value(stripe.getOffset());
     writer.key("indexLength").value(stripe.getIndexLength());
     writer.key("dataLength").value(stripe.getDataLength());
     writer.key("footerLength").value(stripe.getFooterLength());
     writer.key("rowCount").value(stripe.getNumberOfRows());
     writer.endObject();
 }