public RecordReaderImpl.Index readRowIndex(StripeInformation stripe,
            OrcProto.StripeFooter footer, bool[] included, OrcProto.RowIndex[] indexes,
            bool[] sargColumns, OrcProto.BloomFilterIndex[] bloomFilterIndices)
        {
            if (footer == null)
            {
                footer = readStripeFooter(stripe);
            }
            if (indexes == null)
            {
                indexes = new OrcProto.RowIndex[typeCount];
            }
            if (bloomFilterIndices == null)
            {
                bloomFilterIndices = new OrcProto.BloomFilterIndex[typeCount];
            }
            long offset = stripe.getOffset();
            IList<OrcProto.Stream> streams = footer.StreamsList;
            for (int i = 0; i < streams.Count; i++)
            {
                OrcProto.Stream stream = streams[i];
                OrcProto.Stream nextStream = null;
                if (i < streams.Count - 1)
                {
                    nextStream = streams[i + 1];
                }
                int col = (int)stream.Column;
                int len = (int)stream.Length;
                // row index stream and bloom filter are interlaced, check if the sarg column contains bloom
                // filter and combine the io to read row index and bloom filters for that column together
                if (stream.HasKind && (stream.Kind == OrcProto.Stream.Types.Kind.ROW_INDEX))
                {
                    bool readBloomFilter = false;
                    if (sargColumns != null && sargColumns[col] &&
                        nextStream.Kind == OrcProto.Stream.Types.Kind.BLOOM_FILTER)
                    {
                        len += (int)nextStream.Length;
                        i += 1;
                        readBloomFilter = true;
                    }
                    if ((included == null || included[col]) && indexes[col] == null)
                    {
                        byte[] buffer = new byte[len];
                        file.readFully(offset, buffer, 0, buffer.Length);
                        ByteBuffer bb = ByteBuffer.wrap(buffer);
                        indexes[col] = OrcProto.RowIndex.ParseFrom(InStream.create(null, "index",
                            new List<DiskRange> { new RecordReaderImpl.BufferChunk(bb, 0) },
                            (long)stream.Length, codec, bufferSize));
                        if (readBloomFilter)
                        {
                            bb.position((int)stream.Length);
                            bloomFilterIndices[col] = OrcProto.BloomFilterIndex.ParseFrom(InStream.create(
                                null, "bloom_filter", new List<DiskRange> { new RecordReaderImpl.BufferChunk(bb, 0) },
                                (long)nextStream.Length, codec, bufferSize));
                        }
                    }
                }
                offset += len;
            }

            RecordReaderImpl.Index index = new RecordReaderImpl.Index(indexes, bloomFilterIndices);
            return index;
        }
        public void testPartialPlan()
        {
            DiskRangeList result;

            // set the streams
            List<OrcProto.Stream> streams = new List<OrcProto.Stream>();
            streams.Add(OrcProto.Stream.CreateBuilder()
                .SetKind(OrcProto.Stream.Types.Kind.PRESENT)
                .SetColumn(1).SetLength(1000).Build());
            streams.Add(OrcProto.Stream.CreateBuilder()
                .SetKind(OrcProto.Stream.Types.Kind.DATA)
                .SetColumn(1).SetLength(99000).Build());
            streams.Add(OrcProto.Stream.CreateBuilder()
                .SetKind(OrcProto.Stream.Types.Kind.PRESENT)
                .SetColumn(2).SetLength(2000).Build());
            streams.Add(OrcProto.Stream.CreateBuilder()
                .SetKind(OrcProto.Stream.Types.Kind.DATA)
                .SetColumn(2).SetLength(98000).Build());

            bool[] columns = new bool[] { true, true, false };
            bool[] rowGroups = new bool[] { true, true, false, false, true, false };

            // set the index
            OrcProto.RowIndex[] indexes = new OrcProto.RowIndex[columns.Length];
            indexes[1] = OrcProto.RowIndex.CreateBuilder()
                .AddEntry(OrcProto.RowIndexEntry.CreateBuilder()
                    .AddPositions(0).AddPositions(UInt64.MaxValue).AddPositions(UInt64.MaxValue)
                    .AddPositions(0)
                    .Build())
                .AddEntry(OrcProto.RowIndexEntry.CreateBuilder()
                    .AddPositions(100).AddPositions(UInt64.MaxValue).AddPositions(UInt64.MaxValue)
                    .AddPositions(10000)
                    .Build())
                .AddEntry(OrcProto.RowIndexEntry.CreateBuilder()
                    .AddPositions(200).AddPositions(UInt64.MaxValue).AddPositions(UInt64.MaxValue)
                    .AddPositions(20000)
                    .Build())
                .AddEntry(OrcProto.RowIndexEntry.CreateBuilder()
                    .AddPositions(300).AddPositions(UInt64.MaxValue).AddPositions(UInt64.MaxValue)
                    .AddPositions(30000)
                    .Build())
                .AddEntry(OrcProto.RowIndexEntry.CreateBuilder()
                    .AddPositions(400).AddPositions(UInt64.MaxValue).AddPositions(UInt64.MaxValue)
                    .AddPositions(40000)
                    .Build())
                .AddEntry(OrcProto.RowIndexEntry.CreateBuilder()
                    .AddPositions(500).AddPositions(UInt64.MaxValue).AddPositions(UInt64.MaxValue)
                    .AddPositions(50000)
                    .Build())
                .Build();

            // set encodings
            List<OrcProto.ColumnEncoding> encodings = new List<OrcProto.ColumnEncoding>();
            encodings.Add(OrcProto.ColumnEncoding.CreateBuilder()
                            .SetKind(OrcProto.ColumnEncoding.Types.Kind.DIRECT).Build());
            encodings.Add(OrcProto.ColumnEncoding.CreateBuilder()
                .SetKind(OrcProto.ColumnEncoding.Types.Kind.DIRECT).Build());
            encodings.Add(OrcProto.ColumnEncoding.CreateBuilder()
                .SetKind(OrcProto.ColumnEncoding.Types.Kind.DIRECT).Build());

            // set types struct{x: int, y: int}
            List<OrcProto.Type> types = new List<OrcProto.Type>();
            types.Add(OrcProto.Type.CreateBuilder().SetKind(OrcProto.Type.Types.Kind.STRUCT)
                        .AddSubtypes(1).AddSubtypes(2).AddFieldNames("x")
                        .AddFieldNames("y").Build());
            types.Add(OrcProto.Type.CreateBuilder().SetKind(OrcProto.Type.Types.Kind.INT).Build());
            types.Add(OrcProto.Type.CreateBuilder().SetKind(OrcProto.Type.Types.Kind.INT).Build());

            // filter by rows and groups
            result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
                columns, rowGroups, false, encodings, types, 32768, false);
            Assert.Equal(
                diskRanges(0, 1000, 100, 1000, 400, 1000,
                    1000, 11000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP,
                    11000, 21000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP,
                    41000, 51000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP),
                result);
            result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
                columns, rowGroups, false, encodings, types, 32768, true);
            Assert.Equal(
                diskRanges(0, 21000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP,
                    41000, 51000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP),
                result);

            // if we read no rows, don't read any bytes
            rowGroups = new bool[] { false, false, false, false, false, false };
            result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
                columns, rowGroups, false, encodings, types, 32768, false);
            Assert.Null(result);

            // all rows, but only columns 0 and 2.
            rowGroups = null;
            columns = new bool[] { true, false, true };
            result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
                columns, null, false, encodings, types, 32768, false);
            Assert.Equal(diskRanges(100000, 102000, 102000, 200000), result);
            result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
                columns, null, false, encodings, types, 32768, true);
            Assert.Equal(diskRanges(100000, 200000), result);

            rowGroups = new bool[] { false, true, false, false, false, false };
            indexes[2] = indexes[1];
            indexes[1] = null;
            result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
                columns, rowGroups, false, encodings, types, 32768, false);
            Assert.Equal(
                diskRanges(100100, 102000,
                    112000, 122000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP),
                result);
            result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
                columns, rowGroups, false, encodings, types, 32768, true);
            Assert.Equal(
                diskRanges(100100, 102000,
                    112000, 122000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP),
                result);

            rowGroups = new bool[] { false, false, false, false, false, true };
            indexes[1] = indexes[2];
            columns = new bool[] { true, true, true };
            result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
                columns, rowGroups, false, encodings, types, 32768, false);
            Assert.Equal(diskRanges(500, 1000, 51000, 100000, 100500, 102000, 152000, 200000), result);
            result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
                columns, rowGroups, false, encodings, types, 32768, true);
            Assert.Equal(diskRanges(500, 1000, 51000, 100000, 100500, 102000, 152000, 200000), result);
        }
        public RecordReaderImpl.Index readRowIndex(StripeInformation stripe,
                                                   OrcProto.StripeFooter footer, bool[] included, OrcProto.RowIndex[] indexes,
                                                   bool[] sargColumns, OrcProto.BloomFilterIndex[] bloomFilterIndices)
        {
            if (footer == null)
            {
                footer = readStripeFooter(stripe);
            }
            if (indexes == null)
            {
                indexes = new OrcProto.RowIndex[typeCount];
            }
            if (bloomFilterIndices == null)
            {
                bloomFilterIndices = new OrcProto.BloomFilterIndex[typeCount];
            }
            long offset = stripe.getOffset();
            IList <OrcProto.Stream> streams = footer.StreamsList;

            for (int i = 0; i < streams.Count; i++)
            {
                OrcProto.Stream stream     = streams[i];
                OrcProto.Stream nextStream = null;
                if (i < streams.Count - 1)
                {
                    nextStream = streams[i + 1];
                }
                int col = (int)stream.Column;
                int len = (int)stream.Length;
                // row index stream and bloom filter are interlaced, check if the sarg column contains bloom
                // filter and combine the io to read row index and bloom filters for that column together
                if (stream.HasKind && (stream.Kind == OrcProto.Stream.Types.Kind.ROW_INDEX))
                {
                    bool readBloomFilter = false;
                    if (sargColumns != null && sargColumns[col] &&
                        nextStream.Kind == OrcProto.Stream.Types.Kind.BLOOM_FILTER)
                    {
                        len            += (int)nextStream.Length;
                        i              += 1;
                        readBloomFilter = true;
                    }
                    if ((included == null || included[col]) && indexes[col] == null)
                    {
                        byte[] buffer = new byte[len];
                        file.readFully(offset, buffer, 0, buffer.Length);
                        ByteBuffer bb = ByteBuffer.wrap(buffer);
                        indexes[col] = OrcProto.RowIndex.ParseFrom(InStream.create(null, "index",
                                                                                   new List <DiskRange> {
                            new RecordReaderImpl.BufferChunk(bb, 0)
                        },
                                                                                   (long)stream.Length, codec, bufferSize));
                        if (readBloomFilter)
                        {
                            bb.position((int)stream.Length);
                            bloomFilterIndices[col] = OrcProto.BloomFilterIndex.ParseFrom(InStream.create(
                                                                                              null, "bloom_filter", new List <DiskRange> {
                                new RecordReaderImpl.BufferChunk(bb, 0)
                            },
                                                                                              (long)nextStream.Length, codec, bufferSize));
                        }
                    }
                }
                offset += len;
            }

            RecordReaderImpl.Index index = new RecordReaderImpl.Index(indexes, bloomFilterIndices);
            return(index);
        }
        public void testPartialPlanString()
        {
            DiskRangeList result;

            // set the streams
            List<OrcProto.Stream> streams = new List<OrcProto.Stream>();
            streams.Add(OrcProto.Stream.CreateBuilder()
                .SetKind(OrcProto.Stream.Types.Kind.PRESENT)
                .SetColumn(1).SetLength(1000).Build());
            streams.Add(OrcProto.Stream.CreateBuilder()
                .SetKind(OrcProto.Stream.Types.Kind.DATA)
                .SetColumn(1).SetLength(94000).Build());
            streams.Add(OrcProto.Stream.CreateBuilder()
                .SetKind(OrcProto.Stream.Types.Kind.LENGTH)
                .SetColumn(1).SetLength(2000).Build());
            streams.Add(OrcProto.Stream.CreateBuilder()
                .SetKind(OrcProto.Stream.Types.Kind.DICTIONARY_DATA)
                .SetColumn(1).SetLength(3000).Build());
            streams.Add(OrcProto.Stream.CreateBuilder()
                .SetKind(OrcProto.Stream.Types.Kind.PRESENT)
                .SetColumn(2).SetLength(2000).Build());
            streams.Add(OrcProto.Stream.CreateBuilder()
                .SetKind(OrcProto.Stream.Types.Kind.DATA)
                .SetColumn(2).SetLength(98000).Build());

            bool[] columns = new bool[] { true, true, false };
            bool[] rowGroups = new bool[] { false, true, false, false, true, true };

            // set the index
            OrcProto.RowIndex[] indexes = new OrcProto.RowIndex[columns.Length];
            indexes[1] = OrcProto.RowIndex.CreateBuilder()
                .AddEntry(OrcProto.RowIndexEntry.CreateBuilder()
                    .AddPositions(0).AddPositions(UInt64.MaxValue).AddPositions(UInt64.MaxValue)
                    .AddPositions(0)
                    .Build())
                .AddEntry(OrcProto.RowIndexEntry.CreateBuilder()
                    .AddPositions(100).AddPositions(UInt64.MaxValue).AddPositions(UInt64.MaxValue)
                    .AddPositions(10000)
                    .Build())
                .AddEntry(OrcProto.RowIndexEntry.CreateBuilder()
                    .AddPositions(200).AddPositions(UInt64.MaxValue).AddPositions(UInt64.MaxValue)
                    .AddPositions(20000)
                    .Build())
                .AddEntry(OrcProto.RowIndexEntry.CreateBuilder()
                    .AddPositions(300).AddPositions(UInt64.MaxValue).AddPositions(UInt64.MaxValue)
                    .AddPositions(30000)
                    .Build())
                .AddEntry(OrcProto.RowIndexEntry.CreateBuilder()
                    .AddPositions(400).AddPositions(UInt64.MaxValue).AddPositions(UInt64.MaxValue)
                    .AddPositions(40000)
                    .Build())
                .AddEntry(OrcProto.RowIndexEntry.CreateBuilder()
                    .AddPositions(500).AddPositions(UInt64.MaxValue).AddPositions(UInt64.MaxValue)
                    .AddPositions(50000)
                    .Build())
                .Build();

            // set encodings
            List<OrcProto.ColumnEncoding> encodings = new List<OrcProto.ColumnEncoding>();
            encodings.Add(OrcProto.ColumnEncoding.CreateBuilder()
                .SetKind(OrcProto.ColumnEncoding.Types.Kind.DIRECT).Build());
            encodings.Add(OrcProto.ColumnEncoding.CreateBuilder()
                .SetKind(OrcProto.ColumnEncoding.Types.Kind.DICTIONARY).Build());
            encodings.Add(OrcProto.ColumnEncoding.CreateBuilder()
                .SetKind(OrcProto.ColumnEncoding.Types.Kind.DIRECT).Build());

            // set types struct{x: string, y: int}
            List<OrcProto.Type> types = new List<OrcProto.Type>();
            types.Add(OrcProto.Type.CreateBuilder().SetKind(OrcProto.Type.Types.Kind.STRUCT)
                .AddSubtypes(1).AddSubtypes(2).AddFieldNames("x")
                .AddFieldNames("y").Build());
            types.Add(OrcProto.Type.CreateBuilder().SetKind(OrcProto.Type.Types.Kind.STRING).Build());
            types.Add(OrcProto.Type.CreateBuilder().SetKind(OrcProto.Type.Types.Kind.INT).Build());

            // filter by rows and groups
            result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
                columns, rowGroups, false, encodings, types, 32768, false);
            Assert.Equal(
                diskRanges(100, 1000, 400, 1000, 500, 1000,
                    11000, 21000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP,
                    41000, 51000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP,
                    51000, 95000, 95000, 97000, 97000, 100000),
                result);
        }
Exemplo n.º 5
0
        public static void addRgFilteredStreamToRanges(OrcProto.Stream stream,
                                                       bool[] includedRowGroups, bool isCompressed, OrcProto.RowIndex index,
                                                       OrcProto.ColumnEncoding encoding, OrcProto.Type type, int compressionSize, bool hasNull,
                                                       long offset, long length, DiskRangeList.CreateHelper list, bool doMergeBuffers)
        {
            for (int group = 0; group < includedRowGroups.Length; ++group)
            {
                if (!includedRowGroups[group])
                {
                    continue;
                }
                int posn = getIndexPosition(
                    encoding.Kind, type.Kind, stream.Kind, isCompressed, hasNull);
                long start = (long)index.EntryList[group].PositionsList[posn];
                long nextGroupOffset;
                bool isLast = group == (includedRowGroups.Length - 1);
                nextGroupOffset = isLast ? length : (int)index.EntryList[group + 1].PositionsList[posn];

                start += offset;
                long end = offset + estimateRgEndOffset(
                    isCompressed, isLast, nextGroupOffset, length, compressionSize);
                list.addOrMerge(start, end, doMergeBuffers, true);
            }
        }