public RecordReaderImpl.Index readRowIndex(StripeInformation stripe, OrcProto.StripeFooter footer, bool[] included, OrcProto.RowIndex[] indexes, bool[] sargColumns, OrcProto.BloomFilterIndex[] bloomFilterIndices) { if (footer == null) { footer = readStripeFooter(stripe); } if (indexes == null) { indexes = new OrcProto.RowIndex[typeCount]; } if (bloomFilterIndices == null) { bloomFilterIndices = new OrcProto.BloomFilterIndex[typeCount]; } long offset = stripe.getOffset(); IList<OrcProto.Stream> streams = footer.StreamsList; for (int i = 0; i < streams.Count; i++) { OrcProto.Stream stream = streams[i]; OrcProto.Stream nextStream = null; if (i < streams.Count - 1) { nextStream = streams[i + 1]; } int col = (int)stream.Column; int len = (int)stream.Length; // row index stream and bloom filter are interlaced, check if the sarg column contains bloom // filter and combine the io to read row index and bloom filters for that column together if (stream.HasKind && (stream.Kind == OrcProto.Stream.Types.Kind.ROW_INDEX)) { bool readBloomFilter = false; if (sargColumns != null && sargColumns[col] && nextStream.Kind == OrcProto.Stream.Types.Kind.BLOOM_FILTER) { len += (int)nextStream.Length; i += 1; readBloomFilter = true; } if ((included == null || included[col]) && indexes[col] == null) { byte[] buffer = new byte[len]; file.readFully(offset, buffer, 0, buffer.Length); ByteBuffer bb = ByteBuffer.wrap(buffer); indexes[col] = OrcProto.RowIndex.ParseFrom(InStream.create(null, "index", new List<DiskRange> { new RecordReaderImpl.BufferChunk(bb, 0) }, (long)stream.Length, codec, bufferSize)); if (readBloomFilter) { bb.position((int)stream.Length); bloomFilterIndices[col] = OrcProto.BloomFilterIndex.ParseFrom(InStream.create( null, "bloom_filter", new List<DiskRange> { new RecordReaderImpl.BufferChunk(bb, 0) }, (long)nextStream.Length, codec, bufferSize)); } } } offset += len; } RecordReaderImpl.Index index = new RecordReaderImpl.Index(indexes, bloomFilterIndices); return index; }
public void testPartialPlan() { DiskRangeList result; // set the streams List<OrcProto.Stream> streams = new List<OrcProto.Stream>(); streams.Add(OrcProto.Stream.CreateBuilder() .SetKind(OrcProto.Stream.Types.Kind.PRESENT) .SetColumn(1).SetLength(1000).Build()); streams.Add(OrcProto.Stream.CreateBuilder() .SetKind(OrcProto.Stream.Types.Kind.DATA) .SetColumn(1).SetLength(99000).Build()); streams.Add(OrcProto.Stream.CreateBuilder() .SetKind(OrcProto.Stream.Types.Kind.PRESENT) .SetColumn(2).SetLength(2000).Build()); streams.Add(OrcProto.Stream.CreateBuilder() .SetKind(OrcProto.Stream.Types.Kind.DATA) .SetColumn(2).SetLength(98000).Build()); bool[] columns = new bool[] { true, true, false }; bool[] rowGroups = new bool[] { true, true, false, false, true, false }; // set the index OrcProto.RowIndex[] indexes = new OrcProto.RowIndex[columns.Length]; indexes[1] = OrcProto.RowIndex.CreateBuilder() .AddEntry(OrcProto.RowIndexEntry.CreateBuilder() .AddPositions(0).AddPositions(UInt64.MaxValue).AddPositions(UInt64.MaxValue) .AddPositions(0) .Build()) .AddEntry(OrcProto.RowIndexEntry.CreateBuilder() .AddPositions(100).AddPositions(UInt64.MaxValue).AddPositions(UInt64.MaxValue) .AddPositions(10000) .Build()) .AddEntry(OrcProto.RowIndexEntry.CreateBuilder() .AddPositions(200).AddPositions(UInt64.MaxValue).AddPositions(UInt64.MaxValue) .AddPositions(20000) .Build()) .AddEntry(OrcProto.RowIndexEntry.CreateBuilder() .AddPositions(300).AddPositions(UInt64.MaxValue).AddPositions(UInt64.MaxValue) .AddPositions(30000) .Build()) .AddEntry(OrcProto.RowIndexEntry.CreateBuilder() .AddPositions(400).AddPositions(UInt64.MaxValue).AddPositions(UInt64.MaxValue) .AddPositions(40000) .Build()) .AddEntry(OrcProto.RowIndexEntry.CreateBuilder() .AddPositions(500).AddPositions(UInt64.MaxValue).AddPositions(UInt64.MaxValue) .AddPositions(50000) .Build()) .Build(); // set encodings List<OrcProto.ColumnEncoding> encodings = new List<OrcProto.ColumnEncoding>(); encodings.Add(OrcProto.ColumnEncoding.CreateBuilder() .SetKind(OrcProto.ColumnEncoding.Types.Kind.DIRECT).Build()); encodings.Add(OrcProto.ColumnEncoding.CreateBuilder() .SetKind(OrcProto.ColumnEncoding.Types.Kind.DIRECT).Build()); encodings.Add(OrcProto.ColumnEncoding.CreateBuilder() .SetKind(OrcProto.ColumnEncoding.Types.Kind.DIRECT).Build()); // set types struct{x: int, y: int} List<OrcProto.Type> types = new List<OrcProto.Type>(); types.Add(OrcProto.Type.CreateBuilder().SetKind(OrcProto.Type.Types.Kind.STRUCT) .AddSubtypes(1).AddSubtypes(2).AddFieldNames("x") .AddFieldNames("y").Build()); types.Add(OrcProto.Type.CreateBuilder().SetKind(OrcProto.Type.Types.Kind.INT).Build()); types.Add(OrcProto.Type.CreateBuilder().SetKind(OrcProto.Type.Types.Kind.INT).Build()); // filter by rows and groups result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes, columns, rowGroups, false, encodings, types, 32768, false); Assert.Equal( diskRanges(0, 1000, 100, 1000, 400, 1000, 1000, 11000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP, 11000, 21000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP, 41000, 51000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP), result); result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes, columns, rowGroups, false, encodings, types, 32768, true); Assert.Equal( diskRanges(0, 21000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP, 41000, 51000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP), result); // if we read no rows, don't read any bytes rowGroups = new bool[] { false, false, false, false, false, false }; result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes, columns, rowGroups, false, encodings, types, 32768, false); Assert.Null(result); // all rows, but only columns 0 and 2. rowGroups = null; columns = new bool[] { true, false, true }; result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes, columns, null, false, encodings, types, 32768, false); Assert.Equal(diskRanges(100000, 102000, 102000, 200000), result); result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes, columns, null, false, encodings, types, 32768, true); Assert.Equal(diskRanges(100000, 200000), result); rowGroups = new bool[] { false, true, false, false, false, false }; indexes[2] = indexes[1]; indexes[1] = null; result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes, columns, rowGroups, false, encodings, types, 32768, false); Assert.Equal( diskRanges(100100, 102000, 112000, 122000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP), result); result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes, columns, rowGroups, false, encodings, types, 32768, true); Assert.Equal( diskRanges(100100, 102000, 112000, 122000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP), result); rowGroups = new bool[] { false, false, false, false, false, true }; indexes[1] = indexes[2]; columns = new bool[] { true, true, true }; result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes, columns, rowGroups, false, encodings, types, 32768, false); Assert.Equal(diskRanges(500, 1000, 51000, 100000, 100500, 102000, 152000, 200000), result); result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes, columns, rowGroups, false, encodings, types, 32768, true); Assert.Equal(diskRanges(500, 1000, 51000, 100000, 100500, 102000, 152000, 200000), result); }
public RecordReaderImpl.Index readRowIndex(StripeInformation stripe, OrcProto.StripeFooter footer, bool[] included, OrcProto.RowIndex[] indexes, bool[] sargColumns, OrcProto.BloomFilterIndex[] bloomFilterIndices) { if (footer == null) { footer = readStripeFooter(stripe); } if (indexes == null) { indexes = new OrcProto.RowIndex[typeCount]; } if (bloomFilterIndices == null) { bloomFilterIndices = new OrcProto.BloomFilterIndex[typeCount]; } long offset = stripe.getOffset(); IList <OrcProto.Stream> streams = footer.StreamsList; for (int i = 0; i < streams.Count; i++) { OrcProto.Stream stream = streams[i]; OrcProto.Stream nextStream = null; if (i < streams.Count - 1) { nextStream = streams[i + 1]; } int col = (int)stream.Column; int len = (int)stream.Length; // row index stream and bloom filter are interlaced, check if the sarg column contains bloom // filter and combine the io to read row index and bloom filters for that column together if (stream.HasKind && (stream.Kind == OrcProto.Stream.Types.Kind.ROW_INDEX)) { bool readBloomFilter = false; if (sargColumns != null && sargColumns[col] && nextStream.Kind == OrcProto.Stream.Types.Kind.BLOOM_FILTER) { len += (int)nextStream.Length; i += 1; readBloomFilter = true; } if ((included == null || included[col]) && indexes[col] == null) { byte[] buffer = new byte[len]; file.readFully(offset, buffer, 0, buffer.Length); ByteBuffer bb = ByteBuffer.wrap(buffer); indexes[col] = OrcProto.RowIndex.ParseFrom(InStream.create(null, "index", new List <DiskRange> { new RecordReaderImpl.BufferChunk(bb, 0) }, (long)stream.Length, codec, bufferSize)); if (readBloomFilter) { bb.position((int)stream.Length); bloomFilterIndices[col] = OrcProto.BloomFilterIndex.ParseFrom(InStream.create( null, "bloom_filter", new List <DiskRange> { new RecordReaderImpl.BufferChunk(bb, 0) }, (long)nextStream.Length, codec, bufferSize)); } } } offset += len; } RecordReaderImpl.Index index = new RecordReaderImpl.Index(indexes, bloomFilterIndices); return(index); }
public void testPartialPlanString() { DiskRangeList result; // set the streams List<OrcProto.Stream> streams = new List<OrcProto.Stream>(); streams.Add(OrcProto.Stream.CreateBuilder() .SetKind(OrcProto.Stream.Types.Kind.PRESENT) .SetColumn(1).SetLength(1000).Build()); streams.Add(OrcProto.Stream.CreateBuilder() .SetKind(OrcProto.Stream.Types.Kind.DATA) .SetColumn(1).SetLength(94000).Build()); streams.Add(OrcProto.Stream.CreateBuilder() .SetKind(OrcProto.Stream.Types.Kind.LENGTH) .SetColumn(1).SetLength(2000).Build()); streams.Add(OrcProto.Stream.CreateBuilder() .SetKind(OrcProto.Stream.Types.Kind.DICTIONARY_DATA) .SetColumn(1).SetLength(3000).Build()); streams.Add(OrcProto.Stream.CreateBuilder() .SetKind(OrcProto.Stream.Types.Kind.PRESENT) .SetColumn(2).SetLength(2000).Build()); streams.Add(OrcProto.Stream.CreateBuilder() .SetKind(OrcProto.Stream.Types.Kind.DATA) .SetColumn(2).SetLength(98000).Build()); bool[] columns = new bool[] { true, true, false }; bool[] rowGroups = new bool[] { false, true, false, false, true, true }; // set the index OrcProto.RowIndex[] indexes = new OrcProto.RowIndex[columns.Length]; indexes[1] = OrcProto.RowIndex.CreateBuilder() .AddEntry(OrcProto.RowIndexEntry.CreateBuilder() .AddPositions(0).AddPositions(UInt64.MaxValue).AddPositions(UInt64.MaxValue) .AddPositions(0) .Build()) .AddEntry(OrcProto.RowIndexEntry.CreateBuilder() .AddPositions(100).AddPositions(UInt64.MaxValue).AddPositions(UInt64.MaxValue) .AddPositions(10000) .Build()) .AddEntry(OrcProto.RowIndexEntry.CreateBuilder() .AddPositions(200).AddPositions(UInt64.MaxValue).AddPositions(UInt64.MaxValue) .AddPositions(20000) .Build()) .AddEntry(OrcProto.RowIndexEntry.CreateBuilder() .AddPositions(300).AddPositions(UInt64.MaxValue).AddPositions(UInt64.MaxValue) .AddPositions(30000) .Build()) .AddEntry(OrcProto.RowIndexEntry.CreateBuilder() .AddPositions(400).AddPositions(UInt64.MaxValue).AddPositions(UInt64.MaxValue) .AddPositions(40000) .Build()) .AddEntry(OrcProto.RowIndexEntry.CreateBuilder() .AddPositions(500).AddPositions(UInt64.MaxValue).AddPositions(UInt64.MaxValue) .AddPositions(50000) .Build()) .Build(); // set encodings List<OrcProto.ColumnEncoding> encodings = new List<OrcProto.ColumnEncoding>(); encodings.Add(OrcProto.ColumnEncoding.CreateBuilder() .SetKind(OrcProto.ColumnEncoding.Types.Kind.DIRECT).Build()); encodings.Add(OrcProto.ColumnEncoding.CreateBuilder() .SetKind(OrcProto.ColumnEncoding.Types.Kind.DICTIONARY).Build()); encodings.Add(OrcProto.ColumnEncoding.CreateBuilder() .SetKind(OrcProto.ColumnEncoding.Types.Kind.DIRECT).Build()); // set types struct{x: string, y: int} List<OrcProto.Type> types = new List<OrcProto.Type>(); types.Add(OrcProto.Type.CreateBuilder().SetKind(OrcProto.Type.Types.Kind.STRUCT) .AddSubtypes(1).AddSubtypes(2).AddFieldNames("x") .AddFieldNames("y").Build()); types.Add(OrcProto.Type.CreateBuilder().SetKind(OrcProto.Type.Types.Kind.STRING).Build()); types.Add(OrcProto.Type.CreateBuilder().SetKind(OrcProto.Type.Types.Kind.INT).Build()); // filter by rows and groups result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes, columns, rowGroups, false, encodings, types, 32768, false); Assert.Equal( diskRanges(100, 1000, 400, 1000, 500, 1000, 11000, 21000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP, 41000, 51000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP, 51000, 95000, 95000, 97000, 97000, 100000), result); }
public static void addRgFilteredStreamToRanges(OrcProto.Stream stream, bool[] includedRowGroups, bool isCompressed, OrcProto.RowIndex index, OrcProto.ColumnEncoding encoding, OrcProto.Type type, int compressionSize, bool hasNull, long offset, long length, DiskRangeList.CreateHelper list, bool doMergeBuffers) { for (int group = 0; group < includedRowGroups.Length; ++group) { if (!includedRowGroups[group]) { continue; } int posn = getIndexPosition( encoding.Kind, type.Kind, stream.Kind, isCompressed, hasNull); long start = (long)index.EntryList[group].PositionsList[posn]; long nextGroupOffset; bool isLast = group == (includedRowGroups.Length - 1); nextGroupOffset = isLast ? length : (int)index.EntryList[group + 1].PositionsList[posn]; start += offset; long end = offset + estimateRgEndOffset( isCompressed, isLast, nextGroupOffset, length, compressionSize); list.addOrMerge(start, end, doMergeBuffers, true); } }