private TimestampStreamReader(int columnId, SettableUncompressedStream present, SettableUncompressedStream data, SettableUncompressedStream nanos, bool isFileCompressed, OrcProto.ColumnEncoding encoding, bool skipCorrupt) : base(columnId, present, data, nanos, encoding, skipCorrupt) { this.isFileCompressed = isFileCompressed; this._presentStream = present; this._secondsStream = data; this._nanosStream = nanos; }
protected ColumnStatisticsImpl(OrcProto.ColumnStatistics stats) { if (stats.HasNumberOfValues) { count = stats.NumberOfValues; } if (stats.HasHasNull) { _hasNull = stats.HasNull; } else { _hasNull = true; } }
public static void addRgFilteredStreamToRanges(OrcProto.Stream stream, bool[] includedRowGroups, bool isCompressed, OrcProto.RowIndex index, OrcProto.ColumnEncoding encoding, OrcProto.Type type, int compressionSize, bool hasNull, long offset, long length, DiskRangeList.CreateHelper list, bool doMergeBuffers) { for (int group = 0; group < includedRowGroups.Length; ++group) { if (!includedRowGroups[group]) continue; int posn = getIndexPosition( encoding.Kind, type.Kind, stream.Kind, isCompressed, hasNull); long start = (long)index.EntryList[group].PositionsList[posn]; long nextGroupOffset; bool isLast = group == (includedRowGroups.Length - 1); nextGroupOffset = isLast ? length : (int)index.EntryList[group + 1].PositionsList[posn]; start += offset; long end = offset + estimateRgEndOffset( isCompressed, isLast, nextGroupOffset, length, compressionSize); list.addOrMerge(start, end, doMergeBuffers, true); } }
public RowIndexPositionRecorder(OrcProto.RowIndexEntry.Builder builder) { this.builder = builder; }
public BooleanStatisticsImpl(OrcProto.ColumnStatistics stats) : base(stats) { trueCount = stats.BucketStatistics.CountList[0]; }
public StreamReaderBuilder setColumnEncoding(OrcProto.ColumnEncoding encoding) { this.columnEncoding = encoding; return this; }
private void writeFileStatistics(OrcProto.Footer.Builder builder, TreeWriter writer) { builder.AddStatistics(writer.fileStatistics.serialize()); foreach (TreeWriter child in writer.getChildrenWriters()) { writeFileStatistics(builder, child); } }
private static void writeTypes(OrcProto.Footer.Builder builder, TypeDescription schema) { OrcProto.Type.Builder type = OrcProto.Type.CreateBuilder(); IList<TypeDescription> children = schema.getChildren(); switch (schema.getCategory()) { case Category.BOOLEAN: type.Kind = OrcProto.Type.Types.Kind.BOOLEAN; break; case Category.BYTE: type.Kind = OrcProto.Type.Types.Kind.BYTE; break; case Category.SHORT: type.Kind = OrcProto.Type.Types.Kind.SHORT; break; case Category.INT: type.Kind = OrcProto.Type.Types.Kind.INT; break; case Category.LONG: type.Kind = OrcProto.Type.Types.Kind.LONG; break; case Category.FLOAT: type.Kind = OrcProto.Type.Types.Kind.FLOAT; break; case Category.DOUBLE: type.Kind = OrcProto.Type.Types.Kind.DOUBLE; break; case Category.STRING: type.Kind = OrcProto.Type.Types.Kind.STRING; break; case Category.CHAR: type.Kind = OrcProto.Type.Types.Kind.CHAR; type.MaximumLength = (uint)schema.getMaxLength(); break; case Category.VARCHAR: type.Kind = OrcProto.Type.Types.Kind.VARCHAR; type.MaximumLength = (uint)schema.getMaxLength(); break; case Category.BINARY: type.Kind = OrcProto.Type.Types.Kind.BINARY; break; case Category.TIMESTAMP: type.Kind = OrcProto.Type.Types.Kind.TIMESTAMP; break; case Category.DATE: type.Kind = OrcProto.Type.Types.Kind.DATE; break; case Category.DECIMAL: type.Kind = OrcProto.Type.Types.Kind.DECIMAL; type.Precision = (uint)schema.getPrecision(); type.Scale = (uint)schema.getScale(); break; case Category.LIST: type.Kind = OrcProto.Type.Types.Kind.LIST; type.AddSubtypes((uint)children[0].getId()); break; case Category.MAP: type.Kind = OrcProto.Type.Types.Kind.MAP; foreach (TypeDescription t in children) { type.AddSubtypes((uint)t.getId()); } break; case Category.STRUCT: type.Kind = OrcProto.Type.Types.Kind.STRUCT; foreach (TypeDescription t in children) { type.AddSubtypes((uint)t.getId()); } foreach (string field in schema.getFieldNames()) { type.AddFieldNames(field); } break; case Category.UNION: type.Kind = OrcProto.Type.Types.Kind.UNION; foreach (TypeDescription t in children) { type.AddSubtypes((uint)t.getId()); } break; default: throw new ArgumentException("Unknown category: " + schema.getCategory()); } builder.AddTypes(type); if (children != null) { foreach (TypeDescription child in children) { writeTypes(builder, child); } } }
/** * Get the offset in the index positions for the column that the given * stream starts. * @param columnEncoding the encoding of the column * @param columnType the type of the column * @param streamType the kind of the stream * @param isCompressed is the file compressed * @param hasNulls does the column have a PRESENT stream? * @return the number of positions that will be used for that stream */ public static int getIndexPosition(OrcProto.ColumnEncoding.Types.Kind columnEncoding, OrcProto.Type.Types.Kind columnType, OrcProto.Stream.Types.Kind streamType, bool isCompressed, bool hasNulls) { if (streamType == OrcProto.Stream.Types.Kind.PRESENT) { return 0; } int compressionValue = isCompressed ? 1 : 0; int @base = hasNulls ? (BITFIELD_POSITIONS + compressionValue) : 0; switch (columnType) { case OrcProto.Type.Types.Kind.BOOLEAN: case OrcProto.Type.Types.Kind.BYTE: case OrcProto.Type.Types.Kind.SHORT: case OrcProto.Type.Types.Kind.INT: case OrcProto.Type.Types.Kind.LONG: case OrcProto.Type.Types.Kind.FLOAT: case OrcProto.Type.Types.Kind.DOUBLE: case OrcProto.Type.Types.Kind.DATE: case OrcProto.Type.Types.Kind.STRUCT: case OrcProto.Type.Types.Kind.MAP: case OrcProto.Type.Types.Kind.LIST: case OrcProto.Type.Types.Kind.UNION: return @base; case OrcProto.Type.Types.Kind.CHAR: case OrcProto.Type.Types.Kind.VARCHAR: case OrcProto.Type.Types.Kind.STRING: if (columnEncoding == OrcProto.ColumnEncoding.Types.Kind.DICTIONARY || columnEncoding == OrcProto.ColumnEncoding.Types.Kind.DICTIONARY_V2) { return @base; } else { if (streamType == OrcProto.Stream.Types.Kind.DATA) { return @base; } else { return @base + BYTE_STREAM_POSITIONS + compressionValue; } } case OrcProto.Type.Types.Kind.BINARY: if (streamType == OrcProto.Stream.Types.Kind.DATA) { return @base; } return @base + BYTE_STREAM_POSITIONS + compressionValue; case OrcProto.Type.Types.Kind.DECIMAL: if (streamType == OrcProto.Stream.Types.Kind.DATA) { return @base; } return @base + BYTE_STREAM_POSITIONS + compressionValue; case OrcProto.Type.Types.Kind.TIMESTAMP: if (streamType == OrcProto.Stream.Types.Kind.DATA) { return @base; } return @base + RUN_LENGTH_INT_POSITIONS + compressionValue; default: throw new ArgumentException("Unknown type " + columnType); } }
public override void writeStripe(OrcProto.StripeFooter.Builder builder, int requiredIndexEntries) { // if rows in stripe is less than dictionaryCheckAfterRows, dictionary // checking would not have happened. So do it again here. checkDictionaryEncoding(); if (useDictionaryEncoding) { flushDictionary(); } else { // flushout any left over entries from dictionary if (rows.size() > 0) { flushDictionary(); } // suppress the stream for every stripe if dictionary is disabled stringOutput.suppress(); } // we need to build the rowindex before calling super, since it // writes it out. base.writeStripe(builder, requiredIndexEntries); stringOutput.Flush(); lengthOutput.flush(); rowOutput.flush(); directStreamOutput.Flush(); directLengthOutput.flush(); // reset all of the fields to be ready for the next stripe. dictionary.clear(); savedRowIndex.Clear(); rowIndexValueCount.Clear(); recordPosition(rowIndexPosition); rowIndexValueCount.Add(0L); if (!useDictionaryEncoding) { // record the start positions of first index stride of next stripe i.e // beginning of the direct streams when dictionary is disabled recordDirectStreamPosition(); } }
public StringStatisticsImpl(OrcProto.ColumnStatistics stats) : base(stats) { OrcProto.StringStatistics str = stats.StringStatistics; if (str.HasMaximum) { maximum = str.Maximum; } if (str.HasMinimum) { minimum = str.Minimum; } if (str.HasSum) { sum = str.Sum; } }
public BinaryStatisticsImpl(OrcProto.ColumnStatistics stats) : base(stats) { OrcProto.BinaryStatistics binStats = stats.BinaryStatistics; if (binStats.HasSum) { sum = binStats.Sum; } }
public IntegerStatisticsImpl(OrcProto.ColumnStatistics stats) : base(stats) { OrcProto.IntegerStatistics intStat = stats.IntStatistics; if (intStat.HasMinimum) { hasMinimum = true; minimum = intStat.Minimum; } if (intStat.HasMaximum) { maximum = intStat.Maximum; } if (intStat.HasSum) { sum = intStat.Sum; } else { overflow = true; } }
public DoubleStatisticsImpl(OrcProto.ColumnStatistics stats) : base(stats) { OrcProto.DoubleStatistics dbl = stats.DoubleStatistics; if (dbl.HasMinimum) { hasMinimum = true; minimum = dbl.Minimum; } if (dbl.HasMaximum) { maximum = dbl.Maximum; } if (dbl.HasSum) { sum = dbl.Sum; } }
public RecordReaderImpl.Index readRowIndex(StripeInformation stripe, OrcProto.StripeFooter footer, bool[] included, OrcProto.RowIndex[] indexes, bool[] sargColumns, OrcProto.BloomFilterIndex[] bloomFilterIndices) { if (footer == null) { footer = readStripeFooter(stripe); } if (indexes == null) { indexes = new OrcProto.RowIndex[typeCount]; } if (bloomFilterIndices == null) { bloomFilterIndices = new OrcProto.BloomFilterIndex[typeCount]; } long offset = stripe.getOffset(); IList<OrcProto.Stream> streams = footer.StreamsList; for (int i = 0; i < streams.Count; i++) { OrcProto.Stream stream = streams[i]; OrcProto.Stream nextStream = null; if (i < streams.Count - 1) { nextStream = streams[i + 1]; } int col = (int)stream.Column; int len = (int)stream.Length; // row index stream and bloom filter are interlaced, check if the sarg column contains bloom // filter and combine the io to read row index and bloom filters for that column together if (stream.HasKind && (stream.Kind == OrcProto.Stream.Types.Kind.ROW_INDEX)) { bool readBloomFilter = false; if (sargColumns != null && sargColumns[col] && nextStream.Kind == OrcProto.Stream.Types.Kind.BLOOM_FILTER) { len += (int)nextStream.Length; i += 1; readBloomFilter = true; } if ((included == null || included[col]) && indexes[col] == null) { byte[] buffer = new byte[len]; file.readFully(offset, buffer, 0, buffer.Length); ByteBuffer bb = ByteBuffer.wrap(buffer); indexes[col] = OrcProto.RowIndex.ParseFrom(InStream.create(null, "index", new List<DiskRange> { new RecordReaderImpl.BufferChunk(bb, 0) }, (long)stream.Length, codec, bufferSize)); if (readBloomFilter) { bb.position((int)stream.Length); bloomFilterIndices[col] = OrcProto.BloomFilterIndex.ParseFrom(InStream.create( null, "bloom_filter", new List<DiskRange> { new RecordReaderImpl.BufferChunk(bb, 0) }, (long)nextStream.Length, codec, bufferSize)); } } } offset += len; } RecordReaderImpl.Index index = new RecordReaderImpl.Index(indexes, bloomFilterIndices); return index; }
/** * Is this stream part of a dictionary? * @return is this part of a dictionary? */ public static bool isDictionary(OrcProto.Stream.Types.Kind kind, OrcProto.ColumnEncoding encoding) { Debug.Assert(kind != OrcProto.Stream.Types.Kind.DICTIONARY_COUNT); OrcProto.ColumnEncoding.Types.Kind encodingKind = encoding.Kind; return kind == OrcProto.Stream.Types.Kind.DICTIONARY_DATA || (kind == OrcProto.Stream.Types.Kind.LENGTH && (encodingKind == OrcProto.ColumnEncoding.Types.Kind.DICTIONARY || encodingKind == OrcProto.ColumnEncoding.Types.Kind.DICTIONARY_V2)); }
public void appendStripe(byte[] stripe, int offset, int length, StripeInformation stripeInfo, OrcProto.StripeStatistics stripeStatistics) { checkArgument(stripe != null, "Stripe must not be null"); checkArgument(length <= stripe.Length, "Specified length must not be greater specified array length"); checkArgument(stripeInfo != null, "Stripe information must not be null"); checkArgument(stripeStatistics != null, "Stripe statistics must not be null"); getStream(); long start = rawWriter.Position; long availBlockSpace = blockSize - (start % blockSize); // see if stripe can fit in the current hdfs block, else pad the remaining // space in the block if (length < blockSize && length > availBlockSpace && addBlockPadding) { byte[] pad = new byte[(int)Math.Min(HDFS_BUFFER_SIZE, availBlockSpace)]; LOG.info(String.Format("Padding ORC by {0} bytes while merging..", availBlockSpace)); start += availBlockSpace; while (availBlockSpace > 0) { int writeLen = (int)Math.Min(availBlockSpace, pad.Length); rawWriter.Write(pad, 0, writeLen); availBlockSpace -= writeLen; } } rawWriter.Write(stripe, 0, stripe.Length); rowsInStripe = (long)stripeStatistics.ColStatsList[0].NumberOfValues; rowCount += rowsInStripe; // since we have already written the stripe, just update stripe statistics treeWriter.stripeStatsBuilders.Add(stripeStatistics.ToBuilder()); // update file level statistics updateFileStatistics(stripeStatistics); // update stripe information OrcProto.StripeInformation.Builder dirEntry = OrcProto.StripeInformation.CreateBuilder(); dirEntry.Offset = (ulong)start; dirEntry.NumberOfRows = (ulong)rowsInStripe; dirEntry.IndexLength = (ulong)stripeInfo.getIndexLength(); dirEntry.DataLength = (ulong)stripeInfo.getDataLength(); dirEntry.FooterLength = (ulong)stripeInfo.getFooterLength(); stripes.Add(dirEntry.Build()); // reset it after writing the stripe rowsInStripe = 0; }
/** * Write the stripe out to the file. * @param builder the stripe footer that contains the information about the * layout of the stripe. The TreeWriter is required to update * the footer with its information. * @param requiredIndexEntries the number of index entries that are * required. this is to check to make sure the * row index is well formed. * @ */ public virtual void writeStripe(OrcProto.StripeFooter.Builder builder, int requiredIndexEntries) { if (isPresent != null) { isPresent.flush(); // if no nulls are found in a stream, then suppress the stream if (!foundNulls) { isPresentOutStream.suppress(); // since isPresent bitstream is suppressed, update the index to // remove the positions of the isPresent stream if (rowIndexStream != null) { removeIsPresentPositions(); } } } // merge stripe-level column statistics to file statistics and write it to // stripe statistics OrcProto.StripeStatistics.Builder stripeStatsBuilder = OrcProto.StripeStatistics.CreateBuilder(); writeStripeStatistics(stripeStatsBuilder, this); stripeStatsBuilders.Add(stripeStatsBuilder); // reset the flag for next stripe foundNulls = false; builder.AddColumns(getEncoding()); builder.WriterTimezone = streamFactory.Timezone; if (rowIndexStream != null) { if (rowIndex.EntryCount != requiredIndexEntries) { throw new ArgumentException("Column has wrong number of " + "index entries found: " + rowIndex.EntryCount + " expected: " + requiredIndexEntries); } rowIndex.Build().WriteTo(rowIndexStream); rowIndexStream.Flush(); } rowIndex.Clear(); rowIndexEntry.Clear(); // write the bloom filter to out stream if (bloomFilterStream != null) { bloomFilterIndex.Build().WriteTo(bloomFilterStream); bloomFilterStream.Flush(); bloomFilterIndex.Clear(); bloomFilterEntry.Clear(); } }
/** * Create a stream to store part of a column. * @param column the column id for the stream * @param kind the kind of stream * @return The output outStream that the section needs to be written to. * @ */ public OutStream createStream(int column, OrcProto.Stream.Types.Kind kind) { StreamName name = new StreamName(column, kind); CompressionModifier[] modifiers; switch (kind) { case OrcProto.Stream.Types.Kind.BLOOM_FILTER: case OrcProto.Stream.Types.Kind.DATA: case OrcProto.Stream.Types.Kind.DICTIONARY_DATA: if (getCompressionStrategy() == OrcFile.CompressionStrategy.SPEED) { modifiers = new[] { CompressionModifier.FAST, CompressionModifier.TEXT }; } else { modifiers = new[] { CompressionModifier.DEFAULT, CompressionModifier.TEXT }; } break; case OrcProto.Stream.Types.Kind.LENGTH: case OrcProto.Stream.Types.Kind.DICTIONARY_COUNT: case OrcProto.Stream.Types.Kind.PRESENT: case OrcProto.Stream.Types.Kind.ROW_INDEX: case OrcProto.Stream.Types.Kind.SECONDARY: // easily compressed using the fastest modes modifiers = new[] { CompressionModifier.FASTEST, CompressionModifier.BINARY }; break; default: LOG.warn("Missing ORC compression modifiers for " + kind); modifiers = null; break; } BufferedStream result = writer.streams.get(name); if (result == null) { result = new BufferedStream(name.ToString(), writer.bufferSize, writer.codec == null ? writer.codec : writer.codec.modify(modifiers)); writer.streams.Add(name, result); } return result.outStream; }
public override void writeStripe(OrcProto.StripeFooter.Builder builder, int requiredIndexEntries) { base.writeStripe(builder, requiredIndexEntries); tags.flush(); foreach (TreeWriter child in childrenWriters) { child.writeStripe(builder, requiredIndexEntries); } recordPosition(rowIndexPosition); }
public override void writeStripe(OrcProto.StripeFooter.Builder builder, int requiredIndexEntries) { base.writeStripe(builder, requiredIndexEntries); seconds.flush(); nanos.flush(); recordPosition(rowIndexPosition); }
private DecimalStreamReader(int columnId, int precision, int scale, SettableUncompressedStream presentStream, SettableUncompressedStream valueStream, SettableUncompressedStream scaleStream, bool isFileCompressed, OrcProto.ColumnEncoding encoding) : base(columnId, precision, scale, presentStream, valueStream, scaleStream, encoding) { this._isFileCompressed = isFileCompressed; this._presentStream = presentStream; this._valueStream = valueStream; this._scaleStream = scaleStream; }
private void writeStripeStatistics(OrcProto.StripeStatistics.Builder builder, TreeWriter treeWriter) { treeWriter.fileStatistics.merge(treeWriter.stripeColStatistics); builder.AddColStats(treeWriter.stripeColStatistics.serialize().Build()); treeWriter.stripeColStatistics.reset(); foreach (TreeWriter child in treeWriter.getChildrenWriters()) { writeStripeStatistics(builder, child); } }
public DateStatisticsImpl(OrcProto.ColumnStatistics stats) : base(stats) { OrcProto.DateStatistics dateStats = stats.DateStatistics; // min,max values serialized/deserialized as int (days since epoch) if (dateStats.HasMaximum) { maximum = new Date(dateStats.Maximum); } if (dateStats.HasMinimum) { minimum = new Date(dateStats.Minimum); } }
static OrcProto.ColumnEncoding MakeEncoding( OrcProto.ColumnEncoding.Types.Kind kind, int? dictionarySize = null) { OrcProto.ColumnEncoding.Builder builder = OrcProto.ColumnEncoding.CreateBuilder(); builder.Kind = kind; if (dictionarySize != null) { builder.DictionarySize = (uint)dictionarySize.Value; } return builder.Build(); }
public DecimalStatisticsImpl(OrcProto.ColumnStatistics stats) : base(stats) { OrcProto.DecimalStatistics dec = stats.DecimalStatistics; if (dec.HasMaximum) { maximum = HiveDecimal.Parse(dec.Maximum); } if (dec.HasMinimum) { minimum = HiveDecimal.Parse(dec.Minimum); } if (dec.HasSum) { sum = HiveDecimal.Parse(dec.Sum); } else { sum = null; } }
private void updateFileStatistics(OrcProto.StripeStatistics stripeStatistics) { IList<OrcProto.ColumnStatistics> cs = stripeStatistics.ColStatsList; List<TreeWriter> allWriters = getAllColumnTreeWriters(treeWriter); for (int i = 0; i < allWriters.Count; i++) { allWriters[i].fileStatistics.merge(ColumnStatisticsImpl.deserialize(cs[i])); } }
public TimestampStatisticsImpl(OrcProto.ColumnStatistics stats) : base(stats) { OrcProto.TimestampStatistics timestampStats = stats.TimestampStatistics; // min,max values serialized/deserialized as int (milliseconds since epoch) if (timestampStats.HasMaximum) { maximum = new Timestamp(timestampStats.Maximum); } if (timestampStats.HasMinimum) { minimum = new Timestamp(timestampStats.Minimum); } }
/** * Initializes the BloomFilter from the given Orc BloomFilter */ public static BloomFilter Create(OrcProto.BloomFilter bloomFilter) { return new BloomFilter(bloomFilter.BitsetList, (int)bloomFilter.NumHashFunctions); }
public static ColumnStatisticsImpl deserialize(OrcProto.ColumnStatistics stats) { if (stats.HasBucketStatistics) { return new BooleanStatisticsImpl(stats); } else if (stats.HasIntStatistics) { return new IntegerStatisticsImpl(stats); } else if (stats.HasDoubleStatistics) { return new DoubleStatisticsImpl(stats); } else if (stats.HasStringStatistics) { return new StringStatisticsImpl(stats); } else if (stats.HasDecimalStatistics) { return new DecimalStatisticsImpl(stats); } else if (stats.HasDateStatistics) { return new DateStatisticsImpl(stats); } else if (stats.HasTimestampStatistics) { return new TimestampStatisticsImpl(stats); } else if (stats.HasBinaryStatistics) { return new BinaryStatisticsImpl(stats); } else { return new ColumnStatisticsImpl(stats); } }