public void testStruct() { OrcStruct st1 = new OrcStruct(4); OrcStruct st2 = new OrcStruct(4); OrcStruct st3 = new OrcStruct(3); st1.setFieldValue(0, "hop"); st1.setFieldValue(1, "on"); st1.setFieldValue(2, "pop"); st1.setFieldValue(3, 42); Assert.Equal(false, st1.Equals(null)); st2.setFieldValue(0, "hop"); st2.setFieldValue(1, "on"); st2.setFieldValue(2, "pop"); st2.setFieldValue(3, 42); Assert.Equal(st1, st2); st3.setFieldValue(0, "hop"); st3.setFieldValue(1, "on"); st3.setFieldValue(2, "pop"); Assert.Equal(false, st1.Equals(st3)); #if PREDICTABLE_STRING_HASH Assert.Equal(11241, st1.GetHashCode()); #endif Assert.Equal(st1.GetHashCode(), st2.GetHashCode()); #if PREDICTABLE_STRING_HASH Assert.Equal(11204, st3.GetHashCode()); #endif Assert.Equal("{hop, on, pop, 42}", st1.ToString()); st1.setFieldValue(3, null); Assert.Equal(false, st1.Equals(st2)); Assert.Equal(false, st2.Equals(st1)); st2.setFieldValue(3, null); Assert.Equal(st1, st2); }
public override bool Equals(object other) { OrcStruct oth = other as OrcStruct; if (other == null) { return(false); } else { if (fields.Length != oth.fields.Length) { return(false); } for (int i = 0; i < fields.Length; ++i) { if (fields[i] == null) { if (oth.fields[i] != null) { return(false); } } else { if (!fields[i].Equals(oth.fields[i])) { return(false); } } } return(true); } }
void next(OrcStruct next) { if (recordReader.hasNext()) { nextRecord = (OrcStruct)recordReader.next(next); // set the key key.setValues(OrcRecordUpdater.getOriginalTransaction(nextRecord), OrcRecordUpdater.getBucket(nextRecord), OrcRecordUpdater.getRowId(nextRecord), OrcRecordUpdater.getCurrentTransaction(nextRecord), statementId); // if this record is larger than maxKey, we need to stop if (maxKey != null && key.compareRow(maxKey) > 0) { LOG.debug("key " + key + " > maxkey " + maxKey); nextRecord = null; recordReader.Dispose(); } } else { nextRecord = null; recordReader.Dispose(); } }
void next(OrcStruct next) { if (recordReader.hasNext()) { long nextRowId = recordReader.getRowNumber(); // have to do initialization here, because the super's constructor // calls next and thus we need to initialize before our constructor // runs if (next == null) { nextRecord = new OrcStruct(OrcRecordUpdater.FIELDS); IntWritable operation = new IntWritable(OrcRecordUpdater.INSERT_OPERATION); nextRecord.setFieldValue(OrcRecordUpdater.OPERATION, operation); nextRecord.setFieldValue(OrcRecordUpdater.CURRENT_TRANSACTION, new LongWritable(0)); nextRecord.setFieldValue(OrcRecordUpdater.ORIGINAL_TRANSACTION, new LongWritable(0)); nextRecord.setFieldValue(OrcRecordUpdater.BUCKET, new IntWritable(bucket)); nextRecord.setFieldValue(OrcRecordUpdater.ROW_ID, new LongWritable(nextRowId)); nextRecord.setFieldValue(OrcRecordUpdater.ROW, recordReader.next(null)); } else { nextRecord = next; ((IntWritable)next.getFieldValue(OrcRecordUpdater.OPERATION)) .set(OrcRecordUpdater.INSERT_OPERATION); ((LongWritable)next.getFieldValue(OrcRecordUpdater.ORIGINAL_TRANSACTION)) .set(0); ((IntWritable)next.getFieldValue(OrcRecordUpdater.BUCKET)) .set(bucket); ((LongWritable)next.getFieldValue(OrcRecordUpdater.CURRENT_TRANSACTION)) .set(0); ((LongWritable)next.getFieldValue(OrcRecordUpdater.ROW_ID)) .set(0); nextRecord.setFieldValue(OrcRecordUpdater.ROW, recordReader.next(OrcRecordUpdater.getRow(next))); } key.setValues(0L, bucket, nextRowId, 0L, 0); if (maxKey != null && key.compareRow(maxKey) > 0) { if (LOG.isDebugEnabled()) { LOG.debug("key " + key + " > maxkey " + maxKey); } nextRecord = null; recordReader.close(); } } else { nextRecord = null; recordReader.close(); } }
public OrcUnionObjectInspector(UnionTypeInfo info) { List <TypeInfo> unionChildren = info.getAllUnionObjectTypeInfos(); this.children = new List <ObjectInspector>(unionChildren.Count); foreach (TypeInfo child in info.getAllUnionObjectTypeInfos()) { this.children.Add(OrcStruct.createObjectInspector(child)); } }
public OrcUnionObjectInspector(int columnId, IList <OrcProto.Type> types) { OrcProto.Type type = types[columnId]; children = new List <ObjectInspector>(type.SubtypesCount); for (int i = 0; i < type.SubtypesCount; ++i) { children.Add(OrcStruct.createObjectInspector((int)type.SubtypesList[i], types)); } }
static OrcStruct getRow(OrcStruct @struct) { if (@struct == null) { return(null); } else { return((OrcStruct)@struct.getFieldValue(ROW)); } }
public object setStructFieldData(object @struct, StructField field, object fieldValue) { OrcStruct orcStruct = (OrcStruct)@struct; int offset = ((Field)field).offset; // if the offset is bigger than our current number of fields, grow it if (orcStruct.getNumFields() <= offset) { orcStruct.setNumFields(offset + 1); } orcStruct.setFieldValue(offset, fieldValue); return(@struct); }
public override List <object> getStructFieldsDataAsList(object @object) { if (@object == null) { return(null); } OrcStruct @struct = (OrcStruct)@object; List <object> result = new List <object>(@struct.fields.Length); foreach (object child in @struct.fields) { result.Add(child); } return(result); }
public override object getStructFieldData(object @object, StructField field) { if (@object == null) { return(null); } int offset = ((Field)field).offset; OrcStruct @struct = (OrcStruct)@object; if (offset >= @struct.fields.Length) { return(null); } return(@struct.fields[offset]); }
public MetaInfoObjExtractor(CompressionKind compressionKind, int bufferSize, int metadataSize, ByteBuffer footerBuffer) { this.compressionKind = compressionKind; this.bufferSize = bufferSize; this.codec = WriterImpl.createCodec(compressionKind); this.metadataSize = metadataSize; int position = footerBuffer.position(); int footerBufferSize = footerBuffer.limit() - footerBuffer.position() - metadataSize; this.metadata = extractMetadata(footerBuffer, position, metadataSize, codec, bufferSize); this.footer = extractFooter( footerBuffer, position + metadataSize, footerBufferSize, codec, bufferSize); footerBuffer.position(position); this.inspector = OrcStruct.createObjectInspector(0, footer.TypesList); }
public void testInspectorFromTypeInfo() { TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString("struct<c1:boolean,c2:tinyint" + ",c3:smallint,c4:int,c5:bigint,c6:float,c7:double,c8:binary," + "c9:string,c10:struct<c1:int>,c11:map<int,int>,c12:uniontype<int>" + ",c13:array<timestamp>>"); StructObjectInspector inspector = (StructObjectInspector) OrcStruct.createObjectInspector(typeInfo); Assert.Equal("struct<c1:boolean,c2:tinyint,c3:smallint,c4:int,c5:" + "bigint,c6:float,c7:double,c8:binary,c9:string,c10:struct<" + "c1:int>,c11:map<int,int>,c12:uniontype<int>,c13:array<timestamp>>", inspector.getTypeName()); Assert.Equal(null, inspector.getAllStructFieldRefs()[0].getFieldComment()); Assert.Equal(null, inspector.getStructFieldRef("UNKNOWN")); OrcStruct s1 = new OrcStruct(13); for (int i = 0; i < 13; ++i) { s1.setFieldValue(i, i); } List<object> list = new List<object> { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 }; Assert.Equal(list, inspector.getStructFieldsDataAsList(s1)); ListObjectInspector listOI = (ListObjectInspector) inspector.getAllStructFieldRefs()[12].getFieldObjectInspector(); Assert.Equal(ObjectInspectorCategory.LIST, listOI.getCategory()); Assert.Equal(10, listOI.getListElement(list, 10)); Assert.Equal(null, listOI.getListElement(list, -1)); Assert.Equal(null, listOI.getListElement(list, 13)); Assert.Equal(13, listOI.getListLength(list)); Dictionary<object, object> map = new Dictionary<object, object>() { {1, 2}, {2, 4}, {3, 6}, }; MapObjectInspector mapOI = (MapObjectInspector) inspector.getAllStructFieldRefs()[10].getFieldObjectInspector(); Assert.Equal(3, mapOI.getMapSize(map)); Assert.Equal(4, mapOI.getMapValueElement(map, 2)); }
public bool isDelete(OrcStruct value) { return OrcRecordUpdater.getOperation(value) == OrcRecordUpdater.DELETE_OPERATION; }
private void compareInner(InnerStruct expect, OrcStruct actual) { if (expect == null || actual == null) { Assert.Equal(null, expect); Assert.Equal(null, actual); } else { Assert.Equal(expect.int1, actual.getFieldValue(0)); Assert.Equal(expect.string1, actual.getFieldValue(1)); } }
internal static long getCurrentTransaction(OrcStruct @struct) { return ((LongWritable)@struct.getFieldValue(CURRENT_TRANSACTION)).get(); }
internal static int getBucket(OrcStruct @struct) { return(((IntWritable)@struct.getFieldValue(BUCKET)).get()); }
internal static int getBucket(OrcStruct @struct) { return ((IntWritable)@struct.getFieldValue(BUCKET)).get(); }
internal static long getCurrentTransaction(OrcStruct @struct) { return(((LongWritable)@struct.getFieldValue(CURRENT_TRANSACTION)).get()); }
internal static long getRowId(OrcStruct @struct) { return ((LongWritable)@struct.getFieldValue(ROW_ID)).get(); }
static OrcStruct getRow(OrcStruct @struct) { if (@struct == null) { return null; } else { return (OrcStruct)@struct.getFieldValue(ROW); } }
OrcRecordUpdater(Path path, AcidOutputFormat.Options options) { this.options = options; this.bucket.set(options.getBucket()); this.path = AcidUtils.createFilename(path, options); FileSystem fs = options.getFilesystem(); if (fs == null) { fs = path.getFileSystem(options.getConfiguration()); } this.fs = fs; try { FSDataOutputStream strm = fs.create(new Path(path, ACID_FORMAT), false); strm.writeInt(ORC_ACID_VERSION); strm.close(); } catch (IOException ioe) { if (LOG.isDebugEnabled()) { LOG.debug("Failed to create " + path + "/" + ACID_FORMAT + " with " + ioe); } } if (options.getMinimumTransactionId() != options.getMaximumTransactionId() && !options.isWritingBase()) { flushLengths = fs.create(getSideFile(this.path), true, 8, options.getReporter()); } else { flushLengths = null; } OrcFile.WriterOptions writerOptions = null; if (options is OrcOptions) { writerOptions = ((OrcOptions)options).getOrcOptions(); } if (writerOptions == null) { writerOptions = OrcFile.writerOptions( /* options.getTableProperties(), */ options.getConfiguration()); } writerOptions.fileSystem(fs).callback(indexBuilder); if (!options.isWritingBase()) { writerOptions.blockPadding(false); writerOptions.bufferSize(DELTA_BUFFER_SIZE); writerOptions.stripeSize(DELTA_STRIPE_SIZE); } rowInspector = (StructObjectInspector)options.getInspector(); writerOptions.inspector(createEventSchema(findRecId(options.getInspector(), options.getRecordIdColumn()))); this.writer = OrcFile.createWriter(this.path, writerOptions); item = new OrcStruct(FIELDS); item.setFieldValue(OPERATION, operation); item.setFieldValue(CURRENT_TRANSACTION, currentTransaction); item.setFieldValue(ORIGINAL_TRANSACTION, originalTransaction); item.setFieldValue(BUCKET, bucket); item.setFieldValue(ROW_ID, rowId); }
/** * Destructively make this object link to other's values. * @param other the value to point to */ void linkFields(OrcStruct other) { fields = other.fields; }
/** * Constructor that let's the user specify additional options. * @param path pathname for file * @param options options for reading * @ */ public ReaderImpl(Func <Stream> streamCreator, string path, OrcFile.ReaderOptions options) { this.streamCreator = streamCreator; this.path = path; this.conf = options.getConfiguration(); FileMetadata fileMetadata = options.getFileMetadata(); if (fileMetadata != null) { this.compressionKind = fileMetadata.getCompressionKind(); this.bufferSize = fileMetadata.getCompressionBufferSize(); this.codec = WriterImpl.createCodec(compressionKind); this.metadataSize = fileMetadata.getMetadataSize(); this.stripeStats = fileMetadata.getStripeStats(); this.versionList = fileMetadata.getVersionList(); this.writerVersion = OrcFile.WriterVersionHelpers.from(fileMetadata.getWriterVersionNum()); this.types = fileMetadata.getTypes(); this.rowIndexStride = fileMetadata.getRowIndexStride(); this.contentLength = fileMetadata.getContentLength(); this.numberOfRows = fileMetadata.getNumberOfRows(); this.fileStats = fileMetadata.getFileStats(); this.stripes = fileMetadata.getStripes(); this.inspector = OrcStruct.createObjectInspector(0, fileMetadata.getTypes()); this.footerByteBuffer = null; // not cached and not needed here this.userMetadata = null; // not cached and not needed here this.footerMetaAndPsBuffer = null; } else { FileMetaInfo footerMetaData; if (options.getFileMetaInfo() != null) { footerMetaData = options.getFileMetaInfo(); this.footerMetaAndPsBuffer = null; } else { using (Stream file = streamCreator()) { footerMetaData = extractMetaInfoFromFooter(file, path, options.getMaxLength()); this.footerMetaAndPsBuffer = footerMetaData.footerMetaAndPsBuffer; } } MetaInfoObjExtractor rInfo = new MetaInfoObjExtractor(footerMetaData.compressionKind, footerMetaData.bufferSize, footerMetaData.metadataSize, footerMetaData.footerBuffer ); this.footerByteBuffer = footerMetaData.footerBuffer; this.compressionKind = rInfo.compressionKind; this.codec = rInfo.codec; this.bufferSize = rInfo.bufferSize; this.metadataSize = rInfo.metadataSize; this.stripeStats = rInfo.metadata.StripeStatsList; this.types = rInfo.footer.TypesList; this.rowIndexStride = (int)rInfo.footer.RowIndexStride; this.contentLength = (int)rInfo.footer.ContentLength; this.numberOfRows = (int)rInfo.footer.NumberOfRows; this.userMetadata = rInfo.footer.MetadataList; this.fileStats = rInfo.footer.StatisticsList; this.inspector = rInfo.inspector; this.versionList = footerMetaData.versionList.Select(v => (int)v).ToList(); this.writerVersion = footerMetaData.writerVersion; this.stripes = convertProtoStripesToStripes(rInfo.footer.StripesList); } }
public void testUnionAndTimestamp() { List<OrcProto.Type> types = new List<OrcProto.Type>(); types.Add(OrcProto.Type.CreateBuilder().SetKind(OrcProto.Type.Types.Kind.STRUCT). AddFieldNames("time").AddFieldNames("union").AddFieldNames("decimal"). AddSubtypes(1).AddSubtypes(2).AddSubtypes(5).Build()); types.Add(OrcProto.Type.CreateBuilder().SetKind(OrcProto.Type.Types.Kind.TIMESTAMP). Build()); types.Add(OrcProto.Type.CreateBuilder().SetKind(OrcProto.Type.Types.Kind.UNION). AddSubtypes(3).AddSubtypes(4).Build()); types.Add(OrcProto.Type.CreateBuilder().SetKind(OrcProto.Type.Types.Kind.INT). Build()); types.Add(OrcProto.Type.CreateBuilder().SetKind(OrcProto.Type.Types.Kind.STRING). Build()); types.Add(OrcProto.Type.CreateBuilder().SetKind(OrcProto.Type.Types.Kind.DECIMAL). Build()); ObjectInspector inspector = OrcStruct.createObjectInspector(0, types); HiveDecimal maxValue = HiveDecimal.Parse("10000000000000000000"); OrcStruct row = new OrcStruct(3); OrcUnion union = new OrcUnion(); Random rand; using (Stream file = FileOpenWrite(TestFilePath)) using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .inspector(inspector) .stripeSize(1000) .compress(CompressionKind.NONE) .bufferSize(100) .blockPadding(false))) { row.setFieldValue(1, union); row.setFieldValue(0, Timestamp.Parse("2000-03-12 15:00:00")); HiveDecimal value = HiveDecimal.Parse("12345678.6547456"); row.setFieldValue(2, value); union.set((byte)0, 42); writer.addRow(row); row.setFieldValue(0, Timestamp.Parse("2000-03-20 12:00:00.123456789")); union.set((byte)1, "hello"); value = HiveDecimal.Parse("-5643.234"); row.setFieldValue(2, value); writer.addRow(row); row.setFieldValue(0, null); row.setFieldValue(1, null); row.setFieldValue(2, null); writer.addRow(row); row.setFieldValue(1, union); union.set((byte)0, null); writer.addRow(row); union.set((byte)1, null); writer.addRow(row); union.set((byte)0, 200000); row.setFieldValue(0, Timestamp.Parse("1970-01-01 00:00:00")); value = HiveDecimal.Parse("10000000000000000000"); row.setFieldValue(2, value); writer.addRow(row); rand = new Random(42); for (int i = 1970; i < 2038; ++i) { row.setFieldValue(0, Timestamp.Parse(i + "-05-05 12:34:56." + i)); if ((i & 1) == 0) { union.set((byte)0, (i * i)); } else { union.set((byte)1, (i * i).ToString()); } value = HiveDecimal.create(rand.NextBigInteger(64), rand.Next(18)); row.setFieldValue(2, value); if (maxValue.CompareTo(value) < 0) { maxValue = value; } writer.addRow(row); } // let's add a lot of constant rows to test the rle row.setFieldValue(0, null); union.set((byte)0, 1732050807); row.setFieldValue(2, null); for (int i = 0; i < 5000; ++i) { writer.addRow(row); } union.set((byte)0, 0); writer.addRow(row); union.set((byte)0, 10); writer.addRow(row); union.set((byte)0, 138); writer.addRow(row); writer.close(); TypeDescription schema = writer.getSchema(); Assert.Equal(5, schema.getMaximumId()); bool[] expected = new bool[] { false, false, false, false, false, false }; bool[] included = OrcUtils.includeColumns("", schema); Assert.Equal(expected, included); expected = new bool[] { false, true, false, false, false, true }; included = OrcUtils.includeColumns("time,decimal", schema); Assert.Equal(expected, included); expected = new bool[] { false, false, true, true, true, false }; included = OrcUtils.includeColumns("union", schema); Assert.Equal(expected, included); } Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); Assert.Equal(0, reader.getMetadataKeys().Count); Assert.Equal(5077, reader.getNumberOfRows()); DecimalColumnStatistics stats = (DecimalColumnStatistics)reader.getStatistics()[5]; Assert.Equal(71, stats.getNumberOfValues()); Assert.Equal(HiveDecimal.Parse("-5643.234"), stats.getMinimum()); Assert.Equal(maxValue, stats.getMaximum()); // TODO: fix this // Assert.Equal(null,stats.getSum()); int stripeCount = 0; int rowCount = 0; long currentOffset = -1; foreach (StripeInformation stripe in reader.getStripes()) { stripeCount += 1; rowCount += (int)stripe.getNumberOfRows(); if (currentOffset < 0) { currentOffset = stripe.getOffset() + stripe.getLength(); } else { Assert.Equal(currentOffset, stripe.getOffset()); currentOffset += stripe.getLength(); } } Assert.Equal(reader.getNumberOfRows(), rowCount); Assert.Equal(2, stripeCount); Assert.Equal(reader.getContentLength(), currentOffset); using (RecordReader rows = reader.rows()) { Assert.Equal(0, rows.getRowNumber()); Assert.Equal(0.0, rows.getProgress(), 6); Assert.Equal(true, rows.hasNext()); row = (OrcStruct)rows.next(); Assert.Equal(1, rows.getRowNumber()); inspector = reader.getObjectInspector(); Assert.Equal("struct<time:timestamp,union:uniontype<int,string>,decimal:decimal(38,18)>", inspector.getTypeName()); Assert.Equal(Timestamp.Parse("2000-03-12 15:00:00"), row.getFieldValue(0)); union = (OrcUnion)row.getFieldValue(1); Assert.Equal(0, union.getTag()); Assert.Equal(42, union.getObject()); Assert.Equal(HiveDecimal.Parse("12345678.6547456"), row.getFieldValue(2)); row = (OrcStruct)rows.next(); Assert.Equal(2, rows.getRowNumber()); Assert.Equal(Timestamp.Parse("2000-03-20 12:00:00.123456789"), row.getFieldValue(0)); Assert.Equal(1, union.getTag()); Assert.Equal("hello", union.getObject()); Assert.Equal(HiveDecimal.Parse("-5643.234"), row.getFieldValue(2)); row = (OrcStruct)rows.next(); Assert.Equal(null, row.getFieldValue(0)); Assert.Equal(null, row.getFieldValue(1)); Assert.Equal(null, row.getFieldValue(2)); row = (OrcStruct)rows.next(); Assert.Equal(null, row.getFieldValue(0)); union = (OrcUnion)row.getFieldValue(1); Assert.Equal(0, union.getTag()); Assert.Equal(null, union.getObject()); Assert.Equal(null, row.getFieldValue(2)); row = (OrcStruct)rows.next(); Assert.Equal(null, row.getFieldValue(0)); Assert.Equal(1, union.getTag()); Assert.Equal(null, union.getObject()); Assert.Equal(null, row.getFieldValue(2)); row = (OrcStruct)rows.next(); Assert.Equal(Timestamp.Parse("1970-01-01 00:00:00"), row.getFieldValue(0)); Assert.Equal(200000, union.getObject()); Assert.Equal(HiveDecimal.Parse("10000000000000000000"), row.getFieldValue(2)); rand = new Random(42); for (int i = 1970; i < 2038; ++i) { row = (OrcStruct)rows.next(); Assert.Equal(Timestamp.Parse(i + "-05-05 12:34:56." + i), row.getFieldValue(0)); if ((i & 1) == 0) { Assert.Equal(0, union.getTag()); Assert.Equal(i * i, union.getObject()); } else { Assert.Equal(1, union.getTag()); Assert.Equal((i * i).ToString(), union.getObject()); } Assert.Equal(HiveDecimal.create(rand.NextBigInteger(64), rand.Next(18)), row.getFieldValue(2)); } for (int i = 0; i < 5000; ++i) { row = (OrcStruct)rows.next(); Assert.Equal(1732050807, union.getObject()); } row = (OrcStruct)rows.next(); Assert.Equal(0, union.getObject()); row = (OrcStruct)rows.next(); Assert.Equal(10, union.getObject()); row = (OrcStruct)rows.next(); Assert.Equal(138, union.getObject()); Assert.Equal(false, rows.hasNext()); Assert.Equal(1.0, rows.getProgress(), 5); Assert.Equal(reader.getNumberOfRows(), rows.getRowNumber()); rows.seekToRow(1); row = (OrcStruct)rows.next(); Assert.Equal(Timestamp.Parse("2000-03-20 12:00:00.123456789"), row.getFieldValue(0)); Assert.Equal(1, union.getTag()); Assert.Equal("hello", union.getObject()); Assert.Equal(HiveDecimal.Parse("-5643.234"), row.getFieldValue(2)); } }
public bool next(RecordIdentifier recordIdentifier, OrcStruct prev) { bool keysSame = true; while (keysSame && primary != null) { // The primary's nextRecord is the next value to return OrcStruct current = primary.nextRecord; recordIdentifier.set(primary.key); // Advance the primary reader to the next record primary.next(extraValue); // Save the current record as the new extraValue for next time so that // we minimize allocations extraValue = current; // now that the primary reader has advanced, we need to see if we // continue to read it or move to the secondary. if (primary.nextRecord == null || primary.key.compareTo(secondaryKey) > 0) { // if the primary isn't done, push it back into the readers if (primary.nextRecord != null) { readers.put(primary.key, primary); } // update primary and secondaryKey Map.Entry<ReaderKey, ReaderPair> entry = readers.pollFirstEntry(); if (entry != null) { primary = entry.getValue(); if (readers.isEmpty()) { secondaryKey = null; } else { secondaryKey = readers.firstKey(); } } else { primary = null; } } // if this transaction isn't ok, skip over it if (!validTxnList.isTxnValid( ((ReaderKey)recordIdentifier).getCurrentTransactionId())) { continue; } /*for multi-statement txns, you may have multiple events for the same * row in the same (current) transaction. We want to collapse these to just the last one * regardless whether we are minor compacting. Consider INSERT/UPDATE/UPDATE of the * same row in the same txn. There is no benefit passing along anything except the last * event. If we did want to pass it along, we'd have to include statementId in the row * returned so that compaction could write it out or make minor minor compaction understand * how to write out delta files in delta_xxx_yyy_stid format. There doesn't seem to be any * value in this.*/ bool isSameRow = prevKey.isSameRow((ReaderKey)recordIdentifier); // if we are collapsing, figure out if this is a new row if (collapse || isSameRow) { keysSame = (collapse && prevKey.compareRow(recordIdentifier) == 0) || (isSameRow); if (!keysSame) { prevKey.set(recordIdentifier); } } else { keysSame = false; } // set the output record by fiddling with the pointers so that we can // avoid a copy. prev.linkFields(current); } return !keysSame; }
internal static long getOriginalTransaction(OrcStruct @struct) { return ((LongWritable)@struct.getFieldValue(ORIGINAL_TRANSACTION)).get(); }
/** * Create a reader that merge sorts the ACID events together. * @param conf the configuration * @param collapseEvents should the events on the same row be collapsed * @param isOriginal is the base file a pre-acid file * @param bucket the bucket we are reading * @param options the options to read with * @param deltaDirectory the list of delta directories to include * @ */ OrcRawRecordMerger(Configuration conf, bool collapseEvents, Reader reader, bool isOriginal, int bucket, ValidTxnList validTxnList, Reader.Options options, Path[] deltaDirectory) { this.conf = conf; this.collapse = collapseEvents; this.offset = options.getOffset(); this.length = options.getLength(); this.validTxnList = validTxnList; TypeDescription typeDescr = OrcUtils.getDesiredRowTypeDescr(conf); if (typeDescr == null) { throw new IOException(ErrorMsg.SCHEMA_REQUIRED_TO_READ_ACID_TABLES.getErrorCodedMsg()); } objectInspector = OrcRecordUpdater.createEventSchema (OrcStruct.createObjectInspector(0, OrcUtils.getOrcTypes(typeDescr))); // modify the options to reflect the event instead of the base row Reader.Options eventOptions = createEventOptions(options); if (reader == null) { baseReader = null; } else { // find the min/max based on the offset and length if (isOriginal) { discoverOriginalKeyBounds(reader, bucket, options); } else { discoverKeyBounds(reader, options); } LOG.info("min key = " + minKey + ", max key = " + maxKey); // use the min/max instead of the byte range ReaderPair pair; ReaderKey key = new ReaderKey(); if (isOriginal) { options = options.clone(); options.range(options.getOffset(), Long.MAX_VALUE); pair = new OriginalReaderPair(key, reader, bucket, minKey, maxKey, options); } else { pair = new ReaderPair(key, reader, bucket, minKey, maxKey, eventOptions, 0); } // if there is at least one record, put it in the map if (pair.nextRecord != null) { readers.put(key, pair); } baseReader = pair.recordReader; } // we always want to read all of the deltas eventOptions.range(0, Long.MAX_VALUE); if (deltaDirectory != null) { foreach (Path delta in deltaDirectory) { ReaderKey key = new ReaderKey(); Path deltaFile = AcidUtils.createBucketFile(delta, bucket); AcidUtils.ParsedDelta deltaDir = AcidUtils.parsedDelta(delta); FileSystem fs = deltaFile.getFileSystem(conf); long length = getLastFlushLength(fs, deltaFile); if (length != -1 && fs.exists(deltaFile)) { Reader deltaReader = OrcFile.createReader(deltaFile, OrcFile.readerOptions(conf).maxLength(length)); Reader.Options deltaEventOptions = null; if (eventOptions.getSearchArgument() != null) { // Turn off the sarg before pushing it to delta. We never want to push a sarg to a delta as // it can produce wrong results (if the latest valid version of the record is filtered out by // the sarg) or ArrayOutOfBounds errors (when the sarg is applied to a delete record) // unless the delta only has insert events OrcRecordUpdater.AcidStats acidStats = OrcRecordUpdater.parseAcidStats(deltaReader); if (acidStats.deletes > 0 || acidStats.updates > 0) { deltaEventOptions = eventOptions.clone().searchArgument(null, null); } } ReaderPair deltaPair; deltaPair = new ReaderPair(key, deltaReader, bucket, minKey, maxKey, deltaEventOptions != null ? deltaEventOptions : eventOptions, deltaDir.getStatementId()); if (deltaPair.nextRecord != null) { readers.put(key, deltaPair); } } } } // get the first record Map.Entry <ReaderKey, ReaderPair> entry = readers.pollFirstEntry(); if (entry == null) { columns = 0; primary = null; } else { primary = entry.getValue(); if (readers.isEmpty()) { secondaryKey = null; } else { secondaryKey = readers.firstKey(); } // get the number of columns in the user's rows columns = primary.getColumns(); } }
internal static int getOperation(OrcStruct @struct) { return(((IntWritable)@struct.getFieldValue(OPERATION)).get()); }
public bool next(RecordIdentifier recordIdentifier, OrcStruct prev) { bool keysSame = true; while (keysSame && primary != null) { // The primary's nextRecord is the next value to return OrcStruct current = primary.nextRecord; recordIdentifier.set(primary.key); // Advance the primary reader to the next record primary.next(extraValue); // Save the current record as the new extraValue for next time so that // we minimize allocations extraValue = current; // now that the primary reader has advanced, we need to see if we // continue to read it or move to the secondary. if (primary.nextRecord == null || primary.key.compareTo(secondaryKey) > 0) { // if the primary isn't done, push it back into the readers if (primary.nextRecord != null) { readers.put(primary.key, primary); } // update primary and secondaryKey Map.Entry <ReaderKey, ReaderPair> entry = readers.pollFirstEntry(); if (entry != null) { primary = entry.getValue(); if (readers.isEmpty()) { secondaryKey = null; } else { secondaryKey = readers.firstKey(); } } else { primary = null; } } // if this transaction isn't ok, skip over it if (!validTxnList.isTxnValid( ((ReaderKey)recordIdentifier).getCurrentTransactionId())) { continue; } /*for multi-statement txns, you may have multiple events for the same * row in the same (current) transaction. We want to collapse these to just the last one * regardless whether we are minor compacting. Consider INSERT/UPDATE/UPDATE of the * same row in the same txn. There is no benefit passing along anything except the last * event. If we did want to pass it along, we'd have to include statementId in the row * returned so that compaction could write it out or make minor minor compaction understand * how to write out delta files in delta_xxx_yyy_stid format. There doesn't seem to be any * value in this.*/ bool isSameRow = prevKey.isSameRow((ReaderKey)recordIdentifier); // if we are collapsing, figure out if this is a new row if (collapse || isSameRow) { keysSame = (collapse && prevKey.compareRow(recordIdentifier) == 0) || (isSameRow); if (!keysSame) { prevKey.set(recordIdentifier); } } else { keysSame = false; } // set the output record by fiddling with the pointers so that we can // avoid a copy. prev.linkFields(current); } return(!keysSame); }
internal static long getOriginalTransaction(OrcStruct @struct) { return(((LongWritable)@struct.getFieldValue(ORIGINAL_TRANSACTION)).get()); }
public bool isDelete(OrcStruct value) { return(OrcRecordUpdater.getOperation(value) == OrcRecordUpdater.DELETE_OPERATION); }
internal static long getRowId(OrcStruct @struct) { return(((LongWritable)@struct.getFieldValue(ROW_ID)).get()); }
internal static int getOperation(OrcStruct @struct) { return ((IntWritable)@struct.getFieldValue(OPERATION)).get(); }
/** * Generate an ORC file with a range of dates and times. */ public void createOrcDateFile(string path, int minYear, int maxYear) { List<OrcProto.Type> types = new List<OrcProto.Type>(); types.Add(OrcProto.Type.CreateBuilder().SetKind(OrcProto.Type.Types.Kind.STRUCT). AddFieldNames("time").AddFieldNames("date"). AddSubtypes(1).AddSubtypes(2).Build()); types.Add(OrcProto.Type.CreateBuilder().SetKind(OrcProto.Type.Types.Kind.TIMESTAMP). Build()); types.Add(OrcProto.Type.CreateBuilder().SetKind(OrcProto.Type.Types.Kind.DATE). Build()); ObjectInspector inspector = OrcStruct.createObjectInspector(0, types); using (Stream file = FileOpenWrite(path)) using (Writer writer = OrcFile.createWriter(path, file, OrcFile.writerOptions(conf) .inspector(inspector) .stripeSize(100000) .bufferSize(10000) .blockPadding(false))) { OrcStruct row = new OrcStruct(2); for (int year = minYear; year < maxYear; ++year) { for (int ms = 1000; ms < 2000; ++ms) { row.setFieldValue(0, Timestamp.Parse(year + "-05-05 12:34:56." + ms)); row.setFieldValue(1, new Date(year - 1900, 11, 25)); writer.addRow(row); } } } Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf)); using (RecordReader rows = reader.rows()) { for (int year = minYear; year < maxYear; ++year) { for (int ms = 1000; ms < 2000; ++ms) { OrcStruct row = (OrcStruct)rows.next(); Assert.Equal( Timestamp.Parse(year + "-05-05 12:34:56." + ms), row.getFieldValue(0)); Assert.Equal(new Date(year - 1900, 11, 25), row.getFieldValue(1)); } } } }