/** * Create a reader that merge sorts the ACID events together. * @param conf the configuration * @param collapseEvents should the events on the same row be collapsed * @param isOriginal is the base file a pre-acid file * @param bucket the bucket we are reading * @param options the options to read with * @param deltaDirectory the list of delta directories to include * @ */ OrcRawRecordMerger(Configuration conf, bool collapseEvents, Reader reader, bool isOriginal, int bucket, ValidTxnList validTxnList, Reader.Options options, Path[] deltaDirectory) { this.conf = conf; this.collapse = collapseEvents; this.offset = options.getOffset(); this.length = options.getLength(); this.validTxnList = validTxnList; TypeDescription typeDescr = OrcUtils.getDesiredRowTypeDescr(conf); if (typeDescr == null) { throw new IOException(ErrorMsg.SCHEMA_REQUIRED_TO_READ_ACID_TABLES.getErrorCodedMsg()); } objectInspector = OrcRecordUpdater.createEventSchema (OrcStruct.createObjectInspector(0, OrcUtils.getOrcTypes(typeDescr))); // modify the options to reflect the event instead of the base row Reader.Options eventOptions = createEventOptions(options); if (reader == null) { baseReader = null; } else { // find the min/max based on the offset and length if (isOriginal) { discoverOriginalKeyBounds(reader, bucket, options); } else { discoverKeyBounds(reader, options); } LOG.info("min key = " + minKey + ", max key = " + maxKey); // use the min/max instead of the byte range ReaderPair pair; ReaderKey key = new ReaderKey(); if (isOriginal) { options = options.clone(); options.range(options.getOffset(), Long.MAX_VALUE); pair = new OriginalReaderPair(key, reader, bucket, minKey, maxKey, options); } else { pair = new ReaderPair(key, reader, bucket, minKey, maxKey, eventOptions, 0); } // if there is at least one record, put it in the map if (pair.nextRecord != null) { readers.put(key, pair); } baseReader = pair.recordReader; } // we always want to read all of the deltas eventOptions.range(0, Long.MAX_VALUE); if (deltaDirectory != null) { foreach (Path delta in deltaDirectory) { ReaderKey key = new ReaderKey(); Path deltaFile = AcidUtils.createBucketFile(delta, bucket); AcidUtils.ParsedDelta deltaDir = AcidUtils.parsedDelta(delta); FileSystem fs = deltaFile.getFileSystem(conf); long length = getLastFlushLength(fs, deltaFile); if (length != -1 && fs.exists(deltaFile)) { Reader deltaReader = OrcFile.createReader(deltaFile, OrcFile.readerOptions(conf).maxLength(length)); Reader.Options deltaEventOptions = null; if (eventOptions.getSearchArgument() != null) { // Turn off the sarg before pushing it to delta. We never want to push a sarg to a delta as // it can produce wrong results (if the latest valid version of the record is filtered out by // the sarg) or ArrayOutOfBounds errors (when the sarg is applied to a delete record) // unless the delta only has insert events OrcRecordUpdater.AcidStats acidStats = OrcRecordUpdater.parseAcidStats(deltaReader); if (acidStats.deletes > 0 || acidStats.updates > 0) { deltaEventOptions = eventOptions.clone().searchArgument(null, null); } } ReaderPair deltaPair; deltaPair = new ReaderPair(key, deltaReader, bucket, minKey, maxKey, deltaEventOptions != null ? deltaEventOptions : eventOptions, deltaDir.getStatementId()); if (deltaPair.nextRecord != null) { readers.put(key, deltaPair); } } } } // get the first record Map.Entry<ReaderKey, ReaderPair> entry = readers.pollFirstEntry(); if (entry == null) { columns = 0; primary = null; } else { primary = entry.getValue(); if (readers.isEmpty()) { secondaryKey = null; } else { secondaryKey = readers.firstKey(); } // get the number of columns in the user's rows columns = primary.getColumns(); } }
/** * Find the key range for bucket files. * @param reader the reader * @param options the options for reading with * @ */ private void discoverKeyBounds(Reader reader, Reader.Options options) { RecordIdentifier[] keyIndex = OrcRecordUpdater.parseKeyIndex(reader); long offset = options.getOffset(); long maxOffset = options.getMaxOffset(); int firstStripe = 0; int stripeCount = 0; bool isTail = true; List<StripeInformation> stripes = reader.getStripes(); foreach (StripeInformation stripe in stripes) { if (offset > stripe.getOffset()) { firstStripe += 1; } else if (maxOffset > stripe.getOffset()) { stripeCount += 1; } else { isTail = false; break; } } if (firstStripe != 0) { minKey = keyIndex[firstStripe - 1]; } if (!isTail) { maxKey = keyIndex[firstStripe + stripeCount - 1]; } }
/** * Find the key range for original bucket files. * @param reader the reader * @param bucket the bucket number we are reading * @param options the options for reading with * @ */ private void discoverOriginalKeyBounds(Reader reader, int bucket, Reader.Options options ) { long rowLength = 0; long rowOffset = 0; long offset = options.getOffset(); long maxOffset = options.getMaxOffset(); bool isTail = true; foreach (StripeInformation stripe in reader.getStripes()) { if (offset > stripe.getOffset()) { rowOffset += stripe.getNumberOfRows(); } else if (maxOffset > stripe.getOffset()) { rowLength += stripe.getNumberOfRows(); } else { isTail = false; break; } } if (rowOffset > 0) { minKey = new RecordIdentifier(0, bucket, rowOffset - 1); } if (!isTail) { maxKey = new RecordIdentifier(0, bucket, rowOffset + rowLength - 1); } }
/** * Convert from the row include/sarg/columnNames to the event equivalent * for the underlying file. * @param options options for the row reader * @return a cloned options object that is modified for the event reader */ static Reader.Options createEventOptions(Reader.Options options) { Reader.Options result = options.clone(); result.range(options.getOffset(), Int64.MaxValue); // slide the columns down by 6 for the include array if (options.getInclude() != null) { bool[] orig = options.getInclude(); // we always need the base row orig[0] = true; bool[] include = new bool[orig.Length + OrcRecordUpdater.FIELDS]; Arrays.fill(include, 0, OrcRecordUpdater.FIELDS, true); for (int i = 0; i < orig.Length; ++i) { include[i + OrcRecordUpdater.FIELDS] = orig[i]; } result.include(include); } // slide the column names down by 6 for the name array if (options.getColumnNames() != null) { string[] orig = options.getColumnNames(); string[] cols = new string[orig.Length + OrcRecordUpdater.FIELDS]; for (int i = 0; i < orig.Length; ++i) { cols[i + OrcRecordUpdater.FIELDS] = orig[i]; } result.searchArgument(options.getSearchArgument(), cols); } return result; }