/** * A required option that sets the object inspector for the rows. If * setSchema is not called, it also defines the schema. */ public WriterOptions inspector(ObjectInspector value) { _inspector = value; if (!explicitSchema) { schema = OrcUtils.convertTypeInfo( TypeInfoUtils.getTypeInfoFromObjectInspector(value)); } return(this); }
/** * @param typeDescr * @return ORC types for the ACID event based on the row's type description */ public static List <OrcProto.Type> createEventSchema(TypeDescription typeDescr) { List <OrcProto.Type> result = new List <OrcProto.Type>(); OrcProto.Type.Builder type = OrcProto.Type.CreateBuilder(); type.SetKind(OrcProto.Type.Types.Kind.STRUCT); type.AddRangeFieldNames(acidEventFieldNames); for (int i = 0; i < acidEventFieldNames.Length; i++) { type.AddSubtypes((uint)i + 1); } result.Add(type.Build()); // Automatically add all fields except the last (ROW). for (int i = 0; i < acidEventOrcTypeKinds.Length - 1; i++) { type.Clear(); type.SetKind(acidEventOrcTypeKinds[i]); result.Add(type.Build()); } OrcUtils.appendOrcTypesRebuildSubtypes(result, typeDescr); return(result); }
/** * Create a reader that merge sorts the ACID events together. * @param conf the configuration * @param collapseEvents should the events on the same row be collapsed * @param isOriginal is the base file a pre-acid file * @param bucket the bucket we are reading * @param options the options to read with * @param deltaDirectory the list of delta directories to include * @ */ OrcRawRecordMerger(Configuration conf, bool collapseEvents, Reader reader, bool isOriginal, int bucket, ValidTxnList validTxnList, Reader.Options options, Path[] deltaDirectory) { this.conf = conf; this.collapse = collapseEvents; this.offset = options.getOffset(); this.length = options.getLength(); this.validTxnList = validTxnList; TypeDescription typeDescr = OrcUtils.getDesiredRowTypeDescr(conf); if (typeDescr == null) { throw new IOException(ErrorMsg.SCHEMA_REQUIRED_TO_READ_ACID_TABLES.getErrorCodedMsg()); } objectInspector = OrcRecordUpdater.createEventSchema (OrcStruct.createObjectInspector(0, OrcUtils.getOrcTypes(typeDescr))); // modify the options to reflect the event instead of the base row Reader.Options eventOptions = createEventOptions(options); if (reader == null) { baseReader = null; } else { // find the min/max based on the offset and length if (isOriginal) { discoverOriginalKeyBounds(reader, bucket, options); } else { discoverKeyBounds(reader, options); } LOG.info("min key = " + minKey + ", max key = " + maxKey); // use the min/max instead of the byte range ReaderPair pair; ReaderKey key = new ReaderKey(); if (isOriginal) { options = options.clone(); options.range(options.getOffset(), Long.MAX_VALUE); pair = new OriginalReaderPair(key, reader, bucket, minKey, maxKey, options); } else { pair = new ReaderPair(key, reader, bucket, minKey, maxKey, eventOptions, 0); } // if there is at least one record, put it in the map if (pair.nextRecord != null) { readers.put(key, pair); } baseReader = pair.recordReader; } // we always want to read all of the deltas eventOptions.range(0, Long.MAX_VALUE); if (deltaDirectory != null) { foreach (Path delta in deltaDirectory) { ReaderKey key = new ReaderKey(); Path deltaFile = AcidUtils.createBucketFile(delta, bucket); AcidUtils.ParsedDelta deltaDir = AcidUtils.parsedDelta(delta); FileSystem fs = deltaFile.getFileSystem(conf); long length = getLastFlushLength(fs, deltaFile); if (length != -1 && fs.exists(deltaFile)) { Reader deltaReader = OrcFile.createReader(deltaFile, OrcFile.readerOptions(conf).maxLength(length)); Reader.Options deltaEventOptions = null; if (eventOptions.getSearchArgument() != null) { // Turn off the sarg before pushing it to delta. We never want to push a sarg to a delta as // it can produce wrong results (if the latest valid version of the record is filtered out by // the sarg) or ArrayOutOfBounds errors (when the sarg is applied to a delete record) // unless the delta only has insert events OrcRecordUpdater.AcidStats acidStats = OrcRecordUpdater.parseAcidStats(deltaReader); if (acidStats.deletes > 0 || acidStats.updates > 0) { deltaEventOptions = eventOptions.clone().searchArgument(null, null); } } ReaderPair deltaPair; deltaPair = new ReaderPair(key, deltaReader, bucket, minKey, maxKey, deltaEventOptions != null ? deltaEventOptions : eventOptions, deltaDir.getStatementId()); if (deltaPair.nextRecord != null) { readers.put(key, deltaPair); } } } } // get the first record Map.Entry <ReaderKey, ReaderPair> entry = readers.pollFirstEntry(); if (entry == null) { columns = 0; primary = null; } else { primary = entry.getValue(); if (readers.isEmpty()) { secondaryKey = null; } else { secondaryKey = readers.firstKey(); } // get the number of columns in the user's rows columns = primary.getColumns(); } }
public static TreeReaderFactory.TreeReaderSchema validateAndCreate( IList <OrcProto.Type> fileTypes, IList <OrcProto.Type> schemaTypes) { // For ACID, the row is the ROW field in the outer STRUCT. bool isAcid = checkAcidSchema(fileTypes); IList <OrcProto.Type> rowSchema; int rowSubtype; if (isAcid) { rowSubtype = OrcRecordUpdater.ROW + 1; rowSchema = fileTypes.subList(rowSubtype, fileTypes.Count); } else { rowSubtype = 0; rowSchema = fileTypes; } // Do checking on the overlap. Additional columns will be defaulted to NULL. int numFileColumns = rowSchema[0].SubtypesCount; int numDesiredColumns = schemaTypes[0].SubtypesCount; int numReadColumns = Math.Min(numFileColumns, numDesiredColumns); /** * Check type promotion. * * Currently, we only support integer type promotions that can be done "implicitly". * That is, we know that using a bigger integer tree reader on the original smaller integer * column will "just work". * * In the future, other type promotions might require type conversion. */ // short -> int -> bigint as same integer readers are used for the above types. for (int i = 0; i < numReadColumns; i++) { OrcProto.Type fColType = fileTypes[rowSubtype + i]; OrcProto.Type rColType = schemaTypes[i]; if (fColType.Kind != rColType.Kind) { bool ok = false; if (fColType.Kind == OrcProto.Type.Types.Kind.SHORT) { if (rColType.Kind == OrcProto.Type.Types.Kind.INT || rColType.Kind == OrcProto.Type.Types.Kind.LONG) { // type promotion possible, converting SHORT to INT/LONG requested type ok = true; } } else if (fColType.Kind == OrcProto.Type.Types.Kind.INT) { if (rColType.Kind == OrcProto.Type.Types.Kind.LONG) { // type promotion possible, converting INT to LONG requested type ok = true; } } if (!ok) { throw new IOException("ORC does not support type conversion from " + fColType.Kind.ToString() + " to " + rColType.Kind.ToString()); } } } IList <OrcProto.Type> fullSchemaTypes; if (isAcid) { fullSchemaTypes = new List <OrcProto.Type>(); // This copies the ACID struct type which is subtype = 0. // It has field names "operation" through "row". // And we copy the types for all fields EXCEPT ROW (which must be last!). for (int i = 0; i < rowSubtype; i++) { fullSchemaTypes.Add(fileTypes[i].ToBuilder().Build()); } // Add the row struct type. OrcUtils.appendOrcTypesRebuildSubtypes(fullSchemaTypes, schemaTypes, 0); } else { fullSchemaTypes = schemaTypes; } int innerStructSubtype = rowSubtype; // LOG.info("Schema evolution: (fileTypes) " + fileTypes.toString() + // " (schemaEvolutionTypes) " + schemaEvolutionTypes.toString()); return(new TreeReaderFactory.TreeReaderSchema(). fileTypes(fileTypes). schemaTypes(fullSchemaTypes). innerStructSubtype(innerStructSubtype)); }
public static TypeDescription getDesiredRowTypeDescr(Configuration conf) { string columnNameProperty = null; string columnTypeProperty = null; IList <string> schemaEvolutionColumnNames = null; List <TypeDescription> schemaEvolutionTypeDescrs = null; bool haveSchemaEvolutionProperties = false; if (HiveConf.getBoolVar(conf, ConfVars.HIVE_SCHEMA_EVOLUTION)) { columnNameProperty = conf.get(IOConstants.SCHEMA_EVOLUTION_COLUMNS); columnTypeProperty = conf.get(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES); haveSchemaEvolutionProperties = (columnNameProperty != null && columnTypeProperty != null); if (haveSchemaEvolutionProperties) { schemaEvolutionColumnNames = columnNameProperty.Split(','); if (schemaEvolutionColumnNames.Count == 0) { haveSchemaEvolutionProperties = false; } else { schemaEvolutionTypeDescrs = OrcUtils.typeDescriptionsFromHiveTypeProperty(columnTypeProperty); if (schemaEvolutionTypeDescrs.Count != schemaEvolutionColumnNames.Count) { haveSchemaEvolutionProperties = false; } } } } if (!haveSchemaEvolutionProperties) { // Try regular properties; columnNameProperty = conf.get(serdeConstants.LIST_COLUMNS); columnTypeProperty = conf.get(serdeConstants.LIST_COLUMN_TYPES); if (columnTypeProperty == null || columnNameProperty == null) { return(null); } schemaEvolutionColumnNames = columnNameProperty.Split(','); if (schemaEvolutionColumnNames.Count == 0) { return(null); } schemaEvolutionTypeDescrs = OrcUtils.typeDescriptionsFromHiveTypeProperty(columnTypeProperty); if (schemaEvolutionTypeDescrs.Count != schemaEvolutionColumnNames.Count) { return(null); } } // Desired schema does not include virtual columns or partition columns. TypeDescription result = TypeDescription.createStruct(); for (int i = 0; i < schemaEvolutionColumnNames.Count; i++) { result.addField(schemaEvolutionColumnNames[i], schemaEvolutionTypeDescrs[i]); } return(result); }