Esempio n. 1
0
 /**
  * A required option that sets the object inspector for the rows. If
  * setSchema is not called, it also defines the schema.
  */
 public WriterOptions inspector(ObjectInspector value)
 {
     _inspector = value;
     if (!explicitSchema)
     {
         schema = OrcUtils.convertTypeInfo(
             TypeInfoUtils.getTypeInfoFromObjectInspector(value));
     }
     return(this);
 }
Esempio n. 2
0
        /**
         * @param typeDescr
         * @return ORC types for the ACID event based on the row's type description
         */
        public static List <OrcProto.Type> createEventSchema(TypeDescription typeDescr)
        {
            List <OrcProto.Type> result = new List <OrcProto.Type>();

            OrcProto.Type.Builder type = OrcProto.Type.CreateBuilder();
            type.SetKind(OrcProto.Type.Types.Kind.STRUCT);
            type.AddRangeFieldNames(acidEventFieldNames);
            for (int i = 0; i < acidEventFieldNames.Length; i++)
            {
                type.AddSubtypes((uint)i + 1);
            }
            result.Add(type.Build());

            // Automatically add all fields except the last (ROW).
            for (int i = 0; i < acidEventOrcTypeKinds.Length - 1; i++)
            {
                type.Clear();
                type.SetKind(acidEventOrcTypeKinds[i]);
                result.Add(type.Build());
            }

            OrcUtils.appendOrcTypesRebuildSubtypes(result, typeDescr);
            return(result);
        }
        /**
         * Create a reader that merge sorts the ACID events together.
         * @param conf the configuration
         * @param collapseEvents should the events on the same row be collapsed
         * @param isOriginal is the base file a pre-acid file
         * @param bucket the bucket we are reading
         * @param options the options to read with
         * @param deltaDirectory the list of delta directories to include
         * @
         */
        OrcRawRecordMerger(Configuration conf,
                           bool collapseEvents,
                           Reader reader,
                           bool isOriginal,
                           int bucket,
                           ValidTxnList validTxnList,
                           Reader.Options options,
                           Path[] deltaDirectory)
        {
            this.conf         = conf;
            this.collapse     = collapseEvents;
            this.offset       = options.getOffset();
            this.length       = options.getLength();
            this.validTxnList = validTxnList;
            TypeDescription typeDescr = OrcUtils.getDesiredRowTypeDescr(conf);

            if (typeDescr == null)
            {
                throw new IOException(ErrorMsg.SCHEMA_REQUIRED_TO_READ_ACID_TABLES.getErrorCodedMsg());
            }

            objectInspector = OrcRecordUpdater.createEventSchema
                                  (OrcStruct.createObjectInspector(0, OrcUtils.getOrcTypes(typeDescr)));

            // modify the options to reflect the event instead of the base row
            Reader.Options eventOptions = createEventOptions(options);
            if (reader == null)
            {
                baseReader = null;
            }
            else
            {
                // find the min/max based on the offset and length
                if (isOriginal)
                {
                    discoverOriginalKeyBounds(reader, bucket, options);
                }
                else
                {
                    discoverKeyBounds(reader, options);
                }
                LOG.info("min key = " + minKey + ", max key = " + maxKey);
                // use the min/max instead of the byte range
                ReaderPair pair;
                ReaderKey  key = new ReaderKey();
                if (isOriginal)
                {
                    options = options.clone();
                    options.range(options.getOffset(), Long.MAX_VALUE);
                    pair = new OriginalReaderPair(key, reader, bucket, minKey, maxKey,
                                                  options);
                }
                else
                {
                    pair = new ReaderPair(key, reader, bucket, minKey, maxKey,
                                          eventOptions, 0);
                }

                // if there is at least one record, put it in the map
                if (pair.nextRecord != null)
                {
                    readers.put(key, pair);
                }
                baseReader = pair.recordReader;
            }

            // we always want to read all of the deltas
            eventOptions.range(0, Long.MAX_VALUE);
            if (deltaDirectory != null)
            {
                foreach (Path delta in deltaDirectory)
                {
                    ReaderKey             key       = new ReaderKey();
                    Path                  deltaFile = AcidUtils.createBucketFile(delta, bucket);
                    AcidUtils.ParsedDelta deltaDir  = AcidUtils.parsedDelta(delta);
                    FileSystem            fs        = deltaFile.getFileSystem(conf);
                    long                  length    = getLastFlushLength(fs, deltaFile);
                    if (length != -1 && fs.exists(deltaFile))
                    {
                        Reader deltaReader = OrcFile.createReader(deltaFile,
                                                                  OrcFile.readerOptions(conf).maxLength(length));
                        Reader.Options deltaEventOptions = null;
                        if (eventOptions.getSearchArgument() != null)
                        {
                            // Turn off the sarg before pushing it to delta.  We never want to push a sarg to a delta as
                            // it can produce wrong results (if the latest valid version of the record is filtered out by
                            // the sarg) or ArrayOutOfBounds errors (when the sarg is applied to a delete record)
                            // unless the delta only has insert events
                            OrcRecordUpdater.AcidStats acidStats = OrcRecordUpdater.parseAcidStats(deltaReader);
                            if (acidStats.deletes > 0 || acidStats.updates > 0)
                            {
                                deltaEventOptions = eventOptions.clone().searchArgument(null, null);
                            }
                        }
                        ReaderPair deltaPair;
                        deltaPair = new ReaderPair(key, deltaReader, bucket, minKey,
                                                   maxKey, deltaEventOptions != null ? deltaEventOptions : eventOptions, deltaDir.getStatementId());
                        if (deltaPair.nextRecord != null)
                        {
                            readers.put(key, deltaPair);
                        }
                    }
                }
            }

            // get the first record
            Map.Entry <ReaderKey, ReaderPair> entry = readers.pollFirstEntry();
            if (entry == null)
            {
                columns = 0;
                primary = null;
            }
            else
            {
                primary = entry.getValue();
                if (readers.isEmpty())
                {
                    secondaryKey = null;
                }
                else
                {
                    secondaryKey = readers.firstKey();
                }
                // get the number of columns in the user's rows
                columns = primary.getColumns();
            }
        }
Esempio n. 4
0
        public static TreeReaderFactory.TreeReaderSchema validateAndCreate(
            IList <OrcProto.Type> fileTypes,
            IList <OrcProto.Type> schemaTypes)
        {
            // For ACID, the row is the ROW field in the outer STRUCT.
            bool isAcid = checkAcidSchema(fileTypes);
            IList <OrcProto.Type> rowSchema;
            int rowSubtype;

            if (isAcid)
            {
                rowSubtype = OrcRecordUpdater.ROW + 1;
                rowSchema  = fileTypes.subList(rowSubtype, fileTypes.Count);
            }
            else
            {
                rowSubtype = 0;
                rowSchema  = fileTypes;
            }

            // Do checking on the overlap.  Additional columns will be defaulted to NULL.

            int numFileColumns    = rowSchema[0].SubtypesCount;
            int numDesiredColumns = schemaTypes[0].SubtypesCount;

            int numReadColumns = Math.Min(numFileColumns, numDesiredColumns);

            /**
             * Check type promotion.
             *
             * Currently, we only support integer type promotions that can be done "implicitly".
             * That is, we know that using a bigger integer tree reader on the original smaller integer
             * column will "just work".
             *
             * In the future, other type promotions might require type conversion.
             */
            // short -> int -> bigint as same integer readers are used for the above types.

            for (int i = 0; i < numReadColumns; i++)
            {
                OrcProto.Type fColType = fileTypes[rowSubtype + i];
                OrcProto.Type rColType = schemaTypes[i];
                if (fColType.Kind != rColType.Kind)
                {
                    bool ok = false;
                    if (fColType.Kind == OrcProto.Type.Types.Kind.SHORT)
                    {
                        if (rColType.Kind == OrcProto.Type.Types.Kind.INT ||
                            rColType.Kind == OrcProto.Type.Types.Kind.LONG)
                        {
                            // type promotion possible, converting SHORT to INT/LONG requested type
                            ok = true;
                        }
                    }
                    else if (fColType.Kind == OrcProto.Type.Types.Kind.INT)
                    {
                        if (rColType.Kind == OrcProto.Type.Types.Kind.LONG)
                        {
                            // type promotion possible, converting INT to LONG requested type
                            ok = true;
                        }
                    }

                    if (!ok)
                    {
                        throw new IOException("ORC does not support type conversion from " +
                                              fColType.Kind.ToString() + " to " + rColType.Kind.ToString());
                    }
                }
            }

            IList <OrcProto.Type> fullSchemaTypes;

            if (isAcid)
            {
                fullSchemaTypes = new List <OrcProto.Type>();

                // This copies the ACID struct type which is subtype = 0.
                // It has field names "operation" through "row".
                // And we copy the types for all fields EXCEPT ROW (which must be last!).

                for (int i = 0; i < rowSubtype; i++)
                {
                    fullSchemaTypes.Add(fileTypes[i].ToBuilder().Build());
                }

                // Add the row struct type.
                OrcUtils.appendOrcTypesRebuildSubtypes(fullSchemaTypes, schemaTypes, 0);
            }
            else
            {
                fullSchemaTypes = schemaTypes;
            }

            int innerStructSubtype = rowSubtype;

            // LOG.info("Schema evolution: (fileTypes) " + fileTypes.toString() +
            //     " (schemaEvolutionTypes) " + schemaEvolutionTypes.toString());

            return(new TreeReaderFactory.TreeReaderSchema().
                   fileTypes(fileTypes).
                   schemaTypes(fullSchemaTypes).
                   innerStructSubtype(innerStructSubtype));
        }
Esempio n. 5
0
        public static TypeDescription getDesiredRowTypeDescr(Configuration conf)
        {
            string columnNameProperty = null;
            string columnTypeProperty = null;

            IList <string>         schemaEvolutionColumnNames = null;
            List <TypeDescription> schemaEvolutionTypeDescrs  = null;

            bool haveSchemaEvolutionProperties = false;

            if (HiveConf.getBoolVar(conf, ConfVars.HIVE_SCHEMA_EVOLUTION))
            {
                columnNameProperty = conf.get(IOConstants.SCHEMA_EVOLUTION_COLUMNS);
                columnTypeProperty = conf.get(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES);

                haveSchemaEvolutionProperties =
                    (columnNameProperty != null && columnTypeProperty != null);

                if (haveSchemaEvolutionProperties)
                {
                    schemaEvolutionColumnNames = columnNameProperty.Split(',');
                    if (schemaEvolutionColumnNames.Count == 0)
                    {
                        haveSchemaEvolutionProperties = false;
                    }
                    else
                    {
                        schemaEvolutionTypeDescrs =
                            OrcUtils.typeDescriptionsFromHiveTypeProperty(columnTypeProperty);
                        if (schemaEvolutionTypeDescrs.Count != schemaEvolutionColumnNames.Count)
                        {
                            haveSchemaEvolutionProperties = false;
                        }
                    }
                }
            }

            if (!haveSchemaEvolutionProperties)
            {
                // Try regular properties;
                columnNameProperty = conf.get(serdeConstants.LIST_COLUMNS);
                columnTypeProperty = conf.get(serdeConstants.LIST_COLUMN_TYPES);
                if (columnTypeProperty == null || columnNameProperty == null)
                {
                    return(null);
                }

                schemaEvolutionColumnNames = columnNameProperty.Split(',');
                if (schemaEvolutionColumnNames.Count == 0)
                {
                    return(null);
                }
                schemaEvolutionTypeDescrs =
                    OrcUtils.typeDescriptionsFromHiveTypeProperty(columnTypeProperty);
                if (schemaEvolutionTypeDescrs.Count != schemaEvolutionColumnNames.Count)
                {
                    return(null);
                }
            }

            // Desired schema does not include virtual columns or partition columns.
            TypeDescription result = TypeDescription.createStruct();

            for (int i = 0; i < schemaEvolutionColumnNames.Count; i++)
            {
                result.addField(schemaEvolutionColumnNames[i], schemaEvolutionTypeDescrs[i]);
            }

            return(result);
        }