Beispiel #1
0
        private List <int> getColumnIndicesFromNames(List <string> colNames)
        {
            // top level struct
            OrcProto.Type  type       = types[0];
            List <int>     colIndices = new List <int>();
            IList <string> fieldNames = type.FieldNamesList;
            int            fieldIdx   = 0;

            foreach (string colName in colNames)
            {
                fieldIdx = fieldNames.IndexOf(colName);
                if (fieldIdx < 0)
                {
                    string s = "Cannot find field for: " + colName + " in ";
                    foreach (string fn in fieldNames)
                    {
                        s += fn + ", ";
                    }
                    LOG.warn(s);
                    continue;
                }

                // a single field may span multiple columns. find start and end column
                // index for the requested field
                int idxStart = (int)type.SubtypesList[fieldIdx];

                int idxEnd;

                // if the specified is the last field and then end index will be last
                // column index
                if (fieldIdx + 1 > fieldNames.Count - 1)
                {
                    idxEnd = getLastIdx() + 1;
                }
                else
                {
                    idxEnd = (int)type.SubtypesList[fieldIdx + 1];
                }

                // if start index and end index are same then the field is a primitive
                // field else complex field (like map, list, struct, union)
                if (idxStart == idxEnd)
                {
                    // simple field
                    colIndices.Add(idxStart);
                }
                else
                {
                    // complex fields spans multiple columns
                    for (int i = idxStart; i < idxEnd; i++)
                    {
                        colIndices.Add(i);
                    }
                }
            }
            return(colIndices);
        }
 public OrcUnionObjectInspector(int columnId, IList <OrcProto.Type> types)
 {
     OrcProto.Type type = types[columnId];
     children = new List <ObjectInspector>(type.SubtypesCount);
     for (int i = 0; i < type.SubtypesCount; ++i)
     {
         children.Add(OrcStruct.createObjectInspector((int)type.SubtypesList[i],
                                                      types));
     }
 }
Beispiel #3
0
        private static long getRawDataSizeOfColumn(int colIdx, IList <OrcProto.Type> types,
                                                   IList <OrcProto.ColumnStatistics> stats)
        {
            OrcProto.ColumnStatistics colStat = stats[colIdx];
            long numVals = (long)colStat.NumberOfValues;

            OrcProto.Type type = types[colIdx];

            switch (type.Kind)
            {
            case OrcProto.Type.Types.Kind.BINARY:
                // old orc format doesn't support binary statistics. checking for binary
                // statistics is not required as protocol buffers takes care of it.
                return(colStat.BinaryStatistics.Sum);

            case OrcProto.Type.Types.Kind.STRING:
            case OrcProto.Type.Types.Kind.CHAR:
            case OrcProto.Type.Types.Kind.VARCHAR:
                // old orc format doesn't support sum for string statistics. checking for
                // existence is not required as protocol buffers takes care of it.

                // ORC strings are deserialized to java strings. so use java data model's
                // string size
                numVals = numVals == 0 ? 1 : numVals;
                int avgStrLen = (int)(colStat.StringStatistics.Sum / numVals);
                return(numVals * JavaDataModel.lengthForStringOfLength(avgStrLen));

            case OrcProto.Type.Types.Kind.TIMESTAMP:
                return(numVals * JavaDataModel.lengthOfTimestamp());

            case OrcProto.Type.Types.Kind.DATE:
                return(numVals * JavaDataModel.lengthOfDate());

            case OrcProto.Type.Types.Kind.DECIMAL:
                return(numVals * JavaDataModel.lengthOfDecimal());

            case OrcProto.Type.Types.Kind.DOUBLE:
            case OrcProto.Type.Types.Kind.LONG:
                return(numVals * JavaDataModel.Eight);

            case OrcProto.Type.Types.Kind.FLOAT:
            case OrcProto.Type.Types.Kind.INT:
            case OrcProto.Type.Types.Kind.SHORT:
            case OrcProto.Type.Types.Kind.BOOLEAN:
            case OrcProto.Type.Types.Kind.BYTE:
                return(numVals * JavaDataModel.Four);

            default:
                LOG.debug("Unknown primitive category: " + type.Kind);
                break;
            }

            return(0);
        }
            public OrcStructInspector(int columnId, IList <OrcProto.Type> types)
            {
                OrcProto.Type type       = types[columnId];
                int           fieldCount = type.SubtypesCount;

                fields = new List <StructField>(fieldCount);
                for (int i = 0; i < fieldCount; ++i)
                {
                    int fieldType = (int)type.SubtypesList[i];
                    fields.Add(new Field(type.FieldNamesList[i],
                                         createObjectInspector(fieldType, types), i));
                }
            }
        public static ObjectInspector createObjectInspector(int columnId, IList <OrcProto.Type> types)
        {
            OrcProto.Type type = types[columnId];
            switch (type.Kind)
            {
            case OrcProto.Type.Types.Kind.FLOAT:
                return(PrimitiveObjectInspectorFactory.writableFloatObjectInspector);

            case OrcProto.Type.Types.Kind.DOUBLE:
                return(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector);

            case OrcProto.Type.Types.Kind.BOOLEAN:
                return(PrimitiveObjectInspectorFactory.writableBooleanObjectInspector);

            case OrcProto.Type.Types.Kind.BYTE:
                return(PrimitiveObjectInspectorFactory.writableByteObjectInspector);

            case OrcProto.Type.Types.Kind.SHORT:
                return(PrimitiveObjectInspectorFactory.writableShortObjectInspector);

            case OrcProto.Type.Types.Kind.INT:
                return(PrimitiveObjectInspectorFactory.writableIntObjectInspector);

            case OrcProto.Type.Types.Kind.LONG:
                return(PrimitiveObjectInspectorFactory.writableLongObjectInspector);

            case OrcProto.Type.Types.Kind.BINARY:
                return(PrimitiveObjectInspectorFactory.writableBinaryObjectInspector);

            case OrcProto.Type.Types.Kind.STRING:
                return(PrimitiveObjectInspectorFactory.writableStringObjectInspector);

            case OrcProto.Type.Types.Kind.CHAR:
                if (!type.HasMaximumLength)
                {
                    throw new NotSupportedException(
                              "Illegal use of char type without length in ORC type definition.");
                }
                return(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(
                           TypeInfoFactory.getCharTypeInfo((int)type.MaximumLength)));

            case OrcProto.Type.Types.Kind.VARCHAR:
                if (!type.HasMaximumLength)
                {
                    throw new NotSupportedException(
                              "Illegal use of varchar type without length in ORC type definition.");
                }
                return(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(
                           TypeInfoFactory.getVarcharTypeInfo((int)type.MaximumLength)));

            case OrcProto.Type.Types.Kind.TIMESTAMP:
                return(PrimitiveObjectInspectorFactory.writableTimestampObjectInspector);

            case OrcProto.Type.Types.Kind.DATE:
                return(PrimitiveObjectInspectorFactory.writableDateObjectInspector);

            case OrcProto.Type.Types.Kind.DECIMAL:
                int precision = type.HasPrecision ? (int)type.Precision : HiveDecimal.SYSTEM_DEFAULT_PRECISION;
                int scale     = type.HasScale ? (int)type.Scale : HiveDecimal.SYSTEM_DEFAULT_SCALE;
                return(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(
                           TypeInfoFactory.getDecimalTypeInfo(precision, scale)));

            case OrcProto.Type.Types.Kind.STRUCT:
                return(new OrcStructInspector(columnId, types));

            case OrcProto.Type.Types.Kind.UNION:
                return(new OrcUnion.OrcUnionObjectInspector(columnId, types));

            case OrcProto.Type.Types.Kind.MAP:
                return(new OrcMapObjectInspector(columnId, types));

            case OrcProto.Type.Types.Kind.LIST:
                return(new OrcListObjectInspector(columnId, types));

            default:
                throw new NotSupportedException("Unknown type " +
                                                type.Kind);
            }
        }
 public OrcListObjectInspector(int columnId, IList <OrcProto.Type> types)
 {
     OrcProto.Type type = types[columnId];
     child = createObjectInspector((int)type.SubtypesList[0], types);
 }
 public OrcMapObjectInspector(int columnId, IList <OrcProto.Type> types)
 {
     OrcProto.Type type = types[columnId];
     key   = createObjectInspector((int)type.SubtypesList[0], types);
     value = createObjectInspector((int)type.SubtypesList[1], types);
 }
Beispiel #8
0
        public static TreeReaderFactory.TreeReaderSchema validateAndCreate(
            IList <OrcProto.Type> fileTypes,
            IList <OrcProto.Type> schemaTypes)
        {
            // For ACID, the row is the ROW field in the outer STRUCT.
            bool isAcid = checkAcidSchema(fileTypes);
            IList <OrcProto.Type> rowSchema;
            int rowSubtype;

            if (isAcid)
            {
                rowSubtype = OrcRecordUpdater.ROW + 1;
                rowSchema  = fileTypes.subList(rowSubtype, fileTypes.Count);
            }
            else
            {
                rowSubtype = 0;
                rowSchema  = fileTypes;
            }

            // Do checking on the overlap.  Additional columns will be defaulted to NULL.

            int numFileColumns    = rowSchema[0].SubtypesCount;
            int numDesiredColumns = schemaTypes[0].SubtypesCount;

            int numReadColumns = Math.Min(numFileColumns, numDesiredColumns);

            /**
             * Check type promotion.
             *
             * Currently, we only support integer type promotions that can be done "implicitly".
             * That is, we know that using a bigger integer tree reader on the original smaller integer
             * column will "just work".
             *
             * In the future, other type promotions might require type conversion.
             */
            // short -> int -> bigint as same integer readers are used for the above types.

            for (int i = 0; i < numReadColumns; i++)
            {
                OrcProto.Type fColType = fileTypes[rowSubtype + i];
                OrcProto.Type rColType = schemaTypes[i];
                if (fColType.Kind != rColType.Kind)
                {
                    bool ok = false;
                    if (fColType.Kind == OrcProto.Type.Types.Kind.SHORT)
                    {
                        if (rColType.Kind == OrcProto.Type.Types.Kind.INT ||
                            rColType.Kind == OrcProto.Type.Types.Kind.LONG)
                        {
                            // type promotion possible, converting SHORT to INT/LONG requested type
                            ok = true;
                        }
                    }
                    else if (fColType.Kind == OrcProto.Type.Types.Kind.INT)
                    {
                        if (rColType.Kind == OrcProto.Type.Types.Kind.LONG)
                        {
                            // type promotion possible, converting INT to LONG requested type
                            ok = true;
                        }
                    }

                    if (!ok)
                    {
                        throw new IOException("ORC does not support type conversion from " +
                                              fColType.Kind.ToString() + " to " + rColType.Kind.ToString());
                    }
                }
            }

            IList <OrcProto.Type> fullSchemaTypes;

            if (isAcid)
            {
                fullSchemaTypes = new List <OrcProto.Type>();

                // This copies the ACID struct type which is subtype = 0.
                // It has field names "operation" through "row".
                // And we copy the types for all fields EXCEPT ROW (which must be last!).

                for (int i = 0; i < rowSubtype; i++)
                {
                    fullSchemaTypes.Add(fileTypes[i].ToBuilder().Build());
                }

                // Add the row struct type.
                OrcUtils.appendOrcTypesRebuildSubtypes(fullSchemaTypes, schemaTypes, 0);
            }
            else
            {
                fullSchemaTypes = schemaTypes;
            }

            int innerStructSubtype = rowSubtype;

            // LOG.info("Schema evolution: (fileTypes) " + fileTypes.toString() +
            //     " (schemaEvolutionTypes) " + schemaEvolutionTypes.toString());

            return(new TreeReaderFactory.TreeReaderSchema().
                   fileTypes(fileTypes).
                   schemaTypes(fullSchemaTypes).
                   innerStructSubtype(innerStructSubtype));
        }
Beispiel #9
0
        /**
         * NOTE: This method ignores the subtype numbers in the OrcProto.Type rebuilds the subtype
         * numbers based on the length of the result list being appended.
         *
         * @param result
         * @param typeInfo
         */
        public static int appendOrcTypesRebuildSubtypes(
            IList <OrcProto.Type> result,
            IList <OrcProto.Type> types,
            int columnId)
        {
            OrcProto.Type oldType = types[columnId++];

            int subtype = result.Count;

            OrcProto.Type.Builder builder = OrcProto.Type.CreateBuilder();
            bool needsAdd = true;

            switch (oldType.Kind)
            {
            case OrcProto.Type.Types.Kind.BOOLEAN:
                builder.SetKind(OrcProto.Type.Types.Kind.BOOLEAN);
                break;

            case OrcProto.Type.Types.Kind.BYTE:
                builder.SetKind(OrcProto.Type.Types.Kind.BYTE);
                break;

            case OrcProto.Type.Types.Kind.SHORT:
                builder.SetKind(OrcProto.Type.Types.Kind.SHORT);
                break;

            case OrcProto.Type.Types.Kind.INT:
                builder.SetKind(OrcProto.Type.Types.Kind.INT);
                break;

            case OrcProto.Type.Types.Kind.LONG:
                builder.SetKind(OrcProto.Type.Types.Kind.LONG);
                break;

            case OrcProto.Type.Types.Kind.FLOAT:
                builder.SetKind(OrcProto.Type.Types.Kind.FLOAT);
                break;

            case OrcProto.Type.Types.Kind.DOUBLE:
                builder.SetKind(OrcProto.Type.Types.Kind.DOUBLE);
                break;

            case OrcProto.Type.Types.Kind.STRING:
                builder.SetKind(OrcProto.Type.Types.Kind.STRING);
                break;

            case OrcProto.Type.Types.Kind.CHAR:
                builder.SetKind(OrcProto.Type.Types.Kind.CHAR);
                builder.SetMaximumLength(oldType.MaximumLength);
                break;

            case OrcProto.Type.Types.Kind.VARCHAR:
                builder.SetKind(OrcProto.Type.Types.Kind.VARCHAR);
                builder.SetMaximumLength(oldType.MaximumLength);
                break;

            case OrcProto.Type.Types.Kind.BINARY:
                builder.SetKind(OrcProto.Type.Types.Kind.BINARY);
                break;

            case OrcProto.Type.Types.Kind.TIMESTAMP:
                builder.SetKind(OrcProto.Type.Types.Kind.TIMESTAMP);
                break;

            case OrcProto.Type.Types.Kind.DATE:
                builder.SetKind(OrcProto.Type.Types.Kind.DATE);
                break;

            case OrcProto.Type.Types.Kind.DECIMAL:
                builder.SetKind(OrcProto.Type.Types.Kind.DECIMAL);
                builder.SetPrecision(oldType.Precision);
                builder.SetScale(oldType.Scale);
                break;

            case OrcProto.Type.Types.Kind.LIST:
                builder.SetKind(OrcProto.Type.Types.Kind.LIST);
                builder.AddSubtypes((uint)++subtype);
                result.Add(builder.Build());
                needsAdd = false;
                columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
                break;

            case OrcProto.Type.Types.Kind.MAP:
            {
                // Make room for MAP type.
                result.Add(null);

                // Add MAP type pair in order to determine their subtype values.
                columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
                int subtype2 = result.Count;
                columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
                builder.SetKind(OrcProto.Type.Types.Kind.MAP);
                builder.AddSubtypes((uint)subtype + 1);
                builder.AddSubtypes((uint)subtype2);
                result[subtype] = builder.Build();
                needsAdd        = false;
            }
            break;

            case OrcProto.Type.Types.Kind.STRUCT:
            {
                IList <string> fieldNames = oldType.FieldNamesList;

                // Make room for STRUCT type.
                result.Add(null);

                List <int> fieldSubtypes = new List <int>(fieldNames.Count);
                for (int i = 0; i < fieldNames.Count; i++)
                {
                    int fieldSubtype = result.Count;
                    fieldSubtypes.Add(fieldSubtype);
                    columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
                }

                builder.SetKind(OrcProto.Type.Types.Kind.STRUCT);

                for (int i = 0; i < fieldNames.Count; i++)
                {
                    builder.AddSubtypes((uint)fieldSubtypes[i]);
                    builder.AddFieldNames(fieldNames[i]);
                }
                result[subtype] = builder.Build();
                needsAdd        = false;
            }
            break;

            case OrcProto.Type.Types.Kind.UNION:
            {
                int subtypeCount = oldType.SubtypesCount;

                // Make room for UNION type.
                result.Add(null);

                List <int> unionSubtypes = new List <int>(subtypeCount);
                for (int i = 0; i < subtypeCount; i++)
                {
                    int unionSubtype = result.Count;
                    unionSubtypes.Add(unionSubtype);
                    columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
                }

                builder.SetKind(OrcProto.Type.Types.Kind.UNION);
                for (int i = 0; i < subtypeCount; i++)
                {
                    builder.AddSubtypes((uint)unionSubtypes[i]);
                }
                result[subtype] = builder.Build();
                needsAdd        = false;
            }
            break;

            default:
                throw new ArgumentException("Unknown category: " + oldType.Kind);
            }
            if (needsAdd)
            {
                result.Add(builder.Build());
            }
            return(columnId);
        }
Beispiel #10
0
        public static void addRgFilteredStreamToRanges(OrcProto.Stream stream,
                                                       bool[] includedRowGroups, bool isCompressed, OrcProto.RowIndex index,
                                                       OrcProto.ColumnEncoding encoding, OrcProto.Type type, int compressionSize, bool hasNull,
                                                       long offset, long length, DiskRangeList.CreateHelper list, bool doMergeBuffers)
        {
            for (int group = 0; group < includedRowGroups.Length; ++group)
            {
                if (!includedRowGroups[group])
                {
                    continue;
                }
                int posn = getIndexPosition(
                    encoding.Kind, type.Kind, stream.Kind, isCompressed, hasNull);
                long start = (long)index.EntryList[group].PositionsList[posn];
                long nextGroupOffset;
                bool isLast = group == (includedRowGroups.Length - 1);
                nextGroupOffset = isLast ? length : (int)index.EntryList[group + 1].PositionsList[posn];

                start += offset;
                long end = offset + estimateRgEndOffset(
                    isCompressed, isLast, nextGroupOffset, length, compressionSize);
                list.addOrMerge(start, end, doMergeBuffers, true);
            }
        }