private List <int> getColumnIndicesFromNames(List <string> colNames) { // top level struct OrcProto.Type type = types[0]; List <int> colIndices = new List <int>(); IList <string> fieldNames = type.FieldNamesList; int fieldIdx = 0; foreach (string colName in colNames) { fieldIdx = fieldNames.IndexOf(colName); if (fieldIdx < 0) { string s = "Cannot find field for: " + colName + " in "; foreach (string fn in fieldNames) { s += fn + ", "; } LOG.warn(s); continue; } // a single field may span multiple columns. find start and end column // index for the requested field int idxStart = (int)type.SubtypesList[fieldIdx]; int idxEnd; // if the specified is the last field and then end index will be last // column index if (fieldIdx + 1 > fieldNames.Count - 1) { idxEnd = getLastIdx() + 1; } else { idxEnd = (int)type.SubtypesList[fieldIdx + 1]; } // if start index and end index are same then the field is a primitive // field else complex field (like map, list, struct, union) if (idxStart == idxEnd) { // simple field colIndices.Add(idxStart); } else { // complex fields spans multiple columns for (int i = idxStart; i < idxEnd; i++) { colIndices.Add(i); } } } return(colIndices); }
public OrcUnionObjectInspector(int columnId, IList <OrcProto.Type> types) { OrcProto.Type type = types[columnId]; children = new List <ObjectInspector>(type.SubtypesCount); for (int i = 0; i < type.SubtypesCount; ++i) { children.Add(OrcStruct.createObjectInspector((int)type.SubtypesList[i], types)); } }
private static long getRawDataSizeOfColumn(int colIdx, IList <OrcProto.Type> types, IList <OrcProto.ColumnStatistics> stats) { OrcProto.ColumnStatistics colStat = stats[colIdx]; long numVals = (long)colStat.NumberOfValues; OrcProto.Type type = types[colIdx]; switch (type.Kind) { case OrcProto.Type.Types.Kind.BINARY: // old orc format doesn't support binary statistics. checking for binary // statistics is not required as protocol buffers takes care of it. return(colStat.BinaryStatistics.Sum); case OrcProto.Type.Types.Kind.STRING: case OrcProto.Type.Types.Kind.CHAR: case OrcProto.Type.Types.Kind.VARCHAR: // old orc format doesn't support sum for string statistics. checking for // existence is not required as protocol buffers takes care of it. // ORC strings are deserialized to java strings. so use java data model's // string size numVals = numVals == 0 ? 1 : numVals; int avgStrLen = (int)(colStat.StringStatistics.Sum / numVals); return(numVals * JavaDataModel.lengthForStringOfLength(avgStrLen)); case OrcProto.Type.Types.Kind.TIMESTAMP: return(numVals * JavaDataModel.lengthOfTimestamp()); case OrcProto.Type.Types.Kind.DATE: return(numVals * JavaDataModel.lengthOfDate()); case OrcProto.Type.Types.Kind.DECIMAL: return(numVals * JavaDataModel.lengthOfDecimal()); case OrcProto.Type.Types.Kind.DOUBLE: case OrcProto.Type.Types.Kind.LONG: return(numVals * JavaDataModel.Eight); case OrcProto.Type.Types.Kind.FLOAT: case OrcProto.Type.Types.Kind.INT: case OrcProto.Type.Types.Kind.SHORT: case OrcProto.Type.Types.Kind.BOOLEAN: case OrcProto.Type.Types.Kind.BYTE: return(numVals * JavaDataModel.Four); default: LOG.debug("Unknown primitive category: " + type.Kind); break; } return(0); }
public OrcStructInspector(int columnId, IList <OrcProto.Type> types) { OrcProto.Type type = types[columnId]; int fieldCount = type.SubtypesCount; fields = new List <StructField>(fieldCount); for (int i = 0; i < fieldCount; ++i) { int fieldType = (int)type.SubtypesList[i]; fields.Add(new Field(type.FieldNamesList[i], createObjectInspector(fieldType, types), i)); } }
public static ObjectInspector createObjectInspector(int columnId, IList <OrcProto.Type> types) { OrcProto.Type type = types[columnId]; switch (type.Kind) { case OrcProto.Type.Types.Kind.FLOAT: return(PrimitiveObjectInspectorFactory.writableFloatObjectInspector); case OrcProto.Type.Types.Kind.DOUBLE: return(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector); case OrcProto.Type.Types.Kind.BOOLEAN: return(PrimitiveObjectInspectorFactory.writableBooleanObjectInspector); case OrcProto.Type.Types.Kind.BYTE: return(PrimitiveObjectInspectorFactory.writableByteObjectInspector); case OrcProto.Type.Types.Kind.SHORT: return(PrimitiveObjectInspectorFactory.writableShortObjectInspector); case OrcProto.Type.Types.Kind.INT: return(PrimitiveObjectInspectorFactory.writableIntObjectInspector); case OrcProto.Type.Types.Kind.LONG: return(PrimitiveObjectInspectorFactory.writableLongObjectInspector); case OrcProto.Type.Types.Kind.BINARY: return(PrimitiveObjectInspectorFactory.writableBinaryObjectInspector); case OrcProto.Type.Types.Kind.STRING: return(PrimitiveObjectInspectorFactory.writableStringObjectInspector); case OrcProto.Type.Types.Kind.CHAR: if (!type.HasMaximumLength) { throw new NotSupportedException( "Illegal use of char type without length in ORC type definition."); } return(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector( TypeInfoFactory.getCharTypeInfo((int)type.MaximumLength))); case OrcProto.Type.Types.Kind.VARCHAR: if (!type.HasMaximumLength) { throw new NotSupportedException( "Illegal use of varchar type without length in ORC type definition."); } return(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector( TypeInfoFactory.getVarcharTypeInfo((int)type.MaximumLength))); case OrcProto.Type.Types.Kind.TIMESTAMP: return(PrimitiveObjectInspectorFactory.writableTimestampObjectInspector); case OrcProto.Type.Types.Kind.DATE: return(PrimitiveObjectInspectorFactory.writableDateObjectInspector); case OrcProto.Type.Types.Kind.DECIMAL: int precision = type.HasPrecision ? (int)type.Precision : HiveDecimal.SYSTEM_DEFAULT_PRECISION; int scale = type.HasScale ? (int)type.Scale : HiveDecimal.SYSTEM_DEFAULT_SCALE; return(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector( TypeInfoFactory.getDecimalTypeInfo(precision, scale))); case OrcProto.Type.Types.Kind.STRUCT: return(new OrcStructInspector(columnId, types)); case OrcProto.Type.Types.Kind.UNION: return(new OrcUnion.OrcUnionObjectInspector(columnId, types)); case OrcProto.Type.Types.Kind.MAP: return(new OrcMapObjectInspector(columnId, types)); case OrcProto.Type.Types.Kind.LIST: return(new OrcListObjectInspector(columnId, types)); default: throw new NotSupportedException("Unknown type " + type.Kind); } }
public OrcListObjectInspector(int columnId, IList <OrcProto.Type> types) { OrcProto.Type type = types[columnId]; child = createObjectInspector((int)type.SubtypesList[0], types); }
public OrcMapObjectInspector(int columnId, IList <OrcProto.Type> types) { OrcProto.Type type = types[columnId]; key = createObjectInspector((int)type.SubtypesList[0], types); value = createObjectInspector((int)type.SubtypesList[1], types); }
public static TreeReaderFactory.TreeReaderSchema validateAndCreate( IList <OrcProto.Type> fileTypes, IList <OrcProto.Type> schemaTypes) { // For ACID, the row is the ROW field in the outer STRUCT. bool isAcid = checkAcidSchema(fileTypes); IList <OrcProto.Type> rowSchema; int rowSubtype; if (isAcid) { rowSubtype = OrcRecordUpdater.ROW + 1; rowSchema = fileTypes.subList(rowSubtype, fileTypes.Count); } else { rowSubtype = 0; rowSchema = fileTypes; } // Do checking on the overlap. Additional columns will be defaulted to NULL. int numFileColumns = rowSchema[0].SubtypesCount; int numDesiredColumns = schemaTypes[0].SubtypesCount; int numReadColumns = Math.Min(numFileColumns, numDesiredColumns); /** * Check type promotion. * * Currently, we only support integer type promotions that can be done "implicitly". * That is, we know that using a bigger integer tree reader on the original smaller integer * column will "just work". * * In the future, other type promotions might require type conversion. */ // short -> int -> bigint as same integer readers are used for the above types. for (int i = 0; i < numReadColumns; i++) { OrcProto.Type fColType = fileTypes[rowSubtype + i]; OrcProto.Type rColType = schemaTypes[i]; if (fColType.Kind != rColType.Kind) { bool ok = false; if (fColType.Kind == OrcProto.Type.Types.Kind.SHORT) { if (rColType.Kind == OrcProto.Type.Types.Kind.INT || rColType.Kind == OrcProto.Type.Types.Kind.LONG) { // type promotion possible, converting SHORT to INT/LONG requested type ok = true; } } else if (fColType.Kind == OrcProto.Type.Types.Kind.INT) { if (rColType.Kind == OrcProto.Type.Types.Kind.LONG) { // type promotion possible, converting INT to LONG requested type ok = true; } } if (!ok) { throw new IOException("ORC does not support type conversion from " + fColType.Kind.ToString() + " to " + rColType.Kind.ToString()); } } } IList <OrcProto.Type> fullSchemaTypes; if (isAcid) { fullSchemaTypes = new List <OrcProto.Type>(); // This copies the ACID struct type which is subtype = 0. // It has field names "operation" through "row". // And we copy the types for all fields EXCEPT ROW (which must be last!). for (int i = 0; i < rowSubtype; i++) { fullSchemaTypes.Add(fileTypes[i].ToBuilder().Build()); } // Add the row struct type. OrcUtils.appendOrcTypesRebuildSubtypes(fullSchemaTypes, schemaTypes, 0); } else { fullSchemaTypes = schemaTypes; } int innerStructSubtype = rowSubtype; // LOG.info("Schema evolution: (fileTypes) " + fileTypes.toString() + // " (schemaEvolutionTypes) " + schemaEvolutionTypes.toString()); return(new TreeReaderFactory.TreeReaderSchema(). fileTypes(fileTypes). schemaTypes(fullSchemaTypes). innerStructSubtype(innerStructSubtype)); }
/** * NOTE: This method ignores the subtype numbers in the OrcProto.Type rebuilds the subtype * numbers based on the length of the result list being appended. * * @param result * @param typeInfo */ public static int appendOrcTypesRebuildSubtypes( IList <OrcProto.Type> result, IList <OrcProto.Type> types, int columnId) { OrcProto.Type oldType = types[columnId++]; int subtype = result.Count; OrcProto.Type.Builder builder = OrcProto.Type.CreateBuilder(); bool needsAdd = true; switch (oldType.Kind) { case OrcProto.Type.Types.Kind.BOOLEAN: builder.SetKind(OrcProto.Type.Types.Kind.BOOLEAN); break; case OrcProto.Type.Types.Kind.BYTE: builder.SetKind(OrcProto.Type.Types.Kind.BYTE); break; case OrcProto.Type.Types.Kind.SHORT: builder.SetKind(OrcProto.Type.Types.Kind.SHORT); break; case OrcProto.Type.Types.Kind.INT: builder.SetKind(OrcProto.Type.Types.Kind.INT); break; case OrcProto.Type.Types.Kind.LONG: builder.SetKind(OrcProto.Type.Types.Kind.LONG); break; case OrcProto.Type.Types.Kind.FLOAT: builder.SetKind(OrcProto.Type.Types.Kind.FLOAT); break; case OrcProto.Type.Types.Kind.DOUBLE: builder.SetKind(OrcProto.Type.Types.Kind.DOUBLE); break; case OrcProto.Type.Types.Kind.STRING: builder.SetKind(OrcProto.Type.Types.Kind.STRING); break; case OrcProto.Type.Types.Kind.CHAR: builder.SetKind(OrcProto.Type.Types.Kind.CHAR); builder.SetMaximumLength(oldType.MaximumLength); break; case OrcProto.Type.Types.Kind.VARCHAR: builder.SetKind(OrcProto.Type.Types.Kind.VARCHAR); builder.SetMaximumLength(oldType.MaximumLength); break; case OrcProto.Type.Types.Kind.BINARY: builder.SetKind(OrcProto.Type.Types.Kind.BINARY); break; case OrcProto.Type.Types.Kind.TIMESTAMP: builder.SetKind(OrcProto.Type.Types.Kind.TIMESTAMP); break; case OrcProto.Type.Types.Kind.DATE: builder.SetKind(OrcProto.Type.Types.Kind.DATE); break; case OrcProto.Type.Types.Kind.DECIMAL: builder.SetKind(OrcProto.Type.Types.Kind.DECIMAL); builder.SetPrecision(oldType.Precision); builder.SetScale(oldType.Scale); break; case OrcProto.Type.Types.Kind.LIST: builder.SetKind(OrcProto.Type.Types.Kind.LIST); builder.AddSubtypes((uint)++subtype); result.Add(builder.Build()); needsAdd = false; columnId = appendOrcTypesRebuildSubtypes(result, types, columnId); break; case OrcProto.Type.Types.Kind.MAP: { // Make room for MAP type. result.Add(null); // Add MAP type pair in order to determine their subtype values. columnId = appendOrcTypesRebuildSubtypes(result, types, columnId); int subtype2 = result.Count; columnId = appendOrcTypesRebuildSubtypes(result, types, columnId); builder.SetKind(OrcProto.Type.Types.Kind.MAP); builder.AddSubtypes((uint)subtype + 1); builder.AddSubtypes((uint)subtype2); result[subtype] = builder.Build(); needsAdd = false; } break; case OrcProto.Type.Types.Kind.STRUCT: { IList <string> fieldNames = oldType.FieldNamesList; // Make room for STRUCT type. result.Add(null); List <int> fieldSubtypes = new List <int>(fieldNames.Count); for (int i = 0; i < fieldNames.Count; i++) { int fieldSubtype = result.Count; fieldSubtypes.Add(fieldSubtype); columnId = appendOrcTypesRebuildSubtypes(result, types, columnId); } builder.SetKind(OrcProto.Type.Types.Kind.STRUCT); for (int i = 0; i < fieldNames.Count; i++) { builder.AddSubtypes((uint)fieldSubtypes[i]); builder.AddFieldNames(fieldNames[i]); } result[subtype] = builder.Build(); needsAdd = false; } break; case OrcProto.Type.Types.Kind.UNION: { int subtypeCount = oldType.SubtypesCount; // Make room for UNION type. result.Add(null); List <int> unionSubtypes = new List <int>(subtypeCount); for (int i = 0; i < subtypeCount; i++) { int unionSubtype = result.Count; unionSubtypes.Add(unionSubtype); columnId = appendOrcTypesRebuildSubtypes(result, types, columnId); } builder.SetKind(OrcProto.Type.Types.Kind.UNION); for (int i = 0; i < subtypeCount; i++) { builder.AddSubtypes((uint)unionSubtypes[i]); } result[subtype] = builder.Build(); needsAdd = false; } break; default: throw new ArgumentException("Unknown category: " + oldType.Kind); } if (needsAdd) { result.Add(builder.Build()); } return(columnId); }
public static void addRgFilteredStreamToRanges(OrcProto.Stream stream, bool[] includedRowGroups, bool isCompressed, OrcProto.RowIndex index, OrcProto.ColumnEncoding encoding, OrcProto.Type type, int compressionSize, bool hasNull, long offset, long length, DiskRangeList.CreateHelper list, bool doMergeBuffers) { for (int group = 0; group < includedRowGroups.Length; ++group) { if (!includedRowGroups[group]) { continue; } int posn = getIndexPosition( encoding.Kind, type.Kind, stream.Kind, isCompressed, hasNull); long start = (long)index.EntryList[group].PositionsList[posn]; long nextGroupOffset; bool isLast = group == (includedRowGroups.Length - 1); nextGroupOffset = isLast ? length : (int)index.EntryList[group + 1].PositionsList[posn]; start += offset; long end = offset + estimateRgEndOffset( isCompressed, isLast, nextGroupOffset, length, compressionSize); list.addOrMerge(start, end, doMergeBuffers, true); } }