Пример #1
0
 private static void writeTypes(OrcProto.Footer.Builder builder,
                                TypeDescription schema)
 {
     OrcProto.Type.Builder type = OrcProto.Type.CreateBuilder();
     IList<TypeDescription> children = schema.getChildren();
     switch (schema.getCategory())
     {
         case Category.BOOLEAN:
             type.Kind = OrcProto.Type.Types.Kind.BOOLEAN;
             break;
         case Category.BYTE:
             type.Kind = OrcProto.Type.Types.Kind.BYTE;
             break;
         case Category.SHORT:
             type.Kind = OrcProto.Type.Types.Kind.SHORT;
             break;
         case Category.INT:
             type.Kind = OrcProto.Type.Types.Kind.INT;
             break;
         case Category.LONG:
             type.Kind = OrcProto.Type.Types.Kind.LONG;
             break;
         case Category.FLOAT:
             type.Kind = OrcProto.Type.Types.Kind.FLOAT;
             break;
         case Category.DOUBLE:
             type.Kind = OrcProto.Type.Types.Kind.DOUBLE;
             break;
         case Category.STRING:
             type.Kind = OrcProto.Type.Types.Kind.STRING;
             break;
         case Category.CHAR:
             type.Kind = OrcProto.Type.Types.Kind.CHAR;
             type.MaximumLength = (uint)schema.getMaxLength();
             break;
         case Category.VARCHAR:
             type.Kind = OrcProto.Type.Types.Kind.VARCHAR;
             type.MaximumLength = (uint)schema.getMaxLength();
             break;
         case Category.BINARY:
             type.Kind = OrcProto.Type.Types.Kind.BINARY;
             break;
         case Category.TIMESTAMP:
             type.Kind = OrcProto.Type.Types.Kind.TIMESTAMP;
             break;
         case Category.DATE:
             type.Kind = OrcProto.Type.Types.Kind.DATE;
             break;
         case Category.DECIMAL:
             type.Kind = OrcProto.Type.Types.Kind.DECIMAL;
             type.Precision = (uint)schema.getPrecision();
             type.Scale = (uint)schema.getScale();
             break;
         case Category.LIST:
             type.Kind = OrcProto.Type.Types.Kind.LIST;
             type.AddSubtypes((uint)children[0].getId());
             break;
         case Category.MAP:
             type.Kind = OrcProto.Type.Types.Kind.MAP;
             foreach (TypeDescription t in children)
             {
                 type.AddSubtypes((uint)t.getId());
             }
             break;
         case Category.STRUCT:
             type.Kind = OrcProto.Type.Types.Kind.STRUCT;
             foreach (TypeDescription t in children)
             {
                 type.AddSubtypes((uint)t.getId());
             }
             foreach (string field in schema.getFieldNames())
             {
                 type.AddFieldNames(field);
             }
             break;
         case Category.UNION:
             type.Kind = OrcProto.Type.Types.Kind.UNION;
             foreach (TypeDescription t in children)
             {
                 type.AddSubtypes((uint)t.getId());
             }
             break;
         default:
             throw new ArgumentException("Unknown category: " +
               schema.getCategory());
     }
     builder.AddTypes(type);
     if (children != null)
     {
         foreach (TypeDescription child in children)
         {
             writeTypes(builder, child);
         }
     }
 }
Пример #2
0
 private long getRawDataSize(TreeWriter child, TypeDescription schema)
 {
     long total = 0;
     long numVals = child.fileStatistics.getNumberOfValues();
     switch (schema.getCategory())
     {
         case Category.BOOLEAN:
         case Category.BYTE:
         case Category.SHORT:
         case Category.INT:
         case Category.FLOAT:
             return numVals * JavaDataModel.Four;
         case Category.LONG:
         case Category.DOUBLE:
             return numVals * JavaDataModel.Eight;
         case Category.STRING:
         case Category.VARCHAR:
         case Category.CHAR:
             // ORC strings are converted to java Strings. so use JavaDataModel to
             // compute the overall size of strings
             StringColumnStatistics scs = (StringColumnStatistics)child.fileStatistics;
             numVals = numVals == 0 ? 1 : numVals;
             int avgStringLen = (int)(scs.getSum() / numVals);
             return numVals * JavaDataModel.lengthForStringOfLength(avgStringLen);
         case Category.DECIMAL:
             return numVals * JavaDataModel.lengthOfDecimal();
         case Category.DATE:
             return numVals * JavaDataModel.lengthOfDate();
         case Category.BINARY:
             // get total length of binary blob
             BinaryColumnStatistics bcs = (BinaryColumnStatistics)child.fileStatistics;
             return bcs.getSum();
         case Category.TIMESTAMP:
             return numVals * JavaDataModel.lengthOfTimestamp();
         case Category.LIST:
         case Category.MAP:
         case Category.UNION:
         case Category.STRUCT:
             {
                 TreeWriter[] childWriters = child.getChildrenWriters();
                 IList<TypeDescription> childTypes = schema.getChildren();
                 for (int i = 0; i < childWriters.Length; ++i)
                 {
                     total += getRawDataSize(childWriters[i], childTypes[i]);
                 }
                 break;
             }
         default:
             LOG.debug("Unknown object inspector category.");
             break;
     }
     return total;
 }
Пример #3
0
 public StructTreeWriter(int columnId,
                  ObjectInspector inspector,
                  TypeDescription schema,
                  StreamFactory writer,
                  bool nullable)
     : base(columnId, inspector, schema, writer, nullable)
 {
     IList<TypeDescription> children = schema.getChildren();
     if (inspector != null)
     {
         StructObjectInspector structObjectInspector =
           (StructObjectInspector)inspector;
         fields = structObjectInspector.getAllStructFieldRefs();
     }
     childrenWriters = new TreeWriter[children.Count];
     for (int i = 0; i < childrenWriters.Length; ++i)
     {
         ObjectInspector childOI;
         if (fields != null && i < fields.Count)
         {
             childOI = fields[i].getFieldObjectInspector();
         }
         else
         {
             childOI = null;
         }
         childrenWriters[i] = createTreeWriter(
           childOI, children[i], writer,
           true);
     }
     recordPosition(rowIndexPosition);
 }
Пример #4
0
 public UnionTreeWriter(int columnId,
               ObjectInspector inspector,
               TypeDescription schema,
               StreamFactory writer,
               bool nullable)
     : base(columnId, inspector, schema, writer, nullable)
 {
     IList<ObjectInspector> choices = null;
     if (inspector != null)
     {
         UnionObjectInspector insp = (UnionObjectInspector)inspector;
         choices = insp.getObjectInspectors();
     }
     IList<TypeDescription> children = schema.getChildren();
     childrenWriters = new TreeWriter[children.Count];
     for (int i = 0; i < childrenWriters.Length; ++i)
     {
         childrenWriters[i] = createTreeWriter(
             choices != null ? choices[i] : null, children[i], writer, true);
     }
     tags =
       new RunLengthByteWriter(writer.createStream(columnId,
           OrcProto.Stream.Types.Kind.DATA));
     recordPosition(rowIndexPosition);
 }
Пример #5
0
 public MapTreeWriter(int columnId,
               ObjectInspector inspector,
               TypeDescription schema,
               StreamFactory writer,
               bool nullable)
     : base(columnId, inspector, schema, writer, nullable)
 {
     this.isDirectV2 = isNewWriteFormat(writer);
     ObjectInspector keyInspector = null;
     ObjectInspector valueInspector = null;
     if (inspector != null)
     {
         MapObjectInspector insp = (MapObjectInspector)inspector;
         keyInspector = insp.getMapKeyObjectInspector();
         valueInspector = insp.getMapValueObjectInspector();
     }
     childrenWriters = new TreeWriter[2];
     IList<TypeDescription> children = schema.getChildren();
     childrenWriters[0] = createTreeWriter(keyInspector, children[0], writer, true);
     childrenWriters[1] = createTreeWriter(valueInspector, children[1], writer, true);
     lengths = createIntegerWriter(writer.createStream(columnId,
         OrcProto.Stream.Types.Kind.LENGTH), false, isDirectV2, writer);
     recordPosition(rowIndexPosition);
 }
Пример #6
0
 public ListTreeWriter(int columnId,
                ObjectInspector inspector,
                TypeDescription schema,
                StreamFactory writer,
                bool nullable)
     : base(columnId, inspector, schema, writer, nullable)
 {
     this.isDirectV2 = isNewWriteFormat(writer);
     ObjectInspector childOI = inspector == null ? null :
       ((ListObjectInspector)inspector).getListElementObjectInspector();
     childrenWriters = new TreeWriter[1];
     childrenWriters[0] =
       createTreeWriter(childOI, schema.getChildren()[0], writer, true);
     lengths = createIntegerWriter(writer.createStream(columnId,
         OrcProto.Stream.Types.Kind.LENGTH), false, isDirectV2, writer);
     recordPosition(rowIndexPosition);
 }
Пример #7
0
 private static void appendOrcTypes(List<OrcProto.Type> result, TypeDescription typeDescr)
 {
     OrcProto.Type.Builder type = OrcProto.Type.CreateBuilder();
     IList<TypeDescription> children = typeDescr.getChildren();
     switch (typeDescr.getCategory())
     {
         case Category.BOOLEAN:
             type.SetKind(OrcProto.Type.Types.Kind.BOOLEAN);
             break;
         case Category.BYTE:
             type.SetKind(OrcProto.Type.Types.Kind.BYTE);
             break;
         case Category.SHORT:
             type.SetKind(OrcProto.Type.Types.Kind.SHORT);
             break;
         case Category.INT:
             type.SetKind(OrcProto.Type.Types.Kind.INT);
             break;
         case Category.LONG:
             type.SetKind(OrcProto.Type.Types.Kind.LONG);
             break;
         case Category.FLOAT:
             type.SetKind(OrcProto.Type.Types.Kind.FLOAT);
             break;
         case Category.DOUBLE:
             type.SetKind(OrcProto.Type.Types.Kind.DOUBLE);
             break;
         case Category.STRING:
             type.SetKind(OrcProto.Type.Types.Kind.STRING);
             break;
         case Category.CHAR:
             type.SetKind(OrcProto.Type.Types.Kind.CHAR);
             type.SetMaximumLength((uint)typeDescr.getMaxLength());
             break;
         case Category.VARCHAR:
             type.SetKind(OrcProto.Type.Types.Kind.VARCHAR);
             type.SetMaximumLength((uint)typeDescr.getMaxLength());
             break;
         case Category.BINARY:
             type.SetKind(OrcProto.Type.Types.Kind.BINARY);
             break;
         case Category.TIMESTAMP:
             type.SetKind(OrcProto.Type.Types.Kind.TIMESTAMP);
             break;
         case Category.DATE:
             type.SetKind(OrcProto.Type.Types.Kind.DATE);
             break;
         case Category.DECIMAL:
             type.SetKind(OrcProto.Type.Types.Kind.DECIMAL);
             type.SetPrecision((uint)typeDescr.getPrecision());
             type.SetScale((uint)typeDescr.getScale());
             break;
         case Category.LIST:
             type.SetKind(OrcProto.Type.Types.Kind.LIST);
             type.AddSubtypes((uint)children[0].getId());
             break;
         case Category.MAP:
             type.SetKind(OrcProto.Type.Types.Kind.MAP);
             foreach (TypeDescription t in children)
             {
                 type.AddSubtypes((uint)t.getId());
             }
             break;
         case Category.STRUCT:
             type.SetKind(OrcProto.Type.Types.Kind.STRUCT);
             foreach (TypeDescription t in children)
             {
                 type.AddSubtypes((uint)t.getId());
             }
             foreach (string field in typeDescr.getFieldNames())
             {
                 type.AddFieldNames(field);
             }
             break;
         case Category.UNION:
             type.SetKind(OrcProto.Type.Types.Kind.UNION);
             foreach (TypeDescription t in children)
             {
                 type.AddSubtypes((uint)t.getId());
             }
             break;
         default:
             throw new ArgumentException("Unknown category: " + typeDescr.getCategory());
     }
     result.Add(type.Build());
     if (children != null)
     {
         foreach (TypeDescription child in children)
         {
             appendOrcTypes(result, child);
         }
     }
 }
Пример #8
0
 /**
  * Returns selected columns as a bool array with true value set for specified column names.
  * The result will contain number of elements equal to flattened number of columns.
  * For example:
  * selectedColumns - a,b,c
  * allColumns - a,b,c,d
  * If column c is a complex type, say list<string> and other types are primitives then result will
  * be [false, true, true, true, true, true, false]
  * Index 0 is the root element of the struct which is set to false by default, index 1,2
  * corresponds to columns a and b. Index 3,4 correspond to column c which is list<string> and
  * index 5 correspond to column d. After flattening list<string> gets 2 columns.
  *
  * @param selectedColumns - comma separated list of selected column names
  * @param schema       - object schema
  * @return - bool array with true value set for the specified column names
  */
 public static bool[] includeColumns(string selectedColumns, TypeDescription schema)
 {
     int numFlattenedCols = schema.getMaximumId();
     bool[] results = new bool[numFlattenedCols + 1];
     if ("*".Equals(selectedColumns))
     {
         for (int i = 0; i < results.Length; i++)
         {
             results[i] = true;
         }
         return results;
     }
     if (selectedColumns != null &&
         schema.getCategory() == Category.STRUCT)
     {
         IList<string> fieldNames = schema.getFieldNames();
         IList<TypeDescription> fields = schema.getChildren();
         foreach (string column in selectedColumns.Split((',')))
         {
             TypeDescription col = findColumn(column, fieldNames, fields);
             if (col != null)
             {
                 for (int i = col.getId(); i <= col.getMaximumId(); ++i)
                 {
                     results[i] = true;
                 }
             }
         }
     }
     return results;
 }
Пример #9
0
        /**
         * NOTE: This method ignores the subtype numbers in the TypeDescription rebuilds the subtype
         * numbers based on the length of the result list being appended.
         *
         * @param result
         * @param typeInfo
         */
        public static void appendOrcTypesRebuildSubtypes(
            IList<OrcProto.Type> result,
            TypeDescription typeDescr)
        {
            int subtype = result.Count;
            OrcProto.Type.Builder type = OrcProto.Type.CreateBuilder();
            bool needsAdd = true;
            IList<TypeDescription> children = typeDescr.getChildren();
            switch (typeDescr.getCategory())
            {
                case Category.BOOLEAN:
                    type.SetKind(OrcProto.Type.Types.Kind.BOOLEAN);
                    break;
                case Category.BYTE:
                    type.SetKind(OrcProto.Type.Types.Kind.BYTE);
                    break;
                case Category.SHORT:
                    type.SetKind(OrcProto.Type.Types.Kind.SHORT);
                    break;
                case Category.INT:
                    type.SetKind(OrcProto.Type.Types.Kind.INT);
                    break;
                case Category.LONG:
                    type.SetKind(OrcProto.Type.Types.Kind.LONG);
                    break;
                case Category.FLOAT:
                    type.SetKind(OrcProto.Type.Types.Kind.FLOAT);
                    break;
                case Category.DOUBLE:
                    type.SetKind(OrcProto.Type.Types.Kind.DOUBLE);
                    break;
                case Category.STRING:
                    type.SetKind(OrcProto.Type.Types.Kind.STRING);
                    break;
                case Category.CHAR:
                    type.SetKind(OrcProto.Type.Types.Kind.CHAR);
                    type.SetMaximumLength((uint)typeDescr.getMaxLength());
                    break;
                case Category.VARCHAR:
                    type.SetKind(OrcProto.Type.Types.Kind.VARCHAR);
                    type.SetMaximumLength((uint)typeDescr.getMaxLength());
                    break;
                case Category.BINARY:
                    type.SetKind(OrcProto.Type.Types.Kind.BINARY);
                    break;
                case Category.TIMESTAMP:
                    type.SetKind(OrcProto.Type.Types.Kind.TIMESTAMP);
                    break;
                case Category.DATE:
                    type.SetKind(OrcProto.Type.Types.Kind.DATE);
                    break;
                case Category.DECIMAL:
                    type.SetKind(OrcProto.Type.Types.Kind.DECIMAL);
                    type.SetPrecision((uint)typeDescr.getPrecision());
                    type.SetScale((uint)typeDescr.getScale());
                    break;
                case Category.LIST:
                    type.SetKind(OrcProto.Type.Types.Kind.LIST);
                    type.AddSubtypes((uint)++subtype);
                    result.Add(type.Build());
                    needsAdd = false;
                    appendOrcTypesRebuildSubtypes(result, children[0]);
                    break;
                case Category.MAP:
                    {
                        // Make room for MAP type.
                        result.Add(null);

                        // Add MAP type pair in order to determine their subtype values.
                        appendOrcTypesRebuildSubtypes(result, children[0]);
                        int subtype2 = result.Count;
                        appendOrcTypesRebuildSubtypes(result, children[1]);
                        type.SetKind(OrcProto.Type.Types.Kind.MAP);
                        type.AddSubtypes((uint)subtype + 1);
                        type.AddSubtypes((uint)subtype2);
                        result[subtype] = type.Build();
                        needsAdd = false;
                    }
                    break;
                case Category.STRUCT:
                    {
                        IList<String> fieldNames = typeDescr.getFieldNames();

                        // Make room for STRUCT type.
                        result.Add(null);

                        List<int> fieldSubtypes = new List<int>(fieldNames.Count);
                        foreach (TypeDescription child in children)
                        {
                            int fieldSubtype = result.Count;
                            fieldSubtypes.Add(fieldSubtype);
                            appendOrcTypesRebuildSubtypes(result, child);
                        }

                        type.SetKind(OrcProto.Type.Types.Kind.STRUCT);

                        for (int i = 0; i < fieldNames.Count; i++)
                        {
                            type.AddSubtypes((uint)fieldSubtypes[i]);
                            type.AddFieldNames(fieldNames[i]);
                        }
                        result[subtype] = type.Build();
                        needsAdd = false;
                    }
                    break;
                case Category.UNION:
                    {
                        // Make room for UNION type.
                        result.Add(null);

                        List<int> unionSubtypes = new List<int>(children.Count);
                        foreach (TypeDescription child in children)
                        {
                            int unionSubtype = result.Count;
                            unionSubtypes.Add(unionSubtype);
                            appendOrcTypesRebuildSubtypes(result, child);
                        }

                        type.SetKind(OrcProto.Type.Types.Kind.UNION);
                        for (int i = 0; i < children.Count; i++)
                        {
                            type.AddSubtypes((uint)unionSubtypes[i]);
                        }
                        result[subtype] = type.Build();
                        needsAdd = false;
                    }
                    break;
                default:
                    throw new ArgumentException("Unknown category: " + typeDescr.getCategory());
            }
            if (needsAdd)
            {
                result.Add(type.Build());
            }
        }