public static ColumnStatisticsImpl create(TypeDescription schema)
 {
     switch (schema.getCategory())
     {
         case Category.BOOLEAN:
             return new BooleanStatisticsImpl();
         case Category.BYTE:
         case Category.SHORT:
         case Category.INT:
         case Category.LONG:
             return new IntegerStatisticsImpl();
         case Category.FLOAT:
         case Category.DOUBLE:
             return new DoubleStatisticsImpl();
         case Category.STRING:
         case Category.CHAR:
         case Category.VARCHAR:
             return new StringStatisticsImpl();
         case Category.DECIMAL:
             return new DecimalStatisticsImpl();
         case Category.DATE:
             return new DateStatisticsImpl();
         case Category.TIMESTAMP:
             return new TimestampStatisticsImpl();
         case Category.BINARY:
             return new BinaryStatisticsImpl();
         default:
             return new ColumnStatisticsImpl();
     }
 }
        /**
         * @param typeDescr
         * @return ORC types for the ACID event based on the row's type description
         */
        public static List<OrcProto.Type> createEventSchema(TypeDescription typeDescr)
        {
            List<OrcProto.Type> result = new List<OrcProto.Type>();

            OrcProto.Type.Builder type = OrcProto.Type.CreateBuilder();
            type.SetKind(OrcProto.Type.Types.Kind.STRUCT);
            type.AddRangeFieldNames(acidEventFieldNames);
            for (int i = 0; i < acidEventFieldNames.Length; i++)
            {
                type.AddSubtypes((uint)i + 1);
            }
            result.Add(type.Build());

            // Automatically add all fields except the last (ROW).
            for (int i = 0; i < acidEventOrcTypeKinds.Length - 1; i++)
            {
                type.Clear();
                type.SetKind(acidEventOrcTypeKinds[i]);
                result.Add(type.Build());
            }

            OrcUtils.appendOrcTypesRebuildSubtypes(result, typeDescr);
            return result;
        }
Example #3
0
        /**
         * @param typeDescr
         * @return ORC types for the ACID event based on the row's type description
         */
        public static List <OrcProto.Type> createEventSchema(TypeDescription typeDescr)
        {
            List <OrcProto.Type> result = new List <OrcProto.Type>();

            OrcProto.Type.Builder type = OrcProto.Type.CreateBuilder();
            type.SetKind(OrcProto.Type.Types.Kind.STRUCT);
            type.AddRangeFieldNames(acidEventFieldNames);
            for (int i = 0; i < acidEventFieldNames.Length; i++)
            {
                type.AddSubtypes((uint)i + 1);
            }
            result.Add(type.Build());

            // Automatically add all fields except the last (ROW).
            for (int i = 0; i < acidEventOrcTypeKinds.Length - 1; i++)
            {
                type.Clear();
                type.SetKind(acidEventOrcTypeKinds[i]);
                result.Add(type.Build());
            }

            OrcUtils.appendOrcTypesRebuildSubtypes(result, typeDescr);
            return(result);
        }
Example #4
0
 private long getRawDataSize(TreeWriter child, TypeDescription schema)
 {
     long total = 0;
     long numVals = child.fileStatistics.getNumberOfValues();
     switch (schema.getCategory())
     {
         case Category.BOOLEAN:
         case Category.BYTE:
         case Category.SHORT:
         case Category.INT:
         case Category.FLOAT:
             return numVals * JavaDataModel.Four;
         case Category.LONG:
         case Category.DOUBLE:
             return numVals * JavaDataModel.Eight;
         case Category.STRING:
         case Category.VARCHAR:
         case Category.CHAR:
             // ORC strings are converted to java Strings. so use JavaDataModel to
             // compute the overall size of strings
             StringColumnStatistics scs = (StringColumnStatistics)child.fileStatistics;
             numVals = numVals == 0 ? 1 : numVals;
             int avgStringLen = (int)(scs.getSum() / numVals);
             return numVals * JavaDataModel.lengthForStringOfLength(avgStringLen);
         case Category.DECIMAL:
             return numVals * JavaDataModel.lengthOfDecimal();
         case Category.DATE:
             return numVals * JavaDataModel.lengthOfDate();
         case Category.BINARY:
             // get total length of binary blob
             BinaryColumnStatistics bcs = (BinaryColumnStatistics)child.fileStatistics;
             return bcs.getSum();
         case Category.TIMESTAMP:
             return numVals * JavaDataModel.lengthOfTimestamp();
         case Category.LIST:
         case Category.MAP:
         case Category.UNION:
         case Category.STRUCT:
             {
                 TreeWriter[] childWriters = child.getChildrenWriters();
                 IList<TypeDescription> childTypes = schema.getChildren();
                 for (int i = 0; i < childWriters.Length; ++i)
                 {
                     total += getRawDataSize(childWriters[i], childTypes[i]);
                 }
                 break;
             }
         default:
             LOG.debug("Unknown object inspector category.");
             break;
     }
     return total;
 }
Example #5
0
 private static TreeWriter createTreeWriter(ObjectInspector inspector,
                                            TypeDescription schema,
                                            StreamFactory streamFactory,
                                            bool nullable)
 {
     switch (schema.getCategory())
     {
         case Category.BOOLEAN:
             return new BooleanTreeWriter(streamFactory.getNextColumnId(),
                 inspector, schema, streamFactory, nullable);
         case Category.BYTE:
             return new ByteTreeWriter(streamFactory.getNextColumnId(),
                 inspector, schema, streamFactory, nullable);
         case Category.SHORT:
         case Category.INT:
         case Category.LONG:
             return new IntegerTreeWriter(streamFactory.getNextColumnId(),
                 inspector, schema, streamFactory, nullable);
         case Category.FLOAT:
             return new FloatTreeWriter(streamFactory.getNextColumnId(),
                 inspector, schema, streamFactory, nullable);
         case Category.DOUBLE:
             return new DoubleTreeWriter(streamFactory.getNextColumnId(),
                 inspector, schema, streamFactory, nullable);
         case Category.STRING:
             return new StringTreeWriter(streamFactory.getNextColumnId(),
                 inspector, schema, streamFactory, nullable);
         case Category.CHAR:
             return new CharTreeWriter(streamFactory.getNextColumnId(),
                 inspector, schema, streamFactory, nullable);
         case Category.VARCHAR:
             return new VarcharTreeWriter(streamFactory.getNextColumnId(),
                 inspector, schema, streamFactory, nullable);
         case Category.BINARY:
             return new BinaryTreeWriter(streamFactory.getNextColumnId(),
                 inspector, schema, streamFactory, nullable);
         case Category.TIMESTAMP:
             return new TimestampTreeWriter(streamFactory.getNextColumnId(),
                 inspector, schema, streamFactory, nullable);
         case Category.DATE:
             return new DateTreeWriter(streamFactory.getNextColumnId(),
                 inspector, schema, streamFactory, nullable);
         case Category.DECIMAL:
             return new DecimalTreeWriter(streamFactory.getNextColumnId(),
                 inspector, schema, streamFactory, nullable);
         case Category.STRUCT:
             return new StructTreeWriter(streamFactory.getNextColumnId(),
                 inspector, schema, streamFactory, nullable);
         case Category.MAP:
             return new MapTreeWriter(streamFactory.getNextColumnId(), inspector,
                 schema, streamFactory, nullable);
         case Category.LIST:
             return new ListTreeWriter(streamFactory.getNextColumnId(), inspector,
                 schema, streamFactory, nullable);
         case Category.UNION:
             return new UnionTreeWriter(streamFactory.getNextColumnId(), inspector,
                 schema, streamFactory, nullable);
         default:
             throw new ArgumentException("Bad category: " +
                 schema.getCategory());
     }
 }
Example #6
0
 public UnionTreeWriter(int columnId,
               ObjectInspector inspector,
               TypeDescription schema,
               StreamFactory writer,
               bool nullable)
     : base(columnId, inspector, schema, writer, nullable)
 {
     IList<ObjectInspector> choices = null;
     if (inspector != null)
     {
         UnionObjectInspector insp = (UnionObjectInspector)inspector;
         choices = insp.getObjectInspectors();
     }
     IList<TypeDescription> children = schema.getChildren();
     childrenWriters = new TreeWriter[children.Count];
     for (int i = 0; i < childrenWriters.Length; ++i)
     {
         childrenWriters[i] = createTreeWriter(
             choices != null ? choices[i] : null, children[i], writer, true);
     }
     tags =
       new RunLengthByteWriter(writer.createStream(columnId,
           OrcProto.Stream.Types.Kind.DATA));
     recordPosition(rowIndexPosition);
 }
Example #7
0
 public TimestampTreeWriter(int columnId,
                  ObjectInspector inspector,
                  TypeDescription schema,
                  StreamFactory writer,
                  bool nullable)
     : base(columnId, inspector, schema, writer, nullable)
 {
     this.isDirectV2 = isNewWriteFormat(writer);
     this.seconds = createIntegerWriter(writer.createStream(id,
         OrcProto.Stream.Types.Kind.DATA), true, isDirectV2, writer);
     this.nanos = createIntegerWriter(writer.createStream(id,
         OrcProto.Stream.Types.Kind.SECONDARY), false, isDirectV2, writer);
     recordPosition(rowIndexPosition);
     TimeZoneInfo timeZone;
     this.base_timestamp = TimeZones.GetBaseTimestamp(writer.Timezone, out timeZone);
 }
Example #8
0
 public static List<OrcProto.Type> getOrcTypes(TypeDescription typeDescr)
 {
     List<OrcProto.Type> result = new List<OrcProto.Type>();
     appendOrcTypes(result, typeDescr);
     return result;
 }
Example #9
0
 public IntegerTreeWriter(int columnId,
                   ObjectInspector inspector,
                   TypeDescription schema,
                   StreamFactory writer,
                   bool nullable)
     : base(columnId, inspector, schema, writer, nullable)
 {
     OutStream @out = writer.createStream(id,
         OrcProto.Stream.Types.Kind.DATA);
     this.isDirectV2 = isNewWriteFormat(writer);
     this.writer = createIntegerWriter(@out, true, isDirectV2, writer);
     if (inspector is IntObjectInspector)
     {
         intInspector = (IntObjectInspector)inspector;
         shortInspector = null;
         longInspector = null;
     }
     else
     {
         intInspector = null;
         if (inspector is LongObjectInspector)
         {
             longInspector = (LongObjectInspector)inspector;
             shortInspector = null;
         }
         else
         {
             shortInspector = (ShortObjectInspector)inspector;
             longInspector = null;
         }
     }
     recordPosition(rowIndexPosition);
 }
Example #10
0
 public DecimalTreeWriter(int columnId,
                     ObjectInspector inspector,
                     TypeDescription schema,
                     StreamFactory writer,
                     bool nullable)
     : base(columnId, inspector, schema, writer, nullable)
 {
     this.isDirectV2 = isNewWriteFormat(writer);
     valueStream = writer.createStream(id, OrcProto.Stream.Types.Kind.DATA);
     this.scaleStream = createIntegerWriter(writer.createStream(id,
         OrcProto.Stream.Types.Kind.SECONDARY), true, isDirectV2, writer);
     recordPosition(rowIndexPosition);
 }
Example #11
0
 /**
  * Set the schema on read type description.
  */
 public RecordReaderOptions schema(TypeDescription schema)
 {
     this._schema = schema;
     return this;
 }
Example #12
0
        public static TypeDescription getDesiredRowTypeDescr(Configuration conf)
        {
            string columnNameProperty = null;
            string columnTypeProperty = null;

            IList <string>         schemaEvolutionColumnNames = null;
            List <TypeDescription> schemaEvolutionTypeDescrs  = null;

            bool haveSchemaEvolutionProperties = false;

            if (HiveConf.getBoolVar(conf, ConfVars.HIVE_SCHEMA_EVOLUTION))
            {
                columnNameProperty = conf.get(IOConstants.SCHEMA_EVOLUTION_COLUMNS);
                columnTypeProperty = conf.get(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES);

                haveSchemaEvolutionProperties =
                    (columnNameProperty != null && columnTypeProperty != null);

                if (haveSchemaEvolutionProperties)
                {
                    schemaEvolutionColumnNames = columnNameProperty.Split(',');
                    if (schemaEvolutionColumnNames.Count == 0)
                    {
                        haveSchemaEvolutionProperties = false;
                    }
                    else
                    {
                        schemaEvolutionTypeDescrs =
                            OrcUtils.typeDescriptionsFromHiveTypeProperty(columnTypeProperty);
                        if (schemaEvolutionTypeDescrs.Count != schemaEvolutionColumnNames.Count)
                        {
                            haveSchemaEvolutionProperties = false;
                        }
                    }
                }
            }

            if (!haveSchemaEvolutionProperties)
            {
                // Try regular properties;
                columnNameProperty = conf.get(serdeConstants.LIST_COLUMNS);
                columnTypeProperty = conf.get(serdeConstants.LIST_COLUMN_TYPES);
                if (columnTypeProperty == null || columnNameProperty == null)
                {
                    return(null);
                }

                schemaEvolutionColumnNames = columnNameProperty.Split(',');
                if (schemaEvolutionColumnNames.Count == 0)
                {
                    return(null);
                }
                schemaEvolutionTypeDescrs =
                    OrcUtils.typeDescriptionsFromHiveTypeProperty(columnTypeProperty);
                if (schemaEvolutionTypeDescrs.Count != schemaEvolutionColumnNames.Count)
                {
                    return(null);
                }
            }

            // Desired schema does not include virtual columns or partition columns.
            TypeDescription result = TypeDescription.createStruct();

            for (int i = 0; i < schemaEvolutionColumnNames.Count; i++)
            {
                result.addField(schemaEvolutionColumnNames[i], schemaEvolutionTypeDescrs[i]);
            }

            return(result);
        }
Example #13
0
        /**
         * NOTE: This method ignores the subtype numbers in the TypeDescription rebuilds the subtype
         * numbers based on the length of the result list being appended.
         *
         * @param result
         * @param typeInfo
         */
        public static void appendOrcTypesRebuildSubtypes(
            IList <OrcProto.Type> result,
            TypeDescription typeDescr)
        {
            int subtype = result.Count;

            OrcProto.Type.Builder type = OrcProto.Type.CreateBuilder();
            bool needsAdd = true;
            IList <TypeDescription> children = typeDescr.getChildren();

            switch (typeDescr.getCategory())
            {
            case Category.BOOLEAN:
                type.SetKind(OrcProto.Type.Types.Kind.BOOLEAN);
                break;

            case Category.BYTE:
                type.SetKind(OrcProto.Type.Types.Kind.BYTE);
                break;

            case Category.SHORT:
                type.SetKind(OrcProto.Type.Types.Kind.SHORT);
                break;

            case Category.INT:
                type.SetKind(OrcProto.Type.Types.Kind.INT);
                break;

            case Category.LONG:
                type.SetKind(OrcProto.Type.Types.Kind.LONG);
                break;

            case Category.FLOAT:
                type.SetKind(OrcProto.Type.Types.Kind.FLOAT);
                break;

            case Category.DOUBLE:
                type.SetKind(OrcProto.Type.Types.Kind.DOUBLE);
                break;

            case Category.STRING:
                type.SetKind(OrcProto.Type.Types.Kind.STRING);
                break;

            case Category.CHAR:
                type.SetKind(OrcProto.Type.Types.Kind.CHAR);
                type.SetMaximumLength((uint)typeDescr.getMaxLength());
                break;

            case Category.VARCHAR:
                type.SetKind(OrcProto.Type.Types.Kind.VARCHAR);
                type.SetMaximumLength((uint)typeDescr.getMaxLength());
                break;

            case Category.BINARY:
                type.SetKind(OrcProto.Type.Types.Kind.BINARY);
                break;

            case Category.TIMESTAMP:
                type.SetKind(OrcProto.Type.Types.Kind.TIMESTAMP);
                break;

            case Category.DATE:
                type.SetKind(OrcProto.Type.Types.Kind.DATE);
                break;

            case Category.DECIMAL:
                type.SetKind(OrcProto.Type.Types.Kind.DECIMAL);
                type.SetPrecision((uint)typeDescr.getPrecision());
                type.SetScale((uint)typeDescr.getScale());
                break;

            case Category.LIST:
                type.SetKind(OrcProto.Type.Types.Kind.LIST);
                type.AddSubtypes((uint)++subtype);
                result.Add(type.Build());
                needsAdd = false;
                appendOrcTypesRebuildSubtypes(result, children[0]);
                break;

            case Category.MAP:
            {
                // Make room for MAP type.
                result.Add(null);

                // Add MAP type pair in order to determine their subtype values.
                appendOrcTypesRebuildSubtypes(result, children[0]);
                int subtype2 = result.Count;
                appendOrcTypesRebuildSubtypes(result, children[1]);
                type.SetKind(OrcProto.Type.Types.Kind.MAP);
                type.AddSubtypes((uint)subtype + 1);
                type.AddSubtypes((uint)subtype2);
                result[subtype] = type.Build();
                needsAdd        = false;
            }
            break;

            case Category.STRUCT:
            {
                IList <String> fieldNames = typeDescr.getFieldNames();

                // Make room for STRUCT type.
                result.Add(null);

                List <int> fieldSubtypes = new List <int>(fieldNames.Count);
                foreach (TypeDescription child in children)
                {
                    int fieldSubtype = result.Count;
                    fieldSubtypes.Add(fieldSubtype);
                    appendOrcTypesRebuildSubtypes(result, child);
                }

                type.SetKind(OrcProto.Type.Types.Kind.STRUCT);

                for (int i = 0; i < fieldNames.Count; i++)
                {
                    type.AddSubtypes((uint)fieldSubtypes[i]);
                    type.AddFieldNames(fieldNames[i]);
                }
                result[subtype] = type.Build();
                needsAdd        = false;
            }
            break;

            case Category.UNION:
            {
                // Make room for UNION type.
                result.Add(null);

                List <int> unionSubtypes = new List <int>(children.Count);
                foreach (TypeDescription child in children)
                {
                    int unionSubtype = result.Count;
                    unionSubtypes.Add(unionSubtype);
                    appendOrcTypesRebuildSubtypes(result, child);
                }

                type.SetKind(OrcProto.Type.Types.Kind.UNION);
                for (int i = 0; i < children.Count; i++)
                {
                    type.AddSubtypes((uint)unionSubtypes[i]);
                }
                result[subtype] = type.Build();
                needsAdd        = false;
            }
            break;

            default:
                throw new ArgumentException("Unknown category: " + typeDescr.getCategory());
            }
            if (needsAdd)
            {
                result.Add(type.Build());
            }
        }
Example #14
0
        private static void appendOrcTypes(List <OrcProto.Type> result, TypeDescription typeDescr)
        {
            OrcProto.Type.Builder   type     = OrcProto.Type.CreateBuilder();
            IList <TypeDescription> children = typeDescr.getChildren();

            switch (typeDescr.getCategory())
            {
            case Category.BOOLEAN:
                type.SetKind(OrcProto.Type.Types.Kind.BOOLEAN);
                break;

            case Category.BYTE:
                type.SetKind(OrcProto.Type.Types.Kind.BYTE);
                break;

            case Category.SHORT:
                type.SetKind(OrcProto.Type.Types.Kind.SHORT);
                break;

            case Category.INT:
                type.SetKind(OrcProto.Type.Types.Kind.INT);
                break;

            case Category.LONG:
                type.SetKind(OrcProto.Type.Types.Kind.LONG);
                break;

            case Category.FLOAT:
                type.SetKind(OrcProto.Type.Types.Kind.FLOAT);
                break;

            case Category.DOUBLE:
                type.SetKind(OrcProto.Type.Types.Kind.DOUBLE);
                break;

            case Category.STRING:
                type.SetKind(OrcProto.Type.Types.Kind.STRING);
                break;

            case Category.CHAR:
                type.SetKind(OrcProto.Type.Types.Kind.CHAR);
                type.SetMaximumLength((uint)typeDescr.getMaxLength());
                break;

            case Category.VARCHAR:
                type.SetKind(OrcProto.Type.Types.Kind.VARCHAR);
                type.SetMaximumLength((uint)typeDescr.getMaxLength());
                break;

            case Category.BINARY:
                type.SetKind(OrcProto.Type.Types.Kind.BINARY);
                break;

            case Category.TIMESTAMP:
                type.SetKind(OrcProto.Type.Types.Kind.TIMESTAMP);
                break;

            case Category.DATE:
                type.SetKind(OrcProto.Type.Types.Kind.DATE);
                break;

            case Category.DECIMAL:
                type.SetKind(OrcProto.Type.Types.Kind.DECIMAL);
                type.SetPrecision((uint)typeDescr.getPrecision());
                type.SetScale((uint)typeDescr.getScale());
                break;

            case Category.LIST:
                type.SetKind(OrcProto.Type.Types.Kind.LIST);
                type.AddSubtypes((uint)children[0].getId());
                break;

            case Category.MAP:
                type.SetKind(OrcProto.Type.Types.Kind.MAP);
                foreach (TypeDescription t in children)
                {
                    type.AddSubtypes((uint)t.getId());
                }
                break;

            case Category.STRUCT:
                type.SetKind(OrcProto.Type.Types.Kind.STRUCT);
                foreach (TypeDescription t in children)
                {
                    type.AddSubtypes((uint)t.getId());
                }
                foreach (string field in typeDescr.getFieldNames())
                {
                    type.AddFieldNames(field);
                }
                break;

            case Category.UNION:
                type.SetKind(OrcProto.Type.Types.Kind.UNION);
                foreach (TypeDescription t in children)
                {
                    type.AddSubtypes((uint)t.getId());
                }
                break;

            default:
                throw new ArgumentException("Unknown category: " + typeDescr.getCategory());
            }
            result.Add(type.Build());
            if (children != null)
            {
                foreach (TypeDescription child in children)
                {
                    appendOrcTypes(result, child);
                }
            }
        }
Example #15
0
        public static TypeDescription convertTypeInfo(TypeInfo info)
        {
            switch (info.getCategory())
            {
            case ObjectInspectorCategory.PRIMITIVE:
            {
                PrimitiveTypeInfo pinfo = (PrimitiveTypeInfo)info;
                switch (pinfo.getPrimitiveCategory())
                {
                case PrimitiveCategory.BOOLEAN:
                    return(TypeDescription.createBoolean());

                case PrimitiveCategory.BYTE:
                    return(TypeDescription.createByte());

                case PrimitiveCategory.SHORT:
                    return(TypeDescription.createShort());

                case PrimitiveCategory.INT:
                    return(TypeDescription.createInt());

                case PrimitiveCategory.LONG:
                    return(TypeDescription.createLong());

                case PrimitiveCategory.FLOAT:
                    return(TypeDescription.createFloat());

                case PrimitiveCategory.DOUBLE:
                    return(TypeDescription.createDouble());

                case PrimitiveCategory.STRING:
                    return(TypeDescription.createString());

                case PrimitiveCategory.DATE:
                    return(TypeDescription.createDate());

                case PrimitiveCategory.TIMESTAMP:
                    return(TypeDescription.createTimestamp());

                case PrimitiveCategory.BINARY:
                    return(TypeDescription.createBinary());

                case PrimitiveCategory.DECIMAL:
                {
                    DecimalTypeInfo dinfo = (DecimalTypeInfo)pinfo;
                    return(TypeDescription.createDecimal()
                           .withScale(dinfo.scale())
                           .withPrecision(dinfo.precision()));
                }

                case PrimitiveCategory.VARCHAR:
                {
                    BaseCharTypeInfo cinfo = (BaseCharTypeInfo)pinfo;
                    return(TypeDescription.createVarchar()
                           .withMaxLength(cinfo.getLength()));
                }

                case PrimitiveCategory.CHAR:
                {
                    BaseCharTypeInfo cinfo = (BaseCharTypeInfo)pinfo;
                    return(TypeDescription.createChar()
                           .withMaxLength(cinfo.getLength()));
                }

                default:
                    throw new ArgumentException("ORC doesn't handle primitive" +
                                                " category " + pinfo.getPrimitiveCategory());
                }
            }

            case ObjectInspectorCategory.LIST:
            {
                ListTypeInfo linfo = (ListTypeInfo)info;
                return(TypeDescription.createList
                           (convertTypeInfo(linfo.getListElementTypeInfo())));
            }

            case ObjectInspectorCategory.MAP:
            {
                MapTypeInfo minfo = (MapTypeInfo)info;
                return(TypeDescription.createMap
                           (convertTypeInfo(minfo.getMapKeyTypeInfo()),
                           convertTypeInfo(minfo.getMapValueTypeInfo())));
            }

            case ObjectInspectorCategory.UNION:
            {
                UnionTypeInfo   minfo  = (UnionTypeInfo)info;
                TypeDescription result = TypeDescription.createUnion();
                foreach (TypeInfo child in minfo.getAllUnionObjectTypeInfos())
                {
                    result.addUnionChild(convertTypeInfo(child));
                }
                return(result);
            }

            case ObjectInspectorCategory.STRUCT:
            {
                StructTypeInfo  sinfo  = (StructTypeInfo)info;
                TypeDescription result = TypeDescription.createStruct();
                foreach (string fieldName in sinfo.getAllStructFieldNames())
                {
                    result.addField(fieldName,
                                    convertTypeInfo(sinfo.getStructFieldTypeInfo(fieldName)));
                }
                return(result);
            }

            default:
                throw new ArgumentException("ORC doesn't handle " +
                                            info.getCategory());
            }
        }
Example #16
0
 private static void appendOrcTypes(List<OrcProto.Type> result, TypeDescription typeDescr)
 {
     OrcProto.Type.Builder type = OrcProto.Type.CreateBuilder();
     IList<TypeDescription> children = typeDescr.getChildren();
     switch (typeDescr.getCategory())
     {
         case Category.BOOLEAN:
             type.SetKind(OrcProto.Type.Types.Kind.BOOLEAN);
             break;
         case Category.BYTE:
             type.SetKind(OrcProto.Type.Types.Kind.BYTE);
             break;
         case Category.SHORT:
             type.SetKind(OrcProto.Type.Types.Kind.SHORT);
             break;
         case Category.INT:
             type.SetKind(OrcProto.Type.Types.Kind.INT);
             break;
         case Category.LONG:
             type.SetKind(OrcProto.Type.Types.Kind.LONG);
             break;
         case Category.FLOAT:
             type.SetKind(OrcProto.Type.Types.Kind.FLOAT);
             break;
         case Category.DOUBLE:
             type.SetKind(OrcProto.Type.Types.Kind.DOUBLE);
             break;
         case Category.STRING:
             type.SetKind(OrcProto.Type.Types.Kind.STRING);
             break;
         case Category.CHAR:
             type.SetKind(OrcProto.Type.Types.Kind.CHAR);
             type.SetMaximumLength((uint)typeDescr.getMaxLength());
             break;
         case Category.VARCHAR:
             type.SetKind(OrcProto.Type.Types.Kind.VARCHAR);
             type.SetMaximumLength((uint)typeDescr.getMaxLength());
             break;
         case Category.BINARY:
             type.SetKind(OrcProto.Type.Types.Kind.BINARY);
             break;
         case Category.TIMESTAMP:
             type.SetKind(OrcProto.Type.Types.Kind.TIMESTAMP);
             break;
         case Category.DATE:
             type.SetKind(OrcProto.Type.Types.Kind.DATE);
             break;
         case Category.DECIMAL:
             type.SetKind(OrcProto.Type.Types.Kind.DECIMAL);
             type.SetPrecision((uint)typeDescr.getPrecision());
             type.SetScale((uint)typeDescr.getScale());
             break;
         case Category.LIST:
             type.SetKind(OrcProto.Type.Types.Kind.LIST);
             type.AddSubtypes((uint)children[0].getId());
             break;
         case Category.MAP:
             type.SetKind(OrcProto.Type.Types.Kind.MAP);
             foreach (TypeDescription t in children)
             {
                 type.AddSubtypes((uint)t.getId());
             }
             break;
         case Category.STRUCT:
             type.SetKind(OrcProto.Type.Types.Kind.STRUCT);
             foreach (TypeDescription t in children)
             {
                 type.AddSubtypes((uint)t.getId());
             }
             foreach (string field in typeDescr.getFieldNames())
             {
                 type.AddFieldNames(field);
             }
             break;
         case Category.UNION:
             type.SetKind(OrcProto.Type.Types.Kind.UNION);
             foreach (TypeDescription t in children)
             {
                 type.AddSubtypes((uint)t.getId());
             }
             break;
         default:
             throw new ArgumentException("Unknown category: " + typeDescr.getCategory());
     }
     result.Add(type.Build());
     if (children != null)
     {
         foreach (TypeDescription child in children)
         {
             appendOrcTypes(result, child);
         }
     }
 }
Example #17
0
 /**
  * Returns selected columns as a bool array with true value set for specified column names.
  * The result will contain number of elements equal to flattened number of columns.
  * For example:
  * selectedColumns - a,b,c
  * allColumns - a,b,c,d
  * If column c is a complex type, say list<string> and other types are primitives then result will
  * be [false, true, true, true, true, true, false]
  * Index 0 is the root element of the struct which is set to false by default, index 1,2
  * corresponds to columns a and b. Index 3,4 correspond to column c which is list<string> and
  * index 5 correspond to column d. After flattening list<string> gets 2 columns.
  *
  * @param selectedColumns - comma separated list of selected column names
  * @param schema       - object schema
  * @return - bool array with true value set for the specified column names
  */
 public static bool[] includeColumns(string selectedColumns, TypeDescription schema)
 {
     int numFlattenedCols = schema.getMaximumId();
     bool[] results = new bool[numFlattenedCols + 1];
     if ("*".Equals(selectedColumns))
     {
         for (int i = 0; i < results.Length; i++)
         {
             results[i] = true;
         }
         return results;
     }
     if (selectedColumns != null &&
         schema.getCategory() == Category.STRUCT)
     {
         IList<string> fieldNames = schema.getFieldNames();
         IList<TypeDescription> fields = schema.getChildren();
         foreach (string column in selectedColumns.Split((',')))
         {
             TypeDescription col = findColumn(column, fieldNames, fields);
             if (col != null)
             {
                 for (int i = col.getId(); i <= col.getMaximumId(); ++i)
                 {
                     results[i] = true;
                 }
             }
         }
     }
     return results;
 }
Example #18
0
 /**
  * Set the schema on read type description.
  */
 public RecordReaderOptions schema(TypeDescription schema)
 {
     this._schema = schema;
     return(this);
 }
Example #19
0
 /**
  * Set the schema for the file. This is a required parameter.
  * @param schema the schema for the file.
  * @return this
  */
 public WriterOptions setSchema(TypeDescription schema)
 {
     this.explicitSchema = true;
     this.schema = schema;
     return this;
 }
Example #20
0
 public MapTreeWriter(int columnId,
               ObjectInspector inspector,
               TypeDescription schema,
               StreamFactory writer,
               bool nullable)
     : base(columnId, inspector, schema, writer, nullable)
 {
     this.isDirectV2 = isNewWriteFormat(writer);
     ObjectInspector keyInspector = null;
     ObjectInspector valueInspector = null;
     if (inspector != null)
     {
         MapObjectInspector insp = (MapObjectInspector)inspector;
         keyInspector = insp.getMapKeyObjectInspector();
         valueInspector = insp.getMapValueObjectInspector();
     }
     childrenWriters = new TreeWriter[2];
     IList<TypeDescription> children = schema.getChildren();
     childrenWriters[0] = createTreeWriter(keyInspector, children[0], writer, true);
     childrenWriters[1] = createTreeWriter(valueInspector, children[1], writer, true);
     lengths = createIntegerWriter(writer.createStream(columnId,
         OrcProto.Stream.Types.Kind.LENGTH), false, isDirectV2, writer);
     recordPosition(rowIndexPosition);
 }
Example #21
0
 public FloatTreeWriter(int columnId,
                   ObjectInspector inspector,
                   TypeDescription schema,
                   StreamFactory writer,
                   bool nullable)
     : base(columnId, inspector, schema, writer, nullable)
 {
     this.stream = writer.createStream(id,
         OrcProto.Stream.Types.Kind.DATA);
     this.utils = new SerializationUtils();
     recordPosition(rowIndexPosition);
 }
Example #22
0
 public StringTreeWriter(
     int columnId,
     ObjectInspector inspector,
     TypeDescription schema,
     StreamFactory writer,
     bool nullable)
     : base(columnId, inspector, schema, writer, nullable)
 {
 }
Example #23
0
 public ListTreeWriter(int columnId,
                ObjectInspector inspector,
                TypeDescription schema,
                StreamFactory writer,
                bool nullable)
     : base(columnId, inspector, schema, writer, nullable)
 {
     this.isDirectV2 = isNewWriteFormat(writer);
     ObjectInspector childOI = inspector == null ? null :
       ((ListObjectInspector)inspector).getListElementObjectInspector();
     childrenWriters = new TreeWriter[1];
     childrenWriters[0] =
       createTreeWriter(childOI, schema.getChildren()[0], writer, true);
     lengths = createIntegerWriter(writer.createStream(columnId,
         OrcProto.Stream.Types.Kind.LENGTH), false, isDirectV2, writer);
     recordPosition(rowIndexPosition);
 }
Example #24
0
        /**
         * NOTE: This method ignores the subtype numbers in the TypeDescription rebuilds the subtype
         * numbers based on the length of the result list being appended.
         *
         * @param result
         * @param typeInfo
         */
        public static void appendOrcTypesRebuildSubtypes(
            IList<OrcProto.Type> result,
            TypeDescription typeDescr)
        {
            int subtype = result.Count;
            OrcProto.Type.Builder type = OrcProto.Type.CreateBuilder();
            bool needsAdd = true;
            IList<TypeDescription> children = typeDescr.getChildren();
            switch (typeDescr.getCategory())
            {
                case Category.BOOLEAN:
                    type.SetKind(OrcProto.Type.Types.Kind.BOOLEAN);
                    break;
                case Category.BYTE:
                    type.SetKind(OrcProto.Type.Types.Kind.BYTE);
                    break;
                case Category.SHORT:
                    type.SetKind(OrcProto.Type.Types.Kind.SHORT);
                    break;
                case Category.INT:
                    type.SetKind(OrcProto.Type.Types.Kind.INT);
                    break;
                case Category.LONG:
                    type.SetKind(OrcProto.Type.Types.Kind.LONG);
                    break;
                case Category.FLOAT:
                    type.SetKind(OrcProto.Type.Types.Kind.FLOAT);
                    break;
                case Category.DOUBLE:
                    type.SetKind(OrcProto.Type.Types.Kind.DOUBLE);
                    break;
                case Category.STRING:
                    type.SetKind(OrcProto.Type.Types.Kind.STRING);
                    break;
                case Category.CHAR:
                    type.SetKind(OrcProto.Type.Types.Kind.CHAR);
                    type.SetMaximumLength((uint)typeDescr.getMaxLength());
                    break;
                case Category.VARCHAR:
                    type.SetKind(OrcProto.Type.Types.Kind.VARCHAR);
                    type.SetMaximumLength((uint)typeDescr.getMaxLength());
                    break;
                case Category.BINARY:
                    type.SetKind(OrcProto.Type.Types.Kind.BINARY);
                    break;
                case Category.TIMESTAMP:
                    type.SetKind(OrcProto.Type.Types.Kind.TIMESTAMP);
                    break;
                case Category.DATE:
                    type.SetKind(OrcProto.Type.Types.Kind.DATE);
                    break;
                case Category.DECIMAL:
                    type.SetKind(OrcProto.Type.Types.Kind.DECIMAL);
                    type.SetPrecision((uint)typeDescr.getPrecision());
                    type.SetScale((uint)typeDescr.getScale());
                    break;
                case Category.LIST:
                    type.SetKind(OrcProto.Type.Types.Kind.LIST);
                    type.AddSubtypes((uint)++subtype);
                    result.Add(type.Build());
                    needsAdd = false;
                    appendOrcTypesRebuildSubtypes(result, children[0]);
                    break;
                case Category.MAP:
                    {
                        // Make room for MAP type.
                        result.Add(null);

                        // Add MAP type pair in order to determine their subtype values.
                        appendOrcTypesRebuildSubtypes(result, children[0]);
                        int subtype2 = result.Count;
                        appendOrcTypesRebuildSubtypes(result, children[1]);
                        type.SetKind(OrcProto.Type.Types.Kind.MAP);
                        type.AddSubtypes((uint)subtype + 1);
                        type.AddSubtypes((uint)subtype2);
                        result[subtype] = type.Build();
                        needsAdd = false;
                    }
                    break;
                case Category.STRUCT:
                    {
                        IList<String> fieldNames = typeDescr.getFieldNames();

                        // Make room for STRUCT type.
                        result.Add(null);

                        List<int> fieldSubtypes = new List<int>(fieldNames.Count);
                        foreach (TypeDescription child in children)
                        {
                            int fieldSubtype = result.Count;
                            fieldSubtypes.Add(fieldSubtype);
                            appendOrcTypesRebuildSubtypes(result, child);
                        }

                        type.SetKind(OrcProto.Type.Types.Kind.STRUCT);

                        for (int i = 0; i < fieldNames.Count; i++)
                        {
                            type.AddSubtypes((uint)fieldSubtypes[i]);
                            type.AddFieldNames(fieldNames[i]);
                        }
                        result[subtype] = type.Build();
                        needsAdd = false;
                    }
                    break;
                case Category.UNION:
                    {
                        // Make room for UNION type.
                        result.Add(null);

                        List<int> unionSubtypes = new List<int>(children.Count);
                        foreach (TypeDescription child in children)
                        {
                            int unionSubtype = result.Count;
                            unionSubtypes.Add(unionSubtype);
                            appendOrcTypesRebuildSubtypes(result, child);
                        }

                        type.SetKind(OrcProto.Type.Types.Kind.UNION);
                        for (int i = 0; i < children.Count; i++)
                        {
                            type.AddSubtypes((uint)unionSubtypes[i]);
                        }
                        result[subtype] = type.Build();
                        needsAdd = false;
                    }
                    break;
                default:
                    throw new ArgumentException("Unknown category: " + typeDescr.getCategory());
            }
            if (needsAdd)
            {
                result.Add(type.Build());
            }
        }
Example #25
0
 public StringBaseTreeWriter(int columnId,
                  ObjectInspector inspector,
                  TypeDescription schema,
                  StreamFactory writer,
                  bool nullable)
     : base(columnId, inspector, schema, writer, nullable)
 {
     this.isDirectV2 = isNewWriteFormat(writer);
     stringOutput = writer.createStream(id,
         OrcProto.Stream.Types.Kind.DICTIONARY_DATA);
     lengthOutput = createIntegerWriter(writer.createStream(id,
         OrcProto.Stream.Types.Kind.LENGTH), false, isDirectV2, writer);
     rowOutput = createIntegerWriter(writer.createStream(id,
         OrcProto.Stream.Types.Kind.DATA), false, isDirectV2, writer);
     recordPosition(rowIndexPosition);
     rowIndexValueCount.Add(0L);
     buildIndex = writer.buildIndex();
     directStreamOutput = writer.createStream(id, OrcProto.Stream.Types.Kind.DATA);
     directLengthOutput = createIntegerWriter(writer.createStream(id,
         OrcProto.Stream.Types.Kind.LENGTH), false, isDirectV2, writer);
     OrcFile.WriterOptions options = writer.getOptions();
     dictionaryKeySizeThreshold = options.getDictionaryKeySizeThreshold();
     strideDictionaryCheck = options.getStrideDictionaryCheck();
     doneDictionaryCheck = false;
 }
Example #26
0
 /**
  * Set the schema for the file. This is a required parameter.
  * @param schema the schema for the file.
  * @return this
  */
 public WriterOptions setSchema(TypeDescription schema)
 {
     this.explicitSchema = true;
     this.schema         = schema;
     return(this);
 }
Example #27
0
 public StructTreeWriter(int columnId,
                  ObjectInspector inspector,
                  TypeDescription schema,
                  StreamFactory writer,
                  bool nullable)
     : base(columnId, inspector, schema, writer, nullable)
 {
     IList<TypeDescription> children = schema.getChildren();
     if (inspector != null)
     {
         StructObjectInspector structObjectInspector =
           (StructObjectInspector)inspector;
         fields = structObjectInspector.getAllStructFieldRefs();
     }
     childrenWriters = new TreeWriter[children.Count];
     for (int i = 0; i < childrenWriters.Length; ++i)
     {
         ObjectInspector childOI;
         if (fields != null && i < fields.Count)
         {
             childOI = fields[i].getFieldObjectInspector();
         }
         else
         {
             childOI = null;
         }
         childrenWriters[i] = createTreeWriter(
           childOI, children[i], writer,
           true);
     }
     recordPosition(rowIndexPosition);
 }
Example #28
0
 public BooleanTreeWriter(int columnId,
                   ObjectInspector inspector,
                   TypeDescription schema,
                   StreamFactory writer,
                   bool nullable)
     : base(columnId, inspector, schema, writer, nullable)
 {
     PositionedOutputStream @out = writer.createStream(id,
         OrcProto.Stream.Types.Kind.DATA);
     this.writer = new BitFieldWriter(@out, 1);
     recordPosition(rowIndexPosition);
 }
Example #29
0
 /**
  * Create a tree writer.
  * @param columnId the column id of the column to write
  * @param inspector the object inspector to use
  * @param schema the row schema
  * @param streamFactory limited access to the Writer's data.
  * @param nullable can the value be null?
  * @
  */
 protected TreeWriter(
     int columnId,
     ObjectInspector inspector,
     TypeDescription schema,
     StreamFactory streamFactory,
     bool nullable)
 {
     this.streamFactory = streamFactory;
     this.isCompressed = streamFactory.isCompressed();
     this.id = columnId;
     this.inspector = inspector;
     if (nullable)
     {
         isPresentOutStream = streamFactory.createStream(id,
             OrcProto.Stream.Types.Kind.PRESENT);
         isPresent = new BitFieldWriter(isPresentOutStream, 1);
     }
     else
     {
         isPresent = null;
     }
     this.foundNulls = false;
     createBloomFilter = streamFactory.getBloomFilterColumns()[columnId];
     indexStatistics = ColumnStatisticsImpl.create(schema);
     stripeColStatistics = ColumnStatisticsImpl.create(schema);
     fileStatistics = ColumnStatisticsImpl.create(schema);
     childrenWriters = new TreeWriter[0];
     rowIndex = OrcProto.RowIndex.CreateBuilder();
     rowIndexEntry = OrcProto.RowIndexEntry.CreateBuilder();
     rowIndexPosition = new RowIndexPositionRecorder(rowIndexEntry);
     stripeStatsBuilders = new List<OrcProto.StripeStatistics.Builder>();
     if (streamFactory.buildIndex())
     {
         rowIndexStream = streamFactory.createStream(id, OrcProto.Stream.Types.Kind.ROW_INDEX);
     }
     else
     {
         rowIndexStream = null;
     }
     if (createBloomFilter)
     {
         bloomFilterEntry = OrcProto.BloomFilter.CreateBuilder();
         bloomFilterIndex = OrcProto.BloomFilterIndex.CreateBuilder();
         bloomFilterStream = streamFactory.createStream(id, OrcProto.Stream.Types.Kind.BLOOM_FILTER);
         bloomFilter = new BloomFilter(streamFactory.getRowIndexStride(), streamFactory.getBloomFilterFPP());
     }
     else
     {
         bloomFilterEntry = null;
         bloomFilterIndex = null;
         bloomFilterStream = null;
         bloomFilter = null;
     }
 }
Example #30
0
        public WriterImpl(
            Stream stream,
            string path,
            OrcFile.WriterOptions options,
            ObjectInspector inspector,
            TypeDescription schema,
            long stripeSize,
            CompressionKind compress,
            int bufferSize,
            int rowIndexStride,
            MemoryManager memoryManager,
            bool addBlockPadding,
            OrcFile.Version version,
            OrcFile.WriterCallback callback,
            OrcFile.EncodingStrategy encodingStrategy,
            OrcFile.CompressionStrategy compressionStrategy,
            double paddingTolerance,
            long blockSizeValue,
            string bloomFilterColumnNames,
            double bloomFilterFpp)
        {
            this.baseStream = stream;
            this.streamFactory = new StreamFactory(this);
            this.path = path;
            this.options = options;
            this.callback = callback;
            this.schema = schema;
            this.adjustedStripeSize = stripeSize;
            this.defaultStripeSize = stripeSize;
            this.version = version;
            this.encodingStrategy = encodingStrategy;
            this.compressionStrategy = compressionStrategy;
            this.addBlockPadding = addBlockPadding;
            this.blockSize = blockSizeValue;
            this.paddingTolerance = paddingTolerance;
            this.compress = compress;
            this.rowIndexStride = rowIndexStride;
            this.memoryManager = memoryManager;
            buildIndex = rowIndexStride > 0;
            codec = createCodec(compress);
            int numColumns = schema.getMaximumId() + 1;
            this.bufferSize = getEstimatedBufferSize(defaultStripeSize, numColumns, bufferSize);
            if (version == OrcFile.Version.V_0_11)
            {
                /* do not write bloom filters for ORC v11 */
                this.bloomFilterColumns = new bool[schema.getMaximumId() + 1];
            }
            else
            {
                this.bloomFilterColumns =
                    OrcUtils.includeColumns(bloomFilterColumnNames, schema);
            }
            this.bloomFilterFpp = bloomFilterFpp;
            treeWriter = createTreeWriter(inspector, schema, streamFactory, false);
            if (buildIndex && rowIndexStride < MIN_ROW_INDEX_STRIDE)
            {
                throw new ArgumentException("Row stride must be at least " +
                    MIN_ROW_INDEX_STRIDE);
            }

            // ensure that we are able to handle callbacks before we register ourselves
            memoryManager.addWriter(path, stripeSize, this);
        }
Example #31
0
 public VarcharTreeWriter(
     int columnId,
     ObjectInspector inspector,
     TypeDescription schema,
     StreamFactory writer,
     bool nullable)
     : base(columnId, inspector, schema, writer, nullable)
 {
     maxLength = schema.getMaxLength();
 }
Example #32
0
 public ByteTreeWriter(int columnId,
                   ObjectInspector inspector,
                   TypeDescription schema,
                   StreamFactory writer,
                   bool nullable)
     : base(columnId, inspector, schema, writer, nullable)
 {
     this.writer = new RunLengthByteWriter(writer.createStream(id,
         OrcProto.Stream.Types.Kind.DATA));
     recordPosition(rowIndexPosition);
 }
Example #33
0
 private static void writeTypes(OrcProto.Footer.Builder builder,
                                TypeDescription schema)
 {
     OrcProto.Type.Builder type = OrcProto.Type.CreateBuilder();
     IList<TypeDescription> children = schema.getChildren();
     switch (schema.getCategory())
     {
         case Category.BOOLEAN:
             type.Kind = OrcProto.Type.Types.Kind.BOOLEAN;
             break;
         case Category.BYTE:
             type.Kind = OrcProto.Type.Types.Kind.BYTE;
             break;
         case Category.SHORT:
             type.Kind = OrcProto.Type.Types.Kind.SHORT;
             break;
         case Category.INT:
             type.Kind = OrcProto.Type.Types.Kind.INT;
             break;
         case Category.LONG:
             type.Kind = OrcProto.Type.Types.Kind.LONG;
             break;
         case Category.FLOAT:
             type.Kind = OrcProto.Type.Types.Kind.FLOAT;
             break;
         case Category.DOUBLE:
             type.Kind = OrcProto.Type.Types.Kind.DOUBLE;
             break;
         case Category.STRING:
             type.Kind = OrcProto.Type.Types.Kind.STRING;
             break;
         case Category.CHAR:
             type.Kind = OrcProto.Type.Types.Kind.CHAR;
             type.MaximumLength = (uint)schema.getMaxLength();
             break;
         case Category.VARCHAR:
             type.Kind = OrcProto.Type.Types.Kind.VARCHAR;
             type.MaximumLength = (uint)schema.getMaxLength();
             break;
         case Category.BINARY:
             type.Kind = OrcProto.Type.Types.Kind.BINARY;
             break;
         case Category.TIMESTAMP:
             type.Kind = OrcProto.Type.Types.Kind.TIMESTAMP;
             break;
         case Category.DATE:
             type.Kind = OrcProto.Type.Types.Kind.DATE;
             break;
         case Category.DECIMAL:
             type.Kind = OrcProto.Type.Types.Kind.DECIMAL;
             type.Precision = (uint)schema.getPrecision();
             type.Scale = (uint)schema.getScale();
             break;
         case Category.LIST:
             type.Kind = OrcProto.Type.Types.Kind.LIST;
             type.AddSubtypes((uint)children[0].getId());
             break;
         case Category.MAP:
             type.Kind = OrcProto.Type.Types.Kind.MAP;
             foreach (TypeDescription t in children)
             {
                 type.AddSubtypes((uint)t.getId());
             }
             break;
         case Category.STRUCT:
             type.Kind = OrcProto.Type.Types.Kind.STRUCT;
             foreach (TypeDescription t in children)
             {
                 type.AddSubtypes((uint)t.getId());
             }
             foreach (string field in schema.getFieldNames())
             {
                 type.AddFieldNames(field);
             }
             break;
         case Category.UNION:
             type.Kind = OrcProto.Type.Types.Kind.UNION;
             foreach (TypeDescription t in children)
             {
                 type.AddSubtypes((uint)t.getId());
             }
             break;
         default:
             throw new ArgumentException("Unknown category: " +
               schema.getCategory());
     }
     builder.AddTypes(type);
     if (children != null)
     {
         foreach (TypeDescription child in children)
         {
             writeTypes(builder, child);
         }
     }
 }
Example #34
0
 public CharTreeWriter(
     int columnId,
     ObjectInspector inspector,
     TypeDescription schema,
     StreamFactory writer,
     bool nullable)
     : base(columnId, inspector, schema, writer, nullable)
 {
     itemLength = schema.getMaxLength();
     padding = new byte[itemLength];
 }
Example #35
0
 public BinaryTreeWriter(
     int columnId,
     ObjectInspector inspector,
     TypeDescription schema,
     StreamFactory writer,
     bool nullable)
     : base(columnId, inspector, schema, writer, nullable)
 {
     this.stream = writer.createStream(id,
         OrcProto.Stream.Types.Kind.DATA);
     this.isDirectV2 = isNewWriteFormat(writer);
     this.length = createIntegerWriter(writer.createStream(id,
         OrcProto.Stream.Types.Kind.LENGTH), false, isDirectV2, writer);
     recordPosition(rowIndexPosition);
 }
Example #36
0
 public DateTreeWriter(int columnId,
                ObjectInspector inspector,
                TypeDescription schema,
                StreamFactory writer,
                bool nullable)
     : base(columnId, inspector, schema, writer, nullable)
 {
     OutStream @out = writer.createStream(id,
         OrcProto.Stream.Types.Kind.DATA);
     this.isDirectV2 = isNewWriteFormat(writer);
     this.writer = createIntegerWriter(@out, true, isDirectV2, writer);
     recordPosition(rowIndexPosition);
 }
        /**
         * Create a reader that merge sorts the ACID events together.
         * @param conf the configuration
         * @param collapseEvents should the events on the same row be collapsed
         * @param isOriginal is the base file a pre-acid file
         * @param bucket the bucket we are reading
         * @param options the options to read with
         * @param deltaDirectory the list of delta directories to include
         * @
         */
        OrcRawRecordMerger(Configuration conf,
                           bool collapseEvents,
                           Reader reader,
                           bool isOriginal,
                           int bucket,
                           ValidTxnList validTxnList,
                           Reader.Options options,
                           Path[] deltaDirectory)
        {
            this.conf         = conf;
            this.collapse     = collapseEvents;
            this.offset       = options.getOffset();
            this.length       = options.getLength();
            this.validTxnList = validTxnList;
            TypeDescription typeDescr = OrcUtils.getDesiredRowTypeDescr(conf);

            if (typeDescr == null)
            {
                throw new IOException(ErrorMsg.SCHEMA_REQUIRED_TO_READ_ACID_TABLES.getErrorCodedMsg());
            }

            objectInspector = OrcRecordUpdater.createEventSchema
                                  (OrcStruct.createObjectInspector(0, OrcUtils.getOrcTypes(typeDescr)));

            // modify the options to reflect the event instead of the base row
            Reader.Options eventOptions = createEventOptions(options);
            if (reader == null)
            {
                baseReader = null;
            }
            else
            {
                // find the min/max based on the offset and length
                if (isOriginal)
                {
                    discoverOriginalKeyBounds(reader, bucket, options);
                }
                else
                {
                    discoverKeyBounds(reader, options);
                }
                LOG.info("min key = " + minKey + ", max key = " + maxKey);
                // use the min/max instead of the byte range
                ReaderPair pair;
                ReaderKey  key = new ReaderKey();
                if (isOriginal)
                {
                    options = options.clone();
                    options.range(options.getOffset(), Long.MAX_VALUE);
                    pair = new OriginalReaderPair(key, reader, bucket, minKey, maxKey,
                                                  options);
                }
                else
                {
                    pair = new ReaderPair(key, reader, bucket, minKey, maxKey,
                                          eventOptions, 0);
                }

                // if there is at least one record, put it in the map
                if (pair.nextRecord != null)
                {
                    readers.put(key, pair);
                }
                baseReader = pair.recordReader;
            }

            // we always want to read all of the deltas
            eventOptions.range(0, Long.MAX_VALUE);
            if (deltaDirectory != null)
            {
                foreach (Path delta in deltaDirectory)
                {
                    ReaderKey             key       = new ReaderKey();
                    Path                  deltaFile = AcidUtils.createBucketFile(delta, bucket);
                    AcidUtils.ParsedDelta deltaDir  = AcidUtils.parsedDelta(delta);
                    FileSystem            fs        = deltaFile.getFileSystem(conf);
                    long                  length    = getLastFlushLength(fs, deltaFile);
                    if (length != -1 && fs.exists(deltaFile))
                    {
                        Reader deltaReader = OrcFile.createReader(deltaFile,
                                                                  OrcFile.readerOptions(conf).maxLength(length));
                        Reader.Options deltaEventOptions = null;
                        if (eventOptions.getSearchArgument() != null)
                        {
                            // Turn off the sarg before pushing it to delta.  We never want to push a sarg to a delta as
                            // it can produce wrong results (if the latest valid version of the record is filtered out by
                            // the sarg) or ArrayOutOfBounds errors (when the sarg is applied to a delete record)
                            // unless the delta only has insert events
                            OrcRecordUpdater.AcidStats acidStats = OrcRecordUpdater.parseAcidStats(deltaReader);
                            if (acidStats.deletes > 0 || acidStats.updates > 0)
                            {
                                deltaEventOptions = eventOptions.clone().searchArgument(null, null);
                            }
                        }
                        ReaderPair deltaPair;
                        deltaPair = new ReaderPair(key, deltaReader, bucket, minKey,
                                                   maxKey, deltaEventOptions != null ? deltaEventOptions : eventOptions, deltaDir.getStatementId());
                        if (deltaPair.nextRecord != null)
                        {
                            readers.put(key, deltaPair);
                        }
                    }
                }
            }

            // get the first record
            Map.Entry <ReaderKey, ReaderPair> entry = readers.pollFirstEntry();
            if (entry == null)
            {
                columns = 0;
                primary = null;
            }
            else
            {
                primary = entry.getValue();
                if (readers.isEmpty())
                {
                    secondaryKey = null;
                }
                else
                {
                    secondaryKey = readers.firstKey();
                }
                // get the number of columns in the user's rows
                columns = primary.getColumns();
            }
        }
Example #38
0
 /**
  * A required option that sets the object inspector for the rows. If
  * setSchema is not called, it also defines the schema.
  */
 public WriterOptions inspector(ObjectInspector value)
 {
     _inspector = value;
     if (!explicitSchema)
     {
         schema = OrcUtils.convertTypeInfo(
             TypeInfoUtils.getTypeInfoFromObjectInspector(value));
     }
     return this;
 }