示例#1
0
        protected override byte[] ReadSingle(BinaryReader reader, Thrift.SchemaElement tse, int length)
        {
            //length
            if (length == -1)
            {
                length = reader.ReadInt32();
            }

            //data
            return(reader.ReadBytes(length));
        }
示例#2
0
        public SchemaElement ParseSchemaExperimental(ParquetOptions formatOptions)
        {
            int si = 0;

            Thrift.SchemaElement tse = _fileMeta.Schema[si++];
            var root = new SchemaElement(tse.Name, DataType.Unspecified, null);

            ParseSchemaExperimenal(root, tse.Num_children, ref si, formatOptions);

            return(root);
        }
        public Schema CreateModelSchema(ParquetOptions formatOptions)
        {
            int si = 0;

            Thrift.SchemaElement tse = _fileMeta.Schema[si++];
            var container            = new List <Field>();

            CreateModelSchema(null, container, tse.Num_children, ref si, formatOptions);

            return(new Schema(container));
        }
        private void CreateThriftSchema(IEnumerable <Field> ses, Thrift.SchemaElement parent, IList <Thrift.SchemaElement> container)
        {
            foreach (Field se in ses)
            {
                IDataTypeHandler handler = DataTypeFactory.Match(se);

                //todo: check that handler is found indeed

                handler.CreateThrift(se, parent, container);
            }
        }
示例#5
0
        private void ReadAsInt64(Thrift.SchemaElement tse, BinaryReader reader, IList result)
        {
            decimal scaleFactor = (decimal)Math.Pow(10, -tse.Scale);

            while (reader.BaseStream.Position + 8 <= reader.BaseStream.Length)
            {
                long    lv = reader.ReadInt64();
                decimal dv = lv * scaleFactor;
                result.Add(dv);
            }
        }
示例#6
0
        private void WriteAsFixedLengthByteArray(Thrift.SchemaElement tse, BinaryWriter writer, IList values)
        {
            foreach (decimal d in values)
            {
                var    bd       = new BigDecimal(d, tse.Precision, tse.Scale);
                byte[] itemData = bd.ToByteArray();
                tse.Type_length = itemData.Length; //always re-set type length as it can differ from default type length

                writer.Write(itemData);
            }
        }
示例#7
0
        public override bool IsMatch(Thrift.SchemaElement tse, ParquetOptions formatOptions)
        {
            return

                ((tse.Type == Thrift.Type.INT96 && formatOptions.TreatBigIntegersAsDates) || //Impala

                 (tse.Type == Thrift.Type.INT64 && tse.__isset.converted_type && tse.Converted_type == Thrift.ConvertedType.TIMESTAMP_MILLIS) ||
                 (tse.Type == Thrift.Type.INT64 && tse.__isset.converted_type && tse.Converted_type == Thrift.ConvertedType.TIMESTAMP_MICROS) ||

                 (tse.Type == Thrift.Type.INT32 && tse.__isset.converted_type && tse.Converted_type == Thrift.ConvertedType.DATE));
        }
示例#8
0
        public override byte[] PlainEncode(Thrift.SchemaElement tse, string x)
        {
            using (var ms = new MemoryStream())
            {
                using (var bs = new BinaryWriter(ms))
                {
                    WriteOne(bs, x, false);
                }

                return(ms.ToArray());
            }
        }
示例#9
0
        public override Field CreateSchemaElement(IList <Thrift.SchemaElement> schema, ref int index, out int ownedChildCount)
        {
            Thrift.SchemaElement tseList = schema[index];

            ListField listField = ListField.CreateWithNoItem(tseList.Name);

            //as we are skipping elements set path hint
            listField.Path  = $"{tseList.Name}{Schema.PathSeparator}{schema[index + 1].Name}";
            index          += 2; //skip this element and child container
            ownedChildCount = 1; //we should get this element assigned back
            return(listField);
        }
示例#10
0
        public virtual Field CreateSchemaElement(IList <Thrift.SchemaElement> schema, ref int index, out int ownedChildCount)
        {
            Thrift.SchemaElement tse = schema[index++];

            bool hasNulls = (tse.Repetition_type != Thrift.FieldRepetitionType.REQUIRED);
            bool isArray  = (tse.Repetition_type == Thrift.FieldRepetitionType.REPEATED);

            Field simple = CreateSimple(tse, hasNulls, isArray);

            ownedChildCount = 0;
            return(simple);
        }
示例#11
0
        private void ThrowNoHandler(Thrift.SchemaElement tse)
        {
            string ct = tse.__isset.converted_type
            ? $" ({tse.Converted_type})"
            : null;

            string t = tse.__isset.type
            ? $"'{tse.Type}'"
            : "<unspecified>";

            throw new NotSupportedException($"cannot find data type handler for schema element '{tse.Name}' (type: {t}{ct})");
        }
        public override bool IsMatch(Thrift.SchemaElement tse, ParquetOptions formatOptions)
        {
            return

                (tse.__isset.converted_type && tse.Converted_type == Thrift.ConvertedType.DECIMAL &&

                 (
                     tse.Type == Thrift.Type.FIXED_LEN_BYTE_ARRAY ||
                     tse.Type == Thrift.Type.INT32 ||
                     tse.Type == Thrift.Type.INT64
                 ));
        }
        public Thrift.FileMetaData CreateThriftSchema(Schema schema)
        {
            var meta = new Thrift.FileMetaData();

            meta.Version = 1;
            meta.Schema  = new List <Thrift.SchemaElement>();

            Thrift.SchemaElement root = AddRoot(meta.Schema);
            CreateThriftSchema(schema.Fields, root, meta.Schema);

            return(meta);
        }
示例#14
0
        private int ReadAsInt64(Thrift.SchemaElement tse, BinaryReader reader, decimal[] dest, int offset)
        {
            int     start       = offset;
            decimal scaleFactor = (decimal)Math.Pow(10, -tse.Scale);

            while (reader.BaseStream.Position + 8 <= reader.BaseStream.Length)
            {
                long    lv = reader.ReadInt64();
                decimal dv = lv * scaleFactor;
                dest[offset++] = dv;
            }
            return(offset - start);
        }
示例#15
0
        public BigDecimal(byte[] data, Thrift.SchemaElement schema)
        {
            data = data.Reverse().ToArray();

            UnscaledValue = new BigInteger(data);
            Precision     = schema.Precision;
            Scale         = schema.Scale;

            BigInteger scaleMultiplier = BigInteger.Pow(10, Scale);
            decimal    ipScaled        = (decimal)BigInteger.DivRem(UnscaledValue, scaleMultiplier, out BigInteger fpUnscaled);
            decimal    fpScaled        = (decimal)fpUnscaled / (decimal)scaleMultiplier;

            DecimalValue = ipScaled + fpScaled;
        }
示例#16
0
        public ColumnarReader(Stream inputStream, Thrift.ColumnChunk thriftColumnChunk, ThriftFooter footer, ParquetOptions parquetOptions)
        {
            _inputStream       = inputStream ?? throw new ArgumentNullException(nameof(inputStream));
            _thriftColumnChunk = thriftColumnChunk ?? throw new ArgumentNullException(nameof(thriftColumnChunk));
            _footer            = footer ?? throw new ArgumentNullException(nameof(footer));
            _parquetOptions    = parquetOptions ?? throw new ArgumentNullException(nameof(parquetOptions));

            _thriftStream = new ThriftStream(inputStream);
            _footer.GetLevels(_thriftColumnChunk, out int mrl, out int mdl);
            _maxRepetitionLevel  = mrl;
            _maxDefinitionLevel  = mdl;
            _thriftSchemaElement = _footer.GetSchemaElement(_thriftColumnChunk);
            _dataTypeHandler     = DataTypeFactory.Match(_thriftSchemaElement, _parquetOptions);
        }
示例#17
0
        private int Read(Thrift.SchemaElement tse, BinaryReader reader, TSystemType[] dest, int offset)
        {
            int    totalLength = (int)reader.BaseStream.Length;
            int    idx         = offset;
            Stream s           = reader.BaseStream;

            while (s.Position < totalLength && idx < dest.Length)
            {
                TSystemType element = ReadSingle(reader, tse, -1); //potential performance hit on calling a method
                dest[idx++] = element;
            }

            return(idx - offset);
        }
示例#18
0
        public List <string> GetPath(Thrift.SchemaElement schemaElement)
        {
            var path = new List <string>();

            ThriftSchemaTree.Node wrapped = _tree.Find(schemaElement);
            while (wrapped.parent != null)
            {
                path.Add(wrapped.element.Name);
                wrapped = wrapped.parent;
            }

            path.Reverse();
            return(path);
        }
        public override int Read(BinaryReader reader, Thrift.SchemaElement tse, Array dest, int offset)
        {
            switch (tse.Type)
            {
            case Thrift.Type.INT32:
                return(ReadAsInt32(reader, (TimeSpan[])dest, offset));

            case Thrift.Type.INT64:
                return(ReadAsInt64(reader, (TimeSpan[])dest, offset));

            default:
                throw new NotSupportedException();
            }
        }
示例#20
0
        private int Read(Thrift.SchemaElement tse, BinaryReader reader, ParquetOptions formatOptions, TSystemType[] dest, int offset)
        {
            int    totalLength = (int)reader.BaseStream.Length;
            int    idx         = offset;
            Stream s           = reader.BaseStream;

            while (s.Position < totalLength && idx < dest.Length)
            {
                TSystemType element = ReadOne(reader);
                dest[idx++] = element;
            }

            return(idx - offset);
        }
示例#21
0
        public void GetLevels(Thrift.ColumnChunk columnChunk, out int maxRepetitionLevel, out int maxDefinitionLevel)
        {
            maxRepetitionLevel = 0;
            maxDefinitionLevel = 0;

            int           i    = 0;
            List <string> path = columnChunk.Meta_data.Path_in_schema;

            var comparer = new StringListComparer(path);

            if (_memoizedLevels.TryGetValue(comparer, out Tuple <int, int> t))
            {
                maxRepetitionLevel = t.Item1;
                maxDefinitionLevel = t.Item2;
                return;
            }

            int fieldCount = _fileMeta.Schema.Count;

            foreach (string pp in path)
            {
                while (i < fieldCount)
                {
                    SchemaElement schemaElement = _fileMeta.Schema[i];
                    if (string.CompareOrdinal(schemaElement.Name, pp) == 0)
                    {
                        Thrift.SchemaElement se = schemaElement;

                        bool repeated = (se.__isset.repetition_type && se.Repetition_type == Thrift.FieldRepetitionType.REPEATED);
                        bool defined  = (se.Repetition_type == Thrift.FieldRepetitionType.REQUIRED);

                        if (repeated)
                        {
                            maxRepetitionLevel += 1;
                        }
                        if (!defined)
                        {
                            maxDefinitionLevel += 1;
                        }

                        break;
                    }

                    i++;
                }
            }

            _memoizedLevels.Add(comparer, Tuple.Create(maxRepetitionLevel, maxDefinitionLevel));
        }
示例#22
0
        internal Thrift.Statistics ToThriftStatistics(IDataTypeHandler handler, Thrift.SchemaElement tse)
        {
            byte[] min = handler.PlainEncode(tse, MinValue);
            byte[] max = handler.PlainEncode(tse, MaxValue);

            return(new Thrift.Statistics
            {
                Null_count = NullCount,
                Distinct_count = DistinctCount,
                Min = min,
                Min_value = min,
                Max = max,
                Max_value = max
            });
        }
示例#23
0
 public PrimitiveReader(
     Thrift.SchemaElement schemaElement,
     ParquetOptions parquetOptions,
     IDataTypeHandler dataTypeHandler,
     BinaryReader binaryReader,
     int typeWidth,
     Func <BinaryReader, TElement> readOneFunc)
 {
     _schemaElement   = schemaElement;
     _parquetOptions  = parquetOptions;
     _dataTypeHandler = dataTypeHandler;
     _binaryReader    = binaryReader;
     _typeWidth       = typeWidth;
     _readOneFunc     = readOneFunc;
 }
示例#24
0
 public DataColumnWriter(
     Stream stream,
     ThriftStream thriftStream,
     ThriftFooter footer,
     Thrift.SchemaElement schemaElement,
     CompressionMethod compressionMethod,
     int rowCount)
 {
     _stream            = stream;
     _thriftStream      = thriftStream;
     _footer            = footer;
     _schemaElement     = schemaElement;
     _compressionMethod = compressionMethod;
     _rowCount          = rowCount;
 }
示例#25
0
        public static void AdjustSchema(Thrift.SchemaElement schema, Type systemType)
        {
            if (!TypeToTag.TryGetValue(systemType, out TypeTag tag))
            {
                string supportedTypes = string.Join(", ", TypeToTag.Keys.Select(t => t.ToString()));

                throw new NotSupportedException($"system type {systemType} is not supported, list of supported types: '{supportedTypes}'");
            }

            schema.Type = tag.PType;

            if (tag.ConvertedType != null)
            {
                schema.Converted_type = tag.ConvertedType.Value;
            }
        }
        private Thrift.ColumnChunk WriteColumnChunk(Thrift.SchemaElement tse, List <string> path, DataColumn column, IDataTypeHandler dataTypeHandler)
        {
            Thrift.ColumnChunk chunk = _footer.CreateColumnChunk(_compressionMethod, _stream, tse.Type, path, 0);
            Thrift.PageHeader  ph    = _footer.CreateDataPage(_rowCount);
            _footer.GetLevels(chunk, out int maxRepetitionLevel, out int maxDefinitionLevel);

            List <PageTag> pages = WriteColumn(column, tse, dataTypeHandler, maxRepetitionLevel, maxDefinitionLevel);

            chunk.Meta_data.Num_values = ph.Data_page_header.Num_values;

            //the following counters must include both data size and header size
            chunk.Meta_data.Total_compressed_size   = pages.Sum(p => p.HeaderMeta.Compressed_page_size + p.HeaderSize);
            chunk.Meta_data.Total_uncompressed_size = pages.Sum(p => p.HeaderMeta.Uncompressed_page_size + p.HeaderSize);

            return(chunk);
        }
示例#27
0
        public virtual void CreateThrift(Field se, Thrift.SchemaElement parent, IList <Thrift.SchemaElement> container)
        {
            DataField sef = (DataField)se;
            var       tse = new Thrift.SchemaElement(se.Name);

            tse.Type = _thriftType;
            if (_convertedType != null)
            {
                tse.Converted_type = _convertedType.Value;
            }
            tse.Repetition_type = sef.IsArray
            ? Thrift.FieldRepetitionType.REPEATED
            : (sef.HasNulls ? Thrift.FieldRepetitionType.OPTIONAL : Thrift.FieldRepetitionType.REQUIRED);
            container.Add(tse);
            parent.Num_children += 1;
        }
示例#28
0
 private void BuildSchema(Node parent, List <Thrift.SchemaElement> schema, int count, ref int i)
 {
     parent.children = new List <Node>();
     for (int ic = 0; ic < count; ic++)
     {
         Thrift.SchemaElement child = schema[i++];
         var node = new Node {
             element = child, parent = parent
         };
         parent.children.Add(node);
         if (child.Num_children > 0)
         {
             BuildSchema(node, schema, child.Num_children, ref i);
         }
     }
 }
示例#29
0
        public override object PlainDecode(Thrift.SchemaElement tse, byte[] encoded)
        {
            if (encoded == null)
            {
                return(null);
            }

            using (var ms = new MemoryStream(encoded))
            {
                using (var br = new BinaryReader(ms))
                {
                    string element = ReadSingle(br, null, -1, false);
                    return(element);
                }
            }
        }
示例#30
0
        public override int Read(BinaryReader reader, Thrift.SchemaElement tse, Array dest, int offset, ParquetOptions formatOptions)
        {
            string[] tdest = (string[])dest;

            int    totalLength = (int)reader.BaseStream.Length;
            int    idx         = offset;
            Stream s           = reader.BaseStream;

            while (s.Position < totalLength)
            {
                string element = ReadOne(reader);
                tdest[idx++] = element;
            }

            return(idx - offset);
        }