コード例 #1
0
ファイル: Schema.cs プロジェクト: slunyakin-zz/parquet-dotnet
 internal SchemaElement this[Thrift.ColumnChunk value]
 {
     get
     {
         return(_pathToElement[value.Meta_data.Path_in_schema[0]]);
     }
 }
コード例 #2
0
        public void GetLevels(Thrift.ColumnChunk columnChunk, out int maxRepetitionLevel, out int maxDefinitionLevel)
        {
            maxRepetitionLevel = 0;
            maxDefinitionLevel = 0;

            int           i    = 0;
            List <string> path = columnChunk.Meta_data.Path_in_schema;

            foreach (string pp in path)
            {
                while (i < _fileMeta.Schema.Count)
                {
                    if (_fileMeta.Schema[i].Name == pp)
                    {
                        Thrift.SchemaElement se = _fileMeta.Schema[i];

                        bool repeated = (se.__isset.repetition_type && se.Repetition_type == Thrift.FieldRepetitionType.REPEATED);
                        bool defined  = (se.Repetition_type == Thrift.FieldRepetitionType.REQUIRED);

                        if (repeated)
                        {
                            maxRepetitionLevel += 1;
                        }
                        if (!defined)
                        {
                            maxDefinitionLevel += 1;
                        }

                        break;
                    }

                    i++;
                }
            }
        }
コード例 #3
0
        public Thrift.SchemaElement GetSchemaElement(Thrift.ColumnChunk columnChunk)
        {
            if (columnChunk == null)
            {
                throw new ArgumentNullException(nameof(columnChunk));
            }

            List <string> path = columnChunk.Meta_data.Path_in_schema;

            int i = 0;

            foreach (string pp in path)
            {
                while (i < _fileMeta.Schema.Count)
                {
                    if (_fileMeta.Schema[i].Name == pp)
                    {
                        break;
                    }

                    i++;
                }
            }

            return(_fileMeta.Schema[i]);
        }
コード例 #4
0
        public ColumnarReader(Stream inputStream, Thrift.ColumnChunk thriftColumnChunk, ThriftFooter footer, ParquetOptions parquetOptions)
        {
            _inputStream       = inputStream ?? throw new ArgumentNullException(nameof(inputStream));
            _thriftColumnChunk = thriftColumnChunk ?? throw new ArgumentNullException(nameof(thriftColumnChunk));
            _footer            = footer ?? throw new ArgumentNullException(nameof(footer));
            _parquetOptions    = parquetOptions ?? throw new ArgumentNullException(nameof(parquetOptions));

            _thriftStream = new ThriftStream(inputStream);
        }
コード例 #5
0
        public PColumn(Thrift.ColumnChunk thriftChunk, SchemaElement schema, Stream inputStream, ThriftStream thriftStream, ParquetOptions options)
        {
            _thriftChunk = thriftChunk;
            _thrift      = thriftStream;
            _schema      = schema;
            _inputStream = inputStream;
            _options     = options;

            _plainReader = new PlainValuesReader(options);
        }
コード例 #6
0
        public ColumnarReader(Stream inputStream, Thrift.ColumnChunk thriftColumnChunk, ThriftFooter footer, ParquetOptions parquetOptions)
        {
            _inputStream       = inputStream ?? throw new ArgumentNullException(nameof(inputStream));
            _thriftColumnChunk = thriftColumnChunk ?? throw new ArgumentNullException(nameof(thriftColumnChunk));
            _footer            = footer ?? throw new ArgumentNullException(nameof(footer));
            _parquetOptions    = parquetOptions ?? throw new ArgumentNullException(nameof(parquetOptions));

            _thriftStream = new ThriftStream(inputStream);
            _footer.GetLevels(_thriftColumnChunk, out int mrl, out int mdl);
            _maxRepetitionLevel  = mrl;
            _maxDefinitionLevel  = mdl;
            _thriftSchemaElement = _footer.GetSchemaElement(_thriftColumnChunk);
            _dataTypeHandler     = DataTypeFactory.Match(_thriftSchemaElement, _parquetOptions);
        }
コード例 #7
0
        public void GetLevels(Thrift.ColumnChunk columnChunk, out int maxRepetitionLevel, out int maxDefinitionLevel)
        {
            maxRepetitionLevel = 0;
            maxDefinitionLevel = 0;

            int           i    = 0;
            List <string> path = columnChunk.Meta_data.Path_in_schema;

            var comparer = new StringListComparer(path);

            if (_memoizedLevels.TryGetValue(comparer, out Tuple <int, int> t))
            {
                maxRepetitionLevel = t.Item1;
                maxDefinitionLevel = t.Item2;
                return;
            }

            int fieldCount = _fileMeta.Schema.Count;

            foreach (string pp in path)
            {
                while (i < fieldCount)
                {
                    SchemaElement schemaElement = _fileMeta.Schema[i];
                    if (string.CompareOrdinal(schemaElement.Name, pp) == 0)
                    {
                        Thrift.SchemaElement se = schemaElement;

                        bool repeated = (se.__isset.repetition_type && se.Repetition_type == Thrift.FieldRepetitionType.REPEATED);
                        bool defined  = (se.Repetition_type == Thrift.FieldRepetitionType.REQUIRED);

                        if (repeated)
                        {
                            maxRepetitionLevel += 1;
                        }
                        if (!defined)
                        {
                            maxDefinitionLevel += 1;
                        }

                        break;
                    }

                    i++;
                }
            }

            _memoizedLevels.Add(comparer, Tuple.Create(maxRepetitionLevel, maxDefinitionLevel));
        }
コード例 #8
0
ファイル: Schema.cs プロジェクト: slunyakin-zz/parquet-dotnet
        internal int GetMaxDefinitionLevel(Thrift.ColumnChunk cc)
        {
            int max = 0;

            foreach (string part in cc.Meta_data.Path_in_schema)
            {
                SchemaElement element = _pathToElement[part];
                if (element.Thrift.Repetition_type != Thrift.FieldRepetitionType.REQUIRED)
                {
                    max += 1;
                }
            }

            return(max);
        }
コード例 #9
0
        public PColumn(Thrift.ColumnChunk thriftChunk, Schema schema, Stream inputStream, ThriftStream thriftStream, ParquetOptions options)
        {
            if (thriftChunk.Meta_data.Path_in_schema.Count != 1)
            {
                throw new NotImplementedException("path in scheme is not flat");
            }

            _thriftChunk   = thriftChunk;
            _thrift        = thriftStream;
            _schema        = schema;
            _inputStream   = inputStream;
            _schemaElement = _schema[_thriftChunk];
            _options       = options;

            _plainReader = new PlainValuesReader(options);
        }
コード例 #10
0
        private Thrift.ColumnChunk WriteColumnChunk(Thrift.SchemaElement tse, List <string> path, DataColumn column, IDataTypeHandler dataTypeHandler)
        {
            Thrift.ColumnChunk chunk = _footer.CreateColumnChunk(_compressionMethod, _stream, tse.Type, path, 0);
            Thrift.PageHeader  ph    = _footer.CreateDataPage(_rowCount);
            _footer.GetLevels(chunk, out int maxRepetitionLevel, out int maxDefinitionLevel);

            List <PageTag> pages = WriteColumn(column, tse, dataTypeHandler, maxRepetitionLevel, maxDefinitionLevel);

            chunk.Meta_data.Num_values = ph.Data_page_header.Num_values;

            //the following counters must include both data size and header size
            chunk.Meta_data.Total_compressed_size   = pages.Sum(p => p.HeaderMeta.Compressed_page_size + p.HeaderSize);
            chunk.Meta_data.Total_uncompressed_size = pages.Sum(p => p.HeaderMeta.Uncompressed_page_size + p.HeaderSize);

            return(chunk);
        }
コード例 #11
0
        public void Write(DataColumn column)
        {
            if (column == null)
            {
                throw new ArgumentNullException(nameof(column));
            }

            Thrift.SchemaElement tse             = _thschema[_colIdx++];
            IDataTypeHandler     dataTypeHandler = DataTypeFactory.Match(tse, _formatOptions);
            //todo: check if the column is in the right order


            List <string> path = _footer.GetPath(tse);

            Thrift.ColumnChunk chunk = WriteColumnChunk(tse, path, column, dataTypeHandler);
            _thriftRowGroup.Columns.Add(chunk);
        }
コード例 #12
0
        public Thrift.ColumnChunk Write(int offset, int count, IList values)
        {
            if (values == null)
            {
                values = TypeFactory.Create(_schema.ElementType, _schema.IsNullable, _schema.IsRepeated);
            }

            Thrift.ColumnChunk chunk = _meta.AddColumnChunk(_compressionMethod, _output, _schema, values.Count);
            Thrift.PageHeader  ph    = _meta.CreateDataPage(values.Count);

            List <PageTag> pages = WriteValues(_schema, values, ph, _compressionMethod);

            //the following counters must include both data size and header size
            chunk.Meta_data.Total_compressed_size   = pages.Sum(p => p.HeaderMeta.Compressed_page_size + p.HeaderSize);
            chunk.Meta_data.Total_uncompressed_size = pages.Sum(p => p.HeaderMeta.Uncompressed_page_size + p.HeaderSize);

            return(chunk);
        }
コード例 #13
0
        public Thrift.ColumnChunk Write(List <string> path, DataColumn column, IDataTypeHandler dataTypeHandler)
        {
            Thrift.ColumnChunk chunk = _footer.CreateColumnChunk(_compressionMethod, _stream, _schemaElement.Type, path, 0);
            Thrift.PageHeader  ph    = _footer.CreateDataPage(column.TotalCount);
            _footer.GetLevels(chunk, out int maxRepetitionLevel, out int maxDefinitionLevel);

            List <PageTag> pages = WriteColumn(column, _schemaElement, dataTypeHandler, maxRepetitionLevel, maxDefinitionLevel);

            //this count must be set to number of all values in the column, including nulls.
            //for hierarchy/repeated columns this is a count of flattened list, including nulls.
            chunk.Meta_data.Num_values = ph.Data_page_header.Num_values;

            //the following counters must include both data size and header size
            chunk.Meta_data.Total_compressed_size   = pages.Sum(p => p.HeaderMeta.Compressed_page_size + p.HeaderSize);
            chunk.Meta_data.Total_uncompressed_size = pages.Sum(p => p.HeaderMeta.Uncompressed_page_size + p.HeaderSize);

            return(chunk);
        }
コード例 #14
0
        internal SchemaElement this[Thrift.ColumnChunk value]
        {
            get
            {
                string path = string.Join(PathSeparator, value.Meta_data.Path_in_schema);

                if (_pathToElement == null)
                {
                    BuildPathCache();
                }

                if (!_pathToElement.TryGetValue(path, out SchemaElement result))
                {
                    throw new ArgumentException($"cannot find schema element by path '{path}'", nameof(value));
                }

                return(result);
            }
        }
コード例 #15
0
        private void PrintStatistics(Thrift.FileMetaData fileMeta, Thrift.ColumnChunk column, Thrift.Statistics stats)
        {
            WriteLine("    Statistics", T.HeadingTextColor);

            if (stats == null || !(stats.__isset.null_count || stats.__isset.distinct_count || stats.__isset.min || stats.__isset.max))
            {
                WriteLine("      none defined", T.ErrorTextColor);
                return;
            }

            const string undefined = "undefined";

            var t = new Table("name", "value");

            t.AddRow("Null Count", stats.__isset.null_count ? stats.Null_count.ToString() : undefined);
            t.AddRow("Distinct Count", stats.__isset.distinct_count ? stats.Distinct_count.ToString() : undefined);
            t.AddRow("Min", stats.__isset.min ? fileMeta.DecodeSingleStatsValue(column, stats.Min) : undefined);
            t.AddRow("Max", stats.__isset.max ? fileMeta.DecodeSingleStatsValue(column, stats.Max) : undefined);
            t.Render(false, 6, T.HeadingTextColor, T.NormalTextColor);
        }
コード例 #16
0
        public ColumnarWriter(Stream output, ThriftStream thriftStream,
                              ThriftFooter footer,
                              Thrift.SchemaElement tse, List <string> path,
                              CompressionMethod compressionMethod,
                              ParquetOptions formatOptions,
                              WriterOptions writerOptions)
        {
            _output            = output;
            _thriftStream      = thriftStream;
            _footer            = footer;
            _tse               = tse;
            _compressionMethod = compressionMethod;
            _formatOptions     = formatOptions;
            _writerOptions     = writerOptions;
            _dataTypeHandler   = DataTypeFactory.Match(tse, _formatOptions);

            _chunk = _footer.CreateColumnChunk(_compressionMethod, _output, _tse.Type, path, 0);
            _ph    = _footer.CreateDataPage(0);
            _footer.GetLevels(_chunk, out int maxRepetitionLevel, out int maxDefinitionLevel);
            _maxRepetitionLevel = maxRepetitionLevel;
            _maxDefinitionLevel = maxDefinitionLevel;
        }
コード例 #17
0
        public Thrift.ColumnChunk AddColumnChunk(CompressionMethod compression, Stream output, SchemaElement schema, int valuesCount)
        {
            Thrift.CompressionCodec codec = DataFactory.GetThriftCompression(compression);

            var  chunk    = new Thrift.ColumnChunk();
            long startPos = output.Position;

            chunk.File_offset                = startPos;
            chunk.Meta_data                  = new Thrift.ColumnMetaData();
            chunk.Meta_data.Num_values       = valuesCount;
            chunk.Meta_data.Type             = schema.Thrift.Type;
            chunk.Meta_data.Codec            = codec;
            chunk.Meta_data.Data_page_offset = startPos;
            chunk.Meta_data.Encodings        = new List <Thrift.Encoding>
            {
                Thrift.Encoding.RLE,
                Thrift.Encoding.BIT_PACKED,
                Thrift.Encoding.PLAIN
            };
            chunk.Meta_data.Path_in_schema = new List <string>(schema.Path.Split(Schema.PathSeparatorChar));

            return(chunk);
        }
コード例 #18
0
        public Thrift.ColumnChunk CreateColumnChunk(CompressionMethod compression, Stream output, Thrift.Type columnType, List <string> path, int valuesCount)
        {
            Thrift.CompressionCodec codec = DataFactory.GetThriftCompression(compression);

            var  chunk    = new Thrift.ColumnChunk();
            long startPos = output.Position;

            chunk.File_offset                = startPos;
            chunk.Meta_data                  = new Thrift.ColumnMetaData();
            chunk.Meta_data.Num_values       = valuesCount;
            chunk.Meta_data.Type             = columnType;
            chunk.Meta_data.Codec            = codec;
            chunk.Meta_data.Data_page_offset = startPos;
            chunk.Meta_data.Encodings        = new List <Thrift.Encoding>
            {
                Thrift.Encoding.RLE,
                Thrift.Encoding.BIT_PACKED,
                Thrift.Encoding.PLAIN
            };
            chunk.Meta_data.Path_in_schema = path;

            return(chunk);
        }
コード例 #19
0
 public abstract bool IsMatch(Thrift.ColumnChunk columnChunk, string path);
コード例 #20
0
 public override bool IsMatch(Thrift.ColumnChunk columnChunk, string path)
 {
     return(path == _columnName || path.StartsWith(_prefix));
 }
コード例 #21
0
 public override bool IsMatch(Thrift.ColumnChunk columnChunk, string path)
 {
     return(path == _columnName || path.StartsWith(_prefix, System.StringComparison.CurrentCultureIgnoreCase));
 }