ValueMerger, Parquet.File C# (CSharp) 코드 예제들

예제 #1

0

파일 보기

파일: PColumn.cs 프로젝트: slunyakin-zz/parquet-dotnet

        private (ICollection definitions, ICollection repetitions, List <int> indexes) ReadDataPage(Thrift.PageHeader ph, IList destination, long maxValues)
        {
            byte[] data = ReadRawBytes(ph, _inputStream);

            using (var dataStream = new MemoryStream(data))
            {
                using (var reader = new BinaryReader(dataStream))
                {
                    //todo: read repetition levels (only relevant for nested columns)

                    //check if there are definitions at all
                    bool       hasDefinitions = _schemaElement.Thrift.Repetition_type == Thrift.FieldRepetitionType.OPTIONAL;
                    List <int> definitions    = hasDefinitions
                  ? ReadDefinitionLevels(reader, (int)maxValues)
                  : null;

                    // these are pointers back to the Values table - lookup on values
                    List <int> indexes = ReadColumnValues(reader, ph.Data_page_header.Encoding, destination, maxValues);

                    //trim output if it exceeds max number of values
                    int numValues = ph.Data_page_header.Num_values;
                    if (definitions != null)
                    {
                        ValueMerger.Trim(definitions, numValues);
                    }
                    if (indexes != null)
                    {
                        ValueMerger.Trim(indexes, numValues);
                    }

                    return(definitions, null, indexes);
                }
            }
        }

예제 #2

0

파일 보기

        public IList Read(long offset, long count)
        {
            long fileOffset = GetFileOffset();
            long maxValues  = _thriftColumnChunk.Meta_data.Num_values;

            _inputStream.Seek(fileOffset, SeekOrigin.Begin);

            IList      dictionary  = null;
            List <int> indexes     = null;
            List <int> repetitions = null;
            List <int> definitions = null;
            IList      values      = null;

            //there can be only one dictionary page in column
            Thrift.PageHeader ph = _thriftStream.Read <Thrift.PageHeader>();
            if (TryReadDictionaryPage(ph, out dictionary))
            {
                ph = _thriftStream.Read <Thrift.PageHeader>();
            }

            int pagesRead = 0;

            while (true)
            {
                int      valuesSoFar = Math.Max(indexes == null ? 0 : indexes.Count, values == null ? 0 : values.Count);
                PageData pd          = ReadDataPage(ph, maxValues - valuesSoFar);

                repetitions = AssignOrAdd(repetitions, pd.repetitions);
                definitions = AssignOrAdd(definitions, pd.definitions);
                indexes     = AssignOrAdd(indexes, pd.indexes);
                values      = AssignOrAdd(values, pd.values);

                pagesRead++;

                int totalCount = Math.Max(
                    (values == null ? 0 : values.Count) +
                    (indexes == null ? 0 : indexes.Count),
                    (definitions == null ? 0 : definitions.Count));
                if (totalCount >= maxValues)
                {
                    break;                      //limit reached
                }
                ph = _thriftStream.Read <Thrift.PageHeader>();
                if (ph.Type != Thrift.PageType.DATA_PAGE)
                {
                    break;
                }
            }

            IList mergedValues = new ValueMerger(
                _maxDefinitionLevel,
                _maxRepetitionLevel,
                () => _dataTypeHandler.CreateEmptyList(_thriftSchemaElement.IsNullable(), false, 0),
                values ?? _dataTypeHandler.CreateEmptyList(_thriftSchemaElement.IsNullable(), false, 0))
                                 .Apply(dictionary, definitions, repetitions, indexes, (int)maxValues);

            mergedValues.Trim((int)offset, (int)count);

            return(mergedValues);
        }

예제 #3

0

파일 보기

        private PageData ReadDataPage(Thrift.PageHeader ph, IList destination, long maxValues)
        {
            byte[] data = ReadRawBytes(ph, _inputStream);
            int    max  = ph.Data_page_header.Num_values;

            using (var dataStream = new MemoryStream(data))
            {
                using (var reader = new BinaryReader(dataStream))
                {
                    List <int> repetitions = _schema.HasRepetitionLevelsPage
                  ? ReadRepetitionLevels(reader)
                  : null;

                    List <int> definitions = _schema.HasDefinitionLevelsPage
                  ? ReadDefinitionLevels(reader)
                  : null;

                    // these are pointers back to the Values table - lookup on values
                    List <int> indexes = ReadColumnValues(reader, ph.Data_page_header.Encoding, destination, max);

                    //trim output if it exceeds max number of values
                    int numValues = ph.Data_page_header.Num_values;

                    if (!_schema.IsRepeated)
                    {
                        if (repetitions != null)
                        {
                            ValueMerger.TrimTail(repetitions, numValues);
                        }
                        if (definitions != null)
                        {
                            ValueMerger.TrimTail(definitions, numValues);
                        }
                        if (indexes != null)
                        {
                            ValueMerger.TrimTail(indexes, numValues);
                        }
                    }

                    return(new PageData {
                        definitions = definitions, repetitions = repetitions, indexes = indexes
                    });
                }
            }
        }

예제 #4

0

파일 보기

        public IList Read(long offset, long count)
        {
            IList values = TypeFactory.Create(_schema, _options);

            //get the minimum offset, we'll just read pages in sequence
            long fileOffset = new[] { _thriftChunk.Meta_data.Dictionary_page_offset, _thriftChunk.Meta_data.Data_page_offset }.Where(e => e != 0).Min();
            long maxValues = _thriftChunk.Meta_data.Num_values;

            _inputStream.Seek(fileOffset, SeekOrigin.Begin);

            Thrift.PageHeader ph = _thrift.Read <Thrift.PageHeader>();

            IList      dictionaryPage = null;
            List <int> indexes        = null;
            List <int> definitions    = null;
            List <int> repetitions    = null;

            //there can be only one dictionary page in column
            if (ph.Type == Thrift.PageType.DICTIONARY_PAGE)
            {
                dictionaryPage = ReadDictionaryPage(ph);
                ph             = _thrift.Read <Thrift.PageHeader>(); //get next page after dictionary
            }

            int dataPageCount = 0;

            while (true)
            {
                int      valuesSoFar = Math.Max(indexes == null ? 0 : indexes.Count, values.Count);
                PageData page        = ReadDataPage(ph, values, maxValues - valuesSoFar);

                indexes     = AssignOrAdd(indexes, page.indexes);
                definitions = AssignOrAdd(definitions, page.definitions);
                repetitions = AssignOrAdd(repetitions, page.repetitions);

                dataPageCount++;

                int totalCount = Math.Max(

                    values.Count +
                    (indexes == null
                  ? 0
                  : indexes.Count),

                    definitions == null ? 0 : definitions.Count);

                if (totalCount >= maxValues)
                {
                    break;                              //limit reached
                }
                ph = ReadDataPageHeader(dataPageCount); //get next page

                if (ph.Type != Thrift.PageType.DATA_PAGE)
                {
                    break;
                }
            }

            IList mergedValues = new ValueMerger(_schema, _options, values)
                                 .Apply(dictionaryPage, definitions, repetitions, indexes, (int)maxValues);

            //todo: this won't work for nested arrays
            ValueMerger.Trim(mergedValues, (int)offset, (int)count);

            return(mergedValues);
        }

예제 #5

0

파일 보기

파일: PColumn.cs 프로젝트: slunyakin-zz/parquet-dotnet

        public IList Read(string columnName)
        {
            IList values = TypeFactory.Create(_schemaElement);

            //get the minimum offset, we'll just read pages in sequence
            long offset = new[] { _thriftChunk.Meta_data.Dictionary_page_offset, _thriftChunk.Meta_data.Data_page_offset }.Where(e => e != 0).Min();
            long maxValues = _thriftChunk.Meta_data.Num_values;

            _inputStream.Seek(offset, SeekOrigin.Begin);

            Thrift.PageHeader ph = _thrift.Read <Thrift.PageHeader>();

            IList      dictionaryPage = null;
            List <int> indexes        = null;
            List <int> definitions    = null;

            //there can be only one dictionary page in column
            if (ph.Type == Thrift.PageType.DICTIONARY_PAGE)
            {
                dictionaryPage = ReadDictionaryPage(ph);
                ph             = _thrift.Read <Thrift.PageHeader>(); //get next page after dictionary
            }

            int dataPageCount = 0;

            while (true)
            {
                int valuesSoFar = Math.Max(indexes == null ? 0 : indexes.Count, values.Count);
                var page        = ReadDataPage(ph, values, maxValues - valuesSoFar);

                //merge indexes
                if (page.indexes != null)
                {
                    if (indexes == null)
                    {
                        indexes = page.indexes;
                    }
                    else
                    {
                        indexes.AddRange(page.indexes);
                    }
                }

                if (page.definitions != null)
                {
                    if (definitions == null)
                    {
                        definitions = (List <int>)page.definitions;
                    }
                    else
                    {
                        definitions.AddRange((List <int>)page.definitions);
                    }
                }

                dataPageCount++;

                if (page.repetitions != null)
                {
                    throw new NotImplementedException();
                }

                if ((values.Count >= maxValues) || (indexes != null && indexes.Count >= maxValues) || (definitions != null && definitions.Count >= maxValues))
                {
                    break; //limit reached
                }

                /*IList acc1 = new ValueMerger(_schemaElement, values).Apply(dictionaryPage, definitions, indexes, maxValues);
                 * dictionaryPage = null;
                 * definitions = null;
                 * indexes = null;
                 * values.Clear();
                 * foreach (var el in acc1) acc.Add(el);*/

                ph = _thrift.Read <Thrift.PageHeader>(); //get next page
                if (ph.Type != Thrift.PageType.DATA_PAGE)
                {
                    break;
                }
            }

            IList mergedValues = new ValueMerger(_schemaElement, values).Apply(dictionaryPage, definitions, indexes, maxValues);

            return(mergedValues);
        }

C# (CSharp) Parquet.File ValueMerger 예제들