ThriftStream.Read, Parquet.File C# (CSharp)のコード例

コード例 #1

0

ファイルを表示

        public IList Read(long offset, long count)
        {
            long fileOffset = GetFileOffset();
            long maxValues  = _thriftColumnChunk.Meta_data.Num_values;

            _inputStream.Seek(fileOffset, SeekOrigin.Begin);

            IList      dictionary  = null;
            List <int> indexes     = null;
            List <int> repetitions = null;
            List <int> definitions = null;
            IList      values      = null;

            //there can be only one dictionary page in column
            Thrift.PageHeader ph = _thriftStream.Read <Thrift.PageHeader>();
            if (TryReadDictionaryPage(ph, out dictionary))
            {
                ph = _thriftStream.Read <Thrift.PageHeader>();
            }

            int pagesRead = 0;

            while (true)
            {
                int      valuesSoFar = Math.Max(indexes == null ? 0 : indexes.Count, values == null ? 0 : values.Count);
                PageData pd          = ReadDataPage(ph, maxValues - valuesSoFar);

                repetitions = AssignOrAdd(repetitions, pd.repetitions);
                definitions = AssignOrAdd(definitions, pd.definitions);
                indexes     = AssignOrAdd(indexes, pd.indexes);
                values      = AssignOrAdd(values, pd.values);

                pagesRead++;

                int totalCount = Math.Max(
                    (values == null ? 0 : values.Count) +
                    (indexes == null ? 0 : indexes.Count),
                    (definitions == null ? 0 : definitions.Count));
                if (totalCount >= maxValues)
                {
                    break;                      //limit reached
                }
                ph = _thriftStream.Read <Thrift.PageHeader>();
                if (ph.Type != Thrift.PageType.DATA_PAGE)
                {
                    break;
                }
            }

            IList mergedValues = new ValueMerger(
                _maxDefinitionLevel,
                _maxRepetitionLevel,
                () => _dataTypeHandler.CreateEmptyList(_thriftSchemaElement.IsNullable(), false, 0),
                values ?? _dataTypeHandler.CreateEmptyList(_thriftSchemaElement.IsNullable(), false, 0))
                                 .Apply(dictionary, definitions, repetitions, indexes, (int)maxValues);

            mergedValues.Trim((int)offset, (int)count);

            return(mergedValues);
        }

コード例 #2

0

ファイルを表示

        public void Read(long offset, long count)
        {
            Thrift.SchemaElement tse = _footer.GetSchemaElement(_thriftColumnChunk);

            IDataTypeHandler dataTypeHandler = DataTypeFactory.Match(tse, _parquetOptions);

            long fileOffset = GetFileOffset();
            long maxValues  = _thriftColumnChunk.Meta_data.Num_values;

            _inputStream.Seek(fileOffset, SeekOrigin.Begin);

            IList      dictionary  = null;
            List <int> indexes     = null;
            List <int> repetitions = null;
            List <int> definitions = null;
            IList      values      = null;

            //there can be only one dictionary page in column
            Thrift.PageHeader ph = _thriftStream.Read <Thrift.PageHeader>();
            if (TryReadDictionaryPage(ph, dataTypeHandler, out dictionary))
            {
                ph = _thriftStream.Read <Thrift.PageHeader>();
            }

            int pagesRead = 0;

            while (true)
            {
                int      valuesSoFar = Math.Max(indexes == null ? 0 : indexes.Count, values == null ? 0 : values.Count);
                PageData pd          = ReadDataPage(dataTypeHandler, ph, tse, maxValues - valuesSoFar);

                repetitions = AssignOrAdd(repetitions, pd.repetitions);
                definitions = AssignOrAdd(definitions, pd.definitions);
                indexes     = AssignOrAdd(indexes, pd.indexes);
                values      = AssignOrAdd(values, pd.values);

                pagesRead++;

                int totalCount = Math.Max(
                    (values == null ? 0 : values.Count) +
                    (indexes == null ? 0 : indexes.Count),
                    (definitions == null ? 0 : definitions.Count));
                if (totalCount >= maxValues)
                {
                    break;                      //limit reached
                }
                ph = _thriftStream.Read <Thrift.PageHeader>();
                if (ph.Type != Thrift.PageType.DATA_PAGE)
                {
                    break;
                }
            }

            //IList mergedValues = new ValueMerger(_schema, values)
            //   .Apply(dictionary, definitions, repetitions, indexes, (int)maxValues);
        }

コード例 #3

0

ファイルを表示

ファイル: DataColumnReader.cs プロジェクト: pendragon-andyh/parquet-dotnet

        public DataColumn Read()
        {
            long fileOffset = GetFileOffset();
            long maxValues  = _thriftColumnChunk.Meta_data.Num_values;

            _inputStream.Seek(fileOffset, SeekOrigin.Begin);

            IList      dictionary  = null;
            List <int> indexes     = null;
            List <int> repetitions = null;
            List <int> definitions = null;
            IList      values      = null;

            //there can be only one dictionary page in column
            Thrift.PageHeader ph = _thriftStream.Read <Thrift.PageHeader>();
            if (TryReadDictionaryPage(ph, out dictionary))
            {
                ph = _thriftStream.Read <Thrift.PageHeader>();
            }

            int pagesRead = 0;

            while (true)
            {
                int      valuesSoFar = Math.Max(indexes == null ? 0 : indexes.Count, values == null ? 0 : values.Count);
                PageData pd          = ReadDataPage(ph, maxValues - valuesSoFar);

                repetitions = AssignOrAdd(repetitions, pd.repetitions);
                definitions = AssignOrAdd(definitions, pd.definitions);
                indexes     = AssignOrAdd(indexes, pd.indexes);
                values      = AssignOrAdd(values, pd.values);

                pagesRead++;

                int totalCount = Math.Max(
                    (values == null ? 0 : values.Count) +
                    (indexes == null ? 0 : indexes.Count),
                    (definitions == null ? 0 : definitions.Count));
                if (totalCount >= maxValues)
                {
                    break;                      //limit reached
                }
                ph = _thriftStream.Read <Thrift.PageHeader>();
                if (ph.Type != Thrift.PageType.DATA_PAGE)
                {
                    break;
                }
            }

            // all the data is available here!

            // todo: this is a simple hack for trivial tests to succeed
            return(new DataColumn(_dataField, values, definitions, repetitions));
        }

コード例 #4

0

ファイルを表示

ファイル: DataColumnReader.cs プロジェクト: tpeplow/parquet-dotnet-old

        public DataColumn Read()
        {
            long fileOffset = GetFileOffset();
            long maxValues  = _thriftColumnChunk.Meta_data.Num_values;

            _inputStream.Seek(fileOffset, SeekOrigin.Begin);

            ParquetEventSource.Current.SeekColumn(_dataField.Path, fileOffset);

            var colData = new ColumnRawData();

            colData.maxCount = (int)_thriftColumnChunk.Meta_data.Num_values;

            //there can be only one dictionary page in column
            Thrift.PageHeader ph = _thriftStream.Read <Thrift.PageHeader>();
            if (TryReadDictionaryPage(ph, out colData.dictionary, out colData.dictionaryOffset))
            {
                ph = _thriftStream.Read <Thrift.PageHeader>();
            }

            int pagesRead = 0;

            while (true)
            {
                int valuesSoFar = Math.Max(colData.indexes == null ? 0 : colData.indexesOffset, colData.values == null ? 0 : colData.values.Length);
                ReadDataPage(ph, colData, maxValues - valuesSoFar);

                pagesRead++;

                int totalCount = Math.Max(
                    (colData.values == null ? 0 : colData.values.Length) +
                    (colData.indexes == null ? 0 : colData.indexesOffset),
                    (colData.definitions == null ? 0 : colData.definitions.Length));
                if (totalCount >= maxValues)
                {
                    break;                      //limit reached
                }
                ph = _thriftStream.Read <Thrift.PageHeader>();
                if (ph.Type != Thrift.PageType.DATA_PAGE)
                {
                    break;
                }
            }

            // all the data is available here!

            return(new DataColumn(
                       _dataField, colData.values,
                       colData.definitions, _maxDefinitionLevel,
                       colData.repetitions, _maxRepetitionLevel,
                       colData.dictionary,
                       colData.indexes));
        }

コード例 #5

0

ファイルを表示

        public DataColumn Read()
        {
            long fileOffset = GetFileOffset();
            long maxValues  = _thriftColumnChunk.Meta_data.Num_values;

            _inputStream.Seek(fileOffset, SeekOrigin.Begin);

            ParquetEventSource.Current.SeekColumn(_dataField.Path, fileOffset);

            var colData = new ColumnRawData();

            colData.maxCount = (int)_thriftColumnChunk.Meta_data.Num_values;

            //there can be only one dictionary page in column
            Thrift.PageHeader ph = _thriftStream.Read <Thrift.PageHeader>();
            if (TryReadDictionaryPage(ph, out colData.dictionary, out colData.dictionaryOffset))
            {
                ph = _thriftStream.Read <Thrift.PageHeader>();
            }

            int pagesRead = 0;

            while (true)
            {
                ReadDataPage(ph, colData, maxValues);

                pagesRead++;

                /*int totalValueCount =
                 * (colData.dictionary == null ? 0 : colData.indexesOffset) +
                 * (colData.values == null ? 0 : colData.valuesOffset);
                 *
                 * bool exhaused =
                 * (totalValueCount >= maxValues) &&
                 * (colData.definitions == null || colData.definitionsOffset >= maxValues);
                 *
                 * if (exhaused)
                 * break;*/

                int totalCount = Math.Max(
                    (colData.values == null ? 0 : colData.valuesOffset),
                    (colData.definitions == null ? 0 : colData.definitionsOffset));
                if (totalCount >= maxValues)
                {
                    break;                      //limit reached
                }
                ph = _thriftStream.Read <Thrift.PageHeader>();
                if (ph.Type != Thrift.PageType.DATA_PAGE)
                {
                    break;
                }
            }

            // all the data is available here!

            var finalColumn = new DataColumn(
                _dataField, colData.values,
                colData.definitions, _maxDefinitionLevel,
                colData.repetitions, _maxRepetitionLevel,
                colData.dictionary,
                colData.indexes);

            if (_thriftColumnChunk.Meta_data.Statistics != null)
            {
                finalColumn.Statistics = new DataColumnStatistics(
                    _thriftColumnChunk.Meta_data.Statistics.Null_count,
                    _thriftColumnChunk.Meta_data.Statistics.Distinct_count,
                    _dataTypeHandler.PlainDecode(_thriftSchemaElement, _thriftColumnChunk.Meta_data.Statistics.Min_value),
                    _dataTypeHandler.PlainDecode(_thriftSchemaElement, _thriftColumnChunk.Meta_data.Statistics.Max_value));
            }

            return(finalColumn);
        }

コード例 #6

0

ファイルを表示

        public IList Read(long offset, long count)
        {
            IList values = TypeFactory.Create(_schema, _options);

            //get the minimum offset, we'll just read pages in sequence
            long fileOffset = new[] { _thriftChunk.Meta_data.Dictionary_page_offset, _thriftChunk.Meta_data.Data_page_offset }.Where(e => e != 0).Min();
            long maxValues = _thriftChunk.Meta_data.Num_values;

            _inputStream.Seek(fileOffset, SeekOrigin.Begin);

            Thrift.PageHeader ph = _thrift.Read <Thrift.PageHeader>();

            IList      dictionaryPage = null;
            List <int> indexes        = null;
            List <int> definitions    = null;
            List <int> repetitions    = null;

            //there can be only one dictionary page in column
            if (ph.Type == Thrift.PageType.DICTIONARY_PAGE)
            {
                dictionaryPage = ReadDictionaryPage(ph);
                ph             = _thrift.Read <Thrift.PageHeader>(); //get next page after dictionary
            }

            int dataPageCount = 0;

            while (true)
            {
                int      valuesSoFar = Math.Max(indexes == null ? 0 : indexes.Count, values.Count);
                PageData page        = ReadDataPage(ph, values, maxValues - valuesSoFar);

                indexes     = AssignOrAdd(indexes, page.indexes);
                definitions = AssignOrAdd(definitions, page.definitions);
                repetitions = AssignOrAdd(repetitions, page.repetitions);

                dataPageCount++;

                int totalCount = Math.Max(

                    values.Count +
                    (indexes == null
                  ? 0
                  : indexes.Count),

                    definitions == null ? 0 : definitions.Count);

                if (totalCount >= maxValues)
                {
                    break;                              //limit reached
                }
                ph = ReadDataPageHeader(dataPageCount); //get next page

                if (ph.Type != Thrift.PageType.DATA_PAGE)
                {
                    break;
                }
            }

            IList mergedValues = new ValueMerger(_schema, _options, values)
                                 .Apply(dictionaryPage, definitions, repetitions, indexes, (int)maxValues);

            //todo: this won't work for nested arrays
            ValueMerger.Trim(mergedValues, (int)offset, (int)count);

            return(mergedValues);
        }

コード例 #7

0

ファイルを表示

ファイル: PColumn.cs プロジェクト: slunyakin-zz/parquet-dotnet

        public IList Read(string columnName)
        {
            IList values = TypeFactory.Create(_schemaElement);

            //get the minimum offset, we'll just read pages in sequence
            long offset = new[] { _thriftChunk.Meta_data.Dictionary_page_offset, _thriftChunk.Meta_data.Data_page_offset }.Where(e => e != 0).Min();
            long maxValues = _thriftChunk.Meta_data.Num_values;

            _inputStream.Seek(offset, SeekOrigin.Begin);

            Thrift.PageHeader ph = _thrift.Read <Thrift.PageHeader>();

            IList      dictionaryPage = null;
            List <int> indexes        = null;
            List <int> definitions    = null;

            //there can be only one dictionary page in column
            if (ph.Type == Thrift.PageType.DICTIONARY_PAGE)
            {
                dictionaryPage = ReadDictionaryPage(ph);
                ph             = _thrift.Read <Thrift.PageHeader>(); //get next page after dictionary
            }

            int dataPageCount = 0;

            while (true)
            {
                int valuesSoFar = Math.Max(indexes == null ? 0 : indexes.Count, values.Count);
                var page        = ReadDataPage(ph, values, maxValues - valuesSoFar);

                //merge indexes
                if (page.indexes != null)
                {
                    if (indexes == null)
                    {
                        indexes = page.indexes;
                    }
                    else
                    {
                        indexes.AddRange(page.indexes);
                    }
                }

                if (page.definitions != null)
                {
                    if (definitions == null)
                    {
                        definitions = (List <int>)page.definitions;
                    }
                    else
                    {
                        definitions.AddRange((List <int>)page.definitions);
                    }
                }

                dataPageCount++;

                if (page.repetitions != null)
                {
                    throw new NotImplementedException();
                }

                if ((values.Count >= maxValues) || (indexes != null && indexes.Count >= maxValues) || (definitions != null && definitions.Count >= maxValues))
                {
                    break; //limit reached
                }

                /*IList acc1 = new ValueMerger(_schemaElement, values).Apply(dictionaryPage, definitions, indexes, maxValues);
                 * dictionaryPage = null;
                 * definitions = null;
                 * indexes = null;
                 * values.Clear();
                 * foreach (var el in acc1) acc.Add(el);*/

                ph = _thrift.Read <Thrift.PageHeader>(); //get next page
                if (ph.Type != Thrift.PageType.DATA_PAGE)
                {
                    break;
                }
            }

            IList mergedValues = new ValueMerger(_schemaElement, values).Apply(dictionaryPage, definitions, indexes, maxValues);

            return(mergedValues);
        }

C# (CSharp) Parquet.File ThriftStream.Readの例