public IList Read(long offset, long count) { long fileOffset = GetFileOffset(); long maxValues = _thriftColumnChunk.Meta_data.Num_values; _inputStream.Seek(fileOffset, SeekOrigin.Begin); IList dictionary = null; List <int> indexes = null; List <int> repetitions = null; List <int> definitions = null; IList values = null; //there can be only one dictionary page in column Thrift.PageHeader ph = _thriftStream.Read <Thrift.PageHeader>(); if (TryReadDictionaryPage(ph, out dictionary)) { ph = _thriftStream.Read <Thrift.PageHeader>(); } int pagesRead = 0; while (true) { int valuesSoFar = Math.Max(indexes == null ? 0 : indexes.Count, values == null ? 0 : values.Count); PageData pd = ReadDataPage(ph, maxValues - valuesSoFar); repetitions = AssignOrAdd(repetitions, pd.repetitions); definitions = AssignOrAdd(definitions, pd.definitions); indexes = AssignOrAdd(indexes, pd.indexes); values = AssignOrAdd(values, pd.values); pagesRead++; int totalCount = Math.Max( (values == null ? 0 : values.Count) + (indexes == null ? 0 : indexes.Count), (definitions == null ? 0 : definitions.Count)); if (totalCount >= maxValues) { break; //limit reached } ph = _thriftStream.Read <Thrift.PageHeader>(); if (ph.Type != Thrift.PageType.DATA_PAGE) { break; } } IList mergedValues = new ValueMerger( _maxDefinitionLevel, _maxRepetitionLevel, () => _dataTypeHandler.CreateEmptyList(_thriftSchemaElement.IsNullable(), false, 0), values ?? _dataTypeHandler.CreateEmptyList(_thriftSchemaElement.IsNullable(), false, 0)) .Apply(dictionary, definitions, repetitions, indexes, (int)maxValues); mergedValues.Trim((int)offset, (int)count); return(mergedValues); }
public void Read(long offset, long count) { Thrift.SchemaElement tse = _footer.GetSchemaElement(_thriftColumnChunk); IDataTypeHandler dataTypeHandler = DataTypeFactory.Match(tse, _parquetOptions); long fileOffset = GetFileOffset(); long maxValues = _thriftColumnChunk.Meta_data.Num_values; _inputStream.Seek(fileOffset, SeekOrigin.Begin); IList dictionary = null; List <int> indexes = null; List <int> repetitions = null; List <int> definitions = null; IList values = null; //there can be only one dictionary page in column Thrift.PageHeader ph = _thriftStream.Read <Thrift.PageHeader>(); if (TryReadDictionaryPage(ph, dataTypeHandler, out dictionary)) { ph = _thriftStream.Read <Thrift.PageHeader>(); } int pagesRead = 0; while (true) { int valuesSoFar = Math.Max(indexes == null ? 0 : indexes.Count, values == null ? 0 : values.Count); PageData pd = ReadDataPage(dataTypeHandler, ph, tse, maxValues - valuesSoFar); repetitions = AssignOrAdd(repetitions, pd.repetitions); definitions = AssignOrAdd(definitions, pd.definitions); indexes = AssignOrAdd(indexes, pd.indexes); values = AssignOrAdd(values, pd.values); pagesRead++; int totalCount = Math.Max( (values == null ? 0 : values.Count) + (indexes == null ? 0 : indexes.Count), (definitions == null ? 0 : definitions.Count)); if (totalCount >= maxValues) { break; //limit reached } ph = _thriftStream.Read <Thrift.PageHeader>(); if (ph.Type != Thrift.PageType.DATA_PAGE) { break; } } //IList mergedValues = new ValueMerger(_schema, values) // .Apply(dictionary, definitions, repetitions, indexes, (int)maxValues); }
public DataColumn Read() { long fileOffset = GetFileOffset(); long maxValues = _thriftColumnChunk.Meta_data.Num_values; _inputStream.Seek(fileOffset, SeekOrigin.Begin); IList dictionary = null; List <int> indexes = null; List <int> repetitions = null; List <int> definitions = null; IList values = null; //there can be only one dictionary page in column Thrift.PageHeader ph = _thriftStream.Read <Thrift.PageHeader>(); if (TryReadDictionaryPage(ph, out dictionary)) { ph = _thriftStream.Read <Thrift.PageHeader>(); } int pagesRead = 0; while (true) { int valuesSoFar = Math.Max(indexes == null ? 0 : indexes.Count, values == null ? 0 : values.Count); PageData pd = ReadDataPage(ph, maxValues - valuesSoFar); repetitions = AssignOrAdd(repetitions, pd.repetitions); definitions = AssignOrAdd(definitions, pd.definitions); indexes = AssignOrAdd(indexes, pd.indexes); values = AssignOrAdd(values, pd.values); pagesRead++; int totalCount = Math.Max( (values == null ? 0 : values.Count) + (indexes == null ? 0 : indexes.Count), (definitions == null ? 0 : definitions.Count)); if (totalCount >= maxValues) { break; //limit reached } ph = _thriftStream.Read <Thrift.PageHeader>(); if (ph.Type != Thrift.PageType.DATA_PAGE) { break; } } // all the data is available here! // todo: this is a simple hack for trivial tests to succeed return(new DataColumn(_dataField, values, definitions, repetitions)); }
public DataColumn Read() { long fileOffset = GetFileOffset(); long maxValues = _thriftColumnChunk.Meta_data.Num_values; _inputStream.Seek(fileOffset, SeekOrigin.Begin); ParquetEventSource.Current.SeekColumn(_dataField.Path, fileOffset); var colData = new ColumnRawData(); colData.maxCount = (int)_thriftColumnChunk.Meta_data.Num_values; //there can be only one dictionary page in column Thrift.PageHeader ph = _thriftStream.Read <Thrift.PageHeader>(); if (TryReadDictionaryPage(ph, out colData.dictionary, out colData.dictionaryOffset)) { ph = _thriftStream.Read <Thrift.PageHeader>(); } int pagesRead = 0; while (true) { int valuesSoFar = Math.Max(colData.indexes == null ? 0 : colData.indexesOffset, colData.values == null ? 0 : colData.values.Length); ReadDataPage(ph, colData, maxValues - valuesSoFar); pagesRead++; int totalCount = Math.Max( (colData.values == null ? 0 : colData.values.Length) + (colData.indexes == null ? 0 : colData.indexesOffset), (colData.definitions == null ? 0 : colData.definitions.Length)); if (totalCount >= maxValues) { break; //limit reached } ph = _thriftStream.Read <Thrift.PageHeader>(); if (ph.Type != Thrift.PageType.DATA_PAGE) { break; } } // all the data is available here! return(new DataColumn( _dataField, colData.values, colData.definitions, _maxDefinitionLevel, colData.repetitions, _maxRepetitionLevel, colData.dictionary, colData.indexes)); }
public DataColumn Read() { long fileOffset = GetFileOffset(); long maxValues = _thriftColumnChunk.Meta_data.Num_values; _inputStream.Seek(fileOffset, SeekOrigin.Begin); ParquetEventSource.Current.SeekColumn(_dataField.Path, fileOffset); var colData = new ColumnRawData(); colData.maxCount = (int)_thriftColumnChunk.Meta_data.Num_values; //there can be only one dictionary page in column Thrift.PageHeader ph = _thriftStream.Read <Thrift.PageHeader>(); if (TryReadDictionaryPage(ph, out colData.dictionary, out colData.dictionaryOffset)) { ph = _thriftStream.Read <Thrift.PageHeader>(); } int pagesRead = 0; while (true) { ReadDataPage(ph, colData, maxValues); pagesRead++; /*int totalValueCount = * (colData.dictionary == null ? 0 : colData.indexesOffset) + * (colData.values == null ? 0 : colData.valuesOffset); * * bool exhaused = * (totalValueCount >= maxValues) && * (colData.definitions == null || colData.definitionsOffset >= maxValues); * * if (exhaused) * break;*/ int totalCount = Math.Max( (colData.values == null ? 0 : colData.valuesOffset), (colData.definitions == null ? 0 : colData.definitionsOffset)); if (totalCount >= maxValues) { break; //limit reached } ph = _thriftStream.Read <Thrift.PageHeader>(); if (ph.Type != Thrift.PageType.DATA_PAGE) { break; } } // all the data is available here! var finalColumn = new DataColumn( _dataField, colData.values, colData.definitions, _maxDefinitionLevel, colData.repetitions, _maxRepetitionLevel, colData.dictionary, colData.indexes); if (_thriftColumnChunk.Meta_data.Statistics != null) { finalColumn.Statistics = new DataColumnStatistics( _thriftColumnChunk.Meta_data.Statistics.Null_count, _thriftColumnChunk.Meta_data.Statistics.Distinct_count, _dataTypeHandler.PlainDecode(_thriftSchemaElement, _thriftColumnChunk.Meta_data.Statistics.Min_value), _dataTypeHandler.PlainDecode(_thriftSchemaElement, _thriftColumnChunk.Meta_data.Statistics.Max_value)); } return(finalColumn); }
public IList Read(long offset, long count) { IList values = TypeFactory.Create(_schema, _options); //get the minimum offset, we'll just read pages in sequence long fileOffset = new[] { _thriftChunk.Meta_data.Dictionary_page_offset, _thriftChunk.Meta_data.Data_page_offset }.Where(e => e != 0).Min(); long maxValues = _thriftChunk.Meta_data.Num_values; _inputStream.Seek(fileOffset, SeekOrigin.Begin); Thrift.PageHeader ph = _thrift.Read <Thrift.PageHeader>(); IList dictionaryPage = null; List <int> indexes = null; List <int> definitions = null; List <int> repetitions = null; //there can be only one dictionary page in column if (ph.Type == Thrift.PageType.DICTIONARY_PAGE) { dictionaryPage = ReadDictionaryPage(ph); ph = _thrift.Read <Thrift.PageHeader>(); //get next page after dictionary } int dataPageCount = 0; while (true) { int valuesSoFar = Math.Max(indexes == null ? 0 : indexes.Count, values.Count); PageData page = ReadDataPage(ph, values, maxValues - valuesSoFar); indexes = AssignOrAdd(indexes, page.indexes); definitions = AssignOrAdd(definitions, page.definitions); repetitions = AssignOrAdd(repetitions, page.repetitions); dataPageCount++; int totalCount = Math.Max( values.Count + (indexes == null ? 0 : indexes.Count), definitions == null ? 0 : definitions.Count); if (totalCount >= maxValues) { break; //limit reached } ph = ReadDataPageHeader(dataPageCount); //get next page if (ph.Type != Thrift.PageType.DATA_PAGE) { break; } } IList mergedValues = new ValueMerger(_schema, _options, values) .Apply(dictionaryPage, definitions, repetitions, indexes, (int)maxValues); //todo: this won't work for nested arrays ValueMerger.Trim(mergedValues, (int)offset, (int)count); return(mergedValues); }
public IList Read(string columnName) { IList values = TypeFactory.Create(_schemaElement); //get the minimum offset, we'll just read pages in sequence long offset = new[] { _thriftChunk.Meta_data.Dictionary_page_offset, _thriftChunk.Meta_data.Data_page_offset }.Where(e => e != 0).Min(); long maxValues = _thriftChunk.Meta_data.Num_values; _inputStream.Seek(offset, SeekOrigin.Begin); Thrift.PageHeader ph = _thrift.Read <Thrift.PageHeader>(); IList dictionaryPage = null; List <int> indexes = null; List <int> definitions = null; //there can be only one dictionary page in column if (ph.Type == Thrift.PageType.DICTIONARY_PAGE) { dictionaryPage = ReadDictionaryPage(ph); ph = _thrift.Read <Thrift.PageHeader>(); //get next page after dictionary } int dataPageCount = 0; while (true) { int valuesSoFar = Math.Max(indexes == null ? 0 : indexes.Count, values.Count); var page = ReadDataPage(ph, values, maxValues - valuesSoFar); //merge indexes if (page.indexes != null) { if (indexes == null) { indexes = page.indexes; } else { indexes.AddRange(page.indexes); } } if (page.definitions != null) { if (definitions == null) { definitions = (List <int>)page.definitions; } else { definitions.AddRange((List <int>)page.definitions); } } dataPageCount++; if (page.repetitions != null) { throw new NotImplementedException(); } if ((values.Count >= maxValues) || (indexes != null && indexes.Count >= maxValues) || (definitions != null && definitions.Count >= maxValues)) { break; //limit reached } /*IList acc1 = new ValueMerger(_schemaElement, values).Apply(dictionaryPage, definitions, indexes, maxValues); * dictionaryPage = null; * definitions = null; * indexes = null; * values.Clear(); * foreach (var el in acc1) acc.Add(el);*/ ph = _thrift.Read <Thrift.PageHeader>(); //get next page if (ph.Type != Thrift.PageType.DATA_PAGE) { break; } } IList mergedValues = new ValueMerger(_schemaElement, values).Apply(dictionaryPage, definitions, indexes, maxValues); return(mergedValues); }