private (ICollection definitions, ICollection repetitions, List <int> indexes) ReadDataPage(Thrift.PageHeader ph, IList destination, long maxValues) { byte[] data = ReadRawBytes(ph, _inputStream); using (var dataStream = new MemoryStream(data)) { using (var reader = new BinaryReader(dataStream)) { //todo: read repetition levels (only relevant for nested columns) //check if there are definitions at all bool hasDefinitions = _schemaElement.Thrift.Repetition_type == Thrift.FieldRepetitionType.OPTIONAL; List <int> definitions = hasDefinitions ? ReadDefinitionLevels(reader, (int)maxValues) : null; // these are pointers back to the Values table - lookup on values List <int> indexes = ReadColumnValues(reader, ph.Data_page_header.Encoding, destination, maxValues); //trim output if it exceeds max number of values int numValues = ph.Data_page_header.Num_values; if (definitions != null) { ValueMerger.Trim(definitions, numValues); } if (indexes != null) { ValueMerger.Trim(indexes, numValues); } return(definitions, null, indexes); } } }
public IList Read(long offset, long count) { long fileOffset = GetFileOffset(); long maxValues = _thriftColumnChunk.Meta_data.Num_values; _inputStream.Seek(fileOffset, SeekOrigin.Begin); IList dictionary = null; List <int> indexes = null; List <int> repetitions = null; List <int> definitions = null; IList values = null; //there can be only one dictionary page in column Thrift.PageHeader ph = _thriftStream.Read <Thrift.PageHeader>(); if (TryReadDictionaryPage(ph, out dictionary)) { ph = _thriftStream.Read <Thrift.PageHeader>(); } int pagesRead = 0; while (true) { int valuesSoFar = Math.Max(indexes == null ? 0 : indexes.Count, values == null ? 0 : values.Count); PageData pd = ReadDataPage(ph, maxValues - valuesSoFar); repetitions = AssignOrAdd(repetitions, pd.repetitions); definitions = AssignOrAdd(definitions, pd.definitions); indexes = AssignOrAdd(indexes, pd.indexes); values = AssignOrAdd(values, pd.values); pagesRead++; int totalCount = Math.Max( (values == null ? 0 : values.Count) + (indexes == null ? 0 : indexes.Count), (definitions == null ? 0 : definitions.Count)); if (totalCount >= maxValues) { break; //limit reached } ph = _thriftStream.Read <Thrift.PageHeader>(); if (ph.Type != Thrift.PageType.DATA_PAGE) { break; } } IList mergedValues = new ValueMerger( _maxDefinitionLevel, _maxRepetitionLevel, () => _dataTypeHandler.CreateEmptyList(_thriftSchemaElement.IsNullable(), false, 0), values ?? _dataTypeHandler.CreateEmptyList(_thriftSchemaElement.IsNullable(), false, 0)) .Apply(dictionary, definitions, repetitions, indexes, (int)maxValues); mergedValues.Trim((int)offset, (int)count); return(mergedValues); }
private PageData ReadDataPage(Thrift.PageHeader ph, IList destination, long maxValues) { byte[] data = ReadRawBytes(ph, _inputStream); int max = ph.Data_page_header.Num_values; using (var dataStream = new MemoryStream(data)) { using (var reader = new BinaryReader(dataStream)) { List <int> repetitions = _schema.HasRepetitionLevelsPage ? ReadRepetitionLevels(reader) : null; List <int> definitions = _schema.HasDefinitionLevelsPage ? ReadDefinitionLevels(reader) : null; // these are pointers back to the Values table - lookup on values List <int> indexes = ReadColumnValues(reader, ph.Data_page_header.Encoding, destination, max); //trim output if it exceeds max number of values int numValues = ph.Data_page_header.Num_values; if (!_schema.IsRepeated) { if (repetitions != null) { ValueMerger.TrimTail(repetitions, numValues); } if (definitions != null) { ValueMerger.TrimTail(definitions, numValues); } if (indexes != null) { ValueMerger.TrimTail(indexes, numValues); } } return(new PageData { definitions = definitions, repetitions = repetitions, indexes = indexes }); } } }
public IList Read(long offset, long count) { IList values = TypeFactory.Create(_schema, _options); //get the minimum offset, we'll just read pages in sequence long fileOffset = new[] { _thriftChunk.Meta_data.Dictionary_page_offset, _thriftChunk.Meta_data.Data_page_offset }.Where(e => e != 0).Min(); long maxValues = _thriftChunk.Meta_data.Num_values; _inputStream.Seek(fileOffset, SeekOrigin.Begin); Thrift.PageHeader ph = _thrift.Read <Thrift.PageHeader>(); IList dictionaryPage = null; List <int> indexes = null; List <int> definitions = null; List <int> repetitions = null; //there can be only one dictionary page in column if (ph.Type == Thrift.PageType.DICTIONARY_PAGE) { dictionaryPage = ReadDictionaryPage(ph); ph = _thrift.Read <Thrift.PageHeader>(); //get next page after dictionary } int dataPageCount = 0; while (true) { int valuesSoFar = Math.Max(indexes == null ? 0 : indexes.Count, values.Count); PageData page = ReadDataPage(ph, values, maxValues - valuesSoFar); indexes = AssignOrAdd(indexes, page.indexes); definitions = AssignOrAdd(definitions, page.definitions); repetitions = AssignOrAdd(repetitions, page.repetitions); dataPageCount++; int totalCount = Math.Max( values.Count + (indexes == null ? 0 : indexes.Count), definitions == null ? 0 : definitions.Count); if (totalCount >= maxValues) { break; //limit reached } ph = ReadDataPageHeader(dataPageCount); //get next page if (ph.Type != Thrift.PageType.DATA_PAGE) { break; } } IList mergedValues = new ValueMerger(_schema, _options, values) .Apply(dictionaryPage, definitions, repetitions, indexes, (int)maxValues); //todo: this won't work for nested arrays ValueMerger.Trim(mergedValues, (int)offset, (int)count); return(mergedValues); }
public IList Read(string columnName) { IList values = TypeFactory.Create(_schemaElement); //get the minimum offset, we'll just read pages in sequence long offset = new[] { _thriftChunk.Meta_data.Dictionary_page_offset, _thriftChunk.Meta_data.Data_page_offset }.Where(e => e != 0).Min(); long maxValues = _thriftChunk.Meta_data.Num_values; _inputStream.Seek(offset, SeekOrigin.Begin); Thrift.PageHeader ph = _thrift.Read <Thrift.PageHeader>(); IList dictionaryPage = null; List <int> indexes = null; List <int> definitions = null; //there can be only one dictionary page in column if (ph.Type == Thrift.PageType.DICTIONARY_PAGE) { dictionaryPage = ReadDictionaryPage(ph); ph = _thrift.Read <Thrift.PageHeader>(); //get next page after dictionary } int dataPageCount = 0; while (true) { int valuesSoFar = Math.Max(indexes == null ? 0 : indexes.Count, values.Count); var page = ReadDataPage(ph, values, maxValues - valuesSoFar); //merge indexes if (page.indexes != null) { if (indexes == null) { indexes = page.indexes; } else { indexes.AddRange(page.indexes); } } if (page.definitions != null) { if (definitions == null) { definitions = (List <int>)page.definitions; } else { definitions.AddRange((List <int>)page.definitions); } } dataPageCount++; if (page.repetitions != null) { throw new NotImplementedException(); } if ((values.Count >= maxValues) || (indexes != null && indexes.Count >= maxValues) || (definitions != null && definitions.Count >= maxValues)) { break; //limit reached } /*IList acc1 = new ValueMerger(_schemaElement, values).Apply(dictionaryPage, definitions, indexes, maxValues); * dictionaryPage = null; * definitions = null; * indexes = null; * values.Clear(); * foreach (var el in acc1) acc.Add(el);*/ ph = _thrift.Read <Thrift.PageHeader>(); //get next page if (ph.Type != Thrift.PageType.DATA_PAGE) { break; } } IList mergedValues = new ValueMerger(_schemaElement, values).Apply(dictionaryPage, definitions, indexes, maxValues); return(mergedValues); }