private (ICollection definitions, ICollection repetitions, List <int> indexes) ReadDataPage(Thrift.PageHeader ph, IList destination, long maxValues) { byte[] data = ReadRawBytes(ph, _inputStream); using (var dataStream = new MemoryStream(data)) { using (var reader = new BinaryReader(dataStream)) { //todo: read repetition levels (only relevant for nested columns) //check if there are definitions at all bool hasDefinitions = _schemaElement.Thrift.Repetition_type == Thrift.FieldRepetitionType.OPTIONAL; List <int> definitions = hasDefinitions ? ReadDefinitionLevels(reader, (int)maxValues) : null; // these are pointers back to the Values table - lookup on values List <int> indexes = ReadColumnValues(reader, ph.Data_page_header.Encoding, destination, maxValues); //trim output if it exceeds max number of values int numValues = ph.Data_page_header.Num_values; if (definitions != null) { ValueMerger.Trim(definitions, numValues); } if (indexes != null) { ValueMerger.Trim(indexes, numValues); } return(definitions, null, indexes); } } }
public IList Read(long offset, long count) { long fileOffset = GetFileOffset(); long maxValues = _thriftColumnChunk.Meta_data.Num_values; _inputStream.Seek(fileOffset, SeekOrigin.Begin); IList dictionary = null; List <int> indexes = null; List <int> repetitions = null; List <int> definitions = null; IList values = null; //there can be only one dictionary page in column Thrift.PageHeader ph = _thriftStream.Read <Thrift.PageHeader>(); if (TryReadDictionaryPage(ph, out dictionary)) { ph = _thriftStream.Read <Thrift.PageHeader>(); } int pagesRead = 0; while (true) { int valuesSoFar = Math.Max(indexes == null ? 0 : indexes.Count, values == null ? 0 : values.Count); PageData pd = ReadDataPage(ph, maxValues - valuesSoFar); repetitions = AssignOrAdd(repetitions, pd.repetitions); definitions = AssignOrAdd(definitions, pd.definitions); indexes = AssignOrAdd(indexes, pd.indexes); values = AssignOrAdd(values, pd.values); pagesRead++; int totalCount = Math.Max( (values == null ? 0 : values.Count) + (indexes == null ? 0 : indexes.Count), (definitions == null ? 0 : definitions.Count)); if (totalCount >= maxValues) { break; //limit reached } ph = _thriftStream.Read <Thrift.PageHeader>(); if (ph.Type != Thrift.PageType.DATA_PAGE) { break; } } IList mergedValues = new ValueMerger( _maxDefinitionLevel, _maxRepetitionLevel, () => _dataTypeHandler.CreateEmptyList(_thriftSchemaElement.IsNullable(), false, 0), values ?? _dataTypeHandler.CreateEmptyList(_thriftSchemaElement.IsNullable(), false, 0)) .Apply(dictionary, definitions, repetitions, indexes, (int)maxValues); mergedValues.Trim((int)offset, (int)count); return(mergedValues); }
public IList Read(long offset, long count) { IList values = TypeFactory.Create(_schema, _options); //get the minimum offset, we'll just read pages in sequence long fileOffset = new[] { _thriftChunk.Meta_data.Dictionary_page_offset, _thriftChunk.Meta_data.Data_page_offset }.Where(e => e != 0).Min(); long maxValues = _thriftChunk.Meta_data.Num_values; _inputStream.Seek(fileOffset, SeekOrigin.Begin); Thrift.PageHeader ph = _thrift.Read <Thrift.PageHeader>(); IList dictionaryPage = null; List <int> indexes = null; List <int> definitions = null; List <int> repetitions = null; //there can be only one dictionary page in column if (ph.Type == Thrift.PageType.DICTIONARY_PAGE) { dictionaryPage = ReadDictionaryPage(ph); ph = _thrift.Read <Thrift.PageHeader>(); //get next page after dictionary } int dataPageCount = 0; while (true) { int valuesSoFar = Math.Max(indexes == null ? 0 : indexes.Count, values.Count); PageData page = ReadDataPage(ph, values, maxValues - valuesSoFar); indexes = AssignOrAdd(indexes, page.indexes); definitions = AssignOrAdd(definitions, page.definitions); repetitions = AssignOrAdd(repetitions, page.repetitions); dataPageCount++; int totalCount = Math.Max( values.Count + (indexes == null ? 0 : indexes.Count), definitions == null ? 0 : definitions.Count); if (totalCount >= maxValues) { break; //limit reached } ph = ReadDataPageHeader(dataPageCount); //get next page if (ph.Type != Thrift.PageType.DATA_PAGE) { break; } } IList mergedValues = new ValueMerger(_schema, _options, values) .Apply(dictionaryPage, definitions, repetitions, indexes, (int)maxValues); //todo: this won't work for nested arrays ValueMerger.Trim(mergedValues, (int)offset, (int)count); return(mergedValues); }