private void WriteDictionaryEncodedData() { //Sort the dictionary var sortedDictionary = new List <string>(); var i = 0; foreach (var dictEntry in _unsortedDictionary.OrderBy(d => d.Key, StringComparer.Ordinal)) { sortedDictionary.Add(dictEntry.Key); dictEntry.Value.Id = i++; } //Write the dictionary var dictionaryLengthList = new List <long>(); foreach (var dictEntry in sortedDictionary) { var bytes = Encoding.UTF8.GetBytes(dictEntry); dictionaryLengthList.Add(bytes.Length); //Save the length _dictionaryDataBuffer.Write(bytes, 0, bytes.Length); //Write to the buffer } //Write the dictionary lengths var dictionaryLengthEncoder = new IntegerRunLengthEncodingV2Writer(_lengthBuffer); dictionaryLengthEncoder.Write(dictionaryLengthList, false, _shouldAlignLengths); //Write the lookup values var presentList = new List <bool>(_dictionaryLookupValues.Count); var presentEncoder = new BitWriter(_presentBuffer); var lookupList = new List <long>(_dictionaryLookupValues.Count); var lookupEncoder = new IntegerRunLengthEncodingV2Writer(_dataBuffer); var hasNull = false; var strideCount = 0; StringWriterStatistics stats = null; foreach (var value in _dictionaryLookupValues) { if (stats == null) { stats = new StringWriterStatistics(); Statistics.Add(stats); foreach (var buffer in Buffers) { buffer.AnnotatePosition(stats, 0); } } var stringValue = sortedDictionary[value.Id]; //Look up the string value for this Id so we can notate statistics stats.AddValue(stringValue); presentList.Add(value != null); if (value != null) { lookupList.Add(value.Id); } else { hasNull = true; } if (++strideCount == _strideLength) //If it's time for new statistics { //Flush to the buffers presentEncoder.Write(presentList); presentList.Clear(); if (hasNull) { _presentBuffer.MustBeIncluded = true; } lookupEncoder.Write(lookupList, false, _shouldAlignDictionaryLookup); lookupList.Clear(); strideCount = 0; stats = null; } } }
public void AddBlock(IList <string> values) { EnsureEncodingKindIsSet(values); if (ColumnEncoding == ColumnEncodingKind.DirectV2) { var stats = new StringWriterStatistics(); Statistics.Add(stats); foreach (var buffer in Buffers) { buffer.AnnotatePosition(stats, 0); //Our implementation always ends the RLE at the stride } var bytesList = new List <byte[]>(values.Count); var presentList = new List <bool>(values.Count); var lengthList = new List <long>(values.Count); foreach (var str in values) { stats.AddValue(str); if (str != null) { var bytes = Encoding.UTF8.GetBytes(str); bytesList.Add(bytes); lengthList.Add(bytes.Length); } presentList.Add(str != null); } var presentEncoder = new BitWriter(_presentBuffer); presentEncoder.Write(presentList); if (stats.HasNull) { _presentBuffer.MustBeIncluded = true; } foreach (var bytes in bytesList) { _dataBuffer.Write(bytes, 0, bytes.Length); } var lengthEncoder = new IntegerRunLengthEncodingV2Writer(_lengthBuffer); lengthEncoder.Write(lengthList, false, _shouldAlignLengths); } else if (ColumnEncoding == ColumnEncodingKind.DictionaryV2) { foreach (var value in values) { if (value == null) { _dictionaryLookupValues.Add(null); } else { DictionaryEntry entry; if (!_unsortedDictionary.TryGetValue(value, out entry)) { entry = new DictionaryEntry(); _unsortedDictionary.Add(value, entry); } _dictionaryLookupValues.Add(entry); } } } else { throw new ArgumentException(); } }