private byte[] Compress(Thrift.PageHeader ph, byte[] data, CompressionMethod compression) { //note that page size numbers do not include header size by spec ph.Uncompressed_page_size = data.Length; byte[] result; if (compression != CompressionMethod.None) { IDataWriter writer = DataFactory.GetWriter(compression); using (var ms = new MemoryStream()) { writer.Write(data, ms); result = ms.ToArray(); } ph.Compressed_page_size = result.Length; } else { ph.Compressed_page_size = ph.Uncompressed_page_size; result = data; } return(result); }
private int Write(Thrift.PageHeader ph, byte[] data) { int headerSize = ThriftStream.Write(ph); _output.Write(data, 0, data.Length); return(headerSize); }
private Thrift.ColumnChunk Write(SchemaElement schema, IList values, CompressionMethod compression, ColumnStats stats) { Thrift.ColumnChunk chunk = _meta.AddColumnChunk(compression, _output, schema, values.Count); Thrift.PageHeader ph = _meta.CreateDataPage(values.Count); List <PageTag> pages = WriteValues(schema, values, ph, compression, stats); //the following counters must include both data size and header size chunk.Meta_data.Total_compressed_size = pages.Sum(p => p.HeaderMeta.Compressed_page_size + p.HeaderSize); chunk.Meta_data.Total_uncompressed_size = pages.Sum(p => p.HeaderMeta.Uncompressed_page_size + p.HeaderSize); return(chunk); }
private void WriteValues(SchemaElement schema, IList values, Thrift.PageHeader ph, CompressionMethod compression, ColumnStats stats) { byte[] data; using (var ms = new MemoryStream()) { using (var writer = new BinaryWriter(ms)) { if (stats.NullCount > 0) { CreateDefinitions(values, schema, out IList newValues, out List <int> definitions); values = newValues; _rleWriter.Write(writer, schema, definitions); } _plainWriter.Write(writer, schema, values); data = ms.ToArray(); } } ph.Uncompressed_page_size = data.Length; if (compression != CompressionMethod.None) { IDataWriter writer = DataFactory.GetWriter(compression); using (var ms = new MemoryStream()) { writer.Write(data, ms); data = ms.ToArray(); } ph.Compressed_page_size = data.Length; } else { ph.Compressed_page_size = ph.Uncompressed_page_size; } _thrift.Write(ph); _output.Write(data, 0, data.Length); }
private Thrift.ColumnChunk Write(SchemaElement schema, IList values, CompressionMethod compression, ColumnStats stats) { Thrift.ColumnChunk chunk = _meta.AddColumnChunk(compression, _output, schema, values.Count); var ph = new Thrift.PageHeader(Thrift.PageType.DATA_PAGE, 0, 0); ph.Data_page_header = new Thrift.DataPageHeader { Encoding = Thrift.Encoding.PLAIN, Definition_level_encoding = Thrift.Encoding.RLE, Repetition_level_encoding = Thrift.Encoding.BIT_PACKED, Num_values = values.Count }; WriteValues(schema, values, ph, compression, stats); return(chunk); }
private List <PageTag> WriteValues(SchemaElement schema, IList values, Thrift.PageHeader ph, CompressionMethod compression, ColumnStats stats) { var result = new List <PageTag>(); byte[] dictionaryPageBytes = null; int dictionaryPageCount = 0; byte[] dataPageBytes; //flatten values if the field is repeatable if (schema.IsRepeated) { values = FlattenRepeatables(values, schema); } using (var ms = new MemoryStream()) { using (var writer = new BinaryWriter(ms)) { //write repetitions if (schema.IsRepeated) { List <int> repetitions = CreateRepetitions(values, schema); _rleWriter.Write(writer, _definitionsSchema, repetitions, out IList nullExtra); } //write definitions if (schema.HasNulls || schema.IsRepeated) { CreateDefinitions(values, schema, out IList newValues, out List <int> definitions); values = newValues; _rleWriter.Write(writer, _definitionsSchema, definitions, out IList nullExtra); } //write data if (!_writerOptions.UseDictionaryEncoding || !_dicWriter.Write(writer, schema, values, out IList dicValues)) { _plainWriter.Write(writer, schema, values, out IList plainExtra); } else { dictionaryPageCount = dicValues.Count; ph.Data_page_header.Encoding = Thrift.Encoding.PLAIN_DICTIONARY; using (var dms = new MemoryStream()) using (var dwriter = new BinaryWriter(dms)) { _plainWriter.Write(dwriter, schema, dicValues, out IList t0); dictionaryPageBytes = dms.ToArray(); } } dataPageBytes = ms.ToArray(); } } if (dictionaryPageBytes != null) { Thrift.PageHeader dph = _meta.CreateDictionaryPage(dictionaryPageCount); dictionaryPageBytes = Compress(dph, dictionaryPageBytes, compression); int dictionaryHeaderSize = Write(dph, dictionaryPageBytes); result.Add(new PageTag { HeaderSize = dictionaryHeaderSize, HeaderMeta = dph }); } dataPageBytes = Compress(ph, dataPageBytes, compression); int dataHeaderSize = Write(ph, dataPageBytes); result.Add(new PageTag { HeaderSize = dataHeaderSize, HeaderMeta = ph }); return(result); }