private PageData ReadDataPage(IDataTypeHandler dataTypeHandler, Thrift.PageHeader ph, Thrift.SchemaElement tse, long maxValues) { byte[] data = ReadRawBytes(ph, _inputStream); int max = ph.Data_page_header.Num_values; _footer.GetLevels(_thriftColumnChunk, out int maxRepetitionLevel, out int maxDefinitionLevel); var pd = new PageData(); using (var dataStream = new MemoryStream(data)) { using (var reader = new BinaryReader(dataStream)) { if (maxRepetitionLevel > 0) { pd.repetitions = ReadLevels(reader, maxRepetitionLevel); } if (maxDefinitionLevel > 0) { pd.definitions = ReadLevels(reader, maxDefinitionLevel); } ReadColumn(dataTypeHandler, tse, reader, ph.Data_page_header.Encoding, maxValues, out pd.values, out pd.indexes); } } return(pd); }
private Field GetField(PropertyInfo property) { Type pt = property.PropertyType; if (pt.IsNullable()) { pt = pt.GetNonNullable(); } if (pt.IsArray) { pt = pt.GetElementType(); } IDataTypeHandler handler = DataTypeFactory.Match(pt); if (handler == null) { return(null); } ParquetColumnAttribute columnAttr = property.GetCustomAttribute <ParquetColumnAttribute>(); string name = columnAttr?.Name ?? property.Name; DataType type = handler.DataType; var r = new DataField(name, property.PropertyType //use CLR type here as DF constructor will figure out nullability and other parameters ); r.ClrPropName = property.Name; return(r); }
private DataColumn(DataField field) { Field = field ?? throw new ArgumentNullException(nameof(field)); _dataTypeHandler = DataTypeFactory.Match(field.DataType); HasRepetitions = field.IsArray; }
/// <summary> /// Writes next data column to parquet stream. Note that columns must be written in the order they are declared in the /// file schema. /// </summary> /// <param name="column"></param> public void WriteColumn(DataColumn column) { if (column == null) { throw new ArgumentNullException(nameof(column)); } if (RowCount == null) { if (column.Data.Length > 0 || column.Field.MaxRepetitionLevel == 0) { RowCount = column.CalculateRowCount(); } } Thrift.SchemaElement tse = _thschema[_colIdx]; if (!column.Field.Equals(tse)) { throw new ArgumentException($"cannot write this column, expected '{tse.Name}', passed: '{column.Field.Name}'", nameof(column)); } IDataTypeHandler dataTypeHandler = DataTypeFactory.Match(tse, _formatOptions); _colIdx += 1; List <string> path = _footer.GetPath(tse); var writer = new DataColumnWriter(_stream, _thriftStream, _footer, tse, _compressionMethod, _compressionLevel, (int)(RowCount ?? 0)); Thrift.ColumnChunk chunk = writer.Write(path, column, dataTypeHandler); _thriftRowGroup.Columns.Add(chunk); }
private void CreateModelSchema(string path, IList <Field> container, int childCount, ref int si, ParquetOptions formatOptions) { for (int i = 0; i < childCount && si < _fileMeta.Schema.Count; i++) { Thrift.SchemaElement tse = _fileMeta.Schema[si]; IDataTypeHandler dth = DataTypeFactory.Match(tse, formatOptions); if (dth == null) { throw new InvalidOperationException($"cannot find data type handler to create model schema for {tse.Describe()}"); } Field se = dth.CreateSchemaElement(_fileMeta.Schema, ref si, out int ownedChildCount); se.Path = string.Join(Schema.PathSeparator, new[] { path, se.Path ?? se.Name }.Where(p => p != null)); if (ownedChildCount > 0) { var childContainer = new List <Field>(); CreateModelSchema(se.Path, childContainer, ownedChildCount, ref si, formatOptions); foreach (Field cse in childContainer) { se.Assign(cse); } } container.Add(se); } }
private List <PageTag> WriteColumn(DataColumn column, Thrift.SchemaElement tse, IDataTypeHandler dataTypeHandler, int maxRepetitionLevel, int maxDefinitionLevel) { var pages = new List <PageTag>(); /* * Page header must preceeed actual data (compressed or not) however it contains both * the uncompressed and compressed data size which we don't know! This somehow limits * the write efficiency. */ using (var ms = new MemoryStream()) { Thrift.PageHeader dataPageHeader = _footer.CreateDataPage(column.TotalCount); //chain streams together so we have real streaming instead of wasting undefraggable LOH memory using (GapStream pageStream = DataStreamFactory.CreateWriter(ms, _compressionMethod, true)) { using (var writer = new BinaryWriter(pageStream, Encoding.UTF8, true)) { if (maxRepetitionLevel > 0) { WriteLevels(writer, column.RepetitionLevels, maxRepetitionLevel); } if (maxDefinitionLevel > 0) { WriteLevels(writer, column.DefinitionLevels, maxDefinitionLevel); } dataTypeHandler.Write(tse, writer, column.DefinedData); writer.Flush(); } pageStream.Flush(); //extremely important to flush the stream as some compression algorithms don't finish writing dataPageHeader.Uncompressed_page_size = (int)pageStream.Position; } dataPageHeader.Compressed_page_size = (int)ms.Position; //write the header in int headerSize = _thriftStream.Write(dataPageHeader); ms.Position = 0; ms.CopyTo(_stream); var dataTag = new PageTag { HeaderMeta = dataPageHeader, HeaderSize = headerSize }; pages.Add(dataTag); } return(pages); }
static IList GetValues(Dictionary <string, IList> columns, DataField field, bool createIfMissing, bool isNested = false) { if (field.Path == null) { throw new ArgumentNullException(nameof(field.Path)); } if (!columns.TryGetValue(field.Path, out IList values) || values == null) { if (createIfMissing) { IDataTypeHandler handler = DataTypeFactory.Match(field); values = isNested ? new List <IEnumerable>() : handler.CreateEmptyList(field.HasNulls, field.IsArray, 0); columns[field.Path] = values; } else { throw new ArgumentException($"column does not exist by path '{field.Path}'", nameof(field)); } } return(values); }
public override void CreateThrift(Field field, Thrift.SchemaElement parent, IList <Thrift.SchemaElement> container) { ListField listField = (ListField)field; parent.Num_children += 1; //add list container var root = new Thrift.SchemaElement(field.Name) { Converted_type = Thrift.ConvertedType.LIST, Repetition_type = Thrift.FieldRepetitionType.OPTIONAL, Num_children = 1 //field container below }; container.Add(root); //add field container var list = new Thrift.SchemaElement("list") { Repetition_type = Thrift.FieldRepetitionType.REPEATED }; container.Add(list); //add the list item as well IDataTypeHandler fieldHandler = DataTypeFactory.Match(listField.Item); fieldHandler.CreateThrift(listField.Item, list, container); }
private void ReadColumn(IDataTypeHandler dataTypeHandler, Thrift.SchemaElement tse, BinaryReader reader, Thrift.Encoding encoding, long maxValues, out IList values, out List <int> indexes) { //dictionary encoding uses RLE to encode data switch (encoding) { case Thrift.Encoding.PLAIN: values = dataTypeHandler.Read(reader); indexes = null; break; case Thrift.Encoding.RLE: values = null; indexes = RunLengthBitPackingHybridValuesReader.Read(reader, tse.Type_length); break; case Thrift.Encoding.PLAIN_DICTIONARY: values = null; indexes = PlainDictionaryValuesReader.Read(reader, maxValues); break; default: throw new ParquetException($"encoding {encoding} is not supported."); } }
private void ParseSchemaExperimenal(SchemaElement parent, int childCount, ref int si, ParquetOptions formatOptions) { for (int i = 0; i < childCount && si < _fileMeta.Schema.Count; i++) { Thrift.SchemaElement tse = _fileMeta.Schema[si]; IDataTypeHandler dth = DataTypeFactory.Match(tse, formatOptions); if (dth == null) { if (tse.Num_children > 0) { //it's an element ParseSchemaExperimenal(parent, _fileMeta.Schema[si++].Num_children, ref si, formatOptions); continue; } else { ThrowNoHandler(tse); } } SchemaElement newRoot = dth.Create(parent, _fileMeta.Schema, ref si); if (newRoot != null) { ParseSchemaExperimenal(newRoot, _fileMeta.Schema[si - 1].Num_children, ref si, formatOptions); } } }
private List <PageTag> WriteColumn(DataColumn column, Thrift.SchemaElement tse, IDataTypeHandler dataTypeHandler, int maxRepetitionLevel, int maxDefinitionLevel) { var pages = new List <PageTag>(); /* * Page header must preceeed actual data (compressed or not) however it contains both * the uncompressed and compressed data size which we don't know! This somehow limits * the write efficiency. */ using (var ms = new MemoryStream()) { Thrift.PageHeader dataPageHeader = _footer.CreateDataPage(column.TotalCount); //chain streams together so we have real streaming instead of wasting undefraggable LOH memory using (PositionTrackingStream pps = DataStreamFactory.CreateWriter(ms, _compressionMethod)) { using (var writer = new BinaryWriter(pps)) { if (column.HasRepetitions) { throw new NotImplementedException(); } if (column.HasDefinitions) { WriteLevels(writer, column.DefinitionLevels, maxDefinitionLevel); } dataTypeHandler.Write(tse, writer, column.DefinedData); } dataPageHeader.Uncompressed_page_size = (int)pps.Position; } dataPageHeader.Compressed_page_size = (int)ms.Position; //write the hader in int headerSize = _thriftStream.Write(dataPageHeader); ms.Position = 0; ms.CopyTo(_stream); var dataTag = new PageTag { HeaderMeta = dataPageHeader, HeaderSize = headerSize }; pages.Add(dataTag); } return(pages); }
public DataColumn(DataField field) { _field = field ?? throw new ArgumentNullException(nameof(field)); IDataTypeHandler handler = DataTypeFactory.Match(field.DataType); _definedData = handler.CreateEmptyList(false, false, 0); // always a plain list, always non-nullable when possible _definitionLevels = field.HasNulls ? new List <int>() : null; // do not create an instance when not required }
public void Read(long offset, long count) { Thrift.SchemaElement tse = _footer.GetSchemaElement(_thriftColumnChunk); IDataTypeHandler dataTypeHandler = DataTypeFactory.Match(tse, _parquetOptions); long fileOffset = GetFileOffset(); long maxValues = _thriftColumnChunk.Meta_data.Num_values; _inputStream.Seek(fileOffset, SeekOrigin.Begin); IList dictionary = null; List <int> indexes = null; List <int> repetitions = null; List <int> definitions = null; IList values = null; //there can be only one dictionary page in column Thrift.PageHeader ph = _thriftStream.Read <Thrift.PageHeader>(); if (TryReadDictionaryPage(ph, dataTypeHandler, out dictionary)) { ph = _thriftStream.Read <Thrift.PageHeader>(); } int pagesRead = 0; while (true) { int valuesSoFar = Math.Max(indexes == null ? 0 : indexes.Count, values == null ? 0 : values.Count); PageData pd = ReadDataPage(dataTypeHandler, ph, tse, maxValues - valuesSoFar); repetitions = AssignOrAdd(repetitions, pd.repetitions); definitions = AssignOrAdd(definitions, pd.definitions); indexes = AssignOrAdd(indexes, pd.indexes); values = AssignOrAdd(values, pd.values); pagesRead++; int totalCount = Math.Max( (values == null ? 0 : values.Count) + (indexes == null ? 0 : indexes.Count), (definitions == null ? 0 : definitions.Count)); if (totalCount >= maxValues) { break; //limit reached } ph = _thriftStream.Read <Thrift.PageHeader>(); if (ph.Type != Thrift.PageType.DATA_PAGE) { break; } } //IList mergedValues = new ValueMerger(_schema, values) // .Apply(dictionary, definitions, repetitions, indexes, (int)maxValues); }
private bool TryReadDictionaryPage(Thrift.PageHeader ph, IDataTypeHandler dataTypeHandler, out IList dictionary) { if (ph.Type != Thrift.PageType.DICTIONARY_PAGE) { dictionary = null; return(false); } throw new NotImplementedException(); }
private void CreateThriftSchema(IEnumerable <Field> ses, Thrift.SchemaElement parent, IList <Thrift.SchemaElement> container) { foreach (Field se in ses) { IDataTypeHandler handler = DataTypeFactory.Match(se); //todo: check that handler is found indeed handler.CreateThrift(se, parent, container); } }
public DataColumn(DataField field) { _field = field ?? throw new ArgumentNullException(nameof(field)); IDataTypeHandler handler = DataTypeFactory.Match(field.DataType); _definedData = handler.CreateEmptyList(false, false, 0); // always a plain list, always non-nullable when possible _definitionLevels = new List <int>(); HasRepetitions = field.IsArray; _repetitionLevels = HasRepetitions ? new List <int>() : null; }
public DataField(string name, DataType dataType, bool hasNulls = true, bool isArray = false) : base(name, SchemaType.Data) { DataType = dataType; HasNulls = hasNulls; IsArray = isArray; IDataTypeHandler handler = DataTypeFactory.Match(dataType); if (handler != null) { ClrType = handler.ClrType; } }
public ColumnarReader(Stream inputStream, Thrift.ColumnChunk thriftColumnChunk, ThriftFooter footer, ParquetOptions parquetOptions) { _inputStream = inputStream ?? throw new ArgumentNullException(nameof(inputStream)); _thriftColumnChunk = thriftColumnChunk ?? throw new ArgumentNullException(nameof(thriftColumnChunk)); _footer = footer ?? throw new ArgumentNullException(nameof(footer)); _parquetOptions = parquetOptions ?? throw new ArgumentNullException(nameof(parquetOptions)); _thriftStream = new ThriftStream(inputStream); _footer.GetLevels(_thriftColumnChunk, out int mrl, out int mdl); _maxRepetitionLevel = mrl; _maxDefinitionLevel = mdl; _thriftSchemaElement = _footer.GetSchemaElement(_thriftColumnChunk); _dataTypeHandler = DataTypeFactory.Match(_thriftSchemaElement, _parquetOptions); }
public ValueMerger( int maxDefinitionLevel, int maxRepetitionLevel, IList values, IDataTypeHandler dataTypeHandler, bool isNullable) { _maxDefinitionLevel = maxDefinitionLevel; _maxRepetitionLevel = maxRepetitionLevel; _dataTypeHandler = dataTypeHandler; _isNullable = isNullable; _createEmptyListFunc = () => _dataTypeHandler.CreateEmptyList(_isNullable, false, 0); _values = values ?? _dataTypeHandler.CreateEmptyList(_isNullable, false, 0); }
/// <summary> /// Returns database bundle instance /// </summary> /// <param name="packageName">Name of the package</param> /// <returns>DatabaseBundle Instance of database bundle</returns> private DatabaseBundle GetDatabaseBundle(String packageName) { IDatabaseImpl database = (IDatabaseImpl)ClassUtils.CreateClassInstance(packageName + "." + DATABASE_CLASS_NAME); IQueryBuilder queryBuilder = (IQueryBuilder)ClassUtils.CreateClassInstance(packageName + "." + DATABASE_QUERY_BUILDER); IDataTypeHandler dataTypeHandler = (IDataTypeHandler)ClassUtils.CreateClassInstance(packageName + "." + DATABASE_DATA_TYPE_HANDLER); DatabaseBundle databaseBundle = new DatabaseBundle(); databaseBundle.SetDatabase(database); databaseBundle.SetQueryBuilder(queryBuilder); databaseBundle.SetDataTypeHandler(dataTypeHandler); return(databaseBundle); }
private Field GetField(PropertyInfo property) { Type pt = property.PropertyType; if (pt.IsNullable()) { pt = pt.GetNonNullable(); } if (pt.IsArray) { pt = pt.GetElementType(); } IDataTypeHandler handler = DataTypeFactory.Match(pt); if (handler == null) { return(null); } ParquetColumnAttribute columnAttr = property.GetCustomAttribute <ParquetColumnAttribute>(); string name = columnAttr?.Name ?? property.Name; DataType type = handler.DataType; var r = new DataField(name, property.PropertyType //use CLR type here as DF constructor will figure out nullability and other parameters ); if (columnAttr != null) { if (handler.ClrType == typeof(TimeSpan)) { r = new TimeSpanDataField(r.Name, columnAttr.TimeSpanFormat, r.HasNulls, r.IsArray); } if (handler.ClrType == typeof(DateTime) || handler.ClrType == typeof(DateTimeOffset)) { r = new DateTimeDataField(r.Name, columnAttr.DateTimeFormat, r.HasNulls, r.IsArray); } if (handler.ClrType == typeof(decimal)) { r = new DecimalDataField(r.Name, columnAttr.DecimalPrecision, columnAttr.DecimalScale, columnAttr.DecimalForceByteArrayEncoding, r.HasNulls, r.IsArray); } } r.ClrPropName = property.Name; return(r); }
internal Thrift.Statistics ToThriftStatistics(IDataTypeHandler handler, Thrift.SchemaElement tse) { byte[] min = handler.PlainEncode(tse, MinValue); byte[] max = handler.PlainEncode(tse, MaxValue); return(new Thrift.Statistics { Null_count = NullCount, Distinct_count = DistinctCount, Min = min, Min_value = min, Max = max, Max_value = max }); }
public PrimitiveReader( Thrift.SchemaElement schemaElement, ParquetOptions parquetOptions, IDataTypeHandler dataTypeHandler, BinaryReader binaryReader, int typeWidth, Func <BinaryReader, TElement> readOneFunc) { _schemaElement = schemaElement; _parquetOptions = parquetOptions; _dataTypeHandler = dataTypeHandler; _binaryReader = binaryReader; _typeWidth = typeWidth; _readOneFunc = readOneFunc; }
/// <summary> /// Creates a new instance of <see cref="DataField"/> by specifying all the required attributes. /// </summary> /// <param name="name">Field name.</param> /// <param name="dataType">Native Parquet type</param> /// <param name="hasNulls">When true, the field accepts null values. Note that nullable values take slightly more disk space and computing comparing to non-nullable, but are more common.</param> /// <param name="isArray">When true, each value of this field can have multiple values, similar to array in C#.</param> public DataField(string name, DataType dataType, bool hasNulls = true, bool isArray = false) : base(name, SchemaType.Data) { DataType = dataType; HasNulls = hasNulls; IsArray = isArray; MaxRepetitionLevel = isArray ? 1 : 0; IDataTypeHandler handler = DataTypeFactory.Match(dataType); if (handler != null) { ClrType = handler.ClrType; ClrNullableIfHasNullsType = hasNulls ? ClrType.GetNullable() : ClrType; } }
public void Write(DataColumn column) { if (column == null) { throw new ArgumentNullException(nameof(column)); } Thrift.SchemaElement tse = _thschema[_colIdx++]; IDataTypeHandler dataTypeHandler = DataTypeFactory.Match(tse, _formatOptions); //todo: check if the column is in the right order List <string> path = _footer.GetPath(tse); Thrift.ColumnChunk chunk = WriteColumnChunk(tse, path, column, dataTypeHandler); _thriftRowGroup.Columns.Add(chunk); }
public void CreateThrift(Field field, Thrift.SchemaElement parent, IList <Thrift.SchemaElement> container) { StructField structField = (StructField)field; Thrift.SchemaElement tseStruct = new Thrift.SchemaElement(field.Name) { Repetition_type = Thrift.FieldRepetitionType.OPTIONAL, }; container.Add(tseStruct); parent.Num_children += 1; foreach (Field cf in structField.Fields) { IDataTypeHandler handler = DataTypeFactory.Match(cf); handler.CreateThrift(cf, tseStruct, container); } }
private static CInfo Discover() { Type t = typeof(T); Type baseType = t; bool isArray = false; bool hasNulls = false; //throw a useful hint if (t.TryExtractDictionaryType(out Type dKey, out Type dValue)) { throw new ArgumentException($"cannot declare a dictionary this way, please use {nameof(MapField)}."); } if (t.TryExtractEnumerableType(out Type enumItemType)) { baseType = enumItemType; isArray = true; } if (baseType.IsNullable()) { baseType = baseType.GetNonNullable(); hasNulls = true; } if (typeof(Row) == baseType) { throw new ArgumentException($"{typeof(Row)} is not supported. If you tried to declare a struct please use {typeof(StructField)} instead."); } IDataTypeHandler handler = DataTypeFactory.Match(baseType); if (handler == null) { DataTypeFactory.ThrowClrTypeNotSupported(baseType); } return(new CInfo { dataType = handler.DataType, baseType = baseType, isArray = isArray, hasNulls = hasNulls }); }
public Thrift.ColumnChunk Write(List <string> path, DataColumn column, IDataTypeHandler dataTypeHandler) { Thrift.ColumnChunk chunk = _footer.CreateColumnChunk(_compressionMethod, _stream, _schemaElement.Type, path, 0); Thrift.PageHeader ph = _footer.CreateDataPage(column.TotalCount); _footer.GetLevels(chunk, out int maxRepetitionLevel, out int maxDefinitionLevel); List <PageTag> pages = WriteColumn(column, _schemaElement, dataTypeHandler, maxRepetitionLevel, maxDefinitionLevel); //this count must be set to number of all values in the column, including nulls. //for hierarchy/repeated columns this is a count of flattened list, including nulls. chunk.Meta_data.Num_values = ph.Data_page_header.Num_values; //the following counters must include both data size and header size chunk.Meta_data.Total_compressed_size = pages.Sum(p => p.HeaderMeta.Compressed_page_size + p.HeaderSize); chunk.Meta_data.Total_uncompressed_size = pages.Sum(p => p.HeaderMeta.Uncompressed_page_size + p.HeaderSize); return(chunk); }
public override void CreateThrift(Field field, Thrift.SchemaElement parent, IList <Thrift.SchemaElement> container) { parent.Num_children += 1; //add the root container where map begins var root = new Thrift.SchemaElement(field.Name) { Converted_type = Thrift.ConvertedType.MAP, Num_children = 1, Repetition_type = Thrift.FieldRepetitionType.OPTIONAL }; container.Add(root); //key-value is a container for column of keys and column of values var keyValue = new Thrift.SchemaElement(MapField.ContainerName) { Num_children = 0, //is assigned by children Repetition_type = Thrift.FieldRepetitionType.REPEATED }; container.Add(keyValue); //now add the key and value separately MapField mapField = field as MapField; IDataTypeHandler keyHandler = DataTypeFactory.Match(mapField.Key); IDataTypeHandler valueHandler = DataTypeFactory.Match(mapField.Value); keyHandler.CreateThrift(mapField.Key, keyValue, container); Thrift.SchemaElement tseKey = container[container.Count - 1]; valueHandler.CreateThrift(mapField.Value, keyValue, container); Thrift.SchemaElement tseValue = container[container.Count - 1]; //fixups for weirdness in RLs if (tseKey.Repetition_type == Thrift.FieldRepetitionType.REPEATED) { tseKey.Repetition_type = Thrift.FieldRepetitionType.OPTIONAL; } if (tseValue.Repetition_type == Thrift.FieldRepetitionType.REPEATED) { tseValue.Repetition_type = Thrift.FieldRepetitionType.OPTIONAL; } }
internal void AddElement(IList keys, IList values, IDictionary dictionary) { IDataTypeHandler keyHandler = DataTypeFactory.Match(Key.DataType); IDataTypeHandler valueHandler = DataTypeFactory.Match(Value.DataType); IList keysList = keyHandler.CreateEmptyList(Key.HasNulls, false, dictionary.Count); IList valuesList = valueHandler.CreateEmptyList(Value.HasNulls, false, dictionary.Count); foreach (object v in dictionary.Keys) { keysList.Add(v); } foreach (object v in dictionary.Values) { valuesList.Add(v); } keys.Add(keysList); values.Add(valuesList); }