private void ReadByteArray(byte[] data, SchemaElement schemaElement, IList destination) { // Both UTF8 and JSON are stored as binary data (byte_array) which allows annotations to be used either UTF8 and JSON // They should be treated in the same way as Strings // need to find a better implementation for this but date strings are always broken here because of the type mismatch if (schemaElement.IsAnnotatedWith(Thrift.ConvertedType.UTF8) || schemaElement.IsAnnotatedWith(Thrift.ConvertedType.JSON) || _options.TreatByteArrayAsString) { for (int i = 0; i < data.Length;) { int length = BitConverter.ToInt32(data, i); i += 4; //fast-forward to data string s = UTF8.GetString(data, i, length); i += length; //fast-forward to the next element destination.Add(s); } } else { for (int i = 0; i < data.Length;) { int length = BitConverter.ToInt32(data, i); i += 4; //fast-forward to data byte[] ar = new byte[length]; Array.Copy(data, i, ar, 0, length); i += length; //fast-forward to the next element destination.Add(ar); } } }
private static void ReadFixedLenByteArray(byte[] data, SchemaElement schema, IList destination) { if (schema.IsAnnotatedWith(Thrift.ConvertedType.DECIMAL)) { int typeLength = schema.Thrift.Type_length; byte[] itemData = ByteGarbage.GetByteArray(typeLength); for (int i = 0; i < data.Length; i += typeLength) { Array.Copy(data, i, itemData, 0, typeLength); decimal dc = new BigDecimal(itemData, schema.Thrift); destination.Add(dc); } } else if (schema.IsAnnotatedWith(Thrift.ConvertedType.INTERVAL)) { for (int i = 0; i < data.Length; i += schema.Thrift.Type_length) { // assume this is the number of months / days / millis offset from the Julian calendar //todo: optimize allocations byte[] months = new byte[4]; byte[] days = new byte[4]; byte[] millis = new byte[4]; Array.Copy(data, i, months, 0, 4); Array.Copy(data, i + 4, days, 0, 4); Array.Copy(data, i + 8, millis, 0, 4); destination.Add(new Interval( BitConverter.ToInt32(months, 0), BitConverter.ToInt32(days, 0), BitConverter.ToInt32(millis, 0))); } } }
private static void WriteInt32(BinaryWriter writer, SchemaElement schema, IList data) { if (schema.IsAnnotatedWith(Thrift.ConvertedType.DATE)) { var dataTyped = (List<DateTimeOffset>)data; foreach(DateTimeOffset el in dataTyped) { int days = (int)el.ToUnixDays(); writer.Write(days + 1); } } else if (schema.IsAnnotatedWith(Thrift.ConvertedType.DECIMAL)) { var dataTyped = (List<decimal>)data; double scaleFactor = Math.Pow(10, schema.Thrift.Scale); foreach (decimal d in dataTyped) { try { int i = (int) (d * (decimal) scaleFactor); writer.Write(i); } catch (OverflowException) { throw new ParquetException( $"value '{d}' is too large to fit into scale {schema.Thrift.Scale} and precision {schema.Thrift.Precision}"); } } } else if (schema.IsAnnotatedWith(Thrift.ConvertedType.INT_8)) { var dataTyped = (List<byte>)data; foreach (byte byteValue in dataTyped) { writer.Write(byteValue); } } else if (schema.IsAnnotatedWith(Thrift.ConvertedType.UINT_8)) { var dataTyped = (List<sbyte>)data; foreach (sbyte byteValue in dataTyped) { writer.Write(byteValue); } } else { var dataTyped = (List<int>)data; foreach (int el in dataTyped) { writer.Write(el); } } }
private static void ReadLong(byte[] data, SchemaElement schema, IList destination) { if (schema.ElementType == typeof(DateTimeOffset)) { var lst = (List <DateTimeOffset>)destination; for (int i = 0; i < data.Length; i += 8) { long lv = BitConverter.ToInt64(data, i); lst.Add(lv.FromUnixTime()); } } else if (schema.IsAnnotatedWith(Thrift.ConvertedType.DECIMAL)) { decimal scaleFactor = (decimal)Math.Pow(10, -schema.Thrift.Scale); for (int i = 0; i < data.Length; i += 8) { long lv = BitConverter.ToInt64(data, i); decimal dv = lv * scaleFactor; destination.Add(dv); } } else { for (int i = 0; i < data.Length; i += 8) { long lv = BitConverter.ToInt64(data, i); destination.Add(lv); } } }
private static void ReadInt32(byte[] data, SchemaElement schema, IList destination) { if (schema.IsAnnotatedWith(Thrift.ConvertedType.DATE)) { for (int i = 0; i < data.Length; i += 4) { int iv = BitConverter.ToInt32(data, i); destination.Add(new DateTimeOffset(iv.FromUnixTime(), TimeSpan.Zero)); } } else if (schema.IsAnnotatedWith(Thrift.ConvertedType.DECIMAL)) { decimal scaleFactor = (decimal)Math.Pow(10, -schema.Thrift.Scale); for (int i = 0; i < data.Length; i += 4) { int iv = BitConverter.ToInt32(data, i); decimal dv = iv * scaleFactor; destination.Add(dv); } } else if (schema.IsAnnotatedWith(Thrift.ConvertedType.INT_8)) { foreach (byte byteValue in data) { destination.Add(byteValue); } } else if (schema.IsAnnotatedWith(Thrift.ConvertedType.UINT_8)) { foreach (byte byteValue in data) { destination.Add(Convert.ToSByte(byteValue)); } } else { for (int i = 0; i < data.Length; i += 4) { int iv = BitConverter.ToInt32(data, i); destination.Add(iv); } } }
private static void ReadFixedLenByteArray(byte[] data, SchemaElement schema, IList destination) { for (int i = 0; i < data.Length; i += schema.Thrift.Type_length) { if (!schema.IsAnnotatedWith(Thrift.ConvertedType.DECIMAL)) { continue; } // go from data - decimal needs to be 16 bytes but not from Spark - variable fixed nonsense byte[] dataNew = new byte[schema.Thrift.Type_length]; Array.Copy(data, i, dataNew, 0, schema.Thrift.Type_length); var bigInt = new BigDecimal(new BigInteger(dataNew.Reverse().ToArray()), schema.Thrift.Scale, schema.Thrift.Precision); decimal dc = (decimal)bigInt; destination.Add(dc); } }
private static void ReadInt32(byte[] data, SchemaElement schema, IList destination) { if (schema.IsAnnotatedWith(Thrift.ConvertedType.DATE)) { for (int i = 0; i < data.Length; i += 4) { int iv = BitConverter.ToInt32(data, i); destination.Add(new DateTimeOffset(iv.FromUnixTime(), TimeSpan.Zero)); } } else { for (int i = 0; i < data.Length; i += 4) { int iv = BitConverter.ToInt32(data, i); destination.Add(iv); } } }
private static void WriteLong(BinaryWriter writer, SchemaElement schema, IList data) { if (schema.IsAnnotatedWith(Thrift.ConvertedType.TIMESTAMP_MILLIS)) { var lst = (List <DateTimeOffset>)data; foreach (DateTimeOffset dto in lst) { long unixTime = dto.ToUnixTime(); writer.Write(unixTime); } } else { var lst = (List <long>)data; foreach (long l in lst) { writer.Write(l); } } }
private static void WriteInt32(BinaryWriter writer, SchemaElement schema, IList data) { if (schema.IsAnnotatedWith(Thrift.ConvertedType.DATE)) { var dataTyped = (List <DateTimeOffset>)data; foreach (DateTimeOffset el in dataTyped) { int days = (int)el.ToUnixDays(); writer.Write(days); } } else { var dataTyped = (List <int>)data; foreach (int el in dataTyped) { writer.Write(el); } } }
private static void ReadLong(byte[] data, SchemaElement schema, IList destination) { if (schema.IsAnnotatedWith(Thrift.ConvertedType.TIMESTAMP_MILLIS)) { var lst = (List <DateTimeOffset>)destination; for (int i = 0; i < data.Length; i += 8) { long lv = BitConverter.ToInt64(data, i); lst.Add(lv.FromUnixTime()); } } else { for (int i = 0; i < data.Length; i += 8) { long lv = BitConverter.ToInt64(data, i); destination.Add(lv); } } }