private static HashSet <long> PerformMerging(DataFrame retainedDataFrame, string[] retainedJoinColumnNames, Dictionary <long, ICollection <long> > occurrences, HashSet <long> supplementaryJoinColumnsNullIndices, out PrimitiveDataFrameColumn <long> retainedRowIndices, out PrimitiveDataFrameColumn <long> supplementaryRowIndices, bool isInner, bool calculateIntersection) { retainedRowIndices = new Int64DataFrameColumn("RetainedIndices"); supplementaryRowIndices = new Int64DataFrameColumn("SupplementaryIndices"); HashSet <long> intersection = calculateIntersection ? new HashSet <long>() : null; var retainJoinColumns = retainedJoinColumnNames.Select(name => retainedDataFrame.Columns[name]).ToArray(); for (long i = 0; i < retainedDataFrame.Columns.RowCount; i++) { if (!IsAnyNullValueInColumns(retainJoinColumns, i)) { // Get all row indexes from supplementary dataframe that satisfy JOIN condition if (occurrences.TryGetValue(i, out ICollection <long> rowIndices)) { foreach (long supplementaryRowIndex in rowIndices) { retainedRowIndices.Append(i); supplementaryRowIndices.Append(supplementaryRowIndex); // Store intersection if required if (calculateIntersection) { if (!intersection.Contains(supplementaryRowIndex)) { intersection.Add(supplementaryRowIndex); } } } } else { if (isInner) { continue; } retainedRowIndices.Append(i); supplementaryRowIndices.Append(null); } } else { foreach (long row in supplementaryJoinColumnsNullIndices) { retainedRowIndices.Append(i); supplementaryRowIndices.Append(row); } } } return(intersection); }
private static void AppendDataFrameColumnFromArrowArray(Field field, IArrowArray arrowArray, DataFrame ret, string fieldNamePrefix = "") { IArrowType fieldType = field.DataType; DataFrameColumn dataFrameColumn = null; string fieldName = fieldNamePrefix + field.Name; switch (fieldType.TypeId) { case ArrowTypeId.Boolean: BooleanArray arrowBooleanArray = (BooleanArray)arrowArray; ReadOnlyMemory <byte> valueBuffer = arrowBooleanArray.ValueBuffer.Memory; ReadOnlyMemory <byte> nullBitMapBuffer = arrowBooleanArray.NullBitmapBuffer.Memory; dataFrameColumn = new BooleanDataFrameColumn(fieldName, valueBuffer, nullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.Double: PrimitiveArray <double> arrowDoubleArray = (PrimitiveArray <double>)arrowArray; ReadOnlyMemory <byte> doubleValueBuffer = arrowDoubleArray.ValueBuffer.Memory; ReadOnlyMemory <byte> doubleNullBitMapBuffer = arrowDoubleArray.NullBitmapBuffer.Memory; dataFrameColumn = new DoubleDataFrameColumn(fieldName, doubleValueBuffer, doubleNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.Float: PrimitiveArray <float> arrowFloatArray = (PrimitiveArray <float>)arrowArray; ReadOnlyMemory <byte> floatValueBuffer = arrowFloatArray.ValueBuffer.Memory; ReadOnlyMemory <byte> floatNullBitMapBuffer = arrowFloatArray.NullBitmapBuffer.Memory; dataFrameColumn = new SingleDataFrameColumn(fieldName, floatValueBuffer, floatNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.Int8: PrimitiveArray <sbyte> arrowsbyteArray = (PrimitiveArray <sbyte>)arrowArray; ReadOnlyMemory <byte> sbyteValueBuffer = arrowsbyteArray.ValueBuffer.Memory; ReadOnlyMemory <byte> sbyteNullBitMapBuffer = arrowsbyteArray.NullBitmapBuffer.Memory; dataFrameColumn = new SByteDataFrameColumn(fieldName, sbyteValueBuffer, sbyteNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.Int16: PrimitiveArray <short> arrowshortArray = (PrimitiveArray <short>)arrowArray; ReadOnlyMemory <byte> shortValueBuffer = arrowshortArray.ValueBuffer.Memory; ReadOnlyMemory <byte> shortNullBitMapBuffer = arrowshortArray.NullBitmapBuffer.Memory; dataFrameColumn = new Int16DataFrameColumn(fieldName, shortValueBuffer, shortNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.Int32: PrimitiveArray <int> arrowIntArray = (PrimitiveArray <int>)arrowArray; ReadOnlyMemory <byte> intValueBuffer = arrowIntArray.ValueBuffer.Memory; ReadOnlyMemory <byte> intNullBitMapBuffer = arrowIntArray.NullBitmapBuffer.Memory; dataFrameColumn = new Int32DataFrameColumn(fieldName, intValueBuffer, intNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.Int64: PrimitiveArray <long> arrowLongArray = (PrimitiveArray <long>)arrowArray; ReadOnlyMemory <byte> longValueBuffer = arrowLongArray.ValueBuffer.Memory; ReadOnlyMemory <byte> longNullBitMapBuffer = arrowLongArray.NullBitmapBuffer.Memory; dataFrameColumn = new Int64DataFrameColumn(fieldName, longValueBuffer, longNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.String: StringArray stringArray = (StringArray)arrowArray; ReadOnlyMemory <byte> dataMemory = stringArray.ValueBuffer.Memory; ReadOnlyMemory <byte> offsetsMemory = stringArray.ValueOffsetsBuffer.Memory; ReadOnlyMemory <byte> nullMemory = stringArray.NullBitmapBuffer.Memory; dataFrameColumn = new ArrowStringDataFrameColumn(fieldName, dataMemory, offsetsMemory, nullMemory, stringArray.Length, stringArray.NullCount); break; case ArrowTypeId.UInt8: PrimitiveArray <byte> arrowbyteArray = (PrimitiveArray <byte>)arrowArray; ReadOnlyMemory <byte> byteValueBuffer = arrowbyteArray.ValueBuffer.Memory; ReadOnlyMemory <byte> byteNullBitMapBuffer = arrowbyteArray.NullBitmapBuffer.Memory; dataFrameColumn = new ByteDataFrameColumn(fieldName, byteValueBuffer, byteNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.UInt16: PrimitiveArray <ushort> arrowUshortArray = (PrimitiveArray <ushort>)arrowArray; ReadOnlyMemory <byte> ushortValueBuffer = arrowUshortArray.ValueBuffer.Memory; ReadOnlyMemory <byte> ushortNullBitMapBuffer = arrowUshortArray.NullBitmapBuffer.Memory; dataFrameColumn = new UInt16DataFrameColumn(fieldName, ushortValueBuffer, ushortNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.UInt32: PrimitiveArray <uint> arrowUintArray = (PrimitiveArray <uint>)arrowArray; ReadOnlyMemory <byte> uintValueBuffer = arrowUintArray.ValueBuffer.Memory; ReadOnlyMemory <byte> uintNullBitMapBuffer = arrowUintArray.NullBitmapBuffer.Memory; dataFrameColumn = new UInt32DataFrameColumn(fieldName, uintValueBuffer, uintNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.UInt64: PrimitiveArray <ulong> arrowUlongArray = (PrimitiveArray <ulong>)arrowArray; ReadOnlyMemory <byte> ulongValueBuffer = arrowUlongArray.ValueBuffer.Memory; ReadOnlyMemory <byte> ulongNullBitMapBuffer = arrowUlongArray.NullBitmapBuffer.Memory; dataFrameColumn = new UInt64DataFrameColumn(fieldName, ulongValueBuffer, ulongNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.Struct: StructArray structArray = (StructArray)arrowArray; StructType structType = (StructType)field.DataType; IEnumerator <Field> fieldsEnumerator = structType.Fields.GetEnumerator(); IEnumerator <IArrowArray> structArrayEnumerator = structArray.Fields.GetEnumerator(); while (fieldsEnumerator.MoveNext() && structArrayEnumerator.MoveNext()) { AppendDataFrameColumnFromArrowArray(fieldsEnumerator.Current, structArrayEnumerator.Current, ret, field.Name + "_"); } break; case ArrowTypeId.Decimal: case ArrowTypeId.Binary: case ArrowTypeId.Date32: case ArrowTypeId.Date64: case ArrowTypeId.Dictionary: case ArrowTypeId.FixedSizedBinary: case ArrowTypeId.HalfFloat: case ArrowTypeId.Interval: case ArrowTypeId.List: case ArrowTypeId.Map: case ArrowTypeId.Null: case ArrowTypeId.Time32: case ArrowTypeId.Time64: default: throw new NotImplementedException(nameof(fieldType.Name)); } if (dataFrameColumn != null) { ret.Columns.Insert(ret.Columns.Count, dataFrameColumn); } }
private static DataFrameColumn CreateColumn(Type kind, string[] columnNames, int columnIndex) { DataFrameColumn ret; if (kind == typeof(bool)) { ret = new BooleanDataFrameColumn(GetColumnName(columnNames, columnIndex)); } else if (kind == typeof(int)) { ret = new Int32DataFrameColumn(GetColumnName(columnNames, columnIndex)); } else if (kind == typeof(float)) { ret = new SingleDataFrameColumn(GetColumnName(columnNames, columnIndex)); } else if (kind == typeof(string)) { ret = new StringDataFrameColumn(GetColumnName(columnNames, columnIndex), 0); } else if (kind == typeof(long)) { ret = new Int64DataFrameColumn(GetColumnName(columnNames, columnIndex)); } else if (kind == typeof(decimal)) { ret = new DecimalDataFrameColumn(GetColumnName(columnNames, columnIndex)); } else if (kind == typeof(byte)) { ret = new ByteDataFrameColumn(GetColumnName(columnNames, columnIndex)); } else if (kind == typeof(char)) { ret = new CharDataFrameColumn(GetColumnName(columnNames, columnIndex)); } else if (kind == typeof(double)) { ret = new DoubleDataFrameColumn(GetColumnName(columnNames, columnIndex)); } else if (kind == typeof(sbyte)) { ret = new SByteDataFrameColumn(GetColumnName(columnNames, columnIndex)); } else if (kind == typeof(short)) { ret = new Int16DataFrameColumn(GetColumnName(columnNames, columnIndex)); } else if (kind == typeof(uint)) { ret = new UInt32DataFrameColumn(GetColumnName(columnNames, columnIndex)); } else if (kind == typeof(ulong)) { ret = new UInt64DataFrameColumn(GetColumnName(columnNames, columnIndex)); } else if (kind == typeof(ushort)) { ret = new UInt16DataFrameColumn(GetColumnName(columnNames, columnIndex)); } else { throw new NotSupportedException(nameof(kind)); } return(ret); }
private Int64DataFrameColumn GetSortIndices(IComparer <T> comparer, out Int64DataFrameColumn columnNullIndices) { List <List <int> > bufferSortIndices = new List <List <int> >(_columnContainer.Buffers.Count); columnNullIndices = new Int64DataFrameColumn("NullIndices", NullCount); long nullIndicesSlot = 0; // Sort each buffer first for (int b = 0; b < _columnContainer.Buffers.Count; b++) { ReadOnlyDataFrameBuffer <T> buffer = _columnContainer.Buffers[b]; ReadOnlySpan <byte> nullBitMapSpan = _columnContainer.NullBitMapBuffers[b].ReadOnlySpan; int[] sortIndices = new int[buffer.Length]; for (int i = 0; i < buffer.Length; i++) { sortIndices[i] = i; } IntrospectiveSort(buffer.ReadOnlySpan, buffer.Length, sortIndices, comparer); // Bug fix: QuickSort is not stable. When PrimitiveDataFrameColumn has null values and default values, they move around List <int> nonNullSortIndices = new List <int>(); for (int i = 0; i < sortIndices.Length; i++) { int localSortIndex = sortIndices[i]; if (_columnContainer.IsValid(nullBitMapSpan, localSortIndex)) { nonNullSortIndices.Add(sortIndices[i]); } else { columnNullIndices[nullIndicesSlot] = localSortIndex + b * _columnContainer.Buffers[0].Length; nullIndicesSlot++; } } bufferSortIndices.Add(nonNullSortIndices); } // Simple merge sort to build the full column's sort indices ValueTuple <T, int> GetFirstNonNullValueAndBufferIndexStartingAtIndex(int bufferIndex, int startIndex) { int index = bufferSortIndices[bufferIndex][startIndex]; T value; ReadOnlyMemory <byte> buffer = _columnContainer.Buffers[bufferIndex].ReadOnlyBuffer; ReadOnlyMemory <T> typedBuffer = Unsafe.As <ReadOnlyMemory <byte>, ReadOnlyMemory <T> >(ref buffer); if (!typedBuffer.IsEmpty) { bool isArray = MemoryMarshal.TryGetArray(typedBuffer, out ArraySegment <T> arraySegment); if (isArray) { value = arraySegment.Array[index + arraySegment.Offset]; } else { value = _columnContainer.Buffers[bufferIndex][index]; } } else { value = _columnContainer.Buffers[bufferIndex][index]; } return(value, startIndex); } SortedDictionary <T, List <ValueTuple <int, int> > > heapOfValueAndListOfTupleOfSortAndBufferIndex = new SortedDictionary <T, List <ValueTuple <int, int> > >(comparer); IList <ReadOnlyDataFrameBuffer <T> > buffers = _columnContainer.Buffers; for (int i = 0; i < buffers.Count; i++) { ReadOnlyDataFrameBuffer <T> buffer = buffers[i]; if (bufferSortIndices[i].Count == 0) { // All nulls continue; } ValueTuple <T, int> valueAndBufferIndex = GetFirstNonNullValueAndBufferIndexStartingAtIndex(i, 0); if (heapOfValueAndListOfTupleOfSortAndBufferIndex.ContainsKey(valueAndBufferIndex.Item1)) { heapOfValueAndListOfTupleOfSortAndBufferIndex[valueAndBufferIndex.Item1].Add((valueAndBufferIndex.Item2, i)); } else { heapOfValueAndListOfTupleOfSortAndBufferIndex.Add(valueAndBufferIndex.Item1, new List <ValueTuple <int, int> >() { (valueAndBufferIndex.Item2, i) });
internal override PrimitiveDataFrameColumn <long> GetAscendingSortIndices(out Int64DataFrameColumn nullIndices) { Int64DataFrameColumn sortIndices = GetSortIndices(Comparer <T> .Default, out nullIndices); return(sortIndices); }
private static HashSet <long> Merge(DataFrame retainedDataFrame, DataFrame supplementaryDataFrame, string[] retainedJoinColumnNames, string[] supplemetaryJoinColumnNames, out PrimitiveDataFrameColumn <long> retainedRowIndices, out PrimitiveDataFrameColumn <long> supplementaryRowIndices, bool isInner = false, bool calculateIntersection = false) { if (retainedJoinColumnNames == null) { throw new ArgumentNullException(nameof(retainedJoinColumnNames)); } if (supplemetaryJoinColumnNames == null) { throw new ArgumentNullException(nameof(supplemetaryJoinColumnNames)); } if (retainedJoinColumnNames.Length != supplemetaryJoinColumnNames.Length) { throw new ArgumentException(Strings.MismatchedArrayLengths, nameof(retainedJoinColumnNames)); } HashSet <long> intersection = calculateIntersection ? new HashSet <long>() : null; // Get occurrences of values in columns used for join in the retained and supplementary dataframes Dictionary <long, ICollection <long> > occurrences = null; Dictionary <long, long> retainedIndicesReverseMapping = null; HashSet <long> supplementaryJoinColumnsNullIndices = new HashSet <long>(); for (int colNameIndex = 0; colNameIndex < retainedJoinColumnNames.Length; colNameIndex++) { DataFrameColumn shrinkedRetainedColumn = retainedDataFrame.Columns[retainedJoinColumnNames[colNameIndex]]; //shrink retained column by row occurrences from previous step if (occurrences != null) { //only rows with occurences from previose step should go for futher processing var shrinkedRetainedIndices = occurrences.Keys.ToArray(); //create reverse mapping of index of the row in the shrinked column to the index of this row in the original dataframe (new index -> original index) var newRetainedIndicesReverseMapping = new Dictionary <long, long>(shrinkedRetainedIndices.Length); for (int i = 0; i < shrinkedRetainedIndices.Length; i++) { //store reverse mapping to restore original dataframe indices from indices in shrinked row var originalIndex = shrinkedRetainedIndices[i]; newRetainedIndicesReverseMapping.Add(i, originalIndex); } retainedIndicesReverseMapping = newRetainedIndicesReverseMapping; shrinkedRetainedColumn = shrinkedRetainedColumn.Clone(new Int64DataFrameColumn("Indices", shrinkedRetainedIndices)); } DataFrameColumn supplementaryColumn = supplementaryDataFrame.Columns[supplemetaryJoinColumnNames[colNameIndex]]; //Find occurrenses on current step (join column) var newOccurrences = shrinkedRetainedColumn.GetGroupedOccurrences(supplementaryColumn, out HashSet <long> supplementaryColumnNullIndices); //Convert indices from in key from local (shrinked row) to indices in original dataframe if (retainedIndicesReverseMapping != null) { newOccurrences = newOccurrences.ToDictionary(kvp => retainedIndicesReverseMapping[kvp.Key], kvp => kvp.Value); } supplementaryJoinColumnsNullIndices.UnionWith(supplementaryColumnNullIndices); // shrink join result on current column by previous join columns (if any) // (we have to remove occurrences that doesn't exist in previous columns, because JOIN happens only if ALL left and right columns in JOIN are matched) if (occurrences != null) { var shrinkedOccurences = new Dictionary <long, ICollection <long> >(); foreach (var kvp in newOccurrences) { var newValue = kvp.Value.Where(i => occurrences[kvp.Key].Contains(i)).ToArray(); if (newValue.Any()) { shrinkedOccurences.Add(kvp.Key, newValue); } } newOccurrences = shrinkedOccurences; } occurrences = newOccurrences; } retainedRowIndices = new Int64DataFrameColumn("RetainedIndices"); supplementaryRowIndices = new Int64DataFrameColumn("SupplementaryIndices"); //Perform Merging var retainJoinColumns = retainedJoinColumnNames.Select(name => retainedDataFrame.Columns[name]).ToArray(); for (long i = 0; i < retainedDataFrame.Columns.RowCount; i++) { if (!IsAnyNullValueInColumns(retainJoinColumns, i)) { //Get all row indexes from supplementary dataframe that sutisfy JOIN condition if (occurrences.TryGetValue(i, out ICollection <long> rowIndices)) { foreach (long supplementaryRowIndex in rowIndices) { retainedRowIndices.Append(i); supplementaryRowIndices.Append(supplementaryRowIndex); //store intersection if required if (calculateIntersection) { if (!intersection.Contains(supplementaryRowIndex)) { intersection.Add(supplementaryRowIndex); } } } } else { if (isInner) { continue; } retainedRowIndices.Append(i); supplementaryRowIndices.Append(null); } } else { foreach (long row in supplementaryJoinColumnsNullIndices) { retainedRowIndices.Append(i); supplementaryRowIndices.Append(row); } } } return(intersection); }
/// <summary> /// Wraps a <see cref="DataFrame"/> around an Arrow <see cref="RecordBatch"/> without copying data /// </summary> /// <param name="recordBatch"></param> /// <returns><see cref="DataFrame"/></returns> public static DataFrame FromArrowRecordBatch(RecordBatch recordBatch) { DataFrame ret = new DataFrame(); Apache.Arrow.Schema arrowSchema = recordBatch.Schema; int fieldIndex = 0; IEnumerable <IArrowArray> arrowArrays = recordBatch.Arrays; foreach (IArrowArray arrowArray in arrowArrays) { Field field = arrowSchema.GetFieldByIndex(fieldIndex); IArrowType fieldType = field.DataType; DataFrameColumn dataFrameColumn = null; switch (fieldType.TypeId) { case ArrowTypeId.Boolean: BooleanArray arrowBooleanArray = (BooleanArray)arrowArray; ReadOnlyMemory <byte> valueBuffer = arrowBooleanArray.ValueBuffer.Memory; ReadOnlyMemory <byte> nullBitMapBuffer = arrowBooleanArray.NullBitmapBuffer.Memory; dataFrameColumn = new BooleanDataFrameColumn(field.Name, valueBuffer, nullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.Double: PrimitiveArray <double> arrowDoubleArray = (PrimitiveArray <double>)arrowArray; ReadOnlyMemory <byte> doubleValueBuffer = arrowDoubleArray.ValueBuffer.Memory; ReadOnlyMemory <byte> doubleNullBitMapBuffer = arrowDoubleArray.NullBitmapBuffer.Memory; dataFrameColumn = new DoubleDataFrameColumn(field.Name, doubleValueBuffer, doubleNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.Float: PrimitiveArray <float> arrowFloatArray = (PrimitiveArray <float>)arrowArray; ReadOnlyMemory <byte> floatValueBuffer = arrowFloatArray.ValueBuffer.Memory; ReadOnlyMemory <byte> floatNullBitMapBuffer = arrowFloatArray.NullBitmapBuffer.Memory; dataFrameColumn = new SingleDataFrameColumn(field.Name, floatValueBuffer, floatNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.Int8: PrimitiveArray <sbyte> arrowsbyteArray = (PrimitiveArray <sbyte>)arrowArray; ReadOnlyMemory <byte> sbyteValueBuffer = arrowsbyteArray.ValueBuffer.Memory; ReadOnlyMemory <byte> sbyteNullBitMapBuffer = arrowsbyteArray.NullBitmapBuffer.Memory; dataFrameColumn = new SByteDataFrameColumn(field.Name, sbyteValueBuffer, sbyteNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.Int16: PrimitiveArray <short> arrowshortArray = (PrimitiveArray <short>)arrowArray; ReadOnlyMemory <byte> shortValueBuffer = arrowshortArray.ValueBuffer.Memory; ReadOnlyMemory <byte> shortNullBitMapBuffer = arrowshortArray.NullBitmapBuffer.Memory; dataFrameColumn = new Int16DataFrameColumn(field.Name, shortValueBuffer, shortNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.Int32: PrimitiveArray <int> arrowIntArray = (PrimitiveArray <int>)arrowArray; ReadOnlyMemory <byte> intValueBuffer = arrowIntArray.ValueBuffer.Memory; ReadOnlyMemory <byte> intNullBitMapBuffer = arrowIntArray.NullBitmapBuffer.Memory; dataFrameColumn = new Int32DataFrameColumn(field.Name, intValueBuffer, intNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.Int64: PrimitiveArray <long> arrowLongArray = (PrimitiveArray <long>)arrowArray; ReadOnlyMemory <byte> longValueBuffer = arrowLongArray.ValueBuffer.Memory; ReadOnlyMemory <byte> longNullBitMapBuffer = arrowLongArray.NullBitmapBuffer.Memory; dataFrameColumn = new Int64DataFrameColumn(field.Name, longValueBuffer, longNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.String: StringArray stringArray = (StringArray)arrowArray; ReadOnlyMemory <byte> dataMemory = stringArray.ValueBuffer.Memory; ReadOnlyMemory <byte> offsetsMemory = stringArray.ValueOffsetsBuffer.Memory; ReadOnlyMemory <byte> nullMemory = stringArray.NullBitmapBuffer.Memory; dataFrameColumn = new ArrowStringDataFrameColumn(field.Name, dataMemory, offsetsMemory, nullMemory, stringArray.Length, stringArray.NullCount); break; case ArrowTypeId.UInt8: PrimitiveArray <byte> arrowbyteArray = (PrimitiveArray <byte>)arrowArray; ReadOnlyMemory <byte> byteValueBuffer = arrowbyteArray.ValueBuffer.Memory; ReadOnlyMemory <byte> byteNullBitMapBuffer = arrowbyteArray.NullBitmapBuffer.Memory; dataFrameColumn = new ByteDataFrameColumn(field.Name, byteValueBuffer, byteNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.UInt16: PrimitiveArray <ushort> arrowUshortArray = (PrimitiveArray <ushort>)arrowArray; ReadOnlyMemory <byte> ushortValueBuffer = arrowUshortArray.ValueBuffer.Memory; ReadOnlyMemory <byte> ushortNullBitMapBuffer = arrowUshortArray.NullBitmapBuffer.Memory; dataFrameColumn = new UInt16DataFrameColumn(field.Name, ushortValueBuffer, ushortNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.UInt32: PrimitiveArray <uint> arrowUintArray = (PrimitiveArray <uint>)arrowArray; ReadOnlyMemory <byte> uintValueBuffer = arrowUintArray.ValueBuffer.Memory; ReadOnlyMemory <byte> uintNullBitMapBuffer = arrowUintArray.NullBitmapBuffer.Memory; dataFrameColumn = new UInt32DataFrameColumn(field.Name, uintValueBuffer, uintNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.UInt64: PrimitiveArray <ulong> arrowUlongArray = (PrimitiveArray <ulong>)arrowArray; ReadOnlyMemory <byte> ulongValueBuffer = arrowUlongArray.ValueBuffer.Memory; ReadOnlyMemory <byte> ulongNullBitMapBuffer = arrowUlongArray.NullBitmapBuffer.Memory; dataFrameColumn = new UInt64DataFrameColumn(field.Name, ulongValueBuffer, ulongNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.Decimal: case ArrowTypeId.Binary: case ArrowTypeId.Date32: case ArrowTypeId.Date64: case ArrowTypeId.Dictionary: case ArrowTypeId.FixedSizedBinary: case ArrowTypeId.HalfFloat: case ArrowTypeId.Interval: case ArrowTypeId.List: case ArrowTypeId.Map: case ArrowTypeId.Null: case ArrowTypeId.Struct: case ArrowTypeId.Time32: case ArrowTypeId.Time64: default: throw new NotImplementedException(nameof(fieldType.Name)); } ret.Columns.Insert(ret.Columns.Count, dataFrameColumn); fieldIndex++; } return(ret); }
private static Dictionary <long, ICollection <long> > GetOccurences(DataFrame retainedDataFrame, DataFrame supplementaryDataFrame, string[] retainedJoinColumnNames, string[] supplemetaryJoinColumnNames, out HashSet <long> supplementaryJoinColumnsNullIndices) { supplementaryJoinColumnsNullIndices = new HashSet <long>(); // Get occurrences of values in columns used for join in the retained and supplementary dataframes Dictionary <long, ICollection <long> > occurrences = null; Dictionary <long, long> retainedIndicesReverseMapping = null; for (int colNameIndex = 0; colNameIndex < retainedJoinColumnNames.Length; colNameIndex++) { DataFrameColumn shrinkedRetainedColumn = retainedDataFrame.Columns[retainedJoinColumnNames[colNameIndex]]; // Shrink retained column by row occurrences from previous step if (occurrences != null) { // Only rows with occurences from previose step should go for futher processing var shrinkedRetainedIndices = occurrences.Keys.ToArray(); // Create reverse mapping of index of the row in the shrinked column to the index of this row in the original dataframe (new index -> original index) var newRetainedIndicesReverseMapping = new Dictionary <long, long>(shrinkedRetainedIndices.Length); for (int i = 0; i < shrinkedRetainedIndices.Length; i++) { // Store reverse mapping to restore original dataframe indices from indices in shrinked row var originalIndex = shrinkedRetainedIndices[i]; newRetainedIndicesReverseMapping.Add(i, originalIndex); } retainedIndicesReverseMapping = newRetainedIndicesReverseMapping; var indices = new Int64DataFrameColumn("Indices", shrinkedRetainedIndices); shrinkedRetainedColumn = shrinkedRetainedColumn.Clone(indices); } DataFrameColumn supplementaryColumn = supplementaryDataFrame.Columns[supplemetaryJoinColumnNames[colNameIndex]]; // Find occurrenses on current step (join column) var newOccurrences = shrinkedRetainedColumn.GetGroupedOccurrences(supplementaryColumn, out HashSet <long> supplementaryColumnNullIndices); // Convert indices from in key from local (shrinked row) to indices in original dataframe if (retainedIndicesReverseMapping != null) { newOccurrences = newOccurrences.ToDictionary(kvp => retainedIndicesReverseMapping[kvp.Key], kvp => kvp.Value); } supplementaryJoinColumnsNullIndices.UnionWith(supplementaryColumnNullIndices); // Shrink join result on current column by previous join columns (if any) // (we have to remove occurrences that doesn't exist in previous columns, because JOIN happens only if ALL left and right columns in JOIN are matched) if (occurrences != null) { newOccurrences = GetShrinkedOccurences(occurrences, newOccurrences); } occurrences = newOccurrences; } return(occurrences); }
// TODO: Merge API with an "On" parameter that merges on a column common to 2 dataframes /// <summary> /// Merge DataFrames with a database style join /// </summary> /// <param name="other"></param> /// <param name="leftJoinColumn"></param> /// <param name="rightJoinColumn"></param> /// <param name="leftSuffix"></param> /// <param name="rightSuffix"></param> /// <param name="joinAlgorithm"></param> /// <returns></returns> public DataFrame Merge <TKey>(DataFrame other, string leftJoinColumn, string rightJoinColumn, string leftSuffix = "_left", string rightSuffix = "_right", JoinAlgorithm joinAlgorithm = JoinAlgorithm.Left) { // A simple hash join DataFrame ret = new DataFrame(); DataFrame leftDataFrame = this; DataFrame rightDataFrame = other; // The final table size is not known until runtime long rowNumber = 0; PrimitiveDataFrameColumn <long> leftRowIndices = new PrimitiveDataFrameColumn <long>("LeftIndices"); PrimitiveDataFrameColumn <long> rightRowIndices = new PrimitiveDataFrameColumn <long>("RightIndices"); if (joinAlgorithm == JoinAlgorithm.Left) { // First hash other dataframe on the rightJoinColumn DataFrameColumn otherColumn = other.Columns[rightJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>(out HashSet <long> otherColumnNullIndices); // Go over the records in this dataframe and match with the dictionary DataFrameColumn thisColumn = Columns[leftJoinColumn]; for (long i = 0; i < thisColumn.Length; i++) { var thisColumnValue = thisColumn[i]; if (thisColumnValue != null) { if (multimap.TryGetValue((TKey)thisColumnValue, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { leftRowIndices.Append(i); rightRowIndices.Append(row); } } else { leftRowIndices.Append(i); rightRowIndices.Append(null); } } else { foreach (long row in otherColumnNullIndices) { leftRowIndices.Append(i); rightRowIndices.Append(row); } } } } else if (joinAlgorithm == JoinAlgorithm.Right) { DataFrameColumn thisColumn = Columns[leftJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = thisColumn.GroupColumnValues <TKey>(out HashSet <long> thisColumnNullIndices); DataFrameColumn otherColumn = other.Columns[rightJoinColumn]; for (long i = 0; i < otherColumn.Length; i++) { var otherColumnValue = otherColumn[i]; if (otherColumnValue != null) { if (multimap.TryGetValue((TKey)otherColumnValue, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { leftRowIndices.Append(row); rightRowIndices.Append(i); } } else { leftRowIndices.Append(null); rightRowIndices.Append(i); } } else { foreach (long thisColumnNullIndex in thisColumnNullIndices) { leftRowIndices.Append(thisColumnNullIndex); rightRowIndices.Append(i); } } } } else if (joinAlgorithm == JoinAlgorithm.Inner) { // Hash the column with the smaller RowCount long leftRowCount = Rows.Count; long rightRowCount = other.Rows.Count; bool leftColumnIsSmaller = leftRowCount <= rightRowCount; DataFrameColumn hashColumn = leftColumnIsSmaller ? Columns[leftJoinColumn] : other.Columns[rightJoinColumn]; DataFrameColumn otherColumn = ReferenceEquals(hashColumn, Columns[leftJoinColumn]) ? other.Columns[rightJoinColumn] : Columns[leftJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = hashColumn.GroupColumnValues <TKey>(out HashSet <long> smallerDataFrameColumnNullIndices); for (long i = 0; i < otherColumn.Length; i++) { var otherColumnValue = otherColumn[i]; if (otherColumnValue != null) { if (multimap.TryGetValue((TKey)otherColumnValue, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { leftRowIndices.Append(leftColumnIsSmaller ? row : i); rightRowIndices.Append(leftColumnIsSmaller ? i : row); } } } else { foreach (long nullIndex in smallerDataFrameColumnNullIndices) { leftRowIndices.Append(leftColumnIsSmaller ? nullIndex : i); rightRowIndices.Append(leftColumnIsSmaller ? i : nullIndex); } } } } else if (joinAlgorithm == JoinAlgorithm.FullOuter) { DataFrameColumn otherColumn = other.Columns[rightJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>(out HashSet <long> otherColumnNullIndices); Dictionary <TKey, long> intersection = new Dictionary <TKey, long>(EqualityComparer <TKey> .Default); // Go over the records in this dataframe and match with the dictionary DataFrameColumn thisColumn = Columns[leftJoinColumn]; Int64DataFrameColumn thisColumnNullIndices = new Int64DataFrameColumn("ThisColumnNullIndices"); for (long i = 0; i < thisColumn.Length; i++) { var thisColumnValue = thisColumn[i]; if (thisColumnValue != null) { if (multimap.TryGetValue((TKey)thisColumnValue, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { leftRowIndices.Append(i); rightRowIndices.Append(row); if (!intersection.ContainsKey((TKey)thisColumnValue)) { intersection.Add((TKey)thisColumnValue, rowNumber); } } } else { leftRowIndices.Append(i); rightRowIndices.Append(null); } } else { thisColumnNullIndices.Append(i); } } for (long i = 0; i < otherColumn.Length; i++) { var value = otherColumn[i]; if (value != null) { if (!intersection.ContainsKey((TKey)value)) { leftRowIndices.Append(null); rightRowIndices.Append(i); } } } // Now handle the null rows foreach (long?thisColumnNullIndex in thisColumnNullIndices) { foreach (long otherColumnNullIndex in otherColumnNullIndices) { leftRowIndices.Append(thisColumnNullIndex.Value); rightRowIndices.Append(otherColumnNullIndex); } if (otherColumnNullIndices.Count == 0) { leftRowIndices.Append(thisColumnNullIndex.Value); rightRowIndices.Append(null); } } if (thisColumnNullIndices.Length == 0) { foreach (long otherColumnNullIndex in otherColumnNullIndices) { leftRowIndices.Append(null); rightRowIndices.Append(otherColumnNullIndex); } } } else { throw new NotImplementedException(nameof(joinAlgorithm)); } for (int i = 0; i < leftDataFrame.Columns.Count; i++) { ret.Columns.Insert(i, leftDataFrame.Columns[i].Clone(leftRowIndices)); } for (int i = 0; i < rightDataFrame.Columns.Count; i++) { DataFrameColumn column = rightDataFrame.Columns[i].Clone(rightRowIndices); SetSuffixForDuplicatedColumnNames(ret, column, leftSuffix, rightSuffix); ret.Columns.Insert(ret.Columns.Count, column); } return(ret); }