Exemplo n.º 1
0
        private static HashSet <long> PerformMerging(DataFrame retainedDataFrame, string[] retainedJoinColumnNames,
                                                     Dictionary <long, ICollection <long> > occurrences, HashSet <long> supplementaryJoinColumnsNullIndices,
                                                     out PrimitiveDataFrameColumn <long> retainedRowIndices, out PrimitiveDataFrameColumn <long> supplementaryRowIndices,
                                                     bool isInner, bool calculateIntersection)
        {
            retainedRowIndices      = new Int64DataFrameColumn("RetainedIndices");
            supplementaryRowIndices = new Int64DataFrameColumn("SupplementaryIndices");

            HashSet <long> intersection = calculateIntersection ? new HashSet <long>() : null;

            var retainJoinColumns = retainedJoinColumnNames.Select(name => retainedDataFrame.Columns[name]).ToArray();

            for (long i = 0; i < retainedDataFrame.Columns.RowCount; i++)
            {
                if (!IsAnyNullValueInColumns(retainJoinColumns, i))
                {
                    // Get all row indexes from supplementary dataframe that satisfy JOIN condition
                    if (occurrences.TryGetValue(i, out ICollection <long> rowIndices))
                    {
                        foreach (long supplementaryRowIndex in rowIndices)
                        {
                            retainedRowIndices.Append(i);
                            supplementaryRowIndices.Append(supplementaryRowIndex);

                            // Store intersection if required
                            if (calculateIntersection)
                            {
                                if (!intersection.Contains(supplementaryRowIndex))
                                {
                                    intersection.Add(supplementaryRowIndex);
                                }
                            }
                        }
                    }
                    else
                    {
                        if (isInner)
                        {
                            continue;
                        }

                        retainedRowIndices.Append(i);
                        supplementaryRowIndices.Append(null);
                    }
                }
                else
                {
                    foreach (long row in supplementaryJoinColumnsNullIndices)
                    {
                        retainedRowIndices.Append(i);
                        supplementaryRowIndices.Append(row);
                    }
                }
            }

            return(intersection);
        }
Exemplo n.º 2
0
        private static void AppendDataFrameColumnFromArrowArray(Field field, IArrowArray arrowArray, DataFrame ret, string fieldNamePrefix = "")
        {
            IArrowType      fieldType       = field.DataType;
            DataFrameColumn dataFrameColumn = null;
            string          fieldName       = fieldNamePrefix + field.Name;

            switch (fieldType.TypeId)
            {
            case ArrowTypeId.Boolean:
                BooleanArray          arrowBooleanArray = (BooleanArray)arrowArray;
                ReadOnlyMemory <byte> valueBuffer       = arrowBooleanArray.ValueBuffer.Memory;
                ReadOnlyMemory <byte> nullBitMapBuffer  = arrowBooleanArray.NullBitmapBuffer.Memory;
                dataFrameColumn = new BooleanDataFrameColumn(fieldName, valueBuffer, nullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                break;

            case ArrowTypeId.Double:
                PrimitiveArray <double> arrowDoubleArray       = (PrimitiveArray <double>)arrowArray;
                ReadOnlyMemory <byte>   doubleValueBuffer      = arrowDoubleArray.ValueBuffer.Memory;
                ReadOnlyMemory <byte>   doubleNullBitMapBuffer = arrowDoubleArray.NullBitmapBuffer.Memory;
                dataFrameColumn = new DoubleDataFrameColumn(fieldName, doubleValueBuffer, doubleNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                break;

            case ArrowTypeId.Float:
                PrimitiveArray <float> arrowFloatArray       = (PrimitiveArray <float>)arrowArray;
                ReadOnlyMemory <byte>  floatValueBuffer      = arrowFloatArray.ValueBuffer.Memory;
                ReadOnlyMemory <byte>  floatNullBitMapBuffer = arrowFloatArray.NullBitmapBuffer.Memory;
                dataFrameColumn = new SingleDataFrameColumn(fieldName, floatValueBuffer, floatNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                break;

            case ArrowTypeId.Int8:
                PrimitiveArray <sbyte> arrowsbyteArray       = (PrimitiveArray <sbyte>)arrowArray;
                ReadOnlyMemory <byte>  sbyteValueBuffer      = arrowsbyteArray.ValueBuffer.Memory;
                ReadOnlyMemory <byte>  sbyteNullBitMapBuffer = arrowsbyteArray.NullBitmapBuffer.Memory;
                dataFrameColumn = new SByteDataFrameColumn(fieldName, sbyteValueBuffer, sbyteNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                break;

            case ArrowTypeId.Int16:
                PrimitiveArray <short> arrowshortArray       = (PrimitiveArray <short>)arrowArray;
                ReadOnlyMemory <byte>  shortValueBuffer      = arrowshortArray.ValueBuffer.Memory;
                ReadOnlyMemory <byte>  shortNullBitMapBuffer = arrowshortArray.NullBitmapBuffer.Memory;
                dataFrameColumn = new Int16DataFrameColumn(fieldName, shortValueBuffer, shortNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                break;

            case ArrowTypeId.Int32:
                PrimitiveArray <int>  arrowIntArray       = (PrimitiveArray <int>)arrowArray;
                ReadOnlyMemory <byte> intValueBuffer      = arrowIntArray.ValueBuffer.Memory;
                ReadOnlyMemory <byte> intNullBitMapBuffer = arrowIntArray.NullBitmapBuffer.Memory;
                dataFrameColumn = new Int32DataFrameColumn(fieldName, intValueBuffer, intNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                break;

            case ArrowTypeId.Int64:
                PrimitiveArray <long> arrowLongArray       = (PrimitiveArray <long>)arrowArray;
                ReadOnlyMemory <byte> longValueBuffer      = arrowLongArray.ValueBuffer.Memory;
                ReadOnlyMemory <byte> longNullBitMapBuffer = arrowLongArray.NullBitmapBuffer.Memory;
                dataFrameColumn = new Int64DataFrameColumn(fieldName, longValueBuffer, longNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                break;

            case ArrowTypeId.String:
                StringArray           stringArray   = (StringArray)arrowArray;
                ReadOnlyMemory <byte> dataMemory    = stringArray.ValueBuffer.Memory;
                ReadOnlyMemory <byte> offsetsMemory = stringArray.ValueOffsetsBuffer.Memory;
                ReadOnlyMemory <byte> nullMemory    = stringArray.NullBitmapBuffer.Memory;
                dataFrameColumn = new ArrowStringDataFrameColumn(fieldName, dataMemory, offsetsMemory, nullMemory, stringArray.Length, stringArray.NullCount);
                break;

            case ArrowTypeId.UInt8:
                PrimitiveArray <byte> arrowbyteArray       = (PrimitiveArray <byte>)arrowArray;
                ReadOnlyMemory <byte> byteValueBuffer      = arrowbyteArray.ValueBuffer.Memory;
                ReadOnlyMemory <byte> byteNullBitMapBuffer = arrowbyteArray.NullBitmapBuffer.Memory;
                dataFrameColumn = new ByteDataFrameColumn(fieldName, byteValueBuffer, byteNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                break;

            case ArrowTypeId.UInt16:
                PrimitiveArray <ushort> arrowUshortArray       = (PrimitiveArray <ushort>)arrowArray;
                ReadOnlyMemory <byte>   ushortValueBuffer      = arrowUshortArray.ValueBuffer.Memory;
                ReadOnlyMemory <byte>   ushortNullBitMapBuffer = arrowUshortArray.NullBitmapBuffer.Memory;
                dataFrameColumn = new UInt16DataFrameColumn(fieldName, ushortValueBuffer, ushortNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                break;

            case ArrowTypeId.UInt32:
                PrimitiveArray <uint> arrowUintArray       = (PrimitiveArray <uint>)arrowArray;
                ReadOnlyMemory <byte> uintValueBuffer      = arrowUintArray.ValueBuffer.Memory;
                ReadOnlyMemory <byte> uintNullBitMapBuffer = arrowUintArray.NullBitmapBuffer.Memory;
                dataFrameColumn = new UInt32DataFrameColumn(fieldName, uintValueBuffer, uintNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                break;

            case ArrowTypeId.UInt64:
                PrimitiveArray <ulong> arrowUlongArray       = (PrimitiveArray <ulong>)arrowArray;
                ReadOnlyMemory <byte>  ulongValueBuffer      = arrowUlongArray.ValueBuffer.Memory;
                ReadOnlyMemory <byte>  ulongNullBitMapBuffer = arrowUlongArray.NullBitmapBuffer.Memory;
                dataFrameColumn = new UInt64DataFrameColumn(fieldName, ulongValueBuffer, ulongNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                break;

            case ArrowTypeId.Struct:
                StructArray               structArray           = (StructArray)arrowArray;
                StructType                structType            = (StructType)field.DataType;
                IEnumerator <Field>       fieldsEnumerator      = structType.Fields.GetEnumerator();
                IEnumerator <IArrowArray> structArrayEnumerator = structArray.Fields.GetEnumerator();
                while (fieldsEnumerator.MoveNext() && structArrayEnumerator.MoveNext())
                {
                    AppendDataFrameColumnFromArrowArray(fieldsEnumerator.Current, structArrayEnumerator.Current, ret, field.Name + "_");
                }
                break;

            case ArrowTypeId.Decimal:
            case ArrowTypeId.Binary:
            case ArrowTypeId.Date32:
            case ArrowTypeId.Date64:
            case ArrowTypeId.Dictionary:
            case ArrowTypeId.FixedSizedBinary:
            case ArrowTypeId.HalfFloat:
            case ArrowTypeId.Interval:
            case ArrowTypeId.List:
            case ArrowTypeId.Map:
            case ArrowTypeId.Null:
            case ArrowTypeId.Time32:
            case ArrowTypeId.Time64:
            default:
                throw new NotImplementedException(nameof(fieldType.Name));
            }

            if (dataFrameColumn != null)
            {
                ret.Columns.Insert(ret.Columns.Count, dataFrameColumn);
            }
        }
Exemplo n.º 3
0
        private static DataFrameColumn CreateColumn(Type kind, string[] columnNames, int columnIndex)
        {
            DataFrameColumn ret;

            if (kind == typeof(bool))
            {
                ret = new BooleanDataFrameColumn(GetColumnName(columnNames, columnIndex));
            }
            else if (kind == typeof(int))
            {
                ret = new Int32DataFrameColumn(GetColumnName(columnNames, columnIndex));
            }
            else if (kind == typeof(float))
            {
                ret = new SingleDataFrameColumn(GetColumnName(columnNames, columnIndex));
            }
            else if (kind == typeof(string))
            {
                ret = new StringDataFrameColumn(GetColumnName(columnNames, columnIndex), 0);
            }
            else if (kind == typeof(long))
            {
                ret = new Int64DataFrameColumn(GetColumnName(columnNames, columnIndex));
            }
            else if (kind == typeof(decimal))
            {
                ret = new DecimalDataFrameColumn(GetColumnName(columnNames, columnIndex));
            }
            else if (kind == typeof(byte))
            {
                ret = new ByteDataFrameColumn(GetColumnName(columnNames, columnIndex));
            }
            else if (kind == typeof(char))
            {
                ret = new CharDataFrameColumn(GetColumnName(columnNames, columnIndex));
            }
            else if (kind == typeof(double))
            {
                ret = new DoubleDataFrameColumn(GetColumnName(columnNames, columnIndex));
            }
            else if (kind == typeof(sbyte))
            {
                ret = new SByteDataFrameColumn(GetColumnName(columnNames, columnIndex));
            }
            else if (kind == typeof(short))
            {
                ret = new Int16DataFrameColumn(GetColumnName(columnNames, columnIndex));
            }
            else if (kind == typeof(uint))
            {
                ret = new UInt32DataFrameColumn(GetColumnName(columnNames, columnIndex));
            }
            else if (kind == typeof(ulong))
            {
                ret = new UInt64DataFrameColumn(GetColumnName(columnNames, columnIndex));
            }
            else if (kind == typeof(ushort))
            {
                ret = new UInt16DataFrameColumn(GetColumnName(columnNames, columnIndex));
            }
            else
            {
                throw new NotSupportedException(nameof(kind));
            }
            return(ret);
        }
Exemplo n.º 4
0
        private Int64DataFrameColumn GetSortIndices(IComparer <T> comparer, out Int64DataFrameColumn columnNullIndices)
        {
            List <List <int> > bufferSortIndices = new List <List <int> >(_columnContainer.Buffers.Count);

            columnNullIndices = new Int64DataFrameColumn("NullIndices", NullCount);
            long nullIndicesSlot = 0;

            // Sort each buffer first
            for (int b = 0; b < _columnContainer.Buffers.Count; b++)
            {
                ReadOnlyDataFrameBuffer <T> buffer         = _columnContainer.Buffers[b];
                ReadOnlySpan <byte>         nullBitMapSpan = _columnContainer.NullBitMapBuffers[b].ReadOnlySpan;
                int[] sortIndices = new int[buffer.Length];
                for (int i = 0; i < buffer.Length; i++)
                {
                    sortIndices[i] = i;
                }
                IntrospectiveSort(buffer.ReadOnlySpan, buffer.Length, sortIndices, comparer);
                // Bug fix: QuickSort is not stable. When PrimitiveDataFrameColumn has null values and default values, they move around
                List <int> nonNullSortIndices = new List <int>();
                for (int i = 0; i < sortIndices.Length; i++)
                {
                    int localSortIndex = sortIndices[i];
                    if (_columnContainer.IsValid(nullBitMapSpan, localSortIndex))
                    {
                        nonNullSortIndices.Add(sortIndices[i]);
                    }
                    else
                    {
                        columnNullIndices[nullIndicesSlot] = localSortIndex + b * _columnContainer.Buffers[0].Length;
                        nullIndicesSlot++;
                    }
                }
                bufferSortIndices.Add(nonNullSortIndices);
            }
            // Simple merge sort to build the full column's sort indices
            ValueTuple <T, int> GetFirstNonNullValueAndBufferIndexStartingAtIndex(int bufferIndex, int startIndex)
            {
                int index = bufferSortIndices[bufferIndex][startIndex];
                T   value;
                ReadOnlyMemory <byte> buffer      = _columnContainer.Buffers[bufferIndex].ReadOnlyBuffer;
                ReadOnlyMemory <T>    typedBuffer = Unsafe.As <ReadOnlyMemory <byte>, ReadOnlyMemory <T> >(ref buffer);

                if (!typedBuffer.IsEmpty)
                {
                    bool isArray = MemoryMarshal.TryGetArray(typedBuffer, out ArraySegment <T> arraySegment);
                    if (isArray)
                    {
                        value = arraySegment.Array[index + arraySegment.Offset];
                    }
                    else
                    {
                        value = _columnContainer.Buffers[bufferIndex][index];
                    }
                }
                else
                {
                    value = _columnContainer.Buffers[bufferIndex][index];
                }
                return(value, startIndex);
            }

            SortedDictionary <T, List <ValueTuple <int, int> > > heapOfValueAndListOfTupleOfSortAndBufferIndex = new SortedDictionary <T, List <ValueTuple <int, int> > >(comparer);
            IList <ReadOnlyDataFrameBuffer <T> > buffers = _columnContainer.Buffers;

            for (int i = 0; i < buffers.Count; i++)
            {
                ReadOnlyDataFrameBuffer <T> buffer = buffers[i];
                if (bufferSortIndices[i].Count == 0)
                {
                    // All nulls
                    continue;
                }
                ValueTuple <T, int> valueAndBufferIndex = GetFirstNonNullValueAndBufferIndexStartingAtIndex(i, 0);
                if (heapOfValueAndListOfTupleOfSortAndBufferIndex.ContainsKey(valueAndBufferIndex.Item1))
                {
                    heapOfValueAndListOfTupleOfSortAndBufferIndex[valueAndBufferIndex.Item1].Add((valueAndBufferIndex.Item2, i));
                }
                else
                {
                    heapOfValueAndListOfTupleOfSortAndBufferIndex.Add(valueAndBufferIndex.Item1, new List <ValueTuple <int, int> >()
                    {
                        (valueAndBufferIndex.Item2, i)
                    });
Exemplo n.º 5
0
        internal override PrimitiveDataFrameColumn <long> GetAscendingSortIndices(out Int64DataFrameColumn nullIndices)
        {
            Int64DataFrameColumn sortIndices = GetSortIndices(Comparer <T> .Default, out nullIndices);

            return(sortIndices);
        }
Exemplo n.º 6
0
        private static HashSet <long> Merge(DataFrame retainedDataFrame, DataFrame supplementaryDataFrame, string[] retainedJoinColumnNames, string[] supplemetaryJoinColumnNames, out PrimitiveDataFrameColumn <long> retainedRowIndices, out PrimitiveDataFrameColumn <long> supplementaryRowIndices, bool isInner = false, bool calculateIntersection = false)
        {
            if (retainedJoinColumnNames == null)
            {
                throw new ArgumentNullException(nameof(retainedJoinColumnNames));
            }

            if (supplemetaryJoinColumnNames == null)
            {
                throw new ArgumentNullException(nameof(supplemetaryJoinColumnNames));
            }

            if (retainedJoinColumnNames.Length != supplemetaryJoinColumnNames.Length)
            {
                throw new ArgumentException(Strings.MismatchedArrayLengths, nameof(retainedJoinColumnNames));
            }


            HashSet <long> intersection = calculateIntersection ? new HashSet <long>() : null;

            // Get occurrences of values in columns used for join in the retained and supplementary dataframes
            Dictionary <long, ICollection <long> > occurrences    = null;
            Dictionary <long, long> retainedIndicesReverseMapping = null;

            HashSet <long> supplementaryJoinColumnsNullIndices = new HashSet <long>();


            for (int colNameIndex = 0; colNameIndex < retainedJoinColumnNames.Length; colNameIndex++)
            {
                DataFrameColumn shrinkedRetainedColumn = retainedDataFrame.Columns[retainedJoinColumnNames[colNameIndex]];

                //shrink retained column by row occurrences from previous step
                if (occurrences != null)
                {
                    //only rows with occurences from previose step should go for futher processing
                    var shrinkedRetainedIndices = occurrences.Keys.ToArray();

                    //create reverse mapping of index of the row in the shrinked column to the index of this row in the original dataframe (new index -> original index)
                    var newRetainedIndicesReverseMapping = new Dictionary <long, long>(shrinkedRetainedIndices.Length);

                    for (int i = 0; i < shrinkedRetainedIndices.Length; i++)
                    {
                        //store reverse mapping to restore original dataframe indices from indices in shrinked row
                        var originalIndex = shrinkedRetainedIndices[i];
                        newRetainedIndicesReverseMapping.Add(i, originalIndex);
                    }

                    retainedIndicesReverseMapping = newRetainedIndicesReverseMapping;
                    shrinkedRetainedColumn        = shrinkedRetainedColumn.Clone(new Int64DataFrameColumn("Indices", shrinkedRetainedIndices));
                }

                DataFrameColumn supplementaryColumn = supplementaryDataFrame.Columns[supplemetaryJoinColumnNames[colNameIndex]];

                //Find occurrenses on current step (join column)
                var newOccurrences = shrinkedRetainedColumn.GetGroupedOccurrences(supplementaryColumn, out HashSet <long> supplementaryColumnNullIndices);

                //Convert indices from in key from local (shrinked row) to indices in original dataframe
                if (retainedIndicesReverseMapping != null)
                {
                    newOccurrences = newOccurrences.ToDictionary(kvp => retainedIndicesReverseMapping[kvp.Key], kvp => kvp.Value);
                }

                supplementaryJoinColumnsNullIndices.UnionWith(supplementaryColumnNullIndices);

                // shrink join result on current column by previous join columns (if any)
                // (we have to remove occurrences that doesn't exist in previous columns, because JOIN happens only if ALL left and right columns in JOIN are matched)
                if (occurrences != null)
                {
                    var shrinkedOccurences = new Dictionary <long, ICollection <long> >();

                    foreach (var kvp in newOccurrences)
                    {
                        var newValue = kvp.Value.Where(i => occurrences[kvp.Key].Contains(i)).ToArray();
                        if (newValue.Any())
                        {
                            shrinkedOccurences.Add(kvp.Key, newValue);
                        }
                    }
                    newOccurrences = shrinkedOccurences;
                }

                occurrences = newOccurrences;
            }

            retainedRowIndices      = new Int64DataFrameColumn("RetainedIndices");
            supplementaryRowIndices = new Int64DataFrameColumn("SupplementaryIndices");

            //Perform Merging
            var retainJoinColumns = retainedJoinColumnNames.Select(name => retainedDataFrame.Columns[name]).ToArray();

            for (long i = 0; i < retainedDataFrame.Columns.RowCount; i++)
            {
                if (!IsAnyNullValueInColumns(retainJoinColumns, i))
                {
                    //Get all row indexes from supplementary dataframe that sutisfy JOIN condition
                    if (occurrences.TryGetValue(i, out ICollection <long> rowIndices))
                    {
                        foreach (long supplementaryRowIndex in rowIndices)
                        {
                            retainedRowIndices.Append(i);
                            supplementaryRowIndices.Append(supplementaryRowIndex);

                            //store intersection if required
                            if (calculateIntersection)
                            {
                                if (!intersection.Contains(supplementaryRowIndex))
                                {
                                    intersection.Add(supplementaryRowIndex);
                                }
                            }
                        }
                    }
                    else
                    {
                        if (isInner)
                        {
                            continue;
                        }

                        retainedRowIndices.Append(i);
                        supplementaryRowIndices.Append(null);
                    }
                }
                else
                {
                    foreach (long row in supplementaryJoinColumnsNullIndices)
                    {
                        retainedRowIndices.Append(i);
                        supplementaryRowIndices.Append(row);
                    }
                }
            }

            return(intersection);
        }
Exemplo n.º 7
0
        /// <summary>
        /// Wraps a <see cref="DataFrame"/> around an Arrow <see cref="RecordBatch"/> without copying data
        /// </summary>
        /// <param name="recordBatch"></param>
        /// <returns><see cref="DataFrame"/></returns>
        public static DataFrame FromArrowRecordBatch(RecordBatch recordBatch)
        {
            DataFrame ret = new DataFrame();

            Apache.Arrow.Schema arrowSchema = recordBatch.Schema;
            int fieldIndex = 0;
            IEnumerable <IArrowArray> arrowArrays = recordBatch.Arrays;

            foreach (IArrowArray arrowArray in arrowArrays)
            {
                Field           field           = arrowSchema.GetFieldByIndex(fieldIndex);
                IArrowType      fieldType       = field.DataType;
                DataFrameColumn dataFrameColumn = null;
                switch (fieldType.TypeId)
                {
                case ArrowTypeId.Boolean:
                    BooleanArray          arrowBooleanArray = (BooleanArray)arrowArray;
                    ReadOnlyMemory <byte> valueBuffer       = arrowBooleanArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte> nullBitMapBuffer  = arrowBooleanArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new BooleanDataFrameColumn(field.Name, valueBuffer, nullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                    break;

                case ArrowTypeId.Double:
                    PrimitiveArray <double> arrowDoubleArray       = (PrimitiveArray <double>)arrowArray;
                    ReadOnlyMemory <byte>   doubleValueBuffer      = arrowDoubleArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte>   doubleNullBitMapBuffer = arrowDoubleArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new DoubleDataFrameColumn(field.Name, doubleValueBuffer, doubleNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                    break;

                case ArrowTypeId.Float:
                    PrimitiveArray <float> arrowFloatArray       = (PrimitiveArray <float>)arrowArray;
                    ReadOnlyMemory <byte>  floatValueBuffer      = arrowFloatArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte>  floatNullBitMapBuffer = arrowFloatArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new SingleDataFrameColumn(field.Name, floatValueBuffer, floatNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                    break;

                case ArrowTypeId.Int8:
                    PrimitiveArray <sbyte> arrowsbyteArray       = (PrimitiveArray <sbyte>)arrowArray;
                    ReadOnlyMemory <byte>  sbyteValueBuffer      = arrowsbyteArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte>  sbyteNullBitMapBuffer = arrowsbyteArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new SByteDataFrameColumn(field.Name, sbyteValueBuffer, sbyteNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                    break;

                case ArrowTypeId.Int16:
                    PrimitiveArray <short> arrowshortArray       = (PrimitiveArray <short>)arrowArray;
                    ReadOnlyMemory <byte>  shortValueBuffer      = arrowshortArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte>  shortNullBitMapBuffer = arrowshortArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new Int16DataFrameColumn(field.Name, shortValueBuffer, shortNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                    break;

                case ArrowTypeId.Int32:
                    PrimitiveArray <int>  arrowIntArray       = (PrimitiveArray <int>)arrowArray;
                    ReadOnlyMemory <byte> intValueBuffer      = arrowIntArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte> intNullBitMapBuffer = arrowIntArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new Int32DataFrameColumn(field.Name, intValueBuffer, intNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                    break;

                case ArrowTypeId.Int64:
                    PrimitiveArray <long> arrowLongArray       = (PrimitiveArray <long>)arrowArray;
                    ReadOnlyMemory <byte> longValueBuffer      = arrowLongArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte> longNullBitMapBuffer = arrowLongArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new Int64DataFrameColumn(field.Name, longValueBuffer, longNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                    break;

                case ArrowTypeId.String:
                    StringArray           stringArray   = (StringArray)arrowArray;
                    ReadOnlyMemory <byte> dataMemory    = stringArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte> offsetsMemory = stringArray.ValueOffsetsBuffer.Memory;
                    ReadOnlyMemory <byte> nullMemory    = stringArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new ArrowStringDataFrameColumn(field.Name, dataMemory, offsetsMemory, nullMemory, stringArray.Length, stringArray.NullCount);
                    break;

                case ArrowTypeId.UInt8:
                    PrimitiveArray <byte> arrowbyteArray       = (PrimitiveArray <byte>)arrowArray;
                    ReadOnlyMemory <byte> byteValueBuffer      = arrowbyteArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte> byteNullBitMapBuffer = arrowbyteArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new ByteDataFrameColumn(field.Name, byteValueBuffer, byteNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                    break;

                case ArrowTypeId.UInt16:
                    PrimitiveArray <ushort> arrowUshortArray       = (PrimitiveArray <ushort>)arrowArray;
                    ReadOnlyMemory <byte>   ushortValueBuffer      = arrowUshortArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte>   ushortNullBitMapBuffer = arrowUshortArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new UInt16DataFrameColumn(field.Name, ushortValueBuffer, ushortNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                    break;

                case ArrowTypeId.UInt32:
                    PrimitiveArray <uint> arrowUintArray       = (PrimitiveArray <uint>)arrowArray;
                    ReadOnlyMemory <byte> uintValueBuffer      = arrowUintArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte> uintNullBitMapBuffer = arrowUintArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new UInt32DataFrameColumn(field.Name, uintValueBuffer, uintNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                    break;

                case ArrowTypeId.UInt64:
                    PrimitiveArray <ulong> arrowUlongArray       = (PrimitiveArray <ulong>)arrowArray;
                    ReadOnlyMemory <byte>  ulongValueBuffer      = arrowUlongArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte>  ulongNullBitMapBuffer = arrowUlongArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new UInt64DataFrameColumn(field.Name, ulongValueBuffer, ulongNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                    break;

                case ArrowTypeId.Decimal:
                case ArrowTypeId.Binary:
                case ArrowTypeId.Date32:
                case ArrowTypeId.Date64:
                case ArrowTypeId.Dictionary:
                case ArrowTypeId.FixedSizedBinary:
                case ArrowTypeId.HalfFloat:
                case ArrowTypeId.Interval:
                case ArrowTypeId.List:
                case ArrowTypeId.Map:
                case ArrowTypeId.Null:
                case ArrowTypeId.Struct:
                case ArrowTypeId.Time32:
                case ArrowTypeId.Time64:
                default:
                    throw new NotImplementedException(nameof(fieldType.Name));
                }
                ret.Columns.Insert(ret.Columns.Count, dataFrameColumn);
                fieldIndex++;
            }
            return(ret);
        }
Exemplo n.º 8
0
        private static Dictionary <long, ICollection <long> > GetOccurences(DataFrame retainedDataFrame, DataFrame supplementaryDataFrame,
                                                                            string[] retainedJoinColumnNames, string[] supplemetaryJoinColumnNames, out HashSet <long> supplementaryJoinColumnsNullIndices)
        {
            supplementaryJoinColumnsNullIndices = new HashSet <long>();

            // Get occurrences of values in columns used for join in the retained and supplementary dataframes

            Dictionary <long, ICollection <long> > occurrences    = null;
            Dictionary <long, long> retainedIndicesReverseMapping = null;

            for (int colNameIndex = 0; colNameIndex < retainedJoinColumnNames.Length; colNameIndex++)
            {
                DataFrameColumn shrinkedRetainedColumn = retainedDataFrame.Columns[retainedJoinColumnNames[colNameIndex]];

                // Shrink retained column by row occurrences from previous step
                if (occurrences != null)
                {
                    // Only rows with occurences from previose step should go for futher processing
                    var shrinkedRetainedIndices = occurrences.Keys.ToArray();

                    // Create reverse mapping of index of the row in the shrinked column to the index of this row in the original dataframe (new index -> original index)
                    var newRetainedIndicesReverseMapping = new Dictionary <long, long>(shrinkedRetainedIndices.Length);

                    for (int i = 0; i < shrinkedRetainedIndices.Length; i++)
                    {
                        // Store reverse mapping to restore original dataframe indices from indices in shrinked row
                        var originalIndex = shrinkedRetainedIndices[i];
                        newRetainedIndicesReverseMapping.Add(i, originalIndex);
                    }

                    retainedIndicesReverseMapping = newRetainedIndicesReverseMapping;

                    var indices = new Int64DataFrameColumn("Indices", shrinkedRetainedIndices);
                    shrinkedRetainedColumn = shrinkedRetainedColumn.Clone(indices);
                }

                DataFrameColumn supplementaryColumn = supplementaryDataFrame.Columns[supplemetaryJoinColumnNames[colNameIndex]];

                // Find occurrenses on current step (join column)
                var newOccurrences = shrinkedRetainedColumn.GetGroupedOccurrences(supplementaryColumn, out HashSet <long> supplementaryColumnNullIndices);

                // Convert indices from in key from local (shrinked row) to indices in original dataframe
                if (retainedIndicesReverseMapping != null)
                {
                    newOccurrences = newOccurrences.ToDictionary(kvp => retainedIndicesReverseMapping[kvp.Key], kvp => kvp.Value);
                }

                supplementaryJoinColumnsNullIndices.UnionWith(supplementaryColumnNullIndices);

                // Shrink join result on current column by previous join columns (if any)
                // (we have to remove occurrences that doesn't exist in previous columns, because JOIN happens only if ALL left and right columns in JOIN are matched)
                if (occurrences != null)
                {
                    newOccurrences = GetShrinkedOccurences(occurrences, newOccurrences);
                }

                occurrences = newOccurrences;
            }

            return(occurrences);
        }
Exemplo n.º 9
0
        // TODO: Merge API with an "On" parameter that merges on a column common to 2 dataframes

        /// <summary>
        /// Merge DataFrames with a database style join
        /// </summary>
        /// <param name="other"></param>
        /// <param name="leftJoinColumn"></param>
        /// <param name="rightJoinColumn"></param>
        /// <param name="leftSuffix"></param>
        /// <param name="rightSuffix"></param>
        /// <param name="joinAlgorithm"></param>
        /// <returns></returns>
        public DataFrame Merge <TKey>(DataFrame other, string leftJoinColumn, string rightJoinColumn, string leftSuffix = "_left", string rightSuffix = "_right", JoinAlgorithm joinAlgorithm = JoinAlgorithm.Left)
        {
            // A simple hash join
            DataFrame ret            = new DataFrame();
            DataFrame leftDataFrame  = this;
            DataFrame rightDataFrame = other;

            // The final table size is not known until runtime
            long rowNumber = 0;
            PrimitiveDataFrameColumn <long> leftRowIndices  = new PrimitiveDataFrameColumn <long>("LeftIndices");
            PrimitiveDataFrameColumn <long> rightRowIndices = new PrimitiveDataFrameColumn <long>("RightIndices");

            if (joinAlgorithm == JoinAlgorithm.Left)
            {
                // First hash other dataframe on the rightJoinColumn
                DataFrameColumn otherColumn = other.Columns[rightJoinColumn];
                Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>(out HashSet <long> otherColumnNullIndices);

                // Go over the records in this dataframe and match with the dictionary
                DataFrameColumn thisColumn = Columns[leftJoinColumn];

                for (long i = 0; i < thisColumn.Length; i++)
                {
                    var thisColumnValue = thisColumn[i];
                    if (thisColumnValue != null)
                    {
                        if (multimap.TryGetValue((TKey)thisColumnValue, out ICollection <long> rowNumbers))
                        {
                            foreach (long row in rowNumbers)
                            {
                                leftRowIndices.Append(i);
                                rightRowIndices.Append(row);
                            }
                        }
                        else
                        {
                            leftRowIndices.Append(i);
                            rightRowIndices.Append(null);
                        }
                    }
                    else
                    {
                        foreach (long row in otherColumnNullIndices)
                        {
                            leftRowIndices.Append(i);
                            rightRowIndices.Append(row);
                        }
                    }
                }
            }
            else if (joinAlgorithm == JoinAlgorithm.Right)
            {
                DataFrameColumn thisColumn = Columns[leftJoinColumn];
                Dictionary <TKey, ICollection <long> > multimap = thisColumn.GroupColumnValues <TKey>(out HashSet <long> thisColumnNullIndices);

                DataFrameColumn otherColumn = other.Columns[rightJoinColumn];
                for (long i = 0; i < otherColumn.Length; i++)
                {
                    var otherColumnValue = otherColumn[i];
                    if (otherColumnValue != null)
                    {
                        if (multimap.TryGetValue((TKey)otherColumnValue, out ICollection <long> rowNumbers))
                        {
                            foreach (long row in rowNumbers)
                            {
                                leftRowIndices.Append(row);
                                rightRowIndices.Append(i);
                            }
                        }
                        else
                        {
                            leftRowIndices.Append(null);
                            rightRowIndices.Append(i);
                        }
                    }
                    else
                    {
                        foreach (long thisColumnNullIndex in thisColumnNullIndices)
                        {
                            leftRowIndices.Append(thisColumnNullIndex);
                            rightRowIndices.Append(i);
                        }
                    }
                }
            }
            else if (joinAlgorithm == JoinAlgorithm.Inner)
            {
                // Hash the column with the smaller RowCount
                long leftRowCount  = Rows.Count;
                long rightRowCount = other.Rows.Count;

                bool            leftColumnIsSmaller             = leftRowCount <= rightRowCount;
                DataFrameColumn hashColumn                      = leftColumnIsSmaller ? Columns[leftJoinColumn] : other.Columns[rightJoinColumn];
                DataFrameColumn otherColumn                     = ReferenceEquals(hashColumn, Columns[leftJoinColumn]) ? other.Columns[rightJoinColumn] : Columns[leftJoinColumn];
                Dictionary <TKey, ICollection <long> > multimap = hashColumn.GroupColumnValues <TKey>(out HashSet <long> smallerDataFrameColumnNullIndices);

                for (long i = 0; i < otherColumn.Length; i++)
                {
                    var otherColumnValue = otherColumn[i];
                    if (otherColumnValue != null)
                    {
                        if (multimap.TryGetValue((TKey)otherColumnValue, out ICollection <long> rowNumbers))
                        {
                            foreach (long row in rowNumbers)
                            {
                                leftRowIndices.Append(leftColumnIsSmaller ? row : i);
                                rightRowIndices.Append(leftColumnIsSmaller ? i : row);
                            }
                        }
                    }
                    else
                    {
                        foreach (long nullIndex in smallerDataFrameColumnNullIndices)
                        {
                            leftRowIndices.Append(leftColumnIsSmaller ? nullIndex : i);
                            rightRowIndices.Append(leftColumnIsSmaller ? i : nullIndex);
                        }
                    }
                }
            }
            else if (joinAlgorithm == JoinAlgorithm.FullOuter)
            {
                DataFrameColumn otherColumn = other.Columns[rightJoinColumn];
                Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>(out HashSet <long> otherColumnNullIndices);
                Dictionary <TKey, long> intersection            = new Dictionary <TKey, long>(EqualityComparer <TKey> .Default);

                // Go over the records in this dataframe and match with the dictionary
                DataFrameColumn      thisColumn            = Columns[leftJoinColumn];
                Int64DataFrameColumn thisColumnNullIndices = new Int64DataFrameColumn("ThisColumnNullIndices");

                for (long i = 0; i < thisColumn.Length; i++)
                {
                    var thisColumnValue = thisColumn[i];
                    if (thisColumnValue != null)
                    {
                        if (multimap.TryGetValue((TKey)thisColumnValue, out ICollection <long> rowNumbers))
                        {
                            foreach (long row in rowNumbers)
                            {
                                leftRowIndices.Append(i);
                                rightRowIndices.Append(row);
                                if (!intersection.ContainsKey((TKey)thisColumnValue))
                                {
                                    intersection.Add((TKey)thisColumnValue, rowNumber);
                                }
                            }
                        }
                        else
                        {
                            leftRowIndices.Append(i);
                            rightRowIndices.Append(null);
                        }
                    }
                    else
                    {
                        thisColumnNullIndices.Append(i);
                    }
                }
                for (long i = 0; i < otherColumn.Length; i++)
                {
                    var value = otherColumn[i];
                    if (value != null)
                    {
                        if (!intersection.ContainsKey((TKey)value))
                        {
                            leftRowIndices.Append(null);
                            rightRowIndices.Append(i);
                        }
                    }
                }

                // Now handle the null rows
                foreach (long?thisColumnNullIndex in thisColumnNullIndices)
                {
                    foreach (long otherColumnNullIndex in otherColumnNullIndices)
                    {
                        leftRowIndices.Append(thisColumnNullIndex.Value);
                        rightRowIndices.Append(otherColumnNullIndex);
                    }
                    if (otherColumnNullIndices.Count == 0)
                    {
                        leftRowIndices.Append(thisColumnNullIndex.Value);
                        rightRowIndices.Append(null);
                    }
                }
                if (thisColumnNullIndices.Length == 0)
                {
                    foreach (long otherColumnNullIndex in otherColumnNullIndices)
                    {
                        leftRowIndices.Append(null);
                        rightRowIndices.Append(otherColumnNullIndex);
                    }
                }
            }
            else
            {
                throw new NotImplementedException(nameof(joinAlgorithm));
            }

            for (int i = 0; i < leftDataFrame.Columns.Count; i++)
            {
                ret.Columns.Insert(i, leftDataFrame.Columns[i].Clone(leftRowIndices));
            }
            for (int i = 0; i < rightDataFrame.Columns.Count; i++)
            {
                DataFrameColumn column = rightDataFrame.Columns[i].Clone(rightRowIndices);
                SetSuffixForDuplicatedColumnNames(ret, column, leftSuffix, rightSuffix);
                ret.Columns.Insert(ret.Columns.Count, column);
            }
            return(ret);
        }