Beispiel #1
0
 public void SetColumn(int columnIndex, BaseColumn column)
 {
     column = column ?? throw new ArgumentNullException(nameof(column));
     if ((uint)columnIndex >= ColumnCount)
     {
         throw new ArgumentOutOfRangeException(nameof(columnIndex));
     }
     if (RowCount > 0 && column.Length != RowCount)
     {
         throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column));
     }
     if (_columnNameToIndexDictionary.ContainsKey(column.Name))
     {
         throw new ArgumentException($"Table already contains a column called {column.Name}");
     }
     _columnNameToIndexDictionary.Remove(_columnNames[columnIndex]);
     _columnNames[columnIndex] = column.Name;
     _columnNameToIndexDictionary[column.Name] = columnIndex;
     _columns[columnIndex] = column;
 }
Beispiel #2
0
 public void InsertColumn(int columnIndex, BaseColumn column, DataFrame parent)
 {
     column = column ?? throw new ArgumentNullException(nameof(column));
     if ((uint)columnIndex > _columns.Count)
     {
         throw new ArgumentOutOfRangeException(nameof(columnIndex));
     }
     if (RowCount > 0 && column.Length != RowCount)
     {
         throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column));
     }
     if (_columnNameToIndexDictionary.ContainsKey(column.Name))
     {
         throw new ArgumentException($"Table already contains a column called {column.Name}");
     }
     RowCount = column.Length;
     _columnNames.Insert(columnIndex, column.Name);
     _columnNameToIndexDictionary[column.Name] = columnIndex;
     _columns.Insert(columnIndex, column);
     ColumnCount++;
 }
Beispiel #3
0
        public override BaseColumn Clone(BaseColumn mapIndices = null, bool invertMapIndices = false, long numberOfNullsToAppend = 0)
        {
            PrimitiveColumn <T> clone;

            if (!(mapIndices is null))
            {
                if (mapIndices.DataType != typeof(long) && mapIndices.DataType != typeof(bool))
                {
                    throw new ArgumentException(String.Format(Strings.MultipleMismatchedValueType, typeof(long), typeof(bool)), nameof(mapIndices));
                }
                if (mapIndices.Length > Length)
                {
                    throw new ArgumentException(Strings.MapIndicesExceedsColumnLenth, nameof(mapIndices));
                }
                if (mapIndices.DataType == typeof(long))
                {
                    clone = Clone(mapIndices as PrimitiveColumn <long>, invertMapIndices);
                }
                else
                {
                    clone = Clone(mapIndices as PrimitiveColumn <bool>);
                }
            }
        public IEnumerable <RecordBatch> AsArrowRecordBatches()
        {
            Apache.Arrow.Schema.Builder schemaBuilder = new Apache.Arrow.Schema.Builder();

            int columnCount = ColumnCount;

            for (int i = 0; i < columnCount; i++)
            {
                BaseColumn column = Column(i);
                Field      field  = column.Field();
                schemaBuilder.Field(field);
            }

            Schema schema = schemaBuilder.Build();
            List <Apache.Arrow.Array> arrays = new List <Apache.Arrow.Array>();

            int  recordBatchLength             = Int32.MaxValue;
            int  numberOfRowsInThisRecordBatch = (int)Math.Min(recordBatchLength, RowCount);
            long numberOfRowsProcessed         = 0;

            // Sometimes .NET for Spark passes in DataFrames with no rows. In those cases, we just return a RecordBatch with the right Schema and no rows
            do
            {
                for (int i = 0; i < columnCount; i++)
                {
                    BaseColumn column = Column(i);
                    numberOfRowsInThisRecordBatch = (int)Math.Min(numberOfRowsInThisRecordBatch, column.MaxRecordBatchLength(numberOfRowsProcessed));
                }
                for (int i = 0; i < columnCount; i++)
                {
                    BaseColumn column = Column(i);
                    arrays.Add(column.AsArrowArray(numberOfRowsProcessed, numberOfRowsInThisRecordBatch));
                }
                numberOfRowsProcessed += numberOfRowsInThisRecordBatch;
                yield return(new RecordBatch(schema, arrays, numberOfRowsInThisRecordBatch));
            } while (numberOfRowsProcessed < RowCount);
        }
Beispiel #5
0
 public DataFrameTable(BaseColumn column) : this(new List <BaseColumn> {
     column
 })
 {
 }
        public DataFrame(RecordBatch recordBatch)
        {
            _table = new DataFrameTable();
            Apache.Arrow.Schema arrowSchema = recordBatch.Schema;
            int fieldIndex = 0;
            IEnumerable <IArrowArray> arrowArrays = recordBatch.Arrays;

            foreach (IArrowArray arrowArray in arrowArrays)
            {
                Field      field           = arrowSchema.GetFieldByIndex(fieldIndex);
                IArrowType fieldType       = field.DataType;
                BaseColumn dataFrameColumn = null;
                switch (fieldType.TypeId)
                {
                case ArrowTypeId.Boolean:
                    BooleanArray          arrowBooleanArray = (BooleanArray)arrowArray;
                    ReadOnlyMemory <byte> valueBuffer       = arrowBooleanArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte> nullBitMapBuffer  = arrowBooleanArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new PrimitiveColumn <bool>(field.Name, valueBuffer, nullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                    break;

                case ArrowTypeId.Double:
                    PrimitiveArray <double> arrowDoubleArray       = (PrimitiveArray <double>)arrowArray;
                    ReadOnlyMemory <byte>   doubleValueBuffer      = arrowDoubleArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte>   doubleNullBitMapBuffer = arrowDoubleArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new PrimitiveColumn <double>(field.Name, doubleValueBuffer, doubleNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                    break;

                case ArrowTypeId.Float:
                    PrimitiveArray <float> arrowFloatArray       = (PrimitiveArray <float>)arrowArray;
                    ReadOnlyMemory <byte>  floatValueBuffer      = arrowFloatArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte>  floatNullBitMapBuffer = arrowFloatArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new PrimitiveColumn <float>(field.Name, floatValueBuffer, floatNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                    break;

                case ArrowTypeId.Int8:
                    PrimitiveArray <sbyte> arrowsbyteArray       = (PrimitiveArray <sbyte>)arrowArray;
                    ReadOnlyMemory <byte>  sbyteValueBuffer      = arrowsbyteArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte>  sbyteNullBitMapBuffer = arrowsbyteArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new PrimitiveColumn <sbyte>(field.Name, sbyteValueBuffer, sbyteNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                    break;

                case ArrowTypeId.Int16:
                    PrimitiveArray <short> arrowshortArray       = (PrimitiveArray <short>)arrowArray;
                    ReadOnlyMemory <byte>  shortValueBuffer      = arrowshortArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte>  shortNullBitMapBuffer = arrowshortArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new PrimitiveColumn <short>(field.Name, shortValueBuffer, shortNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                    break;

                case ArrowTypeId.Int32:
                    PrimitiveArray <int>  arrowIntArray       = (PrimitiveArray <int>)arrowArray;
                    ReadOnlyMemory <byte> intValueBuffer      = arrowIntArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte> intNullBitMapBuffer = arrowIntArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new PrimitiveColumn <int>(field.Name, intValueBuffer, intNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                    break;

                case ArrowTypeId.Int64:
                    PrimitiveArray <long> arrowLongArray       = (PrimitiveArray <long>)arrowArray;
                    ReadOnlyMemory <byte> longValueBuffer      = arrowLongArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte> longNullBitMapBuffer = arrowLongArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new PrimitiveColumn <long>(field.Name, longValueBuffer, longNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                    break;

                case ArrowTypeId.String:
                    StringArray           stringArray   = (StringArray)arrowArray;
                    ReadOnlyMemory <byte> dataMemory    = stringArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte> offsetsMemory = stringArray.ValueOffsetsBuffer.Memory;
                    ReadOnlyMemory <byte> nullMemory    = stringArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new ArrowStringColumn(field.Name, dataMemory, offsetsMemory, nullMemory, stringArray.Length, stringArray.NullCount);
                    break;

                case ArrowTypeId.UInt8:
                    PrimitiveArray <byte> arrowbyteArray       = (PrimitiveArray <byte>)arrowArray;
                    ReadOnlyMemory <byte> byteValueBuffer      = arrowbyteArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte> byteNullBitMapBuffer = arrowbyteArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new PrimitiveColumn <byte>(field.Name, byteValueBuffer, byteNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                    break;

                case ArrowTypeId.UInt16:
                    PrimitiveArray <ushort> arrowUshortArray       = (PrimitiveArray <ushort>)arrowArray;
                    ReadOnlyMemory <byte>   ushortValueBuffer      = arrowUshortArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte>   ushortNullBitMapBuffer = arrowUshortArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new PrimitiveColumn <ushort>(field.Name, ushortValueBuffer, ushortNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                    break;

                case ArrowTypeId.UInt32:
                    PrimitiveArray <uint> arrowUintArray       = (PrimitiveArray <uint>)arrowArray;
                    ReadOnlyMemory <byte> uintValueBuffer      = arrowUintArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte> uintNullBitMapBuffer = arrowUintArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new PrimitiveColumn <uint>(field.Name, uintValueBuffer, uintNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                    break;

                case ArrowTypeId.UInt64:
                    PrimitiveArray <ulong> arrowUlongArray       = (PrimitiveArray <ulong>)arrowArray;
                    ReadOnlyMemory <byte>  ulongValueBuffer      = arrowUlongArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte>  ulongNullBitMapBuffer = arrowUlongArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new PrimitiveColumn <ulong>(field.Name, ulongValueBuffer, ulongNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                    break;

                case ArrowTypeId.Decimal:
                case ArrowTypeId.Binary:
                case ArrowTypeId.Date32:
                case ArrowTypeId.Date64:
                case ArrowTypeId.Dictionary:
                case ArrowTypeId.FixedSizedBinary:
                case ArrowTypeId.HalfFloat:
                case ArrowTypeId.Interval:
                case ArrowTypeId.List:
                case ArrowTypeId.Map:
                case ArrowTypeId.Null:
                case ArrowTypeId.Struct:
                case ArrowTypeId.Time32:
                case ArrowTypeId.Time64:
                default:
                    throw new NotImplementedException(nameof(fieldType.Name));
                }
                _table.InsertColumn(ColumnCount, dataFrameColumn);
                fieldIndex++;
            }
        }
 public virtual PrimitiveColumn <bool> LessThan(BaseColumn column)
 {
     throw new NotImplementedException();
 }
Beispiel #8
0
 public void InsertColumn(int columnIndex, BaseColumn column)
 {
     _table.InsertColumn(columnIndex, column, this);
     OnColumnsChanged();
 }
Beispiel #9
0
        // TODO: Merge API with an "On" parameter that merges on a column common to 2 dataframes

        /// <summary>
        /// Merge DataFrames with a database style join
        /// </summary>
        /// <param name="other"></param>
        /// <param name="leftJoinColumn"></param>
        /// <param name="rightJoinColumn"></param>
        /// <param name="leftSuffix"></param>
        /// <param name="rightSuffix"></param>
        /// <param name="joinAlgorithm"></param>
        /// <returns></returns>
        public DataFrame Merge <TKey>(DataFrame other, string leftJoinColumn, string rightJoinColumn, string leftSuffix = "_left", string rightSuffix = "_right", JoinAlgorithm joinAlgorithm = JoinAlgorithm.Left)
        {
            // A simple hash join
            DataFrame ret            = new DataFrame();
            DataFrame leftDataFrame  = this;
            DataFrame rightDataFrame = other;

            // The final table size is not known until runtime
            long rowNumber = 0;
            PrimitiveColumn <long> leftRowIndices  = new PrimitiveColumn <long>("LeftIndices");
            PrimitiveColumn <long> rightRowIndices = new PrimitiveColumn <long>("RightIndices");

            if (joinAlgorithm == JoinAlgorithm.Left)
            {
                // First hash other dataframe on the rightJoinColumn
                BaseColumn otherColumn = other[rightJoinColumn];
                Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>();

                // Go over the records in this dataframe and match with the dictionary
                BaseColumn thisColumn = this[leftJoinColumn];

                for (long i = 0; i < thisColumn.Length; i++)
                {
                    var  thisColumnValue          = thisColumn[i];
                    TKey thisColumnValueOrDefault = (TKey)(thisColumnValue == null ? default(TKey) : thisColumnValue);
                    if (multimap.TryGetValue(thisColumnValueOrDefault, out ICollection <long> rowNumbers))
                    {
                        foreach (long row in rowNumbers)
                        {
                            if (thisColumnValue == null)
                            {
                                // Match only with nulls in otherColumn
                                if (otherColumn[row] == null)
                                {
                                    leftRowIndices.Append(i);
                                    rightRowIndices.Append(row);
                                }
                            }
                            else
                            {
                                // Cannot match nulls in otherColumn
                                if (otherColumn[row] != null)
                                {
                                    leftRowIndices.Append(i);
                                    rightRowIndices.Append(row);
                                }
                            }
                        }
                    }
                    else
                    {
                        leftRowIndices.Append(i);
                        rightRowIndices.Append(null);
                    }
                }
            }
            else if (joinAlgorithm == JoinAlgorithm.Right)
            {
                BaseColumn thisColumn = this[leftJoinColumn];
                Dictionary <TKey, ICollection <long> > multimap = thisColumn.GroupColumnValues <TKey>();

                BaseColumn otherColumn = other[rightJoinColumn];
                for (long i = 0; i < otherColumn.Length; i++)
                {
                    var  otherColumnValue          = otherColumn[i];
                    TKey otherColumnValueOrDefault = (TKey)(otherColumnValue == null ? default(TKey) : otherColumnValue);
                    if (multimap.TryGetValue(otherColumnValueOrDefault, out ICollection <long> rowNumbers))
                    {
                        foreach (long row in rowNumbers)
                        {
                            if (otherColumnValue == null)
                            {
                                if (thisColumn[row] == null)
                                {
                                    leftRowIndices.Append(row);
                                    rightRowIndices.Append(i);
                                }
                            }
                            else
                            {
                                if (thisColumn[row] != null)
                                {
                                    leftRowIndices.Append(row);
                                    rightRowIndices.Append(i);
                                }
                            }
                        }
                    }
                    else
                    {
                        leftRowIndices.Append(null);
                        rightRowIndices.Append(i);
                    }
                }
            }
            else if (joinAlgorithm == JoinAlgorithm.Inner)
            {
                // Hash the column with the smaller RowCount
                long       leftRowCount     = RowCount;
                long       rightRowCount    = other.RowCount;
                DataFrame  longerDataFrame  = leftRowCount < rightRowCount ? other : this;
                DataFrame  shorterDataFrame = ReferenceEquals(longerDataFrame, this) ? other : this;
                BaseColumn hashColumn       = (leftRowCount < rightRowCount) ? this[leftJoinColumn] : other[rightJoinColumn];
                BaseColumn otherColumn      = ReferenceEquals(hashColumn, this[leftJoinColumn]) ? other[rightJoinColumn] : this[leftJoinColumn];
                Dictionary <TKey, ICollection <long> > multimap = hashColumn.GroupColumnValues <TKey>();

                for (long i = 0; i < otherColumn.Length; i++)
                {
                    var  otherColumnValue          = otherColumn[i];
                    TKey otherColumnValueOrDefault = (TKey)(otherColumnValue == null ? default(TKey) : otherColumnValue);
                    if (multimap.TryGetValue(otherColumnValueOrDefault, out ICollection <long> rowNumbers))
                    {
                        foreach (long row in rowNumbers)
                        {
                            if (otherColumnValue == null)
                            {
                                if (hashColumn[row] == null)
                                {
                                    leftRowIndices.Append(row);
                                    rightRowIndices.Append(i);
                                }
                            }
                            else
                            {
                                if (hashColumn[row] != null)
                                {
                                    leftRowIndices.Append(row);
                                    rightRowIndices.Append(i);
                                }
                            }
                        }
                    }
                }
                leftDataFrame  = shorterDataFrame;
                rightDataFrame = longerDataFrame;
            }
            else if (joinAlgorithm == JoinAlgorithm.FullOuter)
            {
                BaseColumn otherColumn = other[rightJoinColumn];
                Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>();
                Dictionary <TKey, long> intersection            = new Dictionary <TKey, long>(EqualityComparer <TKey> .Default);

                // Go over the records in this dataframe and match with the dictionary
                BaseColumn thisColumn = this[leftJoinColumn];

                for (long i = 0; i < thisColumn.Length; i++)
                {
                    var  thisColumnValue          = thisColumn[i];
                    TKey thisColumnValueOrDefault = (TKey)(thisColumnValue == null ? default(TKey) : thisColumnValue);
                    if (multimap.TryGetValue(thisColumnValueOrDefault, out ICollection <long> rowNumbers))
                    {
                        foreach (long row in rowNumbers)
                        {
                            if (thisColumnValue == null)
                            {
                                // Has to match only with nulls in otherColumn
                                if (otherColumn[row] == null)
                                {
                                    leftRowIndices.Append(i);
                                    rightRowIndices.Append(row);
                                    if (!intersection.ContainsKey(thisColumnValueOrDefault))
                                    {
                                        intersection.Add(thisColumnValueOrDefault, rowNumber);
                                    }
                                }
                            }
                            else
                            {
                                // Cannot match to nulls in otherColumn
                                if (otherColumn[row] != null)
                                {
                                    leftRowIndices.Append(i);
                                    rightRowIndices.Append(row);
                                    if (!intersection.ContainsKey(thisColumnValueOrDefault))
                                    {
                                        intersection.Add(thisColumnValueOrDefault, rowNumber);
                                    }
                                }
                            }
                        }
                    }
                    else
                    {
                        leftRowIndices.Append(i);
                        rightRowIndices.Append(null);
                    }
                }
                for (long i = 0; i < otherColumn.Length; i++)
                {
                    TKey value = (TKey)(otherColumn[i] ?? default(TKey));
                    if (!intersection.ContainsKey(value))
                    {
                        leftRowIndices.Append(null);
                        rightRowIndices.Append(i);
                    }
                }
            }
            else
            {
                throw new NotImplementedException(nameof(joinAlgorithm));
            }

            for (int i = 0; i < leftDataFrame.ColumnCount; i++)
            {
                ret.InsertColumn(i, leftDataFrame.Column(i).Clone(leftRowIndices));
            }
            for (int i = 0; i < rightDataFrame.ColumnCount; i++)
            {
                BaseColumn column = rightDataFrame.Column(i).Clone(rightRowIndices);
                SetSuffixForDuplicatedColumnNames(ret, column, leftSuffix, rightSuffix);
                ret.InsertColumn(ret.ColumnCount, column);
            }
            return(ret);
        }
Beispiel #10
0
 public virtual BaseColumn Xor(BaseColumn column)
 {
     throw new NotImplementedException();
 }
Beispiel #11
0
 public virtual BaseColumn GreaterThan(BaseColumn column)
 {
     throw new NotImplementedException();
 }
Beispiel #12
0
 public virtual BaseColumn LessThanOrEqual(BaseColumn column)
 {
     throw new NotImplementedException();
 }
            private Delegate CreateGetterDelegate(int col)
            {
                BaseColumn column = _dataFrame.Column(col);

                return(column.GetDataViewGetter(this));
            }
Beispiel #14
0
        // TODO: Merge API with an "On" parameter that merges on a column common to 2 dataframes

        /// <summary>
        /// Merge DataFrames with a database style join
        /// </summary>
        /// <param name="other"></param>
        /// <param name="leftJoinColumn"></param>
        /// <param name="rightJoinColumn"></param>
        /// <param name="leftSuffix"></param>
        /// <param name="rightSuffix"></param>
        /// <param name="joinAlgorithm"></param>
        /// <returns></returns>
        public DataFrame Merge <TKey>(DataFrame other, string leftJoinColumn, string rightJoinColumn, string leftSuffix = "_left", string rightSuffix = "_right", JoinAlgorithm joinAlgorithm = JoinAlgorithm.Left)
        {
            // A simple hash join
            DataFrame ret = new DataFrame();
            PrimitiveColumn <long> emptyMap = new PrimitiveColumn <long>("Empty");

            for (int i = 0; i < ColumnCount; i++)
            {
                // Create empty columns
                BaseColumn column = Column(i).Clone(emptyMap);
                ret.InsertColumn(ret.ColumnCount, column);
            }

            for (int i = 0; i < other.ColumnCount; i++)
            {
                // Create empty columns
                BaseColumn column = other.Column(i).Clone(emptyMap);
                SetSuffixForDuplicatedColumnNames(ret, column, leftSuffix, rightSuffix);
                ret.InsertColumn(ret.ColumnCount, column);
            }

            // The final table size is not known until runtime
            long rowNumber = 0;

            if (joinAlgorithm == JoinAlgorithm.Left)
            {
                // First hash other dataframe on the rightJoinColumn
                BaseColumn otherColumn = other[rightJoinColumn];
                Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>();

                // Go over the records in this dataframe and match with the dictionary
                BaseColumn thisColumn = this[leftJoinColumn];
                for (int c = 0; c < ret.ColumnCount; c++)
                {
                    ret.Column(c).Resize(thisColumn.Length);
                }

                for (long i = 0; i < thisColumn.Length; i++)
                {
                    if (rowNumber >= thisColumn.Length)
                    {
                        for (int c = 0; c < ret.ColumnCount; c++)
                        {
                            ret.Column(c).Resize(rowNumber + 1);
                        }
                    }
                    TKey value = (TKey)(thisColumn[i] ?? default(TKey));
                    if (multimap.TryGetValue(value, out ICollection <long> rowNumbers))
                    {
                        foreach (long row in rowNumbers)
                        {
                            if (thisColumn[i] == null)
                            {
                                // Match only with nulls in otherColumn
                                if (otherColumn[row] == null)
                                {
                                    AppendForMerge(ret, rowNumber++, this, other, i, row);
                                }
                            }
                            else
                            {
                                // Cannot match nulls in otherColumn
                                if (otherColumn[row] != null)
                                {
                                    AppendForMerge(ret, rowNumber++, this, other, i, row);
                                }
                            }
                        }
                    }
                    else
                    {
                        AppendForMerge(ret, rowNumber++, this, other, i, -1);
                    }
                }
                ret._table.RowCount = rowNumber;
            }
            else if (joinAlgorithm == JoinAlgorithm.Right)
            {
                BaseColumn thisColumn = this[leftJoinColumn];
                Dictionary <TKey, ICollection <long> > multimap = thisColumn.GroupColumnValues <TKey>();

                BaseColumn otherColumn = other[rightJoinColumn];
                for (int c = 0; c < ret.ColumnCount; c++)
                {
                    ret.Column(c).Resize(otherColumn.Length);
                }

                for (long i = 0; i < otherColumn.Length; i++)
                {
                    if (rowNumber >= otherColumn.Length)
                    {
                        for (int c = 0; c < ret.ColumnCount; c++)
                        {
                            ret.Column(c).Resize(rowNumber + 1);
                        }
                    }
                    TKey value = (TKey)(otherColumn[i] ?? default(TKey));
                    if (multimap.TryGetValue(value, out ICollection <long> rowNumbers))
                    {
                        foreach (long row in rowNumbers)
                        {
                            if (otherColumn[i] == null)
                            {
                                if (thisColumn[row] == null)
                                {
                                    AppendForMerge(ret, rowNumber++, this, other, row, i);
                                }
                            }
                            else
                            {
                                if (thisColumn[row] != null)
                                {
                                    AppendForMerge(ret, rowNumber++, this, other, row, i);
                                }
                            }
                        }
                    }
                    else
                    {
                        AppendForMerge(ret, rowNumber++, this, other, -1, i);
                    }
                }
                ret._table.RowCount = rowNumber;
            }
            else if (joinAlgorithm == JoinAlgorithm.Inner)
            {
                // Hash the column with the smaller RowCount
                long       leftRowCount     = RowCount;
                long       rightRowCount    = other.RowCount;
                DataFrame  longerDataFrame  = leftRowCount < rightRowCount ? other : this;
                DataFrame  shorterDataFrame = ReferenceEquals(longerDataFrame, this) ? other : this;
                BaseColumn hashColumn       = (leftRowCount < rightRowCount) ? this[leftJoinColumn] : other[rightJoinColumn];
                BaseColumn otherColumn      = ReferenceEquals(hashColumn, this[leftJoinColumn]) ? other[rightJoinColumn] : this[leftJoinColumn];
                Dictionary <TKey, ICollection <long> > multimap = hashColumn.GroupColumnValues <TKey>();

                for (int c = 0; c < ret.ColumnCount; c++)
                {
                    ret.Column(c).Resize(1);
                }

                for (long i = 0; i < otherColumn.Length; i++)
                {
                    if (rowNumber >= ret.Column(0).Length)
                    {
                        for (int c = 0; c < ret.ColumnCount; c++)
                        {
                            ret.Column(c).Resize(rowNumber + 1);
                        }
                    }
                    TKey value = (TKey)(otherColumn[i] ?? default(TKey));
                    if (multimap.TryGetValue(value, out ICollection <long> rowNumbers))
                    {
                        foreach (long row in rowNumbers)
                        {
                            if (otherColumn[i] == null)
                            {
                                if (hashColumn[row] == null)
                                {
                                    AppendForMerge(ret, rowNumber++, this, other, ReferenceEquals(this, shorterDataFrame) ? row : i, ReferenceEquals(this, shorterDataFrame) ? i : row);
                                }
                            }
                            else
                            {
                                if (hashColumn[row] != null)
                                {
                                    AppendForMerge(ret, rowNumber++, this, other, ReferenceEquals(this, shorterDataFrame) ? row : i, ReferenceEquals(this, shorterDataFrame) ? i : row);
                                }
                            }
                        }
                    }
                }
                ret._table.RowCount = rowNumber;
            }
            else if (joinAlgorithm == JoinAlgorithm.FullOuter)
            {
                BaseColumn otherColumn = other[rightJoinColumn];
                Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>();
                Dictionary <TKey, long> intersection            = new Dictionary <TKey, long>(EqualityComparer <TKey> .Default);

                // Go over the records in this dataframe and match with the dictionary
                BaseColumn thisColumn = this[leftJoinColumn];
                for (int c = 0; c < ret.ColumnCount; c++)
                {
                    ret.Column(c).Resize(thisColumn.Length + 1);
                }

                for (long i = 0; i < thisColumn.Length; i++)
                {
                    if (rowNumber >= thisColumn.Length)
                    {
                        for (int c = 0; c < ret.ColumnCount; c++)
                        {
                            ret.Column(c).Resize(rowNumber + 1);
                        }
                    }
                    TKey value = (TKey)(thisColumn[i] ?? default(TKey));
                    if (multimap.TryGetValue(value, out ICollection <long> rowNumbers))
                    {
                        foreach (long row in rowNumbers)
                        {
                            if (thisColumn[i] == null)
                            {
                                // Has to match only with nulls in otherColumn
                                if (otherColumn[row] == null)
                                {
                                    AppendForMerge(ret, rowNumber++, this, other, i, row);
                                    if (!intersection.ContainsKey(value))
                                    {
                                        intersection.Add(value, rowNumber);
                                    }
                                }
                            }
                            else
                            {
                                // Cannot match to nulls in otherColumn
                                if (otherColumn[row] != null)
                                {
                                    AppendForMerge(ret, rowNumber++, this, other, i, row);
                                    if (!intersection.ContainsKey(value))
                                    {
                                        intersection.Add(value, rowNumber);
                                    }
                                }
                            }
                        }
                    }
                    else
                    {
                        AppendForMerge(ret, rowNumber++, this, other, i, -1);
                    }
                }
                for (long i = 0; i < otherColumn.Length; i++)
                {
                    if (rowNumber >= ret.Column(0).Length)
                    {
                        for (int c = 0; c < ret.ColumnCount; c++)
                        {
                            ret.Column(c).Resize(rowNumber + 1);
                        }
                    }
                    TKey value = (TKey)(otherColumn[i] ?? default(TKey));
                    if (!intersection.ContainsKey(value))
                    {
                        if (rowNumber >= otherColumn.Length)
                        {
                            for (int c = 0; c < ret.ColumnCount; c++)
                            {
                                ret.Column(c).Resize(rowNumber + 1);
                            }
                        }
                        AppendForMerge(ret, rowNumber++, this, other, -1, i);
                    }
                }
                ret._table.RowCount = rowNumber;
            }
            return(ret);
        }
 public virtual BaseColumn Xor(BaseColumn column, bool inPlace = false)
 {
     throw new NotImplementedException();
 }
Beispiel #16
0
 public void InsertColumn(int columnIndex, BaseColumn column) => _table.InsertColumn(columnIndex, column);
Beispiel #17
0
 public void SetColumn(int columnIndex, BaseColumn column) => _table.SetColumn(columnIndex, column);
Beispiel #18
0
        public DataFrame Join(DataFrame other, string leftSuffix = "_left", string rightSuffix = "_right", JoinAlgorithm joinAlgorithm = JoinAlgorithm.Left)
        {
            DataFrame ret = new DataFrame();

            if (joinAlgorithm == JoinAlgorithm.Left)
            {
                for (int i = 0; i < ColumnCount; i++)
                {
                    BaseColumn newColumn = Column(i).Clone();
                    ret.InsertColumn(ret.ColumnCount, newColumn);
                }
                long minLength = Math.Min(RowCount, other.RowCount);
                PrimitiveColumn <long> mapIndices = new PrimitiveColumn <long>("mapIndices", minLength);
                for (long i = 0; i < minLength; i++)
                {
                    mapIndices[i] = i;
                }
                for (int i = 0; i < other.ColumnCount; i++)
                {
                    BaseColumn newColumn;
                    if (other.RowCount < RowCount)
                    {
                        newColumn = other.Column(i).Clone(numberOfNullsToAppend: RowCount - other.RowCount);
                    }
                    else
                    {
                        newColumn = other.Column(i).Clone(mapIndices);
                    }
                    SetSuffixForDuplicatedColumnNames(ret, newColumn, leftSuffix, rightSuffix);
                    ret.InsertColumn(ret.ColumnCount, newColumn);
                }
            }
            else if (joinAlgorithm == JoinAlgorithm.Right)
            {
                long minLength = Math.Min(RowCount, other.RowCount);
                PrimitiveColumn <long> mapIndices = new PrimitiveColumn <long>("mapIndices", minLength);
                for (long i = 0; i < minLength; i++)
                {
                    mapIndices[i] = i;
                }
                for (int i = 0; i < ColumnCount; i++)
                {
                    BaseColumn newColumn;
                    if (RowCount < other.RowCount)
                    {
                        newColumn = Column(i).Clone(numberOfNullsToAppend: other.RowCount - RowCount);
                    }
                    else
                    {
                        newColumn = Column(i).Clone(mapIndices);
                    }
                    ret.InsertColumn(ret.ColumnCount, newColumn);
                }
                for (int i = 0; i < other.ColumnCount; i++)
                {
                    BaseColumn newColumn = other.Column(i).Clone();
                    SetSuffixForDuplicatedColumnNames(ret, newColumn, leftSuffix, rightSuffix);
                    ret.InsertColumn(ret.ColumnCount, newColumn);
                }
            }
            else if (joinAlgorithm == JoinAlgorithm.FullOuter)
            {
                long newRowCount   = Math.Max(RowCount, other.RowCount);
                long numberOfNulls = newRowCount - RowCount;
                for (int i = 0; i < ColumnCount; i++)
                {
                    BaseColumn newColumn = Column(i).Clone(numberOfNullsToAppend: numberOfNulls);
                    ret.InsertColumn(ret.ColumnCount, newColumn);
                }
                numberOfNulls = newRowCount - other.RowCount;
                for (int i = 0; i < other.ColumnCount; i++)
                {
                    BaseColumn newColumn = other.Column(i).Clone(numberOfNullsToAppend: numberOfNulls);
                    SetSuffixForDuplicatedColumnNames(ret, newColumn, leftSuffix, rightSuffix);
                    ret.InsertColumn(ret.ColumnCount, newColumn);
                }
            }
            else if (joinAlgorithm == JoinAlgorithm.Inner)
            {
                long newRowCount = Math.Min(RowCount, other.RowCount);
                PrimitiveColumn <long> mapIndices = new PrimitiveColumn <long>("mapIndices", newRowCount);
                for (long i = 0; i < newRowCount; i++)
                {
                    mapIndices[i] = i;
                }
                for (int i = 0; i < ColumnCount; i++)
                {
                    BaseColumn newColumn = Column(i).Clone(mapIndices);
                    ret.InsertColumn(ret.ColumnCount, newColumn);
                }
                for (int i = 0; i < other.ColumnCount; i++)
                {
                    BaseColumn newColumn = other.Column(i).Clone(mapIndices);
                    SetSuffixForDuplicatedColumnNames(ret, newColumn, leftSuffix, rightSuffix);
                    ret.InsertColumn(ret.ColumnCount, newColumn);
                }
            }
            return(ret);
        }
Beispiel #19
0
 public void SetColumn(int columnIndex, BaseColumn column)
 {
     _table.SetColumn(columnIndex, column);
     OnColumnsChanged();
 }
 public virtual PrimitiveColumn <bool> GreaterThanOrEqual(BaseColumn column)
 {
     throw new NotImplementedException();
 }