コード例 #1
0
 public PrimitiveColumn <T> Clone(PrimitiveColumn <long> mapIndices = null, bool invertMapIndices = false)
 {
     if (mapIndices is null)
     {
         PrimitiveColumnContainer <T> newColumnContainer = _columnContainer.Clone();
         return(new PrimitiveColumn <T>(Name, newColumnContainer));
     }
     else
     {
         if (mapIndices.Length != Length)
         {
             throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(mapIndices));
         }
         PrimitiveColumn <T> ret = new PrimitiveColumn <T>(Name);
         if (invertMapIndices == false)
         {
             for (long i = 0; i < mapIndices.Length; i++)
             {
                 ret.Append(_columnContainer[mapIndices._columnContainer[i]]);
             }
         }
         else
         {
             for (long i = Length - 1; i >= 0; i--)
             {
                 ret.Append(_columnContainer[mapIndices._columnContainer[i]]);
             }
         }
         return(ret);
     }
 }
コード例 #2
0
        public override BaseColumn CloneAndAppendNulls(BaseColumn mapIndices = null, bool invertMapIndices = false)
        {
            PrimitiveColumn <T> ret = Clone(mapIndices, invertMapIndices) as PrimitiveColumn <T>;

            ret.AppendMany(null, NullCount);
            return(ret);
        }
コード例 #3
0
        public override BaseColumn CumulativeProduct(IEnumerable <long> rowIndices, bool inPlace = false)
        {
            PrimitiveColumn <T> ret = inPlace ? this : Clone();

            PrimitiveColumnComputation <T> .Instance.CumulativeProduct(ret._columnContainer, rowIndices);

            return(ret);
        }
コード例 #4
0
        public override BaseColumn CumulativeSum(bool inPlace = false)
        {
            PrimitiveColumn <T> ret = inPlace ? this : Clone();

            PrimitiveColumnComputation <T> .Instance.CumulativeSum(ret._columnContainer);

            return(ret);
        }
コード例 #5
0
        private void GetSortIndices(IComparer <T> comparer, out PrimitiveColumn <long> columnSortIndices)
        {
            List <List <int> > bufferSortIndices = new List <List <int> >(_columnContainer.Buffers.Count);

            // Sort each buffer first
            for (int b = 0; b < _columnContainer.Buffers.Count; b++)
            {
                ReadOnlyDataFrameBuffer <T> buffer = _columnContainer.Buffers[b];
                int[] sortIndices = new int[buffer.Length];
                for (int i = 0; i < buffer.Length; i++)
                {
                    sortIndices[i] = i;
                }
                IntrospectiveSort(buffer.ReadOnlySpan, buffer.Length, sortIndices, comparer);
                // Bug fix: QuickSort is not stable. When PrimitiveColumn has null values and default values, they move around
                List <int> nonNullSortIndices = new List <int>();
                for (int i = 0; i < sortIndices.Length; i++)
                {
                    if (IsValid(sortIndices[i] + b * ReadOnlyDataFrameBuffer <T> .MaxCapacity))
                    {
                        nonNullSortIndices.Add(sortIndices[i]);
                    }
                }
                bufferSortIndices.Add(nonNullSortIndices);
            }
            // Simple merge sort to build the full column's sort indices
            ValueTuple <T, int> GetFirstNonNullValueAndBufferIndexStartingAtIndex(int bufferIndex, int startIndex)
            {
                T    value    = _columnContainer.Buffers[bufferIndex][bufferSortIndices[bufferIndex][startIndex]];
                long rowIndex = bufferSortIndices[bufferIndex][startIndex] + bufferIndex * ReadOnlyDataFrameBuffer <T> .MaxCapacity;

                return(value, startIndex);
            }

            SortedDictionary <T, List <ValueTuple <int, int> > > heapOfValueAndListOfTupleOfSortAndBufferIndex = new SortedDictionary <T, List <ValueTuple <int, int> > >(comparer);
            IList <ReadOnlyDataFrameBuffer <T> > buffers = _columnContainer.Buffers;

            for (int i = 0; i < buffers.Count; i++)
            {
                ReadOnlyDataFrameBuffer <T> buffer = buffers[i];
                if (bufferSortIndices[i].Count == 0)
                {
                    // All nulls
                    continue;
                }
                ValueTuple <T, int> valueAndBufferIndex = GetFirstNonNullValueAndBufferIndexStartingAtIndex(i, 0);
                if (heapOfValueAndListOfTupleOfSortAndBufferIndex.ContainsKey(valueAndBufferIndex.Item1))
                {
                    heapOfValueAndListOfTupleOfSortAndBufferIndex[valueAndBufferIndex.Item1].Add((valueAndBufferIndex.Item2, i));
                }
                else
                {
                    heapOfValueAndListOfTupleOfSortAndBufferIndex.Add(valueAndBufferIndex.Item1, new List <ValueTuple <int, int> >()
                    {
                        (valueAndBufferIndex.Item2, i)
                    });
コード例 #6
0
        public override PrimitiveColumn <bool> NotEquals <T>(T value)
        {
            PrimitiveColumn <bool> ret = new PrimitiveColumn <bool>(Name, Length);
            string valString           = value.ToString();

            for (long i = 0; i < Length; i++)
            {
                ret[i] = (string)this[i] != valString;
            }
            return(ret);
        }
コード例 #7
0
        public void InsertColumn <T>(int columnIndex, IEnumerable <T> column, string columnName, DataFrame parent)
            where T : unmanaged
        {
            column = column ?? throw new ArgumentNullException(nameof(column));
            if ((uint)columnIndex > _columns.Count)
            {
                throw new ArgumentOutOfRangeException(nameof(columnIndex));
            }
            BaseColumn newColumn = new PrimitiveColumn <T>(columnName, column);

            InsertColumn(columnIndex, newColumn, parent);
        }
コード例 #8
0
        private void GetSortIndices(IComparer <T> comparer, out PrimitiveColumn <long> columnSortIndices)
        {
            List <int[]> bufferSortIndices = new List <int[]>(_columnContainer.Buffers.Count);

            // Sort each buffer first
            foreach (DataFrameBuffer <T> buffer in _columnContainer.Buffers)
            {
                var sortIndices = new int[buffer.Length];
                for (int i = 0; i < buffer.Length; i++)
                {
                    sortIndices[i] = i;
                }
                IntrospectiveSort(buffer.Span, buffer.Length, sortIndices, comparer);
                bufferSortIndices.Add(sortIndices);
            }
            // Simple merge sort to build the full column's sort indices
            ValueTuple <T, int> GetFirstNonNullValueAndBufferIndexStartingAtIndex(int bufferIndex, int startIndex)
            {
                T    value    = _columnContainer.Buffers[bufferIndex][bufferSortIndices[bufferIndex][startIndex]];
                long rowIndex = bufferSortIndices[bufferIndex][startIndex] + bufferIndex * _columnContainer.Buffers[0].MaxCapacity;

                while (!IsValid(rowIndex) && ++startIndex < bufferSortIndices[bufferIndex].Length)
                {
                    value    = _columnContainer.Buffers[bufferIndex][bufferSortIndices[bufferIndex][startIndex]];
                    rowIndex = startIndex + bufferIndex * _columnContainer.Buffers[0].MaxCapacity;
                }
                return(value, startIndex);
            }

            SortedDictionary <T, List <ValueTuple <int, int> > > heapOfValueAndListOfTupleOfSortAndBufferIndex = new SortedDictionary <T, List <ValueTuple <int, int> > >(comparer);
            IList <DataFrameBuffer <T> > buffers = _columnContainer.Buffers;

            for (int i = 0; i < buffers.Count; i++)
            {
                DataFrameBuffer <T> buffer = buffers[i];
                ValueTuple <T, int> valueAndBufferIndex = GetFirstNonNullValueAndBufferIndexStartingAtIndex(i, 0);
                long columnIndex = valueAndBufferIndex.Item2 + i * bufferSortIndices[0].Length;
                if (columnIndex == Length)
                {
                    // All nulls
                    continue;
                }
                if (heapOfValueAndListOfTupleOfSortAndBufferIndex.ContainsKey(valueAndBufferIndex.Item1))
                {
                    heapOfValueAndListOfTupleOfSortAndBufferIndex[valueAndBufferIndex.Item1].Add((valueAndBufferIndex.Item2, i));
                }
                else
                {
                    heapOfValueAndListOfTupleOfSortAndBufferIndex.Add(valueAndBufferIndex.Item1, new List <ValueTuple <int, int> >()
                    {
                        (valueAndBufferIndex.Item2, i)
                    });
コード例 #9
0
        public override PrimitiveColumn <bool> NotEquals(BaseColumn column)
        {
            // TODO: Using indexing is VERY inefficient here. Each indexer call will find the "right" buffer and then return the value
            if (Length != column.Length)
            {
                throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column));
            }
            PrimitiveColumn <bool> ret = new PrimitiveColumn <bool>(Name, Length);

            for (long i = 0; i < Length; i++)
            {
                ret[i] = (string)this[i] != column[i].ToString();
            }
            return(ret);
        }
コード例 #10
0
 public PrimitiveColumn <T> Clone(PrimitiveColumn <long> mapIndices = null, bool invertMapIndices = false)
 {
     if (mapIndices is null)
     {
         PrimitiveColumnContainer <T> newColumnContainer = _columnContainer.Clone();
         return(new PrimitiveColumn <T>(Name, newColumnContainer));
     }
     else
     {
         if (mapIndices.Length > Length)
         {
             throw new ArgumentException(Strings.MapIndicesExceedsColumnLenth, nameof(mapIndices));
         }
         PrimitiveColumn <T> ret = new PrimitiveColumn <T>(Name, mapIndices.Length);
         ret._columnContainer._modifyNullCountWhileIndexing = false;
         if (invertMapIndices == false)
         {
             for (long i = 0; i < mapIndices.Length; i++)
             {
                 T?value = _columnContainer[mapIndices._columnContainer[i].Value];
                 ret[i] = value;
                 if (!value.HasValue)
                 {
                     ret._columnContainer.NullCount++;
                 }
             }
         }
         else
         {
             long mapIndicesIndex = mapIndices.Length - 1;
             for (long i = 0; i < mapIndices.Length; i++)
             {
                 T?value = _columnContainer[mapIndices._columnContainer[mapIndicesIndex - i].Value];
                 ret[i] = value;
                 if (!value.HasValue)
                 {
                     ret._columnContainer.NullCount++;
                 }
             }
         }
         ret._columnContainer._modifyNullCountWhileIndexing = true;
         return(ret);
     }
 }
コード例 #11
0
        private void GetSortIndices(IComparer <T> comparer, out PrimitiveColumn <long> columnSortIndices)
        {
            List <int[]> bufferSortIndices = new List <int[]>(_columnContainer.Buffers.Count);

            // Sort each buffer first
            foreach (DataFrameBuffer <T> buffer in _columnContainer.Buffers)
            {
                var sortIndices = new int[buffer.Length];
                for (int i = 0; i < buffer.Length; i++)
                {
                    sortIndices[i] = i;
                }
                IntrospectiveSort(buffer.Span, buffer.Length, sortIndices, comparer);
                bufferSortIndices.Add(sortIndices);
            }
            // Simple merge sort to build the full column's sort indices
            SortedDictionary <T, List <Tuple <int, int> > > heapOfValueAndListOfTupleOfSortAndBufferIndex = new SortedDictionary <T, List <Tuple <int, int> > >(comparer);
            IList <DataFrameBuffer <T> > buffers = _columnContainer.Buffers;

            for (int i = 0; i < buffers.Count; i++)
            {
                DataFrameBuffer <T> buffer = buffers[i];
                T value = buffer[bufferSortIndices[i][0]];
                if (heapOfValueAndListOfTupleOfSortAndBufferIndex.ContainsKey(value))
                {
                    heapOfValueAndListOfTupleOfSortAndBufferIndex[value].Add(new Tuple <int, int>(0, i));
                }
                else
                {
                    heapOfValueAndListOfTupleOfSortAndBufferIndex.Add(value, new List <Tuple <int, int> >()
                    {
                        new Tuple <int, int>(0, i)
                    });
                }
            }
            columnSortIndices = new PrimitiveColumn <long>("SortIndices");
            GetBufferSortIndex     getBufferSortIndex     = new GetBufferSortIndex((int bufferIndex, int sortIndex) => bufferSortIndices[bufferIndex][sortIndex]);
            GetValueAtBuffer <T>   getValueAtBuffer       = new GetValueAtBuffer <T>((int bufferIndex, int sortIndex) => _columnContainer.Buffers[bufferIndex][bufferSortIndices[bufferIndex][sortIndex]]);
            GetBufferLengthAtIndex getBufferLengthAtIndex = new GetBufferLengthAtIndex((int bufferIndex) => bufferSortIndices[bufferIndex].Length);

            PopulateColumnSortIndicesWithHeap(heapOfValueAndListOfTupleOfSortAndBufferIndex, columnSortIndices, getBufferSortIndex, getValueAtBuffer, getBufferLengthAtIndex);
        }
コード例 #12
0
        private void GetSortIndices(IComparer <T> comparer, out PrimitiveColumn <long> columnSortIndices)
        {
            List <List <int> > bufferSortIndices = new List <List <int> >(_columnContainer.Buffers.Count);

            // Sort each buffer first
            for (int b = 0; b < _columnContainer.Buffers.Count; b++)
            {
                ReadOnlyDataFrameBuffer <T> buffer         = _columnContainer.Buffers[b];
                ReadOnlySpan <byte>         nullBitMapSpan = _columnContainer.NullBitMapBuffers[b].ReadOnlySpan;
                int[] sortIndices = new int[buffer.Length];
                for (int i = 0; i < buffer.Length; i++)
                {
                    sortIndices[i] = i;
                }
                IntrospectiveSort(buffer.ReadOnlySpan, buffer.Length, sortIndices, comparer);
                // Bug fix: QuickSort is not stable. When PrimitiveColumn has null values and default values, they move around
                List <int> nonNullSortIndices = new List <int>();
                for (int i = 0; i < sortIndices.Length; i++)
                {
                    if (_columnContainer.IsValid(nullBitMapSpan, sortIndices[i]))
                    {
                        nonNullSortIndices.Add(sortIndices[i]);
                    }
                }
                bufferSortIndices.Add(nonNullSortIndices);
            }
            // Simple merge sort to build the full column's sort indices
            ValueTuple <T, int> GetFirstNonNullValueAndBufferIndexStartingAtIndex(int bufferIndex, int startIndex)
            {
                int index = bufferSortIndices[bufferIndex][startIndex];
                T   value;
                ReadOnlyMemory <byte> buffer      = _columnContainer.Buffers[bufferIndex].ReadOnlyBuffer;
                ReadOnlyMemory <T>    typedBuffer = Unsafe.As <ReadOnlyMemory <byte>, ReadOnlyMemory <T> >(ref buffer);

                if (!typedBuffer.IsEmpty)
                {
                    bool isArray = MemoryMarshal.TryGetArray(typedBuffer, out ArraySegment <T> arraySegment);
                    if (isArray)
                    {
                        value = arraySegment.Array[index + arraySegment.Offset];
                    }
                    else
                    {
                        value = _columnContainer.Buffers[bufferIndex][index];
                    }
                }
                else
                {
                    value = _columnContainer.Buffers[bufferIndex][index];
                }
                return(value, startIndex);
            }

            SortedDictionary <T, List <ValueTuple <int, int> > > heapOfValueAndListOfTupleOfSortAndBufferIndex = new SortedDictionary <T, List <ValueTuple <int, int> > >(comparer);
            IList <ReadOnlyDataFrameBuffer <T> > buffers = _columnContainer.Buffers;

            for (int i = 0; i < buffers.Count; i++)
            {
                ReadOnlyDataFrameBuffer <T> buffer = buffers[i];
                if (bufferSortIndices[i].Count == 0)
                {
                    // All nulls
                    continue;
                }
                ValueTuple <T, int> valueAndBufferIndex = GetFirstNonNullValueAndBufferIndexStartingAtIndex(i, 0);
                if (heapOfValueAndListOfTupleOfSortAndBufferIndex.ContainsKey(valueAndBufferIndex.Item1))
                {
                    heapOfValueAndListOfTupleOfSortAndBufferIndex[valueAndBufferIndex.Item1].Add((valueAndBufferIndex.Item2, i));
                }
                else
                {
                    heapOfValueAndListOfTupleOfSortAndBufferIndex.Add(valueAndBufferIndex.Item1, new List <ValueTuple <int, int> >()
                    {
                        (valueAndBufferIndex.Item2, i)
                    });
コード例 #13
0
        public override BaseColumn Sort(bool ascending = true)
        {
            PrimitiveColumn <long> sortIndices = GetAscendingSortIndices() as PrimitiveColumn <long>;

            return(Clone(sortIndices, !ascending, NullCount));
        }
コード例 #14
0
        /// <summary>
        /// Reads a text file as a DataFrame.
        /// Follows pandas API.
        /// </summary>
        /// <param name="createStream">function which creates a stream</param>
        /// <param name="separator">column separator</param>
        /// <param name="header">has a header or not</param>
        /// <param name="columnNames">column names (can be empty)</param>
        /// <param name="dataTypes">column types (can be empty)</param>
        /// <param name="numberOfRowsToRead">number of rows to read not including the header(if present)</param>
        /// <param name="guessRows">number of rows used to guess types</param>
        /// <param name="addIndexColumn">add one column with the row index</param>
        /// <returns>DataFrame</returns>
        public static DataFrame ReadStream(Func <StreamReader> createStream,
                                           char separator          = ',', bool header       = true,
                                           string[] columnNames    = null, Type[] dataTypes = null,
                                           long numberOfRowsToRead = -1, int guessRows      = 10, bool addIndexColumn = false)
        {
            var  linesForGuessType = new List <string[]>();
            long rowline           = 0;
            int  numberOfColumns   = 0;

            if (header == true && numberOfRowsToRead != -1)
            {
                numberOfRowsToRead++;
            }

            // First pass: schema and number of rows.
            using (var st = createStream())
            {
                string line = st.ReadLine();
                while (line != null)
                {
                    if ((numberOfRowsToRead == -1) || rowline < numberOfRowsToRead)
                    {
                        if (linesForGuessType.Count < guessRows)
                        {
                            var spl = line.Split(separator);
                            if (header && rowline == 0)
                            {
                                if (columnNames == null)
                                {
                                    columnNames = spl;
                                }
                            }
                            else
                            {
                                linesForGuessType.Add(spl);
                                numberOfColumns = Math.Max(numberOfColumns, spl.Length);
                            }
                        }
                    }
                    ++rowline;
                    if (rowline == numberOfRowsToRead)
                    {
                        break;
                    }
                    line = st.ReadLine();
                }
            }

            if (linesForGuessType.Count == 0)
            {
                throw new FormatException(Strings.EmptyFile);
            }

            List <BaseColumn> columns = new List <BaseColumn>(numberOfColumns);

            // Guesses types and adds columns.
            for (int i = 0; i < numberOfColumns; ++i)
            {
                Type kind = GuessKind(i, linesForGuessType);
                if (kind == typeof(bool))
                {
                    BaseColumn boolColumn = new PrimitiveColumn <bool>(columnNames == null ? "Column" + i.ToString() : columnNames[i], header == true ? rowline - 1 : rowline);
                    columns.Add(boolColumn);
                }
                else if (kind == typeof(float))
                {
                    BaseColumn floatColumn = new PrimitiveColumn <float>(columnNames == null ? "Column" + i.ToString() : columnNames[i], header == true ? rowline - 1 : rowline);
                    columns.Add(floatColumn);
                }
                else if (kind == typeof(string))
                {
                    BaseColumn stringColumn = new StringColumn(columnNames == null ? "Column" + i.ToString() : columnNames[i], header == true ? rowline - 1 : rowline);
                    columns.Add(stringColumn);
                }
                else
                {
                    throw new NotSupportedException(nameof(kind));
                }
            }

            // Fills values.
            using (StreamReader st = createStream())
            {
                string line = st.ReadLine();
                rowline = 0;
                while (line != null && (numberOfRowsToRead == -1 || rowline < numberOfRowsToRead))
                {
                    var spl = line.Split(separator);
                    if (header && rowline == 0)
                    {
                        // Skips.
                    }
                    else
                    {
                        AppendRow(columns, header == true ? rowline - 1 : rowline, spl);
                    }
                    ++rowline;
                    line = st.ReadLine();
                }
            }

            if (addIndexColumn)
            {
                PrimitiveColumn <int> indexColumn = new PrimitiveColumn <int>("IndexColumn", columns[0].Length);
                for (int i = 0; i < columns[0].Length; i++)
                {
                    indexColumn[i] = i;
                }
                columns.Insert(0, indexColumn);
            }
            return(new DataFrame(columns));
        }
コード例 #15
0
        // TODO: Merge API with an "On" parameter that merges on a column common to 2 dataframes

        /// <summary>
        /// Merge DataFrames with a database style join
        /// </summary>
        /// <param name="other"></param>
        /// <param name="leftJoinColumn"></param>
        /// <param name="rightJoinColumn"></param>
        /// <param name="leftSuffix"></param>
        /// <param name="rightSuffix"></param>
        /// <param name="joinAlgorithm"></param>
        /// <returns></returns>
        public DataFrame Merge <TKey>(DataFrame other, string leftJoinColumn, string rightJoinColumn, string leftSuffix = "_left", string rightSuffix = "_right", JoinAlgorithm joinAlgorithm = JoinAlgorithm.Left)
        {
            // A simple hash join
            DataFrame ret = new DataFrame();
            PrimitiveColumn <long> emptyMap = new PrimitiveColumn <long>("Empty");

            for (int i = 0; i < ColumnCount; i++)
            {
                // Create empty columns
                BaseColumn column = Column(i).Clone(emptyMap);
                ret.InsertColumn(ret.ColumnCount, column);
            }

            for (int i = 0; i < other.ColumnCount; i++)
            {
                // Create empty columns
                BaseColumn column = other.Column(i).Clone(emptyMap);
                SetSuffixForDuplicatedColumnNames(ret, column, leftSuffix, rightSuffix);
                ret.InsertColumn(ret.ColumnCount, column);
            }

            // The final table size is not known until runtime
            long rowNumber = 0;

            if (joinAlgorithm == JoinAlgorithm.Left)
            {
                // First hash other dataframe on the rightJoinColumn
                BaseColumn otherColumn = other[rightJoinColumn];
                Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>();

                // Go over the records in this dataframe and match with the dictionary
                BaseColumn thisColumn = this[leftJoinColumn];
                for (int c = 0; c < ret.ColumnCount; c++)
                {
                    ret.Column(c).Resize(thisColumn.Length);
                }

                for (long i = 0; i < thisColumn.Length; i++)
                {
                    if (rowNumber >= thisColumn.Length)
                    {
                        for (int c = 0; c < ret.ColumnCount; c++)
                        {
                            ret.Column(c).Resize(rowNumber + 1);
                        }
                    }
                    TKey value = (TKey)(thisColumn[i] ?? default(TKey));
                    if (multimap.TryGetValue(value, out ICollection <long> rowNumbers))
                    {
                        foreach (long row in rowNumbers)
                        {
                            if (thisColumn[i] == null)
                            {
                                // Match only with nulls in otherColumn
                                if (otherColumn[row] == null)
                                {
                                    AppendForMerge(ret, rowNumber++, this, other, i, row);
                                }
                            }
                            else
                            {
                                // Cannot match nulls in otherColumn
                                if (otherColumn[row] != null)
                                {
                                    AppendForMerge(ret, rowNumber++, this, other, i, row);
                                }
                            }
                        }
                    }
                    else
                    {
                        AppendForMerge(ret, rowNumber++, this, other, i, -1);
                    }
                }
                ret._table.RowCount = rowNumber;
            }
            else if (joinAlgorithm == JoinAlgorithm.Right)
            {
                BaseColumn thisColumn = this[leftJoinColumn];
                Dictionary <TKey, ICollection <long> > multimap = thisColumn.GroupColumnValues <TKey>();

                BaseColumn otherColumn = other[rightJoinColumn];
                for (int c = 0; c < ret.ColumnCount; c++)
                {
                    ret.Column(c).Resize(otherColumn.Length);
                }

                for (long i = 0; i < otherColumn.Length; i++)
                {
                    if (rowNumber >= otherColumn.Length)
                    {
                        for (int c = 0; c < ret.ColumnCount; c++)
                        {
                            ret.Column(c).Resize(rowNumber + 1);
                        }
                    }
                    TKey value = (TKey)(otherColumn[i] ?? default(TKey));
                    if (multimap.TryGetValue(value, out ICollection <long> rowNumbers))
                    {
                        foreach (long row in rowNumbers)
                        {
                            if (otherColumn[i] == null)
                            {
                                if (thisColumn[row] == null)
                                {
                                    AppendForMerge(ret, rowNumber++, this, other, row, i);
                                }
                            }
                            else
                            {
                                if (thisColumn[row] != null)
                                {
                                    AppendForMerge(ret, rowNumber++, this, other, row, i);
                                }
                            }
                        }
                    }
                    else
                    {
                        AppendForMerge(ret, rowNumber++, this, other, -1, i);
                    }
                }
                ret._table.RowCount = rowNumber;
            }
            else if (joinAlgorithm == JoinAlgorithm.Inner)
            {
                // Hash the column with the smaller RowCount
                long       leftRowCount     = RowCount;
                long       rightRowCount    = other.RowCount;
                DataFrame  longerDataFrame  = leftRowCount < rightRowCount ? other : this;
                DataFrame  shorterDataFrame = ReferenceEquals(longerDataFrame, this) ? other : this;
                BaseColumn hashColumn       = (leftRowCount < rightRowCount) ? this[leftJoinColumn] : other[rightJoinColumn];
                BaseColumn otherColumn      = ReferenceEquals(hashColumn, this[leftJoinColumn]) ? other[rightJoinColumn] : this[leftJoinColumn];
                Dictionary <TKey, ICollection <long> > multimap = hashColumn.GroupColumnValues <TKey>();

                for (int c = 0; c < ret.ColumnCount; c++)
                {
                    ret.Column(c).Resize(1);
                }

                for (long i = 0; i < otherColumn.Length; i++)
                {
                    if (rowNumber >= ret.Column(0).Length)
                    {
                        for (int c = 0; c < ret.ColumnCount; c++)
                        {
                            ret.Column(c).Resize(rowNumber + 1);
                        }
                    }
                    TKey value = (TKey)(otherColumn[i] ?? default(TKey));
                    if (multimap.TryGetValue(value, out ICollection <long> rowNumbers))
                    {
                        foreach (long row in rowNumbers)
                        {
                            if (otherColumn[i] == null)
                            {
                                if (hashColumn[row] == null)
                                {
                                    AppendForMerge(ret, rowNumber++, this, other, ReferenceEquals(this, shorterDataFrame) ? row : i, ReferenceEquals(this, shorterDataFrame) ? i : row);
                                }
                            }
                            else
                            {
                                if (hashColumn[row] != null)
                                {
                                    AppendForMerge(ret, rowNumber++, this, other, ReferenceEquals(this, shorterDataFrame) ? row : i, ReferenceEquals(this, shorterDataFrame) ? i : row);
                                }
                            }
                        }
                    }
                }
                ret._table.RowCount = rowNumber;
            }
            else if (joinAlgorithm == JoinAlgorithm.FullOuter)
            {
                BaseColumn otherColumn = other[rightJoinColumn];
                Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>();
                Dictionary <TKey, long> intersection            = new Dictionary <TKey, long>(EqualityComparer <TKey> .Default);

                // Go over the records in this dataframe and match with the dictionary
                BaseColumn thisColumn = this[leftJoinColumn];
                for (int c = 0; c < ret.ColumnCount; c++)
                {
                    ret.Column(c).Resize(thisColumn.Length + 1);
                }

                for (long i = 0; i < thisColumn.Length; i++)
                {
                    if (rowNumber >= thisColumn.Length)
                    {
                        for (int c = 0; c < ret.ColumnCount; c++)
                        {
                            ret.Column(c).Resize(rowNumber + 1);
                        }
                    }
                    TKey value = (TKey)(thisColumn[i] ?? default(TKey));
                    if (multimap.TryGetValue(value, out ICollection <long> rowNumbers))
                    {
                        foreach (long row in rowNumbers)
                        {
                            if (thisColumn[i] == null)
                            {
                                // Has to match only with nulls in otherColumn
                                if (otherColumn[row] == null)
                                {
                                    AppendForMerge(ret, rowNumber++, this, other, i, row);
                                    if (!intersection.ContainsKey(value))
                                    {
                                        intersection.Add(value, rowNumber);
                                    }
                                }
                            }
                            else
                            {
                                // Cannot match to nulls in otherColumn
                                if (otherColumn[row] != null)
                                {
                                    AppendForMerge(ret, rowNumber++, this, other, i, row);
                                    if (!intersection.ContainsKey(value))
                                    {
                                        intersection.Add(value, rowNumber);
                                    }
                                }
                            }
                        }
                    }
                    else
                    {
                        AppendForMerge(ret, rowNumber++, this, other, i, -1);
                    }
                }
                for (long i = 0; i < otherColumn.Length; i++)
                {
                    if (rowNumber >= ret.Column(0).Length)
                    {
                        for (int c = 0; c < ret.ColumnCount; c++)
                        {
                            ret.Column(c).Resize(rowNumber + 1);
                        }
                    }
                    TKey value = (TKey)(otherColumn[i] ?? default(TKey));
                    if (!intersection.ContainsKey(value))
                    {
                        if (rowNumber >= otherColumn.Length)
                        {
                            for (int c = 0; c < ret.ColumnCount; c++)
                            {
                                ret.Column(c).Resize(rowNumber + 1);
                            }
                        }
                        AppendForMerge(ret, rowNumber++, this, other, -1, i);
                    }
                }
                ret._table.RowCount = rowNumber;
            }
            return(ret);
        }
コード例 #16
0
 public override BaseColumn Sort(bool ascending = true)
 {
     PrimitiveColumn<long> sortIndices = GetAscendingSortIndices() as PrimitiveColumn<long>;
     return CloneAndAppendNulls(sortIndices, !ascending);
 }
コード例 #17
0
        public DataFrame(RecordBatch recordBatch)
        {
            _table = new DataFrameTable();
            Apache.Arrow.Schema arrowSchema = recordBatch.Schema;
            int fieldIndex = 0;
            IEnumerable <IArrowArray> arrowArrays = recordBatch.Arrays;

            foreach (IArrowArray arrowArray in arrowArrays)
            {
                Field      field           = arrowSchema.GetFieldByIndex(fieldIndex);
                IArrowType fieldType       = field.DataType;
                BaseColumn dataFrameColumn = null;
                switch (fieldType.TypeId)
                {
                case ArrowTypeId.Boolean:
                    BooleanArray          arrowBooleanArray = (BooleanArray)arrowArray;
                    ReadOnlyMemory <byte> valueBuffer       = arrowBooleanArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte> nullBitMapBuffer  = arrowBooleanArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new PrimitiveColumn <bool>(field.Name, valueBuffer, nullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                    break;

                case ArrowTypeId.Double:
                    PrimitiveArray <double> arrowDoubleArray       = (PrimitiveArray <double>)arrowArray;
                    ReadOnlyMemory <byte>   doubleValueBuffer      = arrowDoubleArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte>   doubleNullBitMapBuffer = arrowDoubleArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new PrimitiveColumn <double>(field.Name, doubleValueBuffer, doubleNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                    break;

                case ArrowTypeId.Float:
                    PrimitiveArray <float> arrowFloatArray       = (PrimitiveArray <float>)arrowArray;
                    ReadOnlyMemory <byte>  floatValueBuffer      = arrowFloatArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte>  floatNullBitMapBuffer = arrowFloatArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new PrimitiveColumn <float>(field.Name, floatValueBuffer, floatNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                    break;

                case ArrowTypeId.Int8:
                    PrimitiveArray <sbyte> arrowsbyteArray       = (PrimitiveArray <sbyte>)arrowArray;
                    ReadOnlyMemory <byte>  sbyteValueBuffer      = arrowsbyteArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte>  sbyteNullBitMapBuffer = arrowsbyteArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new PrimitiveColumn <sbyte>(field.Name, sbyteValueBuffer, sbyteNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                    break;

                case ArrowTypeId.Int16:
                    PrimitiveArray <short> arrowshortArray       = (PrimitiveArray <short>)arrowArray;
                    ReadOnlyMemory <byte>  shortValueBuffer      = arrowshortArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte>  shortNullBitMapBuffer = arrowshortArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new PrimitiveColumn <short>(field.Name, shortValueBuffer, shortNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                    break;

                case ArrowTypeId.Int32:
                    PrimitiveArray <int>  arrowIntArray       = (PrimitiveArray <int>)arrowArray;
                    ReadOnlyMemory <byte> intValueBuffer      = arrowIntArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte> intNullBitMapBuffer = arrowIntArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new PrimitiveColumn <int>(field.Name, intValueBuffer, intNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                    break;

                case ArrowTypeId.Int64:
                    PrimitiveArray <long> arrowLongArray       = (PrimitiveArray <long>)arrowArray;
                    ReadOnlyMemory <byte> longValueBuffer      = arrowLongArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte> longNullBitMapBuffer = arrowLongArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new PrimitiveColumn <long>(field.Name, longValueBuffer, longNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                    break;

                case ArrowTypeId.String:
                    StringArray           stringArray   = (StringArray)arrowArray;
                    ReadOnlyMemory <byte> dataMemory    = stringArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte> offsetsMemory = stringArray.ValueOffsetsBuffer.Memory;
                    ReadOnlyMemory <byte> nullMemory    = stringArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new ArrowStringColumn(field.Name, dataMemory, offsetsMemory, nullMemory, stringArray.Length, stringArray.NullCount);
                    break;

                case ArrowTypeId.UInt8:
                    PrimitiveArray <byte> arrowbyteArray       = (PrimitiveArray <byte>)arrowArray;
                    ReadOnlyMemory <byte> byteValueBuffer      = arrowbyteArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte> byteNullBitMapBuffer = arrowbyteArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new PrimitiveColumn <byte>(field.Name, byteValueBuffer, byteNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                    break;

                case ArrowTypeId.UInt16:
                    PrimitiveArray <ushort> arrowUshortArray       = (PrimitiveArray <ushort>)arrowArray;
                    ReadOnlyMemory <byte>   ushortValueBuffer      = arrowUshortArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte>   ushortNullBitMapBuffer = arrowUshortArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new PrimitiveColumn <ushort>(field.Name, ushortValueBuffer, ushortNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                    break;

                case ArrowTypeId.UInt32:
                    PrimitiveArray <uint> arrowUintArray       = (PrimitiveArray <uint>)arrowArray;
                    ReadOnlyMemory <byte> uintValueBuffer      = arrowUintArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte> uintNullBitMapBuffer = arrowUintArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new PrimitiveColumn <uint>(field.Name, uintValueBuffer, uintNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                    break;

                case ArrowTypeId.UInt64:
                    PrimitiveArray <ulong> arrowUlongArray       = (PrimitiveArray <ulong>)arrowArray;
                    ReadOnlyMemory <byte>  ulongValueBuffer      = arrowUlongArray.ValueBuffer.Memory;
                    ReadOnlyMemory <byte>  ulongNullBitMapBuffer = arrowUlongArray.NullBitmapBuffer.Memory;
                    dataFrameColumn = new PrimitiveColumn <ulong>(field.Name, ulongValueBuffer, ulongNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount);
                    break;

                case ArrowTypeId.Decimal:
                case ArrowTypeId.Binary:
                case ArrowTypeId.Date32:
                case ArrowTypeId.Date64:
                case ArrowTypeId.Dictionary:
                case ArrowTypeId.FixedSizedBinary:
                case ArrowTypeId.HalfFloat:
                case ArrowTypeId.Interval:
                case ArrowTypeId.List:
                case ArrowTypeId.Map:
                case ArrowTypeId.Null:
                case ArrowTypeId.Struct:
                case ArrowTypeId.Time32:
                case ArrowTypeId.Time64:
                default:
                    throw new NotImplementedException(nameof(fieldType.Name));
                }
                _table.InsertColumn(ColumnCount, dataFrameColumn);
                fieldIndex++;
            }
        }
コード例 #18
0
        public DataFrame Join(DataFrame other, string leftSuffix = "_left", string rightSuffix = "_right", JoinAlgorithm joinAlgorithm = JoinAlgorithm.Left)
        {
            DataFrame ret = new DataFrame();

            if (joinAlgorithm == JoinAlgorithm.Left)
            {
                for (int i = 0; i < ColumnCount; i++)
                {
                    BaseColumn newColumn = Column(i).Clone();
                    ret.InsertColumn(ret.ColumnCount, newColumn);
                }
                long minLength = Math.Min(RowCount, other.RowCount);
                PrimitiveColumn <long> mapIndices = new PrimitiveColumn <long>("mapIndices", minLength);
                for (long i = 0; i < minLength; i++)
                {
                    mapIndices[i] = i;
                }
                for (int i = 0; i < other.ColumnCount; i++)
                {
                    BaseColumn newColumn;
                    if (other.RowCount < RowCount)
                    {
                        newColumn = other.Column(i).Clone(numberOfNullsToAppend: RowCount - other.RowCount);
                    }
                    else
                    {
                        newColumn = other.Column(i).Clone(mapIndices);
                    }
                    SetSuffixForDuplicatedColumnNames(ret, newColumn, leftSuffix, rightSuffix);
                    ret.InsertColumn(ret.ColumnCount, newColumn);
                }
            }
            else if (joinAlgorithm == JoinAlgorithm.Right)
            {
                long minLength = Math.Min(RowCount, other.RowCount);
                PrimitiveColumn <long> mapIndices = new PrimitiveColumn <long>("mapIndices", minLength);
                for (long i = 0; i < minLength; i++)
                {
                    mapIndices[i] = i;
                }
                for (int i = 0; i < ColumnCount; i++)
                {
                    BaseColumn newColumn;
                    if (RowCount < other.RowCount)
                    {
                        newColumn = Column(i).Clone(numberOfNullsToAppend: other.RowCount - RowCount);
                    }
                    else
                    {
                        newColumn = Column(i).Clone(mapIndices);
                    }
                    ret.InsertColumn(ret.ColumnCount, newColumn);
                }
                for (int i = 0; i < other.ColumnCount; i++)
                {
                    BaseColumn newColumn = other.Column(i).Clone();
                    SetSuffixForDuplicatedColumnNames(ret, newColumn, leftSuffix, rightSuffix);
                    ret.InsertColumn(ret.ColumnCount, newColumn);
                }
            }
            else if (joinAlgorithm == JoinAlgorithm.FullOuter)
            {
                long newRowCount   = Math.Max(RowCount, other.RowCount);
                long numberOfNulls = newRowCount - RowCount;
                for (int i = 0; i < ColumnCount; i++)
                {
                    BaseColumn newColumn = Column(i).Clone(numberOfNullsToAppend: numberOfNulls);
                    ret.InsertColumn(ret.ColumnCount, newColumn);
                }
                numberOfNulls = newRowCount - other.RowCount;
                for (int i = 0; i < other.ColumnCount; i++)
                {
                    BaseColumn newColumn = other.Column(i).Clone(numberOfNullsToAppend: numberOfNulls);
                    SetSuffixForDuplicatedColumnNames(ret, newColumn, leftSuffix, rightSuffix);
                    ret.InsertColumn(ret.ColumnCount, newColumn);
                }
            }
            else if (joinAlgorithm == JoinAlgorithm.Inner)
            {
                long newRowCount = Math.Min(RowCount, other.RowCount);
                PrimitiveColumn <long> mapIndices = new PrimitiveColumn <long>("mapIndices", newRowCount);
                for (long i = 0; i < newRowCount; i++)
                {
                    mapIndices[i] = i;
                }
                for (int i = 0; i < ColumnCount; i++)
                {
                    BaseColumn newColumn = Column(i).Clone(mapIndices);
                    ret.InsertColumn(ret.ColumnCount, newColumn);
                }
                for (int i = 0; i < other.ColumnCount; i++)
                {
                    BaseColumn newColumn = other.Column(i).Clone(mapIndices);
                    SetSuffixForDuplicatedColumnNames(ret, newColumn, leftSuffix, rightSuffix);
                    ret.InsertColumn(ret.ColumnCount, newColumn);
                }
            }
            return(ret);
        }
コード例 #19
0
        // TODO: Merge API with an "On" parameter that merges on a column common to 2 dataframes

        /// <summary>
        /// Merge DataFrames with a database style join
        /// </summary>
        /// <param name="other"></param>
        /// <param name="leftJoinColumn"></param>
        /// <param name="rightJoinColumn"></param>
        /// <param name="leftSuffix"></param>
        /// <param name="rightSuffix"></param>
        /// <param name="joinAlgorithm"></param>
        /// <returns></returns>
        public DataFrame Merge <TKey>(DataFrame other, string leftJoinColumn, string rightJoinColumn, string leftSuffix = "_left", string rightSuffix = "_right", JoinAlgorithm joinAlgorithm = JoinAlgorithm.Left)
        {
            // A simple hash join
            DataFrame ret            = new DataFrame();
            DataFrame leftDataFrame  = this;
            DataFrame rightDataFrame = other;

            // The final table size is not known until runtime
            long rowNumber = 0;
            PrimitiveColumn <long> leftRowIndices  = new PrimitiveColumn <long>("LeftIndices");
            PrimitiveColumn <long> rightRowIndices = new PrimitiveColumn <long>("RightIndices");

            if (joinAlgorithm == JoinAlgorithm.Left)
            {
                // First hash other dataframe on the rightJoinColumn
                BaseColumn otherColumn = other[rightJoinColumn];
                Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>();

                // Go over the records in this dataframe and match with the dictionary
                BaseColumn thisColumn = this[leftJoinColumn];

                for (long i = 0; i < thisColumn.Length; i++)
                {
                    var  thisColumnValue          = thisColumn[i];
                    TKey thisColumnValueOrDefault = (TKey)(thisColumnValue == null ? default(TKey) : thisColumnValue);
                    if (multimap.TryGetValue(thisColumnValueOrDefault, out ICollection <long> rowNumbers))
                    {
                        foreach (long row in rowNumbers)
                        {
                            if (thisColumnValue == null)
                            {
                                // Match only with nulls in otherColumn
                                if (otherColumn[row] == null)
                                {
                                    leftRowIndices.Append(i);
                                    rightRowIndices.Append(row);
                                }
                            }
                            else
                            {
                                // Cannot match nulls in otherColumn
                                if (otherColumn[row] != null)
                                {
                                    leftRowIndices.Append(i);
                                    rightRowIndices.Append(row);
                                }
                            }
                        }
                    }
                    else
                    {
                        leftRowIndices.Append(i);
                        rightRowIndices.Append(null);
                    }
                }
            }
            else if (joinAlgorithm == JoinAlgorithm.Right)
            {
                BaseColumn thisColumn = this[leftJoinColumn];
                Dictionary <TKey, ICollection <long> > multimap = thisColumn.GroupColumnValues <TKey>();

                BaseColumn otherColumn = other[rightJoinColumn];
                for (long i = 0; i < otherColumn.Length; i++)
                {
                    var  otherColumnValue          = otherColumn[i];
                    TKey otherColumnValueOrDefault = (TKey)(otherColumnValue == null ? default(TKey) : otherColumnValue);
                    if (multimap.TryGetValue(otherColumnValueOrDefault, out ICollection <long> rowNumbers))
                    {
                        foreach (long row in rowNumbers)
                        {
                            if (otherColumnValue == null)
                            {
                                if (thisColumn[row] == null)
                                {
                                    leftRowIndices.Append(row);
                                    rightRowIndices.Append(i);
                                }
                            }
                            else
                            {
                                if (thisColumn[row] != null)
                                {
                                    leftRowIndices.Append(row);
                                    rightRowIndices.Append(i);
                                }
                            }
                        }
                    }
                    else
                    {
                        leftRowIndices.Append(null);
                        rightRowIndices.Append(i);
                    }
                }
            }
            else if (joinAlgorithm == JoinAlgorithm.Inner)
            {
                // Hash the column with the smaller RowCount
                long       leftRowCount     = RowCount;
                long       rightRowCount    = other.RowCount;
                DataFrame  longerDataFrame  = leftRowCount < rightRowCount ? other : this;
                DataFrame  shorterDataFrame = ReferenceEquals(longerDataFrame, this) ? other : this;
                BaseColumn hashColumn       = (leftRowCount < rightRowCount) ? this[leftJoinColumn] : other[rightJoinColumn];
                BaseColumn otherColumn      = ReferenceEquals(hashColumn, this[leftJoinColumn]) ? other[rightJoinColumn] : this[leftJoinColumn];
                Dictionary <TKey, ICollection <long> > multimap = hashColumn.GroupColumnValues <TKey>();

                for (long i = 0; i < otherColumn.Length; i++)
                {
                    var  otherColumnValue          = otherColumn[i];
                    TKey otherColumnValueOrDefault = (TKey)(otherColumnValue == null ? default(TKey) : otherColumnValue);
                    if (multimap.TryGetValue(otherColumnValueOrDefault, out ICollection <long> rowNumbers))
                    {
                        foreach (long row in rowNumbers)
                        {
                            if (otherColumnValue == null)
                            {
                                if (hashColumn[row] == null)
                                {
                                    leftRowIndices.Append(row);
                                    rightRowIndices.Append(i);
                                }
                            }
                            else
                            {
                                if (hashColumn[row] != null)
                                {
                                    leftRowIndices.Append(row);
                                    rightRowIndices.Append(i);
                                }
                            }
                        }
                    }
                }
                leftDataFrame  = shorterDataFrame;
                rightDataFrame = longerDataFrame;
            }
            else if (joinAlgorithm == JoinAlgorithm.FullOuter)
            {
                BaseColumn otherColumn = other[rightJoinColumn];
                Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>();
                Dictionary <TKey, long> intersection            = new Dictionary <TKey, long>(EqualityComparer <TKey> .Default);

                // Go over the records in this dataframe and match with the dictionary
                BaseColumn thisColumn = this[leftJoinColumn];

                for (long i = 0; i < thisColumn.Length; i++)
                {
                    var  thisColumnValue          = thisColumn[i];
                    TKey thisColumnValueOrDefault = (TKey)(thisColumnValue == null ? default(TKey) : thisColumnValue);
                    if (multimap.TryGetValue(thisColumnValueOrDefault, out ICollection <long> rowNumbers))
                    {
                        foreach (long row in rowNumbers)
                        {
                            if (thisColumnValue == null)
                            {
                                // Has to match only with nulls in otherColumn
                                if (otherColumn[row] == null)
                                {
                                    leftRowIndices.Append(i);
                                    rightRowIndices.Append(row);
                                    if (!intersection.ContainsKey(thisColumnValueOrDefault))
                                    {
                                        intersection.Add(thisColumnValueOrDefault, rowNumber);
                                    }
                                }
                            }
                            else
                            {
                                // Cannot match to nulls in otherColumn
                                if (otherColumn[row] != null)
                                {
                                    leftRowIndices.Append(i);
                                    rightRowIndices.Append(row);
                                    if (!intersection.ContainsKey(thisColumnValueOrDefault))
                                    {
                                        intersection.Add(thisColumnValueOrDefault, rowNumber);
                                    }
                                }
                            }
                        }
                    }
                    else
                    {
                        leftRowIndices.Append(i);
                        rightRowIndices.Append(null);
                    }
                }
                for (long i = 0; i < otherColumn.Length; i++)
                {
                    TKey value = (TKey)(otherColumn[i] ?? default(TKey));
                    if (!intersection.ContainsKey(value))
                    {
                        leftRowIndices.Append(null);
                        rightRowIndices.Append(i);
                    }
                }
            }
            else
            {
                throw new NotImplementedException(nameof(joinAlgorithm));
            }

            for (int i = 0; i < leftDataFrame.ColumnCount; i++)
            {
                ret.InsertColumn(i, leftDataFrame.Column(i).Clone(leftRowIndices));
            }
            for (int i = 0; i < rightDataFrame.ColumnCount; i++)
            {
                BaseColumn column = rightDataFrame.Column(i).Clone(rightRowIndices);
                SetSuffixForDuplicatedColumnNames(ret, column, leftSuffix, rightSuffix);
                ret.InsertColumn(ret.ColumnCount, column);
            }
            return(ret);
        }