Exemplo n.º 1
0
        private static HashSet <long> PerformMerging(DataFrame retainedDataFrame, string[] retainedJoinColumnNames,
                                                     Dictionary <long, ICollection <long> > occurrences, HashSet <long> supplementaryJoinColumnsNullIndices,
                                                     out PrimitiveDataFrameColumn <long> retainedRowIndices, out PrimitiveDataFrameColumn <long> supplementaryRowIndices,
                                                     bool isInner, bool calculateIntersection)
        {
            retainedRowIndices      = new Int64DataFrameColumn("RetainedIndices");
            supplementaryRowIndices = new Int64DataFrameColumn("SupplementaryIndices");

            HashSet <long> intersection = calculateIntersection ? new HashSet <long>() : null;

            var retainJoinColumns = retainedJoinColumnNames.Select(name => retainedDataFrame.Columns[name]).ToArray();

            for (long i = 0; i < retainedDataFrame.Columns.RowCount; i++)
            {
                if (!IsAnyNullValueInColumns(retainJoinColumns, i))
                {
                    // Get all row indexes from supplementary dataframe that satisfy JOIN condition
                    if (occurrences.TryGetValue(i, out ICollection <long> rowIndices))
                    {
                        foreach (long supplementaryRowIndex in rowIndices)
                        {
                            retainedRowIndices.Append(i);
                            supplementaryRowIndices.Append(supplementaryRowIndex);

                            // Store intersection if required
                            if (calculateIntersection)
                            {
                                if (!intersection.Contains(supplementaryRowIndex))
                                {
                                    intersection.Add(supplementaryRowIndex);
                                }
                            }
                        }
                    }
                    else
                    {
                        if (isInner)
                        {
                            continue;
                        }

                        retainedRowIndices.Append(i);
                        supplementaryRowIndices.Append(null);
                    }
                }
                else
                {
                    foreach (long row in supplementaryJoinColumnsNullIndices)
                    {
                        retainedRowIndices.Append(i);
                        supplementaryRowIndices.Append(row);
                    }
                }
            }

            return(intersection);
        }
Exemplo n.º 2
0
        private static HashSet <long> Merge(DataFrame retainedDataFrame, DataFrame supplementaryDataFrame, string[] retainedJoinColumnNames, string[] supplemetaryJoinColumnNames, out PrimitiveDataFrameColumn <long> retainedRowIndices, out PrimitiveDataFrameColumn <long> supplementaryRowIndices, bool isInner = false, bool calculateIntersection = false)
        {
            if (retainedJoinColumnNames == null)
            {
                throw new ArgumentNullException(nameof(retainedJoinColumnNames));
            }

            if (supplemetaryJoinColumnNames == null)
            {
                throw new ArgumentNullException(nameof(supplemetaryJoinColumnNames));
            }

            if (retainedJoinColumnNames.Length != supplemetaryJoinColumnNames.Length)
            {
                throw new ArgumentException(Strings.MismatchedArrayLengths, nameof(retainedJoinColumnNames));
            }


            HashSet <long> intersection = calculateIntersection ? new HashSet <long>() : null;

            // Get occurrences of values in columns used for join in the retained and supplementary dataframes
            Dictionary <long, ICollection <long> > occurrences    = null;
            Dictionary <long, long> retainedIndicesReverseMapping = null;

            HashSet <long> supplementaryJoinColumnsNullIndices = new HashSet <long>();


            for (int colNameIndex = 0; colNameIndex < retainedJoinColumnNames.Length; colNameIndex++)
            {
                DataFrameColumn shrinkedRetainedColumn = retainedDataFrame.Columns[retainedJoinColumnNames[colNameIndex]];

                //shrink retained column by row occurrences from previous step
                if (occurrences != null)
                {
                    //only rows with occurences from previose step should go for futher processing
                    var shrinkedRetainedIndices = occurrences.Keys.ToArray();

                    //create reverse mapping of index of the row in the shrinked column to the index of this row in the original dataframe (new index -> original index)
                    var newRetainedIndicesReverseMapping = new Dictionary <long, long>(shrinkedRetainedIndices.Length);

                    for (int i = 0; i < shrinkedRetainedIndices.Length; i++)
                    {
                        //store reverse mapping to restore original dataframe indices from indices in shrinked row
                        var originalIndex = shrinkedRetainedIndices[i];
                        newRetainedIndicesReverseMapping.Add(i, originalIndex);
                    }

                    retainedIndicesReverseMapping = newRetainedIndicesReverseMapping;
                    shrinkedRetainedColumn        = shrinkedRetainedColumn.Clone(new Int64DataFrameColumn("Indices", shrinkedRetainedIndices));
                }

                DataFrameColumn supplementaryColumn = supplementaryDataFrame.Columns[supplemetaryJoinColumnNames[colNameIndex]];

                //Find occurrenses on current step (join column)
                var newOccurrences = shrinkedRetainedColumn.GetGroupedOccurrences(supplementaryColumn, out HashSet <long> supplementaryColumnNullIndices);

                //Convert indices from in key from local (shrinked row) to indices in original dataframe
                if (retainedIndicesReverseMapping != null)
                {
                    newOccurrences = newOccurrences.ToDictionary(kvp => retainedIndicesReverseMapping[kvp.Key], kvp => kvp.Value);
                }

                supplementaryJoinColumnsNullIndices.UnionWith(supplementaryColumnNullIndices);

                // shrink join result on current column by previous join columns (if any)
                // (we have to remove occurrences that doesn't exist in previous columns, because JOIN happens only if ALL left and right columns in JOIN are matched)
                if (occurrences != null)
                {
                    var shrinkedOccurences = new Dictionary <long, ICollection <long> >();

                    foreach (var kvp in newOccurrences)
                    {
                        var newValue = kvp.Value.Where(i => occurrences[kvp.Key].Contains(i)).ToArray();
                        if (newValue.Any())
                        {
                            shrinkedOccurences.Add(kvp.Key, newValue);
                        }
                    }
                    newOccurrences = shrinkedOccurences;
                }

                occurrences = newOccurrences;
            }

            retainedRowIndices      = new Int64DataFrameColumn("RetainedIndices");
            supplementaryRowIndices = new Int64DataFrameColumn("SupplementaryIndices");

            //Perform Merging
            var retainJoinColumns = retainedJoinColumnNames.Select(name => retainedDataFrame.Columns[name]).ToArray();

            for (long i = 0; i < retainedDataFrame.Columns.RowCount; i++)
            {
                if (!IsAnyNullValueInColumns(retainJoinColumns, i))
                {
                    //Get all row indexes from supplementary dataframe that sutisfy JOIN condition
                    if (occurrences.TryGetValue(i, out ICollection <long> rowIndices))
                    {
                        foreach (long supplementaryRowIndex in rowIndices)
                        {
                            retainedRowIndices.Append(i);
                            supplementaryRowIndices.Append(supplementaryRowIndex);

                            //store intersection if required
                            if (calculateIntersection)
                            {
                                if (!intersection.Contains(supplementaryRowIndex))
                                {
                                    intersection.Add(supplementaryRowIndex);
                                }
                            }
                        }
                    }
                    else
                    {
                        if (isInner)
                        {
                            continue;
                        }

                        retainedRowIndices.Append(i);
                        supplementaryRowIndices.Append(null);
                    }
                }
                else
                {
                    foreach (long row in supplementaryJoinColumnsNullIndices)
                    {
                        retainedRowIndices.Append(i);
                        supplementaryRowIndices.Append(row);
                    }
                }
            }

            return(intersection);
        }
Exemplo n.º 3
0
        // TODO: Merge API with an "On" parameter that merges on a column common to 2 dataframes

        /// <summary>
        /// Merge DataFrames with a database style join
        /// </summary>
        /// <param name="other"></param>
        /// <param name="leftJoinColumn"></param>
        /// <param name="rightJoinColumn"></param>
        /// <param name="leftSuffix"></param>
        /// <param name="rightSuffix"></param>
        /// <param name="joinAlgorithm"></param>
        /// <returns></returns>
        public DataFrame Merge <TKey>(DataFrame other, string leftJoinColumn, string rightJoinColumn, string leftSuffix = "_left", string rightSuffix = "_right", JoinAlgorithm joinAlgorithm = JoinAlgorithm.Left)
        {
            // A simple hash join
            DataFrame ret            = new DataFrame();
            DataFrame leftDataFrame  = this;
            DataFrame rightDataFrame = other;

            // The final table size is not known until runtime
            long rowNumber = 0;
            PrimitiveDataFrameColumn <long> leftRowIndices  = new PrimitiveDataFrameColumn <long>("LeftIndices");
            PrimitiveDataFrameColumn <long> rightRowIndices = new PrimitiveDataFrameColumn <long>("RightIndices");

            if (joinAlgorithm == JoinAlgorithm.Left)
            {
                // First hash other dataframe on the rightJoinColumn
                DataFrameColumn otherColumn = other.Columns[rightJoinColumn];
                Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>(out HashSet <long> otherColumnNullIndices);

                // Go over the records in this dataframe and match with the dictionary
                DataFrameColumn thisColumn = Columns[leftJoinColumn];

                for (long i = 0; i < thisColumn.Length; i++)
                {
                    var thisColumnValue = thisColumn[i];
                    if (thisColumnValue != null)
                    {
                        if (multimap.TryGetValue((TKey)thisColumnValue, out ICollection <long> rowNumbers))
                        {
                            foreach (long row in rowNumbers)
                            {
                                leftRowIndices.Append(i);
                                rightRowIndices.Append(row);
                            }
                        }
                        else
                        {
                            leftRowIndices.Append(i);
                            rightRowIndices.Append(null);
                        }
                    }
                    else
                    {
                        foreach (long row in otherColumnNullIndices)
                        {
                            leftRowIndices.Append(i);
                            rightRowIndices.Append(row);
                        }
                    }
                }
            }
            else if (joinAlgorithm == JoinAlgorithm.Right)
            {
                DataFrameColumn thisColumn = Columns[leftJoinColumn];
                Dictionary <TKey, ICollection <long> > multimap = thisColumn.GroupColumnValues <TKey>(out HashSet <long> thisColumnNullIndices);

                DataFrameColumn otherColumn = other.Columns[rightJoinColumn];
                for (long i = 0; i < otherColumn.Length; i++)
                {
                    var otherColumnValue = otherColumn[i];
                    if (otherColumnValue != null)
                    {
                        if (multimap.TryGetValue((TKey)otherColumnValue, out ICollection <long> rowNumbers))
                        {
                            foreach (long row in rowNumbers)
                            {
                                leftRowIndices.Append(row);
                                rightRowIndices.Append(i);
                            }
                        }
                        else
                        {
                            leftRowIndices.Append(null);
                            rightRowIndices.Append(i);
                        }
                    }
                    else
                    {
                        foreach (long thisColumnNullIndex in thisColumnNullIndices)
                        {
                            leftRowIndices.Append(thisColumnNullIndex);
                            rightRowIndices.Append(i);
                        }
                    }
                }
            }
            else if (joinAlgorithm == JoinAlgorithm.Inner)
            {
                // Hash the column with the smaller RowCount
                long leftRowCount  = Rows.Count;
                long rightRowCount = other.Rows.Count;

                bool            leftColumnIsSmaller             = leftRowCount <= rightRowCount;
                DataFrameColumn hashColumn                      = leftColumnIsSmaller ? Columns[leftJoinColumn] : other.Columns[rightJoinColumn];
                DataFrameColumn otherColumn                     = ReferenceEquals(hashColumn, Columns[leftJoinColumn]) ? other.Columns[rightJoinColumn] : Columns[leftJoinColumn];
                Dictionary <TKey, ICollection <long> > multimap = hashColumn.GroupColumnValues <TKey>(out HashSet <long> smallerDataFrameColumnNullIndices);

                for (long i = 0; i < otherColumn.Length; i++)
                {
                    var otherColumnValue = otherColumn[i];
                    if (otherColumnValue != null)
                    {
                        if (multimap.TryGetValue((TKey)otherColumnValue, out ICollection <long> rowNumbers))
                        {
                            foreach (long row in rowNumbers)
                            {
                                leftRowIndices.Append(leftColumnIsSmaller ? row : i);
                                rightRowIndices.Append(leftColumnIsSmaller ? i : row);
                            }
                        }
                    }
                    else
                    {
                        foreach (long nullIndex in smallerDataFrameColumnNullIndices)
                        {
                            leftRowIndices.Append(leftColumnIsSmaller ? nullIndex : i);
                            rightRowIndices.Append(leftColumnIsSmaller ? i : nullIndex);
                        }
                    }
                }
            }
            else if (joinAlgorithm == JoinAlgorithm.FullOuter)
            {
                DataFrameColumn otherColumn = other.Columns[rightJoinColumn];
                Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>(out HashSet <long> otherColumnNullIndices);
                Dictionary <TKey, long> intersection            = new Dictionary <TKey, long>(EqualityComparer <TKey> .Default);

                // Go over the records in this dataframe and match with the dictionary
                DataFrameColumn      thisColumn            = Columns[leftJoinColumn];
                Int64DataFrameColumn thisColumnNullIndices = new Int64DataFrameColumn("ThisColumnNullIndices");

                for (long i = 0; i < thisColumn.Length; i++)
                {
                    var thisColumnValue = thisColumn[i];
                    if (thisColumnValue != null)
                    {
                        if (multimap.TryGetValue((TKey)thisColumnValue, out ICollection <long> rowNumbers))
                        {
                            foreach (long row in rowNumbers)
                            {
                                leftRowIndices.Append(i);
                                rightRowIndices.Append(row);
                                if (!intersection.ContainsKey((TKey)thisColumnValue))
                                {
                                    intersection.Add((TKey)thisColumnValue, rowNumber);
                                }
                            }
                        }
                        else
                        {
                            leftRowIndices.Append(i);
                            rightRowIndices.Append(null);
                        }
                    }
                    else
                    {
                        thisColumnNullIndices.Append(i);
                    }
                }
                for (long i = 0; i < otherColumn.Length; i++)
                {
                    var value = otherColumn[i];
                    if (value != null)
                    {
                        if (!intersection.ContainsKey((TKey)value))
                        {
                            leftRowIndices.Append(null);
                            rightRowIndices.Append(i);
                        }
                    }
                }

                // Now handle the null rows
                foreach (long?thisColumnNullIndex in thisColumnNullIndices)
                {
                    foreach (long otherColumnNullIndex in otherColumnNullIndices)
                    {
                        leftRowIndices.Append(thisColumnNullIndex.Value);
                        rightRowIndices.Append(otherColumnNullIndex);
                    }
                    if (otherColumnNullIndices.Count == 0)
                    {
                        leftRowIndices.Append(thisColumnNullIndex.Value);
                        rightRowIndices.Append(null);
                    }
                }
                if (thisColumnNullIndices.Length == 0)
                {
                    foreach (long otherColumnNullIndex in otherColumnNullIndices)
                    {
                        leftRowIndices.Append(null);
                        rightRowIndices.Append(otherColumnNullIndex);
                    }
                }
            }
            else
            {
                throw new NotImplementedException(nameof(joinAlgorithm));
            }

            for (int i = 0; i < leftDataFrame.Columns.Count; i++)
            {
                ret.Columns.Insert(i, leftDataFrame.Columns[i].Clone(leftRowIndices));
            }
            for (int i = 0; i < rightDataFrame.Columns.Count; i++)
            {
                DataFrameColumn column = rightDataFrame.Columns[i].Clone(rightRowIndices);
                SetSuffixForDuplicatedColumnNames(ret, column, leftSuffix, rightSuffix);
                ret.Columns.Insert(ret.Columns.Count, column);
            }
            return(ret);
        }