private static HashSet <long> PerformMerging(DataFrame retainedDataFrame, string[] retainedJoinColumnNames, Dictionary <long, ICollection <long> > occurrences, HashSet <long> supplementaryJoinColumnsNullIndices, out PrimitiveDataFrameColumn <long> retainedRowIndices, out PrimitiveDataFrameColumn <long> supplementaryRowIndices, bool isInner, bool calculateIntersection) { retainedRowIndices = new Int64DataFrameColumn("RetainedIndices"); supplementaryRowIndices = new Int64DataFrameColumn("SupplementaryIndices"); HashSet <long> intersection = calculateIntersection ? new HashSet <long>() : null; var retainJoinColumns = retainedJoinColumnNames.Select(name => retainedDataFrame.Columns[name]).ToArray(); for (long i = 0; i < retainedDataFrame.Columns.RowCount; i++) { if (!IsAnyNullValueInColumns(retainJoinColumns, i)) { // Get all row indexes from supplementary dataframe that satisfy JOIN condition if (occurrences.TryGetValue(i, out ICollection <long> rowIndices)) { foreach (long supplementaryRowIndex in rowIndices) { retainedRowIndices.Append(i); supplementaryRowIndices.Append(supplementaryRowIndex); // Store intersection if required if (calculateIntersection) { if (!intersection.Contains(supplementaryRowIndex)) { intersection.Add(supplementaryRowIndex); } } } } else { if (isInner) { continue; } retainedRowIndices.Append(i); supplementaryRowIndices.Append(null); } } else { foreach (long row in supplementaryJoinColumnsNullIndices) { retainedRowIndices.Append(i); supplementaryRowIndices.Append(row); } } } return(intersection); }
private static HashSet <long> Merge(DataFrame retainedDataFrame, DataFrame supplementaryDataFrame, string[] retainedJoinColumnNames, string[] supplemetaryJoinColumnNames, out PrimitiveDataFrameColumn <long> retainedRowIndices, out PrimitiveDataFrameColumn <long> supplementaryRowIndices, bool isInner = false, bool calculateIntersection = false) { if (retainedJoinColumnNames == null) { throw new ArgumentNullException(nameof(retainedJoinColumnNames)); } if (supplemetaryJoinColumnNames == null) { throw new ArgumentNullException(nameof(supplemetaryJoinColumnNames)); } if (retainedJoinColumnNames.Length != supplemetaryJoinColumnNames.Length) { throw new ArgumentException(Strings.MismatchedArrayLengths, nameof(retainedJoinColumnNames)); } HashSet <long> intersection = calculateIntersection ? new HashSet <long>() : null; // Get occurrences of values in columns used for join in the retained and supplementary dataframes Dictionary <long, ICollection <long> > occurrences = null; Dictionary <long, long> retainedIndicesReverseMapping = null; HashSet <long> supplementaryJoinColumnsNullIndices = new HashSet <long>(); for (int colNameIndex = 0; colNameIndex < retainedJoinColumnNames.Length; colNameIndex++) { DataFrameColumn shrinkedRetainedColumn = retainedDataFrame.Columns[retainedJoinColumnNames[colNameIndex]]; //shrink retained column by row occurrences from previous step if (occurrences != null) { //only rows with occurences from previose step should go for futher processing var shrinkedRetainedIndices = occurrences.Keys.ToArray(); //create reverse mapping of index of the row in the shrinked column to the index of this row in the original dataframe (new index -> original index) var newRetainedIndicesReverseMapping = new Dictionary <long, long>(shrinkedRetainedIndices.Length); for (int i = 0; i < shrinkedRetainedIndices.Length; i++) { //store reverse mapping to restore original dataframe indices from indices in shrinked row var originalIndex = shrinkedRetainedIndices[i]; newRetainedIndicesReverseMapping.Add(i, originalIndex); } retainedIndicesReverseMapping = newRetainedIndicesReverseMapping; shrinkedRetainedColumn = shrinkedRetainedColumn.Clone(new Int64DataFrameColumn("Indices", shrinkedRetainedIndices)); } DataFrameColumn supplementaryColumn = supplementaryDataFrame.Columns[supplemetaryJoinColumnNames[colNameIndex]]; //Find occurrenses on current step (join column) var newOccurrences = shrinkedRetainedColumn.GetGroupedOccurrences(supplementaryColumn, out HashSet <long> supplementaryColumnNullIndices); //Convert indices from in key from local (shrinked row) to indices in original dataframe if (retainedIndicesReverseMapping != null) { newOccurrences = newOccurrences.ToDictionary(kvp => retainedIndicesReverseMapping[kvp.Key], kvp => kvp.Value); } supplementaryJoinColumnsNullIndices.UnionWith(supplementaryColumnNullIndices); // shrink join result on current column by previous join columns (if any) // (we have to remove occurrences that doesn't exist in previous columns, because JOIN happens only if ALL left and right columns in JOIN are matched) if (occurrences != null) { var shrinkedOccurences = new Dictionary <long, ICollection <long> >(); foreach (var kvp in newOccurrences) { var newValue = kvp.Value.Where(i => occurrences[kvp.Key].Contains(i)).ToArray(); if (newValue.Any()) { shrinkedOccurences.Add(kvp.Key, newValue); } } newOccurrences = shrinkedOccurences; } occurrences = newOccurrences; } retainedRowIndices = new Int64DataFrameColumn("RetainedIndices"); supplementaryRowIndices = new Int64DataFrameColumn("SupplementaryIndices"); //Perform Merging var retainJoinColumns = retainedJoinColumnNames.Select(name => retainedDataFrame.Columns[name]).ToArray(); for (long i = 0; i < retainedDataFrame.Columns.RowCount; i++) { if (!IsAnyNullValueInColumns(retainJoinColumns, i)) { //Get all row indexes from supplementary dataframe that sutisfy JOIN condition if (occurrences.TryGetValue(i, out ICollection <long> rowIndices)) { foreach (long supplementaryRowIndex in rowIndices) { retainedRowIndices.Append(i); supplementaryRowIndices.Append(supplementaryRowIndex); //store intersection if required if (calculateIntersection) { if (!intersection.Contains(supplementaryRowIndex)) { intersection.Add(supplementaryRowIndex); } } } } else { if (isInner) { continue; } retainedRowIndices.Append(i); supplementaryRowIndices.Append(null); } } else { foreach (long row in supplementaryJoinColumnsNullIndices) { retainedRowIndices.Append(i); supplementaryRowIndices.Append(row); } } } return(intersection); }
// TODO: Merge API with an "On" parameter that merges on a column common to 2 dataframes /// <summary> /// Merge DataFrames with a database style join /// </summary> /// <param name="other"></param> /// <param name="leftJoinColumn"></param> /// <param name="rightJoinColumn"></param> /// <param name="leftSuffix"></param> /// <param name="rightSuffix"></param> /// <param name="joinAlgorithm"></param> /// <returns></returns> public DataFrame Merge <TKey>(DataFrame other, string leftJoinColumn, string rightJoinColumn, string leftSuffix = "_left", string rightSuffix = "_right", JoinAlgorithm joinAlgorithm = JoinAlgorithm.Left) { // A simple hash join DataFrame ret = new DataFrame(); DataFrame leftDataFrame = this; DataFrame rightDataFrame = other; // The final table size is not known until runtime long rowNumber = 0; PrimitiveDataFrameColumn <long> leftRowIndices = new PrimitiveDataFrameColumn <long>("LeftIndices"); PrimitiveDataFrameColumn <long> rightRowIndices = new PrimitiveDataFrameColumn <long>("RightIndices"); if (joinAlgorithm == JoinAlgorithm.Left) { // First hash other dataframe on the rightJoinColumn DataFrameColumn otherColumn = other.Columns[rightJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>(out HashSet <long> otherColumnNullIndices); // Go over the records in this dataframe and match with the dictionary DataFrameColumn thisColumn = Columns[leftJoinColumn]; for (long i = 0; i < thisColumn.Length; i++) { var thisColumnValue = thisColumn[i]; if (thisColumnValue != null) { if (multimap.TryGetValue((TKey)thisColumnValue, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { leftRowIndices.Append(i); rightRowIndices.Append(row); } } else { leftRowIndices.Append(i); rightRowIndices.Append(null); } } else { foreach (long row in otherColumnNullIndices) { leftRowIndices.Append(i); rightRowIndices.Append(row); } } } } else if (joinAlgorithm == JoinAlgorithm.Right) { DataFrameColumn thisColumn = Columns[leftJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = thisColumn.GroupColumnValues <TKey>(out HashSet <long> thisColumnNullIndices); DataFrameColumn otherColumn = other.Columns[rightJoinColumn]; for (long i = 0; i < otherColumn.Length; i++) { var otherColumnValue = otherColumn[i]; if (otherColumnValue != null) { if (multimap.TryGetValue((TKey)otherColumnValue, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { leftRowIndices.Append(row); rightRowIndices.Append(i); } } else { leftRowIndices.Append(null); rightRowIndices.Append(i); } } else { foreach (long thisColumnNullIndex in thisColumnNullIndices) { leftRowIndices.Append(thisColumnNullIndex); rightRowIndices.Append(i); } } } } else if (joinAlgorithm == JoinAlgorithm.Inner) { // Hash the column with the smaller RowCount long leftRowCount = Rows.Count; long rightRowCount = other.Rows.Count; bool leftColumnIsSmaller = leftRowCount <= rightRowCount; DataFrameColumn hashColumn = leftColumnIsSmaller ? Columns[leftJoinColumn] : other.Columns[rightJoinColumn]; DataFrameColumn otherColumn = ReferenceEquals(hashColumn, Columns[leftJoinColumn]) ? other.Columns[rightJoinColumn] : Columns[leftJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = hashColumn.GroupColumnValues <TKey>(out HashSet <long> smallerDataFrameColumnNullIndices); for (long i = 0; i < otherColumn.Length; i++) { var otherColumnValue = otherColumn[i]; if (otherColumnValue != null) { if (multimap.TryGetValue((TKey)otherColumnValue, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { leftRowIndices.Append(leftColumnIsSmaller ? row : i); rightRowIndices.Append(leftColumnIsSmaller ? i : row); } } } else { foreach (long nullIndex in smallerDataFrameColumnNullIndices) { leftRowIndices.Append(leftColumnIsSmaller ? nullIndex : i); rightRowIndices.Append(leftColumnIsSmaller ? i : nullIndex); } } } } else if (joinAlgorithm == JoinAlgorithm.FullOuter) { DataFrameColumn otherColumn = other.Columns[rightJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>(out HashSet <long> otherColumnNullIndices); Dictionary <TKey, long> intersection = new Dictionary <TKey, long>(EqualityComparer <TKey> .Default); // Go over the records in this dataframe and match with the dictionary DataFrameColumn thisColumn = Columns[leftJoinColumn]; Int64DataFrameColumn thisColumnNullIndices = new Int64DataFrameColumn("ThisColumnNullIndices"); for (long i = 0; i < thisColumn.Length; i++) { var thisColumnValue = thisColumn[i]; if (thisColumnValue != null) { if (multimap.TryGetValue((TKey)thisColumnValue, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { leftRowIndices.Append(i); rightRowIndices.Append(row); if (!intersection.ContainsKey((TKey)thisColumnValue)) { intersection.Add((TKey)thisColumnValue, rowNumber); } } } else { leftRowIndices.Append(i); rightRowIndices.Append(null); } } else { thisColumnNullIndices.Append(i); } } for (long i = 0; i < otherColumn.Length; i++) { var value = otherColumn[i]; if (value != null) { if (!intersection.ContainsKey((TKey)value)) { leftRowIndices.Append(null); rightRowIndices.Append(i); } } } // Now handle the null rows foreach (long?thisColumnNullIndex in thisColumnNullIndices) { foreach (long otherColumnNullIndex in otherColumnNullIndices) { leftRowIndices.Append(thisColumnNullIndex.Value); rightRowIndices.Append(otherColumnNullIndex); } if (otherColumnNullIndices.Count == 0) { leftRowIndices.Append(thisColumnNullIndex.Value); rightRowIndices.Append(null); } } if (thisColumnNullIndices.Length == 0) { foreach (long otherColumnNullIndex in otherColumnNullIndices) { leftRowIndices.Append(null); rightRowIndices.Append(otherColumnNullIndex); } } } else { throw new NotImplementedException(nameof(joinAlgorithm)); } for (int i = 0; i < leftDataFrame.Columns.Count; i++) { ret.Columns.Insert(i, leftDataFrame.Columns[i].Clone(leftRowIndices)); } for (int i = 0; i < rightDataFrame.Columns.Count; i++) { DataFrameColumn column = rightDataFrame.Columns[i].Clone(rightRowIndices); SetSuffixForDuplicatedColumnNames(ret, column, leftSuffix, rightSuffix); ret.Columns.Insert(ret.Columns.Count, column); } return(ret); }