private static HashSet <long> Merge(DataFrame retainedDataFrame, DataFrame supplementaryDataFrame, string[] retainedJoinColumnNames, string[] supplemetaryJoinColumnNames, out PrimitiveDataFrameColumn <long> retainedRowIndices, out PrimitiveDataFrameColumn <long> supplementaryRowIndices, bool isInner = false, bool calculateIntersection = false) { if (retainedJoinColumnNames == null) { throw new ArgumentNullException(nameof(retainedJoinColumnNames)); } if (supplemetaryJoinColumnNames == null) { throw new ArgumentNullException(nameof(supplemetaryJoinColumnNames)); } if (retainedJoinColumnNames.Length != supplemetaryJoinColumnNames.Length) { throw new ArgumentException(Strings.MismatchedArrayLengths, nameof(retainedJoinColumnNames)); } HashSet <long> intersection = calculateIntersection ? new HashSet <long>() : null; // Get occurrences of values in columns used for join in the retained and supplementary dataframes Dictionary <long, ICollection <long> > occurrences = null; Dictionary <long, long> retainedIndicesReverseMapping = null; HashSet <long> supplementaryJoinColumnsNullIndices = new HashSet <long>(); for (int colNameIndex = 0; colNameIndex < retainedJoinColumnNames.Length; colNameIndex++) { DataFrameColumn shrinkedRetainedColumn = retainedDataFrame.Columns[retainedJoinColumnNames[colNameIndex]]; //shrink retained column by row occurrences from previous step if (occurrences != null) { //only rows with occurences from previose step should go for futher processing var shrinkedRetainedIndices = occurrences.Keys.ToArray(); //create reverse mapping of index of the row in the shrinked column to the index of this row in the original dataframe (new index -> original index) var newRetainedIndicesReverseMapping = new Dictionary <long, long>(shrinkedRetainedIndices.Length); for (int i = 0; i < shrinkedRetainedIndices.Length; i++) { //store reverse mapping to restore original dataframe indices from indices in shrinked row var originalIndex = shrinkedRetainedIndices[i]; newRetainedIndicesReverseMapping.Add(i, originalIndex); } retainedIndicesReverseMapping = newRetainedIndicesReverseMapping; shrinkedRetainedColumn = shrinkedRetainedColumn.Clone(new Int64DataFrameColumn("Indices", shrinkedRetainedIndices)); } DataFrameColumn supplementaryColumn = supplementaryDataFrame.Columns[supplemetaryJoinColumnNames[colNameIndex]]; //Find occurrenses on current step (join column) var newOccurrences = shrinkedRetainedColumn.GetGroupedOccurrences(supplementaryColumn, out HashSet <long> supplementaryColumnNullIndices); //Convert indices from in key from local (shrinked row) to indices in original dataframe if (retainedIndicesReverseMapping != null) { newOccurrences = newOccurrences.ToDictionary(kvp => retainedIndicesReverseMapping[kvp.Key], kvp => kvp.Value); } supplementaryJoinColumnsNullIndices.UnionWith(supplementaryColumnNullIndices); // shrink join result on current column by previous join columns (if any) // (we have to remove occurrences that doesn't exist in previous columns, because JOIN happens only if ALL left and right columns in JOIN are matched) if (occurrences != null) { var shrinkedOccurences = new Dictionary <long, ICollection <long> >(); foreach (var kvp in newOccurrences) { var newValue = kvp.Value.Where(i => occurrences[kvp.Key].Contains(i)).ToArray(); if (newValue.Any()) { shrinkedOccurences.Add(kvp.Key, newValue); } } newOccurrences = shrinkedOccurences; } occurrences = newOccurrences; } retainedRowIndices = new Int64DataFrameColumn("RetainedIndices"); supplementaryRowIndices = new Int64DataFrameColumn("SupplementaryIndices"); //Perform Merging var retainJoinColumns = retainedJoinColumnNames.Select(name => retainedDataFrame.Columns[name]).ToArray(); for (long i = 0; i < retainedDataFrame.Columns.RowCount; i++) { if (!IsAnyNullValueInColumns(retainJoinColumns, i)) { //Get all row indexes from supplementary dataframe that sutisfy JOIN condition if (occurrences.TryGetValue(i, out ICollection <long> rowIndices)) { foreach (long supplementaryRowIndex in rowIndices) { retainedRowIndices.Append(i); supplementaryRowIndices.Append(supplementaryRowIndex); //store intersection if required if (calculateIntersection) { if (!intersection.Contains(supplementaryRowIndex)) { intersection.Add(supplementaryRowIndex); } } } } else { if (isInner) { continue; } retainedRowIndices.Append(i); supplementaryRowIndices.Append(null); } } else { foreach (long row in supplementaryJoinColumnsNullIndices) { retainedRowIndices.Append(i); supplementaryRowIndices.Append(row); } } } return(intersection); }
private static Dictionary <long, ICollection <long> > GetOccurences(DataFrame retainedDataFrame, DataFrame supplementaryDataFrame, string[] retainedJoinColumnNames, string[] supplemetaryJoinColumnNames, out HashSet <long> supplementaryJoinColumnsNullIndices) { supplementaryJoinColumnsNullIndices = new HashSet <long>(); // Get occurrences of values in columns used for join in the retained and supplementary dataframes Dictionary <long, ICollection <long> > occurrences = null; Dictionary <long, long> retainedIndicesReverseMapping = null; for (int colNameIndex = 0; colNameIndex < retainedJoinColumnNames.Length; colNameIndex++) { DataFrameColumn shrinkedRetainedColumn = retainedDataFrame.Columns[retainedJoinColumnNames[colNameIndex]]; // Shrink retained column by row occurrences from previous step if (occurrences != null) { // Only rows with occurences from previose step should go for futher processing var shrinkedRetainedIndices = occurrences.Keys.ToArray(); // Create reverse mapping of index of the row in the shrinked column to the index of this row in the original dataframe (new index -> original index) var newRetainedIndicesReverseMapping = new Dictionary <long, long>(shrinkedRetainedIndices.Length); for (int i = 0; i < shrinkedRetainedIndices.Length; i++) { // Store reverse mapping to restore original dataframe indices from indices in shrinked row var originalIndex = shrinkedRetainedIndices[i]; newRetainedIndicesReverseMapping.Add(i, originalIndex); } retainedIndicesReverseMapping = newRetainedIndicesReverseMapping; var indices = new Int64DataFrameColumn("Indices", shrinkedRetainedIndices); shrinkedRetainedColumn = shrinkedRetainedColumn.Clone(indices); } DataFrameColumn supplementaryColumn = supplementaryDataFrame.Columns[supplemetaryJoinColumnNames[colNameIndex]]; // Find occurrenses on current step (join column) var newOccurrences = shrinkedRetainedColumn.GetGroupedOccurrences(supplementaryColumn, out HashSet <long> supplementaryColumnNullIndices); // Convert indices from in key from local (shrinked row) to indices in original dataframe if (retainedIndicesReverseMapping != null) { newOccurrences = newOccurrences.ToDictionary(kvp => retainedIndicesReverseMapping[kvp.Key], kvp => kvp.Value); } supplementaryJoinColumnsNullIndices.UnionWith(supplementaryColumnNullIndices); // Shrink join result on current column by previous join columns (if any) // (we have to remove occurrences that doesn't exist in previous columns, because JOIN happens only if ALL left and right columns in JOIN are matched) if (occurrences != null) { newOccurrences = GetShrinkedOccurences(occurrences, newOccurrences); } occurrences = newOccurrences; } return(occurrences); }