// TODO: Merge API with an "On" parameter that merges on a column common to 2 dataframes /// <summary> /// Merge DataFrames with a database style join /// </summary> /// <param name="other"></param> /// <param name="leftJoinColumn"></param> /// <param name="rightJoinColumn"></param> /// <param name="leftSuffix"></param> /// <param name="rightSuffix"></param> /// <param name="joinAlgorithm"></param> /// <returns></returns> public DataFrame Merge <TKey>(DataFrame other, string leftJoinColumn, string rightJoinColumn, string leftSuffix = "_left", string rightSuffix = "_right", JoinAlgorithm joinAlgorithm = JoinAlgorithm.Left) { // A simple hash join DataFrame ret = new DataFrame(); DataFrame leftDataFrame = this; DataFrame rightDataFrame = other; // The final table size is not known until runtime long rowNumber = 0; PrimitiveColumn <long> leftRowIndices = new PrimitiveColumn <long>("LeftIndices"); PrimitiveColumn <long> rightRowIndices = new PrimitiveColumn <long>("RightIndices"); if (joinAlgorithm == JoinAlgorithm.Left) { // First hash other dataframe on the rightJoinColumn BaseColumn otherColumn = other[rightJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>(); // Go over the records in this dataframe and match with the dictionary BaseColumn thisColumn = this[leftJoinColumn]; for (long i = 0; i < thisColumn.Length; i++) { var thisColumnValue = thisColumn[i]; TKey thisColumnValueOrDefault = (TKey)(thisColumnValue == null ? default(TKey) : thisColumnValue); if (multimap.TryGetValue(thisColumnValueOrDefault, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { if (thisColumnValue == null) { // Match only with nulls in otherColumn if (otherColumn[row] == null) { leftRowIndices.Append(i); rightRowIndices.Append(row); } } else { // Cannot match nulls in otherColumn if (otherColumn[row] != null) { leftRowIndices.Append(i); rightRowIndices.Append(row); } } } } else { leftRowIndices.Append(i); rightRowIndices.Append(null); } } } else if (joinAlgorithm == JoinAlgorithm.Right) { BaseColumn thisColumn = this[leftJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = thisColumn.GroupColumnValues <TKey>(); BaseColumn otherColumn = other[rightJoinColumn]; for (long i = 0; i < otherColumn.Length; i++) { var otherColumnValue = otherColumn[i]; TKey otherColumnValueOrDefault = (TKey)(otherColumnValue == null ? default(TKey) : otherColumnValue); if (multimap.TryGetValue(otherColumnValueOrDefault, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { if (otherColumnValue == null) { if (thisColumn[row] == null) { leftRowIndices.Append(row); rightRowIndices.Append(i); } } else { if (thisColumn[row] != null) { leftRowIndices.Append(row); rightRowIndices.Append(i); } } } } else { leftRowIndices.Append(null); rightRowIndices.Append(i); } } } else if (joinAlgorithm == JoinAlgorithm.Inner) { // Hash the column with the smaller RowCount long leftRowCount = RowCount; long rightRowCount = other.RowCount; DataFrame longerDataFrame = leftRowCount < rightRowCount ? other : this; DataFrame shorterDataFrame = ReferenceEquals(longerDataFrame, this) ? other : this; BaseColumn hashColumn = (leftRowCount < rightRowCount) ? this[leftJoinColumn] : other[rightJoinColumn]; BaseColumn otherColumn = ReferenceEquals(hashColumn, this[leftJoinColumn]) ? other[rightJoinColumn] : this[leftJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = hashColumn.GroupColumnValues <TKey>(); for (long i = 0; i < otherColumn.Length; i++) { var otherColumnValue = otherColumn[i]; TKey otherColumnValueOrDefault = (TKey)(otherColumnValue == null ? default(TKey) : otherColumnValue); if (multimap.TryGetValue(otherColumnValueOrDefault, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { if (otherColumnValue == null) { if (hashColumn[row] == null) { leftRowIndices.Append(row); rightRowIndices.Append(i); } } else { if (hashColumn[row] != null) { leftRowIndices.Append(row); rightRowIndices.Append(i); } } } } } leftDataFrame = shorterDataFrame; rightDataFrame = longerDataFrame; } else if (joinAlgorithm == JoinAlgorithm.FullOuter) { BaseColumn otherColumn = other[rightJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>(); Dictionary <TKey, long> intersection = new Dictionary <TKey, long>(EqualityComparer <TKey> .Default); // Go over the records in this dataframe and match with the dictionary BaseColumn thisColumn = this[leftJoinColumn]; for (long i = 0; i < thisColumn.Length; i++) { var thisColumnValue = thisColumn[i]; TKey thisColumnValueOrDefault = (TKey)(thisColumnValue == null ? default(TKey) : thisColumnValue); if (multimap.TryGetValue(thisColumnValueOrDefault, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { if (thisColumnValue == null) { // Has to match only with nulls in otherColumn if (otherColumn[row] == null) { leftRowIndices.Append(i); rightRowIndices.Append(row); if (!intersection.ContainsKey(thisColumnValueOrDefault)) { intersection.Add(thisColumnValueOrDefault, rowNumber); } } } else { // Cannot match to nulls in otherColumn if (otherColumn[row] != null) { leftRowIndices.Append(i); rightRowIndices.Append(row); if (!intersection.ContainsKey(thisColumnValueOrDefault)) { intersection.Add(thisColumnValueOrDefault, rowNumber); } } } } } else { leftRowIndices.Append(i); rightRowIndices.Append(null); } } for (long i = 0; i < otherColumn.Length; i++) { TKey value = (TKey)(otherColumn[i] ?? default(TKey)); if (!intersection.ContainsKey(value)) { leftRowIndices.Append(null); rightRowIndices.Append(i); } } } else { throw new NotImplementedException(nameof(joinAlgorithm)); } for (int i = 0; i < leftDataFrame.ColumnCount; i++) { ret.InsertColumn(i, leftDataFrame.Column(i).Clone(leftRowIndices)); } for (int i = 0; i < rightDataFrame.ColumnCount; i++) { BaseColumn column = rightDataFrame.Column(i).Clone(rightRowIndices); SetSuffixForDuplicatedColumnNames(ret, column, leftSuffix, rightSuffix); ret.InsertColumn(ret.ColumnCount, column); } return(ret); }
// TODO: Merge API with an "On" parameter that merges on a column common to 2 dataframes /// <summary> /// Merge DataFrames with a database style join /// </summary> /// <param name="other"></param> /// <param name="leftJoinColumn"></param> /// <param name="rightJoinColumn"></param> /// <param name="leftSuffix"></param> /// <param name="rightSuffix"></param> /// <param name="joinAlgorithm"></param> /// <returns></returns> public DataFrame Merge <TKey>(DataFrame other, string leftJoinColumn, string rightJoinColumn, string leftSuffix = "_left", string rightSuffix = "_right", JoinAlgorithm joinAlgorithm = JoinAlgorithm.Left) { // A simple hash join DataFrame ret = new DataFrame(); PrimitiveColumn <long> emptyMap = new PrimitiveColumn <long>("Empty"); for (int i = 0; i < ColumnCount; i++) { // Create empty columns BaseColumn column = Column(i).Clone(emptyMap); ret.InsertColumn(ret.ColumnCount, column); } for (int i = 0; i < other.ColumnCount; i++) { // Create empty columns BaseColumn column = other.Column(i).Clone(emptyMap); SetSuffixForDuplicatedColumnNames(ret, column, leftSuffix, rightSuffix); ret.InsertColumn(ret.ColumnCount, column); } // The final table size is not known until runtime long rowNumber = 0; if (joinAlgorithm == JoinAlgorithm.Left) { // First hash other dataframe on the rightJoinColumn BaseColumn otherColumn = other[rightJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>(); // Go over the records in this dataframe and match with the dictionary BaseColumn thisColumn = this[leftJoinColumn]; for (int c = 0; c < ret.ColumnCount; c++) { ret.Column(c).Resize(thisColumn.Length); } for (long i = 0; i < thisColumn.Length; i++) { if (rowNumber >= thisColumn.Length) { for (int c = 0; c < ret.ColumnCount; c++) { ret.Column(c).Resize(rowNumber + 1); } } TKey value = (TKey)(thisColumn[i] ?? default(TKey)); if (multimap.TryGetValue(value, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { if (thisColumn[i] == null) { // Match only with nulls in otherColumn if (otherColumn[row] == null) { AppendForMerge(ret, rowNumber++, this, other, i, row); } } else { // Cannot match nulls in otherColumn if (otherColumn[row] != null) { AppendForMerge(ret, rowNumber++, this, other, i, row); } } } } else { AppendForMerge(ret, rowNumber++, this, other, i, -1); } } ret._table.RowCount = rowNumber; } else if (joinAlgorithm == JoinAlgorithm.Right) { BaseColumn thisColumn = this[leftJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = thisColumn.GroupColumnValues <TKey>(); BaseColumn otherColumn = other[rightJoinColumn]; for (int c = 0; c < ret.ColumnCount; c++) { ret.Column(c).Resize(otherColumn.Length); } for (long i = 0; i < otherColumn.Length; i++) { if (rowNumber >= otherColumn.Length) { for (int c = 0; c < ret.ColumnCount; c++) { ret.Column(c).Resize(rowNumber + 1); } } TKey value = (TKey)(otherColumn[i] ?? default(TKey)); if (multimap.TryGetValue(value, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { if (otherColumn[i] == null) { if (thisColumn[row] == null) { AppendForMerge(ret, rowNumber++, this, other, row, i); } } else { if (thisColumn[row] != null) { AppendForMerge(ret, rowNumber++, this, other, row, i); } } } } else { AppendForMerge(ret, rowNumber++, this, other, -1, i); } } ret._table.RowCount = rowNumber; } else if (joinAlgorithm == JoinAlgorithm.Inner) { // Hash the column with the smaller RowCount long leftRowCount = RowCount; long rightRowCount = other.RowCount; DataFrame longerDataFrame = leftRowCount < rightRowCount ? other : this; DataFrame shorterDataFrame = ReferenceEquals(longerDataFrame, this) ? other : this; BaseColumn hashColumn = (leftRowCount < rightRowCount) ? this[leftJoinColumn] : other[rightJoinColumn]; BaseColumn otherColumn = ReferenceEquals(hashColumn, this[leftJoinColumn]) ? other[rightJoinColumn] : this[leftJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = hashColumn.GroupColumnValues <TKey>(); for (int c = 0; c < ret.ColumnCount; c++) { ret.Column(c).Resize(1); } for (long i = 0; i < otherColumn.Length; i++) { if (rowNumber >= ret.Column(0).Length) { for (int c = 0; c < ret.ColumnCount; c++) { ret.Column(c).Resize(rowNumber + 1); } } TKey value = (TKey)(otherColumn[i] ?? default(TKey)); if (multimap.TryGetValue(value, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { if (otherColumn[i] == null) { if (hashColumn[row] == null) { AppendForMerge(ret, rowNumber++, this, other, ReferenceEquals(this, shorterDataFrame) ? row : i, ReferenceEquals(this, shorterDataFrame) ? i : row); } } else { if (hashColumn[row] != null) { AppendForMerge(ret, rowNumber++, this, other, ReferenceEquals(this, shorterDataFrame) ? row : i, ReferenceEquals(this, shorterDataFrame) ? i : row); } } } } } ret._table.RowCount = rowNumber; } else if (joinAlgorithm == JoinAlgorithm.FullOuter) { BaseColumn otherColumn = other[rightJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>(); Dictionary <TKey, long> intersection = new Dictionary <TKey, long>(EqualityComparer <TKey> .Default); // Go over the records in this dataframe and match with the dictionary BaseColumn thisColumn = this[leftJoinColumn]; for (int c = 0; c < ret.ColumnCount; c++) { ret.Column(c).Resize(thisColumn.Length + 1); } for (long i = 0; i < thisColumn.Length; i++) { if (rowNumber >= thisColumn.Length) { for (int c = 0; c < ret.ColumnCount; c++) { ret.Column(c).Resize(rowNumber + 1); } } TKey value = (TKey)(thisColumn[i] ?? default(TKey)); if (multimap.TryGetValue(value, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { if (thisColumn[i] == null) { // Has to match only with nulls in otherColumn if (otherColumn[row] == null) { AppendForMerge(ret, rowNumber++, this, other, i, row); if (!intersection.ContainsKey(value)) { intersection.Add(value, rowNumber); } } } else { // Cannot match to nulls in otherColumn if (otherColumn[row] != null) { AppendForMerge(ret, rowNumber++, this, other, i, row); if (!intersection.ContainsKey(value)) { intersection.Add(value, rowNumber); } } } } } else { AppendForMerge(ret, rowNumber++, this, other, i, -1); } } for (long i = 0; i < otherColumn.Length; i++) { if (rowNumber >= ret.Column(0).Length) { for (int c = 0; c < ret.ColumnCount; c++) { ret.Column(c).Resize(rowNumber + 1); } } TKey value = (TKey)(otherColumn[i] ?? default(TKey)); if (!intersection.ContainsKey(value)) { if (rowNumber >= otherColumn.Length) { for (int c = 0; c < ret.ColumnCount; c++) { ret.Column(c).Resize(rowNumber + 1); } } AppendForMerge(ret, rowNumber++, this, other, -1, i); } } ret._table.RowCount = rowNumber; } return(ret); }