public DataFrame Join(DataFrame other, string leftSuffix = "_left", string rightSuffix = "_right", JoinAlgorithm joinAlgorithm = JoinAlgorithm.Left) { DataFrame ret = new DataFrame(); if (joinAlgorithm == JoinAlgorithm.Left) { for (int i = 0; i < ColumnCount; i++) { BaseColumn newColumn = Column(i).Clone(); ret.InsertColumn(ret.ColumnCount, newColumn); } long minLength = Math.Min(RowCount, other.RowCount); PrimitiveColumn <long> mapIndices = new PrimitiveColumn <long>("mapIndices", minLength); for (long i = 0; i < minLength; i++) { mapIndices[i] = i; } for (int i = 0; i < other.ColumnCount; i++) { BaseColumn newColumn; if (other.RowCount < RowCount) { newColumn = other.Column(i).Clone(numberOfNullsToAppend: RowCount - other.RowCount); } else { newColumn = other.Column(i).Clone(mapIndices); } SetSuffixForDuplicatedColumnNames(ret, newColumn, leftSuffix, rightSuffix); ret.InsertColumn(ret.ColumnCount, newColumn); } } else if (joinAlgorithm == JoinAlgorithm.Right) { long minLength = Math.Min(RowCount, other.RowCount); PrimitiveColumn <long> mapIndices = new PrimitiveColumn <long>("mapIndices", minLength); for (long i = 0; i < minLength; i++) { mapIndices[i] = i; } for (int i = 0; i < ColumnCount; i++) { BaseColumn newColumn; if (RowCount < other.RowCount) { newColumn = Column(i).Clone(numberOfNullsToAppend: other.RowCount - RowCount); } else { newColumn = Column(i).Clone(mapIndices); } ret.InsertColumn(ret.ColumnCount, newColumn); } for (int i = 0; i < other.ColumnCount; i++) { BaseColumn newColumn = other.Column(i).Clone(); SetSuffixForDuplicatedColumnNames(ret, newColumn, leftSuffix, rightSuffix); ret.InsertColumn(ret.ColumnCount, newColumn); } } else if (joinAlgorithm == JoinAlgorithm.FullOuter) { long newRowCount = Math.Max(RowCount, other.RowCount); long numberOfNulls = newRowCount - RowCount; for (int i = 0; i < ColumnCount; i++) { BaseColumn newColumn = Column(i).Clone(numberOfNullsToAppend: numberOfNulls); ret.InsertColumn(ret.ColumnCount, newColumn); } numberOfNulls = newRowCount - other.RowCount; for (int i = 0; i < other.ColumnCount; i++) { BaseColumn newColumn = other.Column(i).Clone(numberOfNullsToAppend: numberOfNulls); SetSuffixForDuplicatedColumnNames(ret, newColumn, leftSuffix, rightSuffix); ret.InsertColumn(ret.ColumnCount, newColumn); } } else if (joinAlgorithm == JoinAlgorithm.Inner) { long newRowCount = Math.Min(RowCount, other.RowCount); PrimitiveColumn <long> mapIndices = new PrimitiveColumn <long>("mapIndices", newRowCount); for (long i = 0; i < newRowCount; i++) { mapIndices[i] = i; } for (int i = 0; i < ColumnCount; i++) { BaseColumn newColumn = Column(i).Clone(mapIndices); ret.InsertColumn(ret.ColumnCount, newColumn); } for (int i = 0; i < other.ColumnCount; i++) { BaseColumn newColumn = other.Column(i).Clone(mapIndices); SetSuffixForDuplicatedColumnNames(ret, newColumn, leftSuffix, rightSuffix); ret.InsertColumn(ret.ColumnCount, newColumn); } } return(ret); }
private void VerifyJoin(DataFrame join, DataFrame left, DataFrame right, JoinAlgorithm joinAlgorithm) { PrimitiveColumn <long> mapIndices = new PrimitiveColumn <long>("map", join.RowCount); for (long i = 0; i < join.RowCount; i++) { mapIndices[i] = i; } for (int i = 0; i < join.ColumnCount; i++) { BaseColumn joinColumn = join.Column(i); BaseColumn isEqual; if (joinAlgorithm == JoinAlgorithm.Left) { if (i < left.ColumnCount) { BaseColumn leftColumn = left.Column(i); isEqual = joinColumn == leftColumn; } else { int columnIndex = i - left.ColumnCount; BaseColumn rightColumn = right.Column(columnIndex); BaseColumn compareColumn = rightColumn.Length <= join.RowCount ? rightColumn.Clone(numberOfNullsToAppend: join.RowCount - rightColumn.Length) : rightColumn.Clone(mapIndices); isEqual = joinColumn == compareColumn; } } else if (joinAlgorithm == JoinAlgorithm.Right) { if (i < left.ColumnCount) { BaseColumn leftColumn = left.Column(i); BaseColumn compareColumn = leftColumn.Length <= join.RowCount ? leftColumn.Clone(numberOfNullsToAppend: join.RowCount - leftColumn.Length) : leftColumn.Clone(mapIndices); isEqual = joinColumn == compareColumn; } else { int columnIndex = i - left.ColumnCount; BaseColumn rightColumn = right.Column(columnIndex); isEqual = joinColumn == rightColumn; } } else if (joinAlgorithm == JoinAlgorithm.Inner) { if (i < left.ColumnCount) { BaseColumn leftColumn = left.Column(i); isEqual = joinColumn == leftColumn.Clone(mapIndices); } else { int columnIndex = i - left.ColumnCount; BaseColumn rightColumn = right.Column(columnIndex); isEqual = joinColumn == rightColumn.Clone(mapIndices); } } else { if (i < left.ColumnCount) { BaseColumn leftColumn = left.Column(i); isEqual = joinColumn == leftColumn.Clone(numberOfNullsToAppend: join.RowCount - leftColumn.Length); } else { int columnIndex = i - left.ColumnCount; BaseColumn rightColumn = right.Column(columnIndex); isEqual = joinColumn == rightColumn.Clone(numberOfNullsToAppend: join.RowCount - rightColumn.Length); } } for (int j = 0; j < join.RowCount; j++) { Assert.Equal(true, isEqual[j]); } } }
// TODO: Merge API with an "On" parameter that merges on a column common to 2 dataframes /// <summary> /// Merge DataFrames with a database style join /// </summary> /// <param name="other"></param> /// <param name="leftJoinColumn"></param> /// <param name="rightJoinColumn"></param> /// <param name="leftSuffix"></param> /// <param name="rightSuffix"></param> /// <param name="joinAlgorithm"></param> /// <returns></returns> public DataFrame Merge <TKey>(DataFrame other, string leftJoinColumn, string rightJoinColumn, string leftSuffix = "_left", string rightSuffix = "_right", JoinAlgorithm joinAlgorithm = JoinAlgorithm.Left) { // A simple hash join DataFrame ret = new DataFrame(); DataFrame leftDataFrame = this; DataFrame rightDataFrame = other; // The final table size is not known until runtime long rowNumber = 0; PrimitiveColumn <long> leftRowIndices = new PrimitiveColumn <long>("LeftIndices"); PrimitiveColumn <long> rightRowIndices = new PrimitiveColumn <long>("RightIndices"); if (joinAlgorithm == JoinAlgorithm.Left) { // First hash other dataframe on the rightJoinColumn BaseColumn otherColumn = other[rightJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>(); // Go over the records in this dataframe and match with the dictionary BaseColumn thisColumn = this[leftJoinColumn]; for (long i = 0; i < thisColumn.Length; i++) { var thisColumnValue = thisColumn[i]; TKey thisColumnValueOrDefault = (TKey)(thisColumnValue == null ? default(TKey) : thisColumnValue); if (multimap.TryGetValue(thisColumnValueOrDefault, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { if (thisColumnValue == null) { // Match only with nulls in otherColumn if (otherColumn[row] == null) { leftRowIndices.Append(i); rightRowIndices.Append(row); } } else { // Cannot match nulls in otherColumn if (otherColumn[row] != null) { leftRowIndices.Append(i); rightRowIndices.Append(row); } } } } else { leftRowIndices.Append(i); rightRowIndices.Append(null); } } } else if (joinAlgorithm == JoinAlgorithm.Right) { BaseColumn thisColumn = this[leftJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = thisColumn.GroupColumnValues <TKey>(); BaseColumn otherColumn = other[rightJoinColumn]; for (long i = 0; i < otherColumn.Length; i++) { var otherColumnValue = otherColumn[i]; TKey otherColumnValueOrDefault = (TKey)(otherColumnValue == null ? default(TKey) : otherColumnValue); if (multimap.TryGetValue(otherColumnValueOrDefault, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { if (otherColumnValue == null) { if (thisColumn[row] == null) { leftRowIndices.Append(row); rightRowIndices.Append(i); } } else { if (thisColumn[row] != null) { leftRowIndices.Append(row); rightRowIndices.Append(i); } } } } else { leftRowIndices.Append(null); rightRowIndices.Append(i); } } } else if (joinAlgorithm == JoinAlgorithm.Inner) { // Hash the column with the smaller RowCount long leftRowCount = RowCount; long rightRowCount = other.RowCount; DataFrame longerDataFrame = leftRowCount < rightRowCount ? other : this; DataFrame shorterDataFrame = ReferenceEquals(longerDataFrame, this) ? other : this; BaseColumn hashColumn = (leftRowCount < rightRowCount) ? this[leftJoinColumn] : other[rightJoinColumn]; BaseColumn otherColumn = ReferenceEquals(hashColumn, this[leftJoinColumn]) ? other[rightJoinColumn] : this[leftJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = hashColumn.GroupColumnValues <TKey>(); for (long i = 0; i < otherColumn.Length; i++) { var otherColumnValue = otherColumn[i]; TKey otherColumnValueOrDefault = (TKey)(otherColumnValue == null ? default(TKey) : otherColumnValue); if (multimap.TryGetValue(otherColumnValueOrDefault, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { if (otherColumnValue == null) { if (hashColumn[row] == null) { leftRowIndices.Append(row); rightRowIndices.Append(i); } } else { if (hashColumn[row] != null) { leftRowIndices.Append(row); rightRowIndices.Append(i); } } } } } leftDataFrame = shorterDataFrame; rightDataFrame = longerDataFrame; } else if (joinAlgorithm == JoinAlgorithm.FullOuter) { BaseColumn otherColumn = other[rightJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>(); Dictionary <TKey, long> intersection = new Dictionary <TKey, long>(EqualityComparer <TKey> .Default); // Go over the records in this dataframe and match with the dictionary BaseColumn thisColumn = this[leftJoinColumn]; for (long i = 0; i < thisColumn.Length; i++) { var thisColumnValue = thisColumn[i]; TKey thisColumnValueOrDefault = (TKey)(thisColumnValue == null ? default(TKey) : thisColumnValue); if (multimap.TryGetValue(thisColumnValueOrDefault, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { if (thisColumnValue == null) { // Has to match only with nulls in otherColumn if (otherColumn[row] == null) { leftRowIndices.Append(i); rightRowIndices.Append(row); if (!intersection.ContainsKey(thisColumnValueOrDefault)) { intersection.Add(thisColumnValueOrDefault, rowNumber); } } } else { // Cannot match to nulls in otherColumn if (otherColumn[row] != null) { leftRowIndices.Append(i); rightRowIndices.Append(row); if (!intersection.ContainsKey(thisColumnValueOrDefault)) { intersection.Add(thisColumnValueOrDefault, rowNumber); } } } } } else { leftRowIndices.Append(i); rightRowIndices.Append(null); } } for (long i = 0; i < otherColumn.Length; i++) { TKey value = (TKey)(otherColumn[i] ?? default(TKey)); if (!intersection.ContainsKey(value)) { leftRowIndices.Append(null); rightRowIndices.Append(i); } } } else { throw new NotImplementedException(nameof(joinAlgorithm)); } for (int i = 0; i < leftDataFrame.ColumnCount; i++) { ret.InsertColumn(i, leftDataFrame.Column(i).Clone(leftRowIndices)); } for (int i = 0; i < rightDataFrame.ColumnCount; i++) { BaseColumn column = rightDataFrame.Column(i).Clone(rightRowIndices); SetSuffixForDuplicatedColumnNames(ret, column, leftSuffix, rightSuffix); ret.InsertColumn(ret.ColumnCount, column); } return(ret); }
public DataFrame Merge(DataFrame other, string[] leftJoinColumns, string[] rightJoinColumns, string leftSuffix = "_left", string rightSuffix = "_right", JoinAlgorithm joinAlgorithm = JoinAlgorithm.Left) { if (other == null) { throw new ArgumentNullException(nameof(other)); } //In Outer join the joined dataframe retains each row — even if no other matching row exists in supplementary dataframe. //Outer joins subdivide further into left outer joins (left dataframe is retained), right outer joins (rightdataframe is retained), in full outer both are retained PrimitiveDataFrameColumn <long> retainedRowIndices; PrimitiveDataFrameColumn <long> supplementaryRowIndices; DataFrame supplementaryDataFrame; DataFrame retainedDataFrame; bool isLeftDataFrameRetained; if (joinAlgorithm == JoinAlgorithm.Left || joinAlgorithm == JoinAlgorithm.Right) { isLeftDataFrameRetained = (joinAlgorithm == JoinAlgorithm.Left); supplementaryDataFrame = isLeftDataFrameRetained ? other : this; var supplementaryJoinColumns = isLeftDataFrameRetained ? rightJoinColumns : leftJoinColumns; retainedDataFrame = isLeftDataFrameRetained ? this : other; var retainedJoinColumns = isLeftDataFrameRetained ? leftJoinColumns : rightJoinColumns; Merge(retainedDataFrame, supplementaryDataFrame, retainedJoinColumns, supplementaryJoinColumns, out retainedRowIndices, out supplementaryRowIndices); } else if (joinAlgorithm == JoinAlgorithm.Inner) { // use as supplementary (for Hashing) the dataframe with the smaller RowCount isLeftDataFrameRetained = (Rows.Count > other.Rows.Count); supplementaryDataFrame = isLeftDataFrameRetained ? other : this; var supplementaryJoinColumns = isLeftDataFrameRetained ? rightJoinColumns : leftJoinColumns; retainedDataFrame = isLeftDataFrameRetained ? this : other; var retainedJoinColumns = isLeftDataFrameRetained ? leftJoinColumns : rightJoinColumns; Merge(retainedDataFrame, supplementaryDataFrame, retainedJoinColumns, supplementaryJoinColumns, out retainedRowIndices, out supplementaryRowIndices, true); } else if (joinAlgorithm == JoinAlgorithm.FullOuter) { //In full outer join we would like to retain data from both side, so we do it into 2 steps: one first we do LEFT JOIN and then add lost data from the RIGHT side //Step 1 //Do LEFT JOIN isLeftDataFrameRetained = true; supplementaryDataFrame = isLeftDataFrameRetained ? other : this; var supplementaryJoinColumns = isLeftDataFrameRetained ? rightJoinColumns : leftJoinColumns; retainedDataFrame = isLeftDataFrameRetained ? this : other; var retainedJoinColumns = isLeftDataFrameRetained ? leftJoinColumns : rightJoinColumns; var intersection = Merge(retainedDataFrame, supplementaryDataFrame, retainedJoinColumns, supplementaryJoinColumns, out retainedRowIndices, out supplementaryRowIndices, calculateIntersection: true); //Step 2 //Do RIGHT JOIN to retain all data from supplementary DataFrame too (take into account data intersection from the first step to avoid duplicates) for (long i = 0; i < supplementaryDataFrame.Columns.RowCount; i++) { var columns = supplementaryJoinColumns.Select(name => supplementaryDataFrame.Columns[name]).ToArray(); if (!IsAnyNullValueInColumns(columns, i)) { if (!intersection.Contains(i)) { retainedRowIndices.Append(null); supplementaryRowIndices.Append(i); } } } } else { throw new NotImplementedException(nameof(joinAlgorithm)); } DataFrame ret = new DataFrame(); //insert columns from left dataframe (this) for (int i = 0; i < this.Columns.Count; i++) { ret.Columns.Insert(i, this.Columns[i].Clone(isLeftDataFrameRetained ? retainedRowIndices : supplementaryRowIndices)); } //insert columns from right dataframe (other) for (int i = 0; i < other.Columns.Count; i++) { DataFrameColumn column = other.Columns[i].Clone(isLeftDataFrameRetained ? supplementaryRowIndices : retainedRowIndices); SetSuffixForDuplicatedColumnNames(ret, column, leftSuffix, rightSuffix); ret.Columns.Insert(ret.Columns.Count, column); } return(ret); }
/// <summary> /// Merge DataFrames with a database style join (for backward compatibility) /// </summary> /// <param name="other"></param> /// <param name="leftJoinColumn"></param> /// <param name="rightJoinColumn"></param> /// <param name="leftSuffix"></param> /// <param name="rightSuffix"></param> /// <param name="joinAlgorithm"></param> /// <returns></returns> public DataFrame Merge <TKey>(DataFrame other, string leftJoinColumn, string rightJoinColumn, string leftSuffix = "_left", string rightSuffix = "_right", JoinAlgorithm joinAlgorithm = JoinAlgorithm.Left) { return(Merge(other, new[] { leftJoinColumn }, new[] { rightJoinColumn }, leftSuffix, rightSuffix, joinAlgorithm)); }
// TODO: Merge API with an "On" parameter that merges on a column common to 2 dataframes /// <summary> /// Merge DataFrames with a database style join /// </summary> /// <param name="other"></param> /// <param name="leftJoinColumn"></param> /// <param name="rightJoinColumn"></param> /// <param name="leftSuffix"></param> /// <param name="rightSuffix"></param> /// <param name="joinAlgorithm"></param> /// <returns></returns> public DataFrame Merge <TKey>(DataFrame other, string leftJoinColumn, string rightJoinColumn, string leftSuffix = "_left", string rightSuffix = "_right", JoinAlgorithm joinAlgorithm = JoinAlgorithm.Left) { // A simple hash join DataFrame ret = new DataFrame(); PrimitiveColumn <long> emptyMap = new PrimitiveColumn <long>("Empty"); for (int i = 0; i < ColumnCount; i++) { // Create empty columns BaseColumn column = Column(i).Clone(emptyMap); ret.InsertColumn(ret.ColumnCount, column); } for (int i = 0; i < other.ColumnCount; i++) { // Create empty columns BaseColumn column = other.Column(i).Clone(emptyMap); SetSuffixForDuplicatedColumnNames(ret, column, leftSuffix, rightSuffix); ret.InsertColumn(ret.ColumnCount, column); } // The final table size is not known until runtime long rowNumber = 0; if (joinAlgorithm == JoinAlgorithm.Left) { // First hash other dataframe on the rightJoinColumn BaseColumn otherColumn = other[rightJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>(); // Go over the records in this dataframe and match with the dictionary BaseColumn thisColumn = this[leftJoinColumn]; for (int c = 0; c < ret.ColumnCount; c++) { ret.Column(c).Resize(thisColumn.Length); } for (long i = 0; i < thisColumn.Length; i++) { if (rowNumber >= thisColumn.Length) { for (int c = 0; c < ret.ColumnCount; c++) { ret.Column(c).Resize(rowNumber + 1); } } TKey value = (TKey)(thisColumn[i] ?? default(TKey)); if (multimap.TryGetValue(value, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { if (thisColumn[i] == null) { // Match only with nulls in otherColumn if (otherColumn[row] == null) { AppendForMerge(ret, rowNumber++, this, other, i, row); } } else { // Cannot match nulls in otherColumn if (otherColumn[row] != null) { AppendForMerge(ret, rowNumber++, this, other, i, row); } } } } else { AppendForMerge(ret, rowNumber++, this, other, i, -1); } } ret._table.RowCount = rowNumber; } else if (joinAlgorithm == JoinAlgorithm.Right) { BaseColumn thisColumn = this[leftJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = thisColumn.GroupColumnValues <TKey>(); BaseColumn otherColumn = other[rightJoinColumn]; for (int c = 0; c < ret.ColumnCount; c++) { ret.Column(c).Resize(otherColumn.Length); } for (long i = 0; i < otherColumn.Length; i++) { if (rowNumber >= otherColumn.Length) { for (int c = 0; c < ret.ColumnCount; c++) { ret.Column(c).Resize(rowNumber + 1); } } TKey value = (TKey)(otherColumn[i] ?? default(TKey)); if (multimap.TryGetValue(value, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { if (otherColumn[i] == null) { if (thisColumn[row] == null) { AppendForMerge(ret, rowNumber++, this, other, row, i); } } else { if (thisColumn[row] != null) { AppendForMerge(ret, rowNumber++, this, other, row, i); } } } } else { AppendForMerge(ret, rowNumber++, this, other, -1, i); } } ret._table.RowCount = rowNumber; } else if (joinAlgorithm == JoinAlgorithm.Inner) { // Hash the column with the smaller RowCount long leftRowCount = RowCount; long rightRowCount = other.RowCount; DataFrame longerDataFrame = leftRowCount < rightRowCount ? other : this; DataFrame shorterDataFrame = ReferenceEquals(longerDataFrame, this) ? other : this; BaseColumn hashColumn = (leftRowCount < rightRowCount) ? this[leftJoinColumn] : other[rightJoinColumn]; BaseColumn otherColumn = ReferenceEquals(hashColumn, this[leftJoinColumn]) ? other[rightJoinColumn] : this[leftJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = hashColumn.GroupColumnValues <TKey>(); for (int c = 0; c < ret.ColumnCount; c++) { ret.Column(c).Resize(1); } for (long i = 0; i < otherColumn.Length; i++) { if (rowNumber >= ret.Column(0).Length) { for (int c = 0; c < ret.ColumnCount; c++) { ret.Column(c).Resize(rowNumber + 1); } } TKey value = (TKey)(otherColumn[i] ?? default(TKey)); if (multimap.TryGetValue(value, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { if (otherColumn[i] == null) { if (hashColumn[row] == null) { AppendForMerge(ret, rowNumber++, this, other, ReferenceEquals(this, shorterDataFrame) ? row : i, ReferenceEquals(this, shorterDataFrame) ? i : row); } } else { if (hashColumn[row] != null) { AppendForMerge(ret, rowNumber++, this, other, ReferenceEquals(this, shorterDataFrame) ? row : i, ReferenceEquals(this, shorterDataFrame) ? i : row); } } } } } ret._table.RowCount = rowNumber; } else if (joinAlgorithm == JoinAlgorithm.FullOuter) { BaseColumn otherColumn = other[rightJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>(); Dictionary <TKey, long> intersection = new Dictionary <TKey, long>(EqualityComparer <TKey> .Default); // Go over the records in this dataframe and match with the dictionary BaseColumn thisColumn = this[leftJoinColumn]; for (int c = 0; c < ret.ColumnCount; c++) { ret.Column(c).Resize(thisColumn.Length + 1); } for (long i = 0; i < thisColumn.Length; i++) { if (rowNumber >= thisColumn.Length) { for (int c = 0; c < ret.ColumnCount; c++) { ret.Column(c).Resize(rowNumber + 1); } } TKey value = (TKey)(thisColumn[i] ?? default(TKey)); if (multimap.TryGetValue(value, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { if (thisColumn[i] == null) { // Has to match only with nulls in otherColumn if (otherColumn[row] == null) { AppendForMerge(ret, rowNumber++, this, other, i, row); if (!intersection.ContainsKey(value)) { intersection.Add(value, rowNumber); } } } else { // Cannot match to nulls in otherColumn if (otherColumn[row] != null) { AppendForMerge(ret, rowNumber++, this, other, i, row); if (!intersection.ContainsKey(value)) { intersection.Add(value, rowNumber); } } } } } else { AppendForMerge(ret, rowNumber++, this, other, i, -1); } } for (long i = 0; i < otherColumn.Length; i++) { if (rowNumber >= ret.Column(0).Length) { for (int c = 0; c < ret.ColumnCount; c++) { ret.Column(c).Resize(rowNumber + 1); } } TKey value = (TKey)(otherColumn[i] ?? default(TKey)); if (!intersection.ContainsKey(value)) { if (rowNumber >= otherColumn.Length) { for (int c = 0; c < ret.ColumnCount; c++) { ret.Column(c).Resize(rowNumber + 1); } } AppendForMerge(ret, rowNumber++, this, other, -1, i); } } ret._table.RowCount = rowNumber; } return(ret); }
// TODO: Merge API with an "On" parameter that merges on a column common to 2 dataframes /// <summary> /// Merge DataFrames with a database style join /// </summary> /// <param name="other"></param> /// <param name="leftJoinColumn"></param> /// <param name="rightJoinColumn"></param> /// <param name="leftSuffix"></param> /// <param name="rightSuffix"></param> /// <param name="joinAlgorithm"></param> /// <returns></returns> public DataFrame Merge <TKey>(DataFrame other, string leftJoinColumn, string rightJoinColumn, string leftSuffix = "_left", string rightSuffix = "_right", JoinAlgorithm joinAlgorithm = JoinAlgorithm.Left) { // A simple hash join DataFrame ret = new DataFrame(); DataFrame leftDataFrame = this; DataFrame rightDataFrame = other; // The final table size is not known until runtime long rowNumber = 0; PrimitiveDataFrameColumn <long> leftRowIndices = new PrimitiveDataFrameColumn <long>("LeftIndices"); PrimitiveDataFrameColumn <long> rightRowIndices = new PrimitiveDataFrameColumn <long>("RightIndices"); if (joinAlgorithm == JoinAlgorithm.Left) { // First hash other dataframe on the rightJoinColumn DataFrameColumn otherColumn = other.Columns[rightJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>(out HashSet <long> otherColumnNullIndices); // Go over the records in this dataframe and match with the dictionary DataFrameColumn thisColumn = Columns[leftJoinColumn]; for (long i = 0; i < thisColumn.Length; i++) { var thisColumnValue = thisColumn[i]; if (thisColumnValue != null) { if (multimap.TryGetValue((TKey)thisColumnValue, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { leftRowIndices.Append(i); rightRowIndices.Append(row); } } else { leftRowIndices.Append(i); rightRowIndices.Append(null); } } else { foreach (long row in otherColumnNullIndices) { leftRowIndices.Append(i); rightRowIndices.Append(row); } } } } else if (joinAlgorithm == JoinAlgorithm.Right) { DataFrameColumn thisColumn = Columns[leftJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = thisColumn.GroupColumnValues <TKey>(out HashSet <long> thisColumnNullIndices); DataFrameColumn otherColumn = other.Columns[rightJoinColumn]; for (long i = 0; i < otherColumn.Length; i++) { var otherColumnValue = otherColumn[i]; if (otherColumnValue != null) { if (multimap.TryGetValue((TKey)otherColumnValue, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { leftRowIndices.Append(row); rightRowIndices.Append(i); } } else { leftRowIndices.Append(null); rightRowIndices.Append(i); } } else { foreach (long thisColumnNullIndex in thisColumnNullIndices) { leftRowIndices.Append(thisColumnNullIndex); rightRowIndices.Append(i); } } } } else if (joinAlgorithm == JoinAlgorithm.Inner) { // Hash the column with the smaller RowCount long leftRowCount = Rows.Count; long rightRowCount = other.Rows.Count; bool leftColumnIsSmaller = leftRowCount <= rightRowCount; DataFrameColumn hashColumn = leftColumnIsSmaller ? Columns[leftJoinColumn] : other.Columns[rightJoinColumn]; DataFrameColumn otherColumn = ReferenceEquals(hashColumn, Columns[leftJoinColumn]) ? other.Columns[rightJoinColumn] : Columns[leftJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = hashColumn.GroupColumnValues <TKey>(out HashSet <long> smallerDataFrameColumnNullIndices); for (long i = 0; i < otherColumn.Length; i++) { var otherColumnValue = otherColumn[i]; if (otherColumnValue != null) { if (multimap.TryGetValue((TKey)otherColumnValue, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { leftRowIndices.Append(leftColumnIsSmaller ? row : i); rightRowIndices.Append(leftColumnIsSmaller ? i : row); } } } else { foreach (long nullIndex in smallerDataFrameColumnNullIndices) { leftRowIndices.Append(leftColumnIsSmaller ? nullIndex : i); rightRowIndices.Append(leftColumnIsSmaller ? i : nullIndex); } } } } else if (joinAlgorithm == JoinAlgorithm.FullOuter) { DataFrameColumn otherColumn = other.Columns[rightJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>(out HashSet <long> otherColumnNullIndices); Dictionary <TKey, long> intersection = new Dictionary <TKey, long>(EqualityComparer <TKey> .Default); // Go over the records in this dataframe and match with the dictionary DataFrameColumn thisColumn = Columns[leftJoinColumn]; Int64DataFrameColumn thisColumnNullIndices = new Int64DataFrameColumn("ThisColumnNullIndices"); for (long i = 0; i < thisColumn.Length; i++) { var thisColumnValue = thisColumn[i]; if (thisColumnValue != null) { if (multimap.TryGetValue((TKey)thisColumnValue, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { leftRowIndices.Append(i); rightRowIndices.Append(row); if (!intersection.ContainsKey((TKey)thisColumnValue)) { intersection.Add((TKey)thisColumnValue, rowNumber); } } } else { leftRowIndices.Append(i); rightRowIndices.Append(null); } } else { thisColumnNullIndices.Append(i); } } for (long i = 0; i < otherColumn.Length; i++) { var value = otherColumn[i]; if (value != null) { if (!intersection.ContainsKey((TKey)value)) { leftRowIndices.Append(null); rightRowIndices.Append(i); } } } // Now handle the null rows foreach (long?thisColumnNullIndex in thisColumnNullIndices) { foreach (long otherColumnNullIndex in otherColumnNullIndices) { leftRowIndices.Append(thisColumnNullIndex.Value); rightRowIndices.Append(otherColumnNullIndex); } if (otherColumnNullIndices.Count == 0) { leftRowIndices.Append(thisColumnNullIndex.Value); rightRowIndices.Append(null); } } if (thisColumnNullIndices.Length == 0) { foreach (long otherColumnNullIndex in otherColumnNullIndices) { leftRowIndices.Append(null); rightRowIndices.Append(otherColumnNullIndex); } } } else { throw new NotImplementedException(nameof(joinAlgorithm)); } for (int i = 0; i < leftDataFrame.Columns.Count; i++) { ret.Columns.Insert(i, leftDataFrame.Columns[i].Clone(leftRowIndices)); } for (int i = 0; i < rightDataFrame.Columns.Count; i++) { DataFrameColumn column = rightDataFrame.Columns[i].Clone(rightRowIndices); SetSuffixForDuplicatedColumnNames(ret, column, leftSuffix, rightSuffix); ret.Columns.Insert(ret.Columns.Count, column); } return(ret); }