public PrimitiveColumn <T> Clone(PrimitiveColumn <long> mapIndices = null, bool invertMapIndices = false) { if (mapIndices is null) { PrimitiveColumnContainer <T> newColumnContainer = _columnContainer.Clone(); return(new PrimitiveColumn <T>(Name, newColumnContainer)); } else { if (mapIndices.Length != Length) { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(mapIndices)); } PrimitiveColumn <T> ret = new PrimitiveColumn <T>(Name); if (invertMapIndices == false) { for (long i = 0; i < mapIndices.Length; i++) { ret.Append(_columnContainer[mapIndices._columnContainer[i]]); } } else { for (long i = Length - 1; i >= 0; i--) { ret.Append(_columnContainer[mapIndices._columnContainer[i]]); } } return(ret); } }
public override BaseColumn CloneAndAppendNulls(BaseColumn mapIndices = null, bool invertMapIndices = false) { PrimitiveColumn <T> ret = Clone(mapIndices, invertMapIndices) as PrimitiveColumn <T>; ret.AppendMany(null, NullCount); return(ret); }
public override BaseColumn CumulativeProduct(IEnumerable <long> rowIndices, bool inPlace = false) { PrimitiveColumn <T> ret = inPlace ? this : Clone(); PrimitiveColumnComputation <T> .Instance.CumulativeProduct(ret._columnContainer, rowIndices); return(ret); }
public override BaseColumn CumulativeSum(bool inPlace = false) { PrimitiveColumn <T> ret = inPlace ? this : Clone(); PrimitiveColumnComputation <T> .Instance.CumulativeSum(ret._columnContainer); return(ret); }
private void GetSortIndices(IComparer <T> comparer, out PrimitiveColumn <long> columnSortIndices) { List <List <int> > bufferSortIndices = new List <List <int> >(_columnContainer.Buffers.Count); // Sort each buffer first for (int b = 0; b < _columnContainer.Buffers.Count; b++) { ReadOnlyDataFrameBuffer <T> buffer = _columnContainer.Buffers[b]; int[] sortIndices = new int[buffer.Length]; for (int i = 0; i < buffer.Length; i++) { sortIndices[i] = i; } IntrospectiveSort(buffer.ReadOnlySpan, buffer.Length, sortIndices, comparer); // Bug fix: QuickSort is not stable. When PrimitiveColumn has null values and default values, they move around List <int> nonNullSortIndices = new List <int>(); for (int i = 0; i < sortIndices.Length; i++) { if (IsValid(sortIndices[i] + b * ReadOnlyDataFrameBuffer <T> .MaxCapacity)) { nonNullSortIndices.Add(sortIndices[i]); } } bufferSortIndices.Add(nonNullSortIndices); } // Simple merge sort to build the full column's sort indices ValueTuple <T, int> GetFirstNonNullValueAndBufferIndexStartingAtIndex(int bufferIndex, int startIndex) { T value = _columnContainer.Buffers[bufferIndex][bufferSortIndices[bufferIndex][startIndex]]; long rowIndex = bufferSortIndices[bufferIndex][startIndex] + bufferIndex * ReadOnlyDataFrameBuffer <T> .MaxCapacity; return(value, startIndex); } SortedDictionary <T, List <ValueTuple <int, int> > > heapOfValueAndListOfTupleOfSortAndBufferIndex = new SortedDictionary <T, List <ValueTuple <int, int> > >(comparer); IList <ReadOnlyDataFrameBuffer <T> > buffers = _columnContainer.Buffers; for (int i = 0; i < buffers.Count; i++) { ReadOnlyDataFrameBuffer <T> buffer = buffers[i]; if (bufferSortIndices[i].Count == 0) { // All nulls continue; } ValueTuple <T, int> valueAndBufferIndex = GetFirstNonNullValueAndBufferIndexStartingAtIndex(i, 0); if (heapOfValueAndListOfTupleOfSortAndBufferIndex.ContainsKey(valueAndBufferIndex.Item1)) { heapOfValueAndListOfTupleOfSortAndBufferIndex[valueAndBufferIndex.Item1].Add((valueAndBufferIndex.Item2, i)); } else { heapOfValueAndListOfTupleOfSortAndBufferIndex.Add(valueAndBufferIndex.Item1, new List <ValueTuple <int, int> >() { (valueAndBufferIndex.Item2, i) });
public override PrimitiveColumn <bool> NotEquals <T>(T value) { PrimitiveColumn <bool> ret = new PrimitiveColumn <bool>(Name, Length); string valString = value.ToString(); for (long i = 0; i < Length; i++) { ret[i] = (string)this[i] != valString; } return(ret); }
public void InsertColumn <T>(int columnIndex, IEnumerable <T> column, string columnName, DataFrame parent) where T : unmanaged { column = column ?? throw new ArgumentNullException(nameof(column)); if ((uint)columnIndex > _columns.Count) { throw new ArgumentOutOfRangeException(nameof(columnIndex)); } BaseColumn newColumn = new PrimitiveColumn <T>(columnName, column); InsertColumn(columnIndex, newColumn, parent); }
private void GetSortIndices(IComparer <T> comparer, out PrimitiveColumn <long> columnSortIndices) { List <int[]> bufferSortIndices = new List <int[]>(_columnContainer.Buffers.Count); // Sort each buffer first foreach (DataFrameBuffer <T> buffer in _columnContainer.Buffers) { var sortIndices = new int[buffer.Length]; for (int i = 0; i < buffer.Length; i++) { sortIndices[i] = i; } IntrospectiveSort(buffer.Span, buffer.Length, sortIndices, comparer); bufferSortIndices.Add(sortIndices); } // Simple merge sort to build the full column's sort indices ValueTuple <T, int> GetFirstNonNullValueAndBufferIndexStartingAtIndex(int bufferIndex, int startIndex) { T value = _columnContainer.Buffers[bufferIndex][bufferSortIndices[bufferIndex][startIndex]]; long rowIndex = bufferSortIndices[bufferIndex][startIndex] + bufferIndex * _columnContainer.Buffers[0].MaxCapacity; while (!IsValid(rowIndex) && ++startIndex < bufferSortIndices[bufferIndex].Length) { value = _columnContainer.Buffers[bufferIndex][bufferSortIndices[bufferIndex][startIndex]]; rowIndex = startIndex + bufferIndex * _columnContainer.Buffers[0].MaxCapacity; } return(value, startIndex); } SortedDictionary <T, List <ValueTuple <int, int> > > heapOfValueAndListOfTupleOfSortAndBufferIndex = new SortedDictionary <T, List <ValueTuple <int, int> > >(comparer); IList <DataFrameBuffer <T> > buffers = _columnContainer.Buffers; for (int i = 0; i < buffers.Count; i++) { DataFrameBuffer <T> buffer = buffers[i]; ValueTuple <T, int> valueAndBufferIndex = GetFirstNonNullValueAndBufferIndexStartingAtIndex(i, 0); long columnIndex = valueAndBufferIndex.Item2 + i * bufferSortIndices[0].Length; if (columnIndex == Length) { // All nulls continue; } if (heapOfValueAndListOfTupleOfSortAndBufferIndex.ContainsKey(valueAndBufferIndex.Item1)) { heapOfValueAndListOfTupleOfSortAndBufferIndex[valueAndBufferIndex.Item1].Add((valueAndBufferIndex.Item2, i)); } else { heapOfValueAndListOfTupleOfSortAndBufferIndex.Add(valueAndBufferIndex.Item1, new List <ValueTuple <int, int> >() { (valueAndBufferIndex.Item2, i) });
public override PrimitiveColumn <bool> NotEquals(BaseColumn column) { // TODO: Using indexing is VERY inefficient here. Each indexer call will find the "right" buffer and then return the value if (Length != column.Length) { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } PrimitiveColumn <bool> ret = new PrimitiveColumn <bool>(Name, Length); for (long i = 0; i < Length; i++) { ret[i] = (string)this[i] != column[i].ToString(); } return(ret); }
public PrimitiveColumn <T> Clone(PrimitiveColumn <long> mapIndices = null, bool invertMapIndices = false) { if (mapIndices is null) { PrimitiveColumnContainer <T> newColumnContainer = _columnContainer.Clone(); return(new PrimitiveColumn <T>(Name, newColumnContainer)); } else { if (mapIndices.Length > Length) { throw new ArgumentException(Strings.MapIndicesExceedsColumnLenth, nameof(mapIndices)); } PrimitiveColumn <T> ret = new PrimitiveColumn <T>(Name, mapIndices.Length); ret._columnContainer._modifyNullCountWhileIndexing = false; if (invertMapIndices == false) { for (long i = 0; i < mapIndices.Length; i++) { T?value = _columnContainer[mapIndices._columnContainer[i].Value]; ret[i] = value; if (!value.HasValue) { ret._columnContainer.NullCount++; } } } else { long mapIndicesIndex = mapIndices.Length - 1; for (long i = 0; i < mapIndices.Length; i++) { T?value = _columnContainer[mapIndices._columnContainer[mapIndicesIndex - i].Value]; ret[i] = value; if (!value.HasValue) { ret._columnContainer.NullCount++; } } } ret._columnContainer._modifyNullCountWhileIndexing = true; return(ret); } }
private void GetSortIndices(IComparer <T> comparer, out PrimitiveColumn <long> columnSortIndices) { List <int[]> bufferSortIndices = new List <int[]>(_columnContainer.Buffers.Count); // Sort each buffer first foreach (DataFrameBuffer <T> buffer in _columnContainer.Buffers) { var sortIndices = new int[buffer.Length]; for (int i = 0; i < buffer.Length; i++) { sortIndices[i] = i; } IntrospectiveSort(buffer.Span, buffer.Length, sortIndices, comparer); bufferSortIndices.Add(sortIndices); } // Simple merge sort to build the full column's sort indices SortedDictionary <T, List <Tuple <int, int> > > heapOfValueAndListOfTupleOfSortAndBufferIndex = new SortedDictionary <T, List <Tuple <int, int> > >(comparer); IList <DataFrameBuffer <T> > buffers = _columnContainer.Buffers; for (int i = 0; i < buffers.Count; i++) { DataFrameBuffer <T> buffer = buffers[i]; T value = buffer[bufferSortIndices[i][0]]; if (heapOfValueAndListOfTupleOfSortAndBufferIndex.ContainsKey(value)) { heapOfValueAndListOfTupleOfSortAndBufferIndex[value].Add(new Tuple <int, int>(0, i)); } else { heapOfValueAndListOfTupleOfSortAndBufferIndex.Add(value, new List <Tuple <int, int> >() { new Tuple <int, int>(0, i) }); } } columnSortIndices = new PrimitiveColumn <long>("SortIndices"); GetBufferSortIndex getBufferSortIndex = new GetBufferSortIndex((int bufferIndex, int sortIndex) => bufferSortIndices[bufferIndex][sortIndex]); GetValueAtBuffer <T> getValueAtBuffer = new GetValueAtBuffer <T>((int bufferIndex, int sortIndex) => _columnContainer.Buffers[bufferIndex][bufferSortIndices[bufferIndex][sortIndex]]); GetBufferLengthAtIndex getBufferLengthAtIndex = new GetBufferLengthAtIndex((int bufferIndex) => bufferSortIndices[bufferIndex].Length); PopulateColumnSortIndicesWithHeap(heapOfValueAndListOfTupleOfSortAndBufferIndex, columnSortIndices, getBufferSortIndex, getValueAtBuffer, getBufferLengthAtIndex); }
private void GetSortIndices(IComparer <T> comparer, out PrimitiveColumn <long> columnSortIndices) { List <List <int> > bufferSortIndices = new List <List <int> >(_columnContainer.Buffers.Count); // Sort each buffer first for (int b = 0; b < _columnContainer.Buffers.Count; b++) { ReadOnlyDataFrameBuffer <T> buffer = _columnContainer.Buffers[b]; ReadOnlySpan <byte> nullBitMapSpan = _columnContainer.NullBitMapBuffers[b].ReadOnlySpan; int[] sortIndices = new int[buffer.Length]; for (int i = 0; i < buffer.Length; i++) { sortIndices[i] = i; } IntrospectiveSort(buffer.ReadOnlySpan, buffer.Length, sortIndices, comparer); // Bug fix: QuickSort is not stable. When PrimitiveColumn has null values and default values, they move around List <int> nonNullSortIndices = new List <int>(); for (int i = 0; i < sortIndices.Length; i++) { if (_columnContainer.IsValid(nullBitMapSpan, sortIndices[i])) { nonNullSortIndices.Add(sortIndices[i]); } } bufferSortIndices.Add(nonNullSortIndices); } // Simple merge sort to build the full column's sort indices ValueTuple <T, int> GetFirstNonNullValueAndBufferIndexStartingAtIndex(int bufferIndex, int startIndex) { int index = bufferSortIndices[bufferIndex][startIndex]; T value; ReadOnlyMemory <byte> buffer = _columnContainer.Buffers[bufferIndex].ReadOnlyBuffer; ReadOnlyMemory <T> typedBuffer = Unsafe.As <ReadOnlyMemory <byte>, ReadOnlyMemory <T> >(ref buffer); if (!typedBuffer.IsEmpty) { bool isArray = MemoryMarshal.TryGetArray(typedBuffer, out ArraySegment <T> arraySegment); if (isArray) { value = arraySegment.Array[index + arraySegment.Offset]; } else { value = _columnContainer.Buffers[bufferIndex][index]; } } else { value = _columnContainer.Buffers[bufferIndex][index]; } return(value, startIndex); } SortedDictionary <T, List <ValueTuple <int, int> > > heapOfValueAndListOfTupleOfSortAndBufferIndex = new SortedDictionary <T, List <ValueTuple <int, int> > >(comparer); IList <ReadOnlyDataFrameBuffer <T> > buffers = _columnContainer.Buffers; for (int i = 0; i < buffers.Count; i++) { ReadOnlyDataFrameBuffer <T> buffer = buffers[i]; if (bufferSortIndices[i].Count == 0) { // All nulls continue; } ValueTuple <T, int> valueAndBufferIndex = GetFirstNonNullValueAndBufferIndexStartingAtIndex(i, 0); if (heapOfValueAndListOfTupleOfSortAndBufferIndex.ContainsKey(valueAndBufferIndex.Item1)) { heapOfValueAndListOfTupleOfSortAndBufferIndex[valueAndBufferIndex.Item1].Add((valueAndBufferIndex.Item2, i)); } else { heapOfValueAndListOfTupleOfSortAndBufferIndex.Add(valueAndBufferIndex.Item1, new List <ValueTuple <int, int> >() { (valueAndBufferIndex.Item2, i) });
public override BaseColumn Sort(bool ascending = true) { PrimitiveColumn <long> sortIndices = GetAscendingSortIndices() as PrimitiveColumn <long>; return(Clone(sortIndices, !ascending, NullCount)); }
/// <summary> /// Reads a text file as a DataFrame. /// Follows pandas API. /// </summary> /// <param name="createStream">function which creates a stream</param> /// <param name="separator">column separator</param> /// <param name="header">has a header or not</param> /// <param name="columnNames">column names (can be empty)</param> /// <param name="dataTypes">column types (can be empty)</param> /// <param name="numberOfRowsToRead">number of rows to read not including the header(if present)</param> /// <param name="guessRows">number of rows used to guess types</param> /// <param name="addIndexColumn">add one column with the row index</param> /// <returns>DataFrame</returns> public static DataFrame ReadStream(Func <StreamReader> createStream, char separator = ',', bool header = true, string[] columnNames = null, Type[] dataTypes = null, long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false) { var linesForGuessType = new List <string[]>(); long rowline = 0; int numberOfColumns = 0; if (header == true && numberOfRowsToRead != -1) { numberOfRowsToRead++; } // First pass: schema and number of rows. using (var st = createStream()) { string line = st.ReadLine(); while (line != null) { if ((numberOfRowsToRead == -1) || rowline < numberOfRowsToRead) { if (linesForGuessType.Count < guessRows) { var spl = line.Split(separator); if (header && rowline == 0) { if (columnNames == null) { columnNames = spl; } } else { linesForGuessType.Add(spl); numberOfColumns = Math.Max(numberOfColumns, spl.Length); } } } ++rowline; if (rowline == numberOfRowsToRead) { break; } line = st.ReadLine(); } } if (linesForGuessType.Count == 0) { throw new FormatException(Strings.EmptyFile); } List <BaseColumn> columns = new List <BaseColumn>(numberOfColumns); // Guesses types and adds columns. for (int i = 0; i < numberOfColumns; ++i) { Type kind = GuessKind(i, linesForGuessType); if (kind == typeof(bool)) { BaseColumn boolColumn = new PrimitiveColumn <bool>(columnNames == null ? "Column" + i.ToString() : columnNames[i], header == true ? rowline - 1 : rowline); columns.Add(boolColumn); } else if (kind == typeof(float)) { BaseColumn floatColumn = new PrimitiveColumn <float>(columnNames == null ? "Column" + i.ToString() : columnNames[i], header == true ? rowline - 1 : rowline); columns.Add(floatColumn); } else if (kind == typeof(string)) { BaseColumn stringColumn = new StringColumn(columnNames == null ? "Column" + i.ToString() : columnNames[i], header == true ? rowline - 1 : rowline); columns.Add(stringColumn); } else { throw new NotSupportedException(nameof(kind)); } } // Fills values. using (StreamReader st = createStream()) { string line = st.ReadLine(); rowline = 0; while (line != null && (numberOfRowsToRead == -1 || rowline < numberOfRowsToRead)) { var spl = line.Split(separator); if (header && rowline == 0) { // Skips. } else { AppendRow(columns, header == true ? rowline - 1 : rowline, spl); } ++rowline; line = st.ReadLine(); } } if (addIndexColumn) { PrimitiveColumn <int> indexColumn = new PrimitiveColumn <int>("IndexColumn", columns[0].Length); for (int i = 0; i < columns[0].Length; i++) { indexColumn[i] = i; } columns.Insert(0, indexColumn); } return(new DataFrame(columns)); }
// TODO: Merge API with an "On" parameter that merges on a column common to 2 dataframes /// <summary> /// Merge DataFrames with a database style join /// </summary> /// <param name="other"></param> /// <param name="leftJoinColumn"></param> /// <param name="rightJoinColumn"></param> /// <param name="leftSuffix"></param> /// <param name="rightSuffix"></param> /// <param name="joinAlgorithm"></param> /// <returns></returns> public DataFrame Merge <TKey>(DataFrame other, string leftJoinColumn, string rightJoinColumn, string leftSuffix = "_left", string rightSuffix = "_right", JoinAlgorithm joinAlgorithm = JoinAlgorithm.Left) { // A simple hash join DataFrame ret = new DataFrame(); PrimitiveColumn <long> emptyMap = new PrimitiveColumn <long>("Empty"); for (int i = 0; i < ColumnCount; i++) { // Create empty columns BaseColumn column = Column(i).Clone(emptyMap); ret.InsertColumn(ret.ColumnCount, column); } for (int i = 0; i < other.ColumnCount; i++) { // Create empty columns BaseColumn column = other.Column(i).Clone(emptyMap); SetSuffixForDuplicatedColumnNames(ret, column, leftSuffix, rightSuffix); ret.InsertColumn(ret.ColumnCount, column); } // The final table size is not known until runtime long rowNumber = 0; if (joinAlgorithm == JoinAlgorithm.Left) { // First hash other dataframe on the rightJoinColumn BaseColumn otherColumn = other[rightJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>(); // Go over the records in this dataframe and match with the dictionary BaseColumn thisColumn = this[leftJoinColumn]; for (int c = 0; c < ret.ColumnCount; c++) { ret.Column(c).Resize(thisColumn.Length); } for (long i = 0; i < thisColumn.Length; i++) { if (rowNumber >= thisColumn.Length) { for (int c = 0; c < ret.ColumnCount; c++) { ret.Column(c).Resize(rowNumber + 1); } } TKey value = (TKey)(thisColumn[i] ?? default(TKey)); if (multimap.TryGetValue(value, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { if (thisColumn[i] == null) { // Match only with nulls in otherColumn if (otherColumn[row] == null) { AppendForMerge(ret, rowNumber++, this, other, i, row); } } else { // Cannot match nulls in otherColumn if (otherColumn[row] != null) { AppendForMerge(ret, rowNumber++, this, other, i, row); } } } } else { AppendForMerge(ret, rowNumber++, this, other, i, -1); } } ret._table.RowCount = rowNumber; } else if (joinAlgorithm == JoinAlgorithm.Right) { BaseColumn thisColumn = this[leftJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = thisColumn.GroupColumnValues <TKey>(); BaseColumn otherColumn = other[rightJoinColumn]; for (int c = 0; c < ret.ColumnCount; c++) { ret.Column(c).Resize(otherColumn.Length); } for (long i = 0; i < otherColumn.Length; i++) { if (rowNumber >= otherColumn.Length) { for (int c = 0; c < ret.ColumnCount; c++) { ret.Column(c).Resize(rowNumber + 1); } } TKey value = (TKey)(otherColumn[i] ?? default(TKey)); if (multimap.TryGetValue(value, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { if (otherColumn[i] == null) { if (thisColumn[row] == null) { AppendForMerge(ret, rowNumber++, this, other, row, i); } } else { if (thisColumn[row] != null) { AppendForMerge(ret, rowNumber++, this, other, row, i); } } } } else { AppendForMerge(ret, rowNumber++, this, other, -1, i); } } ret._table.RowCount = rowNumber; } else if (joinAlgorithm == JoinAlgorithm.Inner) { // Hash the column with the smaller RowCount long leftRowCount = RowCount; long rightRowCount = other.RowCount; DataFrame longerDataFrame = leftRowCount < rightRowCount ? other : this; DataFrame shorterDataFrame = ReferenceEquals(longerDataFrame, this) ? other : this; BaseColumn hashColumn = (leftRowCount < rightRowCount) ? this[leftJoinColumn] : other[rightJoinColumn]; BaseColumn otherColumn = ReferenceEquals(hashColumn, this[leftJoinColumn]) ? other[rightJoinColumn] : this[leftJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = hashColumn.GroupColumnValues <TKey>(); for (int c = 0; c < ret.ColumnCount; c++) { ret.Column(c).Resize(1); } for (long i = 0; i < otherColumn.Length; i++) { if (rowNumber >= ret.Column(0).Length) { for (int c = 0; c < ret.ColumnCount; c++) { ret.Column(c).Resize(rowNumber + 1); } } TKey value = (TKey)(otherColumn[i] ?? default(TKey)); if (multimap.TryGetValue(value, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { if (otherColumn[i] == null) { if (hashColumn[row] == null) { AppendForMerge(ret, rowNumber++, this, other, ReferenceEquals(this, shorterDataFrame) ? row : i, ReferenceEquals(this, shorterDataFrame) ? i : row); } } else { if (hashColumn[row] != null) { AppendForMerge(ret, rowNumber++, this, other, ReferenceEquals(this, shorterDataFrame) ? row : i, ReferenceEquals(this, shorterDataFrame) ? i : row); } } } } } ret._table.RowCount = rowNumber; } else if (joinAlgorithm == JoinAlgorithm.FullOuter) { BaseColumn otherColumn = other[rightJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>(); Dictionary <TKey, long> intersection = new Dictionary <TKey, long>(EqualityComparer <TKey> .Default); // Go over the records in this dataframe and match with the dictionary BaseColumn thisColumn = this[leftJoinColumn]; for (int c = 0; c < ret.ColumnCount; c++) { ret.Column(c).Resize(thisColumn.Length + 1); } for (long i = 0; i < thisColumn.Length; i++) { if (rowNumber >= thisColumn.Length) { for (int c = 0; c < ret.ColumnCount; c++) { ret.Column(c).Resize(rowNumber + 1); } } TKey value = (TKey)(thisColumn[i] ?? default(TKey)); if (multimap.TryGetValue(value, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { if (thisColumn[i] == null) { // Has to match only with nulls in otherColumn if (otherColumn[row] == null) { AppendForMerge(ret, rowNumber++, this, other, i, row); if (!intersection.ContainsKey(value)) { intersection.Add(value, rowNumber); } } } else { // Cannot match to nulls in otherColumn if (otherColumn[row] != null) { AppendForMerge(ret, rowNumber++, this, other, i, row); if (!intersection.ContainsKey(value)) { intersection.Add(value, rowNumber); } } } } } else { AppendForMerge(ret, rowNumber++, this, other, i, -1); } } for (long i = 0; i < otherColumn.Length; i++) { if (rowNumber >= ret.Column(0).Length) { for (int c = 0; c < ret.ColumnCount; c++) { ret.Column(c).Resize(rowNumber + 1); } } TKey value = (TKey)(otherColumn[i] ?? default(TKey)); if (!intersection.ContainsKey(value)) { if (rowNumber >= otherColumn.Length) { for (int c = 0; c < ret.ColumnCount; c++) { ret.Column(c).Resize(rowNumber + 1); } } AppendForMerge(ret, rowNumber++, this, other, -1, i); } } ret._table.RowCount = rowNumber; } return(ret); }
public override BaseColumn Sort(bool ascending = true) { PrimitiveColumn<long> sortIndices = GetAscendingSortIndices() as PrimitiveColumn<long>; return CloneAndAppendNulls(sortIndices, !ascending); }
public DataFrame(RecordBatch recordBatch) { _table = new DataFrameTable(); Apache.Arrow.Schema arrowSchema = recordBatch.Schema; int fieldIndex = 0; IEnumerable <IArrowArray> arrowArrays = recordBatch.Arrays; foreach (IArrowArray arrowArray in arrowArrays) { Field field = arrowSchema.GetFieldByIndex(fieldIndex); IArrowType fieldType = field.DataType; BaseColumn dataFrameColumn = null; switch (fieldType.TypeId) { case ArrowTypeId.Boolean: BooleanArray arrowBooleanArray = (BooleanArray)arrowArray; ReadOnlyMemory <byte> valueBuffer = arrowBooleanArray.ValueBuffer.Memory; ReadOnlyMemory <byte> nullBitMapBuffer = arrowBooleanArray.NullBitmapBuffer.Memory; dataFrameColumn = new PrimitiveColumn <bool>(field.Name, valueBuffer, nullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.Double: PrimitiveArray <double> arrowDoubleArray = (PrimitiveArray <double>)arrowArray; ReadOnlyMemory <byte> doubleValueBuffer = arrowDoubleArray.ValueBuffer.Memory; ReadOnlyMemory <byte> doubleNullBitMapBuffer = arrowDoubleArray.NullBitmapBuffer.Memory; dataFrameColumn = new PrimitiveColumn <double>(field.Name, doubleValueBuffer, doubleNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.Float: PrimitiveArray <float> arrowFloatArray = (PrimitiveArray <float>)arrowArray; ReadOnlyMemory <byte> floatValueBuffer = arrowFloatArray.ValueBuffer.Memory; ReadOnlyMemory <byte> floatNullBitMapBuffer = arrowFloatArray.NullBitmapBuffer.Memory; dataFrameColumn = new PrimitiveColumn <float>(field.Name, floatValueBuffer, floatNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.Int8: PrimitiveArray <sbyte> arrowsbyteArray = (PrimitiveArray <sbyte>)arrowArray; ReadOnlyMemory <byte> sbyteValueBuffer = arrowsbyteArray.ValueBuffer.Memory; ReadOnlyMemory <byte> sbyteNullBitMapBuffer = arrowsbyteArray.NullBitmapBuffer.Memory; dataFrameColumn = new PrimitiveColumn <sbyte>(field.Name, sbyteValueBuffer, sbyteNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.Int16: PrimitiveArray <short> arrowshortArray = (PrimitiveArray <short>)arrowArray; ReadOnlyMemory <byte> shortValueBuffer = arrowshortArray.ValueBuffer.Memory; ReadOnlyMemory <byte> shortNullBitMapBuffer = arrowshortArray.NullBitmapBuffer.Memory; dataFrameColumn = new PrimitiveColumn <short>(field.Name, shortValueBuffer, shortNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.Int32: PrimitiveArray <int> arrowIntArray = (PrimitiveArray <int>)arrowArray; ReadOnlyMemory <byte> intValueBuffer = arrowIntArray.ValueBuffer.Memory; ReadOnlyMemory <byte> intNullBitMapBuffer = arrowIntArray.NullBitmapBuffer.Memory; dataFrameColumn = new PrimitiveColumn <int>(field.Name, intValueBuffer, intNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.Int64: PrimitiveArray <long> arrowLongArray = (PrimitiveArray <long>)arrowArray; ReadOnlyMemory <byte> longValueBuffer = arrowLongArray.ValueBuffer.Memory; ReadOnlyMemory <byte> longNullBitMapBuffer = arrowLongArray.NullBitmapBuffer.Memory; dataFrameColumn = new PrimitiveColumn <long>(field.Name, longValueBuffer, longNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.String: StringArray stringArray = (StringArray)arrowArray; ReadOnlyMemory <byte> dataMemory = stringArray.ValueBuffer.Memory; ReadOnlyMemory <byte> offsetsMemory = stringArray.ValueOffsetsBuffer.Memory; ReadOnlyMemory <byte> nullMemory = stringArray.NullBitmapBuffer.Memory; dataFrameColumn = new ArrowStringColumn(field.Name, dataMemory, offsetsMemory, nullMemory, stringArray.Length, stringArray.NullCount); break; case ArrowTypeId.UInt8: PrimitiveArray <byte> arrowbyteArray = (PrimitiveArray <byte>)arrowArray; ReadOnlyMemory <byte> byteValueBuffer = arrowbyteArray.ValueBuffer.Memory; ReadOnlyMemory <byte> byteNullBitMapBuffer = arrowbyteArray.NullBitmapBuffer.Memory; dataFrameColumn = new PrimitiveColumn <byte>(field.Name, byteValueBuffer, byteNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.UInt16: PrimitiveArray <ushort> arrowUshortArray = (PrimitiveArray <ushort>)arrowArray; ReadOnlyMemory <byte> ushortValueBuffer = arrowUshortArray.ValueBuffer.Memory; ReadOnlyMemory <byte> ushortNullBitMapBuffer = arrowUshortArray.NullBitmapBuffer.Memory; dataFrameColumn = new PrimitiveColumn <ushort>(field.Name, ushortValueBuffer, ushortNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.UInt32: PrimitiveArray <uint> arrowUintArray = (PrimitiveArray <uint>)arrowArray; ReadOnlyMemory <byte> uintValueBuffer = arrowUintArray.ValueBuffer.Memory; ReadOnlyMemory <byte> uintNullBitMapBuffer = arrowUintArray.NullBitmapBuffer.Memory; dataFrameColumn = new PrimitiveColumn <uint>(field.Name, uintValueBuffer, uintNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.UInt64: PrimitiveArray <ulong> arrowUlongArray = (PrimitiveArray <ulong>)arrowArray; ReadOnlyMemory <byte> ulongValueBuffer = arrowUlongArray.ValueBuffer.Memory; ReadOnlyMemory <byte> ulongNullBitMapBuffer = arrowUlongArray.NullBitmapBuffer.Memory; dataFrameColumn = new PrimitiveColumn <ulong>(field.Name, ulongValueBuffer, ulongNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.Decimal: case ArrowTypeId.Binary: case ArrowTypeId.Date32: case ArrowTypeId.Date64: case ArrowTypeId.Dictionary: case ArrowTypeId.FixedSizedBinary: case ArrowTypeId.HalfFloat: case ArrowTypeId.Interval: case ArrowTypeId.List: case ArrowTypeId.Map: case ArrowTypeId.Null: case ArrowTypeId.Struct: case ArrowTypeId.Time32: case ArrowTypeId.Time64: default: throw new NotImplementedException(nameof(fieldType.Name)); } _table.InsertColumn(ColumnCount, dataFrameColumn); fieldIndex++; } }
public DataFrame Join(DataFrame other, string leftSuffix = "_left", string rightSuffix = "_right", JoinAlgorithm joinAlgorithm = JoinAlgorithm.Left) { DataFrame ret = new DataFrame(); if (joinAlgorithm == JoinAlgorithm.Left) { for (int i = 0; i < ColumnCount; i++) { BaseColumn newColumn = Column(i).Clone(); ret.InsertColumn(ret.ColumnCount, newColumn); } long minLength = Math.Min(RowCount, other.RowCount); PrimitiveColumn <long> mapIndices = new PrimitiveColumn <long>("mapIndices", minLength); for (long i = 0; i < minLength; i++) { mapIndices[i] = i; } for (int i = 0; i < other.ColumnCount; i++) { BaseColumn newColumn; if (other.RowCount < RowCount) { newColumn = other.Column(i).Clone(numberOfNullsToAppend: RowCount - other.RowCount); } else { newColumn = other.Column(i).Clone(mapIndices); } SetSuffixForDuplicatedColumnNames(ret, newColumn, leftSuffix, rightSuffix); ret.InsertColumn(ret.ColumnCount, newColumn); } } else if (joinAlgorithm == JoinAlgorithm.Right) { long minLength = Math.Min(RowCount, other.RowCount); PrimitiveColumn <long> mapIndices = new PrimitiveColumn <long>("mapIndices", minLength); for (long i = 0; i < minLength; i++) { mapIndices[i] = i; } for (int i = 0; i < ColumnCount; i++) { BaseColumn newColumn; if (RowCount < other.RowCount) { newColumn = Column(i).Clone(numberOfNullsToAppend: other.RowCount - RowCount); } else { newColumn = Column(i).Clone(mapIndices); } ret.InsertColumn(ret.ColumnCount, newColumn); } for (int i = 0; i < other.ColumnCount; i++) { BaseColumn newColumn = other.Column(i).Clone(); SetSuffixForDuplicatedColumnNames(ret, newColumn, leftSuffix, rightSuffix); ret.InsertColumn(ret.ColumnCount, newColumn); } } else if (joinAlgorithm == JoinAlgorithm.FullOuter) { long newRowCount = Math.Max(RowCount, other.RowCount); long numberOfNulls = newRowCount - RowCount; for (int i = 0; i < ColumnCount; i++) { BaseColumn newColumn = Column(i).Clone(numberOfNullsToAppend: numberOfNulls); ret.InsertColumn(ret.ColumnCount, newColumn); } numberOfNulls = newRowCount - other.RowCount; for (int i = 0; i < other.ColumnCount; i++) { BaseColumn newColumn = other.Column(i).Clone(numberOfNullsToAppend: numberOfNulls); SetSuffixForDuplicatedColumnNames(ret, newColumn, leftSuffix, rightSuffix); ret.InsertColumn(ret.ColumnCount, newColumn); } } else if (joinAlgorithm == JoinAlgorithm.Inner) { long newRowCount = Math.Min(RowCount, other.RowCount); PrimitiveColumn <long> mapIndices = new PrimitiveColumn <long>("mapIndices", newRowCount); for (long i = 0; i < newRowCount; i++) { mapIndices[i] = i; } for (int i = 0; i < ColumnCount; i++) { BaseColumn newColumn = Column(i).Clone(mapIndices); ret.InsertColumn(ret.ColumnCount, newColumn); } for (int i = 0; i < other.ColumnCount; i++) { BaseColumn newColumn = other.Column(i).Clone(mapIndices); SetSuffixForDuplicatedColumnNames(ret, newColumn, leftSuffix, rightSuffix); ret.InsertColumn(ret.ColumnCount, newColumn); } } return(ret); }
// TODO: Merge API with an "On" parameter that merges on a column common to 2 dataframes /// <summary> /// Merge DataFrames with a database style join /// </summary> /// <param name="other"></param> /// <param name="leftJoinColumn"></param> /// <param name="rightJoinColumn"></param> /// <param name="leftSuffix"></param> /// <param name="rightSuffix"></param> /// <param name="joinAlgorithm"></param> /// <returns></returns> public DataFrame Merge <TKey>(DataFrame other, string leftJoinColumn, string rightJoinColumn, string leftSuffix = "_left", string rightSuffix = "_right", JoinAlgorithm joinAlgorithm = JoinAlgorithm.Left) { // A simple hash join DataFrame ret = new DataFrame(); DataFrame leftDataFrame = this; DataFrame rightDataFrame = other; // The final table size is not known until runtime long rowNumber = 0; PrimitiveColumn <long> leftRowIndices = new PrimitiveColumn <long>("LeftIndices"); PrimitiveColumn <long> rightRowIndices = new PrimitiveColumn <long>("RightIndices"); if (joinAlgorithm == JoinAlgorithm.Left) { // First hash other dataframe on the rightJoinColumn BaseColumn otherColumn = other[rightJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>(); // Go over the records in this dataframe and match with the dictionary BaseColumn thisColumn = this[leftJoinColumn]; for (long i = 0; i < thisColumn.Length; i++) { var thisColumnValue = thisColumn[i]; TKey thisColumnValueOrDefault = (TKey)(thisColumnValue == null ? default(TKey) : thisColumnValue); if (multimap.TryGetValue(thisColumnValueOrDefault, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { if (thisColumnValue == null) { // Match only with nulls in otherColumn if (otherColumn[row] == null) { leftRowIndices.Append(i); rightRowIndices.Append(row); } } else { // Cannot match nulls in otherColumn if (otherColumn[row] != null) { leftRowIndices.Append(i); rightRowIndices.Append(row); } } } } else { leftRowIndices.Append(i); rightRowIndices.Append(null); } } } else if (joinAlgorithm == JoinAlgorithm.Right) { BaseColumn thisColumn = this[leftJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = thisColumn.GroupColumnValues <TKey>(); BaseColumn otherColumn = other[rightJoinColumn]; for (long i = 0; i < otherColumn.Length; i++) { var otherColumnValue = otherColumn[i]; TKey otherColumnValueOrDefault = (TKey)(otherColumnValue == null ? default(TKey) : otherColumnValue); if (multimap.TryGetValue(otherColumnValueOrDefault, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { if (otherColumnValue == null) { if (thisColumn[row] == null) { leftRowIndices.Append(row); rightRowIndices.Append(i); } } else { if (thisColumn[row] != null) { leftRowIndices.Append(row); rightRowIndices.Append(i); } } } } else { leftRowIndices.Append(null); rightRowIndices.Append(i); } } } else if (joinAlgorithm == JoinAlgorithm.Inner) { // Hash the column with the smaller RowCount long leftRowCount = RowCount; long rightRowCount = other.RowCount; DataFrame longerDataFrame = leftRowCount < rightRowCount ? other : this; DataFrame shorterDataFrame = ReferenceEquals(longerDataFrame, this) ? other : this; BaseColumn hashColumn = (leftRowCount < rightRowCount) ? this[leftJoinColumn] : other[rightJoinColumn]; BaseColumn otherColumn = ReferenceEquals(hashColumn, this[leftJoinColumn]) ? other[rightJoinColumn] : this[leftJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = hashColumn.GroupColumnValues <TKey>(); for (long i = 0; i < otherColumn.Length; i++) { var otherColumnValue = otherColumn[i]; TKey otherColumnValueOrDefault = (TKey)(otherColumnValue == null ? default(TKey) : otherColumnValue); if (multimap.TryGetValue(otherColumnValueOrDefault, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { if (otherColumnValue == null) { if (hashColumn[row] == null) { leftRowIndices.Append(row); rightRowIndices.Append(i); } } else { if (hashColumn[row] != null) { leftRowIndices.Append(row); rightRowIndices.Append(i); } } } } } leftDataFrame = shorterDataFrame; rightDataFrame = longerDataFrame; } else if (joinAlgorithm == JoinAlgorithm.FullOuter) { BaseColumn otherColumn = other[rightJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>(); Dictionary <TKey, long> intersection = new Dictionary <TKey, long>(EqualityComparer <TKey> .Default); // Go over the records in this dataframe and match with the dictionary BaseColumn thisColumn = this[leftJoinColumn]; for (long i = 0; i < thisColumn.Length; i++) { var thisColumnValue = thisColumn[i]; TKey thisColumnValueOrDefault = (TKey)(thisColumnValue == null ? default(TKey) : thisColumnValue); if (multimap.TryGetValue(thisColumnValueOrDefault, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { if (thisColumnValue == null) { // Has to match only with nulls in otherColumn if (otherColumn[row] == null) { leftRowIndices.Append(i); rightRowIndices.Append(row); if (!intersection.ContainsKey(thisColumnValueOrDefault)) { intersection.Add(thisColumnValueOrDefault, rowNumber); } } } else { // Cannot match to nulls in otherColumn if (otherColumn[row] != null) { leftRowIndices.Append(i); rightRowIndices.Append(row); if (!intersection.ContainsKey(thisColumnValueOrDefault)) { intersection.Add(thisColumnValueOrDefault, rowNumber); } } } } } else { leftRowIndices.Append(i); rightRowIndices.Append(null); } } for (long i = 0; i < otherColumn.Length; i++) { TKey value = (TKey)(otherColumn[i] ?? default(TKey)); if (!intersection.ContainsKey(value)) { leftRowIndices.Append(null); rightRowIndices.Append(i); } } } else { throw new NotImplementedException(nameof(joinAlgorithm)); } for (int i = 0; i < leftDataFrame.ColumnCount; i++) { ret.InsertColumn(i, leftDataFrame.Column(i).Clone(leftRowIndices)); } for (int i = 0; i < rightDataFrame.ColumnCount; i++) { BaseColumn column = rightDataFrame.Column(i).Clone(rightRowIndices); SetSuffixForDuplicatedColumnNames(ret, column, leftSuffix, rightSuffix); ret.InsertColumn(ret.ColumnCount, column); } return(ret); }