private static HashSet <long> Merge(DataFrame retainedDataFrame, DataFrame supplementaryDataFrame, string[] retainedJoinColumnNames, string[] supplemetaryJoinColumnNames, out PrimitiveDataFrameColumn <long> retainedRowIndices, out PrimitiveDataFrameColumn <long> supplementaryRowIndices, bool isInner = false, bool calculateIntersection = false) { if (retainedJoinColumnNames == null) { throw new ArgumentNullException(nameof(retainedJoinColumnNames)); } if (supplemetaryJoinColumnNames == null) { throw new ArgumentNullException(nameof(supplemetaryJoinColumnNames)); } if (retainedJoinColumnNames.Length != supplemetaryJoinColumnNames.Length) { throw new ArgumentException(Strings.MismatchedArrayLengths, nameof(retainedJoinColumnNames)); } Dictionary <long, ICollection <long> > occurrences = GetOccurences(retainedDataFrame, supplementaryDataFrame, retainedJoinColumnNames, supplemetaryJoinColumnNames, out HashSet <long> supplementaryJoinColumnsNullIndices); return(PerformMerging(retainedDataFrame, retainedJoinColumnNames, occurrences, supplementaryJoinColumnsNullIndices, out retainedRowIndices, out supplementaryRowIndices, isInner, calculateIntersection)); }
public void Insert <T>(int columnIndex, IEnumerable <T> column, string columnName) where T : unmanaged { DataFrameColumn newColumn = new PrimitiveDataFrameColumn <T>(columnName, column); Insert(columnIndex, newColumn); // calls InsertItem internally }
public override DataFrameColumn CumulativeProduct(IEnumerable <long> rowIndices, bool inPlace = false) { PrimitiveDataFrameColumn <T> ret = inPlace ? this : Clone(); PrimitiveColumnComputation <T> .Instance.CumulativeProduct(ret._columnContainer, rowIndices); return(ret); }
public override DataFrameColumn CumulativeSum(bool inPlace = false) { PrimitiveDataFrameColumn <T> ret = inPlace ? this : Clone(); PrimitiveColumnComputation <T> .Instance.CumulativeSum(ret._columnContainer); return(ret); }
private static HashSet <long> PerformMerging(DataFrame retainedDataFrame, string[] retainedJoinColumnNames, Dictionary <long, ICollection <long> > occurrences, HashSet <long> supplementaryJoinColumnsNullIndices, out PrimitiveDataFrameColumn <long> retainedRowIndices, out PrimitiveDataFrameColumn <long> supplementaryRowIndices, bool isInner, bool calculateIntersection) { retainedRowIndices = new Int64DataFrameColumn("RetainedIndices"); supplementaryRowIndices = new Int64DataFrameColumn("SupplementaryIndices"); HashSet <long> intersection = calculateIntersection ? new HashSet <long>() : null; var retainJoinColumns = retainedJoinColumnNames.Select(name => retainedDataFrame.Columns[name]).ToArray(); for (long i = 0; i < retainedDataFrame.Columns.RowCount; i++) { if (!IsAnyNullValueInColumns(retainJoinColumns, i)) { // Get all row indexes from supplementary dataframe that satisfy JOIN condition if (occurrences.TryGetValue(i, out ICollection <long> rowIndices)) { foreach (long supplementaryRowIndex in rowIndices) { retainedRowIndices.Append(i); supplementaryRowIndices.Append(supplementaryRowIndex); // Store intersection if required if (calculateIntersection) { if (!intersection.Contains(supplementaryRowIndex)) { intersection.Add(supplementaryRowIndex); } } } } else { if (isInner) { continue; } retainedRowIndices.Append(i); supplementaryRowIndices.Append(null); } } else { foreach (long row in supplementaryJoinColumnsNullIndices) { retainedRowIndices.Append(i); supplementaryRowIndices.Append(row); } } } return(intersection); }
public PrimitiveDataFrameColumn <bool> ElementwiseEquals(string value) { PrimitiveDataFrameColumn <bool> ret = new PrimitiveDataFrameColumn <bool>(Name, Length); for (long i = 0; i < Length; i++) { ret[i] = this[i] == value; } return(ret); }
public override PrimitiveDataFrameColumn <bool> ReverseXor(bool value, bool inPlace = false) { switch (this) { case PrimitiveDataFrameColumn <bool> boolColumn: PrimitiveDataFrameColumn <bool> retColumn = inPlace ? boolColumn : boolColumn.Clone(); retColumn._columnContainer.ReverseXor(value); return(retColumn); default: throw new NotSupportedException(); } }
internal static PrimitiveDataFrameColumn <bool> ElementwiseEqualsImplementation(DataFrameColumn left, DataFrameColumn right) { if (left.Length != right.Length) { throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(right)); } PrimitiveDataFrameColumn <bool> ret = new PrimitiveDataFrameColumn <bool>(left.Name, left.Length); for (long i = 0; i < left.Length; i++) { ret[i] = (string)left[i] == right[i]?.ToString(); } return(ret); }
/// <summary> /// Reads a seekable stream of CSV data into a DataFrame. /// Follows pandas API. /// </summary> /// <param name="csvStream">stream of CSV data to be read in</param> /// <param name="separator">column separator</param> /// <param name="header">has a header or not</param> /// <param name="columnNames">column names (can be empty)</param> /// <param name="dataTypes">column types (can be empty)</param> /// <param name="numberOfRowsToRead">number of rows to read not including the header(if present)</param> /// <param name="guessRows">number of rows used to guess types</param> /// <param name="addIndexColumn">add one column with the row index</param> /// <returns><see cref="DataFrame"/></returns> public static DataFrame LoadCsv(Stream csvStream, char separator = ',', bool header = true, string[] columnNames = null, Type[] dataTypes = null, long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false) { if (!csvStream.CanSeek) { throw new ArgumentException(Strings.NonSeekableStream, nameof(csvStream)); } var linesForGuessType = new List <string[]>(); long rowline = 0; int numberOfColumns = dataTypes?.Length ?? 0; if (header == true && numberOfRowsToRead != -1) { numberOfRowsToRead++; } List <DataFrameColumn> columns; long streamStart = csvStream.Position; // First pass: schema and number of rows. using (var streamReader = new StreamReader(csvStream, Encoding.UTF8, detectEncodingFromByteOrderMarks: true, DefaultStreamReaderBufferSize, leaveOpen: true)) { string line = null; if (dataTypes == null) { line = streamReader.ReadLine(); while (line != null) { if ((numberOfRowsToRead == -1) || rowline < numberOfRowsToRead) { if (linesForGuessType.Count < guessRows) { var spl = line.Split(separator); if (header && rowline == 0) { if (columnNames == null) { columnNames = spl; } } else { linesForGuessType.Add(spl); numberOfColumns = Math.Max(numberOfColumns, spl.Length); } } } ++rowline; if (rowline == guessRows) { break; } line = streamReader.ReadLine(); } if (linesForGuessType.Count == 0) { throw new FormatException(Strings.EmptyFile); } } columns = new List <DataFrameColumn>(numberOfColumns); // Guesses types or looks up dataTypes and adds columns. for (int i = 0; i < numberOfColumns; ++i) { Type kind = dataTypes == null?GuessKind(i, linesForGuessType) : dataTypes[i]; columns.Add(CreateColumn(kind, columnNames, i)); } DataFrame ret = new DataFrame(columns); line = null; streamReader.DiscardBufferedData(); streamReader.BaseStream.Seek(streamStart, SeekOrigin.Begin); // Fills values. line = streamReader.ReadLine(); rowline = 0; while (line != null && (numberOfRowsToRead == -1 || rowline < numberOfRowsToRead)) { var spl = line.Split(separator); if (header && rowline == 0) { // Skips. } else { ret.Append(spl, inPlace: true); } ++rowline; line = streamReader.ReadLine(); } if (addIndexColumn) { PrimitiveDataFrameColumn <int> indexColumn = new PrimitiveDataFrameColumn <int>("IndexColumn", columns[0].Length); for (int i = 0; i < columns[0].Length; i++) { indexColumn[i] = i; } columns.Insert(0, indexColumn); } return(ret); } }
public DataFrame Join(DataFrame other, string leftSuffix = "_left", string rightSuffix = "_right", JoinAlgorithm joinAlgorithm = JoinAlgorithm.Left) { DataFrame ret = new DataFrame(); if (joinAlgorithm == JoinAlgorithm.Left) { for (int i = 0; i < Columns.Count; i++) { DataFrameColumn newColumn = Columns[i].Clone(); ret.Columns.Insert(ret.Columns.Count, newColumn); } long minLength = Math.Min(Rows.Count, other.Rows.Count); PrimitiveDataFrameColumn <long> mapIndices = new PrimitiveDataFrameColumn <long>("mapIndices", minLength); for (long i = 0; i < minLength; i++) { mapIndices[i] = i; } for (int i = 0; i < other.Columns.Count; i++) { DataFrameColumn newColumn; if (other.Rows.Count < Rows.Count) { newColumn = other.Columns[i].Clone(numberOfNullsToAppend: Rows.Count - other.Rows.Count); } else { newColumn = other.Columns[i].Clone(mapIndices); } SetSuffixForDuplicatedColumnNames(ret, newColumn, leftSuffix, rightSuffix); ret.Columns.Insert(ret.Columns.Count, newColumn); } } else if (joinAlgorithm == JoinAlgorithm.Right) { long minLength = Math.Min(Rows.Count, other.Rows.Count); PrimitiveDataFrameColumn <long> mapIndices = new PrimitiveDataFrameColumn <long>("mapIndices", minLength); for (long i = 0; i < minLength; i++) { mapIndices[i] = i; } for (int i = 0; i < Columns.Count; i++) { DataFrameColumn newColumn; if (Rows.Count < other.Rows.Count) { newColumn = Columns[i].Clone(numberOfNullsToAppend: other.Rows.Count - Rows.Count); } else { newColumn = Columns[i].Clone(mapIndices); } ret.Columns.Insert(ret.Columns.Count, newColumn); } for (int i = 0; i < other.Columns.Count; i++) { DataFrameColumn newColumn = other.Columns[i].Clone(); SetSuffixForDuplicatedColumnNames(ret, newColumn, leftSuffix, rightSuffix); ret.Columns.Insert(ret.Columns.Count, newColumn); } } else if (joinAlgorithm == JoinAlgorithm.FullOuter) { long newRowCount = Math.Max(Rows.Count, other.Rows.Count); long numberOfNulls = newRowCount - Rows.Count; for (int i = 0; i < Columns.Count; i++) { DataFrameColumn newColumn = Columns[i].Clone(numberOfNullsToAppend: numberOfNulls); ret.Columns.Insert(ret.Columns.Count, newColumn); } numberOfNulls = newRowCount - other.Rows.Count; for (int i = 0; i < other.Columns.Count; i++) { DataFrameColumn newColumn = other.Columns[i].Clone(numberOfNullsToAppend: numberOfNulls); SetSuffixForDuplicatedColumnNames(ret, newColumn, leftSuffix, rightSuffix); ret.Columns.Insert(ret.Columns.Count, newColumn); } } else if (joinAlgorithm == JoinAlgorithm.Inner) { long newRowCount = Math.Min(Rows.Count, other.Rows.Count); PrimitiveDataFrameColumn <long> mapIndices = new PrimitiveDataFrameColumn <long>("mapIndices", newRowCount); for (long i = 0; i < newRowCount; i++) { mapIndices[i] = i; } for (int i = 0; i < Columns.Count; i++) { DataFrameColumn newColumn = Columns[i].Clone(mapIndices); ret.Columns.Insert(ret.Columns.Count, newColumn); } for (int i = 0; i < other.Columns.Count; i++) { DataFrameColumn newColumn = other.Columns[i].Clone(mapIndices); SetSuffixForDuplicatedColumnNames(ret, newColumn, leftSuffix, rightSuffix); ret.Columns.Insert(ret.Columns.Count, newColumn); } } return(ret); }
// TODO: Merge API with an "On" parameter that merges on a column common to 2 dataframes /// <summary> /// Merge DataFrames with a database style join /// </summary> /// <param name="other"></param> /// <param name="leftJoinColumn"></param> /// <param name="rightJoinColumn"></param> /// <param name="leftSuffix"></param> /// <param name="rightSuffix"></param> /// <param name="joinAlgorithm"></param> /// <returns></returns> public DataFrame Merge <TKey>(DataFrame other, string leftJoinColumn, string rightJoinColumn, string leftSuffix = "_left", string rightSuffix = "_right", JoinAlgorithm joinAlgorithm = JoinAlgorithm.Left) { // A simple hash join DataFrame ret = new DataFrame(); DataFrame leftDataFrame = this; DataFrame rightDataFrame = other; // The final table size is not known until runtime long rowNumber = 0; PrimitiveDataFrameColumn <long> leftRowIndices = new PrimitiveDataFrameColumn <long>("LeftIndices"); PrimitiveDataFrameColumn <long> rightRowIndices = new PrimitiveDataFrameColumn <long>("RightIndices"); if (joinAlgorithm == JoinAlgorithm.Left) { // First hash other dataframe on the rightJoinColumn DataFrameColumn otherColumn = other[rightJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>(); // Go over the records in this dataframe and match with the dictionary DataFrameColumn thisColumn = this[leftJoinColumn]; for (long i = 0; i < thisColumn.Length; i++) { var thisColumnValue = thisColumn[i]; TKey thisColumnValueOrDefault = (TKey)(thisColumnValue == null ? default(TKey) : thisColumnValue); if (multimap.TryGetValue(thisColumnValueOrDefault, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { if (thisColumnValue == null) { // Match only with nulls in otherColumn if (otherColumn[row] == null) { leftRowIndices.Append(i); rightRowIndices.Append(row); } } else { // Cannot match nulls in otherColumn if (otherColumn[row] != null) { leftRowIndices.Append(i); rightRowIndices.Append(row); } } } } else { leftRowIndices.Append(i); rightRowIndices.Append(null); } } } else if (joinAlgorithm == JoinAlgorithm.Right) { DataFrameColumn thisColumn = this[leftJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = thisColumn.GroupColumnValues <TKey>(); DataFrameColumn otherColumn = other[rightJoinColumn]; for (long i = 0; i < otherColumn.Length; i++) { var otherColumnValue = otherColumn[i]; TKey otherColumnValueOrDefault = (TKey)(otherColumnValue == null ? default(TKey) : otherColumnValue); if (multimap.TryGetValue(otherColumnValueOrDefault, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { if (otherColumnValue == null) { if (thisColumn[row] == null) { leftRowIndices.Append(row); rightRowIndices.Append(i); } } else { if (thisColumn[row] != null) { leftRowIndices.Append(row); rightRowIndices.Append(i); } } } } else { leftRowIndices.Append(null); rightRowIndices.Append(i); } } } else if (joinAlgorithm == JoinAlgorithm.Inner) { // Hash the column with the smaller RowCount long leftRowCount = Rows.Count; long rightRowCount = other.Rows.Count; DataFrame longerDataFrame = leftRowCount <= rightRowCount ? other : this; DataFrame shorterDataFrame = ReferenceEquals(longerDataFrame, this) ? other : this; DataFrameColumn hashColumn = (leftRowCount <= rightRowCount) ? this[leftJoinColumn] : other[rightJoinColumn]; DataFrameColumn otherColumn = ReferenceEquals(hashColumn, this[leftJoinColumn]) ? other[rightJoinColumn] : this[leftJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = hashColumn.GroupColumnValues <TKey>(); for (long i = 0; i < otherColumn.Length; i++) { var otherColumnValue = otherColumn[i]; TKey otherColumnValueOrDefault = (TKey)(otherColumnValue == null ? default(TKey) : otherColumnValue); if (multimap.TryGetValue(otherColumnValueOrDefault, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { if (otherColumnValue == null) { if (hashColumn[row] == null) { leftRowIndices.Append(row); rightRowIndices.Append(i); } } else { if (hashColumn[row] != null) { leftRowIndices.Append(row); rightRowIndices.Append(i); } } } } } leftDataFrame = shorterDataFrame; rightDataFrame = longerDataFrame; } else if (joinAlgorithm == JoinAlgorithm.FullOuter) { DataFrameColumn otherColumn = other[rightJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>(); Dictionary <TKey, long> intersection = new Dictionary <TKey, long>(EqualityComparer <TKey> .Default); // Go over the records in this dataframe and match with the dictionary DataFrameColumn thisColumn = this[leftJoinColumn]; for (long i = 0; i < thisColumn.Length; i++) { var thisColumnValue = thisColumn[i]; TKey thisColumnValueOrDefault = (TKey)(thisColumnValue == null ? default(TKey) : thisColumnValue); if (multimap.TryGetValue(thisColumnValueOrDefault, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { if (thisColumnValue == null) { // Has to match only with nulls in otherColumn if (otherColumn[row] == null) { leftRowIndices.Append(i); rightRowIndices.Append(row); if (!intersection.ContainsKey(thisColumnValueOrDefault)) { intersection.Add(thisColumnValueOrDefault, rowNumber); } } } else { // Cannot match to nulls in otherColumn if (otherColumn[row] != null) { leftRowIndices.Append(i); rightRowIndices.Append(row); if (!intersection.ContainsKey(thisColumnValueOrDefault)) { intersection.Add(thisColumnValueOrDefault, rowNumber); } } } } } else { leftRowIndices.Append(i); rightRowIndices.Append(null); } } for (long i = 0; i < otherColumn.Length; i++) { TKey value = (TKey)(otherColumn[i] ?? default(TKey)); if (!intersection.ContainsKey(value)) { leftRowIndices.Append(null); rightRowIndices.Append(i); } } } else { throw new NotImplementedException(nameof(joinAlgorithm)); } for (int i = 0; i < leftDataFrame.Columns.Count; i++) { ret.Columns.Insert(i, leftDataFrame.Columns[i].Clone(leftRowIndices)); } for (int i = 0; i < rightDataFrame.Columns.Count; i++) { DataFrameColumn column = rightDataFrame.Columns[i].Clone(rightRowIndices); SetSuffixForDuplicatedColumnNames(ret, column, leftSuffix, rightSuffix); ret.Columns.Insert(ret.Columns.Count, column); } return(ret); }
private void GetSortIndices(IComparer <T> comparer, out PrimitiveDataFrameColumn <long> columnSortIndices) { List <List <int> > bufferSortIndices = new List <List <int> >(_columnContainer.Buffers.Count); // Sort each buffer first for (int b = 0; b < _columnContainer.Buffers.Count; b++) { ReadOnlyDataFrameBuffer <T> buffer = _columnContainer.Buffers[b]; ReadOnlySpan <byte> nullBitMapSpan = _columnContainer.NullBitMapBuffers[b].ReadOnlySpan; int[] sortIndices = new int[buffer.Length]; for (int i = 0; i < buffer.Length; i++) { sortIndices[i] = i; } IntrospectiveSort(buffer.ReadOnlySpan, buffer.Length, sortIndices, comparer); // Bug fix: QuickSort is not stable. When PrimitiveDataFrameColumn has null values and default values, they move around List <int> nonNullSortIndices = new List <int>(); for (int i = 0; i < sortIndices.Length; i++) { if (_columnContainer.IsValid(nullBitMapSpan, sortIndices[i])) { nonNullSortIndices.Add(sortIndices[i]); } } bufferSortIndices.Add(nonNullSortIndices); } // Simple merge sort to build the full column's sort indices ValueTuple <T, int> GetFirstNonNullValueAndBufferIndexStartingAtIndex(int bufferIndex, int startIndex) { int index = bufferSortIndices[bufferIndex][startIndex]; T value; ReadOnlyMemory <byte> buffer = _columnContainer.Buffers[bufferIndex].ReadOnlyBuffer; ReadOnlyMemory <T> typedBuffer = Unsafe.As <ReadOnlyMemory <byte>, ReadOnlyMemory <T> >(ref buffer); if (!typedBuffer.IsEmpty) { bool isArray = MemoryMarshal.TryGetArray(typedBuffer, out ArraySegment <T> arraySegment); if (isArray) { value = arraySegment.Array[index + arraySegment.Offset]; } else { value = _columnContainer.Buffers[bufferIndex][index]; } } else { value = _columnContainer.Buffers[bufferIndex][index]; } return(value, startIndex); } SortedDictionary <T, List <ValueTuple <int, int> > > heapOfValueAndListOfTupleOfSortAndBufferIndex = new SortedDictionary <T, List <ValueTuple <int, int> > >(comparer); IList <ReadOnlyDataFrameBuffer <T> > buffers = _columnContainer.Buffers; for (int i = 0; i < buffers.Count; i++) { ReadOnlyDataFrameBuffer <T> buffer = buffers[i]; if (bufferSortIndices[i].Count == 0) { // All nulls continue; } ValueTuple <T, int> valueAndBufferIndex = GetFirstNonNullValueAndBufferIndexStartingAtIndex(i, 0); if (heapOfValueAndListOfTupleOfSortAndBufferIndex.ContainsKey(valueAndBufferIndex.Item1)) { heapOfValueAndListOfTupleOfSortAndBufferIndex[valueAndBufferIndex.Item1].Add((valueAndBufferIndex.Item2, i)); } else { heapOfValueAndListOfTupleOfSortAndBufferIndex.Add(valueAndBufferIndex.Item1, new List <ValueTuple <int, int> >() { (valueAndBufferIndex.Item2, i) });
private static DataFrame ReadCsvLinesIntoDataFrame(IEnumerable <string> lines, char separator = ',', bool header = true, string[] columnNames = null, Type[] dataTypes = null, long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false ) { if (dataTypes == null && guessRows <= 0) { throw new ArgumentException(string.Format(Strings.ExpectedEitherGuessRowsOrDataTypes, nameof(guessRows), nameof(dataTypes))); } var linesForGuessType = new List <string[]>(); long rowline = 0; int numberOfColumns = dataTypes?.Length ?? 0; if (header == true && numberOfRowsToRead != -1) { numberOfRowsToRead++; } List <DataFrameColumn> columns; // First pass: schema and number of rows. string line = null; var enumerator = lines.GetEnumerator(); while (enumerator.MoveNext()) { line = enumerator.Current; if ((numberOfRowsToRead == -1) || rowline < numberOfRowsToRead) { if (linesForGuessType.Count < guessRows || (header && rowline == 0)) { var spl = line.Split(separator); if (header && rowline == 0) { if (columnNames == null) { columnNames = spl; } } else { linesForGuessType.Add(spl); numberOfColumns = Math.Max(numberOfColumns, spl.Length); } } } ++rowline; if (rowline == guessRows || guessRows == 0) { break; } } if (rowline == 0) { throw new FormatException(Strings.EmptyFile); } columns = new List <DataFrameColumn>(numberOfColumns); // Guesses types or looks up dataTypes and adds columns. for (int i = 0; i < numberOfColumns; ++i) { Type kind = dataTypes == null?GuessKind(i, linesForGuessType) : dataTypes[i]; columns.Add(CreateColumn(kind, columnNames, i)); } DataFrame ret = new DataFrame(columns); line = null; // Fill values. enumerator.Reset(); rowline = 0; while (enumerator.MoveNext() && (numberOfRowsToRead == -1 || rowline < numberOfRowsToRead)) { line = enumerator.Current; var spl = line.Split(separator); if (header && rowline == 0) { // Skips. } else { ret.Append(spl, inPlace: true); } ++rowline; } if (addIndexColumn) { PrimitiveDataFrameColumn <int> indexColumn = new PrimitiveDataFrameColumn <int>("IndexColumn", columns[0].Length); for (int i = 0; i < columns[0].Length; i++) { indexColumn[i] = i; } columns.Insert(0, indexColumn); } return(ret); }
private static DataFrameColumn CreateColumn(Type kind, string[] columnNames, int columnIndex) { DataFrameColumn ret; if (kind == typeof(bool)) { ret = new BooleanDataFrameColumn(GetColumnName(columnNames, columnIndex)); } else if (kind == typeof(int)) { ret = new Int32DataFrameColumn(GetColumnName(columnNames, columnIndex)); } else if (kind == typeof(float)) { ret = new SingleDataFrameColumn(GetColumnName(columnNames, columnIndex)); } else if (kind == typeof(string)) { ret = new StringDataFrameColumn(GetColumnName(columnNames, columnIndex), 0); } else if (kind == typeof(long)) { ret = new Int64DataFrameColumn(GetColumnName(columnNames, columnIndex)); } else if (kind == typeof(decimal)) { ret = new DecimalDataFrameColumn(GetColumnName(columnNames, columnIndex)); } else if (kind == typeof(byte)) { ret = new ByteDataFrameColumn(GetColumnName(columnNames, columnIndex)); } else if (kind == typeof(char)) { ret = new CharDataFrameColumn(GetColumnName(columnNames, columnIndex)); } else if (kind == typeof(double)) { ret = new DoubleDataFrameColumn(GetColumnName(columnNames, columnIndex)); } else if (kind == typeof(sbyte)) { ret = new SByteDataFrameColumn(GetColumnName(columnNames, columnIndex)); } else if (kind == typeof(short)) { ret = new Int16DataFrameColumn(GetColumnName(columnNames, columnIndex)); } else if (kind == typeof(uint)) { ret = new UInt32DataFrameColumn(GetColumnName(columnNames, columnIndex)); } else if (kind == typeof(ulong)) { ret = new UInt64DataFrameColumn(GetColumnName(columnNames, columnIndex)); } else if (kind == typeof(ushort)) { ret = new UInt16DataFrameColumn(GetColumnName(columnNames, columnIndex)); } else if (kind == typeof(DateTime)) { ret = new PrimitiveDataFrameColumn <DateTime>(GetColumnName(columnNames, columnIndex)); } else { throw new NotSupportedException(nameof(kind)); } return(ret); }
private static HashSet <long> Merge(DataFrame retainedDataFrame, DataFrame supplementaryDataFrame, string[] retainedJoinColumnNames, string[] supplemetaryJoinColumnNames, out PrimitiveDataFrameColumn <long> retainedRowIndices, out PrimitiveDataFrameColumn <long> supplementaryRowIndices, bool isInner = false, bool calculateIntersection = false) { if (retainedJoinColumnNames == null) { throw new ArgumentNullException(nameof(retainedJoinColumnNames)); } if (supplemetaryJoinColumnNames == null) { throw new ArgumentNullException(nameof(supplemetaryJoinColumnNames)); } if (retainedJoinColumnNames.Length != supplemetaryJoinColumnNames.Length) { throw new ArgumentException(Strings.MismatchedArrayLengths, nameof(retainedJoinColumnNames)); } HashSet <long> intersection = calculateIntersection ? new HashSet <long>() : null; // Get occurrences of values in columns used for join in the retained and supplementary dataframes Dictionary <long, ICollection <long> > occurrences = null; Dictionary <long, long> retainedIndicesReverseMapping = null; HashSet <long> supplementaryJoinColumnsNullIndices = new HashSet <long>(); for (int colNameIndex = 0; colNameIndex < retainedJoinColumnNames.Length; colNameIndex++) { DataFrameColumn shrinkedRetainedColumn = retainedDataFrame.Columns[retainedJoinColumnNames[colNameIndex]]; //shrink retained column by row occurrences from previous step if (occurrences != null) { //only rows with occurences from previose step should go for futher processing var shrinkedRetainedIndices = occurrences.Keys.ToArray(); //create reverse mapping of index of the row in the shrinked column to the index of this row in the original dataframe (new index -> original index) var newRetainedIndicesReverseMapping = new Dictionary <long, long>(shrinkedRetainedIndices.Length); for (int i = 0; i < shrinkedRetainedIndices.Length; i++) { //store reverse mapping to restore original dataframe indices from indices in shrinked row var originalIndex = shrinkedRetainedIndices[i]; newRetainedIndicesReverseMapping.Add(i, originalIndex); } retainedIndicesReverseMapping = newRetainedIndicesReverseMapping; shrinkedRetainedColumn = shrinkedRetainedColumn.Clone(new Int64DataFrameColumn("Indices", shrinkedRetainedIndices)); } DataFrameColumn supplementaryColumn = supplementaryDataFrame.Columns[supplemetaryJoinColumnNames[colNameIndex]]; //Find occurrenses on current step (join column) var newOccurrences = shrinkedRetainedColumn.GetGroupedOccurrences(supplementaryColumn, out HashSet <long> supplementaryColumnNullIndices); //Convert indices from in key from local (shrinked row) to indices in original dataframe if (retainedIndicesReverseMapping != null) { newOccurrences = newOccurrences.ToDictionary(kvp => retainedIndicesReverseMapping[kvp.Key], kvp => kvp.Value); } supplementaryJoinColumnsNullIndices.UnionWith(supplementaryColumnNullIndices); // shrink join result on current column by previous join columns (if any) // (we have to remove occurrences that doesn't exist in previous columns, because JOIN happens only if ALL left and right columns in JOIN are matched) if (occurrences != null) { var shrinkedOccurences = new Dictionary <long, ICollection <long> >(); foreach (var kvp in newOccurrences) { var newValue = kvp.Value.Where(i => occurrences[kvp.Key].Contains(i)).ToArray(); if (newValue.Any()) { shrinkedOccurences.Add(kvp.Key, newValue); } } newOccurrences = shrinkedOccurences; } occurrences = newOccurrences; } retainedRowIndices = new Int64DataFrameColumn("RetainedIndices"); supplementaryRowIndices = new Int64DataFrameColumn("SupplementaryIndices"); //Perform Merging var retainJoinColumns = retainedJoinColumnNames.Select(name => retainedDataFrame.Columns[name]).ToArray(); for (long i = 0; i < retainedDataFrame.Columns.RowCount; i++) { if (!IsAnyNullValueInColumns(retainJoinColumns, i)) { //Get all row indexes from supplementary dataframe that sutisfy JOIN condition if (occurrences.TryGetValue(i, out ICollection <long> rowIndices)) { foreach (long supplementaryRowIndex in rowIndices) { retainedRowIndices.Append(i); supplementaryRowIndices.Append(supplementaryRowIndex); //store intersection if required if (calculateIntersection) { if (!intersection.Contains(supplementaryRowIndex)) { intersection.Add(supplementaryRowIndex); } } } } else { if (isInner) { continue; } retainedRowIndices.Append(i); supplementaryRowIndices.Append(null); } } else { foreach (long row in supplementaryJoinColumnsNullIndices) { retainedRowIndices.Append(i); supplementaryRowIndices.Append(row); } } } return(intersection); }
/// <summary> /// Reads a seekable stream of CSV data into a DataFrame. /// Follows pandas API. /// </summary> /// <param name="csvStream">stream of CSV data to be read in</param> /// <param name="separator">column separator</param> /// <param name="header">has a header or not</param> /// <param name="columnNames">column names (can be empty)</param> /// <param name="dataTypes">column types (can be empty)</param> /// <param name="numberOfRowsToRead">number of rows to read not including the header(if present)</param> /// <param name="guessRows">number of rows used to guess types</param> /// <param name="addIndexColumn">add one column with the row index</param> /// <returns>DataFrame</returns> public static DataFrame LoadCsv(Stream csvStream, char separator = ',', bool header = true, string[] columnNames = null, Type[] dataTypes = null, long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false) { if (!csvStream.CanSeek) { throw new ArgumentException(Strings.NonSeekableStream, nameof(csvStream)); } var linesForGuessType = new List <string[]>(); long rowline = 0; int numberOfColumns = 0; if (header == true && numberOfRowsToRead != -1) { numberOfRowsToRead++; } List <DataFrameColumn> columns; long streamStart = csvStream.Position; // First pass: schema and number of rows. using (var streamReader = new StreamReader(csvStream, encoding: null, detectEncodingFromByteOrderMarks: true, bufferSize: -1, leaveOpen: true)) { string line = streamReader.ReadLine(); while (line != null) { if ((numberOfRowsToRead == -1) || rowline < numberOfRowsToRead) { if (linesForGuessType.Count < guessRows) { var spl = line.Split(separator); if (header && rowline == 0) { if (columnNames == null) { columnNames = spl; } } else { linesForGuessType.Add(spl); numberOfColumns = Math.Max(numberOfColumns, spl.Length); } } } ++rowline; if (rowline == numberOfRowsToRead) { break; } line = streamReader.ReadLine(); } if (linesForGuessType.Count == 0) { throw new FormatException(Strings.EmptyFile); } columns = new List <DataFrameColumn>(numberOfColumns); // Guesses types and adds columns. for (int i = 0; i < numberOfColumns; ++i) { Type kind = GuessKind(i, linesForGuessType); if (kind == typeof(bool)) { DataFrameColumn boolColumn = new PrimitiveDataFrameColumn <bool>(columnNames == null ? "Column" + i.ToString() : columnNames[i], header == true ? rowline - 1 : rowline); columns.Add(boolColumn); } else if (kind == typeof(float)) { DataFrameColumn floatColumn = new PrimitiveDataFrameColumn <float>(columnNames == null ? "Column" + i.ToString() : columnNames[i], header == true ? rowline - 1 : rowline); columns.Add(floatColumn); } else if (kind == typeof(string)) { DataFrameColumn stringColumn = new StringDataFrameColumn(columnNames == null ? "Column" + i.ToString() : columnNames[i], header == true ? rowline - 1 : rowline); columns.Add(stringColumn); } else { throw new NotSupportedException(nameof(kind)); } } line = null; streamReader.DiscardBufferedData(); streamReader.BaseStream.Seek(streamStart, SeekOrigin.Begin); // Fills values. line = streamReader.ReadLine(); rowline = 0; while (line != null && (numberOfRowsToRead == -1 || rowline < numberOfRowsToRead)) { var spl = line.Split(separator); if (header && rowline == 0) { // Skips. } else { AppendRow(columns, header == true ? rowline - 1 : rowline, spl); } ++rowline; line = streamReader.ReadLine(); } if (addIndexColumn) { PrimitiveDataFrameColumn <int> indexColumn = new PrimitiveDataFrameColumn <int>("IndexColumn", columns[0].Length); for (int i = 0; i < columns[0].Length; i++) { indexColumn[i] = i; } columns.Insert(0, indexColumn); } } return(new DataFrame(columns)); }
public override DataFrameColumn ReverseSubtract <U>(U value, bool inPlace = false) { switch (this) { case PrimitiveDataFrameColumn <bool> boolColumn: throw new NotSupportedException(); case PrimitiveDataFrameColumn <decimal> decimalColumn: if (typeof(U) == typeof(bool)) { throw new NotSupportedException(); } if (typeof(U) == typeof(T)) { // No conversions PrimitiveDataFrameColumn <T> newColumn = inPlace ? this : Clone(); newColumn._columnContainer.ReverseSubtract(Unsafe.As <U, T>(ref value)); return(newColumn); } else { if (inPlace) { throw new ArgumentException(string.Format(Strings.MismatchedValueType, typeof(T)), nameof(value)); } PrimitiveDataFrameColumn <decimal> clonedDecimalColumn = CloneAsDecimalColumn(); clonedDecimalColumn._columnContainer.ReverseSubtract(DecimalConverter <U> .Instance.GetDecimal(value)); return(clonedDecimalColumn); } case PrimitiveDataFrameColumn <byte> byteColumn: case PrimitiveDataFrameColumn <char> charColumn: case PrimitiveDataFrameColumn <double> doubleColumn: case PrimitiveDataFrameColumn <float> floatColumn: case PrimitiveDataFrameColumn <int> intColumn: case PrimitiveDataFrameColumn <long> longColumn: case PrimitiveDataFrameColumn <sbyte> sbyteColumn: case PrimitiveDataFrameColumn <short> shortColumn: case PrimitiveDataFrameColumn <uint> uintColumn: case PrimitiveDataFrameColumn <ulong> ulongColumn: case PrimitiveDataFrameColumn <ushort> ushortColumn: if (typeof(U) == typeof(bool)) { throw new NotSupportedException(); } if (typeof(U) == typeof(T)) { // No conversions PrimitiveDataFrameColumn <T> newColumn = inPlace ? this : Clone(); newColumn._columnContainer.ReverseSubtract(Unsafe.As <U, T>(ref value)); return(newColumn); } else { if (inPlace) { throw new ArgumentException(string.Format(Strings.MismatchedValueType, typeof(T)), nameof(value)); } if (typeof(U) == typeof(decimal)) { PrimitiveDataFrameColumn <decimal> decimalColumn = CloneAsDecimalColumn(); decimalColumn._columnContainer.ReverseSubtract(DecimalConverter <U> .Instance.GetDecimal(value)); return(decimalColumn); } else { PrimitiveDataFrameColumn <double> clonedDoubleColumn = CloneAsDoubleColumn(); clonedDoubleColumn._columnContainer.ReverseSubtract(DoubleConverter <U> .Instance.GetDouble(value)); return(clonedDoubleColumn); } } default: throw new NotSupportedException(); } }
/// <summary> /// Wraps a <see cref="DataFrame"/> around an Arrow <see cref="RecordBatch"/> without copying data /// </summary> /// <param name="recordBatch"></param> /// <returns><see cref="DataFrame"/></returns> public static DataFrame FromArrowRecordBatch(RecordBatch recordBatch) { DataFrame ret = new DataFrame(); Apache.Arrow.Schema arrowSchema = recordBatch.Schema; int fieldIndex = 0; IEnumerable <IArrowArray> arrowArrays = recordBatch.Arrays; foreach (IArrowArray arrowArray in arrowArrays) { Field field = arrowSchema.GetFieldByIndex(fieldIndex); IArrowType fieldType = field.DataType; DataFrameColumn dataFrameColumn = null; switch (fieldType.TypeId) { case ArrowTypeId.Boolean: BooleanArray arrowBooleanArray = (BooleanArray)arrowArray; ReadOnlyMemory <byte> valueBuffer = arrowBooleanArray.ValueBuffer.Memory; ReadOnlyMemory <byte> nullBitMapBuffer = arrowBooleanArray.NullBitmapBuffer.Memory; dataFrameColumn = new PrimitiveDataFrameColumn <bool>(field.Name, valueBuffer, nullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.Double: PrimitiveArray <double> arrowDoubleArray = (PrimitiveArray <double>)arrowArray; ReadOnlyMemory <byte> doubleValueBuffer = arrowDoubleArray.ValueBuffer.Memory; ReadOnlyMemory <byte> doubleNullBitMapBuffer = arrowDoubleArray.NullBitmapBuffer.Memory; dataFrameColumn = new PrimitiveDataFrameColumn <double>(field.Name, doubleValueBuffer, doubleNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.Float: PrimitiveArray <float> arrowFloatArray = (PrimitiveArray <float>)arrowArray; ReadOnlyMemory <byte> floatValueBuffer = arrowFloatArray.ValueBuffer.Memory; ReadOnlyMemory <byte> floatNullBitMapBuffer = arrowFloatArray.NullBitmapBuffer.Memory; dataFrameColumn = new PrimitiveDataFrameColumn <float>(field.Name, floatValueBuffer, floatNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.Int8: PrimitiveArray <sbyte> arrowsbyteArray = (PrimitiveArray <sbyte>)arrowArray; ReadOnlyMemory <byte> sbyteValueBuffer = arrowsbyteArray.ValueBuffer.Memory; ReadOnlyMemory <byte> sbyteNullBitMapBuffer = arrowsbyteArray.NullBitmapBuffer.Memory; dataFrameColumn = new PrimitiveDataFrameColumn <sbyte>(field.Name, sbyteValueBuffer, sbyteNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.Int16: PrimitiveArray <short> arrowshortArray = (PrimitiveArray <short>)arrowArray; ReadOnlyMemory <byte> shortValueBuffer = arrowshortArray.ValueBuffer.Memory; ReadOnlyMemory <byte> shortNullBitMapBuffer = arrowshortArray.NullBitmapBuffer.Memory; dataFrameColumn = new PrimitiveDataFrameColumn <short>(field.Name, shortValueBuffer, shortNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.Int32: PrimitiveArray <int> arrowIntArray = (PrimitiveArray <int>)arrowArray; ReadOnlyMemory <byte> intValueBuffer = arrowIntArray.ValueBuffer.Memory; ReadOnlyMemory <byte> intNullBitMapBuffer = arrowIntArray.NullBitmapBuffer.Memory; dataFrameColumn = new PrimitiveDataFrameColumn <int>(field.Name, intValueBuffer, intNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.Int64: PrimitiveArray <long> arrowLongArray = (PrimitiveArray <long>)arrowArray; ReadOnlyMemory <byte> longValueBuffer = arrowLongArray.ValueBuffer.Memory; ReadOnlyMemory <byte> longNullBitMapBuffer = arrowLongArray.NullBitmapBuffer.Memory; dataFrameColumn = new PrimitiveDataFrameColumn <long>(field.Name, longValueBuffer, longNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.String: StringArray stringArray = (StringArray)arrowArray; ReadOnlyMemory <byte> dataMemory = stringArray.ValueBuffer.Memory; ReadOnlyMemory <byte> offsetsMemory = stringArray.ValueOffsetsBuffer.Memory; ReadOnlyMemory <byte> nullMemory = stringArray.NullBitmapBuffer.Memory; dataFrameColumn = new ArrowStringDataFrameColumn(field.Name, dataMemory, offsetsMemory, nullMemory, stringArray.Length, stringArray.NullCount); break; case ArrowTypeId.UInt8: PrimitiveArray <byte> arrowbyteArray = (PrimitiveArray <byte>)arrowArray; ReadOnlyMemory <byte> byteValueBuffer = arrowbyteArray.ValueBuffer.Memory; ReadOnlyMemory <byte> byteNullBitMapBuffer = arrowbyteArray.NullBitmapBuffer.Memory; dataFrameColumn = new PrimitiveDataFrameColumn <byte>(field.Name, byteValueBuffer, byteNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.UInt16: PrimitiveArray <ushort> arrowUshortArray = (PrimitiveArray <ushort>)arrowArray; ReadOnlyMemory <byte> ushortValueBuffer = arrowUshortArray.ValueBuffer.Memory; ReadOnlyMemory <byte> ushortNullBitMapBuffer = arrowUshortArray.NullBitmapBuffer.Memory; dataFrameColumn = new PrimitiveDataFrameColumn <ushort>(field.Name, ushortValueBuffer, ushortNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.UInt32: PrimitiveArray <uint> arrowUintArray = (PrimitiveArray <uint>)arrowArray; ReadOnlyMemory <byte> uintValueBuffer = arrowUintArray.ValueBuffer.Memory; ReadOnlyMemory <byte> uintNullBitMapBuffer = arrowUintArray.NullBitmapBuffer.Memory; dataFrameColumn = new PrimitiveDataFrameColumn <uint>(field.Name, uintValueBuffer, uintNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.UInt64: PrimitiveArray <ulong> arrowUlongArray = (PrimitiveArray <ulong>)arrowArray; ReadOnlyMemory <byte> ulongValueBuffer = arrowUlongArray.ValueBuffer.Memory; ReadOnlyMemory <byte> ulongNullBitMapBuffer = arrowUlongArray.NullBitmapBuffer.Memory; dataFrameColumn = new PrimitiveDataFrameColumn <ulong>(field.Name, ulongValueBuffer, ulongNullBitMapBuffer, arrowArray.Length, arrowArray.NullCount); break; case ArrowTypeId.Decimal: case ArrowTypeId.Binary: case ArrowTypeId.Date32: case ArrowTypeId.Date64: case ArrowTypeId.Dictionary: case ArrowTypeId.FixedSizedBinary: case ArrowTypeId.HalfFloat: case ArrowTypeId.Interval: case ArrowTypeId.List: case ArrowTypeId.Map: case ArrowTypeId.Null: case ArrowTypeId.Struct: case ArrowTypeId.Time32: case ArrowTypeId.Time64: default: throw new NotImplementedException(nameof(fieldType.Name)); } ret.Columns.Insert(ret.Columns.Count, dataFrameColumn); fieldIndex++; } return(ret); }
public new PrimitiveDataFrameColumn <T> Sort(bool ascending = true) { PrimitiveDataFrameColumn <long> sortIndices = GetAscendingSortIndices(); return(Clone(sortIndices, !ascending, NullCount)); }
private static DataFrame ReadCsvLinesIntoDataFrame(WrappedStreamReaderOrStringReader wrappedReader, char separator = ',', bool header = true, string[] columnNames = null, Type[] dataTypes = null, long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false ) { if (dataTypes == null && guessRows <= 0) { throw new ArgumentException(string.Format(Strings.ExpectedEitherGuessRowsOrDataTypes, nameof(guessRows), nameof(dataTypes))); } List <DataFrameColumn> columns; string[] fields; using (var textReader = wrappedReader.GetTextReader()) { TextFieldParser parser = new TextFieldParser(textReader); parser.SetDelimiters(separator.ToString()); var linesForGuessType = new List <string[]>(); long rowline = 0; int numberOfColumns = dataTypes?.Length ?? 0; if (header == true && numberOfRowsToRead != -1) { numberOfRowsToRead++; } // First pass: schema and number of rows. while ((fields = parser.ReadFields()) != null) { if ((numberOfRowsToRead == -1) || rowline < numberOfRowsToRead) { if (linesForGuessType.Count < guessRows || (header && rowline == 0)) { if (header && rowline == 0) { if (columnNames == null) { columnNames = fields; } } else { linesForGuessType.Add(fields); numberOfColumns = Math.Max(numberOfColumns, fields.Length); } } } ++rowline; if (rowline == guessRows || guessRows == 0) { break; } } if (rowline == 0) { throw new FormatException(Strings.EmptyFile); } columns = new List <DataFrameColumn>(numberOfColumns); // Guesses types or looks up dataTypes and adds columns. for (int i = 0; i < numberOfColumns; ++i) { Type kind = dataTypes == null?GuessKind(i, linesForGuessType) : dataTypes[i]; columns.Add(CreateColumn(kind, columnNames, i)); } } DataFrame ret = new DataFrame(columns); // Fill values. using (var textReader = wrappedReader.GetTextReader()) { TextFieldParser parser = new TextFieldParser(textReader); parser.SetDelimiters(separator.ToString()); long rowline = 0; while ((fields = parser.ReadFields()) != null && (numberOfRowsToRead == -1 || rowline < numberOfRowsToRead)) { if (header && rowline == 0) { // Skips. } else { ret.Append(fields, inPlace: true); } ++rowline; } if (addIndexColumn) { PrimitiveDataFrameColumn <int> indexColumn = new PrimitiveDataFrameColumn <int>("IndexColumn", columns[0].Length); for (int i = 0; i < columns[0].Length; i++) { indexColumn[i] = i; } columns.Insert(0, indexColumn); } } return(ret); }
public DataFrame Merge(DataFrame other, string[] leftJoinColumns, string[] rightJoinColumns, string leftSuffix = "_left", string rightSuffix = "_right", JoinAlgorithm joinAlgorithm = JoinAlgorithm.Left) { if (other == null) { throw new ArgumentNullException(nameof(other)); } // In Outer join the joined dataframe retains each row — even if no other matching row exists in supplementary dataframe. // Outer joins subdivide further into left outer joins (left dataframe is retained), right outer joins (rightdataframe is retained), in full outer both are retained PrimitiveDataFrameColumn <long> retainedRowIndices; PrimitiveDataFrameColumn <long> supplementaryRowIndices; DataFrame supplementaryDataFrame; DataFrame retainedDataFrame; bool isLeftDataFrameRetained; if (joinAlgorithm == JoinAlgorithm.Left || joinAlgorithm == JoinAlgorithm.Right) { isLeftDataFrameRetained = (joinAlgorithm == JoinAlgorithm.Left); supplementaryDataFrame = isLeftDataFrameRetained ? other : this; var supplementaryJoinColumns = isLeftDataFrameRetained ? rightJoinColumns : leftJoinColumns; retainedDataFrame = isLeftDataFrameRetained ? this : other; var retainedJoinColumns = isLeftDataFrameRetained ? leftJoinColumns : rightJoinColumns; Merge(retainedDataFrame, supplementaryDataFrame, retainedJoinColumns, supplementaryJoinColumns, out retainedRowIndices, out supplementaryRowIndices); } else if (joinAlgorithm == JoinAlgorithm.Inner) { // Use as supplementary (for Hashing) the dataframe with the smaller RowCount isLeftDataFrameRetained = (Rows.Count > other.Rows.Count); supplementaryDataFrame = isLeftDataFrameRetained ? other : this; var supplementaryJoinColumns = isLeftDataFrameRetained ? rightJoinColumns : leftJoinColumns; retainedDataFrame = isLeftDataFrameRetained ? this : other; var retainedJoinColumns = isLeftDataFrameRetained ? leftJoinColumns : rightJoinColumns; Merge(retainedDataFrame, supplementaryDataFrame, retainedJoinColumns, supplementaryJoinColumns, out retainedRowIndices, out supplementaryRowIndices, true); } else if (joinAlgorithm == JoinAlgorithm.FullOuter) { // In full outer join we would like to retain data from both side, so we do it into 2 steps: one first we do LEFT JOIN and then add lost data from the RIGHT side // Step 1 // Do LEFT JOIN isLeftDataFrameRetained = true; supplementaryDataFrame = isLeftDataFrameRetained ? other : this; var supplementaryJoinColumns = isLeftDataFrameRetained ? rightJoinColumns : leftJoinColumns; retainedDataFrame = isLeftDataFrameRetained ? this : other; var retainedJoinColumns = isLeftDataFrameRetained ? leftJoinColumns : rightJoinColumns; var intersection = Merge(retainedDataFrame, supplementaryDataFrame, retainedJoinColumns, supplementaryJoinColumns, out retainedRowIndices, out supplementaryRowIndices, calculateIntersection: true); // Step 2 // Do RIGHT JOIN to retain all data from supplementary DataFrame too (take into account data intersection from the first step to avoid duplicates) for (long i = 0; i < supplementaryDataFrame.Columns.RowCount; i++) { var columns = supplementaryJoinColumns.Select(name => supplementaryDataFrame.Columns[name]).ToArray(); if (!IsAnyNullValueInColumns(columns, i)) { if (!intersection.Contains(i)) { retainedRowIndices.Append(null); supplementaryRowIndices.Append(i); } } } } else { throw new NotImplementedException(nameof(joinAlgorithm)); } DataFrame ret = new DataFrame(); PrimitiveDataFrameColumn <long> mapIndicesLeft = isLeftDataFrameRetained ? retainedRowIndices : supplementaryRowIndices; PrimitiveDataFrameColumn <long> mapIndicesRight = isLeftDataFrameRetained ? supplementaryRowIndices : retainedRowIndices; // Insert columns from left dataframe (this) for (int i = 0; i < this.Columns.Count; i++) { ret.Columns.Insert(i, this.Columns[i].Clone(mapIndicesLeft)); } // Insert columns from right dataframe (other) for (int i = 0; i < other.Columns.Count; i++) { DataFrameColumn column = other.Columns[i].Clone(mapIndicesRight); SetSuffixForDuplicatedColumnNames(ret, column, leftSuffix, rightSuffix); ret.Columns.Insert(ret.Columns.Count, column); } return(ret); }
/// <summary> /// Reads an implementation of IDataReader into a DataFrame. /// </summary> /// <param name="reader">DataReader to be read in</param> /// <param name="columnNames">column names (can be empty)</param> /// <param name="dataTypes">column types (can be empty)</param> /// <param name="numberOfRowsToRead">number of rows to read not including the header(if present)</param> /// <param name="addIndexColumn">add one column with the row index</param> /// <returns><see cref="DataFrame"/></returns> public static DataFrame FromDataReader(IDataReader reader, string[] columnNames = null, Type[] dataTypes = null, long numberOfRowsToRead = -1, bool addIndexColumn = false) { DataTable schemaTable = reader.GetSchemaTable(); int numberOfColumns = schemaTable.Rows.Count; if (columnNames == null) { columnNames = new string[numberOfColumns]; for (int i = 0; i < numberOfColumns; ++i) { string columnName = schemaTable.Rows[i]["ColumnName"].ToString(); columnNames[i] = string.IsNullOrWhiteSpace(columnName) ? $"Column{i}" : columnName; } } var columns = new List <DataFrameColumn>(numberOfColumns); if (dataTypes == null) { for (int i = 0; i < numberOfColumns; ++i) { var kind = (Type)schemaTable.Rows[i]["DataType"]; columns.Add(CreateColumn(kind, columnNames, i)); } } else { for (int i = 0; i < numberOfColumns; ++i) { columns.Add(CreateColumn(dataTypes[i], columnNames, i)); } } long rowline = 0; var ret = new DataFrame(columns); while (reader.Read() && (numberOfRowsToRead == -1 || rowline < numberOfRowsToRead)) { ret.Append(GetRecordValues(reader), inPlace: true); ++rowline; } if (addIndexColumn) { PrimitiveDataFrameColumn <int> indexColumn = new PrimitiveDataFrameColumn <int>("IndexColumn", columns[0].Length); for (int i = 0; i < columns[0].Length; i++) { indexColumn[i] = i; } columns.Insert(0, indexColumn); } return(ret); IEnumerable <object> GetRecordValues(IDataRecord record) { for (int i = 0; i < record.FieldCount; i++) { yield return(record[i] == DBNull.Value ? null : record[i]); } } }
// TODO: Merge API with an "On" parameter that merges on a column common to 2 dataframes /// <summary> /// Merge DataFrames with a database style join /// </summary> /// <param name="other"></param> /// <param name="leftJoinColumn"></param> /// <param name="rightJoinColumn"></param> /// <param name="leftSuffix"></param> /// <param name="rightSuffix"></param> /// <param name="joinAlgorithm"></param> /// <returns></returns> public DataFrame Merge <TKey>(DataFrame other, string leftJoinColumn, string rightJoinColumn, string leftSuffix = "_left", string rightSuffix = "_right", JoinAlgorithm joinAlgorithm = JoinAlgorithm.Left) { // A simple hash join DataFrame ret = new DataFrame(); DataFrame leftDataFrame = this; DataFrame rightDataFrame = other; // The final table size is not known until runtime long rowNumber = 0; PrimitiveDataFrameColumn <long> leftRowIndices = new PrimitiveDataFrameColumn <long>("LeftIndices"); PrimitiveDataFrameColumn <long> rightRowIndices = new PrimitiveDataFrameColumn <long>("RightIndices"); if (joinAlgorithm == JoinAlgorithm.Left) { // First hash other dataframe on the rightJoinColumn DataFrameColumn otherColumn = other.Columns[rightJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>(out HashSet <long> otherColumnNullIndices); // Go over the records in this dataframe and match with the dictionary DataFrameColumn thisColumn = Columns[leftJoinColumn]; for (long i = 0; i < thisColumn.Length; i++) { var thisColumnValue = thisColumn[i]; if (thisColumnValue != null) { if (multimap.TryGetValue((TKey)thisColumnValue, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { leftRowIndices.Append(i); rightRowIndices.Append(row); } } else { leftRowIndices.Append(i); rightRowIndices.Append(null); } } else { foreach (long row in otherColumnNullIndices) { leftRowIndices.Append(i); rightRowIndices.Append(row); } } } } else if (joinAlgorithm == JoinAlgorithm.Right) { DataFrameColumn thisColumn = Columns[leftJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = thisColumn.GroupColumnValues <TKey>(out HashSet <long> thisColumnNullIndices); DataFrameColumn otherColumn = other.Columns[rightJoinColumn]; for (long i = 0; i < otherColumn.Length; i++) { var otherColumnValue = otherColumn[i]; if (otherColumnValue != null) { if (multimap.TryGetValue((TKey)otherColumnValue, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { leftRowIndices.Append(row); rightRowIndices.Append(i); } } else { leftRowIndices.Append(null); rightRowIndices.Append(i); } } else { foreach (long thisColumnNullIndex in thisColumnNullIndices) { leftRowIndices.Append(thisColumnNullIndex); rightRowIndices.Append(i); } } } } else if (joinAlgorithm == JoinAlgorithm.Inner) { // Hash the column with the smaller RowCount long leftRowCount = Rows.Count; long rightRowCount = other.Rows.Count; bool leftColumnIsSmaller = leftRowCount <= rightRowCount; DataFrameColumn hashColumn = leftColumnIsSmaller ? Columns[leftJoinColumn] : other.Columns[rightJoinColumn]; DataFrameColumn otherColumn = ReferenceEquals(hashColumn, Columns[leftJoinColumn]) ? other.Columns[rightJoinColumn] : Columns[leftJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = hashColumn.GroupColumnValues <TKey>(out HashSet <long> smallerDataFrameColumnNullIndices); for (long i = 0; i < otherColumn.Length; i++) { var otherColumnValue = otherColumn[i]; if (otherColumnValue != null) { if (multimap.TryGetValue((TKey)otherColumnValue, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { leftRowIndices.Append(leftColumnIsSmaller ? row : i); rightRowIndices.Append(leftColumnIsSmaller ? i : row); } } } else { foreach (long nullIndex in smallerDataFrameColumnNullIndices) { leftRowIndices.Append(leftColumnIsSmaller ? nullIndex : i); rightRowIndices.Append(leftColumnIsSmaller ? i : nullIndex); } } } } else if (joinAlgorithm == JoinAlgorithm.FullOuter) { DataFrameColumn otherColumn = other.Columns[rightJoinColumn]; Dictionary <TKey, ICollection <long> > multimap = otherColumn.GroupColumnValues <TKey>(out HashSet <long> otherColumnNullIndices); Dictionary <TKey, long> intersection = new Dictionary <TKey, long>(EqualityComparer <TKey> .Default); // Go over the records in this dataframe and match with the dictionary DataFrameColumn thisColumn = Columns[leftJoinColumn]; Int64DataFrameColumn thisColumnNullIndices = new Int64DataFrameColumn("ThisColumnNullIndices"); for (long i = 0; i < thisColumn.Length; i++) { var thisColumnValue = thisColumn[i]; if (thisColumnValue != null) { if (multimap.TryGetValue((TKey)thisColumnValue, out ICollection <long> rowNumbers)) { foreach (long row in rowNumbers) { leftRowIndices.Append(i); rightRowIndices.Append(row); if (!intersection.ContainsKey((TKey)thisColumnValue)) { intersection.Add((TKey)thisColumnValue, rowNumber); } } } else { leftRowIndices.Append(i); rightRowIndices.Append(null); } } else { thisColumnNullIndices.Append(i); } } for (long i = 0; i < otherColumn.Length; i++) { var value = otherColumn[i]; if (value != null) { if (!intersection.ContainsKey((TKey)value)) { leftRowIndices.Append(null); rightRowIndices.Append(i); } } } // Now handle the null rows foreach (long?thisColumnNullIndex in thisColumnNullIndices) { foreach (long otherColumnNullIndex in otherColumnNullIndices) { leftRowIndices.Append(thisColumnNullIndex.Value); rightRowIndices.Append(otherColumnNullIndex); } if (otherColumnNullIndices.Count == 0) { leftRowIndices.Append(thisColumnNullIndex.Value); rightRowIndices.Append(null); } } if (thisColumnNullIndices.Length == 0) { foreach (long otherColumnNullIndex in otherColumnNullIndices) { leftRowIndices.Append(null); rightRowIndices.Append(otherColumnNullIndex); } } } else { throw new NotImplementedException(nameof(joinAlgorithm)); } for (int i = 0; i < leftDataFrame.Columns.Count; i++) { ret.Columns.Insert(i, leftDataFrame.Columns[i].Clone(leftRowIndices)); } for (int i = 0; i < rightDataFrame.Columns.Count; i++) { DataFrameColumn column = rightDataFrame.Columns[i].Clone(rightRowIndices); SetSuffixForDuplicatedColumnNames(ret, column, leftSuffix, rightSuffix); ret.Columns.Insert(ret.Columns.Count, column); } return(ret); }