/* * /// <summary> * /// Joins this data view to another data view on the given column. * /// </summary> * /// <param name="other"></param> * /// <param name="columnName"></param> * /// <returns></returns> * public DataFrame Join (DataFrame other, string columnName) * { * int thisColumnIndex = this.GetColumnIndex(columnName); * Type thisType = this.columns[thisColumnIndex].StorageType; * int otherColumnIndex = other.GetColumnIndex(columnName); * Type otherType = other.columns[otherColumnIndex].StorageType; * * // Form a lookup from the other table * Dictionary<object, int> hash = new Dictionary<object, int>(); * for (int otherRowIndex = 0; otherRowIndex < other.Rows.Count; otherRowIndex++) * { * hash[other.columns[otherColumnIndex].GetItem(other.map[otherRowIndex])] = otherRowIndex; * } * * // Construct the joined columns * List<DataList> joinedColumns = new List<DataList>(); * for (int i = 0; i < this.columns.Count; i++) * { * DataList joinedColumn = DataList.Create(this.columns[i].Name, this.columns[i].StorageType); * joinedColumns.Add(joinedColumn); * } * for (int j = 0; j < other.columns.Count; j++) * { * DataList joinedColumn = DataList.Create(other.columns[j].Name, other.columns[j].StorageType); * joinedColumns.Add(joinedColumn); * } * * // Populate the joined columns * for (int thisRowIndex = 0; thisRowIndex < this.map.Count; thisRowIndex++) * { * object thisValue = this.columns[thisColumnIndex].GetItem(this.map[thisRowIndex]); * int otherRowIndex; * if (hash.TryGetValue(thisValue, out otherRowIndex)) * { * for (int i = 0; i < this.columns.Count; i++) * { * joinedColumns[i].AddItem(this.columns[i].GetItem(this.map[i])); * } * for (int j = 0; j < other.columns.Count; j++) * { * joinedColumns[this.columns.Count + j].AddItem(other.columns[j].GetItem(other.map[otherRowIndex])); * } * } * } * * DataFrame result = new DataFrame(joinedColumns); * return (result); * * } */ internal void AddColumn(NamedList column) { if (column == null) { throw new ArgumentNullException(nameof(column)); } if (this.columns.Count == 0) { // This is the first column; create a row map. for (int i = 0; i < column.Count; i++) { this.map.Add(i); } } else { // This is not the first column; if it isn't computed, it's length must match the existing columns. if (!column.IsComputed && column.Count != map.Count) { throw new DimensionMismatchException(); } } this.columnMap.Add(column.Name, this.columns.Count); this.columns.Add(column); }
public ConvertedFrameColumn(NamedList column, List <int> map) { Debug.Assert(column != null); Debug.Assert(map != null); this.column = column; this.map = map; }
/// <summary> /// Groups the data by the values in the given column, and computes aggregate quantities for each group. /// </summary> /// <param name="groupByColumnName">The name of the column to group by.</param> /// <param name="aggregator">A function that computes the aggregate quantities.</param> /// <returns>A new data frame containing the aggregates for each group.</returns> /// <remarks> /// <para>The first column of the returned <see cref="FrameTable"/> has the same name as the /// original <paramref name="groupByColumnName"/> and contains all the distinct /// values of that column in the original view. There is an additional column for each /// dictionary entry returned by <paramref name="aggregator"/>, whose name is the returned /// key and whose values are values returned for each group.</para> /// <para>The function that computes the aggregate receives a <see cref="FrameView"/> containing /// all the rows in the group. To produce aggregate results, it can use values in any of /// the columns. Each invocation of the <paramref name="aggregator"/> must return the same keys /// and values for the same keys must be of the same type. (Values for different keys may be /// of different types.) Aggregate column names are taken from the keys and storage types are /// inferred from the returned values.</para> /// <para>To produce just one aggregate value, you may find it simpler and more efficient /// to use the <see cref="GroupBy(string, Func{FrameView, IReadOnlyDictionary{string, object}})"/> /// overload.</para> /// </remarks> public FrameTable GroupBy(string groupByColumnName, Func <FrameView, IReadOnlyDictionary <string, object> > aggregator) { // Collect rows into groups. int groupByColumnIndex = GetColumnIndex(groupByColumnName); NamedList groupByColumn = columns[groupByColumnIndex]; NullableDictionary <object, List <int> > groups = FindGroups(groupByColumn); // Create a column to hold the group values. NamedList groupsColumn = NamedList.Create(groupByColumnName, groupByColumn.StorageType); // Create an enumerator that feeds the groups into the aggregator and presents them as dictionaries. IEnumerable <IReadOnlyDictionary <string, object> > aggregatesEnumerator = GetGroupEnumerator(groups, aggregator, groupsColumn); // Column-ify and validate the presented dictionaries. List <NamedList> aggregateColumns = DictionaryHelper.ReadDictionaries(aggregatesEnumerator); // Collect the results into a frame table. FrameTable result = new FrameTable(); // First column is the group values. result.AddColumn(groupsColumn); // Remaining columns are aggregate columns. foreach (NamedList aggregateColumn in aggregateColumns) { result.AddColumn(aggregateColumn); } return(result); }
private static void ReadCsvAsStrings(TextReader reader, out NamedList <string>[] columns, out DataAdaptor[] headers) { Debug.Assert(reader != null); // Get the column names from the first line. string firstline = reader.ReadLine(); if (firstline == null) { columns = null; headers = null; return; } List <string> names = CsvHelper.ReadCells(firstline); int count = names.Count; // Put the columns into lists of strings, and as we do so, maintain the collection of // types it can be parsed into, and whether any entries are null. columns = new NamedList <string> [names.Count]; headers = new DataAdaptor[names.Count]; for (int columnIndex = 0; columnIndex < columns.Length; columnIndex++) { columns[columnIndex] = new NamedList <string>(names[columnIndex]); headers[columnIndex] = new DataAdaptor(); } while (true) { string line = reader.ReadLine(); if (line == null) { break; } List <string> cells = CsvHelper.ReadCells(line); if (cells.Count != count) { throw new FormatException(); } for (int columnIndex = 0; columnIndex < count; columnIndex++) { string cell = cells[columnIndex]; if (String.IsNullOrEmpty(cell)) { headers[columnIndex].IsNullable = true; columns[columnIndex].Add(null); } else { DataAdaptor header = headers[columnIndex]; header.TryParse(cell); columns[columnIndex].Add(cell); } } } }
/// <summary> /// Creates a new frame table from a file of comma-separated values. /// </summary> /// <param name="reader">A reader positioned at the beginning of the file.</param> /// <returns>A new data frame with data from the file.</returns> /// <remarks> /// <para>The column names are taken from the first line of the file.</para> /// <para>The storage type of each column is inferred from the types of objects /// encountered are the frame table is constructed.</para> /// </remarks> public static FrameTable FromCsv(TextReader reader) { if (reader == null) { throw new ArgumentNullException(nameof(reader)); } NamedList <string>[] textColumns; DataAdaptor[] headers; ReadCsvAsStrings(reader, out textColumns, out headers); NamedList[] columns = new NamedList[headers.Length]; for (int columnIndex = 0; columnIndex < columns.Length; columnIndex++) { DataAdaptor header = headers[columnIndex]; if (header.TypeCandidates.Count == 0) { columns[columnIndex] = textColumns[columnIndex]; } else { TypeParser adaptor = header.TypeCandidates.First.Value; NamedList column = adaptor.CreateStorage(textColumns[columnIndex].Name, header.IsNullable); foreach (string textValue in textColumns[columnIndex]) { if (textValue == null) { column.AddItem(null); } else { object value = adaptor.Parse(textValue); column.AddItem(value); } } columns[columnIndex] = column; } } FrameTable frame = new FrameTable(columns); return(frame); }
// This method collects rows into groups by the group-defining column. It is shared by // both GroupBy overloads. private NullableDictionary <object, List <int> > FindGroups(NamedList groupByColumn) { NullableDictionary <object, List <int> > groups = new NullableDictionary <object, List <int> >(); for (int r = 0; r < this.map.Count; r++) { int index = this.map[r]; object value = groupByColumn.GetItem(index); List <int> members; if (!groups.TryGetValue(value, out members)) { members = new List <int>(); groups.Add(value, members); } members.Add(index); } return(groups); }
/// <summary> /// Sort the rows by the values in the given column in the given direction. /// </summary> /// <param name="columnName">The name of the column to sort by.</param> /// <param name="order">The direction of the ordering.</param> /// <returns>A new view, with rows sorted by the values in the given column.</returns> /// <remarks> /// <para><see langword="null"/> values are supported and are ordered before all other values.</para> /// <para>The type of data in the column must implement <see cref="IComparable"/>.</para> /// </remarks> /// <exception cref="ArgumentNullException"><paramref name="columnName"/> is <see langword="null"/>.</exception> /// <exception cref="IndexOutOfRangeException"><paramref name="columnName"/> is not the name of a column in the view.</exception> /// <exception cref="InvalidCastException">The type of data in the column is not <see cref="IComparable"/>.</exception> public FrameView OrderBy(string columnName, SortOrder order) { if (columnName == null) { throw new ArgumentNullException(nameof(columnName)); } int columnIndex = GetColumnIndex(columnName); NamedList column = columns[columnIndex]; List <int> newMap = new List <int>(map); if (order == SortOrder.Ascending) { newMap.Sort((i, j) => NullableComparer((IComparable)column.GetItem(i), (IComparable)column.GetItem(j))); } else { newMap.Sort((i, j) => NullableComparer((IComparable)column.GetItem(j), (IComparable)column.GetItem(i))); } return(new FrameView(this.columns, newMap)); }
/// <summary> /// Groups the data by the values in the given column, and computes the given aggregate quantity for each group. /// </summary> /// <typeparam name="T">The type of the aggregate output.</typeparam> /// <param name="groupByColumnName">The name of the column to group by.</param> /// <param name="aggregateColumnName">The name of the column for the aggregate output.</param> /// <param name="aggregator">A function that computes the aggregate quantity.</param> /// <returns>A new data frame containing the requested aggregate values for each group.</returns> /// <remarks> /// <para>The function that computes the aggregate receives a <see cref="FrameView"/> containing /// all the rows in the group. To produce an aggregate result, it can use values in any of /// the columns.</para> /// <para>To produce more than one aggregate value, use <see cref="GroupBy(string, Func{FrameView, IReadOnlyDictionary{string, object}})"/>.</para> /// </remarks> public FrameTable GroupBy <T>(string groupByColumnName, Func <FrameView, T> aggregator, string aggregateColumnName) { if (groupByColumnName == null) { throw new ArgumentNullException(nameof(groupByColumnName)); } if (aggregator == null) { throw new ArgumentNullException(nameof(aggregator)); } if (aggregateColumnName == null) { throw new ArgumentNullException(nameof(aggregateColumnName)); } // Collect the rows into groups. int groupByColumnIndex = GetColumnIndex(groupByColumnName); NamedList groupByColumn = columns[groupByColumnIndex]; NullableDictionary <object, List <int> > groups = FindGroups(groupByColumn); // Form destination columns based on group aggregates. NamedList groupsColumn = NamedList.Create(groupByColumnName, groupByColumn.StorageType); NamedList <T> aggregateColumn = new NamedList <T>(aggregateColumnName); foreach (KeyValuePair <object, List <int> > group in groups) { FrameView values = new FrameView(this.columns, group.Value); T aggregateValue = aggregator(values); aggregateColumn.AddItem(aggregateValue); object groupKey = group.Key; groupsColumn.AddItem(groupKey); } FrameTable result = new FrameTable(groupsColumn, aggregateColumn); return(result); }
/// <summary> /// Adds a new row of data to the data frame. /// </summary> /// <param name="values">A dictionary that maps the existing column names to the cell values for the new row.</param> public void AddRow(IReadOnlyDictionary <string, object> values) { if (values == null) { throw new ArgumentNullException(nameof(values)); } int rowCount = map.Count; for (int columnIndex = 0; columnIndex < columns.Count; columnIndex++) { NamedList column = columns[columnIndex]; if (column.IsComputed) { continue; } object value = values[column.Name]; int rowIndex = column.AddItem(value); if (rowIndex != rowCount) { throw new InvalidOperationException(); } } map.Add(rowCount); }
public static List <NamedList> ReadDictionaries(IEnumerable <IReadOnlyDictionary <string, object> > dictionaries) { Debug.Assert(dictionaries != null); // Iterate through the dictionaries, creating header objects that contain the un-cast values // and some information about them. List <DictionaryColumn> headers = null; foreach (IReadOnlyDictionary <string, object> dictionary in dictionaries) { // From the first row, create the headers list based on key names. if (headers == null) { headers = new List <DictionaryColumn>(dictionary.Count); foreach (string key in dictionary.Keys) { DictionaryColumn header = new DictionaryColumn() { Name = key, IsNullable = false, Type = null, Data = new List <object>() }; headers.Add(header); } } if (dictionary.Count != headers.Count) { throw new InvalidOperationException(); } // For all rows, check for null, record the type if we haven't found it yet, and store the value. for (int i = 0; i < headers.Count; i++) { DictionaryColumn header = headers[i]; object value = dictionary[header.Name]; if (value == null) { header.IsNullable = true; } else { if (header.Type == null) { header.Type = value.GetType(); } } header.Data.Add(value); } } // Arrange the columns into named lists of the appropriate type List <NamedList> columns = new List <NamedList>(headers.Count); foreach (DictionaryColumn header in headers) { NamedList column; if (header.Type == null) { // If no non-null value was ever found, we can't infer a type, so just make an object-column. column = new NamedList <object>(header.Name, header.Data); } else { // Based on null-ability and observed type, create the appropriate storage. Type type = header.Type; if (header.IsNullable && type.GetTypeInfo().IsValueType) { type = typeof(Nullable <>).MakeGenericType(type); } column = NamedList.Create(header.Name, type); // Copy the objects into the storage, which will cast them to the storage type. foreach (object value in header.Data) { column.AddItem(value); } } columns.Add(column); } Debug.Assert(columns.Count == headers.Count); return(columns); }
// This method turns our group row dictionary into an iterator that can be fed into the dictionary parser. // As it iterates to produce the views and aggregates in turn, it also adds the group values to the given // groups column. This logic is so closely coupled to the internal logic of the GroupBy method that calls // it that I would rather do this via lambda inside it, but lambdas that produce iterators are not allowed // and I really want an iterator to feed into the shared dictionary parsing logic. private IEnumerable <IReadOnlyDictionary <string, object> > GetGroupEnumerator(NullableDictionary <object, List <int> > groups, Func <FrameView, IReadOnlyDictionary <string, object> > aggregator, NamedList groupsColumn) { foreach (KeyValuePair <object, List <int> > group in groups) { FrameView view = new FrameView(this.columns, group.Value); IReadOnlyDictionary <string, object> aggregate = aggregator(view); yield return(aggregate); groupsColumn.AddItem(group.Key); } }