// Read the excel sheet from the workbook and return as a data table. // Return null if sheet is empty. private static MutableDataTable ReadSheet(WorkbookPart wbPart, Sheet sheet) { string sheetName = sheet.Name.Value; // Retrieve a reference to the worksheet part. WorksheetPart wsPart = (WorksheetPart)(wbPart.GetPartById(sheet.Id)); IEnumerable <Cell> cells = wsPart.Worksheet.Descendants <Cell>(); Dictionary2d <int, int, string> vals = new Dictionary2d <int, int, string>(); foreach (Cell c in cells) { var val = CellToText(wbPart, c); var loc = c.CellReference; var loc2 = ParseRef(loc); int columnId = loc2.Item1; int rowId = loc2.Item2; vals[rowId, columnId] = val; } if (vals.Count > 0) { MutableDataTable dt = ToTable(vals); dt.Name = sheetName; return(dt); } return(null); }
/// <summary> /// Sort a mutable datatable in place by the given column. /// </summary> /// <param name="dt">dat table to sort</param> /// <param name="columnName">column name to sort on. Throws if missing</param> /// <param name="comparer">Comparer to use on column name</param> public static void Sort(this MutableDataTable dt, string columnName, IComparer <string> comparer) { if (comparer == null) { throw new ArgumentNullException("comparer"); } var column = dt.GetColumn(columnName, throwOnMissing: true); int len = column.Values.Length; int[] map = new int[len]; for (int i = 0; i < len; i++) { map[i] = i; } Array.Sort(column.Values, map, comparer); // Sort other columns for consistency foreach (var c in dt.Columns) { if (c == column) { continue; } string[] newVals = new string[len]; for (int i = 0; i < len; i++) { newVals[i] = c.Values[map[i]]; } c.Values = newVals; } }
// Convert a 2d dict into a 2d data table. // TKey1 is rows, TKey1 is columns. // Data table column names are obtained from key values. // Column 0 is set of row values. internal static MutableDataTable ToTable <TKey1, TKey2, TValue>(Dictionary2d <TKey1, TKey2, TValue> dict) { // TKey1 is rows, TKey2 is values. MutableDataTable d = new MutableDataTable(); var rows = dict.Key1; int count = rows.Count(); // Set columns var columns = dict.Key2.ToArray(); { Column[] cs = new Column[columns.Length + 1]; cs[0] = new Column("row name", count); for (int ic = 0; ic < columns.Length; ic++) { cs[ic + 1] = new Column(columns[ic].ToString(), count); } d.Columns = cs; } // Add rows int i = 0; foreach (var row in rows) { d.Columns[0].Values[i] = row.ToString(); for (int ic = 0; ic < columns.Length; ic++) { d.Columns[ic + 1].Values[i] = dict[row, columns[ic]].ToString(); } i++; } return(d); }
// Read in a Ascii file that uses the given separate characters. // Like CSV. // Supports quotes to escape commas public static MutableDataTable Read(string filename, char separator = '\0', bool fAllowMismatch = false, string[] defaultColumns = null) { var lines = ReadAllLines(filename); MutableDataTable dt = ReadArray(lines, separator, fAllowMismatch, defaultColumns); dt.Name = filename; return(dt); }
public RowInMemory(MutableDataTable parent, int row) { m_row = row; m_parent = parent; Debug.Assert(parent != null); Debug.Assert(row >= 0 && row < m_parent.NumRows); }
static void CopyRowIntoArray(string[] values, int index, MutableDataTable d, int row) { for (int c = 0; c < d.Columns.Length; c++) { values[index] = d.Columns[c].Values[row]; index++; } }
// Read in a Ascii file that uses the given separate characters. // Like CSV. // Supports quotes to escape commas public static MutableDataTable Read(string filename, char separator = '\0', bool fAllowMismatch = false) { var lines = File.ReadAllLines(filename); MutableDataTable dt = ReadArray(lines, separator, fAllowMismatch); dt.Name = filename; return(dt); }
// All strings become upper case (for comparison) public static Dictionary <TKey, TValue> ToDict <TKey, TValue>(MutableDataTable table, string keyName, string valueName) { // $$$ Should this be on DataTable? int cKey = GetColumnIndexFromName(table.ColumnNames, keyName); int cValue = GetColumnIndexFromName(table.ColumnNames, valueName); return(ToDict <TKey, TValue>(table, cKey, cValue)); }
// column ids to use for keys and values. public static Dictionary <TKey, TValue> ToDict <TKey, TValue>(MutableDataTable table, int cKey, int cVal) { Dictionary <TKey, TValue> d = new Dictionary <TKey, TValue>(); for (int row = 0; row < table.NumRows; row++) { TKey k = Convert <TKey>(table.Columns[cKey].Values[row]); TValue v = Convert <TValue>(table.Columns[cVal].Values[row]); d[k] = v; } return(d); }
/// <summary> /// Return an in-memory table that contains the topN rows from the table in the filename. /// </summary> /// <param name="builder">ignored</param> /// <param name="filename">filename of table to load. Schema is inferred from header row.</param> /// <param name="topN">reads the topN rows from the table.</param> /// <returns>a in-memory table containing the topN rows from the supplied file.</returns> public static MutableDataTable ReadSampleTopN(this DataTableBuilder builder, string filename, int topN = 100) { Debug.Assert(builder != null); if (filename == null) { throw new ArgumentNullException("filename"); } DataTable source = new StreamingDataTable(filename); MutableDataTable dt = Analyze.SampleTopN(source, topN); return(dt); }
/// <summary> /// Produces a table where each row is the number of unique values in a source column, followed by the top N occurences in that column. /// </summary> /// <param name="table">source table</param> /// <param name="N">number of top N occurences to include in the summary table </param> /// <returns>a summary table</returns> public static MutableDataTable GetColumnValueCounts(DataTable table, int N) { if (N < 0) { throw new ArgumentOutOfRangeException("N"); } string[] names = table.ColumnNames.ToArray(); int count = names.Length; MutableDataTable dSummary = new MutableDataTable(); Column c1 = new Column("column name", count); Column c2 = new Column("count", count); int kFixed = 2; Column[] cAll = new Column[kFixed + N * 2]; cAll[0] = c1; cAll[1] = c2; for (int i = 0; i < N; i++) { cAll[i * 2 + kFixed] = new Column("Top Value " + i, count); cAll[i * 2 + 1 + kFixed] = new Column("Top Occurrence " + i, count); } dSummary.Columns = cAll; int columnId = 0; foreach (string name in names) { Tuple <string, int>[] hist = AsHistogram(table, columnId); c1.Values[columnId] = name; c2.Values[columnId] = hist.Length.ToString(); for (int i = 0; i < N; i++) { if (i >= hist.Length) { break; } cAll[i * 2 + kFixed].Values[columnId] = hist[i].Item1; cAll[i * 2 + 1 + kFixed].Values[columnId] = hist[i].Item2.ToString(); } columnId++; } return(dSummary); }
// $$$ Merge with the more dynamic ToTable. internal static MutableDataTable ToTable <T1, T2>(Tuple <T1, T2>[] a, string name1, string name2) { MutableDataTable d = new MutableDataTable(); int count = a.Length; Column cKeys = new Column(name1, count); Column cVals = new Column(name2, count); d.Columns = new [] { cKeys, cVals }; int i = 0; foreach (var kv in a) { cKeys.Values[i] = kv.Item1.ToString(); cVals.Values[i] = kv.Item2.ToString(); i++; } return(d); }
// skip access private static MutableDataTable ToTable <TValue>(Dictionary2d <int, int, TValue> dict) { // TKey1 is rows, TKey2 is values. MutableDataTable d = new MutableDataTable(); var rows = dict.Key1; int count = rows.Count() - 1; // Set columns var columns = dict.Key2.ToArray(); { Column[] cs = new Column[columns.Length]; for (int ic = 0; ic < columns.Length; ic++) { // fix for empty column name string columnName = dict[0, columns[ic]] == null ? string.Empty : dict[0, columns[ic]].ToString();; cs[ic] = new Column(columnName, count); } d.Columns = cs; } // Add rows int i = 0; foreach (var row in rows) { i++; if (i == 1) { continue; // skip 1st row, header } for (int ic = 0; ic < columns.Length; ic++) { var value = dict[row, columns[ic]]; string s = (value == null) ? string.Empty : value.ToString(); d.Columns[ic].Values[i - 2] = s; } } return(d); }
/// <summary> /// Reads the first worksheet in the .xlsx file and returns it. This only supports .xlsx files (Office 2007, with open xml standard) /// and not .xls files (which had a closed file format that required COM). /// This is safe to use on a server. /// </summary> /// <param name="builder"></param> /// <param name="input">stream to read file from</param> /// <returns>table for the first sheet in the workbook. Table's name is the sheet name.</returns> public static MutableDataTable ReadExcel(this DataTableBuilder builder, Stream input) { // See http://msdn.microsoft.com/en-us/library/hh298534.aspx using (SpreadsheetDocument document = SpreadsheetDocument.Open(input, isEditable: false)) { // Retrieve a reference to the workbook part. WorkbookPart wbPart = document.WorkbookPart; // Get the first sheet foreach (Sheet sheet in wbPart.Workbook.Descendants <Sheet>()) { MutableDataTable dt = ReadSheet(wbPart, sheet); if (dt != null) { return(dt); } } } throw new InvalidOperationException("Excel file is either empty or does not have a valid table in it."); }
/// <summary> /// Create an in-memory table with 2 columns (key and value), where each row is a KeyValuePair from the dictionary. /// </summary> /// <typeparam name="TKey">TKey of dictionary</typeparam> /// <typeparam name="TValue">TValue of dictionary</typeparam> /// <param name="builder">ignored</param> /// <param name="dict">source of data</param> /// <param name="keyName">name for column that holds the dictionary keys</param> /// <param name="valName">name for column that holds the dictionary values</param> /// <returns>an in-memory table</returns> public static MutableDataTable FromDictionary <TKey, TValue>(this DataTableBuilder builder, IDictionary <TKey, TValue> dict, string keyName, string valName) { Debug.Assert(builder != null); MutableDataTable d = new MutableDataTable(); int count = dict.Count; Column cKeys = new Column(keyName, count); Column cVals = new Column(valName, count); d.Columns = new Column[] { cKeys, cVals }; int i = 0; foreach (var kv in dict) { cKeys.Values[i] = kv.Key.ToString(); cVals.Values[i] = kv.Value.ToString(); i++; } return(d); }
/// <summary> /// Reads all sheets in the excel workbook and returns as a ordered collection of data tables. /// </summary> /// <param name="builder">placeholder</param> /// <param name="input">stream to read from</param> /// <returns>Ordered collection of tables corresponding to non-empty sheets. Table name corresponds to sheet name.</returns> public static IList <MutableDataTable> ReadExcelAllSheets(this DataTableBuilder builder, Stream input) { List <MutableDataTable> list = new List <MutableDataTable>(); // See http://msdn.microsoft.com/en-us/library/hh298534.aspx using (SpreadsheetDocument document = SpreadsheetDocument.Open(input, isEditable: false)) { // Retrieve a reference to the workbook part. WorkbookPart wbPart = document.WorkbookPart; // Get the first sheet foreach (Sheet sheet in wbPart.Workbook.Descendants <Sheet>()) { MutableDataTable dt = ReadSheet(wbPart, sheet); if (dt != null) { list.Add(dt); } } } return(list); }
// Dynamically Flatten. // $$$ Need way to guarantee that flatten order matches column names. public static MutableDataTable ToTableX <T>(IEnumerable <T> a, params string[] columnNames) { // $$$ How to infer column names? // Flatten doesn't have a definitive order. // If we had more smart collections, we could infer. var items = a.ToList(); int count = items.Count(); MutableDataTable d = new MutableDataTable(); // Alloc columns Column[] cs = new Column[columnNames.Length]; for (int i = 0; i < columnNames.Length; i++) { cs[i] = new Column(columnNames[i], count); } // Fill in rows int row = 0; foreach (T item in items) { string[] values = Flatten(item); Assert(values.Length == columnNames.Length, string.Format("Row {0} does not have the expected number of values ({1})", row + 1, columnNames.Length)); for (int i = 0; i < columnNames.Length; i++) { cs[i].Values[row] = values[i]; } row++; } d.Columns = cs; return(d); }
// Dynamically Flatten. // $$$ Need way to gaurantee that flatten order matches column names. public static MutableDataTable ToTableX <T>(IEnumerable <T> a, params string[] columnNames) { // $$$ How to infer column names? // Flatten doesn't have a definitive order. // If we had more smart collections, we could infer. int count = a.Count(); MutableDataTable d = new MutableDataTable(); // Alloc columns Column[] cs = new Column[columnNames.Length]; for (int i = 0; i < columnNames.Length; i++) { cs[i] = new Column(columnNames[i], count); } // Fill in rows int row = 0; foreach (T item in a) { string[] values = Flatten(item); Utility.Assert(values.Length == columnNames.Length); for (int i = 0; i < columnNames.Length; i++) { cs[i].Values[row] = values[i]; } row++; } d.Columns = cs; return(d); }
// Read the excel sheet from the workbook and return as a data table. // Return null if sheet is empty. private static MutableDataTable ReadSheet(WorkbookPart wbPart, Sheet sheet) { string sheetName = sheet.Name.Value; // Retrieve a reference to the worksheet part. WorksheetPart wsPart = (WorksheetPart)(wbPart.GetPartById(sheet.Id)); IEnumerable <Cell> cells = wsPart.Worksheet.Descendants <Cell>(); Dictionary2d <int, int, string> vals = new Dictionary2d <int, int, string>(); // Retrieve a cached list of shared strings of this workbook to be used by all cell references IList <OpenXmlElement> sharedStrings = wbPart.GetPartsOfType <SharedStringTablePart>().Select(sharedString => sharedString.SharedStringTable.OfType <OpenXmlElement>().ToList()).FirstOrDefault(); foreach (Cell c in cells) { var val = CellToText(wbPart, c, sharedStrings); var loc = c.CellReference; var loc2 = ParseRef(loc); int columnId = loc2.Item1; int rowId = loc2.Item2; vals[rowId, columnId] = val; } sharedStrings.Clear(); if (vals.Count > 0) { MutableDataTable dt = ToTable(vals); dt.Name = sheetName; return(dt); } return(null); }
internal static MutableDataTable ToMutable(DataTable table) { MutableDataTable dt = new MutableDataTable(); // Take a pass through upfront so we know how large to allocate all the column arrays int numRows = table.Rows.Count(); Column[] cs = Array.ConvertAll(table.ColumnNames.ToArray(), name => new Column(name, numRows)); int rowIdx = 0; foreach (Row row in table.Rows) { var values = row.Values; for (int iColumn = 0; iColumn < values.Count; iColumn++) { cs[iColumn].Values[rowIdx] = values[iColumn]; } rowIdx++; } dt.Columns = cs; return(dt); }
private static MutableDataTable ReadArray(IList <string> lines, char separator, bool fAllowMismatch = false) { if (separator == '\0') { separator = GuessSeparateFromHeaderRow(lines[0]); } int numRows = lines.Count - 1; // First row is a header string[] names = split(lines[0], separator); int numColumns = names.Length; var columns = new Column[numColumns]; for (int i = 0; i < numColumns; i++) { columns[i] = new Column(names[i], numRows); } // Parse each row into data set for (int i = 1; i < lines.Count; i++) { string line = lines[i]; int row = i - 1; string[] parts = split(line, separator); if (parts.Length < numColumns) { // Deal with possible extra commas at the end. // Excel handles this. for (int c = 0; c < parts.Length; c++) { columns[c].Values[row] = parts[c]; } for (int c = parts.Length; c < numColumns; c++) { columns[c].Values[row] = String.Empty; } continue; } if (!fAllowMismatch) { // If mismatch allowed, then treat this row as garbage rather // than throw an exception Utility.Assert(parts.Length == names.Length); } for (int c = 0; c < numColumns; c++) { columns[c].Values[row] = parts[c]; } } MutableDataTable data = new MutableDataTable(); data.Columns = columns; return(data); }
// All strings become upper case (for comparison) public static IDictionary <TKey, TValue> ToDictionary <TKey, TValue>(this MutableDataTable table, string keyName, string valueName) { return(Utility.ToDict <TKey, TValue>(table, keyName, valueName)); }
public static Dictionary <TKey, TValue> ToDict <TKey, TValue>(MutableDataTable table) { // Assume first two return(ToDict <TKey, TValue>(table, 0, 1)); }
private static MutableDataTable ReadArray(IList <string> lines, char separator, bool fAllowMismatch = false, string[] defaultColumns = null) { if (separator == '\0') { separator = GuessSeparateFromHeaderRow(lines[0]); } int numRows = lines.Count - (defaultColumns != null ? 0 : 1); // First row is a header only if we dont pass defaultColumns // if defaultColumns is not null then we use them as columns string[] names = defaultColumns ?? split(lines[0], separator); int numColumns = names.Length; var columns = new Column[numColumns]; for (int i = 0; i < numColumns; i++) { columns[i] = new Column(names[i], numRows); } // Parse each row into data set using (var lineEnumerator = lines.GetEnumerator()) { if (defaultColumns == null) { lineEnumerator.MoveNext(); // in this case we have columns at first index } var row = -1; while (lineEnumerator.MoveNext()) { string line = lineEnumerator.Current; row++; string[] parts = split(line, separator); if (parts.Length < numColumns) { // Deal with possible extra commas at the end. // Excel handles this. for (int c = 0; c < parts.Length; c++) { columns[c].Values[row] = parts[c]; } if (fAllowMismatch) { for (int c = parts.Length; c < numColumns; c++) { columns[c].Values[row] = String.Empty; } continue; } } if (!fAllowMismatch) { // If mismatch allowed, then treat this row as garbage rather // than throw an exception Utility.Assert( parts.Length == names.Length, String.Format( "Allow Mismatch is False. Line has incorrect number of parts. Line Number:{0}; Expected:{1}; Actual:{2}", row + 1, names.Length, parts.Length)); } for (int c = 0; c < numColumns; c++) { columns[c].Values[row] = parts[c]; } } } MutableDataTable data = new MutableDataTable(); data.Columns = columns; return(data); }
/// <summary> /// Sort a mutable datatable in place by the given column. /// </summary> /// <param name="dt">dat table to sort</param> /// <param name="columnName">column name to sort on. Throws if missing</param> public static void Sort(this MutableDataTable dt, string columnName) { Sort(dt, columnName, StringComparer.InvariantCultureIgnoreCase); }
// $$$ Clarify - multiple joins (inner, outer, etc) /// <summary> /// Performs a full outer join on two in-memory tables and returns a new table. /// The number of rows in the resulting table is the sum of rows from each source table. /// The number of columns in teh new table is the sum of columns in the the source tables minus 1 /// (since the join column is redundant) /// </summary> /// <param name="d1"></param> /// <param name="d2"></param> /// <param name="columnName">column name to join on. Both tables must have this column name.</param> /// <returns>a new table</returns> public static MutableDataTable Join(MutableDataTable d1, MutableDataTable d2, string columnName) { Column c1 = d1.GetColumn(columnName); if (c1 == null) { throw new InvalidOperationException("Missing column"); } Column c2 = d2.GetColumn(columnName); if (c2 == null) { throw new InvalidOperationException("Missing column"); } // Place d1 in first set of columns, and d2 in second set. int kColumn = d1.Columns.Length; int kTotalColumns = kColumn + d2.Columns.Length; // Indices into new table where join columns are. int joinColumn1 = Utility.GetColumnIndexFromName(d1.ColumnNames, columnName); int joinColumn2 = Utility.GetColumnIndexFromName(d2.ColumnNames, columnName) + kColumn; // $$$ could really optimize. Sort both on column and then zip. Dictionary <string, int> m1 = GetRowIndex(c1); Dictionary <string, int> m2 = GetRowIndex(c2); // $$$ column names may not be unique. //string[] headers = d1.ColumnNames.Union(d2.ColumnNames).ToArray(); string[] headers = new string[kTotalColumns]; Array.Copy(d1.ColumnNames.ToArray(), 0, headers, 0, kColumn); Array.Copy(d2.ColumnNames.ToArray(), 0, headers, kColumn, kTotalColumns - kColumn); string[] values = new string[headers.Length]; string path = GetTempFileName(); using (CsvWriter tw = new CsvWriter(path, headers)) { foreach (var kv in m1) { Clear(values); string key = kv.Key; // join column int r1 = kv.Value; int r2; if (m2.TryGetValue(key, out r2)) { // In both. write out CopyRowIntoArray(values, kColumn, d2, r2); m2.Remove(key); } else { // Only in M1. } CopyRowIntoArray(values, 0, d1, r1); values[joinColumn1] = values[joinColumn2] = key; tw.WriteRow(values); } // We remove all of M1's items from m2, so M2 is just unique items now. (possibly 0). // Tag those onto the end. foreach (var kv in m2) { int r2 = kv.Value; Clear(values); CopyRowIntoArray(values, kColumn, d2, r2); values[joinColumn1] = values[joinColumn2] = kv.Key; tw.WriteRow(values); } } // close tw MutableDataTable t = Reader.ReadCSV(path); DeleteLocalFile(path); // Remove duplicate columns. t.DeleteColumn(joinColumn2); return(t); }