/// <summary> /// Reads a seekable stream of CSV data into a DataFrame. /// Follows pandas API. /// </summary> /// <param name="csvStream">stream of CSV data to be read in</param> /// <param name="separator">column separator</param> /// <param name="header">has a header or not</param> /// <param name="columnNames">column names (can be empty)</param> /// <param name="dataTypes">column types (can be empty)</param> /// <param name="numberOfRowsToRead">number of rows to read not including the header(if present)</param> /// <param name="guessRows">number of rows used to guess types</param> /// <param name="addIndexColumn">add one column with the row index</param> /// <returns><see cref="DataFrame"/></returns> public static DataFrame LoadCsv(Stream csvStream, char separator = ',', bool header = true, string[] columnNames = null, Type[] dataTypes = null, long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false) { if (!csvStream.CanSeek) { throw new ArgumentException(Strings.NonSeekableStream, nameof(csvStream)); } var linesForGuessType = new List <string[]>(); long rowline = 0; int numberOfColumns = dataTypes?.Length ?? 0; if (header == true && numberOfRowsToRead != -1) { numberOfRowsToRead++; } List <DataFrameColumn> columns; long streamStart = csvStream.Position; // First pass: schema and number of rows. using (var streamReader = new StreamReader(csvStream, Encoding.UTF8, detectEncodingFromByteOrderMarks: true, DefaultStreamReaderBufferSize, leaveOpen: true)) { string line = null; if (dataTypes == null) { line = streamReader.ReadLine(); while (line != null) { if ((numberOfRowsToRead == -1) || rowline < numberOfRowsToRead) { if (linesForGuessType.Count < guessRows) { var spl = line.Split(separator); if (header && rowline == 0) { if (columnNames == null) { columnNames = spl; } } else { linesForGuessType.Add(spl); numberOfColumns = Math.Max(numberOfColumns, spl.Length); } } } ++rowline; if (rowline == guessRows) { break; } line = streamReader.ReadLine(); } if (linesForGuessType.Count == 0) { throw new FormatException(Strings.EmptyFile); } } columns = new List <DataFrameColumn>(numberOfColumns); // Guesses types or looks up dataTypes and adds columns. for (int i = 0; i < numberOfColumns; ++i) { Type kind = dataTypes == null?GuessKind(i, linesForGuessType) : dataTypes[i]; columns.Add(CreateColumn(kind, columnNames, i)); } DataFrame ret = new DataFrame(columns); line = null; streamReader.DiscardBufferedData(); streamReader.BaseStream.Seek(streamStart, SeekOrigin.Begin); // Fills values. line = streamReader.ReadLine(); rowline = 0; while (line != null && (numberOfRowsToRead == -1 || rowline < numberOfRowsToRead)) { var spl = line.Split(separator); if (header && rowline == 0) { // Skips. } else { ret.Append(spl, inPlace: true); } ++rowline; line = streamReader.ReadLine(); } if (addIndexColumn) { PrimitiveDataFrameColumn <int> indexColumn = new PrimitiveDataFrameColumn <int>("IndexColumn", columns[0].Length); for (int i = 0; i < columns[0].Length; i++) { indexColumn[i] = i; } columns.Insert(0, indexColumn); } return(ret); } }
private static DataFrame ReadCsvLinesIntoDataFrame(WrappedStreamReaderOrStringReader wrappedReader, char separator = ',', bool header = true, string[] columnNames = null, Type[] dataTypes = null, long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false ) { if (dataTypes == null && guessRows <= 0) { throw new ArgumentException(string.Format(Strings.ExpectedEitherGuessRowsOrDataTypes, nameof(guessRows), nameof(dataTypes))); } List <DataFrameColumn> columns; string[] fields; using (var textReader = wrappedReader.GetTextReader()) { TextFieldParser parser = new TextFieldParser(textReader); parser.SetDelimiters(separator.ToString()); var linesForGuessType = new List <string[]>(); long rowline = 0; int numberOfColumns = dataTypes?.Length ?? 0; if (header == true && numberOfRowsToRead != -1) { numberOfRowsToRead++; } // First pass: schema and number of rows. while ((fields = parser.ReadFields()) != null) { if ((numberOfRowsToRead == -1) || rowline < numberOfRowsToRead) { if (linesForGuessType.Count < guessRows || (header && rowline == 0)) { if (header && rowline == 0) { if (columnNames == null) { columnNames = fields; } } else { linesForGuessType.Add(fields); numberOfColumns = Math.Max(numberOfColumns, fields.Length); } } } ++rowline; if (rowline == guessRows || guessRows == 0) { break; } } if (rowline == 0) { throw new FormatException(Strings.EmptyFile); } columns = new List <DataFrameColumn>(numberOfColumns); // Guesses types or looks up dataTypes and adds columns. for (int i = 0; i < numberOfColumns; ++i) { Type kind = dataTypes == null?GuessKind(i, linesForGuessType) : dataTypes[i]; columns.Add(CreateColumn(kind, columnNames, i)); } } DataFrame ret = new DataFrame(columns); // Fill values. using (var textReader = wrappedReader.GetTextReader()) { TextFieldParser parser = new TextFieldParser(textReader); parser.SetDelimiters(separator.ToString()); long rowline = 0; while ((fields = parser.ReadFields()) != null && (numberOfRowsToRead == -1 || rowline < numberOfRowsToRead)) { if (header && rowline == 0) { // Skips. } else { ret.Append(fields, inPlace: true); } ++rowline; } if (addIndexColumn) { PrimitiveDataFrameColumn <int> indexColumn = new PrimitiveDataFrameColumn <int>("IndexColumn", columns[0].Length); for (int i = 0; i < columns[0].Length; i++) { indexColumn[i] = i; } columns.Insert(0, indexColumn); } } return(ret); }
/// <summary> /// Reads an implementation of IDataReader into a DataFrame. /// </summary> /// <param name="reader">DataReader to be read in</param> /// <param name="columnNames">column names (can be empty)</param> /// <param name="dataTypes">column types (can be empty)</param> /// <param name="numberOfRowsToRead">number of rows to read not including the header(if present)</param> /// <param name="addIndexColumn">add one column with the row index</param> /// <returns><see cref="DataFrame"/></returns> public static DataFrame FromDataReader(IDataReader reader, string[] columnNames = null, Type[] dataTypes = null, long numberOfRowsToRead = -1, bool addIndexColumn = false) { DataTable schemaTable = reader.GetSchemaTable(); int numberOfColumns = schemaTable.Rows.Count; if (columnNames == null) { columnNames = new string[numberOfColumns]; for (int i = 0; i < numberOfColumns; ++i) { string columnName = schemaTable.Rows[i]["ColumnName"].ToString(); columnNames[i] = string.IsNullOrWhiteSpace(columnName) ? $"Column{i}" : columnName; } } var columns = new List <DataFrameColumn>(numberOfColumns); if (dataTypes == null) { for (int i = 0; i < numberOfColumns; ++i) { var kind = (Type)schemaTable.Rows[i]["DataType"]; columns.Add(CreateColumn(kind, columnNames, i)); } } else { for (int i = 0; i < numberOfColumns; ++i) { columns.Add(CreateColumn(dataTypes[i], columnNames, i)); } } long rowline = 0; var ret = new DataFrame(columns); while (reader.Read() && (numberOfRowsToRead == -1 || rowline < numberOfRowsToRead)) { ret.Append(GetRecordValues(reader), inPlace: true); ++rowline; } if (addIndexColumn) { PrimitiveDataFrameColumn <int> indexColumn = new PrimitiveDataFrameColumn <int>("IndexColumn", columns[0].Length); for (int i = 0; i < columns[0].Length; i++) { indexColumn[i] = i; } columns.Insert(0, indexColumn); } return(ret); IEnumerable <object> GetRecordValues(IDataRecord record) { for (int i = 0; i < record.FieldCount; i++) { yield return(record[i] == DBNull.Value ? null : record[i]); } } }
private static DataFrame ReadCsvLinesIntoDataFrame(IEnumerable <string> lines, char separator = ',', bool header = true, string[] columnNames = null, Type[] dataTypes = null, long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false ) { if (dataTypes == null && guessRows <= 0) { throw new ArgumentException(string.Format(Strings.ExpectedEitherGuessRowsOrDataTypes, nameof(guessRows), nameof(dataTypes))); } var linesForGuessType = new List <string[]>(); long rowline = 0; int numberOfColumns = dataTypes?.Length ?? 0; if (header == true && numberOfRowsToRead != -1) { numberOfRowsToRead++; } List <DataFrameColumn> columns; // First pass: schema and number of rows. string line = null; var enumerator = lines.GetEnumerator(); while (enumerator.MoveNext()) { line = enumerator.Current; if ((numberOfRowsToRead == -1) || rowline < numberOfRowsToRead) { if (linesForGuessType.Count < guessRows || (header && rowline == 0)) { var spl = line.Split(separator); if (header && rowline == 0) { if (columnNames == null) { columnNames = spl; } } else { linesForGuessType.Add(spl); numberOfColumns = Math.Max(numberOfColumns, spl.Length); } } } ++rowline; if (rowline == guessRows || guessRows == 0) { break; } } if (rowline == 0) { throw new FormatException(Strings.EmptyFile); } columns = new List <DataFrameColumn>(numberOfColumns); // Guesses types or looks up dataTypes and adds columns. for (int i = 0; i < numberOfColumns; ++i) { Type kind = dataTypes == null?GuessKind(i, linesForGuessType) : dataTypes[i]; columns.Add(CreateColumn(kind, columnNames, i)); } DataFrame ret = new DataFrame(columns); line = null; // Fill values. enumerator.Reset(); rowline = 0; while (enumerator.MoveNext() && (numberOfRowsToRead == -1 || rowline < numberOfRowsToRead)) { line = enumerator.Current; var spl = line.Split(separator); if (header && rowline == 0) { // Skips. } else { ret.Append(spl, inPlace: true); } ++rowline; } if (addIndexColumn) { PrimitiveDataFrameColumn <int> indexColumn = new PrimitiveDataFrameColumn <int>("IndexColumn", columns[0].Length); for (int i = 0; i < columns[0].Length; i++) { indexColumn[i] = i; } columns.Insert(0, indexColumn); } return(ret); }