/// <summary> /// Reads CSV data passed in as a string into a DataFrame. /// </summary> /// <param name="csvString">csv data passed in as a string</param> /// <param name="separator">column separator</param> /// <param name="header">has a header or not</param> /// <param name="columnNames">column names (can be empty)</param> /// <param name="dataTypes">column types (can be empty)</param> /// <param name="numberOfRowsToRead">number of rows to read not including the header(if present)</param> /// <param name="guessRows">number of rows used to guess types</param> /// <param name="addIndexColumn">add one column with the row index</param> /// <returns><see cref="DataFrame"/></returns> public static DataFrame LoadCsvFromString(string csvString, char separator = ',', bool header = true, string[] columnNames = null, Type[] dataTypes = null, long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false) { WrappedStreamReaderOrStringReader wrappedStreamReaderOrStringReader = new WrappedStreamReaderOrStringReader(csvString); return(ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn)); }
/// <summary> /// Reads a seekable stream of CSV data into a DataFrame. /// </summary> /// <param name="csvStream">stream of CSV data to be read in</param> /// <param name="separator">column separator</param> /// <param name="header">has a header or not</param> /// <param name="columnNames">column names (can be empty)</param> /// <param name="dataTypes">column types (can be empty)</param> /// <param name="numberOfRowsToRead">number of rows to read not including the header(if present)</param> /// <param name="guessRows">number of rows used to guess types</param> /// <param name="addIndexColumn">add one column with the row index</param> /// <param name="encoding">The character encoding. Defaults to UTF8 if not specified</param> /// <returns><see cref="DataFrame"/></returns> public static DataFrame LoadCsv(Stream csvStream, char separator = ',', bool header = true, string[] columnNames = null, Type[] dataTypes = null, long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false, Encoding encoding = null) { if (!csvStream.CanSeek) { throw new ArgumentException(Strings.NonSeekableStream, nameof(csvStream)); } if (dataTypes == null && guessRows <= 0) { throw new ArgumentException(string.Format(Strings.ExpectedEitherGuessRowsOrDataTypes, nameof(guessRows), nameof(dataTypes))); } WrappedStreamReaderOrStringReader wrappedStreamReaderOrStringReader = new WrappedStreamReaderOrStringReader(csvStream, encoding ?? Encoding.UTF8); return(ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn)); }
private static DataFrame ReadCsvLinesIntoDataFrame(WrappedStreamReaderOrStringReader wrappedReader, char separator = ',', bool header = true, string[] columnNames = null, Type[] dataTypes = null, long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false ) { if (dataTypes == null && guessRows <= 0) { throw new ArgumentException(string.Format(Strings.ExpectedEitherGuessRowsOrDataTypes, nameof(guessRows), nameof(dataTypes))); } List <DataFrameColumn> columns; string[] fields; using (var textReader = wrappedReader.GetTextReader()) { TextFieldParser parser = new TextFieldParser(textReader); parser.SetDelimiters(separator.ToString()); var linesForGuessType = new List <string[]>(); long rowline = 0; int numberOfColumns = dataTypes?.Length ?? 0; if (header == true && numberOfRowsToRead != -1) { numberOfRowsToRead++; } // First pass: schema and number of rows. while ((fields = parser.ReadFields()) != null) { if ((numberOfRowsToRead == -1) || rowline < numberOfRowsToRead) { if (linesForGuessType.Count < guessRows || (header && rowline == 0)) { if (header && rowline == 0) { if (columnNames == null) { columnNames = fields; } } else { linesForGuessType.Add(fields); numberOfColumns = Math.Max(numberOfColumns, fields.Length); } } } ++rowline; if (rowline == guessRows || guessRows == 0) { break; } } if (rowline == 0) { throw new FormatException(Strings.EmptyFile); } columns = new List <DataFrameColumn>(numberOfColumns); // Guesses types or looks up dataTypes and adds columns. for (int i = 0; i < numberOfColumns; ++i) { Type kind = dataTypes == null?GuessKind(i, linesForGuessType) : dataTypes[i]; columns.Add(CreateColumn(kind, columnNames, i)); } } DataFrame ret = new DataFrame(columns); // Fill values. using (var textReader = wrappedReader.GetTextReader()) { TextFieldParser parser = new TextFieldParser(textReader); parser.SetDelimiters(separator.ToString()); long rowline = 0; while ((fields = parser.ReadFields()) != null && (numberOfRowsToRead == -1 || rowline < numberOfRowsToRead)) { if (header && rowline == 0) { // Skips. } else { ret.Append(fields, inPlace: true); } ++rowline; } if (addIndexColumn) { PrimitiveDataFrameColumn <int> indexColumn = new PrimitiveDataFrameColumn <int>("IndexColumn", columns[0].Length); for (int i = 0; i < columns[0].Length; i++) { indexColumn[i] = i; } columns.Insert(0, indexColumn); } } return(ret); }