Пример #1
0
        /// <summary>
        /// Reads CSV data passed in as a string into a DataFrame.
        /// </summary>
        /// <param name="csvString">csv data passed in as a string</param>
        /// <param name="separator">column separator</param>
        /// <param name="header">has a header or not</param>
        /// <param name="columnNames">column names (can be empty)</param>
        /// <param name="dataTypes">column types (can be empty)</param>
        /// <param name="numberOfRowsToRead">number of rows to read not including the header(if present)</param>
        /// <param name="guessRows">number of rows used to guess types</param>
        /// <param name="addIndexColumn">add one column with the row index</param>
        /// <returns><see cref="DataFrame"/></returns>
        public static DataFrame LoadCsvFromString(string csvString,
                                                  char separator          = ',', bool header       = true,
                                                  string[] columnNames    = null, Type[] dataTypes = null,
                                                  long numberOfRowsToRead = -1, int guessRows      = 10, bool addIndexColumn = false)
        {
            WrappedStreamReaderOrStringReader wrappedStreamReaderOrStringReader = new WrappedStreamReaderOrStringReader(csvString);

            return(ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn));
        }
Пример #2
0
        /// <summary>
        /// Reads a seekable stream of CSV data into a DataFrame.
        /// </summary>
        /// <param name="csvStream">stream of CSV data to be read in</param>
        /// <param name="separator">column separator</param>
        /// <param name="header">has a header or not</param>
        /// <param name="columnNames">column names (can be empty)</param>
        /// <param name="dataTypes">column types (can be empty)</param>
        /// <param name="numberOfRowsToRead">number of rows to read not including the header(if present)</param>
        /// <param name="guessRows">number of rows used to guess types</param>
        /// <param name="addIndexColumn">add one column with the row index</param>
        /// <param name="encoding">The character encoding. Defaults to UTF8 if not specified</param>
        /// <returns><see cref="DataFrame"/></returns>
        public static DataFrame LoadCsv(Stream csvStream,
                                        char separator          = ',', bool header       = true,
                                        string[] columnNames    = null, Type[] dataTypes = null,
                                        long numberOfRowsToRead = -1, int guessRows      = 10, bool addIndexColumn = false,
                                        Encoding encoding       = null)
        {
            if (!csvStream.CanSeek)
            {
                throw new ArgumentException(Strings.NonSeekableStream, nameof(csvStream));
            }

            if (dataTypes == null && guessRows <= 0)
            {
                throw new ArgumentException(string.Format(Strings.ExpectedEitherGuessRowsOrDataTypes, nameof(guessRows), nameof(dataTypes)));
            }

            WrappedStreamReaderOrStringReader wrappedStreamReaderOrStringReader = new WrappedStreamReaderOrStringReader(csvStream, encoding ?? Encoding.UTF8);

            return(ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn));
        }
Пример #3
0
        private static DataFrame ReadCsvLinesIntoDataFrame(WrappedStreamReaderOrStringReader wrappedReader,
                                                           char separator          = ',', bool header       = true,
                                                           string[] columnNames    = null, Type[] dataTypes = null,
                                                           long numberOfRowsToRead = -1, int guessRows      = 10, bool addIndexColumn = false
                                                           )
        {
            if (dataTypes == null && guessRows <= 0)
            {
                throw new ArgumentException(string.Format(Strings.ExpectedEitherGuessRowsOrDataTypes, nameof(guessRows), nameof(dataTypes)));
            }

            List <DataFrameColumn> columns;

            string[] fields;
            using (var textReader = wrappedReader.GetTextReader())
            {
                TextFieldParser parser = new TextFieldParser(textReader);
                parser.SetDelimiters(separator.ToString());

                var  linesForGuessType = new List <string[]>();
                long rowline           = 0;
                int  numberOfColumns   = dataTypes?.Length ?? 0;

                if (header == true && numberOfRowsToRead != -1)
                {
                    numberOfRowsToRead++;
                }

                // First pass: schema and number of rows.
                while ((fields = parser.ReadFields()) != null)
                {
                    if ((numberOfRowsToRead == -1) || rowline < numberOfRowsToRead)
                    {
                        if (linesForGuessType.Count < guessRows || (header && rowline == 0))
                        {
                            if (header && rowline == 0)
                            {
                                if (columnNames == null)
                                {
                                    columnNames = fields;
                                }
                            }
                            else
                            {
                                linesForGuessType.Add(fields);
                                numberOfColumns = Math.Max(numberOfColumns, fields.Length);
                            }
                        }
                    }
                    ++rowline;
                    if (rowline == guessRows || guessRows == 0)
                    {
                        break;
                    }
                }

                if (rowline == 0)
                {
                    throw new FormatException(Strings.EmptyFile);
                }

                columns = new List <DataFrameColumn>(numberOfColumns);
                // Guesses types or looks up dataTypes and adds columns.
                for (int i = 0; i < numberOfColumns; ++i)
                {
                    Type kind = dataTypes == null?GuessKind(i, linesForGuessType) : dataTypes[i];

                    columns.Add(CreateColumn(kind, columnNames, i));
                }
            }

            DataFrame ret = new DataFrame(columns);

            // Fill values.
            using (var textReader = wrappedReader.GetTextReader())
            {
                TextFieldParser parser = new TextFieldParser(textReader);
                parser.SetDelimiters(separator.ToString());

                long rowline = 0;
                while ((fields = parser.ReadFields()) != null && (numberOfRowsToRead == -1 || rowline < numberOfRowsToRead))
                {
                    if (header && rowline == 0)
                    {
                        // Skips.
                    }
                    else
                    {
                        ret.Append(fields, inPlace: true);
                    }
                    ++rowline;
                }

                if (addIndexColumn)
                {
                    PrimitiveDataFrameColumn <int> indexColumn = new PrimitiveDataFrameColumn <int>("IndexColumn", columns[0].Length);
                    for (int i = 0; i < columns[0].Length; i++)
                    {
                        indexColumn[i] = i;
                    }
                    columns.Insert(0, indexColumn);
                }
            }

            return(ret);
        }