Exemple #1
0
        /// <summary>
        /// Reads a seekable stream of CSV data into a DataFrame.
        /// Follows pandas API.
        /// </summary>
        /// <param name="csvStream">stream of CSV data to be read in</param>
        /// <param name="separator">column separator</param>
        /// <param name="header">has a header or not</param>
        /// <param name="columnNames">column names (can be empty)</param>
        /// <param name="dataTypes">column types (can be empty)</param>
        /// <param name="numberOfRowsToRead">number of rows to read not including the header(if present)</param>
        /// <param name="guessRows">number of rows used to guess types</param>
        /// <param name="addIndexColumn">add one column with the row index</param>
        /// <returns><see cref="DataFrame"/></returns>
        public static DataFrame LoadCsv(Stream csvStream,
                                        char separator          = ',', bool header       = true,
                                        string[] columnNames    = null, Type[] dataTypes = null,
                                        long numberOfRowsToRead = -1, int guessRows      = 10, bool addIndexColumn = false)
        {
            if (!csvStream.CanSeek)
            {
                throw new ArgumentException(Strings.NonSeekableStream, nameof(csvStream));
            }

            var  linesForGuessType = new List <string[]>();
            long rowline           = 0;
            int  numberOfColumns   = dataTypes?.Length ?? 0;

            if (header == true && numberOfRowsToRead != -1)
            {
                numberOfRowsToRead++;
            }

            List <DataFrameColumn> columns;
            long streamStart = csvStream.Position;

            // First pass: schema and number of rows.
            using (var streamReader = new StreamReader(csvStream, Encoding.UTF8, detectEncodingFromByteOrderMarks: true, DefaultStreamReaderBufferSize, leaveOpen: true))
            {
                string line = null;
                if (dataTypes == null)
                {
                    line = streamReader.ReadLine();
                    while (line != null)
                    {
                        if ((numberOfRowsToRead == -1) || rowline < numberOfRowsToRead)
                        {
                            if (linesForGuessType.Count < guessRows)
                            {
                                var spl = line.Split(separator);
                                if (header && rowline == 0)
                                {
                                    if (columnNames == null)
                                    {
                                        columnNames = spl;
                                    }
                                }
                                else
                                {
                                    linesForGuessType.Add(spl);
                                    numberOfColumns = Math.Max(numberOfColumns, spl.Length);
                                }
                            }
                        }
                        ++rowline;
                        if (rowline == guessRows)
                        {
                            break;
                        }
                        line = streamReader.ReadLine();
                    }

                    if (linesForGuessType.Count == 0)
                    {
                        throw new FormatException(Strings.EmptyFile);
                    }
                }

                columns = new List <DataFrameColumn>(numberOfColumns);
                // Guesses types or looks up dataTypes and adds columns.
                for (int i = 0; i < numberOfColumns; ++i)
                {
                    Type kind = dataTypes == null?GuessKind(i, linesForGuessType) : dataTypes[i];

                    columns.Add(CreateColumn(kind, columnNames, i));
                }

                DataFrame ret = new DataFrame(columns);
                line = null;
                streamReader.DiscardBufferedData();
                streamReader.BaseStream.Seek(streamStart, SeekOrigin.Begin);

                // Fills values.
                line    = streamReader.ReadLine();
                rowline = 0;
                while (line != null && (numberOfRowsToRead == -1 || rowline < numberOfRowsToRead))
                {
                    var spl = line.Split(separator);
                    if (header && rowline == 0)
                    {
                        // Skips.
                    }
                    else
                    {
                        ret.Append(spl, inPlace: true);
                    }
                    ++rowline;
                    line = streamReader.ReadLine();
                }

                if (addIndexColumn)
                {
                    PrimitiveDataFrameColumn <int> indexColumn = new PrimitiveDataFrameColumn <int>("IndexColumn", columns[0].Length);
                    for (int i = 0; i < columns[0].Length; i++)
                    {
                        indexColumn[i] = i;
                    }
                    columns.Insert(0, indexColumn);
                }
                return(ret);
            }
        }
        private static DataFrame ReadCsvLinesIntoDataFrame(WrappedStreamReaderOrStringReader wrappedReader,
                                                           char separator          = ',', bool header       = true,
                                                           string[] columnNames    = null, Type[] dataTypes = null,
                                                           long numberOfRowsToRead = -1, int guessRows      = 10, bool addIndexColumn = false
                                                           )
        {
            if (dataTypes == null && guessRows <= 0)
            {
                throw new ArgumentException(string.Format(Strings.ExpectedEitherGuessRowsOrDataTypes, nameof(guessRows), nameof(dataTypes)));
            }

            List <DataFrameColumn> columns;

            string[] fields;
            using (var textReader = wrappedReader.GetTextReader())
            {
                TextFieldParser parser = new TextFieldParser(textReader);
                parser.SetDelimiters(separator.ToString());

                var  linesForGuessType = new List <string[]>();
                long rowline           = 0;
                int  numberOfColumns   = dataTypes?.Length ?? 0;

                if (header == true && numberOfRowsToRead != -1)
                {
                    numberOfRowsToRead++;
                }

                // First pass: schema and number of rows.
                while ((fields = parser.ReadFields()) != null)
                {
                    if ((numberOfRowsToRead == -1) || rowline < numberOfRowsToRead)
                    {
                        if (linesForGuessType.Count < guessRows || (header && rowline == 0))
                        {
                            if (header && rowline == 0)
                            {
                                if (columnNames == null)
                                {
                                    columnNames = fields;
                                }
                            }
                            else
                            {
                                linesForGuessType.Add(fields);
                                numberOfColumns = Math.Max(numberOfColumns, fields.Length);
                            }
                        }
                    }
                    ++rowline;
                    if (rowline == guessRows || guessRows == 0)
                    {
                        break;
                    }
                }

                if (rowline == 0)
                {
                    throw new FormatException(Strings.EmptyFile);
                }

                columns = new List <DataFrameColumn>(numberOfColumns);
                // Guesses types or looks up dataTypes and adds columns.
                for (int i = 0; i < numberOfColumns; ++i)
                {
                    Type kind = dataTypes == null?GuessKind(i, linesForGuessType) : dataTypes[i];

                    columns.Add(CreateColumn(kind, columnNames, i));
                }
            }

            DataFrame ret = new DataFrame(columns);

            // Fill values.
            using (var textReader = wrappedReader.GetTextReader())
            {
                TextFieldParser parser = new TextFieldParser(textReader);
                parser.SetDelimiters(separator.ToString());

                long rowline = 0;
                while ((fields = parser.ReadFields()) != null && (numberOfRowsToRead == -1 || rowline < numberOfRowsToRead))
                {
                    if (header && rowline == 0)
                    {
                        // Skips.
                    }
                    else
                    {
                        ret.Append(fields, inPlace: true);
                    }
                    ++rowline;
                }

                if (addIndexColumn)
                {
                    PrimitiveDataFrameColumn <int> indexColumn = new PrimitiveDataFrameColumn <int>("IndexColumn", columns[0].Length);
                    for (int i = 0; i < columns[0].Length; i++)
                    {
                        indexColumn[i] = i;
                    }
                    columns.Insert(0, indexColumn);
                }
            }

            return(ret);
        }
Exemple #3
0
        /// <summary>
        /// Reads an implementation of IDataReader into a DataFrame.
        /// </summary>
        /// <param name="reader">DataReader to be read in</param>
        /// <param name="columnNames">column names (can be empty)</param>
        /// <param name="dataTypes">column types (can be empty)</param>
        /// <param name="numberOfRowsToRead">number of rows to read not including the header(if present)</param>
        /// <param name="addIndexColumn">add one column with the row index</param>
        /// <returns><see cref="DataFrame"/></returns>
        public static DataFrame FromDataReader(IDataReader reader, string[] columnNames = null, Type[] dataTypes = null, long numberOfRowsToRead = -1, bool addIndexColumn = false)
        {
            DataTable schemaTable     = reader.GetSchemaTable();
            int       numberOfColumns = schemaTable.Rows.Count;

            if (columnNames == null)
            {
                columnNames = new string[numberOfColumns];
                for (int i = 0; i < numberOfColumns; ++i)
                {
                    string columnName = schemaTable.Rows[i]["ColumnName"].ToString();
                    columnNames[i] = string.IsNullOrWhiteSpace(columnName) ? $"Column{i}" : columnName;
                }
            }

            var columns = new List <DataFrameColumn>(numberOfColumns);

            if (dataTypes == null)
            {
                for (int i = 0; i < numberOfColumns; ++i)
                {
                    var kind = (Type)schemaTable.Rows[i]["DataType"];
                    columns.Add(CreateColumn(kind, columnNames, i));
                }
            }
            else
            {
                for (int i = 0; i < numberOfColumns; ++i)
                {
                    columns.Add(CreateColumn(dataTypes[i], columnNames, i));
                }
            }

            long rowline = 0;
            var  ret     = new DataFrame(columns);

            while (reader.Read() && (numberOfRowsToRead == -1 || rowline < numberOfRowsToRead))
            {
                ret.Append(GetRecordValues(reader), inPlace: true);
                ++rowline;
            }

            if (addIndexColumn)
            {
                PrimitiveDataFrameColumn <int> indexColumn = new PrimitiveDataFrameColumn <int>("IndexColumn", columns[0].Length);
                for (int i = 0; i < columns[0].Length; i++)
                {
                    indexColumn[i] = i;
                }
                columns.Insert(0, indexColumn);
            }
            return(ret);

            IEnumerable <object> GetRecordValues(IDataRecord record)
            {
                for (int i = 0; i < record.FieldCount; i++)
                {
                    yield return(record[i] == DBNull.Value ? null : record[i]);
                }
            }
        }
Exemple #4
0
        private static DataFrame ReadCsvLinesIntoDataFrame(IEnumerable <string> lines,
                                                           char separator          = ',', bool header       = true,
                                                           string[] columnNames    = null, Type[] dataTypes = null,
                                                           long numberOfRowsToRead = -1, int guessRows      = 10, bool addIndexColumn = false
                                                           )
        {
            if (dataTypes == null && guessRows <= 0)
            {
                throw new ArgumentException(string.Format(Strings.ExpectedEitherGuessRowsOrDataTypes, nameof(guessRows), nameof(dataTypes)));
            }

            var  linesForGuessType = new List <string[]>();
            long rowline           = 0;
            int  numberOfColumns   = dataTypes?.Length ?? 0;

            if (header == true && numberOfRowsToRead != -1)
            {
                numberOfRowsToRead++;
            }

            List <DataFrameColumn> columns;
            // First pass: schema and number of rows.
            string line = null;

            var enumerator = lines.GetEnumerator();

            while (enumerator.MoveNext())
            {
                line = enumerator.Current;
                if ((numberOfRowsToRead == -1) || rowline < numberOfRowsToRead)
                {
                    if (linesForGuessType.Count < guessRows || (header && rowline == 0))
                    {
                        var spl = line.Split(separator);
                        if (header && rowline == 0)
                        {
                            if (columnNames == null)
                            {
                                columnNames = spl;
                            }
                        }
                        else
                        {
                            linesForGuessType.Add(spl);
                            numberOfColumns = Math.Max(numberOfColumns, spl.Length);
                        }
                    }
                }
                ++rowline;
                if (rowline == guessRows || guessRows == 0)
                {
                    break;
                }
            }

            if (rowline == 0)
            {
                throw new FormatException(Strings.EmptyFile);
            }

            columns = new List <DataFrameColumn>(numberOfColumns);
            // Guesses types or looks up dataTypes and adds columns.
            for (int i = 0; i < numberOfColumns; ++i)
            {
                Type kind = dataTypes == null?GuessKind(i, linesForGuessType) : dataTypes[i];

                columns.Add(CreateColumn(kind, columnNames, i));
            }

            DataFrame ret = new DataFrame(columns);

            line = null;

            // Fill values.
            enumerator.Reset();
            rowline = 0;
            while (enumerator.MoveNext() && (numberOfRowsToRead == -1 || rowline < numberOfRowsToRead))
            {
                line = enumerator.Current;
                var spl = line.Split(separator);
                if (header && rowline == 0)
                {
                    // Skips.
                }
                else
                {
                    ret.Append(spl, inPlace: true);
                }
                ++rowline;
            }

            if (addIndexColumn)
            {
                PrimitiveDataFrameColumn <int> indexColumn = new PrimitiveDataFrameColumn <int>("IndexColumn", columns[0].Length);
                for (int i = 0; i < columns[0].Length; i++)
                {
                    indexColumn[i] = i;
                }
                columns.Insert(0, indexColumn);
            }
            return(ret);
        }