Example #1
0
        private void _Add(string line, DataTableWriter writer)
        {
            var convertedData = writer.Columns.Zip(_Parse(line), (c, str) => _Convert(ref c._type, str)).
                                ToList();

            writer.AddRow(convertedData);
        }
Example #2
0
        //public void Process(Func<IRow, int, bool> processor)
        //{
        //    int index = 0;
        //    foreach (var item in _data) {
        //        if (!processor(item, index++))
        //            break;
        //    }
        //}

        /// <summary>
        /// Creates a data table
        /// </summary>
        /// <param name="output">Optional stream to write the data table to</param>
        public IDataTable Build(Stream output = null)
        {
            var writer = new DataTableWriter(Columns, output);

            Process(writer);
            return(writer.GetDataTable());
        }
Example #3
0
        public IDataTable ConvertToNumeric(IDataTableVectoriser vectoriser = null, bool useTargetColumnIndex = true, Stream output = null)
        {
            var writer = new DataTableWriter(output);

            vectoriser = vectoriser ?? GetVectoriser(useTargetColumnIndex);

            // add the numeric columns
            foreach (var name in vectoriser.ColumnNames)
            {
                writer.AddColumn(name, ColumnType.Float);
            }

            // add the classification label column
            var classColumnIndex = TargetColumnIndex;

            if (useTargetColumnIndex)
            {
                var classColumn = _column[classColumnIndex];
                writer.AddColumn(classColumn.Name, ColumnType.String, true);
            }

            // vectorise each row
            _Iterate((row, i) => {
                var rowData = vectoriser.GetInput(row).Data.AsEnumerable().Cast <object>();
                if (useTargetColumnIndex)
                {
                    rowData = rowData.Concat(new object[] { row.GetField <string>(classColumnIndex) });
                }

                writer.AddRow(new DataTableRow(this, rowData.ToArray(), _rowConverter));
                return(true);
            });
            return(writer.GetDataTable());
        }
Example #4
0
        public (IDataTable Training, IDataTable Test) Split(int?randomSeed = null, double trainPercentage = 0.8, bool shuffle = true, Stream output1 = null, Stream output2 = null)
        {
            var input = Enumerable.Range(0, RowCount);

            if (shuffle)
            {
                input = input.Shuffle(randomSeed);
            }
            var final         = input.ToList();
            int trainingCount = Convert.ToInt32(RowCount * trainPercentage);

            var writer1 = new DataTableWriter(Columns, output1);

            foreach (var row in GetRows(final.Take(trainingCount)))
            {
                writer1.Process(row);
            }

            var writer2 = new DataTableWriter(Columns, output2);

            foreach (var row in GetRows(final.Skip(trainingCount)))
            {
                writer2.Process(row);
            }

            return(writer1.GetDataTable(), writer2.GetDataTable());
        }
Example #5
0
        public IEnumerable <(IDataTable Training, IDataTable Validation)> Fold(int k, int?randomSeed = null, bool shuffle = true)
        {
            var input = Enumerable.Range(0, RowCount);

            if (shuffle)
            {
                input = input.Shuffle(randomSeed);
            }
            var final    = input.ToList();
            var foldSize = final.Count / k;

            for (var i = 0; i < k; i++)
            {
                var trainingRows   = final.Take(i * foldSize).Concat(final.Skip((i + 1) * foldSize));
                var validationRows = final.Skip(i * foldSize).Take(foldSize);

                var writer1 = new DataTableWriter(Columns, null);
                foreach (var row in GetRows(trainingRows))
                {
                    writer1.Process(row);
                }

                var writer2 = new DataTableWriter(Columns, null);
                foreach (var row in GetRows(validationRows))
                {
                    writer2.Process(row);
                }

                yield return(writer1.GetDataTable(), writer2.GetDataTable());
            }
        }
Example #6
0
        public void WriteTo(Stream stream)
        {
            var writer = new DataTableWriter(Columns, stream);

            Process(writer);
            writer.Flush();
        }
Example #7
0
        private void _Preview(string line, DataTableWriter writer)
        {
            int index  = 0;
            var fields = _Parse(line).ToList();

            foreach (var column in writer.Columns)
            {
                _Convert(ref column._type, fields[index++]);
            }
        }
Example #8
0
        public IDataTable CopyWithRows(IEnumerable <int> rowIndex, Stream output = null)
        {
            var writer = new DataTableWriter(_column, output);

            foreach (var row in GetRows(rowIndex))
            {
                writer.AddRow(row);
            }
            return(writer.GetDataTable());
        }
Example #9
0
        public IDataTable Bag(int?count = null, Stream output = null, int?randomSeed = null)
        {
            var input  = Enumerable.Range(0, RowCount).ToList().Bag(count ?? RowCount, randomSeed);
            var writer = new DataTableWriter(Columns, output);

            foreach (var row in GetRows(input))
            {
                writer.Process(row);
            }
            return(writer.GetDataTable());
        }
Example #10
0
        private void _Preview(string line, DataTableWriter writer)
        {
            int index  = 0;
            var fields = _Parse(line).ToList();

            // append empty strings to fill any blanks
            for (int i = fields.Count, len = writer.Columns.Count; i < len; i++)
            {
                fields.Add(string.Empty);
            }
            foreach (var column in writer.Columns)
            {
                _Convert(ref column._type, fields[index++]);
            }
        }
Example #11
0
        public IDataTable Zip(IDataTable dataTable, Stream output = null)
        {
            var writer = new DataTableWriter(_column.Concat(dataTable.Columns), output);

            _Iterate((row, i) =>
            {
                if (i >= dataTable.RowCount)
                {
                    return(false);
                }
                writer.AddRow(row.Data.Concat(dataTable.GetRow(i).Data).ToList());
                return(true);
            });
            return(writer.GetDataTable());
        }
Example #12
0
        private DataTableWriter _DetermineHeaders(Stream stream, List <string> lines, bool checkForHeader, ref bool hasHeader)
        {
            // see if there is a header (all strings)
            var firstLineTypes = _Parse(lines.First()).ToList();

            if (checkForHeader)
            {
                hasHeader = firstLineTypes.All(str => _DetermineType(str) == ColumnType.String);
            }

            // get the list of header names
            var headerNames = new List <string>();
            int index       = 0;

            foreach (var item in firstLineTypes)
            {
                headerNames.Add(hasHeader ? item : "_col" + index++);
            }

            // get the list of column types
            IReadOnlyList <ColumnType> columnTypes;

            if (_parseAsText)
            {
                columnTypes = firstLineTypes.Select(c => ColumnType.String).ToList();
            }
            else
            {
                columnTypes = lines
                              .Skip(hasHeader ? 1 : 0)
                              .SelectMany(line => _Parse(line).Select((str, pos) => Tuple.Create(str, pos)))
                              .GroupBy(l => l.Item2, l => _DetermineType(l.Item1))
                              .OrderBy(g => g.Key)
                              .Select(_GetColumnType)
                              .ToList()
                ;
            }

            // add the columns
            var ret = new DataTableWriter(stream);

            foreach (var column in headerNames.Zip(columnTypes, (name, type) => Tuple.Create(name, type)))
            {
                ret.AddColumn(column.Item1, column.Item2);
            }

            return(ret);
        }
Example #13
0
        public IDataTable ConvertToNumeric(IDataTableVectoriser vectoriser = null, Stream output = null)
        {
            var writer = new DataTableWriter(output);

            vectoriser = vectoriser ?? GetVectoriser();
            foreach (var name in vectoriser.ColumnNames)
            {
                writer.AddColumn(name, ColumnType.Float);
            }
            var classColumnIndex = TargetColumnIndex;
            var classColumn      = _column[classColumnIndex];

            writer.AddColumn(classColumn.Name, ColumnType.String, true);

            _Iterate(row => {
                var data = vectoriser.GetInput(row).AsEnumerable().Cast <object>().Concat(new object[] { row.GetField <string>(classColumnIndex) });
                writer.AddRow(data);
                return(true);
            });
            return(writer.GetDataTable());
        }
Example #14
0
        public IDataTable Project(Func <IRow, IReadOnlyList <object> > mutator, Stream output = null)
        {
            var             isFirst = true;
            DataTableWriter writer  = new DataTableWriter(output);

            _Iterate((row, i) => {
                var mutatedRow = mutator(row);
                if (mutatedRow != null)
                {
                    if (isFirst)
                    {
                        int index = 0;
                        foreach (var item in mutatedRow)
                        {
                            var column = Columns[index];
                            if (item == null)
                            {
                                writer.AddColumn(column.Name, ColumnType.Null, column.IsTarget);
                            }
                            else
                            {
                                var type = item.GetType();
                                ColumnType columnType;
                                if (type == typeof(string))
                                {
                                    columnType = ColumnType.String;
                                }
                                else if (type == typeof(double))
                                {
                                    columnType = ColumnType.Double;
                                }
                                else if (type == typeof(float))
                                {
                                    columnType = ColumnType.Float;
                                }
                                else if (type == typeof(long))
                                {
                                    columnType = ColumnType.Long;
                                }
                                else if (type == typeof(int))
                                {
                                    columnType = ColumnType.Int;
                                }
                                else if (type == typeof(byte))
                                {
                                    columnType = ColumnType.Byte;
                                }
                                else if (type == typeof(DateTime))
                                {
                                    columnType = ColumnType.Date;
                                }
                                else if (type == typeof(bool))
                                {
                                    columnType = ColumnType.Boolean;
                                }
                                else if (type == typeof(FloatVector))
                                {
                                    columnType = ColumnType.Vector;
                                }
                                else if (type == typeof(FloatMatrix))
                                {
                                    columnType = ColumnType.Matrix;
                                }
                                else if (type == typeof(FloatTensor))
                                {
                                    columnType = ColumnType.Tensor;
                                }
                                else if (type == typeof(WeightedIndexList))
                                {
                                    columnType = ColumnType.WeightedIndexList;
                                }
                                else if (type == typeof(IndexList))
                                {
                                    columnType = ColumnType.IndexList;
                                }
                                else
                                {
                                    throw new FormatException();
                                }

                                writer.AddColumn(column.Name, columnType, column.IsTarget);
                            }
                            ++index;
                        }
                        isFirst = false;
                    }
                    writer.AddRow(new DataTableRow(this, mutatedRow, _rowConverter));
                }
                return(true);
            });
            return(writer.GetDataTable());
        }
Example #15
0
        public IDataTable Project(Func <IRow, IReadOnlyList <object> > mutator, Stream output = null)
        {
            var             isFirst = true;
            DataTableWriter writer  = new DataTableWriter(output);

            _Iterate(row => {
                var row2 = mutator(row);
                if (row2 != null)
                {
                    if (isFirst)
                    {
                        int index = 0;
                        foreach (var item in row2)
                        {
                            var column = Columns[index];
                            if (item == null)
                            {
                                writer.AddColumn(column.Name, ColumnType.Null, column.IsTarget);
                            }
                            else
                            {
                                var type = item.GetType();
                                ColumnType columnType;
                                if (type == typeof(string))
                                {
                                    columnType = ColumnType.String;
                                }
                                else if (type == typeof(double))
                                {
                                    columnType = ColumnType.Double;
                                }
                                else if (type == typeof(float))
                                {
                                    columnType = ColumnType.Float;
                                }
                                else if (type == typeof(long))
                                {
                                    columnType = ColumnType.Long;
                                }
                                else if (type == typeof(int))
                                {
                                    columnType = ColumnType.Int;
                                }
                                else if (type == typeof(byte))
                                {
                                    columnType = ColumnType.Byte;
                                }
                                else if (type == typeof(DateTime))
                                {
                                    columnType = ColumnType.Date;
                                }
                                else if (type == typeof(bool))
                                {
                                    columnType = ColumnType.Boolean;
                                }
                                else
                                {
                                    throw new FormatException();
                                }
                                writer.AddColumn(column.Name, columnType, column.IsTarget);
                            }
                            ++index;
                        }
                        isFirst = false;
                    }
                    writer.AddRow(row2);
                }
                return(true);
            });
            return(writer.GetDataTable());
        }