private void _Add(string line, DataTableWriter writer) { var convertedData = writer.Columns.Zip(_Parse(line), (c, str) => _Convert(ref c._type, str)). ToList(); writer.AddRow(convertedData); }
//public void Process(Func<IRow, int, bool> processor) //{ // int index = 0; // foreach (var item in _data) { // if (!processor(item, index++)) // break; // } //} /// <summary> /// Creates a data table /// </summary> /// <param name="output">Optional stream to write the data table to</param> public IDataTable Build(Stream output = null) { var writer = new DataTableWriter(Columns, output); Process(writer); return(writer.GetDataTable()); }
public IDataTable ConvertToNumeric(IDataTableVectoriser vectoriser = null, bool useTargetColumnIndex = true, Stream output = null) { var writer = new DataTableWriter(output); vectoriser = vectoriser ?? GetVectoriser(useTargetColumnIndex); // add the numeric columns foreach (var name in vectoriser.ColumnNames) { writer.AddColumn(name, ColumnType.Float); } // add the classification label column var classColumnIndex = TargetColumnIndex; if (useTargetColumnIndex) { var classColumn = _column[classColumnIndex]; writer.AddColumn(classColumn.Name, ColumnType.String, true); } // vectorise each row _Iterate((row, i) => { var rowData = vectoriser.GetInput(row).Data.AsEnumerable().Cast <object>(); if (useTargetColumnIndex) { rowData = rowData.Concat(new object[] { row.GetField <string>(classColumnIndex) }); } writer.AddRow(new DataTableRow(this, rowData.ToArray(), _rowConverter)); return(true); }); return(writer.GetDataTable()); }
public (IDataTable Training, IDataTable Test) Split(int?randomSeed = null, double trainPercentage = 0.8, bool shuffle = true, Stream output1 = null, Stream output2 = null) { var input = Enumerable.Range(0, RowCount); if (shuffle) { input = input.Shuffle(randomSeed); } var final = input.ToList(); int trainingCount = Convert.ToInt32(RowCount * trainPercentage); var writer1 = new DataTableWriter(Columns, output1); foreach (var row in GetRows(final.Take(trainingCount))) { writer1.Process(row); } var writer2 = new DataTableWriter(Columns, output2); foreach (var row in GetRows(final.Skip(trainingCount))) { writer2.Process(row); } return(writer1.GetDataTable(), writer2.GetDataTable()); }
public IEnumerable <(IDataTable Training, IDataTable Validation)> Fold(int k, int?randomSeed = null, bool shuffle = true) { var input = Enumerable.Range(0, RowCount); if (shuffle) { input = input.Shuffle(randomSeed); } var final = input.ToList(); var foldSize = final.Count / k; for (var i = 0; i < k; i++) { var trainingRows = final.Take(i * foldSize).Concat(final.Skip((i + 1) * foldSize)); var validationRows = final.Skip(i * foldSize).Take(foldSize); var writer1 = new DataTableWriter(Columns, null); foreach (var row in GetRows(trainingRows)) { writer1.Process(row); } var writer2 = new DataTableWriter(Columns, null); foreach (var row in GetRows(validationRows)) { writer2.Process(row); } yield return(writer1.GetDataTable(), writer2.GetDataTable()); } }
public void WriteTo(Stream stream) { var writer = new DataTableWriter(Columns, stream); Process(writer); writer.Flush(); }
private void _Preview(string line, DataTableWriter writer) { int index = 0; var fields = _Parse(line).ToList(); foreach (var column in writer.Columns) { _Convert(ref column._type, fields[index++]); } }
public IDataTable CopyWithRows(IEnumerable <int> rowIndex, Stream output = null) { var writer = new DataTableWriter(_column, output); foreach (var row in GetRows(rowIndex)) { writer.AddRow(row); } return(writer.GetDataTable()); }
public IDataTable Bag(int?count = null, Stream output = null, int?randomSeed = null) { var input = Enumerable.Range(0, RowCount).ToList().Bag(count ?? RowCount, randomSeed); var writer = new DataTableWriter(Columns, output); foreach (var row in GetRows(input)) { writer.Process(row); } return(writer.GetDataTable()); }
private void _Preview(string line, DataTableWriter writer) { int index = 0; var fields = _Parse(line).ToList(); // append empty strings to fill any blanks for (int i = fields.Count, len = writer.Columns.Count; i < len; i++) { fields.Add(string.Empty); } foreach (var column in writer.Columns) { _Convert(ref column._type, fields[index++]); } }
public IDataTable Zip(IDataTable dataTable, Stream output = null) { var writer = new DataTableWriter(_column.Concat(dataTable.Columns), output); _Iterate((row, i) => { if (i >= dataTable.RowCount) { return(false); } writer.AddRow(row.Data.Concat(dataTable.GetRow(i).Data).ToList()); return(true); }); return(writer.GetDataTable()); }
private DataTableWriter _DetermineHeaders(Stream stream, List <string> lines, bool checkForHeader, ref bool hasHeader) { // see if there is a header (all strings) var firstLineTypes = _Parse(lines.First()).ToList(); if (checkForHeader) { hasHeader = firstLineTypes.All(str => _DetermineType(str) == ColumnType.String); } // get the list of header names var headerNames = new List <string>(); int index = 0; foreach (var item in firstLineTypes) { headerNames.Add(hasHeader ? item : "_col" + index++); } // get the list of column types IReadOnlyList <ColumnType> columnTypes; if (_parseAsText) { columnTypes = firstLineTypes.Select(c => ColumnType.String).ToList(); } else { columnTypes = lines .Skip(hasHeader ? 1 : 0) .SelectMany(line => _Parse(line).Select((str, pos) => Tuple.Create(str, pos))) .GroupBy(l => l.Item2, l => _DetermineType(l.Item1)) .OrderBy(g => g.Key) .Select(_GetColumnType) .ToList() ; } // add the columns var ret = new DataTableWriter(stream); foreach (var column in headerNames.Zip(columnTypes, (name, type) => Tuple.Create(name, type))) { ret.AddColumn(column.Item1, column.Item2); } return(ret); }
public IDataTable ConvertToNumeric(IDataTableVectoriser vectoriser = null, Stream output = null) { var writer = new DataTableWriter(output); vectoriser = vectoriser ?? GetVectoriser(); foreach (var name in vectoriser.ColumnNames) { writer.AddColumn(name, ColumnType.Float); } var classColumnIndex = TargetColumnIndex; var classColumn = _column[classColumnIndex]; writer.AddColumn(classColumn.Name, ColumnType.String, true); _Iterate(row => { var data = vectoriser.GetInput(row).AsEnumerable().Cast <object>().Concat(new object[] { row.GetField <string>(classColumnIndex) }); writer.AddRow(data); return(true); }); return(writer.GetDataTable()); }
public IDataTable Project(Func <IRow, IReadOnlyList <object> > mutator, Stream output = null) { var isFirst = true; DataTableWriter writer = new DataTableWriter(output); _Iterate((row, i) => { var mutatedRow = mutator(row); if (mutatedRow != null) { if (isFirst) { int index = 0; foreach (var item in mutatedRow) { var column = Columns[index]; if (item == null) { writer.AddColumn(column.Name, ColumnType.Null, column.IsTarget); } else { var type = item.GetType(); ColumnType columnType; if (type == typeof(string)) { columnType = ColumnType.String; } else if (type == typeof(double)) { columnType = ColumnType.Double; } else if (type == typeof(float)) { columnType = ColumnType.Float; } else if (type == typeof(long)) { columnType = ColumnType.Long; } else if (type == typeof(int)) { columnType = ColumnType.Int; } else if (type == typeof(byte)) { columnType = ColumnType.Byte; } else if (type == typeof(DateTime)) { columnType = ColumnType.Date; } else if (type == typeof(bool)) { columnType = ColumnType.Boolean; } else if (type == typeof(FloatVector)) { columnType = ColumnType.Vector; } else if (type == typeof(FloatMatrix)) { columnType = ColumnType.Matrix; } else if (type == typeof(FloatTensor)) { columnType = ColumnType.Tensor; } else if (type == typeof(WeightedIndexList)) { columnType = ColumnType.WeightedIndexList; } else if (type == typeof(IndexList)) { columnType = ColumnType.IndexList; } else { throw new FormatException(); } writer.AddColumn(column.Name, columnType, column.IsTarget); } ++index; } isFirst = false; } writer.AddRow(new DataTableRow(this, mutatedRow, _rowConverter)); } return(true); }); return(writer.GetDataTable()); }
public IDataTable Project(Func <IRow, IReadOnlyList <object> > mutator, Stream output = null) { var isFirst = true; DataTableWriter writer = new DataTableWriter(output); _Iterate(row => { var row2 = mutator(row); if (row2 != null) { if (isFirst) { int index = 0; foreach (var item in row2) { var column = Columns[index]; if (item == null) { writer.AddColumn(column.Name, ColumnType.Null, column.IsTarget); } else { var type = item.GetType(); ColumnType columnType; if (type == typeof(string)) { columnType = ColumnType.String; } else if (type == typeof(double)) { columnType = ColumnType.Double; } else if (type == typeof(float)) { columnType = ColumnType.Float; } else if (type == typeof(long)) { columnType = ColumnType.Long; } else if (type == typeof(int)) { columnType = ColumnType.Int; } else if (type == typeof(byte)) { columnType = ColumnType.Byte; } else if (type == typeof(DateTime)) { columnType = ColumnType.Date; } else if (type == typeof(bool)) { columnType = ColumnType.Boolean; } else { throw new FormatException(); } writer.AddColumn(column.Name, columnType, column.IsTarget); } ++index; } isFirst = false; } writer.AddRow(row2); } return(true); }); return(writer.GetDataTable()); }