public DataTableNormaliser(IDataTable dataTable, NormalisationType type, Stream output, IEnumerable <int> columnIndices) { _writer = new DataTableWriter(dataTable.Columns, output); var analysis = dataTable.GetAnalysis(); var columnNormList = new List <DataTableNormalisation.Column>(); var columns = analysis.ColumnInfo.AsQueryable(); if (columnIndices != null) { var columnSet = new HashSet <int>(columnIndices); columns = columns.Where(ci => columnSet.Contains(ci.ColumnIndex)); } var vectorColumns = new List <(int ColumnIndex, int Size)>(); foreach (var columnInfo in columns) { var column = dataTable.Columns[columnInfo.ColumnIndex]; if (column.IsContinuous && columnInfo is INumericColumnInfo numericInfo) { var columnNorm = _GetColumn(type, numericInfo, columnInfo.ColumnIndex, column.Type); if (columnNorm != null) { columnNormList.Add(columnNorm); } } else if (column.Type == ColumnType.Vector && columnInfo is IDimensionsColumnInfo vector && vector.XDimension.HasValue && vector.XDimension.Value > 0) { vectorColumns.Add((column.Index, vector.XDimension.Value)); } } DataTableNormalisation.VectorColumn[] vectorColumnNormList = null; if (vectorColumns.Any()) { var collectors = vectorColumns.Select(vc => Enumerable.Range(0, vc.Size).Select(i => new NumberCollector(i)).ToList()).ToList(); dataTable.ForEach(row => { foreach (var column in vectorColumns.Zip(collectors, (vc, c) => (vc, c))) { var vectorAsRow = row.GetField <FloatVector>(column.Item1.ColumnIndex).AsRow(); foreach (var collector in column.Item2) { collector.Process(vectorAsRow); } } }); vectorColumnNormList = collectors.Select((c, i) => new DataTableNormalisation.VectorColumn { ColumnIndex = vectorColumns[i].ColumnIndex, VectorColumns = c.Select((nc, j) => _GetColumn(type, nc, j, ColumnType.Float)).ToArray() }).ToArray(); } _normalisationModel = new DataTableNormalisation { Type = type, ColumnNormalisation = columnNormList.ToArray(), VectorColumnNormalisation = vectorColumnNormList }; }
public DataTableNormaliser(IDataTable dataTable, NormalisationType type, Stream output = null, DataTableNormalisation model = null) { _table = dataTable; _writer = new DataTableWriter(dataTable.Columns, output); if (model != null) { _normalisationModel = model; } else { var analysis = dataTable.GetAnalysis(); var columnNormList = new List <DataTableNormalisation.Column>(); foreach (var columnInfo in analysis.ColumnInfo) { var column = dataTable.Columns[columnInfo.ColumnIndex]; if (column.IsContinuous) { var numericInfo = columnInfo as INumericColumnInfo; if (numericInfo != null) { if (type == NormalisationType.Standard && !numericInfo.StdDev.HasValue) { continue; } DataTableNormalisation.Column columnNorm; if (type == NormalisationType.Standard) { columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.StdDev.Value, numericInfo.Mean); } else if (type == NormalisationType.Euclidean) { columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.L2Norm); } else if (type == NormalisationType.Manhattan) { columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.L1Norm); } else if (type == NormalisationType.FeatureScale) { columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.Max - numericInfo.Min, numericInfo.Min); } else { throw new NotImplementedException(); } columnNormList.Add(columnNorm); } } } _normalisationModel = new DataTableNormalisation { Type = type, ColumnNormalisation = columnNormList.ToArray() }; } }
private void _Add(string line, DataTableWriter writer) { var convertedData = writer.Columns .Zip(_Parse(line), (c, str) => _Convert(ref c._type, str)) .ToList() ; writer.AddRow(convertedData); }
public static IDataTable Project(IDataTable table, IEnumerable <int> columns, Stream output = null) { var validColumn = new HashSet <int>(columns); var writer = new DataTableWriter(table.Columns.Select((c, i) => Tuple.Create(c, i)).Where(c => validColumn.Contains(c.Item2)).Select(c => c.Item1), output); var projector = new DataTableProjector(writer, columns); table.Process(projector); return(writer.GetDataTable()); }
public DataTableNormaliser(IDataTable dataTable, NormalisationType type, Stream output, IEnumerable <int> columnIndices) { _writer = new DataTableWriter(dataTable.Columns, output); var analysis = dataTable.GetAnalysis(); var columnNormList = new List <DataTableNormalisation.Column>(); var columns = analysis.ColumnInfo.AsQueryable(); if (columnIndices != null) { var columnSet = new HashSet <int>(columnIndices); columns = columns.Where(ci => columnSet.Contains(ci.ColumnIndex)); } foreach (var columnInfo in columns) { var column = dataTable.Columns[columnInfo.ColumnIndex]; if (column.IsContinuous) { if (columnInfo is INumericColumnInfo numericInfo) { if (type == NormalisationType.Standard && !numericInfo.StdDev.HasValue) { continue; } DataTableNormalisation.Column columnNorm; if (type == NormalisationType.Standard) { columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.StdDev ?? 1, numericInfo.Mean); } else if (type == NormalisationType.Euclidean) { columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.L2Norm); } else if (type == NormalisationType.Manhattan) { columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.L1Norm); } else if (type == NormalisationType.FeatureScale) { columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.Max - numericInfo.Min, numericInfo.Min); } else { throw new NotImplementedException(); } columnNormList.Add(columnNorm); } } } _normalisationModel = new DataTableNormalisation { Type = type, ColumnNormalisation = columnNormList.ToArray() }; }
private DataTableWriter _DetermineHeaders(Stream stream, List <string> lines, bool checkForHeader, ref bool hasHeader) { // see if there is a header (all strings) var firstLineTypes = _Parse(lines.First()); if (checkForHeader) { hasHeader = firstLineTypes.All(str => _DetermineType(str) == ColumnType.String); } // get the list of header names var headerNames = new List <string>(); int index = 0; foreach (var item in firstLineTypes) { headerNames.Add(hasHeader ? item : "_col" + index++); } // get the list of header types var data = lines .Skip(hasHeader ? 1 : 0) .SelectMany(line => _Parse(line).Select((str, pos) => Tuple.Create(str, pos))) .GroupBy(l => l.Item2, l => _DetermineType(l.Item1)) .OrderBy(g => g.Key) .Select(g => g.Max(v => (int)v)) .Cast <ColumnType>() .ToList() ; // add the columns var ret = new DataTableWriter(stream); foreach (var column in headerNames.Zip(data, (name, type) => Tuple.Create(name, type))) { ret.AddColumn(column.Item1, column.Item2); } return(ret); }
public DataTableNormaliser(IDataTable dataTable, Stream output, DataTableNormalisation model) { _writer = new DataTableWriter(dataTable.Columns, output); _normalisationModel = model; }