public IDataTable Normalise(DataTableNormalisation normalisationModel, Stream output = null) { var normaliser = new DataTableNormaliser(this, normalisationModel.Type, output, normalisationModel); Process(normaliser); return(normaliser.GetDataTable()); }
public DataTableNormaliser(IDataTable dataTable, NormalisationType type, Stream output, IEnumerable <int> columnIndices) { _writer = new DataTableWriter(dataTable.Columns, output); var analysis = dataTable.GetAnalysis(); var columnNormList = new List <DataTableNormalisation.Column>(); var columns = analysis.ColumnInfo.AsQueryable(); if (columnIndices != null) { var columnSet = new HashSet <int>(columnIndices); columns = columns.Where(ci => columnSet.Contains(ci.ColumnIndex)); } var vectorColumns = new List <(int ColumnIndex, int Size)>(); foreach (var columnInfo in columns) { var column = dataTable.Columns[columnInfo.ColumnIndex]; if (column.IsContinuous && columnInfo is INumericColumnInfo numericInfo) { var columnNorm = _GetColumn(type, numericInfo, columnInfo.ColumnIndex, column.Type); if (columnNorm != null) { columnNormList.Add(columnNorm); } } else if (column.Type == ColumnType.Vector && columnInfo is IDimensionsColumnInfo vector && vector.XDimension.HasValue && vector.XDimension.Value > 0) { vectorColumns.Add((column.Index, vector.XDimension.Value)); } } DataTableNormalisation.VectorColumn[] vectorColumnNormList = null; if (vectorColumns.Any()) { var collectors = vectorColumns.Select(vc => Enumerable.Range(0, vc.Size).Select(i => new NumberCollector(i)).ToList()).ToList(); dataTable.ForEach(row => { foreach (var column in vectorColumns.Zip(collectors, (vc, c) => (vc, c))) { var vectorAsRow = row.GetField <FloatVector>(column.Item1.ColumnIndex).AsRow(); foreach (var collector in column.Item2) { collector.Process(vectorAsRow); } } }); vectorColumnNormList = collectors.Select((c, i) => new DataTableNormalisation.VectorColumn { ColumnIndex = vectorColumns[i].ColumnIndex, VectorColumns = c.Select((nc, j) => _GetColumn(type, nc, j, ColumnType.Float)).ToArray() }).ToArray(); } _normalisationModel = new DataTableNormalisation { Type = type, ColumnNormalisation = columnNormList.ToArray(), VectorColumnNormalisation = vectorColumnNormList }; }
public DataTableNormaliser(IDataTable dataTable, NormalisationType type, Stream output = null, DataTableNormalisation model = null) { _table = dataTable; _writer = new DataTableWriter(dataTable.Columns, output); if (model != null) { _normalisationModel = model; } else { var analysis = dataTable.GetAnalysis(); var columnNormList = new List <DataTableNormalisation.Column>(); foreach (var columnInfo in analysis.ColumnInfo) { var column = dataTable.Columns[columnInfo.ColumnIndex]; if (column.IsContinuous) { var numericInfo = columnInfo as INumericColumnInfo; if (numericInfo != null) { if (type == NormalisationType.Standard && !numericInfo.StdDev.HasValue) { continue; } DataTableNormalisation.Column columnNorm; if (type == NormalisationType.Standard) { columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.StdDev.Value, numericInfo.Mean); } else if (type == NormalisationType.Euclidean) { columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.L2Norm); } else if (type == NormalisationType.Manhattan) { columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.L1Norm); } else if (type == NormalisationType.FeatureScale) { columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.Max - numericInfo.Min, numericInfo.Min); } else { throw new NotImplementedException(); } columnNormList.Add(columnNorm); } } } _normalisationModel = new DataTableNormalisation { Type = type, ColumnNormalisation = columnNormList.ToArray() }; } }
public DataTableNormaliser(IDataTable dataTable, NormalisationType type, Stream output, IEnumerable <int> columnIndices) { _writer = new DataTableWriter(dataTable.Columns, output); var analysis = dataTable.GetAnalysis(); var columnNormList = new List <DataTableNormalisation.Column>(); var columns = analysis.ColumnInfo.AsQueryable(); if (columnIndices != null) { var columnSet = new HashSet <int>(columnIndices); columns = columns.Where(ci => columnSet.Contains(ci.ColumnIndex)); } foreach (var columnInfo in columns) { var column = dataTable.Columns[columnInfo.ColumnIndex]; if (column.IsContinuous) { if (columnInfo is INumericColumnInfo numericInfo) { if (type == NormalisationType.Standard && !numericInfo.StdDev.HasValue) { continue; } DataTableNormalisation.Column columnNorm; if (type == NormalisationType.Standard) { columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.StdDev ?? 1, numericInfo.Mean); } else if (type == NormalisationType.Euclidean) { columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.L2Norm); } else if (type == NormalisationType.Manhattan) { columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.L1Norm); } else if (type == NormalisationType.FeatureScale) { columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.Max - numericInfo.Min, numericInfo.Min); } else { throw new NotImplementedException(); } columnNormList.Add(columnNorm); } } } _normalisationModel = new DataTableNormalisation { Type = type, ColumnNormalisation = columnNormList.ToArray() }; }
public DataTableNormaliser(IDataTable dataTable, Stream output, DataTableNormalisation model) { _writer = new DataTableWriter(dataTable.Columns, output); _normalisationModel = model; }