public DataTableNormaliser(IDataTable dataTable, NormalisationType type, Stream output, IEnumerable <int> columnIndices) { _writer = new DataTableWriter(dataTable.Columns, output); var analysis = dataTable.GetAnalysis(); var columnNormList = new List <DataTableNormalisation.Column>(); var columns = analysis.ColumnInfo.AsQueryable(); if (columnIndices != null) { var columnSet = new HashSet <int>(columnIndices); columns = columns.Where(ci => columnSet.Contains(ci.ColumnIndex)); } var vectorColumns = new List <(int ColumnIndex, int Size)>(); foreach (var columnInfo in columns) { var column = dataTable.Columns[columnInfo.ColumnIndex]; if (column.IsContinuous && columnInfo is INumericColumnInfo numericInfo) { var columnNorm = _GetColumn(type, numericInfo, columnInfo.ColumnIndex, column.Type); if (columnNorm != null) { columnNormList.Add(columnNorm); } } else if (column.Type == ColumnType.Vector && columnInfo is IDimensionsColumnInfo vector && vector.XDimension.HasValue && vector.XDimension.Value > 0) { vectorColumns.Add((column.Index, vector.XDimension.Value)); } } DataTableNormalisation.VectorColumn[] vectorColumnNormList = null; if (vectorColumns.Any()) { var collectors = vectorColumns.Select(vc => Enumerable.Range(0, vc.Size).Select(i => new NumberCollector(i)).ToList()).ToList(); dataTable.ForEach(row => { foreach (var column in vectorColumns.Zip(collectors, (vc, c) => (vc, c))) { var vectorAsRow = row.GetField <FloatVector>(column.Item1.ColumnIndex).AsRow(); foreach (var collector in column.Item2) { collector.Process(vectorAsRow); } } }); vectorColumnNormList = collectors.Select((c, i) => new DataTableNormalisation.VectorColumn { ColumnIndex = vectorColumns[i].ColumnIndex, VectorColumns = c.Select((nc, j) => _GetColumn(type, nc, j, ColumnType.Float)).ToArray() }).ToArray(); } _normalisationModel = new DataTableNormalisation { Type = type, ColumnNormalisation = columnNormList.ToArray(), VectorColumnNormalisation = vectorColumnNormList }; }
/// <summary> /// Create a row classifier node /// </summary> /// <param name="classifier">The classifier for each row</param> /// <param name="dataTable">The data table that contains the rows to classify (linked by mini batch index)</param> /// <param name="analysis">Optional data table analysis data</param> /// <param name="name">Optional name to give the node</param> /// <returns></returns> public (INode RowClassifier, int OutputSize) CreateClassifier(IRowClassifier classifier, IDataTable dataTable, IDataTableAnalysis analysis = null, string name = null) { var ret = new RowClassifier(LinearAlgebraProvider, classifier, dataTable, analysis ?? dataTable.GetAnalysis(), name); return(ret, ret.OutputSize); }
public DataTableNormaliser(IDataTable dataTable, NormalisationType type, Stream output = null, DataTableNormalisation model = null) { _table = dataTable; _writer = new DataTableWriter(dataTable.Columns, output); if (model != null) { _normalisationModel = model; } else { var analysis = dataTable.GetAnalysis(); var columnNormList = new List <DataTableNormalisation.Column>(); foreach (var columnInfo in analysis.ColumnInfo) { var column = dataTable.Columns[columnInfo.ColumnIndex]; if (column.IsContinuous) { var numericInfo = columnInfo as INumericColumnInfo; if (numericInfo != null) { if (type == NormalisationType.Standard && !numericInfo.StdDev.HasValue) { continue; } DataTableNormalisation.Column columnNorm; if (type == NormalisationType.Standard) { columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.StdDev.Value, numericInfo.Mean); } else if (type == NormalisationType.Euclidean) { columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.L2Norm); } else if (type == NormalisationType.Manhattan) { columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.L1Norm); } else if (type == NormalisationType.FeatureScale) { columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.Max - numericInfo.Min, numericInfo.Min); } else { throw new NotImplementedException(); } columnNormList.Add(columnNorm); } } } _normalisationModel = new DataTableNormalisation { Type = type, ColumnNormalisation = columnNormList.ToArray() }; } }
public DataTableVectoriser(IDataTable table, bool useTargetColumnIndex) { _column = table.Columns; _classColumnIndex = useTargetColumnIndex ? table.TargetColumnIndex : -1; _analysis = table.GetAnalysis(); foreach (var columnInfo in _analysis.ColumnInfo) { var column = table.Columns[columnInfo.ColumnIndex]; var isTarget = columnInfo.ColumnIndex == _classColumnIndex; int size = 0; var isContinuous = false; if (columnInfo is IIndexColumnInfo indexColumn) { size = Convert.ToInt32(indexColumn.MaxIndex + 1); } else { isContinuous = column.IsContinuous || !columnInfo.NumDistinct.HasValue; if (isContinuous) { size = 1; if (!isTarget) { _columnName.Add(column.Name); } } else { size = columnInfo.NumDistinct.Value; var categoryIndex = columnInfo.DistinctValues.Select(s => s.ToString()).OrderBy(s => s).Select((s, i) => Tuple.Create(s, i)).ToList(); var columnMap = categoryIndex.ToDictionary(d => d.Item1, d => d.Item2); var reverseColumnMap = categoryIndex.ToDictionary(d => d.Item2, d => d.Item1); _columnMap.Add(columnInfo.ColumnIndex, columnMap); _reverseColumnMap.Add(columnInfo.ColumnIndex, reverseColumnMap); if (!isTarget) { for (var i = 0; i < size; i++) { _columnName.Add(column.Name + ":" + reverseColumnMap[i]); } } } } if (isTarget) { _outputSize = size; _isTargetContinuous = isContinuous; _hasTarget = true; } else { _inputSize += size; } } }
public DataTableNormaliser(IDataTable dataTable, NormalisationType type, Stream output, IEnumerable <int> columnIndices) { _writer = new DataTableWriter(dataTable.Columns, output); var analysis = dataTable.GetAnalysis(); var columnNormList = new List <DataTableNormalisation.Column>(); var columns = analysis.ColumnInfo.AsQueryable(); if (columnIndices != null) { var columnSet = new HashSet <int>(columnIndices); columns = columns.Where(ci => columnSet.Contains(ci.ColumnIndex)); } foreach (var columnInfo in columns) { var column = dataTable.Columns[columnInfo.ColumnIndex]; if (column.IsContinuous) { if (columnInfo is INumericColumnInfo numericInfo) { if (type == NormalisationType.Standard && !numericInfo.StdDev.HasValue) { continue; } DataTableNormalisation.Column columnNorm; if (type == NormalisationType.Standard) { columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.StdDev ?? 1, numericInfo.Mean); } else if (type == NormalisationType.Euclidean) { columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.L2Norm); } else if (type == NormalisationType.Manhattan) { columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.L1Norm); } else if (type == NormalisationType.FeatureScale) { columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.Max - numericInfo.Min, numericInfo.Min); } else { throw new NotImplementedException(); } columnNormList.Add(columnNorm); } } } _normalisationModel = new DataTableNormalisation { Type = type, ColumnNormalisation = columnNormList.ToArray() }; }
public AdaBoostTrainer(IDataTable table) { _table = table; _classColumnIndex = table.TargetColumnIndex; _classificationList = table.GetAnalysis()[_classColumnIndex].DistinctValues.Select(v => v.ToString()).ToList(); var rowCount = table.RowCount; var weight = 1f / rowCount; _rowWeight = new float[rowCount]; for (var i = 0; i < rowCount; i++) { _rowWeight[i] = weight; } }
public DataTableVectoriser(IDataTable table, bool useTargetColumnIndex) { var classColumnIndex = useTargetColumnIndex ? table.TargetColumnIndex : -1; var analysis = table.GetAnalysis(); _vectorisationModel = new DataTableVectorisation { ClassColumnIndex = classColumnIndex }; var columnList = new List <DataTableVectorisation.Column>(); foreach (var columnInfo in analysis.ColumnInfo) { var column = table.Columns[columnInfo.ColumnIndex]; var columnModel = new DataTableVectorisation.Column { ColumnIndex = columnInfo.ColumnIndex, IsContinuous = false, IsTargetColumn = columnInfo.ColumnIndex == classColumnIndex, Size = 0, Name = column.Name }; columnList.Add(columnModel); if (columnInfo is IIndexColumnInfo indexColumn) { columnModel.Size = Convert.ToInt32(indexColumn.MaxIndex + 1); } else { columnModel.IsContinuous = column.IsContinuous || !columnInfo.NumDistinct.HasValue; if (columnModel.IsContinuous) { columnModel.Size = 1; } else { columnModel.Size = columnInfo.NumDistinct ?? 0; columnModel.Values = columnInfo.DistinctValues .Select(s => s.ToString()) .OrderBy(s => s) .Select((s, i) => new DataTableVectorisation.CategoricalIndex { Category = s, Index = i }) .ToArray() ; } } if (columnModel.IsTargetColumn) { _vectorisationModel.OutputSize = columnModel.Size; _vectorisationModel.IsTargetContinuous = columnModel.IsContinuous; _vectorisationModel.HasTarget = true; } else { _vectorisationModel.InputSize += columnModel.Size; } } _vectorisationModel.Columns = columnList.ToArray(); }
public DataTableVectoriser(IDataTable table, bool useTargetColumnIndex, int maxNumericCategoricalExpansion = 128) { var classColumnIndex = useTargetColumnIndex ? table.TargetColumnIndex : -1; var analysis = table.GetAnalysis(); _vectorisationModel = new DataTableVectorisation { ClassColumnIndex = classColumnIndex }; var columnList = new List <DataTableVectorisation.Column>(); foreach (var columnInfo in analysis.ColumnInfo) { var column = table.Columns[columnInfo.ColumnIndex]; var columnModel = new DataTableVectorisation.Column { ColumnIndex = columnInfo.ColumnIndex, IsContinuous = false, IsTargetColumn = columnInfo.ColumnIndex == classColumnIndex, Size = 0, Name = column.Name }; columnList.Add(columnModel); if (column.Type == ColumnType.Boolean) { columnModel.Size = 1; columnModel.IsBinary = true; } else if (columnInfo is IIndexColumnInfo indexColumn) { columnModel.Size = Convert.ToInt32(indexColumn.MaxIndex + 1); } else if (columnInfo is IDimensionsColumnInfo vectorColumn) { var size = vectorColumn.XDimension ?? 0; if (vectorColumn.YDimension.HasValue) { size *= vectorColumn.YDimension.Value; } if (vectorColumn.ZDimension.HasValue) { size *= vectorColumn.ZDimension.Value; } columnModel.Size = size; } else if (columnInfo is INumericColumnInfo) { columnModel.IsContinuous = column.IsContinuous || !columnInfo.NumDistinct.HasValue || columnInfo.NumDistinct.Value > maxNumericCategoricalExpansion; if (columnModel.IsContinuous) { columnModel.Size = 1; } else { columnModel.Size = columnInfo.NumDistinct ?? 0; columnModel.Values = _GetCategoricalValues(columnInfo.DistinctValues); } } else if (columnInfo.NumDistinct.HasValue) { columnModel.Size = columnInfo.NumDistinct ?? 0; columnModel.Values = _GetCategoricalValues(columnInfo.DistinctValues); } if (columnModel.IsTargetColumn) { _vectorisationModel.OutputSize = columnModel.Size; _vectorisationModel.IsTargetContinuous = columnModel.IsContinuous; _vectorisationModel.HasTarget = true; _vectorisationModel.IsTargetBinary = columnModel.IsBinary; } else { _vectorisationModel.InputSize += columnModel.Size; } } _vectorisationModel.Columns = columnList.ToArray(); }