public DataTableVectoriser(IDataTable table, bool useTargetColumnIndex) { var classColumnIndex = useTargetColumnIndex ? table.TargetColumnIndex : -1; var analysis = table.GetAnalysis(); _vectorisationModel = new DataTableVectorisation { ClassColumnIndex = classColumnIndex }; var columnList = new List <DataTableVectorisation.Column>(); foreach (var columnInfo in analysis.ColumnInfo) { var column = table.Columns[columnInfo.ColumnIndex]; var columnModel = new DataTableVectorisation.Column { ColumnIndex = columnInfo.ColumnIndex, IsContinuous = false, IsTargetColumn = columnInfo.ColumnIndex == classColumnIndex, Size = 0, Name = column.Name }; columnList.Add(columnModel); if (columnInfo is IIndexColumnInfo indexColumn) { columnModel.Size = Convert.ToInt32(indexColumn.MaxIndex + 1); } else { columnModel.IsContinuous = column.IsContinuous || !columnInfo.NumDistinct.HasValue; if (columnModel.IsContinuous) { columnModel.Size = 1; } else { columnModel.Size = columnInfo.NumDistinct ?? 0; columnModel.Values = columnInfo.DistinctValues .Select(s => s.ToString()) .OrderBy(s => s) .Select((s, i) => new DataTableVectorisation.CategoricalIndex { Category = s, Index = i }) .ToArray() ; } } if (columnModel.IsTargetColumn) { _vectorisationModel.OutputSize = columnModel.Size; _vectorisationModel.IsTargetContinuous = columnModel.IsContinuous; _vectorisationModel.HasTarget = true; } else { _vectorisationModel.InputSize += columnModel.Size; } } _vectorisationModel.Columns = columnList.ToArray(); }
public DataTableVectoriser(IDataTable table, bool useTargetColumnIndex, int maxNumericCategoricalExpansion = 128) { var classColumnIndex = useTargetColumnIndex ? table.TargetColumnIndex : -1; var analysis = table.GetAnalysis(); _vectorisationModel = new DataTableVectorisation { ClassColumnIndex = classColumnIndex }; var columnList = new List <DataTableVectorisation.Column>(); foreach (var columnInfo in analysis.ColumnInfo) { var column = table.Columns[columnInfo.ColumnIndex]; var columnModel = new DataTableVectorisation.Column { ColumnIndex = columnInfo.ColumnIndex, IsContinuous = false, IsTargetColumn = columnInfo.ColumnIndex == classColumnIndex, Size = 0, Name = column.Name }; columnList.Add(columnModel); if (column.Type == ColumnType.Boolean) { columnModel.Size = 1; columnModel.IsBinary = true; } else if (columnInfo is IIndexColumnInfo indexColumn) { columnModel.Size = Convert.ToInt32(indexColumn.MaxIndex + 1); } else if (columnInfo is IDimensionsColumnInfo vectorColumn) { var size = vectorColumn.XDimension ?? 0; if (vectorColumn.YDimension.HasValue) { size *= vectorColumn.YDimension.Value; } if (vectorColumn.ZDimension.HasValue) { size *= vectorColumn.ZDimension.Value; } columnModel.Size = size; } else if (columnInfo is INumericColumnInfo) { columnModel.IsContinuous = column.IsContinuous || !columnInfo.NumDistinct.HasValue || columnInfo.NumDistinct.Value > maxNumericCategoricalExpansion; if (columnModel.IsContinuous) { columnModel.Size = 1; } else { columnModel.Size = columnInfo.NumDistinct ?? 0; columnModel.Values = _GetCategoricalValues(columnInfo.DistinctValues); } } else if (columnInfo.NumDistinct.HasValue) { columnModel.Size = columnInfo.NumDistinct ?? 0; columnModel.Values = _GetCategoricalValues(columnInfo.DistinctValues); } if (columnModel.IsTargetColumn) { _vectorisationModel.OutputSize = columnModel.Size; _vectorisationModel.IsTargetContinuous = columnModel.IsContinuous; _vectorisationModel.HasTarget = true; _vectorisationModel.IsTargetBinary = columnModel.IsBinary; } else { _vectorisationModel.InputSize += columnModel.Size; } } _vectorisationModel.Columns = columnList.ToArray(); }