예제 #1
0
        public DataTableNormaliser(IDataTable dataTable, NormalisationType type, Stream output, IEnumerable <int> columnIndices)
        {
            _writer = new DataTableWriter(dataTable.Columns, output);

            var analysis       = dataTable.GetAnalysis();
            var columnNormList = new List <DataTableNormalisation.Column>();
            var columns        = analysis.ColumnInfo.AsQueryable();

            if (columnIndices != null)
            {
                var columnSet = new HashSet <int>(columnIndices);
                columns = columns.Where(ci => columnSet.Contains(ci.ColumnIndex));
            }

            var vectorColumns = new List <(int ColumnIndex, int Size)>();

            foreach (var columnInfo in columns)
            {
                var column = dataTable.Columns[columnInfo.ColumnIndex];
                if (column.IsContinuous && columnInfo is INumericColumnInfo numericInfo)
                {
                    var columnNorm = _GetColumn(type, numericInfo, columnInfo.ColumnIndex, column.Type);
                    if (columnNorm != null)
                    {
                        columnNormList.Add(columnNorm);
                    }
                }
                else if (column.Type == ColumnType.Vector && columnInfo is IDimensionsColumnInfo vector && vector.XDimension.HasValue && vector.XDimension.Value > 0)
                {
                    vectorColumns.Add((column.Index, vector.XDimension.Value));
                }
            }

            DataTableNormalisation.VectorColumn[] vectorColumnNormList = null;
            if (vectorColumns.Any())
            {
                var collectors = vectorColumns.Select(vc => Enumerable.Range(0, vc.Size).Select(i => new NumberCollector(i)).ToList()).ToList();
                dataTable.ForEach(row => {
                    foreach (var column in vectorColumns.Zip(collectors, (vc, c) => (vc, c)))
                    {
                        var vectorAsRow = row.GetField <FloatVector>(column.Item1.ColumnIndex).AsRow();
                        foreach (var collector in column.Item2)
                        {
                            collector.Process(vectorAsRow);
                        }
                    }
                });
                vectorColumnNormList = collectors.Select((c, i) => new DataTableNormalisation.VectorColumn {
                    ColumnIndex   = vectorColumns[i].ColumnIndex,
                    VectorColumns = c.Select((nc, j) => _GetColumn(type, nc, j, ColumnType.Float)).ToArray()
                }).ToArray();
            }

            _normalisationModel = new DataTableNormalisation {
                Type = type,
                ColumnNormalisation       = columnNormList.ToArray(),
                VectorColumnNormalisation = vectorColumnNormList
            };
        }
예제 #2
0
        /// <summary>
        /// Create a row classifier node
        /// </summary>
        /// <param name="classifier">The classifier for each row</param>
        /// <param name="dataTable">The data table that contains the rows to classify (linked by mini batch index)</param>
        /// <param name="analysis">Optional data table analysis data</param>
        /// <param name="name">Optional name to give the node</param>
        /// <returns></returns>
        public (INode RowClassifier, int OutputSize) CreateClassifier(IRowClassifier classifier,
                                                                      IDataTable dataTable, IDataTableAnalysis analysis = null, string name = null)
        {
            var ret = new RowClassifier(LinearAlgebraProvider, classifier, dataTable,
                                        analysis ?? dataTable.GetAnalysis(), name);

            return(ret, ret.OutputSize);
        }
예제 #3
0
        public DataTableNormaliser(IDataTable dataTable, NormalisationType type, Stream output = null, DataTableNormalisation model = null)
        {
            _table  = dataTable;
            _writer = new DataTableWriter(dataTable.Columns, output);

            if (model != null)
            {
                _normalisationModel = model;
            }
            else
            {
                var analysis       = dataTable.GetAnalysis();
                var columnNormList = new List <DataTableNormalisation.Column>();
                foreach (var columnInfo in analysis.ColumnInfo)
                {
                    var column = dataTable.Columns[columnInfo.ColumnIndex];
                    if (column.IsContinuous)
                    {
                        var numericInfo = columnInfo as INumericColumnInfo;
                        if (numericInfo != null)
                        {
                            if (type == NormalisationType.Standard && !numericInfo.StdDev.HasValue)
                            {
                                continue;
                            }

                            DataTableNormalisation.Column columnNorm;
                            if (type == NormalisationType.Standard)
                            {
                                columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.StdDev.Value, numericInfo.Mean);
                            }
                            else if (type == NormalisationType.Euclidean)
                            {
                                columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.L2Norm);
                            }
                            else if (type == NormalisationType.Manhattan)
                            {
                                columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.L1Norm);
                            }
                            else if (type == NormalisationType.FeatureScale)
                            {
                                columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.Max - numericInfo.Min, numericInfo.Min);
                            }
                            else
                            {
                                throw new NotImplementedException();
                            }
                            columnNormList.Add(columnNorm);
                        }
                    }
                }
                _normalisationModel = new DataTableNormalisation {
                    Type = type,
                    ColumnNormalisation = columnNormList.ToArray()
                };
            }
        }
예제 #4
0
        public DataTableVectoriser(IDataTable table, bool useTargetColumnIndex)
        {
            _column           = table.Columns;
            _classColumnIndex = useTargetColumnIndex ? table.TargetColumnIndex : -1;
            _analysis         = table.GetAnalysis();

            foreach (var columnInfo in _analysis.ColumnInfo)
            {
                var column       = table.Columns[columnInfo.ColumnIndex];
                var isTarget     = columnInfo.ColumnIndex == _classColumnIndex;
                int size         = 0;
                var isContinuous = false;

                if (columnInfo is IIndexColumnInfo indexColumn)
                {
                    size = Convert.ToInt32(indexColumn.MaxIndex + 1);
                }
                else
                {
                    isContinuous = column.IsContinuous || !columnInfo.NumDistinct.HasValue;
                    if (isContinuous)
                    {
                        size = 1;
                        if (!isTarget)
                        {
                            _columnName.Add(column.Name);
                        }
                    }
                    else
                    {
                        size = columnInfo.NumDistinct.Value;
                        var categoryIndex    = columnInfo.DistinctValues.Select(s => s.ToString()).OrderBy(s => s).Select((s, i) => Tuple.Create(s, i)).ToList();
                        var columnMap        = categoryIndex.ToDictionary(d => d.Item1, d => d.Item2);
                        var reverseColumnMap = categoryIndex.ToDictionary(d => d.Item2, d => d.Item1);
                        _columnMap.Add(columnInfo.ColumnIndex, columnMap);
                        _reverseColumnMap.Add(columnInfo.ColumnIndex, reverseColumnMap);
                        if (!isTarget)
                        {
                            for (var i = 0; i < size; i++)
                            {
                                _columnName.Add(column.Name + ":" + reverseColumnMap[i]);
                            }
                        }
                    }
                }
                if (isTarget)
                {
                    _outputSize         = size;
                    _isTargetContinuous = isContinuous;
                    _hasTarget          = true;
                }
                else
                {
                    _inputSize += size;
                }
            }
        }
예제 #5
0
        public DataTableNormaliser(IDataTable dataTable, NormalisationType type, Stream output, IEnumerable <int> columnIndices)
        {
            _writer = new DataTableWriter(dataTable.Columns, output);

            var analysis       = dataTable.GetAnalysis();
            var columnNormList = new List <DataTableNormalisation.Column>();
            var columns        = analysis.ColumnInfo.AsQueryable();

            if (columnIndices != null)
            {
                var columnSet = new HashSet <int>(columnIndices);
                columns = columns.Where(ci => columnSet.Contains(ci.ColumnIndex));
            }

            foreach (var columnInfo in columns)
            {
                var column = dataTable.Columns[columnInfo.ColumnIndex];
                if (column.IsContinuous)
                {
                    if (columnInfo is INumericColumnInfo numericInfo)
                    {
                        if (type == NormalisationType.Standard && !numericInfo.StdDev.HasValue)
                        {
                            continue;
                        }

                        DataTableNormalisation.Column columnNorm;
                        if (type == NormalisationType.Standard)
                        {
                            columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.StdDev ?? 1, numericInfo.Mean);
                        }
                        else if (type == NormalisationType.Euclidean)
                        {
                            columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.L2Norm);
                        }
                        else if (type == NormalisationType.Manhattan)
                        {
                            columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.L1Norm);
                        }
                        else if (type == NormalisationType.FeatureScale)
                        {
                            columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.Max - numericInfo.Min, numericInfo.Min);
                        }
                        else
                        {
                            throw new NotImplementedException();
                        }
                        columnNormList.Add(columnNorm);
                    }
                }
            }
            _normalisationModel = new DataTableNormalisation {
                Type = type,
                ColumnNormalisation = columnNormList.ToArray()
            };
        }
예제 #6
0
        public AdaBoostTrainer(IDataTable table)
        {
            _table              = table;
            _classColumnIndex   = table.TargetColumnIndex;
            _classificationList = table.GetAnalysis()[_classColumnIndex].DistinctValues.Select(v => v.ToString()).ToList();

            var rowCount = table.RowCount;
            var weight   = 1f / rowCount;

            _rowWeight = new float[rowCount];
            for (var i = 0; i < rowCount; i++)
            {
                _rowWeight[i] = weight;
            }
        }
예제 #7
0
        public DataTableVectoriser(IDataTable table, bool useTargetColumnIndex)
        {
            var classColumnIndex = useTargetColumnIndex ? table.TargetColumnIndex : -1;

            var analysis = table.GetAnalysis();

            _vectorisationModel = new DataTableVectorisation {
                ClassColumnIndex = classColumnIndex
            };

            var columnList = new List <DataTableVectorisation.Column>();

            foreach (var columnInfo in analysis.ColumnInfo)
            {
                var column      = table.Columns[columnInfo.ColumnIndex];
                var columnModel = new DataTableVectorisation.Column {
                    ColumnIndex    = columnInfo.ColumnIndex,
                    IsContinuous   = false,
                    IsTargetColumn = columnInfo.ColumnIndex == classColumnIndex,
                    Size           = 0,
                    Name           = column.Name
                };
                columnList.Add(columnModel);

                if (columnInfo is IIndexColumnInfo indexColumn)
                {
                    columnModel.Size = Convert.ToInt32(indexColumn.MaxIndex + 1);
                }
                else
                {
                    columnModel.IsContinuous = column.IsContinuous || !columnInfo.NumDistinct.HasValue;
                    if (columnModel.IsContinuous)
                    {
                        columnModel.Size = 1;
                    }
                    else
                    {
                        columnModel.Size   = columnInfo.NumDistinct ?? 0;
                        columnModel.Values = columnInfo.DistinctValues
                                             .Select(s => s.ToString())
                                             .OrderBy(s => s)
                                             .Select((s, i) => new DataTableVectorisation.CategoricalIndex {
                            Category = s,
                            Index    = i
                        })
                                             .ToArray()
                        ;
                    }
                }
                if (columnModel.IsTargetColumn)
                {
                    _vectorisationModel.OutputSize         = columnModel.Size;
                    _vectorisationModel.IsTargetContinuous = columnModel.IsContinuous;
                    _vectorisationModel.HasTarget          = true;
                }
                else
                {
                    _vectorisationModel.InputSize += columnModel.Size;
                }
            }
            _vectorisationModel.Columns = columnList.ToArray();
        }
예제 #8
0
        public DataTableVectoriser(IDataTable table, bool useTargetColumnIndex, int maxNumericCategoricalExpansion = 128)
        {
            var classColumnIndex = useTargetColumnIndex ? table.TargetColumnIndex : -1;

            var analysis = table.GetAnalysis();

            _vectorisationModel = new DataTableVectorisation {
                ClassColumnIndex = classColumnIndex
            };

            var columnList = new List <DataTableVectorisation.Column>();

            foreach (var columnInfo in analysis.ColumnInfo)
            {
                var column      = table.Columns[columnInfo.ColumnIndex];
                var columnModel = new DataTableVectorisation.Column {
                    ColumnIndex    = columnInfo.ColumnIndex,
                    IsContinuous   = false,
                    IsTargetColumn = columnInfo.ColumnIndex == classColumnIndex,
                    Size           = 0,
                    Name           = column.Name
                };
                columnList.Add(columnModel);

                if (column.Type == ColumnType.Boolean)
                {
                    columnModel.Size     = 1;
                    columnModel.IsBinary = true;
                }
                else if (columnInfo is IIndexColumnInfo indexColumn)
                {
                    columnModel.Size = Convert.ToInt32(indexColumn.MaxIndex + 1);
                }
                else if (columnInfo is IDimensionsColumnInfo vectorColumn)
                {
                    var size = vectorColumn.XDimension ?? 0;
                    if (vectorColumn.YDimension.HasValue)
                    {
                        size *= vectorColumn.YDimension.Value;
                    }
                    if (vectorColumn.ZDimension.HasValue)
                    {
                        size *= vectorColumn.ZDimension.Value;
                    }
                    columnModel.Size = size;
                }
                else if (columnInfo is INumericColumnInfo)
                {
                    columnModel.IsContinuous = column.IsContinuous || !columnInfo.NumDistinct.HasValue || columnInfo.NumDistinct.Value > maxNumericCategoricalExpansion;
                    if (columnModel.IsContinuous)
                    {
                        columnModel.Size = 1;
                    }
                    else
                    {
                        columnModel.Size   = columnInfo.NumDistinct ?? 0;
                        columnModel.Values = _GetCategoricalValues(columnInfo.DistinctValues);
                    }
                }
                else if (columnInfo.NumDistinct.HasValue)
                {
                    columnModel.Size   = columnInfo.NumDistinct ?? 0;
                    columnModel.Values = _GetCategoricalValues(columnInfo.DistinctValues);
                }

                if (columnModel.IsTargetColumn)
                {
                    _vectorisationModel.OutputSize         = columnModel.Size;
                    _vectorisationModel.IsTargetContinuous = columnModel.IsContinuous;
                    _vectorisationModel.HasTarget          = true;
                    _vectorisationModel.IsTargetBinary     = columnModel.IsBinary;
                }
                else
                {
                    _vectorisationModel.InputSize += columnModel.Size;
                }
            }
            _vectorisationModel.Columns = columnList.ToArray();
        }