Esempio n. 1
0
        public DataTableNormaliser(IDataTable dataTable, NormalisationType type, Stream output, IEnumerable <int> columnIndices)
        {
            _writer = new DataTableWriter(dataTable.Columns, output);

            var analysis       = dataTable.GetAnalysis();
            var columnNormList = new List <DataTableNormalisation.Column>();
            var columns        = analysis.ColumnInfo.AsQueryable();

            if (columnIndices != null)
            {
                var columnSet = new HashSet <int>(columnIndices);
                columns = columns.Where(ci => columnSet.Contains(ci.ColumnIndex));
            }

            var vectorColumns = new List <(int ColumnIndex, int Size)>();

            foreach (var columnInfo in columns)
            {
                var column = dataTable.Columns[columnInfo.ColumnIndex];
                if (column.IsContinuous && columnInfo is INumericColumnInfo numericInfo)
                {
                    var columnNorm = _GetColumn(type, numericInfo, columnInfo.ColumnIndex, column.Type);
                    if (columnNorm != null)
                    {
                        columnNormList.Add(columnNorm);
                    }
                }
                else if (column.Type == ColumnType.Vector && columnInfo is IDimensionsColumnInfo vector && vector.XDimension.HasValue && vector.XDimension.Value > 0)
                {
                    vectorColumns.Add((column.Index, vector.XDimension.Value));
                }
            }

            DataTableNormalisation.VectorColumn[] vectorColumnNormList = null;
            if (vectorColumns.Any())
            {
                var collectors = vectorColumns.Select(vc => Enumerable.Range(0, vc.Size).Select(i => new NumberCollector(i)).ToList()).ToList();
                dataTable.ForEach(row => {
                    foreach (var column in vectorColumns.Zip(collectors, (vc, c) => (vc, c)))
                    {
                        var vectorAsRow = row.GetField <FloatVector>(column.Item1.ColumnIndex).AsRow();
                        foreach (var collector in column.Item2)
                        {
                            collector.Process(vectorAsRow);
                        }
                    }
                });
                vectorColumnNormList = collectors.Select((c, i) => new DataTableNormalisation.VectorColumn {
                    ColumnIndex   = vectorColumns[i].ColumnIndex,
                    VectorColumns = c.Select((nc, j) => _GetColumn(type, nc, j, ColumnType.Float)).ToArray()
                }).ToArray();
            }

            _normalisationModel = new DataTableNormalisation {
                Type = type,
                ColumnNormalisation       = columnNormList.ToArray(),
                VectorColumnNormalisation = vectorColumnNormList
            };
        }
Esempio n. 2
0
        public DataTableNormaliser(IDataTable dataTable, NormalisationType type, Stream output = null, DataTableNormalisation model = null)
        {
            _table  = dataTable;
            _writer = new DataTableWriter(dataTable.Columns, output);

            if (model != null)
            {
                _normalisationModel = model;
            }
            else
            {
                var analysis       = dataTable.GetAnalysis();
                var columnNormList = new List <DataTableNormalisation.Column>();
                foreach (var columnInfo in analysis.ColumnInfo)
                {
                    var column = dataTable.Columns[columnInfo.ColumnIndex];
                    if (column.IsContinuous)
                    {
                        var numericInfo = columnInfo as INumericColumnInfo;
                        if (numericInfo != null)
                        {
                            if (type == NormalisationType.Standard && !numericInfo.StdDev.HasValue)
                            {
                                continue;
                            }

                            DataTableNormalisation.Column columnNorm;
                            if (type == NormalisationType.Standard)
                            {
                                columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.StdDev.Value, numericInfo.Mean);
                            }
                            else if (type == NormalisationType.Euclidean)
                            {
                                columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.L2Norm);
                            }
                            else if (type == NormalisationType.Manhattan)
                            {
                                columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.L1Norm);
                            }
                            else if (type == NormalisationType.FeatureScale)
                            {
                                columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.Max - numericInfo.Min, numericInfo.Min);
                            }
                            else
                            {
                                throw new NotImplementedException();
                            }
                            columnNormList.Add(columnNorm);
                        }
                    }
                }
                _normalisationModel = new DataTableNormalisation {
                    Type = type,
                    ColumnNormalisation = columnNormList.ToArray()
                };
            }
        }
Esempio n. 3
0
        private void _Add(string line, DataTableWriter writer)
        {
            var convertedData = writer.Columns
                                .Zip(_Parse(line), (c, str) => _Convert(ref c._type, str))
                                .ToList()
            ;

            writer.AddRow(convertedData);
        }
Esempio n. 4
0
        public static IDataTable Project(IDataTable table, IEnumerable <int> columns, Stream output = null)
        {
            var validColumn = new HashSet <int>(columns);
            var writer      = new DataTableWriter(table.Columns.Select((c, i) => Tuple.Create(c, i)).Where(c => validColumn.Contains(c.Item2)).Select(c => c.Item1), output);
            var projector   = new DataTableProjector(writer, columns);

            table.Process(projector);
            return(writer.GetDataTable());
        }
Esempio n. 5
0
        public DataTableNormaliser(IDataTable dataTable, NormalisationType type, Stream output, IEnumerable <int> columnIndices)
        {
            _writer = new DataTableWriter(dataTable.Columns, output);

            var analysis       = dataTable.GetAnalysis();
            var columnNormList = new List <DataTableNormalisation.Column>();
            var columns        = analysis.ColumnInfo.AsQueryable();

            if (columnIndices != null)
            {
                var columnSet = new HashSet <int>(columnIndices);
                columns = columns.Where(ci => columnSet.Contains(ci.ColumnIndex));
            }

            foreach (var columnInfo in columns)
            {
                var column = dataTable.Columns[columnInfo.ColumnIndex];
                if (column.IsContinuous)
                {
                    if (columnInfo is INumericColumnInfo numericInfo)
                    {
                        if (type == NormalisationType.Standard && !numericInfo.StdDev.HasValue)
                        {
                            continue;
                        }

                        DataTableNormalisation.Column columnNorm;
                        if (type == NormalisationType.Standard)
                        {
                            columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.StdDev ?? 1, numericInfo.Mean);
                        }
                        else if (type == NormalisationType.Euclidean)
                        {
                            columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.L2Norm);
                        }
                        else if (type == NormalisationType.Manhattan)
                        {
                            columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.L1Norm);
                        }
                        else if (type == NormalisationType.FeatureScale)
                        {
                            columnNorm = new DataTableNormalisation.Column(columnInfo.ColumnIndex, column.Type, numericInfo.Max - numericInfo.Min, numericInfo.Min);
                        }
                        else
                        {
                            throw new NotImplementedException();
                        }
                        columnNormList.Add(columnNorm);
                    }
                }
            }
            _normalisationModel = new DataTableNormalisation {
                Type = type,
                ColumnNormalisation = columnNormList.ToArray()
            };
        }
Esempio n. 6
0
        private DataTableWriter _DetermineHeaders(Stream stream, List <string> lines, bool checkForHeader, ref bool hasHeader)
        {
            // see if there is a header (all strings)
            var firstLineTypes = _Parse(lines.First());

            if (checkForHeader)
            {
                hasHeader = firstLineTypes.All(str => _DetermineType(str) == ColumnType.String);
            }

            // get the list of header names
            var headerNames = new List <string>();
            int index       = 0;

            foreach (var item in firstLineTypes)
            {
                headerNames.Add(hasHeader ? item : "_col" + index++);
            }

            // get the list of header types
            var data = lines
                       .Skip(hasHeader ? 1 : 0)
                       .SelectMany(line => _Parse(line).Select((str, pos) => Tuple.Create(str, pos)))
                       .GroupBy(l => l.Item2, l => _DetermineType(l.Item1))
                       .OrderBy(g => g.Key)
                       .Select(g => g.Max(v => (int)v))
                       .Cast <ColumnType>()
                       .ToList()
            ;

            // add the columns
            var ret = new DataTableWriter(stream);

            foreach (var column in headerNames.Zip(data, (name, type) => Tuple.Create(name, type)))
            {
                ret.AddColumn(column.Item1, column.Item2);
            }

            return(ret);
        }
Esempio n. 7
0
 public DataTableNormaliser(IDataTable dataTable, Stream output, DataTableNormalisation model)
 {
     _writer             = new DataTableWriter(dataTable.Columns, output);
     _normalisationModel = model;
 }