예제 #1
0
        private NormalizeTransform(IHost host, ArgumentsBase args, IDataView input,
                                   Func <int, int, ColumnType, IRowCursor, IColumnFunctionBuilder> fnCreate,
                                   params int[] extraTrainColumnIds)
            : base(host, host.CheckRef(args, nameof(args)).GetColumns(), input, args.TestType)
        {
            Host.AssertNonEmpty(Infos);
            Host.Assert(Utils.Size(Infos) == Utils.Size(args.GetColumns()));

            bool[] activeInput = new bool[Source.Schema.ColumnCount];
            if (Utils.Size(extraTrainColumnIds) > 0)
            {
                foreach (var colId in extraTrainColumnIds)
                {
                    Host.Assert(0 <= colId && colId < activeInput.Length);
                    activeInput[colId] = true;
                }
            }

            foreach (var info in Infos)
            {
                activeInput[info.Source] = true;
            }

            var functionBuilders = new IColumnFunctionBuilder[Infos.Length];
            var needMoreData     = new bool[Infos.Length];

            // Go through the input data and pass it to the column function builders.
            using (var pch = Host.StartProgressChannel("Normalize"))
            {
                long numRows = 0;

                pch.SetHeader(new ProgressHeader("examples"), e => e.SetProgress(0, numRows));
                using (var cursor = Source.GetRowCursor(col => activeInput[col]))
                {
                    for (int i = 0; i < Infos.Length; i++)
                    {
                        needMoreData[i] = true;
                        var info = Infos[i];
                        functionBuilders[i] = fnCreate(i, info.Source, info.TypeSrc, cursor);
                    }

                    while (cursor.MoveNext())
                    {
                        // If the row has bad values, the good values are still being used for training.
                        // The comparisons in the code below are arranged so that NaNs in the input are not recorded.
                        // REVIEW: Should infinities and/or NaNs be filtered before the normalization? Should we not record infinities for min/max?
                        // Currently, infinities are recorded and will result in zero scale which in turn will result in NaN output for infinity input.
                        bool any = false;
                        for (int i = 0; i < Infos.Length; i++)
                        {
                            if (!needMoreData[i])
                            {
                                continue;
                            }
                            var info = Infos[i];
                            Host.Assert(!info.TypeSrc.IsVector || info.TypeSrc.IsVector && info.TypeSrc.IsKnownSizeVector);
                            Host.Assert(functionBuilders[i] != null);
                            any |= needMoreData[i] = functionBuilders[i].ProcessValue();
                        }
                        numRows++;

                        if (!any)
                        {
                            break;
                        }
                    }
                }

                pch.Checkpoint(numRows);

                _functions = new IColumnFunction[Infos.Length];
                for (int i = 0; i < Infos.Length; i++)
                {
                    _functions[i] = functionBuilders[i].CreateColumnFunction();
                }
            }
            SetMetadata();
        }