private NormalizeTransform(IHost host, ArgumentsBase args, IDataView input, Func <int, int, ColumnType, IRowCursor, IColumnFunctionBuilder> fnCreate, params int[] extraTrainColumnIds) : base(host, host.CheckRef(args, nameof(args)).GetColumns(), input, args.TestType) { Host.AssertNonEmpty(Infos); Host.Assert(Utils.Size(Infos) == Utils.Size(args.GetColumns())); bool[] activeInput = new bool[Source.Schema.ColumnCount]; if (Utils.Size(extraTrainColumnIds) > 0) { foreach (var colId in extraTrainColumnIds) { Host.Assert(0 <= colId && colId < activeInput.Length); activeInput[colId] = true; } } foreach (var info in Infos) { activeInput[info.Source] = true; } var functionBuilders = new IColumnFunctionBuilder[Infos.Length]; var needMoreData = new bool[Infos.Length]; // Go through the input data and pass it to the column function builders. using (var pch = Host.StartProgressChannel("Normalize")) { long numRows = 0; pch.SetHeader(new ProgressHeader("examples"), e => e.SetProgress(0, numRows)); using (var cursor = Source.GetRowCursor(col => activeInput[col])) { for (int i = 0; i < Infos.Length; i++) { needMoreData[i] = true; var info = Infos[i]; functionBuilders[i] = fnCreate(i, info.Source, info.TypeSrc, cursor); } while (cursor.MoveNext()) { // If the row has bad values, the good values are still being used for training. // The comparisons in the code below are arranged so that NaNs in the input are not recorded. // REVIEW: Should infinities and/or NaNs be filtered before the normalization? Should we not record infinities for min/max? // Currently, infinities are recorded and will result in zero scale which in turn will result in NaN output for infinity input. bool any = false; for (int i = 0; i < Infos.Length; i++) { if (!needMoreData[i]) { continue; } var info = Infos[i]; Host.Assert(!info.TypeSrc.IsVector || info.TypeSrc.IsVector && info.TypeSrc.IsKnownSizeVector); Host.Assert(functionBuilders[i] != null); any |= needMoreData[i] = functionBuilders[i].ProcessValue(); } numRows++; if (!any) { break; } } } pch.Checkpoint(numRows); _functions = new IColumnFunction[Infos.Length]; for (int i = 0; i < Infos.Length; i++) { _functions[i] = functionBuilders[i].CreateColumnFunction(); } } SetMetadata(); }