/// <summary> /// A helper method to create <see cref="MissingValueHandlingTransformer"/> for public facing API. /// </summary> /// <param name="env">Host Environment.</param> /// <param name="input">Input <see cref="IDataView"/>. This is the output from previous transform or loader.</param> /// <param name="outputColumnName">Name of the output column.</param> /// <param name="inputColumnName">Name of the column to be transformed. If this is null '<paramref name="outputColumnName"/>' will be used.</param> /// <param name="replaceWith">The replacement method to utilize.</param> private static IDataView Create(IHostEnvironment env, IDataView input, string outputColumnName, string inputColumnName = null, ReplacementKind replaceWith = ReplacementKind.DefaultValue) { var args = new Options() { Columns = new[] { new Column() { Name = outputColumnName, Source = inputColumnName ?? outputColumnName } }, ReplaceWith = replaceWith }; return(Create(env, args, input)); }
/// <summary> /// Convenience constructor for public facing API. /// </summary> /// <param name="env">Host Environment.</param> /// <param name="input">Input <see cref="IDataView"/>. This is the output from previous transform or loader.</param> /// <param name="name">Name of the output column.</param> /// <param name="source">Name of the column to be transformed. If this is null '<paramref name="name"/>' will be used.</param> /// <param name="replacementKind">The replacement method to utilize.</param> public NAReplaceTransform(IHostEnvironment env, IDataView input, string name, string source = null, ReplacementKind replacementKind = ReplacementKind.DefaultValue) : this(env, new Arguments() { Column = new[] { new Column() { Source = source ?? name, Name = name } }, ReplacementKind = replacementKind }, input) { }
/// <summary> /// Fill the repValues array with the correct replacement values based on the user-given replacement kinds. /// Vectors default to by-slot imputation unless otherwise specified, except for unknown sized vectors /// which force across-slot imputation. /// </summary> private void GetReplacementValues(Arguments args, out object[] repValues, out BitArray[] slotIsDefault) { repValues = new object[Infos.Length]; slotIsDefault = new BitArray[Infos.Length]; ReplacementKind?[] imputationModes = new ReplacementKind?[Infos.Length]; List <int> columnsToImpute = null; // REVIEW: Would like to get rid of the sourceColumns list but seems to be the best way to provide // the cursor with what columns to cursor through. HashSet <int> sourceColumns = null; for (int iinfo = 0; iinfo < Infos.Length; iinfo++) { ReplacementKind kind = args.Column[iinfo].Kind ?? args.ReplacementKind; switch (kind) { case ReplacementKind.SpecifiedValue: repValues[iinfo] = GetSpecifiedValue(args.Column[iinfo].ReplacementString, _types[iinfo], _isNAs[iinfo]); break; case ReplacementKind.DefaultValue: repValues[iinfo] = GetDefault(_types[iinfo]); break; case ReplacementKind.Mean: case ReplacementKind.Min: case ReplacementKind.Max: if (!_types[iinfo].ItemType.IsNumber && !_types[iinfo].ItemType.IsTimeSpan && !_types[iinfo].ItemType.IsDateTime) { throw Host.Except("Cannot perform mean imputations on non-numeric '{0}'", _types[iinfo].ItemType); } imputationModes[iinfo] = kind; Utils.Add(ref columnsToImpute, iinfo); Utils.Add(ref sourceColumns, Infos[iinfo].Source); break; default: Host.Assert(false); throw Host.Except("Internal error, undefined ReplacementKind '{0}' assigned in NAReplaceTransform.", kind); } } // Exit if there are no columns needing a replacement value imputed. if (Utils.Size(columnsToImpute) == 0) { return; } // Impute values. using (var ch = Host.Start("Computing Statistics")) using (var cursor = Source.GetRowCursor(sourceColumns.Contains)) { StatAggregator[] statAggregators = new StatAggregator[columnsToImpute.Count]; for (int ii = 0; ii < columnsToImpute.Count; ii++) { int iinfo = columnsToImpute[ii]; bool bySlot = args.Column[ii].Slot ?? args.ImputeBySlot; if (_types[iinfo].IsVector && !_types[iinfo].IsKnownSizeVector && bySlot) { ch.Warning("By-slot imputation can not be done on variable-length column"); bySlot = false; } statAggregators[ii] = CreateStatAggregator(ch, _types[iinfo], imputationModes[iinfo], bySlot, cursor, Infos[iinfo].Source); } while (cursor.MoveNext()) { for (int ii = 0; ii < statAggregators.Length; ii++) { statAggregators[ii].ProcessRow(); } } for (int ii = 0; ii < statAggregators.Length; ii++) { repValues[columnsToImpute[ii]] = statAggregators[ii].GetStat(); } ch.Done(); } // Construct the slotIsDefault bit arrays. for (int ii = 0; ii < columnsToImpute.Count; ii++) { int slot = columnsToImpute[ii]; if (repValues[slot] is Array) { Func <ColumnType, int[], BitArray> func = ComputeDefaultSlots <int>; var meth = func.GetMethodInfo().GetGenericMethodDefinition().MakeGenericMethod(_types[slot].ItemType.RawType); slotIsDefault[slot] = (BitArray)meth.Invoke(this, new object[] { _types[slot], repValues[slot] }); } } }
/// <summary> /// A helper method to create <see cref="MissingValueHandlingTransformer"/> for public facing API. /// </summary> /// <param name="env">Host Environment.</param> /// <param name="input">Input <see cref="IDataView"/>. This is the output from previous transform or loader.</param> /// <param name="name">Name of the output column.</param> /// <param name="source">Name of the column to be transformed. If this is null '<paramref name="name"/>' will be used.</param> /// <param name="replaceWith">The replacement method to utilize.</param> public static IDataTransform Create(IHostEnvironment env, IDataView input, string name, string source = null, ReplacementKind replaceWith = ReplacementKind.DefaultValue) { var args = new Arguments() { Column = new[] { new Column() { Source = source ?? name, Name = name } }, ReplaceWith = replaceWith }; return(Create(env, args, input)); }