internal Row GetStatefulRows(Row input, IRowToRowMapper mapper, Func <int, bool> active,
                                     List <StatefulRow> rows, out Action disposer)
        {
            Contracts.CheckValue(input, nameof(input));
            Contracts.CheckValue(active, nameof(active));

            disposer = null;
            IRowToRowMapper[] innerMappers = new IRowToRowMapper[0];
            if (mapper is CompositeRowToRowMapper)
            {
                innerMappers = ((CompositeRowToRowMapper)mapper).InnerMappers;
            }

            if (innerMappers.Length == 0)
            {
                bool differentActive = false;
                for (int c = 0; c < input.Schema.ColumnCount; ++c)
                {
                    bool wantsActive = active(c);
                    bool isActive    = input.IsColumnActive(c);
                    differentActive |= wantsActive != isActive;

                    if (wantsActive && !isActive)
                    {
                        throw Contracts.ExceptParam(nameof(input), $"Mapper required column '{input.Schema.GetColumnName(c)}' active but it was not.");
                    }
                }

                var row = mapper.GetRow(input, active, out disposer);
                if (row is StatefulRow statefulRow)
                {
                    rows.Add(statefulRow);
                }

                return(row);
            }

            // For each of the inner mappers, we will be calling their GetRow method, but to do so we need to know
            // what we need from them. The last one will just have the input, but the rest will need to be
            // computed based on the dependencies of the next one in the chain.
            var deps = new Func <int, bool> [innerMappers.Length];

            deps[deps.Length - 1] = active;
            for (int i = deps.Length - 1; i >= 1; --i)
            {
                deps[i - 1] = innerMappers[i].GetDependencies(deps[i]);
            }

            Row result = input;

            for (int i = 0; i < innerMappers.Length; ++i)
            {
                Action localDisp;
                result = GetStatefulRows(result, innerMappers[i], deps[i], rows, out localDisp);
                if (result is StatefulRow statefulResult)
                {
                    rows.Add(statefulResult);
                }

                if (localDisp != null)
                {
                    if (disposer == null)
                    {
                        disposer = localDisp;
                    }
                    else
                    {
                        disposer = localDisp + disposer;
                    }
                    // We want the last disposer to be called first, so the order of the addition here is important.
                }
            }

            return(result);
        }
Example #2
0
 public static ValueGetter <ulong> GetOptGroupGetter(this Row row, RoleMappedData data)
 {
     Contracts.CheckValue(data, nameof(data));
     return(GetOptGroupGetter(row, data.Schema));
 }
        // REVIEW: It would be nice to support propagation of select metadata.
        public static IDataView Create <TSrc, TDst>(IHostEnvironment env, string name, IDataView input,
                                                    string src, string dst, DataViewType typeSrc, DataViewType typeDst, ValueMapper <TSrc, TDst> mapper,
                                                    ValueGetter <VBuffer <ReadOnlyMemory <char> > > keyValueGetter = null, ValueGetter <VBuffer <ReadOnlyMemory <char> > > slotNamesGetter = null)
        {
            Contracts.CheckValue(env, nameof(env));
            env.CheckNonEmpty(name, nameof(name));
            env.CheckValue(input, nameof(input));
            env.CheckNonEmpty(src, nameof(src));
            env.CheckNonEmpty(dst, nameof(dst));
            env.CheckValue(typeSrc, nameof(typeSrc));
            env.CheckValue(typeDst, nameof(typeDst));
            env.CheckValue(mapper, nameof(mapper));
            env.Check(keyValueGetter == null || typeDst.GetItemType() is KeyType);
            env.Check(slotNamesGetter == null || typeDst.IsKnownSizeVector());

            if (typeSrc.RawType != typeof(TSrc))
            {
                throw env.ExceptParam(nameof(mapper),
                                      "The source column type '{0}' doesn't match the input type of the mapper", typeSrc);
            }
            if (typeDst.RawType != typeof(TDst))
            {
                throw env.ExceptParam(nameof(mapper),
                                      "The destination column type '{0}' doesn't match the output type of the mapper", typeDst);
            }

            bool tmp = input.Schema.TryGetColumnIndex(src, out int colSrc);

            if (!tmp)
            {
                throw env.ExceptParam(nameof(src), "The input data doesn't have a column named '{0}'", src);
            }
            var typeOrig = input.Schema[colSrc].Type;

            // REVIEW: Ideally this should support vector-type conversion. It currently doesn't.
            bool     ident;
            Delegate conv;

            if (typeOrig.SameSizeAndItemType(typeSrc))
            {
                ident = true;
                conv  = null;
            }
            else if (!Conversions.Instance.TryGetStandardConversion(typeOrig, typeSrc, out conv, out ident))
            {
                throw env.ExceptParam(nameof(mapper),
                                      "The type of column '{0}', '{1}', cannot be converted to the input type of the mapper '{2}'",
                                      src, typeOrig, typeSrc);
            }

            var       col = new Column(src, dst);
            IDataView impl;

            if (ident)
            {
                impl = new Impl <TSrc, TDst, TDst>(env, name, input, col, typeDst, mapper, keyValueGetter: keyValueGetter, slotNamesGetter: slotNamesGetter);
            }
            else
            {
                Func <IHostEnvironment, string, IDataView, Column, DataViewType, ValueMapper <int, int>,
                      ValueMapper <int, int>, ValueGetter <VBuffer <ReadOnlyMemory <char> > >, ValueGetter <VBuffer <ReadOnlyMemory <char> > >,
                      Impl <int, int, int> > del = CreateImpl <int, int, int>;
                var meth = del.GetMethodInfo().GetGenericMethodDefinition()
                           .MakeGenericMethod(typeOrig.RawType, typeof(TSrc), typeof(TDst));
                impl = (IDataView)meth.Invoke(null, new object[] { env, name, input, col, typeDst, conv, mapper, keyValueGetter, slotNamesGetter });
            }

            return(new OpaqueDataView(impl));
        }
Example #4
0
 internal static ITrainer CreateTrainer <TArgs>(this IHostEnvironment env, TArgs arguments, out string loadName)
     where TArgs : class, new()
 {
     Contracts.CheckValue(env, nameof(env));
     return(CreateCore <ITrainer, TArgs, SignatureTrainer>(env, arguments, out loadName));
 }
Example #5
0
 /// <summary>
 /// Get the getter for the feature column, assuming it is a vector of float.
 /// </summary>
 public static ValueGetter <VBuffer <float> > GetFeatureFloatVectorGetter(this Row row, RoleMappedData data)
 {
     Contracts.CheckValue(data, nameof(data));
     return(GetFeatureFloatVectorGetter(row, data.Schema));
 }
Example #6
0
 /// <summary>
 /// Creates a data saver from the 'LoadName{settings}' string.
 /// </summary>
 public static IDataSaver CreateSaver(this IHostEnvironment env, string settings)
 {
     Contracts.CheckValue(env, nameof(env));
     return(CreateCore <IDataSaver>(env, typeof(SignatureDataSaver), settings));
 }
Example #7
0
 internal static IEvaluator CreateEvaluator(this IHostEnvironment env, string settings)
 {
     Contracts.CheckValue(env, nameof(env));
     env.CheckNonWhiteSpace(settings, nameof(settings));
     return(CreateCore <IEvaluator>(env, typeof(SignatureEvaluator), settings));
 }
Example #8
0
 /// <summary>
 /// Run a TensorFlow model provided through <paramref name="tensorFlowModel"/> on the input column and extract one output column.
 /// The inputs and outputs are matched to TensorFlow graph nodes by name.
 /// </summary>
 public static Vector <float> ApplyTensorFlowGraph(this Vector <float> input, TensorFlowModelInfo tensorFlowModel)
 {
     Contracts.CheckValue(input, nameof(input));
     Contracts.CheckValue(tensorFlowModel, nameof(tensorFlowModel));
     return(new OutColumn(input, tensorFlowModel));
 }
Example #9
0
        internal static IDataTransform Create(IHostEnvironment env, Options options, IDataView input)
        {
            Contracts.CheckValue(env, nameof(env));
            var h = env.Register(RegistrationName);

            h.CheckValue(options, nameof(options));
            h.CheckValue(input, nameof(input));
            h.CheckUserArg(Utils.Size(options.Columns) > 0, nameof(options.Columns), "Columns must be specified");

            // Compose the WordBagTransform from a tokenize transform,
            // followed by a NgramExtractionTransform.
            // Since WordBagTransform is a many-to-one column transform, for each
            // WordBagTransform.Column with multiple sources, we first apply a ConcatTransform.

            // REVIEW: In order to not get ngrams that cross between vector slots, we need to
            // enable tokenize transforms to insert a special token between slots.

            // REVIEW: In order to make it possible to output separate bags for different columns
            // using the same dictionary, we need to find a way to make ConcatTransform remember the boundaries.

            var tokenizeColumns = new WordTokenizingEstimator.ColumnOptions[options.Columns.Length];

            var extractorArgs =
                new NgramExtractorTransform.Options()
            {
                MaxNumTerms = options.MaxNumTerms,
                NgramLength = options.NgramLength,
                SkipLength  = options.SkipLength,
                AllLengths  = options.AllLengths,
                Weighting   = options.Weighting,
                Columns     = new NgramExtractorTransform.Column[options.Columns.Length]
            };

            for (int iinfo = 0; iinfo < options.Columns.Length; iinfo++)
            {
                var column = options.Columns[iinfo];
                h.CheckUserArg(!string.IsNullOrWhiteSpace(column.Name), nameof(column.Name));
                h.CheckUserArg(Utils.Size(column.Source) > 0, nameof(column.Source));
                h.CheckUserArg(column.Source.All(src => !string.IsNullOrWhiteSpace(src)), nameof(column.Source));

                tokenizeColumns[iinfo] = new WordTokenizingEstimator.ColumnOptions(column.Name, column.Source.Length > 1 ? column.Name : column.Source[0]);

                extractorArgs.Columns[iinfo] =
                    new NgramExtractorTransform.Column()
                {
                    Name        = column.Name,
                    Source      = column.Name,
                    MaxNumTerms = column.MaxNumTerms,
                    NgramLength = column.NgramLength,
                    SkipLength  = column.SkipLength,
                    Weighting   = column.Weighting,
                    AllLengths  = column.AllLengths
                };
            }

            IDataView view = input;

            view = NgramExtractionUtils.ApplyConcatOnSources(h, options.Columns, view);
            view = new WordTokenizingEstimator(env, tokenizeColumns).Fit(view).Transform(view);
            return(NgramExtractorTransform.Create(h, extractorArgs, view));
        }
Example #10
0
 /// <include file='doc.xml' path='doc/members/member[@name="WordEmbeddings"]/*' />
 /// <param name="input">Vector of tokenized text.</param>
 /// <param name="customModelFile">The custom word embedding model file.</param>
 public static Vector <float> WordEmbeddings(this VarVector <string> input, string customModelFile)
 {
     Contracts.CheckValue(input, nameof(input));
     return(new OutColumn(input, customModelFile));
 }
Example #11
0
 // REVIEW: this method only covers one use case of using TensorFlow models: consuming one
 // input and producing one output of floats.
 // We could consider selectively adding some more extensions to enable common scenarios.
 /// <summary>
 /// Load the TensorFlow model from <paramref name="modelFile"/> and run it on the input column and extract one output column.
 /// The inputs and outputs are matched to TensorFlow graph nodes by name.
 /// </summary>
 public static Vector <float> ApplyTensorFlowGraph(this Vector <float> input, string modelFile)
 {
     Contracts.CheckValue(input, nameof(input));
     Contracts.CheckNonEmpty(modelFile, nameof(modelFile));
     return(new OutColumn(input, modelFile));
 }
Example #12
0
 /// <include file='doc.xml' path='doc/members/member[@name="WordEmbeddings"]/*' />
 /// <param name="input">Vector of tokenized text.</param>
 /// <param name="modelKind">The pretrained word embedding model.</param>
 /// <returns></returns>
 public static Vector <float> WordEmbeddings(this VarVector <string> input, WordEmbeddingsTransform.PretrainedModelKind modelKind = WordEmbeddingsTransform.PretrainedModelKind.Sswe)
 {
     Contracts.CheckValue(input, nameof(input));
     return(new OutColumn(input, modelKind));
 }
Example #13
0
 private static Type GetRawType(PrimitiveType itemType)
 {
     Contracts.CheckValue(itemType, nameof(itemType));
     return(typeof(VBuffer <>).MakeGenericType(itemType.RawType));
 }
Example #14
0
 /// <summary>
 /// Returns true iff the given type is valid for a <see cref="KeyType"/>. The valid ones are
 /// <see cref="byte"/>, <see cref="ushort"/>, <see cref="uint"/>, and <see cref="ulong"/>, that is, the unsigned integer types.
 /// </summary>
 public static bool IsValidDataType(Type type)
 {
     Contracts.CheckValue(type, nameof(type));
     return(type == typeof(byte) || type == typeof(ushort) || type == typeof(uint) || type == typeof(ulong));
 }
        /// <summary>
        /// Returns the feature selection scores for each slot of each column.
        /// </summary>
        /// <param name="env">The host environment.</param>
        /// <param name="input">The input dataview.</param>
        /// <param name="columns">The columns for which to compute the feature selection scores.</param>
        /// <param name="colSizes">Outputs an array containing the vector sizes of the input columns</param>
        /// <returns>A list of scores.</returns>
        public static long[][] Train(IHostEnvironment env, IDataView input, string[] columns, out int[] colSizes)
        {
            Contracts.CheckValue(env, nameof(env));
            env.CheckValue(input, nameof(input));
            env.CheckParam(Utils.Size(columns) > 0, nameof(columns));

            var schema      = input.Schema;
            var size        = columns.Length;
            var activeInput = new bool[schema.ColumnCount];
            var colSrcs     = new int[size];
            var colTypes    = new ColumnType[size];

            colSizes = new int[size];
            for (int i = 0; i < size; i++)
            {
                int colSrc;
                var colName = columns[i];
                if (!schema.TryGetColumnIndex(colName, out colSrc))
                {
                    throw env.ExceptUserArg(nameof(CountFeatureSelectingTransformer.Arguments.Column), "Source column '{0}' not found", colName);
                }

                var colType = schema.GetColumnType(colSrc);
                if (colType.IsVector && !colType.IsKnownSizeVector)
                {
                    throw env.ExceptUserArg(nameof(CountFeatureSelectingTransformer.Arguments.Column), "Variable length column '{0}' is not allowed", colName);
                }

                activeInput[colSrc] = true;
                colSrcs[i]          = colSrc;
                colTypes[i]         = colType;
                colSizes[i]         = colType.ValueCount;
            }

            var    aggregators = new CountAggregator[size];
            long   rowCur      = 0;
            double rowCount    = input.GetRowCount() ?? double.NaN;

            using (var pch = env.StartProgressChannel("Aggregating counts"))
                using (var cursor = input.GetRowCursor(col => activeInput[col]))
                {
                    var header = new ProgressHeader(new[] { "rows" });
                    pch.SetHeader(header, e => { e.SetProgress(0, rowCur, rowCount); });
                    for (int i = 0; i < size; i++)
                    {
                        if (colTypes[i].IsVector)
                        {
                            aggregators[i] = GetVecAggregator(cursor, colTypes[i], colSrcs[i]);
                        }
                        else
                        {
                            aggregators[i] = GetOneAggregator(cursor, colTypes[i], colSrcs[i]);
                        }
                    }

                    while (cursor.MoveNext())
                    {
                        for (int i = 0; i < size; i++)
                        {
                            aggregators[i].ProcessValue();
                        }
                        rowCur++;
                    }
                    pch.Checkpoint(rowCur);
                }
            return(aggregators.Select(a => a.Count).ToArray());
        }
Example #16
0
        internal static IDataTransform Create(IHostEnvironment env, Options options, IDataView input,
                                              TermLoaderArguments termLoaderArgs = null)
        {
            Contracts.CheckValue(env, nameof(env));
            var h = env.Register(LoaderSignature);

            h.CheckValue(options, nameof(options));
            h.CheckValue(input, nameof(input));
            h.CheckUserArg(Utils.Size(options.Columns) > 0, nameof(options.Columns), "Columns must be specified");

            IDataView view      = input;
            var       termCols  = new List <Column>();
            var       isTermCol = new bool[options.Columns.Length];

            for (int i = 0; i < options.Columns.Length; i++)
            {
                var col = options.Columns[i];

                h.CheckNonWhiteSpace(col.Name, nameof(col.Name));
                h.CheckNonWhiteSpace(col.Source, nameof(col.Source));
                int colId;
                if (input.Schema.TryGetColumnIndex(col.Source, out colId) &&
                    input.Schema[colId].Type.GetItemType() is TextDataViewType)
                {
                    termCols.Add(col);
                    isTermCol[i] = true;
                }
            }

            // If the column types of args.column are text, apply term transform to convert them to keys.
            // Otherwise, skip term transform and apply ngram transform directly.
            // This logic allows NgramExtractorTransform to handle both text and key input columns.
            // Note: ngram transform handles the validation of the types natively (in case the types
            // of args.column are not text nor keys).
            if (termCols.Count > 0)
            {
                ValueToKeyMappingTransformer.Options termArgs = null;
                string[] missingDropColumns = null;
                if (termLoaderArgs != null)
                {
                    termArgs =
                        new ValueToKeyMappingTransformer.Options()
                    {
                        MaxNumTerms = int.MaxValue,
                        Term        = termLoaderArgs.Term,
                        Terms       = termLoaderArgs.Terms,
                        DataFile    = termLoaderArgs.DataFile,
                        Loader      = termLoaderArgs.Loader,
                        TermsColumn = termLoaderArgs.TermsColumn,
                        Sort        = termLoaderArgs.Sort,
                        Columns     = new ValueToKeyMappingTransformer.Column[termCols.Count]
                    };
                    if (termLoaderArgs.DropUnknowns)
                    {
                        missingDropColumns = new string[termCols.Count];
                    }
                }
                else
                {
                    termArgs =
                        new ValueToKeyMappingTransformer.Options()
                    {
                        MaxNumTerms = Utils.Size(options.MaxNumTerms) > 0 ? options.MaxNumTerms[0] : NgramExtractingEstimator.Defaults.MaxNumTerms,
                        Columns     = new ValueToKeyMappingTransformer.Column[termCols.Count]
                    };
                }

                for (int iinfo = 0; iinfo < termCols.Count; iinfo++)
                {
                    var column = termCols[iinfo];
                    termArgs.Columns[iinfo] =
                        new ValueToKeyMappingTransformer.Column()
                    {
                        Name        = column.Name,
                        Source      = column.Source,
                        MaxNumTerms = Utils.Size(column.MaxNumTerms) > 0 ? column.MaxNumTerms[0] : default(int?)
                    };

                    if (missingDropColumns != null)
                    {
                        missingDropColumns[iinfo] = column.Name;
                    }
                }

                view = ValueToKeyMappingTransformer.Create(h, termArgs, view);
                if (missingDropColumns != null)
                {
                    view = new MissingValueDroppingTransformer(h, missingDropColumns.Select(x => (x, x)).ToArray()).Transform(view);
                }
            }

            var ngramColumns = new NgramExtractingEstimator.ColumnOptions[options.Columns.Length];

            for (int iinfo = 0; iinfo < options.Columns.Length; iinfo++)
            {
                var column = options.Columns[iinfo];
                ngramColumns[iinfo] = new NgramExtractingEstimator.ColumnOptions(column.Name,
                                                                                 column.NgramLength ?? options.NgramLength,
                                                                                 column.SkipLength ?? options.SkipLength,
                                                                                 column.AllLengths ?? options.AllLengths,
                                                                                 column.Weighting ?? options.Weighting,
                                                                                 column.MaxNumTerms ?? options.MaxNumTerms,
                                                                                 isTermCol[iinfo] ? column.Name : column.Source
                                                                                 );
            }

            return(new NgramExtractingEstimator(env, ngramColumns).Fit(view).Transform(view) as IDataTransform);
        }
Example #17
0
 /// <summary>
 /// Creates a data saver from the arguments object.
 /// </summary>
 public static IDataSaver CreateSaver <TArgs>(this IHostEnvironment env, TArgs arguments)
     where TArgs : class, new()
 {
     Contracts.CheckValue(env, nameof(env));
     return(CreateCore <IDataSaver, TArgs, SignatureDataSaver>(env, arguments));
 }
        public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input)
        {
            Contracts.CheckValue(env, nameof(env));
            var host = env.Register("Tree Featurizer Transform");

            host.CheckValue(args, nameof(args));
            host.CheckValue(input, nameof(input));
            host.CheckUserArg(!string.IsNullOrWhiteSpace(args.TrainedModelFile) || args.Trainer.IsGood(), nameof(args.TrainedModelFile),
                              "Please specify either a trainer or an input model file.");
            host.CheckUserArg(!string.IsNullOrEmpty(args.FeatureColumn), nameof(args.FeatureColumn), "Transform needs an input features column");

            IDataTransform xf;

            using (var ch = host.Start("Create Tree Ensemble Scorer"))
            {
                var scorerArgs = new TreeEnsembleFeaturizerBindableMapper.Arguments()
                {
                    Suffix = args.Suffix
                };
                if (!string.IsNullOrWhiteSpace(args.TrainedModelFile))
                {
                    if (args.Trainer.IsGood())
                    {
                        ch.Warning("Both an input model and a trainer were specified. Using the model file.");
                    }

                    ch.Trace("Loading model");
                    IPredictor predictor;
                    using (Stream strm = new FileStream(args.TrainedModelFile, FileMode.Open, FileAccess.Read))
                        using (var rep = RepositoryReader.Open(strm, ch))
                            ModelLoadContext.LoadModel <IPredictor, SignatureLoadModel>(host, out predictor, rep, ModelFileUtils.DirPredictor);

                    ch.Trace("Creating scorer");
                    var data = TrainAndScoreTransform.CreateDataFromArgs(ch, input, args);

                    // Make sure that the given predictor has the correct number of input features.
                    if (predictor is CalibratedPredictorBase)
                    {
                        predictor = ((CalibratedPredictorBase)predictor).SubPredictor;
                    }
                    // Predictor should be a FastTreePredictionWrapper, which implements IValueMapper, so this should
                    // be non-null.
                    var vm = predictor as IValueMapper;
                    ch.CheckUserArg(vm != null, nameof(args.TrainedModelFile), "Predictor in model file does not have compatible type");
                    if (vm.InputType.VectorSize != data.Schema.Feature.Type.VectorSize)
                    {
                        throw ch.ExceptUserArg(nameof(args.TrainedModelFile),
                                               "Predictor in model file expects {0} features, but data has {1} features",
                                               vm.InputType.VectorSize, data.Schema.Feature.Type.VectorSize);
                    }

                    var bindable = new TreeEnsembleFeaturizerBindableMapper(env, scorerArgs, predictor);
                    var bound    = bindable.Bind(env, data.Schema);
                    xf = new GenericScorer(env, scorerArgs, input, bound, data.Schema);
                }
                else
                {
                    ch.Assert(args.Trainer.IsGood());

                    ch.Trace("Creating TrainAndScoreTransform");
                    string scorerSettings = CmdParser.GetSettings(ch, scorerArgs,
                                                                  new TreeEnsembleFeaturizerBindableMapper.Arguments());
                    var scorer =
                        new SubComponent <IDataScorerTransform, SignatureDataScorer>(
                            TreeEnsembleFeaturizerBindableMapper.LoadNameShort, scorerSettings);

                    var trainScoreArgs = new TrainAndScoreTransform.Arguments();
                    args.CopyTo(trainScoreArgs);
                    trainScoreArgs.Trainer = new SubComponent <ITrainer, SignatureTrainer>(args.Trainer.Kind,
                                                                                           args.Trainer.Settings);

                    var labelInput = AppendLabelTransform(host, ch, input, trainScoreArgs.LabelColumn, args.LabelPermutationSeed);
                    trainScoreArgs.Scorer = scorer;
                    var scoreXf = TrainAndScoreTransform.Create(host, trainScoreArgs, labelInput);
                    if (input == labelInput)
                    {
                        return(scoreXf);
                    }
                    return((IDataTransform)ApplyTransformUtils.ApplyAllTransformsToData(host, scoreXf, input, labelInput));
                }

                ch.Done();
            }
            return(xf);
        }
Example #19
0
 /// <summary>
 /// Create a new data view which is obtained by appending all columns of all the source data views.
 /// If the data views are of different length, the resulting data view will have the length equal to the
 /// length of the shortest source.
 /// </summary>
 /// <param name="env">The host environment to use.</param>
 /// <param name="sources">A non-empty collection of data views to zip together.</param>
 /// <returns>The resulting data view.</returns>
 public static IDataView Zip(this IHostEnvironment env, IEnumerable <IDataView> sources)
 {
     Contracts.CheckValue(env, nameof(env));
     env.CheckValue(sources, nameof(sources));
     return(ZipDataView.Create(env, sources));
 }
Example #20
0
        public OneHotHashEncodingEstimator(IHostEnvironment env, params ColumnInfo[] columns)
        {
            Contracts.CheckValue(env, nameof(env));
            _host = env.Register(nameof(ValueToKeyMappingEstimator));
            _hash = new HashingEstimator(_host, columns.Select(x => x.HashInfo).ToArray());
            using (var ch = _host.Start(nameof(OneHotHashEncodingEstimator)))
            {
                var binaryCols = new List <(string input, string output)>();
                var cols       = new List <(string input, string output, bool bag)>();
                for (int i = 0; i < columns.Length; i++)
                {
                    var column = columns[i];
                    OneHotEncodingTransformer.OutputKind kind = columns[i].OutputKind;
                    switch (kind)
                    {
                    default:
                        throw _host.ExceptUserArg(nameof(column.OutputKind));

                    case OneHotEncodingTransformer.OutputKind.Key:
                        continue;

                    case OneHotEncodingTransformer.OutputKind.Bin:
                        if ((column.HashInfo.InvertHash) != 0)
                        {
                            ch.Warning("Invert hashing is being used with binary encoding.");
                        }
                        binaryCols.Add((column.HashInfo.Output, column.HashInfo.Output));
                        break;

                    case OneHotEncodingTransformer.OutputKind.Ind:
                        cols.Add((column.HashInfo.Output, column.HashInfo.Output, false));
                        break;

                    case OneHotEncodingTransformer.OutputKind.Bag:
                        cols.Add((column.HashInfo.Output, column.HashInfo.Output, true));
                        break;
                    }
                }
                IEstimator <ITransformer> toBinVector = null;
                IEstimator <ITransformer> toVector    = null;
                if (binaryCols.Count > 0)
                {
                    toBinVector = new KeyToBinaryVectorMappingEstimator(_host, binaryCols.Select(x => new KeyToBinaryVectorTransform.ColumnInfo(x.input, x.output)).ToArray());
                }
                if (cols.Count > 0)
                {
                    toVector = new KeyToVectorMappingEstimator(_host, cols.Select(x => new KeyToVectorTransform.ColumnInfo(x.input, x.output, x.bag)).ToArray());
                }

                if (toBinVector != null && toVector != null)
                {
                    _toSomething = toVector.Append(toBinVector);
                }
                else
                {
                    if (toBinVector != null)
                    {
                        _toSomething = toBinVector;
                    }
                    else
                    {
                        _toSomething = toVector;
                    }
                }
            }
        }
Example #21
0
 /// <summary>
 /// Loads a predictor from the model stream. Returns null iff there's no predictor.
 /// </summary>
 /// <param name="env">The host environment to use.</param>
 /// <param name="modelStream">The model stream.</param>
 public static IPredictor LoadPredictorOrNull(this IHostEnvironment env, Stream modelStream)
 {
     Contracts.CheckValue(modelStream, nameof(modelStream));
     return(ModelFileUtils.LoadPredictorOrNull(env, modelStream));
 }
Example #22
0
 /// <summary>
 /// Converts the categorical value into an indicator array by building a dictionary of categories based on the data and using the id in the dictionary as the index in the array
 /// </summary>
 /// <param name="input">Incoming data.</param>
 /// <param name="outputKind">Specify the output type of indicator array: array or binary encoded data.</param>
 /// <param name="hashBits">Amount of bits to use for hashing.</param>
 /// <param name="seed">Seed value used for hashing.</param>
 /// <param name="ordered">Whether the position of each term should be included in the hash.</param>
 /// <param name="invertHash">Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit.</param>
 public static Vector <float> OneHotHashEncoding(this Vector <string> input, OneHotHashVectorOutputKind outputKind = DefOut,
                                                 int hashBits = DefHashBits, uint seed = DefSeed, bool ordered = DefOrdered, int invertHash = DefInvertHash)
 {
     Contracts.CheckValue(input, nameof(input));
     return(new ImplVector <string>(input, new Config(outputKind, hashBits, seed, ordered, invertHash)));
 }
Example #23
0
 internal static ITrainer CreateTrainer(this IHostEnvironment env, string settings, out string loadName)
 {
     Contracts.CheckValue(env, nameof(env));
     return(CreateCore <ITrainer>(env, typeof(SignatureTrainer), settings, out loadName));
 }
Example #24
0
        public static ModelArgs GetModelArgs(ColumnType type, string colName,
                                             List <long> dims = null, List <bool> dimsParams = null)
        {
            Contracts.CheckValue(type, nameof(type));
            Contracts.CheckNonEmpty(colName, nameof(colName));

            TensorProto.Types.DataType dataType = TensorProto.Types.DataType.Undefined;
            DataKind rawKind;

            if (type is VectorType vectorType)
            {
                rawKind = vectorType.ItemType.RawKind;
            }
            else if (type is KeyType keyType)
            {
                rawKind = keyType.RawKind;
            }
            else
            {
                rawKind = type.RawKind;
            }

            switch (rawKind)
            {
            case DataKind.BL:
                dataType = TensorProto.Types.DataType.Float;
                break;

            case DataKind.TX:
                dataType = TensorProto.Types.DataType.String;
                break;

            case DataKind.I1:
                dataType = TensorProto.Types.DataType.Int8;
                break;

            case DataKind.U1:
                dataType = TensorProto.Types.DataType.Uint8;
                break;

            case DataKind.I2:
                dataType = TensorProto.Types.DataType.Int16;
                break;

            case DataKind.U2:
                dataType = TensorProto.Types.DataType.Uint16;
                break;

            case DataKind.I4:
                dataType = TensorProto.Types.DataType.Int32;
                break;

            case DataKind.U4:
                dataType = TensorProto.Types.DataType.Int64;
                break;

            case DataKind.I8:
                dataType = TensorProto.Types.DataType.Int64;
                break;

            case DataKind.U8:
                dataType = TensorProto.Types.DataType.Uint64;
                break;

            case DataKind.R4:
                dataType = TensorProto.Types.DataType.Float;
                break;

            case DataKind.R8:
                dataType = TensorProto.Types.DataType.Double;
                break;

            default:
                string msg = "Unsupported type: DataKind " + rawKind.ToString();
                Contracts.Check(false, msg);
                break;
            }

            string      name           = colName;
            List <long> dimsLocal      = null;
            List <bool> dimsParamLocal = null;

            if (dims != null)
            {
                dimsLocal      = dims;
                dimsParamLocal = dimsParams;
            }
            else
            {
                dimsLocal = new List <long>();
                if (type.ValueCount == 0) //Unknown size.
                {
                    dimsLocal.Add(1);
                    dimsParamLocal = new List <bool>()
                    {
                        false, true
                    };                                                 //false for batch size, true for dims.
                }
                else if (type.ValueCount == 1)
                {
                    dimsLocal.Add(1);
                }
                else if (type.ValueCount > 1)
                {
                    var vec = (VectorType)type;
                    for (int i = 0; i < vec.Dimensions.Length; i++)
                    {
                        dimsLocal.Add(vec.Dimensions[i]);
                    }
                }
            }
            //batch size.
            dimsLocal?.Insert(0, 1);

            return(new ModelArgs(name, dataType, dimsLocal, dimsParamLocal));
        }
Example #25
0
 public static ValueGetter <float> GetOptWeightFloatGetter(this Row row, RoleMappedData data)
 {
     Contracts.CheckValue(data, nameof(data));
     return(GetOptWeightFloatGetter(row, data.Schema));
 }
Example #26
0
 public override void Save(ModelSaveContext ctx)
 {
     Contracts.CheckValue(ctx, nameof(ctx));
     ctx.SetVersionInfo(GetVersionInfo());
     base.Save(ctx);
 }
Example #27
0
        // Factory method for SignatureDataTransform.
        private static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input)
        {
            Contracts.CheckValue(env, nameof(env));
            var h = env.Register("LoadTransform");

            h.CheckValue(args, nameof(args));
            h.CheckValue(input, nameof(input));
            h.CheckUserArg(File.Exists(args.ModelFile), nameof(args.ModelFile), "File does not exist");

            IDataView currentView;

            // If there are no 'tag' parameters, we load everything, regardless of 'comp'.
            bool complement = args.Complement || Utils.Size(args.Tags) == 0;
            var  allTags    = new HashSet <string>();

            for (int i = 0; i < Utils.Size(args.Tags); i++)
            {
                var curList = args.Tags[i];
                if (string.IsNullOrWhiteSpace(curList))
                {
                    continue;
                }

                foreach (var tag in curList.Split(','))
                {
                    if (!string.IsNullOrWhiteSpace(tag))
                    {
                        allTags.Add(tag.ToLower());
                    }
                }
            }

            Func <string, bool> predicate =
                tag =>
            {
                bool found = allTags.Contains(tag.ToLower());
                return(found == !complement);
            };

            using (var file = h.OpenInputFile(args.ModelFile))
                using (var strm = file.OpenReadStream())
                    using (var rep = RepositoryReader.Open(strm, h))
                        using (var pipeLoaderEntry = rep.OpenEntry(ModelFileUtils.DirDataLoaderModel, ModelLoadContext.ModelStreamName))
                            using (var ctx = new ModelLoadContext(rep, pipeLoaderEntry, ModelFileUtils.DirDataLoaderModel))
                            {
                                currentView = LegacyCompositeDataLoader.LoadSelectedTransforms(ctx, input, h, predicate);

                                if (currentView == input)
                                {
                                    // REVIEW: we are required to return an IDataTransform. Therefore, if we don't introduce a new transform
                                    // on top of 'input', we must throw (since input may not be a data transform).
                                    // We could of course introduce a 'no-op transform', or we could lift the requirement to always return an IDataTransform
                                    // associated with SignatureDataTransform.

                                    var criteria = string.Format(
                                        complement
                                ? "transforms that don't have tags from the list: '{0}'"
                                : "transforms that have tags from the list: '{0}'",
                                        string.Join(",", allTags));
                                    throw h.ExceptUserArg(nameof(args.Tags), "No transforms were found that match the search criteria ({0})", criteria);
                                }
                            }

            h.Assert(currentView is IDataTransform);
            return((IDataTransform)currentView);
        }
Example #28
0
 public static SchemaBindableQuantileRegressionPredictor Create(IHostEnvironment env, ModelLoadContext ctx)
 {
     Contracts.CheckValue(ctx, nameof(ctx));
     ctx.CheckAtModel(GetVersionInfo());
     return(new SchemaBindableQuantileRegressionPredictor(env, ctx));
 }
Example #29
0
        /// <summary>
        /// Loads all transforms from the <paramref name="ctx"/> that pass the <paramref name="isTransformTagAccepted"/> test,
        /// applies them sequentially to the <paramref name="srcView"/>, and returns the resulting data view.
        /// If there are no transforms in <paramref name="ctx"/> that are accepted, returns the original <paramref name="srcView"/>.
        /// The difference from the <c>Create</c> method above is that:
        /// - it doesn't wrap the results into a loader, just returns the last transform in the chain.
        /// - it accepts <see cref="IDataView"/> as input, not necessarily a loader.
        /// - it throws away the tag information.
        /// - it doesn't throw if the context is not representing a <see cref="CompositeDataLoader"/>: in this case it's assumed that no transforms
        ///   meet the test, and the <paramref name="srcView"/> is returned.
        /// Essentially, this is a helper method for the LoadTransform class.
        /// </summary>
        public static IDataView LoadSelectedTransforms(ModelLoadContext ctx, IDataView srcView, IHostEnvironment env, Func <string, bool> isTransformTagAccepted)
        {
            Contracts.CheckValue(env, nameof(env));
            var h = env.Register(RegistrationName);

            h.CheckValue(ctx, nameof(ctx));
            h.Check(ctx.Reader.BaseStream.Position == ctx.FpMin + ctx.Header.FpModel);
            var ver = GetVersionInfo();

            if (ctx.Header.ModelSignature != ver.ModelSignature)
            {
                using (var ch = h.Start("ModelCheck"))
                {
                    ch.Info("The data model doesn't contain transforms.");
                }
                return(srcView);
            }
            ModelHeader.CheckVersionInfo(ref ctx.Header, ver);

            h.CheckValue(srcView, nameof(srcView));
            h.CheckValue(isTransformTagAccepted, nameof(isTransformTagAccepted));

            // *** Binary format ***
            // int: sizeof(Float)
            // int: number of transforms
            // foreach transform: (starting from version VersionAddedTags)
            //     string: tag
            //     string: args string

            int cbFloat = ctx.Reader.ReadInt32();

            h.CheckDecode(cbFloat == sizeof(Float));

            int cxf = ctx.Reader.ReadInt32();

            h.CheckDecode(cxf >= 0);

            bool hasTags = ctx.Header.ModelVerReadable >= VersionAddedTags;
            var  curView = srcView;

            for (int i = 0; i < cxf; i++)
            {
                string tag = "";
                if (hasTags)
                {
                    tag = ctx.LoadNonEmptyString();
                    ctx.LoadStringOrNull(); // ignore the args string
                }
                if (!isTransformTagAccepted(tag))
                {
                    continue;
                }

                IDataTransform xf;
                ctx.LoadModel <IDataTransform, SignatureLoadDataTransform>(env, out xf,
                                                                           string.Format(TransformDirTemplate, i), curView);
                curView = xf;
            }

            return(curView);
        }
Example #30
0
 /// <summary>
 /// Presents a <see cref="Schema.Metadata"/> as a an <see cref="Row"/>.
 /// </summary>
 /// <param name="metadata">The metadata to wrap.</param>
 /// <returns>A row that wraps an input metadata.</returns>
 public static Row MetadataAsRow(Schema.Metadata metadata)
 {
     Contracts.CheckValue(metadata, nameof(metadata));
     return(new MetadataRow(metadata));
 }