예제 #1
0
            protected internal OptimizerState(IChannel ch, IProgressChannelProvider progress, ref VBuffer <Float> initial,
                                              int m, long totalMemLimit, bool keepDense, bool enforceNonNegativity)
            {
                Contracts.AssertValue(ch);
                Ch = ch;
                ch.AssertValueOrNull(progress);
                ProgressProvider = progress;
                Iter             = 1;

                _keepDense = keepDense;
                Dim        = initial.Length;

                _x = CreateWorkingVector();
                initial.CopyTo(ref _x);
                _m             = m;
                _totalMemLimit = totalMemLimit;

                Dim              = initial.Length;
                _grad            = CreateWorkingVector();
                _dir             = CreateWorkingVector();
                _newX            = CreateWorkingVector();
                _newGrad         = CreateWorkingVector();
                _steepestDescDir = CreateWorkingVector();

                _sList  = new VBuffer <Float> [_m];
                _yList  = new VBuffer <Float> [_m];
                _roList = new List <Float>();

                EnforceNonNegativity = enforceNonNegativity;
            }
예제 #2
0
 /// <summary>
 /// Minimize a function.
 /// </summary>
 /// <param name="function">The function to minimize</param>
 /// <param name="initial">The initial point</param>
 /// <param name="result">The point at the optimum</param>
 /// <param name="optimum">The optimum function value</param>
 /// <exception cref="PrematureConvergenceException">Thrown if successive points are within numeric precision of each other, but termination condition is still unsatisfied.</exception>
 public void Minimize(DifferentiableFunction function, ref VBuffer <Float> initial, ref VBuffer <Float> result, out Float optimum)
 {
     Minimize(function, ref initial, _staticTerm, ref result, out optimum);
 }
        protected Float DifferentiableFunctionStream(FloatLabelCursor.Factory cursorFactory, ref VBuffer <Float> xDense, ref VBuffer <Float> grad, IProgressChannel pch)
        {
            Contracts.AssertValue(cursorFactory);

            VBufferUtils.Clear(ref grad);
            VBufferUtils.Densify(ref grad);

            Float[] scratch = null;
            double  loss    = 0;
            long    count   = 0;

            if (pch != null)
            {
                pch.SetHeader(new ProgressHeader(null, new[] { "examples" }), e => e.SetProgress(0, count));
            }
            using (var cursor = cursorFactory.Create())
            {
                while (cursor.MoveNext())
                {
                    loss += AccumulateOneGradient(ref cursor.Features, cursor.Label, cursor.Weight,
                                                  ref xDense, ref grad, ref scratch);
                    count++;
                }
            }

            // we need use double type to accumulate loss to avoid roundoff error
            // please see http://mathworld.wolfram.com/RoundoffError.html for roundoff error definition
            // finally we need to convert double type to float for function definition
            return((Float)loss);
        }
예제 #4
0
 public override Float Eval(ref VBuffer <Float> input, ref VBuffer <Float> gradient)
 {
     return(Function(ref input, ref gradient, ProgressProvider));
 }
        private void TrainCore(IChannel ch, RoleMappedData data)
        {
            Host.AssertValue(ch);
            ch.AssertValue(data);

            // Compute the number of threads to use. The ctor should have verified that this will
            // produce a positive value.
            int numThreads = !UseThreads ? 1 : (NumThreads ?? Environment.ProcessorCount);

            if (Host.ConcurrencyFactor > 0 && numThreads > Host.ConcurrencyFactor)
            {
                numThreads = Host.ConcurrencyFactor;
                ch.Warning("The number of threads specified in trainer arguments is larger than the concurrency factor "
                           + "setting of the environment. Using {0} training threads instead.", numThreads);
            }

            ch.Assert(numThreads > 0);

            NumGoodRows = 0;
            WeightSum   = 0;

            _features = null;
            _labels   = null;
            _weights  = null;
            if (numThreads > 1)
            {
                ch.Info("LBFGS multi-threading will attempt to load dataset into memory. In case of out-of-memory " +
                        "issues, add 'numThreads=1' to the trainer arguments and 'cache=-' to the command line " +
                        "arguments to turn off multi-threading.");
                _features = new VBuffer <Float> [1000];
                _labels   = new Float[1000];
                if (data.Schema.Weight != null)
                {
                    _weights = new Float[1000];
                }
            }

            var cursorFactory = new FloatLabelCursor.Factory(data, CursOpt.Features | CursOpt.Label | CursOpt.Weight);

            long numBad;

            // REVIEW: This pass seems overly expensive for the benefit when multi-threading is off....
            using (var cursor = cursorFactory.Create())
                using (var pch = Host.StartProgressChannel("LBFGS data prep"))
                {
                    // REVIEW: maybe it makes sense for the factory to capture the good row count after
                    // the first successful cursoring?
                    Double totalCount = data.Data.GetRowCount(true) ?? Double.NaN;

                    long exCount = 0;
                    pch.SetHeader(new ProgressHeader(null, new[] { "examples" }),
                                  e => e.SetProgress(0, exCount, totalCount));
                    while (cursor.MoveNext())
                    {
                        WeightSum += cursor.Weight;
                        if (ShowTrainingStats)
                        {
                            ProcessPriorDistribution(cursor.Label, cursor.Weight);
                        }

                        PreTrainingProcessInstance(cursor.Label, ref cursor.Features, cursor.Weight);
                        exCount++;
                        if (_features != null)
                        {
                            ch.Assert(cursor.KeptRowCount <= int.MaxValue);
                            int index = (int)cursor.KeptRowCount - 1;
                            Utils.EnsureSize(ref _features, index + 1);
                            Utils.EnsureSize(ref _labels, index + 1);
                            if (_weights != null)
                            {
                                Utils.EnsureSize(ref _weights, index + 1);
                                _weights[index] = cursor.Weight;
                            }
                            Utils.Swap(ref _features[index], ref cursor.Features);
                            _labels[index] = cursor.Label;

                            if (cursor.KeptRowCount >= int.MaxValue)
                            {
                                ch.Warning("Limiting data size for multi-threading");
                                break;
                            }
                        }
                    }
                    NumGoodRows = cursor.KeptRowCount;
                    numBad      = cursor.SkippedRowCount;
                }
            ch.Check(NumGoodRows > 0, NoTrainingInstancesMessage);
            if (numBad > 0)
            {
                ch.Warning("Skipped {0} instances with missing features/label/weight during training", numBad);
            }

            if (_features != null)
            {
                ch.Assert(numThreads > 1);

                // If there are so many threads that each only gets a small number (less than 10) of instances, trim
                // the number of threads so each gets a more reasonable number (100 or so). These numbers are pretty arbitrary,
                // but avoid the possibility of having no instances on some threads.
                if (numThreads > 1 && NumGoodRows / numThreads < 10)
                {
                    int numNew = Math.Max(1, (int)NumGoodRows / 100);
                    ch.Warning("Too few instances to use {0} threads, decreasing to {1} thread(s)", numThreads, numNew);
                    numThreads = numNew;
                }
                ch.Assert(numThreads > 0);

                // Divide up the instances among the threads.
                _numChunks = numThreads;
                _ranges    = new int[_numChunks + 1];
                int cinstTot = (int)NumGoodRows;
                for (int ichk = 0, iinstMin = 0; ichk < numThreads; ichk++)
                {
                    int cchkLeft = numThreads - ichk;                                // Number of chunks left to fill.
                    ch.Assert(0 < cchkLeft && cchkLeft <= numThreads);
                    int cinstThis = (cinstTot - iinstMin + cchkLeft - 1) / cchkLeft; // Size of this chunk.
                    ch.Assert(0 < cinstThis && cinstThis <= cinstTot - iinstMin);
                    iinstMin         += cinstThis;
                    _ranges[ichk + 1] = iinstMin;
                }

                _localLosses    = new Float[numThreads];
                _localGradients = new VBuffer <Float> [numThreads - 1];
                int size = BiasCount + WeightCount;
                for (int i = 0; i < _localGradients.Length; i++)
                {
                    _localGradients[i] = VBufferUtils.CreateEmpty <Float>(size);
                }

                ch.Assert(_numChunks > 0 && _data == null);
            }
            else
            {
                // Streaming, single-threaded case.
                _data          = data;
                _cursorFactory = cursorFactory;
                ch.Assert(_numChunks == 0 && _data != null);
            }

            VBuffer <Float>       initWeights;
            ITerminationCriterion terminationCriterion;
            Optimizer             opt = InitializeOptimizer(ch, cursorFactory, out initWeights, out terminationCriterion);

            opt.Quiet = Quiet;

            Float loss;

            try
            {
                opt.Minimize(DifferentiableFunction, ref initWeights, terminationCriterion, ref CurrentWeights, out loss);
            }
            catch (Optimizer.PrematureConvergenceException e)
            {
                if (!Quiet)
                {
                    ch.Warning("Premature convergence occurred. The OptimizationTolerance may be set too small. {0}", e.Message);
                }
                CurrentWeights = e.State.X;
                loss           = e.State.Value;
            }

            ch.Assert(CurrentWeights.Length == BiasCount + WeightCount);

            int numParams = BiasCount;

            if ((L1Weight > 0 && !Quiet) || ShowTrainingStats)
            {
                VBufferUtils.ForEachDefined(ref CurrentWeights, (index, value) => { if (index >= BiasCount && value != 0)
                                                                                    {
                                                                                        numParams++;
                                                                                    }
                                            });
                if (L1Weight > 0 && !Quiet)
                {
                    ch.Info("L1 regularization selected {0} of {1} weights.", numParams, BiasCount + WeightCount);
                }
            }

            if (ShowTrainingStats)
            {
                ComputeTrainingStatistics(ch, cursorFactory, loss, numParams);
            }
        }
        /// <summary>
        /// Batch-parallel optimizer
        /// </summary>
        /// <remarks>
        /// REVIEW: consider getting rid of multithread-targeted members
        /// Using TPL, the distinction between Multithreaded and Sequential implementations is unnecessary
        /// </remarks>
        protected virtual Float DifferentiableFunctionMultithreaded(ref VBuffer <Float> xDense, ref VBuffer <Float> gradient, IProgressChannel pch)
        {
            Contracts.Assert(_data == null);
            Contracts.Assert(_cursorFactory == null);
            Contracts.Assert(_numChunks > 0);
            Contracts.Assert(Utils.Size(_ranges) == _numChunks + 1);
            Contracts.Assert(Utils.Size(_localLosses) == _numChunks);
            Contracts.Assert(Utils.Size(_localGradients) + 1 == _numChunks);
            Contracts.AssertValueOrNull(pch);

            // Declare a local variable, since the lambda cannot capture the xDense. The gradient
            // calculation will modify the local gradients, but not this xx value.
            var xx = xDense;
            var gg = gradient;

            Parallel.For(0, _numChunks,
                         ichk =>
            {
                if (ichk == 0)
                {
                    _localLosses[ichk] = DifferentiableFunctionComputeChunk(ichk, ref xx, ref gg, pch);
                }
                else
                {
                    _localLosses[ichk] = DifferentiableFunctionComputeChunk(ichk, ref xx, ref _localGradients[ichk - 1], null);
                }
            });
            gradient = gg;
            Float loss = _localLosses[0];

            for (int i = 1; i < _numChunks; i++)
            {
                VectorUtils.Add(ref _localGradients[i - 1], ref gradient);
                loss += _localLosses[i];
            }
            return(loss);
        }
예제 #7
0
        public void SimpleTextLoaderCopyColumnsTest()
        {
            var env = new MLContext(0);

            const string data = "0 hello 3.14159 -0 2\n"
                                + "1 1 2 4 15";
            var dataSource = new BytesStreamSource(data);

            var text = TextLoaderStatic.CreateLoader(env, ctx => (
                                                         label: ctx.LoadBool(0),
                                                         text: ctx.LoadText(1),
                                                         numericFeatures: ctx.LoadFloat(2, null)), // If fit correctly, this ought to be equivalent to max of 4, that is, length of 3.
                                                     dataSource, separator: ' ');

            // While we have a type-safe wrapper for `IDataView` it is utterly useless except as an input to the `Fit` functions
            // of the other statically typed wrappers. We perhaps ought to make it useful in its own right, but perhaps not now.
            // For now, just operate over the actual `IDataView`.
            var textData = text.Load(dataSource).AsDynamic;

            var schema = textData.Schema;

            // First verify that the columns are there. There ought to be at least one column corresponding to the identifiers in the tuple.
            CheckSchemaHasColumn(schema, "label", out int labelIdx);
            CheckSchemaHasColumn(schema, "text", out int textIdx);
            CheckSchemaHasColumn(schema, "numericFeatures", out int numericFeaturesIdx);
            // Next verify they have the expected types.
            Assert.Equal(BooleanDataViewType.Instance, schema[labelIdx].Type);
            Assert.Equal(TextDataViewType.Instance, schema[textIdx].Type);
            Assert.Equal(new VectorType(NumberDataViewType.Single, 3), schema[numericFeaturesIdx].Type);
            // Next actually inspect the data.
            using (var cursor = textData.GetRowCursorForAllColumns())
            {
                var textGetter                = cursor.GetGetter <ReadOnlyMemory <char> >(textIdx);
                var numericFeaturesGetter     = cursor.GetGetter <VBuffer <float> >(numericFeaturesIdx);
                ReadOnlyMemory <char> textVal = default;
                var             labelGetter   = cursor.GetGetter <bool>(labelIdx);
                bool            labelVal      = default;
                VBuffer <float> numVal        = default;

                void CheckValuesSame(bool bl, string tx, float v0, float v1, float v2)
                {
                    labelGetter(ref labelVal);
                    textGetter(ref textVal);
                    numericFeaturesGetter(ref numVal);
                    Assert.True(tx.AsSpan().SequenceEqual(textVal.Span));
                    Assert.Equal((bool)bl, labelVal);
                    Assert.Equal(3, numVal.Length);
                    Assert.Equal(v0, numVal.GetItemOrDefault(0));
                    Assert.Equal(v1, numVal.GetItemOrDefault(1));
                    Assert.Equal(v2, numVal.GetItemOrDefault(2));
                }

                Assert.True(cursor.MoveNext(), "Could not move even to first row");
                CheckValuesSame(false, "hello", 3.14159f, -0f, 2f);
                Assert.True(cursor.MoveNext(), "Could not move to second row");
                CheckValuesSame(true, "1", 2f, 4f, 15f);
                Assert.False(cursor.MoveNext(), "Moved to third row, but there should have been only two");
            }

            // The next step where we shuffle the names around a little bit is one where we are
            // testing out the implicit usage of copy columns.

            var est         = text.MakeNewEstimator().Append(r => (text: r.label, label: r.numericFeatures));
            var newText     = text.Append(est);
            var newTextData = newText.Fit(dataSource).Load(dataSource);

            schema = newTextData.AsDynamic.Schema;
            // First verify that the columns are there. There ought to be at least one column corresponding to the identifiers in the tuple.
            CheckSchemaHasColumn(schema, "label", out labelIdx);
            CheckSchemaHasColumn(schema, "text", out textIdx);
            // Next verify they have the expected types.
            Assert.Equal(BooleanDataViewType.Instance, schema[textIdx].Type);
            Assert.Equal(new VectorType(NumberDataViewType.Single, 3), schema[labelIdx].Type);
        }
        /// <summary>
        /// Initialize weights by running SGD up to specified tolerance.
        /// </summary>
        protected virtual VBuffer <Float> InitializeWeightsSgd(IChannel ch, FloatLabelCursor.Factory cursorFactory)
        {
            if (!Quiet)
            {
                ch.Info("Running SGD initialization with tolerance {0}", SgdInitializationTolerance);
            }

            int        numExamples  = 0;
            var        oldWeights   = VBufferUtils.CreateEmpty <Float>(BiasCount + WeightCount);
            DTerminate terminateSgd =
                (ref VBuffer <Float> x) =>
            {
                if (++numExamples % 1000 != 0)
                {
                    return(false);
                }
                VectorUtils.AddMult(ref x, -1, ref oldWeights);
                Float normDiff = VectorUtils.Norm(oldWeights);
                x.CopyTo(ref oldWeights);
                // #if OLD_TRACING // REVIEW: How should this be ported?
                if (!Quiet)
                {
                    Console.Write(".");
                    if (numExamples % 50000 == 0)
                    {
                        Console.WriteLine("\t{0}\t{1}", numExamples, normDiff);
                    }
                }
                // #endif
                return(normDiff < SgdInitializationTolerance);
            };

            VBuffer <Float>  result = default(VBuffer <Float>);
            FloatLabelCursor cursor = null;

            try
            {
                Float[] scratch = null;

                SgdOptimizer.DStochasticGradient lossSgd =
                    (ref VBuffer <Float> x, ref VBuffer <Float> grad) =>
                {
                    // Zero out the gradient by sparsifying.
                    grad = new VBuffer <Float>(grad.Length, 0, grad.Values, grad.Indices);
                    EnsureBiases(ref grad);

                    if (cursor == null || !cursor.MoveNext())
                    {
                        if (cursor != null)
                        {
                            cursor.Dispose();
                        }
                        cursor = cursorFactory.Create();
                        if (!cursor.MoveNext())
                        {
                            return;
                        }
                    }
                    AccumulateOneGradient(ref cursor.Features, cursor.Label, cursor.Weight, ref x, ref grad, ref scratch);
                };

                VBuffer <Float> sgdWeights;
                if (DenseOptimizer)
                {
                    sgdWeights = VBufferUtils.CreateDense <Float>(BiasCount + WeightCount);
                }
                else
                {
                    sgdWeights = VBufferUtils.CreateEmpty <Float>(BiasCount + WeightCount);
                }
                SgdOptimizer sgdo = new SgdOptimizer(terminateSgd);
                sgdo.Minimize(lossSgd, ref sgdWeights, ref result);
                // #if OLD_TRACING // REVIEW: How should this be ported?
                if (!Quiet)
                {
                    Console.WriteLine();
                }
                // #endif
                ch.Info("SGD initialization done in {0} rounds", numExamples);
            }
            finally
            {
                if (cursor != null)
                {
                    cursor.Dispose();
                }
            }

            return(result);
        }
        void ComputeStatistics()
        {
            lock (_lock)
            {
                if (_scalingStat == null)
                {
                    using (var ch = _host.Start("ScalerTransform"))
                    {
                        var sch        = _input.Schema;
                        var indexesCol = new List <int>();

                        var textCols = _args.columns.Select(c => c.Source).ToArray();
                        _scalingStat = new Dictionary <string, List <ColumnStatObs> >();

                        for (int i = 0; i < textCols.Length; ++i)
                        {
                            int index;
                            if (!sch.TryGetColumnIndex(textCols[i], out index))
                            {
                                throw ch.Except("Unable to find column '{0}' in '{1}'", textCols[i], SchemaHelper.ToString(sch));
                            }
                            var ty = sch.GetColumnType(index);
                            if (!(ty == NumberType.R4 || ty == NumberType.U4 || ty == TextType.Instance || ty == BoolType.Instance ||
                                  (ty.IsKey() && ty.AsKey().RawKind() == DataKind.U4) || (ty.IsVector() && ty.AsVector().ItemType() == NumberType.R4)))
                            {
                                throw ch.Except("Only a float or a vector of floats or a uint or a text or a bool is allowed for column {0} (schema={1}).", _args.columns[i], SchemaHelper.ToString(sch));
                            }
                            indexesCol.Add(index);
                        }

                        // Computation
                        var required        = new HashSet <int>(indexesCol);
                        var requiredIndexes = required.OrderBy(c => c).ToArray();
                        using (var cur = _input.GetRowCursor(i => required.Contains(i)))
                        {
                            bool[] isText  = requiredIndexes.Select(c => sch.GetColumnType(c) == TextType.Instance).ToArray();
                            bool[] isBool  = requiredIndexes.Select(c => sch.GetColumnType(c) == BoolType.Instance).ToArray();
                            bool[] isFloat = requiredIndexes.Select(c => sch.GetColumnType(c) == NumberType.R4).ToArray();
                            bool[] isUint  = requiredIndexes.Select(c => sch.GetColumnType(c) == NumberType.U4 || sch.GetColumnType(c).RawKind() == DataKind.U4).ToArray();
                            ValueGetter <bool>[] boolGetters = requiredIndexes.Select(i => sch.GetColumnType(i) == BoolType.Instance || sch.GetColumnType(i).RawKind() == DataKind.BL ? cur.GetGetter <bool>(i) : null).ToArray();
                            ValueGetter <uint>[] uintGetters = requiredIndexes.Select(i => sch.GetColumnType(i) == NumberType.U4 || sch.GetColumnType(i).RawKind() == DataKind.U4 ? cur.GetGetter <uint>(i) : null).ToArray();
                            ValueGetter <ReadOnlyMemory <char> >[] textGetters = requiredIndexes.Select(i => sch.GetColumnType(i) == TextType.Instance ? cur.GetGetter <ReadOnlyMemory <char> >(i) : null).ToArray();
                            ValueGetter <float>[]            floatGetters      = requiredIndexes.Select(i => sch.GetColumnType(i) == NumberType.R4 ? cur.GetGetter <float>(i) : null).ToArray();
                            ValueGetter <VBuffer <float> >[] vectorGetters     = requiredIndexes.Select(i => sch.GetColumnType(i).IsVector() ? cur.GetGetter <VBuffer <float> >(i) : null).ToArray();

                            var schema = _input.Schema;
                            for (int i = 0; i < schema.ColumnCount; ++i)
                            {
                                string name = schema.GetColumnName(i);
                                if (!required.Contains(i))
                                {
                                    continue;
                                }
                                _scalingStat[name] = new List <ColumnStatObs>();
                                var t = _scalingStat[name];
                                switch (_args.scaling)
                                {
                                case ScalerStrategy.meanVar:
                                    t.Add(new ColumnStatObs(ColumnStatObs.StatKind.sum));
                                    t.Add(new ColumnStatObs(ColumnStatObs.StatKind.sum2));
                                    t.Add(new ColumnStatObs(ColumnStatObs.StatKind.nb));
                                    break;

                                case ScalerStrategy.minMax:
                                    t.Add(new ColumnStatObs(ColumnStatObs.StatKind.min));
                                    t.Add(new ColumnStatObs(ColumnStatObs.StatKind.max));
                                    break;

                                default:
                                    throw _host.ExceptNotSupp($"Unsupported scaling strategy: {_args.scaling}.");
                                }
                            }

                            float           value     = 0;
                            var             tvalue    = new ReadOnlyMemory <char>();
                            VBuffer <float> vector    = new VBuffer <float>();
                            uint            uvalue    = 0;
                            bool            bvalue    = true;
                            var             curschema = cur.Schema;

                            while (cur.MoveNext())
                            {
                                for (int i = 0; i < requiredIndexes.Length; ++i)
                                {
                                    string name = curschema.GetColumnName(requiredIndexes[i]);
                                    if (!_scalingStat.ContainsKey(name))
                                    {
                                        continue;
                                    }
                                    if (isFloat[i])
                                    {
                                        floatGetters[i](ref value);
                                        foreach (var t in _scalingStat[name])
                                        {
                                            t.Update(value);
                                        }
                                    }
                                    else if (isBool[i])
                                    {
                                        boolGetters[i](ref bvalue);
                                        foreach (var t in _scalingStat[name])
                                        {
                                            t.Update(bvalue);
                                        }
                                    }
                                    else if (isText[i])
                                    {
                                        textGetters[i](ref tvalue);
                                        foreach (var t in _scalingStat[name])
                                        {
                                            t.Update(tvalue.ToString());
                                        }
                                    }
                                    else if (isUint[i])
                                    {
                                        uintGetters[i](ref uvalue);
                                        foreach (var t in _scalingStat[name])
                                        {
                                            t.Update(uvalue);
                                        }
                                    }
                                    else
                                    {
                                        vectorGetters[i](ref vector);
                                        foreach (var t in _scalingStat[name])
                                        {
                                            t.Update(vector);
                                        }
                                    }
                                }
                            }
                        }

                        _scalingFactors = GetScalingParameters();
                        _revIndex       = ComputeRevIndex();
                    }
                }
            }
        }
 public ScalingFactor(int colid, ScalingMethod method, VBuffer <float> mean, VBuffer <float> scale)
 {
     scalingMethod = method;
     columnId      = colid;
     this.mean     = mean;
     this.scale    = scale;
 }
        public void TreeEnsembleFeaturizerOutputSchemaTest()
        {
            // Create data set
            var data     = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(1000).ToList();
            var dataView = ML.Data.LoadFromEnumerable(data);

            // Define a tree model whose trees will be extracted to construct a tree featurizer.
            var trainer = ML.BinaryClassification.Trainers.FastTree(
                new FastTreeBinaryTrainer.Options
            {
                NumberOfThreads = 1,
                NumberOfTrees   = 10,
                NumberOfLeaves  = 5,
            });

            // Train the defined tree model.
            var model = trainer.Fit(dataView);

            // From the trained tree model, a mapper of tree featurizer is created.
            const string treesColumnName  = "MyTrees";
            const string leavesColumnName = "MyLeaves";
            const string pathsColumnName  = "MyPaths";
            var          args             = new TreeEnsembleFeaturizerBindableMapper.Arguments()
            {
                TreesColumnName = treesColumnName, LeavesColumnName = leavesColumnName, PathsColumnName = pathsColumnName
            };
            var treeFeaturizer = new TreeEnsembleFeaturizerBindableMapper(Env, args, model.Model);

            // To get output schema, we need to create RoleMappedSchema for calling Bind(...).
            var roleMappedSchema = new RoleMappedSchema(dataView.Schema,
                                                        label: nameof(SamplesUtils.DatasetUtils.BinaryLabelFloatFeatureVectorFloatWeightSample.Label),
                                                        feature: nameof(SamplesUtils.DatasetUtils.BinaryLabelFloatFeatureVectorFloatWeightSample.Features));

            // Retrieve output schema.
            var boundMapper  = (treeFeaturizer as ISchemaBindableMapper).Bind(Env, roleMappedSchema);
            var outputSchema = boundMapper.OutputSchema;

            {
                // Check if output schema is correct.
                var treeValuesColumn = outputSchema[0];
                Assert.Equal(treesColumnName, treeValuesColumn.Name);
                VectorDataViewType treeValuesType = treeValuesColumn.Type as VectorDataViewType;
                Assert.NotNull(treeValuesType);
                Assert.Equal(NumberDataViewType.Single, treeValuesType.ItemType);
                Assert.Equal(10, treeValuesType.Size);
                // Below we check the only metadata field.
                Assert.Single(treeValuesColumn.Annotations.Schema);
                VBuffer <ReadOnlyMemory <char> > slotNames = default;
                treeValuesColumn.Annotations.GetValue(AnnotationUtils.Kinds.SlotNames, ref slotNames);
                Assert.Equal(10, slotNames.Length);
                // Just check the head and the tail of the extracted vector.
                Assert.Equal("Tree000", slotNames.GetItemOrDefault(0).ToString());
                Assert.Equal("Tree009", slotNames.GetItemOrDefault(9).ToString());
            }

            {
                var treeLeafIdsColumn = outputSchema[1];
                // Check column of tree leaf IDs.
                Assert.Equal(leavesColumnName, treeLeafIdsColumn.Name);
                VectorDataViewType treeLeafIdsType = treeLeafIdsColumn.Type as VectorDataViewType;
                Assert.NotNull(treeLeafIdsType);
                Assert.Equal(NumberDataViewType.Single, treeLeafIdsType.ItemType);
                Assert.Equal(50, treeLeafIdsType.Size);
                // Below we check the two leaf-IDs column's metadata fields.
                Assert.Equal(2, treeLeafIdsColumn.Annotations.Schema.Count);
                // Check metadata field IsNormalized's content.
                bool leafIdsNormalizedFlag = false;
                treeLeafIdsColumn.Annotations.GetValue(AnnotationUtils.Kinds.IsNormalized, ref leafIdsNormalizedFlag);
                Assert.True(leafIdsNormalizedFlag);
                // Check metadata field SlotNames's content.
                VBuffer <ReadOnlyMemory <char> > leafIdsSlotNames = default;
                treeLeafIdsColumn.Annotations.GetValue(AnnotationUtils.Kinds.SlotNames, ref leafIdsSlotNames);
                Assert.Equal(50, leafIdsSlotNames.Length);
                // Just check the head and the tail of the extracted vector.
                Assert.Equal("Tree000Leaf000", leafIdsSlotNames.GetItemOrDefault(0).ToString());
                Assert.Equal("Tree009Leaf004", leafIdsSlotNames.GetItemOrDefault(49).ToString());
            }

            {
                var treePathIdsColumn = outputSchema[2];
                // Check column of path IDs.
                Assert.Equal(pathsColumnName, treePathIdsColumn.Name);
                VectorDataViewType treePathIdsType = treePathIdsColumn.Type as VectorDataViewType;
                Assert.NotNull(treePathIdsType);
                Assert.Equal(NumberDataViewType.Single, treePathIdsType.ItemType);
                Assert.Equal(40, treePathIdsType.Size);
                // Below we check the two path-IDs column's metadata fields.
                Assert.Equal(2, treePathIdsColumn.Annotations.Schema.Count);
                // Check metadata field IsNormalized's content.
                bool pathIdsNormalizedFlag = false;
                treePathIdsColumn.Annotations.GetValue(AnnotationUtils.Kinds.IsNormalized, ref pathIdsNormalizedFlag);
                Assert.True(pathIdsNormalizedFlag);
                // Check metadata field SlotNames's content.
                VBuffer <ReadOnlyMemory <char> > pathIdsSlotNames = default;
                treePathIdsColumn.Annotations.GetValue(AnnotationUtils.Kinds.SlotNames, ref pathIdsSlotNames);
                Assert.Equal(40, pathIdsSlotNames.Length);
                // Just check the head and the tail of the extracted vector.
                Assert.Equal("Tree000Node000", pathIdsSlotNames.GetItemOrDefault(0).ToString());
                Assert.Equal("Tree009Node003", pathIdsSlotNames.GetItemOrDefault(39).ToString());
            }
        }
예제 #12
0
            private ValueGetter <VBuffer <float> > GetGetterVec(IRow input, int iinfo)
            {
                Host.AssertValue(input);
                Host.Assert(0 <= iinfo && iinfo < _parent.ColumnPairs.Length);

                var colType = input.Schema.GetColumnType(ColMapNewToOld[iinfo]);

                Host.Assert(colType.IsVector);
                Host.Assert(colType.ItemType.IsText);

                var srcGetter = input.GetGetter <VBuffer <ReadOnlyMemory <char> > >(ColMapNewToOld[iinfo]);
                var src       = default(VBuffer <ReadOnlyMemory <char> >);
                int dimension = _parent._currentVocab.Dimension;

                float[] wordVector = new float[_parent._currentVocab.Dimension];

                return
                    ((ref VBuffer <float> dst) =>
                {
                    int deno = 0;
                    srcGetter(ref src);
                    var values = dst.Values;
                    if (Utils.Size(values) != 3 * dimension)
                    {
                        values = new float[3 * dimension];
                    }
                    int offset = 2 * dimension;
                    for (int i = 0; i < dimension; i++)
                    {
                        values[i] = float.MaxValue;
                        values[i + dimension] = 0;
                        values[i + offset] = float.MinValue;
                    }
                    for (int word = 0; word < src.Count; word++)
                    {
                        if (_parent._currentVocab.GetWordVector(ref src.Values[word], wordVector))
                        {
                            deno++;
                            for (int i = 0; i < dimension; i++)
                            {
                                float currentTerm = wordVector[i];
                                if (values[i] > currentTerm)
                                {
                                    values[i] = currentTerm;
                                }
                                values[dimension + i] += currentTerm;
                                if (values[offset + i] < currentTerm)
                                {
                                    values[offset + i] = currentTerm;
                                }
                            }
                        }
                    }

                    if (deno != 0)
                    {
                        for (int index = 0; index < dimension; index++)
                        {
                            values[index + dimension] /= deno;
                        }
                    }

                    dst = new VBuffer <float>(values.Length, values, dst.Indices);
                });
            }
예제 #13
0
        private void HashTestCore <T>(T val, PrimitiveType type, uint expected, uint expectedOrdered, uint expectedOrdered3)
        {
            const int bits = 10;

            var builder = new MetadataBuilder();

            builder.AddPrimitiveValue("Foo", type, val);
            var inRow = MetadataUtils.MetadataAsRow(builder.GetMetadata());

            // First do an unordered hash.
            var info   = new HashingTransformer.ColumnInfo("Foo", "Bar", hashBits: bits);
            var xf     = new HashingTransformer(Env, new[] { info });
            var mapper = xf.GetRowToRowMapper(inRow.Schema);

            mapper.OutputSchema.TryGetColumnIndex("Bar", out int outCol);
            var outRow = mapper.GetRow(inRow, c => c == outCol);

            var  getter = outRow.GetGetter <uint>(outCol);
            uint result = 0;

            getter(ref result);
            Assert.Equal(expected, result);

            // Next do an ordered hash.
            info   = new HashingTransformer.ColumnInfo("Foo", "Bar", hashBits: bits, ordered: true);
            xf     = new HashingTransformer(Env, new[] { info });
            mapper = xf.GetRowToRowMapper(inRow.Schema);
            mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol);
            outRow = mapper.GetRow(inRow, c => c == outCol);

            getter = outRow.GetGetter <uint>(outCol);
            getter(ref result);
            Assert.Equal(expectedOrdered, result);

            // Next build up a vector to make sure that hashing is consistent between scalar values
            // at least in the first position, and in the unordered case, the last position.
            const int vecLen   = 5;
            var       denseVec = new VBuffer <T>(vecLen, Utils.CreateArray(vecLen, val));

            builder = new MetadataBuilder();
            builder.Add("Foo", new VectorType(type, vecLen), (ref VBuffer <T> dst) => denseVec.CopyTo(ref dst));
            inRow = MetadataUtils.MetadataAsRow(builder.GetMetadata());

            info   = new HashingTransformer.ColumnInfo("Foo", "Bar", hashBits: bits, ordered: false);
            xf     = new HashingTransformer(Env, new[] { info });
            mapper = xf.GetRowToRowMapper(inRow.Schema);
            mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol);
            outRow = mapper.GetRow(inRow, c => c == outCol);

            var            vecGetter = outRow.GetGetter <VBuffer <uint> >(outCol);
            VBuffer <uint> vecResult = default;

            vecGetter(ref vecResult);

            Assert.Equal(vecLen, vecResult.Length);
            // They all should equal this in this case.
            Assert.All(vecResult.DenseValues(), v => Assert.Equal(expected, v));

            // Now do ordered with the dense vector.
            info   = new HashingTransformer.ColumnInfo("Foo", "Bar", hashBits: bits, ordered: true);
            xf     = new HashingTransformer(Env, new[] { info });
            mapper = xf.GetRowToRowMapper(inRow.Schema);
            mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol);
            outRow    = mapper.GetRow(inRow, c => c == outCol);
            vecGetter = outRow.GetGetter <VBuffer <uint> >(outCol);
            vecGetter(ref vecResult);

            Assert.Equal(vecLen, vecResult.Length);
            Assert.Equal(expectedOrdered, vecResult.GetItemOrDefault(0));
            Assert.Equal(expectedOrdered3, vecResult.GetItemOrDefault(3));
            Assert.All(vecResult.DenseValues(), v => Assert.True((v == 0) == (expectedOrdered == 0)));

            // Let's now do a sparse vector.
            var sparseVec = new VBuffer <T>(10, 3, Utils.CreateArray(3, val), new[] { 0, 3, 7 });

            builder = new MetadataBuilder();
            builder.Add("Foo", new VectorType(type, vecLen), (ref VBuffer <T> dst) => sparseVec.CopyTo(ref dst));
            inRow = MetadataUtils.MetadataAsRow(builder.GetMetadata());

            info   = new HashingTransformer.ColumnInfo("Foo", "Bar", hashBits: bits, ordered: false);
            xf     = new HashingTransformer(Env, new[] { info });
            mapper = xf.GetRowToRowMapper(inRow.Schema);
            mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol);
            outRow    = mapper.GetRow(inRow, c => c == outCol);
            vecGetter = outRow.GetGetter <VBuffer <uint> >(outCol);
            vecGetter(ref vecResult);

            Assert.Equal(10, vecResult.Length);
            Assert.Equal(expected, vecResult.GetItemOrDefault(0));
            Assert.Equal(expected, vecResult.GetItemOrDefault(3));
            Assert.Equal(expected, vecResult.GetItemOrDefault(7));

            info   = new HashingTransformer.ColumnInfo("Foo", "Bar", hashBits: bits, ordered: true);
            xf     = new HashingTransformer(Env, new[] { info });
            mapper = xf.GetRowToRowMapper(inRow.Schema);
            mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol);
            outRow    = mapper.GetRow(inRow, c => c == outCol);
            vecGetter = outRow.GetGetter <VBuffer <uint> >(outCol);
            vecGetter(ref vecResult);

            Assert.Equal(10, vecResult.Length);
            Assert.Equal(expectedOrdered, vecResult.GetItemOrDefault(0));
            Assert.Equal(expectedOrdered3, vecResult.GetItemOrDefault(3));
        }
예제 #14
0
        private void TrainCore(IChannel ch, FloatLabelCursor.Factory cursorFactory, int featureCount)
        {
            Host.AssertValue(ch);
            ch.AssertValue(cursorFactory);

            int m = featureCount + 1;

            // Check for memory conditions first.
            if ((long)m * (m + 1) / 2 > int.MaxValue)
            {
                throw ch.Except("Cannot hold covariance matrix in memory with {0} features", m - 1);
            }

            // Track the number of examples.
            long n = 0;
            // Since we are accumulating over many values, we use Double even for the single precision build.
            var xty = new Double[m];
            // The layout of this algorithm is a packed row-major lower triangular matrix.
            var xtx = new Double[m * (m + 1) / 2];

            // Build X'X (lower triangular) and X'y incrementally (X'X+=X'X_i; X'y+=X'y_i):
            using (var cursor = cursorFactory.Create())
            {
                while (cursor.MoveNext())
                {
                    var yi = cursor.Label;
                    // Increment first element of X'y
                    xty[0] += yi;
                    // Increment first element of lower triangular X'X
                    xtx[0] += 1;
                    var values = cursor.Features.Values;

                    if (cursor.Features.IsDense)
                    {
                        int ioff = 1;
                        ch.Assert(cursor.Features.Count + 1 == m);
                        // Increment rest of first column of lower triangular X'X
                        for (int i = 1; i < m; i++)
                        {
                            ch.Assert(ioff == i * (i + 1) / 2);
                            var val = values[i - 1];
                            // Add the implicit first bias term to X'X
                            xtx[ioff++] += val;
                            // Add the remainder of X'X
                            for (int j = 0; j < i; j++)
                            {
                                xtx[ioff++] += val * values[j];
                            }
                            // X'y
                            xty[i] += val * yi;
                        }
                        ch.Assert(ioff == xtx.Length);
                    }
                    else
                    {
                        var fIndices = cursor.Features.Indices;
                        for (int ii = 0; ii < cursor.Features.Count; ++ii)
                        {
                            int i    = fIndices[ii] + 1;
                            int ioff = i * (i + 1) / 2;
                            var val  = values[ii];
                            // Add the implicit first bias term to X'X
                            xtx[ioff++] += val;
                            // Add the remainder of X'X
                            for (int jj = 0; jj <= ii; jj++)
                            {
                                xtx[ioff + fIndices[jj]] += val * values[jj];
                            }
                            // X'y
                            xty[i] += val * yi;
                        }
                    }
                    n++;
                }
                ch.Check(n > 0, "No training examples in dataset.");
                if (cursor.BadFeaturesRowCount > 0)
                {
                    ch.Warning("Skipped {0} instances with missing features/label during training", cursor.SkippedRowCount);
                }

                if (_l2Weight > 0)
                {
                    // Skip the bias term for regularization, in the ridge regression case.
                    // So start at [1,1] instead of [0,0].

                    // REVIEW: There are two ways to view this, firstly, it is more
                    // user friendly ot make this scaling factor behave similarly regardless
                    // of data size, so that if you have the same parameters, you get the same
                    // model if you feed in your data than if you duplicate your data 10 times.
                    // This is what I have now. The alternate point of view is to view this
                    // L2 regularization parameter as providing some sort of prior, in which
                    // case duplication 10 times should in fact be treated differently! (That
                    // is, we should not multiply by n below.) Both interpretations seem
                    // correct, in their way.
                    Double squared = _l2Weight * _l2Weight * n;
                    int    ioff    = 0;
                    for (int i = 1; i < m; ++i)
                    {
                        xtx[ioff += i + 1] += squared;
                    }
                    ch.Assert(ioff == xtx.Length - 1);
                }
            }

            if (!(_l2Weight > 0) && n < m)
            {
                throw ch.Except("Ordinary least squares requires more examples than parameters. There are {0} parameters, but {1} examples. To enable training, use a positive L2 weight so this behaves as ridge regression.", m, n);
            }

            Double yMean = n == 0 ? 0 : xty[0] / n;

            ch.Info("Trainer solving for {0} parameters across {1} examples", m, n);
            // Cholesky Decomposition of X'X into LL'
            try
            {
                Mkl.Pptrf(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, m, xtx);
            }
            catch (DllNotFoundException)
            {
                // REVIEW: Is there no better way?
                throw ch.ExceptNotSupp("The MKL library (Microsoft.ML.MklImports.dll) or one of its dependencies is missing.");
            }
            // Solve for beta in (LL')beta = X'y:
            Mkl.Pptrs(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, m, 1, xtx, xty, 1);
            // Note that the solver overwrote xty so it contains the solution. To be more clear,
            // we effectively change its name (through reassignment) so we don't get confused that
            // this is somehow xty in the remaining calculation.
            var beta = xty;

            xty = null;
            // Check that the solution is valid.
            for (int i = 0; i < beta.Length; ++i)
            {
                ch.Check(FloatUtils.IsFinite(beta[i]), "Non-finite values detected in OLS solution");
            }

            var weights = VBufferUtils.CreateDense <Float>(beta.Length - 1);

            for (int i = 1; i < beta.Length; ++i)
            {
                weights.Values[i - 1] = (Float)beta[i];
            }
            _weights        = weights;
            _bias           = (Float)beta[0];
            _standardErrors = _tValues = _pValues = null;
            if (!(_l2Weight > 0) && m == n)
            {
                // We would expect the solution to the problem to be exact in this case.
                _rSquared         = 1;
                _rSquaredAdjusted = Float.NaN;
                ch.Info("Number of examples equals number of parameters, solution is exact but no statistics can be derived");
                ch.Done();
                return;
            }

            Double rss = 0; // residual sum of squares
            Double tss = 0; // total sum of squares

            using (var cursor = cursorFactory.Create())
            {
                var   lrPredictor = new LinearRegressionPredictor(Host, ref _weights, _bias);
                var   lrMap       = lrPredictor.GetMapper <VBuffer <Float>, Float>();
                Float yh          = default(Float);
                while (cursor.MoveNext())
                {
                    var features = cursor.Features;
                    lrMap(ref features, ref yh);
                    var e = cursor.Label - yh;
                    rss += e * e;
                    var ydm = cursor.Label - yMean;
                    tss += ydm * ydm;
                }
            }
            _rSquared = ProbClamp(1 - (rss / tss));
            // R^2 adjusted differs from the normal formula on account of the bias term, by Said's reckoning.
            if (n > m)
            {
                _rSquaredAdjusted = ProbClamp(1 - (1 - _rSquared) * (n - 1) / (n - m));
                ch.Info("Coefficient of determination R2 = {0:g}, or {1:g} (adjusted)",
                        _rSquared, _rSquaredAdjusted);
            }
            else
            {
                _rSquaredAdjusted = Double.NaN;
            }

            // The per parameter significance is compute intensive and may not be required for all practitioners.
            // Also we can't estimate it, unless we can estimate the variance, which requires more examples than
            // parameters.
            if (!_perParameterSignificance || m >= n)
            {
                return;
            }

            ch.Assert(!Double.IsNaN(_rSquaredAdjusted));
            _standardErrors = new Double[m];
            _tValues        = new Double[m];
            _pValues        = new Double[m];
            // Invert X'X:
            Mkl.Pptri(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, m, xtx);
            var s2 = rss / (n - m); // estimate of variance of y

            for (int i = 0; i < m; i++)
            {
                // Initialize with inverse Hessian.
                _standardErrors[i] = (Single)xtx[i * (i + 1) / 2 + i];
            }

            if (_l2Weight > 0)
            {
                // Iterate through all entries of inverse Hessian to make adjustment to variance.
                int   ioffset = 1;
                Float reg     = _l2Weight * _l2Weight * n;
                for (int iRow = 1; iRow < m; iRow++)
                {
                    for (int iCol = 0; iCol <= iRow; iCol++)
                    {
                        var entry      = (Single)xtx[ioffset];
                        var adjustment = -reg * entry * entry;
                        _standardErrors[iRow] -= adjustment;
                        if (0 < iCol && iCol < iRow)
                        {
                            _standardErrors[iCol] -= adjustment;
                        }
                        ioffset++;
                    }
                }

                Contracts.Assert(ioffset == xtx.Length);
            }

            for (int i = 0; i < m; i++)
            {
                // sqrt of diagonal entries of s2 * inverse(X'X + reg * I) * X'X * inverse(X'X + reg * I).
                _standardErrors[i] = Math.Sqrt(s2 * _standardErrors[i]);
                ch.Check(FloatUtils.IsFinite(_standardErrors[i]), "Non-finite standard error detected from OLS solution");
                _tValues[i] = beta[i] / _standardErrors[i];
                _pValues[i] = (Float)MathUtils.TStatisticToPValue(_tValues[i], n - m);
                ch.Check(0 <= _pValues[i] && _pValues[i] <= 1, "p-Value calculated outside expected [0,1] range");
            }
        }
예제 #15
0
            private protected override void TransformCore(ref TInput input, FixedSizeQueue <TInput> windowedBuffer, long iteration, ref VBuffer <TInput> output)
            {
                int size   = _parentSliding.WindowSize - _parentSliding._lag + 1;
                var result = VBufferEditor.Create(ref output, size);

                if (_parentSliding._lag == 0)
                {
                    for (int i = 0; i < _parentSliding.WindowSize; ++i)
                    {
                        result.Values[i] = windowedBuffer[i];
                    }
                    result.Values[_parentSliding.WindowSize] = input;
                }
                else
                {
                    for (int i = 0; i < size; ++i)
                    {
                        result.Values[i] = windowedBuffer[i];
                    }
                }
                output = result.Commit();
            }
        public static void Example()
        {
            // Create a new ML context, for ML.NET operations. It can be used for
            // exception tracking and logging, as well as the source of randomness.
            var mlContext = new MLContext();

            // Create a small dataset as an IEnumerable.
            var samples = new List <TextData>()
            {
                new TextData()
                {
                    Text = "This is an example to compute n-grams."
                },
                new TextData()
                {
                    Text = "N-gram is a sequence of 'N' consecutive " +
                           "words/tokens."
                },

                new TextData()
                {
                    Text = "ML.NET's ProduceNgrams API produces " +
                           "vector of n-grams."
                },

                new TextData()
                {
                    Text = "Each position in the vector corresponds " +
                           "to a particular n-gram."
                },

                new TextData()
                {
                    Text = "The value at each position corresponds " +
                           "to,"
                },

                new TextData()
                {
                    Text = "the number of times n-gram occurred in " +
                           "the data (Tf), or"
                },

                new TextData()
                {
                    Text = "the inverse of the number of documents " +
                           "that contain the n-gram (Idf),"
                },

                new TextData()
                {
                    Text = "or compute both and multiply together " +
                           "(Tf-Idf)."
                },
            };

            // Convert training data to IDataView.
            var dataview = mlContext.Data.LoadFromEnumerable(samples);

            // A pipeline for converting text into numeric n-gram features.
            // The following call to 'ProduceNgrams' requires the tokenized
            // text /string as input. This is achieved by calling
            // 'TokenizeIntoWords' first followed by 'ProduceNgrams'. Please note
            // that the length of the output feature vector depends on the n-gram
            // settings.
            var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Tokens",
                                                                           "Text")
                               // 'ProduceNgrams' takes key type as input. Converting the tokens
                               // into key type using 'MapValueToKey'.
                               .Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens"))
                               .Append(mlContext.Transforms.Text.ProduceNgrams("NgramFeatures",
                                                                               "Tokens",
                                                                               ngramLength: 3,
                                                                               useAllLengths: false,
                                                                               weighting: NgramExtractingEstimator.WeightingCriteria.Tf));

            // Fit to data.
            var textTransformer     = textPipeline.Fit(dataview);
            var transformedDataView = textTransformer.Transform(dataview);

            // Create the prediction engine to get the n-gram features extracted
            // from the text.
            var predictionEngine = mlContext.Model.CreatePredictionEngine <TextData,
                                                                           TransformedTextData>(textTransformer);

            // Convert the text into numeric features.
            var prediction = predictionEngine.Predict(samples[0]);

            // Print the length of the feature vector.
            Console.WriteLine("Number of Features: " + prediction.NgramFeatures
                              .Length);

            // Preview of the produced n-grams.
            // Get the slot names from the column's metadata.
            // The slot names for a vector column corresponds to the names
            // associated with each position in the vector.
            VBuffer <ReadOnlyMemory <char> > slotNames = default;

            transformedDataView.Schema["NgramFeatures"].GetSlotNames(ref slotNames);
            var NgramFeaturesColumn = transformedDataView.GetColumn <VBuffer <
                                                                         float> >(transformedDataView.Schema["NgramFeatures"]);
            var slots = slotNames.GetValues();

            Console.Write("N-grams: ");
            foreach (var featureRow in NgramFeaturesColumn)
            {
                foreach (var item in featureRow.Items())
                {
                    Console.Write($"{slots[item.Key]}  ");
                }
                Console.WriteLine();
            }

            // Print the first 10 feature values.
            Console.Write("Features: ");
            for (int i = 0; i < 10; i++)
            {
                Console.Write($"{prediction.NgramFeatures[i]:F4}  ");
            }

            //  Expected output:
            //   Number of Features: 52
            //   N-grams:   This|is|an  is|an|example  an|example|to  example|to|compute  to|compute|n-grams.  N-gram|is|a  is|a|sequence  a|sequence|of  sequence|of|'N'  of|'N'|consecutive  ...
            //   Features:     1.0000      1.0000          1.0000           1.0000             1.0000            0.0000      0.0000          0.0000          0.0000          0.0000          ...
        }
예제 #17
0
 /// <summary>
 /// Return the raw margin from the decision hyperplane
 /// </summary>
 protected override Float Margin(ref VBuffer <Float> feat)
 {
     return(Bias + VectorUtils.DotProduct(ref feat, ref Weights) * WeightsScale);
 }
 protected virtual void PreTrainingProcessInstance(Float label, ref VBuffer <Float> feat, Float weight)
 {
 }
예제 #19
0
        private void InitDenseVecMap <T>(T[] vals, PrimitiveDataViewType itemType, int hashBits = 20)
        {
            var vbuf = new VBuffer <T>(vals.Length, vals);

            InitMap(vbuf, new VectorType(itemType, vals.Length), hashBits, vbuf.CopyTo);
        }
 protected abstract Float AccumulateOneGradient(ref VBuffer <Float> feat, Float label, Float weight,
                                                ref VBuffer <Float> xDense, ref VBuffer <Float> grad, ref Float[] scratch);
예제 #21
0
        /// <summary>
        /// Features: x1, x2vBuff(sparce vector), x3.
        /// y = 10x1 + 10x2vBuff + 30x3 + e.
        /// Within xBuff feature  2nd slot will be sparse most of the time.
        /// 2nd slot of xBuff has the least importance: Evaluation metrics do not change a lot when this slot is permuted.
        /// x3 has the biggest importance.
        /// </summary>
        private IDataView GetSparseDataset(TaskType task = TaskType.Regression, int numberOfInstances = 1000)
        {
            // Setup synthetic dataset.
            var rand = new Random(10);

            float[] yArray = new float[numberOfInstances],
            x1Array = new float[numberOfInstances],
            x3Array = new float[numberOfInstances];

            VBuffer <float>[] vbArray = new VBuffer <float> [numberOfInstances];

            for (var i = 0; i < numberOfInstances; i++)
            {
                var x1 = rand.Next(1000);
                x1Array[i] = x1;
                var x3Important = rand.Next(10000);
                x3Array[i] = x3Important;

                VBuffer <float> vb;

                if (i % 10 != 0)
                {
                    vb = new VBuffer <float>(4, 3, new float[] { rand.Next(1000), rand.Next(1000), rand.Next(1000) }, new int[] { 0, 2, 3 });
                }
                else
                {
                    vb = new VBuffer <float>(4, 4, new float[] { rand.Next(1000), rand.Next(1000), rand.Next(1000), rand.Next(1000) }, new int[] { 0, 1, 2, 3 });
                }

                vbArray[i] = vb;

                float vbSum = 0;
                foreach (var vbValue in vb.DenseValues())
                {
                    vbSum += vbValue * 10;
                }

                var noise = rand.Next(50);
                yArray[i] = 10 * x1 + vbSum + 20 * x3Important + noise;
            }

            // If binary classification, modify the labels
            if (task == TaskType.BinaryClassification ||
                task == TaskType.MulticlassClassification)
            {
                GetBinaryClassificationLabels(yArray);
            }
            else if (task == TaskType.Ranking)
            {
                GetRankingLabels(yArray);
            }

            // Create data view.
            var bldr = new ArrayDataViewBuilder(Env);

            bldr.AddColumn("X1", NumberDataViewType.Single, x1Array);
            bldr.AddColumn("X2VBuffer", NumberDataViewType.Single, vbArray);
            bldr.AddColumn("X3Important", NumberDataViewType.Single, x3Array);
            bldr.AddColumn("Label", NumberDataViewType.Single, yArray);
            if (task == TaskType.Ranking)
            {
                bldr.AddColumn("GroupId", NumberDataViewType.UInt32, CreateGroupIds(yArray.Length));
            }
            var srcDV = bldr.GetDataView();

            var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2VBuffer", "X3Important")
                           .Append(ML.Transforms.Normalize("Features"));

            if (task == TaskType.BinaryClassification)
            {
                return(pipeline.Append(ML.Transforms.Conversion.ConvertType("Label", outputKind: DataKind.Boolean))
                       .Fit(srcDV).Transform(srcDV));
            }
            else if (task == TaskType.MulticlassClassification)
            {
                return(pipeline.Append(ML.Transforms.Conversion.MapValueToKey("Label"))
                       .Fit(srcDV).Transform(srcDV));
            }
            else if (task == TaskType.Ranking)
            {
                return(pipeline.Append(ML.Transforms.Conversion.MapValueToKey("GroupId"))
                       .Fit(srcDV).Transform(srcDV));
            }

            return(pipeline.Fit(srcDV).Transform(srcDV));
        }
        protected Float DifferentiableFunctionComputeChunk(int ichk, ref VBuffer <Float> xDense, ref VBuffer <Float> grad, IProgressChannel pch)
        {
            Contracts.Assert(0 <= ichk && ichk < _numChunks);
            Contracts.AssertValueOrNull(pch);

            VBufferUtils.Clear(ref grad);
            VBufferUtils.Densify(ref grad);

            Float[] scratch = null;
            double  loss    = 0;
            int     ivMin   = _ranges[ichk];
            int     ivLim   = _ranges[ichk + 1];
            int     iv      = ivMin;

            if (pch != null)
            {
                pch.SetHeader(new ProgressHeader(null, new[] { "examples" }), e => e.SetProgress(0, iv - ivMin, ivLim - ivMin));
            }
            for (iv = ivMin; iv < ivLim; iv++)
            {
                Float weight = _weights != null ? _weights[iv] : 1;
                loss += AccumulateOneGradient(ref _features[iv], _labels[iv], weight, ref xDense, ref grad, ref scratch);
            }
            // we need use double type to accumulate loss to avoid roundoff error
            // please see http://mathworld.wolfram.com/RoundoffError.html for roundoff error definition
            // finally we need to convert double type to float for function definition
            return((Float)loss);
        }
예제 #23
0
        public static void Example()
        {
            // Create a new ML context, for ML.NET operations. It can be used for
            // exception tracking and logging, as well as the source of randomness.
            var mlContext = new MLContext(seed: 1);

            // Get a small dataset as an IEnumerable.
            var rawData = new[] {
                new DataPoint()
                {
                    Category = "MLB", Age = 18
                },
                new DataPoint()
                {
                    Category = "NFL", Age = 14
                },
                new DataPoint()
                {
                    Category = "NFL", Age = 15
                },
                new DataPoint()
                {
                    Category = "MLB", Age = 18
                },
                new DataPoint()
                {
                    Category = "MLS", Age = 14
                },
            };

            var data = mlContext.Data.LoadFromEnumerable(rawData);

            // Construct the pipeline that would hash the two columns and store the
            // results in new columns. The first transform hashes the string column
            // and the second transform hashes the integer column.
            //
            // Hashing is not a reversible operation, so there is no way to retrieve
            // the original value from the hashed value. Sometimes, for debugging,
            // or model explainability, users will need to know what values in the
            // original columns generated the values in the hashed columns, since
            // the algorithms will mostly use the hashed values for further
            // computations. The Hash method will preserve the mapping from the
            // original values to the hashed values in the Annotations of the newly
            // created column (column populated with the hashed values).
            //
            // Setting the maximumNumberOfInverts parameters to -1 will preserve the
            // full map. If that parameter is left to the default 0 value, the
            // mapping is not preserved.
            var pipeline = mlContext.Transforms.Conversion.Hash(
                new[]
            {
                new HashingEstimator.ColumnOptions(
                    "CategoryHashed",
                    "Category",
                    16,
                    useOrderedHashing: false,
                    maximumNumberOfInverts: -1),

                new HashingEstimator.ColumnOptions(
                    "AgeHashed",
                    "Age",
                    8,
                    useOrderedHashing: false)
            });

            // Let's fit our pipeline, and then apply it to the same data.
            var transformer     = pipeline.Fit(data);
            var transformedData = transformer.Transform(data);

            // Convert the post transformation from the IDataView format to an
            // IEnumerable <TransformedData> for easy consumption.
            var convertedData = mlContext.Data.CreateEnumerable <
                TransformedDataPoint>(transformedData, true);

            Console.WriteLine("Category CategoryHashed\t Age\t AgeHashed");
            foreach (var item in convertedData)
            {
                Console.WriteLine($"{item.Category}\t {item.CategoryHashed}\t\t  " +
                                  $"{item.Age}\t {item.AgeHashed}");
            }

            // Expected data after the transformation.
            //
            // Category CategoryHashed   Age     AgeHashed
            // MLB      36206            18      127
            // NFL      19015            14      62
            // NFL      19015            15      43
            // MLB      36206            18      127
            // MLS      6013             14      62

            // For the Category column, where we set the maximumNumberOfInverts
            // parameter, the names of the original categories, and their
            // correspondence with the generated hash values is preserved in the
            // Annotations in the format of indices and values.the indices array
            // will have the hashed values, and the corresponding element,
            // position -wise, in the values array will contain the original value.
            //
            // See below for an example on how to retrieve the mapping.
            var slotNames = new VBuffer <ReadOnlyMemory <char> >();

            transformedData.Schema["CategoryHashed"].Annotations.GetValue(
                "KeyValues", ref slotNames);

            var indices       = slotNames.GetIndices();
            var categoryNames = slotNames.GetValues();

            for (int i = 0; i < indices.Length; i++)
            {
                Console.WriteLine($"The original value of the {indices[i]} " +
                                  $"category is {categoryNames[i]}");
            }

            // Output Data
            //
            // The original value of the 6012 category is MLS
            // The original value of the 19014 category is NFL
            // The original value of the 36205 category is MLB
        }
예제 #24
0
 internal FunctionOptimizerState(IChannel ch, IProgressChannelProvider progress, DifferentiableFunction function, ref VBuffer <Float> initial, int m,
                                 long totalMemLimit, bool keepDense, bool enforceNonNegativity)
     : base(ch, progress, ref initial, m, totalMemLimit, keepDense, enforceNonNegativity)
 {
     Function = function;
     Init();
 }
예제 #25
0
 public void GetValue(ref VBuffer <T> dst)
 {
     Contracts.Check(Cursor.IsGood);
     Src.CopyTo(ref dst);
 }
예제 #26
0
 public abstract Float Eval(ref VBuffer <Float> input, ref VBuffer <Float> gradient);
예제 #27
0
        private SequencePool[] Train(Arguments args, IDataView trainingData, out double[][] invDocFreqs)
        {
            // Contains the maximum number of grams to store in the dictionary, for each level of ngrams,
            // from 1 (in position 0) up to ngramLength (in position ngramLength-1)
            var lims = new int[Infos.Length][];

            for (int iinfo = 0; iinfo < Infos.Length; iinfo++)
            {
                var all         = args.Column[iinfo].AllLengths ?? args.AllLengths;
                var ngramLength = _exes[iinfo].NgramLength;
                var maxNumTerms = Utils.Size(args.Column[iinfo].MaxNumTerms) > 0 ? args.Column[iinfo].MaxNumTerms : args.MaxNumTerms;
                if (!all)
                {
                    Host.CheckUserArg(Utils.Size(maxNumTerms) == 0 ||
                                      Utils.Size(maxNumTerms) == 1 && maxNumTerms[0] > 0, nameof(args.MaxNumTerms));
                    lims[iinfo] = new int[ngramLength];
                    lims[iinfo][ngramLength - 1] = Utils.Size(maxNumTerms) == 0 ? Arguments.DefaultMaxTerms : maxNumTerms[0];
                }
                else
                {
                    Host.CheckUserArg(Utils.Size(maxNumTerms) <= ngramLength, nameof(args.MaxNumTerms));
                    Host.CheckUserArg(Utils.Size(maxNumTerms) == 0 || maxNumTerms.All(i => i >= 0) && maxNumTerms[maxNumTerms.Length - 1] > 0, nameof(args.MaxNumTerms));
                    var extend = Utils.Size(maxNumTerms) == 0 ? Arguments.DefaultMaxTerms : maxNumTerms[maxNumTerms.Length - 1];
                    lims[iinfo] = Utils.BuildArray(ngramLength,
                                                   i => i < Utils.Size(maxNumTerms) ? maxNumTerms[i] : extend);
                }
            }

            var helpers = new NgramBufferBuilder[Infos.Length];
            var getters = new ValueGetter <VBuffer <uint> > [Infos.Length];
            var src     = new VBuffer <uint> [Infos.Length];

            // Keep track of how many grams are in the pool for each value of n. Position
            // i in _counts counts how many (i+1)-grams are in the pool for column iinfo.
            var counts    = new int[Infos.Length][];
            var ngramMaps = new SequencePool[Infos.Length];

            bool[] activeInput = new bool[trainingData.Schema.ColumnCount];
            foreach (var info in Infos)
            {
                activeInput[info.Source] = true;
            }
            using (var cursor = trainingData.GetRowCursor(col => activeInput[col]))
                using (var pch = Host.StartProgressChannel("Building n-gram dictionary"))
                {
                    for (int iinfo = 0; iinfo < Infos.Length; iinfo++)
                    {
                        Host.Assert(Infos[iinfo].TypeSrc.IsVector && Infos[iinfo].TypeSrc.ItemType.IsKey);
                        var ngramLength = _exes[iinfo].NgramLength;
                        var skipLength  = _exes[iinfo].SkipLength;

                        getters[iinfo]   = RowCursorUtils.GetVecGetterAs <uint>(NumberType.U4, cursor, Infos[iinfo].Source);
                        src[iinfo]       = default(VBuffer <uint>);
                        counts[iinfo]    = new int[ngramLength];
                        ngramMaps[iinfo] = new SequencePool();

                        // Note: GetNgramIdFinderAdd will control how many ngrams of a specific length will
                        // be added (using lims[iinfo]), therefore we set slotLim to the maximum
                        helpers[iinfo] = new NgramBufferBuilder(ngramLength, skipLength, Utils.ArrayMaxSize,
                                                                GetNgramIdFinderAdd(counts[iinfo], lims[iinfo], ngramMaps[iinfo], _exes[iinfo].RequireIdf(), Host));
                    }

                    int    cInfoFull = 0;
                    bool[] infoFull  = new bool[Infos.Length];

                    invDocFreqs = new double[Infos.Length][];

                    long   totalDocs = 0;
                    Double rowCount  = trainingData.GetRowCount(true) ?? Double.NaN;
                    var    buffers   = new VBuffer <float> [Infos.Length];
                    pch.SetHeader(new ProgressHeader(new[] { "Total n-grams" }, new[] { "documents" }),
                                  e => e.SetProgress(0, totalDocs, rowCount));
                    while (cInfoFull < Infos.Length && cursor.MoveNext())
                    {
                        totalDocs++;
                        for (int iinfo = 0; iinfo < Infos.Length; iinfo++)
                        {
                            getters[iinfo](ref src[iinfo]);
                            var keyCount = (uint)Infos[iinfo].TypeSrc.ItemType.KeyCount;
                            if (keyCount == 0)
                            {
                                keyCount = uint.MaxValue;
                            }
                            if (!infoFull[iinfo])
                            {
                                if (_exes[iinfo].RequireIdf())
                                {
                                    helpers[iinfo].Reset();
                                }

                                helpers[iinfo].AddNgrams(ref src[iinfo], 0, keyCount);
                                if (_exes[iinfo].RequireIdf())
                                {
                                    int totalNgrams = counts[iinfo].Sum();
                                    Utils.EnsureSize(ref invDocFreqs[iinfo], totalNgrams);
                                    helpers[iinfo].GetResult(ref buffers[iinfo]);
                                    foreach (var pair in buffers[iinfo].Items())
                                    {
                                        if (pair.Value >= 1)
                                        {
                                            invDocFreqs[iinfo][pair.Key] += 1;
                                        }
                                    }
                                }
                            }
                            AssertValid(counts[iinfo], lims[iinfo], ngramMaps[iinfo]);
                        }
                    }

                    pch.Checkpoint(counts.Sum(c => c.Sum()), totalDocs);
                    for (int iinfo = 0; iinfo < Infos.Length; iinfo++)
                    {
                        for (int i = 0; i < Utils.Size(invDocFreqs[iinfo]); i++)
                        {
                            if (invDocFreqs[iinfo][i] != 0)
                            {
                                invDocFreqs[iinfo][i] = Math.Log(totalDocs / invDocFreqs[iinfo][i]);
                            }
                        }
                    }

                    for (int iinfo = 0; iinfo < Infos.Length; iinfo++)
                    {
                        AssertValid(counts[iinfo], lims[iinfo], ngramMaps[iinfo]);

                        int ngramLength = _exes[iinfo].NgramLength;
                        for (int i = 0; i < ngramLength; i++)
                        {
                            _exes[iinfo].NonEmptyLevels[i] = counts[iinfo][i] > 0;
                        }
                    }

                    return(ngramMaps);
                }
        }
예제 #28
0
        /// <summary>
        /// Minimize a function using the MeanRelativeImprovement termination criterion with the supplied tolerance level
        /// </summary>
        /// <param name="function">The function to minimize</param>
        /// <param name="initial">The initial point</param>
        /// <param name="tolerance">Convergence tolerance (smaller means more iterations, closer to exact optimum)</param>
        /// <param name="result">The point at the optimum</param>
        /// <param name="optimum">The optimum function value</param>
        /// <exception cref="PrematureConvergenceException">Thrown if successive points are within numeric precision of each other, but termination condition is still unsatisfied.</exception>
        public void Minimize(DifferentiableFunction function, ref VBuffer <Float> initial, Float tolerance, ref VBuffer <Float> result, out Float optimum)
        {
            ITerminationCriterion term = new MeanRelativeImprovementCriterion(tolerance);

            Minimize(function, ref initial, term, ref result, out optimum);
        }
예제 #29
0
        protected override Delegate GetGetterCore(IChannel ch, IRow input, int iinfo, out Action disposer)
        {
            Host.AssertValueOrNull(ch);
            Host.AssertValue(input);
            Host.Assert(0 <= iinfo && iinfo < Infos.Length);
            Host.Assert(Infos[iinfo].TypeSrc.IsVector);
            Host.Assert(Infos[iinfo].TypeSrc.ItemType.IsKey);

            disposer = null;

            var getSrc = RowCursorUtils.GetVecGetterAs <uint>(NumberType.U4, input, Infos[iinfo].Source);
            var src    = default(VBuffer <uint>);
            var bldr   = new NgramBufferBuilder(_exes[iinfo].NgramLength, _exes[iinfo].SkipLength,
                                                _ngramMaps[iinfo].Count, GetNgramIdFinder(iinfo));
            var keyCount = (uint)Infos[iinfo].TypeSrc.ItemType.KeyCount;

            if (keyCount == 0)
            {
                keyCount = uint.MaxValue;
            }

            ValueGetter <VBuffer <Float> > del;

            switch (_exes[iinfo].Weighting)
            {
            case WeightingCriteria.TfIdf:
                Host.AssertValue(_invDocFreqs[iinfo]);
                del =
                    (ref VBuffer <Float> dst) =>
                {
                    getSrc(ref src);
                    if (!bldr.IsEmpty)
                    {
                        bldr.Reset();
                        bldr.AddNgrams(ref src, 0, keyCount);
                        bldr.GetResult(ref dst);
                        VBufferUtils.Apply(ref dst, (int i, ref Float v) => v = (Float)(v * _invDocFreqs[iinfo][i]));
                    }
                    else
                    {
                        dst = new VBuffer <Float>(0, dst.Values, dst.Indices);
                    }
                };
                break;

            case WeightingCriteria.Idf:
                Host.AssertValue(_invDocFreqs[iinfo]);
                del =
                    (ref VBuffer <Float> dst) =>
                {
                    getSrc(ref src);
                    if (!bldr.IsEmpty)
                    {
                        bldr.Reset();
                        bldr.AddNgrams(ref src, 0, keyCount);
                        bldr.GetResult(ref dst);
                        VBufferUtils.Apply(ref dst, (int i, ref Float v) => v = v >= 1 ? (Float)_invDocFreqs[iinfo][i] : 0);
                    }
                    else
                    {
                        dst = new VBuffer <Float>(0, dst.Values, dst.Indices);
                    }
                };
                break;

            case WeightingCriteria.Tf:
                del =
                    (ref VBuffer <Float> dst) =>
                {
                    getSrc(ref src);
                    if (!bldr.IsEmpty)
                    {
                        bldr.Reset();
                        bldr.AddNgrams(ref src, 0, keyCount);
                        bldr.GetResult(ref dst);
                    }
                    else
                    {
                        dst = new VBuffer <Float>(0, dst.Values, dst.Indices);
                    }
                };
                break;

            default:
                throw Host.Except("Unsupported weighting criteria");
            }

            return(del);
        }
예제 #30
0
        public VBuffer[] GetVBuffers()
        {
            if (LogLoaded)
            {
                if (IsLogD3D11)
                {
                    VBuffer[] ret = new VBuffer[m_D3D11.m_IA.vbuffers.Length];
                    for (int i = 0; i < m_D3D11.m_IA.vbuffers.Length; i++)
                    {
                        ret[i].Buffer = m_D3D11.m_IA.vbuffers[i].Buffer;
                        ret[i].ByteOffset = m_D3D11.m_IA.vbuffers[i].Offset;
                        ret[i].ByteStride = m_D3D11.m_IA.vbuffers[i].Stride;
                    }

                    return ret;
                }
                else if (IsLogGL)
                {
                    VBuffer[] ret = new VBuffer[m_GL.m_VtxIn.vbuffers.Length];
                    for (int i = 0; i < m_GL.m_VtxIn.vbuffers.Length; i++)
                    {
                        ret[i].Buffer = m_GL.m_VtxIn.vbuffers[i].Buffer;
                        ret[i].ByteOffset = m_GL.m_VtxIn.vbuffers[i].Offset;
                        ret[i].ByteStride = m_GL.m_VtxIn.vbuffers[i].Stride;
                    }

                    return ret;
                }
            }

            return null;
        }
예제 #31
0
        private static Delegate GetDefaultVectorGetter <TValue>()
        {
            ValueGetter <VBuffer <TValue> > getter = (ref VBuffer <TValue> value) => value = new VBuffer <TValue>(AllVectorSizes, 0, null, null);

            return(getter);
        }