            protected TrainStateBase(IChannel ch, int numFeatures, LinearModelParameters predictor, OnlineLinearTrainer <TTransformer, TModel> parent)
                Contracts.CheckValue(ch, nameof(ch));
                ch.Check(numFeatures > 0, "Cannot train with zero features!");
                ch.Assert(Iteration == 0);
                ch.Assert(Bias == 0);

                ParentHost = parent.Host;

                ch.Trace("{0} Initializing {1} on {2} features", DateTime.UtcNow, parent.Name, numFeatures);

                // We want a dense vector, to prevent memory creation during training
                // unless we have a lot of features.
                if (predictor != null)
                    ((IHaveFeatureWeights)predictor).GetFeatureWeights(ref Weights);
                    VBufferUtils.Densify(ref Weights);
                    Bias = predictor.Bias;
                else if (!string.IsNullOrWhiteSpace(parent.OnlineLinearTrainerOptions.InitialWeights))
                    ch.Info("Initializing weights and bias to " + parent.OnlineLinearTrainerOptions.InitialWeights);
                    string[] weightStr = parent.OnlineLinearTrainerOptions.InitialWeights.Split(',');
                    if (weightStr.Length != numFeatures + 1)
                        throw ch.Except(
                                  "Could not initialize weights from 'initialWeights': expecting {0} values to initialize {1} weights and the intercept",
                                  numFeatures + 1, numFeatures);

                    var weightValues = new float[numFeatures];
                    for (int i = 0; i < numFeatures; i++)
                        weightValues[i] = float.Parse(weightStr[i], CultureInfo.InvariantCulture);
                    Weights = new VBuffer <float>(numFeatures, weightValues);
                    Bias    = float.Parse(weightStr[numFeatures], CultureInfo.InvariantCulture);
                else if (parent.OnlineLinearTrainerOptions.InitialWeightsDiameter > 0)
                    var weightValues = new float[numFeatures];
                    for (int i = 0; i < numFeatures; i++)
                        weightValues[i] = parent.OnlineLinearTrainerOptions.InitialWeightsDiameter * (parent.Host.Rand.NextSingle() - (float)0.5);
                    Weights = new VBuffer <float>(numFeatures, weightValues);
                    Bias    = parent.OnlineLinearTrainerOptions.InitialWeightsDiameter * (parent.Host.Rand.NextSingle() - (float)0.5);
                else if (numFeatures <= 1000)
                    Weights = VBufferUtils.CreateDense <float>(numFeatures);
                    Weights = VBufferUtils.CreateEmpty <float>(numFeatures);
                WeightsScale = 1;
        public static FeatureNameCollection Create(RoleMappedSchema schema)
            // REVIEW: This shim should be deleted as soon as is convenient.
            Contracts.CheckValue(schema, nameof(schema));
            Contracts.CheckParam(schema.Feature != null, nameof(schema), "Cannot create feature name collection if we have no features");
            Contracts.CheckParam(schema.Feature.Type.ValueCount > 0, nameof(schema), "Cannot create feature name collection if our features are not of known size");

            VBuffer <ReadOnlyMemory <char> > slotNames = default;
            int len = schema.Feature.Type.ValueCount;

            if (schema.Schema.HasSlotNames(schema.Feature.Index, len))
                schema.Schema.GetMetadata(MetadataUtils.Kinds.SlotNames, schema.Feature.Index, ref slotNames);
                slotNames = VBufferUtils.CreateEmpty <ReadOnlyMemory <char> >(len);
            var slotNameValues = slotNames.GetValues();

            string[] names = new string[slotNameValues.Length];
            for (int i = 0; i < slotNameValues.Length; ++i)
                names[i] = !slotNameValues[i].IsEmpty ? slotNameValues[i].ToString() : null;
            if (slotNames.IsDense)
                return(new Dense(names.Length, names));

            ReadOnlySpan <int> indices = slotNames.GetIndices();

            return(new Sparse(slotNames.Length, slotNameValues.Length, indices.ToArray(), names));
        /// <summary>
        /// Maps input features names to their input INI content based on the metadata of the
        /// features column. If the <c>IniContent</c> slotwise string metadata is present, that
        /// is used, or else default content is derived from the slot names.
        /// </summary>
        /// <seealso cref="MetadataUtils.Kinds.SlotNames"/>
        public FeaturesToContentMap(RoleMappedSchema schema)
            var feat = schema.Feature;

            Contracts.Assert(feat.Type.ValueCount > 0);

            var sch = schema.Schema;

            if (sch.HasSlotNames(feat.Index, feat.Type.ValueCount))
                sch.GetMetadata(MetadataUtils.Kinds.SlotNames, feat.Index, ref _names);
                _names = VBufferUtils.CreateEmpty <DvText>(feat.Type.ValueCount);
            var type = sch.GetMetadataTypeOrNull(BingBinLoader.IniContentMetadataKind, feat.Index);
            if (type != null && type.IsVector && type.VectorSize == feat.Type.ValueCount && type.ItemType.IsText)
                sch.GetMetadata(BingBinLoader.IniContentMetadataKind, feat.Index, ref _content);
                _content = VBufferUtils.CreateEmpty <DvText>(feat.Type.ValueCount);
            _content = VBufferUtils.CreateEmpty <DvText>(feat.Type.ValueCount);
            Contracts.Assert(_names.Length == _content.Length);
        protected virtual void InitCore(IChannel ch, int numFeatures, LinearPredictor predictor)
            Contracts.Check(numFeatures > 0, "Can't train with zero features!");
            Contracts.Check(NumFeatures == 0, "Can't re-use trainer!");
            Contracts.Assert(Iteration == 0);
            Contracts.Assert(Bias == 0);

            ch.Trace("{0} Initializing {1} on {2} features", DateTime.UtcNow, Name, numFeatures);
            NumFeatures = numFeatures;

            // We want a dense vector, to prevent memory creation during training
            // unless we have a lot of features.
            // REVIEW: make a setting
            if (predictor != null)
                predictor.GetFeatureWeights(ref Weights);
                VBufferUtils.Densify(ref Weights);
                Bias = predictor.Bias;
            else if (!string.IsNullOrWhiteSpace(Args.InitialWeights))
                ch.Info("Initializing weights and bias to " + Args.InitialWeights);
                string[] weightStr = Args.InitialWeights.Split(',');
                if (weightStr.Length != NumFeatures + 1)
                    throw Contracts.Except(
                              "Could not initialize weights from 'initialWeights': expecting {0} values to initialize {1} weights and the intercept",
                              NumFeatures + 1, NumFeatures);

                Weights = VBufferUtils.CreateDense <Float>(NumFeatures);
                for (int i = 0; i < NumFeatures; i++)
                    Weights.Values[i] = Float.Parse(weightStr[i], CultureInfo.InvariantCulture);
                Bias = Float.Parse(weightStr[NumFeatures], CultureInfo.InvariantCulture);
            else if (Args.InitWtsDiameter > 0)
                Weights = VBufferUtils.CreateDense <Float>(NumFeatures);
                for (int i = 0; i < NumFeatures; i++)
                    Weights.Values[i] = Args.InitWtsDiameter * (Host.Rand.NextSingle() - (Float)0.5);
                Bias = Args.InitWtsDiameter * (Host.Rand.NextSingle() - (Float)0.5);
            else if (NumFeatures <= 1000)
                Weights = VBufferUtils.CreateDense <Float>(NumFeatures);
                Weights = VBufferUtils.CreateEmpty <Float>(NumFeatures);
            WeightsScale = 1;
        public override IPredictor CreatePredictor()
            Contracts.Assert(WeightArraySize == 1);
            Contracts.Assert(Utils.Size(Weights) == 1);
            Contracts.Assert(Utils.Size(Bias) == 1);
            Host.Check(Weights[0].Length > 0);
            VBuffer <Float> maybeSparseWeights = VBufferUtils.CreateEmpty <Float>(Weights[0].Length);

            VBufferUtils.CreateMaybeSparseCopy(ref Weights[0], ref maybeSparseWeights, Conversions.Instance.GetIsDefaultPredicate <Float>(NumberType.Float));
            return(new LinearRegressionPredictor(Host, ref maybeSparseWeights, Bias[0]));
        protected override void InitCore(IChannel ch, int numFeatures, LinearPredictor predictor)
            base.InitCore(ch, numFeatures, predictor);

            if (Args.NoBias)
                Bias = 0;

            if (predictor == null)
                VBufferUtils.Densify(ref Weights);

            _weightsUpdate = VBufferUtils.CreateEmpty <Float>(numFeatures);
            public RowMapper(IHostEnvironment env, BindableMapper parent, RoleMappedSchema schema)
                _env = env;
                _parent = parent;
                InputRoleMappedSchema = schema;
                var genericMapper = parent.GenericMapper.Bind(_env, schema);

                _genericRowMapper = genericMapper as ISchemaBoundRowMapper;
                var featureSize = FeatureColumn.Type.GetVectorSize();

                if (parent.Stringify)
                    var builder = new DataViewSchema.Builder();
                    builder.AddColumn(DefaultColumnNames.FeatureContributions, TextDataViewType.Instance, null);
                    _outputSchema = builder.ToSchema();
                    if (FeatureColumn.HasSlotNames(featureSize))
                        FeatureColumn.Annotations.GetValue(AnnotationUtils.Kinds.SlotNames, ref _slotNames);
                        _slotNames = VBufferUtils.CreateEmpty <ReadOnlyMemory <char> >(featureSize);
                    var metadataBuilder = new DataViewSchema.Annotations.Builder();
                    if (InputSchema[FeatureColumn.Index].HasSlotNames(featureSize))
                        metadataBuilder.AddSlotNames(featureSize, (ref VBuffer <ReadOnlyMemory <char> > value) =>
                                                     FeatureColumn.Annotations.GetValue(AnnotationUtils.Kinds.SlotNames, ref value));

                    var schemaBuilder           = new DataViewSchema.Builder();
                    var featureContributionType = new VectorType(NumberDataViewType.Single, FeatureColumn.Type as VectorType);
                    schemaBuilder.AddColumn(DefaultColumnNames.FeatureContributions, featureContributionType, metadataBuilder.ToAnnotations());
                    _outputSchema = schemaBuilder.ToSchema();

                _outputGenericSchema = _genericRowMapper.OutputSchema;
                OutputSchema         = new ZipBinding(new DataViewSchema[] { _outputGenericSchema, _outputSchema, }).OutputSchema;
        private PcaPredictor(IHostEnvironment env, ModelLoadContext ctx)
            : base(env, RegistrationName, ctx)
            // *** Binary format ***
            // int: dimension (aka. number of features)
            // int: rank
            // bool: center
            // If (center)
            //  Float[]: mean vector
            // Float[][]: eigenvectors
            _dimension = ctx.Reader.ReadInt32();

            _rank = ctx.Reader.ReadInt32();

            bool center = ctx.Reader.ReadBoolByte();

            if (center)
                var meanArray = ctx.Reader.ReadFloatArray(_dimension);
                _mean      = new VBuffer <Float>(_dimension, meanArray);
                _norm2Mean = VectorUtils.NormSquared(_mean);
                _mean      = VBufferUtils.CreateEmpty <Float>(_dimension);
                _norm2Mean = 0;

            _eigenVectors  = new VBuffer <Float> [_rank];
            _meanProjected = new Float[_rank];
            for (int i = 0; i < _rank; ++i)
                var vi = ctx.Reader.ReadFloatArray(_dimension);
                _eigenVectors[i]  = new VBuffer <Float>(_dimension, vi);
                _meanProjected[i] = VectorUtils.DotProduct(ref _eigenVectors[i], ref _mean);
            WarnOnOldNormalizer(ctx, GetType(), Host);

            _inputType = new VectorType(NumberType.Float, _dimension);
                public Counters(int numClusters, bool calculateDbi, ColumnInfo features)
                    _numClusters = numClusters;
                    CalculateDbi = calculateDbi;

                    _numInstancesOfClstr = new Double[_numClusters];
                    _numInstancesOfClass = new List <Double>();
                    _confusionMatrix     = new List <Double[]>();
                    if (CalculateDbi)
                        _clusterCentroids = new VBuffer <Single> [_numClusters];
                        for (int i = 0; i < _numClusters; i++)
                            _clusterCentroids[i] = VBufferUtils.CreateEmpty <Single>(features.Type.VectorSize);
                        _distancesToCentroids = new Double[_numClusters];
 public Aggregator(IHostEnvironment env, ColumnInfo features, int scoreVectorSize, bool calculateDbi, bool weighted, string stratName)
     : base(env, stratName)
     _calculateDbi      = calculateDbi;
     _scoresArr         = new float[scoreVectorSize];
     _indicesArr        = new int[scoreVectorSize];
     UnweightedCounters = new Counters(scoreVectorSize, _calculateDbi, features);
     Weighted           = weighted;
     WeightedCounters   = Weighted ? new Counters(scoreVectorSize, _calculateDbi, features) : null;
     if (_calculateDbi)
         _clusterCentroids = new VBuffer <Single> [scoreVectorSize];
         for (int i = 0; i < scoreVectorSize; i++)
             _clusterCentroids[i] = VBufferUtils.CreateEmpty <Single>(features.Type.VectorSize);
            public TrainState(IChannel ch, int numFeatures, LinearPredictor predictor, LinearSvm parent)
                : base(ch, numFeatures, predictor, parent)
                _batchSize         = parent.Args.BatchSize;
                _noBias            = parent.Args.NoBias;
                _performProjection = parent.Args.PerformProjection;
                _lambda            = parent.Args.Lambda;

                if (_noBias)
                    Bias = 0;

                if (predictor == null)
                    VBufferUtils.Densify(ref Weights);

                _weightsUpdate = VBufferUtils.CreateEmpty <Float>(numFeatures);
        public static void Main(string[] argv)

            VBuffer <Float> grad  = VBufferUtils.CreateEmpty <Float>(2);
            int             n     = 0;
            bool            print = false;
            DTerminate      term  =
                (ref VBuffer <Float> x) =>
                QuadTest2D(ref x, ref grad);
                Float norm = VectorUtils.Norm(grad);
                if (++n % 1000 == 0 || print)
                    Console.WriteLine("{0}\t{1}", n, norm);
                return(norm < 1e-5);
            SgdOptimizer    sgdo = new SgdOptimizer(term, SgdOptimizer.RateScheduleType.Constant, false, 100, 1, (Float)0.99);
            VBuffer <Float> init;

            CreateWrapped(out init, 0, 0);
            VBuffer <Float> ans = default(VBuffer <Float>);

            sgdo.Minimize(StochasticQuadTest2D, ref init, ref ans);
            QuadTest2D(ref ans, ref grad);
            n = 0;
            GDOptimizer gdo = new GDOptimizer(term, null, true);

            print = true;
            CreateWrapped(out init, 0, 0);
            gdo.Minimize(QuadTest2D, ref init, ref ans);
            QuadTest2D(ref ans, ref grad);
        public static FeatureNameCollection Create(RoleMappedSchema schema)
            // REVIEW: This shim should be deleted as soon as is convenient.
            Contracts.CheckValue(schema, nameof(schema));
            Contracts.CheckParam(schema.Feature != null, nameof(schema), "Cannot create feature name collection if we have no features");
            Contracts.CheckParam(schema.Feature.Type.ValueCount > 0, nameof(schema), "Cannot create feature name collection if our features are not of known size");

            VBuffer <DvText> slotNames = default(VBuffer <DvText>);
            int len = schema.Feature.Type.ValueCount;

            if (schema.Schema.HasSlotNames(schema.Feature.Index, len))
                schema.Schema.GetMetadata(MetadataUtils.Kinds.SlotNames, schema.Feature.Index, ref slotNames);
                slotNames = VBufferUtils.CreateEmpty <DvText>(len);
            string[] names = new string[slotNames.Count];
            for (int i = 0; i < slotNames.Count; ++i)
                names[i] = slotNames.Values[i].HasChars ? slotNames.Values[i].ToString() : null;
            if (slotNames.IsDense)
                return(new Dense(names.Length, names));

            int[] indices = slotNames.Indices;
            if (indices == null)
                indices = new int[0];
            else if (indices.Length != slotNames.Count)
                Array.Resize(ref indices, slotNames.Count);
            return(new Sparse(slotNames.Length, slotNames.Count, indices, names));
        protected virtual Optimizer InitializeOptimizer(IChannel ch, FloatLabelCursor.Factory cursorFactory,
                                                        out VBuffer <float> init, out ITerminationCriterion terminationCriterion)
            // MeanRelativeImprovementCriterion:
            //   Stops optimization when the average objective improvement over the last
            //   n iterations, normalized by the function value, is small enough.
            terminationCriterion = new MeanRelativeImprovementCriterion(OptTol, 5, MaxIterations);

            Optimizer opt = (L1Weight > 0)
                ? new L1Optimizer(Host, BiasCount, L1Weight / NumGoodRows, MemorySize, DenseOptimizer, null, EnforceNonNegativity)
                : new Optimizer(Host, MemorySize, DenseOptimizer, null, EnforceNonNegativity);

            opt.Quiet = Quiet;

            if (_srcPredictor != null)
                init = InitializeWeightsFromPredictor(_srcPredictor);
            else if (InitWtsDiameter > 0)
                float[] initWeights = new float[BiasCount + WeightCount];
                for (int j = 0; j < initWeights.Length; j++)
                    initWeights[j] = InitWtsDiameter * (Host.Rand.NextSingle() - 0.5f);
                init = new VBuffer <float>(initWeights.Length, initWeights);
            else if (SgdInitializationTolerance > 0)
                init = InitializeWeightsSgd(ch, cursorFactory);
                init = VBufferUtils.CreateEmpty <float>(BiasCount + WeightCount);

            public RowMapper(IHostEnvironment env, BindableMapper parent, RoleMappedSchema schema)
                _env = env;
                _parent = parent;
                InputRoleMappedSchema = schema;
                var genericMapper = parent.GenericMapper.Bind(_env, schema);

                _genericRowMapper = genericMapper as ISchemaBoundRowMapper;

                if (parent.Stringify)
                    var builder = new SchemaBuilder();
                    builder.AddColumn(DefaultColumnNames.FeatureContributions, TextType.Instance, null);
                    _outputSchema = builder.GetSchema();
                    if (InputSchema.HasSlotNames(InputRoleMappedSchema.Feature.Index, InputRoleMappedSchema.Feature.Type.VectorSize))
                        InputSchema.GetMetadata(MetadataUtils.Kinds.SlotNames, InputRoleMappedSchema.Feature.Index,
                                                ref _slotNames);
                        _slotNames = VBufferUtils.CreateEmpty <ReadOnlyMemory <char> >(InputRoleMappedSchema.Feature.Type.VectorSize);
                    _outputSchema = Schema.Create(new FeatureContributionSchema(_env, DefaultColumnNames.FeatureContributions,
                                                                                new VectorType(NumberType.R4, schema.Feature.Type as VectorType),
                                                                                InputSchema, InputRoleMappedSchema.Feature.Index));

                _outputGenericSchema = _genericRowMapper.OutputSchema;
                OutputSchema         = new CompositeSchema(new Schema[] { _outputGenericSchema, _outputSchema, }).AsSchema;
        protected virtual void TrainCore(IChannel ch, RoleMappedData data)

            // Compute the number of threads to use. The ctor should have verified that this will
            // produce a positive value.
            int numThreads = !UseThreads ? 1 : (NumThreads ?? Environment.ProcessorCount);

            if (Host.ConcurrencyFactor > 0 && numThreads > Host.ConcurrencyFactor)
                numThreads = Host.ConcurrencyFactor;
                ch.Warning("The number of threads specified in trainer arguments is larger than the concurrency factor "
                           + "setting of the environment. Using {0} training threads instead.", numThreads);

            ch.Assert(numThreads > 0);

            NumGoodRows = 0;
            WeightSum   = 0;

            _features = null;
            _labels   = null;
            _weights  = null;
            if (numThreads > 1)
                ch.Info("LBFGS multi-threading will attempt to load dataset into memory. In case of out-of-memory " +
                        "issues, add 'numThreads=1' to the trainer arguments and 'cache=-' to the command line " +
                        "arguments to turn off multi-threading.");
                _features = new VBuffer <float> [1000];
                _labels   = new float[1000];
                if (data.Schema.Weight != null)
                    _weights = new float[1000];

            var cursorFactory = new FloatLabelCursor.Factory(data, CursOpt.Features | CursOpt.Label | CursOpt.Weight);

            long numBad;

            // REVIEW: This pass seems overly expensive for the benefit when multi-threading is off....
            using (var cursor = cursorFactory.Create())
                using (var pch = Host.StartProgressChannel("LBFGS data prep"))
                    // REVIEW: maybe it makes sense for the factory to capture the good row count after
                    // the first successful cursoring?
                    Double totalCount = data.Data.GetRowCount(true) ?? Double.NaN;

                    long exCount = 0;
                    pch.SetHeader(new ProgressHeader(null, new[] { "examples" }),
                                  e => e.SetProgress(0, exCount, totalCount));
                    while (cursor.MoveNext())
                        WeightSum += cursor.Weight;
                        if (ShowTrainingStats)
                            ProcessPriorDistribution(cursor.Label, cursor.Weight);

                        PreTrainingProcessInstance(cursor.Label, ref cursor.Features, cursor.Weight);
                        if (_features != null)
                            ch.Assert(cursor.KeptRowCount <= int.MaxValue);
                            int index = (int)cursor.KeptRowCount - 1;
                            Utils.EnsureSize(ref _features, index + 1);
                            Utils.EnsureSize(ref _labels, index + 1);
                            if (_weights != null)
                                Utils.EnsureSize(ref _weights, index + 1);
                                _weights[index] = cursor.Weight;
                            Utils.Swap(ref _features[index], ref cursor.Features);
                            _labels[index] = cursor.Label;

                            if (cursor.KeptRowCount >= int.MaxValue)
                                ch.Warning("Limiting data size for multi-threading");
                    NumGoodRows = cursor.KeptRowCount;
                    numBad      = cursor.SkippedRowCount;
            ch.Check(NumGoodRows > 0, NoTrainingInstancesMessage);
            if (numBad > 0)
                ch.Warning("Skipped {0} instances with missing features/label/weight during training", numBad);

            if (_features != null)
                ch.Assert(numThreads > 1);

                // If there are so many threads that each only gets a small number (less than 10) of instances, trim
                // the number of threads so each gets a more reasonable number (100 or so). These numbers are pretty arbitrary,
                // but avoid the possibility of having no instances on some threads.
                if (numThreads > 1 && NumGoodRows / numThreads < 10)
                    int numNew = Math.Max(1, (int)NumGoodRows / 100);
                    ch.Warning("Too few instances to use {0} threads, decreasing to {1} thread(s)", numThreads, numNew);
                    numThreads = numNew;
                ch.Assert(numThreads > 0);

                // Divide up the instances among the threads.
                _numChunks = numThreads;
                _ranges    = new int[_numChunks + 1];
                int cinstTot = (int)NumGoodRows;
                for (int ichk = 0, iinstMin = 0; ichk < numThreads; ichk++)
                    int cchkLeft = numThreads - ichk;                                // Number of chunks left to fill.
                    ch.Assert(0 < cchkLeft && cchkLeft <= numThreads);
                    int cinstThis = (cinstTot - iinstMin + cchkLeft - 1) / cchkLeft; // Size of this chunk.
                    ch.Assert(0 < cinstThis && cinstThis <= cinstTot - iinstMin);
                    iinstMin         += cinstThis;
                    _ranges[ichk + 1] = iinstMin;

                _localLosses    = new float[numThreads];
                _localGradients = new VBuffer <float> [numThreads - 1];
                int size = BiasCount + WeightCount;
                for (int i = 0; i < _localGradients.Length; i++)
                    _localGradients[i] = VBufferUtils.CreateEmpty <float>(size);

                ch.Assert(_numChunks > 0 && _data == null);
                // Streaming, single-threaded case.
                _data          = data;
                _cursorFactory = cursorFactory;
                ch.Assert(_numChunks == 0 && _data != null);

            VBuffer <float>       initWeights;
            ITerminationCriterion terminationCriterion;
            Optimizer             opt = InitializeOptimizer(ch, cursorFactory, out initWeights, out terminationCriterion);

            opt.Quiet = Quiet;

            float loss;

                opt.Minimize(DifferentiableFunction, ref initWeights, terminationCriterion, ref CurrentWeights, out loss);
            catch (Optimizer.PrematureConvergenceException e)
                if (!Quiet)
                    ch.Warning("Premature convergence occurred. The OptimizationTolerance may be set too small. {0}", e.Message);
                CurrentWeights = e.State.X;
                loss           = e.State.Value;

            ch.Assert(CurrentWeights.Length == BiasCount + WeightCount);

            int numParams = BiasCount;

            if ((L1Weight > 0 && !Quiet) || ShowTrainingStats)
                VBufferUtils.ForEachDefined(ref CurrentWeights, (index, value) => { if (index >= BiasCount && value != 0)
                if (L1Weight > 0 && !Quiet)
                    ch.Info("L1 regularization selected {0} of {1} weights.", numParams, BiasCount + WeightCount);

            if (ShowTrainingStats)
                ComputeTrainingStatistics(ch, cursorFactory, loss, numParams);
        private PcaPredictor TrainCore(IChannel ch, RoleMappedData data, int dimension)

            if (_rank > dimension)
                throw ch.Except("Rank ({0}) cannot be larger than the original dimension ({1})", _rank, dimension);
            int oversampledRank = Math.Min(_rank + _oversampling, dimension);

            //exact: (size of the 2 big matrices + other minor allocations) / (2^30)
            Double memoryUsageEstimate = 2.0 * dimension * oversampledRank * sizeof(Float) / 1e9;

            if (memoryUsageEstimate > 2)
                ch.Info("Estimate memory usage: {0:G2} GB. If running out of memory, reduce rank and oversampling factor.", memoryUsageEstimate);

            var y    = Zeros(oversampledRank, dimension);
            var mean = _center ? VBufferUtils.CreateDense <Float>(dimension) : VBufferUtils.CreateEmpty <Float>(dimension);

            var omega = GaussianMatrix(oversampledRank, dimension, _seed);

            var  cursorFactory = new FeatureFloatVectorCursor.Factory(data, CursOpt.Features | CursOpt.Weight);
            long numBad;

            Project(Host, cursorFactory, ref mean, omega, y, out numBad);
            if (numBad > 0)
                ch.Warning("Skipped {0} instances with missing features/weights during training", numBad);

            //Orthonormalize Y in-place using stabilized Gram Schmidt algorithm.
            //Ref: https://en.wikipedia.org/wiki/Gram-Schmidt#Algorithm
            for (var i = 0; i < oversampledRank; ++i)
                var v = y[i];
                VectorUtils.ScaleBy(ref v, 1 / VectorUtils.Norm(y[i]));

                // Make the next vectors in the queue orthogonal to the orthonormalized vectors.
                for (var j = i + 1; j < oversampledRank; ++j) //subtract the projection of y[j] on v.
                    VectorUtils.AddMult(ref v, -VectorUtils.DotProduct(ref v, ref y[j]), ref y[j]);
            var q = y;     // q in QR decomposition.

            var b = omega; // reuse the memory allocated by Omega.

            Project(Host, cursorFactory, ref mean, q, b, out numBad);

            //Compute B2 = B' * B
            var b2 = new Float[oversampledRank * oversampledRank];

            for (var i = 0; i < oversampledRank; ++i)
                for (var j = i; j < oversampledRank; ++j)
                    b2[i * oversampledRank + j] = b2[j * oversampledRank + i] = VectorUtils.DotProduct(ref b[i], ref b[j]);

            Float[] smallEigenvalues;// eigenvectors and eigenvalues of the small matrix B2.
            Float[] smallEigenvectors;
            EigenUtils.EigenDecomposition(b2, out smallEigenvalues, out smallEigenvectors);
            PostProcess(b, smallEigenvalues, smallEigenvectors, dimension, oversampledRank);

            return(new PcaPredictor(Host, _rank, b, ref mean));
        protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.Factory cursorFactory, Float loss, int numParams)
            Contracts.Assert(NumGoodRows > 0);
            Contracts.Assert(WeightSum > 0);
            Contracts.Assert(BiasCount == 1);
            Contracts.Assert(loss >= 0);
            Contracts.Assert(numParams >= BiasCount);

            ch.Info("Model trained with {0} training examples.", NumGoodRows);

            // Compute deviance: start with loss function.
            Float deviance = (Float)(2 * loss * WeightSum);

            if (L2Weight > 0)
                // Need to subtract L2 regularization loss.
                // The bias term is not regularized.
                var regLoss = VectorUtils.NormSquared(CurrentWeights.Values, 1, CurrentWeights.Length - 1) * L2Weight;
                deviance -= regLoss;

            if (L1Weight > 0)
                // Need to subtract L1 regularization loss.
                // The bias term is not regularized.
                Double regLoss = 0;
                VBufferUtils.ForEachDefined(ref CurrentWeights, (ind, value) => { if (ind >= BiasCount)
                                                                                      regLoss += Math.Abs(value);
                deviance -= (Float)regLoss * L1Weight * 2;

            ch.Info("Residual Deviance: \t{0} (on {1} degrees of freedom)", deviance, Math.Max(NumGoodRows - numParams, 0));

            // Compute null deviance, i.e., the deviance of null hypothesis.
            // Cap the prior positive rate at 1e-15.
            Double priorPosRate = _posWeight / WeightSum;

            Contracts.Assert(0 <= priorPosRate && priorPosRate <= 1);
            Float nullDeviance = (priorPosRate <= 1e-15 || 1 - priorPosRate <= 1e-15) ?
                                 0f : (Float)(2 * WeightSum * MathUtils.Entropy(priorPosRate, true));

            ch.Info("Null Deviance:     \t{0} (on {1} degrees of freedom)", nullDeviance, NumGoodRows - 1);

            // Compute AIC.
            ch.Info("AIC:               \t{0}", 2 * numParams + deviance);

            // Show the coefficients statistics table.
            var featureColIdx = cursorFactory.Data.Schema.Feature.Index;
            var schema        = cursorFactory.Data.Data.Schema;
            var featureLength = CurrentWeights.Length - BiasCount;
            var namesSpans    = VBufferUtils.CreateEmpty <DvText>(featureLength);

            if (schema.HasSlotNames(featureColIdx, featureLength))
                schema.GetMetadata(MetadataUtils.Kinds.SlotNames, featureColIdx, ref namesSpans);
            Host.Assert(namesSpans.Length == featureLength);

            // Inverse mapping of non-zero weight slots.
            Dictionary <int, int> weightIndicesInvMap = null;

            // Indices of bias and non-zero weight slots.
            int[] weightIndices = null;

            // Whether all weights are non-zero.
            bool denseWeight = numParams == CurrentWeights.Length;

            // Extract non-zero indices of weight.
            if (!denseWeight)
                weightIndices          = new int[numParams];
                weightIndicesInvMap    = new Dictionary <int, int>(numParams);
                weightIndices[0]       = 0;
                weightIndicesInvMap[0] = 0;
                int j = 1;
                for (int i = 1; i < CurrentWeights.Length; i++)
                    if (CurrentWeights.Values[i] != 0)
                        weightIndices[j]       = i;
                        weightIndicesInvMap[i] = j++;

                Contracts.Assert(j == numParams);

            // Compute the standard error of coefficients.
            long hessianDimension = (long)numParams * (numParams + 1) / 2;

            if (hessianDimension > int.MaxValue)
                ch.Warning("The number of parameter is too large. Cannot hold the variance-covariance matrix in memory. " +
                           "Skipping computation of standard errors and z-statistics of coefficients. Consider choosing a larger L1 regularizer" +
                           "to reduce the number of parameters.");
                _stats = new LinearModelStatistics(Host, NumGoodRows, numParams, deviance, nullDeviance);

            // Building the variance-covariance matrix for parameters.
            // The layout of this algorithm is a packed row-major lower triangular matrix.
            // E.g., layout of indices for 4-by-4:
            // 0
            // 1 2
            // 3 4 5
            // 6 7 8 9
            var hessian = new Double[hessianDimension];

            // Initialize diagonal elements with L2 regularizers except for the first entry (index 0)
            // Since bias is not regularized.
            if (L2Weight > 0)
                // i is the array index of the diagonal entry at iRow-th row and iRow-th column.
                // iRow is one-based.
                int i = 0;
                for (int iRow = 2; iRow <= numParams; iRow++)
                    i         += iRow;
                    hessian[i] = L2Weight;

                Contracts.Assert(i == hessian.Length - 1);

            // Initialize the remaining entries.
            var bias = CurrentWeights.Values[0];

            using (var cursor = cursorFactory.Create())
                while (cursor.MoveNext())
                    var label  = cursor.Label;
                    var weight = cursor.Weight;
                    var score  = bias + VectorUtils.DotProductWithOffset(ref CurrentWeights, 1, ref cursor.Features);
                    // Compute Bernoulli variance n_i * p_i * (1 - p_i) for the i-th training example.
                    var variance = weight / (2 + 2 * Math.Cosh(score));

                    // Increment the first entry of hessian.
                    hessian[0] += variance;

                    var values = cursor.Features.Values;
                    if (cursor.Features.IsDense)
                        int ioff = 1;

                        // Increment remaining entries of hessian.
                        for (int i = 1; i < numParams; i++)
                            ch.Assert(ioff == i * (i + 1) / 2);
                            int wi = weightIndices == null ? i - 1 : weightIndices[i] - 1;
                            Contracts.Assert(0 <= wi && wi < cursor.Features.Length);
                            var val = values[wi] * variance;
                            // Add the implicit first bias term to X'X
                            hessian[ioff++] += val;
                            // Add the remainder of X'X
                            for (int j = 0; j < i; j++)
                                int wj = weightIndices == null ? j : weightIndices[j + 1] - 1;
                                Contracts.Assert(0 <= wj && wj < cursor.Features.Length);
                                hessian[ioff++] += val * values[wj];
                        ch.Assert(ioff == hessian.Length);
                        var indices = cursor.Features.Indices;
                        for (int ii = 0; ii < cursor.Features.Count; ++ii)
                            int i  = indices[ii];
                            int wi = i + 1;
                            if (weightIndicesInvMap != null && !weightIndicesInvMap.TryGetValue(i + 1, out wi))

                            Contracts.Assert(0 < wi && wi <= cursor.Features.Length);
                            int ioff = wi * (wi + 1) / 2;
                            var val  = values[ii] * variance;
                            // Add the implicit first bias term to X'X
                            hessian[ioff] += val;
                            // Add the remainder of X'X
                            for (int jj = 0; jj <= ii; jj++)
                                int j  = indices[jj];
                                int wj = j + 1;
                                if (weightIndicesInvMap != null && !weightIndicesInvMap.TryGetValue(j + 1, out wj))

                                Contracts.Assert(0 < wj && wj <= cursor.Features.Length);
                                hessian[ioff + wj] += val * values[jj];

            // Apply Cholesky Decomposition to find the inverse of the Hessian.
            Double[] invHessian = null;
                // First, find the Cholesky decomposition LL' of the Hessian.
                Mkl.Pptrf(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, numParams, hessian);
                // Note that hessian is already modified at this point. It is no longer the original Hessian,
                // but instead represents the Cholesky decomposition L.
                // Also note that the following routine is supposed to consume the Cholesky decomposition L instead
                // of the original information matrix.
                Mkl.Pptri(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, numParams, hessian);
                // At this point, hessian should contain the inverse of the original Hessian matrix.
                // Swap hessian with invHessian to avoid confusion in the following context.
                Utils.Swap(ref hessian, ref invHessian);
                Contracts.Assert(hessian == null);
            catch (DllNotFoundException)
                throw ch.ExceptNotSupp("The MKL library (Microsoft.ML.MklImports.dll) or one of its dependencies is missing.");

            Float[] stdErrorValues = new Float[numParams];
            stdErrorValues[0] = (Float)Math.Sqrt(invHessian[0]);

            for (int i = 1; i < numParams; i++)
                // Initialize with inverse Hessian.
                stdErrorValues[i] = (Single)invHessian[i * (i + 1) / 2 + i];

            if (L2Weight > 0)
                // Iterate through all entries of inverse Hessian to make adjustment to variance.
                // A discussion on ridge regularized LR coefficient covariance matrix can be found here:
                // http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3228544/
                // http://www.inf.unibz.it/dis/teaching/DWDM/project2010/LogisticRegression.pdf
                int ioffset = 1;
                for (int iRow = 1; iRow < numParams; iRow++)
                    for (int iCol = 0; iCol <= iRow; iCol++)
                        var entry      = (Single)invHessian[ioffset];
                        var adjustment = -L2Weight * entry * entry;
                        stdErrorValues[iRow] -= adjustment;
                        if (0 < iCol && iCol < iRow)
                            stdErrorValues[iCol] -= adjustment;

                Contracts.Assert(ioffset == invHessian.Length);

            for (int i = 1; i < numParams; i++)
                stdErrorValues[i] = (Float)Math.Sqrt(stdErrorValues[i]);

            VBuffer <Float> stdErrors = new VBuffer <Float>(CurrentWeights.Length, numParams, stdErrorValues, weightIndices);

            _stats = new LinearModelStatistics(Host, NumGoodRows, numParams, deviance, nullDeviance, ref stdErrors);
        protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.Factory cursorFactory, float loss, int numParams)
            Contracts.Assert(NumGoodRows > 0);
            Contracts.Assert(WeightSum > 0);
            Contracts.Assert(BiasCount == 1);
            Contracts.Assert(loss >= 0);
            Contracts.Assert(numParams >= BiasCount);

            ch.Info("Model trained with {0} training examples.", NumGoodRows);

            // Compute deviance: start with loss function.
            float deviance = (float)(2 * loss * WeightSum);

            if (L2Weight > 0)
                // Need to subtract L2 regularization loss.
                // The bias term is not regularized.
                var regLoss = VectorUtils.NormSquared(CurrentWeights.Values, 1, CurrentWeights.Length - 1) * L2Weight;
                deviance -= regLoss;

            if (L1Weight > 0)
                // Need to subtract L1 regularization loss.
                // The bias term is not regularized.
                Double regLoss = 0;
                VBufferUtils.ForEachDefined(ref CurrentWeights, (ind, value) => { if (ind >= BiasCount)
                                                                                      regLoss += Math.Abs(value);
                deviance -= (float)regLoss * L1Weight * 2;

            ch.Info("Residual Deviance: \t{0} (on {1} degrees of freedom)", deviance, Math.Max(NumGoodRows - numParams, 0));

            // Compute null deviance, i.e., the deviance of null hypothesis.
            // Cap the prior positive rate at 1e-15.
            Double priorPosRate = _posWeight / WeightSum;

            Contracts.Assert(0 <= priorPosRate && priorPosRate <= 1);
            float nullDeviance = (priorPosRate <= 1e-15 || 1 - priorPosRate <= 1e-15) ?
                                 0f : (float)(2 * WeightSum * MathUtils.Entropy(priorPosRate, true));

            ch.Info("Null Deviance:     \t{0} (on {1} degrees of freedom)", nullDeviance, NumGoodRows - 1);

            // Compute AIC.
            ch.Info("AIC:               \t{0}", 2 * numParams + deviance);

            // Show the coefficients statistics table.
            var featureColIdx = cursorFactory.Data.Schema.Feature.Index;
            var schema        = cursorFactory.Data.Data.Schema;
            var featureLength = CurrentWeights.Length - BiasCount;
            var namesSpans    = VBufferUtils.CreateEmpty <ReadOnlyMemory <char> >(featureLength);

            if (schema.HasSlotNames(featureColIdx, featureLength))
                schema.GetMetadata(MetadataUtils.Kinds.SlotNames, featureColIdx, ref namesSpans);
            Host.Assert(namesSpans.Length == featureLength);

            // Inverse mapping of non-zero weight slots.
            Dictionary <int, int> weightIndicesInvMap = null;

            // Indices of bias and non-zero weight slots.
            int[] weightIndices = null;

            // Whether all weights are non-zero.
            bool denseWeight = numParams == CurrentWeights.Length;

            // Extract non-zero indices of weight.
            if (!denseWeight)
                weightIndices          = new int[numParams];
                weightIndicesInvMap    = new Dictionary <int, int>(numParams);
                weightIndices[0]       = 0;
                weightIndicesInvMap[0] = 0;
                int j = 1;
                for (int i = 1; i < CurrentWeights.Length; i++)
                    if (CurrentWeights.Values[i] != 0)
                        weightIndices[j]       = i;
                        weightIndicesInvMap[i] = j++;

                Contracts.Assert(j == numParams);

            // Compute the standard error of coefficients.
            long hessianDimension = (long)numParams * (numParams + 1) / 2;

            if (hessianDimension > int.MaxValue)
                ch.Warning("The number of parameter is too large. Cannot hold the variance-covariance matrix in memory. " +
                           "Skipping computation of standard errors and z-statistics of coefficients. Consider choosing a larger L1 regularizer" +
                           "to reduce the number of parameters.");
                _stats = new LinearModelStatistics(Host, NumGoodRows, numParams, deviance, nullDeviance);

            // Building the variance-covariance matrix for parameters.
            // The layout of this algorithm is a packed row-major lower triangular matrix.
            // E.g., layout of indices for 4-by-4:
            // 0
            // 1 2
            // 3 4 5
            // 6 7 8 9
            var hessian = new Double[hessianDimension];

            // Initialize diagonal elements with L2 regularizers except for the first entry (index 0)
            // Since bias is not regularized.
            if (L2Weight > 0)
                // i is the array index of the diagonal entry at iRow-th row and iRow-th column.
                // iRow is one-based.
                int i = 0;
                for (int iRow = 2; iRow <= numParams; iRow++)
                    i         += iRow;
                    hessian[i] = L2Weight;

                Contracts.Assert(i == hessian.Length - 1);

            // Initialize the remaining entries.
            var bias = CurrentWeights.Values[0];

            using (var cursor = cursorFactory.Create())
                while (cursor.MoveNext())
                    var label  = cursor.Label;
                    var weight = cursor.Weight;
                    var score  = bias + VectorUtils.DotProductWithOffset(ref CurrentWeights, 1, ref cursor.Features);
                    // Compute Bernoulli variance n_i * p_i * (1 - p_i) for the i-th training example.
                    var variance = weight / (2 + 2 * Math.Cosh(score));

                    // Increment the first entry of hessian.
                    hessian[0] += variance;

                    var values = cursor.Features.Values;
                    if (cursor.Features.IsDense)
                        int ioff = 1;

                        // Increment remaining entries of hessian.
                        for (int i = 1; i < numParams; i++)
                            ch.Assert(ioff == i * (i + 1) / 2);
                            int wi = weightIndices == null ? i - 1 : weightIndices[i] - 1;
                            Contracts.Assert(0 <= wi && wi < cursor.Features.Length);
                            var val = values[wi] * variance;
                            // Add the implicit first bias term to X'X
                            hessian[ioff++] += val;
                            // Add the remainder of X'X
                            for (int j = 0; j < i; j++)
                                int wj = weightIndices == null ? j : weightIndices[j + 1] - 1;
                                Contracts.Assert(0 <= wj && wj < cursor.Features.Length);
                                hessian[ioff++] += val * values[wj];
                        ch.Assert(ioff == hessian.Length);
                        var indices = cursor.Features.Indices;
                        for (int ii = 0; ii < cursor.Features.Count; ++ii)
                            int i  = indices[ii];
                            int wi = i + 1;
                            if (weightIndicesInvMap != null && !weightIndicesInvMap.TryGetValue(i + 1, out wi))

                            Contracts.Assert(0 < wi && wi <= cursor.Features.Length);
                            int ioff = wi * (wi + 1) / 2;
                            var val  = values[ii] * variance;
                            // Add the implicit first bias term to X'X
                            hessian[ioff] += val;
                            // Add the remainder of X'X
                            for (int jj = 0; jj <= ii; jj++)
                                int j  = indices[jj];
                                int wj = j + 1;
                                if (weightIndicesInvMap != null && !weightIndicesInvMap.TryGetValue(j + 1, out wj))

                                Contracts.Assert(0 < wj && wj <= cursor.Features.Length);
                                hessian[ioff + wj] += val * values[jj];

            _stats = new LinearModelStatistics(Host, NumGoodRows, numParams, deviance, nullDeviance);
        // Combines source key names and slot names to produce final slot names.
        private void GetSlotNames(int iinfo, ref VBuffer <DvText> dst)
            Host.Assert(0 <= iinfo && iinfo < Infos.Length);

            // Size one should have been treated the same as Bag (by the caller).
            // Variable size should have thrown (by the caller).
            var typeSrc = Infos[iinfo].TypeSrc;

            Host.Assert(typeSrc.VectorSize > 1);

            // Get the source slot names, defaulting to empty text.
            var namesSlotSrc = default(VBuffer <DvText>);
            var typeSlotSrc  = Source.Schema.GetMetadataTypeOrNull(MetadataUtils.Kinds.SlotNames, Infos[iinfo].Source);

            if (typeSlotSrc != null && typeSlotSrc.VectorSize == typeSrc.VectorSize && typeSlotSrc.ItemType.IsText)
                Source.Schema.GetMetadata(MetadataUtils.Kinds.SlotNames, Infos[iinfo].Source, ref namesSlotSrc);
                Host.Check(namesSlotSrc.Length == typeSrc.VectorSize);
                namesSlotSrc = VBufferUtils.CreateEmpty <DvText>(typeSrc.VectorSize);

            int keyCount = typeSrc.ItemType.KeyCount;
            int slotLim  = _types[iinfo].VectorSize;

            Host.Assert(slotLim == (long)typeSrc.VectorSize * keyCount);

            // Get the source key names, in an array (since we will use them multiple times).
            var namesKeySrc = default(VBuffer <DvText>);

            Source.Schema.GetMetadata(MetadataUtils.Kinds.KeyValues, Infos[iinfo].Source, ref namesKeySrc);
            Host.Check(namesKeySrc.Length == keyCount);
            var keys = new DvText[keyCount];


            var values = dst.Values;

            if (Utils.Size(values) < slotLim)
                values = new DvText[slotLim];

            var sb   = new StringBuilder();
            int slot = 0;

            foreach (var kvpSlot in namesSlotSrc.Items(all: true))
                Contracts.Assert(slot == (long)kvpSlot.Key * keyCount);
                if (kvpSlot.Value.HasChars)

                int len = sb.Length;
                foreach (var key in keys)
                    sb.Length = len;
                    values[slot++] = new DvText(sb.ToString());
            Host.Assert(slot == slotLim);

            dst = new VBuffer <DvText>(slotLim, values, dst.Indices);
        protected LinearPredictor(IHostEnvironment env, string name, ModelLoadContext ctx)
            : base(env, name, ctx)
            // *** Binary format ***
            // Float: bias
            // int: number of features (weights)
            // int: number of indices
            // int[]: indices
            // int: number of weights
            // Float[]: weights
            // bool: has model stats
            // (Conditional) LinearModelStatistics: stats

            Bias = ctx.Reader.ReadFloat();

            int len = ctx.Reader.ReadInt32();

            Host.Assert(len > 0);

            int cind = ctx.Reader.ReadInt32();

            Host.CheckDecode(0 <= cind & cind < len);
            var indices = ctx.Reader.ReadIntArray(cind);

            // Verify monotonicity of indices.
            int prev = -1;

            for (int i = 0; i < cind; i++)
                Host.CheckDecode(prev < indices[i]);
                prev = indices[i];
            Host.CheckDecode(prev < len);

            int cwht = ctx.Reader.ReadInt32();

            // Either there are as many weights as there are indices (in the
            // sparse case), or (in the dense case) there are no indices and the
            // number of weights is the length of the vector. Note that for the
            // trivial predictor it is quite legal to have 0 in both counts.
            Host.CheckDecode(cwht == cind || (cind == 0 && cwht == len));

            var weights = ctx.Reader.ReadFloatArray(cwht);

            Host.CheckDecode(Utils.Size(weights) == 0 || weights.All(x => FloatUtils.IsFinite(x)));

            if (cwht == 0)
                Weight = VBufferUtils.CreateEmpty <Float>(len);
                Weight = new VBuffer <Float>(len, Utils.Size(weights), weights, indices);

            InputType = new VectorType(NumberType.Float, Weight.Length);
            WarnOnOldNormalizer(ctx, GetType(), Host);

            if (Weight.IsDense)
                _weightsDense = Weight;
                _weightsDenseLock = new object();
 /// <summary>
 /// Convenience function to construct a working vector of length <c>Dim</c>.
 /// </summary>
 /// <returns></returns>
 protected VBuffer <Float> CreateWorkingVector()
     // Owing to the way the operations are structured, if the "x", "newX", and "dir" vectors
     // start out (or somehow naturally become) dense, they will remain dense.
     return(_keepDense ? VBufferUtils.CreateDense <Float>(Dim) : VBufferUtils.CreateEmpty <Float>(Dim));
        /// <summary>
        /// Initialize weights by running SGD up to specified tolerance.
        /// </summary>
        protected virtual VBuffer <float> InitializeWeightsSgd(IChannel ch, FloatLabelCursor.Factory cursorFactory)
            if (!Quiet)
                ch.Info("Running SGD initialization with tolerance {0}", SgdInitializationTolerance);

            int        numExamples  = 0;
            var        oldWeights   = VBufferUtils.CreateEmpty <float>(BiasCount + WeightCount);
            DTerminate terminateSgd =
                (in VBuffer <float> x) =>
                if (++numExamples % 1000 != 0)
                VectorUtils.AddMult(in x, -1, ref oldWeights);
                float normDiff = VectorUtils.Norm(oldWeights);
                x.CopyTo(ref oldWeights);
                // #if OLD_TRACING // REVIEW: How should this be ported?
                if (!Quiet)
                    if (numExamples % 50000 == 0)
                        Console.WriteLine("\t{0}\t{1}", numExamples, normDiff);
                // #endif
                return(normDiff < SgdInitializationTolerance);

            VBuffer <float>  result = default(VBuffer <float>);
            FloatLabelCursor cursor = null;

                float[] scratch = null;

                SgdOptimizer.DStochasticGradient lossSgd =
                    (in VBuffer <float> x, ref VBuffer <float> grad) =>
                    // Zero out the gradient by sparsifying.
                    grad = new VBuffer <float>(grad.Length, 0, grad.Values, grad.Indices);
                    EnsureBiases(ref grad);

                    if (cursor == null || !cursor.MoveNext())
                        if (cursor != null)
                        cursor = cursorFactory.Create();
                        if (!cursor.MoveNext())
                    AccumulateOneGradient(in cursor.Features, cursor.Label, cursor.Weight, in x, ref grad, ref scratch);

                VBuffer <float> sgdWeights;
                if (DenseOptimizer)
                    sgdWeights = VBufferUtils.CreateDense <float>(BiasCount + WeightCount);
                    sgdWeights = VBufferUtils.CreateEmpty <float>(BiasCount + WeightCount);
                SgdOptimizer sgdo = new SgdOptimizer(terminateSgd);
                sgdo.Minimize(lossSgd, ref sgdWeights, ref result);
                // #if OLD_TRACING // REVIEW: How should this be ported?
                if (!Quiet)
                // #endif
                ch.Info("SGD initialization done in {0} rounds", numExamples);
                if (cursor != null)

            IHostEnvironment env,
            IPredictionTransformer <TModel> model,
            IDataView data,
            Func <TResult> resultInitializer,
            Func <IDataView, TMetric> evaluationFunc,
            Func <TMetric, TMetric, TMetric> deltaFunc,
            string features,
            int permutationCount,
            bool useFeatureWeightFilter = false,
            int?topExamples             = null)
            Contracts.CheckValue(env, nameof(env));
            var host = env.Register(nameof(PermutationFeatureImportance <TModel, TMetric, TResult>));

            host.CheckValue(model, nameof(model));
            host.CheckValue(data, nameof(data));
            host.CheckNonEmpty(features, nameof(features));

            topExamples = topExamples ?? Utils.ArrayMaxSize;
            host.Check(topExamples > 0, "Provide how many examples to use (positive number) or set to null to use whole dataset.");

            VBuffer <ReadOnlyMemory <char> > slotNames = default;
            var metricsDelta = new List <TResult>();

            using (var ch = host.Start("GetImportanceMetrics"))
                ch.Trace("Scoring and evaluating baseline.");
                var baselineMetrics = evaluationFunc(model.Transform(data));

                // Get slot names.
                var featuresColumn = data.Schema[features];
                int numSlots       = featuresColumn.Type.GetVectorSize();
                data.Schema.TryGetColumnIndex(features, out int featuresColumnIndex);

                ch.Info("Number of slots: " + numSlots);
                if (data.Schema[featuresColumnIndex].HasSlotNames(numSlots))
                    data.Schema[featuresColumnIndex].Annotations.GetValue(AnnotationUtils.Kinds.SlotNames, ref slotNames);

                if (slotNames.Length != numSlots)
                    slotNames = VBufferUtils.CreateEmpty <ReadOnlyMemory <char> >(numSlots);

                VBuffer <float> weights = default;
                var             workingFeatureIndices = Enumerable.Range(0, numSlots).ToList();
                int             zeroWeightsCount      = 0;

                // By default set to the number of all features available.
                var evaluatedFeaturesCount = numSlots;
                if (useFeatureWeightFilter)
                    var predictorWithWeights = model.Model as IPredictorWithFeatureWeights <Single>;
                    if (predictorWithWeights != null)
                        predictorWithWeights.GetFeatureWeights(ref weights);

                        const int     maxReportedZeroFeatures = 10;
                        StringBuilder msgFilteredOutFeatures  = new StringBuilder("The following features have zero weight and will not be evaluated: \n \t");
                        var           prefix = "";
                        foreach (var k in weights.Items(all: true))
                            if (k.Value == 0)

                                // Print info about first few features we're not going to evaluate.
                                if (zeroWeightsCount <= maxReportedZeroFeatures)
                                    msgFilteredOutFeatures.Append(GetSlotName(slotNames, k.Key));
                                    prefix = ", ";

                        // Old FastTree models has less weights than slots.
                        if (weights.Length < numSlots)
                                "Predictor had fewer features than slots. All unknown features will get default 0 weight.");
                            zeroWeightsCount += numSlots - weights.Length;
                            var indexes = weights.GetIndices().ToArray();
                            var values  = weights.GetValues().ToArray();
                            var count   = values.Length;
                            weights = new VBuffer <float>(numSlots, count, values, indexes);

                        evaluatedFeaturesCount = workingFeatureIndices.Count;
                        ch.Info("Number of zero weights: {0} out of {1}.", zeroWeightsCount, weights.Length);

                        // Print what features have 0 weight
                        if (zeroWeightsCount > 0)
                            if (zeroWeightsCount > maxReportedZeroFeatures)
                                msgFilteredOutFeatures.Append(string.Format("... (printing out  {0} features here).\n Use 'Index' column in the report for info on what features are not evaluated.", maxReportedZeroFeatures));

                if (workingFeatureIndices.Count == 0 && zeroWeightsCount == 0)
                    // Use all features otherwise.
                    workingFeatureIndices.AddRange(Enumerable.Range(0, numSlots));

                if (zeroWeightsCount == numSlots)
                    ch.Warning("All features have 0 weight thus can not do thorough evaluation");

                // Note: this will not work on the huge dataset.
                var          maxSize = topExamples;
                List <float> initialfeatureValuesList = new List <float>();

                // Cursor through the data to cache slot 0 values for the upcoming permutation.
                var valuesRowCount = 0;
                // REVIEW: Seems like if the labels are NaN, so that all metrics are NaN, this command will be useless.
                // In which case probably erroring out is probably the most useful thing.
                using (var cursor = data.GetRowCursor(featuresColumn))
                    var featuresGetter = cursor.GetGetter <VBuffer <float> >(featuresColumn);
                    var featuresBuffer = default(VBuffer <float>);

                    while (initialfeatureValuesList.Count < maxSize && cursor.MoveNext())
                        featuresGetter(ref featuresBuffer);

                    valuesRowCount = initialfeatureValuesList.Count;

                if (valuesRowCount > 0)
                    ch.Info("Detected {0} examples for evaluation.", valuesRowCount);
                    ch.Warning("Detected no examples for evaluation.");

                float[] featureValuesBuffer = initialfeatureValuesList.ToArray();
                float[] nextValues          = new float[valuesRowCount];

                // Now iterate through all the working slots, do permutation and calc the delta of metrics.
                int processedCnt     = 0;
                int nextFeatureIndex = 0;
                var shuffleRand      = RandomUtils.Create(host.Rand.Next());
                using (var pch = host.StartProgressChannel("Calculating Permutation Feature Importance"))
                    pch.SetHeader(new ProgressHeader("processed slots"), e => e.SetProgress(0, processedCnt));
                    foreach (var workingIndx in workingFeatureIndices)
                        // Index for the feature we will permute next.  Needed to build in advance a buffer for the permutation.
                        if (processedCnt < workingFeatureIndices.Count - 1)
                            nextFeatureIndex = workingFeatureIndices[processedCnt + 1];

                        // Used for pre-caching the next feature
                        int nextValuesIndex = 0;

                        SchemaDefinition input = SchemaDefinition.Create(typeof(FeaturesBuffer));
                        Contracts.Assert(input.Count == 1);
                        input[0].ColumnName = features;

                        SchemaDefinition output = SchemaDefinition.Create(typeof(FeaturesBuffer));
                        Contracts.Assert(output.Count == 1);
                        output[0].ColumnName = features;
                        output[0].ColumnType = featuresColumn.Type;

                        // Perform multiple permutations for one feature to build a confidence interval
                        var metricsDeltaForFeature = resultInitializer();
                        for (int permutationIteration = 0; permutationIteration < permutationCount; permutationIteration++)
                            Utils.Shuffle <float>(shuffleRand, featureValuesBuffer);

                            Action <FeaturesBuffer, FeaturesBuffer, PermuterState> permuter =
                                (src, dst, state) =>
                                src.Features.CopyTo(ref dst.Features);
                                VBufferUtils.ApplyAt(ref dst.Features, workingIndx,
                                                     (int ii, ref float d) =>
                                                     d = featureValuesBuffer[state.SampleIndex++]);

                                // Is it time to pre-cache the next feature?
                                if (permutationIteration == permutationCount - 1 &&
                                    processedCnt < workingFeatureIndices.Count - 1)
                                    // Fill out the featureValueBuffer for the next feature while updating the current feature
                                    // This is the reason I need PermuterState in LambdaTransform.CreateMap.
                                    nextValues[nextValuesIndex] = src.Features.GetItemOrDefault(nextFeatureIndex);
                                    if (nextValuesIndex < valuesRowCount - 1)

                            IDataView viewPermuted = LambdaTransform.CreateMap(
                                host, data, permuter, null, input, output);
                            if (valuesRowCount == topExamples)
                                viewPermuted = SkipTakeFilter.Create(host, new SkipTakeFilter.TakeOptions()
                                    Count = valuesRowCount
                                }, viewPermuted);

                            var metrics = evaluationFunc(model.Transform(viewPermuted));

                            var delta = deltaFunc(metrics, baselineMetrics);

                        // Add the metrics delta to the list

                        // Swap values for next iteration of permutation.
                        if (processedCnt < workingFeatureIndices.Count - 1)
                            Array.Clear(featureValuesBuffer, 0, featureValuesBuffer.Length);
                            nextValues.CopyTo(featureValuesBuffer, 0);
                            Array.Clear(nextValues, 0, nextValues.Length);
                    pch.Checkpoint(processedCnt, processedCnt);

        /// <summary>
        /// Minimize the function represented by <paramref name="f"/>.
        /// </summary>
        /// <param name="f">Stochastic gradients of function to minimize</param>
        /// <param name="initial">Initial point</param>
        /// <param name="result">Approximate minimum of <paramref name="f"/></param>
        public void Minimize(DStochasticGradient f, ref VBuffer <Float> initial, ref VBuffer <Float> result)
            Contracts.Check(FloatUtils.IsFinite(initial.Values, initial.Count), "The initial vector contains NaNs or infinite values.");
            int dim = initial.Length;

            VBuffer <Float> grad = VBufferUtils.CreateEmpty <Float>(dim);
            VBuffer <Float> step = VBufferUtils.CreateEmpty <Float>(dim);
            VBuffer <Float> x    = default(VBuffer <Float>);

            initial.CopyTo(ref x);
            VBuffer <Float> prev = default(VBuffer <Float>);
            VBuffer <Float> avg  = VBufferUtils.CreateEmpty <Float>(dim);

            for (int n = 0; _maxSteps == 0 || n < _maxSteps; ++n)
                if (_momentum == 0)
                    step = new VBuffer <Float>(step.Length, 0, step.Values, step.Indices);
                    VectorUtils.ScaleBy(ref step, _momentum);

                Float stepSize;
                switch (_rateSchedule)
                case RateScheduleType.Constant:
                    stepSize = 1 / _t0;

                case RateScheduleType.Sqrt:
                    stepSize = 1 / (_t0 + MathUtils.Sqrt(n));

                case RateScheduleType.Linear:
                    stepSize = 1 / (_t0 + n);

                    throw Contracts.Except();

                Float scale = (1 - _momentum) / _batchSize;
                for (int i = 0; i < _batchSize; ++i)
                    f(ref x, ref grad);
                    VectorUtils.AddMult(ref grad, scale, ref step);

                if (_averaging)
                    Utils.Swap(ref avg, ref prev);
                    VectorUtils.ScaleBy(prev, ref avg, (Float)n / (n + 1));
                    VectorUtils.AddMult(ref step, -stepSize, ref x);
                    VectorUtils.AddMult(ref x, (Float)1 / (n + 1), ref avg);

                    if ((n > 0 && TerminateTester.ShouldTerminate(ref avg, ref prev)) || _terminate(ref avg))
                        result = avg;
                    Utils.Swap(ref x, ref prev);
                    VectorUtils.AddMult(ref step, -stepSize, ref prev, ref x);
                    if ((n > 0 && TerminateTester.ShouldTerminate(ref x, ref prev)) || _terminate(ref x))
                        result = x;

            result = _averaging ? avg : x;