protected virtual Optimizer InitializeOptimizer(IChannel ch, FloatLabelCursor.Factory cursorFactory, out VBuffer <Float> init, out ITerminationCriterion terminationCriterion) { // MeanRelativeImprovementCriterion: // Stops optimization when the average objective improvement over the last // n iterations, normalized by the function value, is small enough. terminationCriterion = new MeanRelativeImprovementCriterion(OptTol, 5, MaxIterations); Optimizer opt = (L1Weight > 0) ? new L1Optimizer(Host, BiasCount, L1Weight / NumGoodRows, MemorySize, DenseOptimizer, null, EnforceNonNegativity) : new Optimizer(Host, MemorySize, DenseOptimizer, null, EnforceNonNegativity); opt.Quiet = Quiet; if (_srcPredictor != null) { init = InitializeWeightsFromPredictor(_srcPredictor); } else if (InitWtsDiameter > 0) { Float[] initWeights = new Float[BiasCount + WeightCount]; for (int j = 0; j < initWeights.Length; j++) { initWeights[j] = InitWtsDiameter * (Host.Rand.NextSingle() - (Float)0.5); } init = new VBuffer <Float>(initWeights.Length, initWeights); } else if (SgdInitializationTolerance > 0) { init = InitializeWeightsSgd(ch, cursorFactory); } else { init = VBufferUtils.CreateEmpty <Float>(BiasCount + WeightCount); } return(opt); }
private static void FillValues(Float input, ref VBuffer <Float> result) { if (input == 0) { VBufferUtils.Resize(ref result, 2, 0); return; } var editor = VBufferEditor.Create(ref result, 2, 1); if (Float.IsNaN(input)) { editor.Values[0] = 1; editor.Indices[0] = 1; } else { editor.Values[0] = input; editor.Indices[0] = 0; } result = editor.Commit(); }
public RowMapper(IHostEnvironment env, BindableMapper parent, RoleMappedSchema schema) { Contracts.AssertValue(env); _env = env; _env.AssertValue(schema); _env.AssertValue(parent); _env.Assert(schema.Feature.HasValue); _parent = parent; InputRoleMappedSchema = schema; var genericMapper = parent.GenericMapper.Bind(_env, schema); _genericRowMapper = genericMapper as ISchemaBoundRowMapper; if (parent.Stringify) { var builder = new SchemaBuilder(); builder.AddColumn(DefaultColumnNames.FeatureContributions, TextType.Instance, null); _outputSchema = builder.GetSchema(); if (FeatureColumn.HasSlotNames(FeatureColumn.Type.VectorSize)) { FeatureColumn.Metadata.GetValue(MetadataUtils.Kinds.SlotNames, ref _slotNames); } else { _slotNames = VBufferUtils.CreateEmpty <ReadOnlyMemory <char> >(FeatureColumn.Type.VectorSize); } } else { _outputSchema = Schema.Create(new FeatureContributionSchema(_env, DefaultColumnNames.FeatureContributions, new VectorType(NumberType.R4, FeatureColumn.Type as VectorType), InputSchema, FeatureColumn.Index)); } _outputGenericSchema = _genericRowMapper.OutputSchema; OutputSchema = new ZipBinding(new Schema[] { _outputGenericSchema, _outputSchema, }).OutputSchema; }
public RowMapper(IHostEnvironment env, BindableMapper parent, RoleMappedSchema schema) { Contracts.AssertValue(env); _env = env; _env.AssertValue(schema); _env.AssertValue(parent); _env.AssertValue(schema.Feature); _parent = parent; InputRoleMappedSchema = schema; var genericMapper = parent.GenericMapper.Bind(_env, schema); _genericRowMapper = genericMapper as ISchemaBoundRowMapper; if (parent.Stringify) { _outputSchema = new SimpleSchema(_env, new KeyValuePair <string, ColumnType>(DefaultColumnNames.FeatureContributions, TextType.Instance)); if (InputSchema.HasSlotNames(InputRoleMappedSchema.Feature.Index, InputRoleMappedSchema.Feature.Type.VectorSize)) { InputSchema.GetMetadata(MetadataUtils.Kinds.SlotNames, InputRoleMappedSchema.Feature.Index, ref _slotNames); } else { _slotNames = VBufferUtils.CreateEmpty <ReadOnlyMemory <char> >(InputRoleMappedSchema.Feature.Type.VectorSize); } } else { _outputSchema = new FeatureContributionSchema(_env, DefaultColumnNames.FeatureContributions, new VectorType(NumberType.R4, schema.Feature.Type.AsVector), InputSchema, InputRoleMappedSchema.Feature.Index); } _outputGenericSchema = _genericRowMapper.OutputSchema; OutputSchema = new CompositeSchema(new ISchema[] { _outputGenericSchema, _outputSchema, }).AsSchema; }
public static FeatureNameCollection Create(RoleMappedSchema schema) { // REVIEW: This shim should be deleted as soon as is convenient. Contracts.CheckValue(schema, nameof(schema)); Contracts.CheckParam(schema.Feature.HasValue, nameof(schema), "Cannot create feature name collection if we have no features"); var featureCol = schema.Feature.Value; Contracts.CheckParam(schema.Feature.Value.Type.ValueCount > 0, nameof(schema), "Cannot create feature name collection if our features are not of known size"); VBuffer <ReadOnlyMemory <char> > slotNames = default; int len = featureCol.Type.ValueCount; if (featureCol.HasSlotNames(len)) { featureCol.Metadata.GetValue(MetadataUtils.Kinds.SlotNames, ref slotNames); } else { slotNames = VBufferUtils.CreateEmpty <ReadOnlyMemory <char> >(len); } var slotNameValues = slotNames.GetValues(); string[] names = new string[slotNameValues.Length]; for (int i = 0; i < slotNameValues.Length; ++i) { names[i] = !slotNameValues[i].IsEmpty ? slotNameValues[i].ToString() : null; } if (slotNames.IsDense) { return(new Dense(names.Length, names)); } ReadOnlySpan <int> indices = slotNames.GetIndices(); return(new Sparse(slotNames.Length, slotNameValues.Length, indices.ToArray(), names)); }
private ValueGetter<VBuffer<ReadOnlyMemory<char>>> MakeGetterVec(Row input, int iinfo) { var getSrc = input.GetGetter<VBuffer<ReadOnlyMemory<char>>>(ColMapNewToOld[iinfo]); Host.AssertValue(getSrc); var src = default(VBuffer<ReadOnlyMemory<char>>); var buffer = new StringBuilder(); var list = new List<ReadOnlyMemory<char>>(); var temp = default(ReadOnlyMemory<char>); return (ref VBuffer<ReadOnlyMemory<char>> dst) => { getSrc(ref src); list.Clear(); var srcValues = src.GetValues(); for (int i = 0; i < srcValues.Length; i++) { NormalizeSrc(in srcValues[i], ref temp, buffer); if (!temp.IsEmpty) list.Add(temp); } VBufferUtils.Copy(list, ref dst, list.Count); }; }
/// <summary> /// Initialize weights by running SGD up to specified tolerance. /// </summary> protected virtual VBuffer <float> InitializeWeightsSgd(IChannel ch, FloatLabelCursor.Factory cursorFactory) { if (!Quiet) { ch.Info("Running SGD initialization with tolerance {0}", SgdInitializationTolerance); } int numExamples = 0; var oldWeights = VBufferUtils.CreateEmpty <float>(BiasCount + WeightCount); DTerminate terminateSgd = (in VBuffer <float> x) => { if (++numExamples % 1000 != 0) { return(false); } VectorUtils.AddMult(in x, -1, ref oldWeights); float normDiff = VectorUtils.Norm(oldWeights); x.CopyTo(ref oldWeights); // #if OLD_TRACING // REVIEW: How should this be ported? if (!Quiet) { Console.Write("."); if (numExamples % 50000 == 0) { Console.WriteLine("\t{0}\t{1}", numExamples, normDiff); } } // #endif return(normDiff < SgdInitializationTolerance); }; VBuffer <float> result = default(VBuffer <float>); FloatLabelCursor cursor = null; try { float[] scratch = null; SgdOptimizer.DStochasticGradient lossSgd = (in VBuffer <float> x, ref VBuffer <float> grad) => { // Zero out the gradient by sparsifying. grad = new VBuffer <float>(grad.Length, 0, grad.Values, grad.Indices); EnsureBiases(ref grad); if (cursor == null || !cursor.MoveNext()) { if (cursor != null) { cursor.Dispose(); } cursor = cursorFactory.Create(); if (!cursor.MoveNext()) { return; } } AccumulateOneGradient(in cursor.Features, cursor.Label, cursor.Weight, in x, ref grad, ref scratch); }; VBuffer <float> sgdWeights; if (DenseOptimizer) { sgdWeights = VBufferUtils.CreateDense <float>(BiasCount + WeightCount); } else { sgdWeights = VBufferUtils.CreateEmpty <float>(BiasCount + WeightCount); } SgdOptimizer sgdo = new SgdOptimizer(terminateSgd); sgdo.Minimize(lossSgd, ref sgdWeights, ref result); // #if OLD_TRACING // REVIEW: How should this be ported? if (!Quiet) { Console.WriteLine(); } // #endif ch.Info("SGD initialization done in {0} rounds", numExamples); } finally { if (cursor != null) { cursor.Dispose(); } } return(result); }
private OlsLinearRegressionPredictor TrainCore(IChannel ch, FloatLabelCursor.Factory cursorFactory, int featureCount) { Host.AssertValue(ch); ch.AssertValue(cursorFactory); int m = featureCount + 1; // Check for memory conditions first. if ((long)m * (m + 1) / 2 > int.MaxValue) { throw ch.Except("Cannot hold covariance matrix in memory with {0} features", m - 1); } // Track the number of examples. long n = 0; // Since we are accumulating over many values, we use Double even for the single precision build. var xty = new Double[m]; // The layout of this algorithm is a packed row-major lower triangular matrix. var xtx = new Double[m * (m + 1) / 2]; // Build X'X (lower triangular) and X'y incrementally (X'X+=X'X_i; X'y+=X'y_i): using (var cursor = cursorFactory.Create()) { while (cursor.MoveNext()) { var yi = cursor.Label; // Increment first element of X'y xty[0] += yi; // Increment first element of lower triangular X'X xtx[0] += 1; var values = cursor.Features.GetValues(); if (cursor.Features.IsDense) { int ioff = 1; ch.Assert(values.Length + 1 == m); // Increment rest of first column of lower triangular X'X for (int i = 1; i < m; i++) { ch.Assert(ioff == i * (i + 1) / 2); var val = values[i - 1]; // Add the implicit first bias term to X'X xtx[ioff++] += val; // Add the remainder of X'X for (int j = 0; j < i; j++) { xtx[ioff++] += val * values[j]; } // X'y xty[i] += val * yi; } ch.Assert(ioff == xtx.Length); } else { var fIndices = cursor.Features.GetIndices(); for (int ii = 0; ii < values.Length; ++ii) { int i = fIndices[ii] + 1; int ioff = i * (i + 1) / 2; var val = values[ii]; // Add the implicit first bias term to X'X xtx[ioff++] += val; // Add the remainder of X'X for (int jj = 0; jj <= ii; jj++) { xtx[ioff + fIndices[jj]] += val * values[jj]; } // X'y xty[i] += val * yi; } } n++; } ch.Check(n > 0, "No training examples in dataset."); if (cursor.BadFeaturesRowCount > 0) { ch.Warning("Skipped {0} instances with missing features/label during training", cursor.SkippedRowCount); } if (_l2Weight > 0) { // Skip the bias term for regularization, in the ridge regression case. // So start at [1,1] instead of [0,0]. // REVIEW: There are two ways to view this, firstly, it is more // user friendly ot make this scaling factor behave similarly regardless // of data size, so that if you have the same parameters, you get the same // model if you feed in your data than if you duplicate your data 10 times. // This is what I have now. The alternate point of view is to view this // L2 regularization parameter as providing some sort of prior, in which // case duplication 10 times should in fact be treated differently! (That // is, we should not multiply by n below.) Both interpretations seem // correct, in their way. Double squared = _l2Weight * _l2Weight * n; int ioff = 0; for (int i = 1; i < m; ++i) { xtx[ioff += i + 1] += squared; } ch.Assert(ioff == xtx.Length - 1); } } if (!(_l2Weight > 0) && n < m) { throw ch.Except("Ordinary least squares requires more examples than parameters. There are {0} parameters, but {1} examples. To enable training, use a positive L2 weight so this behaves as ridge regression.", m, n); } Double yMean = n == 0 ? 0 : xty[0] / n; ch.Info("Trainer solving for {0} parameters across {1} examples", m, n); // Cholesky Decomposition of X'X into LL' try { Mkl.Pptrf(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, m, xtx); } catch (DllNotFoundException) { // REVIEW: Is there no better way? throw ch.ExceptNotSupp("The MKL library (libMklImports) or one of its dependencies is missing."); } // Solve for beta in (LL')beta = X'y: Mkl.Pptrs(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, m, 1, xtx, xty, 1); // Note that the solver overwrote xty so it contains the solution. To be more clear, // we effectively change its name (through reassignment) so we don't get confused that // this is somehow xty in the remaining calculation. var beta = xty; xty = null; // Check that the solution is valid. for (int i = 0; i < beta.Length; ++i) { ch.Check(FloatUtils.IsFinite(beta[i]), "Non-finite values detected in OLS solution"); } var weights = VBufferUtils.CreateDense <float>(beta.Length - 1); for (int i = 1; i < beta.Length; ++i) { weights.Values[i - 1] = (float)beta[i]; } var bias = (float)beta[0]; if (!(_l2Weight > 0) && m == n) { // We would expect the solution to the problem to be exact in this case. ch.Info("Number of examples equals number of parameters, solution is exact but no statistics can be derived"); return(new OlsLinearRegressionPredictor(Host, in weights, bias, null, null, null, 1, float.NaN)); } Double rss = 0; // residual sum of squares Double tss = 0; // total sum of squares using (var cursor = cursorFactory.Create()) { var lrPredictor = new LinearRegressionPredictor(Host, in weights, bias); var lrMap = lrPredictor.GetMapper <VBuffer <float>, float>(); float yh = default; while (cursor.MoveNext()) { var features = cursor.Features; lrMap(in features, ref yh); var e = cursor.Label - yh; rss += e * e; var ydm = cursor.Label - yMean; tss += ydm * ydm; } } var rSquared = ProbClamp(1 - (rss / tss)); // R^2 adjusted differs from the normal formula on account of the bias term, by Said's reckoning. double rSquaredAdjusted; if (n > m) { rSquaredAdjusted = ProbClamp(1 - (1 - rSquared) * (n - 1) / (n - m)); ch.Info("Coefficient of determination R2 = {0:g}, or {1:g} (adjusted)", rSquared, rSquaredAdjusted); } else { rSquaredAdjusted = Double.NaN; } // The per parameter significance is compute intensive and may not be required for all practitioners. // Also we can't estimate it, unless we can estimate the variance, which requires more examples than // parameters. if (!_perParameterSignificance || m >= n) { return(new OlsLinearRegressionPredictor(Host, in weights, bias, null, null, null, rSquared, rSquaredAdjusted)); } ch.Assert(!Double.IsNaN(rSquaredAdjusted)); var standardErrors = new Double[m]; var tValues = new Double[m]; var pValues = new Double[m]; // Invert X'X: Mkl.Pptri(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, m, xtx); var s2 = rss / (n - m); // estimate of variance of y for (int i = 0; i < m; i++) { // Initialize with inverse Hessian. standardErrors[i] = (Single)xtx[i * (i + 1) / 2 + i]; } if (_l2Weight > 0) { // Iterate through all entries of inverse Hessian to make adjustment to variance. int ioffset = 1; float reg = _l2Weight * _l2Weight * n; for (int iRow = 1; iRow < m; iRow++) { for (int iCol = 0; iCol <= iRow; iCol++) { var entry = (Single)xtx[ioffset]; var adjustment = -reg * entry * entry; standardErrors[iRow] -= adjustment; if (0 < iCol && iCol < iRow) { standardErrors[iCol] -= adjustment; } ioffset++; } } Contracts.Assert(ioffset == xtx.Length); } for (int i = 0; i < m; i++) { // sqrt of diagonal entries of s2 * inverse(X'X + reg * I) * X'X * inverse(X'X + reg * I). standardErrors[i] = Math.Sqrt(s2 * standardErrors[i]); ch.Check(FloatUtils.IsFinite(standardErrors[i]), "Non-finite standard error detected from OLS solution"); tValues[i] = beta[i] / standardErrors[i]; pValues[i] = (float)MathUtils.TStatisticToPValue(tValues[i], n - m); ch.Check(0 <= pValues[i] && pValues[i] <= 1, "p-Value calculated outside expected [0,1] range"); } return(new OlsLinearRegressionPredictor(Host, in weights, bias, standardErrors, tValues, pValues, rSquared, rSquaredAdjusted)); }
public IEnumerable <KeyValuePair <int, T> > Items(bool all = false) { return(VBufferUtils.Items(Values, Indices, Length, Count, all)); }
private void GetLabels(Transposer trans, ColumnType labelType, int labelCol) { int min; int lim; var labels = default(VBuffer <int>); // Note: NAs have their own separate bin. if (labelType == NumberType.I4) { var tmp = default(VBuffer <DvInt4>); trans.GetSingleSlotValue(labelCol, ref tmp); BinInts(ref tmp, ref labels, _numBins, out min, out lim); _numLabels = lim - min; } else if (labelType == NumberType.R4) { var tmp = default(VBuffer <Single>); trans.GetSingleSlotValue(labelCol, ref tmp); BinSingles(ref tmp, ref labels, _numBins, out min, out lim); _numLabels = lim - min; } else if (labelType == NumberType.R8) { var tmp = default(VBuffer <Double>); trans.GetSingleSlotValue(labelCol, ref tmp); BinDoubles(ref tmp, ref labels, _numBins, out min, out lim); _numLabels = lim - min; } else if (labelType.IsBool) { var tmp = default(VBuffer <DvBool>); trans.GetSingleSlotValue(labelCol, ref tmp); BinBools(ref tmp, ref labels); _numLabels = 3; min = -1; lim = 2; } else { Contracts.Assert(0 < labelType.KeyCount && labelType.KeyCount < Utils.ArrayMaxSize); KeyLabelGetter <int> del = GetKeyLabels <int>; var methodInfo = del.GetMethodInfo().GetGenericMethodDefinition().MakeGenericMethod(labelType.RawType); var parameters = new object[] { trans, labelCol, labelType }; _labels = (int[])methodInfo.Invoke(this, parameters); _numLabels = labelType.KeyCount + 1; // No need to densify or shift in this case. return; } // Densify and shift labels. VBufferUtils.Densify(ref labels); Contracts.Assert(labels.IsDense); _labels = labels.Values; if (labels.Length < _labels.Length) { Array.Resize(ref _labels, labels.Length); } for (int i = 0; i < _labels.Length; i++) { _labels[i] -= min; Contracts.Assert(_labels[i] < _numLabels); } }
protected LinearPredictor(IHostEnvironment env, string name, ModelLoadContext ctx) : base(env, name, ctx) { // *** Binary format *** // Float: bias // int: number of features (weights) // int: number of indices // int[]: indices // int: number of weights // Float[]: weights // bool: has model stats // (Conditional) LinearModelStatistics: stats Bias = ctx.Reader.ReadFloat(); Host.CheckDecode(FloatUtils.IsFinite(Bias)); int len = ctx.Reader.ReadInt32(); Host.Assert(len > 0); int cind = ctx.Reader.ReadInt32(); Host.CheckDecode(0 <= cind & cind < len); var indices = ctx.Reader.ReadIntArray(cind); // Verify monotonicity of indices. int prev = -1; for (int i = 0; i < cind; i++) { Host.CheckDecode(prev < indices[i]); prev = indices[i]; } Host.CheckDecode(prev < len); int cwht = ctx.Reader.ReadInt32(); // Either there are as many weights as there are indices (in the // sparse case), or (in the dense case) there are no indices and the // number of weights is the length of the vector. Note that for the // trivial predictor it is quite legal to have 0 in both counts. Host.CheckDecode(cwht == cind || (cind == 0 && cwht == len)); var weights = ctx.Reader.ReadFloatArray(cwht); Host.CheckDecode(Utils.Size(weights) == 0 || weights.All(x => FloatUtils.IsFinite(x))); if (cwht == 0) { Weight = VBufferUtils.CreateEmpty <Float>(len); } else { Weight = new VBuffer <Float>(len, Utils.Size(weights), weights, indices); } InputType = new VectorType(NumberType.Float, Weight.Length); WarnOnOldNormalizer(ctx, GetType(), Host); if (Weight.IsDense) { _weightsDense = Weight; } else { _weightsDenseLock = new object(); } }
public override Delegate[] CreateGetters(IRow input, Func <int, bool> activeCols, out Action disposer) { Host.Assert(LabelIndex >= 0); Host.Assert(ScoreIndex >= 0); disposer = null; long cachedPosition = -1; Float label = 0; var score = default(VBuffer <Float>); var l1 = VBufferUtils.CreateDense <Double>(_scoreSize); ValueGetter <Float> nanGetter = (ref Float value) => value = Single.NaN; var labelGetter = activeCols(L1Col) || activeCols(L2Col) ? RowCursorUtils.GetLabelGetter(input, LabelIndex) : nanGetter; ValueGetter <VBuffer <Float> > scoreGetter; if (activeCols(L1Col) || activeCols(L2Col)) { scoreGetter = input.GetGetter <VBuffer <Float> >(ScoreIndex); } else { scoreGetter = (ref VBuffer <Float> dst) => dst = default(VBuffer <Float>); } Action updateCacheIfNeeded = () => { if (cachedPosition != input.Position) { labelGetter(ref label); scoreGetter(ref score); var lab = (Double)label; foreach (var s in score.Items(all: true)) { l1.Values[s.Key] = Math.Abs(lab - s.Value); } cachedPosition = input.Position; } }; var getters = new Delegate[2]; if (activeCols(L1Col)) { ValueGetter <VBuffer <Double> > l1Fn = (ref VBuffer <Double> dst) => { updateCacheIfNeeded(); l1.CopyTo(ref dst); }; getters[L1Col] = l1Fn; } if (activeCols(L2Col)) { VBufferUtils.PairManipulator <Double, Double> sqr = (int slot, Double x, ref Double y) => y = x * x; ValueGetter <VBuffer <Double> > l2Fn = (ref VBuffer <Double> dst) => { updateCacheIfNeeded(); dst = new VBuffer <Double>(_scoreSize, 0, dst.Values, dst.Indices); VBufferUtils.ApplyWith(ref l1, ref dst, sqr); }; getters[L2Col] = l2Fn; } return(getters); }
protected override bool IsNaN(ref VBuffer <Float> score) { return(VBufferUtils.HasNaNs(ref score)); }
protected override void ApplyLossFunction(ref VBuffer <float> score, float label, ref VBuffer <Double> loss) { VBufferUtils.PairManipulator <Float, Double> lossFn = (int slot, Float src, ref Double dst) => dst = LossFunction.Loss(src, label); VBufferUtils.ApplyWith(ref score, ref loss, lossFn); }
protected override VBuffer <Double> Zero() { return(VBufferUtils.CreateDense <Double>(_size)); }
public IEnumerable <T> DenseValues() { return(VBufferUtils.DenseValues(Values, Indices, Length, Count)); }
protected override Delegate GetGetterCore(IChannel ch, IRow input, int iinfo, out Action disposer) { Host.AssertValueOrNull(ch); Host.AssertValue(input); Host.Assert(0 <= iinfo && iinfo < Infos.Length); Host.Assert(Infos[iinfo].TypeSrc.IsVector); Host.Assert(Infos[iinfo].TypeSrc.ItemType.IsKey); disposer = null; var getSrc = RowCursorUtils.GetVecGetterAs <uint>(NumberType.U4, input, Infos[iinfo].Source); var src = default(VBuffer <uint>); var bldr = new NgramBufferBuilder(_exes[iinfo].NgramLength, _exes[iinfo].SkipLength, _ngramMaps[iinfo].Count, GetNgramIdFinder(iinfo)); var keyCount = (uint)Infos[iinfo].TypeSrc.ItemType.KeyCount; if (keyCount == 0) { keyCount = uint.MaxValue; } ValueGetter <VBuffer <Float> > del; switch (_exes[iinfo].Weighting) { case WeightingCriteria.TfIdf: Host.AssertValue(_invDocFreqs[iinfo]); del = (ref VBuffer <Float> dst) => { getSrc(ref src); if (!bldr.IsEmpty) { bldr.Reset(); bldr.AddNgrams(in src, 0, keyCount); bldr.GetResult(ref dst); VBufferUtils.Apply(ref dst, (int i, ref Float v) => v = (Float)(v * _invDocFreqs[iinfo][i])); } else { dst = new VBuffer <Float>(0, dst.Values, dst.Indices); } }; break; case WeightingCriteria.Idf: Host.AssertValue(_invDocFreqs[iinfo]); del = (ref VBuffer <Float> dst) => { getSrc(ref src); if (!bldr.IsEmpty) { bldr.Reset(); bldr.AddNgrams(in src, 0, keyCount); bldr.GetResult(ref dst); VBufferUtils.Apply(ref dst, (int i, ref Float v) => v = v >= 1 ? (Float)_invDocFreqs[iinfo][i] : 0); } else { dst = new VBuffer <Float>(0, dst.Values, dst.Indices); } }; break; case WeightingCriteria.Tf: del = (ref VBuffer <Float> dst) => { getSrc(ref src); if (!bldr.IsEmpty) { bldr.Reset(); bldr.AddNgrams(in src, 0, keyCount); bldr.GetResult(ref dst); } else { dst = new VBuffer <Float>(0, dst.Values, dst.Indices); } }; break; default: throw Host.Except("Unsupported weighting criteria"); } return(del); }
protected virtual void TrainCore(IChannel ch, RoleMappedData data) { Host.AssertValue(ch); ch.AssertValue(data); // Compute the number of threads to use. The ctor should have verified that this will // produce a positive value. int numThreads = !UseThreads ? 1 : (NumThreads ?? Environment.ProcessorCount); if (Host.ConcurrencyFactor > 0 && numThreads > Host.ConcurrencyFactor) { numThreads = Host.ConcurrencyFactor; ch.Warning("The number of threads specified in trainer arguments is larger than the concurrency factor " + "setting of the environment. Using {0} training threads instead.", numThreads); } ch.Assert(numThreads > 0); NumGoodRows = 0; WeightSum = 0; _features = null; _labels = null; _weights = null; if (numThreads > 1) { ch.Info("LBFGS multi-threading will attempt to load dataset into memory. In case of out-of-memory " + "issues, add 'numThreads=1' to the trainer arguments and 'cache=-' to the command line " + "arguments to turn off multi-threading."); _features = new VBuffer <float> [1000]; _labels = new float[1000]; if (data.Schema.Weight != null) { _weights = new float[1000]; } } var cursorFactory = new FloatLabelCursor.Factory(data, CursOpt.Features | CursOpt.Label | CursOpt.Weight); long numBad; // REVIEW: This pass seems overly expensive for the benefit when multi-threading is off.... using (var cursor = cursorFactory.Create()) using (var pch = Host.StartProgressChannel("LBFGS data prep")) { // REVIEW: maybe it makes sense for the factory to capture the good row count after // the first successful cursoring? Double totalCount = data.Data.GetRowCount(true) ?? Double.NaN; long exCount = 0; pch.SetHeader(new ProgressHeader(null, new[] { "examples" }), e => e.SetProgress(0, exCount, totalCount)); while (cursor.MoveNext()) { WeightSum += cursor.Weight; if (ShowTrainingStats) { ProcessPriorDistribution(cursor.Label, cursor.Weight); } PreTrainingProcessInstance(cursor.Label, ref cursor.Features, cursor.Weight); exCount++; if (_features != null) { ch.Assert(cursor.KeptRowCount <= int.MaxValue); int index = (int)cursor.KeptRowCount - 1; Utils.EnsureSize(ref _features, index + 1); Utils.EnsureSize(ref _labels, index + 1); if (_weights != null) { Utils.EnsureSize(ref _weights, index + 1); _weights[index] = cursor.Weight; } Utils.Swap(ref _features[index], ref cursor.Features); _labels[index] = cursor.Label; if (cursor.KeptRowCount >= int.MaxValue) { ch.Warning("Limiting data size for multi-threading"); break; } } } NumGoodRows = cursor.KeptRowCount; numBad = cursor.SkippedRowCount; } ch.Check(NumGoodRows > 0, NoTrainingInstancesMessage); if (numBad > 0) { ch.Warning("Skipped {0} instances with missing features/label/weight during training", numBad); } if (_features != null) { ch.Assert(numThreads > 1); // If there are so many threads that each only gets a small number (less than 10) of instances, trim // the number of threads so each gets a more reasonable number (100 or so). These numbers are pretty arbitrary, // but avoid the possibility of having no instances on some threads. if (numThreads > 1 && NumGoodRows / numThreads < 10) { int numNew = Math.Max(1, (int)NumGoodRows / 100); ch.Warning("Too few instances to use {0} threads, decreasing to {1} thread(s)", numThreads, numNew); numThreads = numNew; } ch.Assert(numThreads > 0); // Divide up the instances among the threads. _numChunks = numThreads; _ranges = new int[_numChunks + 1]; int cinstTot = (int)NumGoodRows; for (int ichk = 0, iinstMin = 0; ichk < numThreads; ichk++) { int cchkLeft = numThreads - ichk; // Number of chunks left to fill. ch.Assert(0 < cchkLeft && cchkLeft <= numThreads); int cinstThis = (cinstTot - iinstMin + cchkLeft - 1) / cchkLeft; // Size of this chunk. ch.Assert(0 < cinstThis && cinstThis <= cinstTot - iinstMin); iinstMin += cinstThis; _ranges[ichk + 1] = iinstMin; } _localLosses = new float[numThreads]; _localGradients = new VBuffer <float> [numThreads - 1]; int size = BiasCount + WeightCount; for (int i = 0; i < _localGradients.Length; i++) { _localGradients[i] = VBufferUtils.CreateEmpty <float>(size); } ch.Assert(_numChunks > 0 && _data == null); } else { // Streaming, single-threaded case. _data = data; _cursorFactory = cursorFactory; ch.Assert(_numChunks == 0 && _data != null); } VBuffer <float> initWeights; ITerminationCriterion terminationCriterion; Optimizer opt = InitializeOptimizer(ch, cursorFactory, out initWeights, out terminationCriterion); opt.Quiet = Quiet; float loss; try { opt.Minimize(DifferentiableFunction, ref initWeights, terminationCriterion, ref CurrentWeights, out loss); } catch (Optimizer.PrematureConvergenceException e) { if (!Quiet) { ch.Warning("Premature convergence occurred. The OptimizationTolerance may be set too small. {0}", e.Message); } CurrentWeights = e.State.X; loss = e.State.Value; } ch.Assert(CurrentWeights.Length == BiasCount + WeightCount); int numParams = BiasCount; if ((L1Weight > 0 && !Quiet) || ShowTrainingStats) { VBufferUtils.ForEachDefined(ref CurrentWeights, (index, value) => { if (index >= BiasCount && value != 0) { numParams++; } }); if (L1Weight > 0 && !Quiet) { ch.Info("L1 regularization selected {0} of {1} weights.", numParams, BiasCount + WeightCount); } } if (ShowTrainingStats) { ComputeTrainingStatistics(ch, cursorFactory, loss, numParams); } }
private void GetLabels(Transposer trans, DataViewType labelType, int labelCol) { int min; int lim; var labels = default(VBuffer <int>); // Note: NAs have their own separate bin. if (labelType == NumberDataViewType.Int32) { var tmp = default(VBuffer <int>); trans.GetSingleSlotValue(labelCol, ref tmp); BinInts(in tmp, ref labels, _numBins, out min, out lim); _numLabels = lim - min; } else if (labelType == NumberDataViewType.Single) { var tmp = default(VBuffer <Single>); trans.GetSingleSlotValue(labelCol, ref tmp); BinSingles(in tmp, ref labels, _numBins, out min, out lim); _numLabels = lim - min; } else if (labelType == NumberDataViewType.Double) { var tmp = default(VBuffer <Double>); trans.GetSingleSlotValue(labelCol, ref tmp); BinDoubles(in tmp, ref labels, _numBins, out min, out lim); _numLabels = lim - min; } else if (labelType is BooleanDataViewType) { var tmp = default(VBuffer <bool>); trans.GetSingleSlotValue(labelCol, ref tmp); BinBools(in tmp, ref labels); _numLabels = 3; min = -1; lim = 2; } else { ulong labelKeyCount = labelType.GetKeyCount(); Contracts.Assert(labelKeyCount < Utils.ArrayMaxSize); KeyLabelGetter <int> del = GetKeyLabels <int>; var methodInfo = del.GetMethodInfo().GetGenericMethodDefinition().MakeGenericMethod(labelType.RawType); var parameters = new object[] { trans, labelCol, labelType }; _labels = (VBuffer <int>)methodInfo.Invoke(this, parameters); _numLabels = labelType.GetKeyCountAsInt32(_host) + 1; // No need to densify or shift in this case. return; } // Densify and shift labels. VBufferUtils.Densify(ref labels); Contracts.Assert(labels.IsDense); var labelsEditor = VBufferEditor.CreateFromBuffer(ref labels); for (int i = 0; i < labels.Length; i++) { labelsEditor.Values[i] -= min; Contracts.Assert(labelsEditor.Values[i] < _numLabels); } _labels = labelsEditor.Commit(); }
private TPredictor TrainCore(IChannel ch, RoleMappedData data, LinearModelParameters predictor, int weightSetCount) { int numFeatures = data.Schema.Feature.Value.Type.GetVectorSize(); var cursorFactory = new FloatLabelCursor.Factory(data, CursOpt.Label | CursOpt.Features); int numThreads = 1; ch.CheckUserArg(numThreads > 0, nameof(_options.NumberOfThreads), "The number of threads must be either null or a positive integer."); var positiveInstanceWeight = _options.PositiveInstanceWeight; VBuffer <float> weights = default; float bias = 0.0f; if (predictor != null) { predictor.GetFeatureWeights(ref weights); VBufferUtils.Densify(ref weights); bias = predictor.Bias; } else { weights = VBufferUtils.CreateDense <float>(numFeatures); } var weightsEditor = VBufferEditor.CreateFromBuffer(ref weights); // Reference: Parasail. SymSGD. bool tuneLR = _options.LearningRate == null; var lr = _options.LearningRate ?? 1.0f; bool tuneNumLocIter = (_options.UpdateFrequency == null); var numLocIter = _options.UpdateFrequency ?? 1; var l2Const = _options.L2Regularization; var piw = _options.PositiveInstanceWeight; // This is state of the learner that is shared with the native code. State state = new State(); GCHandle stateGCHandle = default; try { stateGCHandle = GCHandle.Alloc(state, GCHandleType.Pinned); state.TotalInstancesProcessed = 0; using (InputDataManager inputDataManager = new InputDataManager(this, cursorFactory, ch)) { bool shouldInitialize = true; using (var pch = Host.StartProgressChannel("Preprocessing")) inputDataManager.LoadAsMuchAsPossible(); int iter = 0; if (inputDataManager.IsFullyLoaded) { ch.Info("Data fully loaded into memory."); } using (var pch = Host.StartProgressChannel("Training")) { if (inputDataManager.IsFullyLoaded) { pch.SetHeader(new ProgressHeader(new[] { "iterations" }), entry => entry.SetProgress(0, state.PassIteration, _options.NumberOfIterations)); // If fully loaded, call the SymSGDNative and do not come back until learned for all iterations. Native.LearnAll(inputDataManager, tuneLR, ref lr, l2Const, piw, weightsEditor.Values, ref bias, numFeatures, _options.NumberOfIterations, numThreads, tuneNumLocIter, ref numLocIter, _options.Tolerance, _options.Shuffle, shouldInitialize, stateGCHandle, ch.Info); shouldInitialize = false; } else { pch.SetHeader(new ProgressHeader(new[] { "iterations" }), entry => entry.SetProgress(0, iter, _options.NumberOfIterations)); // Since we loaded data in batch sizes, multiple passes over the loaded data is feasible. int numPassesForABatch = inputDataManager.Count / 10000; while (iter < _options.NumberOfIterations) { // We want to train on the final passes thoroughly (without learning on the same batch multiple times) // This is for fine tuning the AUC. Experimentally, we found that 1 or 2 passes is enough int numFinalPassesToTrainThoroughly = 2; // We also do not want to learn for more passes than what the user asked int numPassesForThisBatch = Math.Min(numPassesForABatch, _options.NumberOfIterations - iter - numFinalPassesToTrainThoroughly); // If all of this leaves us with 0 passes, then set numPassesForThisBatch to 1 numPassesForThisBatch = Math.Max(1, numPassesForThisBatch); state.PassIteration = iter; Native.LearnAll(inputDataManager, tuneLR, ref lr, l2Const, piw, weightsEditor.Values, ref bias, numFeatures, numPassesForThisBatch, numThreads, tuneNumLocIter, ref numLocIter, _options.Tolerance, _options.Shuffle, shouldInitialize, stateGCHandle, ch.Info); shouldInitialize = false; // Check if we are done with going through the data if (inputDataManager.FinishedTheLoad) { iter += numPassesForThisBatch; // Check if more passes are left if (iter < _options.NumberOfIterations) { inputDataManager.RestartLoading(_options.Shuffle, Host); } } // If more passes are left, load as much as possible if (iter < _options.NumberOfIterations) { inputDataManager.LoadAsMuchAsPossible(); } } } // Maps back the dense features that are mislocated if (numThreads > 1) { Native.MapBackWeightVector(weightsEditor.Values, stateGCHandle); } Native.DeallocateSequentially(stateGCHandle); } } } finally { if (stateGCHandle.IsAllocated) { stateGCHandle.Free(); } } return(CreatePredictor(weights, bias)); }
// Combines source key names and slot names to produce final slot names. private void GetSlotNames(int iinfo, ref VBuffer <DvText> dst) { Host.Assert(0 <= iinfo && iinfo < Infos.Length); Host.Assert(_concat[iinfo]); Host.Assert(_types[iinfo].IsKnownSizeVector); // Size one should have been treated the same as Bag (by the caller). // Variable size should have thrown (by the caller). var typeSrc = Infos[iinfo].TypeSrc; Host.Assert(typeSrc.VectorSize > 1); // Get the source slot names, defaulting to empty text. var namesSlotSrc = default(VBuffer <DvText>); var typeSlotSrc = Source.Schema.GetMetadataTypeOrNull(MetadataUtils.Kinds.SlotNames, Infos[iinfo].Source); if (typeSlotSrc != null && typeSlotSrc.VectorSize == typeSrc.VectorSize && typeSlotSrc.ItemType.IsText) { Source.Schema.GetMetadata(MetadataUtils.Kinds.SlotNames, Infos[iinfo].Source, ref namesSlotSrc); Host.Check(namesSlotSrc.Length == typeSrc.VectorSize); } else { namesSlotSrc = VBufferUtils.CreateEmpty <DvText>(typeSrc.VectorSize); } int keyCount = typeSrc.ItemType.KeyCount; int slotLim = _types[iinfo].VectorSize; Host.Assert(slotLim == (long)typeSrc.VectorSize * keyCount); // Get the source key names, in an array (since we will use them multiple times). var namesKeySrc = default(VBuffer <DvText>); Source.Schema.GetMetadata(MetadataUtils.Kinds.KeyValues, Infos[iinfo].Source, ref namesKeySrc); Host.Check(namesKeySrc.Length == keyCount); var keys = new DvText[keyCount]; namesKeySrc.CopyTo(keys); var values = dst.Values; if (Utils.Size(values) < slotLim) { values = new DvText[slotLim]; } var sb = new StringBuilder(); int slot = 0; foreach (var kvpSlot in namesSlotSrc.Items(all: true)) { Contracts.Assert(slot == (long)kvpSlot.Key * keyCount); sb.Clear(); if (kvpSlot.Value.HasChars) { kvpSlot.Value.AddToStringBuilder(sb); } else { sb.Append('[').Append(kvpSlot.Key).Append(']'); } sb.Append('.'); int len = sb.Length; foreach (var key in keys) { sb.Length = len; key.AddToStringBuilder(sb); values[slot++] = new DvText(sb.ToString()); } } Host.Assert(slot == slotLim); dst = new VBuffer <DvText>(slotLim, values, dst.Indices); }
// Delegates onto instance methods are more efficient than delegates onto static methods. private void VecTrivialGetter <TDst>(ref VBuffer <TDst> value) { VBufferUtils.Resize(ref value, 1, 0); }
private PcaPredictor TrainCore(IChannel ch, RoleMappedData data, int dimension) { Host.AssertValue(ch); ch.AssertValue(data); if (_rank > dimension) { throw ch.Except("Rank ({0}) cannot be larger than the original dimension ({1})", _rank, dimension); } int oversampledRank = Math.Min(_rank + _oversampling, dimension); //exact: (size of the 2 big matrices + other minor allocations) / (2^30) Double memoryUsageEstimate = 2.0 * dimension * oversampledRank * sizeof(Float) / 1e9; if (memoryUsageEstimate > 2) { ch.Info("Estimate memory usage: {0:G2} GB. If running out of memory, reduce rank and oversampling factor.", memoryUsageEstimate); } var y = Zeros(oversampledRank, dimension); var mean = _center ? VBufferUtils.CreateDense <Float>(dimension) : VBufferUtils.CreateEmpty <Float>(dimension); var omega = GaussianMatrix(oversampledRank, dimension, _seed); var cursorFactory = new FeatureFloatVectorCursor.Factory(data, CursOpt.Features | CursOpt.Weight); long numBad; Project(Host, cursorFactory, ref mean, omega, y, out numBad); if (numBad > 0) { ch.Warning("Skipped {0} instances with missing features/weights during training", numBad); } //Orthonormalize Y in-place using stabilized Gram Schmidt algorithm. //Ref: https://en.wikipedia.org/wiki/Gram-Schmidt#Algorithm for (var i = 0; i < oversampledRank; ++i) { var v = y[i]; VectorUtils.ScaleBy(ref v, 1 / VectorUtils.Norm(y[i])); // Make the next vectors in the queue orthogonal to the orthonormalized vectors. for (var j = i + 1; j < oversampledRank; ++j) //subtract the projection of y[j] on v. { VectorUtils.AddMult(ref v, -VectorUtils.DotProduct(ref v, ref y[j]), ref y[j]); } } var q = y; // q in QR decomposition. var b = omega; // reuse the memory allocated by Omega. Project(Host, cursorFactory, ref mean, q, b, out numBad); //Compute B2 = B' * B var b2 = new Float[oversampledRank * oversampledRank]; for (var i = 0; i < oversampledRank; ++i) { for (var j = i; j < oversampledRank; ++j) { b2[i * oversampledRank + j] = b2[j * oversampledRank + i] = VectorUtils.DotProduct(ref b[i], ref b[j]); } } Float[] smallEigenvalues;// eigenvectors and eigenvalues of the small matrix B2. Float[] smallEigenvectors; EigenUtils.EigenDecomposition(b2, out smallEigenvalues, out smallEigenvectors); PostProcess(b, smallEigenvalues, smallEigenvectors, dimension, oversampledRank); return(new PcaPredictor(Host, _rank, b, ref mean)); }
/// <summary> /// Build a Bing TreeEnsemble .ini representation of the given predictor /// </summary> public static string LinearModelAsIni(ref VBuffer <Float> weights, Float bias, IPredictor predictor = null, RoleMappedSchema schema = null, PlattCalibrator calibrator = null) { // TODO: Might need to consider a max line length for the Weights list, requiring us to split it up into // multiple evaluators StringBuilder inputBuilder = new StringBuilder(); StringBuilder aggregatedNodesBuilder = new StringBuilder("Nodes="); StringBuilder weightsBuilder = new StringBuilder("Weights="); var featureNames = default(VBuffer <ReadOnlyMemory <char> >); MetadataUtils.GetSlotNames(schema, RoleMappedSchema.ColumnRole.Feature, weights.Length, ref featureNames); int numNonZeroWeights = 0; const string weightsSep = "\t"; VBufferUtils.ForEachDefined(ref weights, (idx, value) => { if (Math.Abs(value - 0) >= Epsilon) { numNonZeroWeights++; var name = featureNames.GetItemOrDefault(idx); inputBuilder.AppendLine("[Input:" + numNonZeroWeights + "]"); inputBuilder.AppendLine("Name=" + (featureNames.Count == 0 ? "Feature_" + idx : name.IsEmpty ? $"f{idx}" : name.ToString())); inputBuilder.AppendLine("Transform=linear"); inputBuilder.AppendLine("Slope=1"); inputBuilder.AppendLine("Intercept=0"); inputBuilder.AppendLine(); aggregatedNodesBuilder.Append("I:" + numNonZeroWeights + weightsSep); weightsBuilder.Append(value + weightsSep); } }); StringBuilder builder = new StringBuilder(); builder.AppendLine("[TreeEnsemble]"); builder.AppendLine("Inputs=" + numNonZeroWeights); builder.AppendLine("Evaluators=1"); builder.AppendLine(); builder.AppendLine(inputBuilder.ToString()); builder.AppendLine("[Evaluator:1]"); builder.AppendLine("EvaluatorType=Aggregator"); builder.AppendLine("Type=Linear"); builder.AppendLine("Bias=" + bias); builder.AppendLine("NumNodes=" + numNonZeroWeights); builder.AppendLine(aggregatedNodesBuilder.ToString().Trim()); builder.AppendLine(weightsBuilder.ToString().Trim()); #if false // REVIEW: This should be done by the caller using the actual training args! builder.AppendLine(); builder.AppendLine("[Comments]"); builder.Append("Trained by TLC"); if (predictor != null) { builder.Append(" as /cl " + predictor.GetType().Name); if (predictor is IInitializable) { string settings = string.Join(";", (predictor as IInitializable).GetSettings()); if (!string.IsNullOrEmpty(settings)) { builder.Append(" /cls " + settings); } } } #endif string ini = builder.ToString(); // Add the calibration if the model was trained with calibration if (calibrator != null) { string calibratorEvaluatorIni = IniFileUtils.GetCalibratorEvaluatorIni(ini, calibrator); ini = IniFileUtils.AddEvaluator(ini, calibratorEvaluatorIni); } return(ini); }
private Delegate MakeGetterVec <T>(int length) { return((ValueGetter <VBuffer <T> >)((ref VBuffer <T> value) => VBufferUtils.Resize(ref value, length, 0))); }
/// <summary> /// Convenience function to construct a working vector of length <c>Dim</c>. /// </summary> /// <returns></returns> protected VBuffer <Float> CreateWorkingVector() { // Owing to the way the operations are structured, if the "x", "newX", and "dir" vectors // start out (or somehow naturally become) dense, they will remain dense. return(_keepDense ? VBufferUtils.CreateDense <Float>(Dim) : VBufferUtils.CreateEmpty <Float>(Dim)); }
protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.Factory cursorFactory, Float loss, int numParams) { Contracts.AssertValue(ch); Contracts.AssertValue(cursorFactory); Contracts.Assert(NumGoodRows > 0); Contracts.Assert(WeightSum > 0); Contracts.Assert(BiasCount == 1); Contracts.Assert(loss >= 0); Contracts.Assert(numParams >= BiasCount); Contracts.Assert(CurrentWeights.IsDense); ch.Info("Model trained with {0} training examples.", NumGoodRows); // Compute deviance: start with loss function. Float deviance = (Float)(2 * loss * WeightSum); if (L2Weight > 0) { // Need to subtract L2 regularization loss. // The bias term is not regularized. var regLoss = VectorUtils.NormSquared(CurrentWeights.Values, 1, CurrentWeights.Length - 1) * L2Weight; deviance -= regLoss; } if (L1Weight > 0) { // Need to subtract L1 regularization loss. // The bias term is not regularized. Double regLoss = 0; VBufferUtils.ForEachDefined(ref CurrentWeights, (ind, value) => { if (ind >= BiasCount) { regLoss += Math.Abs(value); } }); deviance -= (Float)regLoss * L1Weight * 2; } ch.Info("Residual Deviance: \t{0} (on {1} degrees of freedom)", deviance, Math.Max(NumGoodRows - numParams, 0)); // Compute null deviance, i.e., the deviance of null hypothesis. // Cap the prior positive rate at 1e-15. Double priorPosRate = _posWeight / WeightSum; Contracts.Assert(0 <= priorPosRate && priorPosRate <= 1); Float nullDeviance = (priorPosRate <= 1e-15 || 1 - priorPosRate <= 1e-15) ? 0f : (Float)(2 * WeightSum * MathUtils.Entropy(priorPosRate, true)); ch.Info("Null Deviance: \t{0} (on {1} degrees of freedom)", nullDeviance, NumGoodRows - 1); // Compute AIC. ch.Info("AIC: \t{0}", 2 * numParams + deviance); // Show the coefficients statistics table. var featureColIdx = cursorFactory.Data.Schema.Feature.Index; var schema = cursorFactory.Data.Data.Schema; var featureLength = CurrentWeights.Length - BiasCount; var namesSpans = VBufferUtils.CreateEmpty <DvText>(featureLength); if (schema.HasSlotNames(featureColIdx, featureLength)) { schema.GetMetadata(MetadataUtils.Kinds.SlotNames, featureColIdx, ref namesSpans); } Host.Assert(namesSpans.Length == featureLength); // Inverse mapping of non-zero weight slots. Dictionary <int, int> weightIndicesInvMap = null; // Indices of bias and non-zero weight slots. int[] weightIndices = null; // Whether all weights are non-zero. bool denseWeight = numParams == CurrentWeights.Length; // Extract non-zero indices of weight. if (!denseWeight) { weightIndices = new int[numParams]; weightIndicesInvMap = new Dictionary <int, int>(numParams); weightIndices[0] = 0; weightIndicesInvMap[0] = 0; int j = 1; for (int i = 1; i < CurrentWeights.Length; i++) { if (CurrentWeights.Values[i] != 0) { weightIndices[j] = i; weightIndicesInvMap[i] = j++; } } Contracts.Assert(j == numParams); } // Compute the standard error of coefficients. long hessianDimension = (long)numParams * (numParams + 1) / 2; if (hessianDimension > int.MaxValue) { ch.Warning("The number of parameter is too large. Cannot hold the variance-covariance matrix in memory. " + "Skipping computation of standard errors and z-statistics of coefficients. Consider choosing a larger L1 regularizer" + "to reduce the number of parameters."); _stats = new LinearModelStatistics(Host, NumGoodRows, numParams, deviance, nullDeviance); return; } // Building the variance-covariance matrix for parameters. // The layout of this algorithm is a packed row-major lower triangular matrix. // E.g., layout of indices for 4-by-4: // 0 // 1 2 // 3 4 5 // 6 7 8 9 var hessian = new Double[hessianDimension]; // Initialize diagonal elements with L2 regularizers except for the first entry (index 0) // Since bias is not regularized. if (L2Weight > 0) { // i is the array index of the diagonal entry at iRow-th row and iRow-th column. // iRow is one-based. int i = 0; for (int iRow = 2; iRow <= numParams; iRow++) { i += iRow; hessian[i] = L2Weight; } Contracts.Assert(i == hessian.Length - 1); } // Initialize the remaining entries. var bias = CurrentWeights.Values[0]; using (var cursor = cursorFactory.Create()) { while (cursor.MoveNext()) { var label = cursor.Label; var weight = cursor.Weight; var score = bias + VectorUtils.DotProductWithOffset(ref CurrentWeights, 1, ref cursor.Features); // Compute Bernoulli variance n_i * p_i * (1 - p_i) for the i-th training example. var variance = weight / (2 + 2 * Math.Cosh(score)); // Increment the first entry of hessian. hessian[0] += variance; var values = cursor.Features.Values; if (cursor.Features.IsDense) { int ioff = 1; // Increment remaining entries of hessian. for (int i = 1; i < numParams; i++) { ch.Assert(ioff == i * (i + 1) / 2); int wi = weightIndices == null ? i - 1 : weightIndices[i] - 1; Contracts.Assert(0 <= wi && wi < cursor.Features.Length); var val = values[wi] * variance; // Add the implicit first bias term to X'X hessian[ioff++] += val; // Add the remainder of X'X for (int j = 0; j < i; j++) { int wj = weightIndices == null ? j : weightIndices[j + 1] - 1; Contracts.Assert(0 <= wj && wj < cursor.Features.Length); hessian[ioff++] += val * values[wj]; } } ch.Assert(ioff == hessian.Length); } else { var indices = cursor.Features.Indices; for (int ii = 0; ii < cursor.Features.Count; ++ii) { int i = indices[ii]; int wi = i + 1; if (weightIndicesInvMap != null && !weightIndicesInvMap.TryGetValue(i + 1, out wi)) { continue; } Contracts.Assert(0 < wi && wi <= cursor.Features.Length); int ioff = wi * (wi + 1) / 2; var val = values[ii] * variance; // Add the implicit first bias term to X'X hessian[ioff] += val; // Add the remainder of X'X for (int jj = 0; jj <= ii; jj++) { int j = indices[jj]; int wj = j + 1; if (weightIndicesInvMap != null && !weightIndicesInvMap.TryGetValue(j + 1, out wj)) { continue; } Contracts.Assert(0 < wj && wj <= cursor.Features.Length); hessian[ioff + wj] += val * values[jj]; } } } } } // Apply Cholesky Decomposition to find the inverse of the Hessian. Double[] invHessian = null; try { // First, find the Cholesky decomposition LL' of the Hessian. Mkl.Pptrf(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, numParams, hessian); // Note that hessian is already modified at this point. It is no longer the original Hessian, // but instead represents the Cholesky decomposition L. // Also note that the following routine is supposed to consume the Cholesky decomposition L instead // of the original information matrix. Mkl.Pptri(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, numParams, hessian); // At this point, hessian should contain the inverse of the original Hessian matrix. // Swap hessian with invHessian to avoid confusion in the following context. Utils.Swap(ref hessian, ref invHessian); Contracts.Assert(hessian == null); } catch (DllNotFoundException) { throw ch.ExceptNotSupp("The MKL library (Microsoft.ML.MklImports.dll) or one of its dependencies is missing."); } Float[] stdErrorValues = new Float[numParams]; stdErrorValues[0] = (Float)Math.Sqrt(invHessian[0]); for (int i = 1; i < numParams; i++) { // Initialize with inverse Hessian. stdErrorValues[i] = (Single)invHessian[i * (i + 1) / 2 + i]; } if (L2Weight > 0) { // Iterate through all entries of inverse Hessian to make adjustment to variance. // A discussion on ridge regularized LR coefficient covariance matrix can be found here: // http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3228544/ // http://www.inf.unibz.it/dis/teaching/DWDM/project2010/LogisticRegression.pdf int ioffset = 1; for (int iRow = 1; iRow < numParams; iRow++) { for (int iCol = 0; iCol <= iRow; iCol++) { var entry = (Single)invHessian[ioffset]; var adjustment = -L2Weight * entry * entry; stdErrorValues[iRow] -= adjustment; if (0 < iCol && iCol < iRow) { stdErrorValues[iCol] -= adjustment; } ioffset++; } } Contracts.Assert(ioffset == invHessian.Length); } for (int i = 1; i < numParams; i++) { stdErrorValues[i] = (Float)Math.Sqrt(stdErrorValues[i]); } VBuffer <Float> stdErrors = new VBuffer <Float>(CurrentWeights.Length, numParams, stdErrorValues, weightIndices); _stats = new LinearModelStatistics(Host, NumGoodRows, numParams, deviance, nullDeviance, ref stdErrors); }
/// <summary> /// An implementation of the line search for the Wolfe conditions, from Nocedal & Wright /// </summary> internal virtual bool LineSearch(IChannel ch, bool force) { Contracts.AssertValue(ch); Float dirDeriv = VectorUtils.DotProduct(ref _dir, ref _grad); if (dirDeriv == 0) { throw ch.Process(new PrematureConvergenceException(this, "Directional derivative is zero. You may be sitting on the optimum.")); } // if a non-descent direction is chosen, the line search will break anyway, so throw here // The most likely reasons for this is a bug in your function's gradient computation, ch.Check(dirDeriv < 0, "L-BFGS chose a non-descent direction."); Float c1 = (Float)1e-4 * dirDeriv; Float c2 = (Float)0.9 * dirDeriv; Float alpha = (Iter == 1 ? (1 / VectorUtils.Norm(_dir)) : 1); PointValueDeriv last = new PointValueDeriv(0, LastValue, dirDeriv); PointValueDeriv aLo = new PointValueDeriv(); PointValueDeriv aHi = new PointValueDeriv(); // initial bracketing phase while (true) { VectorUtils.AddMultInto(ref _x, alpha, ref _dir, ref _newX); if (EnforceNonNegativity) { VBufferUtils.Apply(ref _newX, delegate(int ind, ref Float newXval) { if (newXval < 0.0) { newXval = 0; } }); } Value = Eval(ref _newX, ref _newGrad); GradientCalculations++; if (Float.IsPositiveInfinity(Value)) { alpha /= 2; continue; } if (!FloatUtils.IsFinite(Value)) { throw ch.Except("Optimizer unable to proceed with loss function yielding {0}", Value); } dirDeriv = VectorUtils.DotProduct(ref _dir, ref _newGrad); PointValueDeriv curr = new PointValueDeriv(alpha, Value, dirDeriv); if ((curr.V > LastValue + c1 * alpha) || (last.A > 0 && curr.V >= last.V)) { aLo = last; aHi = curr; break; } else if (Math.Abs(curr.D) <= -c2) { return(true); } else if (curr.D >= 0) { aLo = curr; aHi = last; break; } last = curr; if (alpha == 0) { alpha = Float.Epsilon; // Robust to divisional underflow. } else { alpha *= 2; } } Float minChange = (Float)0.01; int maxSteps = 10; // this loop is the "zoom" procedure described in Nocedal & Wright for (int step = 0; ; ++step) { if (step == maxSteps && !force) { return(false); } PointValueDeriv left = aLo.A < aHi.A ? aLo : aHi; PointValueDeriv right = aLo.A < aHi.A ? aHi : aLo; if (left.D > 0 && right.D < 0) { // interpolating cubic would have max in range, not min (can this happen?) // set a to the one with smaller value alpha = aLo.V < aHi.V ? aLo.A : aHi.A; } else { alpha = CubicInterp(aLo, aHi); if (Float.IsNaN(alpha) || Float.IsInfinity(alpha)) { alpha = (aLo.A + aHi.A) / 2; } } // this is to ensure that the new point is within bounds // and that the change is reasonably sized Float ub = (minChange * left.A + (1 - minChange) * right.A); if (alpha > ub) { alpha = ub; } Float lb = (minChange * right.A + (1 - minChange) * left.A); if (alpha < lb) { alpha = lb; } VectorUtils.AddMultInto(ref _x, alpha, ref _dir, ref _newX); if (EnforceNonNegativity) { VBufferUtils.Apply(ref _newX, delegate(int ind, ref Float newXval) { if (newXval < 0.0) { newXval = 0; } }); } Value = Eval(ref _newX, ref _newGrad); GradientCalculations++; if (!FloatUtils.IsFinite(Value)) { throw ch.Except("Optimizer unable to proceed with loss function yielding {0}", Value); } dirDeriv = VectorUtils.DotProduct(ref _dir, ref _newGrad); PointValueDeriv curr = new PointValueDeriv(alpha, Value, dirDeriv); if ((curr.V > LastValue + c1 * alpha) || (curr.V >= aLo.V)) { if (aHi.A == curr.A) { if (force) { throw ch.Process(new PrematureConvergenceException(this, "Step size interval numerically zero.")); } else { return(false); } } aHi = curr; } else if (Math.Abs(curr.D) <= -c2) { return(true); } else { if (curr.D * (aHi.A - aLo.A) >= 0) { aHi = aLo; } if (aLo.A == curr.A) { if (force) { throw ch.Process(new PrematureConvergenceException(this, "Step size interval numerically zero.")); } else { return(false); } } aLo = curr; } } }
GetImportanceMetricsMatrix( IHostEnvironment env, IPredictionTransformer <TModel> model, IDataView data, Func <TResult> resultInitializer, Func <IDataView, TMetric> evaluationFunc, Func <TMetric, TMetric, TMetric> deltaFunc, string features, int permutationCount, bool useFeatureWeightFilter = false, int?topExamples = null) { Contracts.CheckValue(env, nameof(env)); var host = env.Register(nameof(PermutationFeatureImportance <TModel, TMetric, TResult>)); host.CheckValue(model, nameof(model)); host.CheckValue(data, nameof(data)); host.CheckNonEmpty(features, nameof(features)); topExamples = topExamples ?? Utils.ArrayMaxSize; host.Check(topExamples > 0, "Provide how many examples to use (positive number) or set to null to use whole dataset."); VBuffer <ReadOnlyMemory <char> > slotNames = default; var metricsDelta = new List <TResult>(); using (var ch = host.Start("GetImportanceMetrics")) { ch.Trace("Scoring and evaluating baseline."); var baselineMetrics = evaluationFunc(model.Transform(data)); // Get slot names. var featuresColumn = data.Schema[features]; int numSlots = featuresColumn.Type.GetVectorSize(); data.Schema.TryGetColumnIndex(features, out int featuresColumnIndex); ch.Info("Number of slots: " + numSlots); if (data.Schema[featuresColumnIndex].HasSlotNames(numSlots)) { data.Schema[featuresColumnIndex].Annotations.GetValue(AnnotationUtils.Kinds.SlotNames, ref slotNames); } if (slotNames.Length != numSlots) { slotNames = VBufferUtils.CreateEmpty <ReadOnlyMemory <char> >(numSlots); } VBuffer <float> weights = default; var workingFeatureIndices = Enumerable.Range(0, numSlots).ToList(); int zeroWeightsCount = 0; // By default set to the number of all features available. var evaluatedFeaturesCount = numSlots; if (useFeatureWeightFilter) { var predictorWithWeights = model.Model as IPredictorWithFeatureWeights <Single>; if (predictorWithWeights != null) { predictorWithWeights.GetFeatureWeights(ref weights); const int maxReportedZeroFeatures = 10; StringBuilder msgFilteredOutFeatures = new StringBuilder("The following features have zero weight and will not be evaluated: \n \t"); var prefix = ""; foreach (var k in weights.Items(all: true)) { if (k.Value == 0) { zeroWeightsCount++; // Print info about first few features we're not going to evaluate. if (zeroWeightsCount <= maxReportedZeroFeatures) { msgFilteredOutFeatures.Append(prefix); msgFilteredOutFeatures.Append(GetSlotName(slotNames, k.Key)); prefix = ", "; } } else { workingFeatureIndices.Add(k.Key); } } // Old FastTree models has less weights than slots. if (weights.Length < numSlots) { ch.Warning( "Predictor had fewer features than slots. All unknown features will get default 0 weight."); zeroWeightsCount += numSlots - weights.Length; var indexes = weights.GetIndices().ToArray(); var values = weights.GetValues().ToArray(); var count = values.Length; weights = new VBuffer <float>(numSlots, count, values, indexes); } evaluatedFeaturesCount = workingFeatureIndices.Count; ch.Info("Number of zero weights: {0} out of {1}.", zeroWeightsCount, weights.Length); // Print what features have 0 weight if (zeroWeightsCount > 0) { if (zeroWeightsCount > maxReportedZeroFeatures) { msgFilteredOutFeatures.Append(string.Format("... (printing out {0} features here).\n Use 'Index' column in the report for info on what features are not evaluated.", maxReportedZeroFeatures)); } ch.Info(msgFilteredOutFeatures.ToString()); } } } if (workingFeatureIndices.Count == 0 && zeroWeightsCount == 0) { // Use all features otherwise. workingFeatureIndices.AddRange(Enumerable.Range(0, numSlots)); } if (zeroWeightsCount == numSlots) { ch.Warning("All features have 0 weight thus can not do thorough evaluation"); return(metricsDelta.ToImmutableArray()); } // Note: this will not work on the huge dataset. var maxSize = topExamples; List <float> initialfeatureValuesList = new List <float>(); // Cursor through the data to cache slot 0 values for the upcoming permutation. var valuesRowCount = 0; // REVIEW: Seems like if the labels are NaN, so that all metrics are NaN, this command will be useless. // In which case probably erroring out is probably the most useful thing. using (var cursor = data.GetRowCursor(featuresColumn)) { var featuresGetter = cursor.GetGetter <VBuffer <float> >(featuresColumn); var featuresBuffer = default(VBuffer <float>); while (initialfeatureValuesList.Count < maxSize && cursor.MoveNext()) { featuresGetter(ref featuresBuffer); initialfeatureValuesList.Add(featuresBuffer.GetItemOrDefault(workingFeatureIndices[0])); } valuesRowCount = initialfeatureValuesList.Count; } if (valuesRowCount > 0) { ch.Info("Detected {0} examples for evaluation.", valuesRowCount); } else { ch.Warning("Detected no examples for evaluation."); return(metricsDelta.ToImmutableArray()); } float[] featureValuesBuffer = initialfeatureValuesList.ToArray(); float[] nextValues = new float[valuesRowCount]; // Now iterate through all the working slots, do permutation and calc the delta of metrics. int processedCnt = 0; int nextFeatureIndex = 0; var shuffleRand = RandomUtils.Create(host.Rand.Next()); using (var pch = host.StartProgressChannel("Calculating Permutation Feature Importance")) { pch.SetHeader(new ProgressHeader("processed slots"), e => e.SetProgress(0, processedCnt)); foreach (var workingIndx in workingFeatureIndices) { // Index for the feature we will permute next. Needed to build in advance a buffer for the permutation. if (processedCnt < workingFeatureIndices.Count - 1) { nextFeatureIndex = workingFeatureIndices[processedCnt + 1]; } // Used for pre-caching the next feature int nextValuesIndex = 0; SchemaDefinition input = SchemaDefinition.Create(typeof(FeaturesBuffer)); Contracts.Assert(input.Count == 1); input[0].ColumnName = features; SchemaDefinition output = SchemaDefinition.Create(typeof(FeaturesBuffer)); Contracts.Assert(output.Count == 1); output[0].ColumnName = features; output[0].ColumnType = featuresColumn.Type; // Perform multiple permutations for one feature to build a confidence interval var metricsDeltaForFeature = resultInitializer(); for (int permutationIteration = 0; permutationIteration < permutationCount; permutationIteration++) { Utils.Shuffle <float>(shuffleRand, featureValuesBuffer); Action <FeaturesBuffer, FeaturesBuffer, PermuterState> permuter = (src, dst, state) => { src.Features.CopyTo(ref dst.Features); VBufferUtils.ApplyAt(ref dst.Features, workingIndx, (int ii, ref float d) => d = featureValuesBuffer[state.SampleIndex++]); // Is it time to pre-cache the next feature? if (permutationIteration == permutationCount - 1 && processedCnt < workingFeatureIndices.Count - 1) { // Fill out the featureValueBuffer for the next feature while updating the current feature // This is the reason I need PermuterState in LambdaTransform.CreateMap. nextValues[nextValuesIndex] = src.Features.GetItemOrDefault(nextFeatureIndex); if (nextValuesIndex < valuesRowCount - 1) { nextValuesIndex++; } } }; IDataView viewPermuted = LambdaTransform.CreateMap( host, data, permuter, null, input, output); if (valuesRowCount == topExamples) { viewPermuted = SkipTakeFilter.Create(host, new SkipTakeFilter.TakeOptions() { Count = valuesRowCount }, viewPermuted); } var metrics = evaluationFunc(model.Transform(viewPermuted)); var delta = deltaFunc(metrics, baselineMetrics); metricsDeltaForFeature.Add(delta); } // Add the metrics delta to the list metricsDelta.Add(metricsDeltaForFeature); // Swap values for next iteration of permutation. if (processedCnt < workingFeatureIndices.Count - 1) { Array.Clear(featureValuesBuffer, 0, featureValuesBuffer.Length); nextValues.CopyTo(featureValuesBuffer, 0); Array.Clear(nextValues, 0, nextValues.Length); } processedCnt++; } pch.Checkpoint(processedCnt, processedCnt); } } return(metricsDelta.ToImmutableArray()); }
/// <summary> /// Drops slots from src and populates the dst with the resulting vector. Slots are /// dropped based on min and max slots that were passed at the constructor. /// </summary> public void DropSlots <TDst>(ref VBuffer <TDst> src, ref VBuffer <TDst> dst) { if (src.Length <= SlotsMin[0]) { // There is nothing to drop, just swap buffers. Utils.Swap(ref src, ref dst); return; } int newLength = DstLength == 0 ? ComputeLength(src.Length) : DstLength; if (newLength == 0) { // All slots dropped. VBufferUtils.Resize(ref dst, 1, 0); return; } Contracts.Assert(newLength < src.Length); // End of the trivial cases // At this point, we need to drop some slots and keep some slots. VBufferEditor <TDst> editor; var srcValues = src.GetValues(); if (src.IsDense) { editor = VBufferEditor.Create(ref dst, newLength); int iDst = 0; int iSrc = 0; for (int i = 0; i < SlotsMax.Length && iSrc < src.Length; i++) { var lim = Math.Min(SlotsMin[i], src.Length); while (iSrc < lim) { Contracts.Assert(iDst <= iSrc); editor.Values[iDst++] = srcValues[iSrc++]; } iSrc = SlotsMax[i] + 1; } while (iSrc < src.Length) { Contracts.Assert(iDst <= iSrc); editor.Values[iDst++] = srcValues[iSrc++]; } Contracts.Assert(iDst == newLength); dst = editor.Commit(); return; } // Sparse case. // Approximate new count is min(#indices, newLength). var newCount = Math.Min(srcValues.Length, newLength); var indices = dst.GetIndices(); var srcIndices = src.GetIndices(); Contracts.Assert(newCount <= src.Length); editor = VBufferEditor.Create( ref dst, newLength, newCount, requireIndicesOnDense: true); int iiDst = 0; int iiSrc = 0; int iOffset = 0; int iRange = 0; int min = SlotsMin[iRange]; // REVIEW: Consider using a BitArray with the slots to keep instead of SlotsMax. It would // only make sense when the number of ranges is greater than the number of slots divided by 32. int max = SlotsMax[iRange]; while (iiSrc < srcValues.Length) { // Copy (with offset) the elements before the current range. var index = srcIndices[iiSrc]; if (index < min) { Contracts.Assert(iiDst <= iiSrc); editor.Indices[iiDst] = index - iOffset; editor.Values[iiDst++] = srcValues[iiSrc++]; continue; } if (index <= max) { // Skip elements in the current range. iiSrc++; continue; } // Find the next range. const int threshold1 = 20; const int threshold2 = 10; while (++iRange < SlotsMax.Length && SlotsMax[iRange] < index) { if (SlotsMax.Length - iRange >= threshold1 && SlotsMax[iRange + threshold2] < index) { iRange = SlotsMax.FindIndexSorted(iRange + threshold2, SlotsMax.Length, index); Contracts.Assert(iRange == SlotsMax.Length || iRange > 0 && SlotsMax[iRange - 1] < index && index <= SlotsMax[iRange]); break; } } if (iRange < SlotsMax.Length) { min = SlotsMin[iRange]; max = SlotsMax[iRange]; } else { min = max = src.Length; } if (iRange > 0) { iOffset = _lengthReduction[iRange - 1]; } Contracts.Assert(index <= max); } dst = editor.CommitTruncated(iiDst); }