protected internal OptimizerState(IChannel ch, IProgressChannelProvider progress, ref VBuffer <Float> initial, int m, long totalMemLimit, bool keepDense, bool enforceNonNegativity) { Contracts.AssertValue(ch); Ch = ch; ch.AssertValueOrNull(progress); ProgressProvider = progress; Iter = 1; _keepDense = keepDense; Dim = initial.Length; _x = CreateWorkingVector(); initial.CopyTo(ref _x); _m = m; _totalMemLimit = totalMemLimit; Dim = initial.Length; _grad = CreateWorkingVector(); _dir = CreateWorkingVector(); _newX = CreateWorkingVector(); _newGrad = CreateWorkingVector(); _steepestDescDir = CreateWorkingVector(); _sList = new VBuffer <Float> [_m]; _yList = new VBuffer <Float> [_m]; _roList = new List <Float>(); EnforceNonNegativity = enforceNonNegativity; }
/// <summary> /// Minimize a function. /// </summary> /// <param name="function">The function to minimize</param> /// <param name="initial">The initial point</param> /// <param name="result">The point at the optimum</param> /// <param name="optimum">The optimum function value</param> /// <exception cref="PrematureConvergenceException">Thrown if successive points are within numeric precision of each other, but termination condition is still unsatisfied.</exception> public void Minimize(DifferentiableFunction function, ref VBuffer <Float> initial, ref VBuffer <Float> result, out Float optimum) { Minimize(function, ref initial, _staticTerm, ref result, out optimum); }
protected Float DifferentiableFunctionStream(FloatLabelCursor.Factory cursorFactory, ref VBuffer <Float> xDense, ref VBuffer <Float> grad, IProgressChannel pch) { Contracts.AssertValue(cursorFactory); VBufferUtils.Clear(ref grad); VBufferUtils.Densify(ref grad); Float[] scratch = null; double loss = 0; long count = 0; if (pch != null) { pch.SetHeader(new ProgressHeader(null, new[] { "examples" }), e => e.SetProgress(0, count)); } using (var cursor = cursorFactory.Create()) { while (cursor.MoveNext()) { loss += AccumulateOneGradient(ref cursor.Features, cursor.Label, cursor.Weight, ref xDense, ref grad, ref scratch); count++; } } // we need use double type to accumulate loss to avoid roundoff error // please see http://mathworld.wolfram.com/RoundoffError.html for roundoff error definition // finally we need to convert double type to float for function definition return((Float)loss); }
public override Float Eval(ref VBuffer <Float> input, ref VBuffer <Float> gradient) { return(Function(ref input, ref gradient, ProgressProvider)); }
private void TrainCore(IChannel ch, RoleMappedData data) { Host.AssertValue(ch); ch.AssertValue(data); // Compute the number of threads to use. The ctor should have verified that this will // produce a positive value. int numThreads = !UseThreads ? 1 : (NumThreads ?? Environment.ProcessorCount); if (Host.ConcurrencyFactor > 0 && numThreads > Host.ConcurrencyFactor) { numThreads = Host.ConcurrencyFactor; ch.Warning("The number of threads specified in trainer arguments is larger than the concurrency factor " + "setting of the environment. Using {0} training threads instead.", numThreads); } ch.Assert(numThreads > 0); NumGoodRows = 0; WeightSum = 0; _features = null; _labels = null; _weights = null; if (numThreads > 1) { ch.Info("LBFGS multi-threading will attempt to load dataset into memory. In case of out-of-memory " + "issues, add 'numThreads=1' to the trainer arguments and 'cache=-' to the command line " + "arguments to turn off multi-threading."); _features = new VBuffer <Float> [1000]; _labels = new Float[1000]; if (data.Schema.Weight != null) { _weights = new Float[1000]; } } var cursorFactory = new FloatLabelCursor.Factory(data, CursOpt.Features | CursOpt.Label | CursOpt.Weight); long numBad; // REVIEW: This pass seems overly expensive for the benefit when multi-threading is off.... using (var cursor = cursorFactory.Create()) using (var pch = Host.StartProgressChannel("LBFGS data prep")) { // REVIEW: maybe it makes sense for the factory to capture the good row count after // the first successful cursoring? Double totalCount = data.Data.GetRowCount(true) ?? Double.NaN; long exCount = 0; pch.SetHeader(new ProgressHeader(null, new[] { "examples" }), e => e.SetProgress(0, exCount, totalCount)); while (cursor.MoveNext()) { WeightSum += cursor.Weight; if (ShowTrainingStats) { ProcessPriorDistribution(cursor.Label, cursor.Weight); } PreTrainingProcessInstance(cursor.Label, ref cursor.Features, cursor.Weight); exCount++; if (_features != null) { ch.Assert(cursor.KeptRowCount <= int.MaxValue); int index = (int)cursor.KeptRowCount - 1; Utils.EnsureSize(ref _features, index + 1); Utils.EnsureSize(ref _labels, index + 1); if (_weights != null) { Utils.EnsureSize(ref _weights, index + 1); _weights[index] = cursor.Weight; } Utils.Swap(ref _features[index], ref cursor.Features); _labels[index] = cursor.Label; if (cursor.KeptRowCount >= int.MaxValue) { ch.Warning("Limiting data size for multi-threading"); break; } } } NumGoodRows = cursor.KeptRowCount; numBad = cursor.SkippedRowCount; } ch.Check(NumGoodRows > 0, NoTrainingInstancesMessage); if (numBad > 0) { ch.Warning("Skipped {0} instances with missing features/label/weight during training", numBad); } if (_features != null) { ch.Assert(numThreads > 1); // If there are so many threads that each only gets a small number (less than 10) of instances, trim // the number of threads so each gets a more reasonable number (100 or so). These numbers are pretty arbitrary, // but avoid the possibility of having no instances on some threads. if (numThreads > 1 && NumGoodRows / numThreads < 10) { int numNew = Math.Max(1, (int)NumGoodRows / 100); ch.Warning("Too few instances to use {0} threads, decreasing to {1} thread(s)", numThreads, numNew); numThreads = numNew; } ch.Assert(numThreads > 0); // Divide up the instances among the threads. _numChunks = numThreads; _ranges = new int[_numChunks + 1]; int cinstTot = (int)NumGoodRows; for (int ichk = 0, iinstMin = 0; ichk < numThreads; ichk++) { int cchkLeft = numThreads - ichk; // Number of chunks left to fill. ch.Assert(0 < cchkLeft && cchkLeft <= numThreads); int cinstThis = (cinstTot - iinstMin + cchkLeft - 1) / cchkLeft; // Size of this chunk. ch.Assert(0 < cinstThis && cinstThis <= cinstTot - iinstMin); iinstMin += cinstThis; _ranges[ichk + 1] = iinstMin; } _localLosses = new Float[numThreads]; _localGradients = new VBuffer <Float> [numThreads - 1]; int size = BiasCount + WeightCount; for (int i = 0; i < _localGradients.Length; i++) { _localGradients[i] = VBufferUtils.CreateEmpty <Float>(size); } ch.Assert(_numChunks > 0 && _data == null); } else { // Streaming, single-threaded case. _data = data; _cursorFactory = cursorFactory; ch.Assert(_numChunks == 0 && _data != null); } VBuffer <Float> initWeights; ITerminationCriterion terminationCriterion; Optimizer opt = InitializeOptimizer(ch, cursorFactory, out initWeights, out terminationCriterion); opt.Quiet = Quiet; Float loss; try { opt.Minimize(DifferentiableFunction, ref initWeights, terminationCriterion, ref CurrentWeights, out loss); } catch (Optimizer.PrematureConvergenceException e) { if (!Quiet) { ch.Warning("Premature convergence occurred. The OptimizationTolerance may be set too small. {0}", e.Message); } CurrentWeights = e.State.X; loss = e.State.Value; } ch.Assert(CurrentWeights.Length == BiasCount + WeightCount); int numParams = BiasCount; if ((L1Weight > 0 && !Quiet) || ShowTrainingStats) { VBufferUtils.ForEachDefined(ref CurrentWeights, (index, value) => { if (index >= BiasCount && value != 0) { numParams++; } }); if (L1Weight > 0 && !Quiet) { ch.Info("L1 regularization selected {0} of {1} weights.", numParams, BiasCount + WeightCount); } } if (ShowTrainingStats) { ComputeTrainingStatistics(ch, cursorFactory, loss, numParams); } }
/// <summary> /// Batch-parallel optimizer /// </summary> /// <remarks> /// REVIEW: consider getting rid of multithread-targeted members /// Using TPL, the distinction between Multithreaded and Sequential implementations is unnecessary /// </remarks> protected virtual Float DifferentiableFunctionMultithreaded(ref VBuffer <Float> xDense, ref VBuffer <Float> gradient, IProgressChannel pch) { Contracts.Assert(_data == null); Contracts.Assert(_cursorFactory == null); Contracts.Assert(_numChunks > 0); Contracts.Assert(Utils.Size(_ranges) == _numChunks + 1); Contracts.Assert(Utils.Size(_localLosses) == _numChunks); Contracts.Assert(Utils.Size(_localGradients) + 1 == _numChunks); Contracts.AssertValueOrNull(pch); // Declare a local variable, since the lambda cannot capture the xDense. The gradient // calculation will modify the local gradients, but not this xx value. var xx = xDense; var gg = gradient; Parallel.For(0, _numChunks, ichk => { if (ichk == 0) { _localLosses[ichk] = DifferentiableFunctionComputeChunk(ichk, ref xx, ref gg, pch); } else { _localLosses[ichk] = DifferentiableFunctionComputeChunk(ichk, ref xx, ref _localGradients[ichk - 1], null); } }); gradient = gg; Float loss = _localLosses[0]; for (int i = 1; i < _numChunks; i++) { VectorUtils.Add(ref _localGradients[i - 1], ref gradient); loss += _localLosses[i]; } return(loss); }
public void SimpleTextLoaderCopyColumnsTest() { var env = new MLContext(0); const string data = "0 hello 3.14159 -0 2\n" + "1 1 2 4 15"; var dataSource = new BytesStreamSource(data); var text = TextLoaderStatic.CreateLoader(env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1), numericFeatures: ctx.LoadFloat(2, null)), // If fit correctly, this ought to be equivalent to max of 4, that is, length of 3. dataSource, separator: ' '); // While we have a type-safe wrapper for `IDataView` it is utterly useless except as an input to the `Fit` functions // of the other statically typed wrappers. We perhaps ought to make it useful in its own right, but perhaps not now. // For now, just operate over the actual `IDataView`. var textData = text.Load(dataSource).AsDynamic; var schema = textData.Schema; // First verify that the columns are there. There ought to be at least one column corresponding to the identifiers in the tuple. CheckSchemaHasColumn(schema, "label", out int labelIdx); CheckSchemaHasColumn(schema, "text", out int textIdx); CheckSchemaHasColumn(schema, "numericFeatures", out int numericFeaturesIdx); // Next verify they have the expected types. Assert.Equal(BooleanDataViewType.Instance, schema[labelIdx].Type); Assert.Equal(TextDataViewType.Instance, schema[textIdx].Type); Assert.Equal(new VectorType(NumberDataViewType.Single, 3), schema[numericFeaturesIdx].Type); // Next actually inspect the data. using (var cursor = textData.GetRowCursorForAllColumns()) { var textGetter = cursor.GetGetter <ReadOnlyMemory <char> >(textIdx); var numericFeaturesGetter = cursor.GetGetter <VBuffer <float> >(numericFeaturesIdx); ReadOnlyMemory <char> textVal = default; var labelGetter = cursor.GetGetter <bool>(labelIdx); bool labelVal = default; VBuffer <float> numVal = default; void CheckValuesSame(bool bl, string tx, float v0, float v1, float v2) { labelGetter(ref labelVal); textGetter(ref textVal); numericFeaturesGetter(ref numVal); Assert.True(tx.AsSpan().SequenceEqual(textVal.Span)); Assert.Equal((bool)bl, labelVal); Assert.Equal(3, numVal.Length); Assert.Equal(v0, numVal.GetItemOrDefault(0)); Assert.Equal(v1, numVal.GetItemOrDefault(1)); Assert.Equal(v2, numVal.GetItemOrDefault(2)); } Assert.True(cursor.MoveNext(), "Could not move even to first row"); CheckValuesSame(false, "hello", 3.14159f, -0f, 2f); Assert.True(cursor.MoveNext(), "Could not move to second row"); CheckValuesSame(true, "1", 2f, 4f, 15f); Assert.False(cursor.MoveNext(), "Moved to third row, but there should have been only two"); } // The next step where we shuffle the names around a little bit is one where we are // testing out the implicit usage of copy columns. var est = text.MakeNewEstimator().Append(r => (text: r.label, label: r.numericFeatures)); var newText = text.Append(est); var newTextData = newText.Fit(dataSource).Load(dataSource); schema = newTextData.AsDynamic.Schema; // First verify that the columns are there. There ought to be at least one column corresponding to the identifiers in the tuple. CheckSchemaHasColumn(schema, "label", out labelIdx); CheckSchemaHasColumn(schema, "text", out textIdx); // Next verify they have the expected types. Assert.Equal(BooleanDataViewType.Instance, schema[textIdx].Type); Assert.Equal(new VectorType(NumberDataViewType.Single, 3), schema[labelIdx].Type); }
/// <summary> /// Initialize weights by running SGD up to specified tolerance. /// </summary> protected virtual VBuffer <Float> InitializeWeightsSgd(IChannel ch, FloatLabelCursor.Factory cursorFactory) { if (!Quiet) { ch.Info("Running SGD initialization with tolerance {0}", SgdInitializationTolerance); } int numExamples = 0; var oldWeights = VBufferUtils.CreateEmpty <Float>(BiasCount + WeightCount); DTerminate terminateSgd = (ref VBuffer <Float> x) => { if (++numExamples % 1000 != 0) { return(false); } VectorUtils.AddMult(ref x, -1, ref oldWeights); Float normDiff = VectorUtils.Norm(oldWeights); x.CopyTo(ref oldWeights); // #if OLD_TRACING // REVIEW: How should this be ported? if (!Quiet) { Console.Write("."); if (numExamples % 50000 == 0) { Console.WriteLine("\t{0}\t{1}", numExamples, normDiff); } } // #endif return(normDiff < SgdInitializationTolerance); }; VBuffer <Float> result = default(VBuffer <Float>); FloatLabelCursor cursor = null; try { Float[] scratch = null; SgdOptimizer.DStochasticGradient lossSgd = (ref VBuffer <Float> x, ref VBuffer <Float> grad) => { // Zero out the gradient by sparsifying. grad = new VBuffer <Float>(grad.Length, 0, grad.Values, grad.Indices); EnsureBiases(ref grad); if (cursor == null || !cursor.MoveNext()) { if (cursor != null) { cursor.Dispose(); } cursor = cursorFactory.Create(); if (!cursor.MoveNext()) { return; } } AccumulateOneGradient(ref cursor.Features, cursor.Label, cursor.Weight, ref x, ref grad, ref scratch); }; VBuffer <Float> sgdWeights; if (DenseOptimizer) { sgdWeights = VBufferUtils.CreateDense <Float>(BiasCount + WeightCount); } else { sgdWeights = VBufferUtils.CreateEmpty <Float>(BiasCount + WeightCount); } SgdOptimizer sgdo = new SgdOptimizer(terminateSgd); sgdo.Minimize(lossSgd, ref sgdWeights, ref result); // #if OLD_TRACING // REVIEW: How should this be ported? if (!Quiet) { Console.WriteLine(); } // #endif ch.Info("SGD initialization done in {0} rounds", numExamples); } finally { if (cursor != null) { cursor.Dispose(); } } return(result); }
void ComputeStatistics() { lock (_lock) { if (_scalingStat == null) { using (var ch = _host.Start("ScalerTransform")) { var sch = _input.Schema; var indexesCol = new List <int>(); var textCols = _args.columns.Select(c => c.Source).ToArray(); _scalingStat = new Dictionary <string, List <ColumnStatObs> >(); for (int i = 0; i < textCols.Length; ++i) { int index; if (!sch.TryGetColumnIndex(textCols[i], out index)) { throw ch.Except("Unable to find column '{0}' in '{1}'", textCols[i], SchemaHelper.ToString(sch)); } var ty = sch.GetColumnType(index); if (!(ty == NumberType.R4 || ty == NumberType.U4 || ty == TextType.Instance || ty == BoolType.Instance || (ty.IsKey() && ty.AsKey().RawKind() == DataKind.U4) || (ty.IsVector() && ty.AsVector().ItemType() == NumberType.R4))) { throw ch.Except("Only a float or a vector of floats or a uint or a text or a bool is allowed for column {0} (schema={1}).", _args.columns[i], SchemaHelper.ToString(sch)); } indexesCol.Add(index); } // Computation var required = new HashSet <int>(indexesCol); var requiredIndexes = required.OrderBy(c => c).ToArray(); using (var cur = _input.GetRowCursor(i => required.Contains(i))) { bool[] isText = requiredIndexes.Select(c => sch.GetColumnType(c) == TextType.Instance).ToArray(); bool[] isBool = requiredIndexes.Select(c => sch.GetColumnType(c) == BoolType.Instance).ToArray(); bool[] isFloat = requiredIndexes.Select(c => sch.GetColumnType(c) == NumberType.R4).ToArray(); bool[] isUint = requiredIndexes.Select(c => sch.GetColumnType(c) == NumberType.U4 || sch.GetColumnType(c).RawKind() == DataKind.U4).ToArray(); ValueGetter <bool>[] boolGetters = requiredIndexes.Select(i => sch.GetColumnType(i) == BoolType.Instance || sch.GetColumnType(i).RawKind() == DataKind.BL ? cur.GetGetter <bool>(i) : null).ToArray(); ValueGetter <uint>[] uintGetters = requiredIndexes.Select(i => sch.GetColumnType(i) == NumberType.U4 || sch.GetColumnType(i).RawKind() == DataKind.U4 ? cur.GetGetter <uint>(i) : null).ToArray(); ValueGetter <ReadOnlyMemory <char> >[] textGetters = requiredIndexes.Select(i => sch.GetColumnType(i) == TextType.Instance ? cur.GetGetter <ReadOnlyMemory <char> >(i) : null).ToArray(); ValueGetter <float>[] floatGetters = requiredIndexes.Select(i => sch.GetColumnType(i) == NumberType.R4 ? cur.GetGetter <float>(i) : null).ToArray(); ValueGetter <VBuffer <float> >[] vectorGetters = requiredIndexes.Select(i => sch.GetColumnType(i).IsVector() ? cur.GetGetter <VBuffer <float> >(i) : null).ToArray(); var schema = _input.Schema; for (int i = 0; i < schema.ColumnCount; ++i) { string name = schema.GetColumnName(i); if (!required.Contains(i)) { continue; } _scalingStat[name] = new List <ColumnStatObs>(); var t = _scalingStat[name]; switch (_args.scaling) { case ScalerStrategy.meanVar: t.Add(new ColumnStatObs(ColumnStatObs.StatKind.sum)); t.Add(new ColumnStatObs(ColumnStatObs.StatKind.sum2)); t.Add(new ColumnStatObs(ColumnStatObs.StatKind.nb)); break; case ScalerStrategy.minMax: t.Add(new ColumnStatObs(ColumnStatObs.StatKind.min)); t.Add(new ColumnStatObs(ColumnStatObs.StatKind.max)); break; default: throw _host.ExceptNotSupp($"Unsupported scaling strategy: {_args.scaling}."); } } float value = 0; var tvalue = new ReadOnlyMemory <char>(); VBuffer <float> vector = new VBuffer <float>(); uint uvalue = 0; bool bvalue = true; var curschema = cur.Schema; while (cur.MoveNext()) { for (int i = 0; i < requiredIndexes.Length; ++i) { string name = curschema.GetColumnName(requiredIndexes[i]); if (!_scalingStat.ContainsKey(name)) { continue; } if (isFloat[i]) { floatGetters[i](ref value); foreach (var t in _scalingStat[name]) { t.Update(value); } } else if (isBool[i]) { boolGetters[i](ref bvalue); foreach (var t in _scalingStat[name]) { t.Update(bvalue); } } else if (isText[i]) { textGetters[i](ref tvalue); foreach (var t in _scalingStat[name]) { t.Update(tvalue.ToString()); } } else if (isUint[i]) { uintGetters[i](ref uvalue); foreach (var t in _scalingStat[name]) { t.Update(uvalue); } } else { vectorGetters[i](ref vector); foreach (var t in _scalingStat[name]) { t.Update(vector); } } } } } _scalingFactors = GetScalingParameters(); _revIndex = ComputeRevIndex(); } } } }
public ScalingFactor(int colid, ScalingMethod method, VBuffer <float> mean, VBuffer <float> scale) { scalingMethod = method; columnId = colid; this.mean = mean; this.scale = scale; }
public void TreeEnsembleFeaturizerOutputSchemaTest() { // Create data set var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(1000).ToList(); var dataView = ML.Data.LoadFromEnumerable(data); // Define a tree model whose trees will be extracted to construct a tree featurizer. var trainer = ML.BinaryClassification.Trainers.FastTree( new FastTreeBinaryTrainer.Options { NumberOfThreads = 1, NumberOfTrees = 10, NumberOfLeaves = 5, }); // Train the defined tree model. var model = trainer.Fit(dataView); // From the trained tree model, a mapper of tree featurizer is created. const string treesColumnName = "MyTrees"; const string leavesColumnName = "MyLeaves"; const string pathsColumnName = "MyPaths"; var args = new TreeEnsembleFeaturizerBindableMapper.Arguments() { TreesColumnName = treesColumnName, LeavesColumnName = leavesColumnName, PathsColumnName = pathsColumnName }; var treeFeaturizer = new TreeEnsembleFeaturizerBindableMapper(Env, args, model.Model); // To get output schema, we need to create RoleMappedSchema for calling Bind(...). var roleMappedSchema = new RoleMappedSchema(dataView.Schema, label: nameof(SamplesUtils.DatasetUtils.BinaryLabelFloatFeatureVectorFloatWeightSample.Label), feature: nameof(SamplesUtils.DatasetUtils.BinaryLabelFloatFeatureVectorFloatWeightSample.Features)); // Retrieve output schema. var boundMapper = (treeFeaturizer as ISchemaBindableMapper).Bind(Env, roleMappedSchema); var outputSchema = boundMapper.OutputSchema; { // Check if output schema is correct. var treeValuesColumn = outputSchema[0]; Assert.Equal(treesColumnName, treeValuesColumn.Name); VectorDataViewType treeValuesType = treeValuesColumn.Type as VectorDataViewType; Assert.NotNull(treeValuesType); Assert.Equal(NumberDataViewType.Single, treeValuesType.ItemType); Assert.Equal(10, treeValuesType.Size); // Below we check the only metadata field. Assert.Single(treeValuesColumn.Annotations.Schema); VBuffer <ReadOnlyMemory <char> > slotNames = default; treeValuesColumn.Annotations.GetValue(AnnotationUtils.Kinds.SlotNames, ref slotNames); Assert.Equal(10, slotNames.Length); // Just check the head and the tail of the extracted vector. Assert.Equal("Tree000", slotNames.GetItemOrDefault(0).ToString()); Assert.Equal("Tree009", slotNames.GetItemOrDefault(9).ToString()); } { var treeLeafIdsColumn = outputSchema[1]; // Check column of tree leaf IDs. Assert.Equal(leavesColumnName, treeLeafIdsColumn.Name); VectorDataViewType treeLeafIdsType = treeLeafIdsColumn.Type as VectorDataViewType; Assert.NotNull(treeLeafIdsType); Assert.Equal(NumberDataViewType.Single, treeLeafIdsType.ItemType); Assert.Equal(50, treeLeafIdsType.Size); // Below we check the two leaf-IDs column's metadata fields. Assert.Equal(2, treeLeafIdsColumn.Annotations.Schema.Count); // Check metadata field IsNormalized's content. bool leafIdsNormalizedFlag = false; treeLeafIdsColumn.Annotations.GetValue(AnnotationUtils.Kinds.IsNormalized, ref leafIdsNormalizedFlag); Assert.True(leafIdsNormalizedFlag); // Check metadata field SlotNames's content. VBuffer <ReadOnlyMemory <char> > leafIdsSlotNames = default; treeLeafIdsColumn.Annotations.GetValue(AnnotationUtils.Kinds.SlotNames, ref leafIdsSlotNames); Assert.Equal(50, leafIdsSlotNames.Length); // Just check the head and the tail of the extracted vector. Assert.Equal("Tree000Leaf000", leafIdsSlotNames.GetItemOrDefault(0).ToString()); Assert.Equal("Tree009Leaf004", leafIdsSlotNames.GetItemOrDefault(49).ToString()); } { var treePathIdsColumn = outputSchema[2]; // Check column of path IDs. Assert.Equal(pathsColumnName, treePathIdsColumn.Name); VectorDataViewType treePathIdsType = treePathIdsColumn.Type as VectorDataViewType; Assert.NotNull(treePathIdsType); Assert.Equal(NumberDataViewType.Single, treePathIdsType.ItemType); Assert.Equal(40, treePathIdsType.Size); // Below we check the two path-IDs column's metadata fields. Assert.Equal(2, treePathIdsColumn.Annotations.Schema.Count); // Check metadata field IsNormalized's content. bool pathIdsNormalizedFlag = false; treePathIdsColumn.Annotations.GetValue(AnnotationUtils.Kinds.IsNormalized, ref pathIdsNormalizedFlag); Assert.True(pathIdsNormalizedFlag); // Check metadata field SlotNames's content. VBuffer <ReadOnlyMemory <char> > pathIdsSlotNames = default; treePathIdsColumn.Annotations.GetValue(AnnotationUtils.Kinds.SlotNames, ref pathIdsSlotNames); Assert.Equal(40, pathIdsSlotNames.Length); // Just check the head and the tail of the extracted vector. Assert.Equal("Tree000Node000", pathIdsSlotNames.GetItemOrDefault(0).ToString()); Assert.Equal("Tree009Node003", pathIdsSlotNames.GetItemOrDefault(39).ToString()); } }
private ValueGetter <VBuffer <float> > GetGetterVec(IRow input, int iinfo) { Host.AssertValue(input); Host.Assert(0 <= iinfo && iinfo < _parent.ColumnPairs.Length); var colType = input.Schema.GetColumnType(ColMapNewToOld[iinfo]); Host.Assert(colType.IsVector); Host.Assert(colType.ItemType.IsText); var srcGetter = input.GetGetter <VBuffer <ReadOnlyMemory <char> > >(ColMapNewToOld[iinfo]); var src = default(VBuffer <ReadOnlyMemory <char> >); int dimension = _parent._currentVocab.Dimension; float[] wordVector = new float[_parent._currentVocab.Dimension]; return ((ref VBuffer <float> dst) => { int deno = 0; srcGetter(ref src); var values = dst.Values; if (Utils.Size(values) != 3 * dimension) { values = new float[3 * dimension]; } int offset = 2 * dimension; for (int i = 0; i < dimension; i++) { values[i] = float.MaxValue; values[i + dimension] = 0; values[i + offset] = float.MinValue; } for (int word = 0; word < src.Count; word++) { if (_parent._currentVocab.GetWordVector(ref src.Values[word], wordVector)) { deno++; for (int i = 0; i < dimension; i++) { float currentTerm = wordVector[i]; if (values[i] > currentTerm) { values[i] = currentTerm; } values[dimension + i] += currentTerm; if (values[offset + i] < currentTerm) { values[offset + i] = currentTerm; } } } } if (deno != 0) { for (int index = 0; index < dimension; index++) { values[index + dimension] /= deno; } } dst = new VBuffer <float>(values.Length, values, dst.Indices); }); }
private void HashTestCore <T>(T val, PrimitiveType type, uint expected, uint expectedOrdered, uint expectedOrdered3) { const int bits = 10; var builder = new MetadataBuilder(); builder.AddPrimitiveValue("Foo", type, val); var inRow = MetadataUtils.MetadataAsRow(builder.GetMetadata()); // First do an unordered hash. var info = new HashingTransformer.ColumnInfo("Foo", "Bar", hashBits: bits); var xf = new HashingTransformer(Env, new[] { info }); var mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.OutputSchema.TryGetColumnIndex("Bar", out int outCol); var outRow = mapper.GetRow(inRow, c => c == outCol); var getter = outRow.GetGetter <uint>(outCol); uint result = 0; getter(ref result); Assert.Equal(expected, result); // Next do an ordered hash. info = new HashingTransformer.ColumnInfo("Foo", "Bar", hashBits: bits, ordered: true); xf = new HashingTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol); outRow = mapper.GetRow(inRow, c => c == outCol); getter = outRow.GetGetter <uint>(outCol); getter(ref result); Assert.Equal(expectedOrdered, result); // Next build up a vector to make sure that hashing is consistent between scalar values // at least in the first position, and in the unordered case, the last position. const int vecLen = 5; var denseVec = new VBuffer <T>(vecLen, Utils.CreateArray(vecLen, val)); builder = new MetadataBuilder(); builder.Add("Foo", new VectorType(type, vecLen), (ref VBuffer <T> dst) => denseVec.CopyTo(ref dst)); inRow = MetadataUtils.MetadataAsRow(builder.GetMetadata()); info = new HashingTransformer.ColumnInfo("Foo", "Bar", hashBits: bits, ordered: false); xf = new HashingTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol); outRow = mapper.GetRow(inRow, c => c == outCol); var vecGetter = outRow.GetGetter <VBuffer <uint> >(outCol); VBuffer <uint> vecResult = default; vecGetter(ref vecResult); Assert.Equal(vecLen, vecResult.Length); // They all should equal this in this case. Assert.All(vecResult.DenseValues(), v => Assert.Equal(expected, v)); // Now do ordered with the dense vector. info = new HashingTransformer.ColumnInfo("Foo", "Bar", hashBits: bits, ordered: true); xf = new HashingTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol); outRow = mapper.GetRow(inRow, c => c == outCol); vecGetter = outRow.GetGetter <VBuffer <uint> >(outCol); vecGetter(ref vecResult); Assert.Equal(vecLen, vecResult.Length); Assert.Equal(expectedOrdered, vecResult.GetItemOrDefault(0)); Assert.Equal(expectedOrdered3, vecResult.GetItemOrDefault(3)); Assert.All(vecResult.DenseValues(), v => Assert.True((v == 0) == (expectedOrdered == 0))); // Let's now do a sparse vector. var sparseVec = new VBuffer <T>(10, 3, Utils.CreateArray(3, val), new[] { 0, 3, 7 }); builder = new MetadataBuilder(); builder.Add("Foo", new VectorType(type, vecLen), (ref VBuffer <T> dst) => sparseVec.CopyTo(ref dst)); inRow = MetadataUtils.MetadataAsRow(builder.GetMetadata()); info = new HashingTransformer.ColumnInfo("Foo", "Bar", hashBits: bits, ordered: false); xf = new HashingTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol); outRow = mapper.GetRow(inRow, c => c == outCol); vecGetter = outRow.GetGetter <VBuffer <uint> >(outCol); vecGetter(ref vecResult); Assert.Equal(10, vecResult.Length); Assert.Equal(expected, vecResult.GetItemOrDefault(0)); Assert.Equal(expected, vecResult.GetItemOrDefault(3)); Assert.Equal(expected, vecResult.GetItemOrDefault(7)); info = new HashingTransformer.ColumnInfo("Foo", "Bar", hashBits: bits, ordered: true); xf = new HashingTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol); outRow = mapper.GetRow(inRow, c => c == outCol); vecGetter = outRow.GetGetter <VBuffer <uint> >(outCol); vecGetter(ref vecResult); Assert.Equal(10, vecResult.Length); Assert.Equal(expectedOrdered, vecResult.GetItemOrDefault(0)); Assert.Equal(expectedOrdered3, vecResult.GetItemOrDefault(3)); }
private void TrainCore(IChannel ch, FloatLabelCursor.Factory cursorFactory, int featureCount) { Host.AssertValue(ch); ch.AssertValue(cursorFactory); int m = featureCount + 1; // Check for memory conditions first. if ((long)m * (m + 1) / 2 > int.MaxValue) { throw ch.Except("Cannot hold covariance matrix in memory with {0} features", m - 1); } // Track the number of examples. long n = 0; // Since we are accumulating over many values, we use Double even for the single precision build. var xty = new Double[m]; // The layout of this algorithm is a packed row-major lower triangular matrix. var xtx = new Double[m * (m + 1) / 2]; // Build X'X (lower triangular) and X'y incrementally (X'X+=X'X_i; X'y+=X'y_i): using (var cursor = cursorFactory.Create()) { while (cursor.MoveNext()) { var yi = cursor.Label; // Increment first element of X'y xty[0] += yi; // Increment first element of lower triangular X'X xtx[0] += 1; var values = cursor.Features.Values; if (cursor.Features.IsDense) { int ioff = 1; ch.Assert(cursor.Features.Count + 1 == m); // Increment rest of first column of lower triangular X'X for (int i = 1; i < m; i++) { ch.Assert(ioff == i * (i + 1) / 2); var val = values[i - 1]; // Add the implicit first bias term to X'X xtx[ioff++] += val; // Add the remainder of X'X for (int j = 0; j < i; j++) { xtx[ioff++] += val * values[j]; } // X'y xty[i] += val * yi; } ch.Assert(ioff == xtx.Length); } else { var fIndices = cursor.Features.Indices; for (int ii = 0; ii < cursor.Features.Count; ++ii) { int i = fIndices[ii] + 1; int ioff = i * (i + 1) / 2; var val = values[ii]; // Add the implicit first bias term to X'X xtx[ioff++] += val; // Add the remainder of X'X for (int jj = 0; jj <= ii; jj++) { xtx[ioff + fIndices[jj]] += val * values[jj]; } // X'y xty[i] += val * yi; } } n++; } ch.Check(n > 0, "No training examples in dataset."); if (cursor.BadFeaturesRowCount > 0) { ch.Warning("Skipped {0} instances with missing features/label during training", cursor.SkippedRowCount); } if (_l2Weight > 0) { // Skip the bias term for regularization, in the ridge regression case. // So start at [1,1] instead of [0,0]. // REVIEW: There are two ways to view this, firstly, it is more // user friendly ot make this scaling factor behave similarly regardless // of data size, so that if you have the same parameters, you get the same // model if you feed in your data than if you duplicate your data 10 times. // This is what I have now. The alternate point of view is to view this // L2 regularization parameter as providing some sort of prior, in which // case duplication 10 times should in fact be treated differently! (That // is, we should not multiply by n below.) Both interpretations seem // correct, in their way. Double squared = _l2Weight * _l2Weight * n; int ioff = 0; for (int i = 1; i < m; ++i) { xtx[ioff += i + 1] += squared; } ch.Assert(ioff == xtx.Length - 1); } } if (!(_l2Weight > 0) && n < m) { throw ch.Except("Ordinary least squares requires more examples than parameters. There are {0} parameters, but {1} examples. To enable training, use a positive L2 weight so this behaves as ridge regression.", m, n); } Double yMean = n == 0 ? 0 : xty[0] / n; ch.Info("Trainer solving for {0} parameters across {1} examples", m, n); // Cholesky Decomposition of X'X into LL' try { Mkl.Pptrf(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, m, xtx); } catch (DllNotFoundException) { // REVIEW: Is there no better way? throw ch.ExceptNotSupp("The MKL library (Microsoft.ML.MklImports.dll) or one of its dependencies is missing."); } // Solve for beta in (LL')beta = X'y: Mkl.Pptrs(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, m, 1, xtx, xty, 1); // Note that the solver overwrote xty so it contains the solution. To be more clear, // we effectively change its name (through reassignment) so we don't get confused that // this is somehow xty in the remaining calculation. var beta = xty; xty = null; // Check that the solution is valid. for (int i = 0; i < beta.Length; ++i) { ch.Check(FloatUtils.IsFinite(beta[i]), "Non-finite values detected in OLS solution"); } var weights = VBufferUtils.CreateDense <Float>(beta.Length - 1); for (int i = 1; i < beta.Length; ++i) { weights.Values[i - 1] = (Float)beta[i]; } _weights = weights; _bias = (Float)beta[0]; _standardErrors = _tValues = _pValues = null; if (!(_l2Weight > 0) && m == n) { // We would expect the solution to the problem to be exact in this case. _rSquared = 1; _rSquaredAdjusted = Float.NaN; ch.Info("Number of examples equals number of parameters, solution is exact but no statistics can be derived"); ch.Done(); return; } Double rss = 0; // residual sum of squares Double tss = 0; // total sum of squares using (var cursor = cursorFactory.Create()) { var lrPredictor = new LinearRegressionPredictor(Host, ref _weights, _bias); var lrMap = lrPredictor.GetMapper <VBuffer <Float>, Float>(); Float yh = default(Float); while (cursor.MoveNext()) { var features = cursor.Features; lrMap(ref features, ref yh); var e = cursor.Label - yh; rss += e * e; var ydm = cursor.Label - yMean; tss += ydm * ydm; } } _rSquared = ProbClamp(1 - (rss / tss)); // R^2 adjusted differs from the normal formula on account of the bias term, by Said's reckoning. if (n > m) { _rSquaredAdjusted = ProbClamp(1 - (1 - _rSquared) * (n - 1) / (n - m)); ch.Info("Coefficient of determination R2 = {0:g}, or {1:g} (adjusted)", _rSquared, _rSquaredAdjusted); } else { _rSquaredAdjusted = Double.NaN; } // The per parameter significance is compute intensive and may not be required for all practitioners. // Also we can't estimate it, unless we can estimate the variance, which requires more examples than // parameters. if (!_perParameterSignificance || m >= n) { return; } ch.Assert(!Double.IsNaN(_rSquaredAdjusted)); _standardErrors = new Double[m]; _tValues = new Double[m]; _pValues = new Double[m]; // Invert X'X: Mkl.Pptri(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, m, xtx); var s2 = rss / (n - m); // estimate of variance of y for (int i = 0; i < m; i++) { // Initialize with inverse Hessian. _standardErrors[i] = (Single)xtx[i * (i + 1) / 2 + i]; } if (_l2Weight > 0) { // Iterate through all entries of inverse Hessian to make adjustment to variance. int ioffset = 1; Float reg = _l2Weight * _l2Weight * n; for (int iRow = 1; iRow < m; iRow++) { for (int iCol = 0; iCol <= iRow; iCol++) { var entry = (Single)xtx[ioffset]; var adjustment = -reg * entry * entry; _standardErrors[iRow] -= adjustment; if (0 < iCol && iCol < iRow) { _standardErrors[iCol] -= adjustment; } ioffset++; } } Contracts.Assert(ioffset == xtx.Length); } for (int i = 0; i < m; i++) { // sqrt of diagonal entries of s2 * inverse(X'X + reg * I) * X'X * inverse(X'X + reg * I). _standardErrors[i] = Math.Sqrt(s2 * _standardErrors[i]); ch.Check(FloatUtils.IsFinite(_standardErrors[i]), "Non-finite standard error detected from OLS solution"); _tValues[i] = beta[i] / _standardErrors[i]; _pValues[i] = (Float)MathUtils.TStatisticToPValue(_tValues[i], n - m); ch.Check(0 <= _pValues[i] && _pValues[i] <= 1, "p-Value calculated outside expected [0,1] range"); } }
private protected override void TransformCore(ref TInput input, FixedSizeQueue <TInput> windowedBuffer, long iteration, ref VBuffer <TInput> output) { int size = _parentSliding.WindowSize - _parentSliding._lag + 1; var result = VBufferEditor.Create(ref output, size); if (_parentSliding._lag == 0) { for (int i = 0; i < _parentSliding.WindowSize; ++i) { result.Values[i] = windowedBuffer[i]; } result.Values[_parentSliding.WindowSize] = input; } else { for (int i = 0; i < size; ++i) { result.Values[i] = windowedBuffer[i]; } } output = result.Commit(); }
public static void Example() { // Create a new ML context, for ML.NET operations. It can be used for // exception tracking and logging, as well as the source of randomness. var mlContext = new MLContext(); // Create a small dataset as an IEnumerable. var samples = new List <TextData>() { new TextData() { Text = "This is an example to compute n-grams." }, new TextData() { Text = "N-gram is a sequence of 'N' consecutive " + "words/tokens." }, new TextData() { Text = "ML.NET's ProduceNgrams API produces " + "vector of n-grams." }, new TextData() { Text = "Each position in the vector corresponds " + "to a particular n-gram." }, new TextData() { Text = "The value at each position corresponds " + "to," }, new TextData() { Text = "the number of times n-gram occurred in " + "the data (Tf), or" }, new TextData() { Text = "the inverse of the number of documents " + "that contain the n-gram (Idf)," }, new TextData() { Text = "or compute both and multiply together " + "(Tf-Idf)." }, }; // Convert training data to IDataView. var dataview = mlContext.Data.LoadFromEnumerable(samples); // A pipeline for converting text into numeric n-gram features. // The following call to 'ProduceNgrams' requires the tokenized // text /string as input. This is achieved by calling // 'TokenizeIntoWords' first followed by 'ProduceNgrams'. Please note // that the length of the output feature vector depends on the n-gram // settings. var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "Text") // 'ProduceNgrams' takes key type as input. Converting the tokens // into key type using 'MapValueToKey'. .Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens")) .Append(mlContext.Transforms.Text.ProduceNgrams("NgramFeatures", "Tokens", ngramLength: 3, useAllLengths: false, weighting: NgramExtractingEstimator.WeightingCriteria.Tf)); // Fit to data. var textTransformer = textPipeline.Fit(dataview); var transformedDataView = textTransformer.Transform(dataview); // Create the prediction engine to get the n-gram features extracted // from the text. var predictionEngine = mlContext.Model.CreatePredictionEngine <TextData, TransformedTextData>(textTransformer); // Convert the text into numeric features. var prediction = predictionEngine.Predict(samples[0]); // Print the length of the feature vector. Console.WriteLine("Number of Features: " + prediction.NgramFeatures .Length); // Preview of the produced n-grams. // Get the slot names from the column's metadata. // The slot names for a vector column corresponds to the names // associated with each position in the vector. VBuffer <ReadOnlyMemory <char> > slotNames = default; transformedDataView.Schema["NgramFeatures"].GetSlotNames(ref slotNames); var NgramFeaturesColumn = transformedDataView.GetColumn <VBuffer < float> >(transformedDataView.Schema["NgramFeatures"]); var slots = slotNames.GetValues(); Console.Write("N-grams: "); foreach (var featureRow in NgramFeaturesColumn) { foreach (var item in featureRow.Items()) { Console.Write($"{slots[item.Key]} "); } Console.WriteLine(); } // Print the first 10 feature values. Console.Write("Features: "); for (int i = 0; i < 10; i++) { Console.Write($"{prediction.NgramFeatures[i]:F4} "); } // Expected output: // Number of Features: 52 // N-grams: This|is|an is|an|example an|example|to example|to|compute to|compute|n-grams. N-gram|is|a is|a|sequence a|sequence|of sequence|of|'N' of|'N'|consecutive ... // Features: 1.0000 1.0000 1.0000 1.0000 1.0000 0.0000 0.0000 0.0000 0.0000 0.0000 ... }
/// <summary> /// Return the raw margin from the decision hyperplane /// </summary> protected override Float Margin(ref VBuffer <Float> feat) { return(Bias + VectorUtils.DotProduct(ref feat, ref Weights) * WeightsScale); }
protected virtual void PreTrainingProcessInstance(Float label, ref VBuffer <Float> feat, Float weight) { }
private void InitDenseVecMap <T>(T[] vals, PrimitiveDataViewType itemType, int hashBits = 20) { var vbuf = new VBuffer <T>(vals.Length, vals); InitMap(vbuf, new VectorType(itemType, vals.Length), hashBits, vbuf.CopyTo); }
protected abstract Float AccumulateOneGradient(ref VBuffer <Float> feat, Float label, Float weight, ref VBuffer <Float> xDense, ref VBuffer <Float> grad, ref Float[] scratch);
/// <summary> /// Features: x1, x2vBuff(sparce vector), x3. /// y = 10x1 + 10x2vBuff + 30x3 + e. /// Within xBuff feature 2nd slot will be sparse most of the time. /// 2nd slot of xBuff has the least importance: Evaluation metrics do not change a lot when this slot is permuted. /// x3 has the biggest importance. /// </summary> private IDataView GetSparseDataset(TaskType task = TaskType.Regression, int numberOfInstances = 1000) { // Setup synthetic dataset. var rand = new Random(10); float[] yArray = new float[numberOfInstances], x1Array = new float[numberOfInstances], x3Array = new float[numberOfInstances]; VBuffer <float>[] vbArray = new VBuffer <float> [numberOfInstances]; for (var i = 0; i < numberOfInstances; i++) { var x1 = rand.Next(1000); x1Array[i] = x1; var x3Important = rand.Next(10000); x3Array[i] = x3Important; VBuffer <float> vb; if (i % 10 != 0) { vb = new VBuffer <float>(4, 3, new float[] { rand.Next(1000), rand.Next(1000), rand.Next(1000) }, new int[] { 0, 2, 3 }); } else { vb = new VBuffer <float>(4, 4, new float[] { rand.Next(1000), rand.Next(1000), rand.Next(1000), rand.Next(1000) }, new int[] { 0, 1, 2, 3 }); } vbArray[i] = vb; float vbSum = 0; foreach (var vbValue in vb.DenseValues()) { vbSum += vbValue * 10; } var noise = rand.Next(50); yArray[i] = 10 * x1 + vbSum + 20 * x3Important + noise; } // If binary classification, modify the labels if (task == TaskType.BinaryClassification || task == TaskType.MulticlassClassification) { GetBinaryClassificationLabels(yArray); } else if (task == TaskType.Ranking) { GetRankingLabels(yArray); } // Create data view. var bldr = new ArrayDataViewBuilder(Env); bldr.AddColumn("X1", NumberDataViewType.Single, x1Array); bldr.AddColumn("X2VBuffer", NumberDataViewType.Single, vbArray); bldr.AddColumn("X3Important", NumberDataViewType.Single, x3Array); bldr.AddColumn("Label", NumberDataViewType.Single, yArray); if (task == TaskType.Ranking) { bldr.AddColumn("GroupId", NumberDataViewType.UInt32, CreateGroupIds(yArray.Length)); } var srcDV = bldr.GetDataView(); var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2VBuffer", "X3Important") .Append(ML.Transforms.Normalize("Features")); if (task == TaskType.BinaryClassification) { return(pipeline.Append(ML.Transforms.Conversion.ConvertType("Label", outputKind: DataKind.Boolean)) .Fit(srcDV).Transform(srcDV)); } else if (task == TaskType.MulticlassClassification) { return(pipeline.Append(ML.Transforms.Conversion.MapValueToKey("Label")) .Fit(srcDV).Transform(srcDV)); } else if (task == TaskType.Ranking) { return(pipeline.Append(ML.Transforms.Conversion.MapValueToKey("GroupId")) .Fit(srcDV).Transform(srcDV)); } return(pipeline.Fit(srcDV).Transform(srcDV)); }
protected Float DifferentiableFunctionComputeChunk(int ichk, ref VBuffer <Float> xDense, ref VBuffer <Float> grad, IProgressChannel pch) { Contracts.Assert(0 <= ichk && ichk < _numChunks); Contracts.AssertValueOrNull(pch); VBufferUtils.Clear(ref grad); VBufferUtils.Densify(ref grad); Float[] scratch = null; double loss = 0; int ivMin = _ranges[ichk]; int ivLim = _ranges[ichk + 1]; int iv = ivMin; if (pch != null) { pch.SetHeader(new ProgressHeader(null, new[] { "examples" }), e => e.SetProgress(0, iv - ivMin, ivLim - ivMin)); } for (iv = ivMin; iv < ivLim; iv++) { Float weight = _weights != null ? _weights[iv] : 1; loss += AccumulateOneGradient(ref _features[iv], _labels[iv], weight, ref xDense, ref grad, ref scratch); } // we need use double type to accumulate loss to avoid roundoff error // please see http://mathworld.wolfram.com/RoundoffError.html for roundoff error definition // finally we need to convert double type to float for function definition return((Float)loss); }
public static void Example() { // Create a new ML context, for ML.NET operations. It can be used for // exception tracking and logging, as well as the source of randomness. var mlContext = new MLContext(seed: 1); // Get a small dataset as an IEnumerable. var rawData = new[] { new DataPoint() { Category = "MLB", Age = 18 }, new DataPoint() { Category = "NFL", Age = 14 }, new DataPoint() { Category = "NFL", Age = 15 }, new DataPoint() { Category = "MLB", Age = 18 }, new DataPoint() { Category = "MLS", Age = 14 }, }; var data = mlContext.Data.LoadFromEnumerable(rawData); // Construct the pipeline that would hash the two columns and store the // results in new columns. The first transform hashes the string column // and the second transform hashes the integer column. // // Hashing is not a reversible operation, so there is no way to retrieve // the original value from the hashed value. Sometimes, for debugging, // or model explainability, users will need to know what values in the // original columns generated the values in the hashed columns, since // the algorithms will mostly use the hashed values for further // computations. The Hash method will preserve the mapping from the // original values to the hashed values in the Annotations of the newly // created column (column populated with the hashed values). // // Setting the maximumNumberOfInverts parameters to -1 will preserve the // full map. If that parameter is left to the default 0 value, the // mapping is not preserved. var pipeline = mlContext.Transforms.Conversion.Hash( new[] { new HashingEstimator.ColumnOptions( "CategoryHashed", "Category", 16, useOrderedHashing: false, maximumNumberOfInverts: -1), new HashingEstimator.ColumnOptions( "AgeHashed", "Age", 8, useOrderedHashing: false) }); // Let's fit our pipeline, and then apply it to the same data. var transformer = pipeline.Fit(data); var transformedData = transformer.Transform(data); // Convert the post transformation from the IDataView format to an // IEnumerable <TransformedData> for easy consumption. var convertedData = mlContext.Data.CreateEnumerable < TransformedDataPoint>(transformedData, true); Console.WriteLine("Category CategoryHashed\t Age\t AgeHashed"); foreach (var item in convertedData) { Console.WriteLine($"{item.Category}\t {item.CategoryHashed}\t\t " + $"{item.Age}\t {item.AgeHashed}"); } // Expected data after the transformation. // // Category CategoryHashed Age AgeHashed // MLB 36206 18 127 // NFL 19015 14 62 // NFL 19015 15 43 // MLB 36206 18 127 // MLS 6013 14 62 // For the Category column, where we set the maximumNumberOfInverts // parameter, the names of the original categories, and their // correspondence with the generated hash values is preserved in the // Annotations in the format of indices and values.the indices array // will have the hashed values, and the corresponding element, // position -wise, in the values array will contain the original value. // // See below for an example on how to retrieve the mapping. var slotNames = new VBuffer <ReadOnlyMemory <char> >(); transformedData.Schema["CategoryHashed"].Annotations.GetValue( "KeyValues", ref slotNames); var indices = slotNames.GetIndices(); var categoryNames = slotNames.GetValues(); for (int i = 0; i < indices.Length; i++) { Console.WriteLine($"The original value of the {indices[i]} " + $"category is {categoryNames[i]}"); } // Output Data // // The original value of the 6012 category is MLS // The original value of the 19014 category is NFL // The original value of the 36205 category is MLB }
internal FunctionOptimizerState(IChannel ch, IProgressChannelProvider progress, DifferentiableFunction function, ref VBuffer <Float> initial, int m, long totalMemLimit, bool keepDense, bool enforceNonNegativity) : base(ch, progress, ref initial, m, totalMemLimit, keepDense, enforceNonNegativity) { Function = function; Init(); }
public void GetValue(ref VBuffer <T> dst) { Contracts.Check(Cursor.IsGood); Src.CopyTo(ref dst); }
public abstract Float Eval(ref VBuffer <Float> input, ref VBuffer <Float> gradient);
private SequencePool[] Train(Arguments args, IDataView trainingData, out double[][] invDocFreqs) { // Contains the maximum number of grams to store in the dictionary, for each level of ngrams, // from 1 (in position 0) up to ngramLength (in position ngramLength-1) var lims = new int[Infos.Length][]; for (int iinfo = 0; iinfo < Infos.Length; iinfo++) { var all = args.Column[iinfo].AllLengths ?? args.AllLengths; var ngramLength = _exes[iinfo].NgramLength; var maxNumTerms = Utils.Size(args.Column[iinfo].MaxNumTerms) > 0 ? args.Column[iinfo].MaxNumTerms : args.MaxNumTerms; if (!all) { Host.CheckUserArg(Utils.Size(maxNumTerms) == 0 || Utils.Size(maxNumTerms) == 1 && maxNumTerms[0] > 0, nameof(args.MaxNumTerms)); lims[iinfo] = new int[ngramLength]; lims[iinfo][ngramLength - 1] = Utils.Size(maxNumTerms) == 0 ? Arguments.DefaultMaxTerms : maxNumTerms[0]; } else { Host.CheckUserArg(Utils.Size(maxNumTerms) <= ngramLength, nameof(args.MaxNumTerms)); Host.CheckUserArg(Utils.Size(maxNumTerms) == 0 || maxNumTerms.All(i => i >= 0) && maxNumTerms[maxNumTerms.Length - 1] > 0, nameof(args.MaxNumTerms)); var extend = Utils.Size(maxNumTerms) == 0 ? Arguments.DefaultMaxTerms : maxNumTerms[maxNumTerms.Length - 1]; lims[iinfo] = Utils.BuildArray(ngramLength, i => i < Utils.Size(maxNumTerms) ? maxNumTerms[i] : extend); } } var helpers = new NgramBufferBuilder[Infos.Length]; var getters = new ValueGetter <VBuffer <uint> > [Infos.Length]; var src = new VBuffer <uint> [Infos.Length]; // Keep track of how many grams are in the pool for each value of n. Position // i in _counts counts how many (i+1)-grams are in the pool for column iinfo. var counts = new int[Infos.Length][]; var ngramMaps = new SequencePool[Infos.Length]; bool[] activeInput = new bool[trainingData.Schema.ColumnCount]; foreach (var info in Infos) { activeInput[info.Source] = true; } using (var cursor = trainingData.GetRowCursor(col => activeInput[col])) using (var pch = Host.StartProgressChannel("Building n-gram dictionary")) { for (int iinfo = 0; iinfo < Infos.Length; iinfo++) { Host.Assert(Infos[iinfo].TypeSrc.IsVector && Infos[iinfo].TypeSrc.ItemType.IsKey); var ngramLength = _exes[iinfo].NgramLength; var skipLength = _exes[iinfo].SkipLength; getters[iinfo] = RowCursorUtils.GetVecGetterAs <uint>(NumberType.U4, cursor, Infos[iinfo].Source); src[iinfo] = default(VBuffer <uint>); counts[iinfo] = new int[ngramLength]; ngramMaps[iinfo] = new SequencePool(); // Note: GetNgramIdFinderAdd will control how many ngrams of a specific length will // be added (using lims[iinfo]), therefore we set slotLim to the maximum helpers[iinfo] = new NgramBufferBuilder(ngramLength, skipLength, Utils.ArrayMaxSize, GetNgramIdFinderAdd(counts[iinfo], lims[iinfo], ngramMaps[iinfo], _exes[iinfo].RequireIdf(), Host)); } int cInfoFull = 0; bool[] infoFull = new bool[Infos.Length]; invDocFreqs = new double[Infos.Length][]; long totalDocs = 0; Double rowCount = trainingData.GetRowCount(true) ?? Double.NaN; var buffers = new VBuffer <float> [Infos.Length]; pch.SetHeader(new ProgressHeader(new[] { "Total n-grams" }, new[] { "documents" }), e => e.SetProgress(0, totalDocs, rowCount)); while (cInfoFull < Infos.Length && cursor.MoveNext()) { totalDocs++; for (int iinfo = 0; iinfo < Infos.Length; iinfo++) { getters[iinfo](ref src[iinfo]); var keyCount = (uint)Infos[iinfo].TypeSrc.ItemType.KeyCount; if (keyCount == 0) { keyCount = uint.MaxValue; } if (!infoFull[iinfo]) { if (_exes[iinfo].RequireIdf()) { helpers[iinfo].Reset(); } helpers[iinfo].AddNgrams(ref src[iinfo], 0, keyCount); if (_exes[iinfo].RequireIdf()) { int totalNgrams = counts[iinfo].Sum(); Utils.EnsureSize(ref invDocFreqs[iinfo], totalNgrams); helpers[iinfo].GetResult(ref buffers[iinfo]); foreach (var pair in buffers[iinfo].Items()) { if (pair.Value >= 1) { invDocFreqs[iinfo][pair.Key] += 1; } } } } AssertValid(counts[iinfo], lims[iinfo], ngramMaps[iinfo]); } } pch.Checkpoint(counts.Sum(c => c.Sum()), totalDocs); for (int iinfo = 0; iinfo < Infos.Length; iinfo++) { for (int i = 0; i < Utils.Size(invDocFreqs[iinfo]); i++) { if (invDocFreqs[iinfo][i] != 0) { invDocFreqs[iinfo][i] = Math.Log(totalDocs / invDocFreqs[iinfo][i]); } } } for (int iinfo = 0; iinfo < Infos.Length; iinfo++) { AssertValid(counts[iinfo], lims[iinfo], ngramMaps[iinfo]); int ngramLength = _exes[iinfo].NgramLength; for (int i = 0; i < ngramLength; i++) { _exes[iinfo].NonEmptyLevels[i] = counts[iinfo][i] > 0; } } return(ngramMaps); } }
/// <summary> /// Minimize a function using the MeanRelativeImprovement termination criterion with the supplied tolerance level /// </summary> /// <param name="function">The function to minimize</param> /// <param name="initial">The initial point</param> /// <param name="tolerance">Convergence tolerance (smaller means more iterations, closer to exact optimum)</param> /// <param name="result">The point at the optimum</param> /// <param name="optimum">The optimum function value</param> /// <exception cref="PrematureConvergenceException">Thrown if successive points are within numeric precision of each other, but termination condition is still unsatisfied.</exception> public void Minimize(DifferentiableFunction function, ref VBuffer <Float> initial, Float tolerance, ref VBuffer <Float> result, out Float optimum) { ITerminationCriterion term = new MeanRelativeImprovementCriterion(tolerance); Minimize(function, ref initial, term, ref result, out optimum); }
protected override Delegate GetGetterCore(IChannel ch, IRow input, int iinfo, out Action disposer) { Host.AssertValueOrNull(ch); Host.AssertValue(input); Host.Assert(0 <= iinfo && iinfo < Infos.Length); Host.Assert(Infos[iinfo].TypeSrc.IsVector); Host.Assert(Infos[iinfo].TypeSrc.ItemType.IsKey); disposer = null; var getSrc = RowCursorUtils.GetVecGetterAs <uint>(NumberType.U4, input, Infos[iinfo].Source); var src = default(VBuffer <uint>); var bldr = new NgramBufferBuilder(_exes[iinfo].NgramLength, _exes[iinfo].SkipLength, _ngramMaps[iinfo].Count, GetNgramIdFinder(iinfo)); var keyCount = (uint)Infos[iinfo].TypeSrc.ItemType.KeyCount; if (keyCount == 0) { keyCount = uint.MaxValue; } ValueGetter <VBuffer <Float> > del; switch (_exes[iinfo].Weighting) { case WeightingCriteria.TfIdf: Host.AssertValue(_invDocFreqs[iinfo]); del = (ref VBuffer <Float> dst) => { getSrc(ref src); if (!bldr.IsEmpty) { bldr.Reset(); bldr.AddNgrams(ref src, 0, keyCount); bldr.GetResult(ref dst); VBufferUtils.Apply(ref dst, (int i, ref Float v) => v = (Float)(v * _invDocFreqs[iinfo][i])); } else { dst = new VBuffer <Float>(0, dst.Values, dst.Indices); } }; break; case WeightingCriteria.Idf: Host.AssertValue(_invDocFreqs[iinfo]); del = (ref VBuffer <Float> dst) => { getSrc(ref src); if (!bldr.IsEmpty) { bldr.Reset(); bldr.AddNgrams(ref src, 0, keyCount); bldr.GetResult(ref dst); VBufferUtils.Apply(ref dst, (int i, ref Float v) => v = v >= 1 ? (Float)_invDocFreqs[iinfo][i] : 0); } else { dst = new VBuffer <Float>(0, dst.Values, dst.Indices); } }; break; case WeightingCriteria.Tf: del = (ref VBuffer <Float> dst) => { getSrc(ref src); if (!bldr.IsEmpty) { bldr.Reset(); bldr.AddNgrams(ref src, 0, keyCount); bldr.GetResult(ref dst); } else { dst = new VBuffer <Float>(0, dst.Values, dst.Indices); } }; break; default: throw Host.Except("Unsupported weighting criteria"); } return(del); }
public VBuffer[] GetVBuffers() { if (LogLoaded) { if (IsLogD3D11) { VBuffer[] ret = new VBuffer[m_D3D11.m_IA.vbuffers.Length]; for (int i = 0; i < m_D3D11.m_IA.vbuffers.Length; i++) { ret[i].Buffer = m_D3D11.m_IA.vbuffers[i].Buffer; ret[i].ByteOffset = m_D3D11.m_IA.vbuffers[i].Offset; ret[i].ByteStride = m_D3D11.m_IA.vbuffers[i].Stride; } return ret; } else if (IsLogGL) { VBuffer[] ret = new VBuffer[m_GL.m_VtxIn.vbuffers.Length]; for (int i = 0; i < m_GL.m_VtxIn.vbuffers.Length; i++) { ret[i].Buffer = m_GL.m_VtxIn.vbuffers[i].Buffer; ret[i].ByteOffset = m_GL.m_VtxIn.vbuffers[i].Offset; ret[i].ByteStride = m_GL.m_VtxIn.vbuffers[i].Stride; } return ret; } } return null; }
private static Delegate GetDefaultVectorGetter <TValue>() { ValueGetter <VBuffer <TValue> > getter = (ref VBuffer <TValue> value) => value = new VBuffer <TValue>(AllVectorSizes, 0, null, null); return(getter); }