private TPredictor TrainCore(IChannel ch, RoleMappedData data, LinearModelParameters predictor, int weightSetCount) { int numFeatures = data.Schema.Feature.Value.Type.GetVectorSize(); var cursorFactory = new FloatLabelCursor.Factory(data, CursOpt.Label | CursOpt.Features); int numThreads = 1; ch.CheckUserArg(numThreads > 0, nameof(_options.NumberOfThreads), "The number of threads must be either null or a positive integer."); var positiveInstanceWeight = _options.PositiveInstanceWeight; VBuffer <float> weights = default; float bias = 0.0f; if (predictor != null) { predictor.GetFeatureWeights(ref weights); VBufferUtils.Densify(ref weights); bias = predictor.Bias; } else { weights = VBufferUtils.CreateDense <float>(numFeatures); } var weightsEditor = VBufferEditor.CreateFromBuffer(ref weights); // Reference: Parasail. SymSGD. bool tuneLR = _options.LearningRate == null; var lr = _options.LearningRate ?? 1.0f; bool tuneNumLocIter = (_options.UpdateFrequency == null); var numLocIter = _options.UpdateFrequency ?? 1; var l2Const = _options.L2Regularization; var piw = _options.PositiveInstanceWeight; // This is state of the learner that is shared with the native code. State state = new State(); GCHandle stateGCHandle = default; try { stateGCHandle = GCHandle.Alloc(state, GCHandleType.Pinned); state.TotalInstancesProcessed = 0; using (InputDataManager inputDataManager = new InputDataManager(this, cursorFactory, ch)) { bool shouldInitialize = true; using (var pch = Host.StartProgressChannel("Preprocessing")) inputDataManager.LoadAsMuchAsPossible(); int iter = 0; if (inputDataManager.IsFullyLoaded) { ch.Info("Data fully loaded into memory."); } using (var pch = Host.StartProgressChannel("Training")) { if (inputDataManager.IsFullyLoaded) { pch.SetHeader(new ProgressHeader(new[] { "iterations" }), entry => entry.SetProgress(0, state.PassIteration, _options.NumberOfIterations)); // If fully loaded, call the SymSGDNative and do not come back until learned for all iterations. Native.LearnAll(inputDataManager, tuneLR, ref lr, l2Const, piw, weightsEditor.Values, ref bias, numFeatures, _options.NumberOfIterations, numThreads, tuneNumLocIter, ref numLocIter, _options.Tolerance, _options.Shuffle, shouldInitialize, stateGCHandle, ch.Info); shouldInitialize = false; } else { pch.SetHeader(new ProgressHeader(new[] { "iterations" }), entry => entry.SetProgress(0, iter, _options.NumberOfIterations)); // Since we loaded data in batch sizes, multiple passes over the loaded data is feasible. int numPassesForABatch = inputDataManager.Count / 10000; while (iter < _options.NumberOfIterations) { // We want to train on the final passes thoroughly (without learning on the same batch multiple times) // This is for fine tuning the AUC. Experimentally, we found that 1 or 2 passes is enough int numFinalPassesToTrainThoroughly = 2; // We also do not want to learn for more passes than what the user asked int numPassesForThisBatch = Math.Min(numPassesForABatch, _options.NumberOfIterations - iter - numFinalPassesToTrainThoroughly); // If all of this leaves us with 0 passes, then set numPassesForThisBatch to 1 numPassesForThisBatch = Math.Max(1, numPassesForThisBatch); state.PassIteration = iter; Native.LearnAll(inputDataManager, tuneLR, ref lr, l2Const, piw, weightsEditor.Values, ref bias, numFeatures, numPassesForThisBatch, numThreads, tuneNumLocIter, ref numLocIter, _options.Tolerance, _options.Shuffle, shouldInitialize, stateGCHandle, ch.Info); shouldInitialize = false; // Check if we are done with going through the data if (inputDataManager.FinishedTheLoad) { iter += numPassesForThisBatch; // Check if more passes are left if (iter < _options.NumberOfIterations) { inputDataManager.RestartLoading(_options.Shuffle, Host); } } // If more passes are left, load as much as possible if (iter < _options.NumberOfIterations) { inputDataManager.LoadAsMuchAsPossible(); } } } // Maps back the dense features that are mislocated if (numThreads > 1) { Native.MapBackWeightVector(weightsEditor.Values, stateGCHandle); } Native.DeallocateSequentially(stateGCHandle); } } } finally { if (stateGCHandle.IsAllocated) { stateGCHandle.Free(); } } return(CreatePredictor(weights, bias)); }
private ValueGetter <VBuffer <ushort> > MakeGetterVec(DataViewRow input, int iinfo) { Host.AssertValue(input); int cv = input.Schema[ColMapNewToOld[iinfo]].Type.GetVectorSize(); Contracts.Assert(cv >= 0); var getSrc = input.GetGetter <VBuffer <ReadOnlyMemory <char> > >(input.Schema[ColMapNewToOld[iinfo]]); var src = default(VBuffer <ReadOnlyMemory <char> >); ValueGetter <VBuffer <ushort> > getterWithStartEndSep = (ref VBuffer <ushort> dst) => { getSrc(ref src); int len = 0; var srcValues = src.GetValues(); for (int i = 0; i < srcValues.Length; i++) { if (!srcValues[i].IsEmpty) { len += srcValues[i].Length; if (_parent._useMarkerChars) { len += TextMarkersCount; } } } var editor = VBufferEditor.Create(ref dst, len); if (len > 0) { int index = 0; for (int i = 0; i < srcValues.Length; i++) { if (srcValues[i].IsEmpty) { continue; } if (_parent._useMarkerChars) { editor.Values[index++] = TextStartMarker; } var span = srcValues[i].Span; for (int ich = 0; ich < srcValues[i].Length; ich++) { editor.Values[index++] = span[ich]; } if (_parent._useMarkerChars) { editor.Values[index++] = TextEndMarker; } } Contracts.Assert(index == len); } dst = editor.Commit(); }; ValueGetter <VBuffer <ushort> > getterWithUnitSep = (ref VBuffer <ushort> dst) => { getSrc(ref src); int len = 0; var srcValues = src.GetValues(); for (int i = 0; i < srcValues.Length; i++) { if (!srcValues[i].IsEmpty) { len += srcValues[i].Length; if (i > 0) { len += 1; // add UnitSeparator character to len that will be added } } } if (_parent._useMarkerChars) { len += TextMarkersCount; } var editor = VBufferEditor.Create(ref dst, len); if (len > 0) { int index = 0; // ReadOnlyMemory can be a result of either concatenating text columns together // or application of word tokenizer before char tokenizer in TextFeaturizingEstimator. // // Considering VBuffer<ReadOnlyMemory> as a single text stream. // Therefore, prepend and append start and end markers only once i.e. at the start and at end of vector. // Insert UnitSeparator after every piece of text in the vector. if (_parent._useMarkerChars) { editor.Values[index++] = TextStartMarker; } for (int i = 0; i < srcValues.Length; i++) { if (srcValues[i].IsEmpty) { continue; } if (i > 0) { editor.Values[index++] = UnitSeparator; } var span = srcValues[i].Span; for (int ich = 0; ich < srcValues[i].Length; ich++) { editor.Values[index++] = span[ich]; } } if (_parent._useMarkerChars) { editor.Values[index++] = TextEndMarker; } Contracts.Assert(index == len); } dst = editor.Commit(); }; return(_parent._isSeparatorStartEnd ? getterWithStartEndSep : getterWithUnitSep); }
/// <inheritdoc/> private protected override void TrainWithoutLock(IProgressChannelProvider progress, FloatLabelCursor.Factory cursorFactory, Random rand, IdToIdxLookup idToIdx, int numThreads, DualsTableBase duals, Float[] biasReg, Float[] invariants, Float lambdaNInv, VBuffer <Float>[] weights, Float[] biasUnreg, VBuffer <Float>[] l1IntermediateWeights, Float[] l1IntermediateBias, Float[] featureNormSquared) { Contracts.AssertValueOrNull(progress); Contracts.Assert(Args.L1Threshold.HasValue); Contracts.AssertValueOrNull(idToIdx); Contracts.AssertValueOrNull(invariants); Contracts.AssertValueOrNull(featureNormSquared); int numClasses = Utils.Size(weights); Contracts.Assert(Utils.Size(biasReg) == numClasses); Contracts.Assert(Utils.Size(biasUnreg) == numClasses); int maxUpdateTrials = 2 * numThreads; var l1Threshold = Args.L1Threshold.Value; bool l1ThresholdZero = l1Threshold == 0; var lr = Args.BiasLearningRate * Args.L2Const.Value; var pch = progress != null?progress.StartProgressChannel("Dual update") : null; using (pch) using (var cursor = Args.Shuffle ? cursorFactory.Create(rand) : cursorFactory.Create()) { long rowCount = 0; if (pch != null) { pch.SetHeader(new ProgressHeader("examples"), e => e.SetProgress(0, rowCount)); } Func <RowId, long> getIndexFromId = GetIndexFromIdGetter(idToIdx, biasReg.Length); while (cursor.MoveNext()) { long idx = getIndexFromId(cursor.Id); long dualIndexInitPos = idx * numClasses; var features = cursor.Features; var label = (int)cursor.Label; Float invariant; Float normSquared; if (invariants != null) { invariant = invariants[idx]; Contracts.AssertValue(featureNormSquared); normSquared = featureNormSquared[idx]; } else { normSquared = VectorUtils.NormSquared(in features); if (Args.BiasLearningRate == 0) { normSquared += 1; } invariant = _loss.ComputeDualUpdateInvariant(2 * normSquared * lambdaNInv * GetInstanceWeight(cursor)); } // The output for the label class using current weights and bias. var labelOutput = WDot(in features, in weights[label], biasReg[label] + biasUnreg[label]); var instanceWeight = GetInstanceWeight(cursor); // This will be the new dual variable corresponding to the label class. Float labelDual = 0; // This will be used to update the weights and regularized bias corresponding to the label class. Float labelPrimalUpdate = 0; // This will be used to update the unregularized bias corresponding to the label class. Float labelAdjustment = 0; // Iterates through all classes. for (int iClass = 0; iClass < numClasses; iClass++) { // Skip the dual/weights/bias update for label class. Will be taken care of at the end. if (iClass == label) { continue; } var weightsEditor = VBufferEditor.CreateFromBuffer(ref weights[iClass]); var l1IntermediateWeightsEditor = !l1ThresholdZero?VBufferEditor.CreateFromBuffer(ref l1IntermediateWeights[iClass]) : default; // Loop trials for compare-and-swap updates of duals. // In general, concurrent update conflict to the same dual variable is rare // if data is shuffled. for (int numTrials = 0; numTrials < maxUpdateTrials; numTrials++) { long dualIndex = iClass + dualIndexInitPos; var dual = duals[dualIndex]; var output = labelOutput + labelPrimalUpdate * normSquared - WDot(in features, in weights[iClass], biasReg[iClass] + biasUnreg[iClass]); var dualUpdate = _loss.DualUpdate(output, 1, dual, invariant, numThreads); // The successive over-relaxation apporach to adjust the sum of dual variables (biasReg) to zero. // Reference to details: http://stat.rutgers.edu/home/tzhang/papers/ml02_dual.pdf, pp. 16-17. var adjustment = l1ThresholdZero ? lr * biasReg[iClass] : lr * l1IntermediateBias[iClass]; dualUpdate -= adjustment; bool success = false; duals.ApplyAt(dualIndex, (long index, ref Float value) => success = Interlocked.CompareExchange(ref value, dual + dualUpdate, dual) == dual); if (success) { // Note: dualConstraint[iClass] = lambdaNInv * (sum of duals[iClass]) var primalUpdate = dualUpdate * lambdaNInv * instanceWeight; labelDual -= dual + dualUpdate; labelPrimalUpdate += primalUpdate; biasUnreg[iClass] += adjustment * lambdaNInv * instanceWeight; labelAdjustment -= adjustment; if (l1ThresholdZero) { VectorUtils.AddMult(in features, weightsEditor.Values, -primalUpdate); biasReg[iClass] -= primalUpdate; } else { //Iterative shrinkage-thresholding (aka. soft-thresholding) //Update v=denseWeights as if there's no L1 //Thresholding: if |v[j]| < threshold, turn off weights[j] //If not, shrink: w[j] = v[i] - sign(v[j]) * threshold l1IntermediateBias[iClass] -= primalUpdate; if (Args.BiasLearningRate == 0) { biasReg[iClass] = Math.Abs(l1IntermediateBias[iClass]) - l1Threshold > 0.0 ? l1IntermediateBias[iClass] - Math.Sign(l1IntermediateBias[iClass]) * l1Threshold : 0; } var featureValues = features.GetValues(); if (features.IsDense) { CpuMathUtils.SdcaL1UpdateDense(-primalUpdate, featureValues.Length, featureValues, l1Threshold, l1IntermediateWeightsEditor.Values, weightsEditor.Values); } else if (featureValues.Length > 0) { CpuMathUtils.SdcaL1UpdateSparse(-primalUpdate, featureValues.Length, featureValues, features.GetIndices(), l1Threshold, l1IntermediateWeightsEditor.Values, weightsEditor.Values); } } break; } } } // Updating with label class weights and dual variable. duals[label + dualIndexInitPos] = labelDual; biasUnreg[label] += labelAdjustment * lambdaNInv * instanceWeight; if (l1ThresholdZero) { var weightsEditor = VBufferEditor.CreateFromBuffer(ref weights[label]); VectorUtils.AddMult(in features, weightsEditor.Values, labelPrimalUpdate); biasReg[label] += labelPrimalUpdate; } else { l1IntermediateBias[label] += labelPrimalUpdate; var intermediateBias = l1IntermediateBias[label]; biasReg[label] = Math.Abs(intermediateBias) - l1Threshold > 0.0 ? intermediateBias - Math.Sign(intermediateBias) * l1Threshold : 0; var weightsEditor = VBufferEditor.CreateFromBuffer(ref weights[label]); var l1IntermediateWeightsEditor = VBufferEditor.CreateFromBuffer(ref l1IntermediateWeights[label]); var featureValues = features.GetValues(); if (features.IsDense) { CpuMathUtils.SdcaL1UpdateDense(labelPrimalUpdate, featureValues.Length, featureValues, l1Threshold, l1IntermediateWeightsEditor.Values, weightsEditor.Values); } else if (featureValues.Length > 0) { CpuMathUtils.SdcaL1UpdateSparse(labelPrimalUpdate, featureValues.Length, featureValues, features.GetIndices(), l1Threshold, l1IntermediateWeightsEditor.Values, weightsEditor.Values); } } rowCount++; } } }
private protected override Delegate[] CreateGettersCore(Row input, Func <int, bool> activeCols, out Action disposer) { disposer = null; var getters = new Delegate[3]; if (!activeCols(ClusterIdCol) && !activeCols(SortedClusterCol) && !activeCols(SortedClusterScoreCol)) { return(getters); } long cachedPosition = -1; VBuffer <Single> scores = default(VBuffer <Single>); var scoresArr = new Single[_numClusters]; int[] sortedIndices = new int[_numClusters]; var scoreGetter = input.GetGetter <VBuffer <Single> >(ScoreIndex); Action updateCacheIfNeeded = () => { if (cachedPosition != input.Position) { scoreGetter(ref scores); scores.CopyTo(scoresArr); int j = 0; foreach (var index in Enumerable.Range(0, scoresArr.Length).OrderBy(i => scoresArr[i])) { sortedIndices[j++] = index; } cachedPosition = input.Position; } }; if (activeCols(ClusterIdCol)) { ValueGetter <uint> assignedFn = (ref uint dst) => { updateCacheIfNeeded(); dst = (uint)sortedIndices[0] + 1; }; getters[ClusterIdCol] = assignedFn; } if (activeCols(SortedClusterScoreCol)) { ValueGetter <VBuffer <Single> > topKScoresFn = (ref VBuffer <Single> dst) => { updateCacheIfNeeded(); var editor = VBufferEditor.Create(ref dst, _numClusters); for (int i = 0; i < _numClusters; i++) { editor.Values[i] = scores.GetItemOrDefault(sortedIndices[i]); } dst = editor.Commit(); }; getters[SortedClusterScoreCol] = topKScoresFn; } if (activeCols(SortedClusterCol)) { ValueGetter <VBuffer <uint> > topKClassesFn = (ref VBuffer <uint> dst) => { updateCacheIfNeeded(); var editor = VBufferEditor.Create(ref dst, _numClusters); for (int i = 0; i < _numClusters; i++) { editor.Values[i] = (uint)sortedIndices[i] + 1; } dst = editor.Commit(); }; getters[SortedClusterCol] = topKClassesFn; } return(getters); }
/// <summary> /// Getter generator for inputs of type <typeparamref name="TSrc"/>, where output type is a vector of hashes /// </summary> /// <typeparam name="TSrc">Input type. Must be a vector</typeparam> /// <param name="input">Row input</param> /// <param name="iinfo">Index of the getter</param> private ValueGetter <VBuffer <uint> > ComposeGetterVecToVec <TSrc>(Row input, int iinfo) { Host.AssertValue(input); Host.Assert(Infos[iinfo].TypeSrc.IsVector); var getSrc = GetSrcGetter <VBuffer <TSrc> >(input, iinfo); var hashFunction = ComposeHashDelegate <TSrc>(); var src = default(VBuffer <TSrc>); int n = _exes[iinfo].OutputValueCount; int expectedSrcLength = Infos[iinfo].TypeSrc.VectorSize; int[][] slotMap = _exes[iinfo].SlotMap; // REVIEW: consider adding a fix-zero functionality (subtract emptyTextHash from all hashes) var mask = (1U << _exes[iinfo].HashBits) - 1; var hashSeed = _exes[iinfo].HashSeed; bool ordered = _exes[iinfo].Ordered; TSrc[] denseValues = null; return ((ref VBuffer <uint> dst) => { getSrc(ref src); Host.Check(src.Length == expectedSrcLength); ReadOnlySpan <TSrc> values; // force-densify the input // REVIEW: this performs poorly if only a fraction of sparse vector is used for hashing. // This scenario was unlikely at the time of writing. Regardless of performance, the hash value // needs to be consistent across equivalent representations - sparse vs dense. if (src.IsDense) { values = src.GetValues(); } else { if (denseValues == null) { denseValues = new TSrc[expectedSrcLength]; } src.CopyTo(denseValues); values = denseValues; } var hashes = VBufferEditor.Create(ref dst, n); for (int i = 0; i < n; i++) { uint hash = hashSeed; foreach (var srcSlot in slotMap[i]) { // REVIEW: some legacy code hashes 0 for srcSlot in ord- case, do we need to preserve this behavior? if (ordered) { hash = Hashing.MurmurRound(hash, (uint)srcSlot); } hash = hashFunction(in values[srcSlot], hash); } hashes.Values[i] = (Hashing.MixHash(hash) & mask) + 1; // +1 to offset from zero, which has special meaning for KeyType } dst = hashes.Commit(); }); }
private protected override sealed void TransformCore(ref TInput input, FixedSizeQueue <TInput> windowedBuffer, long iteration, ref VBuffer <Double> dst) { var outputLength = Parent.OutputLength; Host.Assert(outputLength >= 2); var result = VBufferEditor.Create(ref dst, outputLength); float rawScore = 0; for (int i = 0; i < outputLength; ++i) { result.Values[i] = Double.NaN; } // Step 1: Computing the raw anomaly score result.Values[1] = ComputeRawAnomalyScore(ref input, windowedBuffer, iteration); if (Double.IsNaN(result.Values[1])) { result.Values[0] = 0; } else { if (WindowSize > 0) { // Step 2: Computing the p-value score rawScore = (float)result.Values[1]; if (Parent.ThresholdScore == AlertingScore.RawScore) { switch (Parent.Side) { case AnomalySide.Negative: rawScore = (float)(-result.Values[1]); break; case AnomalySide.Positive: break; default: rawScore = (float)Math.Abs(result.Values[1]); break; } } else { result.Values[2] = ComputeKernelPValue(rawScore); switch (Parent.Side) { case AnomalySide.Negative: result.Values[2] = 1 - result.Values[2]; break; case AnomalySide.Positive: break; default: result.Values[2] = Math.Min(result.Values[2], 1 - result.Values[2]); break; } // Keeping the p-value in the safe range if (result.Values[2] < SequentialAnomalyDetectionTransformBase <TInput, TState> .MinPValue) { result.Values[2] = SequentialAnomalyDetectionTransformBase <TInput, TState> .MinPValue; } else if (result.Values[2] > SequentialAnomalyDetectionTransformBase <TInput, TState> .MaxPValue) { result.Values[2] = SequentialAnomalyDetectionTransformBase <TInput, TState> .MaxPValue; } RawScoreBuffer.AddLast(rawScore); // Step 3: Computing the martingale value if (Parent.Martingale != MartingaleType.None && Parent.ThresholdScore == AlertingScore.MartingaleScore) { Double martingaleUpdate = 0; switch (Parent.Martingale) { case MartingaleType.Power: martingaleUpdate = Parent.LogPowerMartigaleBettingFunc(result.Values[2], Parent.PowerMartingaleEpsilon); break; case MartingaleType.Mixture: martingaleUpdate = Parent.LogMixtureMartigaleBettingFunc(result.Values[2]); break; } if (LogMartingaleUpdateBuffer.Count == 0) { for (int i = 0; i < LogMartingaleUpdateBuffer.Capacity; ++i) { LogMartingaleUpdateBuffer.AddLast(martingaleUpdate); } _logMartingaleValue += LogMartingaleUpdateBuffer.Capacity * martingaleUpdate; } else { _logMartingaleValue += martingaleUpdate; _logMartingaleValue -= LogMartingaleUpdateBuffer.PeekFirst(); LogMartingaleUpdateBuffer.AddLast(martingaleUpdate); } result.Values[3] = Math.Exp(_logMartingaleValue); } } } // Generating alert bool alert = false; if (RawScoreBuffer.IsFull) // No alert until the buffer is completely full. { switch (Parent.ThresholdScore) { case AlertingScore.RawScore: alert = rawScore >= Parent.AlertThreshold; break; case AlertingScore.PValueScore: alert = result.Values[2] <= Parent.AlertThreshold; break; case AlertingScore.MartingaleScore: alert = (Parent.Martingale != MartingaleType.None) && (result.Values[3] >= Parent.AlertThreshold); if (alert) { if (_martingaleAlertCounter > 0) { alert = false; } else { _martingaleAlertCounter = Parent.WindowSize; } } _martingaleAlertCounter--; _martingaleAlertCounter = _martingaleAlertCounter < 0 ? 0 : _martingaleAlertCounter; break; } } result.Values[0] = Convert.ToDouble(alert); } dst = result.Commit(); }
// This converts in place. private static void FillValues(IExceptionContext ectx, ref VBuffer <Float> buffer) { int size = buffer.Length; ectx.Check(0 <= size & size < int.MaxValue / 2); var values = buffer.GetValues(); var editor = VBufferEditor.Create(ref buffer, size * 2, values.Length); int iivDst = 0; if (buffer.IsDense) { // Currently, it's dense. We always produce sparse. for (int ivSrc = 0; ivSrc < values.Length; ivSrc++) { ectx.Assert(iivDst <= ivSrc); var val = values[ivSrc]; if (val == 0) { continue; } if (Float.IsNaN(val)) { editor.Values[iivDst] = 1; editor.Indices[iivDst] = 2 * ivSrc + 1; } else { editor.Values[iivDst] = val; editor.Indices[iivDst] = 2 * ivSrc; } iivDst++; } } else { // Currently, it's sparse. var indices = buffer.GetIndices(); int ivPrev = -1; for (int iivSrc = 0; iivSrc < values.Length; iivSrc++) { ectx.Assert(iivDst <= iivSrc); var val = values[iivSrc]; if (val == 0) { continue; } int iv = indices[iivSrc]; ectx.Assert(ivPrev < iv & iv < size); ivPrev = iv; if (Float.IsNaN(val)) { editor.Values[iivDst] = 1; editor.Indices[iivDst] = 2 * iv + 1; } else { editor.Values[iivDst] = val; editor.Indices[iivDst] = 2 * iv; } iivDst++; } } ectx.Assert(0 <= iivDst & iivDst <= values.Length); buffer = editor.CommitTruncated(iivDst); }
private void GetSlotNames(int iinfo, ref VBuffer <ReadOnlyMemory <char> > dst) { Host.Assert(0 <= iinfo && iinfo < Infos.Length); int size = _types[iinfo].VectorSize; if (size == 0) { throw MetadataUtils.ExceptGetMetadata(); } var editor = VBufferEditor.Create(ref dst, size); var type = Infos[iinfo].TypeSrc; if (!type.IsVector) { Host.Assert(_types[iinfo].VectorSize == 2); var columnName = Source.Schema.GetColumnName(Infos[iinfo].Source); editor.Values[0] = columnName.AsMemory(); editor.Values[1] = (columnName + IndicatorSuffix).AsMemory(); } else { Host.Assert(type.IsKnownSizeVector); Host.Assert(size == 2 * type.VectorSize); // REVIEW: Do we need to verify that there is metadata or should we just call GetMetadata? var typeNames = Source.Schema.GetMetadataTypeOrNull(MetadataUtils.Kinds.SlotNames, Infos[iinfo].Source); if (typeNames == null || typeNames.VectorSize != type.VectorSize || !typeNames.ItemType.IsText) { throw MetadataUtils.ExceptGetMetadata(); } var names = default(VBuffer <ReadOnlyMemory <char> >); Source.Schema.GetMetadata(MetadataUtils.Kinds.SlotNames, Infos[iinfo].Source, ref names); // We both assert and check. If this fails, there is a bug somewhere (possibly in this code // but more likely in the implementation of Base. On the other hand, we don't want to proceed // if we've received garbage. Host.Check(names.Length == type.VectorSize, "Unexpected slot name vector size"); var sb = new StringBuilder(); int slot = 0; foreach (var kvp in names.Items(all: true)) { Host.Assert(0 <= slot && slot < size); Host.Assert(slot % 2 == 0); sb.Clear(); if (kvp.Value.IsEmpty) { sb.Append('[').Append(slot / 2).Append(']'); } else { sb.AppendMemory(kvp.Value); } int len = sb.Length; sb.Append(IndicatorSuffix); var str = sb.ToString(); editor.Values[slot++] = str.AsMemory().Slice(0, len); editor.Values[slot++] = str.AsMemory(); } Host.Assert(slot == size); } dst = editor.Commit(); }
internal static DataViewSchema GetModelSchema(IExceptionContext ectx, Graph graph, string opType = null) { var schemaBuilder = new DataViewSchema.Builder(); foreach (Operation op in graph) { if (opType != null && opType != op.OpType) { continue; } var tfType = op.OutputType(0); // Determine element type in Tensorflow tensor. For example, a vector of floats may get NumberType.R4 here. var mlType = DnnUtils.Tf2MlNetTypeOrNull(tfType); // If the type is not supported in ML.NET then we cannot represent it as a column in an Schema. // We also cannot output it with a TensorFlowTransform, so we skip it. // Furthermore, operators which have NumOutputs <= 0 needs to be filtered. // The 'GetTensorShape' method crashes TensorFlow runtime // (https://github.com/dotnet/machinelearning/issues/2156) when the operator has no outputs. if (mlType == null || op.NumOutputs <= 0) { continue; } // Construct the final ML.NET type of a Tensorflow variable. var tensorShape = op.output.TensorShape.Dimensions; var columnType = new VectorDataViewType(mlType); if (!(Utils.Size(tensorShape) == 1 && tensorShape[0] <= 0) && (Utils.Size(tensorShape) > 0 && tensorShape.Skip(1).All(x => x > 0))) { columnType = new VectorDataViewType(mlType, tensorShape[0] > 0 ? tensorShape : tensorShape.Skip(1).ToArray()); } // There can be at most two metadata fields. // 1. The first field always presents. Its value is this operator's type. For example, // if an output is produced by an "Softmax" operator, the value of this field should be "Softmax". // 2. The second field stores operators whose outputs are consumed by this operator. In other words, // these values are names of some upstream operators which should be evaluated before executing // the current operator. It's possible that one operator doesn't need any input, so this field // can be missing. var metadataBuilder = new DataViewSchema.Annotations.Builder(); // Create the first metadata field. metadataBuilder.Add(TensorflowOperatorTypeKind, TextDataViewType.Instance, (ref ReadOnlyMemory <char> value) => value = op.OpType.AsMemory()); if (op.NumInputs > 0) { // Put upstream operators' names to an array (type: VBuffer) of string (type: ReadOnlyMemory<char>). VBuffer <ReadOnlyMemory <char> > upstreamOperatorNames = default; var bufferEditor = VBufferEditor.Create(ref upstreamOperatorNames, op.NumInputs); for (int i = 0; i < op.NumInputs; ++i) { bufferEditor.Values[i] = op.inputs[i].op.name.AsMemory(); } upstreamOperatorNames = bufferEditor.Commit(); // Used in metadata's getter. // Create the second metadata field. metadataBuilder.Add(TensorflowUpstreamOperatorsKind, new VectorDataViewType(TextDataViewType.Instance, op.NumInputs), (ref VBuffer <ReadOnlyMemory <char> > value) => { upstreamOperatorNames.CopyTo(ref value); }); } schemaBuilder.AddColumn(op.name, columnType, metadataBuilder.ToAnnotations()); } return(schemaBuilder.ToSchema()); }
private void GetLabels(Transposer trans, DataViewType labelType, int labelCol) { int min; int lim; var labels = default(VBuffer <int>); // Note: NAs have their own separate bin. if (labelType == NumberDataViewType.Int32) { var tmp = default(VBuffer <int>); trans.GetSingleSlotValue(labelCol, ref tmp); BinInts(in tmp, ref labels, _numBins, out min, out lim); _numLabels = lim - min; } else if (labelType == NumberDataViewType.Single) { var tmp = default(VBuffer <Single>); trans.GetSingleSlotValue(labelCol, ref tmp); BinSingles(in tmp, ref labels, _numBins, out min, out lim); _numLabels = lim - min; } else if (labelType == NumberDataViewType.Double) { var tmp = default(VBuffer <Double>); trans.GetSingleSlotValue(labelCol, ref tmp); BinDoubles(in tmp, ref labels, _numBins, out min, out lim); _numLabels = lim - min; } else if (labelType is BooleanDataViewType) { var tmp = default(VBuffer <bool>); trans.GetSingleSlotValue(labelCol, ref tmp); BinBools(in tmp, ref labels); _numLabels = 3; min = -1; lim = 2; } else { ulong labelKeyCount = labelType.GetKeyCount(); Contracts.Assert(labelKeyCount < Utils.ArrayMaxSize); KeyLabelGetter <int> del = GetKeyLabels <int>; var methodInfo = del.GetMethodInfo().GetGenericMethodDefinition().MakeGenericMethod(labelType.RawType); var parameters = new object[] { trans, labelCol, labelType }; _labels = (VBuffer <int>)methodInfo.Invoke(this, parameters); _numLabels = labelType.GetKeyCountAsInt32(_host) + 1; // No need to densify or shift in this case. return; } // Densify and shift labels. VBufferUtils.Densify(ref labels); Contracts.Assert(labels.IsDense); var labelsEditor = VBufferEditor.CreateFromBuffer(ref labels); for (int i = 0; i < labels.Length; i++) { labelsEditor.Values[i] -= min; Contracts.Assert(labelsEditor.Values[i] < _numLabels); } _labels = labelsEditor.Commit(); }