/// <summary> /// Copy from this buffer to the given destination, making sure to explicitly include the /// first count indices in indicesInclude. Note that indicesInclude should be sorted /// with each index less than this.Length. Note that this can make the destination be /// dense even if "this" is sparse. /// </summary> public void CopyTo(ref VBuffer <T> dst, int[] indicesInclude, int count) { Contracts.CheckParam(count >= 0, nameof(count)); Contracts.CheckParam(Utils.Size(indicesInclude) >= count, nameof(indicesInclude)); Contracts.CheckParam(Utils.Size(indicesInclude) <= Length, nameof(indicesInclude)); // REVIEW: Ideally we should Check that indicesInclude is sorted and in range. Would that // check be too expensive? #if DEBUG int prev = -1; for (int i = 0; i < count; i++) { Contracts.Assert(prev < indicesInclude[i]); prev = indicesInclude[i]; } Contracts.Assert(prev < Length); #endif if (IsDense || count == 0) { CopyTo(ref dst); return; } if (count >= Length / 2 || Count >= Length / 2) { CopyToDense(ref dst); return; } var indices = dst.Indices; var values = dst.Values; if (Count == 0) { // No values in "this". if (Utils.Size(indices) < count) { indices = new int[count]; } Array.Copy(indicesInclude, indices, count); if (Utils.Size(values) < count) { values = new T[count]; } else { Array.Clear(values, 0, count); } dst = new VBuffer <T>(Length, count, values, indices); return; } int size = 0; int max = count + Count; Contracts.Assert(max < Length); int ii1; int ii2; if (max >= Length / 2 || Utils.Size(values) < max || Utils.Size(indices) < max) { // Compute the needed size. ii1 = 0; ii2 = 0; for (; ;) { Contracts.Assert(ii1 < Count); Contracts.Assert(ii2 < count); size++; int diff = Indices[ii1] - indicesInclude[ii2]; if (diff == 0) { ii1++; ii2++; if (ii1 >= Count) { size += count - ii2; break; } if (ii2 >= count) { size += Count - ii1; break; } } else if (diff < 0) { if (++ii1 >= Count) { size += count - ii2; break; } } else { if (++ii2 >= count) { size += Count - ii1; break; } } } Contracts.Assert(size >= count && size >= Count); if (size == Count) { CopyTo(ref dst); return; } if (size >= Length / 2) { CopyToDense(ref dst); return; } if (Utils.Size(values) < size) { values = new T[size]; } if (Utils.Size(indices) < size) { indices = new int[size]; } max = size; } int ii = 0; ii1 = 0; ii2 = 0; for (; ;) { Contracts.Assert(ii < max); Contracts.Assert(ii1 < Count); Contracts.Assert(ii2 < count); int i1 = Indices[ii1]; int i2 = indicesInclude[ii2]; if (i1 <= i2) { indices[ii] = i1; values[ii] = Values[ii1]; ii++; if (i1 == i2) { ii2++; } if (++ii1 >= Count) { if (ii2 >= count) { break; } Array.Clear(values, ii, count - ii2); Array.Copy(indicesInclude, ii2, indices, ii, count - ii2); ii += count - ii2; break; } if (ii2 >= count) { Array.Copy(Values, ii1, values, ii, Count - ii1); Array.Copy(Indices, ii1, indices, ii, Count - ii1); ii += Count - ii1; break; } } else { indices[ii] = i2; values[ii] = default(T); ii++; if (++ii2 >= count) { Array.Copy(Values, ii1, values, ii, Count - ii1); Array.Copy(Indices, ii1, indices, ii, Count - ii1); ii += Count - ii1; break; } } } Contracts.Assert(size == ii || size == 0); dst = new VBuffer <T>(Length, ii, values, indices); }
private void GetKeyNames(int col, ref VBuffer <DvText> dst) { Contracts.Assert(col == 0); Contracts.AssertValue(_keyNamesType); _keyNames.CopyTo(ref dst); }
private void GetTerms(int iinfo, ref VBuffer <ReadOnlyMemory <char> > dst) { Host.Assert(0 <= iinfo && iinfo < _exes.Length); Host.Assert(_slotNames[iinfo].Length > 0); _slotNames[iinfo].CopyTo(ref dst); }
protected override void CopyValue(ref VBuffer <TItem> src, ref VBuffer <TItem> dst) { src.CopyTo(ref dst); }
private static void FillValues(IExceptionContext ectx, ref VBuffer <Float> src, ref VBuffer <Float> dst, Float divisor, Float scale, Float offset = 0) { int count = src.Count; int length = src.Length; ectx.Assert(Utils.Size(src.Values) >= count); ectx.Assert(divisor >= 0); if (count == 0) { dst = new VBuffer <Float>(length, 0, dst.Values, dst.Indices); return; } ectx.Assert(count > 0); ectx.Assert(length > 0); Float normScale = scale; if (divisor > 0) { normScale /= divisor; } // Don't normalize small values. if (normScale < MinScale) { normScale = 1; } if (offset == 0) { var dstValues = dst.Values; if (Utils.Size(dstValues) < count) { dstValues = new Float[count]; } var dstIndices = dst.Indices; if (!src.IsDense) { if (Utils.Size(dstIndices) < count) { dstIndices = new int[count]; } Array.Copy(src.Indices, dstIndices, count); } SseUtils.Scale(normScale, src.Values, dstValues, count); dst = new VBuffer <Float>(length, count, dstValues, dstIndices); return; } // Subtracting the mean requires a dense representation. src.CopyToDense(ref dst); if (normScale != 1) { SseUtils.ScaleAdd(normScale, -offset, dst.Values, length); } else { SseUtils.Add(-offset, dst.Values, length); } }
public override Delegate[] CreateGetters(IRow input, Func <int, bool> activeOutput, out Action disposer) { disposer = null; var getters = new Delegate[3]; if (!activeOutput(ClusterIdCol) && !activeOutput(SortedClusterCol) && !activeOutput(SortedClusterScoreCol)) { return(getters); } long cachedPosition = -1; VBuffer <Single> scores = default(VBuffer <Single>); var scoresArr = new Single[_numClusters]; int[] sortedIndices = new int[_numClusters]; var scoreGetter = input.GetGetter <VBuffer <Single> >(ScoreIndex); Action updateCacheIfNeeded = () => { if (cachedPosition != input.Position) { scoreGetter(ref scores); scores.CopyTo(scoresArr); int j = 0; foreach (var index in Enumerable.Range(0, scoresArr.Length).OrderBy(i => scoresArr[i])) { sortedIndices[j++] = index; } cachedPosition = input.Position; } }; if (activeOutput(ClusterIdCol)) { ValueGetter <uint> assignedFn = (ref uint dst) => { updateCacheIfNeeded(); dst = (uint)sortedIndices[0] + 1; }; getters[ClusterIdCol] = assignedFn; } if (activeOutput(SortedClusterScoreCol)) { ValueGetter <VBuffer <Single> > topKScoresFn = (ref VBuffer <Single> dst) => { updateCacheIfNeeded(); var values = dst.Values; if (Utils.Size(values) < _numClusters) { values = new Single[_numClusters]; } for (int i = 0; i < _numClusters; i++) { values[i] = scores.GetItemOrDefault(sortedIndices[i]); } dst = new VBuffer <Single>(_numClusters, values); }; getters[SortedClusterScoreCol] = topKScoresFn; } if (activeOutput(SortedClusterCol)) { ValueGetter <VBuffer <uint> > topKClassesFn = (ref VBuffer <uint> dst) => { updateCacheIfNeeded(); var values = dst.Values; if (Utils.Size(values) < _numClusters) { values = new uint[_numClusters]; } for (int i = 0; i < _numClusters; i++) { values[i] = (uint)sortedIndices[i] + 1; } dst = new VBuffer <uint>(_numClusters, values); }; getters[SortedClusterCol] = topKClassesFn; } return(getters); }
protected override Delegate GetGetterCore(IChannel ch, IRow input, int iinfo, out Action disposer) { Host.AssertValueOrNull(ch); Host.AssertValue(input); Host.Assert(0 <= iinfo && iinfo < Infos.Length); Host.Assert(Infos[iinfo].TypeSrc.IsVector); Host.Assert(Infos[iinfo].TypeSrc.ItemType.IsKey); disposer = null; var getSrc = RowCursorUtils.GetVecGetterAs <uint>(NumberType.U4, input, Infos[iinfo].Source); var src = default(VBuffer <uint>); var bldr = new NgramBufferBuilder(_exes[iinfo].NgramLength, _exes[iinfo].SkipLength, _ngramMaps[iinfo].Count, GetNgramIdFinder(iinfo)); var keyCount = (uint)Infos[iinfo].TypeSrc.ItemType.KeyCount; if (keyCount == 0) { keyCount = uint.MaxValue; } ValueGetter <VBuffer <Float> > del; switch (_exes[iinfo].Weighting) { case WeightingCriteria.TfIdf: Host.AssertValue(_invDocFreqs[iinfo]); del = (ref VBuffer <Float> dst) => { getSrc(ref src); if (!bldr.IsEmpty) { bldr.Reset(); bldr.AddNgrams(ref src, 0, keyCount); bldr.GetResult(ref dst); VBufferUtils.Apply(ref dst, (int i, ref Float v) => v = (Float)(v * _invDocFreqs[iinfo][i])); } else { dst = new VBuffer <Float>(0, dst.Values, dst.Indices); } }; break; case WeightingCriteria.Idf: Host.AssertValue(_invDocFreqs[iinfo]); del = (ref VBuffer <Float> dst) => { getSrc(ref src); if (!bldr.IsEmpty) { bldr.Reset(); bldr.AddNgrams(ref src, 0, keyCount); bldr.GetResult(ref dst); VBufferUtils.Apply(ref dst, (int i, ref Float v) => v = v >= 1 ? (Float)_invDocFreqs[iinfo][i] : 0); } else { dst = new VBuffer <Float>(0, dst.Values, dst.Indices); } }; break; case WeightingCriteria.Tf: del = (ref VBuffer <Float> dst) => { getSrc(ref src); if (!bldr.IsEmpty) { bldr.Reset(); bldr.AddNgrams(ref src, 0, keyCount); bldr.GetResult(ref dst); } else { dst = new VBuffer <Float>(0, dst.Values, dst.Indices); } }; break; default: throw Host.Except("Unsupported weighting criteria"); } return(del); }
protected override void UpdateCore(Float label, ref VBuffer <Float> score, ref VBuffer <Double> loss, Float weight) { AddL1AndL2Loss(label, ref score, weight); AddCustomLoss(weight, ref loss); }
protected override void ApplyLossFunction(ref VBuffer <float> score, float label, ref VBuffer <Double> loss) { VBufferUtils.PairManipulator <Float, Double> lossFn = (int slot, Float src, ref Double dst) => dst = LossFunction.Loss(src, label); VBufferUtils.ApplyWith(ref score, ref loss, lossFn); }
private static void Load(IChannel ch, ModelLoadContext ctx, CodecFactory factory, ref VBuffer <ReadOnlyMemory <char> > values) { Contracts.AssertValue(ch); ch.CheckValue(ctx, nameof(ctx)); ctx.CheckAtModel(GetVersionInfo()); // *** Binary format *** // Codec parameterization: A codec parameterization that should be a ReadOnlyMemory codec // int: n, the number of bytes used to write the values // byte[n]: As encoded using the codec // Get the codec from the factory, and from the stream. We have to // attempt to read the codec from the stream, since codecs can potentially // be versioned based on their parameterization. IValueCodec codec; // This *could* happen if we have an old version attempt to read a new version. // Enabling this sort of binary classification is why we also need to write the // codec specification. if (!factory.TryReadCodec(ctx.Reader.BaseStream, out codec)) { throw ch.ExceptDecode(); } ch.AssertValue(codec); ch.CheckDecode(codec.Type.IsVector); ch.CheckDecode(codec.Type.ItemType.IsText); var textCodec = (IValueCodec <VBuffer <ReadOnlyMemory <char> > >)codec; var bufferLen = ctx.Reader.ReadInt32(); ch.CheckDecode(bufferLen >= 0); using (var stream = new SubsetStream(ctx.Reader.BaseStream, bufferLen)) { using (var reader = textCodec.OpenReader(stream, 1)) { reader.MoveNext(); values = default(VBuffer <ReadOnlyMemory <char> >); reader.Get(ref values); } ch.CheckDecode(stream.ReadByte() == -1); } }
private static void Save(IChannel ch, ModelSaveContext ctx, CodecFactory factory, ref VBuffer <ReadOnlyMemory <char> > values) { Contracts.AssertValue(ch); ch.CheckValue(ctx, nameof(ctx)); ctx.CheckAtModel(); ctx.SetVersionInfo(GetVersionInfo()); // *** Binary format *** // Codec parameterization: A codec parameterization that should be a ReadOnlyMemory codec // int: n, the number of bytes used to write the values // byte[n]: As encoded using the codec // Get the codec from the factory IValueCodec codec; var result = factory.TryGetCodec(new VectorType(TextType.Instance), out codec); ch.Assert(result); ch.Assert(codec.Type.IsVector); ch.Assert(codec.Type.VectorSize == 0); ch.Assert(codec.Type.ItemType.RawType == typeof(ReadOnlyMemory <char>)); IValueCodec <VBuffer <ReadOnlyMemory <char> > > textCodec = (IValueCodec <VBuffer <ReadOnlyMemory <char> > >)codec; factory.WriteCodec(ctx.Writer.BaseStream, codec); using (var mem = new MemoryStream()) { using (var writer = textCodec.OpenWriter(mem)) { writer.Write(ref values); writer.Commit(); } ctx.Writer.WriteByteArray(mem.ToArray()); } // Make this resemble, more or less, the auxiliary output from the TermTransform. // It will differ somewhat due to the vector being possibly sparse. To distinguish // between missing and empty, empties are not written at all, while missings are. var v = values; char[] buffer = null; ctx.SaveTextStream("Terms.txt", writer => { writer.WriteLine("# Number of terms = {0} of length {1}", v.Count, v.Length); foreach (var pair in v.Items()) { var text = pair.Value; if (text.IsEmpty) { continue; } writer.Write("{0}\t", pair.Key); // REVIEW: What about escaping this, *especially* for linebreaks? // Do C# and .NET really have no equivalent to Python's "repr"? :( if (text.IsEmpty) { writer.WriteLine(); continue; } Utils.EnsureSize(ref buffer, text.Length); var span = text.Span; for (int i = 0; i < text.Length; i++) { buffer[i] = span[i]; } writer.WriteLine(buffer, 0, text.Length); } }); }
private void DropNAs <TDst>(ref VBuffer <TDst> src, ref VBuffer <TDst> dst, InPredicate <TDst> isNA) { Host.AssertValue(isNA); int newCount = 0; for (int i = 0; i < src.Count; i++) { if (!isNA(in src.Values[i])) { newCount++; } } Host.Assert(newCount <= src.Count); if (newCount == 0) { dst = new VBuffer <TDst>(src.Length - src.Count, 0, dst.Values, dst.Indices); return; } if (newCount == src.Count) { Utils.Swap(ref src, ref dst); return; } var values = dst.Values; if (Utils.Size(values) < newCount) { values = new TDst[newCount]; } int iDst = 0; if (src.IsDense) { for (int i = 0; i < src.Count; i++) { if (!isNA(in src.Values[i])) { values[iDst] = src.Values[i]; iDst++; } } Host.Assert(iDst == newCount); dst = new VBuffer <TDst>(newCount, values, dst.Indices); } else { var indices = dst.Indices; if (Utils.Size(indices) < newCount) { indices = new int[newCount]; } int offset = 0; for (int i = 0; i < src.Count; i++) { if (!isNA(in src.Values[i])) { values[iDst] = src.Values[i]; indices[iDst] = src.Indices[i] - offset; iDst++; }
public void Conv(ref byte[] src, ref VBuffer <Byte> dst) => dst = src != null ? new VBuffer <byte>(src.Length, src) : new VBuffer <byte>(0, new byte[0]);
public static void Copy(ref VBuffer <T> src, ref VBuffer <T> dst) { src.CopyTo(ref dst); }
private static Float[] Train(IHost host, ColInfo[] infos, Arguments args, IDataView trainingData) { Contracts.AssertValue(host, "host"); host.AssertNonEmpty(infos); var avgDistances = new Float[infos.Length]; const int reservoirSize = 5000; bool[] activeColumns = new bool[trainingData.Schema.ColumnCount]; for (int i = 0; i < infos.Length; i++) { activeColumns[infos[i].Source] = true; } var reservoirSamplers = new ReservoirSamplerWithReplacement <VBuffer <Float> > [infos.Length]; using (var cursor = trainingData.GetRowCursor(col => activeColumns[col])) { var rng = args.Seed.HasValue ? RandomUtils.Create(args.Seed) : host.Rand; for (int i = 0; i < infos.Length; i++) { if (infos[i].TypeSrc.IsVector) { var get = cursor.GetGetter <VBuffer <Float> >(infos[i].Source); reservoirSamplers[i] = new ReservoirSamplerWithReplacement <VBuffer <Float> >(rng, reservoirSize, get); } else { var getOne = cursor.GetGetter <Float>(infos[i].Source); Float val = 0; ValueGetter <VBuffer <Float> > get = (ref VBuffer <Float> dst) => { getOne(ref val); dst = new VBuffer <float>(1, new[] { val }); }; reservoirSamplers[i] = new ReservoirSamplerWithReplacement <VBuffer <Float> >(rng, reservoirSize, get); } } while (cursor.MoveNext()) { for (int i = 0; i < infos.Length; i++) { reservoirSamplers[i].Sample(); } } for (int i = 0; i < infos.Length; i++) { reservoirSamplers[i].Lock(); } } for (int iinfo = 0; iinfo < infos.Length; iinfo++) { var instanceCount = reservoirSamplers[iinfo].NumSampled; // If the number of pairs is at most the maximum reservoir size / 2, we go over all the pairs, // so we get all the examples. Otherwise, get a sample with replacement. VBuffer <Float>[] res; int resLength; if (instanceCount < reservoirSize && instanceCount * (instanceCount - 1) <= reservoirSize) { res = reservoirSamplers[iinfo].GetCache(); resLength = reservoirSamplers[iinfo].Size; Contracts.Assert(resLength == instanceCount); } else { res = reservoirSamplers[iinfo].GetSample().ToArray(); resLength = res.Length; } // If the dataset contains only one valid Instance, then we can't learn anything anyway, so just return 1. if (instanceCount <= 1) { avgDistances[iinfo] = 1; } else { Float[] distances; var sub = args.Column[iinfo].MatrixGenerator; if (sub == null) { sub = args.MatrixGenerator; } // create a dummy generator in order to get its type. // REVIEW this should be refactored. See https://github.com/dotnet/machinelearning/issues/699 var matrixGenerator = sub.CreateComponent(host, 1); bool gaussian = matrixGenerator is GaussianFourierSampler; // If the number of pairs is at most the maximum reservoir size / 2, go over all the pairs. if (resLength < reservoirSize) { distances = new Float[instanceCount * (instanceCount - 1) / 2]; int count = 0; for (int i = 0; i < instanceCount; i++) { for (int j = i + 1; j < instanceCount; j++) { distances[count++] = gaussian ? VectorUtils.L2DistSquared(ref res[i], ref res[j]) : VectorUtils.L1Distance(ref res[i], ref res[j]); } } host.Assert(count == distances.Length); } else { distances = new Float[reservoirSize / 2]; for (int i = 0; i < reservoirSize - 1; i += 2) { // For Gaussian kernels, we scale by the L2 distance squared, since the kernel function is exp(-gamma ||x-y||^2). // For Laplacian kernels, we scale by the L1 distance, since the kernel function is exp(-gamma ||x-y||_1). distances[i / 2] = gaussian ? VectorUtils.L2DistSquared(ref res[i], ref res[i + 1]) : VectorUtils.L1Distance(ref res[i], ref res[i + 1]); } } // If by chance, in the random permutation all the pairs are the same instance we return 1. Float median = MathUtils.GetMedianInPlace(distances, distances.Length); avgDistances[iinfo] = median == 0 ? 1 : median; } } return(avgDistances); }
protected override bool IsNaN(ref VBuffer <Float> score) { return(VBufferUtils.HasNaNs(ref score)); }
private static void TransformFeatures(IExceptionContext ectx, ref VBuffer <Float> src, ref VBuffer <Float> dst, TransformInfo transformInfo) { ectx.Check(src.Length == transformInfo.Dimension); var values = dst.Values; if (Utils.Size(values) < transformInfo.Rank) { values = new Float[transformInfo.Rank]; } for (int i = 0; i < transformInfo.Rank; i++) { values[i] = VectorUtils.DotProductWithOffset(transformInfo.Eigenvectors[i], 0, ref src) - (transformInfo.MeanProjected == null ? 0 : transformInfo.MeanProjected[i]); } dst = new VBuffer <Float>(transformInfo.Rank, values, dst.Indices); }
public override Delegate[] CreateGetters(IRow input, Func <int, bool> activeCols, out Action disposer) { Host.Assert(LabelIndex >= 0); Host.Assert(ScoreIndex >= 0); disposer = null; long cachedPosition = -1; Float label = 0; var score = default(VBuffer <Float>); var l1 = VBufferUtils.CreateDense <Double>(_scoreSize); ValueGetter <Float> nanGetter = (ref Float value) => value = Single.NaN; var labelGetter = activeCols(L1Col) || activeCols(L2Col) ? RowCursorUtils.GetLabelGetter(input, LabelIndex) : nanGetter; ValueGetter <VBuffer <Float> > scoreGetter; if (activeCols(L1Col) || activeCols(L2Col)) { scoreGetter = input.GetGetter <VBuffer <Float> >(ScoreIndex); } else { scoreGetter = (ref VBuffer <Float> dst) => dst = default(VBuffer <Float>); } Action updateCacheIfNeeded = () => { if (cachedPosition != input.Position) { labelGetter(ref label); scoreGetter(ref score); var lab = (Double)label; foreach (var s in score.Items(all: true)) { l1.Values[s.Key] = Math.Abs(lab - s.Value); } cachedPosition = input.Position; } }; var getters = new Delegate[2]; if (activeCols(L1Col)) { ValueGetter <VBuffer <Double> > l1Fn = (ref VBuffer <Double> dst) => { updateCacheIfNeeded(); l1.CopyTo(ref dst); }; getters[L1Col] = l1Fn; } if (activeCols(L2Col)) { VBufferUtils.PairManipulator <Double, Double> sqr = (int slot, Double x, ref Double y) => y = x * x; ValueGetter <VBuffer <Double> > l2Fn = (ref VBuffer <Double> dst) => { updateCacheIfNeeded(); dst = new VBuffer <Double>(_scoreSize, 0, dst.Values, dst.Indices); VBufferUtils.ApplyWith(ref l1, ref dst, sqr); }; getters[L2Col] = l2Fn; } return(getters); }
private SequencePool[] Train(Arguments args, IDataView trainingData, out double[][] invDocFreqs) { // Contains the maximum number of grams to store in the dictionary, for each level of ngrams, // from 1 (in position 0) up to ngramLength (in position ngramLength-1) var lims = new int[Infos.Length][]; for (int iinfo = 0; iinfo < Infos.Length; iinfo++) { var all = args.Column[iinfo].AllLengths ?? args.AllLengths; var ngramLength = _exes[iinfo].NgramLength; var maxNumTerms = Utils.Size(args.Column[iinfo].MaxNumTerms) > 0 ? args.Column[iinfo].MaxNumTerms : args.MaxNumTerms; if (!all) { Host.CheckUserArg(Utils.Size(maxNumTerms) == 0 || Utils.Size(maxNumTerms) == 1 && maxNumTerms[0] > 0, nameof(args.MaxNumTerms)); lims[iinfo] = new int[ngramLength]; lims[iinfo][ngramLength - 1] = Utils.Size(maxNumTerms) == 0 ? Arguments.DefaultMaxTerms : maxNumTerms[0]; } else { Host.CheckUserArg(Utils.Size(maxNumTerms) <= ngramLength, nameof(args.MaxNumTerms)); Host.CheckUserArg(Utils.Size(maxNumTerms) == 0 || maxNumTerms.All(i => i >= 0) && maxNumTerms[maxNumTerms.Length - 1] > 0, nameof(args.MaxNumTerms)); var extend = Utils.Size(maxNumTerms) == 0 ? Arguments.DefaultMaxTerms : maxNumTerms[maxNumTerms.Length - 1]; lims[iinfo] = Utils.BuildArray(ngramLength, i => i < Utils.Size(maxNumTerms) ? maxNumTerms[i] : extend); } } var helpers = new NgramBufferBuilder[Infos.Length]; var getters = new ValueGetter <VBuffer <uint> > [Infos.Length]; var src = new VBuffer <uint> [Infos.Length]; // Keep track of how many grams are in the pool for each value of n. Position // i in _counts counts how many (i+1)-grams are in the pool for column iinfo. var counts = new int[Infos.Length][]; var ngramMaps = new SequencePool[Infos.Length]; bool[] activeInput = new bool[trainingData.Schema.ColumnCount]; foreach (var info in Infos) { activeInput[info.Source] = true; } using (var cursor = trainingData.GetRowCursor(col => activeInput[col])) using (var pch = Host.StartProgressChannel("Building n-gram dictionary")) { for (int iinfo = 0; iinfo < Infos.Length; iinfo++) { Host.Assert(Infos[iinfo].TypeSrc.IsVector && Infos[iinfo].TypeSrc.ItemType.IsKey); var ngramLength = _exes[iinfo].NgramLength; var skipLength = _exes[iinfo].SkipLength; getters[iinfo] = RowCursorUtils.GetVecGetterAs <uint>(NumberType.U4, cursor, Infos[iinfo].Source); src[iinfo] = default(VBuffer <uint>); counts[iinfo] = new int[ngramLength]; ngramMaps[iinfo] = new SequencePool(); // Note: GetNgramIdFinderAdd will control how many ngrams of a specific length will // be added (using lims[iinfo]), therefore we set slotLim to the maximum helpers[iinfo] = new NgramBufferBuilder(ngramLength, skipLength, Utils.ArrayMaxSize, GetNgramIdFinderAdd(counts[iinfo], lims[iinfo], ngramMaps[iinfo], _exes[iinfo].RequireIdf(), Host)); } int cInfoFull = 0; bool[] infoFull = new bool[Infos.Length]; invDocFreqs = new double[Infos.Length][]; long totalDocs = 0; Double rowCount = trainingData.GetRowCount(true) ?? Double.NaN; var buffers = new VBuffer <float> [Infos.Length]; pch.SetHeader(new ProgressHeader(new[] { "Total n-grams" }, new[] { "documents" }), e => e.SetProgress(0, totalDocs, rowCount)); while (cInfoFull < Infos.Length && cursor.MoveNext()) { totalDocs++; for (int iinfo = 0; iinfo < Infos.Length; iinfo++) { getters[iinfo](ref src[iinfo]); var keyCount = (uint)Infos[iinfo].TypeSrc.ItemType.KeyCount; if (keyCount == 0) { keyCount = uint.MaxValue; } if (!infoFull[iinfo]) { if (_exes[iinfo].RequireIdf()) { helpers[iinfo].Reset(); } helpers[iinfo].AddNgrams(ref src[iinfo], 0, keyCount); if (_exes[iinfo].RequireIdf()) { int totalNgrams = counts[iinfo].Sum(); Utils.EnsureSize(ref invDocFreqs[iinfo], totalNgrams); helpers[iinfo].GetResult(ref buffers[iinfo]); foreach (var pair in buffers[iinfo].Items()) { if (pair.Value >= 1) { invDocFreqs[iinfo][pair.Key] += 1; } } } } AssertValid(counts[iinfo], lims[iinfo], ngramMaps[iinfo]); } } pch.Checkpoint(counts.Sum(c => c.Sum()), totalDocs); for (int iinfo = 0; iinfo < Infos.Length; iinfo++) { for (int i = 0; i < Utils.Size(invDocFreqs[iinfo]); i++) { if (invDocFreqs[iinfo][i] != 0) { invDocFreqs[iinfo][i] = Math.Log(totalDocs / invDocFreqs[iinfo][i]); } } } for (int iinfo = 0; iinfo < Infos.Length; iinfo++) { AssertValid(counts[iinfo], lims[iinfo], ngramMaps[iinfo]); int ngramLength = _exes[iinfo].NgramLength; for (int i = 0; i < ngramLength; i++) { _exes[iinfo].NonEmptyLevels[i] = counts[iinfo][i] > 0; } } return(ngramMaps); } }
public static void GetSlotNames(RoleMappedSchema schema, RoleMappedSchema.ColumnRole role, int vectorSize, ref VBuffer <DvText> slotNames) { Contracts.CheckValueOrNull(schema); Contracts.CheckValue(role.Value, nameof(role)); Contracts.CheckParam(vectorSize >= 0, nameof(vectorSize)); IReadOnlyList <ColumnInfo> list; if ((list = schema?.GetColumns(role)) == null || list.Count != 1 || !schema.Schema.HasSlotNames(list[0].Index, vectorSize)) { slotNames = new VBuffer <DvText>(vectorSize, 0, slotNames.Values, slotNames.Indices); } else { schema.Schema.GetMetadata(Kinds.SlotNames, list[0].Index, ref slotNames); } }
protected override void GetMissing(ref VBuffer <TItem> dst) { dst = new VBuffer <TItem>(Type.VectorSize, 0, dst.Values, dst.Indices); }
/// <summary> /// Getter generator for inputs of type <typeparamref name="TSrc"/>, where output type is a vector of hashes /// </summary> /// <typeparam name="TSrc">Input type. Must be a vector</typeparam> /// <param name="input">Row input</param> /// <param name="iinfo">Index of the getter</param> private ValueGetter <VBuffer <uint> > ComposeGetterVecToVec <TSrc>(IRow input, int iinfo) { Host.AssertValue(input); Host.Assert(Infos[iinfo].TypeSrc.IsVector); var getSrc = GetSrcGetter <VBuffer <TSrc> >(input, iinfo); var hashFunction = ComposeHashDelegate <TSrc>(); var src = default(VBuffer <TSrc>); int n = _exes[iinfo].OutputValueCount; int expectedSrcLength = Infos[iinfo].TypeSrc.VectorSize; int[][] slotMap = _exes[iinfo].SlotMap; // REVIEW: consider adding a fix-zero functionality (subtract emptyTextHash from all hashes) var mask = (1U << _exes[iinfo].HashBits) - 1; var hashSeed = _exes[iinfo].HashSeed; bool ordered = _exes[iinfo].Ordered; TSrc[] denseValues = null; return ((ref VBuffer <uint> dst) => { getSrc(ref src); Host.Check(src.Length == expectedSrcLength); TSrc[] values; // force-densify the input // REVIEW: this performs poorly if only a fraction of sparse vector is used for hashing. // This scenario was unlikely at the time of writing. Regardless of performance, the hash value // needs to be consistent across equivalent representations - sparse vs dense. if (src.IsDense) { values = src.Values; } else { if (denseValues == null) { denseValues = new TSrc[expectedSrcLength]; } values = denseValues; src.CopyTo(values); } var hashes = dst.Values; if (Utils.Size(hashes) < n) { hashes = new uint[n]; } for (int i = 0; i < n; i++) { uint hash = hashSeed; foreach (var srcSlot in slotMap[i]) { // REVIEW: some legacy code hashes 0 for srcSlot in ord- case, do we need to preserve this behavior? if (ordered) { hash = Hashing.MurmurRound(hash, (uint)srcSlot); } hash = hashFunction(ref values[srcSlot], hash); } hashes[i] = (Hashing.MixHash(hash) & mask) + 1; // +1 to offset from zero, which has special meaning for KeyType } dst = new VBuffer <uint>(n, hashes, dst.Indices); }); }
private ValueGetter <ReadOnlyMemory <char> > GetTextValueGetter <TSrc>(Row input, int colSrc, VBuffer <ReadOnlyMemory <char> > slotNames) { Contracts.AssertValue(input); Contracts.AssertValue(Predictor); var featureGetter = input.GetGetter <TSrc>(colSrc); var map = Predictor.GetFeatureContributionMapper <TSrc, VBuffer <float> >(_topContributionsCount, _bottomContributionsCount, _normalize); var features = default(TSrc); var contributions = default(VBuffer <float>); return ((ref ReadOnlyMemory <char> dst) => { featureGetter(ref features); map(in features, ref contributions); var indices = new Span <int>(); var values = new Span <float>(); if (contributions.IsDense) { Utils.GetIdentityPermutation(contributions.Length).AsSpan().CopyTo(indices); } else { contributions.GetIndices().CopyTo(indices); } contributions.GetValues().CopyTo(values); var count = values.Length; var sb = new StringBuilder(); GenericSpanSortHelper <int> .Sort(indices, values, 0, count); for (var i = 0; i < count; i++) { var val = values[i]; var ind = indices[i]; var name = GetSlotName(ind, slotNames); sb.AppendFormat("{0}: {1}, ", name, val); } if (sb.Length > 0) { _env.Assert(sb.Length >= 2); sb.Remove(sb.Length - 2, 2); } dst = new ReadOnlyMemory <char>(sb.ToString().ToCharArray()); }); }
private static DropSlotsTransform.Column CreateDropSlotsColumn(Arguments args, ref VBuffer <Single> scores, out int selectedCount) { // Not checking the scores.Length, because: // 1. If it's the same as the features column length, we should be constructing the right DropSlots arguments. // 2. If it's less, we assume that the rest of the scores are zero and we drop the slots. // 3. If it's greater, the drop slots ignores the ranges that are outside the valid range of indices for the column. Contracts.Assert(args.Threshold.HasValue != args.NumSlotsToKeep.HasValue); var col = new DropSlotsTransform.Column(); col.Source = args.FeatureColumn; selectedCount = 0; // Degenerate case, dropping all slots. if (scores.Count == 0) { var range = new DropSlotsTransform.Range(); col.Slots = new DropSlotsTransform.Range[] { range }; return(col); } int tiedScoresToKeep; float threshold; if (args.Threshold.HasValue) { threshold = args.Threshold.Value; tiedScoresToKeep = threshold > 0 ? int.MaxValue : 0; } else { Contracts.Assert(args.NumSlotsToKeep.HasValue); threshold = ComputeThreshold(scores.Values, scores.Count, args.NumSlotsToKeep.Value, out tiedScoresToKeep); } var slots = new List <DropSlotsTransform.Range>(); for (int i = 0; i < scores.Count; i++) { var score = Math.Abs(scores.Values[i]); if (score > threshold) { selectedCount++; continue; } if (score == threshold && tiedScoresToKeep > 0) { tiedScoresToKeep--; selectedCount++; continue; } var range = new DropSlotsTransform.Range(); range.Min = i; while (++i < scores.Count) { score = Math.Abs(scores.Values[i]); if (score > threshold) { selectedCount++; break; } if (score == threshold && tiedScoresToKeep > 0) { tiedScoresToKeep--; selectedCount++; break; } } range.Max = i - 1; slots.Add(range); } if (!scores.IsDense) { int ii = 0; var count = slots.Count; for (int i = 0; i < count; i++) { var range = slots[i]; Contracts.Assert(range.Max != null); var min = range.Min; var max = range.Max.Value; Contracts.Assert(min <= max); Contracts.Assert(max < scores.Count); range.Min = min == 0 ? 0 : scores.Indices[min - 1] + 1; range.Max = max == scores.Count - 1 ? scores.Length - 1 : scores.Indices[max + 1] - 1; // Add the gaps before this range. for (; ii < min; ii++) { var gapMin = ii == 0 ? 0 : scores.Indices[ii - 1] + 1; var gapMax = scores.Indices[ii] - 1; if (gapMin <= gapMax) { var gap = new DropSlotsTransform.Range(); gap.Min = gapMin; gap.Max = gapMax; slots.Add(gap); } } ii = max; } // Add the gaps after the last range. for (; ii <= scores.Count; ii++) { var gapMin = ii == 0 ? 0 : scores.Indices[ii - 1] + 1; var gapMax = ii == scores.Count ? scores.Length - 1 : scores.Indices[ii] - 1; if (gapMin <= gapMax) { var gap = new DropSlotsTransform.Range(); gap.Min = gapMin; gap.Max = gapMax; slots.Add(gap); } } // Remove all slots past scores.Length. var lastRange = new DropSlotsTransform.Range(); lastRange.Min = scores.Length; slots.Add(lastRange); } if (slots.Count > 0) { col.Slots = slots.ToArray(); return(col); } return(null); }
public static void GetSlotNames(RoleMappedSchema schema, RoleMappedSchema.ColumnRole role, int vectorSize, ref VBuffer <ReadOnlyMemory <char> > slotNames) { Contracts.CheckValueOrNull(schema); Contracts.CheckParam(vectorSize >= 0, nameof(vectorSize)); IReadOnlyList <ColumnInfo> list; if ((list = schema?.GetColumns(role)) == null || list.Count != 1 || !schema.Schema.HasSlotNames(list[0].Index, vectorSize)) { VBufferUtils.Resize(ref slotNames, vectorSize, 0); } else { schema.Schema.GetMetadata(Kinds.SlotNames, list[0].Index, ref slotNames); } }
/// <summary> /// Returns a score for each slot of the features column. /// </summary> public static void Train(IHostEnvironment env, IDataView input, Arguments args, ref VBuffer <Single> scores) { Contracts.CheckValue(env, nameof(env)); var host = env.Register(RegistrationName); host.CheckValue(args, nameof(args)); host.CheckValue(input, nameof(input)); args.Check(host); TrainCore(host, input, args, ref scores); }
// The multi-output regression evaluator prints only the per-label metrics for each fold. protected override void PrintFoldResultsCore(IChannel ch, Dictionary <string, IDataView> metrics) { IDataView fold; if (!metrics.TryGetValue(MetricKinds.OverallMetrics, out fold)) { throw ch.Except("No overall metrics found"); } int isWeightedCol; bool needWeighted = fold.Schema.TryGetColumnIndex(MetricKinds.ColumnNames.IsWeighted, out isWeightedCol); int stratCol; bool hasStrats = fold.Schema.TryGetColumnIndex(MetricKinds.ColumnNames.StratCol, out stratCol); int stratVal; bool hasStratVals = fold.Schema.TryGetColumnIndex(MetricKinds.ColumnNames.StratVal, out stratVal); ch.Assert(hasStrats == hasStratVals); var colCount = fold.Schema.ColumnCount; var vBufferGetters = new ValueGetter <VBuffer <double> > [colCount]; using (var cursor = fold.GetRowCursor(col => true)) { bool isWeighted = false; ValueGetter <bool> isWeightedGetter; if (needWeighted) { isWeightedGetter = cursor.GetGetter <bool>(isWeightedCol); } else { isWeightedGetter = (ref bool dst) => dst = false; } ValueGetter <uint> stratGetter; if (hasStrats) { var type = cursor.Schema.GetColumnType(stratCol); stratGetter = RowCursorUtils.GetGetterAs <uint>(type, cursor, stratCol); } else { stratGetter = (ref uint dst) => dst = 0; } int labelCount = 0; for (int i = 0; i < fold.Schema.ColumnCount; i++) { if (fold.Schema.IsHidden(i) || (needWeighted && i == isWeightedCol) || (hasStrats && (i == stratCol || i == stratVal))) { continue; } var type = fold.Schema.GetColumnType(i); if (type.IsKnownSizeVector && type.ItemType == NumberType.R8) { vBufferGetters[i] = cursor.GetGetter <VBuffer <double> >(i); if (labelCount == 0) { labelCount = type.VectorSize; } else { ch.Check(labelCount == type.VectorSize, "All vector metrics should contain the same number of slots"); } } } var labelNames = new ReadOnlyMemory <char> [labelCount]; for (int j = 0; j < labelCount; j++) { labelNames[j] = string.Format("Label_{0}", j).AsMemory(); } var sb = new StringBuilder(); sb.AppendLine("Per-label metrics:"); sb.AppendFormat("{0,12} ", " "); for (int i = 0; i < labelCount; i++) { sb.AppendFormat(" {0,20}", labelNames[i]); } sb.AppendLine(); VBuffer <Double> metricVals = default(VBuffer <Double>); bool foundWeighted = !needWeighted; bool foundUnweighted = false; uint strat = 0; while (cursor.MoveNext()) { isWeightedGetter(ref isWeighted); if (foundWeighted && isWeighted || foundUnweighted && !isWeighted) { throw ch.Except("Multiple {0} rows found in overall metrics data view", isWeighted ? "weighted" : "unweighted"); } if (isWeighted) { foundWeighted = true; } else { foundUnweighted = true; } stratGetter(ref strat); if (strat > 0) { continue; } for (int i = 0; i < colCount; i++) { if (vBufferGetters[i] != null) { vBufferGetters[i](ref metricVals); ch.Assert(metricVals.Length == labelCount); sb.AppendFormat("{0}{1,12}:", isWeighted ? "Weighted " : "", fold.Schema.GetColumnName(i)); foreach (var metric in metricVals.Items(all: true)) { sb.AppendFormat(" {0,20:G20}", metric.Value); } sb.AppendLine(); } } if (foundUnweighted && foundWeighted) { break; } } ch.Assert(foundUnweighted && foundWeighted); ch.Info(sb.ToString()); } }
private static void EnsureCachedResultValueMapper(ValueMapper <VBuffer <Float>, Float, Float> mapper, ref long cachedPosition, ValueGetter <VBuffer <Float> > featureGetter, ref VBuffer <Float> features, ref Float score, ref Float prob, Row input) { Contracts.AssertValue(mapper); if (cachedPosition != input.Position) { if (featureGetter != null) { featureGetter(ref features); } mapper(in features, ref score, ref prob); cachedPosition = input.Position; } }
protected override void Copy(ref VBuffer <T> src, ref VBuffer <T> dst) { src.CopyTo(ref dst); }
protected abstract bool AcceptColumnValue(ref VBuffer <TFloat> buffer);