public NgramBufferBuilder(int ngramLength, int skipLength, int slotLim, NgramIdFinder finder) { Contracts.Assert(ngramLength > 0); Contracts.Assert(skipLength >= 0); Contracts.Assert(ngramLength <= MaxSkipNgramLength - skipLength); Contracts.Assert(slotLim >= 0); _ngramLength = ngramLength; _skipLength = skipLength; _slotLim = slotLim; _ngram = new uint[_ngramLength]; _queue = new FixedSizeQueue <uint>(_ngramLength + _skipLength); _bldr = BufferBuilder <Float> .CreateDefault(); _finder = finder; }
public NgramIdFinder Decorate(int iinfo, NgramIdFinder finder) { Contracts.Assert(0 <= iinfo && iinfo < _parent._bindings.InfoCount); Contracts.Assert(_iinfoToCollector[iinfo] == null); Contracts.AssertValue(finder); var srcIndices = _parent._bindings.Infos[iinfo].SrcIndices; // Define the mapper from the ngram, to text. ValueMapper <NGram, StringBuilder> stringMapper; StringBuilder temp = null; char[] buffer = null; if (srcIndices.Length == 1) { // No need to include the column name. This will just be "A" or "(A,B,C)" depending // on the n-arity of the ngram. var srcMap = _srcTextGetters[srcIndices[0]]; Contracts.AssertValue(srcMap); stringMapper = (ref NGram src, ref StringBuilder dst) => { Contracts.Assert(src.ISrcCol == 0); if (src.Lim == 1) { srcMap(ref src.Grams[0], ref dst); return; } ClearDst(ref dst); for (int i = 0; i < src.Lim; ++i) { if (i > 0) { dst.Append('|'); } srcMap(ref src.Grams[i], ref temp); InvertHashUtils.AppendToEnd(temp, dst, ref buffer); } }; } else { Contracts.Assert(srcIndices.Length > 1); string[] srcNames = _friendlyNames[iinfo]; if (srcNames == null) { srcNames = new string[srcIndices.Length]; for (int i = 0; i < srcIndices.Length; ++i) { srcNames[i] = _parent.Source.Schema.GetColumnName(srcIndices[i]); } } Contracts.Assert(Utils.Size(srcNames) == srcIndices.Length); string[] friendlyNames = _friendlyNames?[iinfo]; // We need to disambiguate the column name. This will be the same as the above format, // just instead of "<Stuff>" it would be with "ColumnName:<Stuff>". stringMapper = (ref NGram src, ref StringBuilder dst) => { var srcMap = _srcTextGetters[srcIndices[src.ISrcCol]]; Contracts.AssertValue(srcMap); ClearDst(ref dst); dst.Append(srcNames[src.ISrcCol]); dst.Append(':'); for (int i = 0; i < src.Lim; ++i) { if (i > 0) { dst.Append('|'); } srcMap(ref src.Grams[i], ref temp); InvertHashUtils.AppendToEnd(temp, dst, ref buffer); } }; } var collector = _iinfoToCollector[iinfo] = new InvertHashCollector <NGram>( _parent._bindings.Types[iinfo].VectorSize, _invertHashMaxCounts[iinfo], stringMapper, EqualityComparer <NGram> .Default, (ref NGram src, ref NGram dst) => dst = src.Clone()); return ((uint[] ngram, int lim, int icol, ref bool more) => { Contracts.Assert(0 <= icol && icol < srcIndices.Length); Contracts.AssertValue(_srcTextGetters[srcIndices[icol]]); var result = finder(ngram, lim, icol, ref more); // For the hashing NgramIdFinder, a result of -1 indicates that // a slot does not exist for the given ngram. We do not pass ngrams // that do not have a slot to the InvertHash collector. if (result != -1) { // The following ngram is "unsafe", in that the ngram array is actually // re-used. The collector will utilize its copier to make it safe, in // the event that this is a key it needs to keep. var ngramObj = new NGram(ngram, lim, icol); collector.Add(result, ngramObj); } return result; }); }