Beispiel #1
0
        public NgramBufferBuilder(int ngramLength, int skipLength, int slotLim, NgramIdFinder finder)
        {
            Contracts.Assert(ngramLength > 0);
            Contracts.Assert(skipLength >= 0);
            Contracts.Assert(ngramLength <= MaxSkipNgramLength - skipLength);
            Contracts.Assert(slotLim >= 0);

            _ngramLength = ngramLength;
            _skipLength  = skipLength;
            _slotLim     = slotLim;

            _ngram = new uint[_ngramLength];
            _queue = new FixedSizeQueue <uint>(_ngramLength + _skipLength);
            _bldr  = BufferBuilder <Float> .CreateDefault();

            _finder = finder;
        }
Beispiel #2
0
            public NgramIdFinder Decorate(int iinfo, NgramIdFinder finder)
            {
                Contracts.Assert(0 <= iinfo && iinfo < _parent._bindings.InfoCount);
                Contracts.Assert(_iinfoToCollector[iinfo] == null);
                Contracts.AssertValue(finder);

                var srcIndices = _parent._bindings.Infos[iinfo].SrcIndices;

                // Define the mapper from the ngram, to text.
                ValueMapper <NGram, StringBuilder> stringMapper;
                StringBuilder temp = null;

                char[] buffer = null;

                if (srcIndices.Length == 1)
                {
                    // No need to include the column name. This will just be "A" or "(A,B,C)" depending
                    // on the n-arity of the ngram.
                    var srcMap = _srcTextGetters[srcIndices[0]];
                    Contracts.AssertValue(srcMap);

                    stringMapper =
                        (ref NGram src, ref StringBuilder dst) =>
                    {
                        Contracts.Assert(src.ISrcCol == 0);
                        if (src.Lim == 1)
                        {
                            srcMap(ref src.Grams[0], ref dst);
                            return;
                        }
                        ClearDst(ref dst);
                        for (int i = 0; i < src.Lim; ++i)
                        {
                            if (i > 0)
                            {
                                dst.Append('|');
                            }
                            srcMap(ref src.Grams[i], ref temp);
                            InvertHashUtils.AppendToEnd(temp, dst, ref buffer);
                        }
                    };
                }
                else
                {
                    Contracts.Assert(srcIndices.Length > 1);
                    string[] srcNames = _friendlyNames[iinfo];
                    if (srcNames == null)
                    {
                        srcNames = new string[srcIndices.Length];
                        for (int i = 0; i < srcIndices.Length; ++i)
                        {
                            srcNames[i] = _parent.Source.Schema.GetColumnName(srcIndices[i]);
                        }
                    }
                    Contracts.Assert(Utils.Size(srcNames) == srcIndices.Length);
                    string[] friendlyNames = _friendlyNames?[iinfo];
                    // We need to disambiguate the column name. This will be the same as the above format,
                    // just instead of "<Stuff>" it would be with "ColumnName:<Stuff>".
                    stringMapper =
                        (ref NGram src, ref StringBuilder dst) =>
                    {
                        var srcMap = _srcTextGetters[srcIndices[src.ISrcCol]];
                        Contracts.AssertValue(srcMap);
                        ClearDst(ref dst);
                        dst.Append(srcNames[src.ISrcCol]);
                        dst.Append(':');
                        for (int i = 0; i < src.Lim; ++i)
                        {
                            if (i > 0)
                            {
                                dst.Append('|');
                            }
                            srcMap(ref src.Grams[i], ref temp);
                            InvertHashUtils.AppendToEnd(temp, dst, ref buffer);
                        }
                    };
                }

                var collector = _iinfoToCollector[iinfo] = new InvertHashCollector <NGram>(
                    _parent._bindings.Types[iinfo].VectorSize, _invertHashMaxCounts[iinfo],
                    stringMapper, EqualityComparer <NGram> .Default, (ref NGram src, ref NGram dst) => dst = src.Clone());

                return
                    ((uint[] ngram, int lim, int icol, ref bool more) =>
                {
                    Contracts.Assert(0 <= icol && icol < srcIndices.Length);
                    Contracts.AssertValue(_srcTextGetters[srcIndices[icol]]);
                    var result = finder(ngram, lim, icol, ref more);
                    // For the hashing NgramIdFinder, a result of -1 indicates that
                    // a slot does not exist for the given ngram. We do not pass ngrams
                    // that do not have a slot to the InvertHash collector.
                    if (result != -1)
                    {
                        // The following ngram is "unsafe", in that the ngram array is actually
                        // re-used. The collector will utilize its copier to make it safe, in
                        // the event that this is a key it needs to keep.
                        var ngramObj = new NGram(ngram, lim, icol);
                        collector.Add(result, ngramObj);
                    }
                    return result;
                });
            }