Ejemplo n.º 1
0
 public InvertHashHelper(NgramHashTransform parent, string[][] friendlyNames, Func <int, bool> inputPred, int[] invertHashMaxCounts)
 {
     Contracts.AssertValue(parent);
     Contracts.AssertValue(friendlyNames);
     Contracts.Assert(friendlyNames.Length == parent._bindings.InfoCount);
     Contracts.AssertValue(inputPred);
     Contracts.AssertValue(invertHashMaxCounts);
     Contracts.Assert(invertHashMaxCounts.Length == parent._bindings.InfoCount);
     _parent = parent;
     // One per iinfo (some may be null).
     _iinfoToCollector = new InvertHashCollector <NGram> [_parent._bindings.InfoCount];
     // One per source column (some may be null).
     _srcTextGetters      = new ValueMapper <uint, StringBuilder> [_parent.Source.Schema.ColumnCount];
     _invertHashMaxCounts = invertHashMaxCounts;
     for (int i = 0; i < _srcTextGetters.Length; ++i)
     {
         if (inputPred(i))
         {
             _srcTextGetters[i] = InvertHashUtils.GetSimpleMapper <uint>(_parent.Source.Schema, i);
         }
     }
     _friendlyNames = friendlyNames;
 }
Ejemplo n.º 2
0
            public NgramIdFinder Decorate(int iinfo, NgramIdFinder finder)
            {
                Contracts.Assert(0 <= iinfo && iinfo < _parent._bindings.InfoCount);
                Contracts.Assert(_iinfoToCollector[iinfo] == null);
                Contracts.AssertValue(finder);

                var srcIndices = _parent._bindings.Infos[iinfo].SrcIndices;

                // Define the mapper from the ngram, to text.
                ValueMapper <NGram, StringBuilder> stringMapper;
                StringBuilder temp = null;

                char[] buffer = null;

                if (srcIndices.Length == 1)
                {
                    // No need to include the column name. This will just be "A" or "(A,B,C)" depending
                    // on the n-arity of the ngram.
                    var srcMap = _srcTextGetters[srcIndices[0]];
                    Contracts.AssertValue(srcMap);

                    stringMapper =
                        (ref NGram src, ref StringBuilder dst) =>
                    {
                        Contracts.Assert(src.ISrcCol == 0);
                        if (src.Lim == 1)
                        {
                            srcMap(ref src.Grams[0], ref dst);
                            return;
                        }
                        ClearDst(ref dst);
                        for (int i = 0; i < src.Lim; ++i)
                        {
                            if (i > 0)
                            {
                                dst.Append('|');
                            }
                            srcMap(ref src.Grams[i], ref temp);
                            InvertHashUtils.AppendToEnd(temp, dst, ref buffer);
                        }
                    };
                }
                else
                {
                    Contracts.Assert(srcIndices.Length > 1);
                    string[] srcNames = _friendlyNames[iinfo];
                    if (srcNames == null)
                    {
                        srcNames = new string[srcIndices.Length];
                        for (int i = 0; i < srcIndices.Length; ++i)
                        {
                            srcNames[i] = _parent.Source.Schema.GetColumnName(srcIndices[i]);
                        }
                    }
                    Contracts.Assert(Utils.Size(srcNames) == srcIndices.Length);
                    string[] friendlyNames = _friendlyNames?[iinfo];
                    // We need to disambiguate the column name. This will be the same as the above format,
                    // just instead of "<Stuff>" it would be with "ColumnName:<Stuff>".
                    stringMapper =
                        (ref NGram src, ref StringBuilder dst) =>
                    {
                        var srcMap = _srcTextGetters[srcIndices[src.ISrcCol]];
                        Contracts.AssertValue(srcMap);
                        ClearDst(ref dst);
                        dst.Append(srcNames[src.ISrcCol]);
                        dst.Append(':');
                        for (int i = 0; i < src.Lim; ++i)
                        {
                            if (i > 0)
                            {
                                dst.Append('|');
                            }
                            srcMap(ref src.Grams[i], ref temp);
                            InvertHashUtils.AppendToEnd(temp, dst, ref buffer);
                        }
                    };
                }

                var collector = _iinfoToCollector[iinfo] = new InvertHashCollector <NGram>(
                    _parent._bindings.Types[iinfo].VectorSize, _invertHashMaxCounts[iinfo],
                    stringMapper, EqualityComparer <NGram> .Default, (ref NGram src, ref NGram dst) => dst = src.Clone());

                return
                    ((uint[] ngram, int lim, int icol, ref bool more) =>
                {
                    Contracts.Assert(0 <= icol && icol < srcIndices.Length);
                    Contracts.AssertValue(_srcTextGetters[srcIndices[icol]]);
                    var result = finder(ngram, lim, icol, ref more);
                    // For the hashing NgramIdFinder, a result of -1 indicates that
                    // a slot does not exist for the given ngram. We do not pass ngrams
                    // that do not have a slot to the InvertHash collector.
                    if (result != -1)
                    {
                        // The following ngram is "unsafe", in that the ngram array is actually
                        // re-used. The collector will utilize its copier to make it safe, in
                        // the event that this is a key it needs to keep.
                        var ngramObj = new NGram(ngram, lim, icol);
                        collector.Add(result, ngramObj);
                    }
                    return result;
                });
            }