Beispiel #1
0
        protected internal override IList <FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> > GetFullPrefixPaths(
            IList <FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> > prefixPaths,
            Automaton lookupAutomaton,
            FST <PairOutputs <long?, BytesRef> .Pair> fst)
        {
            // TODO: right now there's no penalty for fuzzy/edits,
            // ie a completion whose prefix matched exactly what the
            // user typed gets no boost over completions that
            // required an edit, which get no boost over completions
            // requiring two edits.  I suspect a multiplicative
            // factor is appropriate (eg, say a fuzzy match must be at
            // least 2X better weight than the non-fuzzy match to
            // "compete") ... in which case I think the wFST needs
            // to be log weights or something ...

            Automaton levA = ConvertAutomaton(ToLevenshteinAutomata(lookupAutomaton));

            /*
             * Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), StandardCharsets.UTF_8);
             * w.write(levA.toDot());
             * w.close();
             * System.out.println("Wrote LevA to out.dot");
             */
            return(FSTUtil.IntersectPrefixPaths(levA, fst));
        }
Beispiel #2
0
        public override IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, bool onlyMorePopular, int num)
        {
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(num > 0);
            }

            if (onlyMorePopular)
            {
                throw new ArgumentException("this suggester only works with onlyMorePopular=false");
            }
            if (contexts != null)
            {
                throw new ArgumentException("this suggester doesn't support contexts");
            }
            if (fst == null)
            {
                return(Collections.EmptyList <LookupResult>());
            }

            //System.out.println("lookup key=" + key + " num=" + num);
            for (var i = 0; i < key.Length; i++)
            {
                if (key[i] == 0x1E)
                {
                    throw new ArgumentException(
                              "lookup key cannot contain HOLE character U+001E; this character is reserved");
                }
                if (key[i] == 0x1F)
                {
                    throw new ArgumentException(
                              "lookup key cannot contain unit separator character U+001F; this character is reserved");
                }
            }

            var utf8Key = new BytesRef(key);

            try
            {
                Automaton lookupAutomaton = ToLookupAutomaton(key);

                var spare = new CharsRef();

                //System.out.println("  now intersect exactFirst=" + exactFirst);

                // Intersect automaton w/ suggest wFST and get all
                // prefix starting nodes & their outputs:
                //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);

                //System.out.println("  prefixPaths: " + prefixPaths.size());

                FST.BytesReader bytesReader = fst.GetBytesReader();

                var scratchArc = new FST.Arc <PairOutputs <long?, BytesRef> .Pair>();

                IList <LookupResult> results = new List <LookupResult>();

                IList <FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> > prefixPaths =
                    FSTUtil.IntersectPrefixPaths(ConvertAutomaton(lookupAutomaton), fst);

                if (exactFirst)
                {
                    int count = 0;
                    foreach (FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> path in prefixPaths)
                    {
                        if (fst.FindTargetArc(END_BYTE, path.FstNode, scratchArc, bytesReader) != null)
                        {
                            // This node has END_BYTE arc leaving, meaning it's an
                            // "exact" match:
                            count++;
                        }
                    }

                    // Searcher just to find the single exact only
                    // match, if present:
                    Util.Fst.Util.TopNSearcher <PairOutputs <long?, BytesRef> .Pair> searcher;
                    searcher = new Util.Fst.Util.TopNSearcher <PairOutputs <long?, BytesRef> .Pair>(fst, count * maxSurfaceFormsPerAnalyzedForm,
                                                                                                    count * maxSurfaceFormsPerAnalyzedForm, weightComparer);

                    // NOTE: we could almost get away with only using
                    // the first start node.  The only catch is if
                    // maxSurfaceFormsPerAnalyzedForm had kicked in and
                    // pruned our exact match from one of these nodes
                    // ...:
                    foreach (var path in prefixPaths)
                    {
                        if (fst.FindTargetArc(END_BYTE, path.FstNode, scratchArc, bytesReader) != null)
                        {
                            // This node has END_BYTE arc leaving, meaning it's an
                            // "exact" match:
                            searcher.AddStartPaths(scratchArc, fst.Outputs.Add(path.Output, scratchArc.Output), false,
                                                   path.Input);
                        }
                    }

                    var completions = searcher.Search();
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(completions.IsComplete);
                    }

                    // NOTE: this is rather inefficient: we enumerate
                    // every matching "exactly the same analyzed form"
                    // path, and then do linear scan to see if one of
                    // these exactly matches the input.  It should be
                    // possible (though hairy) to do something similar
                    // to getByOutput, since the surface form is encoded
                    // into the FST output, so we more efficiently hone
                    // in on the exact surface-form match.  Still, I
                    // suspect very little time is spent in this linear
                    // seach: it's bounded by how many prefix start
                    // nodes we have and the
                    // maxSurfaceFormsPerAnalyzedForm:
                    foreach (var completion in completions)
                    {
                        BytesRef output2 = completion.Output.Output2;
                        if (SameSurfaceForm(utf8Key, output2))
                        {
                            results.Add(GetLookupResult(completion.Output.Output1, output2, spare));
                            break;
                        }
                    }

                    if (results.Count == num)
                    {
                        // That was quick:
                        return(results);
                    }
                }

                Util.Fst.Util.TopNSearcher <PairOutputs <long?, BytesRef> .Pair> searcher2;
                searcher2 = new TopNSearcherAnonymousInnerClassHelper(this, fst, num - results.Count,
                                                                      num * maxAnalyzedPathsForOneInput, weightComparer, utf8Key, results);

                prefixPaths = GetFullPrefixPaths(prefixPaths, lookupAutomaton, fst);

                foreach (FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> path in prefixPaths)
                {
                    searcher2.AddStartPaths(path.FstNode, path.Output, true, path.Input);
                }

                var completions2 = searcher2.Search();
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(completions2.IsComplete);
                }

                foreach (Util.Fst.Util.Result <PairOutputs <long?, BytesRef> .Pair> completion in completions2)
                {
                    LookupResult result = GetLookupResult(completion.Output.Output1, completion.Output.Output2, spare);

                    // TODO: for fuzzy case would be nice to return
                    // how many edits were required

                    //System.out.println("    result=" + result);
                    results.Add(result);

                    if (results.Count == num)
                    {
                        // In the exactFirst=true case the search may
                        // produce one extra path
                        break;
                    }
                }

                return(results);
            }
            catch (IOException bogus)
            {
                throw new Exception(bogus.ToString(), bogus);
            }
        }