Ejemplo n.º 1
0
        internal Automaton ToLookupAutomaton(string key)
        {
            // TODO: is there a Reader from a CharSequence?
            // Turn tokenstream into automaton:
            Automaton   automaton = null;
            TokenStream ts        = queryAnalyzer.GetTokenStream("", key);

            try
            {
                automaton = (GetTokenStreamToAutomaton()).ToAutomaton(ts);
            }
            finally
            {
                IOUtils.DisposeWhileHandlingException(ts);
            }

            // TODO: we could use the end offset to "guess"
            // whether the final token was a partial token; this
            // would only be a heuristic ... but maybe an OK one.
            // This way we could eg differentiate "net" from "net ",
            // which we can't today...

            ReplaceSep(automaton);

            // TODO: we can optimize this somewhat by determinizing
            // while we convert
            BasicOperations.Determinize(automaton);
            return(automaton);
        }
Ejemplo n.º 2
0
        internal Automaton ToLevenshteinAutomata(Automaton automaton)
        {
            var @ref = SpecialOperations.GetFiniteStrings(automaton, -1);

            Automaton[] subs = new Automaton[@ref.Count];
            int         upto = 0;

            foreach (IntsRef path in @ref)
            {
                if (path.Length <= nonFuzzyPrefix || path.Length < minFuzzyLength)
                {
                    subs[upto] = BasicAutomata.MakeString(path.Ints, path.Offset, path.Length);
                    upto++;
                }
                else
                {
                    Automaton prefix = BasicAutomata.MakeString(path.Ints, path.Offset, nonFuzzyPrefix);
                    int[]     ints   = new int[path.Length - nonFuzzyPrefix];
                    Array.Copy(path.Ints, path.Offset + nonFuzzyPrefix, ints, 0, ints.Length);
                    // TODO: maybe add alphaMin to LevenshteinAutomata,
                    // and pass 1 instead of 0?  We probably don't want
                    // to allow the trailing dedup bytes to be
                    // edited... but then 0 byte is "in general" allowed
                    // on input (but not in UTF8).
                    LevenshteinAutomata lev          = new LevenshteinAutomata(ints, unicodeAware ? char.MAX_CODE_POINT : 255, transpositions);
                    Automaton           levAutomaton = lev.ToAutomaton(maxEdits);
                    Automaton           combined     = BasicOperations.Concatenate(Arrays.AsList(prefix, levAutomaton));
                    combined.Deterministic = true; // its like the special case in concatenate itself, except we cloneExpanded already
                    subs[upto]             = combined;
                    upto++;
                }
            }

            if (subs.Length == 0)
            {
                // automaton is empty, there is no accepted paths through it
                return(BasicAutomata.MakeEmpty()); // matches nothing
            }
            else if (subs.Length == 1)
            {
                // no synonyms or anything: just a single path through the tokenstream
                return(subs[0]);
            }
            else
            {
                // multiple paths: this is really scary! is it slow?
                // maybe we should not do this and throw UOE?
                Automaton a = BasicOperations.Union(Arrays.AsList(subs));
                // TODO: we could call toLevenshteinAutomata() before det?
                // this only happens if you have multiple paths anyway (e.g. synonyms)
                BasicOperations.Determinize(a);

                return(a);
            }
        }
Ejemplo n.º 3
0
 protected internal override Automaton ConvertAutomaton(Automaton a)
 {
     if (unicodeAware)
     {
         Automaton utf8automaton = (new UTF32ToUTF8()).Convert(a);
         BasicOperations.Determinize(utf8automaton);
         return(utf8automaton);
     }
     else
     {
         return(a);
     }
 }