internal Automaton ToLookupAutomaton(string key) { // TODO: is there a Reader from a CharSequence? // Turn tokenstream into automaton: Automaton automaton = null; TokenStream ts = queryAnalyzer.GetTokenStream("", key); try { automaton = (GetTokenStreamToAutomaton()).ToAutomaton(ts); } finally { IOUtils.DisposeWhileHandlingException(ts); } // TODO: we could use the end offset to "guess" // whether the final token was a partial token; this // would only be a heuristic ... but maybe an OK one. // This way we could eg differentiate "net" from "net ", // which we can't today... ReplaceSep(automaton); // TODO: we can optimize this somewhat by determinizing // while we convert BasicOperations.Determinize(automaton); return(automaton); }
internal Automaton ToLevenshteinAutomata(Automaton automaton) { var @ref = SpecialOperations.GetFiniteStrings(automaton, -1); Automaton[] subs = new Automaton[@ref.Count]; int upto = 0; foreach (IntsRef path in @ref) { if (path.Length <= nonFuzzyPrefix || path.Length < minFuzzyLength) { subs[upto] = BasicAutomata.MakeString(path.Ints, path.Offset, path.Length); upto++; } else { Automaton prefix = BasicAutomata.MakeString(path.Ints, path.Offset, nonFuzzyPrefix); int[] ints = new int[path.Length - nonFuzzyPrefix]; Array.Copy(path.Ints, path.Offset + nonFuzzyPrefix, ints, 0, ints.Length); // TODO: maybe add alphaMin to LevenshteinAutomata, // and pass 1 instead of 0? We probably don't want // to allow the trailing dedup bytes to be // edited... but then 0 byte is "in general" allowed // on input (but not in UTF8). LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? char.MAX_CODE_POINT : 255, transpositions); Automaton levAutomaton = lev.ToAutomaton(maxEdits); Automaton combined = BasicOperations.Concatenate(Arrays.AsList(prefix, levAutomaton)); combined.Deterministic = true; // its like the special case in concatenate itself, except we cloneExpanded already subs[upto] = combined; upto++; } } if (subs.Length == 0) { // automaton is empty, there is no accepted paths through it return(BasicAutomata.MakeEmpty()); // matches nothing } else if (subs.Length == 1) { // no synonyms or anything: just a single path through the tokenstream return(subs[0]); } else { // multiple paths: this is really scary! is it slow? // maybe we should not do this and throw UOE? Automaton a = BasicOperations.Union(Arrays.AsList(subs)); // TODO: we could call toLevenshteinAutomata() before det? // this only happens if you have multiple paths anyway (e.g. synonyms) BasicOperations.Determinize(a); return(a); } }
protected internal override Automaton ConvertAutomaton(Automaton a) { if (unicodeAware) { Automaton utf8automaton = (new UTF32ToUTF8()).Convert(a); BasicOperations.Determinize(utf8automaton); return(utf8automaton); } else { return(a); } }