internal Automaton ToLevenshteinAutomata(Automaton automaton) { var @ref = SpecialOperations.GetFiniteStrings(automaton, -1); Automaton[] subs = new Automaton[@ref.Count]; int upto = 0; foreach (IntsRef path in @ref) { if (path.Length <= nonFuzzyPrefix || path.Length < minFuzzyLength) { subs[upto] = BasicAutomata.MakeString(path.Ints, path.Offset, path.Length); upto++; } else { Automaton prefix = BasicAutomata.MakeString(path.Ints, path.Offset, nonFuzzyPrefix); int[] ints = new int[path.Length - nonFuzzyPrefix]; Array.Copy(path.Ints, path.Offset + nonFuzzyPrefix, ints, 0, ints.Length); // TODO: maybe add alphaMin to LevenshteinAutomata, // and pass 1 instead of 0? We probably don't want // to allow the trailing dedup bytes to be // edited... but then 0 byte is "in general" allowed // on input (but not in UTF8). LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? char.MAX_CODE_POINT : 255, transpositions); Automaton levAutomaton = lev.ToAutomaton(maxEdits); Automaton combined = BasicOperations.Concatenate(Arrays.AsList(prefix, levAutomaton)); combined.Deterministic = true; // its like the special case in concatenate itself, except we cloneExpanded already subs[upto] = combined; upto++; } } if (subs.Length == 0) { // automaton is empty, there is no accepted paths through it return(BasicAutomata.MakeEmpty()); // matches nothing } else if (subs.Length == 1) { // no synonyms or anything: just a single path through the tokenstream return(subs[0]); } else { // multiple paths: this is really scary! is it slow? // maybe we should not do this and throw UOE? Automaton a = BasicOperations.Union(Arrays.AsList(subs)); // TODO: we could call toLevenshteinAutomata() before det? // this only happens if you have multiple paths anyway (e.g. synonyms) BasicOperations.Determinize(a); return(a); } }
/// <summary> /// Initialize levenshtein DFAs up to maxDistance, if possible </summary> private IList <CompiledAutomaton> InitAutomata(int maxDistance) { IList <CompiledAutomaton> runAutomata = dfaAtt.Automata; //System.out.println("cached automata size: " + runAutomata.size()); if (runAutomata.Count <= maxDistance && maxDistance <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { LevenshteinAutomata builder = new LevenshteinAutomata(UnicodeUtil.NewString(m_termText, m_realPrefixLength, m_termText.Length - m_realPrefixLength), transpositions); for (int i = runAutomata.Count; i <= maxDistance; i++) { Automaton a = builder.ToAutomaton(i); //System.out.println("compute automaton n=" + i); // constant prefix if (m_realPrefixLength > 0) { Automaton prefix = BasicAutomata.MakeString(UnicodeUtil.NewString(m_termText, 0, m_realPrefixLength)); a = BasicOperations.Concatenate(prefix, a); } runAutomata.Add(new CompiledAutomaton(a, true, false)); } } return(runAutomata); }
/// <summary> /// initialize levenshtein DFAs up to maxDistance, if possible </summary> private IList<CompiledAutomaton> InitAutomata(int maxDistance) { IList<CompiledAutomaton> runAutomata = DfaAtt.Automata(); //System.out.println("cached automata size: " + runAutomata.size()); if (runAutomata.Count <= maxDistance && maxDistance <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { LevenshteinAutomata builder = new LevenshteinAutomata(UnicodeUtil.NewString(TermText, RealPrefixLength, TermText.Length - RealPrefixLength), Transpositions); for (int i = runAutomata.Count; i <= maxDistance; i++) { Automaton a = builder.ToAutomaton(i); //System.out.println("compute automaton n=" + i); // constant prefix if (RealPrefixLength > 0) { Automaton prefix = BasicAutomata.MakeString(UnicodeUtil.NewString(TermText, 0, RealPrefixLength)); a = BasicOperations.Concatenate(prefix, a); } runAutomata.Add(new CompiledAutomaton(a, true, false)); } } return runAutomata; }
/// <summary> /// Extracts all <see cref="MultiTermQuery"/>s for <paramref name="field"/>, and returns equivalent /// automata that will match terms. /// </summary> internal static CharacterRunAutomaton[] ExtractAutomata(Query query, string field) { List <CharacterRunAutomaton> list = new List <CharacterRunAutomaton>(); if (query is BooleanQuery) { BooleanClause[] clauses = ((BooleanQuery)query).GetClauses(); foreach (BooleanClause clause in clauses) { if (!clause.IsProhibited) { list.AddAll(Arrays.AsList(ExtractAutomata(clause.Query, field))); } } } else if (query is DisjunctionMaxQuery) { foreach (Query sub in ((DisjunctionMaxQuery)query).Disjuncts) { list.AddAll(Arrays.AsList(ExtractAutomata(sub, field))); } } else if (query is SpanOrQuery) { foreach (Query sub in ((SpanOrQuery)query).GetClauses()) { list.AddAll(Arrays.AsList(ExtractAutomata(sub, field))); } } else if (query is SpanNearQuery) { foreach (Query sub in ((SpanNearQuery)query).GetClauses()) { list.AddAll(Arrays.AsList(ExtractAutomata(sub, field))); } } else if (query is SpanNotQuery) { list.AddAll(Arrays.AsList(ExtractAutomata(((SpanNotQuery)query).Include, field))); } else if (query is SpanPositionCheckQuery) { list.AddAll(Arrays.AsList(ExtractAutomata(((SpanPositionCheckQuery)query).Match, field))); } else if (query is ISpanMultiTermQueryWrapper) { list.AddAll(Arrays.AsList(ExtractAutomata(((ISpanMultiTermQueryWrapper)query).WrappedQuery, field))); } else if (query is AutomatonQuery) { AutomatonQuery aq = (AutomatonQuery)query; if (aq.Field.Equals(field, StringComparison.Ordinal)) { list.Add(new CharacterRunAutomatonToStringAnonymousHelper(aq.Automaton, () => aq.ToString())); } } else if (query is PrefixQuery) { PrefixQuery pq = (PrefixQuery)query; Term prefix = pq.Prefix; if (prefix.Field.Equals(field, StringComparison.Ordinal)) { list.Add(new CharacterRunAutomatonToStringAnonymousHelper( BasicOperations.Concatenate(BasicAutomata.MakeString(prefix.Text()), BasicAutomata.MakeAnyString()), () => pq.ToString())); } } else if (query is FuzzyQuery) { FuzzyQuery fq = (FuzzyQuery)query; if (fq.Field.Equals(field, StringComparison.Ordinal)) { string utf16 = fq.Term.Text(); int[] termText = new int[utf16.CodePointCount(0, utf16.Length)]; for (int cp, i = 0, j = 0; i < utf16.Length; i += Character.CharCount(cp)) { termText[j++] = cp = utf16.CodePointAt(i); } int termLength = termText.Length; int prefixLength = Math.Min(fq.PrefixLength, termLength); string suffix = UnicodeUtil.NewString(termText, prefixLength, termText.Length - prefixLength); LevenshteinAutomata builder = new LevenshteinAutomata(suffix, fq.Transpositions); Automaton automaton = builder.ToAutomaton(fq.MaxEdits); if (prefixLength > 0) { Automaton prefix = BasicAutomata.MakeString(UnicodeUtil.NewString(termText, 0, prefixLength)); automaton = BasicOperations.Concatenate(prefix, automaton); } list.Add(new CharacterRunAutomatonToStringAnonymousHelper(automaton, () => fq.ToString())); } } else if (query is TermRangeQuery) { TermRangeQuery tq = (TermRangeQuery)query; if (tq.Field.Equals(field, StringComparison.Ordinal)) { // this is *not* an automaton, but its very simple list.Add(new SimpleCharacterRunAutomatonAnonymousHelper(BasicAutomata.MakeEmpty(), tq)); } } return(list.ToArray(/*new CharacterRunAutomaton[list.size()]*/)); }