public virtual void TestNonBMPChar() { CharFilter cs = new MappingCharFilter(normMap, new StringReader(UnicodeUtil.NewString(new int[] { 0x1D122 }, 0, 1))); TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); AssertTokenStreamContents(ts, new string[] { "fclef" }, new int[] { 0 }, new int[] { 2 }, 2); }
public static int ToUpper(int codePoint) { var str = UnicodeUtil.NewString(new[] { codePoint }, 0, 1); str = str.ToUpperInvariant(); return(CodePointAt(str, 0)); }
public static int ToLowerCase(int codePoint) { var str = UnicodeUtil.NewString(new[] { codePoint }, 0, 1); str = str.ToLower(); return(CodePointAt(str, 0)); }
public static int ToLowerCase(int codePoint) { // LUCENENET TODO do we really need this? what's wrong with char.ToLower() ? var str = UnicodeUtil.NewString(new[] { codePoint }, 0, 1); str = str.ToLower(); return(CodePointAt(str, 0)); }
internal static string InputToString(int inputMode, IntsRef term, bool isValidUnicode) { if (!isValidUnicode) { return(term.ToString()); } else if (inputMode == 0) { // utf8 return(ToBytesRef(term).Utf8ToString() + " " + term); } else { // utf32 return(UnicodeUtil.NewString(term.Ints, term.Offset, term.Length) + " " + term); } }
/// <summary> /// Constructor for enumeration of all terms from specified <c>reader</c> which share a prefix of /// length <c>prefixLength</c> with <c>term</c> and which have a fuzzy similarity > /// <c>minSimilarity</c>. /// <para/> /// After calling the constructor the enumeration is already pointing to the first /// valid term if such a term exists. /// </summary> /// <exception cref="IOException">If there is a low-level I/O error.</exception> public LinearFuzzyTermsEnum(SlowFuzzyTermsEnum outerInstance) : base(outerInstance.m_terms.GetEnumerator()) { this.outerInstance = outerInstance; this.boostAtt = Attributes.AddAttribute <IBoostAttribute>(); this.text = new int[outerInstance.m_termLength - outerInstance.m_realPrefixLength]; System.Array.Copy(outerInstance.m_termText, outerInstance.m_realPrefixLength, text, 0, text.Length); string prefix = UnicodeUtil.NewString(outerInstance.m_termText, 0, outerInstance.m_realPrefixLength); prefixBytesRef = new BytesRef(prefix); this.d = new int[this.text.Length + 1]; this.p = new int[this.text.Length + 1]; SetInitialSeekTerm(prefixBytesRef); }
private static void AssertAutomaton(Automaton automaton) { var cra = new CharacterRunAutomaton(automaton); var bra = new ByteRunAutomaton(automaton); var ras = new AutomatonTestUtil.RandomAcceptedStrings(automaton); int num = AtLeast(1000); for (int i = 0; i < num; i++) { string s; if (Random().NextBoolean()) { // likely not accepted s = TestUtil.RandomUnicodeString(Random()); } else { // will be accepted int[] codepoints = ras.GetRandomAcceptedString(Random()); try { s = UnicodeUtil.NewString(codepoints, 0, codepoints.Length); } catch (Exception e) { Console.WriteLine(codepoints.Length + " codepoints:"); for (int j = 0; j < codepoints.Length; j++) { Console.WriteLine(" " + codepoints[j].ToString("x")); } throw e; } } var bytes = s.GetBytes(Encoding.UTF8); Assert.AreEqual(cra.Run(s), bra.Run(bytes, 0, bytes.Length)); } }
private static void AssertAutomaton(Automaton automaton) { var cra = new CharacterRunAutomaton(automaton); var bra = new ByteRunAutomaton(automaton); var ras = new RandomAcceptedStrings(automaton); int num = AtLeast(1000); for (int i = 0; i < num; i++) { string s; if (Random.NextBoolean()) { // likely not accepted s = TestUtil.RandomUnicodeString(Random); } else { // will be accepted int[] codepoints = ras.GetRandomAcceptedString(Random); try { s = UnicodeUtil.NewString(codepoints, 0, codepoints.Length); } catch (Exception /*e*/) { Console.WriteLine(codepoints.Length + " codepoints:"); for (int j = 0; j < codepoints.Length; j++) { Console.WriteLine(" " + codepoints[j].ToString("x")); } throw; // LUCENENET: CA2200: Rethrow to preserve stack details (https://docs.microsoft.com/en-us/visualstudio/code-quality/ca2200-rethrow-to-preserve-stack-details) } } var bytes = s.GetBytes(Encoding.UTF8); Assert.AreEqual(cra.Run(s), bra.Run(bytes, 0, bytes.Length)); } }
public override void SetUp() { base.SetUp(); NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.Add("aa", "a"); builder.Add("bbb", "b"); builder.Add("cccc", "cc"); builder.Add("h", "i"); builder.Add("j", "jj"); builder.Add("k", "kkk"); builder.Add("ll", "llll"); builder.Add("empty", ""); // BMP (surrogate pair): builder.Add(UnicodeUtil.NewString(new int[] { 0x1D122 }, 0, 1), "fclef"); builder.Add("\uff01", "full-width-exclamation"); normMap = builder.Build(); }
public virtual void TestGetRandomAcceptedString() { int ITER1 = AtLeast(100); int ITER2 = AtLeast(100); for (int i = 0; i < ITER1; i++) { RegExp re = new RegExp(AutomatonTestUtil.RandomRegexp(Random), RegExpSyntax.NONE); Automaton a = re.ToAutomaton(); Assert.IsFalse(BasicOperations.IsEmpty(a)); RandomAcceptedStrings rx = new RandomAcceptedStrings(a); for (int j = 0; j < ITER2; j++) { int[] acc = null; try { acc = rx.GetRandomAcceptedString(Random); string s = UnicodeUtil.NewString(acc, 0, acc.Length); Assert.IsTrue(BasicOperations.Run(a, s)); } catch (Exception /*t*/) { Console.WriteLine("regexp: " + re); if (acc != null) { Console.WriteLine("fail acc re=" + re + " count=" + acc.Length); for (int k = 0; k < acc.Length; k++) { Console.WriteLine(" " + acc[k].ToString("x")); } } throw; // LUCENENET: CA2200: Rethrow to preserve stack details (https://docs.microsoft.com/en-us/visualstudio/code-quality/ca2200-rethrow-to-preserve-stack-details) } } } }
public virtual void TestGetRandomAcceptedString() { int ITER1 = AtLeast(100); int ITER2 = AtLeast(100); for (int i = 0; i < ITER1; i++) { RegExp re = new RegExp(AutomatonTestUtil.RandomRegexp(Random()), RegExp.NONE); Automaton a = re.ToAutomaton(); Assert.IsFalse(BasicOperations.IsEmpty(a)); AutomatonTestUtil.RandomAcceptedStrings rx = new AutomatonTestUtil.RandomAcceptedStrings(a); for (int j = 0; j < ITER2; j++) { int[] acc = null; try { acc = rx.GetRandomAcceptedString(Random()); string s = UnicodeUtil.NewString(acc, 0, acc.Length); Assert.IsTrue(BasicOperations.Run(a, s)); } catch (Exception t) { Console.WriteLine("regexp: " + re); if (acc != null) { Console.WriteLine("fail acc re=" + re + " count=" + acc.Length); for (int k = 0; k < acc.Length; k++) { Console.WriteLine(" " + acc[k].ToString("x")); } } throw t; } } } }
/// <summary> /// Initialize levenshtein DFAs up to maxDistance, if possible </summary> private IList <CompiledAutomaton> InitAutomata(int maxDistance) { IList <CompiledAutomaton> runAutomata = dfaAtt.Automata; //System.out.println("cached automata size: " + runAutomata.size()); if (runAutomata.Count <= maxDistance && maxDistance <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { LevenshteinAutomata builder = new LevenshteinAutomata(UnicodeUtil.NewString(m_termText, m_realPrefixLength, m_termText.Length - m_realPrefixLength), transpositions); for (int i = runAutomata.Count; i <= maxDistance; i++) { Automaton a = builder.ToAutomaton(i); //System.out.println("compute automaton n=" + i); // constant prefix if (m_realPrefixLength > 0) { Automaton prefix = BasicAutomata.MakeString(UnicodeUtil.NewString(m_termText, 0, m_realPrefixLength)); a = BasicOperations.Concatenate(prefix, a); } runAutomata.Add(new CompiledAutomaton(a, true, false)); } } return(runAutomata); }
/// <summary> /// Extracts all <see cref="MultiTermQuery"/>s for <paramref name="field"/>, and returns equivalent /// automata that will match terms. /// </summary> internal static CharacterRunAutomaton[] ExtractAutomata(Query query, string field) { List <CharacterRunAutomaton> list = new List <CharacterRunAutomaton>(); if (query is BooleanQuery) { BooleanClause[] clauses = ((BooleanQuery)query).GetClauses(); foreach (BooleanClause clause in clauses) { if (!clause.IsProhibited) { list.AddAll(Arrays.AsList(ExtractAutomata(clause.Query, field))); } } } else if (query is DisjunctionMaxQuery) { foreach (Query sub in ((DisjunctionMaxQuery)query).Disjuncts) { list.AddAll(Arrays.AsList(ExtractAutomata(sub, field))); } } else if (query is SpanOrQuery) { foreach (Query sub in ((SpanOrQuery)query).GetClauses()) { list.AddAll(Arrays.AsList(ExtractAutomata(sub, field))); } } else if (query is SpanNearQuery) { foreach (Query sub in ((SpanNearQuery)query).GetClauses()) { list.AddAll(Arrays.AsList(ExtractAutomata(sub, field))); } } else if (query is SpanNotQuery) { list.AddAll(Arrays.AsList(ExtractAutomata(((SpanNotQuery)query).Include, field))); } else if (query is SpanPositionCheckQuery) { list.AddAll(Arrays.AsList(ExtractAutomata(((SpanPositionCheckQuery)query).Match, field))); } else if (query is ISpanMultiTermQueryWrapper) { list.AddAll(Arrays.AsList(ExtractAutomata(((ISpanMultiTermQueryWrapper)query).WrappedQuery, field))); } else if (query is AutomatonQuery) { AutomatonQuery aq = (AutomatonQuery)query; if (aq.Field.Equals(field, StringComparison.Ordinal)) { list.Add(new CharacterRunAutomatonToStringAnonymousHelper(aq.Automaton, () => aq.ToString())); } } else if (query is PrefixQuery) { PrefixQuery pq = (PrefixQuery)query; Term prefix = pq.Prefix; if (prefix.Field.Equals(field, StringComparison.Ordinal)) { list.Add(new CharacterRunAutomatonToStringAnonymousHelper( BasicOperations.Concatenate(BasicAutomata.MakeString(prefix.Text()), BasicAutomata.MakeAnyString()), () => pq.ToString())); } } else if (query is FuzzyQuery) { FuzzyQuery fq = (FuzzyQuery)query; if (fq.Field.Equals(field, StringComparison.Ordinal)) { string utf16 = fq.Term.Text(); int[] termText = new int[utf16.CodePointCount(0, utf16.Length)]; for (int cp, i = 0, j = 0; i < utf16.Length; i += Character.CharCount(cp)) { termText[j++] = cp = utf16.CodePointAt(i); } int termLength = termText.Length; int prefixLength = Math.Min(fq.PrefixLength, termLength); string suffix = UnicodeUtil.NewString(termText, prefixLength, termText.Length - prefixLength); LevenshteinAutomata builder = new LevenshteinAutomata(suffix, fq.Transpositions); Automaton automaton = builder.ToAutomaton(fq.MaxEdits); if (prefixLength > 0) { Automaton prefix = BasicAutomata.MakeString(UnicodeUtil.NewString(termText, 0, prefixLength)); automaton = BasicOperations.Concatenate(prefix, automaton); } list.Add(new CharacterRunAutomatonToStringAnonymousHelper(automaton, () => fq.ToString())); } } else if (query is TermRangeQuery) { TermRangeQuery tq = (TermRangeQuery)query; if (tq.Field.Equals(field, StringComparison.Ordinal)) { // this is *not* an automaton, but its very simple list.Add(new SimpleCharacterRunAutomatonAnonymousHelper(BasicAutomata.MakeEmpty(), tq)); } } return(list.ToArray(/*new CharacterRunAutomaton[list.size()]*/)); }