Example #1
0
        public virtual void TestNonBMPChar()
        {
            CharFilter  cs = new MappingCharFilter(normMap, new StringReader(UnicodeUtil.NewString(new int[] { 0x1D122 }, 0, 1)));
            TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);

            AssertTokenStreamContents(ts, new string[] { "fclef" }, new int[] { 0 }, new int[] { 2 }, 2);
        }
Example #2
0
        public static int ToUpper(int codePoint)
        {
            var str = UnicodeUtil.NewString(new[] { codePoint }, 0, 1);

            str = str.ToUpperInvariant();

            return(CodePointAt(str, 0));
        }
Example #3
0
        public static int ToLowerCase(int codePoint)
        {
            var str = UnicodeUtil.NewString(new[] { codePoint }, 0, 1);

            str = str.ToLower();

            return(CodePointAt(str, 0));
        }
Example #4
0
        public static int ToLowerCase(int codePoint)
        {
            // LUCENENET TODO do we really need this? what's wrong with char.ToLower() ?

            var str = UnicodeUtil.NewString(new[] { codePoint }, 0, 1);

            str = str.ToLower();

            return(CodePointAt(str, 0));
        }
Example #5
0
 internal static string InputToString(int inputMode, IntsRef term, bool isValidUnicode)
 {
     if (!isValidUnicode)
     {
         return(term.ToString());
     }
     else if (inputMode == 0)
     {
         // utf8
         return(ToBytesRef(term).Utf8ToString() + " " + term);
     }
     else
     {
         // utf32
         return(UnicodeUtil.NewString(term.Ints, term.Offset, term.Length) + " " + term);
     }
 }
            /// <summary>
            /// Constructor for enumeration of all terms from specified <c>reader</c> which share a prefix of
            /// length <c>prefixLength</c> with <c>term</c> and which have a fuzzy similarity &gt;
            /// <c>minSimilarity</c>.
            /// <para/>
            /// After calling the constructor the enumeration is already pointing to the first
            /// valid term if such a term exists.
            /// </summary>
            /// <exception cref="IOException">If there is a low-level I/O error.</exception>
            public LinearFuzzyTermsEnum(SlowFuzzyTermsEnum outerInstance)
                : base(outerInstance.m_terms.GetEnumerator())
            {
                this.outerInstance = outerInstance;
                this.boostAtt      = Attributes.AddAttribute <IBoostAttribute>();

                this.text = new int[outerInstance.m_termLength - outerInstance.m_realPrefixLength];
                System.Array.Copy(outerInstance.m_termText, outerInstance.m_realPrefixLength, text, 0, text.Length);
                string prefix = UnicodeUtil.NewString(outerInstance.m_termText, 0, outerInstance.m_realPrefixLength);

                prefixBytesRef = new BytesRef(prefix);
                this.d         = new int[this.text.Length + 1];
                this.p         = new int[this.text.Length + 1];


                SetInitialSeekTerm(prefixBytesRef);
            }
Example #7
0
        private static void AssertAutomaton(Automaton automaton)
        {
            var cra = new CharacterRunAutomaton(automaton);
            var bra = new ByteRunAutomaton(automaton);
            var ras = new AutomatonTestUtil.RandomAcceptedStrings(automaton);

            int num = AtLeast(1000);

            for (int i = 0; i < num; i++)
            {
                string s;
                if (Random().NextBoolean())
                {
                    // likely not accepted
                    s = TestUtil.RandomUnicodeString(Random());
                }
                else
                {
                    // will be accepted
                    int[] codepoints = ras.GetRandomAcceptedString(Random());
                    try
                    {
                        s = UnicodeUtil.NewString(codepoints, 0, codepoints.Length);
                    }
                    catch (Exception e)
                    {
                        Console.WriteLine(codepoints.Length + " codepoints:");
                        for (int j = 0; j < codepoints.Length; j++)
                        {
                            Console.WriteLine("  " + codepoints[j].ToString("x"));
                        }
                        throw e;
                    }
                }
                var bytes = s.GetBytes(Encoding.UTF8);
                Assert.AreEqual(cra.Run(s), bra.Run(bytes, 0, bytes.Length));
            }
        }
Example #8
0
        private static void AssertAutomaton(Automaton automaton)
        {
            var cra = new CharacterRunAutomaton(automaton);
            var bra = new ByteRunAutomaton(automaton);
            var ras = new RandomAcceptedStrings(automaton);

            int num = AtLeast(1000);

            for (int i = 0; i < num; i++)
            {
                string s;
                if (Random.NextBoolean())
                {
                    // likely not accepted
                    s = TestUtil.RandomUnicodeString(Random);
                }
                else
                {
                    // will be accepted
                    int[] codepoints = ras.GetRandomAcceptedString(Random);
                    try
                    {
                        s = UnicodeUtil.NewString(codepoints, 0, codepoints.Length);
                    }
                    catch (Exception /*e*/)
                    {
                        Console.WriteLine(codepoints.Length + " codepoints:");
                        for (int j = 0; j < codepoints.Length; j++)
                        {
                            Console.WriteLine("  " + codepoints[j].ToString("x"));
                        }
                        throw; // LUCENENET: CA2200: Rethrow to preserve stack details (https://docs.microsoft.com/en-us/visualstudio/code-quality/ca2200-rethrow-to-preserve-stack-details)
                    }
                }
                var bytes = s.GetBytes(Encoding.UTF8);
                Assert.AreEqual(cra.Run(s), bra.Run(bytes, 0, bytes.Length));
            }
        }
Example #9
0
        public override void SetUp()
        {
            base.SetUp();
            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();

            builder.Add("aa", "a");
            builder.Add("bbb", "b");
            builder.Add("cccc", "cc");

            builder.Add("h", "i");
            builder.Add("j", "jj");
            builder.Add("k", "kkk");
            builder.Add("ll", "llll");

            builder.Add("empty", "");

            // BMP (surrogate pair):
            builder.Add(UnicodeUtil.NewString(new int[] { 0x1D122 }, 0, 1), "fclef");

            builder.Add("\uff01", "full-width-exclamation");

            normMap = builder.Build();
        }
        public virtual void TestGetRandomAcceptedString()
        {
            int ITER1 = AtLeast(100);
            int ITER2 = AtLeast(100);

            for (int i = 0; i < ITER1; i++)
            {
                RegExp    re = new RegExp(AutomatonTestUtil.RandomRegexp(Random), RegExpSyntax.NONE);
                Automaton a  = re.ToAutomaton();
                Assert.IsFalse(BasicOperations.IsEmpty(a));

                RandomAcceptedStrings rx = new RandomAcceptedStrings(a);
                for (int j = 0; j < ITER2; j++)
                {
                    int[] acc = null;
                    try
                    {
                        acc = rx.GetRandomAcceptedString(Random);
                        string s = UnicodeUtil.NewString(acc, 0, acc.Length);
                        Assert.IsTrue(BasicOperations.Run(a, s));
                    }
                    catch (Exception /*t*/)
                    {
                        Console.WriteLine("regexp: " + re);
                        if (acc != null)
                        {
                            Console.WriteLine("fail acc re=" + re + " count=" + acc.Length);
                            for (int k = 0; k < acc.Length; k++)
                            {
                                Console.WriteLine("  " + acc[k].ToString("x"));
                            }
                        }
                        throw; // LUCENENET: CA2200: Rethrow to preserve stack details (https://docs.microsoft.com/en-us/visualstudio/code-quality/ca2200-rethrow-to-preserve-stack-details)
                    }
                }
            }
        }
Example #11
0
        public virtual void TestGetRandomAcceptedString()
        {
            int ITER1 = AtLeast(100);
            int ITER2 = AtLeast(100);

            for (int i = 0; i < ITER1; i++)
            {
                RegExp    re = new RegExp(AutomatonTestUtil.RandomRegexp(Random()), RegExp.NONE);
                Automaton a  = re.ToAutomaton();
                Assert.IsFalse(BasicOperations.IsEmpty(a));

                AutomatonTestUtil.RandomAcceptedStrings rx = new AutomatonTestUtil.RandomAcceptedStrings(a);
                for (int j = 0; j < ITER2; j++)
                {
                    int[] acc = null;
                    try
                    {
                        acc = rx.GetRandomAcceptedString(Random());
                        string s = UnicodeUtil.NewString(acc, 0, acc.Length);
                        Assert.IsTrue(BasicOperations.Run(a, s));
                    }
                    catch (Exception t)
                    {
                        Console.WriteLine("regexp: " + re);
                        if (acc != null)
                        {
                            Console.WriteLine("fail acc re=" + re + " count=" + acc.Length);
                            for (int k = 0; k < acc.Length; k++)
                            {
                                Console.WriteLine("  " + acc[k].ToString("x"));
                            }
                        }
                        throw t;
                    }
                }
            }
        }
Example #12
0
        /// <summary>
        /// Initialize levenshtein DFAs up to maxDistance, if possible </summary>
        private IList <CompiledAutomaton> InitAutomata(int maxDistance)
        {
            IList <CompiledAutomaton> runAutomata = dfaAtt.Automata;

            //System.out.println("cached automata size: " + runAutomata.size());
            if (runAutomata.Count <= maxDistance && maxDistance <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE)
            {
                LevenshteinAutomata builder = new LevenshteinAutomata(UnicodeUtil.NewString(m_termText, m_realPrefixLength, m_termText.Length - m_realPrefixLength), transpositions);

                for (int i = runAutomata.Count; i <= maxDistance; i++)
                {
                    Automaton a = builder.ToAutomaton(i);
                    //System.out.println("compute automaton n=" + i);
                    // constant prefix
                    if (m_realPrefixLength > 0)
                    {
                        Automaton prefix = BasicAutomata.MakeString(UnicodeUtil.NewString(m_termText, 0, m_realPrefixLength));
                        a = BasicOperations.Concatenate(prefix, a);
                    }
                    runAutomata.Add(new CompiledAutomaton(a, true, false));
                }
            }
            return(runAutomata);
        }
Example #13
0
        /// <summary>
        /// Extracts all <see cref="MultiTermQuery"/>s for <paramref name="field"/>, and returns equivalent
        /// automata that will match terms.
        /// </summary>
        internal static CharacterRunAutomaton[] ExtractAutomata(Query query, string field)
        {
            List <CharacterRunAutomaton> list = new List <CharacterRunAutomaton>();

            if (query is BooleanQuery)
            {
                BooleanClause[] clauses = ((BooleanQuery)query).GetClauses();
                foreach (BooleanClause clause in clauses)
                {
                    if (!clause.IsProhibited)
                    {
                        list.AddAll(Arrays.AsList(ExtractAutomata(clause.Query, field)));
                    }
                }
            }
            else if (query is DisjunctionMaxQuery)
            {
                foreach (Query sub in ((DisjunctionMaxQuery)query).Disjuncts)
                {
                    list.AddAll(Arrays.AsList(ExtractAutomata(sub, field)));
                }
            }
            else if (query is SpanOrQuery)
            {
                foreach (Query sub in ((SpanOrQuery)query).GetClauses())
                {
                    list.AddAll(Arrays.AsList(ExtractAutomata(sub, field)));
                }
            }
            else if (query is SpanNearQuery)
            {
                foreach (Query sub in ((SpanNearQuery)query).GetClauses())
                {
                    list.AddAll(Arrays.AsList(ExtractAutomata(sub, field)));
                }
            }
            else if (query is SpanNotQuery)
            {
                list.AddAll(Arrays.AsList(ExtractAutomata(((SpanNotQuery)query).Include, field)));
            }
            else if (query is SpanPositionCheckQuery)
            {
                list.AddAll(Arrays.AsList(ExtractAutomata(((SpanPositionCheckQuery)query).Match, field)));
            }
            else if (query is ISpanMultiTermQueryWrapper)
            {
                list.AddAll(Arrays.AsList(ExtractAutomata(((ISpanMultiTermQueryWrapper)query).WrappedQuery, field)));
            }
            else if (query is AutomatonQuery)
            {
                AutomatonQuery aq = (AutomatonQuery)query;
                if (aq.Field.Equals(field, StringComparison.Ordinal))
                {
                    list.Add(new CharacterRunAutomatonToStringAnonymousHelper(aq.Automaton, () => aq.ToString()));
                }
            }
            else if (query is PrefixQuery)
            {
                PrefixQuery pq     = (PrefixQuery)query;
                Term        prefix = pq.Prefix;
                if (prefix.Field.Equals(field, StringComparison.Ordinal))
                {
                    list.Add(new CharacterRunAutomatonToStringAnonymousHelper(
                                 BasicOperations.Concatenate(BasicAutomata.MakeString(prefix.Text()), BasicAutomata.MakeAnyString()),
                                 () => pq.ToString()));
                }
            }
            else if (query is FuzzyQuery)
            {
                FuzzyQuery fq = (FuzzyQuery)query;
                if (fq.Field.Equals(field, StringComparison.Ordinal))
                {
                    string utf16    = fq.Term.Text();
                    int[]  termText = new int[utf16.CodePointCount(0, utf16.Length)];
                    for (int cp, i = 0, j = 0; i < utf16.Length; i += Character.CharCount(cp))
                    {
                        termText[j++] = cp = utf16.CodePointAt(i);
                    }
                    int    termLength             = termText.Length;
                    int    prefixLength           = Math.Min(fq.PrefixLength, termLength);
                    string suffix                 = UnicodeUtil.NewString(termText, prefixLength, termText.Length - prefixLength);
                    LevenshteinAutomata builder   = new LevenshteinAutomata(suffix, fq.Transpositions);
                    Automaton           automaton = builder.ToAutomaton(fq.MaxEdits);
                    if (prefixLength > 0)
                    {
                        Automaton prefix = BasicAutomata.MakeString(UnicodeUtil.NewString(termText, 0, prefixLength));
                        automaton = BasicOperations.Concatenate(prefix, automaton);
                    }
                    list.Add(new CharacterRunAutomatonToStringAnonymousHelper(automaton, () => fq.ToString()));
                }
            }
            else if (query is TermRangeQuery)
            {
                TermRangeQuery tq = (TermRangeQuery)query;
                if (tq.Field.Equals(field, StringComparison.Ordinal))
                {
                    // this is *not* an automaton, but its very simple
                    list.Add(new SimpleCharacterRunAutomatonAnonymousHelper(BasicAutomata.MakeEmpty(), tq));
                }
            }
            return(list.ToArray(/*new CharacterRunAutomaton[list.size()]*/));
        }