Exemplo n.º 1
0
        public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
        {
            //create the tokenizer
            TokenStream result = new StandardTokenizer(CURRENT_VERSION, reader);

            //add in filters
            result = new Lucene.Net.Analysis.Snowball.SnowballFilter(result, new EnglishStemmer());

            //add in filters
            // first normalize the StandardTokenizer
            //-result = new StandardFilter(result);

            // makes sure everything is lower case
            result = new LowerCaseFilter(result);

            result = new ASCIIFoldingFilter(result);

            // use the default list of Stop Words, provided by the StopAnalyzer class.
            //-result = new StopFilter(true, result, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
            result = new StopFilter(true, result, new HashSet <string>());


            //return the built token stream.
            return(result);
        }
Exemplo n.º 2
0
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            var result = subAnalyzer.TokenStream(fieldName, reader);

            result = new ASCIIFoldingFilter(result);
            return(result);
        }
        /// <summary>
        /// Tokenizes a field for use in an autocomplete search. Ref DOH-893.
        /// Inspiration taken from:
        /// https://github.com/Sitecore/autohaus/blob/master/Autohaus.Custom/Indexing/Analyzers/NGramAnalyzer.cs
        /// http://stackoverflow.com/a/9183416
        /// </summary>
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            // This should be a good tokenizer for most European-language documents:
            // Splits words at punctuation characters, removing punctuation.
            // Splits words at hyphens, unless there's a number in the token...
            // Recognizes email addresses and internet hostnames as one token.
            TokenStream tokenStream = new StandardTokenizer(this.version, reader);

            // apply a set of standard filters
            tokenStream = new StandardFilter(tokenStream);
            tokenStream = new LowerCaseFilter(tokenStream);

            // This class converts alphabetic, numeric, and symbolic Unicode characters
            // which are not in the first 127 ASCII characters (the "Basic Latin" Unicode
            // block) into their ASCII equivalents, if one exists.
            tokenStream = new ASCIIFoldingFilter(tokenStream);
            tokenStream = new StopFilter(false, tokenStream, StopAnalyzer.ENGLISH_STOP_WORDS_SET);

            // apply the EdgeNGramTokenFilter
            // this turns each token into a set of prefixes, e.g.
            // "South Melbourne" will be turned into "Sou South Mel Melb Melb ..."
            tokenStream = new EdgeNGramTokenFilter(tokenStream, Side.FRONT, this.minGram, this.maxGram);

            // Removes stop words from a token stream.
            return(tokenStream);
        }
Exemplo n.º 4
0
        //Para el procesamiento de textos
        internal string DeleteInvalidData(string result, string tipoAnalizador)
        {
            TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_30, new System.IO.StringReader(result));

            tokenStream = new StandardFilter(tokenStream);  //elimina los signos de puntuación
            tokenStream = new LowerCaseFilter(tokenStream); //convierte el contenido a minúsculas
            if (tipoAnalizador == "Español")
            {
                //filtrará el contenido con el listado de stopWords
                tokenStream = new StopFilter(true, tokenStream, StopFilter.MakeStopSet(SpanishAnalyzer.SPANISH_STOP_WORDS));
                //Convierte caracteres que estan por encima del 127 en la tabla ASCII
                tokenStream = new ASCIIFoldingFilter(tokenStream);
                //Operacion de lematización de la palabras
                tokenStream = SpanishSteammer(tokenStream);
            }
            else
            {
                //filtrará el contenido con el listado de stopWords
                tokenStream = new StopFilter(true, tokenStream, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
                //Operacion de lematización de la palabras
                tokenStream = new PorterStemFilter(tokenStream);
            }

            return(GetDataTokens(tokenStream));
        }
Exemplo n.º 5
0
            protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
            {
                Tokenizer   tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
                TokenFilter filters   = new ASCIIFoldingFilter(tokenizer);

                filters = new EdgeNGramTokenFilter(Version.LUCENE_43, filters, EdgeNGramTokenFilter.Side.FRONT, 2, 15);
                return(new TokenStreamComponents(tokenizer, filters));
            }
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            var         source = new KeywordTokenizer(reader);
            TokenStream result = new ASCIIFoldingFilter(source);

            result = new LowerCaseFilter(LuceneVersion.LUCENE_48, result);
            return(new TokenStreamComponents(source, result));
        }
Exemplo n.º 7
0
            protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
            {
                Tokenizer   tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
                TokenFilter filters   = new ASCIIFoldingFilter(tokenizer);

                filters = new NGramTokenFilter(TEST_VERSION_CURRENT, filters, 2, 2);
                return(new TokenStreamComponents(tokenizer, filters));
            }
    public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
    {
        TokenStream result = new StandardTokenizer(matchVersion, reader);

        result = new StandardFilter(result);
        result = new ASCIIFoldingFilter(result);
        return(result);
    }
Exemplo n.º 9
0
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            var         source = new StandardTokenizer(LuceneVersion.LUCENE_48, reader);
            TokenStream result = new WordDelimiterFilter(LuceneVersion.LUCENE_48, source, 255, CharArraySet.EMPTY_SET);

            result = new ASCIIFoldingFilter(result);
            result = new LowerCaseFilter(LuceneVersion.LUCENE_48, result);
            return(new TokenStreamComponents(source, result));
        }
Exemplo n.º 10
0
            protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
            {
                Tokenizer   tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
                TokenFilter filters   = new ASCIIFoldingFilter(tokenizer);

#pragma warning disable 612, 618
                filters = new EdgeNGramTokenFilter(LuceneVersion.LUCENE_43, filters, EdgeNGramTokenFilter.Side.FRONT, 2, 15);
#pragma warning restore 612, 618
                return(new TokenStreamComponents(tokenizer, filters));
            }
Exemplo n.º 11
0
        /// <summary>
        /// Override of the token stream method, uses these filters in order:
        ///
        /// Whitespace splitter
        /// ASCII common folder (ie é goes to e)
        /// Lowercase
        /// Stopwords removed
        /// Porter stemming (reduces words to common stem)
        /// </summary>
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            var         tokenizer = new WhitespaceTokenizer(LeoLuceneVersion.Version, reader);
            TokenStream filter    = new ASCIIFoldingFilter(tokenizer);

            filter = new LowerCaseFilter(LeoLuceneVersion.Version, filter);
            filter = new StopFilter(LeoLuceneVersion.Version, filter, _words);
            filter = new PorterStemFilter(filter);
            return(new TokenStreamComponents(tokenizer, filter));
        }
Exemplo n.º 12
0
        /// <summary>
        /// </summary>
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            TokenStream t = null;

            t = new LetterOrDigitTokenizer(reader);
            t = new LowerCaseFilter(t);
            t = new ASCIIFoldingFilter(t);

            return(t);
        }
Exemplo n.º 13
0
 public virtual void TestInvalidOffsets()
 {
     Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
     {
         Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
         TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
         filters = new NGramTokenFilter(TEST_VERSION_CURRENT, filters, 2, 2);
         return new TokenStreamComponents(tokenizer, filters);
     });
     AssertAnalyzesTo(analyzer, "mosfellsbær", new string[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" }, new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 }, new int[] { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 });
 }
Exemplo n.º 14
0
        public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
        {
            //create the tokenizer
            TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_30, reader);

            //add in filters
            result = new Lucene.Net.Analysis.Snowball.SnowballFilter(result, new PortugueseStemmer());
            result = new LowerCaseFilter(result);
            result = new ASCIIFoldingFilter(result);
            result = new StopFilter(true, result, EnglishStopWords.GetEnglishStopWords());
            return(result);
        }
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(m_matchVersion, reader);
            TokenStream result = new StandardFilter(m_matchVersion, source);

            result = new EnglishPossessiveFilter(m_matchVersion, result);
            result = new ASCIIFoldingFilter(result);
            result = new LowerCaseFilter(m_matchVersion, result);
            result = new StopFilter(m_matchVersion, result, m_stopwords);
            result = new PorterStemFilter(result);
            return(new TokenStreamComponents(source, result));
        }
            protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
            {
                // ReSharper disable once PossibleNullReferenceException
                var components = (TokenStreamComponents)_analyzer.GetType().GetMethod(nameof(CreateComponents), BindingFlags.Instance | BindingFlags.NonPublic)
                                 .Invoke(_analyzer, new object[] { fieldName, reader });

                var tokenizer = components.Tokenizer;
                var stream    = components.TokenStream;

                stream = new ASCIIFoldingFilter(stream);

                return(new TokenStreamComponents(tokenizer, stream));
            }
Exemplo n.º 17
0
            protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
            {
                // in order: std tokenized
                Tokenizer source = new StandardTokenizer(version, reader);
                // lowercase
                TokenStream tokenStream = new LowerCaseFilter(version, source);

                // replace non-ascii (eg. unicode) chars by the closest ascii equivalent if possible
                tokenStream = new ASCIIFoldingFilter(tokenStream);
                // creates all ngrams
                tokenStream = new NGramTokenFilter(version, tokenStream, k_MinGramSize, k_MaxGramSize);
                return(new TokenStreamComponents(source, tokenStream));
            }
Exemplo n.º 18
0
        //Para el procesamiento de textos
        public string AnalizarConsulta(string consulta, string tipoAnalizador)
        {
            ArrayList   ListStemsList = new ArrayList();
            TokenStream tokenStream   = new StandardTokenizer(Version.LUCENE_30, new System.IO.StringReader(consulta));

            tokenStream = new StandardFilter(tokenStream);  //elimina los signos de puntuación
            tokenStream = new LowerCaseFilter(tokenStream); //convierte el contenido a minúsculas
            if (tipoAnalizador == "Español")
            {
                //filtrará el contenido con el listado de stopWords
                tokenStream = new StopFilter(true, tokenStream, StopFilter.MakeStopSet(SpanishAnalyzer.SPANISH_STOP_WORDS));
                //Convierte caracteres que estan por encima del 127 en la tabla ASCII
                tokenStream = new ASCIIFoldingFilter(tokenStream);
                //Operacion de lematización de la palabras
                tokenStream = SpanishSteammer(tokenStream);
            }
            else if (tipoAnalizador == "Ingles")
            {
                //filtrará el contenido con el listado de stopWords
                tokenStream = new StopFilter(true, tokenStream, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
                //Operacion de lematización de la palabras
                tokenStream = new PorterStemFilter(tokenStream);
            }
            else //Sino establece idioma solo elimina palabras vacias en ambos idiomas
            {
                //filtrará el contenido con el listado de stopWords
                tokenStream = new StopFilter(true, tokenStream, StopFilter.MakeStopSet(SpanishAnalyzer.SPANISH_STOP_WORDS));
                //filtrará el contenido con el listado de stopWords
                tokenStream = new StopFilter(true, tokenStream, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
            }

            string term     = string.Empty;
            var    termAttr = tokenStream.GetAttribute <ITermAttribute>();

            int i = 0;

            while (tokenStream.IncrementToken())
            {
                if (i == 0)
                {
                    term = termAttr.Term;
                    i++;
                }
                else
                {
                    term = term + "," + termAttr.Term;
                }
            }

            return((string.IsNullOrEmpty(term)) ? string.Empty : term.Trim());
        }
Exemplo n.º 19
0
            public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
            {
                TokenStream result = new StandardTokenizer(kLuceneVersion, reader);

                result = new StandardFilter(result);
                result = new LowerCaseFilter(result);
                result = new ASCIIFoldingFilter(result);
                result = new StopFilter(false, result, StopFilter.MakeStopSet(kEnglishStopWords));
                //DEFAULT_SIDE = Side.FRONT
                result = new EdgeNGramTokenFilter(
                    result, Lucene.Net.Analysis.NGram.EdgeNGramTokenFilter.DEFAULT_SIDE, 1, 20);

                return(result);
            }
Exemplo n.º 20
0
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            TokenStream result = new LetterOrDigitTokenizer(reader);

            if (_ignoreLanguageAccents)
            {
                result = new ASCIIFoldingFilter(result);
            }
            if (_caseInsensitive)
            {
                result = new LowerCaseFilter(result);
            }
            return(result);
        }
Exemplo n.º 21
0
        public virtual void TestInvalidOffsets()
        {
            Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
            {
                Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
                TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
#pragma warning disable 612, 618
                filters = new EdgeNGramTokenFilter(LuceneVersion.LUCENE_43, filters, EdgeNGramTokenFilter.Side.FRONT, 2, 15);
#pragma warning restore 612, 618
                return(new TokenStreamComponents(tokenizer, filters));
            });

            AssertAnalyzesTo(analyzer, "mosfellsbær", new string[] { "mo", "mos", "mosf", "mosfe", "mosfel", "mosfell", "mosfells", "mosfellsb", "mosfellsba", "mosfellsbae", "mosfellsbaer" }, new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 });
        }
Exemplo n.º 22
0
        public virtual void TestAllFoldings()
        {
            // Alternating strings of:
            //   1. All non-ASCII characters to be folded, concatenated together as a
            //      single string.
            //   2. The string of ASCII characters to which each of the above
            //      characters should be folded.
            string[] foldings = new string[] { "À" + "Á" + "Â" + "Ã" + "Ä" + "Å" + "Ā" + "Ă" + "Ą" + "Ə" + "Ǎ" + "Ǟ" + "Ǡ" + "Ǻ" + "Ȁ" + "Ȃ" + "Ȧ" + "Ⱥ" + "ᴀ" + "Ḁ" + "Ạ" + "Ả" + "Ấ" + "Ầ" + "Ẩ" + "Ẫ" + "Ậ" + "Ắ" + "Ằ" + "Ẳ" + "Ẵ" + "Ặ" + "Ⓐ" + "A", "A", "à" + "á" + "â" + "ã" + "ä" + "å" + "ā" + "ă" + "ą" + "ǎ" + "ǟ" + "ǡ" + "ǻ" + "ȁ" + "ȃ" + "ȧ" + "ɐ" + "ə" + "ɚ" + "ᶏ" + "ḁ" + "ᶕ" + "ẚ" + "ạ" + "ả" + "ấ" + "ầ" + "ẩ" + "ẫ" + "ậ" + "ắ" + "ằ" + "ẳ" + "ẵ" + "ặ" + "ₐ" + "ₔ" + "ⓐ" + "ⱥ" + "Ɐ" + "a", "a", "Ꜳ", "AA", "Æ" + "Ǣ" + "Ǽ" + "ᴁ", "AE", "Ꜵ", "AO", "Ꜷ", "AU", "Ꜹ" + "Ꜻ", "AV", "Ꜽ", "AY", "⒜", "(a)", "ꜳ", "aa", "æ" + "ǣ" + "ǽ" + "ᴂ", "ae", "ꜵ", "ao", "ꜷ", "au", "ꜹ" + "ꜻ", "av", "ꜽ", "ay", "Ɓ" + "Ƃ" + "Ƀ" + "ʙ" + "ᴃ" + "Ḃ" + "Ḅ" + "Ḇ" + "Ⓑ" + "B", "B", "ƀ" + "ƃ" + "ɓ" + "ᵬ" + "ᶀ" + "ḃ" + "ḅ" + "ḇ" + "ⓑ" + "b", "b", "⒝", "(b)", "Ç" + "Ć" + "Ĉ" + "Ċ" + "Č" + "Ƈ" + "Ȼ" + "ʗ" + "ᴄ" + "Ḉ" + "Ⓒ" + "C", "C", "ç" + "ć" + "ĉ" + "ċ" + "č" + "ƈ" + "ȼ" + "ɕ" + "ḉ" + "ↄ" + "ⓒ" + "Ꜿ" + "ꜿ" + "c", "c", "⒞", "(c)", "Ð" + "Ď" + "Đ" + "Ɖ" + "Ɗ" + "Ƌ" + "ᴅ" + "ᴆ" + "Ḋ" + "Ḍ" + "Ḏ" + "Ḑ" + "Ḓ" + "Ⓓ" + "Ꝺ" + "D", "D", "ð" + "ď" + "đ" + "ƌ" + "ȡ" + "ɖ" + "ɗ" + "ᵭ" + "ᶁ" + "ᶑ" + "ḋ" + "ḍ" + "ḏ" + "ḑ" + "ḓ" + "ⓓ" + "ꝺ" + "d", "d", "DŽ" + "DZ", "DZ", "Dž" + "Dz", "Dz", "⒟", "(d)", "ȸ", "db", "dž" + "dz" + "ʣ" + "ʥ", "dz", "È" + "É" + "Ê" + "Ë" + "Ē" + "Ĕ" + "Ė" + "Ę" + "Ě" + "Ǝ" + "Ɛ" + "Ȅ" + "Ȇ" + "Ȩ" + "Ɇ" + "ᴇ" + "Ḕ" + "Ḗ" + "Ḙ" + "Ḛ" + "Ḝ" + "Ẹ" + "Ẻ" + "Ẽ" + "Ế" + "Ề" + "Ể" + "Ễ" + "Ệ" + "Ⓔ" + "ⱻ" + "E", "E", "è" + "é" + "ê" + "ë" + "ē" + "ĕ" + "ė" + "ę" + "ě" + "ǝ" + "ȅ" + "ȇ" + "ȩ" + "ɇ" + "ɘ" + "ɛ" + "ɜ" + "ɝ" + "ɞ" + "ʚ" + "ᴈ" + "ᶒ" + "ᶓ" + "ᶔ" + "ḕ" + "ḗ" + "ḙ" + "ḛ" + "ḝ" + "ẹ" + "ẻ" + "ẽ" + "ế" + "ề" + "ể" + "ễ" + "ệ" + "ₑ" + "ⓔ" + "ⱸ" + "e", "e", "⒠", "(e)", "Ƒ" + "Ḟ" + "Ⓕ" + "ꜰ" + "Ꝼ" + "ꟻ" + "F", "F", "ƒ" + "ᵮ" + "ᶂ" + "ḟ" + "ẛ" + "ⓕ" + "ꝼ" + "f", "f", "⒡", "(f)", "ff", "ff", "ffi", "ffi", "ffl", "ffl", "fi", "fi", "fl", "fl", "Ĝ" + "Ğ" + "Ġ" + "Ģ" + "Ɠ" + "Ǥ" + "ǥ" + "Ǧ" + "ǧ" + "Ǵ" + "ɢ" + "ʛ" + "Ḡ" + "Ⓖ" + "Ᵹ" + "Ꝿ" + "G", "G", "ĝ" + "ğ" + "ġ" + "ģ" + "ǵ" + "ɠ" + "ɡ" + "ᵷ" + "ᵹ" + "ᶃ" + "ḡ" + "ⓖ" + "ꝿ" + "g", "g", "⒢", "(g)", "Ĥ" + "Ħ" + "Ȟ" + "ʜ" + "Ḣ" + "Ḥ" + "Ḧ" + "Ḩ" + "Ḫ" + "Ⓗ" + "Ⱨ" + "Ⱶ" + "H", "H", "ĥ" + "ħ" + "ȟ" + "ɥ" + "ɦ" + "ʮ" + "ʯ" + "ḣ" + "ḥ" + "ḧ" + "ḩ" + "ḫ" + "ẖ" + "ⓗ" + "ⱨ" + "ⱶ" + "h", "h", "Ƕ", "HV", "⒣", "(h)", "ƕ", "hv", "Ì" + "Í" + "Î" + "Ï" + "Ĩ" + "Ī" + "Ĭ" + "Į" + "İ" + "Ɩ" + "Ɨ" + "Ǐ" + "Ȉ" + "Ȋ" + "ɪ" + "ᵻ" + "Ḭ" + "Ḯ" + "Ỉ" + "Ị" + "Ⓘ" + "ꟾ" + "I", "I", "ì" + "í" + "î" + "ï" + "ĩ" + "ī" + "ĭ" + "į" + "ı" + "ǐ" + "ȉ" + "ȋ" + "ɨ" + "ᴉ" + "ᵢ" + "ᵼ" + "ᶖ" + "ḭ" + "ḯ" + "ỉ" + "ị" + "ⁱ" + "ⓘ" + "i", "i", "IJ", "IJ", "⒤", "(i)", "ij", "ij", "Ĵ" + "Ɉ" + "ᴊ" + "Ⓙ" + "J", "J", "ĵ" + "ǰ" + "ȷ" + "ɉ" + "ɟ" + "ʄ" + "ʝ" + "ⓙ" + "ⱼ" + "j", "j", "⒥", "(j)", "Ķ" + "Ƙ" + "Ǩ" + "ᴋ" + "Ḱ" + "Ḳ" + "Ḵ" + "Ⓚ" + "Ⱪ" + "Ꝁ" + "Ꝃ" + "Ꝅ" + "K", "K", "ķ" + "ƙ" + "ǩ" + "ʞ" + "ᶄ" + "ḱ" + "ḳ" + "ḵ" + "ⓚ" + "ⱪ" + "ꝁ" + "ꝃ" + "ꝅ" + "k", "k", "⒦", "(k)", "Ĺ" + "Ļ" + "Ľ" + "Ŀ" + "Ł" + "Ƚ" + "ʟ" + "ᴌ" + "Ḷ" + "Ḹ" + "Ḻ" + "Ḽ" + "Ⓛ" + "Ⱡ" + "Ɫ" + "Ꝇ" + "Ꝉ" + "Ꞁ" + "L", "L", "ĺ" + "ļ" + "ľ" + "ŀ" + "ł" + "ƚ" + "ȴ" + "ɫ" + "ɬ" + "ɭ" + "ᶅ" + "ḷ" + "ḹ" + "ḻ" + "ḽ" + "ⓛ" + "ⱡ" + "ꝇ" + "ꝉ" + "ꞁ" + "l", "l", "LJ", "LJ", "Ỻ", "LL", "Lj", "Lj", "⒧", "(l)", "lj", "lj", "ỻ", "ll", "ʪ", "ls", "ʫ", "lz", "Ɯ" + "ᴍ" + "Ḿ" + "Ṁ" + "Ṃ" + "Ⓜ" + "Ɱ" + "ꟽ" + "ꟿ" + "M", "M", "ɯ" + "ɰ" + "ɱ" + "ᵯ" + "ᶆ" + "ḿ" + "ṁ" + "ṃ" + "ⓜ" + "m", "m", "⒨", "(m)", "Ñ" + "Ń" + "Ņ" + "Ň" + "Ŋ" + "Ɲ" + "Ǹ" + "Ƞ" + "ɴ" + "ᴎ" + "Ṅ" + "Ṇ" + "Ṉ" + "Ṋ" + "Ⓝ" + "N", "N", "ñ" + "ń" + "ņ" + "ň" + "ʼn" + "ŋ" + "ƞ" + "ǹ" + "ȵ" + "ɲ" + "ɳ" + "ᵰ" + "ᶇ" + "ṅ" + "ṇ" + "ṉ" + "ṋ" + "ⁿ" + "ⓝ" + "n", "n", "NJ", "NJ", "Nj", "Nj", "⒩", "(n)", "nj", "nj", "Ò" + "Ó" + "Ô" + "Õ" + "Ö" + "Ø" + "Ō" + "Ŏ" + "Ő" + "Ɔ" + "Ɵ" + "Ơ" + "Ǒ" + "Ǫ" + "Ǭ" + "Ǿ" + "Ȍ" + "Ȏ" + "Ȫ" + "Ȭ" + "Ȯ" + "Ȱ" + "ᴏ" + "ᴐ" + "Ṍ" + "Ṏ" + "Ṑ" + "Ṓ" + "Ọ" + "Ỏ" + "Ố" + "Ồ" + "Ổ" + "Ỗ" + "Ộ" + "Ớ" + "Ờ" + "Ở" + "Ỡ" + "Ợ" + "Ⓞ" + "Ꝋ" + "Ꝍ" + "O", "O", "ò" + "ó" + "ô" + "õ" + "ö" + "ø" + "ō" + "ŏ" + "ő" + "ơ" + "ǒ" + "ǫ" + "ǭ" + "ǿ" + "ȍ" + "ȏ" + "ȫ" + "ȭ" + "ȯ" + "ȱ" + "ɔ" + "ɵ" + "ᴖ" + "ᴗ" + "ᶗ" + "ṍ" + "ṏ" + "ṑ" + "ṓ" + "ọ" + "ỏ" + "ố" + "ồ" + "ổ" + "ỗ" + "ộ" + "ớ" + "ờ" + "ở" + "ỡ" + "ợ" + "ₒ" + "ⓞ" + "ⱺ" + "ꝋ" + "ꝍ" + "o", "o", "Œ" + "ɶ", "OE", "Ꝏ", "OO", "Ȣ" + "ᴕ", "OU", "⒪", "(o)", "œ" + "ᴔ", "oe", "ꝏ", "oo", "ȣ", "ou", "Ƥ" + "ᴘ" + "Ṕ" + "Ṗ" + "Ⓟ" + "Ᵽ" + "Ꝑ" + "Ꝓ" + "Ꝕ" + "P", "P", "ƥ" + "ᵱ" + "ᵽ" + "ᶈ" + "ṕ" + "ṗ" + "ⓟ" + "ꝑ" + "ꝓ" + "ꝕ" + "ꟼ" + "p", "p", "⒫", "(p)", "Ɋ" + "Ⓠ" + "Ꝗ" + "Ꝙ" + "Q", "Q", "ĸ" + "ɋ" + "ʠ" + "ⓠ" + "ꝗ" + "ꝙ" + "q", "q", "⒬", "(q)", "ȹ", "qp", "Ŕ" + "Ŗ" + "Ř" + "Ȑ" + "Ȓ" + "Ɍ" + "ʀ" + "ʁ" + "ᴙ" + "ᴚ" + "Ṙ" + "Ṛ" + "Ṝ" + "Ṟ" + "Ⓡ" + "Ɽ" + "Ꝛ" + "Ꞃ" + "R", "R", "ŕ" + "ŗ" + "ř" + "ȑ" + "ȓ" + "ɍ" + "ɼ" + "ɽ" + "ɾ" + "ɿ" + "ᵣ" + "ᵲ" + "ᵳ" + "ᶉ" + "ṙ" + "ṛ" + "ṝ" + "ṟ" + "ⓡ" + "ꝛ" + "ꞃ" + "r", "r", "⒭", "(r)", "Ś" + "Ŝ" + "Ş" + "Š" + "Ș" + "Ṡ" + "Ṣ" + "Ṥ" + "Ṧ" + "Ṩ" + "Ⓢ" + "ꜱ" + "ꞅ" + "S", "S", "ś" + "ŝ" + "ş" + "š" + "ſ" + "ș" + "ȿ" + "ʂ" + "ᵴ" + "ᶊ" + "ṡ" + "ṣ" + "ṥ" + "ṧ" + "ṩ" + "ẜ" + "ẝ" + "ⓢ" + "Ꞅ" + "s", "s", "ẞ", "SS", "⒮", "(s)", "ß", "ss", "st", "st", "Ţ" + "Ť" + "Ŧ" + "Ƭ" + "Ʈ" + "Ț" + "Ⱦ" + "ᴛ" + "Ṫ" + "Ṭ" + "Ṯ" + "Ṱ" + "Ⓣ" + "Ꞇ" + "T", "T", "ţ" + "ť" + "ŧ" + "ƫ" + "ƭ" + "ț" + "ȶ" + "ʇ" + "ʈ" + "ᵵ" + "ṫ" + "ṭ" + "ṯ" + "ṱ" + "ẗ" + "ⓣ" + "ⱦ" + "t", "t", "Þ" + "Ꝧ", "TH", "Ꜩ", "TZ", "⒯", "(t)", "ʨ", "tc", "þ" + "ᵺ" + "ꝧ", "th", "ʦ", "ts", "ꜩ", "tz", "Ù" + "Ú" + "Û" + "Ü" + "Ũ" + "Ū" + "Ŭ" + "Ů" + "Ű" + "Ų" + "Ư" + "Ǔ" + "Ǖ" + "Ǘ" + "Ǚ" + "Ǜ" + "Ȕ" + "Ȗ" + "Ʉ" + "ᴜ" + "ᵾ" + "Ṳ" + "Ṵ" + "Ṷ" + "Ṹ" + "Ṻ" + "Ụ" + "Ủ" + "Ứ" + "Ừ" + "Ử" + "Ữ" + "Ự" + "Ⓤ" + "U", "U", "ù" + "ú" + "û" + "ü" + "ũ" + "ū" + "ŭ" + "ů" + "ű" + "ų" + "ư" + "ǔ" + "ǖ" + "ǘ" + "ǚ" + "ǜ" + "ȕ" + "ȗ" + "ʉ" + "ᵤ" + "ᶙ" + "ṳ" + "ṵ" + "ṷ" + "ṹ" + "ṻ" + "ụ" + "ủ" + "ứ" + "ừ" + "ử" + "ữ" + "ự" + "ⓤ" + "u", "u", "⒰", "(u)", "ᵫ", "ue", "Ʋ" + "Ʌ" + "ᴠ" + "Ṽ" + "Ṿ" + "Ỽ" + "Ⓥ" + "Ꝟ" + "Ꝩ" + "V", "V", "ʋ" + "ʌ" + "ᵥ" + "ᶌ" + "ṽ" + "ṿ" + "ⓥ" + "ⱱ" + "ⱴ" + "ꝟ" + "v", "v", "Ꝡ", "VY", "⒱", "(v)", "ꝡ", "vy", "Ŵ" + "Ƿ" + "ᴡ" + "Ẁ" + "Ẃ" + "Ẅ" + "Ẇ" + "Ẉ" + "Ⓦ" + "Ⱳ" + "W", "W", "ŵ" + "ƿ" + "ʍ" + "ẁ" + "ẃ" + "ẅ" + "ẇ" + "ẉ" + "ẘ" + "ⓦ" + "ⱳ" + "w", "w", "⒲", "(w)", "Ẋ" + "Ẍ" + "Ⓧ" + "X", "X", "ᶍ" + "ẋ" + "ẍ" + "ₓ" + "ⓧ" + "x", "x", "⒳", "(x)", "Ý" + "Ŷ" + "Ÿ" + "Ƴ" + "Ȳ" + "Ɏ" + "ʏ" + "Ẏ" + "Ỳ" + "Ỵ" + "Ỷ" + "Ỹ" + "Ỿ" + "Ⓨ" + "Y", "Y", "ý" + "ÿ" + "ŷ" + "ƴ" + "ȳ" + "ɏ" + "ʎ" + "ẏ" + "ẙ" + "ỳ" + "ỵ" + "ỷ" + "ỹ" + "ỿ" + "ⓨ" + "y", "y", "⒴", "(y)", "Ź" + "Ż" + "Ž" + "Ƶ" + "Ȝ" + "Ȥ" + "ᴢ" + "Ẑ" + "Ẓ" + "Ẕ" + "Ⓩ" + "Ⱬ" + "Ꝣ" + "Z", "Z", "ź" + "ż" + "ž" + "ƶ" + "ȝ" + "ȥ" + "ɀ" + "ʐ" + "ʑ" + "ᵶ" + "ᶎ" + "ẑ" + "ẓ" + "ẕ" + "ⓩ" + "ⱬ" + "ꝣ" + "z", "z", "⒵", "(z)", "⁰" + "₀" + "⓪" + "⓿" + "0", "0", "¹" + "₁" + "①" + "⓵" + "❶" + "➀" + "➊" + "1", "1", "⒈", "1.", "⑴", "(1)", "²" + "₂" + "②" + "⓶" + "❷" + "➁" + "➋" + "2", "2", "⒉", "2.", "⑵", "(2)", "³" + "₃" + "③" + "⓷" + "❸" + "➂" + "➌" + "3", "3", "⒊", "3.", "⑶", "(3)", "⁴" + "₄" + "④" + "⓸" + "❹" + "➃" + "➍" + "4", "4", "⒋", "4.", "⑷", "(4)", "⁵" + "₅" + "⑤" + "⓹" + "❺" + "➄" + "➎" + "5", "5", "⒌", "5.", "⑸", "(5)", "⁶" + "₆" + "⑥" + "⓺" + "❻" + "➅" + "➏" + "6", "6", "⒍", "6.", "⑹", "(6)", "⁷" + "₇" + "⑦" + "⓻" + "❼" + "➆" + "➐" + "7", "7", "⒎", "7.", "⑺", "(7)", "⁸" + "₈" + "⑧" + "⓼" + "❽" + "➇" + "➑" + "8", "8", "⒏", "8.", "⑻", "(8)", "⁹" + "₉" + "⑨" + "⓽" + "❾" + "➈" + "➒" + "9", "9", "⒐", "9.", "⑼", "(9)", "⑩" + "⓾" + "❿" + "➉" + "➓", "10", "⒑", "10.", "⑽", "(10)", "⑪" + "⓫", "11", "⒒", "11.", "⑾", "(11)", "⑫" + "⓬", "12", "⒓", "12.", "⑿", "(12)", "⑬" + "⓭", "13", "⒔", "13.", "⒀", "(13)", "⑭" + "⓮", "14", "⒕", "14.", "⒁", "(14)", "⑮" + "⓯", "15", "⒖", "15.", "⒂", "(15)", "⑯" + "⓰", "16", "⒗", "16.", "⒃", "(16)", "⑰" + "⓱", "17", "⒘", "17.", "⒄", "(17)", "⑱" + "⓲", "18", "⒙", "18.", "⒅", "(18)", "⑲" + "⓳", "19", "⒚", "19.", "⒆", "(19)", "⑳" + "⓴", "20", "⒛", "20.", "⒇", "(20)", "«" + "»" + "“" + "”" + "„" + "″" + "‶" + "❝" + "❞" + "❮" + "❯" + """, "\"", "‘" + "’" + "‚" + "‛" + "′" + "‵" + "‹" + "›" + "❛" + "❜" + "'", "'", "‐" + "‑" + "‒" + "–" + "—" + "⁻" + "₋" + "-", "-", "⁅" + "❲" + "[", "[", "⁆" + "❳" + "]", "]", "⁽" + "₍" + "❨" + "❪" + "(", "(", "⸨", "((", "⁾" + "₎" + "❩" + "❫" + ")", ")", "⸩", "))", "❬" + "❰" + "<", "<", "❭" + "❱" + ">", ">", "❴" + "{", "{", "❵" + "}", "}", "⁺" + "₊" + "+", "+", "⁼" + "₌" + "=", "=", "!", "!", "‼", "!!", "⁉", "!?", "#", "#", "$", "$", "⁒" + "%", "%", "&", "&", "⁎" + "*", "*", ",", ",", ".", ".", "⁄" + "/", "/", ":", ":", "⁏" + ";", ";", "?", "?", "⁇", "??", "⁈", "?!", "@", "@", "\", "\\", "‸" + "^", "^", "_", "_", "⁓" + "~", "~" };

            // Construct input text and expected output tokens
            IList <string> expectedUnfoldedTokens = new List <string>();
            IList <string> expectedFoldedTokens   = new List <string>();
            StringBuilder  inputText = new StringBuilder();

            for (int n = 0; n < foldings.Length; n += 2)
            {
                if (n > 0)
                {
                    inputText.Append(' '); // Space between tokens
                }
                inputText.Append(foldings[n]);

                // Construct the expected output tokens: both the unfolded and folded string,
                // with the folded duplicated as many times as the number of characters in
                // the input text.
                StringBuilder expected = new StringBuilder();
                int           numChars = foldings[n].Length;
                for (int m = 0; m < numChars; ++m)
                {
                    expected.Append(foldings[n + 1]);
                }
                expectedUnfoldedTokens.Add(foldings[n]);
                expectedFoldedTokens.Add(expected.ToString());
            }

            TokenStream          stream       = new MockTokenizer(new StringReader(inputText.ToString()), MockTokenizer.WHITESPACE, false);
            ASCIIFoldingFilter   filter       = new ASCIIFoldingFilter(stream, Random.NextBoolean());
            ICharTermAttribute   termAtt      = filter.GetAttribute <ICharTermAttribute>();
            IEnumerator <string> unfoldedIter = expectedUnfoldedTokens.GetEnumerator();
            IEnumerator <string> foldedIter   = expectedFoldedTokens.GetEnumerator();

            filter.Reset();
            while (foldedIter.MoveNext())
            {
                unfoldedIter.MoveNext();
                assertNextTerms(unfoldedIter.Current, foldedIter.Current, filter, termAtt);
            }
            assertFalse(filter.IncrementToken());
        }
Exemplo n.º 23
0
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            TokenStreamComponents tokenStreamComponents = null;
            Tokenizer             tokenizer             = new StandardTokenizer(matchVersion, reader);
            TokenStream           stream = new LowerCaseFilter(matchVersion, tokenizer);

            stream = new CyrllicToLatinFilter(stream);
            stream = new StopFilter(matchVersion, stream, StopFilter.MakeStopSet(matchVersion, STOP_WORDS));
            stream = new SnowballFilter(stream, new SimpleSerbianStemmer());
            stream = new ASCIIFoldingFilter(stream);

            tokenStreamComponents = new TokenStreamComponents(tokenizer, stream);

            return(tokenStreamComponents);
        }
Exemplo n.º 24
0
        public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
        {
            TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_29, reader);

            //result = new StandardFilter(result);
            result = new LowerCaseFilter(result);

            if (STOP_WORDS != null)
            {
                result = new StopFilter(false, result, STOP_WORDS);
            }
            result = new ASCIIFoldingFilter(result);
            result = new SnowballFilter(result, "English");

            return(result);
        }
Exemplo n.º 25
0
        public GitHubIndex(Directory indexDirectory, string githubApiKey)
        {
            github = new GitHubClient(new ProductHeaderValue("LuceneNetDemo"))
            {
                Credentials = new Credentials(githubApiKey)
            };

            analyzer = new PerFieldAnalyzerWrapper(
                // Example of a pre-built custom analyzer
                defaultAnalyzer: new HtmlStripAnalyzer(GitHubIndex.MatchVersion),

                // Example of inline anonymous analyzers
                fieldAnalyzers: new Dictionary <string, Analyzer>
            {
                // Field analyzer for owner
                {
                    "owner",
                    Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
                    {
                        var source         = new KeywordTokenizer(reader);
                        TokenStream result = new ASCIIFoldingFilter(source);
                        result             = new LowerCaseFilter(GitHubIndex.MatchVersion, result);
                        return(new TokenStreamComponents(source, result));
                    })
                },
                // Field analyzer for name
                {
                    "name",
                    Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
                    {
                        var source         = new StandardTokenizer(GitHubIndex.MatchVersion, reader);
                        TokenStream result = new WordDelimiterFilter(GitHubIndex.MatchVersion, source, ~WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE, CharArraySet.EMPTY_SET);
                        result             = new ASCIIFoldingFilter(result);
                        result             = new LowerCaseFilter(GitHubIndex.MatchVersion, result);
                        return(new TokenStreamComponents(source, result));
                    })
                }
            });

            queryParser = new MultiFieldQueryParser(GitHubIndex.MatchVersion,
                                                    new[] { "name", "description", "readme" }, analyzer);


            indexWriter     = new IndexWriter(indexDirectory, new IndexWriterConfig(GitHubIndex.MatchVersion, analyzer));
            searcherManager = new SearcherManager(indexWriter, true, null);
        }
Exemplo n.º 26
0
        public void TestInvalidOffset()
        {
            Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
            {
                Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
                TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
#pragma warning disable 612, 618
                filters = new WordTokenFilter(filters);
#pragma warning restore 612, 618
                return(new TokenStreamComponents(tokenizer, filters));
            });

            AssertAnalyzesTo(analyzer, "mosfellsbær",
                             new string[] { "mosfellsbaer" },
                             new int[] { 0 },
                             new int[] { 11 });
        }
Exemplo n.º 27
0
        public MySearch(string indexPath)
        {
            //_analyzer = new EnhEnglishAnalyzer(MATCH_LUCENE_VERSION);

            _analyzer = new MultiFieldAnalyzerWrapper(
                defaultAnalyzer: new EnhEnglishAnalyzer(MATCH_LUCENE_VERSION, true),
                new[]
            {
                (
                    new[] { "genre", "year" },
                    Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
                {
                    var source = new KeywordTokenizer(reader);
                    TokenStream result = new ASCIIFoldingFilter(source);
                    result = new LowerCaseFilter(MATCH_LUCENE_VERSION, result);
                    return(new TokenStreamComponents(source, result));
                })
                )
            });
Exemplo n.º 28
0
        private string FoldToASCII(string searchPhrase)
        {
            var sb = new StringBuilder();

            var cleanedPhrase = searchPhrase.Trim('\0');

            var asciiFilter = new ASCIIFoldingFilter(new WhitespaceTokenizer((TextReader) new StringReader(cleanedPhrase)));

            string space = string.Empty;

            while (asciiFilter.IncrementToken())
            {
                sb.AppendFormat("{0}{1}", space ?? "", asciiFilter.GetAttribute <ITermAttribute>().Term);
                if (string.IsNullOrEmpty(space))
                {
                    space = " ";
                }
            }
            return(sb.ToString());
        }
Exemplo n.º 29
0
        // the ordering of these filters is important!
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            if (string.Equals("MilitaryIDNumber", fieldName))
            {
                TokenStream result = new WhitespaceTokenizer(reader);
                result = new LowerCaseFilter(result);
                result = new ASCIIFoldingFilter(result);
                result = new AlphaNumericFilter(result);  // behaves weirdly when used on Name field

                // during indexing, we will encounter some of the following extraneous text we don't care about.
                string[] stopWords = new string[] { "", "formerly", "or", "former", "pir", "tbc", "id", "pnc" };
                return(new StopFilter(false, result, new CharArraySet(stopWords, true), true));
            }
            else
            {
                TokenStream result = new AlphaNumericTokenizer(reader);
                result = new LowerCaseFilter(result);
                return(new ASCIIFoldingFilter(result));
            }
        }
Exemplo n.º 30
0
        public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
        {
            TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_29, reader);

            //result = new StandardFilter(result);
            result = new LowerCaseFilter(result);
            if (STOP_WORDS != null)
            {
                result = new StopFilter(false, result, STOP_WORDS);
            }
            result = new ASCIIFoldingFilter(result);

            // we are using a distinct version of the Spanish stemmer, called Spanish2
            // Please check if this class can be found in the Snowball library, the relative path
            // should be: Snowball\SF\Snowball\Ext\
            // just in case, I would leave a copy of this class in this project
            result = new SnowballFilter(result, "Spanish");

            return(result);
        }
Exemplo n.º 31
0
        // The following Perl script generated the foldings[] array automatically
        // from ASCIIFoldingFilter.java:
        //
        //    ============== begin get.test.cases.pl ==============
        //
        //    use strict;
        //    use warnings;
        //
        //    my $file = "ASCIIFoldingFilter.java";
        //    my $output = "testcases.txt";
        //    my %codes = ();
        //    my $folded = '';
        //
        //    open IN, "<:utf8", $file || die "Error opening input file '$file': $!";
        //    open OUT, ">:utf8", $output || die "Error opening output file '$output': $!";
        //
        //    while (my $line = <IN>) {
        //      chomp($line);
        //      # case '\u0133': // <char> <maybe URL> [ description ]
        //      if ($line =~ /case\s+'\\u(....)':.*\[([^\]]+)\]/) {
        //        my $code = $1;
        //        my $desc = $2;
        //        $codes{$code} = $desc;
        //      }
        //      # output[outputPos++] = 'A';
        //      elsif ($line =~ /output\[outputPos\+\+\] = '(.+)';/) {
        //        my $output_char = $1;
        //        $folded .= $output_char;
        //      }
        //      elsif ($line =~ /break;/ && length($folded) > 0) {
        //        my $first = 1;
        //        for my $code (sort { hex($a) <=> hex($b) } keys %codes) {
        //          my $desc = $codes{$code};
        //          print OUT '      ';
        //          print OUT '+ ' if (not $first);
        //          $first = 0;
        //          print OUT '"', chr(hex($code)), qq!"  // U+$code: $desc\n!;
        //        }
        //        print OUT qq!      ,"$folded", // Folded result\n\n!;
        //        %codes = ();
        //        $folded = '';
        //      }
        //    }
        //    close OUT;
        //
        //    ============== end get.test.cases.pl ==============
        //
        //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
        //ORIGINAL LINE: public void testAllFoldings() throws Exception
        public virtual void testAllFoldings()
        {
            // Alternating strings of:
            //   1. All non-ASCII characters to be folded, concatenated together as a
            //      single string.
            //   2. The string of ASCII characters to which each of the above
            //      characters should be folded.
            string[] foldings = new string[] {"À" + "Á" + "Â" + "Ã" + "Ä" + "Å" + "Ā" + "Ă" + "Ą" + "Ə" + "Ǎ" + "Ǟ" + "Ǡ" + "Ǻ" + "Ȁ" + "Ȃ" + "Ȧ" + "Ⱥ" + "ᴀ" + "Ḁ" + "Ạ" + "Ả" + "Ấ" + "Ầ" + "Ẩ" + "Ẫ" + "Ậ" + "Ắ" + "Ằ" + "Ẳ" + "Ẵ" + "Ặ" + "Ⓐ" + "A","A", "à" + "á" + "â" + "ã" + "ä" + "å" + "ā" + "ă" + "ą" + "ǎ" + "ǟ" + "ǡ" + "ǻ" + "ȁ" + "ȃ" + "ȧ" + "ɐ" + "ə" + "ɚ" + "ᶏ" + "ḁ" + "ᶕ" + "ẚ" + "ạ" + "ả" + "ấ" + "ầ" + "ẩ" + "ẫ" + "ậ" + "ắ" + "ằ" + "ẳ" + "ẵ" + "ặ" + "ₐ" + "ₔ" + "ⓐ" + "ⱥ" + "Ɐ" + "a","a", "Ꜳ","AA", "Æ" + "Ǣ" + "Ǽ" + "ᴁ","AE", "Ꜵ","AO", "Ꜷ","AU", "Ꜹ" + "Ꜻ","AV", "Ꜽ","AY", "⒜","(a)", "ꜳ","aa", "æ" + "ǣ" + "ǽ" + "ᴂ","ae", "ꜵ","ao", "ꜷ","au", "ꜹ" + "ꜻ","av", "ꜽ","ay", "Ɓ" + "Ƃ" + "Ƀ" + "ʙ" + "ᴃ" + "Ḃ" + "Ḅ" + "Ḇ" + "Ⓑ" + "B","B", "ƀ" + "ƃ" + "ɓ" + "ᵬ" + "ᶀ" + "ḃ" + "ḅ" + "ḇ" + "ⓑ" + "b","b", "⒝","(b)", "Ç" + "Ć" + "Ĉ" + "Ċ" + "Č" + "Ƈ" + "Ȼ" + "ʗ" + "ᴄ" + "Ḉ" + "Ⓒ" + "C","C", "ç" + "ć" + "ĉ" + "ċ" + "č" + "ƈ" + "ȼ" + "ɕ" + "ḉ" + "ↄ" + "ⓒ" + "Ꜿ" + "ꜿ" + "c","c", "⒞","(c)", "Ð" + "Ď" + "Đ" + "Ɖ" + "Ɗ" + "Ƌ" + "ᴅ" + "ᴆ" + "Ḋ" + "Ḍ" + "Ḏ" + "Ḑ" + "Ḓ" + "Ⓓ" + "Ꝺ" + "D","D", "ð" + "ď" + "đ" + "ƌ" + "ȡ" + "ɖ" + "ɗ" + "ᵭ" + "ᶁ" + "ᶑ" + "ḋ" + "ḍ" + "ḏ" + "ḑ" + "ḓ" + "ⓓ" + "ꝺ" + "d","d", "DŽ" + "DZ","DZ", "Dž" + "Dz","Dz", "⒟","(d)", "ȸ","db", "dž" + "dz" + "ʣ" + "ʥ","dz", "È" + "É" + "Ê" + "Ë" + "Ē" + "Ĕ" + "Ė" + "Ę" + "Ě" + "Ǝ" + "Ɛ" + "Ȅ" + "Ȇ" + "Ȩ" + "Ɇ" + "ᴇ" + "Ḕ" + "Ḗ" + "Ḙ" + "Ḛ" + "Ḝ" + "Ẹ" + "Ẻ" + "Ẽ" + "Ế" + "Ề" + "Ể" + "Ễ" + "Ệ" + "Ⓔ" + "ⱻ" + "E","E", "è" + "é" + "ê" + "ë" + "ē" + "ĕ" + "ė" + "ę" + "ě" + "ǝ" + "ȅ" + "ȇ" + "ȩ" + "ɇ" + "ɘ" + "ɛ" + "ɜ" + "ɝ" + "ɞ" + "ʚ" + "ᴈ" + "ᶒ" + "ᶓ" + "ᶔ" + "ḕ" + "ḗ" + "ḙ" + "ḛ" + "ḝ" + "ẹ" + "ẻ" + "ẽ" + "ế" + "ề" + "ể" + "ễ" + "ệ" + "ₑ" + "ⓔ" + "ⱸ" + "e","e", "⒠","(e)", "Ƒ" + "Ḟ" + "Ⓕ" + "ꜰ" + "Ꝼ" + "ꟻ" + "F","F", "ƒ" + "ᵮ" + "ᶂ" + "ḟ" + "ẛ" + "ⓕ" + "ꝼ" + "f","f", "⒡","(f)", "ff","ff", "ffi","ffi", "ffl","ffl", "fi","fi", "fl","fl", "Ĝ" + "Ğ" + "Ġ" + "Ģ" + "Ɠ" + "Ǥ" + "ǥ" + "Ǧ" + "ǧ" + "Ǵ" + "ɢ" + "ʛ" + "Ḡ" + "Ⓖ" + "Ᵹ" + "Ꝿ" + "G","G", "ĝ" + "ğ" + "ġ" + "ģ" + "ǵ" + "ɠ" + "ɡ" + "ᵷ" + "ᵹ" + "ᶃ" + "ḡ" + "ⓖ" + "ꝿ" + "g","g", "⒢","(g)", "Ĥ" + "Ħ" + "Ȟ" + "ʜ" + "Ḣ" + "Ḥ" + "Ḧ" + "Ḩ" + "Ḫ" + "Ⓗ" + "Ⱨ" + "Ⱶ" + "H","H", "ĥ" + "ħ" + "ȟ" + "ɥ" + "ɦ" + "ʮ" + "ʯ" + "ḣ" + "ḥ" + "ḧ" + "ḩ" + "ḫ" + "ẖ" + "ⓗ" + "ⱨ" + "ⱶ" + "h","h", "Ƕ","HV", "⒣","(h)", "ƕ","hv", "Ì" + "Í" + "Î" + "Ï" + "Ĩ" + "Ī" + "Ĭ" + "Į" + "İ" + "Ɩ" + "Ɨ" + "Ǐ" + "Ȉ" + "Ȋ" + "ɪ" + "ᵻ" + "Ḭ" + "Ḯ" + "Ỉ" + "Ị" + "Ⓘ" + "ꟾ" + "I","I", "ì" + "í" + "î" + "ï" + "ĩ" + "ī" + "ĭ" + "į" + "ı" + "ǐ" + "ȉ" + "ȋ" + "ɨ" + "ᴉ" + "ᵢ" + "ᵼ" + "ᶖ" + "ḭ" + "ḯ" + "ỉ" + "ị" + "ⁱ" + "ⓘ" + "i","i", "IJ","IJ", "⒤","(i)", "ij","ij", "Ĵ" + "Ɉ" + "ᴊ" + "Ⓙ" + "J","J", "ĵ" + "ǰ" + "ȷ" + "ɉ" + "ɟ" + "ʄ" + "ʝ" + "ⓙ" + "ⱼ" + "j","j", "⒥","(j)", "Ķ" + "Ƙ" + "Ǩ" + "ᴋ" + "Ḱ" + "Ḳ" + "Ḵ" + "Ⓚ" + "Ⱪ" + "Ꝁ" + "Ꝃ" + "Ꝅ" + "K","K", "ķ" + "ƙ" + "ǩ" + "ʞ" + "ᶄ" + "ḱ" + "ḳ" + "ḵ" + "ⓚ" + "ⱪ" + "ꝁ" + "ꝃ" + "ꝅ" + "k","k", "⒦","(k)", "Ĺ" + "Ļ" + "Ľ" + "Ŀ" + "Ł" + "Ƚ" + "ʟ" + "ᴌ" + "Ḷ" + "Ḹ" + "Ḻ" + "Ḽ" + "Ⓛ" + "Ⱡ" + "Ɫ" + "Ꝇ" + "Ꝉ" + "Ꞁ" + "L","L", "ĺ" + "ļ" + "ľ" + "ŀ" + "ł" + "ƚ" + "ȴ" + "ɫ" + "ɬ" + "ɭ" + "ᶅ" + "ḷ" + "ḹ" + "ḻ" + "ḽ" + "ⓛ" + "ⱡ" + "ꝇ" + "ꝉ" + "ꞁ" + "l","l", "LJ","LJ", "Ỻ","LL", "Lj","Lj", "⒧","(l)", "lj","lj", "ỻ","ll", "ʪ","ls", "ʫ","lz", "Ɯ" + "ᴍ" + "Ḿ" + "Ṁ" + "Ṃ" + "Ⓜ" + "Ɱ" + "ꟽ" + "ꟿ" + "M","M", "ɯ" + "ɰ" + "ɱ" + "ᵯ" + "ᶆ" + "ḿ" + "ṁ" + "ṃ" + "ⓜ" + "m","m", "⒨","(m)", "Ñ" + "Ń" + "Ņ" + "Ň" + "Ŋ" + "Ɲ" + "Ǹ" + "Ƞ" + "ɴ" + "ᴎ" + "Ṅ" + "Ṇ" + "Ṉ" + "Ṋ" + "Ⓝ" + "N","N", "ñ" + "ń" + "ņ" + "ň" + "ʼn" + "ŋ" + "ƞ" + "ǹ" + "ȵ" + "ɲ" + "ɳ" + "ᵰ" + "ᶇ" + "ṅ" + "ṇ" + "ṉ" + "ṋ" + "ⁿ" + "ⓝ" + "n","n", "NJ","NJ", "Nj","Nj", "⒩","(n)", "nj","nj", "Ò" + "Ó" + "Ô" + "Õ" + "Ö" + "Ø" + "Ō" + "Ŏ" + "Ő" + "Ɔ" + "Ɵ" + "Ơ" + "Ǒ" + "Ǫ" + "Ǭ" + "Ǿ" + "Ȍ" + "Ȏ" + "Ȫ" + "Ȭ" + "Ȯ" + "Ȱ" + "ᴏ" + "ᴐ" + "Ṍ" + "Ṏ" + "Ṑ" + "Ṓ" + "Ọ" + "Ỏ" + "Ố" + "Ồ" + "Ổ" + "Ỗ" + "Ộ" + "Ớ" + "Ờ" + "Ở" + "Ỡ" + "Ợ" + "Ⓞ" + "Ꝋ" + "Ꝍ" + "O","O", "ò" + "ó" + "ô" + "õ" + "ö" + "ø" + "ō" + "ŏ" + "ő" + "ơ" + "ǒ" + "ǫ" + "ǭ" + "ǿ" + "ȍ" + "ȏ" + "ȫ" + "ȭ" + "ȯ" + "ȱ" + "ɔ" + "ɵ" + "ᴖ" + "ᴗ" + "ᶗ" + "ṍ" + "ṏ" + "ṑ" + "ṓ" + "ọ" + "ỏ" + "ố" + "ồ" + "ổ" + "ỗ" + "ộ" + "ớ" + "ờ" + "ở" + "ỡ" + "ợ" + "ₒ" + "ⓞ" + "ⱺ" + "ꝋ" + "ꝍ" + "o","o", "Œ" + "ɶ","OE", "Ꝏ","OO", "Ȣ" + "ᴕ","OU", "⒪","(o)", "œ" + "ᴔ","oe", "ꝏ","oo", "ȣ","ou", "Ƥ" + "ᴘ" + "Ṕ" + "Ṗ" + "Ⓟ" + "Ᵽ" + "Ꝑ" + "Ꝓ" + "Ꝕ" + "P","P", "ƥ" + "ᵱ" + "ᵽ" + "ᶈ" + "ṕ" + "ṗ" + "ⓟ" + "ꝑ" + "ꝓ" + "ꝕ" + "ꟼ" + "p","p", "⒫","(p)", "Ɋ" + "Ⓠ" + "Ꝗ" + "Ꝙ" + "Q","Q", "ĸ" + "ɋ" + "ʠ" + "ⓠ" + "ꝗ" + "ꝙ" + "q","q", "⒬","(q)", "ȹ","qp", "Ŕ" + "Ŗ" + "Ř" + "Ȑ" + "Ȓ" + "Ɍ" + "ʀ" + "ʁ" + "ᴙ" + "ᴚ" + "Ṙ" + "Ṛ" + "Ṝ" + "Ṟ" + "Ⓡ" + "Ɽ" + "Ꝛ" + "Ꞃ" + "R","R", "ŕ" + "ŗ" + "ř" + "ȑ" + "ȓ" + "ɍ" + "ɼ" + "ɽ" + "ɾ" + "ɿ" + "ᵣ" + "ᵲ" + "ᵳ" + "ᶉ" + "ṙ" + "ṛ" + "ṝ" + "ṟ" + "ⓡ" + "ꝛ" + "ꞃ" + "r","r", "⒭","(r)", "Ś" + "Ŝ" + "Ş" + "Š" + "Ș" + "Ṡ" + "Ṣ" + "Ṥ" + "Ṧ" + "Ṩ" + "Ⓢ" + "ꜱ" + "ꞅ" + "S","S", "ś" + "ŝ" + "ş" + "š" + "ſ" + "ș" + "ȿ" + "ʂ" + "ᵴ" + "ᶊ" + "ṡ" + "ṣ" + "ṥ" + "ṧ" + "ṩ" + "ẜ" + "ẝ" + "ⓢ" + "Ꞅ" + "s","s", "ẞ","SS", "⒮","(s)", "ß","ss", "st","st", "Ţ" + "Ť" + "Ŧ" + "Ƭ" + "Ʈ" + "Ț" + "Ⱦ" + "ᴛ" + "Ṫ" + "Ṭ" + "Ṯ" + "Ṱ" + "Ⓣ" + "Ꞇ" + "T","T", "ţ" + "ť" + "ŧ" + "ƫ" + "ƭ" + "ț" + "ȶ" + "ʇ" + "ʈ" + "ᵵ" + "ṫ" + "ṭ" + "ṯ" + "ṱ" + "ẗ" + "ⓣ" + "ⱦ" + "t","t", "Þ" + "Ꝧ","TH", "Ꜩ","TZ", "⒯","(t)", "ʨ","tc", "þ" + "ᵺ" + "ꝧ","th", "ʦ","ts", "ꜩ","tz", "Ù" + "Ú" + "Û" + "Ü" + "Ũ" + "Ū" + "Ŭ" + "Ů" + "Ű" + "Ų" + "Ư" + "Ǔ" + "Ǖ" + "Ǘ" + "Ǚ" + "Ǜ" + "Ȕ" + "Ȗ" + "Ʉ" + "ᴜ" + "ᵾ" + "Ṳ" + "Ṵ" + "Ṷ" + "Ṹ" + "Ṻ" + "Ụ" + "Ủ" + "Ứ" + "Ừ" + "Ử" + "Ữ" + "Ự" + "Ⓤ" + "U","U", "ù" + "ú" + "û" + "ü" + "ũ" + "ū" + "ŭ" + "ů" + "ű" + "ų" + "ư" + "ǔ" + "ǖ" + "ǘ" + "ǚ" + "ǜ" + "ȕ" + "ȗ" + "ʉ" + "ᵤ" + "ᶙ" + "ṳ" + "ṵ" + "ṷ" + "ṹ" + "ṻ" + "ụ" + "ủ" + "ứ" + "ừ" + "ử" + "ữ" + "ự" + "ⓤ" + "u","u", "⒰","(u)", "ᵫ","ue", "Ʋ" + "Ʌ" + "ᴠ" + "Ṽ" + "Ṿ" + "Ỽ" + "Ⓥ" + "Ꝟ" + "Ꝩ" + "V","V", "ʋ" + "ʌ" + "ᵥ" + "ᶌ" + "ṽ" + "ṿ" + "ⓥ" + "ⱱ" + "ⱴ" + "ꝟ" + "v","v", "Ꝡ","VY", "⒱","(v)", "ꝡ","vy", "Ŵ" + "Ƿ" + "ᴡ" + "Ẁ" + "Ẃ" + "Ẅ" + "Ẇ" + "Ẉ" + "Ⓦ" + "Ⱳ" + "W","W", "ŵ" + "ƿ" + "ʍ" + "ẁ" + "ẃ" + "ẅ" + "ẇ" + "ẉ" + "ẘ" + "ⓦ" + "ⱳ" + "w","w", "⒲","(w)", "Ẋ" + "Ẍ" + "Ⓧ" + "X","X", "ᶍ" + "ẋ" + "ẍ" + "ₓ" + "ⓧ" + "x","x", "⒳","(x)", "Ý" + "Ŷ" + "Ÿ" + "Ƴ" + "Ȳ" + "Ɏ" + "ʏ" + "Ẏ" + "Ỳ" + "Ỵ" + "Ỷ" + "Ỹ" + "Ỿ" + "Ⓨ" + "Y","Y", "ý" + "ÿ" + "ŷ" + "ƴ" + "ȳ" + "ɏ" + "ʎ" + "ẏ" + "ẙ" + "ỳ" + "ỵ" + "ỷ" + "ỹ" + "ỿ" + "ⓨ" + "y","y", "⒴","(y)", "Ź" + "Ż" + "Ž" + "Ƶ" + "Ȝ" + "Ȥ" + "ᴢ" + "Ẑ" + "Ẓ" + "Ẕ" + "Ⓩ" + "Ⱬ" + "Ꝣ" + "Z","Z", "ź" + "ż" + "ž" + "ƶ" + "ȝ" + "ȥ" + "ɀ" + "ʐ" + "ʑ" + "ᵶ" + "ᶎ" + "ẑ" + "ẓ" + "ẕ" + "ⓩ" + "ⱬ" + "ꝣ" + "z","z", "⒵","(z)", "⁰" + "₀" + "⓪" + "⓿" + "0","0", "¹" + "₁" + "①" + "⓵" + "❶" + "➀" + "➊" + "1","1", "⒈","1.", "⑴","(1)", "²" + "₂" + "②" + "⓶" + "❷" + "➁" + "➋" + "2","2", "⒉","2.", "⑵","(2)", "³" + "₃" + "③" + "⓷" + "❸" + "➂" + "➌" + "3","3", "⒊","3.", "⑶","(3)", "⁴" + "₄" + "④" + "⓸" + "❹" + "➃" + "➍" + "4","4", "⒋","4.", "⑷","(4)", "⁵" + "₅" + "⑤" + "⓹" + "❺" + "➄" + "➎" + "5","5", "⒌","5.", "⑸","(5)", "⁶" + "₆" + "⑥" + "⓺" + "❻" + "➅" + "➏" + "6","6", "⒍","6.", "⑹","(6)", "⁷" + "₇" + "⑦" + "⓻" + "❼" + "➆" + "➐" + "7","7", "⒎","7.", "⑺","(7)", "⁸" + "₈" + "⑧" + "⓼" + "❽" + "➇" + "➑" + "8","8", "⒏","8.", "⑻","(8)", "⁹" + "₉" + "⑨" + "⓽" + "❾" + "➈" + "➒" + "9","9", "⒐","9.", "⑼","(9)", "⑩" + "⓾" + "❿" + "➉" + "➓","10", "⒑","10.", "⑽","(10)", "⑪" + "⓫","11", "⒒","11.", "⑾","(11)", "⑫" + "⓬","12", "⒓","12.", "⑿","(12)", "⑬" + "⓭","13", "⒔","13.", "⒀","(13)", "⑭" + "⓮","14", "⒕","14.", "⒁","(14)", "⑮" + "⓯","15", "⒖","15.", "⒂","(15)", "⑯" + "⓰","16", "⒗","16.", "⒃","(16)", "⑰" + "⓱","17", "⒘","17.", "⒄","(17)", "⑱" + "⓲","18", "⒙","18.", "⒅","(18)", "⑲" + "⓳","19", "⒚","19.", "⒆","(19)", "⑳" + "⓴","20", "⒛","20.", "⒇","(20)", "«" + "»" + "“" + "”" + "„" + "″" + "‶" + "❝" + "❞" + "❮" + "❯" + ""","\"", "‘" + "’" + "‚" + "‛" + "′" + "‵" + "‹" + "›" + "❛" + "❜" + "'","'", "‐" + "‑" + "‒" + "–" + "—" + "⁻" + "₋" + "-","-", "⁅" + "❲" + "[","[", "⁆" + "❳" + "]","]", "⁽" + "₍" + "❨" + "❪" + "(","(", "⸨","((", "⁾" + "₎" + "❩" + "❫" + ")",")", "⸩","))", "❬" + "❰" + "<","<", "❭" + "❱" + ">",">", "❴" + "{","{", "❵" + "}","}", "⁺" + "₊" + "+","+", "⁼" + "₌" + "=","=", "!","!", "‼","!!", "⁉","!?", "#","#", "$","$", "⁒" + "%","%", "&","&", "⁎" + "*","*", ",",",", ".",".", "⁄" + "/","/", ":",":", "⁏" + ";",";", "?","?", "⁇","??", "⁈","?!", "@","@", "\","\\", "‸" + "^","^", "_","_", "⁓" + "~","~"};

            // Construct input text and expected output tokens
            IList<string> expectedUnfoldedTokens = new List<string>();
            IList<string> expectedFoldedTokens = new List<string>();
            StringBuilder inputText = new StringBuilder();
            for (int n = 0 ; n < foldings.Length ; n += 2)
            {
              if (n > 0)
              {
            inputText.Append(' '); // Space between tokens
              }
              inputText.Append(foldings[n]);

              // Construct the expected output tokens: both the unfolded and folded string,
              // with the folded duplicated as many times as the number of characters in
              // the input text.
              StringBuilder expected = new StringBuilder();
              int numChars = foldings[n].Length;
              for (int m = 0 ; m < numChars; ++m)
              {
            expected.Append(foldings[n + 1]);
              }
              expectedUnfoldedTokens.Add(foldings[n]);
              expectedFoldedTokens.Add(expected.ToString());
            }

            TokenStream stream = new MockTokenizer(new StringReader(inputText.ToString()), MockTokenizer.WHITESPACE, false);
            ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream, random().nextBoolean());
            CharTermAttribute termAtt = filter.getAttribute(typeof(CharTermAttribute));
            IEnumerator<string> unfoldedIter = expectedUnfoldedTokens.GetEnumerator();
            IEnumerator<string> foldedIter = expectedFoldedTokens.GetEnumerator();
            filter.reset();
            while (foldedIter.MoveNext())
            {
            //JAVA TO C# CONVERTER TODO TASK: Java iterators are only converted within the context of 'while' and 'for' loops:
              assertNextTerms(unfoldedIter.next(), foldedIter.Current, filter, termAtt);
            }
            assertFalse(filter.incrementToken());
        }
Exemplo n.º 32
0
 /// <summary>
 /// Pop one input token's worth of tokens off the filter and verify that they are as expected.
 /// </summary>
 //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
 //ORIGINAL LINE: void assertNextTerms(String expectedUnfolded, String expectedFolded, ASCIIFoldingFilter filter, org.apache.lucene.analysis.tokenattributes.CharTermAttribute termAtt) throws Exception
 internal virtual void assertNextTerms(string expectedUnfolded, string expectedFolded, ASCIIFoldingFilter filter, CharTermAttribute termAtt)
 {
     assertTrue(filter.incrementToken());
     assertEquals(expectedFolded, termAtt.ToString());
     if (filter.PreserveOriginal && !expectedUnfolded.Equals(expectedFolded))
     {
       assertTrue(filter.incrementToken());
       assertEquals(expectedUnfolded, termAtt.ToString());
     }
 }
Exemplo n.º 33
0
        // testLain1Accents() is a copy of TestLatin1AccentFilter.testU().
        //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
        //ORIGINAL LINE: public void testLatin1Accents() throws Exception
        public virtual void testLatin1Accents()
        {
            TokenStream stream = new MockTokenizer(new StringReader("Des mot clés À LA CHAÎNE À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï IJ Ð Ñ" + " Ò Ó Ô Õ Ö Ø Œ Þ Ù Ú Û Ü Ý Ÿ à á â ã ä å æ ç è é ê ë ì í î ï ij" + " ð ñ ò ó ô õ ö ø œ ß þ ù ú û ü ý ÿ fi fl"), MockTokenizer.WHITESPACE, false);
            ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream, random().nextBoolean());

            CharTermAttribute termAtt = filter.getAttribute(typeof(CharTermAttribute));
            filter.reset();
            assertNextTerms("Des", "Des", filter, termAtt);
            assertNextTerms("mot", "mot", filter, termAtt);
            assertNextTerms("clés", "cles", filter, termAtt);
            assertNextTerms("À", "A", filter, termAtt);
            assertNextTerms("LA", "LA", filter, termAtt);
            assertNextTerms("CHAÎNE", "CHAINE", filter, termAtt);
            assertNextTerms("À", "A", filter, termAtt);
            assertNextTerms("Á", "A", filter, termAtt);
            assertNextTerms("Â", "A", filter, termAtt);
            assertNextTerms("Ã", "A", filter, termAtt);
            assertNextTerms("Ä", "A", filter, termAtt);
            assertNextTerms("Å", "A", filter, termAtt);
            assertNextTerms("Æ", "AE", filter, termAtt);
            assertNextTerms("Ç", "C", filter, termAtt);
            assertNextTerms("È", "E", filter, termAtt);
            assertNextTerms("É", "E", filter, termAtt);
            assertNextTerms("Ê", "E", filter, termAtt);
            assertNextTerms("Ë", "E", filter, termAtt);
            assertNextTerms("Ì", "I", filter, termAtt);
            assertNextTerms("Í", "I", filter, termAtt);
            assertNextTerms("Î", "I", filter, termAtt);
            assertNextTerms("Ï", "I", filter, termAtt);
            assertNextTerms("IJ", "IJ", filter, termAtt);
            assertNextTerms("Ð", "D", filter, termAtt);
            assertNextTerms("Ñ", "N", filter, termAtt);
            assertNextTerms("Ò", "O", filter, termAtt);
            assertNextTerms("Ó", "O", filter, termAtt);
            assertNextTerms("Ô", "O", filter, termAtt);
            assertNextTerms("Õ", "O", filter, termAtt);
            assertNextTerms("Ö", "O", filter, termAtt);
            assertNextTerms("Ø", "O", filter, termAtt);
            assertNextTerms("Œ", "OE", filter, termAtt);
            assertNextTerms("Þ", "TH", filter, termAtt);
            assertNextTerms("Ù", "U", filter, termAtt);
            assertNextTerms("Ú", "U", filter, termAtt);
            assertNextTerms("Û", "U", filter, termAtt);
            assertNextTerms("Ü", "U", filter, termAtt);
            assertNextTerms("Ý", "Y", filter, termAtt);
            assertNextTerms("Ÿ", "Y", filter, termAtt);
            assertNextTerms("à", "a", filter, termAtt);
            assertNextTerms("á", "a", filter, termAtt);
            assertNextTerms("â", "a", filter, termAtt);
            assertNextTerms("ã", "a", filter, termAtt);
            assertNextTerms("ä", "a", filter, termAtt);
            assertNextTerms("å", "a", filter, termAtt);
            assertNextTerms("æ", "ae", filter, termAtt);
            assertNextTerms("ç", "c", filter, termAtt);
            assertNextTerms("è", "e", filter, termAtt);
            assertNextTerms("é", "e", filter, termAtt);
            assertNextTerms("ê", "e", filter, termAtt);
            assertNextTerms("ë", "e", filter, termAtt);
            assertNextTerms("ì", "i", filter, termAtt);
            assertNextTerms("í", "i", filter, termAtt);
            assertNextTerms("î", "i", filter, termAtt);
            assertNextTerms("ï", "i", filter, termAtt);
            assertNextTerms("ij", "ij", filter, termAtt);
            assertNextTerms("ð", "d", filter, termAtt);
            assertNextTerms("ñ", "n", filter, termAtt);
            assertNextTerms("ò", "o", filter, termAtt);
            assertNextTerms("ó", "o", filter, termAtt);
            assertNextTerms("ô", "o", filter, termAtt);
            assertNextTerms("õ", "o", filter, termAtt);
            assertNextTerms("ö", "o", filter, termAtt);
            assertNextTerms("ø", "o", filter, termAtt);
            assertNextTerms("œ", "oe", filter, termAtt);
            assertNextTerms("ß", "ss", filter, termAtt);
            assertNextTerms("þ", "th", filter, termAtt);
            assertNextTerms("ù", "u", filter, termAtt);
            assertNextTerms("ú", "u", filter, termAtt);
            assertNextTerms("û", "u", filter, termAtt);
            assertNextTerms("ü", "u", filter, termAtt);
            assertNextTerms("ý", "y", filter, termAtt);
            assertNextTerms("ÿ", "y", filter, termAtt);
            assertNextTerms("fi", "fi", filter, termAtt);
            assertNextTerms("fl", "fl", filter, termAtt);
            assertFalse(filter.incrementToken());
        }