Пример #1
0
        public override TokenStream TokenStream(String fieldName, TextReader reader)
        {
            TokenStream filter = new StandardFilter(new LowerCaseTokenizer(reader));

            filter = new PorterStemFilter(filter);

            return(filter);
        }
Пример #2
0
        public virtual void TestNoOverrides()
        {
            StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(true);
            Tokenizer   tokenizer = new KeywordTokenizer(new StringReader("book"));
            TokenStream stream    = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.Build()));

            AssertTokenStreamContents(stream, new string[] { "book" });
        }
Пример #3
0
        /// <summary>
        /// Override of the token stream method, uses these filters in order:
        ///
        /// Whitespace splitter
        /// ASCII common folder (ie é goes to e)
        /// Lowercase
        /// Stopwords removed
        /// Porter stemming (reduces words to common stem)
        /// </summary>
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            var         tokenizer = new WhitespaceTokenizer(LeoLuceneVersion.Version, reader);
            TokenStream filter    = new ASCIIFoldingFilter(tokenizer);

            filter = new LowerCaseFilter(LeoLuceneVersion.Version, filter);
            filter = new StopFilter(LeoLuceneVersion.Version, filter, _words);
            filter = new PorterStemFilter(filter);
            return(new TokenStreamComponents(tokenizer, filter));
        }
Пример #4
0
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer lowerCaseTokenizer = new LowerCaseTokenizer(LuceneVersion.LUCENE_48, reader);

            PorterStemFilter porterStemFilter = new PorterStemFilter(lowerCaseTokenizer);

            StopFilter stopFilter = new StopFilter(LuceneVersion.LUCENE_48, porterStemFilter, EnglishAnalyzer.DefaultStopSet);

            return(new TokenStreamComponents(lowerCaseTokenizer, stopFilter));
        }
        public virtual void TestIgnoreCase()
        {
            // lets make booked stem to books
            // the override filter will convert "booked" to "books",
            // but also mark it with KeywordAttribute so Porter will not change it.
            StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(true);
            builder.Add("boOkEd", "books");
            Tokenizer   tokenizer = new KeywordTokenizer(new StringReader("BooKeD"));
            TokenStream stream    = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.Build()));

            AssertTokenStreamContents(stream, new string[] { "books" });
        }
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(m_matchVersion, reader);
            TokenStream result = new StandardFilter(m_matchVersion, source);

            result = new EnglishPossessiveFilter(m_matchVersion, result);
            result = new ASCIIFoldingFilter(result);
            result = new LowerCaseFilter(m_matchVersion, result);
            result = new StopFilter(m_matchVersion, result, m_stopwords);
            result = new PorterStemFilter(result);
            return(new TokenStreamComponents(source, result));
        }
Пример #7
0
        //Para el procesamiento de textos
        public string AnalizarConsulta(string consulta, string tipoAnalizador)
        {
            ArrayList   ListStemsList = new ArrayList();
            TokenStream tokenStream   = new StandardTokenizer(Version.LUCENE_30, new System.IO.StringReader(consulta));

            tokenStream = new StandardFilter(tokenStream);  //elimina los signos de puntuación
            tokenStream = new LowerCaseFilter(tokenStream); //convierte el contenido a minúsculas
            if (tipoAnalizador == "Español")
            {
                //filtrará el contenido con el listado de stopWords
                tokenStream = new StopFilter(true, tokenStream, StopFilter.MakeStopSet(SpanishAnalyzer.SPANISH_STOP_WORDS));
                //Convierte caracteres que estan por encima del 127 en la tabla ASCII
                tokenStream = new ASCIIFoldingFilter(tokenStream);
                //Operacion de lematización de la palabras
                tokenStream = SpanishSteammer(tokenStream);
            }
            else if (tipoAnalizador == "Ingles")
            {
                //filtrará el contenido con el listado de stopWords
                tokenStream = new StopFilter(true, tokenStream, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
                //Operacion de lematización de la palabras
                tokenStream = new PorterStemFilter(tokenStream);
            }
            else //Sino establece idioma solo elimina palabras vacias en ambos idiomas
            {
                //filtrará el contenido con el listado de stopWords
                tokenStream = new StopFilter(true, tokenStream, StopFilter.MakeStopSet(SpanishAnalyzer.SPANISH_STOP_WORDS));
                //filtrará el contenido con el listado de stopWords
                tokenStream = new StopFilter(true, tokenStream, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
            }

            string term     = string.Empty;
            var    termAttr = tokenStream.GetAttribute <ITermAttribute>();

            int i = 0;

            while (tokenStream.IncrementToken())
            {
                if (i == 0)
                {
                    term = termAttr.Term;
                    i++;
                }
                else
                {
                    term = term + "," + termAttr.Term;
                }
            }

            return((string.IsNullOrEmpty(term)) ? string.Empty : term.Trim());
        }
Пример #8
0
        public void tokenize()
        {
            TokenStream tokenStream = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_30, new StringReader(this.input));

            tokenStream = new StopFilter(false, tokenStream, StandardAnalyzer.STOP_WORDS_SET);
            tokenStream = new PorterStemFilter(tokenStream);

            var termAttr = tokenStream.GetAttribute <Lucene.Net.Analysis.Tokenattributes.ITermAttribute>();

            while (tokenStream.IncrementToken())
            {
                tokens.Add(termAttr.Term);
            }
        }
Пример #9
0
        public override TokenStream TokenStream(String fieldName, TextReader reader)
        {
            TokenStream result = new SentenceTokenizer(reader);

            result = new WordTokenizer(result, wordSegment);
            // result = new LowerCaseFilter(result);
            // 不再需要LowerCaseFilter,因为SegTokenFilter已经将所有英文字符转换成小写
            // stem太严格了, This is not bug, this feature:)
            result = new PorterStemFilter(result);
            if (stopWords != null)
            {
                result = new StopFilter(true, result, StopFilter.MakeStopSet(stopWords), false);
            }
            return(result);
        }
Пример #10
0
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            TokenStreamComponents tokenStreamComponents = null;
            Tokenizer             tokenizer             = new StandardTokenizer(matchVersion, reader);
            TokenStream           stream = new StandardFilter(matchVersion, tokenizer);

            stream = new LowerCaseFilter(matchVersion, stream);
            stream = new StopFilter(matchVersion, stream, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
            stream = new PorterStemFilter(stream);
            stream = new SnowballFilter(stream, new EnglishStemmer());

            tokenStreamComponents = new TokenStreamComponents(tokenizer, stream);

            return(tokenStreamComponents);
        }
        public virtual void TestRandomRealisticWhiteSpace()
        {
            IDictionary <string, string> map = new Dictionary <string, string>();
            int numTerms = AtLeast(50);

            for (int i = 0; i < numTerms; i++)
            {
                string        randomRealisticUnicodeString = TestUtil.RandomRealisticUnicodeString(Random);
                char[]        charArray = randomRealisticUnicodeString.ToCharArray();
                StringBuilder sb        = new StringBuilder();
                for (int j = 0; j < charArray.Length;)
                {
                    int cp = Character.CodePointAt(charArray, j, charArray.Length);
                    if (!char.IsWhiteSpace((char)cp))
                    {
                        sb.AppendCodePoint(cp);
                    }
                    j += Character.CharCount(cp);
                }
                if (sb.Length > 0)
                {
                    string value = TestUtil.RandomSimpleString(Random);
                    map[sb.ToString()] = value.Length == 0 ? "a" : value;
                }
            }
            if (map.Count == 0)
            {
                map["booked"] = "books";
            }
            StemmerOverrideFilter.Builder builder  = new StemmerOverrideFilter.Builder(Random.nextBoolean());
            IDictionary <string, string>  entrySet = map;
            StringBuilder  input  = new StringBuilder();
            IList <string> output = new List <string>();

            foreach (KeyValuePair <string, string> entry in entrySet)
            {
                builder.Add(entry.Key, entry.Value);
                if (Random.nextBoolean() || output.Count == 0)
                {
                    input.Append(entry.Key).Append(" ");
                    output.Add(entry.Value);
                }
            }
            Tokenizer   tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input.ToString()));
            TokenStream stream    = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.Build()));

            AssertTokenStreamContents(stream, output.ToArray());
        }
Пример #12
0
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            TokenStream result = null;

            if (useStopWords)
            {
                ISet <string> stopWords = StopFilter.MakeStopSet(new string[] { "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", // EN
                                                                                "van", "aan", "dat", "de", "den", "der", "des", "deze", "die", "dit", "door", "een", "het", "ik", "is", "je", "na",                                                                                                                  // NL
                                                                                "au", "aux", "la", "le", "les" });                                                                                                                                                                                                   // FR

                result = new PorterStemFilter(new StopFilter(false, new ASCIIFoldingFilter(new LowerCaseFilter(new CDRWhitespaceTokenizer(reader))), stopWords));
            }
            else
            {
                result = new PorterStemFilter(new ASCIIFoldingFilter(new LowerCaseFilter(new CDRWhitespaceTokenizer(reader))));
            }

            return(result);
        }
Пример #13
0
        override public TokenStream TokenStream(string fieldName, TextReader reader)
        {
            StandardTokenizer tokenStream = new StandardTokenizer(VERSION, reader);

            tokenStream.MaxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
            TokenStream result = new StandardFilter(tokenStream);

            result = new LowerCaseFilter(result);

            if (SettingsViewModel.Instance.StopWords == true)
            {
                result = new StopFilter(enableSPI, result, StopSet);
            }

            if (SettingsViewModel.Instance.Stemming == true)
            {
                result = new PorterStemFilter(result);
            }

            return(result);
        }
Пример #14
0
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(m_matchVersion, reader);
            TokenStream result = new StandardFilter(m_matchVersion, source);

            // for stripping 's from words
            result = new EnglishPossessiveFilter(m_matchVersion, result);
            // converts é to e (and © to (c), etc.
            result = new ASCIIFoldingFilter(result);
            result = new LowerCaseFilter(m_matchVersion, result);
            result = new StopFilter(m_matchVersion, result, EnglishAnalyzer.DefaultStopSet);
            // for chopping off common word suffixes, like removing ming from stemming, etc.
            result = new PorterStemFilter(result);

            // The ngram tokenizer first breaks text down into words whenever it encounters one of a list of specified characters,
            // then it emits N-grams of each word of the specified length.
            if (_userNGram)
            {
                result = new EdgeNGramTokenFilter(m_matchVersion, result, _ngramMin, _ngramMax);
            }

            return(new TokenStreamComponents(source, result));
        }
Пример #15
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void testRandomRealisticKeyword() throws java.io.IOException
        public virtual void testRandomRealisticKeyword()
        {
            IDictionary <string, string> map = new Dictionary <string, string>();
            int numTerms = atLeast(50);

            for (int i = 0; i < numTerms; i++)
            {
                string randomRealisticUnicodeString = TestUtil.randomRealisticUnicodeString(random());
                if (randomRealisticUnicodeString.Length > 0)
                {
                    string value = TestUtil.randomSimpleString(random());
                    map[randomRealisticUnicodeString] = value.Length == 0 ? "a" : value;
                }
            }
            if (map.Count == 0)
            {
                map["booked"] = "books";
            }
            StemmerOverrideFilter.Builder         builder  = new StemmerOverrideFilter.Builder(random().nextBoolean());
            ISet <KeyValuePair <string, string> > entrySet = map.SetOfKeyValuePairs();

            foreach (KeyValuePair <string, string> entry in entrySet)
            {
                builder.add(entry.Key, entry.Value);
            }
            StemmerOverrideMap build = builder.build();

            foreach (KeyValuePair <string, string> entry in entrySet)
            {
                if (random().nextBoolean())
                {
                    Tokenizer   tokenizer = new KeywordTokenizer(new StringReader(entry.Key));
                    TokenStream stream    = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, build));
                    assertTokenStreamContents(stream, new string[] { entry.Value });
                }
            }
        }
Пример #16
0
        public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
        {
            TokenStream tok = new PorterStemFilter(new LowerCaseTokenizer(reader));

            return(tok);
        }
Пример #17
0
        /// <summary>
        /// Creates the components.
        /// </summary>
        /// <param name="fieldName">Name of the field.</param>
        /// <returns></returns>
        protected override AnalyzerTokenStreamComponents CreateComponents(string fieldName)
        {
            if (String.IsNullOrWhiteSpace(fieldName))
                throw new ArgumentException($"{nameof(fieldName)} cannot be null or blank");

            var pattern = Pattern.compile(_separatorChars);
            var tokenizer = new PatternTokenizer(pattern, -1);
            var stream = _ignoreCase ? new LowerCaseFilter(tokenizer) as TokenStream : tokenizer as TokenStream;

            if (_enableStemming)
                stream = new PorterStemFilter(stream);

            return new AnalyzerTokenStreamComponents(tokenizer, stream);
        }