Filters StandardTokenizer with StandardFilter, {@link LowerCaseFilter}, StopFilter and SnowballFilter. Available stemmers are listed in SF.Snowball.Ext. The name of a stemmer is the part of the class name before "Stemmer", e.g., the stemmer in EnglishStemmer is named "English".

NOTE: This class uses the same Version dependent settings as StandardAnalyzer

상속: Lucene.Net.Analysis.Analyzer
예제 #1
0
        public void TestEnglish()
        {
            Analyzer a = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English");

            AssertAnalyzesTo(a, "he abhorred accents",
                             new String[] { "he", "abhor", "accent" });
        }
예제 #2
0
        public virtual void TestReusableTokenStream()
        {
            Analyzer a = new SnowballAnalyzer(TEST_VERSION_CURRENT, "English");

            AssertAnalyzesTo(a, "he abhorred accents", new string[] { "he", "abhor", "accent" });
            AssertAnalyzesTo(a, "she abhorred him", new string[] { "she", "abhor", "him" });
        }
예제 #3
0
 public void TestStopwords()
 {
     Analyzer a = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English",
         StandardAnalyzer.STOP_WORDS_SET);
     AssertAnalyzesTo(a, "the quick brown fox jumped",
         new String[] { "quick", "brown", "fox", "jump" });
 }
예제 #4
0
        public virtual void TestTurkish()
        {
            Analyzer a = new SnowballAnalyzer(TEST_VERSION_CURRENT, "Turkish");

            AssertAnalyzesTo(a, "ağacı", new string[] { "ağaç" });
            AssertAnalyzesTo(a, "AĞACI", new string[] { "ağaç" });
        }
예제 #5
0
        public virtual void TestTurkish()
        {
            Analyzer a = new SnowballAnalyzer(TEST_VERSION_CURRENT, "Turkish");

            AssertAnalyzesTo(a, "ağacı", new string[] { "ağaç" });
            AssertAnalyzesTo(a, "AĞACI", new string[] { "ağaç" });
        }
예제 #6
0
        public void TestStopwords()
        {
            Analyzer a = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English",
                                              StandardAnalyzer.STOP_WORDS_SET);

            AssertAnalyzesTo(a, "the quick brown fox jumped",
                             new String[] { "quick", "brown", "fox", "jump" });
        }
예제 #7
0
 public void TestReusableTokenStream()
 {
     Analyzer a = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English");
     AssertAnalyzesToReuse(a, "he abhorred accents",
         new String[] { "he", "abhor", "accent" });
     AssertAnalyzesToReuse(a, "she abhorred him",
         new String[] { "she", "abhor", "him" });
 }
예제 #8
0
 public virtual void TestTurkishBWComp()
 {
     Analyzer a = new SnowballAnalyzer(LuceneVersion.LUCENE_30, "Turkish");
     // AĞACI in turkish lowercases to ağacı, but with lowercase filter ağaci.
     // this fails due to wrong casing, because the stemmer
     // will only remove -ı, not -i
     AssertAnalyzesTo(a, "ağacı", new string[] { "ağaç" });
     AssertAnalyzesTo(a, "AĞACI", new string[] { "ağaci" });
 }
예제 #9
0
        public void TestReusableTokenStream()
        {
            Analyzer a = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English");

            AssertAnalyzesToReuse(a, "he abhorred accents",
                                  new String[] { "he", "abhor", "accent" });
            AssertAnalyzesToReuse(a, "she abhorred him",
                                  new String[] { "she", "abhor", "him" });
        }
예제 #10
0
        public virtual void TestEnglishLowerCase()
        {
            Analyzer a = new SnowballAnalyzer(TEST_VERSION_CURRENT, "English");
            AssertAnalyzesTo(a, "cryogenic", new string[] { "cryogen" });
            AssertAnalyzesTo(a, "CRYOGENIC", new string[] { "cryogen" });

            Analyzer b = new SnowballAnalyzer(LuceneVersion.LUCENE_30, "English");
            AssertAnalyzesTo(b, "cryogenic", new string[] { "cryogen" });
            AssertAnalyzesTo(b, "CRYOGENIC", new string[] { "cryogen" });
        }
예제 #11
0
 public void Setup()
 {
     stopWords = new string[StopAnalyzer.ENGLISH_STOP_WORDS_SET.Count + 1];
     stopWords[0] = "into";
     int i = 1;
     foreach (string value in StopAnalyzer.ENGLISH_STOP_WORDS_SET) stopWords[i++] = value;
     var ram = new RAMDirectory();
     var snow = new SnowballAnalyzer("English", stopWords);
     _service = new SearchEngineService(ram, snow);
 }
예제 #12
0
        public void TestJiraLuceneNet54()
        {
            var analyzer    = new SnowballAnalyzer(Lucene.Net.Util.Version.LUCENE_CURRENT, "Finnish");
            var input       = new StringReader("terve");
            var tokenStream = analyzer.TokenStream("fieldName", input);
            var termAttr    = tokenStream.AddAttribute <ITermAttribute>();

            Assert.That(tokenStream.IncrementToken(), Is.True);
            Assert.That(termAttr.Term, Is.EqualTo("terv"));
        }
예제 #13
0
        public virtual void TestTurkishBWComp()
        {
            Analyzer a = new SnowballAnalyzer(LuceneVersion.LUCENE_30, "Turkish");

            // AĞACI in turkish lowercases to ağacı, but with lowercase filter ağaci.
            // this fails due to wrong casing, because the stemmer
            // will only remove -ı, not -i
            AssertAnalyzesTo(a, "ağacı", new string[] { "ağaç" });
            AssertAnalyzesTo(a, "AĞACI", new string[] { "ağaci" });
        }
예제 #14
0
        public virtual void TestEnglishLowerCase()
        {
            Analyzer a = new SnowballAnalyzer(TEST_VERSION_CURRENT, "English");

            AssertAnalyzesTo(a, "cryogenic", new string[] { "cryogen" });
            AssertAnalyzesTo(a, "CRYOGENIC", new string[] { "cryogen" });

            Analyzer b = new SnowballAnalyzer(LuceneVersion.LUCENE_30, "English");

            AssertAnalyzesTo(b, "cryogenic", new string[] { "cryogen" });
            AssertAnalyzesTo(b, "CRYOGENIC", new string[] { "cryogen" });
        }
예제 #15
0
 public void TestWithRealWorldData()
 {
     foreach (var file in Directory.GetFiles(Path.Combine(Paths.ProjectRootDirectory, @"test-files\analysis")))
     {
         using (Analyzer a = new SnowballAnalyzer(Version.LUCENE_CURRENT, "Turkish"))
         {
             var ts = a.TokenStream("dummy", new System.IO.StringReader(File.ReadAllText(file)));
             while (ts.IncrementToken())
             {
                 var att = ts.GetAttribute<ITermAttribute>();
                 Console.WriteLine(att.Term);
             }
         }
     }
 }
        public void Build()
        {
            Directory directory = FSDirectory.GetDirectory(indexPath);
            Analyzer analyzer = new SnowballAnalyzer("English");
            IndexWriter writer = new IndexWriter(directory, analyzer, true);

            new DirectoryInfo(contentPath)
                .GetFilesRecursive()
                .Where(file => Parser.IsParseable(file.FullName))
                .Select(file => new { Path = file.FullName, Text = Parser.Parse(file.FullName), Title = file.Name })
                .ForEach((item) =>
                {
                    Document doc = new Document();
                    doc.Add(new Field("title", item.Title, Field.Store.YES, Field.Index.TOKENIZED));
                    doc.Add(new Field("path", item.Path, Field.Store.YES, Field.Index.NO));
                    doc.Add(new Field("text", item.Text, Field.Store.YES, Field.Index.TOKENIZED));
                    writer.AddDocument(doc);
                });

            writer.Optimize();
            writer.Close();
        }
        public IEnumerable<SearchResult> Search(string query)
        {
            Analyzer analyzer = new SnowballAnalyzer("English");
            QueryParser parser = new QueryParser("text", analyzer);
            Query luceneQuery = parser.Parse(query);
            Directory directory = FSDirectory.GetDirectory(indexPath);
            IndexSearcher searcher = new IndexSearcher(directory);

            QueryScorer queryScorer = new QueryScorer(luceneQuery);
            Highlighter highlighter = new Highlighter(queryScorer);

            TopDocs topDocs = searcher.Search(luceneQuery, 100);

            var searchResults = new List<SearchResult>();
            foreach (ScoreDoc scoreDoc in topDocs.scoreDocs)
            {
                Document doc = searcher.Doc(scoreDoc.doc);
                searchResults.Add(new SearchResult { Path = doc.Get("path"), Score = scoreDoc.score, Title = doc.Get("title"), Preview = highlighter.GetBestFragment(analyzer, "text", doc.Get("text")) });
            }

            return searchResults;
        }
예제 #18
0
        public SpanishAnalyzer(ISet<string> stop_words)
        {
            analyzer = new SnowballAnalyzer(Lucene.Net.Util.Version.LUCENE_30, "Spanish", stop_words);

            STOP_WORDS = stop_words;
        }
예제 #19
0
 public SpanishAnalyzer()
 {
     analyzer = new SnowballAnalyzer(Lucene.Net.Util.Version.LUCENE_30, "Spanish");
 }
예제 #20
0
 public void TestEnglish()
 {
     Analyzer a = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English");
     AssertAnalyzesTo(a, "he abhorred accents",
         new String[] { "he", "abhor", "accent" });
 }
예제 #21
0
 public void TestJiraLuceneNet54()
 {
     var analyzer = new SnowballAnalyzer(Lucene.Net.Util.Version.LUCENE_CURRENT, "Finnish");
     var input = new StringReader("terve");
     var tokenStream = analyzer.TokenStream("fieldName", input);
     var termAttr = tokenStream.AddAttribute<ITermAttribute>();
     Assert.That(tokenStream.IncrementToken(), Is.True);
     Assert.That(termAttr.Term, Is.EqualTo("terv"));
 }
        /// <summary>
        /// 
        /// </summary>
        /// <param name="IndexField"></param>
        /// <param name="LuceneIndex"></param>
        /// <param name="searchQuery"></param>
        /// <returns></returns>
        public static string GetHighlight(string IndexField, string LuceneIndex, string searchQuery, string highlightField)
        {
            string hightlightText = string.Empty;

            var formatter = new SimpleHTMLFormatter("<span class=\"umbSearchHighlight\">", "</span>");

            var highlighter = new Highlighter(formatter, FragmentScorer(searchQuery, highlightField, LuceneIndex));
            var tokenStream = new SnowballAnalyzer("English").TokenStream(highlightField, new StringReader(IndexField));

            string tmp = highlighter.GetBestFragments(tokenStream, IndexField, 3, "...");
            if (tmp.Length > 0)
                hightlightText = tmp + "...";

            return hightlightText;
        }
예제 #23
0
        private Analyzer GuessAnalyzer(string filePath)
        {
            Analyzer ret = null;

            switch (Path.GetFileName(filePath).Substring(0, 2).ToLowerInvariant())
            {
                case "zh":
                    ret = new ChineseAnalyzer();
                    break;
                case "cs":
                    ret = new CzechAnalyzer();
                    break;
                case "da":
                    ret = new SnowballAnalyzer("Danish");
                    break;
                case "nl":
                    ret = new SnowballAnalyzer("Dutch");
                    break;
                case "en":
                    ret = new SnowballAnalyzer("English");
                    break;
                case "fi":
                    ret = new SnowballAnalyzer("Finnish");
                    break;
                case "fr":
                    ret = new SnowballAnalyzer("French");
                    break;
                case "de":
                    ret = new SnowballAnalyzer("German");
                    break;
                case "it":
                    ret = new SnowballAnalyzer("Italian");
                    break;
                case "ja":
                    ret = new CJKAnalyzer();
                    break;
                case "ko":
                    ret = new CJKAnalyzer();
                    break;
                case "no":
                    ret = new SnowballAnalyzer("Norwegian");
                    break;
                case "pt":
                    ret = new SnowballAnalyzer("Portuguese");
                    break;
                case "ru":
                    ret = new SnowballAnalyzer("Russian");
                    break;
                case "es":
                    ret = new SnowballAnalyzer("Spanish");
                    break;
                case "se":
                    ret = new SnowballAnalyzer("Swedish");
                    break;
                default:
                    ret = new StandardAnalyzer();
                    break;
            }

            return ret;
        }
        public void StemmingEnabledQueryParsing()
        {
            string queryText = "The guy bought multiple bikes before he left town to race.";

            var snowballAnalyzer = new SnowballAnalyzer(Version.LUCENE_30, "English");
            var queryParser = new QueryParser(Version.LUCENE_30, "description", snowballAnalyzer);

            // Parse the search text.
            var query = queryParser.Parse(queryText);

            // Write the parsed query to the output to see the results.
            Trace.WriteLine(new QueryVisualizer(true).Process(query).ToString());
        }
예제 #25
0
        public virtual void TestEnglish()
        {
            Analyzer a = new SnowballAnalyzer(TEST_VERSION_CURRENT, "English");

            AssertAnalyzesTo(a, "he abhorred accents", new string[] { "he", "abhor", "accent" });
        }
예제 #26
0
		public virtual void  TestEnglish()
		{
			Analyzer a = new SnowballAnalyzer("English");
			AssertAnalyzesTo(a, "he abhorred accents", new System.String[]{"he", "abhor", "accent"});
		}
예제 #27
0
        public virtual void TestStopwords()
        {
            Analyzer a = new SnowballAnalyzer(TEST_VERSION_CURRENT, "English", StandardAnalyzer.STOP_WORDS_SET);

            AssertAnalyzesTo(a, "the quick brown fox jumped", new string[] { "quick", "brown", "fox", "jump" });
        }
예제 #28
0
        public virtual void  TestEnglish()
        {
            Analyzer a = new SnowballAnalyzer("English");

            AssertAnalyzesTo(a, "he abhorred accents", new System.String[] { "he", "abhor", "accent" });
        }
예제 #29
0
 public virtual void TestEnglish()
 {
     Analyzer a = new SnowballAnalyzer(TEST_VERSION_CURRENT, "English");
     AssertAnalyzesTo(a, "he abhorred accents", new string[] { "he", "abhor", "accent" });
 }
예제 #30
0
 public virtual void TestReusableTokenStream()
 {
     Analyzer a = new SnowballAnalyzer(TEST_VERSION_CURRENT, "English");
     AssertAnalyzesTo(a, "he abhorred accents", new string[] { "he", "abhor", "accent" });
     AssertAnalyzesTo(a, "she abhorred him", new string[] { "she", "abhor", "him" });
 }
예제 #31
0
        private Analyzer GuessAnalyzer(string filePath,out bool isRTL)
        {
            Analyzer ret = null;
            isRTL = false;

            switch (Path.GetFileName(filePath).Substring(0, 2).ToLowerInvariant())
            {
                case "zh":
                    ret = new ChineseAnalyzer();
                    break;
                case "cs":
                    ret = new CzechAnalyzer();
                    break;
                case "da":
                    ret = new SnowballAnalyzer("Danish");
                    break;
                case "nl":
                    ret = new SnowballAnalyzer("Dutch");
                    break;
                case "en":
                    ret = new SnowballAnalyzer("English");
                    break;
                case "fi":
                    ret = new SnowballAnalyzer("Finnish");
                    break;
                case "fr":
                    ret = new SnowballAnalyzer("French");
                    break;
                case "de":
                    ret = new SnowballAnalyzer("German");
                    break;
                case "it":
                    ret = new SnowballAnalyzer("Italian");
                    break;
                case "ja":
                    ret = new CJKAnalyzer();
                    break;
                case "ko":
                    ret = new CJKAnalyzer();
                    break;
                case "no":
                    ret = new SnowballAnalyzer("Norwegian");
                    break;
                case "pt":
                    ret = new SnowballAnalyzer("Portuguese");
                    break;
                case "ru":
                    ret = new SnowballAnalyzer("Russian");
                    break;
                case "es":
                    ret = new SnowballAnalyzer("Spanish");
                    break;
                case "se":
                    ret = new SnowballAnalyzer("Swedish");
                    break;
                case "ar":
                    isRTL = true;
                    // TODO: Lucene 2.9 has a light stemmer for Arabic providing good search results
                    ret = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29);
                    break;
                case "he":
                    {
                        isRTL = true;
                        string hspellPath = System.Configuration.ConfigurationManager.AppSettings["hspellPath"];
                        if (!string.IsNullOrEmpty(hspellPath) && Directory.Exists(hspellPath))
                        {
                            try
                            {
                                ret = new Lucene.Net.Analysis.Hebrew.MorphAnalyzer(hspellPath);
                                break;
                            }
                            catch
                            {
                            }
                        }
                        ret = new Lucene.Net.Analysis.Hebrew.SimpleAnalyzer();
                        break;
                    }
                default:
                    ret = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29);
                    break;
            }

            return ret;
        }
예제 #32
0
 public virtual void TestStopwords()
 {
     Analyzer a = new SnowballAnalyzer(TEST_VERSION_CURRENT, "English", StandardAnalyzer.STOP_WORDS_SET);
     AssertAnalyzesTo(a, "the quick brown fox jumped", new string[] { "quick", "brown", "fox", "jump" });
 }