public void SnowballAnalyzer() { // The algorithm is language-specific, using stemming. Stemming algorithms attempt to reduce a word to a common root form. string text = "building build builds builded"; string output = "Analyzing '" + text + "', generated the tokens: "; Dictionary <string, string> tokensFound = new Dictionary <string, string>(); // Do the analyzis Analyzer analyzer = new SnowballAnalyzer("English", StandardAnalyzer.STOP_WORDS); TokenStream stream = analyzer.TokenStream("contents", new StringReader(text)); while (true) { Token token = stream.Next(); if (token == null) { break; } // Append only unique tokens if (!tokensFound.ContainsKey(token.TermText())) { tokensFound[token.TermText()] = token.TermText(); output += "[" + token.TermText() + "] "; } } log.Debug(output); Assert.AreEqual(1, tokensFound.Count); }
/// <summary> /// 将word取出词干,支持停用词 /// </summary> /// <param name="word"></param> /// <param name="language"></param> /// <returns></returns> public static string SnowballWord(string word, string language) { string result = null; string stemmer = SnowballDict.GetStemmer(language); if (stemmer == null) { result = word; } else { using (SnowballAnalyzer snowball = new SnowballAnalyzer(Lucene.Net.Util.Version.LUCENE_30, stemmer, StopWord.StopWordList)) { using (TokenStream ts = snowball.ReusableTokenStream("", new StringReader(word)))//只显示分词信息,不需要使用FieldName { while (ts.IncrementToken()) { ITermAttribute attribute = ts.GetAttribute <ITermAttribute>(); result = attribute.Term; } } } } return(result); }
/// <summary> /// /// </summary> /// <param name="IndexField"></param> /// <param name="LuceneIndex"></param> /// <param name="searchQuery"></param> /// <returns></returns> public static string GetHighlight(string IndexField, string LuceneIndex, string searchQuery, string highlightField) { string hightlightText = string.Empty; var formatter = new SimpleHTMLFormatter("<span class=\"umbSearchHighlight\">", "</span>"); var highlighter = new Highlighter(formatter, FragmentScorer(searchQuery, highlightField, LuceneIndex)); var tokenStream = new SnowballAnalyzer("English").TokenStream(highlightField, new StringReader(IndexField)); string tmp = highlighter.GetBestFragments(tokenStream, IndexField, 3, "..."); if (tmp.Length > 0) { hightlightText = tmp + "..."; } return(hightlightText); }
public static void AddUpdateLuceneIndex(IEnumerable <Offer> offers) { // init lucene //var analyzer = new StandardAnalyzer(Version.LUCENE_30); var analyzer = new SnowballAnalyzer(Lucene.Net.Util.Version.LUCENE_30, "English", StopAnalyzer.ENGLISH_STOP_WORDS_SET); using (var writer = new IndexWriter(_directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED)) { // add data to lucene search index (replaces older entry if any) foreach (var offer in offers) { _addToLuceneIndex(offer, writer); } // close handles analyzer.Close(); writer.Dispose(); } }
private static IEnumerable <Offer> _search(string searchQuery, string searchField = "") { // validation if (string.IsNullOrEmpty(searchQuery.Replace("*", "").Replace("?", ""))) { return(new List <Offer>()); } // set up lucene searcher using (var searcher = new IndexSearcher(_directory, true)) { var hits_limit = 1000; //var analyzer = new StandardAnalyzer(Version.LUCENE_30); var analyzer = new SnowballAnalyzer(Lucene.Net.Util.Version.LUCENE_30, "English", StopAnalyzer.ENGLISH_STOP_WORDS_SET); searcher.SetDefaultFieldSortScoring(true, true); // search by single field if (!string.IsNullOrEmpty(searchField)) { var parser = new QueryParser(Version.LUCENE_30, searchField, analyzer); var query = parseQuery(searchQuery, parser); var hits = searcher.Search(query, hits_limit).ScoreDocs; var results = _mapLuceneToOfferList(hits, searcher); analyzer.Close(); searcher.Dispose(); return(results); } // search by multiple fields (ordered by RELEVANCE) else { var parser = new MultiFieldQueryParser(Version.LUCENE_30, _fields.Keys.ToArray(), analyzer); var query = parseQuery(searchQuery, parser); var hits = searcher.Search(query, null, hits_limit, Sort.RELEVANCE).ScoreDocs; var results = _mapLuceneToOfferList(hits, searcher); analyzer.Close(); searcher.Dispose(); return(results); } } }
public string GeneratePreviewText(string text) { QueryScorer scorer = new QueryScorer(currentQuery); IFormatter formatter = new SimpleHTMLFormatter("", ""); Highlighter highlighter = new Highlighter(formatter, scorer); highlighter.TextFragmenter = new SimpleFragmenter(100); TokenStream stream = new SnowballAnalyzer(VERSION, "English").TokenStream(URL_FN, new StringReader(text)); //TokenStream stream = new StandardAnalyzer(VERSION).TokenStream(URL_FN, new StringReader(text)); string fragment = highlighter.GetBestFragments(stream, text, 2, "..."); if (string.IsNullOrEmpty(fragment)) { if (text.Length > 100) { fragment = text.Substring(0, 100); } else { fragment = text; } } return(fragment); }
static void Main(string[] args) { Action <Analyzer, String> displayAction = DisplayTokens; var version = Lucene.Net.Util.Version.LUCENE_30; var text = "Høje Taastrup Århus René"; Console.WriteLine("Original string: {0}", text); Console.WriteLine(); Analyzer analyzer = new KeywordAnalyzer(); displayAction(analyzer, text); analyzer = new WhitespaceAnalyzer(); displayAction(analyzer, text); analyzer = new SimpleAnalyzer(); displayAction(analyzer, text); analyzer = new StopAnalyzer(version); displayAction(analyzer, text); analyzer = new StandardAnalyzer(version); displayAction(analyzer, text); analyzer = new SnowballAnalyzer(Version.LUCENE_30, "Danish"); // http://snowball.tartarus.org/ displayAction(analyzer, text); analyzer = new TestAnalyzer(version); displayAction(analyzer, text); //analyzer = new LowerCaseKeywordAnalyzer(); //displayAction(analyzer, text); //analyzer = new EdgeNGramAnalyzer(version); //displayAction(analyzer, text); //analyzer = new ReverseAnalyzer(version); //displayAction(analyzer, text); //new PerFieldAnalyzerWrapper() //Different fields require different analyzers }
public void Transform(Dictionary <string, object> dictionary = null, List <Type> allowedTransformers = null) { if (dictionary == null) { dictionary = new Dictionary <string, object>(); } if (Attributes == null) { Attributes = new object[] { } } ; if (ParentListAttributes == null) { ParentListAttributes = new object[] { } } ; //lower case keys //Key = Key.ToLower(); //find field settings var settings = Attributes.Where(x => x.GetType() == typeof(IndexSettings)); var parentListSettings = ParentListAttributes.Where(x => x.GetType() == typeof(IndexSettings)); if (!settings.Any() && parentListSettings.Any()) { settings = parentListSettings; } if (settings.Any()) { var sattr = (IndexSettings)settings.First(); KeepValueCasing = !sattr.LowerCaseValue; Ignore = sattr.Ignore; Spatial = sattr.Spatial; FieldIndexSetting = sattr.FieldIndexSetting; FieldStoreSetting = sattr.FieldStoreSetting; if (sattr.Analyzer != null) { if (sattr.Analyzer == typeof(SnowballAnalyzer)) { Analyzer = new SnowballAnalyzer(Lucene.Net.Util.LuceneVersion.LUCENE_48, "English"); } else { try { Analyzer = (Analyzer)Activator.CreateInstance(sattr.Analyzer, Lucene.Net.Util.LuceneVersion.LUCENE_48); } catch (MissingMethodException mmex) { Analyzer = (Analyzer)Activator.CreateInstance(sattr.Analyzer); } } } } else { KeepValueCasing = true; FieldIndexSetting = FieldSettings.FieldIndexSetting; FieldStoreSetting = FieldSettings.FieldStoreSetting; } //apply transforms var tattr = Attributes .Where(x => x.GetType().GetInterfaces() .Any(y => y.IsGenericType && y.GetGenericTypeDefinition() == typeof(I_Property_Transformer <,>)) ).ToList(); OriginalValue = Value; var task = ObjectDumper.DoTransform(tattr, Type, Model, Key, UniqueKey, Value, dictionary, allowedTransformers: allowedTransformers); task.GetAwaiter().GetResult(); Value = task.Result; } }
public SpanishAnalyzer(ISet <string> stop_words) { analyzer = new SnowballAnalyzer(Lucene.Net.Util.Version.LUCENE_30, "Spanish", stop_words); STOP_WORDS = stop_words; }
public SpanishAnalyzer() { analyzer = new SnowballAnalyzer(Lucene.Net.Util.Version.LUCENE_30, "Spanish"); }
private Analyzer GuessAnalyzer(string filePath) { Analyzer ret = null; switch (Path.GetFileName(filePath).Substring(0, 2).ToLowerInvariant()) { case "zh": ret = new ChineseAnalyzer(); break; case "cs": ret = new CzechAnalyzer(); break; case "da": ret = new SnowballAnalyzer("Danish"); break; case "nl": ret = new SnowballAnalyzer("Dutch"); break; case "en": ret = new SnowballAnalyzer("English"); break; case "fi": ret = new SnowballAnalyzer("Finnish"); break; case "fr": ret = new SnowballAnalyzer("French"); break; case "de": ret = new SnowballAnalyzer("German"); break; case "it": ret = new SnowballAnalyzer("Italian"); break; case "ja": ret = new CJKAnalyzer(); break; case "ko": ret = new CJKAnalyzer(); break; case "no": ret = new SnowballAnalyzer("Norwegian"); break; case "pt": ret = new SnowballAnalyzer("Portuguese"); break; case "ru": ret = new SnowballAnalyzer("Russian"); break; case "es": ret = new SnowballAnalyzer("Spanish"); break; case "se": ret = new SnowballAnalyzer("Swedish"); break; default: ret = new StandardAnalyzer(); break; } return(ret); }
private Analyzer GuessAnalyzer(string filePath, out bool isRTL) { Analyzer ret = null; isRTL = false; switch (Path.GetFileName(filePath).Substring(0, 2).ToLowerInvariant()) { case "zh": ret = new ChineseAnalyzer(); break; case "cs": ret = new CzechAnalyzer(); break; case "da": ret = new SnowballAnalyzer("Danish"); break; case "nl": ret = new SnowballAnalyzer("Dutch"); break; case "en": ret = new SnowballAnalyzer("English"); break; case "fi": ret = new SnowballAnalyzer("Finnish"); break; case "fr": ret = new SnowballAnalyzer("French"); break; case "de": ret = new SnowballAnalyzer("German"); break; case "it": ret = new SnowballAnalyzer("Italian"); break; case "ja": ret = new CJKAnalyzer(); break; case "ko": ret = new CJKAnalyzer(); break; case "no": ret = new SnowballAnalyzer("Norwegian"); break; case "pt": ret = new SnowballAnalyzer("Portuguese"); break; case "ru": ret = new SnowballAnalyzer("Russian"); break; case "es": ret = new SnowballAnalyzer("Spanish"); break; case "se": ret = new SnowballAnalyzer("Swedish"); break; case "ar": isRTL = true; // TODO: Lucene 2.9 has a light stemmer for Arabic providing good search results ret = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29); break; case "he": { isRTL = true; string hspellPath = System.Configuration.ConfigurationManager.AppSettings["hspellPath"]; if (!string.IsNullOrEmpty(hspellPath) && Directory.Exists(hspellPath)) { try { ret = new Lucene.Net.Analysis.Hebrew.MorphAnalyzer(hspellPath); break; } catch { } } ret = new Lucene.Net.Analysis.Hebrew.SimpleAnalyzer(); break; } default: ret = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29); break; } return(ret); }