Beispiel #1
0
        public void SnowballAnalyzer()
        {
            // The algorithm is language-specific, using stemming. Stemming algorithms attempt to reduce a word to a common root form.
            string text   = "building build builds builded";
            string output = "Analyzing '" + text + "', generated the tokens: ";
            Dictionary <string, string> tokensFound = new Dictionary <string, string>();

            // Do the analyzis
            Analyzer    analyzer = new SnowballAnalyzer("English", StandardAnalyzer.STOP_WORDS);
            TokenStream stream   = analyzer.TokenStream("contents", new StringReader(text));

            while (true)
            {
                Token token = stream.Next();
                if (token == null)
                {
                    break;
                }

                // Append only unique tokens
                if (!tokensFound.ContainsKey(token.TermText()))
                {
                    tokensFound[token.TermText()] = token.TermText();
                    output += "[" + token.TermText() + "] ";
                }
            }

            log.Debug(output);

            Assert.AreEqual(1, tokensFound.Count);
        }
Beispiel #2
0
        /// <summary>
        /// 将word取出词干,支持停用词
        /// </summary>
        /// <param name="word"></param>
        /// <param name="language"></param>
        /// <returns></returns>
        public static string SnowballWord(string word, string language)
        {
            string result  = null;
            string stemmer = SnowballDict.GetStemmer(language);

            if (stemmer == null)
            {
                result = word;
            }
            else
            {
                using (SnowballAnalyzer snowball = new SnowballAnalyzer(Lucene.Net.Util.Version.LUCENE_30, stemmer, StopWord.StopWordList))
                {
                    using (TokenStream ts = snowball.ReusableTokenStream("", new StringReader(word)))//只显示分词信息,不需要使用FieldName
                    {
                        while (ts.IncrementToken())
                        {
                            ITermAttribute attribute = ts.GetAttribute <ITermAttribute>();
                            result = attribute.Term;
                        }
                    }
                }
            }
            return(result);
        }
        /// <summary>
        ///
        /// </summary>
        /// <param name="IndexField"></param>
        /// <param name="LuceneIndex"></param>
        /// <param name="searchQuery"></param>
        /// <returns></returns>
        public static string GetHighlight(string IndexField, string LuceneIndex, string searchQuery, string highlightField)
        {
            string hightlightText = string.Empty;

            var formatter = new SimpleHTMLFormatter("<span class=\"umbSearchHighlight\">", "</span>");

            var highlighter = new Highlighter(formatter, FragmentScorer(searchQuery, highlightField, LuceneIndex));
            var tokenStream = new SnowballAnalyzer("English").TokenStream(highlightField, new StringReader(IndexField));

            string tmp = highlighter.GetBestFragments(tokenStream, IndexField, 3, "...");

            if (tmp.Length > 0)
            {
                hightlightText = tmp + "...";
            }

            return(hightlightText);
        }
Beispiel #4
0
        public static void AddUpdateLuceneIndex(IEnumerable <Offer> offers)
        {
            // init lucene
            //var analyzer = new StandardAnalyzer(Version.LUCENE_30);
            var analyzer = new SnowballAnalyzer(Lucene.Net.Util.Version.LUCENE_30, "English", StopAnalyzer.ENGLISH_STOP_WORDS_SET);

            using (var writer = new IndexWriter(_directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED))
            {
                // add data to lucene search index (replaces older entry if any)
                foreach (var offer in offers)
                {
                    _addToLuceneIndex(offer, writer);
                }

                // close handles
                analyzer.Close();
                writer.Dispose();
            }
        }
Beispiel #5
0
        private static IEnumerable <Offer> _search(string searchQuery, string searchField = "")
        {
            // validation
            if (string.IsNullOrEmpty(searchQuery.Replace("*", "").Replace("?", "")))
            {
                return(new List <Offer>());
            }

            // set up lucene searcher
            using (var searcher = new IndexSearcher(_directory, true))
            {
                var hits_limit = 1000;
                //var analyzer = new StandardAnalyzer(Version.LUCENE_30);
                var analyzer = new SnowballAnalyzer(Lucene.Net.Util.Version.LUCENE_30, "English", StopAnalyzer.ENGLISH_STOP_WORDS_SET);
                searcher.SetDefaultFieldSortScoring(true, true);

                // search by single field
                if (!string.IsNullOrEmpty(searchField))
                {
                    var parser  = new QueryParser(Version.LUCENE_30, searchField, analyzer);
                    var query   = parseQuery(searchQuery, parser);
                    var hits    = searcher.Search(query, hits_limit).ScoreDocs;
                    var results = _mapLuceneToOfferList(hits, searcher);
                    analyzer.Close();
                    searcher.Dispose();
                    return(results);
                }
                // search by multiple fields (ordered by RELEVANCE)
                else
                {
                    var parser  = new MultiFieldQueryParser(Version.LUCENE_30, _fields.Keys.ToArray(), analyzer);
                    var query   = parseQuery(searchQuery, parser);
                    var hits    = searcher.Search(query, null, hits_limit, Sort.RELEVANCE).ScoreDocs;
                    var results = _mapLuceneToOfferList(hits, searcher);
                    analyzer.Close();
                    searcher.Dispose();
                    return(results);
                }
            }
        }
        public string GeneratePreviewText(string text)
        {
            QueryScorer scorer      = new QueryScorer(currentQuery);
            IFormatter  formatter   = new SimpleHTMLFormatter("", "");
            Highlighter highlighter = new Highlighter(formatter, scorer);

            highlighter.TextFragmenter = new SimpleFragmenter(100);
            TokenStream stream = new SnowballAnalyzer(VERSION, "English").TokenStream(URL_FN, new StringReader(text));
            //TokenStream stream = new StandardAnalyzer(VERSION).TokenStream(URL_FN, new StringReader(text));
            string fragment = highlighter.GetBestFragments(stream, text, 2, "...");

            if (string.IsNullOrEmpty(fragment))
            {
                if (text.Length > 100)
                {
                    fragment = text.Substring(0, 100);
                }
                else
                {
                    fragment = text;
                }
            }
            return(fragment);
        }
Beispiel #7
0
        static void Main(string[] args)
        {
            Action <Analyzer, String> displayAction = DisplayTokens;

            var version = Lucene.Net.Util.Version.LUCENE_30;

            var text = "Høje Taastrup Århus René";

            Console.WriteLine("Original string: {0}", text);
            Console.WriteLine();

            Analyzer analyzer = new KeywordAnalyzer();

            displayAction(analyzer, text);
            analyzer = new WhitespaceAnalyzer();
            displayAction(analyzer, text);
            analyzer = new SimpleAnalyzer();
            displayAction(analyzer, text);
            analyzer = new StopAnalyzer(version);
            displayAction(analyzer, text);
            analyzer = new StandardAnalyzer(version);
            displayAction(analyzer, text);
            analyzer = new SnowballAnalyzer(Version.LUCENE_30, "Danish"); // http://snowball.tartarus.org/
            displayAction(analyzer, text);
            analyzer = new TestAnalyzer(version);
            displayAction(analyzer, text);

            //analyzer = new LowerCaseKeywordAnalyzer();
            //displayAction(analyzer, text);
            //analyzer = new EdgeNGramAnalyzer(version);
            //displayAction(analyzer, text);
            //analyzer = new ReverseAnalyzer(version);
            //displayAction(analyzer, text);

            //new PerFieldAnalyzerWrapper() //Different fields require different analyzers
        }
        public void Transform(Dictionary <string, object> dictionary = null, List <Type> allowedTransformers = null)
        {
            if (dictionary == null)
            {
                dictionary = new Dictionary <string, object>();
            }
            if (Attributes == null)
            {
                Attributes = new object[] { }
            }
            ;
            if (ParentListAttributes == null)
            {
                ParentListAttributes = new object[] { }
            }
            ;

            //lower case keys
            //Key = Key.ToLower();

            //find field settings
            var settings           = Attributes.Where(x => x.GetType() == typeof(IndexSettings));
            var parentListSettings = ParentListAttributes.Where(x => x.GetType() == typeof(IndexSettings));

            if (!settings.Any() && parentListSettings.Any())
            {
                settings = parentListSettings;
            }
            if (settings.Any())
            {
                var sattr = (IndexSettings)settings.First();
                KeepValueCasing   = !sattr.LowerCaseValue;
                Ignore            = sattr.Ignore;
                Spatial           = sattr.Spatial;
                FieldIndexSetting = sattr.FieldIndexSetting;
                FieldStoreSetting = sattr.FieldStoreSetting;
                if (sattr.Analyzer != null)
                {
                    if (sattr.Analyzer == typeof(SnowballAnalyzer))
                    {
                        Analyzer = new SnowballAnalyzer(Lucene.Net.Util.LuceneVersion.LUCENE_48, "English");
                    }
                    else
                    {
                        try
                        {
                            Analyzer = (Analyzer)Activator.CreateInstance(sattr.Analyzer, Lucene.Net.Util.LuceneVersion.LUCENE_48);
                        }
                        catch (MissingMethodException mmex) {
                            Analyzer = (Analyzer)Activator.CreateInstance(sattr.Analyzer);
                        }
                    }
                }
            }
            else
            {
                KeepValueCasing   = true;
                FieldIndexSetting = FieldSettings.FieldIndexSetting;
                FieldStoreSetting = FieldSettings.FieldStoreSetting;
            }

            //apply transforms
            var tattr = Attributes
                        .Where(x => x.GetType().GetInterfaces()
                               .Any(y => y.IsGenericType && y.GetGenericTypeDefinition() == typeof(I_Property_Transformer <,>))
                               ).ToList();

            OriginalValue = Value;
            var task = ObjectDumper.DoTransform(tattr, Type, Model, Key, UniqueKey, Value, dictionary, allowedTransformers: allowedTransformers);

            task.GetAwaiter().GetResult();
            Value = task.Result;
        }
    }
        public SpanishAnalyzer(ISet <string> stop_words)
        {
            analyzer = new SnowballAnalyzer(Lucene.Net.Util.Version.LUCENE_30, "Spanish", stop_words);

            STOP_WORDS = stop_words;
        }
 public SpanishAnalyzer()
 {
     analyzer = new SnowballAnalyzer(Lucene.Net.Util.Version.LUCENE_30, "Spanish");
 }
Beispiel #11
0
        private Analyzer GuessAnalyzer(string filePath)
        {
            Analyzer ret = null;

            switch (Path.GetFileName(filePath).Substring(0, 2).ToLowerInvariant())
            {
            case "zh":
                ret = new ChineseAnalyzer();
                break;

            case "cs":
                ret = new CzechAnalyzer();
                break;

            case "da":
                ret = new SnowballAnalyzer("Danish");
                break;

            case "nl":
                ret = new SnowballAnalyzer("Dutch");
                break;

            case "en":
                ret = new SnowballAnalyzer("English");
                break;

            case "fi":
                ret = new SnowballAnalyzer("Finnish");
                break;

            case "fr":
                ret = new SnowballAnalyzer("French");
                break;

            case "de":
                ret = new SnowballAnalyzer("German");
                break;

            case "it":
                ret = new SnowballAnalyzer("Italian");
                break;

            case "ja":
                ret = new CJKAnalyzer();
                break;

            case "ko":
                ret = new CJKAnalyzer();
                break;

            case "no":
                ret = new SnowballAnalyzer("Norwegian");
                break;

            case "pt":
                ret = new SnowballAnalyzer("Portuguese");
                break;

            case "ru":
                ret = new SnowballAnalyzer("Russian");
                break;

            case "es":
                ret = new SnowballAnalyzer("Spanish");
                break;

            case "se":
                ret = new SnowballAnalyzer("Swedish");
                break;

            default:
                ret = new StandardAnalyzer();
                break;
            }

            return(ret);
        }
Beispiel #12
0
        private Analyzer GuessAnalyzer(string filePath, out bool isRTL)
        {
            Analyzer ret = null;

            isRTL = false;

            switch (Path.GetFileName(filePath).Substring(0, 2).ToLowerInvariant())
            {
            case "zh":
                ret = new ChineseAnalyzer();
                break;

            case "cs":
                ret = new CzechAnalyzer();
                break;

            case "da":
                ret = new SnowballAnalyzer("Danish");
                break;

            case "nl":
                ret = new SnowballAnalyzer("Dutch");
                break;

            case "en":
                ret = new SnowballAnalyzer("English");
                break;

            case "fi":
                ret = new SnowballAnalyzer("Finnish");
                break;

            case "fr":
                ret = new SnowballAnalyzer("French");
                break;

            case "de":
                ret = new SnowballAnalyzer("German");
                break;

            case "it":
                ret = new SnowballAnalyzer("Italian");
                break;

            case "ja":
                ret = new CJKAnalyzer();
                break;

            case "ko":
                ret = new CJKAnalyzer();
                break;

            case "no":
                ret = new SnowballAnalyzer("Norwegian");
                break;

            case "pt":
                ret = new SnowballAnalyzer("Portuguese");
                break;

            case "ru":
                ret = new SnowballAnalyzer("Russian");
                break;

            case "es":
                ret = new SnowballAnalyzer("Spanish");
                break;

            case "se":
                ret = new SnowballAnalyzer("Swedish");
                break;

            case "ar":
                isRTL = true;
                // TODO: Lucene 2.9 has a light stemmer for Arabic providing good search results
                ret = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29);
                break;

            case "he":
            {
                isRTL = true;
                string hspellPath = System.Configuration.ConfigurationManager.AppSettings["hspellPath"];
                if (!string.IsNullOrEmpty(hspellPath) && Directory.Exists(hspellPath))
                {
                    try
                    {
                        ret = new Lucene.Net.Analysis.Hebrew.MorphAnalyzer(hspellPath);
                        break;
                    }
                    catch
                    {
                    }
                }
                ret = new Lucene.Net.Analysis.Hebrew.SimpleAnalyzer();
                break;
            }

            default:
                ret = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29);
                break;
            }

            return(ret);
        }