private AnalysisDescriptor BuildAnalysis(AnalysisDescriptor ad)
 {
     return(ad.Analyzers(a => a
                         .Pattern(COMMA_WHITESPACE_ANALYZER, p => p.Pattern(@"[,\s]+"))
                         .Custom(EMAIL_ANALYZER, c => c.Filters(EMAIL_TOKEN_FILTER, "lowercase", TLD_STOPWORDS_TOKEN_FILTER, EDGE_NGRAM_TOKEN_FILTER, "unique").Tokenizer("keyword"))
                         .Custom(VERSION_INDEX_ANALYZER, c => c.Filters(VERSION_PAD1_TOKEN_FILTER, VERSION_PAD2_TOKEN_FILTER, VERSION_PAD3_TOKEN_FILTER, VERSION_PAD4_TOKEN_FILTER, VERSION_TOKEN_FILTER, "lowercase", "unique").Tokenizer("whitespace"))
                         .Custom(VERSION_SEARCH_ANALYZER, c => c.Filters(VERSION_PAD1_TOKEN_FILTER, VERSION_PAD2_TOKEN_FILTER, VERSION_PAD3_TOKEN_FILTER, VERSION_PAD4_TOKEN_FILTER, "lowercase").Tokenizer("whitespace"))
                         .Custom(WHITESPACE_LOWERCASE_ANALYZER, c => c.Filters("lowercase").Tokenizer(COMMA_WHITESPACE_TOKENIZER))
                         .Custom(TYPENAME_ANALYZER, c => c.Filters(TYPENAME_TOKEN_FILTER, "lowercase", "unique").Tokenizer(TYPENAME_HIERARCHY_TOKENIZER))
                         .Custom(STANDARDPLUS_ANALYZER, c => c.Filters(STANDARDPLUS_TOKEN_FILTER, "lowercase", "unique").Tokenizer(COMMA_WHITESPACE_TOKENIZER))
                         .Custom(LOWER_KEYWORD_ANALYZER, c => c.Filters("lowercase").Tokenizer("keyword"))
                         .Custom(HOST_ANALYZER, c => c.Filters("lowercase").Tokenizer(HOST_TOKENIZER))
                         .Custom(URL_PATH_ANALYZER, c => c.Filters("lowercase").Tokenizer(URL_PATH_TOKENIZER)))
            .TokenFilters(f => f
                          .EdgeNGram(EDGE_NGRAM_TOKEN_FILTER, p => p.MaxGram(50).MinGram(2).Side(EdgeNGramSide.Front))
                          .PatternCapture(EMAIL_TOKEN_FILTER, p => p.PreserveOriginal().Patterns("(\\w+)", "(\\p{L}+)", "(\\d+)", "@(.+)", "@(.+)\\.", "(.+)@"))
                          .PatternCapture(STANDARDPLUS_TOKEN_FILTER, p => p.PreserveOriginal().Patterns(
                                              @"([^\.\(\)\[\]\/\\\{\}\?=&;:\<\>]+)",
                                              @"([^\.\(\)\[\]\/\\\{\}\?=&;:\<\>]+[\.\/\\][^\.\(\)\[\]\/\\\{\}\?=&;:\<\>]+)",
                                              @"([^\.\(\)\[\]\/\\\{\}\?=&;:\<\>]+[\.\/\\][^\.\(\)\[\]\/\\\{\}\?=&;:\<\>]+[\.\/\\][^\.\(\)\[\]\/\\\{\}\?=&;:\<\>]+)"
                                              ))
                          .PatternCapture(TYPENAME_TOKEN_FILTER, p => p.PreserveOriginal().Patterns(@" ^ (\w+)", @"\.(\w+)", @"([^\(\)]+)"))
                          .PatternCapture(VERSION_TOKEN_FILTER, p => p.Patterns(@"^(\d+)\.", @"^(\d+\.\d+)", @"^(\d+\.\d+\.\d+)"))
                          .PatternReplace(VERSION_PAD1_TOKEN_FILTER, p => p.Pattern(@"(\.|^)(\d{1})(?=\.|-|$)").Replacement("$10000$2"))
                          .PatternReplace(VERSION_PAD2_TOKEN_FILTER, p => p.Pattern(@"(\.|^)(\d{2})(?=\.|-|$)").Replacement("$1000$2"))
                          .PatternReplace(VERSION_PAD3_TOKEN_FILTER, p => p.Pattern(@"(\.|^)(\d{3})(?=\.|-|$)").Replacement("$100$2"))
                          .PatternReplace(VERSION_PAD4_TOKEN_FILTER, p => p.Pattern(@"(\.|^)(\d{4})(?=\.|-|$)").Replacement("$10$2"))
                          .Stop(TLD_STOPWORDS_TOKEN_FILTER, p => p.StopWords("com", "net", "org", "info", "me", "edu", "mil", "gov", "biz", "co", "io", "dev"))
                          .WordDelimiter(ALL_WORDS_DELIMITER_TOKEN_FILTER, p => p.CatenateNumbers().PreserveOriginal().CatenateAll().CatenateWords()))
            .Tokenizers(t => t
                        .CharGroup(COMMA_WHITESPACE_TOKENIZER, p => p.TokenizeOnCharacters(",", "whitespace"))
                        .CharGroup(URL_PATH_TOKENIZER, p => p.TokenizeOnCharacters("/", "-", "."))
                        .CharGroup(HOST_TOKENIZER, p => p.TokenizeOnCharacters("."))
                        .PathHierarchy(TYPENAME_HIERARCHY_TOKENIZER, p => p.Delimiter('.'))));
 }
Exemple #2
0
        public static IAnalysis AddSearchAnalyzer(this AnalysisDescriptor analysis)
        {
            const string lowercase = nameof(lowercase);

            // https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-edgengram-tokenizer.html

            return
                (analysis
                 .Analyzers(a => a
                            .Custom(IndexAnalyzerName, c => c
                                    .Tokenizer(IndexAnalyzerName)
                                    .Filters(lowercase)
                                    )
                            .Custom(SearchAnalyzerName, c =>
                                    c.Tokenizer(lowercase)
                                    )
                            )
                 .Tokenizers(t => t
                             .EdgeNGram(IndexAnalyzerName, e => e
                                        .MinGram(1)
                                        .MaxGram(20)
                                        .TokenChars(TokenChar.Letter)
                                        )
                             ));
        }
Exemple #3
0
 protected static IAnalysis InitCommonAnalyzers(AnalysisDescriptor analysis)
 {
     return(analysis.Analyzers(a => a
                               .Custom("html_stripper", cc => cc
                                       .Filters("eng_stopwords", "trim", "lowercase")
                                       .CharFilters("html_strip")
                                       .Tokenizer("autocomplete")
                                       )
                               .Custom("keywords_wo_stopwords", cc => cc
                                       .Filters("eng_stopwords", "trim", "lowercase")
                                       .CharFilters("html_strip")
                                       .Tokenizer("key_tokenizer")
                                       )
                               .Custom("autocomplete", cc => cc
                                       .Filters("eng_stopwords", "trim", "lowercase")
                                       .Tokenizer("autocomplete")
                                       )
                               )
            .Tokenizers(tdesc => tdesc
                        .Keyword("key_tokenizer", t => t)
                        .EdgeNGram("autocomplete", e => e
                                   .MinGram(3)
                                   .MaxGram(15)
                                   .TokenChars(TokenChar.Letter, TokenChar.Digit)
                                   )
                        )
            .TokenFilters(f => f
                          .Stop("eng_stopwords", lang => lang
                                .StopWords("_english_")
                                )
                          ));
 }
Exemple #4
0
 private AnalysisDescriptor BuildAnalysis(AnalysisDescriptor ad)
 {
     return(ad.Analyzers(a => a
                         .Pattern(COMMA_WHITESPACE_ANALYZER, p => p.Pattern(@"[,\s]+"))
                         .Custom(STANDARDPLUS_ANALYZER, c => c.Filters("lowercase", "stop", "unique").Tokenizer(COMMA_WHITESPACE_TOKENIZER))
                         .Custom(WHITESPACE_LOWERCASE_ANALYZER, c => c.Filters("lowercase").Tokenizer("whitespace")))
            .Tokenizers(t => t
                        .Pattern(COMMA_WHITESPACE_TOKENIZER, p => p.Pattern(@"[,\s]+"))));
 }
Exemple #5
0
        private static AnalysisDescriptor CreateAnalysisDescriptor(string charHtmlFilter, TokenFiltersDescriptor tokenFilters, AnalyzersDescriptor analyzers)
        {
            var analysisDescriptor = new AnalysisDescriptor();

            analysisDescriptor.CharFilters(c => c.HtmlStrip(charHtmlFilter));
            analysisDescriptor.TokenFilters(t => tokenFilters);
            analysisDescriptor.Analyzers(a => analyzers);
            return(analysisDescriptor);
        }
 protected override AnalysisDescriptor Contribute(AnalysisDescriptor descriptor, IEnumerable <KeyValuePair <string, AnalyzerBase> > build)
 {
     return(descriptor.Analyzers(a =>
     {
         foreach (var item in build.Where(x => CanContribute(x, a)))
         {
             a.Add(item.Key, item.Value);
         }
         return a;
     }));
 }
 private AnalysisDescriptor BuildAnalysis(AnalysisDescriptor ad)
 {
     return(ad.Analyzers(bases => {
         bases.Add(COMMA_WHITESPACE_ANALYZER, new PatternAnalyzer {
             Pattern = @"[,\s]+"
         });
         bases.Add(EMAIL_ANALYZER, new CustomAnalyzer {
             Tokenizer = "keyword", Filter = new[] { EMAIL_TOKEN_FILTER, "lowercase", "unique" }
         });
         bases.Add(VERSION_INDEX_ANALYZER, new CustomAnalyzer {
             Tokenizer = "whitespace", Filter = new[] { VERSION_PAD1_TOKEN_FILTER, VERSION_PAD2_TOKEN_FILTER, VERSION_PAD3_TOKEN_FILTER, VERSION_PAD4_TOKEN_FILTER, VERSION_TOKEN_FILTER, "lowercase", "unique" }
         });
         bases.Add(VERSION_SEARCH_ANALYZER, new CustomAnalyzer {
             Tokenizer = "whitespace", Filter = new[] { VERSION_PAD1_TOKEN_FILTER, VERSION_PAD2_TOKEN_FILTER, VERSION_PAD3_TOKEN_FILTER, VERSION_PAD4_TOKEN_FILTER, "lowercase" }
         });
         bases.Add(WHITESPACE_LOWERCASE_ANALYZER, new CustomAnalyzer {
             Tokenizer = "whitespace", Filter = new[] { "lowercase" }
         });
         bases.Add(TYPENAME_ANALYZER, new CustomAnalyzer {
             Tokenizer = "whitespace", Filter = new[] { TYPENAME_TOKEN_FILTER, "lowercase", "unique" }
         });
         bases.Add(STANDARDPLUS_ANALYZER, new CustomAnalyzer {
             Tokenizer = "whitespace", Filter = new[] { "standard", TYPENAME_TOKEN_FILTER, "lowercase", "stop", "unique" }
         });
         return bases;
     }).TokenFilters(bases => {
         bases.Add(EMAIL_TOKEN_FILTER, new PatternCaptureTokenFilter {
             Patterns = new[] { @"(\w+)", @"(\p{L}+)", @"(\d+)", @"(.+)@", @"@(.+)" }
         });
         bases.Add(TYPENAME_TOKEN_FILTER, new PatternCaptureTokenFilter {
             Patterns = new[] { @"\.(\w+)" }
         });
         bases.Add(VERSION_TOKEN_FILTER, new PatternCaptureTokenFilter {
             Patterns = new[] { @"^(\d+)\.", @"^(\d+\.\d+)", @"^(\d+\.\d+\.\d+)" }
         });
         bases.Add(VERSION_PAD1_TOKEN_FILTER, new PatternReplaceTokenFilter {
             Pattern = @"(\.|^)(\d{1})(?=\.|-|$)", Replacement = @"$10000$2"
         });
         bases.Add(VERSION_PAD2_TOKEN_FILTER, new PatternReplaceTokenFilter {
             Pattern = @"(\.|^)(\d{2})(?=\.|-|$)", Replacement = @"$1000$2"
         });
         bases.Add(VERSION_PAD3_TOKEN_FILTER, new PatternReplaceTokenFilter {
             Pattern = @"(\.|^)(\d{3})(?=\.|-|$)", Replacement = @"$100$2"
         });
         bases.Add(VERSION_PAD4_TOKEN_FILTER, new PatternReplaceTokenFilter {
             Pattern = @"(\.|^)(\d{4})(?=\.|-|$)", Replacement = @"$10$2"
         });
         return bases;
     }));
 }
 private AnalysisDescriptor BuildIndexDescriptor(AnalysisDescriptor a)
 {
     return
         (a
          .Analyzers(aa => aa
                     .Custom("default",
                             descriptor =>
                             descriptor.Tokenizer("standard")
                             .CharFilters("html_strip")
                             .Filters("lowercase", "ru_RU", "en_US"))
                     ).TokenFilters(descriptor =>
                                    descriptor.Hunspell("ru_RU", hh => hh.Dedup().Locale("ru_RU"))
                                    .Hunspell("en_US", hh => hh.Dedup().Locale("en_US")))
         );
 }
        public static IAnalysis AddSearchAnalyzerFor(this AnalysisDescriptor analysis, string indexName)
        {
            var indexAnalyzerName = $"{indexName}_search";
            var indexAnalyzerKey  = "lowercase";

            return
                (analysis
                 .Analyzers(a => a
                            .Custom(indexAnalyzerName, c => c
                                    .Tokenizer(indexAnalyzerName)
                                    .Filters(indexAnalyzerKey)
                                    )
                            )
                 .Tokenizers(t => t
                             .EdgeNGram(indexAnalyzerName, e => e
                                        .MinGram(1)
                                        .MaxGram(20)
                                        .TokenChars(TokenChar.Letter)
                                        )
                             ));
        }
Exemple #10
0
 private static AnalysisDescriptor Analysis(AnalysisDescriptor analysis)
 => analysis
 .Analyzers(a => a
            .Custom("my-search-analyzer", ca => ca
                    .Tokenizer("standard")
                    .Filters("lowercase", "russian_morphology", "english_morphology", "my_stopwords")
                    ))
 .TokenFilters(tf => tf
               .Stop("my_stopwords", s => s
                     .StopWords("а", "без", "более", "бы", "был", "была", "были", "было", "быть", "в", "вам", "вас",
                                "весь", "во", "вот", "все", "всего", "всех", "вы", "где", "да", "даже", "для", "до", "его",
                                "ее", "если", "есть", "еще", "же", "за", "здесь", "и", "из", "или", "им", "их", "к", "как",
                                "ко", "когда", "кто", "ли", "либо", "мне", "может", "мы", "на", "надо", "наш", "не", "него",
                                "нее", "нет", "ни", "них", "но", "ну", "о", "об", "однако", "он", "она", "они", "оно", "от",
                                "очень", "по", "под", "при", "с", "со", "так", "также", "такой", "там", "те", "тем", "то",
                                "того", "тоже", "той", "только", "том", "ты", "у", "уже", "хотя", "чего", "чей", "чем",
                                "что", "чтобы", "чье", "чья", "эта", "эти", "это", "я", "a", "an", "and", "are", "as", "at",
                                "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or",
                                "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was",
                                "will", "with"
                                )));
Exemple #11
0
 private AnalysisDescriptor BuildAnalysis(AnalysisDescriptor ad)
 {
     return(ad.Analyzers(a => a
                         .Pattern(COMMA_WHITESPACE_ANALYZER, p => p.Pattern(@"[,\s]+"))
                         .Custom(EMAIL_ANALYZER, c => c.Filters(EMAIL_TOKEN_FILTER, "lowercase", "unique").Tokenizer("keyword"))
                         .Custom(VERSION_INDEX_ANALYZER, c => c.Filters(VERSION_PAD1_TOKEN_FILTER, VERSION_PAD2_TOKEN_FILTER, VERSION_PAD3_TOKEN_FILTER, VERSION_PAD4_TOKEN_FILTER, VERSION_TOKEN_FILTER, "lowercase", "unique").Tokenizer("whitespace"))
                         .Custom(VERSION_SEARCH_ANALYZER, c => c.Filters(VERSION_PAD1_TOKEN_FILTER, VERSION_PAD2_TOKEN_FILTER, VERSION_PAD3_TOKEN_FILTER, VERSION_PAD4_TOKEN_FILTER, "lowercase").Tokenizer("whitespace"))
                         .Custom(WHITESPACE_LOWERCASE_ANALYZER, c => c.Filters("lowercase").Tokenizer("whitespace"))
                         .Custom(TYPENAME_ANALYZER, c => c.Filters(TYPENAME_TOKEN_FILTER, "lowercase", "unique").Tokenizer(TYPENAME_HIERARCHY_TOKENIZER))
                         .Custom(STANDARDPLUS_ANALYZER, c => c.Filters("standard", TYPENAME_TOKEN_FILTER, "lowercase", "stop", "unique").Tokenizer(COMMA_WHITESPACE_TOKENIZER)))
            .TokenFilters(f => f
                          .PatternCapture(EMAIL_TOKEN_FILTER, p => p.Patterns(@"(\w+)", @"(\p{L}+)", @"(\d+)", "(.+)@", "@(.+)"))
                          .PatternCapture(TYPENAME_TOKEN_FILTER, p => p.Patterns(@"\.(\w+)", @"([^\()]+)"))
                          .PatternCapture(VERSION_TOKEN_FILTER, p => p.Patterns(@"^(\d+)\.", @"^(\d+\.\d+)", @"^(\d+\.\d+\.\d+)"))
                          .PatternReplace(VERSION_PAD1_TOKEN_FILTER, p => p.Pattern(@"(\.|^)(\d{1})(?=\.|-|$)").Replacement("$10000$2"))
                          .PatternReplace(VERSION_PAD2_TOKEN_FILTER, p => p.Pattern(@"(\.|^)(\d{2})(?=\.|-|$)").Replacement("$1000$2"))
                          .PatternReplace(VERSION_PAD3_TOKEN_FILTER, p => p.Pattern(@"(\.|^)(\d{3})(?=\.|-|$)").Replacement("$100$2"))
                          .PatternReplace(VERSION_PAD4_TOKEN_FILTER, p => p.Pattern(@"(\.|^)(\d{4})(?=\.|-|$)").Replacement("$10$2")))
            .Tokenizers(t => t
                        .Pattern(COMMA_WHITESPACE_TOKENIZER, p => p.Pattern(@"[,\s]+"))
                        .PathHierarchy(TYPENAME_HIERARCHY_TOKENIZER, p => p.Delimiter('.'))));
 }
Exemple #12
0
        /// <summary>
        /// I've moved this into an extension method
        /// for reuse and a clearer understanding of the
        /// custom analyzer we are writing
        /// </summary>
        /// <param name="analysis"></param>
        /// <returns></returns>
        public static IAnalysis AddSearchAnalyzer(this AnalysisDescriptor analysis)
        {
            const string lowercase = nameof(lowercase);

            return
                (analysis
                 .Analyzers(a => a
                            .Custom(IndexAnalyzerName, c => c
                                    .Tokenizer(IndexAnalyzerName)
                                    .Filters(lowercase)
                                    )
                            .Custom(SearchAnalyzerName, c =>
                                    c.Tokenizer(lowercase)
                                    )
                            )
                 .Tokenizers(t => t
                             .EdgeNGram(IndexAnalyzerName, e => e
                                        .MinGram(1)
                                        .MaxGram(20)
                                        .TokenChars(TokenChar.Letter)
                                        )
                             ));
        }
 protected override IAnalysis FluentAnalysis(AnalysisDescriptor an) =>
 an.Analyzers(d => AssertionSetup.Fluent(AssertionSetup.Name, d));
Exemple #14
0
        /// <summary>
        /// Job Analysis descriptions
        /// </summary>
        ///
        private AnalysisDescriptor GetJobsAnalysisDescriptor(LanguageCode languageCode = LanguageCode.EN)
        {
            var descriptor = new AnalysisDescriptor();

            descriptor.TokenFilters(cf => cf.Add("shingle_title", new ShingleTokenFilter()));

            descriptor.TokenFilters(
                f => f.Add("job_stopfilter", new StopTokenFilter {
                Stopwords = new List <string> {
                    "job", "jobs"
                }
            }));

            // Title Analyzer
            var titleAnalyzer = GetTitleAnalyzer(languageCode);

            descriptor.Analyzers(a => a.Add("job_title", titleAnalyzer));

            // Path Analyzer
            var pathAnalyzer = GetPathAnalyzer();

            descriptor.Analyzers(a => a.Add("path", pathAnalyzer));

            // Lowercase Analyzer
            var lowercaseAnalyzer = GetLowercaseAnalyzer(languageCode);

            descriptor.Analyzers(a => a.Add("lowercase", lowercaseAnalyzer));

            // Snowball Token Filter
            var snowballPorterFilter = GetStemmerTokenFilter(languageCode);

            descriptor.TokenFilters(d => d.Add("snowballPorterFilter", snowballPorterFilter));

            // Stopwords Filter
            var stopwordFilter = GetStopwordFilter(languageCode);

            descriptor.TokenFilters(d => d.Add("stopwordFilter", stopwordFilter));

            // Word Delimiter Token Filter
            var wdtFitler = GetWordDelimeterTokenFilter(languageCode);

            descriptor.TokenFilters(d => d.Add("wdtFitler", wdtFitler));

            // Job Default Analyzer
            var jobDefaultAnalyzer = GetJobDefaultAnanyzer(languageCode);

            descriptor.Analyzers(a => a.Add("jobDefaultAnalyzer", jobDefaultAnalyzer));

            // Job Default with Delimiter Analyzer
            var jobDefaultWithDelimiterAnalyzer = GetJobDefaultWithDelimiterAnalyzer(languageCode);

            descriptor.Analyzers(a => a.Add("jobDefaultWithDelimiterAnalyzer", jobDefaultWithDelimiterAnalyzer));

            // Title Suggestion Anlyzer
            var titleSuggestAnalyzer = GetTitleSuggestAnalyzer(languageCode);

            descriptor.Analyzers(a => a.Add("titleSuggestAnalyzer", titleSuggestAnalyzer));

            // country, match first node in hierarchy path
            descriptor.Tokenizers(t => t.Add("country_path", new PatternTokenizer {
                Pattern = "^(/[0-9]+/).*", Group = 1
            }));
            descriptor.Analyzers(a => a.Add("country_path", new CustomAnalyzer {
                Tokenizer = "country_path"
            }));

            // region, match first and second nodes in hierarchy path
            descriptor.Tokenizers(t => t.Add("region_path", new PatternTokenizer {
                Pattern = "^(/[0-9]+/[0-9]+/).*", Group = 1
            }));
            descriptor.Analyzers(a => a.Add("region_path", new CustomAnalyzer {
                Tokenizer = "region_path"
            }));

            // city, match first four or first three nodes in path as cities in some countries lack a second level division
            descriptor.Tokenizers(t => t.Add("city_path", new PatternTokenizer {
                Pattern = "^(/[0-9]+/[0-9]+/[0-9]+/[0-9]+/[0-9]+/[0-9]+/|/[0-9]+/[0-9]+/[0-9]+/[0-9]+/[0-9]+/|/[0-9]+/[0-9]+/[0-9]+/[0-9]+/|/[0-9]+/[0-9]+/[0-9]+/).*", Group = 1
            }));
            descriptor.Analyzers(a => a.Add("city_path", new CustomAnalyzer {
                Tokenizer = "city_path"
            }));

            return(descriptor);
        }