private AnalysisDescriptor BuildAnalysis(AnalysisDescriptor ad) { return(ad.Analyzers(a => a .Pattern(COMMA_WHITESPACE_ANALYZER, p => p.Pattern(@"[,\s]+")) .Custom(EMAIL_ANALYZER, c => c.Filters(EMAIL_TOKEN_FILTER, "lowercase", TLD_STOPWORDS_TOKEN_FILTER, EDGE_NGRAM_TOKEN_FILTER, "unique").Tokenizer("keyword")) .Custom(VERSION_INDEX_ANALYZER, c => c.Filters(VERSION_PAD1_TOKEN_FILTER, VERSION_PAD2_TOKEN_FILTER, VERSION_PAD3_TOKEN_FILTER, VERSION_PAD4_TOKEN_FILTER, VERSION_TOKEN_FILTER, "lowercase", "unique").Tokenizer("whitespace")) .Custom(VERSION_SEARCH_ANALYZER, c => c.Filters(VERSION_PAD1_TOKEN_FILTER, VERSION_PAD2_TOKEN_FILTER, VERSION_PAD3_TOKEN_FILTER, VERSION_PAD4_TOKEN_FILTER, "lowercase").Tokenizer("whitespace")) .Custom(WHITESPACE_LOWERCASE_ANALYZER, c => c.Filters("lowercase").Tokenizer(COMMA_WHITESPACE_TOKENIZER)) .Custom(TYPENAME_ANALYZER, c => c.Filters(TYPENAME_TOKEN_FILTER, "lowercase", "unique").Tokenizer(TYPENAME_HIERARCHY_TOKENIZER)) .Custom(STANDARDPLUS_ANALYZER, c => c.Filters(STANDARDPLUS_TOKEN_FILTER, "lowercase", "unique").Tokenizer(COMMA_WHITESPACE_TOKENIZER)) .Custom(LOWER_KEYWORD_ANALYZER, c => c.Filters("lowercase").Tokenizer("keyword")) .Custom(HOST_ANALYZER, c => c.Filters("lowercase").Tokenizer(HOST_TOKENIZER)) .Custom(URL_PATH_ANALYZER, c => c.Filters("lowercase").Tokenizer(URL_PATH_TOKENIZER))) .TokenFilters(f => f .EdgeNGram(EDGE_NGRAM_TOKEN_FILTER, p => p.MaxGram(50).MinGram(2).Side(EdgeNGramSide.Front)) .PatternCapture(EMAIL_TOKEN_FILTER, p => p.PreserveOriginal().Patterns("(\\w+)", "(\\p{L}+)", "(\\d+)", "@(.+)", "@(.+)\\.", "(.+)@")) .PatternCapture(STANDARDPLUS_TOKEN_FILTER, p => p.PreserveOriginal().Patterns( @"([^\.\(\)\[\]\/\\\{\}\?=&;:\<\>]+)", @"([^\.\(\)\[\]\/\\\{\}\?=&;:\<\>]+[\.\/\\][^\.\(\)\[\]\/\\\{\}\?=&;:\<\>]+)", @"([^\.\(\)\[\]\/\\\{\}\?=&;:\<\>]+[\.\/\\][^\.\(\)\[\]\/\\\{\}\?=&;:\<\>]+[\.\/\\][^\.\(\)\[\]\/\\\{\}\?=&;:\<\>]+)" )) .PatternCapture(TYPENAME_TOKEN_FILTER, p => p.PreserveOriginal().Patterns(@" ^ (\w+)", @"\.(\w+)", @"([^\(\)]+)")) .PatternCapture(VERSION_TOKEN_FILTER, p => p.Patterns(@"^(\d+)\.", @"^(\d+\.\d+)", @"^(\d+\.\d+\.\d+)")) .PatternReplace(VERSION_PAD1_TOKEN_FILTER, p => p.Pattern(@"(\.|^)(\d{1})(?=\.|-|$)").Replacement("$10000$2")) .PatternReplace(VERSION_PAD2_TOKEN_FILTER, p => p.Pattern(@"(\.|^)(\d{2})(?=\.|-|$)").Replacement("$1000$2")) .PatternReplace(VERSION_PAD3_TOKEN_FILTER, p => p.Pattern(@"(\.|^)(\d{3})(?=\.|-|$)").Replacement("$100$2")) .PatternReplace(VERSION_PAD4_TOKEN_FILTER, p => p.Pattern(@"(\.|^)(\d{4})(?=\.|-|$)").Replacement("$10$2")) .Stop(TLD_STOPWORDS_TOKEN_FILTER, p => p.StopWords("com", "net", "org", "info", "me", "edu", "mil", "gov", "biz", "co", "io", "dev")) .WordDelimiter(ALL_WORDS_DELIMITER_TOKEN_FILTER, p => p.CatenateNumbers().PreserveOriginal().CatenateAll().CatenateWords())) .Tokenizers(t => t .CharGroup(COMMA_WHITESPACE_TOKENIZER, p => p.TokenizeOnCharacters(",", "whitespace")) .CharGroup(URL_PATH_TOKENIZER, p => p.TokenizeOnCharacters("/", "-", ".")) .CharGroup(HOST_TOKENIZER, p => p.TokenizeOnCharacters(".")) .PathHierarchy(TYPENAME_HIERARCHY_TOKENIZER, p => p.Delimiter('.')))); }
public static IAnalysis AddSearchAnalyzer(this AnalysisDescriptor analysis) { const string lowercase = nameof(lowercase); // https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-edgengram-tokenizer.html return (analysis .Analyzers(a => a .Custom(IndexAnalyzerName, c => c .Tokenizer(IndexAnalyzerName) .Filters(lowercase) ) .Custom(SearchAnalyzerName, c => c.Tokenizer(lowercase) ) ) .Tokenizers(t => t .EdgeNGram(IndexAnalyzerName, e => e .MinGram(1) .MaxGram(20) .TokenChars(TokenChar.Letter) ) )); }
protected static IAnalysis InitCommonAnalyzers(AnalysisDescriptor analysis) { return(analysis.Analyzers(a => a .Custom("html_stripper", cc => cc .Filters("eng_stopwords", "trim", "lowercase") .CharFilters("html_strip") .Tokenizer("autocomplete") ) .Custom("keywords_wo_stopwords", cc => cc .Filters("eng_stopwords", "trim", "lowercase") .CharFilters("html_strip") .Tokenizer("key_tokenizer") ) .Custom("autocomplete", cc => cc .Filters("eng_stopwords", "trim", "lowercase") .Tokenizer("autocomplete") ) ) .Tokenizers(tdesc => tdesc .Keyword("key_tokenizer", t => t) .EdgeNGram("autocomplete", e => e .MinGram(3) .MaxGram(15) .TokenChars(TokenChar.Letter, TokenChar.Digit) ) ) .TokenFilters(f => f .Stop("eng_stopwords", lang => lang .StopWords("_english_") ) )); }
private AnalysisDescriptor BuildAnalysis(AnalysisDescriptor ad) { return(ad.Analyzers(a => a .Pattern(COMMA_WHITESPACE_ANALYZER, p => p.Pattern(@"[,\s]+")) .Custom(STANDARDPLUS_ANALYZER, c => c.Filters("lowercase", "stop", "unique").Tokenizer(COMMA_WHITESPACE_TOKENIZER)) .Custom(WHITESPACE_LOWERCASE_ANALYZER, c => c.Filters("lowercase").Tokenizer("whitespace"))) .Tokenizers(t => t .Pattern(COMMA_WHITESPACE_TOKENIZER, p => p.Pattern(@"[,\s]+")))); }
private static AnalysisDescriptor CreateAnalysisDescriptor(string charHtmlFilter, TokenFiltersDescriptor tokenFilters, AnalyzersDescriptor analyzers) { var analysisDescriptor = new AnalysisDescriptor(); analysisDescriptor.CharFilters(c => c.HtmlStrip(charHtmlFilter)); analysisDescriptor.TokenFilters(t => tokenFilters); analysisDescriptor.Analyzers(a => analyzers); return(analysisDescriptor); }
protected override AnalysisDescriptor Contribute(AnalysisDescriptor descriptor, IEnumerable <KeyValuePair <string, AnalyzerBase> > build) { return(descriptor.Analyzers(a => { foreach (var item in build.Where(x => CanContribute(x, a))) { a.Add(item.Key, item.Value); } return a; })); }
private AnalysisDescriptor BuildAnalysis(AnalysisDescriptor ad) { return(ad.Analyzers(bases => { bases.Add(COMMA_WHITESPACE_ANALYZER, new PatternAnalyzer { Pattern = @"[,\s]+" }); bases.Add(EMAIL_ANALYZER, new CustomAnalyzer { Tokenizer = "keyword", Filter = new[] { EMAIL_TOKEN_FILTER, "lowercase", "unique" } }); bases.Add(VERSION_INDEX_ANALYZER, new CustomAnalyzer { Tokenizer = "whitespace", Filter = new[] { VERSION_PAD1_TOKEN_FILTER, VERSION_PAD2_TOKEN_FILTER, VERSION_PAD3_TOKEN_FILTER, VERSION_PAD4_TOKEN_FILTER, VERSION_TOKEN_FILTER, "lowercase", "unique" } }); bases.Add(VERSION_SEARCH_ANALYZER, new CustomAnalyzer { Tokenizer = "whitespace", Filter = new[] { VERSION_PAD1_TOKEN_FILTER, VERSION_PAD2_TOKEN_FILTER, VERSION_PAD3_TOKEN_FILTER, VERSION_PAD4_TOKEN_FILTER, "lowercase" } }); bases.Add(WHITESPACE_LOWERCASE_ANALYZER, new CustomAnalyzer { Tokenizer = "whitespace", Filter = new[] { "lowercase" } }); bases.Add(TYPENAME_ANALYZER, new CustomAnalyzer { Tokenizer = "whitespace", Filter = new[] { TYPENAME_TOKEN_FILTER, "lowercase", "unique" } }); bases.Add(STANDARDPLUS_ANALYZER, new CustomAnalyzer { Tokenizer = "whitespace", Filter = new[] { "standard", TYPENAME_TOKEN_FILTER, "lowercase", "stop", "unique" } }); return bases; }).TokenFilters(bases => { bases.Add(EMAIL_TOKEN_FILTER, new PatternCaptureTokenFilter { Patterns = new[] { @"(\w+)", @"(\p{L}+)", @"(\d+)", @"(.+)@", @"@(.+)" } }); bases.Add(TYPENAME_TOKEN_FILTER, new PatternCaptureTokenFilter { Patterns = new[] { @"\.(\w+)" } }); bases.Add(VERSION_TOKEN_FILTER, new PatternCaptureTokenFilter { Patterns = new[] { @"^(\d+)\.", @"^(\d+\.\d+)", @"^(\d+\.\d+\.\d+)" } }); bases.Add(VERSION_PAD1_TOKEN_FILTER, new PatternReplaceTokenFilter { Pattern = @"(\.|^)(\d{1})(?=\.|-|$)", Replacement = @"$10000$2" }); bases.Add(VERSION_PAD2_TOKEN_FILTER, new PatternReplaceTokenFilter { Pattern = @"(\.|^)(\d{2})(?=\.|-|$)", Replacement = @"$1000$2" }); bases.Add(VERSION_PAD3_TOKEN_FILTER, new PatternReplaceTokenFilter { Pattern = @"(\.|^)(\d{3})(?=\.|-|$)", Replacement = @"$100$2" }); bases.Add(VERSION_PAD4_TOKEN_FILTER, new PatternReplaceTokenFilter { Pattern = @"(\.|^)(\d{4})(?=\.|-|$)", Replacement = @"$10$2" }); return bases; })); }
private AnalysisDescriptor BuildIndexDescriptor(AnalysisDescriptor a) { return (a .Analyzers(aa => aa .Custom("default", descriptor => descriptor.Tokenizer("standard") .CharFilters("html_strip") .Filters("lowercase", "ru_RU", "en_US")) ).TokenFilters(descriptor => descriptor.Hunspell("ru_RU", hh => hh.Dedup().Locale("ru_RU")) .Hunspell("en_US", hh => hh.Dedup().Locale("en_US"))) ); }
public static IAnalysis AddSearchAnalyzerFor(this AnalysisDescriptor analysis, string indexName) { var indexAnalyzerName = $"{indexName}_search"; var indexAnalyzerKey = "lowercase"; return (analysis .Analyzers(a => a .Custom(indexAnalyzerName, c => c .Tokenizer(indexAnalyzerName) .Filters(indexAnalyzerKey) ) ) .Tokenizers(t => t .EdgeNGram(indexAnalyzerName, e => e .MinGram(1) .MaxGram(20) .TokenChars(TokenChar.Letter) ) )); }
private static AnalysisDescriptor Analysis(AnalysisDescriptor analysis) => analysis .Analyzers(a => a .Custom("my-search-analyzer", ca => ca .Tokenizer("standard") .Filters("lowercase", "russian_morphology", "english_morphology", "my_stopwords") )) .TokenFilters(tf => tf .Stop("my_stopwords", s => s .StopWords("а", "без", "более", "бы", "был", "была", "были", "было", "быть", "в", "вам", "вас", "весь", "во", "вот", "все", "всего", "всех", "вы", "где", "да", "даже", "для", "до", "его", "ее", "если", "есть", "еще", "же", "за", "здесь", "и", "из", "или", "им", "их", "к", "как", "ко", "когда", "кто", "ли", "либо", "мне", "может", "мы", "на", "надо", "наш", "не", "него", "нее", "нет", "ни", "них", "но", "ну", "о", "об", "однако", "он", "она", "они", "оно", "от", "очень", "по", "под", "при", "с", "со", "так", "также", "такой", "там", "те", "тем", "то", "того", "тоже", "той", "только", "том", "ты", "у", "уже", "хотя", "чего", "чей", "чем", "что", "чтобы", "чье", "чья", "эта", "эти", "это", "я", "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with" )));
private AnalysisDescriptor BuildAnalysis(AnalysisDescriptor ad) { return(ad.Analyzers(a => a .Pattern(COMMA_WHITESPACE_ANALYZER, p => p.Pattern(@"[,\s]+")) .Custom(EMAIL_ANALYZER, c => c.Filters(EMAIL_TOKEN_FILTER, "lowercase", "unique").Tokenizer("keyword")) .Custom(VERSION_INDEX_ANALYZER, c => c.Filters(VERSION_PAD1_TOKEN_FILTER, VERSION_PAD2_TOKEN_FILTER, VERSION_PAD3_TOKEN_FILTER, VERSION_PAD4_TOKEN_FILTER, VERSION_TOKEN_FILTER, "lowercase", "unique").Tokenizer("whitespace")) .Custom(VERSION_SEARCH_ANALYZER, c => c.Filters(VERSION_PAD1_TOKEN_FILTER, VERSION_PAD2_TOKEN_FILTER, VERSION_PAD3_TOKEN_FILTER, VERSION_PAD4_TOKEN_FILTER, "lowercase").Tokenizer("whitespace")) .Custom(WHITESPACE_LOWERCASE_ANALYZER, c => c.Filters("lowercase").Tokenizer("whitespace")) .Custom(TYPENAME_ANALYZER, c => c.Filters(TYPENAME_TOKEN_FILTER, "lowercase", "unique").Tokenizer(TYPENAME_HIERARCHY_TOKENIZER)) .Custom(STANDARDPLUS_ANALYZER, c => c.Filters("standard", TYPENAME_TOKEN_FILTER, "lowercase", "stop", "unique").Tokenizer(COMMA_WHITESPACE_TOKENIZER))) .TokenFilters(f => f .PatternCapture(EMAIL_TOKEN_FILTER, p => p.Patterns(@"(\w+)", @"(\p{L}+)", @"(\d+)", "(.+)@", "@(.+)")) .PatternCapture(TYPENAME_TOKEN_FILTER, p => p.Patterns(@"\.(\w+)", @"([^\()]+)")) .PatternCapture(VERSION_TOKEN_FILTER, p => p.Patterns(@"^(\d+)\.", @"^(\d+\.\d+)", @"^(\d+\.\d+\.\d+)")) .PatternReplace(VERSION_PAD1_TOKEN_FILTER, p => p.Pattern(@"(\.|^)(\d{1})(?=\.|-|$)").Replacement("$10000$2")) .PatternReplace(VERSION_PAD2_TOKEN_FILTER, p => p.Pattern(@"(\.|^)(\d{2})(?=\.|-|$)").Replacement("$1000$2")) .PatternReplace(VERSION_PAD3_TOKEN_FILTER, p => p.Pattern(@"(\.|^)(\d{3})(?=\.|-|$)").Replacement("$100$2")) .PatternReplace(VERSION_PAD4_TOKEN_FILTER, p => p.Pattern(@"(\.|^)(\d{4})(?=\.|-|$)").Replacement("$10$2"))) .Tokenizers(t => t .Pattern(COMMA_WHITESPACE_TOKENIZER, p => p.Pattern(@"[,\s]+")) .PathHierarchy(TYPENAME_HIERARCHY_TOKENIZER, p => p.Delimiter('.')))); }
/// <summary> /// I've moved this into an extension method /// for reuse and a clearer understanding of the /// custom analyzer we are writing /// </summary> /// <param name="analysis"></param> /// <returns></returns> public static IAnalysis AddSearchAnalyzer(this AnalysisDescriptor analysis) { const string lowercase = nameof(lowercase); return (analysis .Analyzers(a => a .Custom(IndexAnalyzerName, c => c .Tokenizer(IndexAnalyzerName) .Filters(lowercase) ) .Custom(SearchAnalyzerName, c => c.Tokenizer(lowercase) ) ) .Tokenizers(t => t .EdgeNGram(IndexAnalyzerName, e => e .MinGram(1) .MaxGram(20) .TokenChars(TokenChar.Letter) ) )); }
protected override IAnalysis FluentAnalysis(AnalysisDescriptor an) => an.Analyzers(d => AssertionSetup.Fluent(AssertionSetup.Name, d));
/// <summary> /// Job Analysis descriptions /// </summary> /// private AnalysisDescriptor GetJobsAnalysisDescriptor(LanguageCode languageCode = LanguageCode.EN) { var descriptor = new AnalysisDescriptor(); descriptor.TokenFilters(cf => cf.Add("shingle_title", new ShingleTokenFilter())); descriptor.TokenFilters( f => f.Add("job_stopfilter", new StopTokenFilter { Stopwords = new List <string> { "job", "jobs" } })); // Title Analyzer var titleAnalyzer = GetTitleAnalyzer(languageCode); descriptor.Analyzers(a => a.Add("job_title", titleAnalyzer)); // Path Analyzer var pathAnalyzer = GetPathAnalyzer(); descriptor.Analyzers(a => a.Add("path", pathAnalyzer)); // Lowercase Analyzer var lowercaseAnalyzer = GetLowercaseAnalyzer(languageCode); descriptor.Analyzers(a => a.Add("lowercase", lowercaseAnalyzer)); // Snowball Token Filter var snowballPorterFilter = GetStemmerTokenFilter(languageCode); descriptor.TokenFilters(d => d.Add("snowballPorterFilter", snowballPorterFilter)); // Stopwords Filter var stopwordFilter = GetStopwordFilter(languageCode); descriptor.TokenFilters(d => d.Add("stopwordFilter", stopwordFilter)); // Word Delimiter Token Filter var wdtFitler = GetWordDelimeterTokenFilter(languageCode); descriptor.TokenFilters(d => d.Add("wdtFitler", wdtFitler)); // Job Default Analyzer var jobDefaultAnalyzer = GetJobDefaultAnanyzer(languageCode); descriptor.Analyzers(a => a.Add("jobDefaultAnalyzer", jobDefaultAnalyzer)); // Job Default with Delimiter Analyzer var jobDefaultWithDelimiterAnalyzer = GetJobDefaultWithDelimiterAnalyzer(languageCode); descriptor.Analyzers(a => a.Add("jobDefaultWithDelimiterAnalyzer", jobDefaultWithDelimiterAnalyzer)); // Title Suggestion Anlyzer var titleSuggestAnalyzer = GetTitleSuggestAnalyzer(languageCode); descriptor.Analyzers(a => a.Add("titleSuggestAnalyzer", titleSuggestAnalyzer)); // country, match first node in hierarchy path descriptor.Tokenizers(t => t.Add("country_path", new PatternTokenizer { Pattern = "^(/[0-9]+/).*", Group = 1 })); descriptor.Analyzers(a => a.Add("country_path", new CustomAnalyzer { Tokenizer = "country_path" })); // region, match first and second nodes in hierarchy path descriptor.Tokenizers(t => t.Add("region_path", new PatternTokenizer { Pattern = "^(/[0-9]+/[0-9]+/).*", Group = 1 })); descriptor.Analyzers(a => a.Add("region_path", new CustomAnalyzer { Tokenizer = "region_path" })); // city, match first four or first three nodes in path as cities in some countries lack a second level division descriptor.Tokenizers(t => t.Add("city_path", new PatternTokenizer { Pattern = "^(/[0-9]+/[0-9]+/[0-9]+/[0-9]+/[0-9]+/[0-9]+/|/[0-9]+/[0-9]+/[0-9]+/[0-9]+/[0-9]+/|/[0-9]+/[0-9]+/[0-9]+/[0-9]+/|/[0-9]+/[0-9]+/[0-9]+/).*", Group = 1 })); descriptor.Analyzers(a => a.Add("city_path", new CustomAnalyzer { Tokenizer = "city_path" })); return(descriptor); }