public static IAnalysis ProjectAnalysisSettings(AnalysisDescriptor analysis) { analysis .TokenFilters(tokenFilters => tokenFilters .Shingle("shingle", shingle => shingle .MinShingleSize(2) .MaxShingleSize(4) ) ) .Analyzers(analyzers => analyzers .Custom("shingle", shingle => shingle .Filters("standard", "shingle") .Tokenizer("standard") ) ); if (new SemVer.Range(">=5.2.0").IsSatisfied(TestClient.Configuration.ElasticsearchVersion)) { analysis.Normalizers(analyzers => analyzers .Custom("my_normalizer", n => n .Filters("lowercase", "asciifolding") ) ); } return(analysis); }
public static AnalysisDescriptor SetAnalysis(AnalysisDescriptor analysis) { return(analysis .CharFilters(c => c .Mapping("mapping", f => f.Mappings(GetCharMapping())) .PatternReplace("digits", f => f.Pattern("[^0-9]").Replacement("")) ) .TokenFilters(f => f .NGram("digits_ngram", t => t.MinGram(3).MaxGram(8)) .Length("length_limit", t => t.Min(1).Max(20)) .EdgeNGram("custom_edge_ngram", t => t.MinGram(1).MaxGram(50)) ) .Tokenizers(t => t .NGram("ngram", d => d.MinGram(2).MaxGram(30))) .Analyzers(a => a .UserDefined(Replace, new CustomAnalyzer { Tokenizer = "whitespace", Filter = new[] { "lowercase", "scandinavian_folding", "unique" }, CharFilter = new[] { "mapping" } }) .UserDefined(ReplaceNgram, new CustomAnalyzer { Tokenizer = "whitespace", Filter = new[] { "lowercase", "scandinavian_folding", "custom_edge_ngram", "unique" }, CharFilter = new[] { "mapping", "html_strip" } }) .UserDefined(Key, new CustomAnalyzer { Tokenizer = "keyword", Filter = new[] { "lowercase", "scandinavian_folding" } }) .UserDefined(Digits, new CustomAnalyzer { Tokenizer = "keyword", CharFilter = new[] { "digits" }, Filter = new[] { "digits_ngram", "unique", "length_limit" } }) .UserDefined(Lowercase, new CustomAnalyzer { Tokenizer = "keyword", Filter = new[] { "lowercase" } }) )); }
private AnalysisDescriptor BuildAnalysis(AnalysisDescriptor ad) { return(ad.Analyzers(a => a .Pattern(COMMA_WHITESPACE_ANALYZER, p => p.Pattern(@"[,\s]+")) .Custom(EMAIL_ANALYZER, c => c.Filters(EMAIL_TOKEN_FILTER, "lowercase", TLD_STOPWORDS_TOKEN_FILTER, EDGE_NGRAM_TOKEN_FILTER, "unique").Tokenizer("keyword")) .Custom(VERSION_INDEX_ANALYZER, c => c.Filters(VERSION_PAD1_TOKEN_FILTER, VERSION_PAD2_TOKEN_FILTER, VERSION_PAD3_TOKEN_FILTER, VERSION_PAD4_TOKEN_FILTER, VERSION_TOKEN_FILTER, "lowercase", "unique").Tokenizer("whitespace")) .Custom(VERSION_SEARCH_ANALYZER, c => c.Filters(VERSION_PAD1_TOKEN_FILTER, VERSION_PAD2_TOKEN_FILTER, VERSION_PAD3_TOKEN_FILTER, VERSION_PAD4_TOKEN_FILTER, "lowercase").Tokenizer("whitespace")) .Custom(WHITESPACE_LOWERCASE_ANALYZER, c => c.Filters("lowercase").Tokenizer("whitespace")) .Custom(TYPENAME_ANALYZER, c => c.Filters(TYPENAME_TOKEN_FILTER, "lowercase", "unique").Tokenizer(TYPENAME_HIERARCHY_TOKENIZER)) .Custom(STANDARDPLUS_ANALYZER, c => c.Filters(TYPENAME_TOKEN_FILTER, "lowercase", "stop", "unique").Tokenizer(COMMA_WHITESPACE_TOKENIZER)) .Custom(LOWER_KEYWORD_ANALYZER, c => c.Filters("lowercase").Tokenizer("keyword")) .Custom(HOST_ANALYZER, c => c.Filters("lowercase").Tokenizer(HOST_TOKENIZER)) .Custom(URL_PATH_ANALYZER, c => c.Filters("lowercase").Tokenizer(URL_PATH_TOKENIZER))) .TokenFilters(f => f .EdgeNGram(EDGE_NGRAM_TOKEN_FILTER, p => p.MaxGram(50).MinGram(2).Side(EdgeNGramSide.Front)) .PatternCapture(EMAIL_TOKEN_FILTER, p => p.PreserveOriginal().Patterns("(\\w+)", "(\\p{L}+)", "(\\d+)", "@(.+)", "@(.+)\\.", "(.+)@")) .PatternCapture(TYPENAME_TOKEN_FILTER, p => p.Patterns(@"\.(\w+)", @"([^\()]+)")) .PatternCapture(VERSION_TOKEN_FILTER, p => p.Patterns(@"^(\d+)\.", @"^(\d+\.\d+)", @"^(\d+\.\d+\.\d+)")) .PatternReplace(VERSION_PAD1_TOKEN_FILTER, p => p.Pattern(@"(\.|^)(\d{1})(?=\.|-|$)").Replacement("$10000$2")) .PatternReplace(VERSION_PAD2_TOKEN_FILTER, p => p.Pattern(@"(\.|^)(\d{2})(?=\.|-|$)").Replacement("$1000$2")) .PatternReplace(VERSION_PAD3_TOKEN_FILTER, p => p.Pattern(@"(\.|^)(\d{3})(?=\.|-|$)").Replacement("$100$2")) .PatternReplace(VERSION_PAD4_TOKEN_FILTER, p => p.Pattern(@"(\.|^)(\d{4})(?=\.|-|$)").Replacement("$10$2")) .Stop(TLD_STOPWORDS_TOKEN_FILTER, p => p.StopWords("com", "net", "org", "info", "me", "edu", "mil", "gov", "biz", "co", "io", "dev")) .WordDelimiter(ALL_WORDS_DELIMITER_TOKEN_FILTER, p => p.CatenateNumbers().PreserveOriginal().CatenateAll().CatenateWords())) .Tokenizers(t => t .Pattern(COMMA_WHITESPACE_TOKENIZER, p => p.Pattern(@"[,\s]+")) .CharGroup(URL_PATH_TOKENIZER, p => p.TokenizeOnCharacters("/", "-", ".")) .CharGroup(HOST_TOKENIZER, p => p.TokenizeOnCharacters(".")) .PathHierarchy(TYPENAME_HIERARCHY_TOKENIZER, p => p.Delimiter('.')))); }
/// <summary> /// I've moved this into an extension method /// for reuse and a clearer understanding of the /// custom analyzer we are writing /// </summary> /// <param name="analysis"></param> /// <returns></returns> public static IAnalysis AddSearchAnalyzer(this AnalysisDescriptor analysis) { const string lowercase = nameof(lowercase); // https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-edgengram-tokenizer.html // names aren't really important, they are just keys return (analysis .Analyzers(a => a .Custom(IndexAnalyzerName, c => c .Tokenizer(IndexAnalyzerName) .Filters(lowercase) ) .Custom(SearchAnalyzerName, c => c.Tokenizer(lowercase) ) ) .Tokenizers(t => t .EdgeNGram(IndexAnalyzerName, e => e .MinGram(1) .MaxGram(20) .TokenChars(TokenChar.Letter) ) )); }
private AnalysisDescriptor Analysis(AnalysisDescriptor analysis) => analysis .Tokenizers(tokenizers => tokenizers .Pattern("name-tokenizer", p => p.Pattern(@"\W+")) ) .TokenFilters(tokenfilters => tokenfilters .WordDelimiter("name-words", w => w .SplitOnCaseChange() .PreserveOriginal() .SplitOnNumerics() .GenerateNumberParts(false) .GenerateWordParts() ) ) .Analyzers(analyzers => analyzers .Custom("name-keyword", c => c .Tokenizer("keyword") .Filters("lowercase") ) .Custom("html_stripper", cc => cc .Filters("trim", "lowercase") .CharFilters("html_strip") .Tokenizer("name-tokenizer") ) .Custom("name-analyzer", c => c .Filters("name-words", "lowercase") .Tokenizer("ik_max_word") ) );
public static IAnalysis ProjectAnalysisSettings(AnalysisDescriptor analysis) { analysis .TokenFilters(tokenFilters => tokenFilters .Shingle("shingle", shingle => shingle .MinShingleSize(2) .MaxShingleSize(4) ) ) .Analyzers(analyzers => analyzers .Custom("shingle", shingle => shingle .Filters("standard", "shingle") .Tokenizer("standard") ) ); //normalizers are a new feature since 5.2.0 if (TestClient.VersionUnderTestSatisfiedBy(">=5.2.0")) { analysis.Normalizers(analyzers => analyzers .Custom("my_normalizer", n => n .Filters("lowercase", "asciifolding") ) ); } return(analysis); }
protected static IAnalysis InitCommonAnalyzers(AnalysisDescriptor analysis) { return(analysis.Analyzers(a => a .Custom("html_stripper", cc => cc .Filters("eng_stopwords", "trim", "lowercase") .CharFilters("html_strip") .Tokenizer("autocomplete") ) .Custom("keywords_wo_stopwords", cc => cc .Filters("eng_stopwords", "trim", "lowercase") .CharFilters("html_strip") .Tokenizer("key_tokenizer") ) .Custom("autocomplete", cc => cc .Filters("eng_stopwords", "trim", "lowercase") .Tokenizer("autocomplete") ) ) .Tokenizers(tdesc => tdesc .Keyword("key_tokenizer", t => t) .EdgeNGram("autocomplete", e => e .MinGram(3) .MaxGram(15) .TokenChars(TokenChar.Letter, TokenChar.Digit) ) ) .TokenFilters(f => f .Stop("eng_stopwords", lang => lang .StopWords("_english_") ) )); }
private IAnalysis ConfigureConcatenateAndAutocompleteAnalysis(AnalysisDescriptor analysis) { // for concatenate filter see my fork: https://github.com/rh78/elasticsearch-concatenate-token-filter return(analysis .TokenFilters(filter => filter .UserDefined("concatenate_filter", new ConcatenateTokenFilter() { TokenSeparator = " ", IncrementGap = 1000 }) .EdgeNGram("autocomplete_filter", edgeNGram => edgeNGram .MinGram(1) .MaxGram(20) ) .EdgeNGram("partialsearch_filter", edgeNGram => edgeNGram .MinGram(3) .MaxGram(20) ) ) .Analyzers(analyzer => analyzer .Custom("autocomplete_index", custom => custom .Tokenizer("standard") .Filters(new string[] { "lowercase", "asciifolding", "concatenate_filter", "autocomplete_filter" }) ) .Custom("autocomplete_search", custom => custom .Tokenizer("standard") .Filters(new string[] { "lowercase", "asciifolding", "concatenate_filter" }) ) .Custom("partialsearch_index", custom => custom .Tokenizer("standard") .Filters(new string[] { "lowercase", "asciifolding", "partialsearch_filter" }) ))); }
private AnalysisDescriptor BuildAnalysis(AnalysisDescriptor ad) { return(ad.Analyzers(a => a .Pattern(COMMA_WHITESPACE_ANALYZER, p => p.Pattern(@"[,\s]+")) .Custom(STANDARDPLUS_ANALYZER, c => c.Filters("lowercase", "stop", "unique").Tokenizer(COMMA_WHITESPACE_TOKENIZER)) .Custom(WHITESPACE_LOWERCASE_ANALYZER, c => c.Filters("lowercase").Tokenizer("whitespace"))) .Tokenizers(t => t .Pattern(COMMA_WHITESPACE_TOKENIZER, p => p.Pattern(@"[,\s]+")))); }
private static AnalysisDescriptor CreateAnalysisDescriptor(string charHtmlFilter, TokenFiltersDescriptor tokenFilters, AnalyzersDescriptor analyzers) { var analysisDescriptor = new AnalysisDescriptor(); analysisDescriptor.CharFilters(c => c.HtmlStrip(charHtmlFilter)); analysisDescriptor.TokenFilters(t => tokenFilters); analysisDescriptor.Analyzers(a => analyzers); return(analysisDescriptor); }
public IAnalysis ConfigureAnalysis(AnalysisDescriptor analysisDescriptor) { return(analysisDescriptor.TokenFilters(tf => tf.Synonym("city_synonym", tfd => tfd.Synonyms("lol => laughing", "new york, nyc"))) .Analyzers(aa => aa.Custom("cna", ca => ca .CharFilters("html_strip") .Tokenizer("standard") .Filters("lowercase", "stop", "city_synonym")) )); }
protected override AnalysisDescriptor Contribute(AnalysisDescriptor descriptor, IEnumerable <KeyValuePair <string, TokenizerBase> > build) { return(descriptor.Tokenizers(a => { foreach (var item in build.Where(x => CanContribute(x, a))) { a.Add(item.Key, item.Value); } return a; })); }
private static CreateIndexDescriptor CreateIndexDescriptor(string indexName, AnalysisDescriptor analysistDescriptor, Func <TypeMappingDescriptor <DocumentElastic>, TypeMappingDescriptor <DocumentElastic> > documentMappingDescriptor) { var descriptor = new CreateIndexDescriptor(indexName); descriptor.Settings(s => s.NumberOfReplicas(0).NumberOfShards(1).Analysis(a => analysistDescriptor)) .Mappings(mapping => mapping .Map <DocumentElastic>(map => documentMappingDescriptor(map).AutoMap()) .Map <TagElastic>(mm => mm.AutoMap().Dynamic(false)) .Map <PropertiesElastic>(mm => mm.AutoMap().Dynamic(false))); return(descriptor); }
public AnalysisDescriptor Resolve <T>(LanguageCode languageCode = LanguageCode.EN) { AnalysisDescriptor desciptor = null; if (typeof(T) == typeof(ElasticsearchJob)) { desciptor = GetJobsAnalysisDescriptor(languageCode); } return(desciptor); }
private IAnalysis ProjectAnalysisSettings(AnalysisDescriptor analysis) => analysis .TokenFilters(tokenFilters => tokenFilters .Shingle("shingle", shingle => shingle .MinShingleSize(2) .MaxShingleSize(4) ) ) .Analyzers(analyzers => analyzers .Custom("shingle", shingle => shingle .Filters("standard", "shingle") .Tokenizer("standard") ) );
private AnalysisDescriptor BuildAnalysis(AnalysisDescriptor ad) { return(ad.Analyzers(bases => { bases.Add(COMMA_WHITESPACE_ANALYZER, new PatternAnalyzer { Pattern = @"[,\s]+" }); bases.Add(EMAIL_ANALYZER, new CustomAnalyzer { Tokenizer = "keyword", Filter = new[] { EMAIL_TOKEN_FILTER, "lowercase", "unique" } }); bases.Add(VERSION_INDEX_ANALYZER, new CustomAnalyzer { Tokenizer = "whitespace", Filter = new[] { VERSION_PAD1_TOKEN_FILTER, VERSION_PAD2_TOKEN_FILTER, VERSION_PAD3_TOKEN_FILTER, VERSION_PAD4_TOKEN_FILTER, VERSION_TOKEN_FILTER, "lowercase", "unique" } }); bases.Add(VERSION_SEARCH_ANALYZER, new CustomAnalyzer { Tokenizer = "whitespace", Filter = new[] { VERSION_PAD1_TOKEN_FILTER, VERSION_PAD2_TOKEN_FILTER, VERSION_PAD3_TOKEN_FILTER, VERSION_PAD4_TOKEN_FILTER, "lowercase" } }); bases.Add(WHITESPACE_LOWERCASE_ANALYZER, new CustomAnalyzer { Tokenizer = "whitespace", Filter = new[] { "lowercase" } }); bases.Add(TYPENAME_ANALYZER, new CustomAnalyzer { Tokenizer = "whitespace", Filter = new[] { TYPENAME_TOKEN_FILTER, "lowercase", "unique" } }); bases.Add(STANDARDPLUS_ANALYZER, new CustomAnalyzer { Tokenizer = "whitespace", Filter = new[] { "standard", TYPENAME_TOKEN_FILTER, "lowercase", "stop", "unique" } }); return bases; }).TokenFilters(bases => { bases.Add(EMAIL_TOKEN_FILTER, new PatternCaptureTokenFilter { Patterns = new[] { @"(\w+)", @"(\p{L}+)", @"(\d+)", @"(.+)@", @"@(.+)" } }); bases.Add(TYPENAME_TOKEN_FILTER, new PatternCaptureTokenFilter { Patterns = new[] { @"\.(\w+)" } }); bases.Add(VERSION_TOKEN_FILTER, new PatternCaptureTokenFilter { Patterns = new[] { @"^(\d+)\.", @"^(\d+\.\d+)", @"^(\d+\.\d+\.\d+)" } }); bases.Add(VERSION_PAD1_TOKEN_FILTER, new PatternReplaceTokenFilter { Pattern = @"(\.|^)(\d{1})(?=\.|-|$)", Replacement = @"$10000$2" }); bases.Add(VERSION_PAD2_TOKEN_FILTER, new PatternReplaceTokenFilter { Pattern = @"(\.|^)(\d{2})(?=\.|-|$)", Replacement = @"$1000$2" }); bases.Add(VERSION_PAD3_TOKEN_FILTER, new PatternReplaceTokenFilter { Pattern = @"(\.|^)(\d{3})(?=\.|-|$)", Replacement = @"$100$2" }); bases.Add(VERSION_PAD4_TOKEN_FILTER, new PatternReplaceTokenFilter { Pattern = @"(\.|^)(\d{4})(?=\.|-|$)", Replacement = @"$10$2" }); return bases; })); }
private AnalysisDescriptor BuildIndexDescriptor(AnalysisDescriptor a) { return (a .Analyzers(aa => aa .Custom("default", descriptor => descriptor.Tokenizer("standard") .CharFilters("html_strip") .Filters("lowercase", "ru_RU", "en_US")) ).TokenFilters(descriptor => descriptor.Hunspell("ru_RU", hh => hh.Dedup().Locale("ru_RU")) .Hunspell("en_US", hh => hh.Dedup().Locale("en_US"))) ); }
private static AnalysisDescriptor Analysis(AnalysisDescriptor analysis) => analysis .CharFilters(c => c.Mapping("swedish_char_mapping", m => m.Mappings("w => v", "W => V"))) .TokenFilters(tf => tf.Hunspell("sv_SE", x => x.Dedup().Locale("sv_SE"))) .Analyzers(analyzers => analyzers .Custom(LowercaseKeywordAnalyserName, c => c .Tokenizer("keyword") .Filters("lowercase") ) .Custom(SwedishTextAnalyserName, c => c .Tokenizer("standard") .Filters("lowercase", "sv_SE") .CharFilters("html_strip", "swedish_char_mapping") ) );
public static IAnalysis AddSearchAnalyzerFor(this AnalysisDescriptor analysis, string indexName) { var indexAnalyzerName = $"{indexName}_search"; var indexAnalyzerKey = "lowercase"; return (analysis .Analyzers(a => a .Custom(indexAnalyzerName, c => c .Tokenizer(indexAnalyzerName) .Filters(indexAnalyzerKey) ) ) .Tokenizers(t => t .EdgeNGram(indexAnalyzerName, e => e .MinGram(1) .MaxGram(20) .TokenChars(TokenChar.Letter) ) )); }
private static AnalysisDescriptor Analysis(AnalysisDescriptor analysis) => analysis .Analyzers(a => a .Custom("my-search-analyzer", ca => ca .Tokenizer("standard") .Filters("lowercase", "russian_morphology", "english_morphology", "my_stopwords") )) .TokenFilters(tf => tf .Stop("my_stopwords", s => s .StopWords("а", "без", "более", "бы", "был", "была", "были", "было", "быть", "в", "вам", "вас", "весь", "во", "вот", "все", "всего", "всех", "вы", "где", "да", "даже", "для", "до", "его", "ее", "если", "есть", "еще", "же", "за", "здесь", "и", "из", "или", "им", "их", "к", "как", "ко", "когда", "кто", "ли", "либо", "мне", "может", "мы", "на", "надо", "наш", "не", "него", "нее", "нет", "ни", "них", "но", "ну", "о", "об", "однако", "он", "она", "они", "оно", "от", "очень", "по", "под", "при", "с", "со", "так", "также", "такой", "там", "те", "тем", "то", "того", "тоже", "той", "только", "том", "ты", "у", "уже", "хотя", "чего", "чей", "чем", "что", "чтобы", "чье", "чья", "эта", "эти", "это", "я", "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with" )));
private AnalysisDescriptor BuildAnalysis(AnalysisDescriptor ad) { return(ad.Analyzers(a => a .Pattern(COMMA_WHITESPACE_ANALYZER, p => p.Pattern(@"[,\s]+")) .Custom(EMAIL_ANALYZER, c => c.Filters(EMAIL_TOKEN_FILTER, "lowercase", "unique").Tokenizer("keyword")) .Custom(VERSION_INDEX_ANALYZER, c => c.Filters(VERSION_PAD1_TOKEN_FILTER, VERSION_PAD2_TOKEN_FILTER, VERSION_PAD3_TOKEN_FILTER, VERSION_PAD4_TOKEN_FILTER, VERSION_TOKEN_FILTER, "lowercase", "unique").Tokenizer("whitespace")) .Custom(VERSION_SEARCH_ANALYZER, c => c.Filters(VERSION_PAD1_TOKEN_FILTER, VERSION_PAD2_TOKEN_FILTER, VERSION_PAD3_TOKEN_FILTER, VERSION_PAD4_TOKEN_FILTER, "lowercase").Tokenizer("whitespace")) .Custom(WHITESPACE_LOWERCASE_ANALYZER, c => c.Filters("lowercase").Tokenizer("whitespace")) .Custom(TYPENAME_ANALYZER, c => c.Filters(TYPENAME_TOKEN_FILTER, "lowercase", "unique").Tokenizer(TYPENAME_HIERARCHY_TOKENIZER)) .Custom(STANDARDPLUS_ANALYZER, c => c.Filters("standard", TYPENAME_TOKEN_FILTER, "lowercase", "stop", "unique").Tokenizer(COMMA_WHITESPACE_TOKENIZER))) .TokenFilters(f => f .PatternCapture(EMAIL_TOKEN_FILTER, p => p.Patterns(@"(\w+)", @"(\p{L}+)", @"(\d+)", "(.+)@", "@(.+)")) .PatternCapture(TYPENAME_TOKEN_FILTER, p => p.Patterns(@"\.(\w+)", @"([^\()]+)")) .PatternCapture(VERSION_TOKEN_FILTER, p => p.Patterns(@"^(\d+)\.", @"^(\d+\.\d+)", @"^(\d+\.\d+\.\d+)")) .PatternReplace(VERSION_PAD1_TOKEN_FILTER, p => p.Pattern(@"(\.|^)(\d{1})(?=\.|-|$)").Replacement("$10000$2")) .PatternReplace(VERSION_PAD2_TOKEN_FILTER, p => p.Pattern(@"(\.|^)(\d{2})(?=\.|-|$)").Replacement("$1000$2")) .PatternReplace(VERSION_PAD3_TOKEN_FILTER, p => p.Pattern(@"(\.|^)(\d{3})(?=\.|-|$)").Replacement("$100$2")) .PatternReplace(VERSION_PAD4_TOKEN_FILTER, p => p.Pattern(@"(\.|^)(\d{4})(?=\.|-|$)").Replacement("$10$2"))) .Tokenizers(t => t .Pattern(COMMA_WHITESPACE_TOKENIZER, p => p.Pattern(@"[,\s]+")) .PathHierarchy(TYPENAME_HIERARCHY_TOKENIZER, p => p.Delimiter('.')))); }
public static AnalysisDescriptor AutoCompleteAnalyzers(this AnalysisDescriptor analysis) { return(analysis.Tokenizers(t => t.Whitespace("whitespace_tokenizer")) .TokenFilters(t => t.EdgeNGram("ngram_filter", n => n.MinGram(1).MaxGram(8))) .Analyzers(a => a .Custom("default_autocomplete", c => c .Tokenizer("whitespace_tokenizer").Filters("lowercase", "asciifolding") ) .Custom("snowball_autocomplete", c => c .Tokenizer("whitespace_tokenizer").Filters("lowercase", "asciifolding", "snowball") ) .Custom("shingle_autocomplete", c => c .Tokenizer("whitespace_tokenizer").Filters("shingle", "lowercase", "asciifolding") ) .Custom("ngram_autocomplete", c => c .Tokenizer("whitespace_tokenizer").Filters("lowercase", "asciifolding", "ngram_filter") ) .Custom("search_autocomplete", c => c .Tokenizer("whitespace_tokenizer").Filters("lowercase", "asciifolding") ) )); }
/// <summary> /// I've moved this into an extension method /// for reuse and a clearer understanding of the /// custom analyzer we are writing /// </summary> /// <param name="analysis"></param> /// <returns></returns> public static IAnalysis AddSearchAnalyzer(this AnalysisDescriptor analysis) { const string lowercase = nameof(lowercase); return (analysis .Analyzers(a => a .Custom(IndexAnalyzerName, c => c .Tokenizer(IndexAnalyzerName) .Filters(lowercase) ) .Custom(SearchAnalyzerName, c => c.Tokenizer(lowercase) ) ) .Tokenizers(t => t .EdgeNGram(IndexAnalyzerName, e => e .MinGram(1) .MaxGram(20) .TokenChars(TokenChar.Letter) ) )); }
private static AnalysisDescriptor Analysis(AnalysisDescriptor analysis) => analysis .Tokenizers(tokenizers => tokenizers .Pattern("nuget-id-tokenizer", p => p.Pattern(@"\W+")) ) .TokenFilters(tokenfilters => tokenfilters .WordDelimiter("nuget-id-words", w => w .SplitOnCaseChange() .PreserveOriginal() .SplitOnNumerics() .GenerateNumberParts(false) .GenerateWordParts() ) ) .Analyzers(analyzers => analyzers .Custom("nuget-id-analyzer", c => c .Tokenizer("nuget-id-tokenizer") .Filters("nuget-id-words", "lowercase") ) .Custom("nuget-id-keyword", c => c .Tokenizer("keyword") .Filters("lowercase") ) );
public static AnalysisDescriptor DutchAnalysis(AnalysisDescriptor analysis) => analysis // custom filters .TokenFilters(tokenfilters => tokenfilters .Stop("dutch_stop", w => w .StopWords("_dutch_") ) .Stemmer("dutch_stemmer", w => w .Language("dutch") ) ) .CharFilters(charFilters => charFilters .PatternReplace("kill_numbers", p => p .Pattern("(\\d+)") .Replacement(""))) // custom analyzers .Analyzers(analyzers => analyzers .Custom("dutch", c => c .CharFilters("kill_numbers") .Tokenizer("standard") .Filters("lowercase", "dutch_stop", "dutch_stemmer") ) );
protected abstract IAnalysis FluentAnalysis(AnalysisDescriptor an);
protected override IAnalysis FluentAnalysis(AnalysisDescriptor an) => an.Tokenizers(d => AssertionSetup.Fluent(AssertionSetup.Name, d));
public static AnalysisDescriptor Initialize(AnalysisDescriptor analysis) => analysis.Normalizers(n => n .Custom(Sort, descriptor => descriptor.Filters("lowercase")));
private static IAnalysis InitCommonAnalyzers(AnalysisDescriptor analysis) { return(analysis); }
private IAnalysis CreateAnalysis(AnalysisDescriptor analysisDescriptor) { return(analysisDescriptor .TokenFilters(CreateTokenFilters) .Analyzers(CreateAnalyzers)); }