Ejemplo n.º 1
0
 public static IAnalysis ProjectAnalysisSettings(AnalysisDescriptor analysis)
 {
     analysis
     .TokenFilters(tokenFilters => tokenFilters
                   .Shingle("shingle", shingle => shingle
                            .MinShingleSize(2)
                            .MaxShingleSize(4)
                            )
                   )
     .Analyzers(analyzers => analyzers
                .Custom("shingle", shingle => shingle
                        .Filters("standard", "shingle")
                        .Tokenizer("standard")
                        )
                );
     if (new SemVer.Range(">=5.2.0").IsSatisfied(TestClient.Configuration.ElasticsearchVersion))
     {
         analysis.Normalizers(analyzers => analyzers
                              .Custom("my_normalizer", n => n
                                      .Filters("lowercase", "asciifolding")
                                      )
                              );
     }
     return(analysis);
 }
Ejemplo n.º 2
0
 public static AnalysisDescriptor SetAnalysis(AnalysisDescriptor analysis)
 {
     return(analysis
            .CharFilters(c => c
                         .Mapping("mapping", f => f.Mappings(GetCharMapping()))
                         .PatternReplace("digits", f => f.Pattern("[^0-9]").Replacement(""))
                         )
            .TokenFilters(f => f
                          .NGram("digits_ngram", t => t.MinGram(3).MaxGram(8))
                          .Length("length_limit", t => t.Min(1).Max(20))
                          .EdgeNGram("custom_edge_ngram", t => t.MinGram(1).MaxGram(50))
                          )
            .Tokenizers(t => t
                        .NGram("ngram", d => d.MinGram(2).MaxGram(30)))
            .Analyzers(a => a
                       .UserDefined(Replace, new CustomAnalyzer {
         Tokenizer = "whitespace", Filter = new[] { "lowercase", "scandinavian_folding", "unique" }, CharFilter = new[] { "mapping" }
     })
                       .UserDefined(ReplaceNgram, new CustomAnalyzer {
         Tokenizer = "whitespace", Filter = new[] { "lowercase", "scandinavian_folding", "custom_edge_ngram", "unique" }, CharFilter = new[] { "mapping", "html_strip" }
     })
                       .UserDefined(Key, new CustomAnalyzer {
         Tokenizer = "keyword", Filter = new[] { "lowercase", "scandinavian_folding" }
     })
                       .UserDefined(Digits, new CustomAnalyzer {
         Tokenizer = "keyword", CharFilter = new[] { "digits" }, Filter = new[] { "digits_ngram", "unique", "length_limit" }
     })
                       .UserDefined(Lowercase, new CustomAnalyzer {
         Tokenizer = "keyword", Filter = new[] { "lowercase" }
     })
                       ));
 }
Ejemplo n.º 3
0
 private AnalysisDescriptor BuildAnalysis(AnalysisDescriptor ad)
 {
     return(ad.Analyzers(a => a
                         .Pattern(COMMA_WHITESPACE_ANALYZER, p => p.Pattern(@"[,\s]+"))
                         .Custom(EMAIL_ANALYZER, c => c.Filters(EMAIL_TOKEN_FILTER, "lowercase", TLD_STOPWORDS_TOKEN_FILTER, EDGE_NGRAM_TOKEN_FILTER, "unique").Tokenizer("keyword"))
                         .Custom(VERSION_INDEX_ANALYZER, c => c.Filters(VERSION_PAD1_TOKEN_FILTER, VERSION_PAD2_TOKEN_FILTER, VERSION_PAD3_TOKEN_FILTER, VERSION_PAD4_TOKEN_FILTER, VERSION_TOKEN_FILTER, "lowercase", "unique").Tokenizer("whitespace"))
                         .Custom(VERSION_SEARCH_ANALYZER, c => c.Filters(VERSION_PAD1_TOKEN_FILTER, VERSION_PAD2_TOKEN_FILTER, VERSION_PAD3_TOKEN_FILTER, VERSION_PAD4_TOKEN_FILTER, "lowercase").Tokenizer("whitespace"))
                         .Custom(WHITESPACE_LOWERCASE_ANALYZER, c => c.Filters("lowercase").Tokenizer("whitespace"))
                         .Custom(TYPENAME_ANALYZER, c => c.Filters(TYPENAME_TOKEN_FILTER, "lowercase", "unique").Tokenizer(TYPENAME_HIERARCHY_TOKENIZER))
                         .Custom(STANDARDPLUS_ANALYZER, c => c.Filters(TYPENAME_TOKEN_FILTER, "lowercase", "stop", "unique").Tokenizer(COMMA_WHITESPACE_TOKENIZER))
                         .Custom(LOWER_KEYWORD_ANALYZER, c => c.Filters("lowercase").Tokenizer("keyword"))
                         .Custom(HOST_ANALYZER, c => c.Filters("lowercase").Tokenizer(HOST_TOKENIZER))
                         .Custom(URL_PATH_ANALYZER, c => c.Filters("lowercase").Tokenizer(URL_PATH_TOKENIZER)))
            .TokenFilters(f => f
                          .EdgeNGram(EDGE_NGRAM_TOKEN_FILTER, p => p.MaxGram(50).MinGram(2).Side(EdgeNGramSide.Front))
                          .PatternCapture(EMAIL_TOKEN_FILTER, p => p.PreserveOriginal().Patterns("(\\w+)", "(\\p{L}+)", "(\\d+)", "@(.+)", "@(.+)\\.", "(.+)@"))
                          .PatternCapture(TYPENAME_TOKEN_FILTER, p => p.Patterns(@"\.(\w+)", @"([^\()]+)"))
                          .PatternCapture(VERSION_TOKEN_FILTER, p => p.Patterns(@"^(\d+)\.", @"^(\d+\.\d+)", @"^(\d+\.\d+\.\d+)"))
                          .PatternReplace(VERSION_PAD1_TOKEN_FILTER, p => p.Pattern(@"(\.|^)(\d{1})(?=\.|-|$)").Replacement("$10000$2"))
                          .PatternReplace(VERSION_PAD2_TOKEN_FILTER, p => p.Pattern(@"(\.|^)(\d{2})(?=\.|-|$)").Replacement("$1000$2"))
                          .PatternReplace(VERSION_PAD3_TOKEN_FILTER, p => p.Pattern(@"(\.|^)(\d{3})(?=\.|-|$)").Replacement("$100$2"))
                          .PatternReplace(VERSION_PAD4_TOKEN_FILTER, p => p.Pattern(@"(\.|^)(\d{4})(?=\.|-|$)").Replacement("$10$2"))
                          .Stop(TLD_STOPWORDS_TOKEN_FILTER, p => p.StopWords("com", "net", "org", "info", "me", "edu", "mil", "gov", "biz", "co", "io", "dev"))
                          .WordDelimiter(ALL_WORDS_DELIMITER_TOKEN_FILTER, p => p.CatenateNumbers().PreserveOriginal().CatenateAll().CatenateWords()))
            .Tokenizers(t => t
                        .Pattern(COMMA_WHITESPACE_TOKENIZER, p => p.Pattern(@"[,\s]+"))
                        .CharGroup(URL_PATH_TOKENIZER, p => p.TokenizeOnCharacters("/", "-", "."))
                        .CharGroup(HOST_TOKENIZER, p => p.TokenizeOnCharacters("."))
                        .PathHierarchy(TYPENAME_HIERARCHY_TOKENIZER, p => p.Delimiter('.'))));
 }
Ejemplo n.º 4
0
        /// <summary>
        /// I've moved this into an extension method
        /// for reuse and a clearer understanding of the
        /// custom analyzer we are writing
        /// </summary>
        /// <param name="analysis"></param>
        /// <returns></returns>
        public static IAnalysis AddSearchAnalyzer(this AnalysisDescriptor analysis)
        {
            const string lowercase = nameof(lowercase);

            // https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-edgengram-tokenizer.html
            // names aren't really important, they are just keys
            return
                (analysis
                 .Analyzers(a => a
                            .Custom(IndexAnalyzerName, c => c
                                    .Tokenizer(IndexAnalyzerName)
                                    .Filters(lowercase)
                                    )
                            .Custom(SearchAnalyzerName, c =>
                                    c.Tokenizer(lowercase)
                                    )
                            )
                 .Tokenizers(t => t
                             .EdgeNGram(IndexAnalyzerName, e => e
                                        .MinGram(1)
                                        .MaxGram(20)
                                        .TokenChars(TokenChar.Letter)
                                        )
                             ));
        }
Ejemplo n.º 5
0
 private AnalysisDescriptor Analysis(AnalysisDescriptor analysis) => analysis
 .Tokenizers(tokenizers => tokenizers
             .Pattern("name-tokenizer", p => p.Pattern(@"\W+"))
             )
 .TokenFilters(tokenfilters => tokenfilters
               .WordDelimiter("name-words", w => w
                              .SplitOnCaseChange()
                              .PreserveOriginal()
                              .SplitOnNumerics()
                              .GenerateNumberParts(false)
                              .GenerateWordParts()
                              )
               )
 .Analyzers(analyzers => analyzers
            .Custom("name-keyword", c => c
                    .Tokenizer("keyword")
                    .Filters("lowercase")
                    )
            .Custom("html_stripper", cc => cc
                    .Filters("trim", "lowercase")
                    .CharFilters("html_strip")
                    .Tokenizer("name-tokenizer")
                    )
            .Custom("name-analyzer", c => c
                    .Filters("name-words", "lowercase")
                    .Tokenizer("ik_max_word")
                    )
            );
Ejemplo n.º 6
0
 public static IAnalysis ProjectAnalysisSettings(AnalysisDescriptor analysis)
 {
     analysis
     .TokenFilters(tokenFilters => tokenFilters
                   .Shingle("shingle", shingle => shingle
                            .MinShingleSize(2)
                            .MaxShingleSize(4)
                            )
                   )
     .Analyzers(analyzers => analyzers
                .Custom("shingle", shingle => shingle
                        .Filters("standard", "shingle")
                        .Tokenizer("standard")
                        )
                );
     //normalizers are a new feature since 5.2.0
     if (TestClient.VersionUnderTestSatisfiedBy(">=5.2.0"))
     {
         analysis.Normalizers(analyzers => analyzers
                              .Custom("my_normalizer", n => n
                                      .Filters("lowercase", "asciifolding")
                                      )
                              );
     }
     return(analysis);
 }
Ejemplo n.º 7
0
 protected static IAnalysis InitCommonAnalyzers(AnalysisDescriptor analysis)
 {
     return(analysis.Analyzers(a => a
                               .Custom("html_stripper", cc => cc
                                       .Filters("eng_stopwords", "trim", "lowercase")
                                       .CharFilters("html_strip")
                                       .Tokenizer("autocomplete")
                                       )
                               .Custom("keywords_wo_stopwords", cc => cc
                                       .Filters("eng_stopwords", "trim", "lowercase")
                                       .CharFilters("html_strip")
                                       .Tokenizer("key_tokenizer")
                                       )
                               .Custom("autocomplete", cc => cc
                                       .Filters("eng_stopwords", "trim", "lowercase")
                                       .Tokenizer("autocomplete")
                                       )
                               )
            .Tokenizers(tdesc => tdesc
                        .Keyword("key_tokenizer", t => t)
                        .EdgeNGram("autocomplete", e => e
                                   .MinGram(3)
                                   .MaxGram(15)
                                   .TokenChars(TokenChar.Letter, TokenChar.Digit)
                                   )
                        )
            .TokenFilters(f => f
                          .Stop("eng_stopwords", lang => lang
                                .StopWords("_english_")
                                )
                          ));
 }
Ejemplo n.º 8
0
        private IAnalysis ConfigureConcatenateAndAutocompleteAnalysis(AnalysisDescriptor analysis)
        {
            // for concatenate filter see my fork: https://github.com/rh78/elasticsearch-concatenate-token-filter

            return(analysis
                   .TokenFilters(filter => filter
                                 .UserDefined("concatenate_filter", new ConcatenateTokenFilter()
            {
                TokenSeparator = " ",
                IncrementGap = 1000
            })
                                 .EdgeNGram("autocomplete_filter", edgeNGram => edgeNGram
                                            .MinGram(1)
                                            .MaxGram(20)
                                            )
                                 .EdgeNGram("partialsearch_filter", edgeNGram => edgeNGram
                                            .MinGram(3)
                                            .MaxGram(20)
                                            )
                                 )
                   .Analyzers(analyzer => analyzer
                              .Custom("autocomplete_index", custom => custom
                                      .Tokenizer("standard")
                                      .Filters(new string[] { "lowercase", "asciifolding", "concatenate_filter", "autocomplete_filter" })
                                      )
                              .Custom("autocomplete_search", custom => custom
                                      .Tokenizer("standard")
                                      .Filters(new string[] { "lowercase", "asciifolding", "concatenate_filter" })
                                      )
                              .Custom("partialsearch_index", custom => custom
                                      .Tokenizer("standard")
                                      .Filters(new string[] { "lowercase", "asciifolding", "partialsearch_filter" })
                                      )));
        }
Ejemplo n.º 9
0
 private AnalysisDescriptor BuildAnalysis(AnalysisDescriptor ad)
 {
     return(ad.Analyzers(a => a
                         .Pattern(COMMA_WHITESPACE_ANALYZER, p => p.Pattern(@"[,\s]+"))
                         .Custom(STANDARDPLUS_ANALYZER, c => c.Filters("lowercase", "stop", "unique").Tokenizer(COMMA_WHITESPACE_TOKENIZER))
                         .Custom(WHITESPACE_LOWERCASE_ANALYZER, c => c.Filters("lowercase").Tokenizer("whitespace")))
            .Tokenizers(t => t
                        .Pattern(COMMA_WHITESPACE_TOKENIZER, p => p.Pattern(@"[,\s]+"))));
 }
Ejemplo n.º 10
0
        private static AnalysisDescriptor CreateAnalysisDescriptor(string charHtmlFilter, TokenFiltersDescriptor tokenFilters, AnalyzersDescriptor analyzers)
        {
            var analysisDescriptor = new AnalysisDescriptor();

            analysisDescriptor.CharFilters(c => c.HtmlStrip(charHtmlFilter));
            analysisDescriptor.TokenFilters(t => tokenFilters);
            analysisDescriptor.Analyzers(a => analyzers);
            return(analysisDescriptor);
        }
Ejemplo n.º 11
0
 public IAnalysis ConfigureAnalysis(AnalysisDescriptor analysisDescriptor)
 {
     return(analysisDescriptor.TokenFilters(tf => tf.Synonym("city_synonym", tfd => tfd.Synonyms("lol => laughing", "new york, nyc")))
            .Analyzers(aa =>
                       aa.Custom("cna", ca => ca
                                 .CharFilters("html_strip")
                                 .Tokenizer("standard")
                                 .Filters("lowercase", "stop", "city_synonym"))
                       ));
 }
Ejemplo n.º 12
0
 protected override AnalysisDescriptor Contribute(AnalysisDescriptor descriptor, IEnumerable <KeyValuePair <string, TokenizerBase> > build)
 {
     return(descriptor.Tokenizers(a =>
     {
         foreach (var item in build.Where(x => CanContribute(x, a)))
         {
             a.Add(item.Key, item.Value);
         }
         return a;
     }));
 }
Ejemplo n.º 13
0
        private static CreateIndexDescriptor CreateIndexDescriptor(string indexName, AnalysisDescriptor analysistDescriptor, Func <TypeMappingDescriptor <DocumentElastic>, TypeMappingDescriptor <DocumentElastic> > documentMappingDescriptor)
        {
            var descriptor = new CreateIndexDescriptor(indexName);

            descriptor.Settings(s => s.NumberOfReplicas(0).NumberOfShards(1).Analysis(a => analysistDescriptor))
            .Mappings(mapping => mapping
                      .Map <DocumentElastic>(map => documentMappingDescriptor(map).AutoMap())
                      .Map <TagElastic>(mm => mm.AutoMap().Dynamic(false))
                      .Map <PropertiesElastic>(mm => mm.AutoMap().Dynamic(false)));
            return(descriptor);
        }
Ejemplo n.º 14
0
        public AnalysisDescriptor Resolve <T>(LanguageCode languageCode = LanguageCode.EN)
        {
            AnalysisDescriptor desciptor = null;

            if (typeof(T) == typeof(ElasticsearchJob))
            {
                desciptor = GetJobsAnalysisDescriptor(languageCode);
            }

            return(desciptor);
        }
Ejemplo n.º 15
0
 private IAnalysis ProjectAnalysisSettings(AnalysisDescriptor analysis) => analysis
 .TokenFilters(tokenFilters => tokenFilters
               .Shingle("shingle", shingle => shingle
                        .MinShingleSize(2)
                        .MaxShingleSize(4)
                        )
               )
 .Analyzers(analyzers => analyzers
            .Custom("shingle", shingle => shingle
                    .Filters("standard", "shingle")
                    .Tokenizer("standard")
                    )
            );
Ejemplo n.º 16
0
 private AnalysisDescriptor BuildAnalysis(AnalysisDescriptor ad)
 {
     return(ad.Analyzers(bases => {
         bases.Add(COMMA_WHITESPACE_ANALYZER, new PatternAnalyzer {
             Pattern = @"[,\s]+"
         });
         bases.Add(EMAIL_ANALYZER, new CustomAnalyzer {
             Tokenizer = "keyword", Filter = new[] { EMAIL_TOKEN_FILTER, "lowercase", "unique" }
         });
         bases.Add(VERSION_INDEX_ANALYZER, new CustomAnalyzer {
             Tokenizer = "whitespace", Filter = new[] { VERSION_PAD1_TOKEN_FILTER, VERSION_PAD2_TOKEN_FILTER, VERSION_PAD3_TOKEN_FILTER, VERSION_PAD4_TOKEN_FILTER, VERSION_TOKEN_FILTER, "lowercase", "unique" }
         });
         bases.Add(VERSION_SEARCH_ANALYZER, new CustomAnalyzer {
             Tokenizer = "whitespace", Filter = new[] { VERSION_PAD1_TOKEN_FILTER, VERSION_PAD2_TOKEN_FILTER, VERSION_PAD3_TOKEN_FILTER, VERSION_PAD4_TOKEN_FILTER, "lowercase" }
         });
         bases.Add(WHITESPACE_LOWERCASE_ANALYZER, new CustomAnalyzer {
             Tokenizer = "whitespace", Filter = new[] { "lowercase" }
         });
         bases.Add(TYPENAME_ANALYZER, new CustomAnalyzer {
             Tokenizer = "whitespace", Filter = new[] { TYPENAME_TOKEN_FILTER, "lowercase", "unique" }
         });
         bases.Add(STANDARDPLUS_ANALYZER, new CustomAnalyzer {
             Tokenizer = "whitespace", Filter = new[] { "standard", TYPENAME_TOKEN_FILTER, "lowercase", "stop", "unique" }
         });
         return bases;
     }).TokenFilters(bases => {
         bases.Add(EMAIL_TOKEN_FILTER, new PatternCaptureTokenFilter {
             Patterns = new[] { @"(\w+)", @"(\p{L}+)", @"(\d+)", @"(.+)@", @"@(.+)" }
         });
         bases.Add(TYPENAME_TOKEN_FILTER, new PatternCaptureTokenFilter {
             Patterns = new[] { @"\.(\w+)" }
         });
         bases.Add(VERSION_TOKEN_FILTER, new PatternCaptureTokenFilter {
             Patterns = new[] { @"^(\d+)\.", @"^(\d+\.\d+)", @"^(\d+\.\d+\.\d+)" }
         });
         bases.Add(VERSION_PAD1_TOKEN_FILTER, new PatternReplaceTokenFilter {
             Pattern = @"(\.|^)(\d{1})(?=\.|-|$)", Replacement = @"$10000$2"
         });
         bases.Add(VERSION_PAD2_TOKEN_FILTER, new PatternReplaceTokenFilter {
             Pattern = @"(\.|^)(\d{2})(?=\.|-|$)", Replacement = @"$1000$2"
         });
         bases.Add(VERSION_PAD3_TOKEN_FILTER, new PatternReplaceTokenFilter {
             Pattern = @"(\.|^)(\d{3})(?=\.|-|$)", Replacement = @"$100$2"
         });
         bases.Add(VERSION_PAD4_TOKEN_FILTER, new PatternReplaceTokenFilter {
             Pattern = @"(\.|^)(\d{4})(?=\.|-|$)", Replacement = @"$10$2"
         });
         return bases;
     }));
 }
Ejemplo n.º 17
0
 private AnalysisDescriptor BuildIndexDescriptor(AnalysisDescriptor a)
 {
     return
         (a
          .Analyzers(aa => aa
                     .Custom("default",
                             descriptor =>
                             descriptor.Tokenizer("standard")
                             .CharFilters("html_strip")
                             .Filters("lowercase", "ru_RU", "en_US"))
                     ).TokenFilters(descriptor =>
                                    descriptor.Hunspell("ru_RU", hh => hh.Dedup().Locale("ru_RU"))
                                    .Hunspell("en_US", hh => hh.Dedup().Locale("en_US")))
         );
 }
Ejemplo n.º 18
0
        private static AnalysisDescriptor Analysis(AnalysisDescriptor analysis) => analysis
        .CharFilters(c => c.Mapping("swedish_char_mapping", m => m.Mappings("w => v", "W => V")))
        .TokenFilters(tf => tf.Hunspell("sv_SE", x => x.Dedup().Locale("sv_SE")))
        .Analyzers(analyzers => analyzers
                   .Custom(LowercaseKeywordAnalyserName, c => c
                           .Tokenizer("keyword")
                           .Filters("lowercase")
                           )
                   .Custom(SwedishTextAnalyserName, c => c
                           .Tokenizer("standard")
                           .Filters("lowercase", "sv_SE")
                           .CharFilters("html_strip", "swedish_char_mapping")

                           )
                   );
        public static IAnalysis AddSearchAnalyzerFor(this AnalysisDescriptor analysis, string indexName)
        {
            var indexAnalyzerName = $"{indexName}_search";
            var indexAnalyzerKey  = "lowercase";

            return
                (analysis
                 .Analyzers(a => a
                            .Custom(indexAnalyzerName, c => c
                                    .Tokenizer(indexAnalyzerName)
                                    .Filters(indexAnalyzerKey)
                                    )
                            )
                 .Tokenizers(t => t
                             .EdgeNGram(indexAnalyzerName, e => e
                                        .MinGram(1)
                                        .MaxGram(20)
                                        .TokenChars(TokenChar.Letter)
                                        )
                             ));
        }
Ejemplo n.º 20
0
 private static AnalysisDescriptor Analysis(AnalysisDescriptor analysis)
 => analysis
 .Analyzers(a => a
            .Custom("my-search-analyzer", ca => ca
                    .Tokenizer("standard")
                    .Filters("lowercase", "russian_morphology", "english_morphology", "my_stopwords")
                    ))
 .TokenFilters(tf => tf
               .Stop("my_stopwords", s => s
                     .StopWords("а", "без", "более", "бы", "был", "была", "были", "было", "быть", "в", "вам", "вас",
                                "весь", "во", "вот", "все", "всего", "всех", "вы", "где", "да", "даже", "для", "до", "его",
                                "ее", "если", "есть", "еще", "же", "за", "здесь", "и", "из", "или", "им", "их", "к", "как",
                                "ко", "когда", "кто", "ли", "либо", "мне", "может", "мы", "на", "надо", "наш", "не", "него",
                                "нее", "нет", "ни", "них", "но", "ну", "о", "об", "однако", "он", "она", "они", "оно", "от",
                                "очень", "по", "под", "при", "с", "со", "так", "также", "такой", "там", "те", "тем", "то",
                                "того", "тоже", "той", "только", "том", "ты", "у", "уже", "хотя", "чего", "чей", "чем",
                                "что", "чтобы", "чье", "чья", "эта", "эти", "это", "я", "a", "an", "and", "are", "as", "at",
                                "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or",
                                "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was",
                                "will", "with"
                                )));
Ejemplo n.º 21
0
 private AnalysisDescriptor BuildAnalysis(AnalysisDescriptor ad)
 {
     return(ad.Analyzers(a => a
                         .Pattern(COMMA_WHITESPACE_ANALYZER, p => p.Pattern(@"[,\s]+"))
                         .Custom(EMAIL_ANALYZER, c => c.Filters(EMAIL_TOKEN_FILTER, "lowercase", "unique").Tokenizer("keyword"))
                         .Custom(VERSION_INDEX_ANALYZER, c => c.Filters(VERSION_PAD1_TOKEN_FILTER, VERSION_PAD2_TOKEN_FILTER, VERSION_PAD3_TOKEN_FILTER, VERSION_PAD4_TOKEN_FILTER, VERSION_TOKEN_FILTER, "lowercase", "unique").Tokenizer("whitespace"))
                         .Custom(VERSION_SEARCH_ANALYZER, c => c.Filters(VERSION_PAD1_TOKEN_FILTER, VERSION_PAD2_TOKEN_FILTER, VERSION_PAD3_TOKEN_FILTER, VERSION_PAD4_TOKEN_FILTER, "lowercase").Tokenizer("whitespace"))
                         .Custom(WHITESPACE_LOWERCASE_ANALYZER, c => c.Filters("lowercase").Tokenizer("whitespace"))
                         .Custom(TYPENAME_ANALYZER, c => c.Filters(TYPENAME_TOKEN_FILTER, "lowercase", "unique").Tokenizer(TYPENAME_HIERARCHY_TOKENIZER))
                         .Custom(STANDARDPLUS_ANALYZER, c => c.Filters("standard", TYPENAME_TOKEN_FILTER, "lowercase", "stop", "unique").Tokenizer(COMMA_WHITESPACE_TOKENIZER)))
            .TokenFilters(f => f
                          .PatternCapture(EMAIL_TOKEN_FILTER, p => p.Patterns(@"(\w+)", @"(\p{L}+)", @"(\d+)", "(.+)@", "@(.+)"))
                          .PatternCapture(TYPENAME_TOKEN_FILTER, p => p.Patterns(@"\.(\w+)", @"([^\()]+)"))
                          .PatternCapture(VERSION_TOKEN_FILTER, p => p.Patterns(@"^(\d+)\.", @"^(\d+\.\d+)", @"^(\d+\.\d+\.\d+)"))
                          .PatternReplace(VERSION_PAD1_TOKEN_FILTER, p => p.Pattern(@"(\.|^)(\d{1})(?=\.|-|$)").Replacement("$10000$2"))
                          .PatternReplace(VERSION_PAD2_TOKEN_FILTER, p => p.Pattern(@"(\.|^)(\d{2})(?=\.|-|$)").Replacement("$1000$2"))
                          .PatternReplace(VERSION_PAD3_TOKEN_FILTER, p => p.Pattern(@"(\.|^)(\d{3})(?=\.|-|$)").Replacement("$100$2"))
                          .PatternReplace(VERSION_PAD4_TOKEN_FILTER, p => p.Pattern(@"(\.|^)(\d{4})(?=\.|-|$)").Replacement("$10$2")))
            .Tokenizers(t => t
                        .Pattern(COMMA_WHITESPACE_TOKENIZER, p => p.Pattern(@"[,\s]+"))
                        .PathHierarchy(TYPENAME_HIERARCHY_TOKENIZER, p => p.Delimiter('.'))));
 }
Ejemplo n.º 22
0
 public static AnalysisDescriptor AutoCompleteAnalyzers(this AnalysisDescriptor analysis)
 {
     return(analysis.Tokenizers(t => t.Whitespace("whitespace_tokenizer"))
            .TokenFilters(t => t.EdgeNGram("ngram_filter", n => n.MinGram(1).MaxGram(8)))
            .Analyzers(a => a
                       .Custom("default_autocomplete", c => c
                               .Tokenizer("whitespace_tokenizer").Filters("lowercase", "asciifolding")
                               )
                       .Custom("snowball_autocomplete", c => c
                               .Tokenizer("whitespace_tokenizer").Filters("lowercase", "asciifolding", "snowball")
                               )
                       .Custom("shingle_autocomplete", c => c
                               .Tokenizer("whitespace_tokenizer").Filters("shingle", "lowercase", "asciifolding")
                               )
                       .Custom("ngram_autocomplete", c => c
                               .Tokenizer("whitespace_tokenizer").Filters("lowercase", "asciifolding", "ngram_filter")
                               )
                       .Custom("search_autocomplete", c => c
                               .Tokenizer("whitespace_tokenizer").Filters("lowercase", "asciifolding")
                               )
                       ));
 }
Ejemplo n.º 23
0
        /// <summary>
        /// I've moved this into an extension method
        /// for reuse and a clearer understanding of the
        /// custom analyzer we are writing
        /// </summary>
        /// <param name="analysis"></param>
        /// <returns></returns>
        public static IAnalysis AddSearchAnalyzer(this AnalysisDescriptor analysis)
        {
            const string lowercase = nameof(lowercase);

            return
                (analysis
                 .Analyzers(a => a
                            .Custom(IndexAnalyzerName, c => c
                                    .Tokenizer(IndexAnalyzerName)
                                    .Filters(lowercase)
                                    )
                            .Custom(SearchAnalyzerName, c =>
                                    c.Tokenizer(lowercase)
                                    )
                            )
                 .Tokenizers(t => t
                             .EdgeNGram(IndexAnalyzerName, e => e
                                        .MinGram(1)
                                        .MaxGram(20)
                                        .TokenChars(TokenChar.Letter)
                                        )
                             ));
        }
Ejemplo n.º 24
0
 private static AnalysisDescriptor Analysis(AnalysisDescriptor analysis) => analysis
 .Tokenizers(tokenizers => tokenizers
             .Pattern("nuget-id-tokenizer", p => p.Pattern(@"\W+"))
             )
 .TokenFilters(tokenfilters => tokenfilters
               .WordDelimiter("nuget-id-words", w => w
                              .SplitOnCaseChange()
                              .PreserveOriginal()
                              .SplitOnNumerics()
                              .GenerateNumberParts(false)
                              .GenerateWordParts()
                              )
               )
 .Analyzers(analyzers => analyzers
            .Custom("nuget-id-analyzer", c => c
                    .Tokenizer("nuget-id-tokenizer")
                    .Filters("nuget-id-words", "lowercase")
                    )
            .Custom("nuget-id-keyword", c => c
                    .Tokenizer("keyword")
                    .Filters("lowercase")
                    )
            );
Ejemplo n.º 25
0
        public static AnalysisDescriptor DutchAnalysis(AnalysisDescriptor analysis) => analysis

        //  custom filters
        .TokenFilters(tokenfilters => tokenfilters
                      .Stop("dutch_stop", w => w
                            .StopWords("_dutch_")
                            )
                      .Stemmer("dutch_stemmer", w => w
                               .Language("dutch")
                               )
                      )
        .CharFilters(charFilters => charFilters
                     .PatternReplace("kill_numbers", p => p
                                     .Pattern("(\\d+)")
                                     .Replacement("")))

        //  custom analyzers
        .Analyzers(analyzers => analyzers
                   .Custom("dutch", c => c
                           .CharFilters("kill_numbers")
                           .Tokenizer("standard")
                           .Filters("lowercase", "dutch_stop", "dutch_stemmer")
                           )
                   );
 protected abstract IAnalysis FluentAnalysis(AnalysisDescriptor an);
 protected override IAnalysis FluentAnalysis(AnalysisDescriptor an) =>
 an.Tokenizers(d => AssertionSetup.Fluent(AssertionSetup.Name, d));
Ejemplo n.º 28
0
 public static AnalysisDescriptor Initialize(AnalysisDescriptor analysis) =>
 analysis.Normalizers(n => n
                      .Custom(Sort, descriptor => descriptor.Filters("lowercase")));
Ejemplo n.º 29
0
 private static IAnalysis InitCommonAnalyzers(AnalysisDescriptor analysis)
 {
     return(analysis);
 }
 private IAnalysis CreateAnalysis(AnalysisDescriptor analysisDescriptor)
 {
     return(analysisDescriptor
            .TokenFilters(CreateTokenFilters)
            .Analyzers(CreateAnalyzers));
 }