예제 #1
0
 private static Index IndexDefinition()
 {
     return(new Index()
     {
         Name = "types",
         Fields = new[]
         {
             new Field("id", DataType.String)
             {
                 IsKey = true, IsRetrievable = true, IsFilterable = false, IsSortable = false, IsFacetable = false, IsSearchable = false
             },
             new Field("namespaces", DataType.Collection(DataType.String))
             {
                 IsRetrievable = false, IsFilterable = true, IsSortable = false, IsFacetable = true, IsSearchable = true, IndexAnalyzer = AnalyzerName.Create("name_index"), SearchAnalyzer = AnalyzerName.Create("name_search")
             },
             new Field("types", DataType.Collection(DataType.String))
             {
                 IsRetrievable = false, IsFilterable = false, IsSortable = false, IsFacetable = false, IsSearchable = true, IndexAnalyzer = AnalyzerName.Create("name_index"), SearchAnalyzer = AnalyzerName.Create("name_search")
             },
             new Field("typesCamelHump", DataType.Collection(DataType.String))
             {
                 IsRetrievable = false, IsFilterable = false, IsSortable = false, IsFacetable = false, IsSearchable = true, IndexAnalyzer = AnalyzerName.Create("camel_hump_index"), SearchAnalyzer = AnalyzerName.Create("camel_hump_search")
             },
             new Field("packageId", DataType.String)
             {
                 IsRetrievable = true, IsFilterable = false, IsSortable = false, IsFacetable = false, IsSearchable = false
             },
             new Field("packageVersion", DataType.String)
             {
                 IsRetrievable = true, IsFilterable = false, IsSortable = false, IsFacetable = false, IsSearchable = false
             },
             new Field("prerelease", DataType.Boolean)
             {
                 IsRetrievable = false, IsFilterable = true, IsSortable = false, IsFacetable = true, IsSearchable = false
             },
             new Field("netstandardVersion", DataType.Int32)
             {
                 IsRetrievable = true, IsFilterable = true, IsSortable = false, IsFacetable = true, IsSearchable = false
             },
             new Field("published", DataType.DateTimeOffset)
             {
                 IsRetrievable = true, IsFilterable = true, IsSortable = true, IsFacetable = false, IsSearchable = false
             },
             new Field("totalDownloadCount", DataType.Int32)
             {
                 IsRetrievable = true, IsFilterable = true, IsSortable = true, IsFacetable = false, IsSearchable = false
             },
         },
         CorsOptions = new CorsOptions()
         {
             AllowedOrigins = new[] { "*" },
             MaxAgeInSeconds = (long)TimeSpan.FromHours(2).TotalSeconds,
         },
         CharFilters = new CharFilter[]
         {
             new PatternReplaceCharFilter
             {
                 Name = "remove_generics",
                 Pattern = "<[^>]*>",
                 Replacement = ".",
             },
             new PatternReplaceCharFilter
             {
                 Name = "remove_non_uppercase",
                 Pattern = "[^A-Z]+",
                 Replacement = ".",
             },
             new MappingCharFilter
             {
                 Name = "period_to_space",
                 Mappings = new[] { @".=>\u0020" },
             },
             new MappingCharFilter
             {
                 Name = "period_to_empty_string",
                 Mappings = new[] { @".=>" },
             },
         },
         TokenFilters = new TokenFilter[]
         {
             new EdgeNGramTokenFilter
             {
                 Name = "my_ngram",
                 MinGram = 2,
                 MaxGram = 16,
                 Side = EdgeNGramTokenFilterSide.Front,
             },
         },
         Analyzers = new Analyzer[]
         {
             new CustomAnalyzer
             {
                 Name = "name_search",
                 CharFilters = new []
                 {
                     CharFilterName.Create("remove_generics"),
                     CharFilterName.Create("period_to_space"),
                 },
                 Tokenizer = TokenizerName.Whitespace,
                 TokenFilters = new []
                 {
                     TokenFilterName.Lowercase,
                 },
             },
             new CustomAnalyzer
             {
                 Name = "name_index",
                 CharFilters = new []
                 {
                     CharFilterName.Create("remove_generics"),
                     CharFilterName.Create("period_to_space"),
                 },
                 Tokenizer = TokenizerName.Whitespace,
                 TokenFilters = new []
                 {
                     TokenFilterName.Lowercase,
                     TokenFilterName.Create("my_ngram"),
                 },
             },
             new CustomAnalyzer
             {
                 Name = "camel_hump_search",
                 Tokenizer = TokenizerName.Whitespace,
                 TokenFilters = new []
                 {
                     TokenFilterName.Lowercase,
                 },
             },
             new CustomAnalyzer
             {
                 Name = "camel_hump_index",
                 CharFilters = new []
                 {
                     CharFilterName.Create("remove_generics"),
                     CharFilterName.Create("remove_non_uppercase"),
                     CharFilterName.Create("period_to_empty_string"),
                 },
                 Tokenizer = TokenizerName.Keyword,
                 TokenFilters = new []
                 {
                     TokenFilterName.Lowercase,
                     TokenFilterName.Create("my_ngram"),
                 },
             },
         },
     });
 }
예제 #2
0
        public void CanCreateAllAnalysisComponents()
        {
            Run(() =>
            {
                // Declare some custom component names to use with CustomAnalyzer. All other names will be randomly generated.
                var customTokenizerName   = TokenizerName.Create("my_tokenizer");
                var customTokenFilterName = TokenFilterName.Create("my_tokenfilter");
                var customCharFilterName  = CharFilterName.Create("my_charfilter");

                Index index     = CreateTestIndex();
                index.Analyzers = new Analyzer[]
                {
                    new CustomAnalyzer(
                        SearchTestUtilities.GenerateName(),
                        customTokenizerName,
                        new[] { customTokenFilterName },
                        new[] { customCharFilterName }),
                    new CustomAnalyzer(
                        SearchTestUtilities.GenerateName(),
                        TokenizerName.EdgeNGram),
                    new PatternAnalyzer(
                        SearchTestUtilities.GenerateName(),
                        lowerCaseTerms: false,
                        pattern: "abc",
                        flags: RegexFlags.DotAll,
                        stopwords: new[] { "the" }),
                    new StandardAnalyzer(SearchTestUtilities.GenerateName(), maxTokenLength: 100, stopwords: new[] { "the" }),
                    new StopAnalyzer(SearchTestUtilities.GenerateName(), stopwords: new[] { "the" }),
                    new StopAnalyzer(SearchTestUtilities.GenerateName())
                };

                index.Tokenizers = new Tokenizer[]
                {
                    new EdgeNGramTokenizer(customTokenizerName, minGram: 1, maxGram: 1),    // One custom tokenizer for CustomAnalyzer above.
                    new EdgeNGramTokenizer(
                        SearchTestUtilities.GenerateName(),
                        minGram: 2,
                        maxGram: 4,
                        tokenChars: new[] { TokenCharacterKind.Letter }),
                    new NGramTokenizer(SearchTestUtilities.GenerateName(), minGram: 2, maxGram: 4, tokenChars: new[] { TokenCharacterKind.Letter }),
                    new ClassicTokenizer(SearchTestUtilities.GenerateName(), maxTokenLength: 100),
                    new KeywordTokenizer(SearchTestUtilities.GenerateName(), bufferSize: 100),
                    new MicrosoftLanguageStemmingTokenizer(
                        SearchTestUtilities.GenerateName(),
                        maxTokenLength: 100,
                        isSearchTokenizer: true,
                        language: MicrosoftStemmingTokenizerLanguage.Croatian),
                    new MicrosoftLanguageTokenizer(
                        SearchTestUtilities.GenerateName(),
                        maxTokenLength: 100,
                        isSearchTokenizer: true,
                        language: MicrosoftTokenizerLanguage.Thai),
                    new PathHierarchyTokenizer(
                        SearchTestUtilities.GenerateName(),
                        delimiter: ':',
                        replacement: '_',
                        bufferSize: 100,
                        reverseTokenOrder: true,
                        numberOfTokensToSkip: 2),
                    new PatternTokenizer(
                        SearchTestUtilities.GenerateName(),
                        pattern: ".*",
                        flags: RegexFlags.Multiline | RegexFlags.Literal,
                        group: 0),
                    new StandardTokenizer(SearchTestUtilities.GenerateName(), maxTokenLength: 100),
                    new UaxUrlEmailTokenizer(SearchTestUtilities.GenerateName(), maxTokenLength: 100)
                };

                index.TokenFilters = new TokenFilter[]
                {
                    new CjkBigramTokenFilter(customTokenFilterName),    // One custom token filter for CustomAnalyzer above.
                    new CjkBigramTokenFilter(
                        SearchTestUtilities.GenerateName(),
                        ignoreScripts: new[] { CjkBigramTokenFilterScripts.Han },
                        outputUnigrams: true),
                    new CjkBigramTokenFilter(SearchTestUtilities.GenerateName()),
                    new AsciiFoldingTokenFilter(SearchTestUtilities.GenerateName(), preserveOriginal: true),
                    new AsciiFoldingTokenFilter(SearchTestUtilities.GenerateName()),
                    new CommonGramTokenFilter(
                        SearchTestUtilities.GenerateName(),
                        commonWords: new[] { "hello", "goodbye" },
                        ignoreCase: true,
                        useQueryMode: true),
                    new CommonGramTokenFilter(SearchTestUtilities.GenerateName(), commonWords: new[] { "at" }),
                    new DictionaryDecompounderTokenFilter(
                        SearchTestUtilities.GenerateName(),
                        wordList: new[] { "Schadenfreude" },
                        minWordSize: 10,
                        minSubwordSize: 5,
                        maxSubwordSize: 13,
                        onlyLongestMatch: true),
                    new EdgeNGramTokenFilter(SearchTestUtilities.GenerateName(), minGram: 2, maxGram: 10, side: EdgeNGramTokenFilterSide.Back),
                    new ElisionTokenFilter(SearchTestUtilities.GenerateName(), articles: new[] { "a" }),
                    new ElisionTokenFilter(SearchTestUtilities.GenerateName()),
                    new KeepTokenFilter(SearchTestUtilities.GenerateName(), keepWords: new[] { "aloha" }, lowerCaseKeepWords: true),
                    new KeepTokenFilter(SearchTestUtilities.GenerateName(), keepWords: new[] { "e", "komo", "mai" }),
                    new KeywordMarkerTokenFilter(SearchTestUtilities.GenerateName(), keywords: new[] { "key", "words" }, ignoreCase: true),
                    new KeywordMarkerTokenFilter(SearchTestUtilities.GenerateName(), keywords: new[] { "essential" }),
                    new LengthTokenFilter(SearchTestUtilities.GenerateName(), min: 5, max: 10),
                    new LimitTokenFilter(SearchTestUtilities.GenerateName(), maxTokenCount: 10, consumeAllTokens: true),
                    new NGramTokenFilter(SearchTestUtilities.GenerateName(), minGram: 2, maxGram: 3),
                    new PatternCaptureTokenFilter(SearchTestUtilities.GenerateName(), patterns: new[] { ".*" }, preserveOriginal: false),
                    new PatternReplaceTokenFilter(SearchTestUtilities.GenerateName(), pattern: "abc", replacement: "123"),
                    new PhoneticTokenFilter(SearchTestUtilities.GenerateName(), encoder: PhoneticEncoder.Soundex, replaceOriginalTokens: false),
                    new ShingleTokenFilter(
                        SearchTestUtilities.GenerateName(),
                        maxShingleSize: 10,
                        minShingleSize: 5,
                        outputUnigrams: false,
                        outputUnigramsIfNoShingles: true,
                        tokenSeparator: " ",
                        filterToken: "|"),
                    new SnowballTokenFilter(SearchTestUtilities.GenerateName(), SnowballTokenFilterLanguage.English),
                    new StemmerOverrideTokenFilter(SearchTestUtilities.GenerateName(), rules: new[] { "ran => run" }),
                    new StemmerTokenFilter(SearchTestUtilities.GenerateName(), StemmerTokenFilterLanguage.French),
                    new StopwordsTokenFilter(
                        SearchTestUtilities.GenerateName(),
                        stopwords: new[] { "a", "the" },
                        ignoreCase: true,
                        removeTrailingStopWords: false),
                    new StopwordsTokenFilter(
                        SearchTestUtilities.GenerateName(),
                        stopwordsList: StopwordsList.Italian,
                        ignoreCase: true,
                        removeTrailingStopWords: false),
                    new SynonymTokenFilter(SearchTestUtilities.GenerateName(), synonyms: new[] { "great, good" }, ignoreCase: true, expand: false),
                    new TruncateTokenFilter(SearchTestUtilities.GenerateName(), length: 10),
                    new UniqueTokenFilter(SearchTestUtilities.GenerateName(), onlyOnSamePosition: true),
                    new UniqueTokenFilter(SearchTestUtilities.GenerateName()),
                    new WordDelimiterTokenFilter(
                        SearchTestUtilities.GenerateName(),
                        generateWordParts: false,
                        generateNumberParts: false,
                        catenateWords: true,
                        catenateNumbers: true,
                        catenateAll: true,
                        splitOnCaseChange: false,
                        preserveOriginal: true,
                        splitOnNumerics: false,
                        stemEnglishPossessive: false,
                        protectedWords: new[] { "protected" })
                };

                index.CharFilters = new CharFilter[]
                {
                    new MappingCharFilter(customCharFilterName, mappings: new[] { "a => b" }),    // One custom char filter for CustomAnalyzer above.
                    new MappingCharFilter(SearchTestUtilities.GenerateName(), mappings: new[] { "s => $", "S => $" }),
                    new PatternReplaceCharFilter(SearchTestUtilities.GenerateName(), pattern: "abc", replacement: "123")
                };

                // We have to split up analysis components into two indexes, one where any components with optional properties have defaults that
                // are zero or null (default(T)), and another where we need to specify the default values we expect to get back from the REST API.

                Func <int, string> generateSimpleName = n => string.Format(CultureInfo.InvariantCulture, "a{0}", n);

                int i = 0;

                Index indexWithSpecialDefaults     = CreateTestIndex();
                indexWithSpecialDefaults.Analyzers = new Analyzer[]
                {
                    new PatternAnalyzer(generateSimpleName(i++)),
                    new StandardAnalyzer(generateSimpleName(i++))
                };

                indexWithSpecialDefaults.Tokenizers = new Tokenizer[]
                {
                    new EdgeNGramTokenizer(generateSimpleName(i++)),
                    new NGramTokenizer(generateSimpleName(i++)),
                    new ClassicTokenizer(generateSimpleName(i++)),
                    new KeywordTokenizer(generateSimpleName(i++)),
                    new MicrosoftLanguageStemmingTokenizer(generateSimpleName(i++)),
                    new MicrosoftLanguageTokenizer(generateSimpleName(i++)),
                    new PathHierarchyTokenizer(generateSimpleName(i++)),
                    new PatternTokenizer(generateSimpleName(i++)),
                    new StandardTokenizer(generateSimpleName(i++)),
                    new UaxUrlEmailTokenizer(generateSimpleName(i++))
                };

                indexWithSpecialDefaults.TokenFilters = new TokenFilter[]
                {
                    new DictionaryDecompounderTokenFilter(
                        generateSimpleName(i++),
                        wordList: new[] { "Bahnhof" }),
                    new EdgeNGramTokenFilter(generateSimpleName(i++)),
                    new LengthTokenFilter(generateSimpleName(i++)),
                    new LimitTokenFilter(generateSimpleName(i++)),
                    new NGramTokenFilter(generateSimpleName(i++)),
                    new PatternCaptureTokenFilter(generateSimpleName(i++), patterns: new[] { "[a-z]*" }),
                    new PhoneticTokenFilter(generateSimpleName(i++)),
                    new ShingleTokenFilter(generateSimpleName(i++)),
                    new StopwordsTokenFilter(generateSimpleName(i++)),
                    new SynonymTokenFilter(generateSimpleName(i++), synonyms: new[] { "mutt, canine => dog" }),
                    new TruncateTokenFilter(generateSimpleName(i++)),
                    new WordDelimiterTokenFilter(generateSimpleName(i++))
                };

                i = 0;

                Index expectedIndexWithSpecialDefaults     = CreateTestIndex();
                expectedIndexWithSpecialDefaults.Name      = indexWithSpecialDefaults.Name;
                expectedIndexWithSpecialDefaults.Analyzers = new Analyzer[]
                {
                    new PatternAnalyzer(generateSimpleName(i++), lowerCaseTerms: true, pattern: @"\W+"),
                    new StandardAnalyzer(generateSimpleName(i++), maxTokenLength: 255)
                };

                expectedIndexWithSpecialDefaults.Tokenizers = new Tokenizer[]
                {
                    new EdgeNGramTokenizer(generateSimpleName(i++), minGram: 1, maxGram: 2),
                    new NGramTokenizer(generateSimpleName(i++), minGram: 1, maxGram: 2),
                    new ClassicTokenizer(generateSimpleName(i++), maxTokenLength: 255),
                    new KeywordTokenizer(generateSimpleName(i++), bufferSize: 256),
                    new MicrosoftLanguageStemmingTokenizer(generateSimpleName(i++), maxTokenLength: 255),
                    new MicrosoftLanguageTokenizer(generateSimpleName(i++), maxTokenLength: 255),
                    new PathHierarchyTokenizer(generateSimpleName(i++), delimiter: '/', replacement: '/', bufferSize: 1024),
                    new PatternTokenizer(generateSimpleName(i++), pattern: @"\W+", group: -1),
                    new StandardTokenizer(generateSimpleName(i++), maxTokenLength: 255),
                    new UaxUrlEmailTokenizer(generateSimpleName(i++), maxTokenLength: 255)
                };

                expectedIndexWithSpecialDefaults.TokenFilters = new TokenFilter[]
                {
                    new DictionaryDecompounderTokenFilter(
                        generateSimpleName(i++),
                        wordList: new[] { "Bahnhof" },
                        minWordSize: 5,
                        minSubwordSize: 2,
                        maxSubwordSize: 15),
                    new EdgeNGramTokenFilter(generateSimpleName(i++), minGram: 1, maxGram: 2, side: EdgeNGramTokenFilterSide.Front),
                    new LengthTokenFilter(generateSimpleName(i++), max: int.MaxValue),
                    new LimitTokenFilter(generateSimpleName(i++), maxTokenCount: 1),
                    new NGramTokenFilter(generateSimpleName(i++), minGram: 1, maxGram: 2),
                    new PatternCaptureTokenFilter(generateSimpleName(i++), patterns: new[] { "[a-z]*" }, preserveOriginal: true),
                    new PhoneticTokenFilter(generateSimpleName(i++), encoder: PhoneticEncoder.Metaphone, replaceOriginalTokens: true),
                    new ShingleTokenFilter(
                        generateSimpleName(i++),
                        maxShingleSize: 2,
                        minShingleSize: 2,
                        outputUnigrams: true,
                        tokenSeparator: " ",
                        filterToken: "_"),
                    new StopwordsTokenFilter(generateSimpleName(i++), stopwordsList: StopwordsList.English, removeTrailingStopWords: true),
                    new SynonymTokenFilter(generateSimpleName(i++), synonyms: new[] { "mutt, canine => dog" }, expand: true),
                    new TruncateTokenFilter(generateSimpleName(i++), length: 300),
                    new WordDelimiterTokenFilter(
                        generateSimpleName(i++),
                        generateWordParts: true,
                        generateNumberParts: true,
                        splitOnCaseChange: true,
                        splitOnNumerics: true,
                        stemEnglishPossessive: true)
                };

                // This is to make sure we didn't forget any components in this test.
                AssertIndexContainsAllAnalysisComponents(index, indexWithSpecialDefaults);

                TestAnalysisComponents(index);
                TestAnalysisComponents(indexWithSpecialDefaults, expectedIndexWithSpecialDefaults);
            });
        }