void IUtf8JsonSerializable.Write(Utf8JsonWriter writer)
 {
     writer.WriteStartObject();
     writer.WritePropertyName("tokenizer");
     writer.WriteStringValue(TokenizerName.ToString());
     if (Optional.IsCollectionDefined(TokenFilters))
     {
         writer.WritePropertyName("tokenFilters");
         writer.WriteStartArray();
         foreach (var item in TokenFilters)
         {
             writer.WriteStringValue(item.ToString());
         }
         writer.WriteEndArray();
     }
     if (Optional.IsCollectionDefined(CharFilters))
     {
         writer.WritePropertyName("charFilters");
         writer.WriteStartArray();
         foreach (var item in CharFilters)
         {
             writer.WriteStringValue(item);
         }
         writer.WriteEndArray();
     }
     writer.WritePropertyName("@odata.type");
     writer.WriteStringValue(ODataType);
     writer.WritePropertyName("name");
     writer.WriteStringValue(Name);
     writer.WriteEndObject();
 }
        private async Task CreateCombinedIndex(ISearchServiceClient serviceClient)
        {
            var indexDefinition = new Index
            {
                Name      = SearchConstants.CombinedIndexName,
                Fields    = FieldBuilder.BuildForType <CombinedIndexedItem>(),
                Analyzers = new List <Analyzer>
                {
                    new CustomAnalyzer
                    {
                        Name         = SearchConstants.AdvancedAnalyzerName,
                        TokenFilters = new List <TokenFilterName>
                        {
                            TokenFilterName.Lowercase,
                            TokenFilterName.AsciiFolding,
                            //TokenFilterName.Phonetic,
                            //TokenFilterName.EdgeNGram
                        },
                        Tokenizer = TokenizerName.Create(SearchConstants.NGramTokenizerName),
                        //Tokenizer = TokenizerName.EdgeNGram,
                    },
                    new CustomAnalyzer
                    {
                        Name         = SearchConstants.AdvancedAnalyzer_2_Name,
                        Tokenizer    = TokenizerName.EdgeNGram,
                        TokenFilters = new List <TokenFilterName>
                        {
                            TokenFilterName.Lowercase,
                            "myNGramTokenFilter"
                        }
                    }
                },
                Tokenizers = new List <Tokenizer>
                {
                    new NGramTokenizer(SearchConstants.NGramTokenizerName)
                    {
                        MinGram    = 4,
                        MaxGram    = 30,
                        TokenChars = new List <TokenCharacterKind>
                        {
                            TokenCharacterKind.Letter,
                            TokenCharacterKind.Digit,
                        }
                    }
                },
                TokenFilters = new List <TokenFilter>
                {
                    new NGramTokenFilterV2
                    {
                        Name    = "myNGramTokenFilter",
                        MinGram = 1,
                        MaxGram = 100
                    }
                }
            };

            await serviceClient.Indexes.CreateAsync(indexDefinition);
        }
Exemplo n.º 3
0
        public void CanCreateAllAnalysisComponents()
        {
            Run(() =>
            {
                // Declare some custom component names to use with CustomAnalyzer. All other names will be randomly generated.
                var customTokenizerName   = TokenizerName.Create("my_tokenizer");
                var customTokenFilterName = TokenFilterName.Create("my_tokenfilter");
                var customCharFilterName  = CharFilterName.Create("my_charfilter");

                Index index     = CreateTestIndex();
                index.Analyzers = new Analyzer[]
                {
                    new CustomAnalyzer(
                        SearchTestUtilities.GenerateName(),
                        customTokenizerName,
                        new[] { customTokenFilterName },
                        new[] { customCharFilterName }),
                    new CustomAnalyzer(
                        SearchTestUtilities.GenerateName(),
                        TokenizerName.EdgeNGram),
                    new PatternAnalyzer(
                        SearchTestUtilities.GenerateName(),
                        lowerCaseTerms: false,
                        pattern: "abc",
                        flags: RegexFlags.DotAll,
                        stopwords: new[] { "the" }),
                    new StandardAnalyzer(SearchTestUtilities.GenerateName(), maxTokenLength: 100, stopwords: new[] { "the" }),
                    new StopAnalyzer(SearchTestUtilities.GenerateName(), stopwords: new[] { "the" }),
                    new StopAnalyzer(SearchTestUtilities.GenerateName())
                };

                index.Tokenizers = new Tokenizer[]
                {
                    new EdgeNGramTokenizer(customTokenizerName, minGram: 1, maxGram: 1),    // One custom tokenizer for CustomAnalyzer above.
                    new EdgeNGramTokenizer(
                        SearchTestUtilities.GenerateName(),
                        minGram: 2,
                        maxGram: 4,
                        tokenChars: new[] { TokenCharacterKind.Letter }),
                    new NGramTokenizer(SearchTestUtilities.GenerateName(), minGram: 2, maxGram: 4, tokenChars: new[] { TokenCharacterKind.Letter }),
                    new ClassicTokenizer(SearchTestUtilities.GenerateName(), maxTokenLength: 100),
                    new KeywordTokenizer(SearchTestUtilities.GenerateName(), bufferSize: 100),
                    new MicrosoftLanguageStemmingTokenizer(
                        SearchTestUtilities.GenerateName(),
                        maxTokenLength: 100,
                        isSearchTokenizer: true,
                        language: MicrosoftStemmingTokenizerLanguage.Croatian),
                    new MicrosoftLanguageTokenizer(
                        SearchTestUtilities.GenerateName(),
                        maxTokenLength: 100,
                        isSearchTokenizer: true,
                        language: MicrosoftTokenizerLanguage.Thai),
                    new PathHierarchyTokenizer(
                        SearchTestUtilities.GenerateName(),
                        delimiter: ':',
                        replacement: '_',
                        bufferSize: 100,
                        reverseTokenOrder: true,
                        numberOfTokensToSkip: 2),
                    new PatternTokenizer(
                        SearchTestUtilities.GenerateName(),
                        pattern: ".*",
                        flags: RegexFlags.Multiline | RegexFlags.Literal,
                        group: 0),
                    new StandardTokenizer(SearchTestUtilities.GenerateName(), maxTokenLength: 100),
                    new UaxUrlEmailTokenizer(SearchTestUtilities.GenerateName(), maxTokenLength: 100)
                };

                index.TokenFilters = new TokenFilter[]
                {
                    new CjkBigramTokenFilter(customTokenFilterName),    // One custom token filter for CustomAnalyzer above.
                    new CjkBigramTokenFilter(
                        SearchTestUtilities.GenerateName(),
                        ignoreScripts: new[] { CjkBigramTokenFilterScripts.Han },
                        outputUnigrams: true),
                    new CjkBigramTokenFilter(SearchTestUtilities.GenerateName()),
                    new AsciiFoldingTokenFilter(SearchTestUtilities.GenerateName(), preserveOriginal: true),
                    new AsciiFoldingTokenFilter(SearchTestUtilities.GenerateName()),
                    new CommonGramTokenFilter(
                        SearchTestUtilities.GenerateName(),
                        commonWords: new[] { "hello", "goodbye" },
                        ignoreCase: true,
                        useQueryMode: true),
                    new CommonGramTokenFilter(SearchTestUtilities.GenerateName(), commonWords: new[] { "at" }),
                    new DictionaryDecompounderTokenFilter(
                        SearchTestUtilities.GenerateName(),
                        wordList: new[] { "Schadenfreude" },
                        minWordSize: 10,
                        minSubwordSize: 5,
                        maxSubwordSize: 13,
                        onlyLongestMatch: true),
                    new EdgeNGramTokenFilter(SearchTestUtilities.GenerateName(), minGram: 2, maxGram: 10, side: EdgeNGramTokenFilterSide.Back),
                    new ElisionTokenFilter(SearchTestUtilities.GenerateName(), articles: new[] { "a" }),
                    new ElisionTokenFilter(SearchTestUtilities.GenerateName()),
                    new KeepTokenFilter(SearchTestUtilities.GenerateName(), keepWords: new[] { "aloha" }, lowerCaseKeepWords: true),
                    new KeepTokenFilter(SearchTestUtilities.GenerateName(), keepWords: new[] { "e", "komo", "mai" }),
                    new KeywordMarkerTokenFilter(SearchTestUtilities.GenerateName(), keywords: new[] { "key", "words" }, ignoreCase: true),
                    new KeywordMarkerTokenFilter(SearchTestUtilities.GenerateName(), keywords: new[] { "essential" }),
                    new LengthTokenFilter(SearchTestUtilities.GenerateName(), min: 5, max: 10),
                    new LimitTokenFilter(SearchTestUtilities.GenerateName(), maxTokenCount: 10, consumeAllTokens: true),
                    new NGramTokenFilter(SearchTestUtilities.GenerateName(), minGram: 2, maxGram: 3),
                    new PatternCaptureTokenFilter(SearchTestUtilities.GenerateName(), patterns: new[] { ".*" }, preserveOriginal: false),
                    new PatternReplaceTokenFilter(SearchTestUtilities.GenerateName(), pattern: "abc", replacement: "123"),
                    new PhoneticTokenFilter(SearchTestUtilities.GenerateName(), encoder: PhoneticEncoder.Soundex, replaceOriginalTokens: false),
                    new ShingleTokenFilter(
                        SearchTestUtilities.GenerateName(),
                        maxShingleSize: 10,
                        minShingleSize: 5,
                        outputUnigrams: false,
                        outputUnigramsIfNoShingles: true,
                        tokenSeparator: " ",
                        filterToken: "|"),
                    new SnowballTokenFilter(SearchTestUtilities.GenerateName(), SnowballTokenFilterLanguage.English),
                    new StemmerOverrideTokenFilter(SearchTestUtilities.GenerateName(), rules: new[] { "ran => run" }),
                    new StemmerTokenFilter(SearchTestUtilities.GenerateName(), StemmerTokenFilterLanguage.French),
                    new StopwordsTokenFilter(
                        SearchTestUtilities.GenerateName(),
                        stopwords: new[] { "a", "the" },
                        ignoreCase: true,
                        removeTrailingStopWords: false),
                    new StopwordsTokenFilter(
                        SearchTestUtilities.GenerateName(),
                        stopwordsList: StopwordsList.Italian,
                        ignoreCase: true,
                        removeTrailingStopWords: false),
                    new SynonymTokenFilter(SearchTestUtilities.GenerateName(), synonyms: new[] { "great, good" }, ignoreCase: true, expand: false),
                    new TruncateTokenFilter(SearchTestUtilities.GenerateName(), length: 10),
                    new UniqueTokenFilter(SearchTestUtilities.GenerateName(), onlyOnSamePosition: true),
                    new UniqueTokenFilter(SearchTestUtilities.GenerateName()),
                    new WordDelimiterTokenFilter(
                        SearchTestUtilities.GenerateName(),
                        generateWordParts: false,
                        generateNumberParts: false,
                        catenateWords: true,
                        catenateNumbers: true,
                        catenateAll: true,
                        splitOnCaseChange: false,
                        preserveOriginal: true,
                        splitOnNumerics: false,
                        stemEnglishPossessive: false,
                        protectedWords: new[] { "protected" })
                };

                index.CharFilters = new CharFilter[]
                {
                    new MappingCharFilter(customCharFilterName, mappings: new[] { "a => b" }),    // One custom char filter for CustomAnalyzer above.
                    new MappingCharFilter(SearchTestUtilities.GenerateName(), mappings: new[] { "s => $", "S => $" }),
                    new PatternReplaceCharFilter(SearchTestUtilities.GenerateName(), pattern: "abc", replacement: "123")
                };

                // We have to split up analysis components into two indexes, one where any components with optional properties have defaults that
                // are zero or null (default(T)), and another where we need to specify the default values we expect to get back from the REST API.

                Func <int, string> generateSimpleName = n => string.Format(CultureInfo.InvariantCulture, "a{0}", n);

                int i = 0;

                Index indexWithSpecialDefaults     = CreateTestIndex();
                indexWithSpecialDefaults.Analyzers = new Analyzer[]
                {
                    new PatternAnalyzer(generateSimpleName(i++)),
                    new StandardAnalyzer(generateSimpleName(i++))
                };

                indexWithSpecialDefaults.Tokenizers = new Tokenizer[]
                {
                    new EdgeNGramTokenizer(generateSimpleName(i++)),
                    new NGramTokenizer(generateSimpleName(i++)),
                    new ClassicTokenizer(generateSimpleName(i++)),
                    new KeywordTokenizer(generateSimpleName(i++)),
                    new MicrosoftLanguageStemmingTokenizer(generateSimpleName(i++)),
                    new MicrosoftLanguageTokenizer(generateSimpleName(i++)),
                    new PathHierarchyTokenizer(generateSimpleName(i++)),
                    new PatternTokenizer(generateSimpleName(i++)),
                    new StandardTokenizer(generateSimpleName(i++)),
                    new UaxUrlEmailTokenizer(generateSimpleName(i++))
                };

                indexWithSpecialDefaults.TokenFilters = new TokenFilter[]
                {
                    new DictionaryDecompounderTokenFilter(
                        generateSimpleName(i++),
                        wordList: new[] { "Bahnhof" }),
                    new EdgeNGramTokenFilter(generateSimpleName(i++)),
                    new LengthTokenFilter(generateSimpleName(i++)),
                    new LimitTokenFilter(generateSimpleName(i++)),
                    new NGramTokenFilter(generateSimpleName(i++)),
                    new PatternCaptureTokenFilter(generateSimpleName(i++), patterns: new[] { "[a-z]*" }),
                    new PhoneticTokenFilter(generateSimpleName(i++)),
                    new ShingleTokenFilter(generateSimpleName(i++)),
                    new StopwordsTokenFilter(generateSimpleName(i++)),
                    new SynonymTokenFilter(generateSimpleName(i++), synonyms: new[] { "mutt, canine => dog" }),
                    new TruncateTokenFilter(generateSimpleName(i++)),
                    new WordDelimiterTokenFilter(generateSimpleName(i++))
                };

                i = 0;

                Index expectedIndexWithSpecialDefaults     = CreateTestIndex();
                expectedIndexWithSpecialDefaults.Name      = indexWithSpecialDefaults.Name;
                expectedIndexWithSpecialDefaults.Analyzers = new Analyzer[]
                {
                    new PatternAnalyzer(generateSimpleName(i++), lowerCaseTerms: true, pattern: @"\W+"),
                    new StandardAnalyzer(generateSimpleName(i++), maxTokenLength: 255)
                };

                expectedIndexWithSpecialDefaults.Tokenizers = new Tokenizer[]
                {
                    new EdgeNGramTokenizer(generateSimpleName(i++), minGram: 1, maxGram: 2),
                    new NGramTokenizer(generateSimpleName(i++), minGram: 1, maxGram: 2),
                    new ClassicTokenizer(generateSimpleName(i++), maxTokenLength: 255),
                    new KeywordTokenizer(generateSimpleName(i++), bufferSize: 256),
                    new MicrosoftLanguageStemmingTokenizer(generateSimpleName(i++), maxTokenLength: 255),
                    new MicrosoftLanguageTokenizer(generateSimpleName(i++), maxTokenLength: 255),
                    new PathHierarchyTokenizer(generateSimpleName(i++), delimiter: '/', replacement: '/', bufferSize: 1024),
                    new PatternTokenizer(generateSimpleName(i++), pattern: @"\W+", group: -1),
                    new StandardTokenizer(generateSimpleName(i++), maxTokenLength: 255),
                    new UaxUrlEmailTokenizer(generateSimpleName(i++), maxTokenLength: 255)
                };

                expectedIndexWithSpecialDefaults.TokenFilters = new TokenFilter[]
                {
                    new DictionaryDecompounderTokenFilter(
                        generateSimpleName(i++),
                        wordList: new[] { "Bahnhof" },
                        minWordSize: 5,
                        minSubwordSize: 2,
                        maxSubwordSize: 15),
                    new EdgeNGramTokenFilter(generateSimpleName(i++), minGram: 1, maxGram: 2, side: EdgeNGramTokenFilterSide.Front),
                    new LengthTokenFilter(generateSimpleName(i++), max: int.MaxValue),
                    new LimitTokenFilter(generateSimpleName(i++), maxTokenCount: 1),
                    new NGramTokenFilter(generateSimpleName(i++), minGram: 1, maxGram: 2),
                    new PatternCaptureTokenFilter(generateSimpleName(i++), patterns: new[] { "[a-z]*" }, preserveOriginal: true),
                    new PhoneticTokenFilter(generateSimpleName(i++), encoder: PhoneticEncoder.Metaphone, replaceOriginalTokens: true),
                    new ShingleTokenFilter(
                        generateSimpleName(i++),
                        maxShingleSize: 2,
                        minShingleSize: 2,
                        outputUnigrams: true,
                        tokenSeparator: " ",
                        filterToken: "_"),
                    new StopwordsTokenFilter(generateSimpleName(i++), stopwordsList: StopwordsList.English, removeTrailingStopWords: true),
                    new SynonymTokenFilter(generateSimpleName(i++), synonyms: new[] { "mutt, canine => dog" }, expand: true),
                    new TruncateTokenFilter(generateSimpleName(i++), length: 300),
                    new WordDelimiterTokenFilter(
                        generateSimpleName(i++),
                        generateWordParts: true,
                        generateNumberParts: true,
                        splitOnCaseChange: true,
                        splitOnNumerics: true,
                        stemEnglishPossessive: true)
                };

                // This is to make sure we didn't forget any components in this test.
                AssertIndexContainsAllAnalysisComponents(index, indexWithSpecialDefaults);

                TestAnalysisComponents(index);
                TestAnalysisComponents(indexWithSpecialDefaults, expectedIndexWithSpecialDefaults);
            });
        }