void IUtf8JsonSerializable.Write(Utf8JsonWriter writer) { writer.WriteStartObject(); writer.WritePropertyName("tokenizer"); writer.WriteStringValue(TokenizerName.ToString()); if (Optional.IsCollectionDefined(TokenFilters)) { writer.WritePropertyName("tokenFilters"); writer.WriteStartArray(); foreach (var item in TokenFilters) { writer.WriteStringValue(item.ToString()); } writer.WriteEndArray(); } if (Optional.IsCollectionDefined(CharFilters)) { writer.WritePropertyName("charFilters"); writer.WriteStartArray(); foreach (var item in CharFilters) { writer.WriteStringValue(item); } writer.WriteEndArray(); } writer.WritePropertyName("@odata.type"); writer.WriteStringValue(ODataType); writer.WritePropertyName("name"); writer.WriteStringValue(Name); writer.WriteEndObject(); }
private async Task CreateCombinedIndex(ISearchServiceClient serviceClient) { var indexDefinition = new Index { Name = SearchConstants.CombinedIndexName, Fields = FieldBuilder.BuildForType <CombinedIndexedItem>(), Analyzers = new List <Analyzer> { new CustomAnalyzer { Name = SearchConstants.AdvancedAnalyzerName, TokenFilters = new List <TokenFilterName> { TokenFilterName.Lowercase, TokenFilterName.AsciiFolding, //TokenFilterName.Phonetic, //TokenFilterName.EdgeNGram }, Tokenizer = TokenizerName.Create(SearchConstants.NGramTokenizerName), //Tokenizer = TokenizerName.EdgeNGram, }, new CustomAnalyzer { Name = SearchConstants.AdvancedAnalyzer_2_Name, Tokenizer = TokenizerName.EdgeNGram, TokenFilters = new List <TokenFilterName> { TokenFilterName.Lowercase, "myNGramTokenFilter" } } }, Tokenizers = new List <Tokenizer> { new NGramTokenizer(SearchConstants.NGramTokenizerName) { MinGram = 4, MaxGram = 30, TokenChars = new List <TokenCharacterKind> { TokenCharacterKind.Letter, TokenCharacterKind.Digit, } } }, TokenFilters = new List <TokenFilter> { new NGramTokenFilterV2 { Name = "myNGramTokenFilter", MinGram = 1, MaxGram = 100 } } }; await serviceClient.Indexes.CreateAsync(indexDefinition); }
public void CanCreateAllAnalysisComponents() { Run(() => { // Declare some custom component names to use with CustomAnalyzer. All other names will be randomly generated. var customTokenizerName = TokenizerName.Create("my_tokenizer"); var customTokenFilterName = TokenFilterName.Create("my_tokenfilter"); var customCharFilterName = CharFilterName.Create("my_charfilter"); Index index = CreateTestIndex(); index.Analyzers = new Analyzer[] { new CustomAnalyzer( SearchTestUtilities.GenerateName(), customTokenizerName, new[] { customTokenFilterName }, new[] { customCharFilterName }), new CustomAnalyzer( SearchTestUtilities.GenerateName(), TokenizerName.EdgeNGram), new PatternAnalyzer( SearchTestUtilities.GenerateName(), lowerCaseTerms: false, pattern: "abc", flags: RegexFlags.DotAll, stopwords: new[] { "the" }), new StandardAnalyzer(SearchTestUtilities.GenerateName(), maxTokenLength: 100, stopwords: new[] { "the" }), new StopAnalyzer(SearchTestUtilities.GenerateName(), stopwords: new[] { "the" }), new StopAnalyzer(SearchTestUtilities.GenerateName()) }; index.Tokenizers = new Tokenizer[] { new EdgeNGramTokenizer(customTokenizerName, minGram: 1, maxGram: 1), // One custom tokenizer for CustomAnalyzer above. new EdgeNGramTokenizer( SearchTestUtilities.GenerateName(), minGram: 2, maxGram: 4, tokenChars: new[] { TokenCharacterKind.Letter }), new NGramTokenizer(SearchTestUtilities.GenerateName(), minGram: 2, maxGram: 4, tokenChars: new[] { TokenCharacterKind.Letter }), new ClassicTokenizer(SearchTestUtilities.GenerateName(), maxTokenLength: 100), new KeywordTokenizer(SearchTestUtilities.GenerateName(), bufferSize: 100), new MicrosoftLanguageStemmingTokenizer( SearchTestUtilities.GenerateName(), maxTokenLength: 100, isSearchTokenizer: true, language: MicrosoftStemmingTokenizerLanguage.Croatian), new MicrosoftLanguageTokenizer( SearchTestUtilities.GenerateName(), maxTokenLength: 100, isSearchTokenizer: true, language: MicrosoftTokenizerLanguage.Thai), new PathHierarchyTokenizer( SearchTestUtilities.GenerateName(), delimiter: ':', replacement: '_', bufferSize: 100, reverseTokenOrder: true, numberOfTokensToSkip: 2), new PatternTokenizer( SearchTestUtilities.GenerateName(), pattern: ".*", flags: RegexFlags.Multiline | RegexFlags.Literal, group: 0), new StandardTokenizer(SearchTestUtilities.GenerateName(), maxTokenLength: 100), new UaxUrlEmailTokenizer(SearchTestUtilities.GenerateName(), maxTokenLength: 100) }; index.TokenFilters = new TokenFilter[] { new CjkBigramTokenFilter(customTokenFilterName), // One custom token filter for CustomAnalyzer above. new CjkBigramTokenFilter( SearchTestUtilities.GenerateName(), ignoreScripts: new[] { CjkBigramTokenFilterScripts.Han }, outputUnigrams: true), new CjkBigramTokenFilter(SearchTestUtilities.GenerateName()), new AsciiFoldingTokenFilter(SearchTestUtilities.GenerateName(), preserveOriginal: true), new AsciiFoldingTokenFilter(SearchTestUtilities.GenerateName()), new CommonGramTokenFilter( SearchTestUtilities.GenerateName(), commonWords: new[] { "hello", "goodbye" }, ignoreCase: true, useQueryMode: true), new CommonGramTokenFilter(SearchTestUtilities.GenerateName(), commonWords: new[] { "at" }), new DictionaryDecompounderTokenFilter( SearchTestUtilities.GenerateName(), wordList: new[] { "Schadenfreude" }, minWordSize: 10, minSubwordSize: 5, maxSubwordSize: 13, onlyLongestMatch: true), new EdgeNGramTokenFilter(SearchTestUtilities.GenerateName(), minGram: 2, maxGram: 10, side: EdgeNGramTokenFilterSide.Back), new ElisionTokenFilter(SearchTestUtilities.GenerateName(), articles: new[] { "a" }), new ElisionTokenFilter(SearchTestUtilities.GenerateName()), new KeepTokenFilter(SearchTestUtilities.GenerateName(), keepWords: new[] { "aloha" }, lowerCaseKeepWords: true), new KeepTokenFilter(SearchTestUtilities.GenerateName(), keepWords: new[] { "e", "komo", "mai" }), new KeywordMarkerTokenFilter(SearchTestUtilities.GenerateName(), keywords: new[] { "key", "words" }, ignoreCase: true), new KeywordMarkerTokenFilter(SearchTestUtilities.GenerateName(), keywords: new[] { "essential" }), new LengthTokenFilter(SearchTestUtilities.GenerateName(), min: 5, max: 10), new LimitTokenFilter(SearchTestUtilities.GenerateName(), maxTokenCount: 10, consumeAllTokens: true), new NGramTokenFilter(SearchTestUtilities.GenerateName(), minGram: 2, maxGram: 3), new PatternCaptureTokenFilter(SearchTestUtilities.GenerateName(), patterns: new[] { ".*" }, preserveOriginal: false), new PatternReplaceTokenFilter(SearchTestUtilities.GenerateName(), pattern: "abc", replacement: "123"), new PhoneticTokenFilter(SearchTestUtilities.GenerateName(), encoder: PhoneticEncoder.Soundex, replaceOriginalTokens: false), new ShingleTokenFilter( SearchTestUtilities.GenerateName(), maxShingleSize: 10, minShingleSize: 5, outputUnigrams: false, outputUnigramsIfNoShingles: true, tokenSeparator: " ", filterToken: "|"), new SnowballTokenFilter(SearchTestUtilities.GenerateName(), SnowballTokenFilterLanguage.English), new StemmerOverrideTokenFilter(SearchTestUtilities.GenerateName(), rules: new[] { "ran => run" }), new StemmerTokenFilter(SearchTestUtilities.GenerateName(), StemmerTokenFilterLanguage.French), new StopwordsTokenFilter( SearchTestUtilities.GenerateName(), stopwords: new[] { "a", "the" }, ignoreCase: true, removeTrailingStopWords: false), new StopwordsTokenFilter( SearchTestUtilities.GenerateName(), stopwordsList: StopwordsList.Italian, ignoreCase: true, removeTrailingStopWords: false), new SynonymTokenFilter(SearchTestUtilities.GenerateName(), synonyms: new[] { "great, good" }, ignoreCase: true, expand: false), new TruncateTokenFilter(SearchTestUtilities.GenerateName(), length: 10), new UniqueTokenFilter(SearchTestUtilities.GenerateName(), onlyOnSamePosition: true), new UniqueTokenFilter(SearchTestUtilities.GenerateName()), new WordDelimiterTokenFilter( SearchTestUtilities.GenerateName(), generateWordParts: false, generateNumberParts: false, catenateWords: true, catenateNumbers: true, catenateAll: true, splitOnCaseChange: false, preserveOriginal: true, splitOnNumerics: false, stemEnglishPossessive: false, protectedWords: new[] { "protected" }) }; index.CharFilters = new CharFilter[] { new MappingCharFilter(customCharFilterName, mappings: new[] { "a => b" }), // One custom char filter for CustomAnalyzer above. new MappingCharFilter(SearchTestUtilities.GenerateName(), mappings: new[] { "s => $", "S => $" }), new PatternReplaceCharFilter(SearchTestUtilities.GenerateName(), pattern: "abc", replacement: "123") }; // We have to split up analysis components into two indexes, one where any components with optional properties have defaults that // are zero or null (default(T)), and another where we need to specify the default values we expect to get back from the REST API. Func <int, string> generateSimpleName = n => string.Format(CultureInfo.InvariantCulture, "a{0}", n); int i = 0; Index indexWithSpecialDefaults = CreateTestIndex(); indexWithSpecialDefaults.Analyzers = new Analyzer[] { new PatternAnalyzer(generateSimpleName(i++)), new StandardAnalyzer(generateSimpleName(i++)) }; indexWithSpecialDefaults.Tokenizers = new Tokenizer[] { new EdgeNGramTokenizer(generateSimpleName(i++)), new NGramTokenizer(generateSimpleName(i++)), new ClassicTokenizer(generateSimpleName(i++)), new KeywordTokenizer(generateSimpleName(i++)), new MicrosoftLanguageStemmingTokenizer(generateSimpleName(i++)), new MicrosoftLanguageTokenizer(generateSimpleName(i++)), new PathHierarchyTokenizer(generateSimpleName(i++)), new PatternTokenizer(generateSimpleName(i++)), new StandardTokenizer(generateSimpleName(i++)), new UaxUrlEmailTokenizer(generateSimpleName(i++)) }; indexWithSpecialDefaults.TokenFilters = new TokenFilter[] { new DictionaryDecompounderTokenFilter( generateSimpleName(i++), wordList: new[] { "Bahnhof" }), new EdgeNGramTokenFilter(generateSimpleName(i++)), new LengthTokenFilter(generateSimpleName(i++)), new LimitTokenFilter(generateSimpleName(i++)), new NGramTokenFilter(generateSimpleName(i++)), new PatternCaptureTokenFilter(generateSimpleName(i++), patterns: new[] { "[a-z]*" }), new PhoneticTokenFilter(generateSimpleName(i++)), new ShingleTokenFilter(generateSimpleName(i++)), new StopwordsTokenFilter(generateSimpleName(i++)), new SynonymTokenFilter(generateSimpleName(i++), synonyms: new[] { "mutt, canine => dog" }), new TruncateTokenFilter(generateSimpleName(i++)), new WordDelimiterTokenFilter(generateSimpleName(i++)) }; i = 0; Index expectedIndexWithSpecialDefaults = CreateTestIndex(); expectedIndexWithSpecialDefaults.Name = indexWithSpecialDefaults.Name; expectedIndexWithSpecialDefaults.Analyzers = new Analyzer[] { new PatternAnalyzer(generateSimpleName(i++), lowerCaseTerms: true, pattern: @"\W+"), new StandardAnalyzer(generateSimpleName(i++), maxTokenLength: 255) }; expectedIndexWithSpecialDefaults.Tokenizers = new Tokenizer[] { new EdgeNGramTokenizer(generateSimpleName(i++), minGram: 1, maxGram: 2), new NGramTokenizer(generateSimpleName(i++), minGram: 1, maxGram: 2), new ClassicTokenizer(generateSimpleName(i++), maxTokenLength: 255), new KeywordTokenizer(generateSimpleName(i++), bufferSize: 256), new MicrosoftLanguageStemmingTokenizer(generateSimpleName(i++), maxTokenLength: 255), new MicrosoftLanguageTokenizer(generateSimpleName(i++), maxTokenLength: 255), new PathHierarchyTokenizer(generateSimpleName(i++), delimiter: '/', replacement: '/', bufferSize: 1024), new PatternTokenizer(generateSimpleName(i++), pattern: @"\W+", group: -1), new StandardTokenizer(generateSimpleName(i++), maxTokenLength: 255), new UaxUrlEmailTokenizer(generateSimpleName(i++), maxTokenLength: 255) }; expectedIndexWithSpecialDefaults.TokenFilters = new TokenFilter[] { new DictionaryDecompounderTokenFilter( generateSimpleName(i++), wordList: new[] { "Bahnhof" }, minWordSize: 5, minSubwordSize: 2, maxSubwordSize: 15), new EdgeNGramTokenFilter(generateSimpleName(i++), minGram: 1, maxGram: 2, side: EdgeNGramTokenFilterSide.Front), new LengthTokenFilter(generateSimpleName(i++), max: int.MaxValue), new LimitTokenFilter(generateSimpleName(i++), maxTokenCount: 1), new NGramTokenFilter(generateSimpleName(i++), minGram: 1, maxGram: 2), new PatternCaptureTokenFilter(generateSimpleName(i++), patterns: new[] { "[a-z]*" }, preserveOriginal: true), new PhoneticTokenFilter(generateSimpleName(i++), encoder: PhoneticEncoder.Metaphone, replaceOriginalTokens: true), new ShingleTokenFilter( generateSimpleName(i++), maxShingleSize: 2, minShingleSize: 2, outputUnigrams: true, tokenSeparator: " ", filterToken: "_"), new StopwordsTokenFilter(generateSimpleName(i++), stopwordsList: StopwordsList.English, removeTrailingStopWords: true), new SynonymTokenFilter(generateSimpleName(i++), synonyms: new[] { "mutt, canine => dog" }, expand: true), new TruncateTokenFilter(generateSimpleName(i++), length: 300), new WordDelimiterTokenFilter( generateSimpleName(i++), generateWordParts: true, generateNumberParts: true, splitOnCaseChange: true, splitOnNumerics: true, stemEnglishPossessive: true) }; // This is to make sure we didn't forget any components in this test. AssertIndexContainsAllAnalysisComponents(index, indexWithSpecialDefaults); TestAnalysisComponents(index); TestAnalysisComponents(indexWithSpecialDefaults, expectedIndexWithSpecialDefaults); }); }