public void EdgeNGramTokenFilterRoundtrips(string odataType) { string jsonContent = $@"{{ ""@odata.type"": ""{odataType}"", ""name"": ""test"", ""minGram"": 0, ""maxGram"": 1, ""side"": ""front"" }}"; JsonDocument jsonDoc = JsonDocument.Parse(jsonContent); EdgeNGramTokenFilter sut = TokenFilter.DeserializeTokenFilter(jsonDoc.RootElement) as EdgeNGramTokenFilter; Assert.NotNull(sut); Assert.AreEqual(odataType, sut.ODataType); Assert.AreEqual("test", sut.Name); Assert.AreEqual(0, sut.MinGram); Assert.AreEqual(1, sut.MaxGram); Assert.AreEqual(EdgeNGramTokenFilterSide.Front, sut.Side); using MemoryStream stream = new MemoryStream(); using (Utf8JsonWriter writer = new Utf8JsonWriter(stream)) { ((IUtf8JsonSerializable)sut).Write(writer); } stream.Position = 0; jsonDoc = JsonDocument.Parse(stream); Assert.True(jsonDoc.RootElement.TryGetProperty("@odata.type", out JsonElement odataTypeElem)); Assert.AreEqual(odataType, odataTypeElem.GetString()); }
/// <summary> /// Tokenizes a field for use in an autocomplete search. Ref DOH-893. /// Inspiration taken from: /// https://github.com/Sitecore/autohaus/blob/master/Autohaus.Custom/Indexing/Analyzers/NGramAnalyzer.cs /// http://stackoverflow.com/a/9183416 /// </summary> public override TokenStream TokenStream(string fieldName, TextReader reader) { // This should be a good tokenizer for most European-language documents: // Splits words at punctuation characters, removing punctuation. // Splits words at hyphens, unless there's a number in the token... // Recognizes email addresses and internet hostnames as one token. TokenStream tokenStream = new StandardTokenizer(this.version, reader); // apply a set of standard filters tokenStream = new StandardFilter(tokenStream); tokenStream = new LowerCaseFilter(tokenStream); // This class converts alphabetic, numeric, and symbolic Unicode characters // which are not in the first 127 ASCII characters (the "Basic Latin" Unicode // block) into their ASCII equivalents, if one exists. tokenStream = new ASCIIFoldingFilter(tokenStream); tokenStream = new StopFilter(false, tokenStream, StopAnalyzer.ENGLISH_STOP_WORDS_SET); // apply the EdgeNGramTokenFilter // this turns each token into a set of prefixes, e.g. // "South Melbourne" will be turned into "Sou South Mel Melb Melb ..." tokenStream = new EdgeNGramTokenFilter(tokenStream, Side.FRONT, this.minGram, this.maxGram); // Removes stop words from a token stream. return(tokenStream); }
public void ExecuteShouldReturnExpectedValue(Scenario scenario) { var filter = new EdgeNGramTokenFilter(); var result = filter.Execute(scenario.Input, scenario.AnalyzeContext); Assert.Equal(scenario.ExpectedResult, result); }
public void TestSmallTokenInStream() { input = new WhitespaceTokenizer(new StringReader("abc de fgh")); EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, Side.FRONT, 3, 3); AssertTokenStreamContents(tokenizer, new String[] { "abc", "fgh" }, new int[] { 0, 7 }, new int[] { 3, 10 }); }
public void TestReset() { WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(new StringReader("abcde")); EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(tokenizer, Side.FRONT, 1, 3); AssertTokenStreamContents(filter, new String[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 }); tokenizer.Reset(new StringReader("abcde")); AssertTokenStreamContents(filter, new String[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 }); }
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { var tokenizer = new StandardTokenizer(MatchLuceneVersion, reader); TokenStream tokenStream = new StandardFilter(MatchLuceneVersion, tokenizer); tokenStream = new LowerCaseFilter(MatchLuceneVersion, tokenStream); tokenStream = new StopFilter(MatchLuceneVersion, tokenStream, StopAnalyzer.ENGLISH_STOP_WORDS_SET); tokenStream = new EdgeNGramTokenFilter(MatchLuceneVersion, tokenStream, MinGramSize, MaxGramSize); return(new TokenStreamComponents(tokenizer, tokenStream)); }
public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { TokenStream result = new StandardTokenizer(kLuceneVersion, reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new ASCIIFoldingFilter(result); result = new StopFilter(false, result, StopFilter.MakeStopSet(kEnglishStopWords)); //DEFAULT_SIDE = Side.FRONT result = new EdgeNGramTokenFilter( result, Lucene.Net.Analysis.NGram.EdgeNGramTokenFilter.DEFAULT_SIDE, 1, 20); return(result); }
public override TokenStream TokenStream(string fieldName, TextReader reader) { //Apply standard tokenizer to input var tokenizedInput = new StandardTokenizer(_version, reader); //Apply standard, lowercase and English stop words filters to input var filteredInput = new SnowballFilter(new StopFilter(true, new LowerCaseFilter(new StandardFilter(tokenizedInput)), StopAnalyzer.ENGLISH_STOP_WORDS_SET), new EnglishStemmer()); //Apply EdgeNGram filter to front of words //Min size of grams max size of grams var grammedInput = new EdgeNGramTokenFilter(filteredInput, Side.FRONT, _mingram, _maxgram); return(grammedInput); }
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new StandardFilter(m_matchVersion, source); // for stripping 's from words result = new EnglishPossessiveFilter(m_matchVersion, result); // converts é to e (and © to (c), etc. result = new ASCIIFoldingFilter(result); result = new LowerCaseFilter(m_matchVersion, result); result = new StopFilter(m_matchVersion, result, EnglishAnalyzer.DefaultStopSet); // for chopping off common word suffixes, like removing ming from stemming, etc. result = new PorterStemFilter(result); // The ngram tokenizer first breaks text down into words whenever it encounters one of a list of specified characters, // then it emits N-grams of each word of the specified length. if (_userNGram) { result = new EdgeNGramTokenFilter(m_matchVersion, result, _ngramMin, _ngramMax); } return(new TokenStreamComponents(source, result)); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testFrontUnigram() throws Exception public virtual void testFrontUnigram() { EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 1, 1); assertTokenStreamContents(tokenizer, new string[]{"a"}, new int[]{0}, new int[]{5}); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testFrontRangeOfNgrams() throws Exception public virtual void testFrontRangeOfNgrams() { EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 1, 3); assertTokenStreamContents(tokenizer, new string[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5}); }
public void TestFrontRangeOfNgrams() { EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, Side.FRONT, 1, 3); AssertTokenStreamContents(tokenizer, new String[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 }); }
public void TestBackRangeOfNgrams() { EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, Side.BACK, 1, 3); AssertTokenStreamContents(tokenizer, new String[] { "e", "de", "cde" }, new int[] { 4, 3, 2 }, new int[] { 5, 5, 5 }); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testOversizedNgrams() throws Exception public virtual void testOversizedNgrams() { EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 6, 6); assertTokenStreamContents(tokenizer, new string[0], new int[0], new int[0]); }
public void TestOversizedNgrams() { EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, Side.FRONT, 6, 6); AssertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0]); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testBackUnigram() throws Exception public virtual void testBackUnigram() { EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(Version.LUCENE_43, input, EdgeNGramTokenFilter.Side.BACK, 1, 1); assertTokenStreamContents(tokenizer, new string[]{"e"}, new int[]{4}, new int[]{5}); }
public void CreatesEdgeNGramTokenFilterV2() { EdgeNGramTokenFilter sut = new EdgeNGramTokenFilter("test"); Assert.AreEqual(@"#Microsoft.Azure.Search.EdgeNGramTokenFilterV2", sut.ODataType); }
private CreateIndexDescriptor GetCreateIndexDescriptor(string indexName, Type objectType) { #region [ Default analyzers and filters ] // Add custom index analyzers CustomAnalyzers.Add("full_string_index_analyzer", new CustomAnalyzer { Tokenizer = "standard", Filter = new List <string> { "standard", "string_delimeter", "stop", "asciifolding", "string_ngrams", "lowercase" } }); CustomAnalyzers.Add("full_keyword_index_analyzer", new CustomAnalyzer { Tokenizer = "keyword", Filter = new List <string> { "standard", "stop", "asciifolding" } }); // Add custom search analyzers CustomAnalyzers.Add("full_string_search_analyzer", new CustomAnalyzer { Tokenizer = "standard", Filter = new List <string> { "standard", "stop", "asciifolding", "lowercase" } }); #endregion // Create a default descriptor CreateIndexDescriptor descriptor = null; // Create default settings var settings = new IndexSettings() { NumberOfReplicas = 1, NumberOfShards = 2 }; // Add additional settings settings.Analysis = new Analysis(); settings.Analysis.TokenFilters = new TokenFilters(); settings.Analysis.Analyzers = new Analyzers(); //settings.Add("index.mapping.single_type", false); settings.Add("index.mapping.total_fields.limit", 2000); settings.Add("index.mapping.nested_fields.limit", 500); settings.Add("index.max_docvalue_fields_search", 500); // Create token filters var stringNGramsTokenFilter = new EdgeNGramTokenFilter { MinGram = 2, MaxGram = 20 }; var stringDelimiterTokenFilter = new WordDelimiterTokenFilter { GenerateWordParts = true, CatenateAll = true, CatenateNumbers = true, CatenateWords = true, SplitOnCaseChange = true, SplitOnNumerics = true, PreserveOriginal = true }; // Add filters settings.Analysis.TokenFilters.Add("string_ngrams", stringNGramsTokenFilter); settings.Analysis.TokenFilters.Add("string_delimeter", stringDelimiterTokenFilter); // Add analyzers CustomAnalyzers.ToList().ForEach(a => { settings.Analysis.Analyzers.Add(a.Key, a.Value); }); // Create the config var indexConfig = new IndexState { Settings = settings }; #region [ LogRecord Mapping ] // Fill the descriptor according to the type if (objectType == typeof(ESLogRecord)) { descriptor = new CreateIndexDescriptor(indexName) .InitializeUsing(indexConfig) .Mappings(ms => ms.Map <ESLogRecord>(m => m.AutoMap())); } #endregion return(descriptor); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testBackRangeOfNgrams() throws Exception public virtual void testBackRangeOfNgrams() { EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(Version.LUCENE_43, input, EdgeNGramTokenFilter.Side.BACK, 1, 3); assertTokenStreamContents(tokenizer, new string[]{"e","de","cde"}, new int[]{4,3,2}, new int[]{5,5,5}, null, null, null, null, false); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testSupplementaryCharacters() throws java.io.IOException public virtual void testSupplementaryCharacters() { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final String s = org.apache.lucene.util.TestUtil.randomUnicodeString(random(), 10); string s = TestUtil.randomUnicodeString(random(), 10); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int codePointCount = s.codePointCount(0, s.length()); int codePointCount = s.codePointCount(0, s.Length); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int minGram = org.apache.lucene.util.TestUtil.nextInt(random(), 1, 3); int minGram = TestUtil.Next(random(), 1, 3); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int maxGram = org.apache.lucene.util.TestUtil.nextInt(random(), minGram, 10); int maxGram = TestUtil.Next(random(), minGram, 10); TokenStream tk = new KeywordTokenizer(new StringReader(s)); tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.tokenattributes.CharTermAttribute termAtt = tk.addAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class); CharTermAttribute termAtt = tk.addAttribute(typeof(CharTermAttribute)); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.tokenattributes.OffsetAttribute offsetAtt = tk.addAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute.class); OffsetAttribute offsetAtt = tk.addAttribute(typeof(OffsetAttribute)); tk.reset(); for (int i = minGram; i <= Math.Min(codePointCount, maxGram); ++i) { assertTrue(tk.incrementToken()); assertEquals(0, offsetAtt.startOffset()); assertEquals(s.Length, offsetAtt.endOffset()); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int end = Character.offsetByCodePoints(s, 0, i); int end = char.offsetByCodePoints(s, 0, i); assertEquals(s.Substring(0, end), termAtt.ToString()); } assertFalse(tk.incrementToken()); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testSmallTokenInStream() throws Exception public virtual void testSmallTokenInStream() { input = new MockTokenizer(new StringReader("abc de fgh"), MockTokenizer.WHITESPACE, false); EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 3, 3); assertTokenStreamContents(tokenizer, new string[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10}); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testReset() throws Exception public virtual void testReset() { WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde")); EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, EdgeNGramTokenFilter.Side.FRONT, 1, 3); assertTokenStreamContents(filter, new string[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5}); tokenizer.Reader = new StringReader("abcde"); assertTokenStreamContents(filter, new string[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5}); }
public void TestFrontUnigram() { EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, Side.FRONT, 1, 1); AssertTokenStreamContents(tokenizer, new String[] { "a" }, new int[] { 0 }, new int[] { 1 }); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testFilterPositions() throws Exception public virtual void testFilterPositions() { TokenStream ts = new MockTokenizer(new StringReader("abcde vwxyz"), MockTokenizer.WHITESPACE, false); EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, ts, EdgeNGramTokenFilter.Side.FRONT, 1, 3); assertTokenStreamContents(tokenizer, new string[]{"a","ab","abc","v","vw","vwx"}, new int[]{0,0,0,6,6,6}, new int[]{5,5,5,11,11,11}, null, new int[]{1,0,0,1,0,0}, null, null, false); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testFirstTokenPositionIncrement() throws Exception public virtual void testFirstTokenPositionIncrement() { TokenStream ts = new MockTokenizer(new StringReader("a abc"), MockTokenizer.WHITESPACE, false); ts = new PositionFilter(ts); // All but first token will get 0 position increment EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, ts, EdgeNGramTokenFilter.Side.FRONT, 2, 3); // The first token "a" will not be output, since it's smaller than the mingram size of 2. // The second token on input to EdgeNGramTokenFilter will have position increment of 0, // which should be increased to 1, since this is the first output token in the stream. assertTokenStreamContents(filter, new string[] {"ab", "abc"}, new int[] {2, 2}, new int[] {5, 5}, new int[] {1, 0}); }
public void TestBackUnigram() { EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, Side.BACK, 1, 1); AssertTokenStreamContents(tokenizer, new String[] { "e" }, new int[] { 4 }, new int[] { 5 }); }
protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenFilter filters = new ASCIIFoldingFilter(tokenizer); filters = new EdgeNGramTokenFilter(Version.LUCENE_43, filters, EdgeNGramTokenFilter.Side.FRONT, 2, 15); return new TokenStreamComponents(tokenizer, filters); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testGraphs() throws java.io.IOException public virtual void testGraphs() { TokenStream tk = new LetterTokenizer(TEST_VERSION_CURRENT, new StringReader("abc d efgh ij klmno p q")); tk = new ShingleFilter(tk); tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, 7, 10); assertTokenStreamContents(tk, new string[] {"efgh ij", "ij klmn", "ij klmno", "klmno p"}, new int[] {6,11,11,14}, new int[] {13,19,19,21}, new int[] {3,1,0,1}, new int[] {2,2,2,2}, 23); }