public void EdgeNGramTokenFilterRoundtrips(string odataType)
        {
            string jsonContent = $@"{{
    ""@odata.type"": ""{odataType}"",
    ""name"": ""test"",
    ""minGram"": 0,
    ""maxGram"": 1,
    ""side"": ""front""
}}";

            JsonDocument         jsonDoc = JsonDocument.Parse(jsonContent);
            EdgeNGramTokenFilter sut     = TokenFilter.DeserializeTokenFilter(jsonDoc.RootElement) as EdgeNGramTokenFilter;

            Assert.NotNull(sut);
            Assert.AreEqual(odataType, sut.ODataType);
            Assert.AreEqual("test", sut.Name);
            Assert.AreEqual(0, sut.MinGram);
            Assert.AreEqual(1, sut.MaxGram);
            Assert.AreEqual(EdgeNGramTokenFilterSide.Front, sut.Side);

            using MemoryStream stream = new MemoryStream();
            using (Utf8JsonWriter writer = new Utf8JsonWriter(stream))
            {
                ((IUtf8JsonSerializable)sut).Write(writer);
            }

            stream.Position = 0;

            jsonDoc = JsonDocument.Parse(stream);
            Assert.True(jsonDoc.RootElement.TryGetProperty("@odata.type", out JsonElement odataTypeElem));
            Assert.AreEqual(odataType, odataTypeElem.GetString());
        }
        /// <summary>
        /// Tokenizes a field for use in an autocomplete search. Ref DOH-893.
        /// Inspiration taken from:
        /// https://github.com/Sitecore/autohaus/blob/master/Autohaus.Custom/Indexing/Analyzers/NGramAnalyzer.cs
        /// http://stackoverflow.com/a/9183416
        /// </summary>
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            // This should be a good tokenizer for most European-language documents:
            // Splits words at punctuation characters, removing punctuation.
            // Splits words at hyphens, unless there's a number in the token...
            // Recognizes email addresses and internet hostnames as one token.
            TokenStream tokenStream = new StandardTokenizer(this.version, reader);

            // apply a set of standard filters
            tokenStream = new StandardFilter(tokenStream);
            tokenStream = new LowerCaseFilter(tokenStream);

            // This class converts alphabetic, numeric, and symbolic Unicode characters
            // which are not in the first 127 ASCII characters (the "Basic Latin" Unicode
            // block) into their ASCII equivalents, if one exists.
            tokenStream = new ASCIIFoldingFilter(tokenStream);
            tokenStream = new StopFilter(false, tokenStream, StopAnalyzer.ENGLISH_STOP_WORDS_SET);

            // apply the EdgeNGramTokenFilter
            // this turns each token into a set of prefixes, e.g.
            // "South Melbourne" will be turned into "Sou South Mel Melb Melb ..."
            tokenStream = new EdgeNGramTokenFilter(tokenStream, Side.FRONT, this.minGram, this.maxGram);

            // Removes stop words from a token stream.
            return(tokenStream);
        }
        public void ExecuteShouldReturnExpectedValue(Scenario scenario)
        {
            var filter = new EdgeNGramTokenFilter();
            var result = filter.Execute(scenario.Input, scenario.AnalyzeContext);

            Assert.Equal(scenario.ExpectedResult, result);
        }
Esempio n. 4
0
        public void TestSmallTokenInStream()
        {
            input = new WhitespaceTokenizer(new StringReader("abc de fgh"));
            EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, Side.FRONT, 3, 3);

            AssertTokenStreamContents(tokenizer, new String[] { "abc", "fgh" }, new int[] { 0, 7 }, new int[] { 3, 10 });
        }
Esempio n. 5
0
        public void TestReset()
        {
            WhitespaceTokenizer  tokenizer = new WhitespaceTokenizer(new StringReader("abcde"));
            EdgeNGramTokenFilter filter    = new EdgeNGramTokenFilter(tokenizer, Side.FRONT, 1, 3);

            AssertTokenStreamContents(filter, new String[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 });
            tokenizer.Reset(new StringReader("abcde"));
            AssertTokenStreamContents(filter, new String[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 });
        }
Esempio n. 6
0
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            var         tokenizer   = new StandardTokenizer(MatchLuceneVersion, reader);
            TokenStream tokenStream = new StandardFilter(MatchLuceneVersion, tokenizer);

            tokenStream = new LowerCaseFilter(MatchLuceneVersion, tokenStream);
            tokenStream = new StopFilter(MatchLuceneVersion, tokenStream, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
            tokenStream = new EdgeNGramTokenFilter(MatchLuceneVersion, tokenStream, MinGramSize, MaxGramSize);

            return(new TokenStreamComponents(tokenizer, tokenStream));
        }
Esempio n. 7
0
            public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
            {
                TokenStream result = new StandardTokenizer(kLuceneVersion, reader);

                result = new StandardFilter(result);
                result = new LowerCaseFilter(result);
                result = new ASCIIFoldingFilter(result);
                result = new StopFilter(false, result, StopFilter.MakeStopSet(kEnglishStopWords));
                //DEFAULT_SIDE = Side.FRONT
                result = new EdgeNGramTokenFilter(
                    result, Lucene.Net.Analysis.NGram.EdgeNGramTokenFilter.DEFAULT_SIDE, 1, 20);

                return(result);
            }
Esempio n. 8
0
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            //Apply standard tokenizer to input
            var tokenizedInput = new StandardTokenizer(_version, reader);

            //Apply standard, lowercase and English stop words filters to input
            var filteredInput = new SnowballFilter(new StopFilter(true, new LowerCaseFilter(new StandardFilter(tokenizedInput)),
                                                                  StopAnalyzer.ENGLISH_STOP_WORDS_SET), new EnglishStemmer());

            //Apply EdgeNGram filter to front of words
            //Min size of grams max size of grams
            var grammedInput = new EdgeNGramTokenFilter(filteredInput, Side.FRONT, _mingram, _maxgram);

            return(grammedInput);
        }
Esempio n. 9
0
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(m_matchVersion, reader);
            TokenStream result = new StandardFilter(m_matchVersion, source);

            // for stripping 's from words
            result = new EnglishPossessiveFilter(m_matchVersion, result);
            // converts é to e (and © to (c), etc.
            result = new ASCIIFoldingFilter(result);
            result = new LowerCaseFilter(m_matchVersion, result);
            result = new StopFilter(m_matchVersion, result, EnglishAnalyzer.DefaultStopSet);
            // for chopping off common word suffixes, like removing ming from stemming, etc.
            result = new PorterStemFilter(result);

            // The ngram tokenizer first breaks text down into words whenever it encounters one of a list of specified characters,
            // then it emits N-grams of each word of the specified length.
            if (_userNGram)
            {
                result = new EdgeNGramTokenFilter(m_matchVersion, result, _ngramMin, _ngramMax);
            }

            return(new TokenStreamComponents(source, result));
        }
 //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
 //ORIGINAL LINE: public void testFrontUnigram() throws Exception
 public virtual void testFrontUnigram()
 {
     EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 1, 1);
     assertTokenStreamContents(tokenizer, new string[]{"a"}, new int[]{0}, new int[]{5});
 }
 //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
 //ORIGINAL LINE: public void testFrontRangeOfNgrams() throws Exception
 public virtual void testFrontRangeOfNgrams()
 {
     EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 1, 3);
     assertTokenStreamContents(tokenizer, new string[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5});
 }
Esempio n. 12
0
        public void TestFrontRangeOfNgrams()
        {
            EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, Side.FRONT, 1, 3);

            AssertTokenStreamContents(tokenizer, new String[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 });
        }
Esempio n. 13
0
        public void TestBackRangeOfNgrams()
        {
            EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, Side.BACK, 1, 3);

            AssertTokenStreamContents(tokenizer, new String[] { "e", "de", "cde" }, new int[] { 4, 3, 2 }, new int[] { 5, 5, 5 });
        }
 //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
 //ORIGINAL LINE: public void testOversizedNgrams() throws Exception
 public virtual void testOversizedNgrams()
 {
     EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 6, 6);
     assertTokenStreamContents(tokenizer, new string[0], new int[0], new int[0]);
 }
Esempio n. 15
0
        public void TestOversizedNgrams()
        {
            EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, Side.FRONT, 6, 6);

            AssertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0]);
        }
 //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
 //ORIGINAL LINE: public void testBackUnigram() throws Exception
 public virtual void testBackUnigram()
 {
     EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(Version.LUCENE_43, input, EdgeNGramTokenFilter.Side.BACK, 1, 1);
     assertTokenStreamContents(tokenizer, new string[]{"e"}, new int[]{4}, new int[]{5});
 }
        public void CreatesEdgeNGramTokenFilterV2()
        {
            EdgeNGramTokenFilter sut = new EdgeNGramTokenFilter("test");

            Assert.AreEqual(@"#Microsoft.Azure.Search.EdgeNGramTokenFilterV2", sut.ODataType);
        }
Esempio n. 18
0
        private CreateIndexDescriptor GetCreateIndexDescriptor(string indexName, Type objectType)
        {
            #region [ Default analyzers and filters ]

            // Add custom index analyzers
            CustomAnalyzers.Add("full_string_index_analyzer", new CustomAnalyzer {
                Tokenizer = "standard", Filter = new List <string> {
                    "standard", "string_delimeter", "stop", "asciifolding", "string_ngrams", "lowercase"
                }
            });
            CustomAnalyzers.Add("full_keyword_index_analyzer", new CustomAnalyzer {
                Tokenizer = "keyword", Filter = new List <string> {
                    "standard", "stop", "asciifolding"
                }
            });

            // Add custom search analyzers
            CustomAnalyzers.Add("full_string_search_analyzer", new CustomAnalyzer {
                Tokenizer = "standard", Filter = new List <string> {
                    "standard", "stop", "asciifolding", "lowercase"
                }
            });

            #endregion

            // Create a default descriptor
            CreateIndexDescriptor descriptor = null;

            // Create default settings
            var settings = new IndexSettings()
            {
                NumberOfReplicas = 1, NumberOfShards = 2
            };

            // Add additional settings
            settings.Analysis = new Analysis();
            settings.Analysis.TokenFilters = new TokenFilters();
            settings.Analysis.Analyzers    = new Analyzers();
            //settings.Add("index.mapping.single_type", false);
            settings.Add("index.mapping.total_fields.limit", 2000);
            settings.Add("index.mapping.nested_fields.limit", 500);
            settings.Add("index.max_docvalue_fields_search", 500);

            // Create token filters
            var stringNGramsTokenFilter = new EdgeNGramTokenFilter {
                MinGram = 2, MaxGram = 20
            };
            var stringDelimiterTokenFilter = new WordDelimiterTokenFilter {
                GenerateWordParts = true, CatenateAll = true, CatenateNumbers = true, CatenateWords = true, SplitOnCaseChange = true, SplitOnNumerics = true, PreserveOriginal = true
            };

            // Add filters
            settings.Analysis.TokenFilters.Add("string_ngrams", stringNGramsTokenFilter);
            settings.Analysis.TokenFilters.Add("string_delimeter", stringDelimiterTokenFilter);

            // Add analyzers
            CustomAnalyzers.ToList().ForEach(a =>
            {
                settings.Analysis.Analyzers.Add(a.Key, a.Value);
            });

            // Create the config
            var indexConfig = new IndexState
            {
                Settings = settings
            };

            #region [ LogRecord Mapping ]

            // Fill the descriptor according to the type
            if (objectType == typeof(ESLogRecord))
            {
                descriptor = new CreateIndexDescriptor(indexName)
                             .InitializeUsing(indexConfig)
                             .Mappings(ms => ms.Map <ESLogRecord>(m => m.AutoMap()));
            }

            #endregion

            return(descriptor);
        }
 //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
 //ORIGINAL LINE: public void testBackRangeOfNgrams() throws Exception
 public virtual void testBackRangeOfNgrams()
 {
     EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(Version.LUCENE_43, input, EdgeNGramTokenFilter.Side.BACK, 1, 3);
     assertTokenStreamContents(tokenizer, new string[]{"e","de","cde"}, new int[]{4,3,2}, new int[]{5,5,5}, null, null, null, null, false);
 }
 //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
 //ORIGINAL LINE: public void testSupplementaryCharacters() throws java.io.IOException
 public virtual void testSupplementaryCharacters()
 {
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final String s = org.apache.lucene.util.TestUtil.randomUnicodeString(random(), 10);
     string s = TestUtil.randomUnicodeString(random(), 10);
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final int codePointCount = s.codePointCount(0, s.length());
     int codePointCount = s.codePointCount(0, s.Length);
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final int minGram = org.apache.lucene.util.TestUtil.nextInt(random(), 1, 3);
     int minGram = TestUtil.Next(random(), 1, 3);
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final int maxGram = org.apache.lucene.util.TestUtil.nextInt(random(), minGram, 10);
     int maxGram = TestUtil.Next(random(), minGram, 10);
     TokenStream tk = new KeywordTokenizer(new StringReader(s));
     tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram);
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final org.apache.lucene.analysis.tokenattributes.CharTermAttribute termAtt = tk.addAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class);
     CharTermAttribute termAtt = tk.addAttribute(typeof(CharTermAttribute));
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final org.apache.lucene.analysis.tokenattributes.OffsetAttribute offsetAtt = tk.addAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute.class);
     OffsetAttribute offsetAtt = tk.addAttribute(typeof(OffsetAttribute));
     tk.reset();
     for (int i = minGram; i <= Math.Min(codePointCount, maxGram); ++i)
     {
       assertTrue(tk.incrementToken());
       assertEquals(0, offsetAtt.startOffset());
       assertEquals(s.Length, offsetAtt.endOffset());
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final int end = Character.offsetByCodePoints(s, 0, i);
       int end = char.offsetByCodePoints(s, 0, i);
       assertEquals(s.Substring(0, end), termAtt.ToString());
     }
     assertFalse(tk.incrementToken());
 }
 //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
 //ORIGINAL LINE: public void testSmallTokenInStream() throws Exception
 public virtual void testSmallTokenInStream()
 {
     input = new MockTokenizer(new StringReader("abc de fgh"), MockTokenizer.WHITESPACE, false);
     EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 3, 3);
     assertTokenStreamContents(tokenizer, new string[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
 }
 //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
 //ORIGINAL LINE: public void testReset() throws Exception
 public virtual void testReset()
 {
     WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde"));
     EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, EdgeNGramTokenFilter.Side.FRONT, 1, 3);
     assertTokenStreamContents(filter, new string[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5});
     tokenizer.Reader = new StringReader("abcde");
     assertTokenStreamContents(filter, new string[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5});
 }
Esempio n. 23
0
        public void TestFrontUnigram()
        {
            EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, Side.FRONT, 1, 1);

            AssertTokenStreamContents(tokenizer, new String[] { "a" }, new int[] { 0 }, new int[] { 1 });
        }
 //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
 //ORIGINAL LINE: public void testFilterPositions() throws Exception
 public virtual void testFilterPositions()
 {
     TokenStream ts = new MockTokenizer(new StringReader("abcde vwxyz"), MockTokenizer.WHITESPACE, false);
     EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, ts, EdgeNGramTokenFilter.Side.FRONT, 1, 3);
     assertTokenStreamContents(tokenizer, new string[]{"a","ab","abc","v","vw","vwx"}, new int[]{0,0,0,6,6,6}, new int[]{5,5,5,11,11,11}, null, new int[]{1,0,0,1,0,0}, null, null, false);
 }
 //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
 //ORIGINAL LINE: public void testFirstTokenPositionIncrement() throws Exception
 public virtual void testFirstTokenPositionIncrement()
 {
     TokenStream ts = new MockTokenizer(new StringReader("a abc"), MockTokenizer.WHITESPACE, false);
     ts = new PositionFilter(ts); // All but first token will get 0 position increment
     EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, ts, EdgeNGramTokenFilter.Side.FRONT, 2, 3);
     // The first token "a" will not be output, since it's smaller than the mingram size of 2.
     // The second token on input to EdgeNGramTokenFilter will have position increment of 0,
     // which should be increased to 1, since this is the first output token in the stream.
     assertTokenStreamContents(filter, new string[] {"ab", "abc"}, new int[] {2, 2}, new int[] {5, 5}, new int[] {1, 0});
 }
Esempio n. 26
0
        public void TestBackUnigram()
        {
            EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, Side.BACK, 1, 1);

            AssertTokenStreamContents(tokenizer, new String[] { "e" }, new int[] { 4 }, new int[] { 5 });
        }
 protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
 {
     Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
     TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
     filters = new EdgeNGramTokenFilter(Version.LUCENE_43, filters, EdgeNGramTokenFilter.Side.FRONT, 2, 15);
     return new TokenStreamComponents(tokenizer, filters);
 }
 //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
 //ORIGINAL LINE: public void testGraphs() throws java.io.IOException
 public virtual void testGraphs()
 {
     TokenStream tk = new LetterTokenizer(TEST_VERSION_CURRENT, new StringReader("abc d efgh ij klmno p q"));
     tk = new ShingleFilter(tk);
     tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, 7, 10);
     assertTokenStreamContents(tk, new string[] {"efgh ij", "ij klmn", "ij klmno", "klmno p"}, new int[] {6,11,11,14}, new int[] {13,19,19,21}, new int[] {3,1,0,1}, new int[] {2,2,2,2}, 23);
 }