public virtual void TestReset() { Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("please divide this sentence")); TokenStream filter = new ShingleFilter(wsTokenizer, 2); AssertTokenStreamContents(filter, new string[] { "please", "please divide", "divide", "divide this", "this", "this sentence", "sentence" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new string[] { TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE }, new int[] { 1, 0, 1, 0, 1, 0, 1 }); wsTokenizer.Reader = new StringReader("please divide this sentence"); AssertTokenStreamContents(filter, new string[] { "please", "please divide", "divide", "divide this", "this", "this sentence", "sentence" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new string[] { TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE }, new int[] { 1, 0, 1, 0, 1, 0, 1 }); }
public void TestReset() { WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(new StringReader("abcde")); NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1); AssertTokenStreamContents(filter, new String[] { "a", "b", "c", "d", "e" }, new int[] { 0, 1, 2, 3, 4 }, new int[] { 1, 2, 3, 4, 5 }); tokenizer.Reset(new StringReader("abcde")); AssertTokenStreamContents(filter, new String[] { "a", "b", "c", "d", "e" }, new int[] { 0, 1, 2, 3, 4 }, new int[] { 1, 2, 3, 4, 5 }); }
/// <summary> /// </summary> public override TokenStream TokenStream(string fieldName, TextReader reader) { TokenStream t = null; t = new WhitespaceTokenizer(reader); t = new LowerCaseFilter(t); return(t); }
static void assertAlgorithm(IStringEncoder encoder, bool inject, String input, String[] expected) { Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)); PhoneticFilter filter = new PhoneticFilter(tokenizer, encoder, inject); AssertTokenStreamContents(filter, expected); }
public virtual void TestReset() { WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde")); NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 1, 1); AssertTokenStreamContents(filter, new string[] { "a", "b", "c", "d", "e" }, new int[] { 0, 0, 0, 0, 0 }, new int[] { 5, 5, 5, 5, 5 }, new int[] { 1, 0, 0, 0, 0 }); tokenizer.Reader = new StringReader("abcde"); AssertTokenStreamContents(filter, new string[] { "a", "b", "c", "d", "e" }, new int[] { 0, 0, 0, 0, 0 }, new int[] { 5, 5, 5, 5, 5 }, new int[] { 1, 0, 0, 0, 0 }); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testReset() throws Exception public virtual void testReset() { WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde")); EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, EdgeNGramTokenFilter.Side.FRONT, 1, 3); assertTokenStreamContents(filter, new string[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 5, 5, 5 }); tokenizer.Reader = new StringReader("abcde"); assertTokenStreamContents(filter, new string[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 5, 5, 5 }); }
public void TestReset() { WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(new StringReader("abcde")); EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(tokenizer, EdgeNGramTokenFilter.Side.Front, 1, 3); AssertTokenStreamContents(filter, new String[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 }); tokenizer.Reset(new StringReader("abcde")); AssertTokenStreamContents(filter, new String[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 }); }
/// <summary> /// Override of the token stream method, uses these filters in order: /// /// Whitespace splitter /// ASCII common folder (ie é goes to e) /// Lowercase /// Stopwords removed /// Porter stemming (reduces words to common stem) /// </summary> protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { var tokenizer = new WhitespaceTokenizer(LeoLuceneVersion.Version, reader); TokenStream filter = new ASCIIFoldingFilter(tokenizer); filter = new LowerCaseFilter(LeoLuceneVersion.Version, filter); filter = new StopFilter(LeoLuceneVersion.Version, filter, _words); filter = new PorterStemFilter(filter); return(new TokenStreamComponents(tokenizer, filter)); }
public virtual void TestReset() { WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde")); #pragma warning disable 612, 618 EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, EdgeNGramTokenFilter.Side.FRONT, 1, 3); #pragma warning restore 612, 618 AssertTokenStreamContents(filter, new string[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 5, 5, 5 }); tokenizer.SetReader(new StringReader("abcde")); AssertTokenStreamContents(filter, new string[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 5, 5, 5 }); }
public void ShouldTokenize() { var input = " hello pretty \r\n world "; var tokenizer = new WhitespaceTokenizer(); using (var streamReader = input.AsStreamReader()) { var tokens = tokenizer.Tokenize(streamReader); CollectionAssert.AreEquivalent(new[] { "hello", "pretty", "world" }, tokens); } }
public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader) { var tokenizer = (Tokenizer) PreviousTokenStream; if (tokenizer == null) { tokenizer = new WhitespaceTokenizer(reader); PreviousTokenStream = tokenizer; } else tokenizer.Reset(reader); return tokenizer; }
public void TestNonConvertableStringsWithoutInject() { TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("12345 #$%@#^%&")); TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false); AssertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" }); // should have something after the stream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("12345 #$%@#^%& hello")); filter = new DoubleMetaphoneFilter(stream, 8, false); AssertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&", "HL" }); }
// Make sure old style next() calls result in a new copy of payloads //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testPayloadCopy() throws java.io.IOException public virtual void testPayloadCopy() { string s = "how now brown cow"; TokenStream ts; ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(s)); ts = new PayloadSetter(ts); verifyPayload(ts); ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(s)); ts = new PayloadSetter(ts); verifyPayload(ts); }
public void ShouldTokenizeLongText() { var word1 = Utils.RandomWord(1000, 1000); var word2 = Utils.RandomWord(1000, 1000); var word3 = Utils.RandomWord(1000, 1000); var input = $" \r\n {word1} {word2} \r\n {word3} "; var tokenizer = new WhitespaceTokenizer(); using (var streamReader = input.AsStreamReader()) { var tokens = tokenizer.Tokenize(streamReader); CollectionAssert.AreEquivalent(new[] { word1, word2, word3 }, tokens); } }
public override TokenStream ReusableTokenStream(string fieldName, System.IO.TextReader reader) { var tokenizer = (Tokenizer)PreviousTokenStream; if (tokenizer == null) { tokenizer = new WhitespaceTokenizer(reader); PreviousTokenStream = tokenizer; } else { tokenizer.Reset(reader); } return(tokenizer); }
public virtual void TestRetainMockAttribute() { CharArraySet dict = makeDictionary("abc", "d", "efg"); Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcdefg")); TokenStream stream = new MockRetainAttributeFilter(tokenizer); stream = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, stream, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); IMockRetainAttribute retAtt = stream.AddAttribute <IMockRetainAttribute>(); stream.Reset(); while (stream.IncrementToken()) { assertTrue("Custom attribute value was lost", retAtt.Retain); } }
public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { //TokenStream result = new StandardTokenizer(matchVersion, reader); TokenStream result = new WhitespaceTokenizer(@reader); result = new StandardFilter(result); result = new EnglishPossessiveFilter(result); result = new LowerCaseFilter(result); //La ricerca nel titolo deve indicizzare tutto, pertanto niente stopwords //result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stopTable); result = new PorterStemFilter(result); //Per gestire la creazione di parole tagliando i simboli result = new SymbolsFilter(result); return(result); }
public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader) { Tokenizer tokenizer = (Tokenizer)GetPreviousTokenStream(); if (tokenizer == null) { tokenizer = new WhitespaceTokenizer(reader); SetPreviousTokenStream(tokenizer); } else { tokenizer.Reset(reader); } return(tokenizer); }
public virtual void TestRandomRealisticWhiteSpace() { IDictionary <string, string> map = new Dictionary <string, string>(); int numTerms = AtLeast(50); for (int i = 0; i < numTerms; i++) { string randomRealisticUnicodeString = TestUtil.RandomRealisticUnicodeString(Random); char[] charArray = randomRealisticUnicodeString.ToCharArray(); StringBuilder sb = new StringBuilder(); for (int j = 0; j < charArray.Length;) { int cp = Character.CodePointAt(charArray, j, charArray.Length); if (!char.IsWhiteSpace((char)cp)) { sb.AppendCodePoint(cp); } j += Character.CharCount(cp); } if (sb.Length > 0) { string value = TestUtil.RandomSimpleString(Random); map[sb.ToString()] = value.Length == 0 ? "a" : value; } } if (map.Count == 0) { map["booked"] = "books"; } StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(Random.nextBoolean()); IDictionary <string, string> entrySet = map; StringBuilder input = new StringBuilder(); IList <string> output = new List <string>(); foreach (KeyValuePair <string, string> entry in entrySet) { builder.Add(entry.Key, entry.Value); if (Random.nextBoolean() || output.Count == 0) { input.Append(entry.Key).Append(" "); output.Add(entry.Value); } } Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input.ToString())); TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.Build())); AssertTokenStreamContents(stream, output.ToArray()); }
/// <summary> /// Test that LowercaseFilter handles the lowercasing correctly if the term /// buffer has a trailing surrogate character leftover and the current term in /// the buffer ends with a corresponding leading surrogate. /// </summary> //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testLowerCaseFilterLowSurrogateLeftover() throws java.io.IOException public virtual void testLowerCaseFilterLowSurrogateLeftover() { // test if the limit of the termbuffer is correctly used with supplementary // chars WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("BogustermBogusterm\udc16")); LowerCaseFilter filter = new LowerCaseFilter(TEST_VERSION_CURRENT, tokenizer); assertTokenStreamContents(filter, new string[] {"bogustermbogusterm\udc16"}); filter.reset(); string highSurEndingUpper = "BogustermBoguster\ud801"; string highSurEndingLower = "bogustermboguster\ud801"; tokenizer.Reader = new StringReader(highSurEndingUpper); assertTokenStreamContents(filter, new string[] {highSurEndingLower}); assertTrue(filter.hasAttribute(typeof(CharTermAttribute))); char[] termBuffer = filter.getAttribute(typeof(CharTermAttribute)).buffer(); int length = highSurEndingLower.Length; assertEquals('\ud801', termBuffer[length - 1]); }
public void TestFilterWithMark() { TokenStream stream = new WhitespaceTokenizer(new StringReader( "Do have a nice day")); // 1-4 length string ReverseStringFilter filter = new ReverseStringFilter(stream, '\u0001'); ITermAttribute text = filter.GetAttribute <ITermAttribute>(); Assert.True(filter.IncrementToken()); Assert.AreEqual("\u0001oD", text.Term); Assert.True(filter.IncrementToken()); Assert.AreEqual("\u0001evah", text.Term); Assert.True(filter.IncrementToken()); Assert.AreEqual("\u0001a", text.Term); Assert.True(filter.IncrementToken()); Assert.AreEqual("\u0001ecin", text.Term); Assert.True(filter.IncrementToken()); Assert.AreEqual("\u0001yad", text.Term); Assert.False(filter.IncrementToken()); }
public void TestIterator() { var wst = new WhitespaceTokenizer(new StringReader("one two three four five")); var smf = new ShingleMatrixFilter(wst, 2, 2, '_', false, new OneDimensionalNonWeightedTokenSettingsCodec()); int i; for (i = 0; smf.IncrementToken(); i++) { } Assert.AreEqual(4, i); // call next once more. this should return false again rather than throwing an exception (LUCENE-1939) Assert.IsFalse(smf.IncrementToken()); //System.DateTime.Now; }
/// <summary> /// Test that LowercaseFilter handles the lowercasing correctly if the term /// buffer has a trailing surrogate character leftover and the current term in /// the buffer ends with a corresponding leading surrogate. /// </summary> //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testLowerCaseFilterLowSurrogateLeftover() throws java.io.IOException public virtual void testLowerCaseFilterLowSurrogateLeftover() { // test if the limit of the termbuffer is correctly used with supplementary // chars WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("BogustermBogusterm\udc16")); LowerCaseFilter filter = new LowerCaseFilter(TEST_VERSION_CURRENT, tokenizer); assertTokenStreamContents(filter, new string[] { "bogustermbogusterm\udc16" }); filter.reset(); string highSurEndingUpper = "BogustermBoguster\ud801"; string highSurEndingLower = "bogustermboguster\ud801"; tokenizer.Reader = new StringReader(highSurEndingUpper); assertTokenStreamContents(filter, new string[] { highSurEndingLower }); assertTrue(filter.hasAttribute(typeof(CharTermAttribute))); char[] termBuffer = filter.getAttribute(typeof(CharTermAttribute)).buffer(); int length = highSurEndingLower.Length; assertEquals('\ud801', termBuffer[length - 1]); }
// the ordering of these filters is important! public override TokenStream TokenStream(string fieldName, TextReader reader) { if (string.Equals("MilitaryIDNumber", fieldName)) { TokenStream result = new WhitespaceTokenizer(reader); result = new LowerCaseFilter(result); result = new ASCIIFoldingFilter(result); result = new AlphaNumericFilter(result); // behaves weirdly when used on Name field // during indexing, we will encounter some of the following extraneous text we don't care about. string[] stopWords = new string[] { "", "formerly", "or", "former", "pir", "tbc", "id", "pnc" }; return(new StopFilter(false, result, new CharArraySet(stopWords, true), true)); } else { TokenStream result = new AlphaNumericTokenizer(reader); result = new LowerCaseFilter(result); return(new ASCIIFoldingFilter(result)); } }
public void TestReset() { Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence")); TokenStream filter = new ShingleFilter(wsTokenizer, 2); AssertTokenStreamContents(filter, new[] { "please", "please divide", "divide", "divide this", "this", "this sentence", "sentence" }, new[] { 0, 0, 7, 7, 14, 14, 19 }, new[] { 6, 13, 13, 18, 18, 27, 27 }, new[] { TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE }, new[] { 1, 0, 1, 0, 1, 0, 1 } ); wsTokenizer.Reset(new StringReader("please divide this sentence")); AssertTokenStreamContents(filter, new[] { "please", "please divide", "divide", "divide this", "this", "this sentence", "sentence" }, new[] { 0, 0, 7, 7, 14, 14, 19 }, new[] { 6, 13, 13, 18, 18, 27, 27 }, new[] { TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE }, new[] { 1, 0, 1, 0, 1, 0, 1 } ); }
public virtual void TestQueryReset() { const string input = "How the s a brown s cow d like A B thing?"; WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)); CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords); CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf); ICharTermAttribute term = wt.AddAttribute <ICharTermAttribute>(); nsf.Reset(); assertTrue(nsf.IncrementToken()); assertEquals("How_the", term.ToString()); assertTrue(nsf.IncrementToken()); assertEquals("the_s", term.ToString()); nsf.Dispose(); wt.SetReader(new StringReader(input)); nsf.Reset(); assertTrue(nsf.IncrementToken()); assertEquals("How_the", term.ToString()); }
public virtual void TestReset() { CharArraySet dict = makeDictionary("Rind", "Fleisch", "Draht", "Schere", "Gesetz", "Aufgabe", "Überwachung"); Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("Rindfleischüberwachungsgesetz")); DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, wsTokenizer, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); ICharTermAttribute termAtt = tf.GetAttribute <ICharTermAttribute>(); tf.Reset(); assertTrue(tf.IncrementToken()); assertEquals("Rindfleischüberwachungsgesetz", termAtt.ToString()); assertTrue(tf.IncrementToken()); assertEquals("Rind", termAtt.ToString()); tf.End(); tf.Dispose(); wsTokenizer.SetReader(new StringReader("Rindfleischüberwachungsgesetz")); tf.Reset(); assertTrue(tf.IncrementToken()); assertEquals("Rindfleischüberwachungsgesetz", termAtt.ToString()); }
public static ArrayList defaultSplit(String inputString) { StringReader reader = new StringReader(inputString); Tokenizer whiteSpaceTokenizer = new WhitespaceTokenizer(reader); TokenStream tokenStream = new LengthFilter(whiteSpaceTokenizer, 2, int.MaxValue); var termAttribute = tokenStream.GetAttribute <ITermAttribute>(); tokenStream.Reset(); ArrayList tokenizedString = new ArrayList(); while (tokenStream.IncrementToken()) { tokenizedString.Add(termAttribute.Term); } tokenStream.End(); tokenStream.Dispose(); return(tokenizedString); }
protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new WhitespaceTokenizer(LuceneVersion.LUCENE_CURRENT, reader); return(new TokenStreamComponents(tokenizer, new ChineseFilter(tokenizer))); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testWhitespaceTokenizer() throws java.io.IOException public virtual void testWhitespaceTokenizer() { StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest"); WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); assertTokenStreamContents(tokenizer, new string[] {"Tokenizer", "\ud801\udc1ctest"}); }
public IList <string> TestWhitespaceTokenizerReturnDelimiterTrue(string text) { var tokenizer = new WhitespaceTokenizer(true); return(tokenizer.Tokenize(text)); }
/// <summary> /// Creates the cache type mapping and analyzer. /// </summary> /// <param name="documentType">Type of the document.</param> /// <param name="deleteIfExists">if set to <c>true</c> [delete if exists].</param> public override void CreateIndex(Type documentType, bool deleteIfExists = true) { try { var indexName = documentType.Name.ToLower(); object instance = Activator.CreateInstance(documentType); // Check if index already exists. If it exists, no need to create it again if (_indexes.ContainsKey(indexName)) { return; } Index index = new Index(); // make sure this is an index document if (instance is IndexModelBase) { Dictionary <string, TypeMappingProperties> typeMapping = new Dictionary <string, TypeMappingProperties>(); Dictionary <string, Analyzer> fieldAnalyzers = new Dictionary <string, Analyzer>(); // get properties from the model and add them to the index (hint: attributes will be added dynamically as the documents are loaded) var modelProperties = documentType.GetProperties(); foreach (var property in modelProperties) { var indexAttribute = property.GetCustomAttributes(typeof(RockIndexField), false); if (indexAttribute.Length > 0) { var attribute = ( RockIndexField )indexAttribute[0]; var propertyName = property.Name; // rewrite non-string index option (would be nice if they made the enums match up...) if (attribute.Type != IndexFieldType.String) { if (attribute.Index == IndexType.NotIndexed) { continue; } } var typeMappingProperty = new TypeMappingProperties(); typeMappingProperty.Name = propertyName; typeMappingProperty.Boost = ( float )attribute.Boost; switch (attribute.Type) { case IndexFieldType.Boolean: case IndexFieldType.Date: case IndexFieldType.Number: { typeMappingProperty.IndexType = IndexType.NotAnalyzed; typeMappingProperty.Analyzer = string.Empty; break; } default: { typeMappingProperty.IndexType = attribute.Index; if (!string.IsNullOrWhiteSpace(attribute.Analyzer)) { typeMappingProperty.Analyzer = attribute.Analyzer; } break; } } typeMapping.Add(propertyName, typeMappingProperty); if (typeMappingProperty.Analyzer?.ToLowerInvariant() == "snowball") { fieldAnalyzers[typeMappingProperty.Name] = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { var tokenizer = new StandardTokenizer(_matchVersion, reader); var sbpff = new SnowballPorterFilterFactory(new Dictionary <string, string>() { { "language", "English" } }); sbpff.Inform(new ClasspathResourceLoader(documentType)); TokenStream result = sbpff.Create(new StandardTokenizer(_matchVersion, reader)); return(new TokenStreamComponents(tokenizer, result)); // https://github.com/apache/lucenenet/blob/master/src/Lucene.Net.Analysis.Common/Analysis/Snowball/SnowballAnalyzer.cs }); } else if (typeMappingProperty.Analyzer?.ToLowerInvariant() == "whitespace") { fieldAnalyzers[propertyName] = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { var tokenizer = new WhitespaceTokenizer(_matchVersion, reader); TokenStream result = new StandardFilter(_matchVersion, tokenizer); return(new TokenStreamComponents(tokenizer, result)); }); } } } index.MappingProperties = typeMapping; index.FieldAnalyzers = fieldAnalyzers; _indexes[indexName] = index; } } catch (Exception ex) { HttpContext context2 = HttpContext.Current; ExceptionLogService.LogException(ex, context2); } }
public virtual void testWhitespaceTokenizerBWCompat() { StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest"); WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_30, reader); assertTokenStreamContents(tokenizer, new string[] {"Tokenizer", "\ud801\udc1ctest"}); }
public override TokenStreamComponents createComponents(string fieldName, Reader reader) { Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); return new TokenStreamComponents(tokenizer, new UpperCaseFilter(TEST_VERSION_CURRENT, tokenizer)); }
public async Task AfterSubscribeMultiple_FindFiles_ReturnResult_ChangeFiles_ReturnUpdated(int inits, int changes, int searches) { var rng = new Random(); var tokenizer = new WhitespaceTokenizer(); var analyzer = new FileAnalyzer(tokenizer); Dictionary <string, List <string> > InitFolder(TempFolder folder, int count) { var folderContent = new Dictionary <string, List <string> >(); for (var i = 0; i < count; i++) { var guids = new List <string>(Generators.Generate(100, () => Guid.NewGuid().ToString())); var text = string.Join(' ', guids); var name = folder.CreateFile(() => text); folderContent[name] = guids; } return(folderContent); } using var tmp = new TempFolder(0); var content = InitFolder(tmp, inits); await analyzer.Subscribe(tmp.FolderPath); var keys = content.Keys.ToList(); var searchTasks = new Dictionary <string, Task <IEnumerable <string> > >(); var delayedSearchTasks = new Dictionary <string, Task <IEnumerable <string> > >(); for (var i = 0; i < searches; i++) { var key = keys[rng.Next(keys.Count)]; var values = content[key]; var query = values[rng.Next(values.Count)]; searchTasks[key] = Task.Run(() => analyzer.Search(query)); delayedSearchTasks[key] = Task.Run(async() => await analyzer.DelayedSearch(query)); } await Task.WhenAll(searchTasks.Values); await Task.WhenAll(delayedSearchTasks.Values); Assert.All(searchTasks, kv => { var(key, task) = kv; var result = task.Result.ToList(); if (result.Any()) { Assert.All(result, s => Assert.True(s.Equals(key, StringComparison.InvariantCultureIgnoreCase))); } }); Assert.All(delayedSearchTasks, kv => { var(key, task) = kv; var result = task.Result.ToList(); Assert.NotEmpty(result); Assert.All(result, s => Assert.True(s.Equals(key, StringComparison.InvariantCultureIgnoreCase))); }); SpinWait.SpinUntil(() => !analyzer.IsIndexing); var changedFiles = new Dictionary <string, List <string> >(); for (var i = 0; i < changes; i++) { var guids = new List <string>(Generators.Generate(100, () => Guid.NewGuid().ToString())); var text = string.Join(' ', guids); var key = keys[rng.Next(keys.Count)]; keys.Remove(key); changedFiles[key] = guids; tmp.ChangeFile(key, () => text); } await Task.Delay(10 *changes); SpinWait.SpinUntil(() => !analyzer.IsIndexing); keys = changedFiles.Keys.ToList(); searchTasks = new Dictionary <string, Task <IEnumerable <string> > >(); delayedSearchTasks = new Dictionary <string, Task <IEnumerable <string> > >(); for (var i = 0; i < searches; i++) { var key = keys[rng.Next(keys.Count)]; var values = changedFiles[key]; var query = values[rng.Next(values.Count)]; searchTasks[key] = Task.Run(() => analyzer.Search(query)); delayedSearchTasks[key] = Task.Run(async() => await analyzer.DelayedSearch(query)); } await Task.WhenAll(searchTasks.Values); await Task.WhenAll(delayedSearchTasks.Values); Assert.All(searchTasks, kv => { var(key, task) = kv; var result = task.Result.ToList(); if (result.Any()) { Assert.All(result, s => Assert.True(s.Equals(key, StringComparison.InvariantCultureIgnoreCase))); } }); Assert.All(delayedSearchTasks, kv => { var(key, task) = kv; var result = task.Result.ToList(); Assert.NotEmpty(result); Assert.All(result, s => Assert.True(s.Equals(key, StringComparison.InvariantCultureIgnoreCase))); }); }