public virtual void TestExclude() { CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("havnedistriktene"), false); Analyzer a = new NorwegianAnalyzer(TEST_VERSION_CURRENT, NorwegianAnalyzer.DefaultStopSet, exclusionSet); CheckOneTerm(a, "havnedistriktene", "havnedistriktene"); CheckOneTerm(a, "havnedistrikter", "havnedistrikt"); }
public virtual void TestExclude() { CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("abbandonata"), false); Analyzer a = new ItalianAnalyzer(TEST_VERSION_CURRENT, ItalianAnalyzer.DefaultStopSet, exclusionSet); CheckOneTerm(a, "abbandonata", "abbandonata"); CheckOneTerm(a, "abbandonati", "abbandonat"); }
public virtual void TestExactCase() { StringReader reader = new StringReader("Now is The Time"); CharArraySet stopWords = new CharArraySet(TEST_VERSION_CURRENT, new string[] { "is", "the", "Time" }, false); TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopWords); AssertTokenStreamContents(stream, new string[] { "Now", "The" }); }
public virtual void TestExclude() { CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("correspondente"), false); Analyzer a = new GalicianAnalyzer(TEST_VERSION_CURRENT, GalicianAnalyzer.DefaultStopSet, exclusionSet); CheckOneTerm(a, "correspondente", "correspondente"); CheckOneTerm(a, "corresponderá", "correspond"); }
public virtual void TestExclude() { CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("edeltäjistään"), false); Analyzer a = new FinnishAnalyzer(TEST_VERSION_CURRENT, FinnishAnalyzer.DefaultStopSet, exclusionSet); CheckOneTerm(a, "edeltäjiinsä", "edeltäj"); CheckOneTerm(a, "edeltäjistään", "edeltäjistään"); }
public virtual void TestExclude() { CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("undersøgelse"), false); Analyzer a = new DanishAnalyzer(TEST_VERSION_CURRENT, DanishAnalyzer.DefaultStopSet, exclusionSet); CheckOneTerm(a, "undersøgelse", "undersøgelse"); CheckOneTerm(a, "undersøg", "undersøg"); }
protected CompoundWordTokenFilterBase(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch) : base(input) { termAtt = AddAttribute<ICharTermAttribute>() as CharTermAttribute; offsetAtt = AddAttribute<IOffsetAttribute>(); posIncAtt = AddAttribute<IPositionIncrementAttribute>(); this.matchVersion = matchVersion; this.tokens = new LinkedList<CompoundToken>(); if (minWordSize < 0) { throw new System.ArgumentException("minWordSize cannot be negative"); } this.minWordSize = minWordSize; if (minSubwordSize < 0) { throw new System.ArgumentException("minSubwordSize cannot be negative"); } this.minSubwordSize = minSubwordSize; if (maxSubwordSize < 0) { throw new System.ArgumentException("maxSubwordSize cannot be negative"); } this.maxSubwordSize = maxSubwordSize; this.onlyLongestMatch = onlyLongestMatch; this.dictionary = dictionary; }
public virtual void TestExclude() { CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("jaktkarlarne"), false); Analyzer a = new SwedishAnalyzer(TEST_VERSION_CURRENT, SwedishAnalyzer.DefaultStopSet, exclusionSet); CheckOneTerm(a, "jaktkarlarne", "jaktkarlarne"); CheckOneTerm(a, "jaktkarlens", "jaktkarl"); }
public virtual void TestWithStemExclusionSet() { CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true); set.add("پیاوە"); Analyzer a = new SoraniAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set); AssertAnalyzesTo(a, "پیاوە", new string[] { "پیاوە" }); }
static DefaultsHolder() { try { DEFAULT_STOP_SET = WordlistLoader.GetWordSet(IOUtils.GetDecodingReader(typeof(PolishAnalyzer), typeof(PolishAnalyzer).Namespace + "." + DEFAULT_STOPWORD_FILE, Encoding.UTF8), "#", #pragma warning disable 612, 618 LuceneVersion.LUCENE_CURRENT); #pragma warning restore 612, 618 } catch (IOException ex) { // default set should always be present as it is part of the // distribution (embedded resource) throw new SystemException("Unable to load default stopword set", ex); } try { DEFAULT_TABLE = StempelStemmer.Load(typeof(PolishAnalyzer).Assembly.GetManifestResourceStream( typeof(PolishAnalyzer).Namespace + "." + DEFAULT_STEMMER_FILE)); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (embedded resource) throw new SystemException("Unable to load default stemming tables", ex); } }
/// <summary> /// Test for NPE /// </summary> public virtual void testContainsWithNull() { CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true); try { set.contains((char[]) null, 0, 10); fail("null value must raise NPE"); } catch (System.NullReferenceException) { } try { set.contains((CharSequence) null); fail("null value must raise NPE"); } catch (System.NullReferenceException) { } try { set.contains((object) null); fail("null value must raise NPE"); } catch (System.NullReferenceException) { } }
public virtual void TestWithKeywordAttribute() { CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true); set.add("fischen"); GermanStemFilter filter = new GermanStemFilter(new SetKeywordMarkerFilter(new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader("Fischen Trinken")), set)); AssertTokenStreamContents(filter, new string[] { "fischen", "trink" }); }
public virtual void TestExclude() { CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("արծիվներ"), false); Analyzer a = new ArmenianAnalyzer(TEST_VERSION_CURRENT, ArmenianAnalyzer.DefaultStopSet, exclusionSet); CheckOneTerm(a, "արծիվներ", "արծիվներ"); CheckOneTerm(a, "արծիվ", "արծ"); }
internal readonly bool forceFirstLetter; // make sure the first letter is capital even if it is in the keep list /// <summary> /// Creates a new CapitalizationFilterFactory </summary> public CapitalizationFilterFactory(IDictionary<string, string> args) : base(args) { assureMatchVersion(); bool ignoreCase = getBoolean(args, KEEP_IGNORE_CASE, false); HashSet<string> k = getSet(args, KEEP); if (k != null) { keep = new CharArraySet(luceneMatchVersion, 10, ignoreCase); keep.AddAll(k); } k = getSet(args, OK_PREFIX); if (k != null) { okPrefix = new List<char[]>(); foreach (string item in k) { okPrefix.Add(item.ToCharArray()); } } minWordLength = getInt(args, MIN_WORD_LENGTH, 0); maxWordCount = getInt(args, MAX_WORD_COUNT, CapitalizationFilter.DEFAULT_MAX_WORD_COUNT); maxTokenLength = getInt(args, MAX_TOKEN_LENGTH, CapitalizationFilter.DEFAULT_MAX_TOKEN_LENGTH); onlyFirstWord = getBoolean(args, ONLY_FIRST_WORD, true); forceFirstLetter = getBoolean(args, FORCE_FIRST_LETTER, true); if (args.Count > 0) { throw new System.ArgumentException("Unknown parameters: " + args); } }
public void SetArticles(ISet<string> articles) { if (articles is CharArraySet) this.articles = (CharArraySet)articles; else this.articles = new CharArraySet(articles, true); }
public virtual void TestExclude() { CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("babakocsi"), false); Analyzer a = new HungarianAnalyzer(TEST_VERSION_CURRENT, HungarianAnalyzer.DefaultStopSet, exclusionSet); CheckOneTerm(a, "babakocsi", "babakocsi"); CheckOneTerm(a, "babakocsijáért", "babakocs"); }
public virtual void TestExclude() { CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("peledakan"), false); Analyzer a = new IndonesianAnalyzer(TEST_VERSION_CURRENT, IndonesianAnalyzer.DefaultStopSet, exclusionSet); CheckOneTerm(a, "peledakan", "peledakan"); CheckOneTerm(a, "pembunuhan", "bunuh"); }
public virtual void Inform(ResourceLoader loader) { if (wordFiles != null) { words = GetWordSet(loader, wordFiles, ignoreCase); } }
public virtual void TestExclude() { CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("tirgiem"), false); Analyzer a = new LatvianAnalyzer(TEST_VERSION_CURRENT, LatvianAnalyzer.DefaultStopSet, exclusionSet); CheckOneTerm(a, "tirgiem", "tirgiem"); CheckOneTerm(a, "tirgus", "tirg"); }
public virtual void TestExclude() { CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("books"), false); Analyzer a = new EnglishAnalyzer(TEST_VERSION_CURRENT, EnglishAnalyzer.DefaultStopSet, exclusionSet); CheckOneTerm(a, "books", "books"); CheckOneTerm(a, "book", "book"); }
public virtual void TestExclude() { CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("absenţa"), false); Analyzer a = new RomanianAnalyzer(TEST_VERSION_CURRENT, RomanianAnalyzer.DefaultStopSet, exclusionSet); CheckOneTerm(a, "absenţa", "absenţa"); CheckOneTerm(a, "absenţi", "absenţ"); }
public virtual void TestWithStemExclusionSet() { CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true); set.add("hole"); CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set); AssertAnalyzesTo(cz, "hole desek", new string[] { "hole", "desk" }); }
public virtual void TestWithStemExclusionSet() { CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true); set.add("представление"); Analyzer a = new RussianAnalyzer(TEST_VERSION_CURRENT, RussianAnalyzer.DefaultStopSet, set); AssertAnalyzesTo(a, "Вместе с тем о силе электромагнитной энергии имели представление еще", new string[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представление" }); }
public virtual void TestExclude() { CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("ağacı"), false); Analyzer a = new TurkishAnalyzer(TEST_VERSION_CURRENT, TurkishAnalyzer.DefaultStopSet, exclusionSet); CheckOneTerm(a, "ağacı", "ağacı"); CheckOneTerm(a, "ağaç", "ağaç"); }
public virtual void TestWithStemExclusionSet() { CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true); set.add("строеве"); Analyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set); AssertAnalyzesTo(a, "строевете строеве", new string[] { "строй", "строеве" }); }
public ChineseFilter(TokenStream @in) : base(@in) { stopTable = new CharArraySet(LuceneVersion.LUCENE_CURRENT, Arrays.AsList(STOP_WORDS), false); termAtt = AddAttribute<ICharTermAttribute>(); }
public virtual void TestExclude() { CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("llengües"), false); Analyzer a = new CatalanAnalyzer(TEST_VERSION_CURRENT, CatalanAnalyzer.DefaultStopSet, exclusionSet); CheckOneTerm(a, "llengües", "llengües"); CheckOneTerm(a, "llengua", "llengu"); }
/// <summary> /// Creates a CapitalizationFilter with the specified parameters. </summary> /// <param name="in"> input tokenstream </param> /// <param name="onlyFirstWord"> should each word be capitalized or all of the words? </param> /// <param name="keep"> a keep word list. Each word that should be kept separated by whitespace. </param> /// <param name="forceFirstLetter"> Force the first letter to be capitalized even if it is in the keep list. </param> /// <param name="okPrefix"> do not change word capitalization if a word begins with something in this list. </param> /// <param name="minWordLength"> how long the word needs to be to get capitalization applied. If the /// minWordLength is 3, "and" > "And" but "or" stays "or". </param> /// <param name="maxWordCount"> if the token contains more then maxWordCount words, the capitalization is /// assumed to be correct. </param> /// <param name="maxTokenLength"> ??? </param> public CapitalizationFilter(TokenStream @in, bool onlyFirstWord, CharArraySet keep, bool forceFirstLetter, ICollection<char[]> okPrefix, int minWordLength, int maxWordCount, int maxTokenLength) : base(@in) { // LUCENENET: The guard clauses were copied here from the version of Lucene. // Apparently, the tests were not ported from 4.8.0 because they expected this and the // original tests did not. Adding them anyway because there is no downside to this. if (minWordLength < 0) { throw new ArgumentOutOfRangeException("minWordLength must be greater than or equal to zero"); } if (maxWordCount < 1) { throw new ArgumentOutOfRangeException("maxWordCount must be greater than zero"); } if (maxTokenLength < 1) { throw new ArgumentOutOfRangeException("maxTokenLength must be greater than zero"); } this.onlyFirstWord = onlyFirstWord; this.keep = keep; this.forceFirstLetter = forceFirstLetter; this.okPrefix = okPrefix; this.minWordLength = minWordLength; this.maxWordCount = maxWordCount; this.maxTokenLength = maxTokenLength; termAtt = AddAttribute<ICharTermAttribute>(); }
public virtual void TestExclude() { CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("feirmeoireacht"), false); Analyzer a = new IrishAnalyzer(TEST_VERSION_CURRENT, IrishAnalyzer.DefaultStopSet, exclusionSet); CheckOneTerm(a, "feirmeoireacht", "feirmeoireacht"); CheckOneTerm(a, "siopadóireacht", "siopadóir"); }
public virtual void TestExclude() { CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("chicano"), false); Analyzer a = new SpanishAnalyzer(TEST_VERSION_CURRENT, SpanishAnalyzer.DefaultStopSet, exclusionSet); CheckOneTerm(a, "chicana", "chican"); CheckOneTerm(a, "chicano", "chicano"); }
/// <summary> /// Builds an analyzer with the given stop words. /// </summary> public BulgarianAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords) : this(matchVersion, stopwords, CharArraySet.EMPTY_SET) { }
/// <summary> /// Builds an analyzer with the given stop words. /// </summary> /// <param name="matchVersion"> lucene compatibility version </param> /// <param name="stopwords"> a stopword set </param> public PortugueseAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords) : this(matchVersion, stopwords, CharArraySet.EMPTY_SET) { }
public AnalyzerAnonymousInnerClassHelper2(TestGermanLightStemFilter outerInstance, CharArraySet exclusionSet) { this.outerInstance = outerInstance; this.exclusionSet = exclusionSet; }
/// <summary> /// Creates a new <see cref="DictionaryCompoundWordTokenFilter"/> /// </summary> /// <param name="matchVersion"> /// Lucene version to enable correct Unicode 4.0 behavior in the /// dictionaries if Version > 3.0. See <a /// href="CompoundWordTokenFilterBase.html#version" /// >CompoundWordTokenFilterBase</a> for details. </param> /// <param name="input"> /// the <see cref="TokenStream"/> to process </param> /// <param name="dictionary"> /// the word dictionary to match against. </param> /// <param name="minWordSize"> /// only words longer than this get processed </param> /// <param name="minSubwordSize"> /// only subwords longer than this get to the output stream </param> /// <param name="maxSubwordSize"> /// only subwords shorter than this get to the output stream </param> /// <param name="onlyLongestMatch"> /// Add only the longest matching subword to the stream </param> public DictionaryCompoundWordTokenFilter(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch) : base(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch) { if (dictionary == null) { throw new ArgumentException("dictionary cannot be null"); } }
/// <summary> /// Creates a new <see cref="DictionaryCompoundWordTokenFilter"/> /// </summary> /// <param name="matchVersion"> /// Lucene version to enable correct Unicode 4.0 behavior in the /// dictionaries if Version > 3.0. See <a /// href="CompoundWordTokenFilterBase.html#version" /// >CompoundWordTokenFilterBase</a> for details. </param> /// <param name="input"> /// the <see cref="TokenStream"/> to process </param> /// <param name="dictionary"> /// the word dictionary to match against. </param> public DictionaryCompoundWordTokenFilter(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary) : base(matchVersion, input, dictionary) { if (dictionary == null) { throw new ArgumentException("dictionary cannot be null"); } }
internal static void AssertCapitalizesToKeyword(string input, string expected, bool onlyFirstWord, CharArraySet keep, bool forceFirstLetter, ICollection <char[]> okPrefix, int minWordLength, int maxWordCount, int maxTokenLength) { AssertCapitalizesTo(new MockTokenizer(new StringReader(input), MockTokenizer.KEYWORD, false), new string[] { expected }, onlyFirstWord, keep, forceFirstLetter, okPrefix, minWordLength, maxWordCount, maxTokenLength); }
internal static void AssertCapitalizesTo(Tokenizer tokenizer, string[] expected, bool onlyFirstWord, CharArraySet keep, bool forceFirstLetter, ICollection <char[]> okPrefix, int minWordLength, int maxWordCount, int maxTokenLength) { CapitalizationFilter filter = new CapitalizationFilter(tokenizer, onlyFirstWord, keep, forceFirstLetter, okPrefix, minWordLength, maxWordCount, maxTokenLength, // LUCENENET specific - pass in the invariant culture to get the same behavior as Lucene, // otherwise the filter is culture-sensitive. CultureInfo.InvariantCulture); AssertTokenStreamContents(filter, expected); }
public AnalyzerAnonymousInnerClassHelper2(TestFrenchMinimalStemFilter outerInstance, CharArraySet exclusionSet) { this.outerInstance = outerInstance; this.exclusionSet = exclusionSet; }
protected CompoundWordTokenFilterBase(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary) : this(matchVersion, input, dictionary, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false) { }
/// <summary> /// Builds an analyzer with the given stop words. /// <para> /// <b>NOTE:</b> The stopwords set should be pre-processed with the logic of /// <seealso cref="GreekLowerCaseFilter"/> for best results. /// /// </para> /// </summary> /// <param name="matchVersion"> Lucene compatibility version, /// See <a href="#version">above</a> </param> /// <param name="stopwords"> a stopword set </param> public GreekAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords) : base(matchVersion, stopwords) { }
/// <summary> /// Builds an analyzer with the given stop words and a set of work to be /// excluded from the <see cref="CzechStemFilter"/>. /// </summary> /// <param name="matchVersion"> <see cref="LuceneVersion"/> to match </param> /// <param name="stopwords"> a stopword set </param> /// <param name="stemExclusionTable"> a stemming exclusion set </param> public CzechAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable) : base(matchVersion, stopwords) { this.stemExclusionTable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclusionTable)); }
protected CompoundWordTokenFilterBase(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch) : base(input) { m_termAtt = AddAttribute <ICharTermAttribute>(); m_offsetAtt = AddAttribute <IOffsetAttribute>(); posIncAtt = AddAttribute <IPositionIncrementAttribute>(); this.m_matchVersion = matchVersion; this.m_tokens = new Queue <CompoundToken>(); if (minWordSize < 0) { throw new ArgumentOutOfRangeException(nameof(minWordSize), "minWordSize cannot be negative"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) } this.m_minWordSize = minWordSize; if (minSubwordSize < 0) { throw new ArgumentOutOfRangeException(nameof(minSubwordSize), "minSubwordSize cannot be negative"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) } this.m_minSubwordSize = minSubwordSize; if (maxSubwordSize < 0) { throw new ArgumentOutOfRangeException(nameof(maxSubwordSize), "maxSubwordSize cannot be negative"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) } this.m_maxSubwordSize = maxSubwordSize; this.m_onlyLongestMatch = onlyLongestMatch; this.m_dictionary = dictionary; }
public virtual void testMethods() { CharArrayMap <int?> cm = new CharArrayMap <int?>(TEST_VERSION_CURRENT, 2, false); Dictionary <string, int?> hm = new Dictionary <string, int?>(); hm["foo"] = 1; hm["bar"] = 2; cm.putAll(hm); assertEquals(hm.Count, cm.size()); hm["baz"] = 3; cm.putAll(hm); assertEquals(hm.Count, cm.size()); CharArraySet cs = cm.Keys; int n = 0; foreach (object o in cs) { assertTrue(cm.containsKey(o)); char[] co = (char[])o; assertTrue(cm.containsKey(co, 0, co.Length)); n++; } assertEquals(hm.Count, n); assertEquals(hm.Count, cs.size()); assertEquals(cm.size(), cs.size()); cs.clear(); assertEquals(0, cs.size()); assertEquals(0, cm.size()); try { cs.add("test"); fail("keySet() allows adding new keys"); } catch (System.NotSupportedException) { // pass } cm.putAll(hm); assertEquals(hm.Count, cs.size()); assertEquals(cm.size(), cs.size()); IEnumerator <KeyValuePair <object, int?> > iter1 = cm.entrySet().GetEnumerator(); n = 0; while (iter1.MoveNext()) { KeyValuePair <object, int?> entry = iter1.Current; object key = entry.Key; int? val = entry.Value; assertEquals(cm.get(key), val); entry.Value = val * 100; assertEquals(val * 100, (int)cm.get(key)); n++; } assertEquals(hm.Count, n); cm.clear(); cm.putAll(hm); assertEquals(cm.size(), n); CharArrayMap <int?> .EntryIterator iter2 = cm.entrySet().GetEnumerator(); n = 0; while (iter2.hasNext()) { char[] keyc = iter2.nextKey(); int? val = iter2.currentValue(); assertEquals(hm[new string(keyc)], val); iter2.Value = val * 100; assertEquals(val * 100, (int)cm.get(keyc)); n++; } assertEquals(hm.Count, n); cm.entrySet().clear(); assertEquals(0, cm.size()); assertEquals(0, cm.entrySet().size()); assertTrue(cm.Empty); }
/// <summary> /// Builds an analyzer with the given stop words and a stem exclusion set. /// If a stem exclusion set is provided this analyzer will add a <see cref="SetKeywordMarkerFilter"/> /// before <see cref="BulgarianStemFilter"/>. /// </summary> public BulgarianAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) : base(matchVersion, stopwords) { this.stemExclusionSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclusionSet)); }
/// <summary> /// Builds an analyzer with the given stop words. /// </summary> /// <param name="matchVersion"> lucene compatibility version </param> /// <param name="stopwords"> a stopword set </param> public SwedishAnalyzer(Version matchVersion, CharArraySet stopwords) : this(matchVersion, stopwords, CharArraySet.EMPTY_SET) { }
/// <summary> /// Create a new <see cref="SetKeywordMarkerFilter"/>, that marks the current token as a /// keyword if the tokens term buffer is contained in the given set via the /// <see cref="KeywordAttribute"/>. /// </summary> /// <param name="in"> /// <see cref="TokenStream"/> to filter </param> /// <param name="keywordSet"> /// the keywords set to lookup the current termbuffer </param> public SetKeywordMarkerFilter(TokenStream @in, CharArraySet keywordSet) : base(@in) { this.keywordSet = keywordSet; termAtt = AddAttribute <ICharTermAttribute>(); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: static void assertCapitalizesTo(String input, String expected[], boolean onlyFirstWord, org.apache.lucene.analysis.util.CharArraySet keep, boolean forceFirstLetter, java.util.Collection<char[]> okPrefix, int minWordLength, int maxWordCount, int maxTokenLength) throws java.io.IOException internal static void assertCapitalizesTo(string input, string[] expected, bool onlyFirstWord, CharArraySet keep, bool forceFirstLetter, ICollection <char[]> okPrefix, int minWordLength, int maxWordCount, int maxTokenLength) { assertCapitalizesTo(new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false), expected, onlyFirstWord, keep, forceFirstLetter, okPrefix, minWordLength, maxWordCount, maxTokenLength); }
protected CompoundWordTokenFilterBase(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch) : base(input) { m_termAtt = AddAttribute <ICharTermAttribute>(); m_offsetAtt = AddAttribute <IOffsetAttribute>(); posIncAtt = AddAttribute <IPositionIncrementAttribute>(); this.m_matchVersion = matchVersion; this.m_tokens = new Queue <CompoundToken>(); if (minWordSize < 0) { throw new ArgumentException("minWordSize cannot be negative"); } this.m_minWordSize = minWordSize; if (minSubwordSize < 0) { throw new ArgumentException("minSubwordSize cannot be negative"); } this.m_minSubwordSize = minSubwordSize; if (maxSubwordSize < 0) { throw new ArgumentException("maxSubwordSize cannot be negative"); } this.m_maxSubwordSize = maxSubwordSize; this.m_onlyLongestMatch = onlyLongestMatch; this.m_dictionary = dictionary; }
/// <summary> /// Creates a new WordDelimiterFilter using <see cref="WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE"/> /// as its charTypeTable /// </summary> /// <param name="matchVersion"> lucene compatibility version </param> /// <param name="in"> <see cref="TokenStream"/> to be filtered </param> /// <param name="configurationFlags"> Flags configuring the filter </param> /// <param name="protWords"> If not null is the set of tokens to protect from being delimited </param> public WordDelimiterFilter(LuceneVersion matchVersion, TokenStream @in, WordDelimiterFlags configurationFlags, CharArraySet protWords) : this(matchVersion, @in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords) { }
/// <summary> /// Builds an analyzer with the given stop words. If a non-empty stem exclusion set is /// provided this analyzer will add a <seealso cref="SetKeywordMarkerFilter"/> before /// stemming. /// </summary> /// <param name="matchVersion"> lucene compatibility version </param> /// <param name="stopwords"> a stopword set </param> /// <param name="stemExclusionSet"> a set of terms not to be stemmed </param> public SwedishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) : base(matchVersion, stopwords) { this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet)); }
public ChineseFilter(TokenStream _in) : base(_in) { stopTable = new CharArraySet((IEnumerable <string>)STOP_WORDS, false); termAtt = AddAttribute <ITermAttribute>(); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: static void assertCapitalizesTo(org.apache.lucene.analysis.Tokenizer tokenizer, String expected[], boolean onlyFirstWord, org.apache.lucene.analysis.util.CharArraySet keep, boolean forceFirstLetter, java.util.Collection<char[]> okPrefix, int minWordLength, int maxWordCount, int maxTokenLength) throws java.io.IOException internal static void assertCapitalizesTo(Tokenizer tokenizer, string[] expected, bool onlyFirstWord, CharArraySet keep, bool forceFirstLetter, ICollection <char[]> okPrefix, int minWordLength, int maxWordCount, int maxTokenLength) { CapitalizationFilter filter = new CapitalizationFilter(tokenizer, onlyFirstWord, keep, forceFirstLetter, okPrefix, minWordLength, maxWordCount, maxTokenLength); assertTokenStreamContents(filter, expected); }
public KeepWordFilter(Version version, bool enablePositionIncrements, TokenStream @in, CharArraySet words) : base(version, enablePositionIncrements, @in) { this.words = words; termAtt = AddAttribute <ICharTermAttribute>(); }
/// <summary> /// Creates a new WordDelimiterFilter /// </summary> /// <param name="matchVersion"> lucene compatibility version </param> /// <param name="in"> TokenStream to be filtered </param> /// <param name="charTypeTable"> table containing character types </param> /// <param name="configurationFlags"> Flags configuring the filter </param> /// <param name="protWords"> If not null is the set of tokens to protect from being delimited </param> public WordDelimiterFilter(LuceneVersion matchVersion, TokenStream @in, byte[] charTypeTable, WordDelimiterFlags configurationFlags, CharArraySet protWords) : base(@in) { this.termAttribute = AddAttribute <ICharTermAttribute>(); this.offsetAttribute = AddAttribute <IOffsetAttribute>(); this.posIncAttribute = AddAttribute <IPositionIncrementAttribute>(); this.typeAttribute = AddAttribute <ITypeAttribute>(); concat = new WordDelimiterConcatenation(this); concatAll = new WordDelimiterConcatenation(this); sorter = new OffsetSorter(this); if (!matchVersion.OnOrAfter(LuceneVersion.LUCENE_48)) { throw new System.ArgumentException("This class only works with Lucene 4.8+. To emulate the old (broken) behavior of WordDelimiterFilter, use Lucene47WordDelimiterFilter"); } this.flags = configurationFlags; this.protWords = protWords; this.iterator = new WordDelimiterIterator(charTypeTable, Has(WordDelimiterFlags.SPLIT_ON_CASE_CHANGE), Has(WordDelimiterFlags.SPLIT_ON_NUMERICS), Has(WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE)); }
/// <summary> /// Creates a new <see cref="HyphenationCompoundWordTokenFilter"/> instance. /// </summary> /// <param name="matchVersion"> /// Lucene version to enable correct Unicode 4.0 behavior in the /// dictionaries if Version > 3.0. See <a /// href="CompoundWordTokenFilterBase.html#version" /// >CompoundWordTokenFilterBase</a> for details. </param> /// <param name="input"> /// the <see cref="TokenStream"/> to process </param> /// <param name="hyphenator"> /// the hyphenation pattern tree to use for hyphenation </param> /// <param name="dictionary"> /// the word dictionary to match against. </param> public HyphenationCompoundWordTokenFilter(LuceneVersion matchVersion, TokenStream input, HyphenationTree hyphenator, CharArraySet dictionary) : this(matchVersion, input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false) { }
/// <summary> /// Create a new <seealso cref="KeepWordFilter"/>. /// <para><b>NOTE</b>: The words set passed to this constructor will be directly /// used by this filter and should not be modified. /// </para> /// </summary> /// <param name="version"> the Lucene match version </param> /// <param name="in"> the <seealso cref="TokenStream"/> to consume </param> /// <param name="words"> the words to keep </param> public KeepWordFilter(Version version, TokenStream @in, CharArraySet words) : base(version, @in) { this.words = words; }
/// <summary> /// Builds an analyzer with the given stop words. /// </summary> /// <param name="matchVersion"> lucene compatibility version </param> /// <param name="stopwords"> a stopword set </param> public FinnishAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords) : this(matchVersion, stopwords, CharArraySet.EMPTY_SET) { }
private static Analyzer GetAnalyzer() { var stopwords = new CharArraySet(LuceneVersion.LUCENE_48, System.IO.File.ReadAllLines(stopPath), false); return(new EnglishAnalyzer(LuceneVersion.LUCENE_48, stopwords)); }
public virtual void TestStopPositons() { StringBuilder sb = new StringBuilder(); List <string> a = new List <string>(); for (int i = 0; i < 20; i++) { string w = English.IntToEnglish(i).Trim(); sb.Append(w).Append(" "); if (i % 3 != 0) { a.Add(w); } } log(sb.ToString()); string[] stopWords = a.ToArray(); for (int i = 0; i < a.Count; i++) { log("Stop: " + stopWords[i]); } CharArraySet stopSet = StopFilter.MakeStopSet(TEST_VERSION_CURRENT, stopWords); // with increments StringReader reader = new StringReader(sb.ToString()); #pragma warning disable 612, 618 StopFilter stpf = new StopFilter(Version.LUCENE_40, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet); DoTestStopPositons(stpf, true); // without increments reader = new StringReader(sb.ToString()); stpf = new StopFilter(Version.LUCENE_43, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet); #pragma warning restore 612, 618 DoTestStopPositons(stpf, false); // with increments, concatenating two stop filters List <string> a0 = new List <string>(); List <string> a1 = new List <string>(); for (int i = 0; i < a.Count; i++) { if (i % 2 == 0) { a0.Add(a[i]); } else { a1.Add(a[i]); } } string[] stopWords0 = a0.ToArray(); for (int i = 0; i < a0.Count; i++) { log("Stop0: " + stopWords0[i]); } string[] stopWords1 = a1.ToArray(); for (int i = 0; i < a1.Count; i++) { log("Stop1: " + stopWords1[i]); } CharArraySet stopSet0 = StopFilter.MakeStopSet(TEST_VERSION_CURRENT, stopWords0); CharArraySet stopSet1 = StopFilter.MakeStopSet(TEST_VERSION_CURRENT, stopWords1); reader = new StringReader(sb.ToString()); StopFilter stpf0 = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet0); // first part of the set #pragma warning disable 612, 618 stpf0.SetEnablePositionIncrements(true); #pragma warning restore 612, 618 StopFilter stpf01 = new StopFilter(TEST_VERSION_CURRENT, stpf0, stopSet1); // two stop filters concatenated! DoTestStopPositons(stpf01, true); }
/// <summary> /// Builds an analyzer with the given stop words. </summary> /// <param name="matchVersion"> Lucene version to match - See <see cref="UAX29URLEmailAnalyzer"/> </param> /// <param name="stopWords"> stop words </param> public UAX29URLEmailAnalyzer(LuceneVersion matchVersion, CharArraySet stopWords) : base(matchVersion, stopWords) { }