static StopAnalyzer() { IList <string> stopWords = Arrays.AsList("a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"); var stopSet = new CharArraySet(LuceneVersion.LUCENE_CURRENT, stopWords, false); ENGLISH_STOP_WORDS_SET = CharArraySet.UnmodifiableSet(stopSet); }
internal static ISet <string> LoadDefaultStopWordSet() { using (StreamReader reader = new StreamReader(System.Reflection.Assembly.GetAssembly(typeof(ArabicAnalyzer)).GetManifestResourceStream("Lucene.Net.Analysis.AR." + DEFAULT_STOPWORD_FILE))) { return(CharArraySet.UnmodifiableSet(CharArraySet.Copy(WordlistLoader.GetWordSet(reader, STOPWORDS_COMMENT)))); } }
/* * Builds an analyzer with the given stop words and stemming exclusion words * * @param matchVersion * lucene compatibility version * @param stopwords * a stopword set */ public BrazilianAnalyzer(Version matchVersion, ISet <string> stopwords, ISet <string> stemExclusionSet) : this(matchVersion, stopwords) { excltable = CharArraySet.UnmodifiableSet(CharArraySet .Copy(stemExclusionSet)); }
public DutchAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap <string> stemOverrideDict) { this.matchVersion = matchVersion; this.stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stopwords)); this.excltable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclusionTable)); #pragma warning disable 612, 618 if (stemOverrideDict.Count == 0 || !matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { this.stemdict = null; this.origStemdict = CharArrayMap.UnmodifiableMap(CharArrayMap.Copy(matchVersion, stemOverrideDict)); } else { this.origStemdict = null; // we don't need to ignore case here since we lowercase in this analyzer anyway StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(false); CharArrayMap <string> .EntryIterator iter = (CharArrayMap <string> .EntryIterator)stemOverrideDict.EntrySet().GetEnumerator(); CharsRef spare = new CharsRef(); while (iter.HasNext) { char[] nextKey = iter.NextKey(); spare.CopyChars(nextKey, 0, nextKey.Length); builder.Add(new string(spare.Chars), iter.CurrentValue); } try { this.stemdict = builder.Build(); } catch (IOException ex) { throw new Exception("can not build stem dict", ex); } } }
public DutchAnalyzer(Version matchVersion, ISet <string> stopwords, ISet <string> stemExclusionTable) { stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords)); excltable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stemExclusionTable)); this.matchVersion = matchVersion; SetOverridesTokenStreamMethod <DutchAnalyzer>(); }
/// <summary> /// Builds an analyzer with the given stop words. If a non-empty stem exclusion set is /// provided this analyzer will add a <see cref="SetKeywordMarkerFilter"/> before /// stemming. /// </summary> /// <param name="matchVersion">lucene compatibility version</param> /// <param name="stopwords">a stopword set</param> /// <param name="stemExclusionSet">a set of terms not to be stemmed</param> public PolishAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) : base(matchVersion, stopwords) { this.stemTable = DefaultsHolder.DEFAULT_TABLE; this.stemExclusionSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy( matchVersion, stemExclusionSet)); }
/// <summary> /// Builds an analyzer with the given stop words /// </summary> /// <param name="matchVersion">lucene compatibility version</param> /// <param name="stopwords">a stopword set</param> /// <param name="stemExclusionSet">a stemming exclusion set</param> /// <param name="normalizeDin2">Specifies if the DIN-2007-2 style stemmer should be used in addition to DIN1. This /// will cause words with 'ae', 'ue', or 'oe' in them (expanded umlauts) to be first converted to 'a', 'u', and 'o' /// respectively, before the DIN1 stemmer is invoked.</param> public GermanAnalyzer(Version matchVersion, ISet <string> stopwords, ISet <string> stemExclusionSet, bool normalizeDin2) { stopSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords)); exclusionSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stemExclusionSet)); this.matchVersion = matchVersion; _normalizeDin2 = normalizeDin2; SetOverridesTokenStreamMethod <GermanAnalyzer>(); }
static StopWordList() { { var englishStopSet = new CharArraySet(EnglishStopWords.Length, false); englishStopSet.AddAll(new System.Collections.ArrayList(EnglishStopWords)); EnglishStopWordsSet = CharArraySet.UnmodifiableSet(englishStopSet); } }
public CustomAnalyzer() { _defaultStopwords = new PersianAnalyzer(Lucene.Net.Util.LuceneVersion.LUCENE_48).StopwordSet; _defaultArticles = CharArraySet.UnmodifiableSet( new CharArraySet(Lucene.Net.Util.LuceneVersion.LUCENE_48, Arrays.AsList( Stopwords ), true)); }
internal static CharArraySet LoadDefaultStopWordSet() { // make sure it is unmodifiable as we expose it in the outer class return(CharArraySet.UnmodifiableSet(WordlistLoader.GetWordSet(IOUtils .GetDecodingReader(typeof(SmartChineseAnalyzer), DEFAULT_STOPWORD_FILE, Encoding.UTF8), STOPWORD_FILE_COMMENT, #pragma warning disable 612, 618 LuceneVersion.LUCENE_CURRENT))); #pragma warning restore 612, 618 }
private static CharArraySet LoadEnglishStopWordsSet() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006) { IList <string> stopWords = new string[] { "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with" }; #pragma warning disable 612, 618 var stopSet = new CharArraySet(LuceneVersion.LUCENE_CURRENT, stopWords, false); #pragma warning restore 612, 618 return(CharArraySet.UnmodifiableSet(stopSet)); }
static ISet <String> LoadDefaultStopWordSet() { var stream = System.Reflection.Assembly.GetAssembly(typeof(PersianAnalyzer)).GetManifestResourceStream("Lucene.Net.Analyzers.Fa." + DEFAULT_STOPWORD_FILE); try { StreamReader reader = new StreamReader(stream, System.Text.Encoding.UTF8); // make sure it is unmodifiable as we expose it in the outer class return(CharArraySet.UnmodifiableSet(new CharArraySet(WordlistLoader.GetWordSet(reader, STOPWORDS_COMMENT), true))); } finally { stream.Close(); } }
private static ISet <string> GetStopWords() { int portalId; string cultureCode; var searchDoc = Thread.GetData(Thread.GetNamedDataSlot(Constants.TlsSearchInfo)) as SearchDocument; if (searchDoc == null) { portalId = 0; // default cultureCode = Thread.CurrentThread.CurrentCulture.Name; } else { portalId = searchDoc.PortalId; cultureCode = searchDoc.CultureCode; if (string.IsNullOrEmpty(cultureCode)) { var portalInfo = PortalController.Instance.GetPortal(portalId); if (portalInfo != null) { cultureCode = portalInfo.DefaultLanguage; } } } var stops = StopAnalyzer.ENGLISH_STOP_WORDS_SET; var searchStopWords = SearchHelper.Instance.GetSearchStopWords(portalId, cultureCode); if (searchStopWords != null && !string.IsNullOrEmpty(searchStopWords.StopWords)) { //TODO Use cache from InternalSearchController var cultureInfo = new CultureInfo(cultureCode ?? "en-US"); var strArray = searchStopWords.StopWords.Split(',').Select(s => s.ToLower(cultureInfo)).ToArray(); var set = new CharArraySet(strArray.Length, false); set.AddAll(strArray); stops = CharArraySet.UnmodifiableSet(set); } return(stops); }
static StopWord() { CharArraySet charArraySet = new CharArraySet(0, true); string applicationPath = Path.Combine(LuceneNetConfig.LuceneDictDirectory, "Stopword.txt"); if (File.Exists(applicationPath)) { Encoding encoding = EncodingType.GetType(applicationPath); using (StreamReader sr = new StreamReader(applicationPath, encoding)) { while (!sr.EndOfStream) { string line = sr.ReadLine(); if (line != null) { charArraySet.Add(line); } } } } //charArraySet.AddAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);//英语停用词,我们使用StandardAnalyzer分析器里面已经使用了英语停用词,所以就不需要在添加了。 _StopWordList = CharArraySet.UnmodifiableSet(charArraySet); }
/// <summary> /// Builds an analyzer with the given stop words. If a non-empty stem exclusion set is /// provided this analyzer will add a <see cref="SetKeywordMarkerFilter"/> before /// stemming. /// </summary> /// <param name="matchVersion"> lucene compatibility version </param> /// <param name="stopwords"> a stopword set </param> /// <param name="stemExclusionSet"> a set of terms not to be stemmed </param> public FinnishAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) : base(matchVersion, stopwords) { this.stemExclusionSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclusionSet)); }
/// <summary> /// Builds an analyzer with the given stop words and stemming exclusion words /// </summary> /// <param name="matchVersion"> /// lucene compatibility version </param> /// <param name="stopwords"> /// a stopword set </param> /// <param name="stemExclusionSet"> a set of terms not to be stemmed </param> public BrazilianAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) : this(matchVersion, stopwords) { excltable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclusionSet)); }
/// <summary> /// Builds the named analyzer with the given stop words. </summary> public SnowballAnalyzer(LuceneVersion matchVersion, string name, CharArraySet stopWords) : this(matchVersion, name) { stopSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stopWords)); }
/* * Builds an analyzer with the given stop words * * @param matchVersion * lucene compatibility version * @param stopwords * a stopword set * @param stemExclutionSet * a stemming exclusion set */ public FrenchAnalyzer(Version matchVersion, ISet <string> stopwords, ISet <string> stemExclutionSet) { this.matchVersion = matchVersion; this.stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords)); this.excltable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stemExclutionSet)); }
/// <summary> /// Builds an analyzer with the given stop words /// </summary> /// <param name="matchVersion"> /// lucene compatibility version </param> /// <param name="stopwords"> /// a stopword set </param> /// <param name="stemExclutionSet"> /// a stemming exclusion set </param> public FrenchAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclutionSet) : base(matchVersion, stopwords) { this.excltable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclutionSet)); }
public DanishAnalyzer(Version matchVersion, ISet <string> stopwords, ISet <string> stemExclusionTable) { StopTable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords)); ExclusionTable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stemExclusionTable)); MatchVersion = matchVersion; }
public CJKAnalyzer(Version matchVersion, ISet <string> stopWords) { stopTable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopWords)); this.matchVersion = matchVersion; }
/* * Builds an analyzer with the given stop words and stemming exclusion words * * @param matchVersion * lucene compatibility version * @param stopwords * a stopword set */ public BrazilianAnalyzerCustom(Lucene.Net.Util.Version matchVersion, ISet <string> stopwords, ISet <string> stemExclusionSet) : this(matchVersion, stopwords) { this.excltable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stemExclusionSet)); }
static Stopwords() { PORTUGUESE = new string[] { "a", "ainda", "alem", "ambas", "ambos", "antes", "ao", "aonde", "aos", "apos", "aquele", "aqueles", "as", "assim", "com", "como", "contra", "contudo", "cuja", "cujas", "cujo", "cujos", "da", "das", "de", "dela", "dele", "deles", "demais", "depois", "desde", "desta", "deste", "dispoe", "dispoem", "diversa", "diversas", "diversos", "do", "dos", "durante", "e", "ela", "elas", "ele", "eles", "em", "entao", "entre", "essa", "essas", "esse", "esses", "esta", "estas", "este", "estes", "ha", "isso", "isto", "logo", "mais", "mas", "mediante", "menos", "mesma", "mesmas", "mesmo", "mesmos", "na", "nas", "nao", "nas", "nem", "nesse", "neste", "nos", "o", "os", "ou", "outra", "outras", "outro", "outros", "pelas", "pelas", "pelo", "pelos", "perante", "pois", "por", "porque", "portanto", "proprio", "propios", "quais", "qual", "qualquer", "quando", "quanto", "que", "quem", "quer", "se", "seja", "sem", "sendo", "seu", "seus", "sob", "sobre", "sua", "suas", "tal", "tambem", "teu", "teus", "toda", "todas", "todo", "todos", "tua", "tuas", "tudo", "um", "uma", "umas", "uns" }; var stopSet = new CharArraySet(PORTUGUESE.Length, false); stopSet.AddAll(new System.Collections.ArrayList(PORTUGUESE)); PORTUGUESE_SET = CharArraySet.UnmodifiableSet(stopSet); }
/* * Builds an analyzer with the given stop words * * @param matchVersion * lucene compatibility version * @param stopwords * a stopword set */ public BrazilianAnalyzer(Version matchVersion, ISet <string> stopwords) { stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords)); this.matchVersion = matchVersion; }
/* * Builds an analyzer with the given stop words * * @param matchVersion * lucene compatibility version * @param stopwords * a stopword set */ public BrazilianAnalyzerCustom(Lucene.Net.Util.Version matchVersion, ISet <string> stopwords) { this.stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords)); this.matchVersion = matchVersion; }
/// <summary> /// Creates a new instance initialized with the given stopword set /// </summary> /// <param name="version"> /// the Lucene version for cross version compatibility </param> /// <param name="stopwords"> /// the analyzer's stopword set </param> protected StopwordAnalyzerBase(LuceneVersion version, CharArraySet stopwords) { m_matchVersion = version; // analyzers should use char array set for stopwords! this.m_stopwords = stopwords is null ? CharArraySet.EMPTY_SET : CharArraySet.UnmodifiableSet(CharArraySet.Copy(version, stopwords)); }
/// <summary> /// Builds an analyzer with the given stop words. If a non-empty stem exclusion set is /// provided this analyzer will add a <see cref="SetKeywordMarkerFilter"/> before /// stemming. /// </summary> /// <param name="matchVersion"><see cref="LuceneVersion"/> to match.</param> /// <param name="stopwords">A stopword set.</param> /// <param name="stemExclusionSet">A set of terms not to be stemmed.</param> public UkrainianMorfologikAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) : base(matchVersion, stopwords) { this.stemExclusionSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclusionSet)); }
/* * Builds an analyzer with the given stop words and stemming exclusion words * * @param matchVersion * lucene compatibility version * @param stopwords * a stopword set */ public CzechAnalyzer(Version matchVersion, ISet <string> stopwords) { this.matchVersion = matchVersion; this.stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords)); }
/// <summary> /// Builds an analyzer with the given stop words and a set of work to be /// excluded from the <see cref="CzechStemFilter"/>. /// </summary> /// <param name="matchVersion"> <see cref="LuceneVersion"/> to match </param> /// <param name="stopwords"> a stopword set </param> /// <param name="stemExclusionTable"> a stemming exclusion set </param> public CzechAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable) : base(matchVersion, stopwords) { this.stemExclusionTable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclusionTable)); }
/* * Builds an analyzer with the given stop words * * @param matchVersion * lucene compatibility version * @param stopwords * a stopword set */ public GreekAnalyzer(Version matchVersion, ISet <string> stopwords) { stopSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords)); this.matchVersion = matchVersion; }