Example #1
0
        static StopAnalyzer()
        {
            IList <string> stopWords = Arrays.AsList("a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with");
            var            stopSet   = new CharArraySet(LuceneVersion.LUCENE_CURRENT, stopWords, false);

            ENGLISH_STOP_WORDS_SET = CharArraySet.UnmodifiableSet(stopSet);
        }
Example #2
0
 internal static ISet <string> LoadDefaultStopWordSet()
 {
     using (StreamReader reader = new StreamReader(System.Reflection.Assembly.GetAssembly(typeof(ArabicAnalyzer)).GetManifestResourceStream("Lucene.Net.Analysis.AR." + DEFAULT_STOPWORD_FILE)))
     {
         return(CharArraySet.UnmodifiableSet(CharArraySet.Copy(WordlistLoader.GetWordSet(reader, STOPWORDS_COMMENT))));
     }
 }
Example #3
0
        /*
         * Builds an analyzer with the given stop words and stemming exclusion words
         *
         * @param matchVersion
         *          lucene compatibility version
         * @param stopwords
         *          a stopword set
         */

        public BrazilianAnalyzer(Version matchVersion, ISet <string> stopwords,
                                 ISet <string> stemExclusionSet)
            : this(matchVersion, stopwords)
        {
            excltable = CharArraySet.UnmodifiableSet(CharArraySet
                                                     .Copy(stemExclusionSet));
        }
Example #4
0
        public DutchAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap <string> stemOverrideDict)
        {
            this.matchVersion = matchVersion;
            this.stoptable    = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stopwords));
            this.excltable    = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclusionTable));
#pragma warning disable 612, 618
            if (stemOverrideDict.Count == 0 || !matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
#pragma warning restore 612, 618
            {
                this.stemdict     = null;
                this.origStemdict = CharArrayMap.UnmodifiableMap(CharArrayMap.Copy(matchVersion, stemOverrideDict));
            }
            else
            {
                this.origStemdict = null;
                // we don't need to ignore case here since we lowercase in this analyzer anyway
                StemmerOverrideFilter.Builder        builder = new StemmerOverrideFilter.Builder(false);
                CharArrayMap <string> .EntryIterator iter    = (CharArrayMap <string> .EntryIterator)stemOverrideDict.EntrySet().GetEnumerator();
                CharsRef spare = new CharsRef();
                while (iter.HasNext)
                {
                    char[] nextKey = iter.NextKey();
                    spare.CopyChars(nextKey, 0, nextKey.Length);
                    builder.Add(new string(spare.Chars), iter.CurrentValue);
                }
                try
                {
                    this.stemdict = builder.Build();
                }
                catch (IOException ex)
                {
                    throw new Exception("can not build stem dict", ex);
                }
            }
        }
Example #5
0
 public DutchAnalyzer(Version matchVersion, ISet <string> stopwords, ISet <string> stemExclusionTable)
 {
     stoptable         = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
     excltable         = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stemExclusionTable));
     this.matchVersion = matchVersion;
     SetOverridesTokenStreamMethod <DutchAnalyzer>();
 }
Example #6
0
 /// <summary>
 /// Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
 /// provided this analyzer will add a <see cref="SetKeywordMarkerFilter"/> before
 /// stemming.
 /// </summary>
 /// <param name="matchVersion">lucene compatibility version</param>
 /// <param name="stopwords">a stopword set</param>
 /// <param name="stemExclusionSet">a set of terms not to be stemmed</param>
 public PolishAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet)
     : base(matchVersion, stopwords)
 {
     this.stemTable        = DefaultsHolder.DEFAULT_TABLE;
     this.stemExclusionSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(
                                                              matchVersion, stemExclusionSet));
 }
Example #7
0
 /// <summary>
 /// Builds an analyzer with the given stop words
 /// </summary>
 /// <param name="matchVersion">lucene compatibility version</param>
 /// <param name="stopwords">a stopword set</param>
 /// <param name="stemExclusionSet">a stemming exclusion set</param>
 /// <param name="normalizeDin2">Specifies if the DIN-2007-2 style stemmer should be used in addition to DIN1.  This
 /// will cause words with 'ae', 'ue', or 'oe' in them (expanded umlauts) to be first converted to 'a', 'u', and 'o'
 /// respectively, before the DIN1 stemmer is invoked.</param>
 public GermanAnalyzer(Version matchVersion, ISet <string> stopwords, ISet <string> stemExclusionSet, bool normalizeDin2)
 {
     stopSet           = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
     exclusionSet      = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stemExclusionSet));
     this.matchVersion = matchVersion;
     _normalizeDin2    = normalizeDin2;
     SetOverridesTokenStreamMethod <GermanAnalyzer>();
 }
Example #8
0
 static StopWordList()
 {
     {
         var englishStopSet = new CharArraySet(EnglishStopWords.Length, false);
         englishStopSet.AddAll(new System.Collections.ArrayList(EnglishStopWords));
         EnglishStopWordsSet = CharArraySet.UnmodifiableSet(englishStopSet);
     }
 }
Example #9
0
            public CustomAnalyzer()
            {
                _defaultStopwords = new PersianAnalyzer(Lucene.Net.Util.LuceneVersion.LUCENE_48).StopwordSet;

                _defaultArticles = CharArraySet.UnmodifiableSet(
                    new CharArraySet(Lucene.Net.Util.LuceneVersion.LUCENE_48,
                                     Arrays.AsList(
                                         Stopwords
                                         ), true));
            }
            internal static CharArraySet LoadDefaultStopWordSet()
            {
                // make sure it is unmodifiable as we expose it in the outer class
                return(CharArraySet.UnmodifiableSet(WordlistLoader.GetWordSet(IOUtils
                                                                              .GetDecodingReader(typeof(SmartChineseAnalyzer), DEFAULT_STOPWORD_FILE,
                                                                                                 Encoding.UTF8), STOPWORD_FILE_COMMENT,
#pragma warning disable 612, 618
                                                                              LuceneVersion.LUCENE_CURRENT)));

#pragma warning restore 612, 618
            }
Example #11
0
        private static CharArraySet LoadEnglishStopWordsSet() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
        {
            IList <string> stopWords = new string[] { "a", "an", "and", "are", "as", "at", "be",
                                                      "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on",
                                                      "or", "such", "that", "the", "their", "then", "there", "these", "they", "this",
                                                      "to", "was", "will", "with" };

#pragma warning disable 612, 618
            var stopSet = new CharArraySet(LuceneVersion.LUCENE_CURRENT, stopWords, false);
#pragma warning restore 612, 618
            return(CharArraySet.UnmodifiableSet(stopSet));
        }
Example #12
0
            static ISet <String> LoadDefaultStopWordSet()
            {
                var stream = System.Reflection.Assembly.GetAssembly(typeof(PersianAnalyzer)).GetManifestResourceStream("Lucene.Net.Analyzers.Fa." + DEFAULT_STOPWORD_FILE);

                try
                {
                    StreamReader reader = new StreamReader(stream, System.Text.Encoding.UTF8);
                    // make sure it is unmodifiable as we expose it in the outer class
                    return(CharArraySet.UnmodifiableSet(new CharArraySet(WordlistLoader.GetWordSet(reader, STOPWORDS_COMMENT), true)));
                }
                finally
                {
                    stream.Close();
                }
            }
Example #13
0
        private static ISet <string> GetStopWords()
        {
            int    portalId;
            string cultureCode;

            var searchDoc = Thread.GetData(Thread.GetNamedDataSlot(Constants.TlsSearchInfo)) as SearchDocument;

            if (searchDoc == null)
            {
                portalId    = 0; // default
                cultureCode = Thread.CurrentThread.CurrentCulture.Name;
            }
            else
            {
                portalId    = searchDoc.PortalId;
                cultureCode = searchDoc.CultureCode;
                if (string.IsNullOrEmpty(cultureCode))
                {
                    var portalInfo = PortalController.Instance.GetPortal(portalId);
                    if (portalInfo != null)
                    {
                        cultureCode = portalInfo.DefaultLanguage;
                    }
                }
            }

            var stops           = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
            var searchStopWords = SearchHelper.Instance.GetSearchStopWords(portalId, cultureCode);

            if (searchStopWords != null && !string.IsNullOrEmpty(searchStopWords.StopWords))
            {
                //TODO Use cache from InternalSearchController
                var cultureInfo = new CultureInfo(cultureCode ?? "en-US");
                var strArray    = searchStopWords.StopWords.Split(',').Select(s => s.ToLower(cultureInfo)).ToArray();
                var set         = new CharArraySet(strArray.Length, false);
                set.AddAll(strArray);
                stops = CharArraySet.UnmodifiableSet(set);
            }

            return(stops);
        }
Example #14
0
        static StopWord()
        {
            CharArraySet charArraySet    = new CharArraySet(0, true);
            string       applicationPath = Path.Combine(LuceneNetConfig.LuceneDictDirectory, "Stopword.txt");

            if (File.Exists(applicationPath))
            {
                Encoding encoding = EncodingType.GetType(applicationPath);
                using (StreamReader sr = new StreamReader(applicationPath, encoding))
                {
                    while (!sr.EndOfStream)
                    {
                        string line = sr.ReadLine();
                        if (line != null)
                        {
                            charArraySet.Add(line);
                        }
                    }
                }
            }
            //charArraySet.AddAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);//英语停用词,我们使用StandardAnalyzer分析器里面已经使用了英语停用词,所以就不需要在添加了。
            _StopWordList = CharArraySet.UnmodifiableSet(charArraySet);
        }
Example #15
0
 /// <summary>
 /// Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
 /// provided this analyzer will add a <see cref="SetKeywordMarkerFilter"/> before
 /// stemming.
 /// </summary>
 /// <param name="matchVersion"> lucene compatibility version </param>
 /// <param name="stopwords"> a stopword set </param>
 /// <param name="stemExclusionSet"> a set of terms not to be stemmed </param>
 public FinnishAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet)
     : base(matchVersion, stopwords)
 {
     this.stemExclusionSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclusionSet));
 }
Example #16
0
 /// <summary>
 /// Builds an analyzer with the given stop words and stemming exclusion words
 /// </summary>
 /// <param name="matchVersion">
 ///          lucene compatibility version </param>
 /// <param name="stopwords">
 ///          a stopword set </param>
 /// <param name="stemExclusionSet"> a set of terms not to be stemmed </param>
 public BrazilianAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet)
     : this(matchVersion, stopwords)
 {
     excltable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclusionSet));
 }
Example #17
0
 /// <summary>
 /// Builds the named analyzer with the given stop words. </summary>
 public SnowballAnalyzer(LuceneVersion matchVersion, string name, CharArraySet stopWords) : this(matchVersion, name)
 {
     stopSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stopWords));
 }
Example #18
0
 /*
  * Builds an analyzer with the given stop words
  *
  * @param matchVersion
  *          lucene compatibility version
  * @param stopwords
  *          a stopword set
  * @param stemExclutionSet
  *          a stemming exclusion set
  */
 public FrenchAnalyzer(Version matchVersion, ISet <string> stopwords, ISet <string> stemExclutionSet)
 {
     this.matchVersion = matchVersion;
     this.stoptable    = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
     this.excltable    = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stemExclutionSet));
 }
Example #19
0
 /// <summary>
 /// Builds an analyzer with the given stop words
 /// </summary>
 /// <param name="matchVersion">
 ///          lucene compatibility version </param>
 /// <param name="stopwords">
 ///          a stopword set </param>
 /// <param name="stemExclutionSet">
 ///          a stemming exclusion set </param>
 public FrenchAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclutionSet)
     : base(matchVersion, stopwords)
 {
     this.excltable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclutionSet));
 }
Example #20
0
 public DanishAnalyzer(Version matchVersion, ISet <string> stopwords, ISet <string> stemExclusionTable)
 {
     StopTable      = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
     ExclusionTable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stemExclusionTable));
     MatchVersion   = matchVersion;
 }
Example #21
0
 public CJKAnalyzer(Version matchVersion, ISet <string> stopWords)
 {
     stopTable         = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopWords));
     this.matchVersion = matchVersion;
 }
Example #22
0
        /*
         * Builds an analyzer with the given stop words and stemming exclusion words
         *
         * @param matchVersion
         *          lucene compatibility version
         * @param stopwords
         *          a stopword set
         */

        public BrazilianAnalyzerCustom(Lucene.Net.Util.Version matchVersion, ISet <string> stopwords,
                                       ISet <string> stemExclusionSet)
            : this(matchVersion, stopwords)
        {
            this.excltable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stemExclusionSet));
        }
Example #23
0
        static Stopwords()
        {
            PORTUGUESE = new string[] {
                "a",
                "ainda",
                "alem",
                "ambas",
                "ambos",
                "antes",
                "ao",
                "aonde",
                "aos",
                "apos",
                "aquele",
                "aqueles",
                "as",
                "assim",
                "com",
                "como",
                "contra",
                "contudo",
                "cuja",
                "cujas",
                "cujo",
                "cujos",
                "da",
                "das",
                "de",
                "dela",
                "dele",
                "deles",
                "demais",
                "depois",
                "desde",
                "desta",
                "deste",
                "dispoe",
                "dispoem",
                "diversa",
                "diversas",
                "diversos",
                "do",
                "dos",
                "durante",
                "e",
                "ela",
                "elas",
                "ele",
                "eles",
                "em",
                "entao",
                "entre",
                "essa",
                "essas",
                "esse",
                "esses",
                "esta",
                "estas",
                "este",
                "estes",
                "ha",
                "isso",
                "isto",
                "logo",
                "mais",
                "mas",
                "mediante",
                "menos",
                "mesma",
                "mesmas",
                "mesmo",
                "mesmos",
                "na",
                "nas",
                "nao",
                "nas",
                "nem",
                "nesse",
                "neste",
                "nos",
                "o",
                "os",
                "ou",
                "outra",
                "outras",
                "outro",
                "outros",
                "pelas",
                "pelas",
                "pelo",
                "pelos",
                "perante",
                "pois",
                "por",
                "porque",
                "portanto",
                "proprio",
                "propios",
                "quais",
                "qual",
                "qualquer",
                "quando",
                "quanto",
                "que",
                "quem",
                "quer",
                "se",
                "seja",
                "sem",
                "sendo",
                "seu",
                "seus",
                "sob",
                "sobre",
                "sua",
                "suas",
                "tal",
                "tambem",
                "teu",
                "teus",
                "toda",
                "todas",
                "todo",
                "todos",
                "tua",
                "tuas",
                "tudo",
                "um",
                "uma",
                "umas",
                "uns"
            };
            var stopSet = new CharArraySet(PORTUGUESE.Length, false);

            stopSet.AddAll(new System.Collections.ArrayList(PORTUGUESE));
            PORTUGUESE_SET = CharArraySet.UnmodifiableSet(stopSet);
        }
Example #24
0
        /*
         * Builds an analyzer with the given stop words
         *
         * @param matchVersion
         *          lucene compatibility version
         * @param stopwords
         *          a stopword set
         */

        public BrazilianAnalyzer(Version matchVersion, ISet <string> stopwords)
        {
            stoptable         = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
            this.matchVersion = matchVersion;
        }
Example #25
0
        /*
         * Builds an analyzer with the given stop words
         *
         * @param matchVersion
         *          lucene compatibility version
         * @param stopwords
         *          a stopword set
         */

        public BrazilianAnalyzerCustom(Lucene.Net.Util.Version matchVersion, ISet <string> stopwords)
        {
            this.stoptable    = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
            this.matchVersion = matchVersion;
        }
Example #26
0
 /// <summary>
 /// Creates a new instance initialized with the given stopword set
 /// </summary>
 /// <param name="version">
 ///          the Lucene version for cross version compatibility </param>
 /// <param name="stopwords">
 ///          the analyzer's stopword set </param>
 protected StopwordAnalyzerBase(LuceneVersion version, CharArraySet stopwords)
 {
     m_matchVersion = version;
     // analyzers should use char array set for stopwords!
     this.m_stopwords = stopwords is null ? CharArraySet.EMPTY_SET : CharArraySet.UnmodifiableSet(CharArraySet.Copy(version, stopwords));
 }
 /// <summary>
 /// Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
 /// provided this analyzer will add a <see cref="SetKeywordMarkerFilter"/> before
 /// stemming.
 /// </summary>
 /// <param name="matchVersion"><see cref="LuceneVersion"/> to match.</param>
 /// <param name="stopwords">A stopword set.</param>
 /// <param name="stemExclusionSet">A set of terms not to be stemmed.</param>
 public UkrainianMorfologikAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet)
     : base(matchVersion, stopwords)
 {
     this.stemExclusionSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclusionSet));
 }
Example #28
0
 /*
  * Builds an analyzer with the given stop words and stemming exclusion words
  *
  * @param matchVersion
  *          lucene compatibility version
  * @param stopwords
  *          a stopword set
  */
 public CzechAnalyzer(Version matchVersion, ISet <string> stopwords)
 {
     this.matchVersion = matchVersion;
     this.stoptable    = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
 }
Example #29
0
 /// <summary>
 /// Builds an analyzer with the given stop words and a set of work to be
 /// excluded from the <see cref="CzechStemFilter"/>.
 /// </summary>
 /// <param name="matchVersion"> <see cref="LuceneVersion"/> to match </param>
 /// <param name="stopwords"> a stopword set </param>
 /// <param name="stemExclusionTable"> a stemming exclusion set </param>
 public CzechAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable)
     : base(matchVersion, stopwords)
 {
     this.stemExclusionTable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclusionTable));
 }
Example #30
0
 /*
  * Builds an analyzer with the given stop words
  *
  * @param matchVersion
  *          lucene compatibility version
  * @param stopwords
  *          a stopword set
  */
 public GreekAnalyzer(Version matchVersion, ISet <string> stopwords)
 {
     stopSet           = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
     this.matchVersion = matchVersion;
 }