コード例 #1
0
        /**
         * Build a CharArraySet from an array of common words, appropriate for passing
         * into the CommonGramsFilter constructor,case-sensitive if ignoreCase is
         * false.
         *
         * @param commonWords
         * @param ignoreCase If true, all words are lower cased first.
         * @return a Set containing the words
         */
        public static CharArraySet makeCommonSet(string[] commonWords, bool ignoreCase)
        {
            CharArraySet commonSet = new CharArraySet(commonWords.Length, ignoreCase);

            commonSet.addAll(Arrays.asList(commonWords));
            return(commonSet);
        }
コード例 #2
0
 /**
  * Construct a token stream filtering the given input using a Set of common
  * words to create bigrams, case-sensitive if ignoreCase is false (unless Set
  * is CharArraySet). If <code>commonWords</code> is an instance of
  * {@link CharArraySet} (true if <code>makeCommonSet()</code> was used to
  * construct the set) it will be directly used and <code>ignoreCase</code>
  * will be ignored since <code>CharArraySet</code> directly controls case
  * sensitivity.
  * <p/>
  * If <code>commonWords</code> is not an instance of {@link CharArraySet}, a
  * new CharArraySet will be constructed and <code>ignoreCase</code> will be
  * used to specify the case sensitivity of that set.
  *
  * @param input TokenStream input in filter chain.
  * @param commonWords The set of common words.
  * @param ignoreCase -Ignore case when constructing bigrams for common words.
  */
 public CommonGramsFilter(TokenStream input, Set commonWords, bool ignoreCase) : base(input)
 {
     if (commonWords is CharArraySet)
     {
         this.commonWords = (CharArraySet)commonWords;
     }
     else
     {
         this.commonWords = new CharArraySet(commonWords.size(), ignoreCase);
         this.commonWords.addAll(commonWords);
     }
     init();
 }
コード例 #3
0
        public void inform(ResourceLoader loader)
        {
            string commonWordFiles = (string)args.get("words");

            ignoreCase = getBoolean("ignoreCase", false);

            if (commonWordFiles != null)
            {
                try {
                    List /*<String>*/ files = StrUtils.splitFileNames(commonWordFiles);
                    if (commonWords == null && files.size() > 0)
                    {
                        // default stopwords list has 35 or so words, but maybe don't make it
                        // that big to start
                        commonWords = new CharArraySet(files.size() * 10, ignoreCase);
                    }
                    for (var iter = files.iterator(); iter.hasNext();)
                    {
                        string            file  = (string)iter.next();
                        List /*<String>*/ wlist = loader.getLines(file.Trim());
                        // TODO: once StopFilter.makeStopSet(List) method is available, switch
                        // to using that so we can avoid a toArray() call
                        commonWords.addAll(CommonGramsFilter.makeCommonSet((string[])wlist
                                                                           .toArray(new string[0]), ignoreCase));
                    }
                } catch (IOException e) {
                    throw new System.ApplicationException("Unexpected exception", e);
                }
            }
            else
            {
#pragma warning disable 612
                commonWords = CommonGramsFilter.makeCommonSet(
                    StopAnalyzer.ENGLISH_STOP_WORDS, ignoreCase);
#pragma warning restore 612
            }
        }
コード例 #4
0
 /**
  * Construct a token stream filtering the given input using an Array of common
  * words to create bigrams and is case-sensitive if ignoreCase is false.
  *
  * @param input Tokenstream in filter chain
  * @param commonWords words to be used in constructing bigrams
  * @param ignoreCase -Ignore case when constructing bigrams for common words.
  */
 public CommonGramsFilter(TokenStream input, string[] commonWords, bool ignoreCase) : base(input)
 {
     this.commonWords = makeCommonSet(commonWords, ignoreCase);
     init();
 }