/** * Build a CharArraySet from an array of common words, appropriate for passing * into the CommonGramsFilter constructor,case-sensitive if ignoreCase is * false. * * @param commonWords * @param ignoreCase If true, all words are lower cased first. * @return a Set containing the words */ public static CharArraySet makeCommonSet(string[] commonWords, bool ignoreCase) { CharArraySet commonSet = new CharArraySet(commonWords.Length, ignoreCase); commonSet.addAll(Arrays.asList(commonWords)); return(commonSet); }
/** * Construct a token stream filtering the given input using a Set of common * words to create bigrams, case-sensitive if ignoreCase is false (unless Set * is CharArraySet). If <code>commonWords</code> is an instance of * {@link CharArraySet} (true if <code>makeCommonSet()</code> was used to * construct the set) it will be directly used and <code>ignoreCase</code> * will be ignored since <code>CharArraySet</code> directly controls case * sensitivity. * <p/> * If <code>commonWords</code> is not an instance of {@link CharArraySet}, a * new CharArraySet will be constructed and <code>ignoreCase</code> will be * used to specify the case sensitivity of that set. * * @param input TokenStream input in filter chain. * @param commonWords The set of common words. * @param ignoreCase -Ignore case when constructing bigrams for common words. */ public CommonGramsFilter(TokenStream input, Set commonWords, bool ignoreCase) : base(input) { if (commonWords is CharArraySet) { this.commonWords = (CharArraySet)commonWords; } else { this.commonWords = new CharArraySet(commonWords.size(), ignoreCase); this.commonWords.addAll(commonWords); } init(); }
public void inform(ResourceLoader loader) { string commonWordFiles = (string)args.get("words"); ignoreCase = getBoolean("ignoreCase", false); if (commonWordFiles != null) { try { List /*<String>*/ files = StrUtils.splitFileNames(commonWordFiles); if (commonWords == null && files.size() > 0) { // default stopwords list has 35 or so words, but maybe don't make it // that big to start commonWords = new CharArraySet(files.size() * 10, ignoreCase); } for (var iter = files.iterator(); iter.hasNext();) { string file = (string)iter.next(); List /*<String>*/ wlist = loader.getLines(file.Trim()); // TODO: once StopFilter.makeStopSet(List) method is available, switch // to using that so we can avoid a toArray() call commonWords.addAll(CommonGramsFilter.makeCommonSet((string[])wlist .toArray(new string[0]), ignoreCase)); } } catch (IOException e) { throw new System.ApplicationException("Unexpected exception", e); } } else { #pragma warning disable 612 commonWords = CommonGramsFilter.makeCommonSet( StopAnalyzer.ENGLISH_STOP_WORDS, ignoreCase); #pragma warning restore 612 } }
/** * Construct a token stream filtering the given input using an Array of common * words to create bigrams and is case-sensitive if ignoreCase is false. * * @param input Tokenstream in filter chain * @param commonWords words to be used in constructing bigrams * @param ignoreCase -Ignore case when constructing bigrams for common words. */ public CommonGramsFilter(TokenStream input, string[] commonWords, bool ignoreCase) : base(input) { this.commonWords = makeCommonSet(commonWords, ignoreCase); init(); }