static List <string> TokenizeStandard(string content, TokenizeConfig config) { StringReader reader = new StringReader(content); TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_24, reader); var stophash = StopFilter.MakeStopSet(config.StopWords); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(true, result, stophash, true); /// Set up lexicon/invertlexicon, featurevectors, wordappearancecount /// result.Reset(); TermAttribute termattr = (TermAttribute)result.GetAttribute(typeof(TermAttribute)); List <string> words = new List <string>(); while (result.IncrementToken()) { words.Add(termattr.Term()); } return(words); }
private void DoTestStopPositons(StopFilter stpf, bool enableIcrements) { log("---> test with enable-increments-" + (enableIcrements ? "enabled" : "disabled")); stpf.EnablePositionIncrements = enableIcrements; ICharTermAttribute termAtt = stpf.GetAttribute<ICharTermAttribute>(); IPositionIncrementAttribute posIncrAtt = stpf.GetAttribute<IPositionIncrementAttribute>(); stpf.Reset(); for (int i = 0; i < 20; i += 3) { assertTrue(stpf.IncrementToken()); log("Token " + i + ": " + stpf); string w = English.IntToEnglish(i).Trim(); assertEquals("expecting token " + i + " to be " + w, w, termAtt.ToString()); assertEquals("all but first token must have position increment of 3", enableIcrements ? (i == 0 ? 1 : 3) : 1, posIncrAtt.PositionIncrement); } assertFalse(stpf.IncrementToken()); stpf.End(); stpf.Dispose(); }