Пример #1
0
        static List <string> TokenizeStandard(string content, TokenizeConfig config)
        {
            StringReader reader = new StringReader(content);
            TokenStream  result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_24, reader);

            var stophash = StopFilter.MakeStopSet(config.StopWords);

            result = new StandardFilter(result);
            result = new LowerCaseFilter(result);
            result = new StopFilter(true, result, stophash, true);

            /// Set up lexicon/invertlexicon, featurevectors, wordappearancecount ///
            result.Reset();
            TermAttribute termattr = (TermAttribute)result.GetAttribute(typeof(TermAttribute));
            List <string> words    = new List <string>();

            while (result.IncrementToken())
            {
                words.Add(termattr.Term());
            }
            return(words);
        }
Пример #2
0
 private void DoTestStopPositons(StopFilter stpf, bool enableIcrements)
 {
     log("---> test with enable-increments-" + (enableIcrements ? "enabled" : "disabled"));
     stpf.EnablePositionIncrements = enableIcrements;
     ICharTermAttribute termAtt = stpf.GetAttribute<ICharTermAttribute>();
     IPositionIncrementAttribute posIncrAtt = stpf.GetAttribute<IPositionIncrementAttribute>();
     stpf.Reset();
     for (int i = 0; i < 20; i += 3)
     {
         assertTrue(stpf.IncrementToken());
         log("Token " + i + ": " + stpf);
         string w = English.IntToEnglish(i).Trim();
         assertEquals("expecting token " + i + " to be " + w, w, termAtt.ToString());
         assertEquals("all but first token must have position increment of 3", enableIcrements ? (i == 0 ? 1 : 3) : 1, posIncrAtt.PositionIncrement);
     }
     assertFalse(stpf.IncrementToken());
     stpf.End();
     stpf.Dispose();
 }