/// <summary> /// 停用词的使用教程 /// </summary> public void CoreStopWordDictionaryDemo() { // var coreStopWordDictionary = new com.hankcs.hanlp.dictionary.stopword.CoreStopWordDictionary(); //var Filter = new com.hankcs.hanlp.dictionary.stopword.Filter(); //var Term = new com.hankcs.hanlp.seg.common.Term(); var BasicTokenizer = new com.hankcs.hanlp.tokenizer.BasicTokenizer(); var NotionalTokenizer = new com.hankcs.hanlp.tokenizer.NotionalTokenizer(); var text = "小区居民有的反对喂养流浪猫,而有的居民却赞成喂养这些小宝贝"; // 可以动态修改停用词词典 CoreStopWordDictionary.add("居民"); print(NotionalTokenizer.segment(text)); CoreStopWordDictionary.remove("居民"); print(NotionalTokenizer.segment(text)); //可以对任意分词器的结果执行过滤 var term_list = BasicTokenizer.segment(text); print(term_list); CoreStopWordDictionary.apply(term_list); print(term_list); // 还可以自定义过滤逻辑 var MyFilter = new MyStopWordFilter(); CoreStopWordDictionary.FILTER = MyFilter; print(NotionalTokenizer.segment("数字123的保留")); // “的”位于stopwords.txt所以被过滤,数字得到保留 }
bool Filter.shouldInclude(Term term) { if (term.nature.startsWith('m')) { return(true); // 数词保留 } return(!CoreStopWordDictionary.contains(term.word)); // 停用词过滤 }
/** * 将句子列表转化为文档 * * @param sentenceList * @return */ private static List <List <String> > convertSentenceListToDocument(List <String> sentenceList) { List <List <String> > docs = new List <List <String> >(sentenceList.Count); foreach (String sentence in sentenceList) { List <Term> termList = StandardTokenizer.segment(sentence.ToCharArray()); List <String> wordList = new List <String>(); foreach (Term term in termList) { if (CoreStopWordDictionary.shouldInclude(term)) { wordList.Add(term.word); } } docs.Add(wordList); } return(docs); }