示例#1
0
        /// <summary>
        ///  停用词的使用教程
        /// </summary>
        public void CoreStopWordDictionaryDemo()
        {
            // var coreStopWordDictionary = new com.hankcs.hanlp.dictionary.stopword.CoreStopWordDictionary();
            //var Filter = new com.hankcs.hanlp.dictionary.stopword.Filter();
            //var Term = new com.hankcs.hanlp.seg.common.Term();
            var BasicTokenizer    = new com.hankcs.hanlp.tokenizer.BasicTokenizer();
            var NotionalTokenizer = new com.hankcs.hanlp.tokenizer.NotionalTokenizer();

            var text = "小区居民有的反对喂养流浪猫,而有的居民却赞成喂养这些小宝贝";

            // 可以动态修改停用词词典
            CoreStopWordDictionary.add("居民");
            print(NotionalTokenizer.segment(text));
            CoreStopWordDictionary.remove("居民");
            print(NotionalTokenizer.segment(text));

            //可以对任意分词器的结果执行过滤
            var term_list = BasicTokenizer.segment(text);

            print(term_list);
            CoreStopWordDictionary.apply(term_list);
            print(term_list);

            // 还可以自定义过滤逻辑
            var MyFilter = new MyStopWordFilter();

            CoreStopWordDictionary.FILTER = MyFilter;
            print(NotionalTokenizer.segment("数字123的保留"));
            // “的”位于stopwords.txt所以被过滤,数字得到保留
        }
示例#2
0
 bool Filter.shouldInclude(Term term)
 {
     if (term.nature.startsWith('m'))
     {
         return(true);                                    // 数词保留
     }
     return(!CoreStopWordDictionary.contains(term.word)); // 停用词过滤
 }
示例#3
0
        /**
         * 将句子列表转化为文档
         *
         * @param sentenceList
         * @return
         */
        private static List <List <String> > convertSentenceListToDocument(List <String> sentenceList)
        {
            List <List <String> > docs = new List <List <String> >(sentenceList.Count);

            foreach (String sentence in sentenceList)
            {
                List <Term>   termList = StandardTokenizer.segment(sentence.ToCharArray());
                List <String> wordList = new List <String>();
                foreach (Term term in termList)
                {
                    if (CoreStopWordDictionary.shouldInclude(term))
                    {
                        wordList.Add(term.word);
                    }
                }
                docs.Add(wordList);
            }
            return(docs);
        }