public JiebaTokenizer(TextReader input, TokenizerMode mode) : base(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input) { _segment = new JiebaSegment(); _mode = mode; LoadStopWords(); Init(); }
public TextRankExtractor() { Span = 5; Segment = new JiebaSegment(); PosSegmenter = new PosSegmenter(Segment); SetStopWords(ConfigManager.StopWordsFile); if (StopWords.IsEmpty()) { StopWords.UnionWith(DefaultStopWords); } }
public void TestNewCut() { var segmenter = new JiebaSegment(); var wordInfos = segmenter.Cut2("推荐系统终于发布了最终的版本,点击率蹭蹭上涨"); Assert.Equal(wordInfos.ElementAt(0).position, 0); for (int i = 1; i < wordInfos.Count(); i++) { Assert.Equal(wordInfos.ElementAt(i).position, wordInfos.ElementAt(i - 1).position + wordInfos.ElementAt(i - 1).value.Length); } }
public void TestCut() { var jiebaSegment = new JiebaSegment(); var segments = jiebaSegment.Cut("我来到北京清华大学", cutAll: true); var resultWords = new List <string> { "我", "来到", "北京", "清华", "清华大学", "华大", "大学" }; Compared(segments, resultWords); segments = jiebaSegment.Cut("我来到北京清华大学"); resultWords = new List <string> { "我", "来到", "北京", "清华大学" }; Compared(segments, resultWords); segments = jiebaSegment.Cut("他来到了网易杭研大厦"); // 默认为精确模式,同时也使用HMM模型 resultWords = new List <string> { "他", "来到", "了", "网易", "杭研", "大厦" }; Compared(segments, resultWords); segments = jiebaSegment.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式 resultWords = new List <string> { "小明", "硕士", "毕业", "于", "中国", "科学", "学院", "科学院", "中国科学院", "计算", "计算所", ",", "后" , "在", "日本", "京都", "大学", "日本京都大学", "深造" }; Compared(segments, resultWords); segments = jiebaSegment.Cut("结过婚的和尚未结过婚的"); resultWords = new List <string> { "结过婚", "的", "和", "尚未", "结过婚", "的" }; Compared(segments, resultWords); segments = jiebaSegment.Cut("快奔三", false, false); resultWords = new List <string> { "快", "奔三" }; Compared(segments, resultWords); }
public TfidfExtractor(JiebaSegment segment = null) { if (segment.IsNull()) { Segment = new JiebaSegment(); } else { Segment = segment; } PosSegmenter = new PosSegmenter(Segment); SetStopWords(ConfigManager.StopWordsFile); if (StopWords.IsEmpty()) { StopWords.UnionWith(DefaultStopWords); } Loader = new IdfLoader(DefaultIdfFile); IdfFreq = Loader.IdfFreq; MedianIdf = Loader.MedianIdf; }
public PosSegmenter(JiebaSegment segment) { _segment = segment; }
public PosSegmenter() { _segment = new JiebaSegment(); }