private List <string> WordSplitResult(string strWords) { List <string> result = new List <string>(); IEnumerable <string> segments; switch (comboBoxCutMode.SelectedIndex) { case 0: segments = segmenter.Cut(strWords); break; case 1: segments = segmenter.CutForSearch(strWords); break; case 2: var idf = new TfidfExtractor(); segments = idf.ExtractTags(strWords, 20, Constants.NounAndVerbPos); break; default: var textRank = new TextRankExtractor(); segments = textRank.ExtractTags(strWords, 20, Constants.NounAndVerbPos); break; } foreach (string str in string.Join(" ", segments).Split(' ')) { if (!stopwordsList.Contains(str)) { result.Add(str); } } return(result); }
public void TestTextRankExtractorWithWeight() { var s = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目 2013年,实现营业收入0万元,实现净利润-139.13万元。"; var extractor = new TextRankExtractor(); var result = extractor.ExtractTagsWithWeight(s); foreach (var tag in result) { Console.WriteLine("({0}, {1})", tag.Word, tag.Weight); } }
//计算权值 并生成js文件 用于生成词云 static public void WordWeightCal() { string wordcloud = @"D:\data\wordcloud\"; DirectoryInfo theFolder = new DirectoryInfo(@"D:\data\news\"); if (!Directory.Exists(wordcloud)) { Directory.CreateDirectory(wordcloud); //Console.WriteLine("create"); } DirectoryInfo[] dirInfo = theFolder.GetDirectories(); var threads = new List <Thread>(); foreach (var item in dirInfo) { var thread = new Thread(() => { //Console.WriteLine(item.Name); FileInfo[] fileInfo = item.GetFiles("*.txt", SearchOption.AllDirectories); string text = ""; foreach (var file in fileInfo) { using (StreamReader sr = new StreamReader(file.FullName)) { text += sr.ReadToEnd(); } } //var extractor = new TfidfExtractor(); var extractor = new TextRankExtractor(); var a = extractor.ExtractTagsWithWeight(text); using (StreamWriter sw = new StreamWriter(wordcloud + item.Name + ".js", false)) { sw.WriteLine("words = ["); foreach (var result in a) { sw.WriteLine("{ text: '" + result.Word + "', size: " + result.Weight + ", href: 'https://www.baidu.com/s?wd=" + result.Word + "' },"); } sw.WriteLine("];"); } lock (locker) { c++; } //Console.WriteLine(c); }); threads.Add(thread); thread.Start(); } foreach (var t in threads) { t.Join(); } Console.WriteLine("分词完成!"); }
public void TestTextRankExtractorWithoutWeights() { var s = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目 2013年,实现营业收入0万元,实现净利润-139.13万元。"; var extractor = new TextRankExtractor(); var result = extractor.ExtractTags(s); Assert.That(result, Contains.Item("吉林")); result = extractor.ExtractTags(s, allowPos: new [] { "n" }); Assert.That(result, Is.Not.Contains("吉林")); Assert.That(result, Is.Not.Contains("实现")); }
void init() { if (Extractor == null) { TextRankExtractor = new TextRankExtractor(); TfidfExtractor = new TfidfExtractor(); } if (Algorithm == ExtractAlgorithm.TextRank) { Extractor = TextRankExtractor; } if (Algorithm == ExtractAlgorithm.TF_IDF) { Extractor = TfidfExtractor; } }
/// <summary> /// 获取num个核心句 /// </summary> /// <param name="text">文本</param> /// <param name="num">核心句数</param> /// <param name="type">抽取类型</param> public void GetList(string text, int num, int type) { keywordList.Clear(); //获取核心关键词列表 switch (type) { case 1: { TfidfExtractor te = new TfidfExtractor(); keywordList = te.ExtractTags(text, num).ToList(); } break; case 2: { TextRankExtractor te = new TextRankExtractor(); keywordList = te.ExtractTags(text, num).ToList(); } break; } AllsentenceList.Clear(); keySentenceList.Clear(); //将文章拆为句子列表,并分词 text = text.Replace(Environment.NewLine.ToString(), " 。"); //text = text.Replace(" ", ""); AllsentenceList = text.Split('。', '?').Where(x => !string.IsNullOrEmpty(x) && x != "undefined").Select(x => x.Trim()).ToList(); List <Sentence> temp = new List <Sentence>(); for (int i = 0; i < AllsentenceList.Count; i++) { AllsentenceList[i] = AllsentenceList[i] + "。"; var sentence = segmenter.Cut(AllsentenceList[i]); Sentence v = new Sentence(); v.Sen = string.Join(" ", sentence); v.Index = i; temp.Add(v); } GetSentenceList(keywordList, temp); }