public void TestIssues() { // case 1 var text = @"整併"; var extractor = new TfidfExtractor(); var keywords = extractor.ExtractTags(text, 10, Constants.NounPos); foreach (var keyword in keywords) { Console.WriteLine(keyword); } keywords = extractor.ExtractTags(text, 10, Constants.VerbPos); foreach (var keyword in keywords) { Console.WriteLine(keyword); } // case 2: text = "開発支援工具FLEXITE"; keywords = extractor.ExtractTags(text, 10, Constants.NounPos); foreach (var keyword in keywords) { Console.WriteLine(keyword); } }
public void wordCut() { string[] folders = Directory.GetDirectories(path); string filePath = savePath + @"\分词结果.txt"; StreamWriter writer = new StreamWriter(filePath); foreach (string folder in folders) { int index = folder.LastIndexOf('\\'); string folderName = folder.Substring(index + 1); writer.Write("\"{0}\": [", folderName); Console.WriteLine(folderName); string[] files = Directory.GetFiles(folder); string text = ""; foreach (string file in files) { text += File.ReadAllText(file); } var extractor = new TfidfExtractor(); var keywords = extractor.ExtractTagsWithWeight(text); foreach (WordWeightPair w in keywords) { writer.Write("[\"{0}\", {1}],", w.Word, (int)(w.Weight * 100 * 2)); //将权重按照一定倍数放大并取整,便于后期处理 Console.WriteLine("{0}: {1}", w.Word, (int)(w.Weight * 100 * 2)); } writer.WriteLine("],"); } writer.Close(); }
private List <string> WordSplitResult(string strWords) { List <string> result = new List <string>(); IEnumerable <string> segments; switch (comboBoxCutMode.SelectedIndex) { case 0: segments = segmenter.Cut(strWords); break; case 1: segments = segmenter.CutForSearch(strWords); break; case 2: var idf = new TfidfExtractor(); segments = idf.ExtractTags(strWords, 20, Constants.NounAndVerbPos); break; default: var textRank = new TextRankExtractor(); segments = textRank.ExtractTags(strWords, 20, Constants.NounAndVerbPos); break; } foreach (string str in string.Join(" ", segments).Split(' ')) { if (!stopwordsList.Contains(str)) { result.Add(str); } } return(result); }
/// <summary> /// 获取高频词 /// </summary> /// <param name="userpaper"></param> /// <returns></returns> public List <string> GetKeyWord(string userpaper) { JiebaSegmenter segmenter = new JiebaSegmenter(); segmenter.LoadUserDict("THUOCL_it.txt"); var fc = new TfidfExtractor(); List <string> UserKeywords = fc.ExtractTags(userpaper, count: 6, allowPos: null).ToList <string>(); return(UserKeywords); }
public void TestExtractTagsWithPos() { var tfidf = new TfidfExtractor(); var text = GetFileContents(TestHelper.GetResourceFilePath("article_social.txt")); var result = tfidf.ExtractTags(text, 30, Constants.NounAndVerbPos); foreach (var tag in result) { Console.WriteLine(tag); } }
public void TestExtractIdioms() { var tfidf = new TfidfExtractor(); var text = GetFileContents(@"Resources\article_social.txt"); var result = tfidf.ExtractTags(text, 50, Constants.IdiomPos); foreach (var tag in result) { Console.WriteLine(tag); } }
public void TestExtractTagsWithWeights() { var tfidf = new TfidfExtractor(); var text = GetFileContents(@"Resources\article.txt"); var result = tfidf.ExtractTagsWithWeight(text); foreach (var tag in result) { Console.WriteLine("({0}, {1})", tag.Word, tag.Weight); } }
public void TestExtractTags() { var tfidf = new TfidfExtractor(); var text = GetFileContents(TestHelper.GetResourceFilePath("article.txt")); var result = tfidf.ExtractTags(text, 30); foreach (var tag in result) { Console.WriteLine(tag); } }
public void TestExtractTagsOfSportsNews() { var tfidf = new TfidfExtractor(); var text = GetFileContents(@"Resources\article_sports.txt"); var result = tfidf.ExtractTags(text); foreach (var tag in result) { Console.WriteLine(tag); } }
/// <summary> /// jieba.net分词,找关键词 /// </summary> /// <param name="str">输入字符串</param> /// <param name="strout">输出10个关键词</param> /// <returns></returns> public string JiebaKey(string str) { string strout = ""; var extractor = new TfidfExtractor().ExtractTags(str, 10, Constants.NounAndVerbPos); // 提取前十个仅包含名词和动词的关键词 foreach (var keyword in extractor) { strout += (keyword + " "); } return(strout); }
public void ExtractTagsDemo() { var text = "程序员(英文Programmer)是从事程序开发、维护的专业人员。一般将程序员分为程序设计人员和程序编码人员,但两者的界限并不非常清楚,特别是在中国。软件从业人员分为初级程序员、高级程序员、系统分析员和项目经理四大类。"; var extractor = new TfidfExtractor(); var keywords = extractor.ExtractTags(text); foreach (var keyword in keywords) { Console.WriteLine(keyword); } }
public async Task <string> Post([FromBody] MonitorRequestModel model) { var monitor = await _connection.QuerySingleOrDefaultAsync <Monitor>("SELECT * FROM [dbo].[Monitor] WHERE [Name]=@Name", new { model.Name }); if (monitor == null) { var name = model.Name; if (model.Type == MonitorType.公司) { var extractor = new TfidfExtractor(); var keywords = extractor.ExtractTags(model.Name).ToArray(); name = keywords[0]; } await _connection.ExecuteAsync( @"INSERT INTO [dbo].[Monitor]([Type],[Name],[Tag],[Remarks]) VALUES(@Type, @Name, @Tag, @Remarks)", new { model.Type, model.Name, Tag = name, Remarks = model.NickName }); BackgroundJob.Enqueue <MonitorJob>(job => job.Monitor(name, null)); return($"@{model.NickName}:监控设置成功."); } else { await _connection.ExecuteAsync("UPDATE [dbo].[Monitor] SET [Remarks]=[Remarks]+','+@NickName WHERE [Id]=@Id", new { monitor.Id, model.NickName }); var name = model.Name; if (model.Type == MonitorType.公司) { var extractor = new TfidfExtractor(); var keywords = extractor.ExtractTags(model.Name).ToArray(); name = keywords[0]; } var news = await _connection.QueryAsync <BaiduNews>("SELECT TOP 5 * FROM [dbo].[BaiduNews] WHERE [Keyword]=@Keyword ORDER BY [CreateDate] DESC", new { Keyword = name }); var enumerable = news as BaiduNews[] ?? news.ToArray(); if (enumerable.Any()) { //await _connection.ExecuteAsync("UPDATE [dbo].[BaiduNews] SET [IsPushed]=1 WHERE Id=@Id", // enumerable.Where(p => p.IsPushed == false).Select(p => new { p.Id })); var index = 1; string content = enumerable.Aggregate("", (current, art) => current + $"{index++}.{art.Title}\r\n"); return($"@{model.NickName}:{model.Name}最新舆情:\r\n{content}点击查看更多详情http://syzb.qianjifang.com.cn/{HttpUtility.UrlEncode(name)}"); } else { return($"@{model.NickName}:{model.Name}目前没有最新舆情!"); } } }
public void TestSetStopWords() { var tfidf = new TfidfExtractor(); tfidf.SetStopWords(@"Resources\stop_words_test.txt"); var text = GetFileContents(@"Resources\article_sports.txt"); var result = tfidf.ExtractTags(text, 30); foreach (var tag in result) { Console.WriteLine(tag); } }
public void TestSetStopWords() { var tfidf = new TfidfExtractor(); // Use less stopwords than default stopword list. tfidf.SetStopWords(TestHelper.GetResourceFilePath("stop_words_test.txt")); var text = GetFileContents(TestHelper.GetResourceFilePath("article.txt")); var result = tfidf.ExtractTags(text, 30); foreach (var tag in result) { Console.WriteLine(tag); } }
public void ExtractTagsDemo2() { var text = @"在数学和计算机科学/算学之中,算法/算则法(Algorithm)为一个计算的具体步骤,常用于计算、数据处理和自动推理。精确而言,算法是一个表示为有限长列表的有效方法。算法应包含清晰定义的指令用于计算函数。 算法中的指令描述的是一个计算,当其运行时能从一个初始状态和初始输入(可能为空)开始,经过一系列有限而清晰定义的状态最终产生输出并停止于一个终态。一个状态到另一个状态的转移不一定是确定的。随机化算法在内的一些算法,包含了一些随机输入。 形式化算法的概念部分源自尝试解决希尔伯特提出的判定问题,并在其后尝试定义有效计算性或者有效方法中成形。这些尝试包括库尔特·哥德尔、雅克·埃尔布朗和斯蒂芬·科尔·克莱尼分别于1930年、1934年和1935年提出的递归函数,阿隆佐·邱奇于1936年提出的λ演算,1936年Emil Leon Post的Formulation 1和艾伦·图灵1937年提出的图灵机。即使在当前,依然常有直觉想法难以定义为形式化算法的情况。"; var extractor = new TfidfExtractor(); var keywords = extractor.ExtractTags(text, 10, Constants.NounAndVerbPos); foreach (var keyword in keywords) { Console.WriteLine(keyword); } }
void init() { if (Extractor == null) { TextRankExtractor = new TextRankExtractor(); TfidfExtractor = new TfidfExtractor(); } if (Algorithm == ExtractAlgorithm.TextRank) { Extractor = TextRankExtractor; } if (Algorithm == ExtractAlgorithm.TF_IDF) { Extractor = TfidfExtractor; } }
private async void Button1_Click(object sender, EventArgs e) { if (textBox1.Text == "") { return; } _query = textBox1.Text; var result = ""; var client = new HttpClient(); var uri = Host + "cx=" + Cx + "&key=" + Key + "&num=" + Num + "&start=" + Start + "&q=" + System.Net.WebUtility.UrlEncode(_query); var response = await client.GetAsync(uri); var contentString = await response.Content.ReadAsStringAsync(); dynamic parsedJson = JsonConvert.DeserializeObject(contentString); var items = parsedJson?.items; for (var i = Start; i < Num; i++) { result += items?[i].snippet.ToString(); } var extractor = new TfidfExtractor(); var pairs = extractor.ExtractTagsWithWeight(result, 30); var words = new List <string>(); var freqs = new List <int>(); foreach (var pair in pairs) { if (pair.Word.Equals("...")) { continue; } words.Add(pair.Word); freqs.Add(Convert.ToInt32(pair.Weight * Math.Pow(10, 6))); } var wc = new WordCloud.WordCloud(1920, 1080); var image = wc.Draw(words, freqs); pictureBox1.Image = image; button2.Enabled = true; button2.Visible = true; }
/// <summary> /// 获取num个核心句 /// </summary> /// <param name="text">文本</param> /// <param name="num">核心句数</param> /// <param name="type">抽取类型</param> public void GetList(string text, int num, int type) { keywordList.Clear(); //获取核心关键词列表 switch (type) { case 1: { TfidfExtractor te = new TfidfExtractor(); keywordList = te.ExtractTags(text, num).ToList(); } break; case 2: { TextRankExtractor te = new TextRankExtractor(); keywordList = te.ExtractTags(text, num).ToList(); } break; } AllsentenceList.Clear(); keySentenceList.Clear(); //将文章拆为句子列表,并分词 text = text.Replace(Environment.NewLine.ToString(), " 。"); //text = text.Replace(" ", ""); AllsentenceList = text.Split('。', '?').Where(x => !string.IsNullOrEmpty(x) && x != "undefined").Select(x => x.Trim()).ToList(); List <Sentence> temp = new List <Sentence>(); for (int i = 0; i < AllsentenceList.Count; i++) { AllsentenceList[i] = AllsentenceList[i] + "。"; var sentence = segmenter.Cut(AllsentenceList[i]); Sentence v = new Sentence(); v.Sen = string.Join(" ", sentence); v.Index = i; temp.Add(v); } GetSentenceList(keywordList, temp); }
public static decimal GetHousePrice(string text) { //var seg = new JiebaSegmenter(); //var li = seg.Cut(text).ToList(); decimal housePrice = 0; var extractor = new TfidfExtractor(); var keywords = extractor.ExtractTags(text, 20, new List <string>() { "m" }); if (keywords != null) { var lstProce = keywords.Distinct().Where(s => s.Length <= 5 && s.Length >= 3).OrderByDescending(s => s.Length); var price = lstProce.FirstOrDefault(); decimal.TryParse(price, out housePrice); } return(housePrice); }
public static int GetHousePrice(string text) { int housePrice = 0; var extractor = new TfidfExtractor(); var keywords = extractor.ExtractTags(text, 20, new List <string>() { "m" }); if (keywords != null) { var prices = keywords.Distinct().Select(p => { var price = 0; int.TryParse(p, out price); return(price); }).Where(p => p >= 500 && p <= 30000); return(prices.FirstOrDefault()); } return(housePrice); }
private static void oprateJieBa(string filename) { string[] filenames = filename.Split('\\'); string filename1 = "E:\\词云\\JieBaResult\\" + filenames[2] + ".csv"; //用来存储jieba分析后的结果 string text = File.ReadAllText(filename); var segmenter = new JiebaSegmenter(); var segments = segmenter.Cut(text); var extractor = new TfidfExtractor(); var keywords = extractor.ExtractTags(text, 30, Constants.NounAndVerbPos); Console.WriteLine(filename); string str = null; foreach (var keyword in keywords) { str = str + keyword + "\n"; Console.WriteLine(keyword); } StreamWriter fz = new StreamWriter(filename1, true); fz.Write(str); fz.Close(); }
public double CalcTFIDFSimilarity(string paperText1, string paperText2) { ObjectArgs arg = new ObjectArgs(); //去掉分隔符,。... string txt1 = paperText1; txt1 = Regex.Replace(txt1, @"[^a-zA-Z0-9\u4e00-\u9fa5\s]", ""); string txt2 = paperText2; txt2 = Regex.Replace(txt2, @"[^a-zA-Z0-9\u4e00-\u9fa5\s]", ""); var segmenter = new JiebaSegmenter(); JiebaNet.Analyser.TfidfExtractor tfd = new TfidfExtractor(segmenter); arg.ObjData = "开始分词"; OnRaiseReporting(arg); Console.WriteLine("开始分词"); Stopwatch sw = new Stopwatch(); sw.Start(); IEnumerable <string> segment1 = segmenter.Cut(txt1); IEnumerable <string> segment2 = segmenter.Cut(txt2); int num1 = segment1.Count(); int num2 = segment2.Count(); Console.WriteLine("文档1分词数:" + num1); Console.WriteLine("文档2分词数:" + num2); arg.ObjData = "文档1分词数:" + num1 + "\r\n文档2分词数: " + num2; OnRaiseReporting(arg); IEnumerable <string> seg1 = null; //多的 IEnumerable <string> seg2 = null; //少的 if (num1 > num2) { seg1 = segment1; seg2 = segment2; } else { seg2 = segment1; seg1 = segment2; } int maxLength = seg1.Count(); int minLength = seg2.Count(); double similar = 0; //Dictionary<string, double> seg2Dic = CalcTF(seg2); string str2 = GetText(seg2); for (int i = 0; i + minLength <= maxLength; i++) { //0-interval //1-interval+1 //2-interval+2 ... IEnumerable <string> seg = seg1.Where((item, index) => index > i && index < i + minLength);//取i 到 i+minLength //从seg1 中截取与seg2相同数量的词集合seg //分别计算词频 seg2的词频只需要计算一次 //计算cos ===相似度 //Dictionary<string, double> dic = CalcTF(seg); //double s = CalcSimilar(dic, seg2Dic); string str = GetText(seg); int topNum = 500; IEnumerable <WordWeightPair> tf_a = tfd.ExtractTagsWithWeight(str2, topNum); IEnumerable <WordWeightPair> tf_b = tfd.ExtractTagsWithWeight(str, topNum); double molecular = 0; // 分子 double denominator_a = 0; // 分母 double denominator_b = 0; Dictionary <string, WordWeightPair> dic_a = new Dictionary <string, WordWeightPair>(); Dictionary <string, WordWeightPair> dic_b = new Dictionary <string, WordWeightPair>(); foreach (var a in tf_a) { dic_a.Add(a.Word, a); } foreach (var b in tf_b) { dic_b.Add(b.Word, b); } //Console.WriteLine("两篇文档相似的词有:"); foreach (var k in dic_a.Keys) { WordWeightPair a = dic_a[k]; WordWeightPair b; dic_b.TryGetValue(k, out b); denominator_a += a.Weight * a.Weight; molecular += a.Weight * (null == b ? 0 : b.Weight); //if (a != null && b != null) //{ // Console.WriteLine(a.Word + " TF-IDF词频统计 文档一:" + a.Weight + "|文档二:" // + b.Weight); //} } foreach (var k in dic_b.Keys) { WordWeightPair b = dic_b[k]; denominator_b += b.Weight * b.Weight; } double s = 0; if (denominator_a != 0 && denominator_b != 0) { s = (molecular / (Math.Sqrt(denominator_a) * Math.Sqrt(denominator_b))); } //Console.WriteLine("两篇文档相似度:" + s * 100 + "%"); if ((i + 1) % 50 == 0) { Console.WriteLine(string.Format("第{0}次计算出的相似度:{1}", i + 1, s)); arg.ObjData = string.Format("第{0}次计算出的相似度:{1}", i + 1, s); OnRaiseReporting(arg); } if (s > similar) { similar = s; } if (s >= 0.99) { //极高相似度 Console.WriteLine(string.Format("第{0}次计算出的相似度:{1}", i + 1, s)); arg.ObjData = string.Format("第{0}次计算出的相似度:{1}", i + 1, s); OnRaiseReporting(arg); } //Console.WriteLine("第"+i+"次花费时间:" + sw.ElapsedMilliseconds / 1000 + "秒"); } sw.Stop(); Console.WriteLine("两篇文章的相似度:" + similar); Console.WriteLine("花费时间:" + sw.ElapsedMilliseconds + "ms"); arg.ObjData = string.Format("两篇文章的相似度:" + similar + "\r\n花费时间:" + sw.ElapsedMilliseconds + "ms"); OnRaiseReporting(arg); return(similar); }
/// <summary> /// 提取文章关键词集合 /// </summary> /// <param name="objStr"></param> /// <returns></returns> public static IEnumerable <string> GetArticleKeywords(string objStr) { var idf = new TfidfExtractor(); return(idf.ExtractTags(objStr, 10, Constants.NounAndVerbPos));//名词和动词 }
/// <summary> /// 获取关键词权重(名/动词) /// </summary> /// <param name="str"></param> /// <param name="count">数量</param> /// <param name="tfidf">词典(不提供则使用默认分词词典)</param> /// <returns></returns> public static IEnumerable <WordWeightPair> GetKeyWordsWegihtNounAndVerb(this string str, int count = 20, TfidfExtractor tfidf = null) => (tfidf ?? TfidfExtractor).ExtractTagsWithWeight(str, count, Constants.NounAndVerbPos);
/// <summary> /// 获取关键词权重 /// </summary> /// <param name="str"></param> /// <param name="count">数量</param> /// <param name="tfidf">词典(不提供则使用默认分词词典)</param> /// <returns></returns> public static IEnumerable <WordWeightPair> GetKeyWordsWegiht(this string str, int count = 20, TfidfExtractor tfidf = null) => (tfidf ?? TfidfExtractor).ExtractTagsWithWeight(str, count);
/// <summary> /// 获取关键词(名/动词) /// </summary> /// <param name="str"></param> /// <param name="count">数量</param> /// <param name="tfidf">词典(不提供则使用默认分词词典)</param> /// <returns></returns> public static IEnumerable <string> GetKeyWordsNounAndVerb(this string str, int count = 20, TfidfExtractor tfidf = null) => (tfidf ?? TfidfExtractor).ExtractTags(str, count, Constants.NounAndVerbPos);
/// <summary> /// 获取关键词 /// </summary> /// <param name="str"></param> /// <param name="count">数量</param> /// <param name="tfidf">词典(不提供则使用默认分词词典)</param> /// <returns></returns> public static IEnumerable <string> GetKeyWords(this string str, int count = 20, TfidfExtractor tfidf = null) => (tfidf ?? TfidfExtractor).ExtractTags(str, count);
private void button3_Click(object sender, EventArgs e) { try { //进行计时 Stopwatch time = new Stopwatch(); time.Start(); //将inform,normal,spam三个文件夹下的文件汇入gather文件夹下的collect.txt文档 File.Create(@"..\Debug\textgather\gather\collect.txt").Close(); string[] subdirectory = Directory.GetFiles(@"..\Debug\textgather\spam\"); foreach (string path in subdirectory) { StreamReader sr = new StreamReader(path, Encoding.Default); string content = sr.ReadLine(); sr.Close(); StreamWriter sw = new StreamWriter(@"..\Debug\textgather\gather\collect.txt", true, Encoding.Default); sw.WriteLine(content); sw.Close(); } subdirectory = Directory.GetFiles(@"..\Debug\textgather\inform\"); foreach (string path in subdirectory) { StreamReader sr = new StreamReader(path, Encoding.Default); string content = sr.ReadLine(); sr.Close(); StreamWriter sw = new StreamWriter(@"..\Debug\textgather\gather\collect.txt", true, Encoding.Default); sw.WriteLine(content); sw.Close(); } subdirectory = Directory.GetFiles(@"..\Debug\textgather\normal\"); foreach (string path in subdirectory) { StreamReader sr = new StreamReader(path, Encoding.Default); string content = sr.ReadLine(); sr.Close(); StreamWriter sw = new StreamWriter(@"..\Debug\textgather\gather\collect.txt", true, Encoding.Default); sw.WriteLine(content); sw.Close(); } //将collect文本中的词语进行分词,以及用TF-IDF算法提取关键词保存在collectok文本 StreamReader srt = new StreamReader(@"..\Debug\textgather\gather\collect.txt", Encoding.Default); string cont = srt.ReadToEnd(); srt.Close(); //调用jieba提取仅包含名词和动词的关键词 var extractor = new TfidfExtractor(); var keywords = extractor.ExtractTagsWithWeight(cont, 6000, Constants.NounAndVerbPos); File.Create(@"..\Debug\textgather\gather\collecto.txt").Close(); StreamWriter swt = new StreamWriter(@"..\Debug\textgather\gather\collecto.txt", true, Encoding.Default); string strline = null; foreach (var keyword in keywords) { strline = keyword.Word + " " + keyword.Weight; swt.WriteLine(strline); } swt.Close(); //计时 time.Stop(); TimeSpan ts2 = time.Elapsed; listBox1.Items.Clear(); listBox1.Items.Add("文本预处理完成,总耗时:" + ts2.TotalSeconds + "秒"); } catch (Exception ex) { MessageBox.Show(ex.Message); } }