public void ExtractTagsDemo2() { var text = @"在数学和计算机科学/算学之中,算法/算则法(Algorithm)为一个计算的具体步骤,常用于计算、数据处理和自动推理。精确而言,算法是一个表示为有限长列表的有效方法。算法应包含清晰定义的指令用于计算函数。 算法中的指令描述的是一个计算,当其运行时能从一个初始状态和初始输入(可能为空)开始,经过一系列有限而清晰定义的状态最终产生输出并停止于一个终态。一个状态到另一个状态的转移不一定是确定的。随机化算法在内的一些算法,包含了一些随机输入。 形式化算法的概念部分源自尝试解决希尔伯特提出的判定问题,并在其后尝试定义有效计算性或者有效方法中成形。这些尝试包括库尔特·哥德尔、雅克·埃尔布朗和斯蒂芬·科尔·克莱尼分别于1930年、1934年和1935年提出的递归函数,阿隆佐·邱奇于1936年提出的λ演算,1936年Emil Leon Post的Formulation 1和艾伦·图灵1937年提出的图灵机。即使在当前,依然常有直觉想法难以定义为形式化算法的情况。"; var extractor = new TfidfExtractor(); var keywords = extractor.ExtractTags(text, 10, Constants.NounAndVerbPos); foreach (var keyword in keywords) { Console.WriteLine(keyword); } }
public void TestSetStopWords() { var tfidf = new TfidfExtractor(); // Use less stopwords than default stopword list. tfidf.SetStopWords(TestHelper.GetResourceFilePath("stop_words_test.txt")); var text = GetFileContents(TestHelper.GetResourceFilePath("article.txt")); var result = tfidf.ExtractTags(text, 30); foreach (var tag in result) { Console.WriteLine(tag); } }
/// <summary> /// 获取num个核心句 /// </summary> /// <param name="text">文本</param> /// <param name="num">核心句数</param> /// <param name="type">抽取类型</param> public void GetList(string text, int num, int type) { keywordList.Clear(); //获取核心关键词列表 switch (type) { case 1: { TfidfExtractor te = new TfidfExtractor(); keywordList = te.ExtractTags(text, num).ToList(); } break; case 2: { TextRankExtractor te = new TextRankExtractor(); keywordList = te.ExtractTags(text, num).ToList(); } break; } AllsentenceList.Clear(); keySentenceList.Clear(); //将文章拆为句子列表,并分词 text = text.Replace(Environment.NewLine.ToString(), " 。"); //text = text.Replace(" ", ""); AllsentenceList = text.Split('。', '?').Where(x => !string.IsNullOrEmpty(x) && x != "undefined").Select(x => x.Trim()).ToList(); List <Sentence> temp = new List <Sentence>(); for (int i = 0; i < AllsentenceList.Count; i++) { AllsentenceList[i] = AllsentenceList[i] + "。"; var sentence = segmenter.Cut(AllsentenceList[i]); Sentence v = new Sentence(); v.Sen = string.Join(" ", sentence); v.Index = i; temp.Add(v); } GetSentenceList(keywordList, temp); }
public static decimal GetHousePrice(string text) { //var seg = new JiebaSegmenter(); //var li = seg.Cut(text).ToList(); decimal housePrice = 0; var extractor = new TfidfExtractor(); var keywords = extractor.ExtractTags(text, 20, new List <string>() { "m" }); if (keywords != null) { var lstProce = keywords.Distinct().Where(s => s.Length <= 5 && s.Length >= 3).OrderByDescending(s => s.Length); var price = lstProce.FirstOrDefault(); decimal.TryParse(price, out housePrice); } return(housePrice); }
public static int GetHousePrice(string text) { int housePrice = 0; var extractor = new TfidfExtractor(); var keywords = extractor.ExtractTags(text, 20, new List <string>() { "m" }); if (keywords != null) { var prices = keywords.Distinct().Select(p => { var price = 0; int.TryParse(p, out price); return(price); }).Where(p => p >= 500 && p <= 30000); return(prices.FirstOrDefault()); } return(housePrice); }
private static void oprateJieBa(string filename) { string[] filenames = filename.Split('\\'); string filename1 = "E:\\词云\\JieBaResult\\" + filenames[2] + ".csv"; //用来存储jieba分析后的结果 string text = File.ReadAllText(filename); var segmenter = new JiebaSegmenter(); var segments = segmenter.Cut(text); var extractor = new TfidfExtractor(); var keywords = extractor.ExtractTags(text, 30, Constants.NounAndVerbPos); Console.WriteLine(filename); string str = null; foreach (var keyword in keywords) { str = str + keyword + "\n"; Console.WriteLine(keyword); } StreamWriter fz = new StreamWriter(filename1, true); fz.Write(str); fz.Close(); }
/// <summary> /// 提取文章关键词集合 /// </summary> /// <param name="objStr"></param> /// <returns></returns> public static IEnumerable <string> GetArticleKeywords(string objStr) { var idf = new TfidfExtractor(); return(idf.ExtractTags(objStr, 10, Constants.NounAndVerbPos));//名词和动词 }