public void tegsText() { List <String> test1 = new List <string>(); test1.Add("hello"); test1.Add("this"); MainWindow._tags = test1; String test2 = "is"; DataMining d = new DataMining(); bool r = d.RepeattedTag(test2); Assert.IsTrue(r); }
public static void msdn_channel9process(string path, string link) { Denoising de = new Denoising(); de.Path = path; de.Work(); // work后就获得了原始数据,去噪数据等 MainWindow._rawdata = de.Rawdata; // 获得原始数据 MainWindow._denoisingdata = de.Denoisingdata; // 获得去噪数据 if (MainWindow._denoisingdata == null) // 如果得到的去噪数据是null,说明可能存在编码或其他一些问题 { MainWindow._denoisingdata = ""; } // 初始化一个用来分词的类,对去噪数据进行分词处理 WordSegment ws = new WordSegment(); //bool isCHineseWebPage = WebTesting(MainWindow._denoisingdata); MainWindow._wordSegmentResult = ws.cutwords(MainWindow._denoisingdata, WordSegmentData.preWordSegmentor); foreach (string word in MainWindow._wordSegmentResult) { if (MainWindow.wordFreq.ContainsKey(word)) { int freq = (int)MainWindow.wordFreq[word]; MainWindow.wordFreq[word] = freq + 1; } else { MainWindow.wordFreq.Add(word, 1); } } MainWindow._tags = new List <string>(); MainWindow.tagPreview = new Hashtable(); // 初始化一个数据挖掘类的实例 DataMining dm = new DataMining(); // 获得原始数据和去噪数据 dm.Rawdata = MainWindow._rawdata; dm.Denoisingdata = MainWindow._denoisingdata; // 开始工作 dm.Work(); // 获得标题,作者,发布日期,其实很不准确 MainWindow._title = dm.Title; MainWindow._author = dm.Author; MainWindow._postdate = dm.Postdate; Page pg = new Page(); PageItem pgitem = new PageItem(); MainWindow.curwid = MainWindow.getWebpageNo() + 1; pgitem.setwid(MainWindow.curwid); //item1.settitle(MainWindow._title); //pgitem.settitle(translate(MainWindow._title)); pgitem.settitle(MainWindow._title); pgitem.setauthor(MainWindow._author); pgitem.setpostdate(MainWindow._postdate); pgitem.setlink(MainWindow.curUrl); pgitem.setreferred(""); pg.savePage(pgitem); HtmlDocument doc = new HtmlDocument(); try { doc.Load(path, Encoding.UTF8); } catch (Exception e) { return; } HtmlNode rootNode = doc.DocumentNode; string CategoryListXPath = "//title|//img"; HtmlNodeCollection categoryNodeList = rootNode.SelectNodes(CategoryListXPath); foreach (HtmlNode child in categoryNodeList) { HtmlNode hn = HtmlNode.CreateNode(child.OuterHtml); if (hn.SelectSingleNode("//title") != null) { MainWindow._title = hn.SelectSingleNode("//title").InnerText; } else if (hn.SelectSingleNode("//*[@class=\"thumb\"]") != null) { string str = hn.SelectSingleNode("//*[@class=\"thumb\"]").OuterHtml; string reg = "(?<=alt=\")[^\"]*\""; string key_words = Regex.Match(str, reg).Value; MainWindow._tags.Add(key_words); } } //save to table TAGS foreach (string tag in MainWindow._tags) { if (!MainWindow._occurredTags.Contains(tag)) { Tag thistag = new Tag(); TagItem tgitem = new TagItem(); tgitem.settid(MainWindow.getTagNo() + 1); //MainWindow._occurredTags.Add(translate(tag)); MainWindow._occurredTags.Add(tag); //tgitem.setname(translate(tag)); tgitem.setname(tag); thistag.saveTag(tgitem); } } foreach (string tag in MainWindow._tags) { Preview b = new Preview(); PreviewItem item3 = new PreviewItem(); //MainWindow.curtid = MainWindow.getTagNo(translate(tag)); MainWindow.curtid = MainWindow.getTagNo(tag); item3.settid(MainWindow.curtid); item3.setwid(MainWindow.curwid); //item3.setpreview(translate((string)tagPreview[tag])); item3.setpreview((string)MainWindow.tagPreview[tag]); //item3.setpreview((string)tagPreview[tag]); b.savePreview(item3); } }
public static void Process(string path, bool needWriteToDB) // @path:文件路径 @needWriteToDB:表示是否需要写入数据库 { // 初始化一个用来去噪的类 Denoising de = new Denoising(); de.Path = path; de.Work(); // work后就获得了原始数据,去噪数据等 MainWindow._rawdata = de.Rawdata; // 获得原始数据 MainWindow._denoisingdata = de.Denoisingdata; // 获得去噪数据 if (MainWindow._denoisingdata == null) // 如果得到的去噪数据是null,说明可能存在编码或其他一些问题 { MainWindow._denoisingdata = ""; } // 初始化一个用来分词的类,对去噪数据进行分词处理 WordSegment ws = new WordSegment(); //bool isCHineseWebPage = WebTesting(MainWindow._denoisingdata); MainWindow._wordSegmentResult = ws.cutwords(MainWindow._denoisingdata, WordSegmentData.preWordSegmentor); foreach (string word in MainWindow._wordSegmentResult) { if (MainWindow.wordFreq.ContainsKey(word)) { int freq = (int)MainWindow.wordFreq[word]; MainWindow.wordFreq[word] = freq + 1; } else { MainWindow.wordFreq.Add(word, 1); } } /*if (isCHineseWebPage) // 只对中文页面采用分词算法 * { * MainWindow._wordSegmentResult = ws.DoWordSegment(MainWindow._denoisingdata); // 使用分词算法进行分词 * * // 对分词结果中每个词的词频进行统计 * string word; * Regex r = new Regex(@"\{<([^>]*)>\}", RegexOptions.Multiline); * MatchCollection matches = r.Matches(MainWindow._wordSegmentResult); * foreach (Match match in matches) * { * if (match.Success) * { * word = match.Groups[1].Value; * if (MainWindow.wordFreq.ContainsKey(word)) * { * int freq = (int)MainWindow.wordFreq[word]; * MainWindow.wordFreq[word] = freq + 1; * } * else * { * MainWindow.wordFreq.Add(word, 1); * } * } * } * } * else // 对英文页面,直接采用分隔符分离的方法 * { * MainWindow._wordSegmentResult = MainWindow._denoisingdata; * string[] words = MainWindow._wordSegmentResult.Split(new char[] {' ', '`', '~', '!', '@', '$', '%', * '^', '&', '*','(', ')', '_', '+', '=', '{', '[', '}', ']', ':', ';', '\"', '\'', ',', * '<', '.', '>', '?', '/', '\\', '\r', '\n', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}); * foreach (string word in words) * { * if (word.Length > 1) * { * if (MainWindow.wordFreq.ContainsKey(word.ToLower())) * { * int freq = (int)MainWindow.wordFreq[word.ToLower()]; * MainWindow.wordFreq[word.ToLower()] = freq + 1; * } * else * { * MainWindow.wordFreq.Add(word.ToLower(), 1); * } * } * } * }*/ // 数据挖掘 // 先清空两个容器 MainWindow._tags = new List <string>(); MainWindow.tagPreview = new Hashtable(); // 初始化一个数据挖掘类的实例 DataMining dm = new DataMining(); // 获得原始数据和去噪数据 dm.Rawdata = MainWindow._rawdata; dm.Denoisingdata = MainWindow._denoisingdata; // 开始工作 dm.Work(); // 获得标题,作者,发布日期,其实很不准确 MainWindow._title = dm.Title; MainWindow._author = dm.Author; MainWindow._postdate = dm.Postdate; // 已经得到了想要的信息,开始 //DB OPERATION //信息已经提取完成,存入数据库,包括tag webpage tag_webpage 3张表 //请把新的tag存入tag表中,注意编号 if (needWriteToDB) { /*suiyuhao*/ //save to table PAGE Page pg = new Page(); PageItem pgitem = new PageItem(); MainWindow.curwid = MainWindow.getWebpageNo() + 1; pgitem.setwid(MainWindow.curwid); //item1.settitle(MainWindow._title); //pgitem.settitle(translate(MainWindow._title)); pgitem.settitle(MainWindow._title); pgitem.setauthor(MainWindow._author); pgitem.setpostdate(MainWindow._postdate); pgitem.setlink(MainWindow.curUrl); pgitem.setreferred(""); pg.savePage(pgitem); string body = dealhtml(MainWindow._rawdata); string keywords = ""; //save to table TAGS foreach (string tag in MainWindow._tags) { if (!MainWindow._occurredTags.Contains(tag)) { Tag thistag = new Tag(); TagItem tgitem = new TagItem(); tgitem.settid(MainWindow.getTagNo() + 1); //MainWindow._occurredTags.Add(translate(tag)); MainWindow._occurredTags.Add(tag); //tgitem.setname(translate(tag)); tgitem.setname(tag); thistag.saveTag(tgitem); } keywords += tag; } MainWindow.insertintosolr(MainWindow._title, "", MainWindow.curUrl, MainWindow._postdate, keywords, 0, body); foreach (string tag in MainWindow._tags) { Preview b = new Preview(); PreviewItem item3 = new PreviewItem(); //MainWindow.curtid = MainWindow.getTagNo(translate(tag)); MainWindow.curtid = MainWindow.getTagNo(tag); item3.settid(MainWindow.curtid); item3.setwid(MainWindow.curwid); //item3.setpreview(translate((string)tagPreview[tag])); item3.setpreview((string)MainWindow.tagPreview[tag]); //item3.setpreview((string)tagPreview[tag]); b.savePreview(item3); } /*suiyuhao*/ } }
public static void Processwrd(string doc, string doc2, bool needWriteToDB) { MainWindow._rawdata = doc; MainWindow._denoisingdata = doc2; if (MainWindow._denoisingdata == null) { MainWindow._denoisingdata = ""; } WordSegment ws = new WordSegment(); MainWindow._wordSegmentResult = ws.cutwords(MainWindow._denoisingdata, WordSegmentData.preWordSegmentor); foreach (string word in MainWindow._wordSegmentResult) { if (MainWindow.wordFreq.ContainsKey(word)) { int freq = (int)MainWindow.wordFreq[word]; MainWindow.wordFreq[word] = freq + 1; } else { MainWindow.wordFreq.Add(word, 1); } } List <string> _tags = new List <string>(); Hashtable tagPreview = new Hashtable(); DataMining dm = new DataMining(); dm.Rawdata = MainWindow._rawdata; dm.Denoisingdata = MainWindow._denoisingdata; dm.Work(); MainWindow._title = dm.Title; MainWindow._author = dm.Author; MainWindow._postdate = dm.Postdate; if (needWriteToDB) { /*suiyuhao*/ //save to table PAGE Page pg = new Page(); PageItem pgitem = new PageItem(); MainWindow.curwid = MainWindow.getWebpageNo() + 1; pgitem.setwid(MainWindow.curwid); //item1.settitle(MainWindow._title); //pgitem.settitle(translate(MainWindow._title)); pgitem.settitle(MainWindow._title); pgitem.setauthor(MainWindow._author); pgitem.setpostdate(MainWindow._postdate); pgitem.setlink(MainWindow.curUrl); pgitem.setreferred(""); pg.savePage(pgitem); //save to table TAGS foreach (string tag in _tags) { if (!MainWindow._occurredTags.Contains(tag)) { Tag thistag = new Tag(); TagItem tgitem = new TagItem(); tgitem.settid(MainWindow.getTagNo() + 1); //MainWindow._occurredTags.Add(translate(tag)); MainWindow._occurredTags.Add(tag); //tgitem.setname(translate(tag)); tgitem.setname(tag); thistag.saveTag(tgitem); } } foreach (string tag in _tags) { Preview b = new Preview(); PreviewItem item3 = new PreviewItem(); //MainWindow.curtid = MainWindow.getTagNo(translate(tag)); MainWindow.curtid = MainWindow.getTagNo(tag); item3.settid(MainWindow.curtid); item3.setwid(MainWindow.curwid); //item3.setpreview(translate((string)tagPreview[tag])); item3.setpreview((string)tagPreview[tag]); //item3.setpreview((string)tagPreview[tag]); b.savePreview(item3); } /*suiyuhao*/ } }