public bool savePreview(PreviewItem item) { string insertSQL = string.Format("INSERT INTO XueBa.dbo.WebPage_Tag(tid,wid,preview) VALUES('{0}','{1}','{2}')", item.gettid(), item.getwid(), item.getpreview()); SqlConnection con = Connection.instance(AppConfiguration.GetConfigValue("serverIp"), "crawler", AppConfiguration.GetConfigValue("username"), AppConfiguration.GetConfigValue("password")); SqlCommand Command = con.CreateCommand(); Command.CommandText = insertSQL; try { Command.ExecuteNonQuery(); } catch (Exception e) { Console.WriteLine(e.StackTrace); return(false); } con.Close(); return(true); }
public ArrayList getPreview(string title = "") { string selectSQL = string.Format("SELECT * FROM XueBa.dbo.WebPage WHERE title='{0}'", title); ArrayList record = new ArrayList(); SqlConnection con = Connection.instance(AppConfiguration.GetConfigValue("serverIp"), "crawler", AppConfiguration.GetConfigValue("username"), AppConfiguration.GetConfigValue("password")); SqlCommand Command = con.CreateCommand(); Command.CommandText = selectSQL; SqlDataReader Reader = Command.ExecuteReader(); while (Reader.Read()) { PreviewItem Item = new PreviewItem(); Item.setwid(Reader.GetInt32(0)); Item.settid(Reader.GetInt32(1)); Item.setpreview(Reader.GetString(2)); record.Add(Item); } con.Close(); return(record); }
private void button3_Click(object sender, EventArgs e) // Denoising { pagetitle = textBox1.Text; author = textBox2.Text; postdate = textBox3.Text; Page a = new Page(); PageItem item1 = new PageItem(); int no1 = MainWindow.getWebpageNo() + 1; item1.setwid(no1); item1.settitle(pagetitle); item1.setauthor(author); item1.setpostdate(postdate); item1.setlink(textBox5.Text); item1.setreferred(""); a.savePage(item1); //save to table TAGS foreach (string tag in tags) { if (!MainWindow._occurredTags.Contains(tag)) { Tag c = new Tag(); TagItem item2 = new TagItem(); item2.settid(MainWindow.getTagNo() + 1); item2.setname(tag); c.saveTag(item2); } } foreach (string tag in tags) { Preview b = new Preview(); PreviewItem item3 = new PreviewItem(); int no2 = MainWindow.getTagNo(tag); item3.settid(no2); item3.setwid(no1); item3.setpreview((string)tagpreviews[tag]); b.savePreview(item3); } }
public static void msdn_channel9process(string path, string link) { Denoising de = new Denoising(); de.Path = path; de.Work(); // work后就获得了原始数据,去噪数据等 MainWindow._rawdata = de.Rawdata; // 获得原始数据 MainWindow._denoisingdata = de.Denoisingdata; // 获得去噪数据 if (MainWindow._denoisingdata == null) // 如果得到的去噪数据是null,说明可能存在编码或其他一些问题 { MainWindow._denoisingdata = ""; } // 初始化一个用来分词的类,对去噪数据进行分词处理 WordSegment ws = new WordSegment(); //bool isCHineseWebPage = WebTesting(MainWindow._denoisingdata); MainWindow._wordSegmentResult = ws.cutwords(MainWindow._denoisingdata, WordSegmentData.preWordSegmentor); foreach (string word in MainWindow._wordSegmentResult) { if (MainWindow.wordFreq.ContainsKey(word)) { int freq = (int)MainWindow.wordFreq[word]; MainWindow.wordFreq[word] = freq + 1; } else { MainWindow.wordFreq.Add(word, 1); } } MainWindow._tags = new List <string>(); MainWindow.tagPreview = new Hashtable(); // 初始化一个数据挖掘类的实例 DataMining dm = new DataMining(); // 获得原始数据和去噪数据 dm.Rawdata = MainWindow._rawdata; dm.Denoisingdata = MainWindow._denoisingdata; // 开始工作 dm.Work(); // 获得标题,作者,发布日期,其实很不准确 MainWindow._title = dm.Title; MainWindow._author = dm.Author; MainWindow._postdate = dm.Postdate; Page pg = new Page(); PageItem pgitem = new PageItem(); MainWindow.curwid = MainWindow.getWebpageNo() + 1; pgitem.setwid(MainWindow.curwid); //item1.settitle(MainWindow._title); //pgitem.settitle(translate(MainWindow._title)); pgitem.settitle(MainWindow._title); pgitem.setauthor(MainWindow._author); pgitem.setpostdate(MainWindow._postdate); pgitem.setlink(MainWindow.curUrl); pgitem.setreferred(""); pg.savePage(pgitem); HtmlDocument doc = new HtmlDocument(); try { doc.Load(path, Encoding.UTF8); } catch (Exception e) { return; } HtmlNode rootNode = doc.DocumentNode; string CategoryListXPath = "//title|//img"; HtmlNodeCollection categoryNodeList = rootNode.SelectNodes(CategoryListXPath); foreach (HtmlNode child in categoryNodeList) { HtmlNode hn = HtmlNode.CreateNode(child.OuterHtml); if (hn.SelectSingleNode("//title") != null) { MainWindow._title = hn.SelectSingleNode("//title").InnerText; } else if (hn.SelectSingleNode("//*[@class=\"thumb\"]") != null) { string str = hn.SelectSingleNode("//*[@class=\"thumb\"]").OuterHtml; string reg = "(?<=alt=\")[^\"]*\""; string key_words = Regex.Match(str, reg).Value; MainWindow._tags.Add(key_words); } } //save to table TAGS foreach (string tag in MainWindow._tags) { if (!MainWindow._occurredTags.Contains(tag)) { Tag thistag = new Tag(); TagItem tgitem = new TagItem(); tgitem.settid(MainWindow.getTagNo() + 1); //MainWindow._occurredTags.Add(translate(tag)); MainWindow._occurredTags.Add(tag); //tgitem.setname(translate(tag)); tgitem.setname(tag); thistag.saveTag(tgitem); } } foreach (string tag in MainWindow._tags) { Preview b = new Preview(); PreviewItem item3 = new PreviewItem(); //MainWindow.curtid = MainWindow.getTagNo(translate(tag)); MainWindow.curtid = MainWindow.getTagNo(tag); item3.settid(MainWindow.curtid); item3.setwid(MainWindow.curwid); //item3.setpreview(translate((string)tagPreview[tag])); item3.setpreview((string)MainWindow.tagPreview[tag]); //item3.setpreview((string)tagPreview[tag]); b.savePreview(item3); } }
public static void Process(string path, bool needWriteToDB) // @path:文件路径 @needWriteToDB:表示是否需要写入数据库 { // 初始化一个用来去噪的类 Denoising de = new Denoising(); de.Path = path; de.Work(); // work后就获得了原始数据,去噪数据等 MainWindow._rawdata = de.Rawdata; // 获得原始数据 MainWindow._denoisingdata = de.Denoisingdata; // 获得去噪数据 if (MainWindow._denoisingdata == null) // 如果得到的去噪数据是null,说明可能存在编码或其他一些问题 { MainWindow._denoisingdata = ""; } // 初始化一个用来分词的类,对去噪数据进行分词处理 WordSegment ws = new WordSegment(); //bool isCHineseWebPage = WebTesting(MainWindow._denoisingdata); MainWindow._wordSegmentResult = ws.cutwords(MainWindow._denoisingdata, WordSegmentData.preWordSegmentor); foreach (string word in MainWindow._wordSegmentResult) { if (MainWindow.wordFreq.ContainsKey(word)) { int freq = (int)MainWindow.wordFreq[word]; MainWindow.wordFreq[word] = freq + 1; } else { MainWindow.wordFreq.Add(word, 1); } } /*if (isCHineseWebPage) // 只对中文页面采用分词算法 * { * MainWindow._wordSegmentResult = ws.DoWordSegment(MainWindow._denoisingdata); // 使用分词算法进行分词 * * // 对分词结果中每个词的词频进行统计 * string word; * Regex r = new Regex(@"\{<([^>]*)>\}", RegexOptions.Multiline); * MatchCollection matches = r.Matches(MainWindow._wordSegmentResult); * foreach (Match match in matches) * { * if (match.Success) * { * word = match.Groups[1].Value; * if (MainWindow.wordFreq.ContainsKey(word)) * { * int freq = (int)MainWindow.wordFreq[word]; * MainWindow.wordFreq[word] = freq + 1; * } * else * { * MainWindow.wordFreq.Add(word, 1); * } * } * } * } * else // 对英文页面,直接采用分隔符分离的方法 * { * MainWindow._wordSegmentResult = MainWindow._denoisingdata; * string[] words = MainWindow._wordSegmentResult.Split(new char[] {' ', '`', '~', '!', '@', '$', '%', * '^', '&', '*','(', ')', '_', '+', '=', '{', '[', '}', ']', ':', ';', '\"', '\'', ',', * '<', '.', '>', '?', '/', '\\', '\r', '\n', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}); * foreach (string word in words) * { * if (word.Length > 1) * { * if (MainWindow.wordFreq.ContainsKey(word.ToLower())) * { * int freq = (int)MainWindow.wordFreq[word.ToLower()]; * MainWindow.wordFreq[word.ToLower()] = freq + 1; * } * else * { * MainWindow.wordFreq.Add(word.ToLower(), 1); * } * } * } * }*/ // 数据挖掘 // 先清空两个容器 MainWindow._tags = new List <string>(); MainWindow.tagPreview = new Hashtable(); // 初始化一个数据挖掘类的实例 DataMining dm = new DataMining(); // 获得原始数据和去噪数据 dm.Rawdata = MainWindow._rawdata; dm.Denoisingdata = MainWindow._denoisingdata; // 开始工作 dm.Work(); // 获得标题,作者,发布日期,其实很不准确 MainWindow._title = dm.Title; MainWindow._author = dm.Author; MainWindow._postdate = dm.Postdate; // 已经得到了想要的信息,开始 //DB OPERATION //信息已经提取完成,存入数据库,包括tag webpage tag_webpage 3张表 //请把新的tag存入tag表中,注意编号 if (needWriteToDB) { /*suiyuhao*/ //save to table PAGE Page pg = new Page(); PageItem pgitem = new PageItem(); MainWindow.curwid = MainWindow.getWebpageNo() + 1; pgitem.setwid(MainWindow.curwid); //item1.settitle(MainWindow._title); //pgitem.settitle(translate(MainWindow._title)); pgitem.settitle(MainWindow._title); pgitem.setauthor(MainWindow._author); pgitem.setpostdate(MainWindow._postdate); pgitem.setlink(MainWindow.curUrl); pgitem.setreferred(""); pg.savePage(pgitem); string body = dealhtml(MainWindow._rawdata); string keywords = ""; //save to table TAGS foreach (string tag in MainWindow._tags) { if (!MainWindow._occurredTags.Contains(tag)) { Tag thistag = new Tag(); TagItem tgitem = new TagItem(); tgitem.settid(MainWindow.getTagNo() + 1); //MainWindow._occurredTags.Add(translate(tag)); MainWindow._occurredTags.Add(tag); //tgitem.setname(translate(tag)); tgitem.setname(tag); thistag.saveTag(tgitem); } keywords += tag; } MainWindow.insertintosolr(MainWindow._title, "", MainWindow.curUrl, MainWindow._postdate, keywords, 0, body); foreach (string tag in MainWindow._tags) { Preview b = new Preview(); PreviewItem item3 = new PreviewItem(); //MainWindow.curtid = MainWindow.getTagNo(translate(tag)); MainWindow.curtid = MainWindow.getTagNo(tag); item3.settid(MainWindow.curtid); item3.setwid(MainWindow.curwid); //item3.setpreview(translate((string)tagPreview[tag])); item3.setpreview((string)MainWindow.tagPreview[tag]); //item3.setpreview((string)tagPreview[tag]); b.savePreview(item3); } /*suiyuhao*/ } }
public static void Processwrd(string doc, string doc2, bool needWriteToDB) { MainWindow._rawdata = doc; MainWindow._denoisingdata = doc2; if (MainWindow._denoisingdata == null) { MainWindow._denoisingdata = ""; } WordSegment ws = new WordSegment(); MainWindow._wordSegmentResult = ws.cutwords(MainWindow._denoisingdata, WordSegmentData.preWordSegmentor); foreach (string word in MainWindow._wordSegmentResult) { if (MainWindow.wordFreq.ContainsKey(word)) { int freq = (int)MainWindow.wordFreq[word]; MainWindow.wordFreq[word] = freq + 1; } else { MainWindow.wordFreq.Add(word, 1); } } List <string> _tags = new List <string>(); Hashtable tagPreview = new Hashtable(); DataMining dm = new DataMining(); dm.Rawdata = MainWindow._rawdata; dm.Denoisingdata = MainWindow._denoisingdata; dm.Work(); MainWindow._title = dm.Title; MainWindow._author = dm.Author; MainWindow._postdate = dm.Postdate; if (needWriteToDB) { /*suiyuhao*/ //save to table PAGE Page pg = new Page(); PageItem pgitem = new PageItem(); MainWindow.curwid = MainWindow.getWebpageNo() + 1; pgitem.setwid(MainWindow.curwid); //item1.settitle(MainWindow._title); //pgitem.settitle(translate(MainWindow._title)); pgitem.settitle(MainWindow._title); pgitem.setauthor(MainWindow._author); pgitem.setpostdate(MainWindow._postdate); pgitem.setlink(MainWindow.curUrl); pgitem.setreferred(""); pg.savePage(pgitem); //save to table TAGS foreach (string tag in _tags) { if (!MainWindow._occurredTags.Contains(tag)) { Tag thistag = new Tag(); TagItem tgitem = new TagItem(); tgitem.settid(MainWindow.getTagNo() + 1); //MainWindow._occurredTags.Add(translate(tag)); MainWindow._occurredTags.Add(tag); //tgitem.setname(translate(tag)); tgitem.setname(tag); thistag.saveTag(tgitem); } } foreach (string tag in _tags) { Preview b = new Preview(); PreviewItem item3 = new PreviewItem(); //MainWindow.curtid = MainWindow.getTagNo(translate(tag)); MainWindow.curtid = MainWindow.getTagNo(tag); item3.settid(MainWindow.curtid); item3.setwid(MainWindow.curwid); //item3.setpreview(translate((string)tagPreview[tag])); item3.setpreview((string)tagPreview[tag]); //item3.setpreview((string)tagPreview[tag]); b.savePreview(item3); } /*suiyuhao*/ } }
public bool savaPreview(PreviewItem item) { return(mypre.savePreview(item)); }