private void MainWindow_Load(object sender, EventArgs e) { // For test and presentation //添加在爬虫下载文件夹下的所有htm或html文件 /* * Regex re = new Regex(@"(?:\.htm|\.html)$", RegexOptions.IgnoreCase); * * DirectoryInfo TheFolder=new DirectoryInfo(_filepath); * foreach (FileInfo NextFile in TheFolder.GetFiles()) * { * if (re.IsMatch(NextFile.Name)) * { * this.filelistbox.Items.Add(NextFile.Name); * } * } * * this.totalfilebox.Text = this.filelistbox.Items.Count.ToString(); * */ //DB OPERATION //提取Crawler在数据库中存放的网页 //和上面一样将网页链接存放在listbox中 //此时修改Process方法,因为本地没有文件,应直接对Denoising实例化对象的_rawdata赋数据库中的值 //原来的代码不要删掉,注释掉就可以 string selectSQL = string.Format("SELECT id, myUrl, filepath, isDeal FROM crawler.dbo.fileinfo ORDER BY filepath"); //MessageBox.Show(selectSQL); SqlConnection con = Connection.instance(AppConfiguration.GetConfigValue("serverIp"), "crawler", AppConfiguration.GetConfigValue("username"), AppConfiguration.GetConfigValue("password")); SqlCommand Command = con.CreateCommand(); Command.CommandText = selectSQL; SqlDataReader reader = Command.ExecuteReader(); while (reader.Read()) { // reader[0] : id, reader[1]:url, reader[2]:filepath, reader[3]:isDeal WebInDB widb = new WebInDB((int)reader[0], (string)reader[1], (string)reader[2], (int)reader[3]); _web.Add(widb); this.filelistbox.Items.Add((string)reader[2]); } this.totalfilebox.Text = this.filelistbox.Items.Count.ToString(); con.Close(); }
public static void ProcessSingle(WebInDB web) { /* * 处理一个选中的html或htm文件 * 实现是否选择的判定机制 */ if (web.Filepath != null && web.IsDeal == 0) { String curPath = web.Filepath; string selectSQL = string.Format("SELECT id, myUrl FROM crawler.dbo.fileinfo where filepath = '{0}'", curPath); SqlConnection con = Connection.instance(AppConfiguration.GetConfigValue("serverIp"), "crawler", AppConfiguration.GetConfigValue("username"), AppConfiguration.GetConfigValue("password")); SqlCommand Command = con.CreateCommand(); Command.CommandText = selectSQL; SqlDataReader reader = Command.ExecuteReader(); if (reader.Read()) { curUrl = reader[1].ToString(); con.Close(); } else { updateData(2, web.Id); cannotDealNo++; con.Close(); return; } selectSQL = string.Format("SELECT * FROM XueBa.dbo.WebPage WHERE link = '{0}'", curUrl); con = Connection.instance(AppConfiguration.GetConfigValue("serverIp"), "XueBa", AppConfiguration.GetConfigValue("username"), AppConfiguration.GetConfigValue("password")); Command = con.CreateCommand(); Command.CommandText = selectSQL; reader = Command.ExecuteReader(); string selectSQL1 = string.Format("SELECT * FROM XueBa.dbo.c705questions WHERE link like '{0}'", curUrl); //MessageBox.Show(curItem); SqlConnection con1 = Connection.instance(AppConfiguration.GetConfigValue("serverIp"), "XueBa", AppConfiguration.GetConfigValue("username"), AppConfiguration.GetConfigValue("password")); SqlCommand Command1 = con1.CreateCommand(); Command1.CommandText = selectSQL1; SqlDataReader reader1 = Command1.ExecuteReader(); if (reader.Read() || reader1.Read()) { con.Close(); con1.Close(); return; } else { dealNo++; curPath = Regex.Split(curPath, "XueBaResources")[1]; curPath = @"\\10.2.28.78\XueBaResources" + curPath; FileInfo temp = new FileInfo(curPath); if (temp.Extension.Equals(".pdf")) { //PipeLine.OtherToHtml.IcDocument document = PipeLine.OtherToHtml.FactoryDocument.GetDocoment(@"F:\ziliao\C705\Pipeline\text", curPath); PipeLine.OtherToHtml.IcDocument document = PipeLine.OtherToHtml.FactoryDocument.GetDocoment(@"\\10.2.28.78\XueBaResources", curPath); document.TransformDocument(); //ProcessProcedure.Processpdf(@"F:\ziliao\C705\Pipeline\text\" + temp.Name + ".html", true); ProcessProcedure.Processpdf(@"\\10.2.28.78\XueBaResources\" + temp.Name + ".html", true); } else { if (ProcessProcedure.baiduzhidao(curUrl)) { ProcessProcedure.baiduzhidaoprocess(curPath, curUrl); } else if (ProcessProcedure.cnblogs(curUrl)) { ProcessProcedure.cnblogsprocess(curPath, curUrl); } else if (ProcessProcedure.dewen(curUrl)) { ProcessProcedure.dewenprocess(curPath, curUrl); } else if (ProcessProcedure.stackoverflow(curUrl)) { ProcessProcedure.stackoverflowprocess(curPath, curUrl);// process stackoverflow question answer pair } else if (ProcessProcedure.sosowenwen(curUrl)) { ProcessProcedure.sosowenwenprocess(curPath, curUrl);// process sosowenwen question answer pair } else { ProcessProcedure.Process(curPath, true);//@"C:\C705\Pipeline\TestFiles\" + } } updateData(1, web.Id); } } else if (web.IsDeal == 0 && web.Filepath == null) { cannotDealNo++; string updateSQL = string.Format("update crawler.dbo.fileinfo set isDeal='{0}' where id = '{1}'", 2, web.Id); SqlConnection con = Connection.instance(AppConfiguration.GetConfigValue("serverIp"), "crawler", AppConfiguration.GetConfigValue("username"), AppConfiguration.GetConfigValue("password")); SqlCommand cmd = con.CreateCommand(); cmd.CommandText = updateSQL; cmd.ExecuteNonQuery(); con.Close(); } }