//异步执行获取列表文档结束 private void ProcessListPagesComplete(IAsyncResult itfAR) { //异步执行获取列表文档完毕后,获得异步返回的结果,继续异步执行下一步(获取文章URL集合) CollectProcess collectProcessListPages = (CollectProcess)((AsyncResult)itfAR).AsyncDelegate; ArticleCollectOffline collectOffline = collectProcessListPages.EndInvoke(itfAR); if (collectOffline.CancelException == null) { CollectProcess collectProcessArticlePages = new CollectProcess(ProcessArticlePages); collectProcessArticlePages.BeginInvoke(collectOffline, ProcessArticlePagesComplete, null); } else { tboxStatistics.AppendText(string.Format("终止获取列表页:{0} \n", collectOffline.CancelException.Message)); tboxStatistics.AppendText(string.Format("当前获取列表页位置:{0}\n", collectOffline.CurrentProcessedListPages)); tboxStatistics.AppendText(string.Format("总共需要处理列表页面数:{0}\n", collectOffline.CancelException.Data["TotalListPages"])); } //输出列表文档的信息 List <string> listPages = collectOffline.ListPages; tboxStatistics.AppendText(string.Format("获取列表文档所花时间:{0}\n", swGlobal.ElapsedMilliseconds)); tboxStatistics.AppendText(string.Format("本次获取列表页面数:{0}\n", listPages.Count)); }
private void btnCoTest_Click(object sender, EventArgs e) { tabctrCoform.SelectedTab = tabCoTest; bool validateResult = validateCoConfig(); if (!validateResult) { MessageBox.Show("采集规则未填写完整或未正确填写,请重新填写并保存!"); } else { //int listStartPageNum = int.Parse(_startPageNumber); //int listStopPageNum = int.Parse(_stopPageNumber); int listStartPageNum = 2; int listStopPageNum = 5; //因为这里是测试,所以默认只采集到第5页,提升测试速度 int arcSubPageStartNum = int.Parse(_arcSubPageStartNum); List <string> moreListPages = new List <string>(); List <string> subNodeParams = new List <string>(); List <string> regexParams = new List <string>(); cancelToken = new CancellationTokenSource(); swGlobal.Start(); if (!string.IsNullOrWhiteSpace(_moreListPages)) { string[] moreListPagesArr = tboxMoreListPages.Lines; moreListPages = moreListPagesArr.ToList <string>(); } if (!string.IsNullOrWhiteSpace(_subNodeParams)) { string[] subNodeParamsArr = tboxSubNodeParams.Lines; subNodeParams = subNodeParamsArr.ToList <string>(); } if (!string.IsNullOrWhiteSpace(_regexParams)) { string[] regexParamsArr = tboxRegexParams.Lines; regexParams = regexParamsArr.ToList <string>(); } ArticleCollectOffline collectOffline = new ArticleCollectOffline(_cid, _listPath, listStartPageNum, listStopPageNum, _xpathArcurlNode, _xpathTitleNode, _xpathContentNode, subNodeParams, regexParams, _arcSubPageSymbol, arcSubPageStartNum); if (moreListPages != null) { collectOffline.AddListPages(moreListPages); } collectOffline.CancelToken = cancelToken; CollectProcess collectProcessListPages = new CollectProcess(ProcessListPages); collectProcessListPages.BeginInvoke(collectOffline, ProcessListPagesComplete, null); //禁用表单,测试采集期间不能操作表单 tabctrCoform.SelectedIndexChanged += TabctrCoform_SelectedIndexChanged; btnCoTest.Enabled = false; btnSaveCoConfig.Enabled = false; } }
//异步执行采集文章结束 private void ProcessCollectArticlesComplete(IAsyncResult itfAR) { //异步执行采集文章内容完成后 CollectProcess collectProcessCollectArticles = (CollectProcess)((AsyncResult)itfAR).AsyncDelegate; ArticleCollectOffline collectOffline = collectProcessCollectArticles.EndInvoke(itfAR); swGlobal.Stop(); tboxStatistics.AppendText(string.Format("swGlobal ElapsedMilliseconds: {0} \n", swGlobal.ElapsedMilliseconds)); //输出采集文档信息 if (collectOffline.CancelException != null) { tboxStatistics.AppendText(string.Format("当前采集文章数:{0}\n", collectOffline.CurrentProcessedArticles)); tboxStatistics.AppendText(string.Format("此次总共需要采集文章数:{0}\n", collectOffline.CancelException.Data["TotalArticles"])); } List <Dictionary <string, string> > articles = collectOffline.Articles; List <Exception> coException = collectOffline.CoException; printErrors(coException); tboxStatistics.AppendText(string.Format("采集文章总数:{0} \n", articles.Count)); tboxStatistics.AppendText(string.Format("采集所耗时间 :{0} \n", swGlobal.ElapsedMilliseconds)); tboxStatistics.AppendText("-----------------------------------------------------------------------------------\n"); var arcList = from d in articles orderby d["title"] ascending select d; foreach (Dictionary <string, string> article in arcList) { foreach (KeyValuePair <string, string> kvp in article) { tboxArticlesContent.AppendText(kvp.Key + ": \n"); tboxArticlesContent.AppendText(kvp.Value + "\n"); } tboxArticlesContent.AppendText("---------------------------------------------\n"); } //恢复表单可操作 try { btnSaveCoConfig.Enabled = true; btnCoTest.Enabled = true; tabctrCoform.SelectedIndexChanged -= TabctrCoform_SelectedIndexChanged; } catch (Exception ex) { } }
//异步执行获取文章URL集合结束 private void ProcessArticlePagesComplete(IAsyncResult itfAR) { //异步执行获取文章URL集合完毕后,获得异步返回的结果,继续异步执行下一步(采集文档内容) CollectProcess collectProcessArticlePages = (CollectProcess)((AsyncResult)itfAR).AsyncDelegate; ArticleCollectOffline collectOffline = collectProcessArticlePages.EndInvoke(itfAR); if (collectOffline.CancelException == null) { CollectProcess collectProcessCollectArticles = new CollectProcess(ProcessCollectArticles); collectProcessCollectArticles.BeginInvoke(collectOffline, ProcessCollectArticlesComplete, null); } else { tboxStatistics.AppendText(string.Format("终止获取列表页:{0} \n", collectOffline.CancelException.Message)); tboxStatistics.AppendText(string.Format("当前处理列表页位置:{0}\n", collectOffline.CurrentProcessedListPages)); tboxStatistics.AppendText(string.Format("总共需要处理列表页面数:{0}\n", collectOffline.CancelException.Data["TotalListPages"])); tboxStatistics.AppendText(string.Format("当前处理文章链接数:{0}\n", collectOffline.CurrentGetArticlePages)); } //输出URL集合信息 List <string> correctListArticles = new List <string>(); foreach (Dictionary <string, string> item in collectOffline.CorrectArticlePages) { correctListArticles.Add(item["arcpath"]); } List <string> wrongListArticles = new List <string>(); foreach (Dictionary <string, string> item in collectOffline.WrongArticlePages) { wrongListArticles.Add(item["arcpath"]); } tboxStatistics.AppendText(string.Format("获取文章URL集合所花时间: {0}\n", swGlobal.ElapsedMilliseconds)); tboxArticlesPages.AppendText("待采集文章链接:\n"); foreach (string item in correctListArticles) { tboxArticlesPages.AppendText(string.Format("{0}\n", item)); } tboxArticlesPages.AppendText("-------------------------------------------------------------------------------\n"); tboxArticlesPages.AppendText("未能正确匹配内容链接,请检查匹配XPATH规则: \n"); foreach (string item in wrongListArticles) { tboxArticlesPages.AppendText(string.Format("{0}\n", item)); } }