//异步执行获取列表文档结束 private void ProcessListPagesComplete(IAsyncResult itfAR) { //异步执行获取列表文档完毕后,获得异步返回的结果,继续异步执行下一步(获取文章URL集合) CollectProcess collectProcessListPages = (CollectProcess)((AsyncResult)itfAR).AsyncDelegate; ArticleCollectOffline collectOffline = collectProcessListPages.EndInvoke(itfAR); if (collectOffline.CancelException == null) { CollectProcess collectProcessArticlePages = new CollectProcess(ProcessArticlePages); collectProcessArticlePages.BeginInvoke(collectOffline, ProcessArticlePagesComplete, null); } else { tboxStatistics.AppendText(string.Format("终止获取列表页:{0} \n", collectOffline.CancelException.Message)); tboxStatistics.AppendText(string.Format("当前获取列表页位置:{0}\n", collectOffline.CurrentProcessedListPages)); tboxStatistics.AppendText(string.Format("总共需要处理列表页面数:{0}\n", collectOffline.CancelException.Data["TotalListPages"])); } //输出列表文档的信息 List <string> listPages = collectOffline.ListPages; tboxStatistics.AppendText(string.Format("获取列表文档所花时间:{0}\n", swGlobal.ElapsedMilliseconds)); tboxStatistics.AppendText(string.Format("本次获取列表页面数:{0}\n", listPages.Count)); }
private void btnCoTest_Click(object sender, EventArgs e) { tabctrCoform.SelectedTab = tabCoTest; bool validateResult = validateCoConfig(); if (!validateResult) { MessageBox.Show("采集规则未填写完整或未正确填写,请重新填写并保存!"); } else { //int listStartPageNum = int.Parse(_startPageNumber); //int listStopPageNum = int.Parse(_stopPageNumber); int listStartPageNum = 2; int listStopPageNum = 5; //因为这里是测试,所以默认只采集到第5页,提升测试速度 int arcSubPageStartNum = int.Parse(_arcSubPageStartNum); List <string> moreListPages = new List <string>(); List <string> subNodeParams = new List <string>(); List <string> regexParams = new List <string>(); cancelToken = new CancellationTokenSource(); swGlobal.Start(); if (!string.IsNullOrWhiteSpace(_moreListPages)) { string[] moreListPagesArr = tboxMoreListPages.Lines; moreListPages = moreListPagesArr.ToList <string>(); } if (!string.IsNullOrWhiteSpace(_subNodeParams)) { string[] subNodeParamsArr = tboxSubNodeParams.Lines; subNodeParams = subNodeParamsArr.ToList <string>(); } if (!string.IsNullOrWhiteSpace(_regexParams)) { string[] regexParamsArr = tboxRegexParams.Lines; regexParams = regexParamsArr.ToList <string>(); } ArticleCollectOffline collectOffline = new ArticleCollectOffline(_cid, _listPath, listStartPageNum, listStopPageNum, _xpathArcurlNode, _xpathTitleNode, _xpathContentNode, subNodeParams, regexParams, _arcSubPageSymbol, arcSubPageStartNum); if (moreListPages != null) { collectOffline.AddListPages(moreListPages); } collectOffline.CancelToken = cancelToken; CollectProcess collectProcessListPages = new CollectProcess(ProcessListPages); collectProcessListPages.BeginInvoke(collectOffline, ProcessListPagesComplete, null); //禁用表单,测试采集期间不能操作表单 tabctrCoform.SelectedIndexChanged += TabctrCoform_SelectedIndexChanged; btnCoTest.Enabled = false; btnSaveCoConfig.Enabled = false; } }
//异步执行获取文章URL集合结束 private void ProcessArticlePagesComplete(IAsyncResult itfAR) { //异步执行获取文章URL集合完毕后,获得异步返回的结果,继续异步执行下一步(采集文档内容) CollectProcess collectProcessArticlePages = (CollectProcess)((AsyncResult)itfAR).AsyncDelegate; ArticleCollectOffline collectOffline = collectProcessArticlePages.EndInvoke(itfAR); if (collectOffline.CancelException == null) { CollectProcess collectProcessCollectArticles = new CollectProcess(ProcessCollectArticles); collectProcessCollectArticles.BeginInvoke(collectOffline, ProcessCollectArticlesComplete, null); } else { tboxStatistics.AppendText(string.Format("终止获取列表页:{0} \n", collectOffline.CancelException.Message)); tboxStatistics.AppendText(string.Format("当前处理列表页位置:{0}\n", collectOffline.CurrentProcessedListPages)); tboxStatistics.AppendText(string.Format("总共需要处理列表页面数:{0}\n", collectOffline.CancelException.Data["TotalListPages"])); tboxStatistics.AppendText(string.Format("当前处理文章链接数:{0}\n", collectOffline.CurrentGetArticlePages)); } //输出URL集合信息 List <string> correctListArticles = new List <string>(); foreach (Dictionary <string, string> item in collectOffline.CorrectArticlePages) { correctListArticles.Add(item["arcpath"]); } List <string> wrongListArticles = new List <string>(); foreach (Dictionary <string, string> item in collectOffline.WrongArticlePages) { wrongListArticles.Add(item["arcpath"]); } tboxStatistics.AppendText(string.Format("获取文章URL集合所花时间: {0}\n", swGlobal.ElapsedMilliseconds)); tboxArticlesPages.AppendText("待采集文章链接:\n"); foreach (string item in correctListArticles) { tboxArticlesPages.AppendText(string.Format("{0}\n", item)); } tboxArticlesPages.AppendText("-------------------------------------------------------------------------------\n"); tboxArticlesPages.AppendText("未能正确匹配内容链接,请检查匹配XPATH规则: \n"); foreach (string item in wrongListArticles) { tboxArticlesPages.AppendText(string.Format("{0}\n", item)); } }