Ejemplo n.º 1
0
        //异步执行获取列表文档结束
        private void ProcessListPagesComplete(IAsyncResult itfAR)
        {
            //异步执行获取列表文档完毕后,获得异步返回的结果,继续异步执行下一步(获取文章URL集合)
            CollectProcess        collectProcessListPages = (CollectProcess)((AsyncResult)itfAR).AsyncDelegate;
            ArticleCollectOffline collectOffline          = collectProcessListPages.EndInvoke(itfAR);

            if (collectOffline.CancelException == null)
            {
                CollectProcess collectProcessArticlePages = new CollectProcess(ProcessArticlePages);
                collectProcessArticlePages.BeginInvoke(collectOffline, ProcessArticlePagesComplete, null);
            }
            else
            {
                tboxStatistics.AppendText(string.Format("终止获取列表页:{0} \n", collectOffline.CancelException.Message));
                tboxStatistics.AppendText(string.Format("当前获取列表页位置:{0}\n", collectOffline.CurrentProcessedListPages));
                tboxStatistics.AppendText(string.Format("总共需要处理列表页面数:{0}\n", collectOffline.CancelException.Data["TotalListPages"]));
            }

            //输出列表文档的信息

            List <string> listPages = collectOffline.ListPages;

            tboxStatistics.AppendText(string.Format("获取列表文档所花时间:{0}\n", swGlobal.ElapsedMilliseconds));
            tboxStatistics.AppendText(string.Format("本次获取列表页面数:{0}\n", listPages.Count));
        }
Ejemplo n.º 2
0
        private void btnCoTest_Click(object sender, EventArgs e)
        {
            tabctrCoform.SelectedTab = tabCoTest;
            bool validateResult = validateCoConfig();

            if (!validateResult)
            {
                MessageBox.Show("采集规则未填写完整或未正确填写,请重新填写并保存!");
            }
            else
            {
                //int listStartPageNum = int.Parse(_startPageNumber);
                //int listStopPageNum = int.Parse(_stopPageNumber);
                int           listStartPageNum   = 2;
                int           listStopPageNum    = 5; //因为这里是测试,所以默认只采集到第5页,提升测试速度
                int           arcSubPageStartNum = int.Parse(_arcSubPageStartNum);
                List <string> moreListPages      = new List <string>();
                List <string> subNodeParams      = new List <string>();
                List <string> regexParams        = new List <string>();
                cancelToken = new CancellationTokenSource();
                swGlobal.Start();

                if (!string.IsNullOrWhiteSpace(_moreListPages))
                {
                    string[] moreListPagesArr = tboxMoreListPages.Lines;
                    moreListPages = moreListPagesArr.ToList <string>();
                }
                if (!string.IsNullOrWhiteSpace(_subNodeParams))
                {
                    string[] subNodeParamsArr = tboxSubNodeParams.Lines;
                    subNodeParams = subNodeParamsArr.ToList <string>();
                }
                if (!string.IsNullOrWhiteSpace(_regexParams))
                {
                    string[] regexParamsArr = tboxRegexParams.Lines;
                    regexParams = regexParamsArr.ToList <string>();
                }

                ArticleCollectOffline collectOffline = new ArticleCollectOffline(_cid, _listPath, listStartPageNum, listStopPageNum, _xpathArcurlNode, _xpathTitleNode, _xpathContentNode, subNodeParams, regexParams, _arcSubPageSymbol, arcSubPageStartNum);
                if (moreListPages != null)
                {
                    collectOffline.AddListPages(moreListPages);
                }
                collectOffline.CancelToken = cancelToken;
                CollectProcess collectProcessListPages = new CollectProcess(ProcessListPages);
                collectProcessListPages.BeginInvoke(collectOffline, ProcessListPagesComplete, null);

                //禁用表单,测试采集期间不能操作表单
                tabctrCoform.SelectedIndexChanged += TabctrCoform_SelectedIndexChanged;
                btnCoTest.Enabled       = false;
                btnSaveCoConfig.Enabled = false;
            }
        }
Ejemplo n.º 3
0
        //异步执行采集文章结束
        private void ProcessCollectArticlesComplete(IAsyncResult itfAR)
        {
            //异步执行采集文章内容完成后
            CollectProcess        collectProcessCollectArticles = (CollectProcess)((AsyncResult)itfAR).AsyncDelegate;
            ArticleCollectOffline collectOffline = collectProcessCollectArticles.EndInvoke(itfAR);

            swGlobal.Stop();
            tboxStatistics.AppendText(string.Format("swGlobal ElapsedMilliseconds: {0} \n", swGlobal.ElapsedMilliseconds));
            //输出采集文档信息
            if (collectOffline.CancelException != null)
            {
                tboxStatistics.AppendText(string.Format("当前采集文章数:{0}\n", collectOffline.CurrentProcessedArticles));
                tboxStatistics.AppendText(string.Format("此次总共需要采集文章数:{0}\n", collectOffline.CancelException.Data["TotalArticles"]));
            }
            List <Dictionary <string, string> > articles = collectOffline.Articles;
            List <Exception> coException = collectOffline.CoException;

            printErrors(coException);

            tboxStatistics.AppendText(string.Format("采集文章总数:{0} \n", articles.Count));
            tboxStatistics.AppendText(string.Format("采集所耗时间 :{0} \n", swGlobal.ElapsedMilliseconds));
            tboxStatistics.AppendText("-----------------------------------------------------------------------------------\n");
            var arcList = from d in articles
                          orderby d["title"]
                          ascending
                          select d;

            foreach (Dictionary <string, string> article in arcList)
            {
                foreach (KeyValuePair <string, string> kvp in article)
                {
                    tboxArticlesContent.AppendText(kvp.Key + ": \n");
                    tboxArticlesContent.AppendText(kvp.Value + "\n");
                }
                tboxArticlesContent.AppendText("---------------------------------------------\n");
            }

            //恢复表单可操作
            try
            {
                btnSaveCoConfig.Enabled            = true;
                btnCoTest.Enabled                  = true;
                tabctrCoform.SelectedIndexChanged -= TabctrCoform_SelectedIndexChanged;
            }
            catch (Exception ex)
            {
            }
        }
Ejemplo n.º 4
0
        //异步执行获取文章URL集合结束
        private void ProcessArticlePagesComplete(IAsyncResult itfAR)
        {
            //异步执行获取文章URL集合完毕后,获得异步返回的结果,继续异步执行下一步(采集文档内容)
            CollectProcess        collectProcessArticlePages = (CollectProcess)((AsyncResult)itfAR).AsyncDelegate;
            ArticleCollectOffline collectOffline             = collectProcessArticlePages.EndInvoke(itfAR);

            if (collectOffline.CancelException == null)
            {
                CollectProcess collectProcessCollectArticles = new CollectProcess(ProcessCollectArticles);
                collectProcessCollectArticles.BeginInvoke(collectOffline, ProcessCollectArticlesComplete, null);
            }
            else
            {
                tboxStatistics.AppendText(string.Format("终止获取列表页:{0} \n", collectOffline.CancelException.Message));
                tboxStatistics.AppendText(string.Format("当前处理列表页位置:{0}\n", collectOffline.CurrentProcessedListPages));
                tboxStatistics.AppendText(string.Format("总共需要处理列表页面数:{0}\n", collectOffline.CancelException.Data["TotalListPages"]));
                tboxStatistics.AppendText(string.Format("当前处理文章链接数:{0}\n", collectOffline.CurrentGetArticlePages));
            }
            //输出URL集合信息
            List <string> correctListArticles = new List <string>();

            foreach (Dictionary <string, string> item in collectOffline.CorrectArticlePages)
            {
                correctListArticles.Add(item["arcpath"]);
            }
            List <string> wrongListArticles = new List <string>();

            foreach (Dictionary <string, string> item in collectOffline.WrongArticlePages)
            {
                wrongListArticles.Add(item["arcpath"]);
            }
            tboxStatistics.AppendText(string.Format("获取文章URL集合所花时间: {0}\n", swGlobal.ElapsedMilliseconds));
            tboxArticlesPages.AppendText("待采集文章链接:\n");
            foreach (string item in correctListArticles)
            {
                tboxArticlesPages.AppendText(string.Format("{0}\n", item));
            }
            tboxArticlesPages.AppendText("-------------------------------------------------------------------------------\n");
            tboxArticlesPages.AppendText("未能正确匹配内容链接,请检查匹配XPATH规则: \n");
            foreach (string item in wrongListArticles)
            {
                tboxArticlesPages.AppendText(string.Format("{0}\n", item));
            }
        }
Ejemplo n.º 5
0
 //采集文章内容
 private ArticleCollectOffline ProcessCollectArticles(ArticleCollectOffline collectOffline)
 {
     collectOffline.ProcessCollectArticles();
     return(collectOffline);
 }