Пример #1
0
        private void ParsePage(string title, string url, PageElement pageElement = null)
        {
            if (pageElement == null)
            {
                pageElement = new PageElement {
                    Title = title, Url = url
                };
            }

            var xpath = new ItemPageXPaths();
            List <SubItemElement> subList;
            DateTime    startTime = DateTime.Now;
            PageElement result;

            if (GeckoDownRd.Checked)
            {
                //result = new GeckoParser().GetArticleContent(url, title, DeterminedMode(), out xpath);
                CrawlResponse resp    = GeckoRequestProcessor.DoRequest(BuildFakeRequest(url), BuildFakeSiteEntity(), null, null, null, true, 1000);
                string        content = resp.Content;
                result = PageAutoAnalyzer.AnalyzeContent(content, pageElement,
                                                         DeterminedMode(), new IdentityContentElement(), ref xpath,
                                                         out subList, 86400, ExcludeTxt.Text);
            }
            else if (HttpdownRd.Checked)
            {
                string content = WebRequestProcessor.DownloadHTTPString(url, 30);
                result = PageAutoAnalyzer.AnalyzeContent(content, pageElement,
                                                         DeterminedMode(), new IdentityContentElement(), ref xpath,
                                                         out subList, 86400, ExcludeTxt.Text);
            }
            else
            {
                throw new Exception("不支持该方式分析正文");
            }


            TimeSpan usedTime = DateTime.Now - startTime;

            if (result == null)
            {
                return;
            }
            PageUrlTxt.Text       = HtmlUtility.ExpandRelativePath(url, result.Url);
            TitleTxt.Text         = result.Title;
            ContentTxt.Text       = result.Content;
            ViewTxt.Text          = result.View.ToString();
            ReplyTxt.Text         = result.Reply.ToString();
            PubdateTxt.Text       = result.Pubdate == null ? "" : result.Pubdate.ToString();
            AuthorTxt.Text        = result.Author;
            MediaTxt.Text         = result.MediaName;
            ElementXPathTxt.Text  = result.ElementXPath;
            ElementBlockTxt.Text  = result.ElementBlock;
            NextpageXPathTxt.Text = result.NextPageXPath;
        }
Пример #2
0
        private void ParseListBtn_Click(object sender, EventArgs e)
        {
            string        url     = InputUrlTxt.Text;
            string        content = "";
            RecogniseMode mode    = DeterminedMode();
            var           xpath   = new ListPageXPaths();

            PageElement[] result;
            if (GeckoDownRd.Checked)
            {
                //result = new GeckoParser().AnalyzeArticleList(url,mode,out xpath,86400);
                CrawlResponse resp = GeckoRequestProcessor.DoRequest(BuildFakeRequest(url), BuildFakeSiteEntity(), null, null, null, true, 1000);
                content = resp.Content;
                var ret = PageAutoAnalyzer.AnalyzeArticleList(resp.Url, content, mode, new IdentityPageElement(), ref xpath, 86400);
                result = ret == null ? null : ret.List;
            }
            else if (HttpdownRd.Checked)
            {
                content = WebRequestProcessor.DownloadHTTPString(url, 30);
                var ret = PageAutoAnalyzer.AnalyzeArticleList(url, content, mode, new IdentityPageElement(), ref xpath, 86400);
                result = ret == null ? null : ret.List;
            }
            else
            {
                throw new NotSupportedException("不支持当前项抓取");
            }



            if (result == null)
            {
                MessageBox.Show("解析不出数据");
                return;
            }
            foreach (var pageElement in result)
            {
                pageElement.Url = HtmlUtility.ExpandRelativePath(url, pageElement.Url);
            }
            ListGridView.DataSource = result;
        }