private void ParsePage(string title, string url, PageElement pageElement = null) { if (pageElement == null) { pageElement = new PageElement { Title = title, Url = url }; } var xpath = new ItemPageXPaths(); List <SubItemElement> subList; DateTime startTime = DateTime.Now; PageElement result; if (GeckoDownRd.Checked) { //result = new GeckoParser().GetArticleContent(url, title, DeterminedMode(), out xpath); CrawlResponse resp = GeckoRequestProcessor.DoRequest(BuildFakeRequest(url), BuildFakeSiteEntity(), null, null, null, true, 1000); string content = resp.Content; result = PageAutoAnalyzer.AnalyzeContent(content, pageElement, DeterminedMode(), new IdentityContentElement(), ref xpath, out subList, 86400, ExcludeTxt.Text); } else if (HttpdownRd.Checked) { string content = WebRequestProcessor.DownloadHTTPString(url, 30); result = PageAutoAnalyzer.AnalyzeContent(content, pageElement, DeterminedMode(), new IdentityContentElement(), ref xpath, out subList, 86400, ExcludeTxt.Text); } else { throw new Exception("不支持该方式分析正文"); } TimeSpan usedTime = DateTime.Now - startTime; if (result == null) { return; } PageUrlTxt.Text = HtmlUtility.ExpandRelativePath(url, result.Url); TitleTxt.Text = result.Title; ContentTxt.Text = result.Content; ViewTxt.Text = result.View.ToString(); ReplyTxt.Text = result.Reply.ToString(); PubdateTxt.Text = result.Pubdate == null ? "" : result.Pubdate.ToString(); AuthorTxt.Text = result.Author; MediaTxt.Text = result.MediaName; ElementXPathTxt.Text = result.ElementXPath; ElementBlockTxt.Text = result.ElementBlock; NextpageXPathTxt.Text = result.NextPageXPath; }
private void ParseListBtn_Click(object sender, EventArgs e) { string url = InputUrlTxt.Text; string content = ""; RecogniseMode mode = DeterminedMode(); var xpath = new ListPageXPaths(); PageElement[] result; if (GeckoDownRd.Checked) { //result = new GeckoParser().AnalyzeArticleList(url,mode,out xpath,86400); CrawlResponse resp = GeckoRequestProcessor.DoRequest(BuildFakeRequest(url), BuildFakeSiteEntity(), null, null, null, true, 1000); content = resp.Content; var ret = PageAutoAnalyzer.AnalyzeArticleList(resp.Url, content, mode, new IdentityPageElement(), ref xpath, 86400); result = ret == null ? null : ret.List; } else if (HttpdownRd.Checked) { content = WebRequestProcessor.DownloadHTTPString(url, 30); var ret = PageAutoAnalyzer.AnalyzeArticleList(url, content, mode, new IdentityPageElement(), ref xpath, 86400); result = ret == null ? null : ret.List; } else { throw new NotSupportedException("不支持当前项抓取"); } if (result == null) { MessageBox.Show("解析不出数据"); return; } foreach (var pageElement in result) { pageElement.Url = HtmlUtility.ExpandRelativePath(url, pageElement.Url); } ListGridView.DataSource = result; }