private void ParsePage(string title, string url, PageElement pageElement = null) { if (pageElement == null) { pageElement = new PageElement { Title = title, Url = url }; } var xpath = new ItemPageXPaths(); List <SubItemElement> subList; DateTime startTime = DateTime.Now; PageElement result; if (GeckoDownRd.Checked) { //result = new GeckoParser().GetArticleContent(url, title, DeterminedMode(), out xpath); CrawlResponse resp = GeckoRequestProcessor.DoRequest(BuildFakeRequest(url), BuildFakeSiteEntity(), null, null, null, true, 1000); string content = resp.Content; result = PageAutoAnalyzer.AnalyzeContent(content, pageElement, DeterminedMode(), new IdentityContentElement(), ref xpath, out subList, 86400, ExcludeTxt.Text); } else if (HttpdownRd.Checked) { string content = WebRequestProcessor.DownloadHTTPString(url, 30); result = PageAutoAnalyzer.AnalyzeContent(content, pageElement, DeterminedMode(), new IdentityContentElement(), ref xpath, out subList, 86400, ExcludeTxt.Text); } else { throw new Exception("不支持该方式分析正文"); } TimeSpan usedTime = DateTime.Now - startTime; if (result == null) { return; } PageUrlTxt.Text = HtmlUtility.ExpandRelativePath(url, result.Url); TitleTxt.Text = result.Title; ContentTxt.Text = result.Content; ViewTxt.Text = result.View.ToString(); ReplyTxt.Text = result.Reply.ToString(); PubdateTxt.Text = result.Pubdate == null ? "" : result.Pubdate.ToString(); AuthorTxt.Text = result.Author; MediaTxt.Text = result.MediaName; ElementXPathTxt.Text = result.ElementXPath; ElementBlockTxt.Text = result.ElementBlock; NextpageXPathTxt.Text = result.NextPageXPath; }
private void ParseListBtn_Click(object sender, EventArgs e) { string url = InputUrlTxt.Text; string content = ""; RecogniseMode mode = DeterminedMode(); var xpath = new ListPageXPaths(); PageElement[] result; if (GeckoDownRd.Checked) { //result = new GeckoParser().AnalyzeArticleList(url,mode,out xpath,86400); CrawlResponse resp = GeckoRequestProcessor.DoRequest(BuildFakeRequest(url), BuildFakeSiteEntity(), null, null, null, true, 1000); content = resp.Content; var ret = PageAutoAnalyzer.AnalyzeArticleList(resp.Url, content, mode, new IdentityPageElement(), ref xpath, 86400); result = ret == null ? null : ret.List; } else if (HttpdownRd.Checked) { content = WebRequestProcessor.DownloadHTTPString(url, 30); var ret = PageAutoAnalyzer.AnalyzeArticleList(url, content, mode, new IdentityPageElement(), ref xpath, 86400); result = ret == null ? null : ret.List; } else { throw new NotSupportedException("不支持当前项抓取"); } if (result == null) { MessageBox.Show("解析不出数据"); return; } foreach (var pageElement in result) { pageElement.Url = HtmlUtility.ExpandRelativePath(url, pageElement.Url); } ListGridView.DataSource = result; }
private UserTweet CrawlTask(string entryUrl) { _currentUrl = entryUrl; var Site = SiteBusiness.GetBySiteID("Weibo"); var Request = BuildRequest(entryUrl, RegexContent); Site.TimeoutSecs = 60; CrawlResponse Response = null; try { Response = GeckoRequestProcessor.DoRequest(Request, Site, null, null); AggrSum(); } catch { } if (Response.Status != Enums.CrawlResult.Succ) { Logger.Info("访问页面错误:Url = " + Response.Url); } var content = Response.Content; //First page UserTweet result = new UserTweet(); result.Url = entryUrl; try { FillUserInfo(content, result); } catch { return(result); } var endId = DeterminedMid(content, MidType.EndId); var maxId = DeterminedMid(content, MidType.MaxId); //var name = Regex.Match(content,) int currentPage = 1; int maxPage = 50; string rootPath = @"D:/output/" + result.Name + "/"; Workbook outputBook = new Workbook(); Worksheet sheet = null; int currentLine = 4; int pos = 1; //表示第几次滚屏(一页中共3次),0:第一次;1:第二次;2:第三次 bool isContinue = false; if (!Directory.Exists(rootPath)) { Directory.CreateDirectory(rootPath); } if (File.Exists(rootPath + result.Name + ".xls")) { outputBook.Open(rootPath + result.Name + ".xls"); sheet = outputBook.Worksheets[result.Name]; int endrow = currentLine; while (!string.IsNullOrEmpty(sheet.Cells[endrow, 0].StringValue)) { endrow++; } currentLine = endrow; currentPage = (int)(currentLine / 45d) + 1; isContinue = true; } else { sheet = outputBook.Worksheets.Add(result.Name); } //Save to excel //Initialize column sheet.Cells[0, 0].PutValue("姓名"); sheet.Cells[0, 1].PutValue("网址"); sheet.Cells[0, 2].PutValue("粉丝数"); sheet.Cells[0, 3].PutValue("关注数"); sheet.Cells[0, 4].PutValue("微博数"); sheet.Cells[1, 0].PutValue(result.Name); sheet.Cells[1, 1].PutValue(result.Url); sheet.Cells[1, 2].PutValue(result.Follower); sheet.Cells[1, 3].PutValue(result.Follow); sheet.Cells[1, 4].PutValue(result.TweetNum); sheet.Cells[3, 0].PutValue("微博内容"); sheet.Cells[3, 1].PutValue("发布时间"); sheet.Cells[3, 2].PutValue("转发数"); sheet.Cells[3, 3].PutValue("评论数"); sheet.Cells[3, 4].PutValue("原帖地址"); sheet.Cells[3, 5].PutValue("来源"); sheet.Cells[3, 6].PutValue("具体评论"); if (isContinue) { pos = 0; var url = BuildTweetJsonUrl(result, endId, maxId, currentPage, pos); Request = BuildRequest(url); Response = GeckoRequestProcessor.DoRequest(Request, Site, null, null); AggrSum(); JsonResponse tmpResult = JsonConvert.DeserializeObject <JsonResponse>(Response.Content.Trim("</pre>".ToArray())); Response.Content = HttpUtility.HtmlDecode(tmpResult.data); pos++; var firstTweet = FillUserTweet(result, Response.Content); var firstUrl = firstTweet.FirstOrDefault().Url; result.Tweets.Clear(); int endrow = currentLine - 1; while (endrow > 3) { if (sheet.Cells[endrow, 4].StringValue == firstUrl) { currentLine = endrow; break; } endrow--; } } //Crawl with json while (Regex.IsMatch(Response.Content.Trim(), RegexContent, RegexOptions.Multiline | RegexOptions.IgnoreCase) && Response.Status == Enums.CrawlResult.Succ) { content = Response.Content.Trim(); var currentTweet = FillUserTweet(result, content); foreach (Tweet tweet in currentTweet) { string fileName = tweet.Mid + ".xls"; //检查是否是失败后的已经存在的评论 if (NeedCrawlComment) { if (!File.Exists(rootPath + fileName)) { FillTweetComment(tweet, Site); if (tweet.Comments.Count > 0) { SaveComment(rootPath, tweet, fileName); } } } sheet.Cells[currentLine, 0].PutValue(tweet.Content); sheet.Cells[currentLine, 1].PutValue(tweet.PubDate.ToString("yyyy-MM-dd HH:mm:ss")); sheet.Cells[currentLine, 2].PutValue(tweet.Forward); sheet.Cells[currentLine, 3].PutValue(tweet.Comment); sheet.Cells[currentLine, 4].PutValue(tweet.Url); sheet.Cells[currentLine, 5].PutValue(tweet.Source); //link comment if (File.Exists(rootPath + fileName)) { sheet.Cells[currentLine, 6].PutValue("点击查看"); //string linkPath = result.Name + "/" + fileName; string linkPath = fileName; sheet.Hyperlinks.Add(currentLine, 6, 1, 1, linkPath); } outputBook.Save(rootPath + result.Name + ".xls"); StatusLbl.Text = string.Format("正在读取名人:{0}的第{1}条微博", result.Name, currentLine - 3); Application.DoEvents(); currentLine++; } var url = BuildTweetJsonUrl(result, endId, maxId, currentPage, pos); Request = BuildRequest(url); for (int i = 0; i < 5; i++) { try { Response = GeckoRequestProcessor.DoRequest(Request, Site, null, null); AggrSum(); } catch { } if (Response.Status != Enums.CrawlResult.Succ) { Logger.Info("访问页面错误:Url = " + Response.Url); } else { break; } } try { JsonResponse tmpResult = JsonConvert.DeserializeObject <JsonResponse>(Response.Content.Trim("</pre>".ToArray())); Response.Content = HttpUtility.HtmlDecode(tmpResult.data); } catch { try { CommentJsonResponse tmpResult = JsonConvert.DeserializeObject <CommentJsonResponse>(Response.Content.Trim("</pre>".ToArray())); Response.Content = HttpUtility.HtmlDecode(tmpResult.data.html); } catch { } } pos = (pos + 1) % 3; if (pos == 0) { currentPage++; } maxId = result.Tweets.Last().Mid; } return(result); }
private void FillTweetComment(Tweet tweet, SiteEntity site) { if (tweet.Comment == 0) { return; } int currentPage = 1; string mid = tweet.Mid; try { while (true) { string url = string.Format(CommentUrlFormat, mid, currentPage); var request = BuildRequest(url); CrawlResponse response = null; for (int i = 0; i < 5; i++) { try { response = GeckoRequestProcessor.DoRequest(request, site, null, null); AggrSum(); } catch {} if (response.Status != Enums.CrawlResult.Succ) { Logger.Info("访问页面错误:Url = " + response.Url); } else { break; } } CommentJsonResponse tmpResult = JsonConvert.DeserializeObject <CommentJsonResponse>(response.Content.Trim("</pre>".ToArray())); response.Content = HttpUtility.HtmlDecode(tmpResult.data.html); var pageMatch = Regex.Match(response.Content, RegexCommentPage, RegexOptions.IgnoreCase | RegexOptions.Multiline); if (currentPage != 1 && (!pageMatch.Success || pageMatch.Groups["CurrentPageNum"].Value != currentPage.ToString(CultureInfo.InvariantCulture))) { return; } //Fill Tweet var matches = Regex.Matches(response.Content, RegexComment, RegexOptions.IgnoreCase | RegexOptions.Multiline); foreach (Match match in matches) { Comment comment = new Comment(); comment.Author = match.Groups["Author"].Value; comment.AuthorUrl = RegexParser.AbsoluteUrl(match.Groups["AuthorUrl"].Value, tweet.Url, true); comment.Content = TextCleaner.FullClean(match.Groups["Content"].Value); comment.PubDate = DateTimeParser.Parser(match.Groups["PubDate"].Value) ?? DateTime.MinValue; tweet.Comments.Add(comment); } currentPage++; } } catch { } }