private string BuildTweetJsonUrl(UserTweet userTweet, string endId, string maxId, int page, int pos) { string url = prefixWeiboUrl + "&max_id=" + maxId + "&end_id=" + endId + "&uid=" + userTweet.Uid; switch (pos) { case 0: { //第一次滚屏 url = url + "&count=50&page=" + page; int prepage = page > 1 ? page - 1 : page; url = url + "&pre_page=" + prepage; break; } case 1: case 2: { //第二次,第三次滚屏逻辑一样 url = url + "&count=15&page=" + page + "&pre_page=" + page; int pagebar = pos - 1; url = url + "&pagebar=" + pagebar; break; } default: { throw new Exception("滚屏次数不对,目前新浪只支持每页3次滚屏"); } } return(url); }
private Tweet[] FillUserTweet(UserTweet result, string content) { var matches = Regex.Matches(content, RegexContent, RegexOptions.Multiline | RegexOptions.IgnoreCase); List <Tweet> tweetList = new List <Tweet>(); try { foreach (Match match in matches) { Tweet tweet = new Tweet(); int comment; int.TryParse(match.Groups["Reply"].Value, out comment); int forward; int.TryParse(match.Groups["Forward"].Value, out forward); tweet.Comment = comment; tweet.Content = TextCleaner.FullClean(match.Groups["Content"].Value); tweet.Mid = match.Groups["Mid"].Value; tweet.Forward = forward; tweet.Source = match.Groups["Source"].Value; tweet.PubDate = DateTimeParser.Parser(match.Groups["PubDate"].Value) ?? DateTime.MinValue; tweet.Url = RegexParser.AbsoluteUrl(match.Groups["Url"].Value, result.Url, true); result.Tweets.Add(tweet); tweetList.Add(tweet); } } catch {} return(tweetList.ToArray()); }
private void FillUserInfo(string content, UserTweet userTweet) { var match = Regex.Match(content, RegexInfo, RegexOptions.Multiline | RegexOptions.IgnoreCase); userTweet.Follow = int.Parse(match.Groups["Follow"].Value); userTweet.Follower = int.Parse(match.Groups["Follower"].Value); userTweet.TweetNum = int.Parse(match.Groups["TweetNum"].Value); userTweet.Name = HTMLCleaner.CleanHTML(match.Groups["Name"].Value, true); userTweet.Uid = match.Groups["Uid"].Value; }
private UserTweet CrawlTask(string entryUrl) { _currentUrl = entryUrl; var Site = SiteBusiness.GetBySiteID("Weibo"); var Request = BuildRequest(entryUrl, RegexContent); Site.TimeoutSecs = 60; CrawlResponse Response = null; try { Response = GeckoRequestProcessor.DoRequest(Request, Site, null, null); AggrSum(); } catch { } if (Response.Status != Enums.CrawlResult.Succ) { Logger.Info("访问页面错误:Url = " + Response.Url); } var content = Response.Content; //First page UserTweet result = new UserTweet(); result.Url = entryUrl; try { FillUserInfo(content, result); } catch { return(result); } var endId = DeterminedMid(content, MidType.EndId); var maxId = DeterminedMid(content, MidType.MaxId); //var name = Regex.Match(content,) int currentPage = 1; int maxPage = 50; string rootPath = @"D:/output/" + result.Name + "/"; Workbook outputBook = new Workbook(); Worksheet sheet = null; int currentLine = 4; int pos = 1; //表示第几次滚屏(一页中共3次),0:第一次;1:第二次;2:第三次 bool isContinue = false; if (!Directory.Exists(rootPath)) { Directory.CreateDirectory(rootPath); } if (File.Exists(rootPath + result.Name + ".xls")) { outputBook.Open(rootPath + result.Name + ".xls"); sheet = outputBook.Worksheets[result.Name]; int endrow = currentLine; while (!string.IsNullOrEmpty(sheet.Cells[endrow, 0].StringValue)) { endrow++; } currentLine = endrow; currentPage = (int)(currentLine / 45d) + 1; isContinue = true; } else { sheet = outputBook.Worksheets.Add(result.Name); } //Save to excel //Initialize column sheet.Cells[0, 0].PutValue("姓名"); sheet.Cells[0, 1].PutValue("网址"); sheet.Cells[0, 2].PutValue("粉丝数"); sheet.Cells[0, 3].PutValue("关注数"); sheet.Cells[0, 4].PutValue("微博数"); sheet.Cells[1, 0].PutValue(result.Name); sheet.Cells[1, 1].PutValue(result.Url); sheet.Cells[1, 2].PutValue(result.Follower); sheet.Cells[1, 3].PutValue(result.Follow); sheet.Cells[1, 4].PutValue(result.TweetNum); sheet.Cells[3, 0].PutValue("微博内容"); sheet.Cells[3, 1].PutValue("发布时间"); sheet.Cells[3, 2].PutValue("转发数"); sheet.Cells[3, 3].PutValue("评论数"); sheet.Cells[3, 4].PutValue("原帖地址"); sheet.Cells[3, 5].PutValue("来源"); sheet.Cells[3, 6].PutValue("具体评论"); if (isContinue) { pos = 0; var url = BuildTweetJsonUrl(result, endId, maxId, currentPage, pos); Request = BuildRequest(url); Response = GeckoRequestProcessor.DoRequest(Request, Site, null, null); AggrSum(); JsonResponse tmpResult = JsonConvert.DeserializeObject <JsonResponse>(Response.Content.Trim("</pre>".ToArray())); Response.Content = HttpUtility.HtmlDecode(tmpResult.data); pos++; var firstTweet = FillUserTweet(result, Response.Content); var firstUrl = firstTweet.FirstOrDefault().Url; result.Tweets.Clear(); int endrow = currentLine - 1; while (endrow > 3) { if (sheet.Cells[endrow, 4].StringValue == firstUrl) { currentLine = endrow; break; } endrow--; } } //Crawl with json while (Regex.IsMatch(Response.Content.Trim(), RegexContent, RegexOptions.Multiline | RegexOptions.IgnoreCase) && Response.Status == Enums.CrawlResult.Succ) { content = Response.Content.Trim(); var currentTweet = FillUserTweet(result, content); foreach (Tweet tweet in currentTweet) { string fileName = tweet.Mid + ".xls"; //检查是否是失败后的已经存在的评论 if (NeedCrawlComment) { if (!File.Exists(rootPath + fileName)) { FillTweetComment(tweet, Site); if (tweet.Comments.Count > 0) { SaveComment(rootPath, tweet, fileName); } } } sheet.Cells[currentLine, 0].PutValue(tweet.Content); sheet.Cells[currentLine, 1].PutValue(tweet.PubDate.ToString("yyyy-MM-dd HH:mm:ss")); sheet.Cells[currentLine, 2].PutValue(tweet.Forward); sheet.Cells[currentLine, 3].PutValue(tweet.Comment); sheet.Cells[currentLine, 4].PutValue(tweet.Url); sheet.Cells[currentLine, 5].PutValue(tweet.Source); //link comment if (File.Exists(rootPath + fileName)) { sheet.Cells[currentLine, 6].PutValue("点击查看"); //string linkPath = result.Name + "/" + fileName; string linkPath = fileName; sheet.Hyperlinks.Add(currentLine, 6, 1, 1, linkPath); } outputBook.Save(rootPath + result.Name + ".xls"); StatusLbl.Text = string.Format("正在读取名人:{0}的第{1}条微博", result.Name, currentLine - 3); Application.DoEvents(); currentLine++; } var url = BuildTweetJsonUrl(result, endId, maxId, currentPage, pos); Request = BuildRequest(url); for (int i = 0; i < 5; i++) { try { Response = GeckoRequestProcessor.DoRequest(Request, Site, null, null); AggrSum(); } catch { } if (Response.Status != Enums.CrawlResult.Succ) { Logger.Info("访问页面错误:Url = " + Response.Url); } else { break; } } try { JsonResponse tmpResult = JsonConvert.DeserializeObject <JsonResponse>(Response.Content.Trim("</pre>".ToArray())); Response.Content = HttpUtility.HtmlDecode(tmpResult.data); } catch { try { CommentJsonResponse tmpResult = JsonConvert.DeserializeObject <CommentJsonResponse>(Response.Content.Trim("</pre>".ToArray())); Response.Content = HttpUtility.HtmlDecode(tmpResult.data.html); } catch { } } pos = (pos + 1) % 3; if (pos == 0) { currentPage++; } maxId = result.Tweets.Last().Mid; } return(result); }