예제 #1
0
        private UserTweet CrawlTask(string entryUrl)
        {
            _currentUrl = entryUrl;
            var Site    = SiteBusiness.GetBySiteID("Weibo");
            var Request = BuildRequest(entryUrl, RegexContent);

            Site.TimeoutSecs = 60;
            CrawlResponse Response = null;

            try
            {
                Response = GeckoRequestProcessor.DoRequest(Request, Site, null, null);
                AggrSum();
            }
            catch
            {
            }



            if (Response.Status != Enums.CrawlResult.Succ)
            {
                Logger.Info("访问页面错误:Url = " + Response.Url);
            }
            var content = Response.Content;
            //First page
            UserTweet result = new UserTweet();

            result.Url = entryUrl;
            try
            {
                FillUserInfo(content, result);
            }
            catch
            {
                return(result);
            }

            var endId = DeterminedMid(content, MidType.EndId);
            var maxId = DeterminedMid(content, MidType.MaxId);
            //var name = Regex.Match(content,)
            int       currentPage = 1;
            int       maxPage     = 50;
            string    rootPath    = @"D:/output/" + result.Name + "/";
            Workbook  outputBook  = new Workbook();
            Worksheet sheet       = null;
            int       currentLine = 4;
            int       pos         = 1; //表示第几次滚屏(一页中共3次),0:第一次;1:第二次;2:第三次
            bool      isContinue  = false;

            if (!Directory.Exists(rootPath))
            {
                Directory.CreateDirectory(rootPath);
            }
            if (File.Exists(rootPath + result.Name + ".xls"))
            {
                outputBook.Open(rootPath + result.Name + ".xls");
                sheet = outputBook.Worksheets[result.Name];
                int endrow = currentLine;
                while (!string.IsNullOrEmpty(sheet.Cells[endrow, 0].StringValue))
                {
                    endrow++;
                }
                currentLine = endrow;
                currentPage = (int)(currentLine / 45d) + 1;
                isContinue  = true;
            }
            else
            {
                sheet = outputBook.Worksheets.Add(result.Name);
            }
            //Save to excel

            //Initialize column
            sheet.Cells[0, 0].PutValue("姓名");
            sheet.Cells[0, 1].PutValue("网址");
            sheet.Cells[0, 2].PutValue("粉丝数");
            sheet.Cells[0, 3].PutValue("关注数");
            sheet.Cells[0, 4].PutValue("微博数");

            sheet.Cells[1, 0].PutValue(result.Name);
            sheet.Cells[1, 1].PutValue(result.Url);
            sheet.Cells[1, 2].PutValue(result.Follower);
            sheet.Cells[1, 3].PutValue(result.Follow);
            sheet.Cells[1, 4].PutValue(result.TweetNum);

            sheet.Cells[3, 0].PutValue("微博内容");
            sheet.Cells[3, 1].PutValue("发布时间");
            sheet.Cells[3, 2].PutValue("转发数");
            sheet.Cells[3, 3].PutValue("评论数");
            sheet.Cells[3, 4].PutValue("原帖地址");
            sheet.Cells[3, 5].PutValue("来源");
            sheet.Cells[3, 6].PutValue("具体评论");



            if (isContinue)
            {
                pos = 0;
                var url = BuildTweetJsonUrl(result, endId, maxId, currentPage, pos);
                Request  = BuildRequest(url);
                Response = GeckoRequestProcessor.DoRequest(Request, Site, null, null);
                AggrSum();
                JsonResponse tmpResult =
                    JsonConvert.DeserializeObject <JsonResponse>(Response.Content.Trim("</pre>".ToArray()));
                Response.Content = HttpUtility.HtmlDecode(tmpResult.data);
                pos++;
                var firstTweet = FillUserTweet(result, Response.Content);
                var firstUrl   = firstTweet.FirstOrDefault().Url;
                result.Tweets.Clear();
                int endrow = currentLine - 1;
                while (endrow > 3)
                {
                    if (sheet.Cells[endrow, 4].StringValue == firstUrl)
                    {
                        currentLine = endrow;
                        break;
                    }
                    endrow--;
                }
            }
            //Crawl with json
            while (Regex.IsMatch(Response.Content.Trim(), RegexContent, RegexOptions.Multiline | RegexOptions.IgnoreCase) && Response.Status == Enums.CrawlResult.Succ)
            {
                content = Response.Content.Trim();
                var currentTweet = FillUserTweet(result, content);
                foreach (Tweet tweet in currentTweet)
                {
                    string fileName = tweet.Mid + ".xls";
                    //检查是否是失败后的已经存在的评论
                    if (NeedCrawlComment)
                    {
                        if (!File.Exists(rootPath + fileName))
                        {
                            FillTweetComment(tweet, Site);
                            if (tweet.Comments.Count > 0)
                            {
                                SaveComment(rootPath, tweet, fileName);
                            }
                        }
                    }



                    sheet.Cells[currentLine, 0].PutValue(tweet.Content);
                    sheet.Cells[currentLine, 1].PutValue(tweet.PubDate.ToString("yyyy-MM-dd HH:mm:ss"));
                    sheet.Cells[currentLine, 2].PutValue(tweet.Forward);
                    sheet.Cells[currentLine, 3].PutValue(tweet.Comment);
                    sheet.Cells[currentLine, 4].PutValue(tweet.Url);
                    sheet.Cells[currentLine, 5].PutValue(tweet.Source);

                    //link comment
                    if (File.Exists(rootPath + fileName))
                    {
                        sheet.Cells[currentLine, 6].PutValue("点击查看");
                        //string linkPath = result.Name + "/" + fileName;
                        string linkPath = fileName;
                        sheet.Hyperlinks.Add(currentLine, 6, 1, 1, linkPath);
                    }
                    outputBook.Save(rootPath + result.Name + ".xls");
                    StatusLbl.Text = string.Format("正在读取名人:{0}的第{1}条微博", result.Name, currentLine - 3);
                    Application.DoEvents();
                    currentLine++;
                }



                var url = BuildTweetJsonUrl(result, endId, maxId, currentPage, pos);
                Request = BuildRequest(url);
                for (int i = 0; i < 5; i++)
                {
                    try
                    {
                        Response = GeckoRequestProcessor.DoRequest(Request, Site, null, null);
                        AggrSum();
                    }
                    catch
                    {
                    }
                    if (Response.Status != Enums.CrawlResult.Succ)
                    {
                        Logger.Info("访问页面错误:Url = " + Response.Url);
                    }
                    else
                    {
                        break;
                    }
                }

                try
                {
                    JsonResponse tmpResult =
                        JsonConvert.DeserializeObject <JsonResponse>(Response.Content.Trim("</pre>".ToArray()));
                    Response.Content = HttpUtility.HtmlDecode(tmpResult.data);
                }
                catch
                {
                    try
                    {
                        CommentJsonResponse tmpResult =
                            JsonConvert.DeserializeObject <CommentJsonResponse>(Response.Content.Trim("</pre>".ToArray()));
                        Response.Content = HttpUtility.HtmlDecode(tmpResult.data.html);
                    }
                    catch
                    {
                    }
                }

                pos = (pos + 1) % 3;
                if (pos == 0)
                {
                    currentPage++;
                }
                maxId = result.Tweets.Last().Mid;
            }



            return(result);
        }
예제 #2
0
        private void FillTweetComment(Tweet tweet, SiteEntity site)
        {
            if (tweet.Comment == 0)
            {
                return;
            }
            int    currentPage = 1;
            string mid         = tweet.Mid;

            try
            {
                while (true)
                {
                    string url = string.Format(CommentUrlFormat, mid, currentPage);

                    var request = BuildRequest(url);

                    CrawlResponse response = null;
                    for (int i = 0; i < 5; i++)
                    {
                        try
                        {
                            response = GeckoRequestProcessor.DoRequest(request, site, null, null);
                            AggrSum();
                        }
                        catch {}

                        if (response.Status != Enums.CrawlResult.Succ)
                        {
                            Logger.Info("访问页面错误:Url = " + response.Url);
                        }
                        else
                        {
                            break;
                        }
                    }
                    CommentJsonResponse tmpResult =
                        JsonConvert.DeserializeObject <CommentJsonResponse>(response.Content.Trim("</pre>".ToArray()));
                    response.Content = HttpUtility.HtmlDecode(tmpResult.data.html);
                    var pageMatch = Regex.Match(response.Content, RegexCommentPage,
                                                RegexOptions.IgnoreCase | RegexOptions.Multiline);
                    if (currentPage != 1 &&
                        (!pageMatch.Success ||
                         pageMatch.Groups["CurrentPageNum"].Value != currentPage.ToString(CultureInfo.InvariantCulture)))
                    {
                        return;
                    }
                    //Fill Tweet
                    var matches = Regex.Matches(response.Content, RegexComment,
                                                RegexOptions.IgnoreCase | RegexOptions.Multiline);

                    foreach (Match match in matches)
                    {
                        Comment comment = new Comment();
                        comment.Author    = match.Groups["Author"].Value;
                        comment.AuthorUrl = RegexParser.AbsoluteUrl(match.Groups["AuthorUrl"].Value, tweet.Url, true);
                        comment.Content   = TextCleaner.FullClean(match.Groups["Content"].Value);
                        comment.PubDate   = DateTimeParser.Parser(match.Groups["PubDate"].Value) ?? DateTime.MinValue;
                        tweet.Comments.Add(comment);
                    }

                    currentPage++;
                }
            }
            catch {
            }
        }