Пример #1
0
        private string BuildTweetJsonUrl(UserTweet userTweet, string endId, string maxId, int page, int pos)
        {
            string url = prefixWeiboUrl + "&max_id=" + maxId + "&end_id=" + endId + "&uid=" + userTweet.Uid;

            switch (pos)
            {
            case 0:
            {
                //第一次滚屏
                url = url + "&count=50&page=" + page;
                int prepage = page > 1 ? page - 1 : page;
                url = url + "&pre_page=" + prepage;
                break;
            }

            case 1:
            case 2:
            {
                //第二次,第三次滚屏逻辑一样
                url = url + "&count=15&page=" + page + "&pre_page=" + page;
                int pagebar = pos - 1;
                url = url + "&pagebar=" + pagebar;

                break;
            }

            default:
            {
                throw new Exception("滚屏次数不对,目前新浪只支持每页3次滚屏");
            }
            }
            return(url);
        }
Пример #2
0
        private Tweet[] FillUserTweet(UserTweet result, string content)
        {
            var          matches   = Regex.Matches(content, RegexContent, RegexOptions.Multiline | RegexOptions.IgnoreCase);
            List <Tweet> tweetList = new List <Tweet>();

            try
            {
                foreach (Match match in matches)
                {
                    Tweet tweet = new Tweet();
                    int   comment;
                    int.TryParse(match.Groups["Reply"].Value, out comment);
                    int forward;
                    int.TryParse(match.Groups["Forward"].Value, out forward);
                    tweet.Comment = comment;
                    tweet.Content = TextCleaner.FullClean(match.Groups["Content"].Value);
                    tweet.Mid     = match.Groups["Mid"].Value;
                    tweet.Forward = forward;
                    tweet.Source  = match.Groups["Source"].Value;
                    tweet.PubDate = DateTimeParser.Parser(match.Groups["PubDate"].Value) ?? DateTime.MinValue;
                    tweet.Url     = RegexParser.AbsoluteUrl(match.Groups["Url"].Value, result.Url, true);
                    result.Tweets.Add(tweet);
                    tweetList.Add(tweet);
                }
            }
            catch {}

            return(tweetList.ToArray());
        }
Пример #3
0
        private void FillUserInfo(string content, UserTweet userTweet)
        {
            var match = Regex.Match(content, RegexInfo, RegexOptions.Multiline | RegexOptions.IgnoreCase);

            userTweet.Follow   = int.Parse(match.Groups["Follow"].Value);
            userTweet.Follower = int.Parse(match.Groups["Follower"].Value);
            userTweet.TweetNum = int.Parse(match.Groups["TweetNum"].Value);
            userTweet.Name     = HTMLCleaner.CleanHTML(match.Groups["Name"].Value, true);
            userTweet.Uid      = match.Groups["Uid"].Value;
        }
Пример #4
0
        private UserTweet CrawlTask(string entryUrl)
        {
            _currentUrl = entryUrl;
            var Site    = SiteBusiness.GetBySiteID("Weibo");
            var Request = BuildRequest(entryUrl, RegexContent);

            Site.TimeoutSecs = 60;
            CrawlResponse Response = null;

            try
            {
                Response = GeckoRequestProcessor.DoRequest(Request, Site, null, null);
                AggrSum();
            }
            catch
            {
            }



            if (Response.Status != Enums.CrawlResult.Succ)
            {
                Logger.Info("访问页面错误:Url = " + Response.Url);
            }
            var content = Response.Content;
            //First page
            UserTweet result = new UserTweet();

            result.Url = entryUrl;
            try
            {
                FillUserInfo(content, result);
            }
            catch
            {
                return(result);
            }

            var endId = DeterminedMid(content, MidType.EndId);
            var maxId = DeterminedMid(content, MidType.MaxId);
            //var name = Regex.Match(content,)
            int       currentPage = 1;
            int       maxPage     = 50;
            string    rootPath    = @"D:/output/" + result.Name + "/";
            Workbook  outputBook  = new Workbook();
            Worksheet sheet       = null;
            int       currentLine = 4;
            int       pos         = 1; //表示第几次滚屏(一页中共3次),0:第一次;1:第二次;2:第三次
            bool      isContinue  = false;

            if (!Directory.Exists(rootPath))
            {
                Directory.CreateDirectory(rootPath);
            }
            if (File.Exists(rootPath + result.Name + ".xls"))
            {
                outputBook.Open(rootPath + result.Name + ".xls");
                sheet = outputBook.Worksheets[result.Name];
                int endrow = currentLine;
                while (!string.IsNullOrEmpty(sheet.Cells[endrow, 0].StringValue))
                {
                    endrow++;
                }
                currentLine = endrow;
                currentPage = (int)(currentLine / 45d) + 1;
                isContinue  = true;
            }
            else
            {
                sheet = outputBook.Worksheets.Add(result.Name);
            }
            //Save to excel

            //Initialize column
            sheet.Cells[0, 0].PutValue("姓名");
            sheet.Cells[0, 1].PutValue("网址");
            sheet.Cells[0, 2].PutValue("粉丝数");
            sheet.Cells[0, 3].PutValue("关注数");
            sheet.Cells[0, 4].PutValue("微博数");

            sheet.Cells[1, 0].PutValue(result.Name);
            sheet.Cells[1, 1].PutValue(result.Url);
            sheet.Cells[1, 2].PutValue(result.Follower);
            sheet.Cells[1, 3].PutValue(result.Follow);
            sheet.Cells[1, 4].PutValue(result.TweetNum);

            sheet.Cells[3, 0].PutValue("微博内容");
            sheet.Cells[3, 1].PutValue("发布时间");
            sheet.Cells[3, 2].PutValue("转发数");
            sheet.Cells[3, 3].PutValue("评论数");
            sheet.Cells[3, 4].PutValue("原帖地址");
            sheet.Cells[3, 5].PutValue("来源");
            sheet.Cells[3, 6].PutValue("具体评论");



            if (isContinue)
            {
                pos = 0;
                var url = BuildTweetJsonUrl(result, endId, maxId, currentPage, pos);
                Request  = BuildRequest(url);
                Response = GeckoRequestProcessor.DoRequest(Request, Site, null, null);
                AggrSum();
                JsonResponse tmpResult =
                    JsonConvert.DeserializeObject <JsonResponse>(Response.Content.Trim("</pre>".ToArray()));
                Response.Content = HttpUtility.HtmlDecode(tmpResult.data);
                pos++;
                var firstTweet = FillUserTweet(result, Response.Content);
                var firstUrl   = firstTweet.FirstOrDefault().Url;
                result.Tweets.Clear();
                int endrow = currentLine - 1;
                while (endrow > 3)
                {
                    if (sheet.Cells[endrow, 4].StringValue == firstUrl)
                    {
                        currentLine = endrow;
                        break;
                    }
                    endrow--;
                }
            }
            //Crawl with json
            while (Regex.IsMatch(Response.Content.Trim(), RegexContent, RegexOptions.Multiline | RegexOptions.IgnoreCase) && Response.Status == Enums.CrawlResult.Succ)
            {
                content = Response.Content.Trim();
                var currentTweet = FillUserTweet(result, content);
                foreach (Tweet tweet in currentTweet)
                {
                    string fileName = tweet.Mid + ".xls";
                    //检查是否是失败后的已经存在的评论
                    if (NeedCrawlComment)
                    {
                        if (!File.Exists(rootPath + fileName))
                        {
                            FillTweetComment(tweet, Site);
                            if (tweet.Comments.Count > 0)
                            {
                                SaveComment(rootPath, tweet, fileName);
                            }
                        }
                    }



                    sheet.Cells[currentLine, 0].PutValue(tweet.Content);
                    sheet.Cells[currentLine, 1].PutValue(tweet.PubDate.ToString("yyyy-MM-dd HH:mm:ss"));
                    sheet.Cells[currentLine, 2].PutValue(tweet.Forward);
                    sheet.Cells[currentLine, 3].PutValue(tweet.Comment);
                    sheet.Cells[currentLine, 4].PutValue(tweet.Url);
                    sheet.Cells[currentLine, 5].PutValue(tweet.Source);

                    //link comment
                    if (File.Exists(rootPath + fileName))
                    {
                        sheet.Cells[currentLine, 6].PutValue("点击查看");
                        //string linkPath = result.Name + "/" + fileName;
                        string linkPath = fileName;
                        sheet.Hyperlinks.Add(currentLine, 6, 1, 1, linkPath);
                    }
                    outputBook.Save(rootPath + result.Name + ".xls");
                    StatusLbl.Text = string.Format("正在读取名人:{0}的第{1}条微博", result.Name, currentLine - 3);
                    Application.DoEvents();
                    currentLine++;
                }



                var url = BuildTweetJsonUrl(result, endId, maxId, currentPage, pos);
                Request = BuildRequest(url);
                for (int i = 0; i < 5; i++)
                {
                    try
                    {
                        Response = GeckoRequestProcessor.DoRequest(Request, Site, null, null);
                        AggrSum();
                    }
                    catch
                    {
                    }
                    if (Response.Status != Enums.CrawlResult.Succ)
                    {
                        Logger.Info("访问页面错误:Url = " + Response.Url);
                    }
                    else
                    {
                        break;
                    }
                }

                try
                {
                    JsonResponse tmpResult =
                        JsonConvert.DeserializeObject <JsonResponse>(Response.Content.Trim("</pre>".ToArray()));
                    Response.Content = HttpUtility.HtmlDecode(tmpResult.data);
                }
                catch
                {
                    try
                    {
                        CommentJsonResponse tmpResult =
                            JsonConvert.DeserializeObject <CommentJsonResponse>(Response.Content.Trim("</pre>".ToArray()));
                        Response.Content = HttpUtility.HtmlDecode(tmpResult.data.html);
                    }
                    catch
                    {
                    }
                }

                pos = (pos + 1) % 3;
                if (pos == 0)
                {
                    currentPage++;
                }
                maxId = result.Tweets.Last().Mid;
            }



            return(result);
        }