Exemple #1
0
        private void ParsePage(string title, string url, PageElement pageElement = null)
        {
            if (pageElement == null)
            {
                pageElement = new PageElement {
                    Title = title, Url = url
                };
            }

            var xpath = new ItemPageXPaths();
            List <SubItemElement> subList;
            DateTime    startTime = DateTime.Now;
            PageElement result;

            if (GeckoDownRd.Checked)
            {
                //result = new GeckoParser().GetArticleContent(url, title, DeterminedMode(), out xpath);
                CrawlResponse resp    = GeckoRequestProcessor.DoRequest(BuildFakeRequest(url), BuildFakeSiteEntity(), null, null, null, true, 1000);
                string        content = resp.Content;
                result = PageAutoAnalyzer.AnalyzeContent(content, pageElement,
                                                         DeterminedMode(), new IdentityContentElement(), ref xpath,
                                                         out subList, 86400, ExcludeTxt.Text);
            }
            else if (HttpdownRd.Checked)
            {
                string content = WebRequestProcessor.DownloadHTTPString(url, 30);
                result = PageAutoAnalyzer.AnalyzeContent(content, pageElement,
                                                         DeterminedMode(), new IdentityContentElement(), ref xpath,
                                                         out subList, 86400, ExcludeTxt.Text);
            }
            else
            {
                throw new Exception("不支持该方式分析正文");
            }


            TimeSpan usedTime = DateTime.Now - startTime;

            if (result == null)
            {
                return;
            }
            PageUrlTxt.Text       = HtmlUtility.ExpandRelativePath(url, result.Url);
            TitleTxt.Text         = result.Title;
            ContentTxt.Text       = result.Content;
            ViewTxt.Text          = result.View.ToString();
            ReplyTxt.Text         = result.Reply.ToString();
            PubdateTxt.Text       = result.Pubdate == null ? "" : result.Pubdate.ToString();
            AuthorTxt.Text        = result.Author;
            MediaTxt.Text         = result.MediaName;
            ElementXPathTxt.Text  = result.ElementXPath;
            ElementBlockTxt.Text  = result.ElementBlock;
            NextpageXPathTxt.Text = result.NextPageXPath;
        }
Exemple #2
0
        private void ParseListBtn_Click(object sender, EventArgs e)
        {
            string        url     = InputUrlTxt.Text;
            string        content = "";
            RecogniseMode mode    = DeterminedMode();
            var           xpath   = new ListPageXPaths();

            PageElement[] result;
            if (GeckoDownRd.Checked)
            {
                //result = new GeckoParser().AnalyzeArticleList(url,mode,out xpath,86400);
                CrawlResponse resp = GeckoRequestProcessor.DoRequest(BuildFakeRequest(url), BuildFakeSiteEntity(), null, null, null, true, 1000);
                content = resp.Content;
                var ret = PageAutoAnalyzer.AnalyzeArticleList(resp.Url, content, mode, new IdentityPageElement(), ref xpath, 86400);
                result = ret == null ? null : ret.List;
            }
            else if (HttpdownRd.Checked)
            {
                content = WebRequestProcessor.DownloadHTTPString(url, 30);
                var ret = PageAutoAnalyzer.AnalyzeArticleList(url, content, mode, new IdentityPageElement(), ref xpath, 86400);
                result = ret == null ? null : ret.List;
            }
            else
            {
                throw new NotSupportedException("不支持当前项抓取");
            }



            if (result == null)
            {
                MessageBox.Show("解析不出数据");
                return;
            }
            foreach (var pageElement in result)
            {
                pageElement.Url = HtmlUtility.ExpandRelativePath(url, pageElement.Url);
            }
            ListGridView.DataSource = result;
        }
Exemple #3
0
        private UserTweet CrawlTask(string entryUrl)
        {
            _currentUrl = entryUrl;
            var Site    = SiteBusiness.GetBySiteID("Weibo");
            var Request = BuildRequest(entryUrl, RegexContent);

            Site.TimeoutSecs = 60;
            CrawlResponse Response = null;

            try
            {
                Response = GeckoRequestProcessor.DoRequest(Request, Site, null, null);
                AggrSum();
            }
            catch
            {
            }



            if (Response.Status != Enums.CrawlResult.Succ)
            {
                Logger.Info("访问页面错误:Url = " + Response.Url);
            }
            var content = Response.Content;
            //First page
            UserTweet result = new UserTweet();

            result.Url = entryUrl;
            try
            {
                FillUserInfo(content, result);
            }
            catch
            {
                return(result);
            }

            var endId = DeterminedMid(content, MidType.EndId);
            var maxId = DeterminedMid(content, MidType.MaxId);
            //var name = Regex.Match(content,)
            int       currentPage = 1;
            int       maxPage     = 50;
            string    rootPath    = @"D:/output/" + result.Name + "/";
            Workbook  outputBook  = new Workbook();
            Worksheet sheet       = null;
            int       currentLine = 4;
            int       pos         = 1; //表示第几次滚屏(一页中共3次),0:第一次;1:第二次;2:第三次
            bool      isContinue  = false;

            if (!Directory.Exists(rootPath))
            {
                Directory.CreateDirectory(rootPath);
            }
            if (File.Exists(rootPath + result.Name + ".xls"))
            {
                outputBook.Open(rootPath + result.Name + ".xls");
                sheet = outputBook.Worksheets[result.Name];
                int endrow = currentLine;
                while (!string.IsNullOrEmpty(sheet.Cells[endrow, 0].StringValue))
                {
                    endrow++;
                }
                currentLine = endrow;
                currentPage = (int)(currentLine / 45d) + 1;
                isContinue  = true;
            }
            else
            {
                sheet = outputBook.Worksheets.Add(result.Name);
            }
            //Save to excel

            //Initialize column
            sheet.Cells[0, 0].PutValue("姓名");
            sheet.Cells[0, 1].PutValue("网址");
            sheet.Cells[0, 2].PutValue("粉丝数");
            sheet.Cells[0, 3].PutValue("关注数");
            sheet.Cells[0, 4].PutValue("微博数");

            sheet.Cells[1, 0].PutValue(result.Name);
            sheet.Cells[1, 1].PutValue(result.Url);
            sheet.Cells[1, 2].PutValue(result.Follower);
            sheet.Cells[1, 3].PutValue(result.Follow);
            sheet.Cells[1, 4].PutValue(result.TweetNum);

            sheet.Cells[3, 0].PutValue("微博内容");
            sheet.Cells[3, 1].PutValue("发布时间");
            sheet.Cells[3, 2].PutValue("转发数");
            sheet.Cells[3, 3].PutValue("评论数");
            sheet.Cells[3, 4].PutValue("原帖地址");
            sheet.Cells[3, 5].PutValue("来源");
            sheet.Cells[3, 6].PutValue("具体评论");



            if (isContinue)
            {
                pos = 0;
                var url = BuildTweetJsonUrl(result, endId, maxId, currentPage, pos);
                Request  = BuildRequest(url);
                Response = GeckoRequestProcessor.DoRequest(Request, Site, null, null);
                AggrSum();
                JsonResponse tmpResult =
                    JsonConvert.DeserializeObject <JsonResponse>(Response.Content.Trim("</pre>".ToArray()));
                Response.Content = HttpUtility.HtmlDecode(tmpResult.data);
                pos++;
                var firstTweet = FillUserTweet(result, Response.Content);
                var firstUrl   = firstTweet.FirstOrDefault().Url;
                result.Tweets.Clear();
                int endrow = currentLine - 1;
                while (endrow > 3)
                {
                    if (sheet.Cells[endrow, 4].StringValue == firstUrl)
                    {
                        currentLine = endrow;
                        break;
                    }
                    endrow--;
                }
            }
            //Crawl with json
            while (Regex.IsMatch(Response.Content.Trim(), RegexContent, RegexOptions.Multiline | RegexOptions.IgnoreCase) && Response.Status == Enums.CrawlResult.Succ)
            {
                content = Response.Content.Trim();
                var currentTweet = FillUserTweet(result, content);
                foreach (Tweet tweet in currentTweet)
                {
                    string fileName = tweet.Mid + ".xls";
                    //检查是否是失败后的已经存在的评论
                    if (NeedCrawlComment)
                    {
                        if (!File.Exists(rootPath + fileName))
                        {
                            FillTweetComment(tweet, Site);
                            if (tweet.Comments.Count > 0)
                            {
                                SaveComment(rootPath, tweet, fileName);
                            }
                        }
                    }



                    sheet.Cells[currentLine, 0].PutValue(tweet.Content);
                    sheet.Cells[currentLine, 1].PutValue(tweet.PubDate.ToString("yyyy-MM-dd HH:mm:ss"));
                    sheet.Cells[currentLine, 2].PutValue(tweet.Forward);
                    sheet.Cells[currentLine, 3].PutValue(tweet.Comment);
                    sheet.Cells[currentLine, 4].PutValue(tweet.Url);
                    sheet.Cells[currentLine, 5].PutValue(tweet.Source);

                    //link comment
                    if (File.Exists(rootPath + fileName))
                    {
                        sheet.Cells[currentLine, 6].PutValue("点击查看");
                        //string linkPath = result.Name + "/" + fileName;
                        string linkPath = fileName;
                        sheet.Hyperlinks.Add(currentLine, 6, 1, 1, linkPath);
                    }
                    outputBook.Save(rootPath + result.Name + ".xls");
                    StatusLbl.Text = string.Format("正在读取名人:{0}的第{1}条微博", result.Name, currentLine - 3);
                    Application.DoEvents();
                    currentLine++;
                }



                var url = BuildTweetJsonUrl(result, endId, maxId, currentPage, pos);
                Request = BuildRequest(url);
                for (int i = 0; i < 5; i++)
                {
                    try
                    {
                        Response = GeckoRequestProcessor.DoRequest(Request, Site, null, null);
                        AggrSum();
                    }
                    catch
                    {
                    }
                    if (Response.Status != Enums.CrawlResult.Succ)
                    {
                        Logger.Info("访问页面错误:Url = " + Response.Url);
                    }
                    else
                    {
                        break;
                    }
                }

                try
                {
                    JsonResponse tmpResult =
                        JsonConvert.DeserializeObject <JsonResponse>(Response.Content.Trim("</pre>".ToArray()));
                    Response.Content = HttpUtility.HtmlDecode(tmpResult.data);
                }
                catch
                {
                    try
                    {
                        CommentJsonResponse tmpResult =
                            JsonConvert.DeserializeObject <CommentJsonResponse>(Response.Content.Trim("</pre>".ToArray()));
                        Response.Content = HttpUtility.HtmlDecode(tmpResult.data.html);
                    }
                    catch
                    {
                    }
                }

                pos = (pos + 1) % 3;
                if (pos == 0)
                {
                    currentPage++;
                }
                maxId = result.Tweets.Last().Mid;
            }



            return(result);
        }
Exemple #4
0
        private void FillTweetComment(Tweet tweet, SiteEntity site)
        {
            if (tweet.Comment == 0)
            {
                return;
            }
            int    currentPage = 1;
            string mid         = tweet.Mid;

            try
            {
                while (true)
                {
                    string url = string.Format(CommentUrlFormat, mid, currentPage);

                    var request = BuildRequest(url);

                    CrawlResponse response = null;
                    for (int i = 0; i < 5; i++)
                    {
                        try
                        {
                            response = GeckoRequestProcessor.DoRequest(request, site, null, null);
                            AggrSum();
                        }
                        catch {}

                        if (response.Status != Enums.CrawlResult.Succ)
                        {
                            Logger.Info("访问页面错误:Url = " + response.Url);
                        }
                        else
                        {
                            break;
                        }
                    }
                    CommentJsonResponse tmpResult =
                        JsonConvert.DeserializeObject <CommentJsonResponse>(response.Content.Trim("</pre>".ToArray()));
                    response.Content = HttpUtility.HtmlDecode(tmpResult.data.html);
                    var pageMatch = Regex.Match(response.Content, RegexCommentPage,
                                                RegexOptions.IgnoreCase | RegexOptions.Multiline);
                    if (currentPage != 1 &&
                        (!pageMatch.Success ||
                         pageMatch.Groups["CurrentPageNum"].Value != currentPage.ToString(CultureInfo.InvariantCulture)))
                    {
                        return;
                    }
                    //Fill Tweet
                    var matches = Regex.Matches(response.Content, RegexComment,
                                                RegexOptions.IgnoreCase | RegexOptions.Multiline);

                    foreach (Match match in matches)
                    {
                        Comment comment = new Comment();
                        comment.Author    = match.Groups["Author"].Value;
                        comment.AuthorUrl = RegexParser.AbsoluteUrl(match.Groups["AuthorUrl"].Value, tweet.Url, true);
                        comment.Content   = TextCleaner.FullClean(match.Groups["Content"].Value);
                        comment.PubDate   = DateTimeParser.Parser(match.Groups["PubDate"].Value) ?? DateTime.MinValue;
                        tweet.Comments.Add(comment);
                    }

                    currentPage++;
                }
            }
            catch {
            }
        }