private bool Login() { //换一个账号 var Site = SiteBusiness.GetBySiteID("Weibo"); //AccountManager.NeedChangeAccount(Site); //var GotAccout = AccountManager.ChangeAccount(Site.SiteID, Site.AccountLimitReqs); ////获得可用帐号,开始登录 //AccountExtend CurrentAccount = AccountManager.GetCurrentAccount(Site.SiteID); //var AccountID = CurrentAccount.AccountID; //执行登出登录任务并返回其执行结果 var CurrentAccount = LoginAccountBusiness.GetByAccountID("Weibo1200"); var Response = Crawler.Core.RequestProcessor.Processor.DoLogin(Site, CurrentAccount); if (Response != null && Response.Status == Enums.CrawlResult.Succ) { //登录成功 return(true); } else { return(false); } //检查代理获取是否正常:null表示异常,string.Empty表示不用代理 }
private void Search(SiteEntity siteEntity, KeywordQuery keywordQuery, List <AnalyzeData> resultDataList, int ProgressPercStart, int ProgressPercEnd) { var firstCrawl = CrawlBusiness.GetTopBySiteID(siteEntity.SiteID, ""); var crawlID = firstCrawl.CrawlID; string lastItemID = null; var keyword = keywordQuery.Keyword; var keywordExclude = ""; var startPage = keywordQuery.StartPage; var endPage = keywordQuery.EndPage; var crawl = CrawlBusiness.GetByCrawlID(crawlID); var site = SiteBusiness.GetBySiteID(crawl.SiteID); ListResponse result = null; for (int currentPage = startPage; currentPage <= endPage; currentPage++) { CrawlRequest request = CrawlRequest.GetQueryUrl(crawlID, keyword, currentPage, keywordExclude, "", ""); crawl.KeywordQuery = keyword; crawl.KeywordAny = ""; crawl.KeywordNot = keywordExclude; crawl.KeywordSite = ""; var response = Core.Crawler.SimpleCrawler.CrawlList_Single(request, crawl, site, true, null); var currentItems = response.ExtractItems(crawl); if (!(currentItems == null || !currentItems.Any() || currentItems.Last().ItemID == lastItemID)) { lastItemID = currentItems.Last().ItemID; //bool stopCrawl; //ExistCheck.ExistCheck_List( // response, null, // (Enums.ExistItemStrategy) crawl.ExistItemStrategy, // (Enums.ContentDetailLevel) site.ContentDetailLevel, crawl.IssueID, // crawl.CrawlID, out stopCrawl); if (result == null) { result = response; } else { result.CombineList(response); } } backgroundWorker1.ReportProgress(ProgressPercStart + (ProgressPercEnd - ProgressPercStart) * (currentPage + 1) / (endPage - startPage + 1)); } //Get Item if (result != null) { if (DetailChk.Checked) { this.Text = Site.Name + " 抓取Items"; Core.Crawler.SimpleCrawler.CrawlItem_Multi( result, crawl, site, msg => { }); this.Text = @"Palas搜索工具"; } var items = result.ExtractItems(crawl, null); AnalyzeData data = new AnalyzeData() { Items = items, CrawlID = crawl.CrawlID }; resultDataList.Add(data); } }
private UserTweet CrawlTask(string entryUrl) { _currentUrl = entryUrl; var Site = SiteBusiness.GetBySiteID("Weibo"); var Request = BuildRequest(entryUrl, RegexContent); Site.TimeoutSecs = 60; CrawlResponse Response = null; try { Response = GeckoRequestProcessor.DoRequest(Request, Site, null, null); AggrSum(); } catch { } if (Response.Status != Enums.CrawlResult.Succ) { Logger.Info("访问页面错误:Url = " + Response.Url); } var content = Response.Content; //First page UserTweet result = new UserTweet(); result.Url = entryUrl; try { FillUserInfo(content, result); } catch { return(result); } var endId = DeterminedMid(content, MidType.EndId); var maxId = DeterminedMid(content, MidType.MaxId); //var name = Regex.Match(content,) int currentPage = 1; int maxPage = 50; string rootPath = @"D:/output/" + result.Name + "/"; Workbook outputBook = new Workbook(); Worksheet sheet = null; int currentLine = 4; int pos = 1; //表示第几次滚屏(一页中共3次),0:第一次;1:第二次;2:第三次 bool isContinue = false; if (!Directory.Exists(rootPath)) { Directory.CreateDirectory(rootPath); } if (File.Exists(rootPath + result.Name + ".xls")) { outputBook.Open(rootPath + result.Name + ".xls"); sheet = outputBook.Worksheets[result.Name]; int endrow = currentLine; while (!string.IsNullOrEmpty(sheet.Cells[endrow, 0].StringValue)) { endrow++; } currentLine = endrow; currentPage = (int)(currentLine / 45d) + 1; isContinue = true; } else { sheet = outputBook.Worksheets.Add(result.Name); } //Save to excel //Initialize column sheet.Cells[0, 0].PutValue("姓名"); sheet.Cells[0, 1].PutValue("网址"); sheet.Cells[0, 2].PutValue("粉丝数"); sheet.Cells[0, 3].PutValue("关注数"); sheet.Cells[0, 4].PutValue("微博数"); sheet.Cells[1, 0].PutValue(result.Name); sheet.Cells[1, 1].PutValue(result.Url); sheet.Cells[1, 2].PutValue(result.Follower); sheet.Cells[1, 3].PutValue(result.Follow); sheet.Cells[1, 4].PutValue(result.TweetNum); sheet.Cells[3, 0].PutValue("微博内容"); sheet.Cells[3, 1].PutValue("发布时间"); sheet.Cells[3, 2].PutValue("转发数"); sheet.Cells[3, 3].PutValue("评论数"); sheet.Cells[3, 4].PutValue("原帖地址"); sheet.Cells[3, 5].PutValue("来源"); sheet.Cells[3, 6].PutValue("具体评论"); if (isContinue) { pos = 0; var url = BuildTweetJsonUrl(result, endId, maxId, currentPage, pos); Request = BuildRequest(url); Response = GeckoRequestProcessor.DoRequest(Request, Site, null, null); AggrSum(); JsonResponse tmpResult = JsonConvert.DeserializeObject <JsonResponse>(Response.Content.Trim("</pre>".ToArray())); Response.Content = HttpUtility.HtmlDecode(tmpResult.data); pos++; var firstTweet = FillUserTweet(result, Response.Content); var firstUrl = firstTweet.FirstOrDefault().Url; result.Tweets.Clear(); int endrow = currentLine - 1; while (endrow > 3) { if (sheet.Cells[endrow, 4].StringValue == firstUrl) { currentLine = endrow; break; } endrow--; } } //Crawl with json while (Regex.IsMatch(Response.Content.Trim(), RegexContent, RegexOptions.Multiline | RegexOptions.IgnoreCase) && Response.Status == Enums.CrawlResult.Succ) { content = Response.Content.Trim(); var currentTweet = FillUserTweet(result, content); foreach (Tweet tweet in currentTweet) { string fileName = tweet.Mid + ".xls"; //检查是否是失败后的已经存在的评论 if (NeedCrawlComment) { if (!File.Exists(rootPath + fileName)) { FillTweetComment(tweet, Site); if (tweet.Comments.Count > 0) { SaveComment(rootPath, tweet, fileName); } } } sheet.Cells[currentLine, 0].PutValue(tweet.Content); sheet.Cells[currentLine, 1].PutValue(tweet.PubDate.ToString("yyyy-MM-dd HH:mm:ss")); sheet.Cells[currentLine, 2].PutValue(tweet.Forward); sheet.Cells[currentLine, 3].PutValue(tweet.Comment); sheet.Cells[currentLine, 4].PutValue(tweet.Url); sheet.Cells[currentLine, 5].PutValue(tweet.Source); //link comment if (File.Exists(rootPath + fileName)) { sheet.Cells[currentLine, 6].PutValue("点击查看"); //string linkPath = result.Name + "/" + fileName; string linkPath = fileName; sheet.Hyperlinks.Add(currentLine, 6, 1, 1, linkPath); } outputBook.Save(rootPath + result.Name + ".xls"); StatusLbl.Text = string.Format("正在读取名人:{0}的第{1}条微博", result.Name, currentLine - 3); Application.DoEvents(); currentLine++; } var url = BuildTweetJsonUrl(result, endId, maxId, currentPage, pos); Request = BuildRequest(url); for (int i = 0; i < 5; i++) { try { Response = GeckoRequestProcessor.DoRequest(Request, Site, null, null); AggrSum(); } catch { } if (Response.Status != Enums.CrawlResult.Succ) { Logger.Info("访问页面错误:Url = " + Response.Url); } else { break; } } try { JsonResponse tmpResult = JsonConvert.DeserializeObject <JsonResponse>(Response.Content.Trim("</pre>".ToArray())); Response.Content = HttpUtility.HtmlDecode(tmpResult.data); } catch { try { CommentJsonResponse tmpResult = JsonConvert.DeserializeObject <CommentJsonResponse>(Response.Content.Trim("</pre>".ToArray())); Response.Content = HttpUtility.HtmlDecode(tmpResult.data.html); } catch { } } pos = (pos + 1) % 3; if (pos == 0) { currentPage++; } maxId = result.Tweets.Last().Mid; } return(result); }