Example #1
0
        private bool Login()
        {
            //换一个账号
            var Site = SiteBusiness.GetBySiteID("Weibo");
            //AccountManager.NeedChangeAccount(Site);
            //var GotAccout = AccountManager.ChangeAccount(Site.SiteID, Site.AccountLimitReqs);

            ////获得可用帐号,开始登录
            //AccountExtend CurrentAccount = AccountManager.GetCurrentAccount(Site.SiteID);

            //var AccountID = CurrentAccount.AccountID;
            //执行登出登录任务并返回其执行结果
            var CurrentAccount = LoginAccountBusiness.GetByAccountID("Weibo1200");
            var Response       = Crawler.Core.RequestProcessor.Processor.DoLogin(Site, CurrentAccount);

            if (Response != null && Response.Status == Enums.CrawlResult.Succ)
            {
                //登录成功
                return(true);
            }
            else
            {
                return(false);
            }

            //检查代理获取是否正常:null表示异常,string.Empty表示不用代理
        }
Example #2
0
        private void Init()
        {
            var sites = SiteBusiness.GetByWhere("(ContentType=4 OR ContentType=5) AND IsVisible=True", null, "", 0, 1000);

            //var filterSites = from site in sites
            //                  let media = site.Media
            //                  where media.MediaType > 20
            //                  select site;
            SearchEngineChkList.DataSource    = sites.OrderBy(s => s.Name).ToArray();
            SearchEngineChkList.DisplayMember = "Name";
            SearchEngineChkList.ValueMember   = "SiteID";

            StartDate.Value = DateTime.Now.AddDays(-180);
        }
Example #3
0
        public override BaseActionResult Execute(Context context)
        {
            BaseActionResult bar = new BaseActionResult();

            bar.Result = false;

            try
            {
                switch (context.MessageName.Trim())
                {
                case "GetCurDomain":
                    bar.ReturnString = ConfigurationManager.AppSettings["Domain"];
                    bar.Result       = true;
                    break;

                case "GetCurSite":
                    SiteBusiness sb = new SiteBusiness();
                    bar.ReturnString = sb.GetSite(ConfigurationManager.AppSettings["Site"]);
                    if (string.IsNullOrWhiteSpace(bar.ReturnString))
                    {
                        bar.ReturnString = ConfigurationManager.AppSettings["Site"];
                    }
                    bar.Result = true;
                    break;

                default:
                    bar.ReturnMessage = null;

                    break;
                }
            }
            catch (Exception e)
            {
                logger.Error((long)ModuleEnum.Framework_WS, ModuleInstanceName.Framework, 1, e.Message, Application.StartupPath.ToString(),
                             (new System.Diagnostics.StackFrame(true)).GetFileName(),
                             Convert.ToInt32(new System.Diagnostics.StackFrame(true).GetFileLineNumber().ToString()));
                bar.ReturnMessage = null;
            }
            return(bar);
        }
Example #4
0
        public override BaseActionResult Execute(Context context)
        {
            bar.DataSetData = new DataSet();
            try
            {
                switch (context.MessageName.Trim())
                {
                case "GetFilterSite":
                {
                    string       UserGuid     = CommonGlobalSettings.Utilities.GetParameter("UserGuid", context.Parameters);
                    string       RoleName     = CommonGlobalSettings.Utilities.GetParameter("RoleName", context.Parameters);
                    string       CurSite      = CommonGlobalSettings.Utilities.GetParameter("CurSite", context.Parameters);
                    string       MatchingName = CommonGlobalSettings.Utilities.GetParameter("MatchingName", context.Parameters);
                    SiteBusiness sb           = new SiteBusiness();
                    bar.DataSetData = sb.GetFilterSite(UserGuid, RoleName, CurSite, MatchingName);
                    bar.Result      = true;
                    break;
                }

                default:
                {
                    bar.ReturnMessage = null;
                    bar.Result        = false;
                    break;
                }
                }
            }
            catch (Exception e)
            {
                logger.Error((long)ModuleEnum.Framework_WS, ModuleInstanceName.Framework, 1, e.Message, Application.StartupPath.ToString(),
                             (new System.Diagnostics.StackFrame(true)).GetFileName(),
                             Convert.ToInt32(new System.Diagnostics.StackFrame(true).GetFileLineNumber().ToString()));
                bar.ReturnMessage = null;
                bar.Result        = false;
                return(bar);
            }
            return(bar);
        }
Example #5
0
        private void Search(SiteEntity siteEntity, KeywordQuery keywordQuery, List <AnalyzeData> resultDataList, int ProgressPercStart, int ProgressPercEnd)
        {
            var          firstCrawl     = CrawlBusiness.GetTopBySiteID(siteEntity.SiteID, "");
            var          crawlID        = firstCrawl.CrawlID;
            string       lastItemID     = null;
            var          keyword        = keywordQuery.Keyword;
            var          keywordExclude = "";
            var          startPage      = keywordQuery.StartPage;
            var          endPage        = keywordQuery.EndPage;
            var          crawl          = CrawlBusiness.GetByCrawlID(crawlID);
            var          site           = SiteBusiness.GetBySiteID(crawl.SiteID);
            ListResponse result         = null;

            for (int currentPage = startPage; currentPage <= endPage; currentPage++)
            {
                CrawlRequest request = CrawlRequest.GetQueryUrl(crawlID, keyword, currentPage, keywordExclude, "", "");
                crawl.KeywordQuery = keyword;
                crawl.KeywordAny   = "";
                crawl.KeywordNot   = keywordExclude;
                crawl.KeywordSite  = "";
                var response     = Core.Crawler.SimpleCrawler.CrawlList_Single(request, crawl, site, true, null);
                var currentItems = response.ExtractItems(crawl);
                if (!(currentItems == null || !currentItems.Any() || currentItems.Last().ItemID == lastItemID))
                {
                    lastItemID = currentItems.Last().ItemID;
                    //bool stopCrawl;
                    //ExistCheck.ExistCheck_List(
                    //                           response, null,
                    //                           (Enums.ExistItemStrategy) crawl.ExistItemStrategy,
                    //                           (Enums.ContentDetailLevel) site.ContentDetailLevel, crawl.IssueID,
                    //                           crawl.CrawlID, out stopCrawl);
                    if (result == null)
                    {
                        result = response;
                    }
                    else
                    {
                        result.CombineList(response);
                    }
                }

                backgroundWorker1.ReportProgress(ProgressPercStart + (ProgressPercEnd - ProgressPercStart) * (currentPage + 1) / (endPage - startPage + 1));
            }

            //Get Item
            if (result != null)
            {
                if (DetailChk.Checked)
                {
                    this.Text = Site.Name + " 抓取Items";
                    Core.Crawler.SimpleCrawler.CrawlItem_Multi(
                        result, crawl, site, msg =>
                    {
                    });
                    this.Text = @"Palas搜索工具";
                }

                var         items = result.ExtractItems(crawl, null);
                AnalyzeData data  = new AnalyzeData()
                {
                    Items   = items,
                    CrawlID = crawl.CrawlID
                };
                resultDataList.Add(data);
            }
        }
Example #6
0
        private UserTweet CrawlTask(string entryUrl)
        {
            _currentUrl = entryUrl;
            var Site    = SiteBusiness.GetBySiteID("Weibo");
            var Request = BuildRequest(entryUrl, RegexContent);

            Site.TimeoutSecs = 60;
            CrawlResponse Response = null;

            try
            {
                Response = GeckoRequestProcessor.DoRequest(Request, Site, null, null);
                AggrSum();
            }
            catch
            {
            }



            if (Response.Status != Enums.CrawlResult.Succ)
            {
                Logger.Info("访问页面错误:Url = " + Response.Url);
            }
            var content = Response.Content;
            //First page
            UserTweet result = new UserTweet();

            result.Url = entryUrl;
            try
            {
                FillUserInfo(content, result);
            }
            catch
            {
                return(result);
            }

            var endId = DeterminedMid(content, MidType.EndId);
            var maxId = DeterminedMid(content, MidType.MaxId);
            //var name = Regex.Match(content,)
            int       currentPage = 1;
            int       maxPage     = 50;
            string    rootPath    = @"D:/output/" + result.Name + "/";
            Workbook  outputBook  = new Workbook();
            Worksheet sheet       = null;
            int       currentLine = 4;
            int       pos         = 1; //表示第几次滚屏(一页中共3次),0:第一次;1:第二次;2:第三次
            bool      isContinue  = false;

            if (!Directory.Exists(rootPath))
            {
                Directory.CreateDirectory(rootPath);
            }
            if (File.Exists(rootPath + result.Name + ".xls"))
            {
                outputBook.Open(rootPath + result.Name + ".xls");
                sheet = outputBook.Worksheets[result.Name];
                int endrow = currentLine;
                while (!string.IsNullOrEmpty(sheet.Cells[endrow, 0].StringValue))
                {
                    endrow++;
                }
                currentLine = endrow;
                currentPage = (int)(currentLine / 45d) + 1;
                isContinue  = true;
            }
            else
            {
                sheet = outputBook.Worksheets.Add(result.Name);
            }
            //Save to excel

            //Initialize column
            sheet.Cells[0, 0].PutValue("姓名");
            sheet.Cells[0, 1].PutValue("网址");
            sheet.Cells[0, 2].PutValue("粉丝数");
            sheet.Cells[0, 3].PutValue("关注数");
            sheet.Cells[0, 4].PutValue("微博数");

            sheet.Cells[1, 0].PutValue(result.Name);
            sheet.Cells[1, 1].PutValue(result.Url);
            sheet.Cells[1, 2].PutValue(result.Follower);
            sheet.Cells[1, 3].PutValue(result.Follow);
            sheet.Cells[1, 4].PutValue(result.TweetNum);

            sheet.Cells[3, 0].PutValue("微博内容");
            sheet.Cells[3, 1].PutValue("发布时间");
            sheet.Cells[3, 2].PutValue("转发数");
            sheet.Cells[3, 3].PutValue("评论数");
            sheet.Cells[3, 4].PutValue("原帖地址");
            sheet.Cells[3, 5].PutValue("来源");
            sheet.Cells[3, 6].PutValue("具体评论");



            if (isContinue)
            {
                pos = 0;
                var url = BuildTweetJsonUrl(result, endId, maxId, currentPage, pos);
                Request  = BuildRequest(url);
                Response = GeckoRequestProcessor.DoRequest(Request, Site, null, null);
                AggrSum();
                JsonResponse tmpResult =
                    JsonConvert.DeserializeObject <JsonResponse>(Response.Content.Trim("</pre>".ToArray()));
                Response.Content = HttpUtility.HtmlDecode(tmpResult.data);
                pos++;
                var firstTweet = FillUserTweet(result, Response.Content);
                var firstUrl   = firstTweet.FirstOrDefault().Url;
                result.Tweets.Clear();
                int endrow = currentLine - 1;
                while (endrow > 3)
                {
                    if (sheet.Cells[endrow, 4].StringValue == firstUrl)
                    {
                        currentLine = endrow;
                        break;
                    }
                    endrow--;
                }
            }
            //Crawl with json
            while (Regex.IsMatch(Response.Content.Trim(), RegexContent, RegexOptions.Multiline | RegexOptions.IgnoreCase) && Response.Status == Enums.CrawlResult.Succ)
            {
                content = Response.Content.Trim();
                var currentTweet = FillUserTweet(result, content);
                foreach (Tweet tweet in currentTweet)
                {
                    string fileName = tweet.Mid + ".xls";
                    //检查是否是失败后的已经存在的评论
                    if (NeedCrawlComment)
                    {
                        if (!File.Exists(rootPath + fileName))
                        {
                            FillTweetComment(tweet, Site);
                            if (tweet.Comments.Count > 0)
                            {
                                SaveComment(rootPath, tweet, fileName);
                            }
                        }
                    }



                    sheet.Cells[currentLine, 0].PutValue(tweet.Content);
                    sheet.Cells[currentLine, 1].PutValue(tweet.PubDate.ToString("yyyy-MM-dd HH:mm:ss"));
                    sheet.Cells[currentLine, 2].PutValue(tweet.Forward);
                    sheet.Cells[currentLine, 3].PutValue(tweet.Comment);
                    sheet.Cells[currentLine, 4].PutValue(tweet.Url);
                    sheet.Cells[currentLine, 5].PutValue(tweet.Source);

                    //link comment
                    if (File.Exists(rootPath + fileName))
                    {
                        sheet.Cells[currentLine, 6].PutValue("点击查看");
                        //string linkPath = result.Name + "/" + fileName;
                        string linkPath = fileName;
                        sheet.Hyperlinks.Add(currentLine, 6, 1, 1, linkPath);
                    }
                    outputBook.Save(rootPath + result.Name + ".xls");
                    StatusLbl.Text = string.Format("正在读取名人:{0}的第{1}条微博", result.Name, currentLine - 3);
                    Application.DoEvents();
                    currentLine++;
                }



                var url = BuildTweetJsonUrl(result, endId, maxId, currentPage, pos);
                Request = BuildRequest(url);
                for (int i = 0; i < 5; i++)
                {
                    try
                    {
                        Response = GeckoRequestProcessor.DoRequest(Request, Site, null, null);
                        AggrSum();
                    }
                    catch
                    {
                    }
                    if (Response.Status != Enums.CrawlResult.Succ)
                    {
                        Logger.Info("访问页面错误:Url = " + Response.Url);
                    }
                    else
                    {
                        break;
                    }
                }

                try
                {
                    JsonResponse tmpResult =
                        JsonConvert.DeserializeObject <JsonResponse>(Response.Content.Trim("</pre>".ToArray()));
                    Response.Content = HttpUtility.HtmlDecode(tmpResult.data);
                }
                catch
                {
                    try
                    {
                        CommentJsonResponse tmpResult =
                            JsonConvert.DeserializeObject <CommentJsonResponse>(Response.Content.Trim("</pre>".ToArray()));
                        Response.Content = HttpUtility.HtmlDecode(tmpResult.data.html);
                    }
                    catch
                    {
                    }
                }

                pos = (pos + 1) % 3;
                if (pos == 0)
                {
                    currentPage++;
                }
                maxId = result.Tweets.Last().Mid;
            }



            return(result);
        }