/// <summary>
        /// 对页面内容进行解析抓取
        /// </summary>
        /// <param name="sendor"></param>
        /// <param name="args"></param>
        private void Parse(object sendor, OnCompletedEventArgs args)
        {
            string          strRef  = @"(<div class=""HotItem-content"">)[\s|\S]+?<\/div><\/div><\/span><\/div><\/div>";
            MatchCollection matches = new Regex(strRef).Matches(args.PageSource);

            foreach (Match match in matches)
            {
                if (hotList.Count >= max)
                {
                    break;
                }
                String url       = new Regex(@"href="".+?""").Match(match.Value).Value.Substring(5).Trim('"');
                String title     = new Regex(@"h2 class=""HotItem-title"".+?<").Match(match.Value).Value.Substring(25).Trim('<');
                String multiLine = String.Empty;
                try
                {
                    multiLine = new Regex(@"HotItem-excerpt"">.+?<").Match(match.Value).Value.Substring(17).Trim('<');
                }
                catch (Exception excep)
                {
                    Console.WriteLine("内部问题");
                    System.Console.WriteLine(excep.Message);
                }

                String degree = new Regex(@"<\/svg>.+?万").Match(match.Value).Value.Substring(6).Trim('万');
                hotList.Add(new HotPoint_ZHIHU(url, multiLine, title, Convert.ToInt32(degree)));
            }
        }
        private void Parse(object sender, OnCompletedEventArgs e)
        {
            Downloaded = true;
            String          strRef  = @"""url"":""https:\/\/api.zhihu.com\/questions\/[0-9]+?""";
            MatchCollection matches = new Regex(strRef).Matches(e.PageSource);

            foreach (Match match in matches)
            {
                //爬取5个问题
                if (count > max - 1)
                {
                    break;
                }
                try
                {
                    String url = new Regex(@"[0-9]+?""").Match(match.Value).Value.Trim('"', '\\');
                    answerUrls.Enqueue(url);
                    count++;
                }
                catch (Exception exception)
                {
                    Console.WriteLine(exception.Message);
                }
            }
        }
        private void Parse(Object sendor, OnCompletedEventArgs args)
        {
            lock (lock1)
            {
                Count++;
                Answer_ZHIHU answer = new Answer_ZHIHU();
                try
                {
                    String          strRef  = @"{""id""[\s|\S]+?""upvoted_followees"":";
                    MatchCollection matches = new Regex(strRef).Matches(args.PageSource);
                    strRef       = @"title"":""[\s|\S]+?""";
                    answer.Title = new Regex(strRef).Match(matches[0].Value).Value.Substring(7).Trim('"');
                    strRef       = @"url"":""[\s|\S]+?""";
                    answer.Url   = new Regex(strRef).Match(matches[0].Value).Value.Substring(5).Trim('"');
                    answer.Url   = Regex.Replace(answer.Url, @"\/api\/v4\/questions", @"/question");

                    String author, content, voteUp;
                    answer.List = new List <AnswerDetail>();
                    foreach (Match match in matches)
                    {
                        author  = new Regex(@"name"":""[\s|\S]+?""").Match(match.Value).Value.Substring(7).Trim('"');
                        voteUp  = new Regex(@"voteup_count"":[0-9]+?,").Match(match.Value).Value.Substring(14).Trim(',');
                        content = new Regex(@"content"":""[\s|\S]+?"",""").Match(match.Value).Value.Substring(10).Trim('"', ',');
                        content = Regex.Replace(content, @"\\u003c[\s|\S]+?\\u003e", "");
                        answer.List.Add(new AnswerDetail(content, author, Convert.ToInt32(voteUp)));
                    }
                    AnswerList.Add(answer);
                    multiSpider.GetFunc(new Uri(answer.Url)).Wait();
                }
                catch (Exception e)
                {
                    Console.WriteLine(e.Message);
                }
            }
        }
Exemple #4
0
        private void HotelCrawler_OnCompleted(object sender, OnCompletedEventArgs e)
        {
            DBHelp        help    = new DBHelp();
            StrongCrawler crawler = sender as StrongCrawler;
            string        commid  = DateTime.Now.ToString("yyyyMMddHHmmss");

            foreach (ReptileInfo item in crawler.Data)
            {
                item.CommId = string.IsNullOrEmpty(item.CommId) ? commid : item.CommId;
                help.InsertData(item);
            }
        }
Exemple #5
0
        private void Parse(object sendor, OnCompletedEventArgs args)
        {
            try
            {
                HtmlDocument doc = new HtmlDocument();
                doc.LoadHtml(args.PageSource);
                HtmlNodeCollection cards = doc.DocumentNode.SelectNodes("//div[@class='card']");

                foreach (var card in cards)
                {
                    HtmlNode cardfeed = card.SelectSingleNode(".//div[@class='card-feed']");

                    //昵称
                    HtmlNode n    = cardfeed.SelectSingleNode(".//a[1][@class='name']");
                    string   name = n.InnerText.Trim();
                    // Console.WriteLine(name);
                    //内容简介
                    HtmlNode c       = cardfeed.SelectSingleNode(".//p[1]");
                    string   content = c.InnerText.Trim();
                    content = Regex.Replace(content, @"(展开全文c|\?)", "");
                    content = Regex.Replace(content, @"\?", "");
                    // Console.WriteLine(content);
                    //来自
                    HtmlNode f    = cardfeed.SelectSingleNode(".//p[@class='from']");
                    string   from = f.InnerText.Trim();
                    from = Regex.Replace(from, @"\s", "");
                    // Console.WriteLine(from);

                    HtmlNode cardAction = card.SelectSingleNode(".//div[@class='card-act']");

                    //转发
                    HtmlNode p    = cardAction.SelectSingleNode(".//li[2]/a");
                    string   post = p.InnerText.Trim();

                    //评论
                    HtmlNode com     = cardAction.SelectSingleNode(".//li[3]/a");
                    string   comment = com.InnerText.Trim();

                    //点赞
                    HtmlNode li   = cardAction.SelectSingleNode(".//ul/li[4]/a");
                    string   like = "赞 " + li.InnerText.Trim();
                    count++;
                    searchResults.Enqueue(new WSearchResult(count, name, content, post, like, comment, from));
                    //Console.WriteLine(count);
                }
            }
            catch (Exception e)
            {
                Console.WriteLine(e.Message);
            }
        }
 private void MultiParse(Object sendor, OnCompletedEventArgs args)
 {
     try
     {
         String strRef    = @"RichText ztext"" itemProp=""text"">[\s|\S]+?<";
         String multiLine = new Regex(strRef).Match(args.PageSource).Value.Substring(32);
         AnswerList[Count - 1].MultiLine = Regex.Replace(multiLine, @"<[\s|\S]+?>", "").Trim('<');
     }
     catch (Exception e)
     {
         Console.WriteLine(e.Message);
         AnswerList[Count - 1].MultiLine = "无";
     }
 }
Exemple #7
0
        private void Parse(object sender, OnCompletedEventArgs args)
        {
            //HotTop();
            //尝试历史

            //HtmlWeb webClient = new HtmlWeb();
            //HtmlWeb webClient2 = new HtmlWeb();

            //Encoding encoder = Encoding.GetEncoding("utf-8");
            //HtmlAgilityPack.HtmlDocument doc = webClient.Load("http://tieba.baidu.com/hottopic/browse/topicList?res_type=1&red_tag=q0593629036");
            //Encoding encoder2 = Encoding.GetEncoding("utf-8");
            //HtmlAgilityPack.HtmlDocument doc2 = webClient2.Load("https://tieba.baidu.com/f/good?kw=秦时明月");
            //HtmlNode htmlNode = doc.DocumentNode;
            //HtmlNodeCollection hrefList = doc.DocumentNode.SelectNodes(".//a[@href]");
            //HtmlNodeCollection titleList = doc.DocumentNode.SelectNodes(".//a");
            //HtmlNodeCollection emList = doc.DocumentNode.SelectNodes(".//p");
            //HtmlNodeCollection HotList = doc.DocumentNode.SelectNodes(".//span");
            //int i, j = 1, k = 3;
            //for (i = 12; i < 32; i++)
            //{

            //    Console.WriteLine("热帖标题:" + titleList[i].InnerText);
            //    Console.WriteLine("热帖简述:" + emList[j].InnerText);
            //    Console.WriteLine("热度:" + HotList[k].InnerText);
            //    Console.WriteLine("链接:" + hrefList[i].Attributes["href"].Value);
            //    k = k + 2;
            //    j++;
            //}
            //HtmlNode LookFor = doc2.DocumentNode;
            //HtmlNodeCollection ResultTitle = doc2.DocumentNode.SelectNodes(".//a[@title]");
            //HtmlNodeCollection ResultHerf = doc2.DocumentNode.SelectNodes(".//a[contains(@href,'fr=good')]");

            ////for (int i = 0; i < 30; i++)
            ////{
            ////    int j = i + 15;
            ////    Console.WriteLine("帖子:" + ResultTitle[j].Attributes["title"].Value);
            ////    Console.WriteLine("帖子链接为:" + "https://tieba.baidu.com" + ResultHerf[i].Attributes["href"].Value);
            ////}
            //for (int i = 0; i < 30; i++)
            //{
            //    int j = i + 15;
            //    Console.WriteLine("帖子:" + ResultHerf[i].Attributes["title"].Value);
            //    Console.WriteLine("帖子链接为:" + "https://tieba.baidu.com" + ResultHerf[i].Attributes["href"].Value);
            //}

            //Console.Write(args.PageSource);
        }
        private static void HotelCrawler(OnCompletedEventArgs e)
        {
            var StationsInfos = e.WebDriver.FindElement(By.XPath("//*[@id='all_citybox']"));
            var stationList   = StationsInfos.FindElements(By.XPath("li[@class='station-item']"));
            //var totalPage = Convert.ToInt32(comments.FindElement(By.XPath("div[@class='c_page_box']/div[@class='c_page']/div[contains(@class,'c_page_list')]/a[last()]")).Text);
            TrainStation        temp;
            List <TrainStation> list = new List <TrainStation>();

            foreach (var item in stationList)
            {
                temp      = new TrainStation();
                temp.text = item.GetAttribute("data-text");
                temp.code = item.GetAttribute("data-code");
                list.Add(temp);
            }

            Console.WriteLine(JsonConvert.SerializeObject(list));
            Console.ReadKey();
        }
Exemple #9
0
        /// <summary>
        /// 高级爬虫
        /// </summary>
        /// <param name="uri">抓取地址URL</param>
        /// <param name="script">要执行的Javascript脚本代码</param>
        /// <param name="operation">要执行的页面操作</param>
        /// <returns></returns>
        public OnCompletedEventArgs Start(Uri uri, string name, Operation operation, ref string pageSource, Script script = null)
        {
            if (OnStart != null)
            {
                this.OnStart(this, new OnStartEventArgs(uri));
            }
            OnCompletedEventArgs result = null;
            var drives = new ChromeDriver(_chromeDriverService, _chromeOptions);
            var driver = new PhantomJSDriver(_service, _options);//实例化PhantomJS的WebDriver

            try
            {
                var watch = DateTime.Now;
                drives.Navigate().GoToUrl(uri.ToString());//请求URL地址
                if (script != null)
                {
                    driver.ExecuteScript(script.Code, script.Args);                //执行Javascript代码
                }
                if (operation != null && operation.Action != null)
                {
                    operation.Action.Invoke(driver);
                }


                var driverWait = new WebDriverWait(driver, TimeSpan.FromMilliseconds(operation.Timeout));//设置超时时间为x毫秒
                if (operation.Condition != null)
                {
                    driverWait.Until(operation.Condition);
                }
                var threadId     = System.Threading.Thread.CurrentThread.ManagedThreadId; //获取当前任务线程ID
                var milliseconds = DateTime.Now.Subtract(watch).Milliseconds;             //获取请求执行时间;
                pageSource = drives.PageSource;                                           //获取网页Dom结构
                result     = new OnCompletedEventArgs(uri, threadId, milliseconds, pageSource, drives, name);

                //this.OnCompleted?.Invoke(this, new OnCompletedEventArgs(uri, threadId, milliseconds, pageSource, driver,name));
            }
            catch (Exception ex)
            {
                this.OnError?.Invoke(this, new OnErrorEventArgs(uri, ex));
                //throw ex;
            }
            return(result);
        }
        private static void HotelCrawler(OnCompletedEventArgs e)
        {
            //Console.WriteLine(e.PageSource);
            //File.WriteAllText(Environment.CurrentDirectory + "\\cc.html", e.PageSource, Encoding.UTF8);

            var hotelName   = e.WebDriver.FindElement(By.XPath("//*[@id='J_htl_info']/div[@class='name']/h2[@class='cn_n']")).Text;
            var address     = e.WebDriver.FindElement(By.XPath("//*[@id='J_htl_info']/div[@class='adress']")).Text;
            var price       = e.WebDriver.FindElement(By.XPath("//*[@id='div_minprice']/p[1]")).Text;
            var score       = e.WebDriver.FindElement(By.XPath("//*[@id='divCtripComment']/div[1]/div[1]/span[3]/span")).Text;
            var reviewCount = e.WebDriver.FindElement(By.XPath("//*[@id='commentTab']/a")).Text;

            var comments    = e.WebDriver.FindElement(By.XPath("//*[@id='hotel_info_comment']/div[@id='commentList']/div[1]/div[1]/div[1]"));
            var currentPage = Convert.ToInt32(comments.FindElement(By.XPath("div[@class='c_page_box']/div[@class='c_page']/div[contains(@class,'c_page_list')]/a[@class='current']")).Text);
            var totalPage   = Convert.ToInt32(comments.FindElement(By.XPath("div[@class='c_page_box']/div[@class='c_page']/div[contains(@class,'c_page_list')]/a[last()]")).Text);
            var messages    = comments.FindElements(By.XPath("div[@class='comment_detail_list']/div"));
            var nextPage    = Convert.ToInt32(comments.FindElement(By.XPath("div[@class='c_page_box']/div[@class='c_page']/div[contains(@class,'c_page_list')]/a[@class='current']/following-sibling::a[1]")).Text);

            Console.WriteLine();
            Console.WriteLine("名称:" + hotelName);
            Console.WriteLine("地址:" + address);
            Console.WriteLine("价格:" + price);
            Console.WriteLine("评分:" + score);
            Console.WriteLine("数量:" + reviewCount);
            Console.WriteLine("页码:" + "当前页(" + currentPage + ")" + "下一页(" + nextPage + ")" + "总页数(" + totalPage + ")" + "每页(" + messages.Count + ")");
            Console.WriteLine();
            Console.WriteLine("===============================================");
            Console.WriteLine();
            Console.WriteLine("点评内容:");

            foreach (var message in messages)
            {
                Console.WriteLine("帐号:" + message.FindElement(By.XPath("div[contains(@class,'user_info')]/p[@class='name']")).Text);
                Console.WriteLine("房型:" + message.FindElement(By.XPath("div[@class='comment_main']/p/a")).Text);
                Console.WriteLine("内容:" + message.FindElement(By.XPath("div[@class='comment_main']/div[@class='comment_txt']/div[1]")).Text.Substring(0, 50) + "....");
                Console.WriteLine();
                Console.WriteLine();
            }
            Console.WriteLine();
            Console.WriteLine("===============================================");
            Console.WriteLine("地址:" + e.Uri.ToString());
            Console.WriteLine("耗时:" + e.Milliseconds + "毫秒");
        }
Exemple #11
0
        /// <summary>
        /// 对页面内容进行解析抓取
        /// </summary>
        /// <param name="sendor"></param>
        /// <param name="args"></param>
        private void Parse(object sendor, OnCompletedEventArgs args)
        {
            string       pattern      = @"(?s)(?<=<tr class="""">).+?(?=</tr>)";
            RegexOptions regexOptions = RegexOptions.None;
            Regex        regex        = new Regex(pattern, regexOptions);
            string       inputData    = args.PageSource;

            foreach (Match match in regex.Matches(inputData))
            {
                if (match.Success)
                {
                    try
                    {
                        HtmlDocument doc = new HtmlDocument();
                        doc.LoadHtml(match.Value);
                        String url   = "https://s.weibo.com" + new Regex(@"(?<=href="").+?(?="")").Match(match.Value).Value;
                        String title = new Regex(@"(?<=>).+?(?=</a>)").Match(match.Value).Value;
                        title = Regex.Replace(title, @"<img\b[^>]*>", "");
                        String   rank   = new Regex(@"(?<=<td class=""td-01 ranktop"">).+?(?=</td>)").Match(match.Value).Value;
                        String   degree = new Regex(@"(?<=<span>).+?(?=</span>)").Match(match.Value).Value;
                        HtmlNode icon   = doc.DocumentNode.SelectSingleNode(".//i");
                        if (icon != null)
                        {
                            if (icon.InnerText == "荐")
                            {
                                continue;
                            }
                        }
                        if (rank == "")
                        {
                            continue;
                        }
                        hotPoints.Add(new WHotPoint(Convert.ToInt32(rank), url, title, degree));
                    }
                    catch (Exception e)
                    {
                        Console.WriteLine(e.Message);
                    }
                }
            }
        }
Exemple #12
0
        private void Crawler_OnCompleted(object sender, OnCompletedEventArgs e)
        {
            // e.PageSource = DataCheck.RepTrim(e.PageSource);

            string[] str = DataCheck.GetRegStrArr(e.PageSource, e.Regex);
        }
Exemple #13
0
 private void Crawler_OnCompleted(object sender, OnCompletedEventArgs e)
 {
     string[] str = DataCheck.GetRegStrArr(e.PageSource, e.Regex);
     SetLog("抓取新闻数量:" + str.Length + "\t用时:" + e.Milliseconds, Color.Black);
 }
Exemple #14
0
 /// <summary>
 /// 引发<see cref="OnCompletedEvent"/>事件
 /// </summary>
 /// <param name="args"></param>
 public void OnCompleted(OnCompletedEventArgs args)
 {
     OnCompletedEvent?.Invoke(this, args);
 }
Exemple #15
0
 private void Crawler_OnCompleted(object sender, OnCompletedEventArgs e)
 {
 }
Exemple #16
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        private void Crawler_OnCompleted(OnCompletedEventArgs e, int layer, string Title = "")
        {
            SetLog($"读取网站:{e.Uri.ToString()}\n\t\t\t\t深度:{layer}\t用时:{e.Milliseconds} 毫秒\t线程ID:{e.TreadID}", Color.Gray);
            UpAll(1);
            urlList.Add(e.Uri.ToString());
            HtmlDocument doc = new HtmlDocument();

            doc.LoadHtml(e.PageSource);
            ConReg = "//div[@id='endText']";
            // 查看是否存在内容
            HtmlNode conNode = doc.DocumentNode.SelectSingleNode(ConReg);

            if (conNode != null)
            {
                string url = e.Uri.ToString();
                // 判断此超链接是否已经读取
                if (_GSQ_NewsService.Exists(c => c.url == url))
                {
                    return;
                }

                if (!string.IsNullOrEmpty(conNode.InnerHtml))
                {
                    GSQ_News _News = new GSQ_News();
                    _News.title         = Title;
                    _News.url           = e.Uri.ToString();
                    _News.sourcewebsite = conNode.InnerHtml;
                    _News.num           = 0;
                    _News.CreateDate    = DateTime.Now;
                    _GSQ_NewsService.AddEntity(_News);
                    UpCon(1);
                    UpNum(1);
                    SetLog($"抓取新闻《{Title}》,用时:{e.Milliseconds} 毫秒", Color.Gray);
                }
                return;
            }
            if (layer >= 4)
            {
                return;
            }

            // 获取所有a标签
            var AList = doc.DocumentNode.Descendants("a");

            foreach (var item in AList)
            {
                // 爬虫类
                Crawler   crawler   = new Crawler();
                Operation operation = new Operation()
                {
                    Action    = (x) => { },
                    Condition = (x) => { return(true); },
                    timeout   = 5000
                };
                crawler.OnError     += Crawler_OnError;
                crawler.OnCompleted += (s, ex) => {
                    Crawler_OnCompleted(ex, layer + 1, item.InnerText);
                };
                string url  = Utils.DelLastChar(e.Uri.ToString(), "/", 0);
                string href = item.Attributes["href"]?.Value;
                if (!string.IsNullOrEmpty(href) && !urlList.Contains(href) &&
                    DataCheck.CheckReg(href, DataCheck.Reg_Url) &&
                    href.Contains(url))
                {
                    crawler.Start(href, operation, null).Wait();
                }
            }
        }