/// <summary> /// 对页面内容进行解析抓取 /// </summary> /// <param name="sendor"></param> /// <param name="args"></param> private void Parse(object sendor, OnCompletedEventArgs args) { string strRef = @"(<div class=""HotItem-content"">)[\s|\S]+?<\/div><\/div><\/span><\/div><\/div>"; MatchCollection matches = new Regex(strRef).Matches(args.PageSource); foreach (Match match in matches) { if (hotList.Count >= max) { break; } String url = new Regex(@"href="".+?""").Match(match.Value).Value.Substring(5).Trim('"'); String title = new Regex(@"h2 class=""HotItem-title"".+?<").Match(match.Value).Value.Substring(25).Trim('<'); String multiLine = String.Empty; try { multiLine = new Regex(@"HotItem-excerpt"">.+?<").Match(match.Value).Value.Substring(17).Trim('<'); } catch (Exception excep) { Console.WriteLine("内部问题"); System.Console.WriteLine(excep.Message); } String degree = new Regex(@"<\/svg>.+?万").Match(match.Value).Value.Substring(6).Trim('万'); hotList.Add(new HotPoint_ZHIHU(url, multiLine, title, Convert.ToInt32(degree))); } }
private void Parse(object sender, OnCompletedEventArgs e) { Downloaded = true; String strRef = @"""url"":""https:\/\/api.zhihu.com\/questions\/[0-9]+?"""; MatchCollection matches = new Regex(strRef).Matches(e.PageSource); foreach (Match match in matches) { //爬取5个问题 if (count > max - 1) { break; } try { String url = new Regex(@"[0-9]+?""").Match(match.Value).Value.Trim('"', '\\'); answerUrls.Enqueue(url); count++; } catch (Exception exception) { Console.WriteLine(exception.Message); } } }
private void Parse(Object sendor, OnCompletedEventArgs args) { lock (lock1) { Count++; Answer_ZHIHU answer = new Answer_ZHIHU(); try { String strRef = @"{""id""[\s|\S]+?""upvoted_followees"":"; MatchCollection matches = new Regex(strRef).Matches(args.PageSource); strRef = @"title"":""[\s|\S]+?"""; answer.Title = new Regex(strRef).Match(matches[0].Value).Value.Substring(7).Trim('"'); strRef = @"url"":""[\s|\S]+?"""; answer.Url = new Regex(strRef).Match(matches[0].Value).Value.Substring(5).Trim('"'); answer.Url = Regex.Replace(answer.Url, @"\/api\/v4\/questions", @"/question"); String author, content, voteUp; answer.List = new List <AnswerDetail>(); foreach (Match match in matches) { author = new Regex(@"name"":""[\s|\S]+?""").Match(match.Value).Value.Substring(7).Trim('"'); voteUp = new Regex(@"voteup_count"":[0-9]+?,").Match(match.Value).Value.Substring(14).Trim(','); content = new Regex(@"content"":""[\s|\S]+?"",""").Match(match.Value).Value.Substring(10).Trim('"', ','); content = Regex.Replace(content, @"\\u003c[\s|\S]+?\\u003e", ""); answer.List.Add(new AnswerDetail(content, author, Convert.ToInt32(voteUp))); } AnswerList.Add(answer); multiSpider.GetFunc(new Uri(answer.Url)).Wait(); } catch (Exception e) { Console.WriteLine(e.Message); } } }
private void HotelCrawler_OnCompleted(object sender, OnCompletedEventArgs e) { DBHelp help = new DBHelp(); StrongCrawler crawler = sender as StrongCrawler; string commid = DateTime.Now.ToString("yyyyMMddHHmmss"); foreach (ReptileInfo item in crawler.Data) { item.CommId = string.IsNullOrEmpty(item.CommId) ? commid : item.CommId; help.InsertData(item); } }
private void Parse(object sendor, OnCompletedEventArgs args) { try { HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(args.PageSource); HtmlNodeCollection cards = doc.DocumentNode.SelectNodes("//div[@class='card']"); foreach (var card in cards) { HtmlNode cardfeed = card.SelectSingleNode(".//div[@class='card-feed']"); //昵称 HtmlNode n = cardfeed.SelectSingleNode(".//a[1][@class='name']"); string name = n.InnerText.Trim(); // Console.WriteLine(name); //内容简介 HtmlNode c = cardfeed.SelectSingleNode(".//p[1]"); string content = c.InnerText.Trim(); content = Regex.Replace(content, @"(展开全文c|\?)", ""); content = Regex.Replace(content, @"\?", ""); // Console.WriteLine(content); //来自 HtmlNode f = cardfeed.SelectSingleNode(".//p[@class='from']"); string from = f.InnerText.Trim(); from = Regex.Replace(from, @"\s", ""); // Console.WriteLine(from); HtmlNode cardAction = card.SelectSingleNode(".//div[@class='card-act']"); //转发 HtmlNode p = cardAction.SelectSingleNode(".//li[2]/a"); string post = p.InnerText.Trim(); //评论 HtmlNode com = cardAction.SelectSingleNode(".//li[3]/a"); string comment = com.InnerText.Trim(); //点赞 HtmlNode li = cardAction.SelectSingleNode(".//ul/li[4]/a"); string like = "赞 " + li.InnerText.Trim(); count++; searchResults.Enqueue(new WSearchResult(count, name, content, post, like, comment, from)); //Console.WriteLine(count); } } catch (Exception e) { Console.WriteLine(e.Message); } }
private void MultiParse(Object sendor, OnCompletedEventArgs args) { try { String strRef = @"RichText ztext"" itemProp=""text"">[\s|\S]+?<"; String multiLine = new Regex(strRef).Match(args.PageSource).Value.Substring(32); AnswerList[Count - 1].MultiLine = Regex.Replace(multiLine, @"<[\s|\S]+?>", "").Trim('<'); } catch (Exception e) { Console.WriteLine(e.Message); AnswerList[Count - 1].MultiLine = "无"; } }
private void Parse(object sender, OnCompletedEventArgs args) { //HotTop(); //尝试历史 //HtmlWeb webClient = new HtmlWeb(); //HtmlWeb webClient2 = new HtmlWeb(); //Encoding encoder = Encoding.GetEncoding("utf-8"); //HtmlAgilityPack.HtmlDocument doc = webClient.Load("http://tieba.baidu.com/hottopic/browse/topicList?res_type=1&red_tag=q0593629036"); //Encoding encoder2 = Encoding.GetEncoding("utf-8"); //HtmlAgilityPack.HtmlDocument doc2 = webClient2.Load("https://tieba.baidu.com/f/good?kw=秦时明月"); //HtmlNode htmlNode = doc.DocumentNode; //HtmlNodeCollection hrefList = doc.DocumentNode.SelectNodes(".//a[@href]"); //HtmlNodeCollection titleList = doc.DocumentNode.SelectNodes(".//a"); //HtmlNodeCollection emList = doc.DocumentNode.SelectNodes(".//p"); //HtmlNodeCollection HotList = doc.DocumentNode.SelectNodes(".//span"); //int i, j = 1, k = 3; //for (i = 12; i < 32; i++) //{ // Console.WriteLine("热帖标题:" + titleList[i].InnerText); // Console.WriteLine("热帖简述:" + emList[j].InnerText); // Console.WriteLine("热度:" + HotList[k].InnerText); // Console.WriteLine("链接:" + hrefList[i].Attributes["href"].Value); // k = k + 2; // j++; //} //HtmlNode LookFor = doc2.DocumentNode; //HtmlNodeCollection ResultTitle = doc2.DocumentNode.SelectNodes(".//a[@title]"); //HtmlNodeCollection ResultHerf = doc2.DocumentNode.SelectNodes(".//a[contains(@href,'fr=good')]"); ////for (int i = 0; i < 30; i++) ////{ //// int j = i + 15; //// Console.WriteLine("帖子:" + ResultTitle[j].Attributes["title"].Value); //// Console.WriteLine("帖子链接为:" + "https://tieba.baidu.com" + ResultHerf[i].Attributes["href"].Value); ////} //for (int i = 0; i < 30; i++) //{ // int j = i + 15; // Console.WriteLine("帖子:" + ResultHerf[i].Attributes["title"].Value); // Console.WriteLine("帖子链接为:" + "https://tieba.baidu.com" + ResultHerf[i].Attributes["href"].Value); //} //Console.Write(args.PageSource); }
private static void HotelCrawler(OnCompletedEventArgs e) { var StationsInfos = e.WebDriver.FindElement(By.XPath("//*[@id='all_citybox']")); var stationList = StationsInfos.FindElements(By.XPath("li[@class='station-item']")); //var totalPage = Convert.ToInt32(comments.FindElement(By.XPath("div[@class='c_page_box']/div[@class='c_page']/div[contains(@class,'c_page_list')]/a[last()]")).Text); TrainStation temp; List <TrainStation> list = new List <TrainStation>(); foreach (var item in stationList) { temp = new TrainStation(); temp.text = item.GetAttribute("data-text"); temp.code = item.GetAttribute("data-code"); list.Add(temp); } Console.WriteLine(JsonConvert.SerializeObject(list)); Console.ReadKey(); }
/// <summary> /// 高级爬虫 /// </summary> /// <param name="uri">抓取地址URL</param> /// <param name="script">要执行的Javascript脚本代码</param> /// <param name="operation">要执行的页面操作</param> /// <returns></returns> public OnCompletedEventArgs Start(Uri uri, string name, Operation operation, ref string pageSource, Script script = null) { if (OnStart != null) { this.OnStart(this, new OnStartEventArgs(uri)); } OnCompletedEventArgs result = null; var drives = new ChromeDriver(_chromeDriverService, _chromeOptions); var driver = new PhantomJSDriver(_service, _options);//实例化PhantomJS的WebDriver try { var watch = DateTime.Now; drives.Navigate().GoToUrl(uri.ToString());//请求URL地址 if (script != null) { driver.ExecuteScript(script.Code, script.Args); //执行Javascript代码 } if (operation != null && operation.Action != null) { operation.Action.Invoke(driver); } var driverWait = new WebDriverWait(driver, TimeSpan.FromMilliseconds(operation.Timeout));//设置超时时间为x毫秒 if (operation.Condition != null) { driverWait.Until(operation.Condition); } var threadId = System.Threading.Thread.CurrentThread.ManagedThreadId; //获取当前任务线程ID var milliseconds = DateTime.Now.Subtract(watch).Milliseconds; //获取请求执行时间; pageSource = drives.PageSource; //获取网页Dom结构 result = new OnCompletedEventArgs(uri, threadId, milliseconds, pageSource, drives, name); //this.OnCompleted?.Invoke(this, new OnCompletedEventArgs(uri, threadId, milliseconds, pageSource, driver,name)); } catch (Exception ex) { this.OnError?.Invoke(this, new OnErrorEventArgs(uri, ex)); //throw ex; } return(result); }
private static void HotelCrawler(OnCompletedEventArgs e) { //Console.WriteLine(e.PageSource); //File.WriteAllText(Environment.CurrentDirectory + "\\cc.html", e.PageSource, Encoding.UTF8); var hotelName = e.WebDriver.FindElement(By.XPath("//*[@id='J_htl_info']/div[@class='name']/h2[@class='cn_n']")).Text; var address = e.WebDriver.FindElement(By.XPath("//*[@id='J_htl_info']/div[@class='adress']")).Text; var price = e.WebDriver.FindElement(By.XPath("//*[@id='div_minprice']/p[1]")).Text; var score = e.WebDriver.FindElement(By.XPath("//*[@id='divCtripComment']/div[1]/div[1]/span[3]/span")).Text; var reviewCount = e.WebDriver.FindElement(By.XPath("//*[@id='commentTab']/a")).Text; var comments = e.WebDriver.FindElement(By.XPath("//*[@id='hotel_info_comment']/div[@id='commentList']/div[1]/div[1]/div[1]")); var currentPage = Convert.ToInt32(comments.FindElement(By.XPath("div[@class='c_page_box']/div[@class='c_page']/div[contains(@class,'c_page_list')]/a[@class='current']")).Text); var totalPage = Convert.ToInt32(comments.FindElement(By.XPath("div[@class='c_page_box']/div[@class='c_page']/div[contains(@class,'c_page_list')]/a[last()]")).Text); var messages = comments.FindElements(By.XPath("div[@class='comment_detail_list']/div")); var nextPage = Convert.ToInt32(comments.FindElement(By.XPath("div[@class='c_page_box']/div[@class='c_page']/div[contains(@class,'c_page_list')]/a[@class='current']/following-sibling::a[1]")).Text); Console.WriteLine(); Console.WriteLine("名称:" + hotelName); Console.WriteLine("地址:" + address); Console.WriteLine("价格:" + price); Console.WriteLine("评分:" + score); Console.WriteLine("数量:" + reviewCount); Console.WriteLine("页码:" + "当前页(" + currentPage + ")" + "下一页(" + nextPage + ")" + "总页数(" + totalPage + ")" + "每页(" + messages.Count + ")"); Console.WriteLine(); Console.WriteLine("==============================================="); Console.WriteLine(); Console.WriteLine("点评内容:"); foreach (var message in messages) { Console.WriteLine("帐号:" + message.FindElement(By.XPath("div[contains(@class,'user_info')]/p[@class='name']")).Text); Console.WriteLine("房型:" + message.FindElement(By.XPath("div[@class='comment_main']/p/a")).Text); Console.WriteLine("内容:" + message.FindElement(By.XPath("div[@class='comment_main']/div[@class='comment_txt']/div[1]")).Text.Substring(0, 50) + "...."); Console.WriteLine(); Console.WriteLine(); } Console.WriteLine(); Console.WriteLine("==============================================="); Console.WriteLine("地址:" + e.Uri.ToString()); Console.WriteLine("耗时:" + e.Milliseconds + "毫秒"); }
/// <summary> /// 对页面内容进行解析抓取 /// </summary> /// <param name="sendor"></param> /// <param name="args"></param> private void Parse(object sendor, OnCompletedEventArgs args) { string pattern = @"(?s)(?<=<tr class="""">).+?(?=</tr>)"; RegexOptions regexOptions = RegexOptions.None; Regex regex = new Regex(pattern, regexOptions); string inputData = args.PageSource; foreach (Match match in regex.Matches(inputData)) { if (match.Success) { try { HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(match.Value); String url = "https://s.weibo.com" + new Regex(@"(?<=href="").+?(?="")").Match(match.Value).Value; String title = new Regex(@"(?<=>).+?(?=</a>)").Match(match.Value).Value; title = Regex.Replace(title, @"<img\b[^>]*>", ""); String rank = new Regex(@"(?<=<td class=""td-01 ranktop"">).+?(?=</td>)").Match(match.Value).Value; String degree = new Regex(@"(?<=<span>).+?(?=</span>)").Match(match.Value).Value; HtmlNode icon = doc.DocumentNode.SelectSingleNode(".//i"); if (icon != null) { if (icon.InnerText == "荐") { continue; } } if (rank == "") { continue; } hotPoints.Add(new WHotPoint(Convert.ToInt32(rank), url, title, degree)); } catch (Exception e) { Console.WriteLine(e.Message); } } } }
private void Crawler_OnCompleted(object sender, OnCompletedEventArgs e) { // e.PageSource = DataCheck.RepTrim(e.PageSource); string[] str = DataCheck.GetRegStrArr(e.PageSource, e.Regex); }
private void Crawler_OnCompleted(object sender, OnCompletedEventArgs e) { string[] str = DataCheck.GetRegStrArr(e.PageSource, e.Regex); SetLog("抓取新闻数量:" + str.Length + "\t用时:" + e.Milliseconds, Color.Black); }
/// <summary> /// 引发<see cref="OnCompletedEvent"/>事件 /// </summary> /// <param name="args"></param> public void OnCompleted(OnCompletedEventArgs args) { OnCompletedEvent?.Invoke(this, args); }
private void Crawler_OnCompleted(object sender, OnCompletedEventArgs e) { }
/// <summary> /// /// </summary> /// <param name="sender"></param> /// <param name="e"></param> private void Crawler_OnCompleted(OnCompletedEventArgs e, int layer, string Title = "") { SetLog($"读取网站:{e.Uri.ToString()}\n\t\t\t\t深度:{layer}\t用时:{e.Milliseconds} 毫秒\t线程ID:{e.TreadID}", Color.Gray); UpAll(1); urlList.Add(e.Uri.ToString()); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(e.PageSource); ConReg = "//div[@id='endText']"; // 查看是否存在内容 HtmlNode conNode = doc.DocumentNode.SelectSingleNode(ConReg); if (conNode != null) { string url = e.Uri.ToString(); // 判断此超链接是否已经读取 if (_GSQ_NewsService.Exists(c => c.url == url)) { return; } if (!string.IsNullOrEmpty(conNode.InnerHtml)) { GSQ_News _News = new GSQ_News(); _News.title = Title; _News.url = e.Uri.ToString(); _News.sourcewebsite = conNode.InnerHtml; _News.num = 0; _News.CreateDate = DateTime.Now; _GSQ_NewsService.AddEntity(_News); UpCon(1); UpNum(1); SetLog($"抓取新闻《{Title}》,用时:{e.Milliseconds} 毫秒", Color.Gray); } return; } if (layer >= 4) { return; } // 获取所有a标签 var AList = doc.DocumentNode.Descendants("a"); foreach (var item in AList) { // 爬虫类 Crawler crawler = new Crawler(); Operation operation = new Operation() { Action = (x) => { }, Condition = (x) => { return(true); }, timeout = 5000 }; crawler.OnError += Crawler_OnError; crawler.OnCompleted += (s, ex) => { Crawler_OnCompleted(ex, layer + 1, item.InnerText); }; string url = Utils.DelLastChar(e.Uri.ToString(), "/", 0); string href = item.Attributes["href"]?.Value; if (!string.IsNullOrEmpty(href) && !urlList.Contains(href) && DataCheck.CheckReg(href, DataCheck.Reg_Url) && href.Contains(url)) { crawler.Start(href, operation, null).Wait(); } } }