public void AnalysisHtml(DataReceivedEventArgs args) { string htmlStr = args.Html; var htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(htmlStr); HtmlNode html = htmlDocument.DocumentNode; var title = html.CssSelect("title"); if (title != null && title.Count() == 1) { Console.WriteLine(title.FirstOrDefault().InnerHtml); Console.WriteLine(args.Url); Console.WriteLine(); } }
public bool IPLimitProcess(SimpleCrawler.DataReceivedEventArgs args) { try { if (string.IsNullOrEmpty(args.Html) || args.Html.Contains("503 Service Unavailable")) { return(true); } HtmlDocument document = new HtmlDocument(); document.LoadHtml(args.Html); if (document.GetElementbyId("cy_center") == null) { Console.WriteLine("查找不到ID对象"); return(true); } } catch (Exception) { } return(false); }
public void DataReceive(SimpleCrawler.DataReceivedEventArgs args) { try { HtmlDocument document = new HtmlDocument(); document.LoadHtml(args.Html); string authorization = args.urlInfo.Authorization; HtmlNode elementbyId = document.GetElementbyId("cy_center"); if (elementbyId == null) { Console.WriteLine("查找不到ID对象"); } else { BsonDocument bsonDoc = new BsonDocument(); HtmlNode img = elementbyId.SelectSingleNode("./div/div[@class='cy-icon-box']/img"); if ((img != null) && (img.Attributes["src"] != null)) { BsonDocument document3 = SaveProductImg(img, "projThumb"); bsonDoc.Set("projLocalThumb", document3.Text("localSrc")); bsonDoc.Set("projThumb", document3.Text("src")); } HtmlNode node3 = elementbyId.SelectSingleNode("//h1[@class='cy-cp-name']"); if (node3 != null) { bsonDoc.Add("name", node3.InnerText.Trim()); } HtmlNode node4 = elementbyId.SelectSingleNode("//div[@class='cy-xq-time']"); if (node4 != null) { bsonDoc.Add("pushDate", node4.InnerText.Replace("发布时间:", "").Trim()); } HtmlNode node5 = elementbyId.SelectSingleNode("//div[@class='cy-tag-list']"); if (node5 != null) { bsonDoc.Add("tag", node5.InnerText.Trim()); } HtmlNode node6 = elementbyId.SelectSingleNode("//div[@class='cy-cp-intro']"); if (node6 != null) { bsonDoc.Add("companyIntro", node6.InnerText.Trim()); } HtmlNode node7 = elementbyId.SelectSingleNode("//div[@class='cy-cp-intro-info']"); if (node7 != null) { bsonDoc.Add("productIntro", node7.InnerText.Trim()); } HtmlNode node8 = document.GetElementbyId("business"); if (node8 != null) { bsonDoc.Add("domain", node8.InnerText.Trim()); } if (document.GetElementbyId("shareholders") != null) { bsonDoc.Add("shareholders", node8.InnerText); } HtmlNodeCollection imgs = elementbyId.SelectNodes("//ul[@class='gallery-img-box']/li/img"); if (imgs != null) { BsonDocument document4 = SaveProductImgs(imgs, bsonDoc.Text("pushDate")); bsonDoc.Add("srcList", document4.Text("srcList")); bsonDoc.Add("localSrcList", document4.Text("localSrcList")); } HtmlNode node10 = document.GetElementbyId("advantage"); if (node10 != null) { bsonDoc.Add("advantage", node10.InnerText.Trim()); } HtmlNode node11 = document.GetElementbyId("results"); if (node11 != null) { bsonDoc.Add("achievements", node11.InnerText.Trim()); } HtmlNodeCollection nodes2 = elementbyId.SelectNodes("//ul[@class='cy-cp-team']/li"); if (nodes2 != null) { BsonArray array = new BsonArray(); foreach (HtmlNode node14 in (IEnumerable <HtmlNode>)nodes2) { HtmlNode node15 = node14.SelectSingleNode("./div[@class='team-personnel-name']"); HtmlNode node16 = node14.SelectSingleNode("./div[@class='team-personnel-position']"); HtmlNode node17 = node14.SelectSingleNode("./div[@class='team-personnel-intro']"); if (((node15 != null) && (node16 != null)) && (node17 != null)) { BsonDocument document5 = new BsonDocument { { "name", node15.InnerText.Trim() }, { "position", node16.InnerText.Trim() }, { "intro", node17.InnerText.Trim() } }; array.Add(document5); } } bsonDoc.Add("teamInfo", array); } HtmlNode node12 = elementbyId.SelectSingleNode("//div[@data-type='agree_chuangye']"); if (node12 != null) { bsonDoc.Add("agree", node12.InnerText.Trim()); } HtmlNode node13 = elementbyId.SelectSingleNode("//div[@data-type='disagree_chuangye']"); if (node13 != null) { bsonDoc.Add("disAgree", node13.InnerText.Trim()); } HtmlNodeCollection nodes3 = elementbyId.SelectNodes("//div[@class='box-moder cy-box-moder company-info']/ul/li"); if (nodes3 != null) { foreach (HtmlNode node18 in (IEnumerable <HtmlNode>)nodes3) { string[] separator = new string[] { ":", ":" }; string[] strArray = node18.InnerText.Trim().Split(separator, StringSplitOptions.RemoveEmptyEntries); if (strArray.Length >= 2) { bsonDoc.Set(strArray[0].Trim(), strArray[1].Trim()); } } } HtmlNodeCollection nodes4 = elementbyId.SelectNodes("//div[@class='box-moder cy-box-moder company-info get-company-box hide']/ul/li"); if (nodes4 != null) { foreach (HtmlNode node19 in (IEnumerable <HtmlNode>)nodes4) { string[] separator = new string[] { ":", ":" }; string[] strArray2 = node19.InnerText.Trim().Split(separator, StringSplitOptions.RemoveEmptyEntries); if (strArray2.Length >= 2) { bsonDoc.Set(strArray2[0].Trim(), strArray2[1].Trim()); } } } HtmlNodeCollection nodes5 = elementbyId.SelectNodes("//div[@class='box-moder cy-box-moder company-info get-company-box']/ul/li"); if (nodes5 != null) { foreach (HtmlNode node20 in (IEnumerable <HtmlNode>)nodes5) { string[] separator = new string[] { ":", ":" }; string[] strArray3 = node20.InnerText.Trim().Split(separator, StringSplitOptions.RemoveEmptyEntries); if (strArray3.Length >= 2) { if (strArray3[0].Trim() == "办公地点") { bsonDoc.Set("地址", strArray3[1].Trim()); } else { bsonDoc.Set(strArray3[0].Trim(), strArray3[1].Trim()); } } } } if (bsonDoc.ElementCount > 0) { bsonDoc.Set("isUpdate", "2"); Console.WriteLine(bsonDoc.Text("name") + "更新成功"); StorageData target = new StorageData { Document = bsonDoc, Query = Query.EQ("_id", ObjectId.Parse(authorization)), Name = this.DataTableName, Type = StorageType.Update }; DBChangeQueue.Instance.EnQueue(target); } if (UrlQueue.Instance.Count == 0) { Task.WaitAll(allTask.ToArray()); Task.WhenAll(allTask).GetAwaiter().OnCompleted(() => Console.WriteLine("所有下载完成")); Console.ReadLine(); } } } catch (Exception exception) { Console.WriteLine(exception.Message); } }
/// <summary> /// The master data received event. /// </summary> /// <param name="args"> /// The args. /// </param> private void MasterDataReceivedEvent(DataReceivedEventArgs args) { _htmlAnalysisService.AnalysisHtml(args); }