public void AnalysisHtml(DataReceivedEventArgs args)
        {
            string htmlStr = args.Html;
            var htmlDocument = new HtmlDocument();
            htmlDocument.LoadHtml(htmlStr);
            HtmlNode html = htmlDocument.DocumentNode;

            var title = html.CssSelect("title");
            if (title != null && title.Count() == 1)
            {
                Console.WriteLine(title.FirstOrDefault().InnerHtml);
                Console.WriteLine(args.Url);
                Console.WriteLine();
            }
        }
Exemple #2
0
 public bool IPLimitProcess(SimpleCrawler.DataReceivedEventArgs args)
 {
     try
     {
         if (string.IsNullOrEmpty(args.Html) || args.Html.Contains("503 Service Unavailable"))
         {
             return(true);
         }
         HtmlDocument document = new HtmlDocument();
         document.LoadHtml(args.Html);
         if (document.GetElementbyId("cy_center") == null)
         {
             Console.WriteLine("查找不到ID对象");
             return(true);
         }
     }
     catch (Exception)
     {
     }
     return(false);
 }
Exemple #3
0
 public void DataReceive(SimpleCrawler.DataReceivedEventArgs args)
 {
     try
     {
         HtmlDocument document = new HtmlDocument();
         document.LoadHtml(args.Html);
         string   authorization = args.urlInfo.Authorization;
         HtmlNode elementbyId   = document.GetElementbyId("cy_center");
         if (elementbyId == null)
         {
             Console.WriteLine("查找不到ID对象");
         }
         else
         {
             BsonDocument bsonDoc = new BsonDocument();
             HtmlNode     img     = elementbyId.SelectSingleNode("./div/div[@class='cy-icon-box']/img");
             if ((img != null) && (img.Attributes["src"] != null))
             {
                 BsonDocument document3 = SaveProductImg(img, "projThumb");
                 bsonDoc.Set("projLocalThumb", document3.Text("localSrc"));
                 bsonDoc.Set("projThumb", document3.Text("src"));
             }
             HtmlNode node3 = elementbyId.SelectSingleNode("//h1[@class='cy-cp-name']");
             if (node3 != null)
             {
                 bsonDoc.Add("name", node3.InnerText.Trim());
             }
             HtmlNode node4 = elementbyId.SelectSingleNode("//div[@class='cy-xq-time']");
             if (node4 != null)
             {
                 bsonDoc.Add("pushDate", node4.InnerText.Replace("发布时间:", "").Trim());
             }
             HtmlNode node5 = elementbyId.SelectSingleNode("//div[@class='cy-tag-list']");
             if (node5 != null)
             {
                 bsonDoc.Add("tag", node5.InnerText.Trim());
             }
             HtmlNode node6 = elementbyId.SelectSingleNode("//div[@class='cy-cp-intro']");
             if (node6 != null)
             {
                 bsonDoc.Add("companyIntro", node6.InnerText.Trim());
             }
             HtmlNode node7 = elementbyId.SelectSingleNode("//div[@class='cy-cp-intro-info']");
             if (node7 != null)
             {
                 bsonDoc.Add("productIntro", node7.InnerText.Trim());
             }
             HtmlNode node8 = document.GetElementbyId("business");
             if (node8 != null)
             {
                 bsonDoc.Add("domain", node8.InnerText.Trim());
             }
             if (document.GetElementbyId("shareholders") != null)
             {
                 bsonDoc.Add("shareholders", node8.InnerText);
             }
             HtmlNodeCollection imgs = elementbyId.SelectNodes("//ul[@class='gallery-img-box']/li/img");
             if (imgs != null)
             {
                 BsonDocument document4 = SaveProductImgs(imgs, bsonDoc.Text("pushDate"));
                 bsonDoc.Add("srcList", document4.Text("srcList"));
                 bsonDoc.Add("localSrcList", document4.Text("localSrcList"));
             }
             HtmlNode node10 = document.GetElementbyId("advantage");
             if (node10 != null)
             {
                 bsonDoc.Add("advantage", node10.InnerText.Trim());
             }
             HtmlNode node11 = document.GetElementbyId("results");
             if (node11 != null)
             {
                 bsonDoc.Add("achievements", node11.InnerText.Trim());
             }
             HtmlNodeCollection nodes2 = elementbyId.SelectNodes("//ul[@class='cy-cp-team']/li");
             if (nodes2 != null)
             {
                 BsonArray array = new BsonArray();
                 foreach (HtmlNode node14 in (IEnumerable <HtmlNode>)nodes2)
                 {
                     HtmlNode node15 = node14.SelectSingleNode("./div[@class='team-personnel-name']");
                     HtmlNode node16 = node14.SelectSingleNode("./div[@class='team-personnel-position']");
                     HtmlNode node17 = node14.SelectSingleNode("./div[@class='team-personnel-intro']");
                     if (((node15 != null) && (node16 != null)) && (node17 != null))
                     {
                         BsonDocument document5 = new BsonDocument {
                             {
                                 "name",
                                 node15.InnerText.Trim()
                             },
                             {
                                 "position",
                                 node16.InnerText.Trim()
                             },
                             {
                                 "intro",
                                 node17.InnerText.Trim()
                             }
                         };
                         array.Add(document5);
                     }
                 }
                 bsonDoc.Add("teamInfo", array);
             }
             HtmlNode node12 = elementbyId.SelectSingleNode("//div[@data-type='agree_chuangye']");
             if (node12 != null)
             {
                 bsonDoc.Add("agree", node12.InnerText.Trim());
             }
             HtmlNode node13 = elementbyId.SelectSingleNode("//div[@data-type='disagree_chuangye']");
             if (node13 != null)
             {
                 bsonDoc.Add("disAgree", node13.InnerText.Trim());
             }
             HtmlNodeCollection nodes3 = elementbyId.SelectNodes("//div[@class='box-moder cy-box-moder company-info']/ul/li");
             if (nodes3 != null)
             {
                 foreach (HtmlNode node18 in (IEnumerable <HtmlNode>)nodes3)
                 {
                     string[] separator = new string[] { ":", ":" };
                     string[] strArray  = node18.InnerText.Trim().Split(separator, StringSplitOptions.RemoveEmptyEntries);
                     if (strArray.Length >= 2)
                     {
                         bsonDoc.Set(strArray[0].Trim(), strArray[1].Trim());
                     }
                 }
             }
             HtmlNodeCollection nodes4 = elementbyId.SelectNodes("//div[@class='box-moder cy-box-moder company-info get-company-box hide']/ul/li");
             if (nodes4 != null)
             {
                 foreach (HtmlNode node19 in (IEnumerable <HtmlNode>)nodes4)
                 {
                     string[] separator = new string[] { ":", ":" };
                     string[] strArray2 = node19.InnerText.Trim().Split(separator, StringSplitOptions.RemoveEmptyEntries);
                     if (strArray2.Length >= 2)
                     {
                         bsonDoc.Set(strArray2[0].Trim(), strArray2[1].Trim());
                     }
                 }
             }
             HtmlNodeCollection nodes5 = elementbyId.SelectNodes("//div[@class='box-moder cy-box-moder company-info get-company-box']/ul/li");
             if (nodes5 != null)
             {
                 foreach (HtmlNode node20 in (IEnumerable <HtmlNode>)nodes5)
                 {
                     string[] separator = new string[] { ":", ":" };
                     string[] strArray3 = node20.InnerText.Trim().Split(separator, StringSplitOptions.RemoveEmptyEntries);
                     if (strArray3.Length >= 2)
                     {
                         if (strArray3[0].Trim() == "办公地点")
                         {
                             bsonDoc.Set("地址", strArray3[1].Trim());
                         }
                         else
                         {
                             bsonDoc.Set(strArray3[0].Trim(), strArray3[1].Trim());
                         }
                     }
                 }
             }
             if (bsonDoc.ElementCount > 0)
             {
                 bsonDoc.Set("isUpdate", "2");
                 Console.WriteLine(bsonDoc.Text("name") + "更新成功");
                 StorageData target = new StorageData
                 {
                     Document = bsonDoc,
                     Query    = Query.EQ("_id", ObjectId.Parse(authorization)),
                     Name     = this.DataTableName,
                     Type     = StorageType.Update
                 };
                 DBChangeQueue.Instance.EnQueue(target);
             }
             if (UrlQueue.Instance.Count == 0)
             {
                 Task.WaitAll(allTask.ToArray());
                 Task.WhenAll(allTask).GetAwaiter().OnCompleted(() => Console.WriteLine("所有下载完成"));
                 Console.ReadLine();
             }
         }
     }
     catch (Exception exception)
     {
         Console.WriteLine(exception.Message);
     }
 }
 /// <summary>
 /// The master data received event.
 /// </summary>
 /// <param name="args">
 /// The args.
 /// </param>
 private void MasterDataReceivedEvent(DataReceivedEventArgs args)
 {
     _htmlAnalysisService.AnalysisHtml(args);
 }