private void DO(CrawlerInfo ci) { var uri = new Uri(ci.url.Url); var siteType = HtmlParse.RecogSite(uri); var c = new NCrawler.Crawler(uri, new HtmlDocumentProcessor(), new MyPipelineStep(ci)) { MaximumCrawlDepth = CrawlArgs.CrawlDepth(siteType), MaximumThreadCount = 5, IncludeFilter = CrawlArgs.IncludeFilter(siteType), ExcludeFilter = CrawlArgs.ExcludeFilter(siteType), }; c.Crawl(); }
public void Process(NCrawler.Crawler crawler, PropertyBag propertyBag) { var rsp = propertyBag.GetResponse(); try { HtmlDocument htmlDoc = HtmlParse.LoadFromHtml(propertyBag); var siteType = HtmlParse.RecogSite(propertyBag.ResponseUri); var records = Parse(htmlDoc, siteType); if (records == null) { return; } foreach (var record in records) { DAL.Data.Add(record); ++ci.Count; } } catch (NullReferenceException) { } }
/// <summary> /// 输入table节点, 按表头解析表格 /// 如存在标题, 返回值中以Title为键存放 /// </summary> /// <param name="tableRoot"></param> /// <returns></returns> public static Dictionary <string, List <string> > ParseTable(HtmlNode tableRoot) { var content = new Dictionary <string, List <string> >(); content.Add("Title", new List <string>()); var tbody = tableRoot.SelectSingleNode("./tbody"); tbody = tbody ?? tableRoot; var columnCnt = 0; foreach (var tr in tbody.SelectNodes("./tr")) { var tds = tr.SelectNodes("./td"); if (tds == null) { continue; } if (tds.Count > columnCnt) { columnCnt = tds.Count; } } if (columnCnt <= 0) { return(content); } var tbodyContent = new List <List <string> >(); for (var i = 0; i < columnCnt; i++) { tbodyContent.Add(new List <string>()); } foreach (var tr in tbody.SelectNodes("./tr")) { var tds = tr.SelectNodes("./td"); var count = tds.Count; if (count < columnCnt) { var product = HtmlParse.ParseProjectName(tr.InnerText); if (product != null) { content["Title"].Add(product); } continue; } for (var i = 0; i < count; i++) { tbodyContent[i].Add(Regex.Replace(tds[i].InnerText, "\\s+", " ")); } } foreach (var items in tbodyContent) { if (items.Count == 0) { continue; } if (content.ContainsKey(items[0])) { // TODO Console.WriteLine($"Repeat Key: {items[0]}"); continue; } string key = items[0]; items.RemoveAt(0); content.Add(key, items); } return(content); }
public static List <Model.Data> Parse(HtmlDocument doc, SiteType siteType) { var result = new List <Model.Data>(); var title = HtmlParse.ParseTitle(doc, siteType); var rootNode = doc.DocumentNode; if (title.Length < 8 || !Regex.IsMatch(title, "(公告|公示|中标)") || Regex.IsMatch(title, "(流标|废标)")) { return(null); } //Console.WriteLine($"\t{MyPipelineStep.count++}"); var contentType = ContentType.Unrecognized; HtmlNode contentNode = null; switch (siteType) { case SiteType.NanChang: contentNode = rootNode.SelectSingleNode("//table[@class='MsoNormalTable']"); contentType = ContentType.Table; break; case SiteType.JingDeZhen: contentNode = rootNode.SelectSingleNode("//*[@id='MyContent']"); contentType = ContentType.Paragraph; break; case SiteType.JiangXi: contentType = ContentType.Table; contentNode = rootNode.SelectSingleNode("//*[@id='TDContent']//*[@class='MsoNormalTable']"); break; case SiteType.PinXiang: contentNode = rootNode.SelectSingleNode("//table[@align='center'//talbe[@align='center']"); contentType = ContentType.Table; break; case SiteType.GanZhou: contentNode = rootNode.SelectSingleNode("//table[@class='MsoNormalTable']"); contentType = ContentType.Table; break; case SiteType.FuZhou: contentNode = rootNode.SelectSingleNode("//body"); contentType = ContentType.Paragraph; break; default: return(null); } var product = HtmlParse.ParseProjectName(title); var facilities = new List <string>(); var money = 0; var date = HtmlParse.ParseDate(doc, siteType); //按表格型解析 if (contentType == ContentType.Table) { if (contentNode == null) { return(result); } var table = contentNode; var content = HtmlParse.ParseTable(table); if (content.ContainsKey("Title") && content["Title"].Count > 0) { product = product ?? content["Title"][0]; } foreach (var item in content) { if (Regex.IsMatch(item.Key, "商|单位|公司")) { facilities = item.Value; } else if (Regex.IsMatch(item.Key, "金额")) { money = item.Value.Count > 0 ? HtmlParse.ParseMoney(item.Value[0]) : 0; } } } else if (contentType == ContentType.Paragraph) { //按文本型解析 var text = contentNode.InnerText; facilities = HtmlParse.ParseFacilities(text); money = HtmlParse.ParseMoney(text); } var count = facilities.Count; if (product == null) { //Console.WriteLine($"\n{title}\n"); return(null); } for (int i = 0; i != facilities.Count; ++i) { facilities[i] = Regex.Replace(facilities[i], " ", ""); if (facilities[i].Length < 5) { facilities[i] = null; } } for (var i = 0; i < count; i++) { try { result.Add(new Model.Data() { ProjectName = (product ?? "NULL").Trim(), WinCom = (facilities[i] ?? "NULL").Trim(), Money = money, Time = date, }); } catch (FormatException) { } } return(result); }