protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <CorpMerit>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = ToolWeb.GetHtmlByUrl(this.SiteUrl, Encoding.Default); } catch { return(list); } #region 优质专业工程 Parser parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("height", "29")), true), new TagNameFilter("table")));//parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"),new HasAttributeFilter("width","98%"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; parser = new Parser(new Lexer(table.ToHtml())); NodeList aTagNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aTagNode != null && aTagNode.Count > 0) { for (int j = 0; j < aTagNode.Count; j++) { ATag aTag = aTagNode[j].GetATag(); string name = "优质专业工程"; string typename = aTag.LinkText.Replace("·", ""); string url = "http://www.jianzhuxh.com/excellence/" + aTag.Link; string htlList = string.Empty; int page = 1; try { htlList = ToolWeb.GetHtmlByUrl(url, Encoding.Default); } catch { continue; } parser = new Parser(new Lexer(htlList)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("align", "center"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("/", "页"); page = int.Parse(temp); } catch { } } for (int d = 1; d <= page; d++) { if (d > 1) { try { htlList = ToolWeb.GetHtmlByUrl(url + "&page=" + d, Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(htlList)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "text"))); if (dtlNode != null && dtlNode.Count > 0) { parser = new Parser(new Lexer(dtlNode.ToHtml())); NodeList dtlNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "844"))); if (dtlNodeList != null && dtlNodeList.Count > 0) { TableTag tableTag = dtlNodeList[0] as TableTag; for (int k = 0; k < tableTag.RowCount; k++) { string CorpCode = string.Empty, CorpName = string.Empty, MeritYear = string.Empty, MeritName = string.Empty, MeritDate = string.Empty, MeritLevel = string.Empty, MeritRegion = string.Empty, MeritSector = string.Empty, MeritPrjName = string.Empty, PrjSupporter = string.Empty, Source = string.Empty, Url = string.Empty, Remark = string.Empty, Details = string.Empty, MeritType = string.Empty, PrjMgr = string.Empty, SupMgr = string.Empty, ManCost = string.Empty, ProArea = string.Empty, SupUnit = string.Empty, PileConsUnit = string.Empty, BuildingType = string.Empty; TableRow tr = tableTag.Rows[k]; MeritName = name; MeritType = typename; MeritPrjName = tr.Columns[1].ToNodePlainString(); CorpName = tr.Columns[2].ToNodePlainString(); PrjMgr = tr.Columns[3].ToNodePlainString(); SupUnit = tr.Columns[4].ToNodePlainString(); SupMgr = tr.Columns[5].ToNodePlainString(); ManCost = tr.Columns[6].ToNodePlainString(); if (ManCost.Contains("吨")) { ManCost = string.Empty; } ProArea = tr.Columns[7].ToNodePlainString(); MeritYear = tr.Columns[8].ToNodePlainString(); CorpMerit info = ToolDb.GenCorpMerit("广东省", "深圳市", "", CorpCode, CorpName, MeritYear, MeritName, MeritDate, MeritLevel, MeritRegion, MeritSector, MeritPrjName, PrjSupporter, Source, url, Remark, Details, MeritType, PrjMgr, SupMgr, ManCost, ProArea, SupUnit, PileConsUnit, BuildingType); list.Add(info); } } } } } } } #endregion #region 其它工程 parser = new Parser(new Lexer(html)); NodeList theNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("height", "32")), true), new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("align", "center")))); if (theNode != null && theNode.Count > 2) { TableTag table = theNode[2] as TableTag; parser = new Parser(new Lexer(table.ToHtml())); NodeList atagNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (atagNode != null && atagNode.Count > 0) { for (int j = 0; j < atagNode.Count; j++) { ATag aTag = atagNode[j].GetATag(); string typename = aTag.LinkText; string url = "http://www.jianzhuxh.com/excellence/" + aTag.Link; string htmlList = string.Empty; int page = 1; try { htmlList = ToolWeb.GetHtmlByUrl(url, Encoding.Default); } catch { continue; } parser = new Parser(new Lexer(htmlList)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("align", "center"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("/", "页"); page = int.Parse(temp); } catch { } } for (int k = 1; k <= page; k++) { if (k > 1) { try { htmlList = ToolWeb.GetHtmlByUrl(url + "&page=" + k.ToString(), Encoding.Default); } catch { } } parser = new Parser(new Lexer(htmlList)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "text16"))); if (dtlNode != null && dtlNode.Count > 0) { TableTag tableTag = dtlNode[0] as TableTag; for (int t = 0; t < tableTag.RowCount; t++) { TableRow tr = tableTag.Rows[t]; string CorpCode = string.Empty, CorpName = string.Empty, MeritYear = string.Empty, MeritName = string.Empty, MeritDate = string.Empty, MeritLevel = string.Empty, MeritRegion = string.Empty, MeritSector = string.Empty, MeritPrjName = string.Empty, PrjSupporter = string.Empty, Source = string.Empty, Url = string.Empty, Remark = string.Empty, Details = string.Empty, MeritType = string.Empty, PrjMgr = string.Empty, SupMgr = string.Empty, ManCost = string.Empty, ProArea = string.Empty, SupUnit = string.Empty, PileConsUnit = string.Empty, BuildingType = string.Empty; MeritName = MeritType = typename; if (typename.Contains("优质工程")) { MeritName = MeritType = "深圳市" + typename; MeritPrjName = tr.Columns[2].ToNodePlainString(); CorpName = tr.Columns[3].ToNodePlainString(); PrjMgr = tr.Columns[4].ToNodePlainString(); SupUnit = tr.Columns[5].ToNodePlainString(); SupMgr = tr.Columns[6].ToNodePlainString(); PrjSupporter = tr.Columns[7].ToNodePlainString(); string temp = tr.Columns[8].ToNodePlainString(); if (temp.Contains("元")) { ManCost = temp; } else { ProArea = temp; } MeritYear = tr.Columns[9].ToNodePlainString(); } else if (typename.Contains("优质结构工程")) { MeritName = MeritType = "深圳市" + typename; MeritPrjName = tr.Columns[1].ToNodePlainString(); CorpName = tr.Columns[2].ToNodePlainString(); PrjMgr = tr.Columns[3].ToNodePlainString(); PileConsUnit = tr.Columns[4].ToNodePlainString(); SupUnit = tr.Columns[5].ToNodePlainString(); SupMgr = tr.Columns[6].ToNodePlainString(); string temp = tr.Columns[8].ToNodePlainString(); if (temp.Contains("元")) { ManCost = temp; } else { ProArea = temp; } MeritYear = tr.Columns[10].ToNodePlainString(); } else if (typename.Contains("用户满意工程")) { MeritName = MeritType = "深圳市" + typename; MeritPrjName = tr.Columns[1].ToNodePlainString(); CorpName = tr.Columns[2].ToNodePlainString(); SupUnit = tr.Columns[3].ToNodePlainString(); BuildingType = tr.Columns[4].ToNodePlainString(); ProArea = tr.Columns[5].ToNodePlainString(); MeritYear = tr.Columns[6].ToNodePlainString(); } else if (typename.Contains("绿色施工示范工程")) { MeritName = MeritType = "深圳市" + typename; MeritPrjName = tr.Columns[2].ToNodePlainString(); CorpName = tr.Columns[3].ToNodePlainString(); PrjMgr = tr.Columns[4].ToNodePlainString(); SupUnit = tr.Columns[5].ToNodePlainString(); SupMgr = tr.Columns[6].ToNodePlainString(); PrjSupporter = tr.Columns[8].ToNodePlainString(); MeritYear = tr.Columns[10].ToNodePlainString(); } else if (typename.Contains("文明工地") || typename.Contains("双优工地") || typename.Contains("双优样板工地")) { MeritPrjName = tr.Columns[1].ToNodePlainString(); CorpName = tr.Columns[2].ToNodePlainString(); PrjMgr = tr.Columns[3].ToNodePlainString(); SupUnit = tr.Columns[4].ToNodePlainString(); SupMgr = tr.Columns[5].ToNodePlainString(); string temp = tr.Columns[6].ToNodePlainString(); if (temp.Contains("元")) { ManCost = temp; } else { ProArea = temp; } MeritYear = tr.Columns[7].ToNodePlainString(); } CorpMerit info = ToolDb.GenCorpMerit("广东省", "深圳市", "", CorpCode, CorpName, MeritYear, MeritName, MeritDate, MeritLevel, MeritRegion, MeritSector, MeritPrjName, PrjSupporter, Source, url, Remark, Details, MeritType, PrjMgr, SupMgr, ManCost, ProArea, SupUnit, PileConsUnit, BuildingType); list.Add(info); } } } } } } #endregion #region 深圳地区 parser = new Parser(new Lexer(html)); NodeList areaNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("height", "29")), true), new TagNameFilter("table"))); if (areaNode != null && areaNode.Count > 0) { TableTag table = areaNode[1] as TableTag; parser = new Parser(new Lexer(table.ToHtml())); NodeList listNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { ATag aTag = listNode[j].GetATag(); string typename = aTag.LinkText.Replace("·", ""); string url = "http://www.jianzhuxh.com/excellence/" + aTag.Link; string htmlList = string.Empty; int page = 1; try { htmlList = ToolWeb.GetHtmlByUrl(url, Encoding.Default); } catch { continue; } parser = new Parser(new Lexer(htmlList)); //continue; NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("form"), new HasAttributeFilter("name", "gopage"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("/", "页"); page = int.Parse(temp); } catch { } } for (int k = 1; k <= page; k++) { if (k > 1) { try { htmlList = ToolWeb.GetHtmlByUrl(url + "?page=" + k.ToString(), Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(htmlList)); NodeList tableNode = null; if (typename.Contains("鲁班奖")) { tableNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "py_tbl"))); } else { tableNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "text18"))); } if (tableNode != null && tableNode.Count > 0) { TableTag tableTag = tableNode[0] as TableTag; for (int t = 1; t < tableTag.RowCount; t++) { TableRow tr = tableTag.Rows[t]; string CorpCode = string.Empty, CorpName = string.Empty, MeritYear = string.Empty, MeritName = string.Empty, MeritDate = string.Empty, MeritLevel = string.Empty, MeritRegion = string.Empty, MeritSector = string.Empty, MeritPrjName = string.Empty, PrjSupporter = string.Empty, Source = string.Empty, Url = string.Empty, Remark = string.Empty, Details = string.Empty, MeritType = string.Empty, PrjMgr = string.Empty, SupMgr = string.Empty, ManCost = string.Empty, ProArea = string.Empty, SupUnit = string.Empty, PileConsUnit = string.Empty, BuildingType = string.Empty; MeritName = MeritType = typename; MeritPrjName = tr.Columns[1].ToNodePlainString(); CorpName = tr.Columns[2].ToNodePlainString(); PrjSupporter = tr.Columns[3].ToNodePlainString().Replace("参建单位", "").Replace(":", "").Replace(":", ""); SupUnit = tr.Columns[4].ToNodePlainString(); PrjMgr = tr.Columns[5].ToNodePlainString(); MeritYear = tr.Columns[6].ToNodePlainString(); CorpMerit info = ToolDb.GenCorpMerit("广东省", "深圳市", "", CorpCode, CorpName, MeritYear, MeritName, MeritDate, MeritLevel, MeritRegion, MeritSector, MeritPrjName, PrjSupporter, Source, url, Remark, Details, MeritType, PrjMgr, SupMgr, ManCost, ProArea, SupUnit, PileConsUnit, BuildingType); list.Add(info); } } } } } } #endregion return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; Parser parser = null; string eventValidation = string.Empty; DateTime dateTime = DateTime.Now; DateTime begin = DateTime.Parse("1980-01-01"); for (DateTime t = begin; t <= dateTime; t = t.AddDays(30)) { string endDate = t.AddDays(30).ToString("yyyy-MM-dd"); try { html = ToolWeb.GetHtmlByUrl(SiteUrl, Encoding.UTF8, ref cookiestr); parser = new Parser(new Lexer(html)); NodeList pageInputNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("input"), new HasAttributeFilter("id", "ctl00_ContentPlaceHolder1_ValidateCode1_txtRanNum"))); string pageValiCode = string.Empty; if (pageInputNode != null && pageInputNode.Count > 0) { pageValiCode = (pageInputNode[0] as InputTag).GetAttribute("value"); } viewState = ToolWeb.GetAspNetViewState(html); NameValueCollection nvc = ToolWeb.GetNameValueCollection( new string[] { "ctl00_ContentPlaceHolder1_toolkitScriptManager1_HiddenField", "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE", "ctl00$ContentPlaceHolder1$txtEnt_name", "ctl00$ContentPlaceHolder1$txtAWARD_NAME", "ctl00$ContentPlaceHolder1$txtStartDate", "ctl00$ContentPlaceHolder1$txtEndDate", "ctl00$ContentPlaceHolder1$ValidateCode1$txtValidateCode", "ctl00$ContentPlaceHolder1$ValidateCode1$txtRanNum" }, new string[] { "", "ctl00$ContentPlaceHolder1$AspNetPager2", "1", viewState, "", "", t.ToString("yyyy-MM-dd"), endDate, "", pageValiCode }); html = ToolWeb.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8, ref cookiestr); } catch { return(null); } string opValue = string.Empty; parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "ctl00_ContentPlaceHolder1_AspNetPager2"))); if (pageList != null && pageList.Count > 0) { try { string temp = pageList[0].ToPlainTextString().GetRegexBegEnd("共", "条"); int page = int.Parse(temp); int result = page / 15; if (page % 15 != 0) { pageInt = result + 1; } else { pageInt = result; } } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { parser = new Parser(new Lexer(html)); NodeList pageInputNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("input"), new HasAttributeFilter("id", "ctl00_ContentPlaceHolder1_ValidateCode1_txtRanNum"))); string pageValiCode = string.Empty; if (pageInputNode != null && pageInputNode.Count > 0) { pageValiCode = (pageInputNode[0] as InputTag).GetAttribute("value"); } viewState = ToolWeb.GetAspNetViewState(html); NameValueCollection nvc = ToolWeb.GetNameValueCollection( new string[] { "ctl00_ContentPlaceHolder1_toolkitScriptManager1_HiddenField", "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE", "ctl00$ContentPlaceHolder1$txtEnt_name", "ctl00$ContentPlaceHolder1$txtAWARD_NAME", "ctl00$ContentPlaceHolder1$txtStartDate", "ctl00$ContentPlaceHolder1$txtEndDate", "ctl00$ContentPlaceHolder1$ValidateCode1$txtValidateCode", "ctl00$ContentPlaceHolder1$ValidateCode1$txtRanNum" }, new string[] { "", "ctl00$ContentPlaceHolder1$AspNetPager2", i.ToString(), viewState, "", "", t.ToString("yyyy-MM-dd"), endDate, "", pageValiCode }); try { html = ToolWeb.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "tab_ent"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string CorpCode = string.Empty, CorpName = string.Empty, MeritYear = string.Empty, MeritName = string.Empty, MeritDate = string.Empty, MeritLevel = string.Empty, MeritRegion = string.Empty, MeritSector = string.Empty, MeritPrjName = string.Empty, PrjSupporter = string.Empty, Source = string.Empty, Url = string.Empty, Remark = string.Empty, Details = string.Empty; TableRow tr = table.Rows[j]; CorpName = tr.Columns[2].ToNodePlainString(); MeritName = tr.Columns[1].ToNodePlainString(); MeritDate = tr.Columns[3].ToPlainTextString().GetDateRegex(); Url = "http://113.108.219.40/PlatForm/SearchCenter/" + tr.Columns[1].GetATagHref(); string htlDtl = string.Empty; try { htlDtl = ToolWeb.GetHtmlByUrl(Url, Encoding.UTF8); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (dtlList != null && dtlList.Count > 0) { TableTag tab = dtlList[0] as TableTag; string ctx = string.Empty; for (int k = 0; k < tab.RowCount; k++) { for (int d = 0; d < tab.Rows[k].ColumnCount; d++) { if ((d + 1) % 2 == 0) { ctx += tab.Rows[k].Columns[d].ToNodePlainString() + "\r\n"; } else { ctx += tab.Rows[k].Columns[d].ToNodePlainString().Replace(":", "").Replace(":", "") + ":"; } } } MeritLevel = ctx.GetRegex("获奖等级"); Remark = ctx.GetRegex("备注"); Details = ctx.GetRegex("表彰内容描述"); Source = "广东省住房和城乡建设厅"; if (Remark.Contains("无备注") || Remark == "无") { Remark = null; } CorpMerit info = ToolDb.GenCorpMerit("广东省", "广东地区", "", CorpCode, CorpName, MeritYear, MeritName, MeritDate, MeritLevel, MeritRegion, MeritSector, MeritPrjName, PrjSupporter, Source, Url, Remark, Details); ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate, this.ExistsHtlCtx); } } } } } return(null); }