protected override IList ExecuteCrawl(bool crawlAll) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; infoType = "通知公告"; infoUrl = this.SiteUrl; string htldtl = string.Empty; try { htldtl = ToolHtml.GetHtmlByUrl(infoUrl, Encoding.UTF8).GetJsString(); } catch { } headName = "关于转发深圳市住房和建设局转发《深圳市交通运输委港航和货运交通管理局关于我市泥头车运输企业土石方运输业务投标资质考评和异地泥头车备案托管第二阶段情况的通报》的通知"; ctxHtml = "<table width='960' background='{root_path}images/xil_jl_05.jpg' border='0' cellspacing='0' cellpadding='0'> <tbody><tr> <td align='center' background='../../../images/xil_jl_03.jpg' valign='top' style='background-repeat: repeat-x;'><table width='100%' border='0' cellspacing='0' cellpadding='0'> <tbody><tr> <td width='9%'> </td> <td width='83%' height='25'> </td> <td width='8%'> </td> </tr> <tr> <td> </td> <td valign='top'><table width='100%' border='0' cellspacing='0' cellpadding='0'> <tbody><tr> <td width='8%' height='25' class='red12a'>题材分类:</td> <td width='42%'><a style='text-decoration: underline; cursor: pointer;' onclick='xlsj('catalog1=327')'>通知公告公示</a></td> <td width='8%' class='red12a'>主题分类:</td> <td width='42%'><a style='text-decoration: underline; cursor: pointer;' onclick='xlsj('catalog2=479')'>其他</a></td> </tr> <tr> <td height='25' class='red12a'>发文机构:</td> <td><span id='fbjgid' style='display: none;'><script>fbjg('深圳市南山区人民政府 ')</script><a style='text-decoration: underline; cursor: pointer;' onclick='xlsj('district=深圳市南山区人民政府')'>深圳市南山区人民政府</a></span></td><script>var wh = ''; wh = wh.replace(/ /ig,''); wh = wh.replace(/ /ig,''); if(wh==''||wh==null||'无'==wh){ document.getElementById('fbjgid').style.display='none'; }</script> <td class='red12a'>来源网站发布日期:</td> <td><a style='text-decoration: underline; cursor: pointer;' onclick='xlsj('urltime=2013.08.12')'>2013-08-12</a></td> </tr> <tr> <td height='25' class='red12a'>所属地区:</td> <td><script>ssdq('广东省深圳市 ')</script><a style='text-decoration: underline; cursor: pointer;' onclick='xlsj('vreserved3=广东省深圳市')'>广东省深圳市</a>;</td> <td class='red12a'>文 号:</td> <td><script type='text/javascript'> ycwh(); </script></td> </tr> <tr> <td height='25' class='red12a' valign='top' style='padding-top: 8px;'>关 键 词:</td> <td valign='top' style='line-height: 20px; padding-top: 3px;'><script>gjzsj('深圳市;泥头车;货运交通;交通运输;备案;港航;土石方运输;投标资质;考评;异地')</script><a style='text-decoration: underline; cursor: pointer;' onclick='xlsj('keywords=深圳市')'>深圳市</a>;<a style='text-decoration: underline; cursor: pointer;' onclick='xlsj('keywords=泥头车')'>泥头车</a>;<a style='text-decoration: underline; cursor: pointer;' onclick='xlsj('keywords=货运交通')'>货运交通</a>;<a style='text-decoration: underline; cursor: pointer;' onclick='xlsj('keywords=交通运输')'>交通运输</a>;<a style='text-decoration: underline; cursor: pointer;' onclick='xlsj('keywords=备案')'>备案</a>;<a style='text-decoration: underline; cursor: pointer;' onclick='xlsj('keywords=港航')'>港航</a>;<a style='text-decoration: underline; cursor: pointer;' onclick='xlsj('keywords=土石方运输')'>土石方运输</a>;<a style='text-decoration: underline; cursor: pointer;' onclick='xlsj('keywords=投标资质')'>投标资质</a>;<a style='text-decoration: underline; cursor: pointer;' onclick='xlsj('keywords=考评')'>考评</a>;<a style='text-decoration: underline; cursor: pointer;' onclick='xlsj('keywords=异地')'>异地</a>;</td> <td class='red12a'>公文发布日期:</td> <td><a style='text-decoration: underline; cursor: pointer;' onclick='xlsj('urldate=')'></a></td> </tr> </tbody></table></td> <td> </td> </tr> </tbody></table></td> </tr> <tr> <td bgcolor='#ffffff'><img width='943' height='8' src='../../../images/xil_jl_06.jpg'></td> </tr> </tbody></table> <table width='960' bgcolor='#ffffff' border='0' cellspacing='0' cellpadding='0'> <tbody><tr> <td align='center' valign='top'><table width='830' border='0' cellspacing='0' cellpadding='0'> <tbody><tr> <td align='center' class='dbiaoti' style='padding: 15px 0px;'>关于转发深圳市住房和建设局转发《深圳市交通运输委港航和货运交通管理局关于我市泥头车运输企业土石方运输业务投标资质考评和异地泥头车备案托管第二阶段情况的通报》的通知</td> </tr> </tbody></table> <table width='830' border='0' cellspacing='0' cellpadding='0'> <tbody><tr> <td><table width='100%' background='../../../images/erj_jl_122_28.jpg' border='0' cellspacing='0' cellpadding='0'> <tbody><tr> <td width='12'><img width='12' height='34' src='../../../images/erj_jl_121_25.jpg'></td> <td><table width='100%' height='25' align='center' border='0' cellspacing='0' cellpadding='0'> <tbody><tr> <td class='fff12'>来源:<script>lyjs('深圳市南山区人民政府')</script><a style='text-decoration: underline; cursor: pointer;' onclick='xlsj('sitename=深圳市南山区人民政府')'>深圳市南山区人民政府</a>;</td> <td width='80'><a onclick='checkUrl('http://www.szns.gov.cn/publish/main/1/19/tzgg/20130812110509651949516/index.html','关于转发深圳市住房和建设局转发《深圳市交通运输委港航和货运交通管理局关于我市泥头车运输企业土石方运输业务投标资质考评和异地泥头车备案托管第二阶段情况的通报》的通知','4032393');' href='#'>原文链接 >></a></td> <td width='80'><a href='/search/htmlflash4Radar?docid=4032393'>网页快照</a> >> </td> </tr> </tbody></table></td> <td width='8'><img width='8' height='34' src='../../../images/erj_jl_123_30.jpg'></td> </tr> </tbody></table></td> </tr> </tbody></table> <table width='830' border='0' cellspacing='0' cellpadding='0'> <tbody><tr> <td class='zw_link' valign='top' style='padding: 20px 0px 0px;'> <br><br>各有关单位:<br> 现将《深圳市交通运输委港航和货运交通管理局关于我市泥头车运输企业土石方运输业务投标资质考评和异地泥头车备案托管第二阶段情况的通报》(深交港货[2013]164号)转发给你们,请遵照执行。目前,共有46家泥头车运输企业已获取我市土石方运输业务投标资质;共有82家异地企业204辆泥头车,分别与12家土石方运输业务投标资质企业达成了备案托管。<br> 特此通知。<br> 联系人:李衍航,电话:83788608。 <br> 附件:深交港货[2013]164号<br> 深圳市住房和建设局<br> 2013年8月9日<br> <br><br><br><br> <script type='text/javascript'> qufj(); </script><a href='./P020131018007991034107.pdf'> 附件:深交港货[2013]164号 </a><br> </td> </tr> </tbody></table> <table width='100%' border='0' cellspacing='0' cellpadding='0'> <tbody><tr> <td> </td> </tr> </tbody></table></td> </tr> </tbody></table>"; //infoCtx = ctxHtml.GetJsString().Replace("<tr>", "").Replace("</tr>", "").Replace("<br>", "\r\n").ToCtxString().Replace(">", ""); Parser parser = new Parser(new Lexer(htldtl)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("background", "../../../images/sd_in_09.jpg"))); if (nodeList != null && nodeList.Count > 0) { infoCtx = nodeList.AsHtml().Replace("<br>", "\r\n").ToCtxString().Replace(":\r\n", ":").Replace(">", ""); } msgType = infoScorce = "深圳市住房和建设局"; releaseTime = "2013-08-09"; NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "深圳市工程", string.Empty, infoCtx, infoType); if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate)) { BaseAttach attach = ToolHtml.GetBaseAttach("http://govinfo.nlc.gov.cn/gdsszfz/xxgk/szsnsqrmzf/201310/P020131018007991034107.pdf", "深交港货[2013]164号", info.Id); if (attach != null) { ToolDb.SaveEntity(attach, string.Empty); } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { //取得页码 int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("id", "PageDataList__ctl7_LinkButton1"))); if (pageList != null && pageList.Count > 0) { try { string temp = pageList.AsString(); pageInt = Convert.ToInt32(temp.GetRegexBegEnd("共", "页")); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { viewState = this.ToolWebSite.GetAspNetViewState(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection( new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE", "head1:username", "head1:Password", "head1:rbLoginType", "Tb_keyword", "ddlNewsType", "ddlistaddnewsdate" }, new string[] { "PageDataList$_ctl" + (i + 1).ToString() + "$LinkButton1", "", viewState, "", "", "unit", "", "20", "" } ); html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", " tb_list"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; TableRow tr = table.Rows[j]; infoType = "通知公告"; releaseTime = tr.Columns[2].ToPlainTextString().GetDateRegex(); headName = tr.Columns[1].ToNodePlainString(); infoUrl = "http://www.szpark.com.cn" + tr.Columns[1].GetATagHref(); string htldtl = string.Empty; try { htldtl = ToolHtml.GetHtmlByUrl(infoUrl, Encoding.UTF8).GetJsString(); } catch { } parser = new Parser(new Lexer(htldtl)); NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "newsinfo"))); if (noList != null && noList.Count > 0) { ctxHtml = noList.AsHtml().Replace("<br/>", "\r\n").Replace("<BR/>", ""); infoCtx = ctxHtml.ToCtxString().Replace(" ", "").Replace(" ", "").Replace("\t\t", "\t").Replace("\t\t", "\t"); infoCtx = Regex.Replace(infoCtx, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase).Replace(" ", "").Replace("\t", "").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); msgType = MsgTypeCosnt.ShenZhenFJYLMsgType; infoScorce = infoScorce.Replace(" ", ""); NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "深圳市工程", string.Empty, infoCtx, infoType); sqlCount++; if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } if (ToolDb.SaveEntity(info, this.ExistCompareFields)) { parser = new Parser(new Lexer(ctxHtml)); NodeList imgList = parser.ExtractAllNodesThatMatch(new TagNameFilter("img")); if (imgList != null && imgList.Count > 0) { for (int m = 0; m < imgList.Count; m++) { try { ImageTag img = imgList[m] as ImageTag; string src = img.GetAttribute("src"); if (src.ToLower().Contains(".gif")) { continue; } BaseAttach obj = null; if (src.Contains("http")) { obj = ToolHtml.GetBaseAttach(src, headName, info.Id); } else { obj = ToolHtml.GetBaseAttach("http://www.szpark.com.cn" + src.Replace("../", "/").Replace("./", "/"), headName, info.Id); } if (obj != null) { ToolDb.SaveEntity(obj, string.Empty); } } catch { } } } parser = new Parser(new Lexer(ctxHtml)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int a = 0; a < aNode.Count; a++) { ATag aTag = aNode[a] as ATag; if (aTag.IsAtagAttach()) { try { BaseAttach obj = null; string href = aTag.GetATagHref(); if (href.Contains("http")) { obj = ToolHtml.GetBaseAttach(href, aTag.LinkText, info.Id); } else { obj = ToolHtml.GetBaseAttach("http://www.szpark.com.cn" + href.Replace("../", "/").Replace("./", "/"), aTag.LinkText, info.Id); } if (obj != null) { ToolDb.SaveEntity(obj, string.Empty); } } catch { } } } } } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "bulletininfotable_toolbarTable"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("/", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { string bulletininfotable_totalpages = ToolHtml.GetHtmlInputValue(html, "bulletininfotable_totalpages"); string bulletininfotable_totalrows = ToolHtml.GetHtmlInputValue(html, "bulletininfotable_totalrows"); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "ec_i", "bulletininfotable_efn", "bulletininfotable_crd", "bulletininfotable_p", "bulletininfotable_s_bulletintitle", "bulletininfotable_s_finishday", "hySort", "findAjaxZoneAtClient", "method", "bulletinclass", "bulletininfotable_totalpages", "bulletininfotable_totalrows", "bulletininfotable_pg", "bulletininfotable_rd" }, new string[] { "bulletininfotable", "", "20", i.ToString(), "", "", "2", "false", "bulletinMore", "01", bulletininfotable_totalpages, bulletininfotable_totalrows, (i - 1).ToString(), "5" }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.Default, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html.Replace("tbody", "table"))); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "bulletininfotable_table_body"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty; TableRow tr = table.Rows[j]; prjName = tr.Columns[0].ToNodePlainString(); beginDate = tr.Columns[1].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.ynggzy.net/bulletin.do?method=showbulletin&bulletin_id=" + tr.GetAttribute("id"); string htmldtl = string.Empty; try { htmldtl = ToolHtml.GetHtmlByUrl(this.SiteUrl, InfoUrl, Encoding.Default); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("body")); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); bidCtx = HtmlTxt.ToLower().GetReplace("</p>,<br />,<br/>,<br>", "\r\n").ToCtxString(); buildUnit = bidCtx.GetBuildRegex(); prjAddress = bidCtx.GetAddressRegex(); code = bidCtx.GetCodeRegex(); bidType = prjName.GetInviteBidType(); bidUnit = bidCtx.GetBidRegex(); if (string.IsNullOrWhiteSpace(bidUnit)) { bidUnit = bidCtx.GetRegex("成交人,成交供应商"); } bidMoney = bidCtx.GetMoneyRegex(); prjMgr = bidCtx.GetMgrRegex(); if (string.IsNullOrWhiteSpace(bidUnit)) { bidUnit = bidCtx.GetRegexBegEnd("确定中标供应商为", ","); if (string.IsNullOrWhiteSpace(bidMoney) || bidMoney == "0") { bidMoney = bidCtx.GetRegexBegEnd("投标报价为", "万元"); } if (string.IsNullOrWhiteSpace(bidUnit)) { parser = new Parser(new Lexer(HtmlTxt)); NodeList bidNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (bidNode != null && bidNode.Count > 0) { string ctx = string.Empty; TableTag tag = bidNode[0] as TableTag; for (int r = 0; r < tag.RowCount; r++) { for (int c = 0; c < tag.Rows[r].ColumnCount; c++) { string temp = tag.Rows[r].Columns[c].ToNodePlainString(); if ((c + 1) % 2 == 0) { ctx += temp + "\r\n"; } else { ctx += temp + ":"; } } } bidUnit = ctx.GetBidRegex(); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = ctx.GetRegex("入围供应商,成交人,单位名称"); } if (string.IsNullOrWhiteSpace(bidMoney) || bidMoney == "0") { bidMoney = ctx.GetMoneyRegex(); } if (string.IsNullOrWhiteSpace(bidMoney) || bidMoney == "0") { bidMoney = ctx.GetMoneyString().GetMoney(); } } } } if (buildUnit.Contains("联系")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("联系")); } if (buildUnit.Contains("公司")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司"; } if (buildUnit.Contains("地址")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("地址")); } if (bidUnit.Contains("地址")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("地址")); } if (bidUnit.Contains("公司")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("公司")) + "公司"; } bidUnit = bidUnit.GetReplace("第一,1"); if (bidUnit.Contains("综合") || bidUnit.Contains("报价") || bidUnit.Contains("联系") || bidUnit.Contains("投标单位") || bidUnit.Contains("得分") || bidUnit.Contains("中标价")) { bidUnit = string.Empty; } try { if (decimal.Parse(bidMoney) > 100000) { bidMoney = (decimal.Parse(bidMoney) / 10000).ToString(); } } catch { } specType = "建设工程"; msgType = "云南省公共资源交易中心"; BidInfo info = ToolDb.GenBidInfo("云南省", "云南省及地市", area, string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string obj = a.Link.GetReplace("(", "(").GetRegexBegEnd("(", ",").GetReplace("(").GetReplace("'").Replace(",", ""); string name = a.Link.GetReplace(")", ")").GetRegexBegEnd(",", ")").GetReplace(")").GetReplace("'").Replace(",", ""); string link = "http://www.ynggzy.net/resource/bulletin.do?method=mdownloadFile&file_id=" + obj + "&file_name=" + name; BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { //取得页码 int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch (Exception ex) { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("select"), new HasAttributeFilter("id", "dnn_ctr467_ArticleList_cboPages")), true), new TagNameFilter("option"))); if (pageList != null && pageList.Count > 0) { try { pageInt = pageList.Count; } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { viewState = this.ToolWebSite.GetAspNetViewState(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection( new string[] { "__EVENTARGUMENT", "dnn:ctr467:ArticleList:cboPages", "ScrollTop", "__dnnVariable", "__VIEWSTATE" }, new string[] { "", (i - 1).ToString(), "", "", viewState } ); html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "dnn_ctr467_ArticleList_PanelA")), true), new TagNameFilter("table"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; TableRow tr = table.Rows[j]; infoType = "通知公告"; releaseTime = "20" + tr.Columns[2].ToPlainTextString().GetDateRegex("yy-MM-dd"); headName = tr.Columns[1].ToNodePlainString(); infoUrl = "http://www.szmea.net" + tr.Columns[1].GetATagHref(); string htldtl = string.Empty; try { htldtl = ToolHtml.GetHtmlByUrl(SiteUrl, infoUrl, Encoding.Default).GetJsString(); //ToolHtml.GetHtmlByUrl(infoUrl, Encoding.Default).GetJsString(); } catch { } parser = new Parser(new Lexer(htldtl)); NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "dnn_ctr391_ArticleShow_lblContent"))); if (noList != null && noList.Count > 0) { ctxHtml = noList.AsHtml().Replace("<br/>", "\r\n").Replace("<BR/>", ""); infoCtx = ctxHtml.ToCtxString().Replace(" ", "").Replace(" ", "").Replace("\t\t", "\t").Replace("\t\t", "\t"); infoCtx = Regex.Replace(infoCtx, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase).Replace(" ", "").Replace("\t", "").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); msgType = MsgTypeCosnt.ShenZhenJLGCMsgType; infoScorce = infoScorce.Replace(" ", ""); NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "深圳市工程", string.Empty, infoCtx, infoType); sqlCount++; if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } if (ToolDb.SaveEntity(info, this.ExistCompareFields)) { parser = new Parser(new Lexer(ctxHtml)); NodeList imgList = parser.ExtractAllNodesThatMatch(new TagNameFilter("img")); if (imgList != null && imgList.Count > 0) { for (int m = 0; m < imgList.Count; m++) { try { ImageTag img = imgList[m] as ImageTag; string src = img.GetAttribute("src"); if (src.ToLower().Contains(".gif")) { continue; } BaseAttach obj = null; if (src.Contains("http")) { obj = ToolHtml.GetBaseAttach(src, headName, info.Id); } else { obj = ToolHtml.GetBaseAttach("http://www.szmea.net" + src.Replace("../", "/").Replace("./", "/"), headName, info.Id); } if (obj != null) { ToolDb.SaveEntity(obj, string.Empty); } } catch { } } } parser = new Parser(new Lexer(ctxHtml)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int a = 0; a < aNode.Count; a++) { ATag aTag = aNode[a] as ATag; if (aTag.IsAtagAttach()) { try { BaseAttach obj = null; string href = aTag.GetATagHref(); if (href.Contains("http")) { obj = ToolHtml.GetBaseAttach(href, aTag.LinkText, info.Id); } else { obj = ToolHtml.GetBaseAttach("http://www.szmea.net" + href.Replace("../", "/").Replace("./", "/"), aTag.LinkText, info.Id); } if (obj != null) { ToolDb.SaveEntity(obj, string.Empty); } } catch { continue; } } } } } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "bulletininfotable_toolbarTable"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("/", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { string bulletininfotable_totalpages = ToolHtml.GetHtmlInputValue(html, "bulletininfotable_totalpages"); string bulletininfotable_totalrows = ToolHtml.GetHtmlInputValue(html, "bulletininfotable_totalrows"); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "ec_i", "bulletininfotable_efn", "bulletininfotable_crd", "bulletininfotable_p", "bulletininfotable_s_bulletintitle", "bulletininfotable_s_finishday", "hySort", "findAjaxZoneAtClient", "method", "bulletinclass", "bulletininfotable_totalpages", "bulletininfotable_totalrows", "bulletininfotable_pg", "bulletininfotable_rd" }, new string[] { "bulletininfotable", "", "20", i.ToString(), "", "", "1", "false", "bulletinMore", "01", bulletininfotable_totalpages, bulletininfotable_totalrows, (i - 1).ToString(), "5" }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.Default, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html.Replace("tbody", "table"))); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "bulletininfotable_table_body"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, city = string.Empty; TableRow tr = table.Rows[j]; prjName = tr.Columns[0].ToNodePlainString(); beginDate = tr.Columns[1].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.ynggzy.net/bulletin.do?method=showbulletin&bulletin_id=" + tr.GetAttribute("id"); string htmldtl = string.Empty; try { htmldtl = ToolHtml.GetHtmlByUrl(this.SiteUrl, InfoUrl, Encoding.Default); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("body")); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); inviteCtx = HtmlTxt.ToLower().GetReplace("</p>,<br />,<br/>,<br>", "\r\n").ToCtxString(); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); code = inviteCtx.GetCodeRegex().GetCodeDel(); inviteType = prjName.GetInviteBidType(); specType = "政府采购"; msgType = "云南省公共资源交易中心"; InviteInfo info = ToolDb.GenInviteInfo("云南省", "云南省及地市", city, string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.ynggzy.net/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "LblPageCount"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString(); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE", "TBuildInc", "TFindContractorName", "SArea", "SCCSort", "txtGO", "__EVENTVALIDATION" }, new string[] { "lbtnGO", "", viewState, "", "", "0", "", i.ToString(), eventValidation }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "gv_List"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, city = string.Empty; TableRow tr = table.Rows[j]; buildUnit = tr.Columns[1].ToNodePlainString(); ATag aTag = tr.Columns[2].GetATag(); prjName = aTag.GetAttribute("title"); inviteType = tr.Columns[3].ToNodePlainString(); beginDate = tr.Columns[5].ToPlainTextString().GetDateRegex(); city = tr.Columns[6].ToNodePlainString(); InfoUrl = "http://www.ynzb.com.cn/" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = ToolHtml.GetHtmlByUrl(this.SiteUrl, InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("cellspacing", "1"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); TableTag tag = dtlNode[0] as TableTag; for (int r = 0; r < tag.RowCount; r++) { if (r == 0) { inviteCtx += tag.Rows[r].Columns[0].ToNodePlainString() + "\r\n"; if (string.IsNullOrWhiteSpace(prjName)) { prjName = tag.Rows[r].Columns[0].ToNodePlainString(); } continue; } for (int c = 0; c < tag.Rows[r].ColumnCount; c++) { string temp = tag.Rows[r].Columns[c].ToNodePlainString(); if ((c + 1) % 2 == 0) { inviteCtx += temp + "\r\n"; } else { inviteCtx += temp + ":"; } } } code = inviteCtx.GetCodeRegex(); prjAddress = inviteCtx.GetAddressRegex(); if (buildUnit.Contains("..")) { string temp = inviteCtx.GetBuildRegex(); buildUnit = !string.IsNullOrEmpty(temp) ? temp : buildUnit.Replace(".", ""); } specType = "建设工程"; msgType = "云南省住房和城乡建设厅"; InviteInfo info = ToolDb.GenInviteInfo("云南省", "云南省及地市", city, string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.ynzb.com.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "Top10 TxtCenter"))); if (noList != null && noList.Count > 0) { string temp = noList.AsString().GetRegexBegEnd("/", "页"); try { pageInt = Convert.ToInt32(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.bidding.csg.cn/zbgg/index_" + i.ToString() + ".jhtml", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "W750 Right")), true), new TagNameFilter("li"))); if (nodeList != null && nodeList.Count > 0) { for (int j = 1; j < nodeList.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = nodeList[j].GetATag(); prjName = aTag.LinkText; beginDate = nodeList[j].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.bidding.csg.cn" + aTag.Link; string htlDtl = string.Empty; try { htlDtl = ToolHtml.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "Center W1000"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); parser = new Parser(new Lexer(HtmlTxt)); NodeList nameNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("h1"), new HasAttributeFilter("class", "TxtCenter Padding10"))); if (nameNode != null && nameNode.Count > 0) { prjName = nameNode[0].ToNodePlainString(); } inviteCtx = HtmlTxt.ToCtxString(); inviteType = ToolHtml.GetInviteTypes(prjName); prjAddress = ToolHtml.GetRegexString(inviteCtx, ToolHtml.AddressRegex); buildUnit = ToolHtml.GetRegexString(inviteCtx, ToolHtml.BuildRegex); code = ToolHtml.GetRegexString(inviteCtx, ToolHtml.CodeRegex); prjAddress = ToolHtml.GetSubString(prjAddress, 150); buildUnit = ToolHtml.GetSubString(buildUnit, 150); code = ToolHtml.GetSubString(code, 50); if (string.IsNullOrEmpty(code)) { code = "见招标信息"; } if (string.IsNullOrEmpty(prjAddress)) { prjAddress = "见招标信息"; } specType = "其他"; msgType = "中国南方电网有限责任公司招标服务中心"; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "中国南方电网有限责任公司招标服务中心"; } InviteInfo info = ToolDb.GenInviteInfo("广东省", "电网专项工程", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList nodeAtag = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (nodeAtag != null && nodeAtag.Count > 0) { for (int c = 0; c < nodeAtag.Count; c++) { ATag a = nodeAtag[c] as ATag; if (a.Link.IsAtagAttach()) { string alink = "http://www.bidding.csg.cn/" + a.Link; try { BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText.Replace(" ", "").Replace(";", "").Replace(";", ""), info.Id, alink); base.AttachList.Add(attach); } catch { } } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("style", "float:left;"))); if (tdNodes != null && tdNodes.Count > 0) { try { string pageTemp = tdNodes[0].ToPlainTextString().Replace(" ", "").Trim(); Regex regpage = new Regex(@"共\d+页"); //string ss = regpage.Match(pageTemp).Value.Replace("页", "").Replace("共", "").Trim(); pageInt = int.Parse(regpage.Match(pageTemp).Value.Replace("共", "").Replace("页", "").Trim()); } catch (Exception ex) { } string cookiestr = string.Empty; for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl + "&page=" + i.ToString()), Encoding.UTF8); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "tb_list"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; beginDate = tr.Columns[4].ToPlainTextString().Trim(); endDate = tr.Columns[5].ToPlainTextString().Trim(); prjName = tr.Columns[1].ToPlainTextString().Trim().Replace(" ", ""); ATag aTag = tr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://zb.cmc.com.cn/TenderNotice/" + aTag.Link; string htmldetail = string.Empty; try { htmldetail = ToolHtml.GetHtmlByUrl(SiteUrl, InfoUrl).Replace(" ", "").Trim(); Parser dtlparserHTML = new Parser(new Lexer(htmldetail)); NodeList dtnodeHTML = dtlparserHTML.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "ct"), new TagNameFilter("div"))); HtmlTxt = dtnodeHTML.AsHtml(); htmldetail = ToolHtml.GetHtmlByUrl(SiteUrl, InfoUrl).Replace(" ", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n"); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); htmldetail = regexHtml.Replace(htmldetail, ""); } catch (Exception ex) { continue; } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "ct"), new TagNameFilter("div"))); inviteCtx = dtnode.AsString().Trim().Replace(" ", ""); Regex regCtx = new Regex(@"([\r\n]+)|([\t]+)|(\[该信息共被浏览了[0-9]+次\]\[关闭\])"); inviteCtx = regCtx.Replace(inviteCtx, "\r\n"); Regex regcode = new Regex(@"编号(:|:)[^\r\n]+\r\n"); code = regcode.Match(inviteCtx).Value.Replace("编号", "").Replace(":", "").Replace(":", "").Trim(); if (Encoding.Default.GetByteCount(code) > 50) { code = ""; } specType = "其他"; msgType = "中国机械进出口(集团)有限公司"; inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { //取得页码 int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch (Exception ex) { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "f12"))); if (pageList != null && pageList.Count > 0) { try { string temp = pageList.AsString(); pageInt = Convert.ToInt32(temp.GetRegexBegEnd("/", "页")); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "&page=" + i.ToString(), Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("height", "32")), true), new TagNameFilter("table"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 0; j < table.RowCount - 1; j++) { if ((j + 1) % 2 == 0) { continue; } string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; TableRow tr = table.Rows[j]; infoType = "通知公告"; releaseTime = tr.Columns[2].ToPlainTextString().GetDateRegex(); headName = tr.Columns[1].ToNodePlainString(); infoUrl = "http://www.jianzhuxh.com/news/" + tr.Columns[1].GetATagValue("onclick").GetRegexBegEnd("'", "'"); string htldtl = string.Empty; try { htldtl = ToolHtml.GetHtmlByUrl(infoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htldtl)); NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "text18"))); if (noList != null && noList.Count > 0) { ctxHtml = noList[0].ToHtml().Replace("<br/>", "\r\n").Replace("<BR/>", ""); infoCtx = ctxHtml.ToCtxString().Replace(" ", "").Replace(" ", "").Replace("\t\t", "\t").Replace("\t\t", "\t"); infoCtx = Regex.Replace(infoCtx, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase).Replace(" ", "").Replace("\t", "").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); msgType = MsgTypeCosnt.ShenZhenJZYMsgType; infoScorce = infoScorce.Replace(" ", ""); NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "深圳市工程", string.Empty, infoCtx, infoType); sqlCount++; if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } if (ToolDb.SaveEntity(info, this.ExistCompareFields)) { parser = new Parser(new Lexer(ctxHtml)); NodeList imgList = parser.ExtractAllNodesThatMatch(new TagNameFilter("img")); if (imgList != null && imgList.Count > 0) { for (int m = 0; m < imgList.Count; m++) { try { ImageTag img = imgList[m] as ImageTag; string src = img.GetAttribute("src"); if (src.ToLower().Contains(".gif")) { continue; } BaseAttach obj = null; if (src.Contains("http")) { obj = ToolHtml.GetBaseAttach(src, headName, info.Id); } else { obj = ToolHtml.GetBaseAttach("http://www.jianzhuxh.com" + src.Replace("../", "/").Replace("./", "/"), headName, info.Id); } if (obj != null) { ToolDb.SaveEntity(obj, string.Empty); } } catch { } } } parser = new Parser(new Lexer(ctxHtml)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int a = 0; a < aNode.Count; a++) { ATag aTag = aNode[a] as ATag; if (aTag.IsAtagAttach()) { try { BaseAttach obj = null; string href = aTag.GetATagHref(); if (href.Contains("http")) { obj = ToolHtml.GetBaseAttach(href, aTag.LinkText, info.Id); } else { obj = ToolHtml.GetBaseAttach("http://www.jianzhuxh.com" + href.Replace("../", "/").Replace("./", "/"), aTag.LinkText, info.Id); } if (obj != null) { ToolDb.SaveEntity(obj, string.Empty); } } catch { } } } } } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { //取得页码 int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "ctl00_cph_context_GridViewPaingTwo1_lblGridViewPagingDesc"))); if (pageList != null && pageList.Count > 0) { try { string temp = pageList.AsString(); pageInt = Convert.ToInt32(temp.GetRegexBegEnd("共", "页")); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection( new string[] { "__VIEWSTATE", "__EVENTVALIDATION", "ctl00$cph_context$GridViewPaingTwo1$txtGridViewPagingForwardTo", "ctl00$cph_context$GridViewPaingTwo1$btnForwardToPage" }, new string[] { viewState, eventValidation, i.ToString(), "GO" } ); html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_cph_context_GridView1"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; TableRow tr = table.Rows[j]; headName = tr.Columns[1].ToNodePlainString(); releaseTime = tr.Columns[2].ToNodePlainString(); infoType = "政策法规"; infoUrl = "http://www.dgzb.com.cn/DGJYWEB/SiteManage/" + tr.Columns[1].GetATagHref(); string htldtl = string.Empty; try { htldtl = ToolHtml.GetHtmlByUrl(infoUrl, Encoding.UTF8).GetJsString(); } catch { } parser = new Parser(new Lexer(htldtl)); NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "ctl00_cph_context_span_MetContent"))); if (noList != null && noList.Count > 0) { ctxHtml = noList.AsHtml().Replace("<br/>", "\r\n").Replace("<BR/>", ""); infoCtx = noList.AsString().Replace(" ", "").Replace(" ", "").Replace("\t\t", "\t").Replace("\t\t", "\t"); infoCtx = Regex.Replace(infoCtx, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase).Replace(" ", "").Replace("\t", "").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); msgType = MsgTypeCosnt.DongGuanMsgType; infoScorce = infoScorce.Replace(" ", ""); NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "东莞市区", string.Empty, infoCtx, infoType); if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } else { sqlCount++; if (ToolDb.SaveEntity(info, this.ExistCompareFields)) { parser = new Parser(new Lexer(htldtl)); NodeList attachList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_cph_context_DownLoadFiles1_GridView1"))); if (attachList != null && attachList.Count > 0) { TableTag tabTag = attachList[0] as TableTag; for (int k = 1; k < tabTag.RowCount; k++) { TableRow dr = tabTag.Rows[k]; try { string attName = string.IsNullOrEmpty(dr.Columns[1].ToNodePlainString()) ? headName : dr.Columns[1].ToNodePlainString(); BaseAttach baseInfo = ToolHtml.GetBaseAttachByUrl("http://www.dgzb.com.cn/DGJYWEB/SiteManage/" + dr.Columns[1].GetATagHref(), attName, info.Id); if (baseInfo != null) { ToolDb.SaveEntity(baseInfo, string.Empty); } } catch { } } } } } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("style", "float:left;"))); if (tdNodes != null && tdNodes.Count > 0) { try { string pageTemp = tdNodes[0].ToPlainTextString().Replace(" ", "").Trim(); Regex regpage = new Regex(@"共\d+页"); //string ss = regpage.Match(pageTemp).Value.Replace("页", "").Replace("共", "").Trim(); pageInt = int.Parse(regpage.Match(pageTemp).Value.Replace("共", "").Replace("页", "").Trim()); } catch (Exception ex) { } string cookiestr = string.Empty; for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl + "?page=" + i.ToString()), Encoding.UTF8); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "tb_list"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; prjName = tr.Columns[1].ToPlainTextString().Trim().Replace(" ", ""); ATag aTag = tr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://zb.cmc.com.cn/TenderPublicity/" + aTag.Link; string htmldetail = string.Empty; try { htmldetail = ToolHtml.GetHtmlByUrl(SiteUrl, InfoUrl).Replace(" ", "").Trim(); Parser dtlparserHTML = new Parser(new Lexer(htmldetail)); NodeList dtnodeHTML = dtlparserHTML.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "ct"), new TagNameFilter("div"))); HtmlTxt = dtnodeHTML.AsHtml(); htmldetail = ToolHtml.GetHtmlByUrl(SiteUrl, InfoUrl).Replace(" ", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n"); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); htmldetail = regexHtml.Replace(htmldetail, ""); } catch (Exception ex) { continue; } parser = new Parser(new Lexer(htmldetail)); NodeList ldata = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "tm"))); if (ldata != null && ldata.Count > 0) { string datactx = ldata.AsString(); Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}"); beginDate = regDate.Match(datactx).Value; } if (string.IsNullOrEmpty(beginDate)) { beginDate = DateTime.Now.ToString(); } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "ct"), new TagNameFilter("div"))); bidCtx = dtnode.AsString().Trim().Replace(" ", ""); Regex regCtx = new Regex(@"([\r\n]+)|([\t]+)|(\[该信息共被浏览了[0-9]+次\]\[关闭\])"); bidCtx = regCtx.Replace(bidCtx, "\r\n") + "\r\n"; Regex regBidUnit = new Regex(@"(成交供应商|中标人名称|中标单位|中标人)(:|:)[^\r\n]+\r\n"); bidUnit = regBidUnit.Match(bidCtx).Value.Replace("成交供应商", "").Replace("中标单位", "").Replace("中标人名称", "").Replace("成交", "").Replace("中标人", "").Replace(":", "").Replace(":", "").Trim(); Regex regcode = new Regex(@"编号(:|:)[^\r\n]+\r\n"); code = regcode.Match(bidCtx).Value.Replace("编号", "").Replace(":", "").Replace(":", "").Trim(); if (Encoding.Default.GetByteCount(code) > 50) { code = ""; } Regex regBidMoneystr = new Regex(@"金额(:|:)[^\r\n]+\r\n"); string monerystr = regBidMoneystr.Match(bidCtx).Value.Replace("金额", "").Replace(":", "").Replace(":", "").Replace(",", "").Replace(",", "").Trim(); Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+"); if (string.IsNullOrEmpty(regBidMoney.Match(monerystr).Value)) { Regex regBidMoneystr1 = new Regex(@"小写为(:|:)[^\r\n]+\r\n"); monerystr = regBidMoneystr1.Match(bidCtx).Value.Replace("小写为", "").Replace(":", "").Replace(":", "").Replace(",", "").Replace(",", "").Trim(); } if (string.IsNullOrEmpty(regBidMoney.Match(monerystr).Value)) { Regex regBidMoneystr1 = new Regex(@"(¥|$)[^\r\n]+\r\n"); monerystr = regBidMoneystr1.Match(bidCtx).Value.Replace("¥", "").Replace("$", "").Replace(":", "").Replace(":", "").Replace(",", "").Replace(",", "").Trim(); } if (!string.IsNullOrEmpty(regBidMoney.Match(monerystr).Value)) { if ((monerystr.Contains("万元") || monerystr.Contains("万美元")) && !monerystr.Contains("万元整")) { bidMoney = regBidMoney.Match(monerystr).Value; } else { try { bidMoney = (decimal.Parse(regBidMoney.Match(monerystr).Value) / 10000).ToString(); if (decimal.Parse(bidMoney) < decimal.Parse("0.1")) { bidMoney = "0"; } } catch (Exception) { bidMoney = "0"; } } } specType = "其他"; msgType = "中国机械进出口(集团)有限公司"; prjName = ToolDb.GetPrjName(prjName); bidType = ToolHtml.GetInviteTypes(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookieStr = string.Empty; int pageInt = 1; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "tzgg_right_page")), true), new TagNameFilter("a"))); if (pageList != null && pageList.Count > 0) { try { ATag aTag = pageList[pageList.Count - 2] as ATag; string tem = aTag.LinkText; pageInt = Convert.ToInt32(tem.Replace("goPage(", "").Replace(")", "")); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { string url = "http://www.gzzb.gd.cn/cms/wz/view/tzygg/enterpriseAchievementServlet?name=&number=&projectName=&projectNumber=&siteId=1&channelId=19&pager.offset=" + i.ToString() + "0"; html = this.ToolWebSite.GetHtmlByUrl(url, Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "table1"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string InfoTitle = string.Empty, InfoType = string.Empty, PublistTime = string.Empty, InfoCtx = string.Empty, InfoUrl = string.Empty, prjCode = string.Empty, buildUnit = string.Empty, htmlTxt = string.Empty; TableRow tr = table.Rows[j]; prjCode = tr.Columns[1].ToNodePlainString(); InfoTitle = tr.Columns[2].ToNodePlainString(); buildUnit = tr.Columns[4].ToNodePlainString(); PublistTime = tr.Columns[5].ToPlainTextString(); InfoType = "业绩公示"; InfoUrl = "http://www.gzzb.gd.cn" + tr.Columns[2].GetATagHref(); string htldtl = string.Empty; try { htldtl = ToolHtml.GetHtmlByUrl(InfoUrl, Encoding.Default); htldtl = htldtl.GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htldtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "block-body"))); //if (dtlList != null && dtlList.Count > 0) //{ // InfoCtx = dtlList.AsString().ToCtxString().Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); //} InfoCtx = "项目编号:" + prjCode + "\r\n项目名称:" + InfoTitle + "\r\n单位编号:" + tr.Columns[3].ToNodePlainString() + "\r\n单位名称:" + buildUnit + "\r\n审核时间:" + PublistTime; htmlTxt = InfoCtx; NoticeInfo info = ToolDb.GenNoticeInfo("广东省", "广州市区", string.Empty, string.Empty, InfoTitle, InfoType, InfoCtx, PublistTime, string.Empty, MsgTypeCosnt.GuangZhouMsgType, InfoUrl, prjCode, buildUnit, string.Empty, string.Empty, string.Empty, string.Empty, htmlTxt); list.Add(info); //parser = new Parser(new Lexer(dtlList.AsHtml())); //NodeList aList = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); //if (aList != null && aList.Count > 0) //{ // for (int c = 0; c < aList.Count; c++) // { // ATag aTag = aList[c].GetATag(); // if (aTag.IsAtagAttach()) // { // string alink = "http://www.gzzb.gd.cn" + aTag.Link; // BaseAttach attach = ToolDb.GenBaseAttach(aTag.LinkText.Replace(" ", "").Replace(";", "").Replace(";", ""), info.Id, alink); // base.AttachList.Add(attach); // } // } //} if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); //取得页码 string html = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch (Exception ex) { Logger.Error(ex); return(list); } int pageInt = 1; Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "totalpage"))); if (pageNode != null && pageNode.Count > 0) { try { pageInt = Convert.ToInt32(pageNode[0].ToNodePlainString()); } catch { } } for (int i = pageInt; i >= 1; i--) { if (i < pageInt) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.nmgztb.com/Html/gongchengxinxi/zhongbiaogongshi/index_" + i + ".htm", Encoding.Default); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(html)); NodeList sNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "100%"))); if (sNodes != null && sNodes.Count > 0) { TableTag table = sNodes[0] as TableTag; for (int t = 0; t < table.RowCount; t++) { if (table.Rows[t].ColumnCount < 2) { continue; } string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, HtmlTxt = string.Empty, strHtml = string.Empty; StringBuilder ctx = new StringBuilder(); TableRow tr = table.Rows[t] as TableRow; NodeList nodeList = tr.SearchFor(typeof(ATag), true); if (nodeList.Count > 0) { ATag aTag = nodeList[0] as ATag; InfoUrl = "http://www.nmgztb.com" + aTag.Link; prjName = aTag.GetAttribute("title"); string htmldtl = string.Empty, dtlStr = string.Empty; try { dtlStr = ToolHtml.GetHtmlByUrl(InfoUrl, Encoding.Default); htmldtl = dtlStr.ToLower(); } catch (Exception ex) { continue; } Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); htmldtl = regexHtml.Replace(htmldtl, ""); Parser parserdtl = new Parser(new Lexer(htmldtl)); NodeList nodesDtl = parserdtl.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "link_con_con"))); if (nodesDtl != null && nodesDtl.Count > 0) { Regex regex = new Regex(@"更新时间:\d{4}年\d{1,2}月\d{1,2}日"); Match math = regex.Match(nodesDtl.AsString()); if (math != null) { beginDate = math.Value.Replace("更新时间:", "").Replace("年", "-").Replace("月", "-").Replace("日", "").Trim(); } } parserdtl.Reset(); nodesDtl = parserdtl.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "oo"))); HtmlTxt = nodesDtl.AsHtml(); Parser par = new Parser(new Lexer(dtlStr)); NodeList htmlList = par.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "oo"))); strHtml = htmlList.AsHtml(); string str = nodesDtl.AsString().Replace(" ", "").Replace(" ", ""); Regex regexCTX = new Regex(@"作者:[^更新时间]+更新时间:\d{4}年\d{1,2}月\d{1,2}日"); str = str.Replace(regexCTX.Match(str).Value, ""); if (str.IndexOf("上一篇:") > -1) { ctx.Append(str.Substring(0, str.IndexOf("上一篇:"))); } else { ctx.Append(str); } if (ctx.ToString().Contains("招标人:") || ctx.ToString().Contains("招标单位:") || ctx.ToString().Contains("招标采购单位:")) { Regex regex = new Regex("(招标人|招标单位|招标采购单位):[^\r\n]+[\r\n]{1}"); Match match = regex.Match(ctx.ToString()); buildUnit = match.Value.Substring(match.Value.IndexOf(":") + 1).Trim(); buildUnit = buildUnit.Replace("“", "").Replace("”", ""); } if (ctx.ToString().Contains("预中标人:")) { try { Regex regex = new Regex("(预中标人):[^\r\n]+[\r\n]{1}"); MatchCollection match = regex.Matches(ctx.ToString()); bidUnit = match[0].Value.Substring(match[0].Value.IndexOf(":") + 1).Trim(); } catch { } } if (ctx.ToString().Contains("第一中标候选人:")) { try { Regex regex = new Regex("(第一中标候选人):[^\r\n]+[\r\n]{1}"); MatchCollection match = regex.Matches(ctx.ToString()); bidUnit = match[0].Value.Substring(match[0].Value.IndexOf(":") + 1).Trim(); } catch { } } if (ctx.ToString().Contains("中标候选人公示")) { try { Regex regex = new Regex("(第一名):[^\r\n]+[\r\n]{1}"); MatchCollection match = regex.Matches(ctx.ToString()); bidUnit = match[0].Value.Substring(match[0].Value.IndexOf(":") + 1).Trim(); } catch { } } Regex regMon = new Regex(@"(中标价|价格|金额)(:|:)[^\r\n]+[\r\n]{1}"); string monerystr = regMon.Match(ctx.ToString()).Value.Replace("中标价", "").Replace("价格", "").Replace("金额", "").Replace(":", "").Replace(":", "").Trim(); Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+"); if (!string.IsNullOrEmpty(regBidMoney.Match(monerystr).Value)) { if (monerystr.Contains("万元") || monerystr.Contains("万美元")) { bidMoney = regBidMoney.Match(monerystr).Value; } else { try { bidMoney = (decimal.Parse(regBidMoney.Match(monerystr).Value) / 10000).ToString(); if (decimal.Parse(bidMoney) < decimal.Parse("0.1")) { bidMoney = "0"; } } catch (Exception) { bidMoney = "0"; } } } if (string.IsNullOrEmpty(bidUnit)) { string nodeCon = string.Empty; parser = new Parser(new Lexer(HtmlTxt)); NodeList nodeCtx = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "msonormaltable"))); if (nodeCtx != null && nodeCtx.Count > 0) { TableTag tabCtx = nodeCtx[0] as TableTag; if (tabCtx.RowCount > 1) { for (int k = 0; k < tabCtx.Rows[0].ColumnCount; k++) { nodeCon += tabCtx.Rows[0].Columns[k].ToNodePlainString(); nodeCon += ":" + tabCtx.Rows[1].Columns[k].ToNodePlainString() + "\r\n"; } } } bidUnit = nodeCon.GetBidRegex().Replace("第一名", ""); bidMoney = nodeCon.GetMoneyRegex(); if (bidMoney == "0") { bidMoney = nodeCon.GetRegex("投标报价(元)").GetMoney(); } } if (string.IsNullOrEmpty(buildUnit)) { buildUnit = ""; } if (string.IsNullOrEmpty(bidUnit)) { bidUnit = ""; } if (Encoding.Default.GetByteCount(buildUnit) > 150) { buildUnit = string.Empty; } if (Encoding.Default.GetByteCount(bidUnit) > 150) { bidUnit = string.Empty; } if (Encoding.Default.GetByteCount(code) > 50) { code = string.Empty; } prjName = ToolDb.GetPrjName(prjName); bidType = ToolHtml.GetInviteTypes(prjName); try { BidInfo info = ToolDb.GenBidInfo("内蒙古自治区", "内蒙古自治区及盟市", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, string.Empty, ctx.ToString(), string.Empty, "内蒙古自治区建设工程招标投标服务中心", bidType, "建设工程", string.Empty, bidMoney, InfoUrl, string.Empty, HtmlTxt); //ToolDb.SaveEntity(info, this.ExistCompareFields); list.Add(info); } catch { Logger.Error(prjName); } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { //取得页码 int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "divPage"))); if (pageList != null && pageList.Count > 0) { try { string temp = pageList.AsString(); pageInt = Convert.ToInt32(temp.GetRegexBegEnd("共", "页")); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "&pageindex=" + i.ToString(), Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "list")), true), new TagNameFilter("li"))); if (nodeList != null && nodeList.Count > 0) { for (int j = 0; j < nodeList.Count; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; infoType = "通知公告"; releaseTime = nodeList[j].ToPlainTextString().GetDateRegex(); headName = nodeList[j].GetATag().LinkText; //try //{ // headName = headName.Substring(3, headName.Length - 3).Replace(".",""); //} //catch { headName = nodeList[j].ToNodePlainString().Replace(releaseTime, ""); } infoUrl = "http://www.szzszx.com.cn" + nodeList[j].GetATagHref(); string htldtl = string.Empty; try { htldtl = ToolHtml.GetHtmlByUrl(infoUrl, Encoding.UTF8).GetJsString(); } catch { } parser = new Parser(new Lexer(htldtl)); NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "content"))); if (noList != null && noList.Count > 0) { ctxHtml = noList[0].ToHtml().Replace("<br/>", "\r\n").Replace("<BR/>", ""); infoCtx = ctxHtml.ToCtxString().Replace(" ", "").Replace(" ", "").Replace("\t\t", "\t").Replace("\t\t", "\t"); infoCtx = Regex.Replace(infoCtx, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase).Replace(" ", "").Replace("\t", "").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); msgType = MsgTypeCosnt.ShenZhenZSWMsgType; infoScorce = infoScorce.Replace(" ", ""); NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "深圳市工程", string.Empty, infoCtx, infoType); sqlCount++; if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } ToolDb.SaveEntity(info, this.ExistCompareFields, ExistsUpdate); //if (ToolDb.SaveEntity(info, this.ExistCompareFields,ExistsUpdate)) //{ // #region 抓取附件 // parser = new Parser(new Lexer(ctxHtml)); // NodeList imgList = parser.ExtractAllNodesThatMatch(new TagNameFilter("img")); // if (imgList != null && imgList.Count > 0) // { // for (int m = 0; m < imgList.Count; m++) // { // try // { // ImageTag img = imgList[m] as ImageTag; // string src = img.GetAttribute("src"); // if (src.ToLower().Contains(".gif")) // continue; // BaseAttach obj = null; // if (src.Contains("http")) // { // obj = ToolHtml.GetBaseAttach(src, headName, info.Id); // } // else // { // obj = ToolHtml.GetBaseAttach("http://www.szzszx.com.cn" + src.Replace("../", "/").Replace("./", "/"), headName, info.Id); // } // if (obj != null) // ToolDb.SaveEntity(obj, string.Empty); // } // catch { } // } // } // parser = new Parser(new Lexer(ctxHtml)); // NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); // if (aNode != null && aNode.Count > 0) // { // for (int a = 0; a < aNode.Count; a++) // { // ATag aTag = aNode[a] as ATag; // string s = aTag.Link; // if (aTag.IsAtagAttach()) // { // try // { // BaseAttach obj = null; // string href = aTag.GetATagHref(); // if (href.Contains("http")) // { // obj = ToolHtml.GetBaseAttach(href, aTag.LinkText, info.Id); // } // else // { // obj = ToolHtml.GetBaseAttach("http://www.szzszx.com.cn"+href.Replace("../","/").Replace("./","/"), aTag.LinkText, info.Id); // } // if (obj != null) // ToolDb.SaveEntity(obj, string.Empty); // } // catch { continue; } // } // } // } // #endregion //} } } } } return(null); }