protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookieStr = string.Empty; int pageInt = 1; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("tr"), new HasAttributeFilter("valign", "top"))); if (pageList != null && pageList.Count > 0) { try { TableRow tr = pageList[0] as TableRow; string temp = tr.Columns[tr.ColumnCount - 1].ToNodePlainString(); temp = temp.Substring(temp.Length - 1, 1); pageInt = int.Parse(temp.Replace("(", "")); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection( new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE", "__VIEWSTATEENCRYPTED", "__EVENTVALIDATION", "sel", "beginDate", "endDate", "infotitle" }, new string[] { "GridView1", "Page$" + i.ToString(), viewState, "", eventValidation, "1", "", "", "" } ); html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "GridView1"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount - 1; j++) { string InfoTitle = string.Empty, InfoType = string.Empty, PublistTime = string.Empty, InfoCtx = string.Empty, InfoUrl = string.Empty, prjCode = string.Empty, buildUnit = string.Empty, htmlTxt = string.Empty; TableRow tr = table.Rows[j]; InfoType = "公告公示"; InfoTitle = tr.Columns[1].ToNodePlainString(); PublistTime = tr.Columns[3].ToPlainTextString(); InfoUrl = "http://www.szjsjy.com.cn/Notify/" + tr.Columns[1].GetATagHref(); string htldtl = string.Empty; try { htldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8); htldtl = htldtl.GetJsString(); } catch { continue; } NoticeInfo info = ToolDb.GenNoticeInfo("广东省", "深圳市工程", string.Empty, string.Empty, InfoTitle, InfoType, InfoCtx, PublistTime, string.Empty, MsgTypeCosnt.ShenZhenMsgType, InfoUrl, prjCode, buildUnit, string.Empty, string.Empty, string.Empty, string.Empty, htmlTxt); parser = new Parser(new Lexer(htldtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "750"))); if (dtlList != null && dtlList.Count > 0) { InfoCtx = dtlList.ToHtml().Replace("</tr>", "\r\n").ToCtxString().Replace("\r\n\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\t", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); parser = new Parser(new Lexer(dtlList.ToHtml())); NodeList aList = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aList != null && aList.Count > 0) { for (int k = 0; k < aList.Count; k++) { ATag aTag = aList[k] as ATag; if (aTag.IsAtagAttach()) { string alink = "http://www.szjsjy.com.cn/" + aTag.Link.Replace("../", ""); BaseAttach attach = ToolDb.GenBaseAttach(aTag.LinkText.Replace(" ", "").Replace(";", "").Replace(";", ""), info.Id, alink); base.AttachList.Add(attach); } } } info.CtxHtml = dtlList.AsHtml(); info.InfoCtx = InfoCtx; list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { //取得页码 int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default).GetJsString(); } catch (Exception ex) { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("height", "25")), true), new TagNameFilter("a"))); if (pageList != null && pageList.Count > 0) { try { string temp = pageList.GetATag(pageList.Count - 3).Link.Replace("&", "kdxx") + "kdxx"; temp = temp.GetRegexBegEnd("page=", "kdxx").Replace("&", ""); pageInt = Convert.ToInt32(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "&page=" + i.ToString(), Encoding.Default).GetJsString(); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "zcfg_right_table")), true), new TagNameFilter("table"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; TableRow tr = table.Rows[j]; infoType = "政策法规"; headName = tr.Columns[1].ToNodePlainString(); infoUrl = "http://www.gzzb.gd.cn" + tr.Columns[1].GetATagHref(); string htldtl = string.Empty; try { htldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htldtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "contentDiv"))); if (dtlList != null && dtlList.Count > 0) { ctxHtml = dtlList.AsHtml(); infoCtx = ctxHtml.ToCtxString().Replace("○", "〇").Replace("O", "〇"); releaseTime = infoCtx.GetChinaTime(); if (string.IsNullOrEmpty(releaseTime)) { releaseTime = infoCtx.GetDateRegex("yyyy年MM月dd日"); } if (string.IsNullOrEmpty(releaseTime)) { releaseTime = infoCtx.GetDateRegex(); } msgType = MsgTypeCosnt.GuangZhouMsgType; NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "广州市区", string.Empty, infoCtx, infoType); sqlCount++; if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } if (ToolDb.SaveEntity(info, this.ExistCompareFields)) { parser = new Parser(new Lexer(ctxHtml)); NodeList imgList = parser.ExtractAllNodesThatMatch(new TagNameFilter("img")); if (imgList != null && imgList.Count > 0) { for (int img = 0; img < imgList.Count; img++) { ImageTag imgTag = imgList[img] as ImageTag; try { BaseAttach obj = null; if (imgTag.GetAttribute("src").Contains("http")) { obj = ToolHtml.GetBaseAttach(imgTag.GetAttribute("src"), headName, info.Id); } else { obj = ToolHtml.GetBaseAttach("http://www.gzzb.gd.cn" + imgTag.GetAttribute("src"), headName, info.Id); } if (obj != null) { ToolDb.SaveEntity(obj, string.Empty); } } catch { } } } parser = new Parser(new Lexer(ctxHtml)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int a = 0; a < aNode.Count; a++) { ATag aTag = aNode[a] as ATag; if (aTag.IsAtagAttach()) { try { BaseAttach obj = null; if (aTag.Link.Contains("http")) { obj = ToolHtml.GetBaseAttach(aTag.Link, aTag.LinkText, info.Id); } else { obj = ToolHtml.GetBaseAttach("http://www.gzzb.gd.cn" + aTag.Link, aTag.LinkText, info.Id); } if (obj != null) { ToolDb.SaveEntity(obj, string.Empty); } } catch { } } } } } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ProjectResult>(); int sqlCount = 0; string html = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + this.MaxCount); } catch { return(null); } int startIndex = html.IndexOf("{"); int endIndex = html.LastIndexOf("}"); html = html.Substring(startIndex, (endIndex + 1) - startIndex); JavaScriptSerializer serializer = new JavaScriptSerializer(); Dictionary <string, object> smsTypeJson = (Dictionary <string, object>)serializer.DeserializeObject(html); foreach (KeyValuePair <string, object> obj in smsTypeJson) { if (obj.Key == "total") { continue; } object[] array = (object[])obj.Value; foreach (object arrValue in array) { string Code = string.Empty, prjName = string.Empty, BuildUnit = string.Empty, FinalistsWay = string.Empty, RevStaMethod = string.Empty, SetStaMethod = string.Empty, VoteMethod = string.Empty, RevStaDate = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty, Ctx = string.Empty, Html = string.Empty, beginDate = string.Empty; Dictionary <string, object> dic = (Dictionary <string, object>)arrValue; Code = Convert.ToString(dic["bdBH"]); prjName = Convert.ToString(dic["bdName"]); beginDate = Convert.ToString(dic["createTime2"]); string dbjieGuoid = Convert.ToString(dic["dbJieGuoGuid"]); string bdId = Convert.ToString(dic["bdGuid"]); string ggId = Convert.ToString(dic["ggGuid"]); string detailUrl = Convert.ToString(dic["detailUrl"]); InfoUrl = "https://www.szjsjy.com.cn:8001/jyw-ba/jyxx/queryOldOTDataDetail.do?type=9&id=" + Code; string attachJson = string.Empty; try { Html = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); string temp = Html.GetReplace("\"\""); if (string.IsNullOrWhiteSpace(temp)) { InfoUrl = " https://www.szjsjy.com.cn:8001/jyw-ba/jyxx/dbResult_View.do?bdGuid=" + bdId + "&ggGuid=" + ggId + "&dbJieGuoGuid=" + dbjieGuoid; Html = this.ToolWebSite.GetHtmlByUrl(InfoUrl); string url = "https://www.szjsjy.com.cn:8001/jyw-ba/jyxx/queryDbJieGuoByGuid.do?guid=" + dbjieGuoid; attachJson = this.ToolWebSite.GetHtmlByUrl(url); } } catch { continue; } string gcName = string.Empty, bdName = string.Empty, zbrName = string.Empty, createTime = string.Empty, lxr = string.Empty, lxdh = string.Empty, dbBanFa = string.Empty, piaoJueBanFa = string.Empty; bool isChouQian = false; string attachId = string.Empty; string rwFs = string.Empty; string unitUrl = string.Empty; string lxrxx = string.Empty; string lxdhxx = string.Empty; if (!string.IsNullOrWhiteSpace(attachJson)) { JavaScriptSerializer newSerializer = new JavaScriptSerializer(); Dictionary <string, object> newTypeJson = (Dictionary <string, object>)newSerializer.DeserializeObject(attachJson); Dictionary <string, object> kdInfo = (Dictionary <string, object>)newTypeJson; Dictionary <string, object> ggbd = (Dictionary <string, object>)kdInfo["ggbd"]; Dictionary <string, object> gc = (Dictionary <string, object>)ggbd["gc"]; Dictionary <string, object> bd = (Dictionary <string, object>)kdInfo["bd"]; Dictionary <string, object> bdgc = (Dictionary <string, object>)bd["gc"]; try { attachId = Convert.ToString(kdInfo["attachFileGroupGuid"]); } catch { } try { string ggGuid = Convert.ToString(kdInfo["ggGuid"]); string bdGuid = Convert.ToString(kdInfo["bdGuid"]); unitUrl = "https://www.szjsjy.com.cn:8001/jyw-ba/jyxx/queryTbrListByBdGuidAndGgGuidForGs.do?bdGuid=" + bdGuid + "&ggGuid=" + ggGuid; } catch { } gcName = Convert.ToString(gc["gcName"]); try { bdName = Convert.ToString(kdInfo["bdName"]); } catch { bdName = gcName; } zbrName = Convert.ToString(gc["zbRName"]); createTime = Convert.ToString(kdInfo["dbTime"]); createTime = ToolHtml.GetDateTimeByLong(Convert.ToInt64(createTime)).ToString(); try { lxr = Convert.ToString(bdgc["lianXiRenName"]); } catch { } try { lxrxx = Convert.ToString(bdgc["jingBanRenName"]); } catch { } try { lxdh = Convert.ToString(bdgc["lianXiRenPhone"]); } catch { } try { lxdhxx = Convert.ToString(bdgc["jingBanRenMobile"]); } catch { } try { rwFs = Convert.ToString(kdInfo["rwFangShi"]); } catch { } try { dbBanFa = Convert.ToString(kdInfo["dbBanFa"]); } catch { } try { piaoJueBanFa = Convert.ToString(kdInfo["piaoJueBanFa"]); } catch { } try { isChouQian = (bool)kdInfo["isChouQian"]; } catch { } string surl = " https://www.szjsjy.com.cn:8001/jyw-ba/jyxx/dbResult_View.do?bdGuid=" + bdId + "&ggGuid=" + ggId + "&dbJieGuoGuid=" + dbjieGuoid; attachJson = this.ToolWebSite.GetHtmlByUrl(surl); Html = attachJson; Parser parserNew = new Parser(new Lexer(Html)); NodeList tableNode = parserNew.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "de_tab1"))); if (tableNode != null && tableNode.Count > 0) { Html = tableNode.AsHtml(); Html = Html.GetReplace("<td id=\"ggName\"> </td>", "<td id=\"ggName\"> " + prjName + "</td>"); Html = Html.GetReplace("<td id=\"bdBH\"> </td>", "<td id=\"bdBH\"> " + Code + "</td>"); Html = Html.GetReplace("<td id=\"bdName\"> </td>", "<td id=\"bdName\"> " + bdName + "</td>"); Html = Html.GetReplace("<td id=\"zbRName\"> </td>", "<td id=\"zbRName\"> " + zbrName + "</td>"); Html = Html.GetReplace("<td id=\"dbTime\"> </td>", "<td id=\"dbTime\"> " + createTime + "</td>"); Html = Html.GetReplace("<td id=\"rwfs\"> </td>", "<td id=\"rwfs\"> " + rwFs + "</td>"); Html = Html.GetReplace("<td id=\"dbBanFa\"> </td>", "<td id=\"dbBanFa\"> " + dbBanFa + "</td>"); Html = Html.GetReplace("<td id=\"lianXiRenName\"> </td>", "<td id=\"lianXiRenName\"> " + lxrxx + "</td>"); Html = Html.GetReplace("<td id=\"lianXiRenPhone\"> </td>", "<td id=\"lianXiRenPhone\"> " + lxdhxx + "</td>"); Ctx = Html.Replace("</tr>", "\r\n").ToCtxString(); } } string resultCtx = string.Empty; Parser parser = new Parser(new Lexer(Html.GetReplace("\\\"", "\"").GetReplace("0:00:00", ""))); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "de_tab1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int r = 0; r < table.RowCount; r++) { for (int c = 0; c < table.Rows[r].ColumnCount; c++) { string temp = table.Rows[r].Columns[c].ToPlainTextString().GetReplace(":,:"); if (c % 2 == 0) { resultCtx += temp + ":"; } else { resultCtx += temp + "\r\n"; } } } } string strTmp = string.Empty; if (!string.IsNullOrEmpty(unitUrl)) { string unithtml = string.Empty; try { unithtml = this.ToolWebSite.GetHtmlByUrl(unitUrl); } catch { } object[] unitTypeJson = (object[])serializer.DeserializeObject(unithtml); if (unitTypeJson.Length > 0) { List <LongGangResult> unitLists = this.GetUnits(unitTypeJson); if (isChouQian) { strTmp += "<table width='100%' border='0' class='de_tab2'>"; strTmp += "<tr>"; strTmp += "<th style='text-align: left' class='bg_tdtop'>序号</th>"; strTmp += "<th style='text-align: left' class='bg_tdtop'>投标人名称</th>"; strTmp += "<th style='text-align: left' class='bg_tdtop'>投标时间</th>"; strTmp += "<th style='text-align: left' class='bg_tdtop'>中标候选人</th>"; strTmp += "</tr>"; foreach (LongGangResult unitInfo in unitLists.OrderBy(x => x.Xh)) { strTmp = strTmp + "<tr>"; strTmp = strTmp + "<td style='padding: 0px'>" + unitInfo.Xh + "</td>"; strTmp = strTmp + "<td style='padding: 0px'>" + unitInfo.UnitName + "</td>"; strTmp = strTmp + "<td style='padding: 0px'>" + unitInfo.TbDate + "</td>"; if (unitInfo.BidStatus == "3") { strTmp = strTmp + "<td><input type='checkbox' checked=true disabled=true/></td>"; } else { strTmp = strTmp + "<td><input type='checkbox' disabled=true/></td>"; } strTmp = strTmp + "</tr>"; } strTmp = strTmp + "</table>"; } else if (dbBanFa == "其他方法") { strTmp += "<table width='100%' border='0' class='de_tab2'>"; strTmp += "<tr>"; strTmp += "<th style='text-align: left' class='bg_tdtop'>序号</th>"; strTmp += "<th style='text-align: left' class='bg_tdtop'>企业名称</th>"; strTmp += "<th style='text-align: left' class='bg_tdtop'>确定中标候选人</th>"; strTmp += "</tr>"; foreach (LongGangResult unitInfo in unitLists.OrderBy(x => x.Xh)) { strTmp = strTmp + "<tr>"; strTmp = strTmp + "<td style='padding: 0px'>" + unitInfo.Xh + "</td>"; strTmp = strTmp + "<td style='padding: 0px'>" + unitInfo.UnitName + "</td>"; if (unitInfo.BidStatus == "3") { strTmp = strTmp + "<td><input type='checkbox' checked=true disabled=true/></td>"; } else { strTmp = strTmp + "<td><input type='checkbox' disabled=true/></td>"; } strTmp = strTmp + "</tr>"; } strTmp = strTmp + "</table>"; } else if (dbBanFa == "逐轮淘汰") { strTmp += "<table width='100%' border='0' class='de_tab2'>"; strTmp += "<tr>"; strTmp += "<th style='text-align: left' class='bg_tdtop'>序号</th>"; strTmp += "<th style='text-align: left' class='bg_tdtop'>投标人名称</th>"; strTmp += "<th style='text-align: left' class='bg_tdtop'>投标报价(元)</th>"; strTmp += "<th style='text-align: left' class='bg_tdtop'>投标时间</th>"; strTmp += "<th style='text-align: left' class='bg_tdtop'>是否入围</th>"; strTmp += "</tr>"; foreach (LongGangResult unitInfo in unitLists.OrderBy(x => x.Xh)) { strTmp = strTmp + "<tr>"; strTmp = strTmp + "<td style='padding: 0px'>" + unitInfo.Xh + "</td>"; strTmp = strTmp + "<td style='padding: 0px'>" + unitInfo.UnitName + "</td>"; strTmp = strTmp + "<td style='padding: 0px'>" + unitInfo.BidMoney + "</td>"; strTmp = strTmp + "<td style='padding: 0px'>" + unitInfo.TbDate + "</td>"; if (unitInfo.IsNo == "是") { strTmp = strTmp + "<td><input type='checkbox' checked=true disabled=true/></td>"; } else { strTmp = strTmp + "<td><input type='checkbox' disabled=true/></td>"; } strTmp = strTmp + "</tr>"; } strTmp = strTmp + "</table>"; } else if (dbBanFa == "集体议事法") { strTmp += "<table width='100%' border='0' class='de_tab2'>"; strTmp += "<tr>"; strTmp += "<th style='text-align: left' class='bg_tdtop'>序号</th>"; strTmp += "<th style='text-align: left' class='bg_tdtop'>企业名称</th>"; strTmp += "<th style='text-align: left' class='bg_tdtop'>确定中标候选人</th>"; strTmp += "</tr>"; foreach (LongGangResult unitInfo in unitLists.OrderBy(x => x.Code)) { strTmp = strTmp + "<tr>"; strTmp = strTmp + "<td style='padding: 0px'>" + unitInfo.Code + "</td>"; strTmp = strTmp + "<td style='padding: 0px'>" + unitInfo.UnitName + "</td>"; if (unitInfo.IsNo == "是") { strTmp = strTmp + "<td><input type='checkbox' checked=true disabled=true/></td>"; } else { strTmp = strTmp + "<td><input type='checkbox' disabled=true/></td>"; } strTmp = strTmp + "</tr>"; } strTmp = strTmp + "</table>"; } else if (dbBanFa == "价格竞争法") { strTmp += "<table width='100%' border='0' class='de_tab2'>"; strTmp += "<tr>"; strTmp += "<th style='text-align: left' class='bg_tdtop'>序号</th>"; strTmp += "<th style='text-align: left' class='bg_tdtop'>企业名称</th>"; strTmp += "<th style='text-align: left' class='bg_tdtop'>确定中标候选人</th>"; strTmp += "</tr>"; foreach (LongGangResult unitInfo in unitLists.OrderBy(x => x.Xh)) { strTmp = strTmp + "<tr>"; strTmp = strTmp + "<td style='padding: 0px'>" + unitInfo.Xh + "</td>"; strTmp = strTmp + "<td style='padding: 0px'>" + unitInfo.UnitName + "</td>"; if (unitInfo.BidStatus == "3") { strTmp = strTmp + "<td><input type='checkbox' checked=true disabled=true/></td>"; } else { strTmp = strTmp + "<td><input type='checkbox' disabled=true/></td>"; } strTmp = strTmp + "</tr>"; } strTmp = strTmp + "</table>"; } else if (piaoJueBanFa == "简单多数法") { strTmp += "<table width='100%' border='0' class='de_tab2'>"; strTmp += "<tr>"; strTmp += "<th style='text-align: left' class='bg_tdtop'>编号</th>"; strTmp += "<th style='text-align: left' class='bg_tdtop'>投标单位</th>"; strTmp += "<th style='text-align: left' class='bg_tdtop'>得票数</th>"; strTmp += "<th style='text-align: left' class='bg_tdtop'>排名</th>"; strTmp += "</tr>"; foreach (LongGangResult unitInfo in unitLists.OrderBy(x => x.Code)) { strTmp = strTmp + "<tr>"; strTmp = strTmp + "<td style='padding: 0px'>" + unitInfo.Code + "</td>"; strTmp = strTmp + "<td style='padding: 0px'>" + unitInfo.UnitName + "</td>"; strTmp = strTmp + "<td style='padding: 0px'>" + unitInfo.Piao + "</td>"; strTmp = strTmp + "<td style='padding: 0px'>" + unitInfo.Xh + "</td>"; strTmp = strTmp + "</tr>"; } strTmp = strTmp + "</table>"; } else if (piaoJueBanFa == "一对一比较法") { strTmp += "<table width='100%' border='0' class='de_tab2'>"; strTmp += "<tr>"; strTmp += "<th style='text-align: left' class='bg_tdtop'>编号</th>"; strTmp += "<th style='text-align: left' class='bg_tdtop'>投标单位</th>"; strTmp += "<th style='text-align: left' class='bg_tdtop'>取胜次数</th>"; strTmp += "<th style='text-align: left' class='bg_tdtop'>排名</th>"; strTmp += "</tr>"; foreach (LongGangResult unitInfo in unitLists.OrderBy(x => x.Code)) { strTmp = strTmp + "<tr>"; strTmp = strTmp + "<td style='padding: 0px'>" + unitInfo.Code + "</td>"; strTmp = strTmp + "<td style='padding: 0px'>" + unitInfo.UnitName + "</td>"; strTmp = strTmp + "<td style='padding: 0px'>" + unitInfo.Piao + "</td>"; strTmp = strTmp + "<td style='padding: 0px'>" + unitInfo.Xh + "</td>"; strTmp = strTmp + "</tr>"; } strTmp = strTmp + "</table>"; } else { strTmp += "<table width='100%' border='0' class='de_tab2'>"; strTmp += "<tr>"; strTmp += "<th style='text-align: left' class='bg_tdtop'>编号</th>"; strTmp += "<th style='text-align: left' class='bg_tdtop'>投标单位</th>"; strTmp += "<th style='text-align: left' class='bg_tdtop'>取胜次数</th>"; strTmp += "<th style='text-align: left' class='bg_tdtop'>排名</th>"; strTmp += "</tr>"; foreach (LongGangResult unitInfo in unitLists.OrderBy(x => x.Code)) { strTmp = strTmp + "<tr>"; strTmp = strTmp + "<td style='padding: 0px'>" + unitInfo.Code + "</td>"; strTmp = strTmp + "<td style='padding: 0px'>" + unitInfo.UnitName + "</td>"; strTmp = strTmp + "<td style='padding: 0px'>" + unitInfo.Piao + "</td>"; strTmp = strTmp + "<td style='padding: 0px'>" + unitInfo.Xh + "</td>"; strTmp = strTmp + "</tr>"; } strTmp = strTmp + "</table>"; } } } Ctx = Html.GetReplace("</tr> ", "\r\n").ToCtxString(); BuildUnit = resultCtx.GetRegex("建设单位").GetReplace(" ", ""); if (string.IsNullOrEmpty(BuildUnit)) { BuildUnit = zbrName; } FinalistsWay = resultCtx.GetRegex("入围方式").GetReplace(" ", ""); RevStaMethod = resultCtx.GetRegex("评标方法"); SetStaMethod = resultCtx.GetRegex("定标方法").GetReplace(" ", ""); VoteMethod = resultCtx.GetRegex("票决方法"); RevStaDate = resultCtx.GetRegex("定标时间").GetDateRegex(); if (string.IsNullOrEmpty(RevStaDate)) { RevStaDate = createTime; } if (!string.IsNullOrWhiteSpace(strTmp)) { Html += strTmp; Ctx = Html.GetReplace("</tr> ", "\r\n").ToCtxString(); } MsgType = "深圳市建设工程交易中心宝安分中心"; ProjectResult info = ToolDb.GetProjectResult("广东省", "深圳宝安区工程", "宝安区", Code, prjName, BuildUnit, FinalistsWay, RevStaMethod, SetStaMethod, VoteMethod, RevStaDate, detailUrl, MsgType, Ctx, Html, beginDate); sqlCount++; if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate, this.ExistsHtlCtx)) { if (!string.IsNullOrWhiteSpace(attachId)) { string url = "https://www.szjsjy.com.cn:8001/jyw-ba/jyxx/filegroup/queryByGroupGuidZS.do?groupGuid=" + attachId; string attachHtml = string.Empty; try { attachHtml = this.ToolWebSite.GetHtmlByUrl(url); } catch { } if (!string.IsNullOrWhiteSpace(attachHtml)) { JavaScriptSerializer newSerializers = new JavaScriptSerializer(); Dictionary <string, object> newTypeJsons = (Dictionary <string, object>)newSerializers.DeserializeObject(attachHtml); Dictionary <string, object> mofo = (Dictionary <string, object>)newTypeJsons; object[] objs = (object[])mofo["rows"]; foreach (object objAttach in objs) { Dictionary <string, object> attachs = (Dictionary <string, object>)objAttach; string attachguid = Convert.ToString(attachs["attachGuid"]); string attachName = Convert.ToString(attachs["attachName"]); string link = "https://www.szjsjy.com.cn:8001/file/downloadFile?fileId=" + attachguid; BaseAttach attach = ToolHtml.GetBaseAttach(link, attachName, info.Id, "SiteManage\\Files\\Attach\\"); if (attach != null) { ToolDb.SaveEntity(attach, "SourceID,AttachServerPath"); } } } } else { parser = new Parser(new Lexer(Html)); NodeList fileNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (fileNode != null && fileNode.Count > 0) { for (int f = 0; f < fileNode.Count; f++) { ATag tag = fileNode[f] as ATag; if (tag.IsAtagAttach() || tag.Link.ToLower().Contains("downloadfile")) { try { BaseAttach attach = null; string link = string.Empty; if (tag.Link.ToLower().Contains("http")) { link = tag.Link; if (link.StartsWith("\\")) { link = link.Substring(link.IndexOf("\\"), link.Length - link.IndexOf("\\")); } if (link.EndsWith("//")) { link = link.Remove(link.LastIndexOf("//")); } link = link.GetReplace("\\", ""); } else { link = "https://www.szjsjy.com.cn:8001/" + tag.Link; } attach = ToolHtml.GetBaseAttach(link, tag.LinkText, info.Id, "SiteManage\\Files\\Attach\\"); if (attach != null) { ToolDb.SaveEntity(attach, "SourceID,AttachServerPath"); } } catch { continue; } } } } } } if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <NoticeInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("nowrap", "true"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("总页数", "当前页").Replace(":", ""); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__VIEWSTATE", "__EVENTTARGET", "__EVENTARGUMENT" }, new string[] { viewState, "MoreInfoList1$Pager", i.ToString() }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.Default, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "MoreInfoList1_DataGrid1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string InfoTitle = string.Empty, InfoType = string.Empty, PublistTime = string.Empty, InfoCtx = string.Empty, InfoUrl = string.Empty, prjCode = string.Empty, buildUnit = string.Empty, htmlTxt = string.Empty; InfoType = "澄清修改通知"; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); InfoTitle = aTag.GetAttribute("title"); PublistTime = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.hbggzy.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "TDContent"))); if (dtlNode != null && dtlNode.Count > 0) { htmlTxt = dtlNode.AsHtml(); InfoCtx = htmlTxt.GetReplace("</p>,<br />,<br/>", "\r\n").ToCtxString(); NoticeInfo info = ToolDb.GenNoticeInfo("湖北省", "湖北省及地市", "", string.Empty, InfoTitle, InfoType, InfoCtx, PublistTime, string.Empty, "湖北省公共资源交易中心", InfoUrl, prjCode, buildUnit, string.Empty, string.Empty, "其他项目", string.Empty, htmlTxt); list.Add(info); parser = new Parser(new Lexer(htmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.hbggzy.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("bgColor", "#EEF4F9"))); if (tdNodes != null && tdNodes.Count > 0) { string pageTemp = tdNodes.AsString().Replace(" ", "").Replace(" ", "").Trim(); Regex regpage = new Regex(@"1/[0-9]+页"); try { pageInt = int.Parse(regpage.Match(pageTemp).Value.Split('/')[1].Replace("页", "").Trim()); } catch (Exception ex) { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.szzdzb.cn/Product-index-id-11-p-" + i + ".html", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "hangao27"))), new TagNameFilter("table"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; code = tr.Columns[0].ToPlainTextString().Trim(); prjName = tr.Columns[1].ToPlainTextString().Trim(); ATag aTag = tr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://www.szzdzb.cn" + aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace(" ", "").GetJsString(); } catch { continue; } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "hangao27"))), new TagNameFilter("table"))); if (dtnode != null && dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); bidCtx = HtmlTxt.ToLower().GetReplace("</p>,<br/>", "\r\n").ToCtxString(); beginDate = bidCtx.GetRegex("发布时间").GetDateRegex(); if (bidCtx.Contains("确定中标供应商")) { parser = new Parser(new Lexer(HtmlTxt)); NodeList nodeTab = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "pContent"))), new TagNameFilter("table"))); if (nodeTab != null && nodeTab.Count > 0) { TableTag tabNode = nodeTab[0] as TableTag; for (int r = 0; r < tabNode.RowCount; r++) { try { if (tabNode.Rows[r].ToNodePlainString().Contains("确定中标供应商")) { bidUnit = tabNode.Rows[r + 1].Columns[1].ToNodePlainString(); bidMoney = tabNode.Rows[r + 2].Columns[1].ToNodePlainString().Replace(",", "").Replace(",", "").GetMoney("万元"); break; } } catch { } } } if (string.IsNullOrWhiteSpace(bidUnit)) { bidUnit = bidCtx.GetBidRegex(); } if (bidMoney == "0" || string.IsNullOrWhiteSpace(bidMoney)) { bidMoney = bidCtx.Replace(",", "").Replace(",", "").GetMoneyRegex(); } } else { bidUnit = bidCtx.GetBidRegex(new string[] { "第一备选供应商" }); parser = new Parser(new Lexer(HtmlTxt)); NodeList nodeTab = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "pContent"))), new TagNameFilter("table"))); if (nodeTab != null && nodeTab.Count > 0) { TableTag tabNode = nodeTab[0] as TableTag; for (int r = 0; r < tabNode.RowCount; r++) { try { if (tabNode.Rows[r].ToNodePlainString().Contains(bidUnit)) { bidMoney = tabNode.Rows[r].Columns[2].ToNodePlainString().Replace(",", "").Replace(",", "").GetMoney(); break; } } catch { } } } } specType = "其他"; msgType = "深圳市振东招标代理有限公司"; bidType = ToolHtml.GetInviteTypes(prjName); prjName = ToolDb.GetPrjName(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookieStr = string.Empty; int pageInt = 1; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "tzgg_right_page")), true), new TagNameFilter("span"))); if (pageList != null && pageList.Count > 0) { try { Span temp = pageList[pageList.Count - 1] as Span; string tem = temp.GetAttribute("onclick"); pageInt = Convert.ToInt32(tem.Replace("goPage(", "").Replace(")", "")); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "page", "xmlb", "xmjdbmid", "method", "SearchBar", "PageSize" }, new string[] { i.ToString(), "", "", "", "Y", "15" }); html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "table1"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string InfoTitle = string.Empty, InfoType = string.Empty, PublistTime = string.Empty, InfoCtx = string.Empty, InfoUrl = string.Empty, htmlTxt = string.Empty; TableRow tr = table.Rows[j]; InfoTitle = tr.Columns[1].ToNodePlainString(); PublistTime = tr.Columns[2].ToPlainTextString(); InfoType = "资审公示"; InfoUrl = "http://www.gzzb.gd.cn" + tr.Columns[1].GetATagHref(); string htldtl = string.Empty; try { htldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); htldtl = htldtl.GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htldtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("valign", "top"))); if (dtlList != null && dtlList.Count > 0) { htmlTxt = dtlList.ToHtml(); InfoCtx = dtlList.AsString().ToCtxString().Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); } NoticeInfo info = ToolDb.GenNoticeInfo("广东省", "广州市区", string.Empty, string.Empty, InfoTitle, InfoType, InfoCtx, PublistTime, string.Empty, MsgTypeCosnt.GuangZhouMsgType, InfoUrl, string.Empty, string.Empty, string.Empty, string.Empty, string.Empty, string.Empty, htmlTxt); list.Add(info); parser = new Parser(new Lexer(dtlList.AsHtml())); NodeList aList = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aList != null && aList.Count > 0) { for (int c = 0; c < aList.Count; c++) { ATag aTag = aList[c].GetATag(); if (aTag.IsAtagAttach()) { string alink = "http://www.gzzb.gd.cn" + aTag.Link; BaseAttach attach = ToolDb.GenBaseAttach(aTag.LinkText.Replace(" ", "").Replace(";", "").Replace(";", ""), info.Id, alink); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default).Replace(" ", ""); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "Body_div")), true), new TagNameFilter("li"))); if (sNode != null && sNode.Count > 0) { for (int t = 0; t < sNode.Count; t++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; INode node = sNode[t]; ATag aTag = node.GetATag(); prjName = aTag.GetAttribute("title"); beginDate = node.ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.tyjzsc.com.cn/" + aTag.Link.GetReplace("./"); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("style", "width:650px;"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); code = inviteCtx.GetCodeRegex(); msgType = "太原市建设工程交易中心"; specType = "建设工程"; inviteType = prjName.GetInviteBidType(); InviteInfo info = ToolDb.GenInviteInfo("山西省", "山西省及地市", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); //parser = new Parser(new Lexer(HtmlTxt)); //NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); //if (aNode != null && aNode.Count > 0) //{ // for (int k = 0; k < aNode.Count; k++) // { // ATag a = aNode[k] as ATag; // if (a.IsAtagAttach()) // { // BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, a.Link); // base.AttachList.Add(attach); // } // } //} } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <NoticeInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("vAlign", "bottom"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("总页数:", "当前"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); string __CSRFTOKEN = ToolHtml.GetHtmlInputValue(html, "__CSRFTOKEN"); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__CSRFTOKEN", "__VIEWSTATE", "__EVENTTARGET", "__EVENTARGUMENT" }, new string[] { __CSRFTOKEN, viewState, "MoreInfoList1$Pager", i.ToString() }); try { cookiestr = cookiestr.GetReplace(new string[] { "path=/;", "HttpOnly", "," }); html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "MoreInfoList1_DataGrid1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string InfoTitle = string.Empty, InfoType = string.Empty, PublistTime = string.Empty, InfoCtx = string.Empty, InfoUrl = string.Empty, htmlTxt = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); InfoTitle = aTag.GetAttribute("title"); PublistTime = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.spprec.com" + aTag.Link; InfoType = "变更公告"; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "ivs_content"))); if (dtlNode != null && dtlNode.Count > 0) { htmlTxt = dtlNode.AsHtml(); InfoCtx = htmlTxt.GetReplace("<br />,<br/>,<br>,</p>", "\r\n").ToCtxString(); NoticeInfo info = ToolDb.GenNoticeInfo("四川省", "四川省及地市", string.Empty, string.Empty, InfoTitle, InfoType, InfoCtx, PublistTime, string.Empty, "四川省公共资源交易中心", InfoUrl, string.Empty, string.Empty, string.Empty, string.Empty, "建设工程", string.Empty, htmlTxt); list.Add(info); parser = new Parser(new Lexer(htmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag tag = aNode[k] as ATag; if (tag.IsAtagAttach()) { string link = string.Empty; if (tag.Link.ToLower().Contains("http")) { link = tag.Link; } else { link = "http://www.spprec.com" + tag.Link; } BaseAttach attach = ToolDb.GenBaseAttach(tag.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "MoreInfoList1_Pager"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[0].ToPlainTextString().GetRegexBegEnd("总页数", "当前").Replace(":", ""); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection( new string[] { "__VIEWSTATE", "__EVENTTARGET", "__EVENTARGUMENT" }, new string[] { viewState, "MoreInfoList1$Pager", i.ToString() } ); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "MoreInfoList1_DataGrid1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); prjName = aTag.GetAttribute("title"); beginDate = tr.ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.gaxqjyzx.com" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "TDContent"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); code = inviteCtx.GetCodeRegex(); buildUnit = inviteCtx.GetBuildRegex(); if (buildUnit.Contains("招标代理机构")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("招标代理机构")); } if (buildUnit.Contains("代理机构")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("代理机构")); } prjAddress = inviteCtx.GetAddressRegex(); inviteType = prjName.GetInviteBidType(); specType = "建设工程"; msgType = "贵安新区公共资源交易中心"; InviteInfo info = ToolDb.GenInviteInfo("贵州省", "贵州省及地市", "贵安新区", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8, ref cookiestr); } catch { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("nowrap", "true"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("总页数:", "当"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection( new string[] { "__VIEWSTATE", "__EVENTTARGET", "__EVENTARGUMENT", "__EVENTVALIDATION", "MoreInfoList1$txtTitle" }, new string[] { viewState, "MoreInfoList1$Pager", i.ToString(), eventValidation, "" } ); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "MoreInfoList1_DataGrid1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty, area = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); headName = aTag.GetAttribute("title"); releaseTime = tr.Columns[2].ToPlainTextString().GetDateRegex(); string temp = tr.Columns[1].ToNodePlainString(); if (temp.Contains("[") && temp.Contains("]")) { area = temp.Substring(temp.IndexOf("["), temp.IndexOf("]") - temp.IndexOf("[")).GetReplace("[,]"); } infoUrl = "http://www.sxszbb.com" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("height", "500"))); if (dtlNode != null && dtlNode.Count > 0) { ctxHtml = dtlNode.AsHtml(); infoCtx = ctxHtml.ToCtxString(); infoType = "通知公告"; msgType = "陕西省建设工程招标投标管理办公室"; NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "陕西省", "陕西省及地市", area, infoCtx, infoType); sqlCount++; if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate)) { parser = new Parser(new Lexer(ctxHtml)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int a = 0; a < aNode.Count; a++) { ATag tag = aNode[a] as ATag; if (tag.IsAtagAttach()) { string link = string.Empty; if (tag.Link.ToLower().Contains("http")) { link = tag.Link; } else { link = "http://www.sxszbb.com" + tag.Link.GetReplace("../,./"); } try { BaseAttach baseInfo = ToolHtml.GetBaseAttach(link, tag.LinkText, info.Id); if (baseInfo != null) { ToolDb.SaveEntity(baseInfo, string.Empty); } } catch { } } } } } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "ctl00_ContentPlaceHolder2_lblSumPage"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[0].ToNodePlainString(); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE", "__VIEWSTATEGENERATOR", "__EVENTVALIDATION" }, new string[] { "ctl00$ContentPlaceHolder2$lnkBtnNext", "", viewState, "96852609", eventValidation }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.Default, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "695"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount - 1; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.GetATag(); prjName = aTag.LinkText.Trim(); beginDate = tr.Columns[1].ToPlainTextString().GetDateRegex(); InfoUrl = "http://js.panyu.gov.cn/" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "ctl00_ContentPlaceHolder1_txtContent"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml();//.Replace("<br", "\r\n<br"); inviteCtx = HtmlTxt.ToLower().GetReplace("</p>,</br>,<br>", "\r\n").ToCtxString(); code = inviteCtx.GetCodeRegex().GetCodeDel(); buildUnit = inviteCtx.GetBuildRegex(); if (buildUnit.Contains("招标代理")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("招标代理")); } if (buildUnit.Contains("地址")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("地址")); } if (buildUnit.Contains("公司")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司"; } prjAddress = inviteCtx.GetAddressRegex(); inviteType = prjName.GetInviteBidType(); msgType = "广州市番禺区住房和建设局"; specType = "政府采购"; InviteInfo info = ToolDb.GenInviteInfo("广东省", "广州政府采购", "番禺区", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList fileNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (fileNode != null && fileNode.Count > 0) { for (int k = 0; k < fileNode.Count; k++) { ATag fileAtag = fileNode[k].GetATag(); if (fileAtag.IsAtagAttach()) { string fileName = fileAtag.LinkText.ToNodeString().Replace(" ", ""); string fileLink = fileAtag.Link; if (!fileLink.ToLower().Contains("http")) { fileLink = "http://js.panyu.gov.cn/" + fileAtag.Link; } base.AttachList.Add(ToolDb.GenBaseAttach(fileName, info.Id, fileLink)); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("li"), new HasAttributeFilter("class", "wb-page-li"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("/", "\r"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { SiteUrl = "http://www.jxsggzy.cn/web/jyxx/002002/002002005/" + i + ".html"; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8, ref cookiestr); } catch { continue; } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("li"), new HasAttributeFilter("class", "ewb-list-node clearfix"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty; ATag aTag = listNode[j].GetATag(); prjName = aTag.GetAttribute("title"); if (string.IsNullOrWhiteSpace(prjName)) { prjName = aTag.LinkText; } beginDate = listNode[j].ToPlainTextString().GetDateRegex(); if (prjName[2].Equals('县') || prjName[2].Equals('区') || prjName[2].Equals('市')) { area = prjName.Substring(0, 3); } InfoUrl = "http://www.jxsggzy.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "article-info"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); bidCtx = HtmlTxt.Replace("<br/>", "\r\n").ToCtxString(); buildUnit = bidCtx.GetBuildRegex(); if (string.IsNullOrEmpty(buildUnit)) { buildUnit = bidCtx.GetRegexBegEnd("招标人名称:", "项目"); } parser = new Parser(new Lexer(HtmlTxt)); NodeList dtlBidNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "MsoNormalTable"))); if (dtlBidNode != null && dtlBidNode.Count > 0) { TableTag bidTable = dtlBidNode[0] as TableTag; string ctx = string.Empty; for (int r = 0; r < bidTable.RowCount; r++) { for (int c = 0; c < bidTable.Rows[r].ColumnCount; c++) { string temp = bidTable.Rows[r].Columns[c].ToNodePlainString(); if (string.IsNullOrEmpty(temp)) { continue; } if ((c + 1) % 2 == 0) { ctx += temp + "\r\n"; } else { ctx += temp + ":"; } } } prjAddress = ctx.GetAddressRegex(); buildUnit = ctx.GetBuildRegex(); bidUnit = ctx.GetBidRegex(new string[] { "第一名" }); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = ctx.GetRegex("第一"); } bidMoney = ctx.GetMoneyRegex(); prjMgr = ctx.GetMgrRegex(new string[] { "建造师" }); code = ctx.GetCodeRegex(); } else { prjAddress = bidCtx.GetAddressRegex(); buildUnit = bidCtx.GetBuildRegex(); bidUnit = bidCtx.GetBidRegex(); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = bidCtx.GetRegex("第一中标候选人"); } bidMoney = bidCtx.GetMoneyString().GetMoney("万元"); prjMgr = bidCtx.GetMgrRegex(); if (string.IsNullOrEmpty(prjMgr)) { prjMgr = bidCtx.GetRegex("注册监理工程师"); } code = bidCtx.GetCodeRegex(); } bidType = "交通工程"; specType = "政府采购"; msgType = "江西省公共资源交易中心"; BidInfo info = ToolDb.GenBidInfo("江西省", "江西省及地市", area, string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("select"), new HasAttributeFilter("id", "id1"))); if (sNode != null && sNode.Count > 0) { try { SelectTag tag = sNode[0] as SelectTag; string temp = tag.OptionTags[tag.OptionTags.Length - 1].StringText; pageInt = int.Parse(temp.GetReplace("第,页")); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "&page=" + i, Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("cellpadding", "4"))); if (viewList != null && viewList.Count > 0) { TableTag table = viewList[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { TableRow tr = table.Rows[j]; if (tr.ColumnCount == 1) { continue; } string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = tr.Columns[1].GetATag(); beginDate = tr.Columns[1].ToPlainTextString().GetDateRegex(); prjName = aTag.GetAttribute("title"); InfoUrl = "http://liaobu.dg.gov.cn/" + aTag.Link; string htmDtl = string.Empty; try { htmDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>"); htmDtl = regexHtml.Replace(htmDtl, ""); } catch { continue; } parser = new Parser(new Lexer(htmDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "cont"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = dtl.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); inviteType = prjName.GetInviteBidType(); prjAddress = inviteCtx.GetAddressRegex(); buildUnit = inviteCtx.GetBuildRegex(); code = inviteCtx.GetCodeRegex().GetCodeDel(); msgType = "东莞市寮步镇政府"; specType = "政府采购"; inviteType = prjName.GetInviteBidType(); InviteInfo info = ToolDb.GenInviteInfo("广东省", "东莞市区", "寮步镇", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k].GetATag(); if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://liaobu.dg.gov.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "page"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[0].ToPlainTextString().GetRegexBegEnd("共", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "&Page=" + i); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "list_con_main_bulcon")), true), new TagNameFilter("li"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { INode node = listNode[j]; ATag aTag = node.GetATag(); if (aTag == null) { continue; } string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = aTag.GetAttribute("title"); beginDate = node.ToNodePlainString().GetDateRegex(); string linkId = aTag.Link.GetRegexBegEnd("Id=", "&"); InfoUrl = "http://www.hngzzx.com/HomePage/ShowInfoDetail.aspx?Id=" + linkId + "&TableID=1"; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "detail_con"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); bidCtx = HtmlTxt.GetReplace("</p>,<br/>", "\r\n").ToCtxString(); prjAddress = bidCtx.GetAddressRegex(); buildUnit = bidCtx.GetBuildRegex(); code = bidCtx.GetCodeRegex().GetCodeDel(); bidUnit = bidCtx.GetBidRegex(); if (string.IsNullOrWhiteSpace(bidUnit)) { bidUnit = bidCtx.GetRegex("第一名", false); } bidMoney = bidCtx.GetMoneyRegex(); prjMgr = bidCtx.GetMgrRegex(); if (string.IsNullOrEmpty(bidUnit)) { parser = new Parser(new Lexer(HtmlTxt)); NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("br")); if (tableNode != null && tableNode.Count > 0) { bool isOk = false; string ctx = string.Empty; for (int t = 0; t < tableNode.Count; t++) { if (tableNode[t].ToPlainTextString().Contains("供应商名称")) { isOk = true; TableTag tag = tableNode[t] as TableTag; if (tag.RowCount > 2) { for (int c = 0; c < tag.Rows[0].ColumnCount; c++) { ctx += tag.Rows[0].Columns[c].ToNodePlainString() + ":"; try { ctx += tag.Rows[1].Columns[c].ToNodePlainString() + "\r\n"; } catch { } } } break; } } if (!isOk) { for (int t = 0; t < tableNode.Count; t++) { if (tableNode.AsString().Contains("中标候选人")) { isOk = true; TableTag tag = tableNode[t] as TableTag; if (tag.RowCount > 2) { for (int c = 0; c < tag.Rows[0].ColumnCount; c++) { ctx += tag.Rows[0].Columns[0].ToNodePlainString() + ":"; try { ctx += tag.Rows[1].Columns[0].ToNodePlainString() + "\r\n"; } catch { } } } break; } } if (!isOk) { for (int t = 0; t < tableNode.Count; t++) { if (tableNode.AsString().Contains("中标单位") || tableNode.AsString().Contains("中标候选单位") || tableNode.AsString().Contains("投标人名称")) { isOk = true; TableTag tag = tableNode[t] as TableTag; if (tag.RowCount > 2) { for (int c = 0; c < tag.Rows[0].ColumnCount; c++) { ctx += tag.Rows[0].Columns[0].ToNodePlainString() + ":"; try { ctx += tag.Rows[1].Columns[0].ToNodePlainString() + "\r\n"; } catch { } } } break; } } } } bidUnit = ctx.GetBidRegex(); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = ctx.GetRegex("第一名,供应商名称,投标人名称"); } string money = ctx.GetMoneyRegex(); if (string.IsNullOrEmpty(money) || bidMoney != money) { bidMoney = money; } if (string.IsNullOrEmpty(bidMoney) || bidMoney == "0") { bidMoney = ctx.GetRegex("中标金额(单位:元),最终报价,投标报价(元)", false).GetMoney(); } if (string.IsNullOrEmpty(prjMgr)) { prjMgr = bidCtx.GetMgrRegex(); } } } if (bidUnit.Contains("公司")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("公司")) + "公司"; } if (bidUnit.Contains("研究院")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("研究院")) + "研究院"; } if (bidUnit.Contains("开发局")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("开发局")) + "开发局"; } if (bidUnit.Contains("名称") || bidUnit.Contains("联系人") || bidUnit.Contains("报价") || bidUnit.Contains("内容")) { bidUnit = string.Empty; } bidUnit = bidUnit.GetReplace("1,2,3,、"); if (code.Contains("代理")) { code = code.Remove(code.IndexOf("代理")); } try { if (decimal.Parse(bidMoney) > 10000) { bidMoney = (decimal.Parse(bidMoney) / 10000).ToString(); } } catch { } specType = bidType = "政府采购"; msgType = "湖南省公共资源交易中心"; BidInfo info = ToolDb.GenBidInfo("湖南省", "湖南省及地市", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "yesh fl"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode[0].ToNodePlainString().GetRegexBegEnd("/", "页"); pageInt = int.Parse(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.szlhxq.gov.cn/mzbsc/zwgk69/cgzb/zbgg282/14843-" + i.ToString() + ".html", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "news1_list")), true), new TagNameFilter("li"))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; beginDate = viewList[j].ToNodePlainString().GetDateRegex(); ATag aTag = viewList[j].GetATag(); prjName = aTag.GetAttribute("title"); InfoUrl = "http://www.szlhxq.gov.cn" + aTag.Link; string htmDtl = string.Empty; try { htmDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "tit-content"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = dtl.AsHtml(); inviteCtx = System.Text.RegularExpressions.Regex.Replace(HtmlTxt, "(<script)[\\s\\S]*?(</script>)", ""); inviteCtx = System.Text.RegularExpressions.Regex.Replace(inviteCtx.Replace("<br/>", "\r\n").Replace("<BR/>", "\r\n").Replace("<BR>", "\r\n").Replace("<br>", "\r\n"), "<[^>]*>", "").Replace(" ", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\r\n", "\r\n").Replace("\r\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\t", "").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); inviteType = prjName.GetInviteBidType(); prjAddress = inviteCtx.GetAddressRegex(); if (prjAddress.Contains("**")) { prjAddress = string.Empty; } buildUnit = inviteCtx.GetBuildRegex(); if (buildUnit.Contains("资质")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("资质")); } code = inviteCtx.GetCodeRegex().GetCodeDel(); msgType = "深圳市龙华新区民治街道办事处"; if (string.IsNullOrEmpty(prjAddress)) { prjAddress = "见中标信息"; } specType = "建设工程"; inviteType = "小型工程"; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "深圳市龙华新区民治街道办事处"; } InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "AspNetPager1"))); if (tdNodes != null && tdNodes.Count > 0) { string htlPage = tdNodes.ToHtml(); parser = new Parser(new Lexer(htlPage)); NodeFilter filer = new TagNameFilter("a"); NodeList pageList = parser.ExtractAllNodesThatMatch(filer); if (pageList != null && pageList.Count > 0) { for (int i = pageList.Count - 1; i >= 0; i--) { try { ATag aTag = pageList.SearchFor(typeof(ATag), true)[i] as ATag; string pageTemp = aTag.Link.Replace("main.aspx?flg=3&id=6&page=", ""); pageInt = int.Parse(pageTemp); break; } catch (Exception ex) { } } } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl + "&page=" + i.ToString()), Encoding.UTF8); } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "760"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; beginDate = tr.Columns[2].ToPlainTextString().Trim(); ATag aTag = tr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag; prjName = aTag.LinkText; InfoUrl = "http://www.uho.cn/" + aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace(" ", "").Trim(); Parser dtlparserHTML = new Parser(new Lexer(htmldetail)); NodeList dtnodeHTML = dtlparserHTML.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("bordercolor", "#FFFFFF"), new TagNameFilter("table"))); HtmlTxt = dtnodeHTML.AsHtml(); htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).ToLower().Replace(" ", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n").Replace("<br/>", "\r\n"); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); htmldetail = regexHtml.Replace(htmldetail, ""); } catch (Exception ex) { continue; } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("bordercolor", "#ffffff"), new TagNameFilter("table"))); inviteCtx = dtnode.AsString(); specType = "其他"; msgType = "深圳市友和保险经纪有限公司"; inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } return(list); }
public void DealHtml(IList list, string html, bool crawlAll) { Parser parserDtl = new Parser(new Lexer(html)); NodeList aNodes = parserDtl.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "lefttable"))); if (aNodes != null && aNodes.Count > 0) { Type typs = typeof(ATag); TableTag table = aNodes[0] as TableTag; for (int t = 1; t < table.RowCount - 1; t++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, ctx = string.Empty, CreateTime = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[t] as TableRow; ATag aTag = tr.SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = aTag.Link; prjName = table.Rows[t].Columns[1].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); endDate = table.Rows[t].Columns[2].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); string htmlDtl = string.Empty; try { htmlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); } catch (Exception ex) { continue; } Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); htmlDtl = regexHtml.Replace(htmlDtl, ""); Parser parserCtx = new Parser(new Lexer(htmlDtl)); NodeList ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "printTb lefttable"))); if (ctxNode != null && ctxNode.Count > 0) { Parser parserdiv = new Parser(new Lexer(htmlDtl)); NodeList aNodesdiv = parserdiv.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "biuuu_button"))); HtmlTxt = ctxNode.AsHtml().Replace(aNodesdiv.AsHtml(), "").Trim(); Type tp = typeof(ATag); TableTag tabTag = ctxNode[0] as TableTag; string startTime = tabTag.Rows[1].Columns[0].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); Regex regex = new Regex(@"时间:\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}"); Match math = regex.Match(startTime); beginDate = math.Value.Replace("时间:", ""); Regex regexcode = new Regex("(工程编号|项目编号|招标编号):[^\r\n]+[\r\n]{1}"); Match match = regexcode.Match(tabTag.ToPlainTextString()); code = match.Value.Substring(match.Value.IndexOf(":") + 1).Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); Regex regexBuildUnit = new Regex("(招标人|建设单位|招标采购单位):[^\r\n]+[\r\n]{1}"); Match matchBuildUnit = regexBuildUnit.Match(tabTag.ToPlainTextString()); buildUnit = matchBuildUnit.Value.Substring(matchBuildUnit.Value.IndexOf(":") + 1).Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); Regex regexAddress = new Regex("(建设地点|项目地点|工程地点):[^\r\n]+[\r\n]{1}"); Match matchAddress = regexAddress.Match(tabTag.ToPlainTextString()); prjAddress = matchAddress.Value.Substring(matchAddress.Value.IndexOf(":") + 1).Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); ctx = tabTag.Rows[2].Columns[0].ToPlainTextString().Replace(" ", " ").Replace("\r\n\r\n\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); if (ctx.Length > 0) { Regex regexCtx = new Regex("<!--[^<]+-->"); ctx = regexCtx.Replace(ctx, ""); } if (Encoding.Default.GetByteCount(code) > 50) { code = ""; } if (buildUnit == "" || buildUnit == null) { buildUnit = ""; } if (Encoding.Default.GetByteCount(buildUnit) > 150) { buildUnit = buildUnit.Substring(0, 150); } if (Encoding.Default.GetByteCount(prjAddress) > 200) { prjAddress = "见招标公告内容"; } if (beginDate.Length > 0 && endDate.Length > 0) { DateTime begin = new DateTime(); DateTime end = new DateTime(); try { begin = DateTime.Parse(beginDate); end = DateTime.Parse(endDate); } catch (Exception) { } if (begin > end) { endDate = string.Empty; } } } parserCtx.Reset(); ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "toptd_bai"))); Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}"); beginDate = regDate.Match(ctxNode.AsString()).Value.Trim(); if (beginDate == "") { beginDate = string.Empty; } inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "惠州市区", "惠阳区", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, ctx, remark, "惠州市建设工程交易中心", inviteType, "建设工程", string.Empty, InfoUrl, HtmlTxt); list.Add(info); ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("target", "_blank"))); NodeList aTagNodes = ctxNode.SearchFor(typeof(ATag), true); for (int a = 0; a < aTagNodes.Count; a++) { ATag fileTage = aTagNodes[a] as ATag; if (fileTage.Link.Contains("http://www.ebc.huizhou.gov.cn/index/loadNewsFile")) { string downloadURL = fileTage.Link; BaseAttach attach = ToolDb.GenBaseAttach(fileTage.ToPlainTextString(), info.Id, downloadURL); base.AttachList.Add(attach); } } if (!crawlAll && list.Count >= this.MaxCount) { return; } } } }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("form"), new HasAttributeFilter("name", "qPageForm"))); if (pageNode != null && pageNode.Count > 0) { try { NodeList aNode = new Parser(new Lexer(pageNode.ToHtml())).ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { string temp = aNode[aNode.Count - 2].GetATagHref().Replace("turnOverPage", "").Replace("(", "").Replace(")", "").Replace(";", ""); pageInt = int.Parse(temp); } } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "channelCode", "pageIndex", "pageSize", "pointPageIndexId" }, new string[] { "0008", i.ToString(), "15", "1" }); try { html = this.ToolWebSite.GetHtmlByUrl("http://maoming.gdgpo.com/queryMoreInfoList.do", nvc, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "m_m_c_list")), true), new TagNameFilter("li"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = listNode[j].GetATag(1); prjName = aTag.GetAttribute("title"); beginDate = listNode[j].ToPlainTextString().GetDateRegex(); InfoUrl = "http://maoming.gdgpo.com" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "zw_c_c_cont"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml().Replace("<br", "\r\n<br"); bidCtx = HtmlTxt.Replace("</p>", "\r\n").Replace("</pre>", "\r\n").ToCtxString(); buildUnit = bidCtx.GetBuildRegex(); code = bidCtx.Replace("(招标编号", "000000").GetCodeRegex().GetCodeDel(); if (string.IsNullOrEmpty(code)) { code = bidCtx.GetRegex("招标编号", true, 50).GetCodeDel(); } string temp = code.GetDateRegex("yyyy年MM月dd日"); bidUnit = bidCtx.GetBidRegex().GetBidUnitDel(); bidMoney = bidCtx.GetMoneyString(); if (bidMoney.Contains("(")) { bidMoney = bidMoney.Remove(bidMoney.IndexOf("(")).GetMoney(); } else { bidMoney = bidMoney.GetMoney(); } if (bidMoney == "0") { bidMoney = bidCtx.GetMoneyString(null, true); if (bidMoney.Contains("(")) { bidMoney = bidMoney.Remove(bidMoney.IndexOf("(")).GetMoney(); } else if (bidMoney.Contains("大写")) { bidMoney = bidMoney.Remove(bidMoney.IndexOf("大写")).GetMoney(); } else { bidMoney = bidMoney.GetMoney(); } } if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney)) { bidMoney = bidCtx.GetMoneyString(null, true); if (bidMoney.Contains("大写")) { bidMoney = bidMoney.Remove(bidMoney.IndexOf("大写")).GetMoney(); } else { bidMoney = bidMoney.GetMoney("万元"); } } if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney)) { bidMoney = bidCtx.GetMoneyString(null, true).GetMoney(); } if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney)) { bidMoney = bidCtx.GetMoneyString(new string[] { "¥", "$" }, false).GetMoney(); } if (!string.IsNullOrEmpty(bidMoney) && bidMoney != "0" && decimal.Parse(bidMoney) > 10000) { bidMoney = (decimal.Parse(bidMoney) / 10000).ToString(); } if (string.IsNullOrEmpty(bidUnit)) { bidUnit = bidCtx.Replace(" ", "").GetRegex("成交人,成交候选供应商"); } if (bidUnit.Contains("名称")) { bidUnit = bidUnit.Replace("名称", ""); } bidUnit = bidUnit.Replace("-", ""); bidType = prjName.GetInviteBidType(); msgType = "茂名市政府采购"; specType = "政府采购"; BidInfo info = ToolDb.GenBidInfo("广东省", "茂名市区", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); parser = new Parser(new Lexer(HtmlTxt)); NodeList fileNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (fileNode != null && fileNode.Count > 0) { for (int k = 0; k < fileNode.Count; k++) { ATag fileAtag = fileNode[k].GetATag(); if (fileAtag.IsAtagAttach()) { string fileName = fileAtag.LinkText.ToNodeString().Replace(" ", ""); string fileLink = fileAtag.Link; if (!fileLink.ToLower().Contains("http")) { fileLink = "http://maoming.gdgpo.gov.cn" + fileAtag.Link; } if (Encoding.Default.GetByteCount(fileLink) < 500) { base.AttachList.Add(ToolDb.GenBaseAttach(fileName, info.Id, fileLink)); } } } } list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("nowrap", "true"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("总页数", "当前页").Replace(":", ""); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__VIEWSTATE", "__EVENTTARGET", "__EVENTARGUMENT" }, new string[] { viewState, "MoreInfoList1$Pager", i.ToString() }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.Default, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "MoreInfoList1_DataGrid1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); prjName = aTag.GetAttribute("title"); beginDate = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.dyggzyjyzx.com" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "TDContent"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); prjAddress = inviteCtx.GetAddressRegex(); buildUnit = inviteCtx.GetBuildRegex(); if (buildUnit.Contains("公司")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司"; } if (buildUnit.Contains("地址")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("地址")); } if (buildUnit.Contains("招标代理")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("招标代理")); } code = inviteCtx.GetCodeRegex().GetCodeDel(); msgType = "大冶市公共资源交易中心"; specType = "建设工程"; inviteType = prjName.GetInviteBidType(); buildUnit = buildUnit.Replace(" ", ""); InviteInfo info = ToolDb.GenInviteInfo("湖北省", "湖北省及地市", "大冶市", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.dyggzyjyzx.com/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default, ref cookiestr); Regex regexHtml = new Regex(@"<script[^<]*</script>"); htl = regexHtml.Replace(htl, ""); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("align", "right"))); Regex regexPage = new Regex(@"共\d+页"); try { page = int.Parse(regexPage.Match(nodeList.AsString()).Value.Trim(new char[] { '共', '页' })); } catch (Exception) { } for (int i = 1; i < page; i++) { if (i > 1) { try { htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl + "&otype=&pageNum=" + i.ToString()), Encoding.Default); Regex regexHtml = new Regex(@"<script[^<]*</script>"); htl = regexHtml.Replace(htl, ""); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(htl)); NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("cellpadding", "1"))); if (tableNodeList != null && tableNodeList.Count > 0) { TableTag table = (TableTag)tableNodeList[0]; for (int j = 0; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; prjName = tr.Columns[0].ToPlainTextString().Trim(); beginDate = tr.Columns[1].ToPlainTextString().Replace(" ", "").Trim().Substring(0, 10); ATag aTag = tr.Columns[0].SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://market.meizhou.gov.cn/website/deptwebsite/1925/Content.jsp?issueId=15488&msgType=00&filePath=" + aTag.GetAttribute("onclick").Replace("showDeptContent('1925','", ""); int ii = InfoUrl.IndexOf("'"); string oo = InfoUrl.Remove(ii).Trim(); if (oo.Contains("content.php")) { string url = aTag.GetAttribute("onclick").Replace("showDeptContent('1925','", ""); ii = url.IndexOf("'"); oo = url.Remove(ii).ToString(); } string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(oo), Encoding.Default).Replace(" ", ""); Regex regexHtml = new Regex(@"<script[^<]*</script>"); htmldetail = regexHtml.Replace(htmldetail, ""); } catch (Exception) { Logger.Error("InviteMeiZhoouJS"); continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("P"), new HasAttributeFilter("class", "MsoNormal"))); if (dtnode == null || dtnode.Count < 1) { parserdetail = new Parser(new Lexer(htmldetail)); dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "WordSection1"))); } if (dtnode.Count > 0 && dtnode != null) { HtmlTxt = dtnode.AsHtml(); for (int k = 0; k < dtnode.Count; k++) { string tr1 = string.Empty; tr1 = dtnode[k].ToPlainTextString().Replace(" ", "").Trim(); if (k == 0) { string InvType = tr1; inviteType = ToolHtml.GetInviteTypes(InvType); } inviteCtx += tr1 + ":" + "\r\n"; } Regex regPrjAddr = new Regex(@"(工程地点|建设地点):[^\r\n]+\r\n"); prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程地点:", "").Replace("建设地点", "").Replace(":", "").Replace(";", "").Trim(); Regex bildUnit = new Regex(@"(招标人|招标人(盖章)|招标人):[^\r\n]+[\r\n]{1}"); buildUnit = bildUnit.Match(inviteCtx).Value.Replace("招 标人:", "").Replace("招标人(盖章):", "").Replace("招标人:", "").Trim(); if (buildUnit != "" && buildUnit.Contains(":")) { int zz = buildUnit.IndexOf(":"); buildUnit = buildUnit.Remove(zz).ToString(); } Regex regcode = new Regex(@"(招标项目编号|项目编号)(:|:)[^\r\n]+[\r\n]{1}"); code = regcode.Match(inviteCtx).Value.Replace("招标项目编号", "").Replace("项目编号", "").Replace(":", "").Replace(":", "").Trim(); Regex regoType = new Regex(@"工程类型:[^\r\n]+\r\n"); string oType = regoType.Match(inviteCtx).Value.Replace("工程类型:", "").Trim(); if (oType.Contains("房建")) { otherType = "房建及工业民用建筑"; } else if (oType.Contains("市政")) { otherType = "市政工程"; } else if (oType.Contains("园林绿化")) { otherType = "园林绿化工程"; } else if (oType.Contains("装饰") || oType.Contains("装修")) { otherType = "装饰装修工程"; } else if (oType.Contains("电力")) { otherType = "电力工程"; } else if (oType.Contains("水利")) { otherType = "水利工程"; } if (oType.Contains("环保")) { otherType = "环保工程"; } if (buildUnit == "") { buildUnit = ""; } if (buildUnit.Contains("梅州市建设工程交易中心")) { buildUnit = ""; } msgType = "梅州市建设工程交易中心"; specType = "建设工程"; inviteCtx = inviteCtx.Replace("<?", "").Replace("xml:namespace prefix = o ns = ", "").Replace("urn:schemas-microsoft-com:office:office", "").Replace("/>", "").Trim(); inviteCtx = inviteCtx.Replace("<?", "").Replace("xml:namespace prefix = ns0 ns = ", "").Replace("urn:schemas-microsoft-com:office:smarttags", "").Replace("/>", "").Trim(); inviteCtx = inviteCtx.Replace("<?", "").Replace("xml:namespace prefix = st1 ns = ", "").Replace("urn:schemas-microsoft-com:office:smarttags", "").Replace("/>", "").Trim(); inviteCtx = inviteCtx.Replace("?xml:namespaceprefix=o/>", "").Trim(); inviteCtx = inviteCtx.Replace("<?xml:namespaceprefix=st1/>", "").Trim(); if (Encoding.Default.GetByteCount(code) >= 50) { code = string.Empty; } if (Encoding.Default.GetByteCount(prjAddress) >= 150) { prjAddress = string.Empty; } InviteInfo info = ToolDb.GenInviteInfo("广东省", "梅州市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, oo, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <NotifyInfo>(); int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string cookiestr = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "gridview_PagerRow"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("/", " "); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection( new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__LASTFOCUS", "__VIEWSTATE", "__VIEWSTATEGENERATOR", "__EVENTVALIDATION", "keyTextBox", "PagerControl1:_ctl4", "PagerControl1:_ctl2.x", "PagerControl1:_ctl2.y" }, new string[] { "", "", "", viewState, "7CE136E4", eventValidation, "", "", "3", "5" } ); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.Default, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "MyGridView1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); headName = aTag.LinkText; releaseTime = tr.Columns[2].ToPlainTextString().GetDateRegex(); infoUrl = aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "PopupBody_context"))); if (dtlNode != null && dtlNode.Count > 0) { if (Encoding.Default.GetByteCount(headName) > 200) { headName = headName.Substring(0, 100); } ctxHtml = dtlNode.AsHtml(); infoCtx = ctxHtml.ToCtxString(); List <string> listImg = new List <string>(); parser = new Parser(new Lexer(ctxHtml)); NodeList imgNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("img")); if (imgNode != null && imgNode.Count > 0) { for (int m = 0; m < imgNode.Count; m++) { string link = "http://publish.bcactc.com" + (imgNode[m] as ImageTag).ImageURL; listImg.Add(link); ctxHtml = ctxHtml.GetReplace((imgNode[m] as ImageTag).ImageURL, link); } } msgType = "北京市建设工程发包承包交易中心"; NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "北京市", "北京市区", "", infoCtx, "通知公告"); sqlCount++; if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate)) { if (listImg.Count > 0) { for (int a = 0; a < listImg.Count; a++) { BaseAttach entity = null; try { entity = ToolHtml.GetBaseAttach(listImg[0], headName, info.Id); if (entity != null) { ToolDb.SaveEntity(entity, string.Empty); } } catch { } } } parser = new Parser(new Lexer(ctxHtml)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag fileATag = aNode[k].GetATag(); if (fileATag.IsAtagAttach()) { BaseAttach obj = null; try { if (fileATag.Link.ToLower().Contains("http")) { obj = ToolHtml.GetBaseAttach(fileATag.Link, headName, info.Id); } else { obj = ToolHtml.GetBaseAttach("http://publish.bcactc.com/" + fileATag.Link, headName, info.Id); } } catch { } if (obj != null) { ToolDb.SaveEntity(obj, string.Empty); } } } } } if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string cookiestr = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "pager")), true), new TagNameFilter("a"))); if (pageNode != null && pageNode.Count > 0) { try { Regex reg = new Regex(@"[0-9]+"); string temp = reg.Match(pageNode[pageNode.Count - 1].GetATagHref().Replace("'", "")).Value; pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection( new string[] { "__VIEWSTATE", "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATEENCRYPTED", "__EVENTVALIDATION", "hsa1$DD_LX", "hsa1$wd", "pager_input" }, new string[] { viewState, "pager", i.ToString(), "", eventValidation, "综合搜索", "", (i - 1).ToString() } ); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "GV1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 1; j < table.RowCount - 1; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, area = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; prjName = tr.Columns[0].ToNodePlainString(); if (prjName.Contains("[")) { area = prjName.Replace("[", "【").Replace("]", "】").GetRegexBegEnd("【", "】"); prjName = prjName.Replace("[" + area + "]", ""); } beginDate = tr.Columns[1].ToPlainTextString(); InfoUrl = "http://www.ycsggzy.cn/" + tr.Columns[0].GetATagHref().Replace("&", "&"); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "Lb_nr"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml().GetReplace("<br>", "<br />"); inviteCtx = HtmlTxt.ToCtxString(); code = inviteCtx.GetCodeRegex(); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex().Replace(" ", ""); inviteType = prjName.GetInviteBidType(); specType = "建设工程"; msgType = "银川市公共资源交易中心"; InviteInfo info = ToolDb.GenInviteInfo("宁夏回族自治区", "宁夏回族自治区及地市", "银川市", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <ItemPlan>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "arContent"))); if (pageNode != null && pageNode.Count > 0) { TableTag pageTable = pageNode[0] as TableTag; string temp = pageTable.Rows[pageTable.RowCount - 1].ToNodePlainString().Replace("createPageHTML", "").Replace("0,", "").Replace("(", "").Replace(")", "").Replace("index", "").Replace("htm", "").Replace(",", "").Replace("\"", "").Replace(";", "").Trim(); try { pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "/index_" + (i - 1).ToString() + ".htm", Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "arContent"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 1; j < table.RowCount - 1; j++) { TableRow tr = table.Rows[j]; string ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty; ATag aTag = tr.Columns[1].GetATag(); ItemName = aTag.GetAttribute("title"); ItemCode = tr.Columns[2].ToNodePlainString(); PlanDate = tr.Columns[3].ToPlainTextString().GetDateRegex(); InfoUrl = this.SiteUrl + aTag.Link.Replace("../", "").Replace("./", ""); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "detail"))); if (dtlNode != null && dtlNode.Count > 0) { CtxHtml = dtlNode.AsHtml(); ItemCtx = CtxHtml.ToCtxString(); string ctx = string.Empty; parser = new Parser(new Lexer(CtxHtml)); NodeList dtlTable = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "100%"))); if (dtlTable != null && dtlTable.Count > 0) { TableTag tableTag = dtlTable[0] as TableTag; for (int k = 0; k < tableTag.RowCount; k++) { for (int c = 0; c < tableTag.Rows[k].ColumnCount; c++) { if (c % 2 == 0) { ctx += tableTag.Rows[k].Columns[c].ToNodePlainString().Replace(":", "").Replace(":", "") + ":"; } else { ctx += tableTag.Rows[k].Columns[c].ToNodePlainString() + "\r\n"; } } } } MsgUnit = ctx.GetRegex("发布单位"); if (string.IsNullOrEmpty(MsgUnit)) { MsgUnit = "发改委"; } PlanType = "项目审批信息"; MsgType = "深圳市发展和改革委员会"; ItemPlan info = ToolDb.GenItemPlan("广东省", "深圳市区", "", ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <NoticeInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch (Exception ex) { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pagination page-mar"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("/共", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "&page=" + i, Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "wsbs-table"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string InfoTitle = string.Empty, InfoType = string.Empty, PublistTime = string.Empty, InfoCtx = string.Empty, InfoUrl = string.Empty, prjCode = string.Empty, buildUnit = string.Empty, htmlTxt = string.Empty; TableRow tr = table.Rows[j]; InfoTitle = tr.Columns[1].ToNodePlainString(); InfoType = "变更公示"; PublistTime = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.gzggzy.cn" + tr.Columns[1].GetATagHref(); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "xx-main"))); if (dtlNode != null && dtlNode.Count > 0) { htmlTxt = dtlNode.AsHtml().GetJsString().Replace("<", "<").Replace(">", ">").Replace(""", "\"").Replace("&", "&").Replace("&lquot;", "").Replace("”", ""); InfoCtx = htmlTxt.Replace("</p>", "").Replace("<br/>", "").Replace("<br>", "").ToCtxString().Replace("±", "").Replace("&ldquot;", "").Replace("“", ""); prjCode = InfoCtx.GetCodeRegex().GetChina().GetCodeDel().Replace("<br", "").Replace("/>", ""); buildUnit = InfoCtx.GetBuildRegex(null, true, 100); if (buildUnit.Contains("联系")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("联系")); } if (buildUnit.Contains("地址")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("地址")); } NoticeInfo info = ToolDb.GenNoticeInfo("广东省", "广州政府采购", string.Empty, string.Empty, InfoTitle, InfoType, InfoCtx, PublistTime, string.Empty, "广州公共资源交易中心", InfoUrl, prjCode, buildUnit, string.Empty, string.Empty, string.Empty, string.Empty, htmlTxt); parser = new Parser(new Lexer(htmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag aTag = aNode[k].GetATag(); if (aTag.IsAtagAttach()) { string link = string.Empty; if (aTag.Link.ToLower().Contains("http")) { link = aTag.Link; } else { link = "http://www.gzggzy.cn" + aTag.Link; } BaseAttach attach = ToolDb.GenBaseAttach(aTag.LinkText, info.Id, link); base.AttachList.Add(attach); } } } list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "easysite-total-page"))); if (nodeList != null && nodeList.Count > 0) { string temp = nodeList.AsString(); try { page = int.Parse(temp.GetRegexBegEnd("1/", "\n")); } catch { } } if (page == 1) { page = 42; } for (int i = 1; i <= page; i++) { if (i > 1) { try { htl = this.ToolWebSite.GetHtmlByUrl("http://www.szgm.gov.cn/szgm/132100/xwdt17/135204/151246/8d25503a-" + i.ToString() + ".html", Encoding.UTF8); } catch { return(list); } } parser = new Parser(new Lexer(htl)); // NodeList tabList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("cellspacing", "0")), true), new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("cellspacing", "0")))); NodeList tabList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("cellspacing", "0"))), new TagNameFilter("tr"))); //NodeList tabList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("li"), new HasAttributeFilter("class", "easysite-article-li"))); if (tabList != null && tabList.Count > 0) { for (int j = 0; j < tabList.Count; j++) { ATag aTag = null; TableRow tr = null; try { tr = (tabList[j] as TableTag).Rows[0]; aTag = tr.GetATag(); if (aTag == null || tr.ColumnCount != 3) { continue; } } catch { continue; } string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, bidType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteType = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; beginDate = tr.Columns[2].ToPlainTextString().GetDateRegex(); prjName = aTag.GetAttribute("title"); InfoUrl = "http://www.szgm.gov.cn" + aTag.Link; string htldtl = string.Empty; try { htldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "article_body"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); code = inviteCtx.GetCodeRegex().GetCodeDel(); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetBuildRegex(); inviteType = prjName.GetInviteBidType(); specType = "政府采购"; msgType = "深圳市光明新区"; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = msgType; } InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳区及街道工程", "光明新区", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <NoticeInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default, ref cookiestr); } catch { } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("nowrap", "true"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("总页数:", "当前"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__LASTFOCUS", "__VIEWSTATE", "__EVENTVALIDATION" }, new string[] { "Pager", i.ToString(), "", viewState, eventValidation }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.Default, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("tr")); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { TableRow tr = listNode[j] as TableRow; if (tr.ColumnCount != 6) { continue; } ATag aTag = tr.GetATag(); if (aTag == null) { continue; } string prjType = tr.Columns[2].ToNodePlainString().GetReplace("[", "【").GetReplace("]", "】").GetRegexBegEnd("【", "】"); if (!prjType.Contains("水利工程") && !prjType.Contains("建设工程") && !prjType.Contains("交通工程")) { continue; } string InfoTitle = string.Empty, InfoType = string.Empty, PublistTime = string.Empty, InfoCtx = string.Empty, InfoUrl = string.Empty, prjCode = string.Empty, buildUnit = string.Empty, htmlTxt = string.Empty; InfoType = "变更公示"; InfoTitle = aTag.GetAttribute("title"); InfoUrl = "http://ggzyjy.jl.gov.cn/JiLinZtb/" + aTag.Link.GetReplace("../,./"); string area = tr.Columns[3].ToNodePlainString().GetReplace("[", "【").GetReplace("]", "】").GetRegexBegEnd("【", "】"); PublistTime = tr.Columns[4].ToPlainTextString().GetDateRegex(); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "TDContent"))); if (dtlNode != null && dtlNode.Count > 0) { htmlTxt = dtlNode.AsHtml(); InfoCtx = htmlTxt.GetReplace("</p>,<br />,<br/>", "\r\n").ToCtxString(); prjCode = InfoCtx.GetCodeRegex().GetCodeDel(); buildUnit = InfoCtx.GetBuildRegex(); if (buildUnit.Contains("公司")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司"; } if (buildUnit.Contains("联系")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("联系")); } if (buildUnit.Contains("地址")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("地址")); } NoticeInfo info = ToolDb.GenNoticeInfo("吉林省", "吉林省及地市", area, string.Empty, InfoTitle, InfoType, InfoCtx, PublistTime, string.Empty, "吉林省公共资源交易中心", InfoUrl, prjCode, buildUnit, string.Empty, string.Empty, "建设工程", prjType, htmlTxt); list.Add(info); parser = new Parser(new Lexer(htmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://ggzyjy.jl.gov.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { //取得页码 int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch (Exception ex) { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "scott")), true), new TagNameFilter("a"))); if (pageList != null && pageList.Count > 0) { try { string temp = pageList[pageList.Count - 1].GetATagValue().Replace("(", "kdxx").Replace(")", "xxdk").GetRegexBegEnd("kdxx", "xxdk"); pageInt = Convert.ToInt32(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "lefttable"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount - 1; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; TableRow tr = table.Rows[j]; infoType = "办事指南"; headName = tr.Columns[1].ToNodePlainString(); releaseTime = tr.Columns[2].ToPlainTextString().GetDateRegex(); infoUrl = tr.Columns[1].GetATagHref(); string htldtl = string.Empty; try { htldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.Default).GetJsString().Replace("<?xml:namespace prefix = o ns = \"urn:schemas-microsoft-com:office:office\" />", ""); } catch { continue; } parser = new Parser(new Lexer(htldtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "context_div"))); if (dtlList != null && dtlList.Count > 0) { ctxHtml = dtlList.AsHtml(); infoCtx = ctxHtml.ToCtxString(); msgType = MsgTypeCosnt.HuiZhouMsgType; NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "惠州市区", string.Empty, infoCtx, infoType); if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } else { sqlCount++; if (ToolDb.SaveEntity(info, this.ExistCompareFields)) { parser = new Parser(new Lexer(ctxHtml)); NodeList imgList = parser.ExtractAllNodesThatMatch(new TagNameFilter("img")); if (imgList != null && imgList.Count > 0) { for (int img = 0; img < imgList.Count; img++) { ImageTag imgTag = imgList[img] as ImageTag; BaseAttach baseInfo = ToolHtml.GetBaseAttachByUrl(imgTag.GetAttribute("src"), headName, info.Id); if (baseInfo != null) { ToolDb.SaveEntity(baseInfo, string.Empty); } } } parser = new Parser(new Lexer(ctxHtml)); NodeList attachList = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (attachList != null && attachList.Count > 0) { for (int a = 0; a < attachList.Count; a++) { ATag aTag = attachList[a] as ATag; if (aTag.IsAtagAttach()) { BaseAttach obj = ToolHtml.GetBaseAttachByUrl(aTag.Link, aTag.LinkText, info.Id); if (obj != null) { ToolDb.SaveEntity(obj, string.Empty); } } } } } } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("nowrap", "true"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("总页数:", "当"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection( new string[] { "__VIEWSTATE", "__EVENTTARGET", "__EVENTARGUMENT", "__EVENTVALIDATION", "MoreInfoList1$txtTitle" }, new string[] { viewState, "MoreInfoList1$Pager", i.ToString(), eventValidation, "" } ); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "MoreInfoList1_DataGrid1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); prjName = aTag.GetAttribute("title"); beginDate = tr.Columns[2].ToPlainTextString().GetDateRegex(); string temp = tr.Columns[1].ToNodePlainString(); if (temp.Contains("[") && temp.Contains("]")) { area = temp.Substring(temp.IndexOf("["), temp.IndexOf("]") - temp.IndexOf("[")).GetReplace("[,]"); } InfoUrl = "http://www.sxszbb.com" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "TDContent"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); bidCtx = HtmlTxt.ToCtxString(); buildUnit = bidCtx.GetBuildRegex(); code = bidCtx.GetCodeRegex().GetCodeDel(); bidMoney = bidCtx.GetMoneyRegex(); bidUnit = bidCtx.GetBidRegex(); if (string.IsNullOrWhiteSpace(bidUnit)) { bidUnit = bidCtx.GetRegex("第一名"); } prjMgr = bidCtx.GetMgrRegex(); if (string.IsNullOrWhiteSpace(bidUnit)) { parser = new Parser(new Lexer(HtmlTxt)); NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (tableNode != null && tableNode.Count > 0) { string ctx = string.Empty; TableTag tag = tableNode[0] as TableTag; bool isBreak = false, rBreak = false; for (int r = 0; r < tag.RowCount; r++) { for (int c = 0; c < tag.Rows[r].ColumnCount; c++) { string strTemp = tag.Rows[r].Columns[c].ToNodePlainString(); if (strTemp.Contains("评标结果")) { isBreak = true; break; } if (isBreak) { rBreak = true; try { ctx += tag.Rows[r].Columns[c].ToNodePlainString().GetReplace(":,:") + ":"; ctx += tag.Rows[r + 1].Columns[c].ToNodePlainString().GetReplace(":,:") + "\r\n"; } catch { } } else { if ((c + 1) % 2 == 0) { ctx += strTemp.GetReplace(":,:") + "\r\n"; } else { ctx += strTemp.GetReplace(":,:") + ":"; } } } if (rBreak) { break; } } bidUnit = ctx.GetBidRegex(); if (string.IsNullOrWhiteSpace(bidMoney) || bidMoney == "0") { bidMoney = ctx.GetMoneyRegex(); } if (string.IsNullOrWhiteSpace(prjMgr)) { prjMgr = ctx.GetMgrRegex(); } if (string.IsNullOrWhiteSpace(buildUnit)) { buildUnit = ctx.GetBuildRegex(); } if (string.IsNullOrWhiteSpace(code)) { code = ctx.GetCodeRegex().GetCodeDel(); } } } if (buildUnit.Contains("单位章")) { buildUnit = string.Empty; } if (buildUnit.Contains("联系人")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("联系人")); } if (prjMgr.Contains("中标")) { prjMgr = string.Empty; } specType = bidType = "建设工程"; msgType = "陕西省建设工程招标投标管理办公室"; BidInfo info = ToolDb.GenBidInfo("陕西省", "陕西省及地市", area, string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("form"), new HasAttributeFilter("name", "qPageForm"))); if (pageNode != null && pageNode.Count > 0) { try { NodeList aNode = new Parser(new Lexer(pageNode.ToHtml())).ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { string temp = aNode[aNode.Count - 2].GetATagHref().Replace("turnOverPage", "").Replace("(", "").Replace(")", "").Replace(";", ""); pageInt = int.Parse(temp); } } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "channelCode", "pageIndex", "pageSize", "pointPageIndexId" }, new string[] { "0005", i.ToString(), "15", "1" }); try { html = this.ToolWebSite.GetHtmlByUrl("http://foshan.gdgpo.com/queryMoreInfoList.do", nvc, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "m_m_c_list")), true), new TagNameFilter("li"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = listNode[j].GetATag(1); prjName = aTag.GetAttribute("title"); beginDate = listNode[j].ToPlainTextString().GetDateRegex(); InfoUrl = "http://foshan.gdgpo.com" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "zw_c_c_cont"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml();//.Replace("<br", "\r\n<br"); inviteCtx = HtmlTxt.ToCtxString(); code = inviteCtx.GetCodeRegex(); if (code.Contains("<")) { code = code.Remove(code.IndexOf("<")); } if (code.Contains("(")) { code = code.Remove(code.IndexOf("(")); } buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); inviteType = prjName.GetInviteBidType(); msgType = "佛山市政府采购"; specType = "政府采购"; InviteInfo info = ToolDb.GenInviteInfo("广东省", "佛山市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); parser = new Parser(new Lexer(HtmlTxt)); NodeList fileNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (fileNode != null && fileNode.Count > 0) { for (int k = 0; k < fileNode.Count; k++) { ATag fileAtag = fileNode[k].GetATag(); if (fileAtag.IsAtagAttach()) { string fileName = fileAtag.LinkText.ToNodeString().Replace(" ", ""); string fileLink = fileAtag.Link; if (!fileLink.ToLower().Contains("http")) { fileLink = "http://foshan.gdgpo.gov.cn" + fileAtag.Link; } base.AttachList.Add(ToolDb.GenBaseAttach(fileName, info.Id, fileLink)); } } } list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("select")); if (sNode != null && sNode.Count > 0) { try { SelectTag selTag = sNode[0] as SelectTag; string temp = selTag.OptionTags[selTag.OptionTags.Length - 1].ToNodePlainString(); pageInt = Convert.ToInt32(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "&page=" + i, Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "taxis_border"))); if (viewList != null && viewList.Count > 0) { TableTag table = viewList[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { TableRow tr = table.Rows[j]; string prjName = string.Empty, InfoUrl = string.Empty, beginDate = string.Empty, HtmlTxt = string.Empty; ATag aTag = tr.GetATag(); if (aTag == null) { continue; } prjName = aTag.LinkText.Trim(); InfoUrl = "http://www.hzhlz.gov.cn/" + aTag.Link; beginDate = tr.Columns[3].ToPlainTextString().GetDateRegex(); string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "newsContent"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = dtl.AsHtml(); string ctx = HtmlTxt.ToLower().GetReplace("</p>,</br>,<br>", "\r\n").ToCtxString(); parser.Reset(); NodeList nodeName = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "font1"))); if (nodeName != null && nodeName.Count > 0) { prjName = nodeName[0].ToNodePlainString().GetReplace(" ").Trim(); } else { continue; } if (prjName.Contains("中标") || prjName.Contains("成交") || prjName.Contains("结果")) { string buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty; bidCtx = HtmlTxt.ToLower().GetReplace("</p>,</br>,<br>", "\r\n").ToCtxString(); code = bidCtx.GetCodeRegex().GetCodeDel(); buildUnit = bidCtx.GetBuildRegex(); if (buildUnit.Contains("招标代理")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("招标代理")); } if (buildUnit.Contains("公司")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司"; } bidUnit = bidCtx.GetBidRegex(); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = bidCtx.GetRegex("中标候选公司"); } bidMoney = bidCtx.GetMoneyRegex(); try { if (decimal.Parse(bidMoney) > 100000) { bidMoney = (decimal.Parse(bidMoney) / 10000).ToString(); } } catch { } msgType = "广东省惠州市惠城区横沥镇人民政府"; specType = "政府采购"; bidType = prjName.GetInviteBidType(); BidInfo info = ToolDb.GenBidInfo("广东省", "惠州市区", "惠城区", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k].GetATag(); if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.hzhlz.gov.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } else { string code = string.Empty, buildUnit = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty; inviteCtx = HtmlTxt.ToLower().GetReplace("</p>,</br>,<br>", "\r\n").ToCtxString(); inviteType = prjName.GetInviteBidType(); code = inviteCtx.GetCodeRegex().GetCodeDel(); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); if (buildUnit.Contains("招标代理")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("招标代理")); } if (buildUnit.Contains("公司")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司"; } msgType = "广东省惠州市惠城区横沥镇人民政府"; specType = "政府采购"; InviteInfo info = ToolDb.GenInviteInfo("广东省", "惠州市区", "惠城区", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k].GetATag(); if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.hzhlz.gov.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } } return(list); }