protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "NewsPage"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString().Replace("createPageHTML(", "").Replace("index", "").Replace("html", "").Replace("0,", "").Replace(");", "").Replace(",", "").Replace(";", "").Replace(")", "").Replace("\"", "").Replace(" ", ""); pageInt = Convert.ToInt32(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.shajing.gov.cn/xxgk_14947/ywxx/zbcg/zhbgg/index_" + (i - 1).ToString() + ".html", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "NewsLiks01Text"))), new TagNameFilter("ul"))), new TagNameFilter("li"))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}"); beginDate = regDate.Match(viewList[j].ToPlainTextString().Trim()).Value; string temp = viewList[j].ToPlainTextString().Trim().Replace(beginDate, ""); try { int beg = temp.IndexOf("else{"), end = temp.Length; temp = temp.Substring(beg, end - beg); beg = temp.IndexOf("<a"); end = temp.IndexOf("/a>"); temp = temp.Substring(beg, (end - beg) + 3); beg = temp.IndexOf(">"); end = temp.IndexOf("</"); prjName = temp.Substring(beg + 1, end - beg - 1); Parser p = new Parser(new Lexer(temp)); NodeList l = p.ExtractAllNodesThatMatch(new TagNameFilter("a")); ATag aTag = l.SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://www.shajing.gov.cn/xxgk_14947/ywxx/zbcg/zhbgg/" + aTag.Link.Replace("../", "").Replace("./", ""); } catch { continue; } string htlDtl = string.Empty, ctx = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8); Regex regexHtml = new Regex(@"<script[^<]*</script>|<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>"); htlDtl = regexHtml.Replace(htlDtl, ""); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "DivContent"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = System.Text.RegularExpressions.Regex.Replace(dtl.AsHtml(), "(<script)[\\s\\S]*?(</script>)", ""); bidCtx = System.Text.RegularExpressions.Regex.Replace(HtmlTxt, "(<script)[\\s\\S]*?(</script>)", ""); bidCtx = System.Text.RegularExpressions.Regex.Replace(bidCtx, "<[^>]*>", "").Replace(" ", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); bidType = "工程"; if (prjName.Contains("施工")) { bidType = "施工"; } if (prjName.Contains("监理")) { bidType = "监理"; } if (prjName.Contains("设计")) { bidType = "设计"; } if (prjName.Contains("勘察")) { bidType = "勘察"; } if (prjName.Contains("服务")) { bidType = "服务"; } if (prjName.Contains("劳务分包")) { bidType = "劳务分包"; } if (prjName.Contains("专业分包")) { bidType = "专业分包"; } if (prjName.Contains("小型施工")) { bidType = "小型工程"; } if (prjName.Contains("设备材料")) { bidType = "设备材料"; } Regex regPrjCode = new Regex(@"(工程编号|项目编号|招标编号|中标编号|编号)(:|:)[^\r\n]+\r\n"); code = regPrjCode.Match(bidCtx.Replace(" ", "")).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("招标编号", "").Replace("中标编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Trim(); Regex regBuidUnit = new Regex(@"(建设单位|招标人|承包人|招标单位|招标方|招标代理机构)(:|:)[^\r\n]+\r\n"); buildUnit = regBuidUnit.Match(bidCtx.Replace(" ", "")).Value.Replace("招标代理机构", "").Replace("建设单位", "").Replace("招标人", "").Replace("承包人", "").Replace("招标单位", "").Replace("招标方", "").Replace(":", "").Replace(":", "").Trim(); Regex regMoney = new Regex(@"(中标价|投标价|总投资|发包价|投标报价|价格|金额)(:|:|)[^\r\n]+\r\n"); bidMoney = regMoney.Match(bidCtx.Replace(" ", "")).Value.Replace("中标价", "").Replace("总投资", "").Replace("发包价", "").Replace("投标报价", "").Replace("投标价", "").Replace("价格", "").Replace("金额", "").Replace(":", "").Replace(":", "").Trim(); Regex regBidUnit = new Regex(@"(第一候选人|中标候选人|中标单位|中标人|中标方)(:|:)[^\r\n]+\r\n"); bidUnit = regBidUnit.Match(bidCtx.Replace(" ", "")).Value.Replace("中标候选人", "").Replace("第一候选人", "").Replace("中标单位", "").Replace("中标人", "").Replace("中标方", "").Replace(":", "").Replace(":", "").Trim(); Regex regprjMgr = new Regex(@"(项目经理姓名|项目经理|项目负责人|项目总监|建造师|总工程师|监理师)(:|:)[^\r\n]+\r\n"); prjMgr = regprjMgr.Match(bidCtx.Replace(" ", "")).Value.Replace("项目经理姓名", "").Replace("总工程师", "").Replace("项目经理", "").Replace("项目总监", "").Replace("建造师", "").Replace("监理师", "").Replace("项目负责人", "").Replace(":", "").Replace(":", "").Trim(); Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+"); if (bidMoney.Contains("万")) { bidMoney = bidMoney.Remove(bidMoney.IndexOf("万")).Trim(); bidMoney = regBidMoney.Match(bidMoney).Value; } else { try { bidMoney = (decimal.Parse(regBidMoney.Match(bidMoney).Value) / 10000).ToString(); if (decimal.Parse(bidMoney) < decimal.Parse("0.1")) { bidMoney = "0"; } } catch (Exception) { bidMoney = "0"; } } if (prjMgr.Contains("资格")) { prjMgr = prjMgr.Remove(prjMgr.IndexOf("资格")); } buildUnit = ToolHtml.GetSubString(buildUnit, 150); bidUnit = ToolHtml.GetSubString(bidUnit, 150); code = ToolHtml.GetSubString(code, 50); prjMgr = ToolHtml.GetSubString(prjMgr, 50); if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "深圳市宝安区沙井街道办事处"; } msgType = "深圳市宝安区沙井街道办事处"; specType = "建设工程"; bidType = "小型工程"; prjName = ToolDb.GetPrjName(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "Top10 TxtCenter"))); if (noList != null && noList.Count > 0) { string temp = noList.AsString().GetRegexBegEnd("/", "页"); try { pageInt = Convert.ToInt32(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.bidding.csg.cn/zbgg/index_" + i.ToString() + ".jhtml", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "W750 Right")), true), new TagNameFilter("li"))); if (nodeList != null && nodeList.Count > 0) { for (int j = 1; j < nodeList.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = nodeList[j].GetATag(); prjName = aTag.LinkText; beginDate = nodeList[j].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.bidding.csg.cn" + aTag.Link; string htlDtl = string.Empty; try { htlDtl = ToolHtml.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "Center W1000"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); parser = new Parser(new Lexer(HtmlTxt)); NodeList nameNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("h1"), new HasAttributeFilter("class", "TxtCenter Padding10"))); if (nameNode != null && nameNode.Count > 0) { prjName = nameNode[0].ToNodePlainString(); } inviteCtx = HtmlTxt.ToCtxString(); inviteType = ToolHtml.GetInviteTypes(prjName); prjAddress = ToolHtml.GetRegexString(inviteCtx, ToolHtml.AddressRegex); buildUnit = ToolHtml.GetRegexString(inviteCtx, ToolHtml.BuildRegex); code = ToolHtml.GetRegexString(inviteCtx, ToolHtml.CodeRegex); prjAddress = ToolHtml.GetSubString(prjAddress, 150); buildUnit = ToolHtml.GetSubString(buildUnit, 150); code = ToolHtml.GetSubString(code, 50); if (string.IsNullOrEmpty(code)) { code = "见招标信息"; } if (string.IsNullOrEmpty(prjAddress)) { prjAddress = "见招标信息"; } specType = "其他"; msgType = "中国南方电网有限责任公司招标服务中心"; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "中国南方电网有限责任公司招标服务中心"; } InviteInfo info = ToolDb.GenInviteInfo("广东省", "电网专项工程", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList nodeAtag = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (nodeAtag != null && nodeAtag.Count > 0) { for (int c = 0; c < nodeAtag.Count; c++) { ATag a = nodeAtag[c] as ATag; if (a.Link.IsAtagAttach()) { string alink = "http://www.bidding.csg.cn/" + a.Link; try { BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText.Replace(" ", "").Replace(";", "").Replace(";", ""), info.Id, alink); base.AttachList.Add(attach); } catch { } } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "jwpage"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString(); Regex reg = new Regex(@"/共[^页]+页"); pageInt = Convert.ToInt32(reg.Match(temp).Value.Replace("/共", "").Replace("页", "")); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { } parser = new Parser(new Lexer(html)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "jwRercon"))), new TagNameFilter("ul"))), new TagNameFilter("li"))); if (dtlList != null && dtlList.Count > 0) { for (int j = 0; j < dtlList.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = dtlList[j].ToPlainTextString().Trim().Remove(dtlList[j].ToPlainTextString().Trim().IndexOf("[")); Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}"); beginDate = regDate.Match(dtlList[j].ToPlainTextString().Trim()).Value; ATag aTag = dtlList.SearchFor(typeof(ATag), true)[j] as ATag; InfoUrl = "http://www.szns.gov.cn" + aTag.Link; string htmDtl = string.Empty; try { htmDtl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.UTF8); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>"); htmDtl = regexHtml.Replace(htmDtl, ""); } catch { continue; } parser = new Parser(new Lexer(htmDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "hyxzf2"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = dtl.AsHtml(); inviteCtx = dtl.AsString().Replace(" ", "").Replace("\n", "\r\n"); string InvType = prjName; if (InvType.Contains("施工")) { inviteType = "施工"; } if (InvType.Contains("监理")) { inviteType = "监理"; } if (InvType.Contains("设计")) { inviteType = "设计"; } if (InvType.Contains("勘察")) { inviteType = "勘察"; } if (InvType.Contains("服务")) { inviteType = "服务"; } if (InvType.Contains("劳务分包")) { inviteType = "劳务分包"; } if (InvType.Contains("专业分包")) { inviteType = "专业分包"; } if (InvType.Contains("小型施工")) { inviteType = "小型工程"; } if (InvType.Contains("设备材料")) { inviteType = "设备材料"; } Regex regPrjAddr = new Regex(@"(工程位置|工程地点|工程地址)(:|:)[^\r\n]+\r\n"); prjAddress = regPrjAddr.Match(inviteCtx.Replace(" ", "")).Value.Replace("工程位置", "").Replace("工程地点", "").Replace("工程地址", "").Replace(":", "").Replace(":", "").Trim(); Regex regBuildUnit = new Regex(@"(招标单位|招标人|招标单位(盖章))(:|:)[^\r\n]+\r\n"); buildUnit = regBuildUnit.Match(inviteCtx.Replace(" ", "")).Value.Replace("招标单位", "").Replace("招标人", "").Replace("(盖章)", "").Replace(":", "").Replace(":", "").Trim(); Regex regPrjCode = new Regex(@"(工程编号|项目编号|编号)(:|:)[^\r\n]+\r\n"); code = regPrjCode.Match(inviteCtx.Replace(" ", "")).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Trim(); msgType = "深圳市南山区粤海街道办事处"; if (string.IsNullOrEmpty(prjAddress)) { prjAddress = "见招标信息"; } code = ToolHtml.GetSubString(code, 50); buildUnit = ToolHtml.GetSubString(buildUnit, 150); specType = "建设工程"; inviteType = "小型工程"; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "深圳市南山区粤海街道办事处"; } inviteType = ToolHtml.GetInviteType(inviteType); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳区及街道工程", "南山区", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 int pageInt = 1; string html = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new TagNameFilter("div")), new HasAttributeFilter("id", "page_div"))); if (sNode != null && sNode.Count > 0) { string page = ToolHtml.GetRegexString(sNode.AsString(), "共", "页"); try { pageInt = int.Parse(page); } catch { pageInt = 7; } } parser.Reset(); for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.conghua.gov.cn/zgch/zbzb/list_" + i.ToString() + ".shtml", Encoding.Default); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(html)); sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "list_list"))), new TagNameFilter("table"))); if (sNode != null && sNode.Count > 0) { TableTag table = sNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { TableRow tr = table.Rows[j]; string projectName = ToolHtml.GetHtmlAtagValue("title", tr.ToHtml()); if (!projectName.Contains("中标") && !projectName.Contains("结果") && !projectName.Contains("候选单位公示")) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = projectName; inviteType = ToolHtml.GetInviteTypes(projectName); beginDate = ToolHtml.GetRegexDateTime(tr.Columns[1].ToPlainTextString()); InfoUrl = "http://www.conghua.gov.cn" + ToolHtml.GetHtmlAtagValue("href", tr.ToHtml()).Replace("..", ""); string htmlDtl = string.Empty; try { htmlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8); htmlDtl = ToolHtml.GetRegexHtlTxt(htmlDtl); } catch { continue; } parser = new Parser(new Lexer(htmlDtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "zoomcon"))); if (dtlList != null && dtlList.Count > 0) { HtmlTxt = dtlList.ToHtml(); inviteCtx = dtlList.AsString().Replace(" ", ""); buildUnit = ToolHtml.GetRegexString(inviteCtx, ToolHtml.BuildRegex, true); if (!string.IsNullOrEmpty(buildUnit) && buildUnit.Contains(" ")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf(" ")); } buildUnit = ToolHtml.GetSubString(buildUnit, 150); msgType = "广州建设工程交易中心"; specType = "建设工程"; inviteType = inviteType == "" ? "小型工程" : inviteType; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "广州建设工程交易中心"; } InviteInfo info = ToolDb.GenInviteInfo("广东省", "广州市区", "从化市", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } else { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = projectName; bidType = ToolHtml.GetInviteTypes(projectName); beginDate = ToolHtml.GetRegexDateTime(tr.Columns[1].ToPlainTextString()); InfoUrl = "http://www.conghua.gov.cn" + ToolHtml.GetHtmlAtagValue("href", tr.ToHtml()).Replace("..", ""); string htmlDtl = string.Empty; try { htmlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8); htmlDtl = ToolHtml.GetRegexHtlTxt(htmlDtl); } catch { continue; } parser = new Parser(new Lexer(htmlDtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "zoomcon"))); if (dtlList != null && dtlList.Count > 0) { HtmlTxt = dtlList.ToHtml(); bidCtx = dtlList.AsString(); buildUnit = ToolHtml.GetRegexString(bidCtx, ToolHtml.BuildRegex, true); buildUnit = ToolHtml.GetSubString(buildUnit, 150); msgType = "广州建设工程交易中心"; specType = "建设工程"; bidType = bidType == "" ? bidType : "小型工程"; parser = new Parser(new Lexer(HtmlTxt)); NodeList bidNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (bidNode != null && bidNode.Count > 0) { string ctx = string.Empty; TableTag bidTable = bidNode[0] as TableTag; try { for (int r = 0; r < bidTable.RowCount; r++) { ctx += bidTable.Rows[r].Columns[0].ToNodePlainString() + ":"; ctx += bidTable.Rows[r].Columns[1].ToNodePlainString() + "\r\n"; } } catch { } bidUnit = ctx.GetRegex("单位名称,承包意向人名称"); bidMoney = ctx.GetMoneyRegex(); prjMgr = ctx.GetMgrRegex(); if (prjMgr.Contains("/")) { prjMgr = prjMgr.Remove(prjMgr.IndexOf("/")); } } if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "广州建设工程交易中心"; } BidInfo info = ToolDb.GenBidInfo("广东省", "广州市区", "从化市", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default, ref cookiestr); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("align", "right"))); Regex regexPage = new Regex(@"\d+页"); try { page = Convert.ToInt32(regexPage.Match(nodeList.AsString()).Value.Replace("页", "").Trim()); } catch (Exception) { } for (int i = 1; i <= page; i++) { if (i > 1) { try { htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl + "&page=" + i.ToString()), Encoding.Default); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(htl)); NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("align", "center"))); if (tableNodeList != null && tableNodeList.Count > 1) { TableTag table = (TableTag)tableNodeList[3]; for (int j = 0; j < table.RowCount - 1; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, img = string.Empty, HtmlTxt = string.Empty, downUrl = string.Empty, downName = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.GetATag(1); prjName = aTag.LinkText; if (prjName == "参加网上竞价招标供应商,敬请浏览以下网站") { continue; } beginDate = tr.Columns[1].ToPlainTextString().Trim(); InfoUrl = "http://zhaobiao.szpt.edu.cn/" + aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.Default); } catch (Exception) { continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new TagNameFilter("p")); if (dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); Regex regeximg = new Regex(@"<IMG[^>]*>");//去掉图片 HtmlTxt = regeximg.Replace(HtmlTxt, ""); for (int z = 0; z < dtnode.Count; z++) { inviteCtx += dtnode[z].ToPlainTextString().Replace(" ", "").Trim() + "\r\n"; } Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); inviteCtx = regexHtml.Replace(inviteCtx, ""); Regex regcode = new Regex(@"(项目编号|招标编号)(:|:)[^\r\n]+\r\n"); code = regcode.Match(inviteCtx).Value.Replace("项目编号:", "").Replace("招标编号:", "").Replace(":", "").Trim(); code = ToolHtml.GetSubString(code, 30); Regex regprjAddress = new Regex(@"地址(:|:)[^\r\n]+\r\n"); prjAddress = regprjAddress.Match(inviteCtx).Value.Replace("地址:", "").Trim(); //Regex regBegin = new Regex(@"投标报名时间:[^\r\n]+[\r\n]{1}"); //string date = regBegin.Match(inviteCtx).Value.Replace("投标报名时间:", "").Replace(" ", "").Trim(); //Regex regDate = new Regex(@"\d{4}年\d{1,2}月\d{1,2}日"); //endDate = regDate.Match(date).Value.Trim(); Regex regBuidUnit = new Regex(@"(招标机构|委托单位)(:|:)[^\r\n]+\r\n"); buildUnit = regBuidUnit.Match(inviteCtx).Value.Replace("招标机构:", "").Replace("委托单位:", "").Trim(); if (inviteType == "设备材料" || inviteType == "小型施工" || inviteType == "专业分包" || inviteType == "劳务分包" || inviteType == "服务" || inviteType == "勘察" || inviteType == "设计" || inviteType == "监理" || inviteType == "施工") { specType = "建设工程"; } else { specType = "其他"; } if (buildUnit == "") { buildUnit = ""; } if (prjAddress == "") { prjAddress = "见招标信息"; } msgType = "深职院"; inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parserdetail.Reset(); parserdetail = new Parser(new Lexer(htmldetail)); NodeList nodedown = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new TagNameFilter("p"), true), new TagNameFilter("a"))); for (int k = 0; k < nodedown.Count; k++) { ATag aTagdown = nodedown.SearchFor(typeof(ATag), true)[k] as ATag; if (aTagdown.LinkText.Contains(".doc") || aTagdown.LinkText.Contains(".dwg") || aTagdown.LinkText.Contains(".xls")) { downName = aTagdown.LinkText; downUrl = "http://zhaobiao.szpt.edu.cn" + aTagdown.Link; BaseAttach attach = ToolDb.GenBaseAttach(downName, info.Id, downUrl); base.AttachList.Add(attach); } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "venycms-page")), true), new TagNameFilter("script"))); if (sNode != null && sNode.Count > 0) { try { string page = sNode.ToString().Replace("createPageHTML(", "").Replace(",", "kd").Replace("****", "").Replace("\n", ""); page = page.GetRegexBegEnd("Code", "kd"); pageInt = int.Parse(page); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://sgjd.baoan.gov.cn/zbcg/zhbgg_139208/index_" + (i - 1) + ".html", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "content clearfix")), true), new TagNameFilter("li"))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = viewList[j].GetATag(); prjName = aTag.GetAttribute("title"); beginDate = viewList[j].ToPlainTextString().GetDateRegex(); InfoUrl = aTag.Link; InfoUrl = InfoUrl.GetRegexBegEnd("./", ".html"); InfoUrl = "http://sgjd.baoan.gov.cn/zbcg/zhbgg_139208/" + InfoUrl + ".html"; string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "con"))); if (dtlList != null && dtlList.Count > 0) { HtmlTxt = dtlList.AsHtml(); bidCtx = HtmlTxt.ToCtxString(); bidType = prjName.GetInviteBidType(); Regex regPrjCode = new Regex(@"(工程编号|项目编号|招标编号|中标编号|编号)(:|:)[^\r\n]+\r\n"); code = regPrjCode.Match(bidCtx.Replace(" ", "")).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("招标编号", "").Replace("中标编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Trim(); Regex regBuidUnit = new Regex(@"(建设单位|招标人|承包人|招标单位|招标方|招标代理机构)(:|:)[^\r\n]+\r\n"); buildUnit = regBuidUnit.Match(bidCtx.Replace(" ", "")).Value.Replace("招标代理机构", "").Replace("建设单位", "").Replace("招标人", "").Replace("承包人", "").Replace("招标单位", "").Replace("招标方", "").Replace(":", "").Replace(":", "").Trim(); Regex regMoney = new Regex(@"(中标价|投标价|总投资|发包价|投标报价|价格|金额|总价)(:|:|)[^\r\n]+\r\n"); bidMoney = regMoney.Match(bidCtx.Replace(" ", "")).Value.Replace("中标价", "").Replace("总投资", "").Replace("发包价", "").Replace("总价", "").Replace("投标报价", "").Replace("投标价", "").Replace("价格", "").Replace("金额", "").Replace(":", "").Replace(":", "").Replace(",", "").Replace(",", "").Trim(); Regex regBidUnit = new Regex(@"(成交供应商|中标供应商|第一候选人|中标候选人|中标单位|中标人|中标方)(:|:)[^\r\n]+\r\n"); bidUnit = regBidUnit.Match(bidCtx.Replace(" ", "")).Value.Replace("成交供应商", "").Replace("中标供应商", "").Replace("中标候选人", "").Replace("第一候选人", "").Replace("中标单位", "").Replace("中标人", "").Replace("中标方", "").Replace(":", "").Replace(":", "").Trim(); Regex regprjMgr = new Regex(@"(项目经理姓名|项目经理(或建造师)|项目经理|项目负责人|项目总监|建造师|总工程师|监理师)(:|:)[^\r\n]+\r\n"); prjMgr = regprjMgr.Match(bidCtx.Replace(" ", "")).Value.Replace("项目经理(或建造师)", "").Replace("项目经理姓名", "").Replace("总工程师", "").Replace("项目经理", "").Replace("项目总监", "").Replace("建造师", "").Replace("监理师", "").Replace("项目负责人", "").Replace(":", "").Replace(":", "").Trim(); Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+"); if (prjMgr.Contains("资格")) { prjMgr = prjMgr.Remove(prjMgr.IndexOf("资格")); } bidUnit = ToolHtml.GetStringTemp(bidUnit); buildUnit = ToolHtml.GetSubString(buildUnit, 150); bidUnit = ToolHtml.GetSubString(bidUnit, 150); code = ToolHtml.GetSubString(code, 50); prjMgr = ToolHtml.GetSubString(prjMgr, 50); if (string.IsNullOrWhiteSpace(bidUnit)) { parser = new Parser(new Lexer(HtmlTxt)); NodeList tableNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "MsoNormalTable"))); if (tableNode != null && tableNode.Count > 0) { for (int t = 0; t < tableNode.Count; t++) { TableTag table = tableNode[t] as TableTag; string ctx = string.Empty; for (int r = 0; r < table.Rows[0].ColumnCount; r++) { try { ctx += table.Rows[0].Columns[r].ToNodePlainString() + ":"; ctx += table.Rows[1].Columns[r].ToNodePlainString() + "\r\n"; } catch { } } bidUnit = ctx.GetBidRegex(); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = ctx.GetRegex("中标供应商"); } if (string.IsNullOrWhiteSpace(code)) { code = ctx.GetCodeRegex(); } // break; bidMoney = ctx.GetMoneyRegex(); } } } try { if (Convert.ToDecimal(bidMoney) > 100000) { bidMoney = (decimal.Parse(bidMoney) / 10000).ToString(); } } catch { } if (bidMoney.Contains("万")) { bidMoney = bidMoney.Remove(bidMoney.IndexOf("万")).Trim(); bidMoney = regBidMoney.Match(bidMoney).Value; } if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "深圳市宝安区松岗街道办事处"; } msgType = "深圳市宝安区松岗街道办事处"; specType = "建设工程"; bidType = "小型工程"; BidInfo info = ToolDb.GenBidInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "list_page"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString().Replace("createPageHTML(", "").Replace("index", "").Replace("html", "").Replace(",0,", "").Replace(");", "").Replace(",", "").Replace(";", "").Replace(")", "").Replace("\"", "").Replace(" ", ""); pageInt = Convert.ToInt32(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://xajdb.baoan.gov.cn/xxgk_11984/ywxx/zbcg/zbxxgs/index_" + (i - 1).ToString() + ".html", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "right_list"))), new TagNameFilter("ul"))), new TagNameFilter("li"))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}"); beginDate = regDate.Match(viewList[j].ToPlainTextString().Trim()).Value; string temp = viewList[j].ToPlainTextString().Trim().Replace(beginDate, ""); try { int beg = temp.IndexOf("else"), end = temp.Length; temp = temp.Substring(beg, end - beg); beg = temp.IndexOf("<a"); end = temp.IndexOf("/a>"); temp = temp.Substring(beg, (end - beg) + 3); beg = temp.IndexOf(">"); end = temp.IndexOf("</"); prjName = temp.Substring(beg + 1, end - beg - 1); Parser p = new Parser(new Lexer(temp)); NodeList l = p.ExtractAllNodesThatMatch(new TagNameFilter("a")); ATag aTag = l.SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://xajdb.baoan.gov.cn/xxgk_11984/ywxx/zbcg/zbxxgs/" + aTag.Link.Replace("../", "").Replace("./", ""); } catch { continue; } string htmDtl = string.Empty; try { htmDtl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.UTF8); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>"); htmDtl = regexHtml.Replace(htmDtl, ""); } catch { continue; } parser = new Parser(new Lexer(htmDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "TRS_PreAppend"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = dtl.AsHtml(); parser = new Parser(new Lexer(HtmlTxt.Replace("th", "td"))); NodeList dtlTab = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "texttable"))); if (dtlTab != null && dtlTab.Count > 0) { TableTag table = dtlTab[0] as TableTag; for (int k = 0; k < table.RowCount; k++) { for (int c = 0; c < table.Rows[k].ColumnCount; c++) { string strCtx = table.Rows[k].Columns[c].ToPlainTextString().Replace(" ", "").Replace(" ", ""); if (strCtx == "工程类型") { break; } if (c % 2 == 0) { inviteCtx += strCtx + ":"; } else { inviteCtx += strCtx + "\r\n"; } } } } else { inviteCtx = dtl.AsString().Replace(" ", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n"); } string InvType = prjName; if (InvType.Contains("施工")) { inviteType = "施工"; } if (InvType.Contains("监理")) { inviteType = "监理"; } if (InvType.Contains("设计")) { inviteType = "设计"; } if (InvType.Contains("勘察")) { inviteType = "勘察"; } if (InvType.Contains("服务")) { inviteType = "服务"; } if (InvType.Contains("劳务分包")) { inviteType = "劳务分包"; } if (InvType.Contains("专业分包")) { inviteType = "专业分包"; } if (InvType.Contains("小型施工")) { inviteType = "小型工程"; } if (InvType.Contains("设备材料")) { inviteType = "设备材料"; } Regex regPrjAddr = new Regex(@"(工程位置|工程地点|工程地址|地址)(:|:)[^\r\n]+\r\n"); prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程位置", "").Replace("工程地点", "").Replace("工程地址", "").Replace(":", "").Replace(":", "").Trim(); Regex regBuildUnit = new Regex(@"(招标单位|招标人|招标单位(盖章)|采购人)(:|:)[^\r\n]+\r\n"); buildUnit = regBuildUnit.Match(inviteCtx).Value.Replace("招标单位", "").Replace("招标人", "").Replace("(盖章)", "").Replace(":", "").Replace(":", "").Trim(); Regex regPrjCode = new Regex(@"(工程编号|项目编号|编号)(:|:)[^\r\n]+\r\n"); code = regPrjCode.Match(inviteCtx).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Trim(); msgType = "深圳市宝安区新安街道办事处"; if (string.IsNullOrEmpty(prjAddress) || Encoding.Default.GetByteCount(prjAddress) > 150) { prjAddress = "见招标信息"; } if (string.IsNullOrWhiteSpace(prjAddress)) { prjAddress = inviteCtx.GetAddressRegex(); } if (string.IsNullOrWhiteSpace(buildUnit)) { inviteCtx.GetBuildRegex(); } if (string.IsNullOrWhiteSpace(code)) { code = inviteCtx.GetCodeRegex(); } if (code.Contains(")")) { code = code.Remove(code.IndexOf(")")); } if (buildUnit.Contains("采购人")) { buildUnit = buildUnit.Replace("采购人", ""); } code = ToolHtml.GetSubString(code, 50); buildUnit = ToolHtml.GetSubString(buildUnit, 150); specType = "建设工程"; inviteType = "小型工程"; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "深圳市宝安区新安街道办事处"; } inviteType = ToolHtml.GetInviteType(inviteType); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pagination"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString().Replace(" ", ""); Regex reg = new Regex(@"/[^页]+页"); pageInt = Convert.ToInt32(reg.Match(temp).Value.Replace("/", "").Replace("页", "")); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://zyjy.huizhou.gov.cn/pages/cms/hzggzyjyzx/html/artList.html?cataId=a000dc84e53b4dc88e1e05d15d7c90f7&pageNo=" + i.ToString(), Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "div_list"))), new TagNameFilter("ul"))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}"); beginDate = regDate.Match(viewList[j].ToPlainTextString()).Value; //prjName = viewList[j].ToPlainTextString().Replace("\r", "").Replace("\n", "").Replace(beginDate, ""); ATag aTag = viewList.SearchFor(typeof(ATag), true)[j] as ATag; prjName = aTag.GetAttribute("title"); InfoUrl = "http://zyjy.huizhou.gov.cn" + aTag.Link; string htmDtl = string.Empty; try { htmDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>"); htmDtl = regexHtml.Replace(htmDtl, ""); } catch { continue; } parser = new Parser(new Lexer(htmDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "divZoom"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = dtl.AsHtml(); bidCtx = HtmlTxt.ToCtxString(); NodeList ifrm = new Parser(new Lexer(htmDtl)).ExtractAllNodesThatMatch(new TagNameFilter("iframe")); if (ifrm != null && ifrm.Count > 0) { IFrameTag frame = ifrm[0] as IFrameTag; string url = frame.GetAttribute("src"); try { string htm = this.ToolWebSite.GetHtmlByUrl(url, Encoding.Default); NodeList tabNode = new Parser(new Lexer(htm)).ExtractAllNodesThatMatch(new TagNameFilter("table")); string ctx = tabNode.AsHtml().ToCtxString().Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t"); bidCtx = ctx + bidCtx; } catch { } } //bidCtx = System.Text.RegularExpressions.Regex.Replace(HtmlTxt, "(<script)[\\s\\S]*?(</script>)", ""); //bidCtx = System.Text.RegularExpressions.Regex.Replace(bidCtx, "<[^>]*>", "").Replace(" ", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\r\n", "\r\n").Replace("\r\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\t", "").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); Regex regPrjCode = new Regex(@"(工程编号|项目编号|招标编号|中标编号|编号)(:|:)[^\r\n]+\r\n"); code = regPrjCode.Match(bidCtx.Replace(" ", "")).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("招标编号", "").Replace("中标编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Trim(); Regex regBuidUnit = new Regex(@"(建设单位|招标人|承包人|招标单位|招标方|招标代理机构)(:|:)[^\r\n]+\r\n"); buildUnit = regBuidUnit.Match(bidCtx.Replace(" ", "")).Value.Replace("招标代理机构", "").Replace("建设单位", "").Replace("招标人", "").Replace("承包人", "").Replace("招标单位", "").Replace("招标方", "").Replace(":", "").Replace(":", "").Trim(); Regex regMoney = new Regex(@"(中标价|投标价|总投资|发包价|投标报价|价格|金额|总价)(:|:|)[^\r\n]+\r\n"); bidMoney = regMoney.Match(bidCtx.Replace(" ", "")).Value.Replace("中标价", "").Replace("总投资", "").Replace("发包价", "").Replace("总价", "").Replace("投标报价", "").Replace("投标价", "").Replace("价格", "").Replace("金额", "").Replace(":", "").Replace(":", "").Replace(",", "").Replace(",", "").Trim(); Regex regBidUnit = new Regex(@"(成交供应商|中标供应商|第一候选人|中标候选人|中标单位|中标人|中标方)(:|:)[^\r\n]+\r\n"); bidUnit = regBidUnit.Match(bidCtx.Replace(" ", "")).Value.Replace("成交供应商", "").Replace("中标供应商", "").Replace("中标候选人", "").Replace("第一候选人", "").Replace("中标单位", "").Replace("中标人", "").Replace("中标方", "").Replace(":", "").Replace(":", "").Trim(); Regex regprjMgr = new Regex(@"(项目经理姓名|项目经理(或建造师)|项目经理|项目负责人|项目总监|建造师|总工程师|监理师)(:|:)[^\r\n]+\r\n"); prjMgr = regprjMgr.Match(bidCtx.Replace(" ", "")).Value.Replace("项目经理(或建造师)", "").Replace("项目经理姓名", "").Replace("总工程师", "").Replace("项目经理", "").Replace("项目总监", "").Replace("建造师", "").Replace("监理师", "").Replace("项目负责人", "").Replace(":", "").Replace(":", "").Trim(); Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+"); if (bidMoney.Contains("万")) { bidMoney = bidMoney.Remove(bidMoney.IndexOf("万")).Trim(); bidMoney = regBidMoney.Match(bidMoney).Value; } else { try { bidMoney = (decimal.Parse(regBidMoney.Match(bidMoney).Value) / 10000).ToString(); if (decimal.Parse(bidMoney) < decimal.Parse("0.1")) { bidMoney = "0"; } } catch (Exception) { bidMoney = "0"; } } if (prjMgr.Contains("资格")) { prjMgr = prjMgr.Remove(prjMgr.IndexOf("资格")); } bidUnit = ToolHtml.GetStringTemp(bidUnit).Replace(";", ""); buildUnit = ToolHtml.GetSubString(buildUnit, 150); bidUnit = ToolHtml.GetSubString(bidUnit, 150); code = ToolHtml.GetSubString(code, 50); prjMgr = ToolHtml.GetSubString(prjMgr, 50); msgType = "惠州市公共资源交易中心"; specType = "建设工程"; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "惠州市公共资源交易中心"; } bidType = ToolHtml.GetInviteTypes(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "惠州市区", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("vAlign", "middle"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString(); Regex reg = new Regex(@"/[^页]+页"); string page = reg.Match(temp).Value.Replace("/", "").Replace("页", ""); pageInt = Convert.ToInt32(page); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl) + "&page=" + i.ToString(), Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("width", "100%"))), new TagNameFilter("table"))); if (nodeList != null && nodeList.Count > 0) { for (int j = 0; j < nodeList.Count; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableTag table = nodeList[j] as TableTag; TableRow tr = table.Rows[0]; prjName = tr.Columns[0].ToNodePlainString(); bidType = prjName.GetInviteBidType(); beginDate = tr.Columns[1].ToPlainTextString(); InfoUrl = "http://www.cajsw.gov.cn/" + tr.Columns[0].GetATagHref(2); string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); htlDtl = htlDtl.GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "fontzoom"))); if (dtlList != null && dtlList.Count > 0) { HtmlTxt = dtlList.ToHtml(); bidCtx = HtmlTxt.ToCtxString(); string ctx = bidCtx.ToNodeString(); bidUnit = ctx.GetRegexBegEnd("中标候选人为", ","); bidUnit = ToolHtml.GetSubString(bidUnit, 150); string money = ctx.GetRegexBegEnd("投标报价", "元").GetMoney(); bidMoney = money.GetMoney(); prjAddress = bidCtx.GetAddressRegex(); code = bidCtx.GetCodeRegex(); buildUnit = bidCtx.GetBuildRegex(); msgType = "潮州市潮安县住房和城乡建设局"; specType = "建设工程"; BidInfo info = ToolDb.GenBidInfo("广东省", "潮州市区", "潮安县", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pagination"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString().Replace(" ", ""); Regex reg = new Regex(@"/[^页]+页"); pageInt = Convert.ToInt32(reg.Match(temp).Value.Replace("/", "").Replace("页", "")); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://zyjy.huizhou.gov.cn/pages/cms/hzggzyjyzx/html/artList.html?cataId=54f6d9f3580843d59b9dd64918e7ae4f&pageNo=" + i.ToString(), Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "div_list"))), new TagNameFilter("ul"))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}"); beginDate = regDate.Match(viewList[j].ToPlainTextString()).Value; prjName = viewList[j].ToPlainTextString().Replace("\r", "").Replace("\n", "").Replace(beginDate, ""); ATag aTag = viewList.SearchFor(typeof(ATag), true)[j] as ATag; InfoUrl = "http://zyjy.huizhou.gov.cn" + aTag.Link; string htmDtl = string.Empty; try { System.Data.DataTable dt = new System.Data.DataTable(); htmDtl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.UTF8); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>"); htmDtl = regexHtml.Replace(htmDtl, ""); } catch { continue; } parser = new Parser(new Lexer(htmDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "divZoom"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = System.Text.RegularExpressions.Regex.Replace(dtl.ToHtml(), "(<script)[\\s\\S]*?(</script>)", ""); inviteCtx = System.Text.RegularExpressions.Regex.Replace(HtmlTxt, "(<script)[\\s\\S]*?(</script>)", ""); inviteCtx = System.Text.RegularExpressions.Regex.Replace(inviteCtx, "<[^>]*>", "").Replace(" ", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\r\n", "\r\n").Replace("\r\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\t", "").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); Regex regPrjAddr = new Regex(@"(工程位置|工程地点|工程地址|详细地址|地点|地址)(:|:)[^\r\n]+\r\n"); prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程位置", "").Replace("工程地点", "").Replace("工程地址", "").Replace("详细地址", "").Replace("地点", "").Replace("地址", "").Replace(":", "").Replace(":", "").Trim(); Regex regBuildUnit = new Regex(@"(招标代理机构|招标单位|招标人|招标单位(盖章))(:|:)[^\r\n]+\r\n"); buildUnit = regBuildUnit.Match(inviteCtx).Value.Replace("招标代理机构", "").Replace("招标单位", "").Replace("招标人", "").Replace("(盖章)", "").Replace(":", "").Replace(":", "").Trim(); if (buildUnit.Contains("资质")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("资质")); } prjAddress = ToolHtml.GetSubString(prjAddress, 150); buildUnit = ToolHtml.GetSubString(buildUnit, 150); Regex regPrjCode = new Regex(@"(工程编号|项目编号|编号)(:|:)[^\r\n]+\r\n"); code = regPrjCode.Match(inviteCtx).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Trim(); msgType = "惠州市公共资源交易中心"; specType = "建设工程"; inviteType = ToolHtml.GetInviteTypes(prjName); if (string.IsNullOrEmpty(prjAddress) || Encoding.Default.GetByteCount(prjAddress) > 150) { prjAddress = "见招标信息"; } if (Encoding.Default.GetByteCount(code) > 50) { code = ""; } inviteType = ToolHtml.GetInviteType(inviteType); InviteInfo info = ToolDb.GenInviteInfo("广东省", "惠州市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "NewsPage"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString().Replace("createPageHTML(", "").Replace("index", "").Replace("html", "").Replace(", 0,", "").Replace(");", "").Replace(",", "").Replace(";", "").Replace(")", "").Replace("\"", "").Replace(" ", ""); pageInt = Convert.ToInt32(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.shajing.gov.cn/xxgk_14947/ywxx/zbcg/zbgg/index_" + (i - 1).ToString() + ".html", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "NewsLiks01Text"))), new TagNameFilter("ul"))), new TagNameFilter("li"))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}"); beginDate = regDate.Match(viewList[j].ToPlainTextString().Trim()).Value; string temp = viewList[j].ToPlainTextString().Trim().Replace(beginDate, ""); try { int beg = temp.IndexOf("else{"), end = temp.Length; temp = temp.Substring(beg, end - beg); beg = temp.IndexOf("<a"); end = temp.IndexOf("/a>"); temp = temp.Substring(beg, (end - beg) + 3); beg = temp.IndexOf(">"); end = temp.IndexOf("</"); prjName = temp.Substring(beg + 1, end - beg - 1); Parser p = new Parser(new Lexer(temp)); NodeList l = p.ExtractAllNodesThatMatch(new TagNameFilter("a")); ATag aTag = l.SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://www.shajing.gov.cn/xxgk_14947/ywxx/zbcg/zbgg/" + aTag.Link.Replace("../", "").Replace("./", ""); } catch { continue; } string htlDtl = string.Empty, ctx = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8); Regex regexHtml = new Regex(@"<script[^<]*</script>|<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>"); htlDtl = regexHtml.Replace(htlDtl, ""); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "DivContent"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = System.Text.RegularExpressions.Regex.Replace(dtl.AsHtml(), "(<script)[\\s\\S]*?(</script>)", ""); inviteCtx = System.Text.RegularExpressions.Regex.Replace(HtmlTxt, "(<script)[\\s\\S]*?(</script>)", ""); inviteCtx = System.Text.RegularExpressions.Regex.Replace(inviteCtx, "<[^>]*>", "").Replace(" ", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); string InvType = prjName; if (InvType.Contains("施工")) { inviteType = "施工"; } if (InvType.Contains("监理")) { inviteType = "监理"; } if (InvType.Contains("设计")) { inviteType = "设计"; } if (InvType.Contains("勘察")) { inviteType = "勘察"; } if (InvType.Contains("服务")) { inviteType = "服务"; } if (InvType.Contains("劳务分包")) { inviteType = "劳务分包"; } if (InvType.Contains("专业分包")) { inviteType = "专业分包"; } if (InvType.Contains("小型施工")) { inviteType = "小型工程"; } if (InvType.Contains("设备材料")) { inviteType = "设备材料"; } Regex regPrjAddr = new Regex(@"(工程位置|工程地点|工程地址|详细地址|地点|地址)(:|:)[^\r\n]+\r\n"); prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程位置", "").Replace("工程地点", "").Replace("工程地址", "").Replace("详细地址", "").Replace("地点", "").Replace("地址", "").Replace(":", "").Replace(":", "").Trim(); Regex regBuildUnit = new Regex(@"(招标代理机构|招标单位|招标人|招标单位(盖章))(:|:)[^\r\n]+\r\n"); buildUnit = regBuildUnit.Match(inviteCtx).Value.Replace("招标代理机构", "").Replace("招标单位", "").Replace("招标人", "").Replace("(盖章)", "").Replace(":", "").Replace(":", "").Trim(); Regex regPrjCode = new Regex(@"(工程编号|项目编号|编号)(:|:)[^\r\n]+\r\n"); code = regPrjCode.Match(inviteCtx).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Trim(); msgType = "深圳市宝安区沙井街道办事处"; if (string.IsNullOrEmpty(prjAddress) || Encoding.Default.GetByteCount(prjAddress) > 150) { prjAddress = "见招标信息"; } code = ToolHtml.GetSubString(code, 50); buildUnit = ToolHtml.GetSubString(buildUnit, 150); specType = "建设工程"; inviteType = "小型工程"; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "深圳市宝安区沙井街道办事处"; } inviteType = ToolHtml.GetInviteType(inviteType); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳区及街道工程", "宝安区", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl);//ToolSocket.Get("http://www.guanhu.gov.cn/NEWS/Public_Edit.aspx?verid=2f51d6aa-816e-41bb-a331-bce28a4f9554", Encoding.Default); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "Normal"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode[0].ToNodePlainString().GetRegexBegEnd("/", "跳转"); pageInt = Convert.ToInt32(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://glbsc.szlhxq.gov.cn/glbsc/zwgk70/zbcg5/zbxxgs/15158-" + i + ".html", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("style", "border-bottom: 1px dashed #333;"))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { TableTag table = viewList[j] as TableTag; string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; beginDate = table.ToPlainTextString().GetDateRegex(); ATag aTag = table.GetATag(); prjName = aTag.GetAttribute("title"); InfoUrl = "http://glbsc.szlhxq.gov.cn" + aTag.Link; string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "contentbox"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = dtl.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); inviteCtx = System.Text.RegularExpressions.Regex.Replace(inviteCtx, "<[^>]*>", "").Replace(" ", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); inviteType = prjName.GetInviteBidType(); Regex regPrjAddr = new Regex(@"(工程位置|工程地点|工程地址|详细地址|地点|地址)(:|:)[^\r\n]+\r\n"); prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程位置", "").Replace("工程地点", "").Replace("工程地址", "").Replace("详细地址", "").Replace("地点", "").Replace("地址", "").Replace(":", "").Replace(":", "").Trim(); Regex regBuildUnit = new Regex(@"(招标代理机构|招标单位|招标人|招标单位(盖章))(:|:)[^\r\n]+\r\n"); buildUnit = regBuildUnit.Match(inviteCtx).Value.Replace("招标代理机构", "").Replace("招标单位", "").Replace("招标人", "").Replace("(盖章)", "").Replace(":", "").Replace(":", "").Trim(); Regex regPrjCode = new Regex(@"(工程编号|项目编号|编号)(:|:)[^\r\n]+\r\n"); code = regPrjCode.Match(inviteCtx).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Replace(")", "").Replace(")", "").Trim(); msgType = "深圳市龙华新区观澜街道办事处"; if (string.IsNullOrEmpty(prjAddress) || Encoding.Default.GetByteCount(prjAddress) > 150) { prjAddress = "见招标信息"; } code = ToolHtml.GetSubString(code, 50); buildUnit = ToolHtml.GetSubString(buildUnit, 150); specType = "建设工程"; inviteType = "小型工程"; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "深圳市龙华新区观澜街道办事处"; } inviteType = ToolHtml.GetInviteType(inviteType); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string cookiestr = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "table1"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("共", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "&page=" + i, Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "position6")), true), new TagNameFilter("li"))); if (pageNode != null && pageNode.Count > 0) { for (int j = 3; j < pageNode.Count; j++) { INode node = pageNode[j]; ATag aTag = node.GetATag(); string psName = aTag.LinkText; if (psName.Contains("中标") || psName.Contains("结果")) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = aTag.GetAttribute("title"); InfoUrl = "http://www.zqgcjy.com/" + aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch (Exception) { continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "table1"))); if (dtnode != null && dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); bidCtx = HtmlTxt.ToCtxString(); beginDate = bidCtx.GetDateRegex(); code = bidCtx.GetCodeRegex(); bidMoney = bidCtx.GetMoneyRegex(); if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney)) { bidMoney = bidCtx.GetMoneyRegex(null, true); } if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney)) { bidMoney = bidCtx.GetRegex("总额").GetMoney(); } prjMgr = bidCtx.GetMgrRegex(); bidUnit = bidCtx.GetBidRegex(); bidDate = bidCtx.GetTimeRegex(); buildUnit = bidCtx.GetBuildRegex(); parser = new Parser(new Lexer(HtmlTxt)); NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (tableNode != null && tableNode.Count > 0) { for (int t = 0; t < tableNode.Count; t++) { TableTag tag = tableNode[t] as TableTag; string classStr = tag.GetAttribute("class"); if (!string.IsNullOrEmpty(classStr) && classStr.ToLower().Contains("table1")) { continue; } string ctx = string.Empty; for (int r = 0; r < tag.RowCount; r++) { for (int c = 0; c < tag.Rows[r].ColumnCount; c++) { string temp = tag.Rows[r].Columns[c].ToNodePlainString().GetReplace(":,:"); if ((c + 1) % 2 == 0) { ctx += temp + "\r\n"; } else { ctx += temp + ":"; } } } if (string.IsNullOrEmpty(bidUnit)) { bidUnit = ctx.GetRegex("成交候选人,中标单位名称,第一中标候选人,第一候选人"); } if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney)) { bidMoney = ctx.GetMoneyRegex(); } if (string.IsNullOrEmpty(prjMgr)) { prjMgr = ctx.GetMgrRegex(); } if (string.IsNullOrEmpty(prjMgr)) { prjMgr = ctx.GetRegex("拟任总监,拟任项目经理"); } if (!bidUnit.Contains("公司")) { ctx = string.Empty; try { for (int r = 1; r < tag.Rows[4].ColumnCount; r++) { string temp = tag.Rows[4].Columns[r].ToNodePlainString().GetReplace(":,:"); ctx += temp + ":"; ctx += tag.Rows[5].Columns[r].ToNodePlainString().GetReplace(":,:") + "\r\n"; } if (string.IsNullOrEmpty(bidUnit)) { bidUnit = ctx.GetRegex("成交候选人,中标单位名称,第一中标候选人,第一成交候选人"); } if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney)) { bidMoney = ctx.GetMoneyRegex(); } if (string.IsNullOrEmpty(prjMgr)) { prjMgr = ctx.GetMgrRegex(); } if (string.IsNullOrEmpty(prjMgr)) { prjMgr = ctx.GetRegex("拟任总监,拟任项目经理"); } } catch { } } } } msgType = "肇庆工程交易中心"; specType = bidType = "建设工程"; BidInfo info = ToolDb.GenBidInfo("广东省", "肇庆市区", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); //ToolDb.SaveEntity(info, ""); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } else { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = aTag.GetAttribute("title"); InfoUrl = "http://www.zqgcjy.com/" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch (Exception) { continue; } Parser parserdetail = new Parser(new Lexer(htmldtl)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "table1"))); if (dtnode != null && dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); buildUnit = inviteCtx.GetBidUnitDel().GetBuildRegex(); beginDate = inviteCtx.GetDateRegex(); prjAddress = ToolHtml.GetRegexString(inviteCtx, ToolHtml.AddressRegex); //inviteCtx.GetAddressRegex(); code = inviteCtx.GetReplace(" ").GetCodeRegex().GetCodeDel(); prjAddress = ToolHtml.GetSubString(prjAddress, 150); NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (tableNode != null && tableNode.Count > 0) { string ctx = string.Empty; TableTag tag = tableNode[0] as TableTag; for (int r = 0; r < tag.RowCount; r++) { for (int c = 0; c < tag.Rows[r].ColumnCount; c++) { string temp = tag.Rows[r].Columns[c].ToNodePlainString().GetReplace(":,:"); if ((c + 1) % 2 == 0) { ctx += temp + "\r\n"; } else { ctx += temp + ":"; } } } if (string.IsNullOrEmpty(code)) { code = ctx.GetCodeRegex(); } if (string.IsNullOrEmpty(buildUnit)) { buildUnit = ctx.GetBuildRegex(); } if (string.IsNullOrEmpty(prjAddress)) { prjAddress = ctx.GetAddressRegex(); } if (string.IsNullOrEmpty(prjAddress)) { prjAddress = "见招标信息"; } } msgType = "肇庆工程交易中心"; specType = "建设工程"; inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "肇庆市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "yema"))); if (noList != null && noList.Count > 0) { string temp = noList.AsString(); try { Regex reg = new Regex(@"/[^页]+页"); string result = reg.Match(temp).Value.Replace("页", "").Replace("/", ""); pageInt = Convert.ToInt32(result); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.gy-center.net/announce/list.jhtml?visi_id=&cid=76&chid=&gid=&thistype=&searchcid=&keyword=&action=yes&interval=&page=" + i.ToString(), Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "tab01"))), new TagNameFilter("ul"))), new TagNameFilter("li"))); if (dtlList != null && dtlList.Count > 0) { for (int j = 0; j < dtlList.Count - 1; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; string temp = dtlList[j].ToPlainTextString(); string tempHtl = dtlList[j].ToHtml(); prjName = ToolHtml.GetHtmlAtagValue("title", tempHtl); beginDate = ToolHtml.GetRegexDateTime(temp); InfoUrl = "http://www.gy-center.net/announce/" + ToolHtml.GetHtmlAtagValue("href", tempHtl); string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); htlDtl = System.Text.RegularExpressions.Regex.Replace(htlDtl, "(<script)[\\s\\S]*?(</script>)", ""); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList htlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "r_content_right_main"))); if (htlList != null && htlList.Count > 0) { HtmlTxt = htlList.ToHtml(); inviteCtx = Regex.Replace(HtmlTxt, "<[^>]*>", "").Replace(" ", "").Replace(" ", "").Replace("\t\t", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); inviteType = ToolHtml.GetInviteTypes(prjName); prjAddress = ToolHtml.GetRegexString(inviteCtx, ToolHtml.AddressRegex); buildUnit = ToolHtml.GetRegexString(inviteCtx, ToolHtml.BuildRegex); code = ToolHtml.GetRegexString(inviteCtx, ToolHtml.CodeRegex); prjAddress = ToolHtml.GetSubString(prjAddress, 150); buildUnit = ToolHtml.GetSubString(buildUnit, 150); code = ToolHtml.GetSubString(code, 50); if (string.IsNullOrEmpty(code)) { code = "见招标信息"; } if (string.IsNullOrEmpty(prjAddress)) { prjAddress = "见招标信息"; } specType = "其他"; msgType = "工网在线"; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "工网在线"; } InviteInfo info = ToolDb.GenInviteInfo("广东省", "电网专项工程", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("width", "50%"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString().Replace(" ", ""); Regex reg = new Regex(@"条,[^页]+页"); pageInt = Convert.ToInt32(reg.Match(temp).Value.Replace("条,", "").Replace("页", "")); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "&p=" + i.ToString(), Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "Datagrid"))); if (viewList != null && viewList.Count > 0) { TableTag tab = viewList[0] as TableTag; for (int j = 0; j < tab.RowCount; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = tab.Rows[j]; prjName = tr.Columns[1].ToPlainTextString().Replace("\r", "").Replace("\t", "").Replace("\n", ""); Regex regDate = new Regex(@"\d{4}/\d{1,2}/\d{1,2}"); beginDate = regDate.Match(tr.Columns[2].ToPlainTextString()).Value; ATag aTag = tr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://www.xixiang.gov.cn/" + aTag.Link; string htmDtl = string.Empty; try { htmDtl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.Default); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>"); htmDtl = regexHtml.Replace(htmDtl, ""); } catch { continue; } parser = new Parser(new Lexer(htmDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "Lblcontent"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = dtl.AsHtml(); parser = new Parser(new Lexer(HtmlTxt)); NodeList span = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "Lblcontent"))); if (span != null && span.Count > 0) { bidCtx = Regex.Replace(span.AsHtml().ToLower().Replace("<br/>", "\r\n").Replace("<br>", "\r\n"), "<[^>]*>", "").Replace(" ", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\r\n", "\r\n").Replace("\r\r\n", "\r\n").Replace("\r\r\n", "\r\n") + "\r\n"; parser = new Parser(new Lexer(span.AsHtml().ToLower().Replace("th", "td"))); NodeList dtlTab = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (dtlTab != null && dtlTab.Count > 0) { bidCtx = ""; TableTag table = dtlTab[0] as TableTag; for (int k = 0; k < table.RowCount; k++) { for (int c = 0; c < table.Rows[k].ColumnCount; c++) { if (table.RowCount > 1 && k == 0) { string strCtx = table.Rows[k].Columns[c].ToPlainTextString().Replace(" ", "").Replace(" ", "").Replace("\r\n", "").Replace("\n", ""); bidCtx += strCtx + ":" + table.Rows[k + 1].Columns[c].ToPlainTextString().Replace(" ", "").Replace(" ", "").Replace("\r\n", "").Replace("\n", "") + "\r\n"; } } break; } bidCtx = bidCtx.Replace("\n", "").Replace("\r\n\r\n", "\r\n").Replace("\r", "\r\n") + "\r\n"; } else { string ctx = HtmlTxt.ToLower().Replace("<br/>", "\r\n").Replace("<br>", "\r\n"); bidCtx = Regex.Replace(ctx, "<[^>]*>", "").Replace(" ", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\r\n", "\r\n") + "\r\n"; } } else { parser = new Parser(new Lexer(HtmlTxt.ToLower().Replace("th", "td"))); NodeList dtlTab = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (dtlTab != null && dtlTab.Count > 0) { TableTag table = dtlTab[0] as TableTag; for (int k = 0; k < table.RowCount; k++) { for (int c = 0; c < table.Rows[k].ColumnCount; c++) { if (table.RowCount > 1 && k == 0) { string strCtx = table.Rows[k].Columns[c].ToPlainTextString().Replace(" ", "").Replace(" ", "").Replace("\r\n", "").Replace("\n", ""); bidCtx += strCtx + ":" + table.Rows[k + 1].Columns[c].ToPlainTextString().Replace(" ", "").Replace(" ", "").Replace("\r\n", "").Replace("\n", "") + "\r\n"; } } break; } bidCtx = bidCtx.Replace("\n", "").Replace("\r\n\r\n", "\r\n").Replace("\r", "\r\n") + "\r\n"; } else { string ctx = HtmlTxt.ToLower().Replace("<br/>", "\r\n").Replace("<br>", "\r\n"); bidCtx = Regex.Replace(ctx, "<[^>]*>", "").Replace(" ", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\r\n", "\r\n") + "\r\n"; } } bidCtx = bidCtx.Replace(" ", ""); bidType = "工程"; if (prjName.Contains("施工")) { bidType = "施工"; } if (prjName.Contains("监理")) { bidType = "监理"; } if (prjName.Contains("设计")) { bidType = "设计"; } if (prjName.Contains("勘察")) { bidType = "勘察"; } if (prjName.Contains("服务")) { bidType = "服务"; } if (prjName.Contains("劳务分包")) { bidType = "劳务分包"; } if (prjName.Contains("专业分包")) { bidType = "专业分包"; } if (prjName.Contains("小型施工")) { bidType = "小型工程"; } if (prjName.Contains("设备材料")) { bidType = "设备材料"; } Regex regPrjCode = new Regex(@"(工程编号|项目编号|招标编号|中标编号|编号)(:|:)[^\r\n]+\r\n"); code = regPrjCode.Match(bidCtx.Replace(" ", "")).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("招标编号", "").Replace("中标编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Trim(); Regex regBuidUnit = new Regex(@"(建设单位|招标人|承包人|招标单位|招标方|招标代理机构)(:|:)[^\r\n]+\r\n"); buildUnit = regBuidUnit.Match(bidCtx.Replace(" ", "")).Value.Replace("招标代理机构", "").Replace("建设单位", "").Replace("招标人", "").Replace("承包人", "").Replace("招标单位", "").Replace("招标方", "").Replace(":", "").Replace(":", "").Trim(); Regex regMoney = new Regex(@"(中标价|投标价|总投资|发包价|投标报价|价格|金额|报价)(:|:|)[^\r\n]+\r\n"); bidMoney = regMoney.Match(bidCtx.Replace(" ", "")).Value.Replace("报价", "").Replace("中标价", "").Replace("总投资", "").Replace("发包价", "").Replace("投标报价", "").Replace("投标价", "").Replace("价格", "").Replace("金额", "").Replace(":", "").Replace(":", "").Replace("¥", "").Replace(",", "").Trim(); Regex regBidUnit = new Regex(@"(第一候选人|投标供应商名称|中标候选人|中标供应商|中标单位|中标人|中标方)(:|:)[^\r\n]+\r\n"); bidUnit = regBidUnit.Match(bidCtx.Replace(" ", "")).Value.Replace("投标供应商名称", "").Replace("中标供应商", "").Replace("中标候选人", "").Replace("第一候选人", "").Replace("中标单位", "").Replace("中标人", "").Replace("中标方", "").Replace(":", "").Replace(":", "").Trim(); Regex regprjMgr = new Regex(@"(项目经理姓名|项目经理|项目负责人|项目总监|建造师|总工程师|监理师)(:|:)[^\r\n]+\r\n"); prjMgr = regprjMgr.Match(bidCtx.Replace(" ", "")).Value.Replace("项目经理姓名", "").Replace("总工程师", "").Replace("项目经理", "").Replace("项目总监", "").Replace("建造师", "").Replace("监理师", "").Replace("项目负责人", "").Replace(":", "").Replace(":", "").Trim(); Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+"); if (bidMoney.Contains("万")) { //bidMoney = bidMoney.Remove(bidMoney.IndexOf("万")).Trim(); bidMoney = regBidMoney.Match(bidMoney).Value; } else { try { bidMoney = (decimal.Parse(regBidMoney.Match(bidMoney).Value) / 10000).ToString(); if (decimal.Parse(bidMoney) < decimal.Parse("0.1")) { bidMoney = "0"; } } catch (Exception) { bidMoney = "0"; } } if (prjMgr.Contains("资格")) { prjMgr = prjMgr.Remove(prjMgr.IndexOf("资格")); } string ctxs = HtmlTxt.ToLower().Replace("<br/>", "\r\n").Replace("<br>", "\r\n"); bidCtx = Regex.Replace(ctxs, "<[^>]*>", "").Replace(" ", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\r\n", "\r\n") + "\r\n"; bidCtx = bidCtx.Replace(" ", ""); bidUnit = ToolHtml.GetStringTemp(bidUnit); buildUnit = ToolHtml.GetSubString(buildUnit, 150); bidUnit = ToolHtml.GetSubString(bidUnit, 150); code = ToolHtml.GetSubString(code, 50); prjMgr = ToolHtml.GetSubString(prjMgr, 50); if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "深圳市宝安区西乡街道办事处"; } msgType = "深圳市宝安区西乡街道办事处"; specType = "建设工程"; bidType = "小型工程"; prjName = ToolDb.GetPrjName(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 string html = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch (Exception ex) { return(list); } int pageInt = 1; Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "totalpage"))); if (pageNode != null && pageNode.Count > 0) { try { pageInt = Convert.ToInt32(pageNode[0].ToNodePlainString()); } catch { } } for (int i = pageInt; i >= 1; i--) { if (i < pageInt) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.nmgztb.com/Html/gongchengxinxi/zhaobiaogonggao/index_" + (i - 1) + ".htm", Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList sNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "100%"))); //parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter( new TagNameFilter("div"),new HasAttributeFilter("class","lanmu_con")),true),new TagNameFilter("table"))); //NodeList div = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "lanmu_con"))); //parser = new Parser(new Lexer(div.ToHtml())); //NodeList table = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (sNodes != null && sNodes.Count > 0) { TableTag table = sNodes[0] as TableTag; for (int t = 0; t < table.RowCount; t++) { if (table.Rows[t].ColumnCount < 2) { continue; } string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, HtmlTxt = string.Empty; StringBuilder ctx = new StringBuilder(); TableRow tr = table.Rows[t] as TableRow; NodeList nodeList = tr.SearchFor(typeof(ATag), true); if (nodeList.Count > 0) { ATag aTag = nodeList[0] as ATag; InfoUrl = "http://www.nmgztb.com" + aTag.Link; prjName = aTag.GetAttribute("title"); string htmldtl = string.Empty;//this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).ToLower(); try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).ToLower(); } catch (Exception ex) { continue; } Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); htmldtl = regexHtml.Replace(htmldtl, ""); Parser parserdtl = new Parser(new Lexer(htmldtl)); NodeList nodesDtl = parserdtl.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "link_con_con"))); if (nodesDtl != null && nodesDtl.Count > 0) { Regex regex = new Regex(@"更新时间:\d{4}年\d{1,2}月\d{1,2}日"); Match math = regex.Match(nodesDtl.AsString()); if (math != null) { beginDate = math.Value.Replace("更新时间:", "").Replace("年", "-").Replace("月", "-").Replace("日", "").Trim(); } } parserdtl.Reset(); nodesDtl = parserdtl.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "oo"))); HtmlTxt = nodesDtl.AsHtml(); string str = nodesDtl.AsString().Replace(" ", "").Replace(" ", ""); Regex regexCTX = new Regex(@"作者:[^更新时间]+更新时间:\d{4}年\d{1,2}月\d{1,2}日"); str = str.Replace(regexCTX.Match(str).Value, ""); if (str.IndexOf("上一篇:") > -1) { ctx.Append(str.Substring(0, str.IndexOf("上一篇:"))); } else { ctx.Append(str); } if (ctx.ToString().Contains("招标人:") || ctx.ToString().Contains("招标单位:") || ctx.ToString().Contains("招标采购单位:")) { Regex regex = new Regex("(招标人|招标单位|招标采购单位):[^\r\n]+[\r\n]{1}"); Match match = regex.Match(ctx.ToString()); buildUnit = match.Value.Replace("招标人:", "").Replace("招标单位:", "").Replace("招标采购单位:", "").Trim(); } if (ctx.ToString().Contains("招标编号:")) { Regex regex = new Regex("(招标编号):[^\r\n]+[\r\n]{1}"); Match match = regex.Match(ctx.ToString()); code = match.Value.Replace("招标编号:", "").ToUpper().Trim(); if (code.Length >= 50) { code = ""; } } if (string.IsNullOrEmpty(buildUnit)) { buildUnit = ""; } inviteType = ToolHtml.GetInviteTypes(prjName); buildUnit = ToolHtml.GetSubString(buildUnit, 150); prjAddress = ToolHtml.GetAddress(prjAddress); code = ToolHtml.GetSubString(code, 50); InviteInfo info = ToolDb.GenInviteInfo("内蒙古自治区", "内蒙古自治区及盟市", "", string.Empty, code, prjName, "", buildUnit, beginDate, endDate, ctx.ToString(), remark, "内蒙古自治区建设工程招标投标服务中心", inviteType, "建设工程", string.Empty, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { DateTime startDate = DateTime.Today; DateTime endDates = startDate.AddDays(-90); IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "TIMEBEGIN_SHOW", "TIMEEND_SHOW", "TIMEBEGIN", "TIMEEND", "DEAL_TIME", "DEAL_CLASSIFY", "DEAL_STAGE", "DEAL_PROVINCE", "DEAL_CITY", "DEAL_PLATFORM", "DEAL_TRADE", "isShowAll", "PAGENUMBER", "FINDTXT" }, new string[] { endDates.ToString(), startDate.ToString(), endDates.ToString(), startDate.ToString(), "02", "01", "0101", "0", "0", "0", "0", "1", "1", "" }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8, ref cookiestr); } catch { } } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "paging")), true), new TagNameFilter("span"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("共", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "TIMEBEGIN_SHOW", "TIMEEND_SHOW", "TIMEBEGIN", "TIMEEND", "DEAL_TIME", "DEAL_CLASSIFY", "DEAL_STAGE", "DEAL_PROVINCE", "DEAL_CITY", "DEAL_PLATFORM", "DEAL_TRADE", "isShowAll", "PAGENUMBER", "FINDTXT" }, new string[] { endDates.ToString(), startDate.ToString(), endDates.ToString(), startDate.ToString(), "02", "01", "0101", "0", "0", "0", "0", "1", i.ToString(), "" }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "publicont"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string nlse = string.Empty; string ywlx = string.Empty; string sehu = string.Empty; INode node = listNode[j]; ATag aTag = node.GetATag(); if (aTag == null) { continue; } string nod = node.ToHtml(); parser = new Parser(new Lexer(nod)); NodeList txtNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "span_on"))); if (txtNode != null && txtNode.Count > 0) { sehu = txtNode[0].ToNodePlainString(); nlse = txtNode[3].ToNodePlainString(); ywlx = txtNode[2].ToNodePlainString(); } if (nlse.Contains("招标/资审公告")) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = aTag.GetAttribute("title"); inviteType = ToolHtml.GetInviteTypes(prjName); beginDate = node.ToPlainTextString().GetDateRegex(); InfoUrl = aTag.Link.GetReplace("amp;"); string htmlDtl = string.Empty; try { htmlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8); htmlDtl = ToolHtml.GetRegexHtlTxt(htmlDtl); } catch { continue; } parser = new Parser(new Lexer(htmlDtl)); NodeList zsList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "div_0101"))); if (zsList != null && zsList.Count > 0) { try { INode nodezs = zsList[0]; ATag aTagzs = nodezs.GetATag(); string urlzs = aTagzs.GetAttribute("onclick"); string urls = urlzs.GetReplace("showdetail(this, '0101','", "").GetReplace("')", "").Replace(",", "").Replace(")", ""); urls = "http://www.ggzy.gov.cn/information" + urls; htmlDtl = this.ToolWebSite.GetHtmlByUrl(urls, Encoding.UTF8); htmlDtl = ToolHtml.GetRegexHtlTxt(htmlDtl); } catch (Exception) { throw; } } parser = new Parser(new Lexer(htmlDtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "detail"))); if (dtlList != null && dtlList.Count > 0) { string ctxUrl = string.Empty; HtmlTxt = dtlList.AsHtml(); inviteCtx = HtmlTxt.Replace("</p>", "\r\n").ToCtxString(); try { Parser parurl = new Parser(new Lexer(HtmlTxt)); NodeList zsUrl = parurl.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("p"), new HasAttributeFilter("class", "p_o"))); if (zsUrl != null && zsUrl.Count > 0) { INode urlzs = zsUrl[0]; ATag aTagurl = urlzs.GetATag(); ctxUrl = "原文链接地址 : " + aTagurl.Link; } } catch (Exception ex) { } inviteCtx = inviteCtx + ctxUrl; prjAddress = inviteCtx.GetAddressRegex(); buildUnit = inviteCtx.GetBuildRegex(); code = inviteCtx.GetCodeRegex(); if (string.IsNullOrEmpty(buildUnit)) { buildUnit = inviteCtx.GetRegex("招标人"); } buildUnit = ToolHtml.GetSubString(buildUnit, 150); if (string.IsNullOrWhiteSpace(code)) { parser = new Parser(new Lexer(HtmlTxt)); NodeList bidNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (bidNode != null && bidNode.Count > 0) { string ctx = string.Empty; TableTag bidTable = bidNode[0] as TableTag; try { for (int r = 0; r < bidTable.RowCount; r++) { ctx += bidTable.Rows[r].Columns[0].ToNodePlainString() + ":"; ctx += bidTable.Rows[r].Columns[1].ToNodePlainString() + "\r\n"; } } catch { } if (string.IsNullOrWhiteSpace(buildUnit)) { buildUnit = ctx.GetBuildRegex(); } if (string.IsNullOrWhiteSpace(prjAddress)) { prjAddress = ctx.GetAddressRegex(); } if (string.IsNullOrWhiteSpace(code)) { code = ctx.GetCodeRegex(); } } } msgType = "国家信息中心"; specType = "建设工程"; inviteType = "建设工程"; string[] provs = GetPrivoce(sehu); InviteInfo info = ToolDb.GenInviteInfo(provs[0], provs[1], "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); try { parser = new Parser(new Lexer(HtmlTxt)); NodeList nodeFm = parser.ExtractAllNodesThatMatch((new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "detail_content"))))); if (dtlList != null && dtlList.Count > 0) { INode nodFm = nodeFm[0]; ATag aTagzs = nodFm.GetATag(); string dfe = aTagzs.Link; BaseAttach attach = ToolDb.GenBaseAttach("内容(点击下载)", info.Id, dfe); base.AttachList.Add(attach); } } catch { } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } else { continue; } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "input-group-addon"))); if (tdNodes != null && tdNodes.Count > 0) { try { string reTemp = tdNodes.AsString().GetRegexBegEnd("共", "项"); string pageTemp = tdNodes.AsString().GetRegexBegEnd("项", "页").GetReplace("共,项,页," + reTemp + ",,"); pageInt = int.Parse(pageTemp); } catch (Exception) { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "?pi=" + (i - 1), Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "inside_table"))); if (nodeList != null && nodeList.Count > 0) { TableTag tableRow = (TableTag)nodeList[0]; for (int j = 1; j < tableRow.RowCount; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = tableRow.Rows[j]; beginDate = tr.Columns[3].ToPlainTextString().Trim(); prjName = tr.Columns[1].ToPlainTextString().Trim().GetReplace("""); buildUnit = tr.Columns[2].ToPlainTextString().Trim(); InfoUrl = "http://www.bajsjy.com/" + tr.Columns[1].GetATagHref(); string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.UTF8).Replace("<th", "<td").Replace("</th>", "</td>").Replace(" ", ""); } catch (Exception) { continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "inside_table"))); if (dtnode != null && dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); TableTag tabledetail = (TableTag)dtnode[0]; for (int r = 0; r < tabledetail.RowCount; r++) { TableRow trdetail = tabledetail.Rows[r]; try { for (int c = 0; c < trdetail.ColumnCount; c++) { string tr1 = string.Empty; string tr2 = string.Empty; tr1 = trdetail.Columns[c].ToPlainTextString().Trim(); tr2 = trdetail.Columns[c + 1].ToPlainTextString().Trim(); bidCtx += tr1 + ":" + tr2 + "\r\n"; if (trdetail.ColumnCount > (c + 1)) { c = c + 1; } } } catch { bidCtx = HtmlTxt.ToCtxString(); } } Regex regPrjAdd = new Regex(@"(工程地点|工程地址):[^\r\n]+\r\n"); prjAddress = regPrjAdd.Match(bidCtx).Value.Replace("工程地点:", "").Replace("工程地址:", "").Trim(); if (string.IsNullOrEmpty(prjAddress)) { prjAddress = string.Empty; } prjAddress = ToolHtml.GetSubString(prjAddress, 50); msgType = "深圳市建设工程交易中心宝安分中心"; specType = "建设工程"; Regex regMoney = new Regex(@"(中标价):[^\r\n]+\r\n"); bidMoney = regMoney.Match(bidCtx).Value.Replace("金额", "").Replace("中标价", "").Replace(":", "").Replace(":", "").Replace("/", "").Replace(",", "").Trim(); Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+"); if (!string.IsNullOrEmpty(regBidMoney.Match(bidMoney).Value)) { if (bidMoney.Contains("万元") || bidMoney.Contains("万美元") || bidMoney.Contains("万")) { bidMoney = regBidMoney.Match(bidMoney).Value; } else { try { bidMoney = (decimal.Parse(regBidMoney.Match(bidMoney).Value) / 10000).ToString(); if (decimal.Parse(bidMoney) < decimal.Parse("0.1")) { bidMoney = "0"; } } catch (Exception) { bidMoney = "0"; } } } Regex regBidUnit = new Regex(@"(中标人|中标单位):[^\r\n]+\r\n"); bidUnit = regBidUnit.Match(bidCtx).Value.Replace("中标人:", "").Replace("中标单位", "").Trim(); if (bidUnit == "" || bidUnit == null) { bidUnit = ""; } if (Encoding.Default.GetByteCount(bidUnit) > 150) { bidUnit = bidUnit.Substring(0, 150); } Regex regprjMgr = new Regex(@"(项目经理):[^\r\n]+\r\n"); prjMgr = regprjMgr.Match(bidCtx).Value.Replace("项目经理:", "").Trim(); if (string.IsNullOrEmpty(prjMgr)) { prjMgr = string.Empty; } prjMgr = ToolHtml.GetSubString(prjMgr, 30); Regex regOtherType = new Regex(@"(工程类型):[^\r\n]+\r\n"); string oType = regOtherType.Match(bidCtx).Value.Replace("工程类型:", "").Trim(); if (oType.Contains("房建")) { otherType = "房建及工业民用建筑"; } if (oType.Contains("市政")) { otherType = "市政工程"; } if (oType.Contains("园林绿化")) { otherType = "园林绿化工程"; } if (oType.Contains("装饰装修")) { otherType = "装饰装修工程"; } if (oType.Contains("电力")) { otherType = "电力工程"; } if (oType.Contains("水利")) { otherType = "水利工程"; } if (oType.Contains("环保")) { otherType = "环保工程"; } otherType = ToolHtml.GetSubString(otherType, 50); oType = ToolHtml.GetSubString(oType, 50); //prjName = ToolDb.GetPrjName(prjName); bidType = ToolHtml.GetInviteTypes(prjName); buildUnit = ToolHtml.GetSubString(buildUnit, 150); BidInfo info = null; try { info = ToolDb.GenBidInfo("广东省", "深圳宝安区工程", "宝安区", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, beginDate, beginDate, HtmlTxt); } catch { Logger.Error("出错啦"); } list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "yema"))); if (noList != null && noList.Count > 0) { string temp = noList.AsString(); try { Regex reg = new Regex(@"/[^页]+页"); string result = reg.Match(temp).Value.Replace("页", "").Replace("/", ""); pageInt = Convert.ToInt32(result); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.gy-center.net/announce/list.jhtml?visi_id=&cid=97&chid=&gid=&thistype=&searchcid=&keyword=&action=yes&interval=&page=" + i.ToString(), Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "tab01"))), new TagNameFilter("ul"))), new TagNameFilter("li"))); if (dtlList != null && dtlList.Count > 0) { for (int j = 0; j < dtlList.Count - 1; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; string temp = dtlList[j].ToPlainTextString(); string tempHtl = dtlList[j].ToHtml(); prjName = ToolHtml.GetHtmlAtagValue("title", tempHtl); beginDate = ToolHtml.GetRegexDateTime(temp); InfoUrl = "http://www.gy-center.net/announce/" + ToolHtml.GetHtmlAtagValue("href", tempHtl); string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); htlDtl = System.Text.RegularExpressions.Regex.Replace(htlDtl, "(<script)[\\s\\S]*?(</script>)", ""); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList htlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "r_content_right_main"))); if (htlList != null && htlList.Count > 0) { HtmlTxt = htlList.ToHtml(); bidCtx = Regex.Replace(HtmlTxt, "<[^>]*>", "").Replace(" ", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\t\t", "").Replace("\r\r", "\r").Replace("\n\n", "\n"); bidType = ToolHtml.GetInviteTypes(prjName); string bidStr = string.Empty; parser = new Parser(new Lexer(HtmlTxt)); NodeList bidList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "MsoNormalTable"))); if (bidList != null && bidList.Count > 0) { try { TableTag tab = bidList[0] as TableTag; if (tab.RowCount > 1 && tab.Rows[0].ColumnCount > 6) { bidStr = tab.Rows[0].Columns[0].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[0].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[1].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[1].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[2].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[2].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[3].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[3].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[4].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[4].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[5].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[5].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[6].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[6].ToPlainTextString().ToNodeString() + "\r\n"; } else if (tab.RowCount > 1 && tab.Rows[0].ColumnCount > 5) { bidStr = tab.Rows[0].Columns[0].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[0].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[1].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[1].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[2].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[2].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[3].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[3].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[4].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[4].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[5].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[5].ToPlainTextString().ToNodeString() + "\r\n"; } else if (tab.RowCount > 1 && tab.Rows[0].ColumnCount > 4) { bidStr = tab.Rows[0].Columns[0].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[0].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[1].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[1].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[2].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[2].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[3].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[3].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[4].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[4].ToPlainTextString().ToNodeString() + "\r\n"; } else if (tab.RowCount > 1 && tab.Rows[0].ColumnCount > 3) { bidStr = tab.Rows[0].Columns[0].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[0].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[1].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[1].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[2].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[2].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[3].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[3].ToPlainTextString().ToNodeString() + "\r\n"; } else if (tab.RowCount > 1 && tab.Rows[0].ColumnCount > 2) { bidStr = tab.Rows[0].Columns[0].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[0].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[1].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[1].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[2].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[2].ToPlainTextString().ToNodeString() + "\r\n"; } else if (tab.RowCount > 1 && tab.Rows[0].ColumnCount > 1) { bidStr = tab.Rows[0].Columns[0].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[0].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[1].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[1].ToPlainTextString().ToNodeString() + "\r\n"; } } catch { } } buildUnit = ToolHtml.GetRegexString(bidCtx, ToolHtml.BuildRegex); prjAddress = ToolHtml.GetRegexString(bidCtx, ToolHtml.AddressRegex); code = ToolHtml.GetRegexString(bidCtx, ToolHtml.CodeRegex); bidUnit = ToolHtml.GetRegexString(bidCtx, ToolHtml.BidRegex); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = ToolHtml.GetRegexString(bidStr.Replace(" ", ""), ToolHtml.BidRegex, false); } if (string.IsNullOrEmpty(bidUnit)) { bidUnit = bidCtx.GetRegexBegEnd("确认", "为"); } bidMoney = ToolHtml.GetRegexString(bidCtx, ToolHtml.MoneyRegex); bidMoney = ToolHtml.GetRegexMoney(bidMoney); if (string.IsNullOrEmpty(bidMoney) || bidMoney == "0") { bidMoney = bidCtx.GetRegexBegEnd("¥", "元").GetMoney(); } buildUnit = ToolHtml.GetSubString(buildUnit, 150); prjAddress = ToolHtml.GetSubString(prjAddress, 150); code = ToolHtml.GetSubString(code, 50); bidUnit = ToolHtml.GetSubString(bidUnit, 150); bidUnit = ToolHtml.GetStringTemp(bidUnit); buildUnit = ToolHtml.GetStringTemp(buildUnit); if (string.IsNullOrEmpty(code)) { code = "见中标信息"; } if (string.IsNullOrEmpty(prjAddress)) { prjAddress = "见中标信息"; } specType = "其他"; msgType = "工网在线"; BidInfo info = ToolDb.GenBidInfo("广东省", "电网专项工程", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList nodeAtag = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (nodeAtag != null && nodeAtag.Count > 0) { for (int c = 0; c < nodeAtag.Count; c++) { ATag a = nodeAtag[c] as ATag; if (a.Link.IsAtagAttach()) { string alink = "http://www.bidding.csg.cn/" + a.Link; BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText.Replace(" ", ""), info.Id, alink); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "Normal"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString().Replace("createPageHTML(", "").Replace("index", "").Replace("html", "").Replace(", 0,", "").Replace(");", "").Replace(",", "").Replace(";", "").Replace(")", "").Replace("\"", "").Replace(" ", "").GetRegexBegEnd("/", "跳"); pageInt = Convert.ToInt32(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { string url = "http://lhbsc.szlhxq.gov.cn/lhbsc/bsdt43/qyfw78/zbcg2/zbxxgs49/0e647d73-" + i.ToString() + ".html"; html = this.ToolWebSite.GetHtmlByUrl(url, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); //NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("tr"), new HasAttributeFilter("class", ""))), new TagNameFilter("tr"))); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("tr"), new HasAttributeFilter("class", ""))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}"); INode node = viewList[j]; ATag aTag = node.GetATag(); beginDate = regDate.Match(viewList[j].ToPlainTextString().Trim()).Value; InfoUrl = "http://lhbsc.szlhxq.gov.cn" + aTag.Link.Replace("../", "").Replace("./", ""); prjName = aTag.GetAttribute("title"); string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>"); htlDtl = regexHtml.Replace(htlDtl, ""); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "contentbox"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = Regex.Replace(dtl.AsHtml(), "(<script)[\\s\\S]*?(</script>)", ""); inviteCtx = Regex.Replace(HtmlTxt, "(<script)[\\s\\S]*?(</script>)", ""); inviteCtx = Regex.Replace(inviteCtx, "<[^>]*>", "").Replace(" ", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); string InvType = prjName; inviteType = prjName.GetInviteBidType(); Regex regPrjAddr = new Regex(@"(工程位置|工程地点|工程地址|详细地址|地址)(:|:)[^\r\n]+\r\n"); prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程位置", "").Replace("工程地点", "").Replace("工程地址", "").Replace("详细地址", "").Replace("地址", "").Replace(":", "").Replace(":", "").Trim(); Regex regBuildUnit = new Regex(@"(招标代理机构|采购代理机构|采购人名称|招标单位|招标人|招标单位(盖章))(:|:)[^\r\n]+\r\n"); buildUnit = regBuildUnit.Match(inviteCtx).Value.Replace("采购人名称", "").Replace("采购代理机构", "").Replace("招标代理机构", "").Replace("招标单位", "").Replace("招标人", "").Replace("(盖章)", "").Replace(":", "").Replace(":", "").Trim(); Regex regPrjCode = new Regex(@"(工程编号|项目编号|编号)(:|:)[^\r\n]+\r\n"); code = regPrjCode.Match(inviteCtx).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Replace("(", "").Replace(")", "").Trim(); msgType = "深圳市龙华新区龙华街道办事处"; if (string.IsNullOrEmpty(prjAddress) || Encoding.Default.GetByteCount(prjAddress) > 150) { prjAddress = "见招标信息"; } code = ToolHtml.GetSubString(code, 50); buildUnit = ToolHtml.GetSubString(buildUnit, 150); specType = "建设工程"; inviteType = "小型工程"; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "深圳市龙华新区龙华街道办事处"; } inviteType = ToolHtml.GetInviteType(inviteType); buildUnit = ToolHtml.GetSubString(buildUnit, 150); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("width", "50%"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString().Replace(" ", ""); Regex reg = new Regex(@"条,[^页]+页"); pageInt = Convert.ToInt32(reg.Match(temp).Value.Replace("条,", "").Replace("页", "")); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "&p=" + i.ToString(), Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "Datagrid"))); if (viewList != null && viewList.Count > 0) { TableTag tab = viewList[0] as TableTag; for (int j = 0; j < tab.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = tab.Rows[j]; prjName = tr.Columns[1].ToPlainTextString().Replace("\r", "").Replace("\t", "").Replace("\n", ""); Regex regDate = new Regex(@"\d{4}/\d{1,2}/\d{1,2}"); beginDate = regDate.Match(tr.Columns[2].ToPlainTextString()).Value; ATag aTag = tr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://www.xixiang.gov.cn/" + aTag.Link; string htmDtl = string.Empty; try { htmDtl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.Default); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>"); htmDtl = regexHtml.Replace(htmDtl, ""); } catch { continue; } parser = new Parser(new Lexer(htmDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "Lblcontent"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = dtl.AsHtml(); inviteCtx = dtl.AsString().Replace(" ", ""); string InvType = prjName; if (InvType.Contains("施工")) { inviteType = "施工"; } if (InvType.Contains("监理")) { inviteType = "监理"; } if (InvType.Contains("设计")) { inviteType = "设计"; } if (InvType.Contains("勘察")) { inviteType = "勘察"; } if (InvType.Contains("服务")) { inviteType = "服务"; } if (InvType.Contains("劳务分包")) { inviteType = "劳务分包"; } if (InvType.Contains("专业分包")) { inviteType = "专业分包"; } if (InvType.Contains("小型施工")) { inviteType = "小型工程"; } if (InvType.Contains("设备材料")) { inviteType = "设备材料"; } Regex regPrjAddr = new Regex(@"(工程位置|工程地点|工程地址)(:|:)[^\r\n]+\r\n"); prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程位置", "").Replace("工程地点", "").Replace("工程地址", "").Replace(":", "").Replace(":", "").Trim(); Regex regBuildUnit = new Regex(@"(招标代理机构|招标单位|招标人|招标单位(盖章))(:|:)[^\r\n]+\r\n"); buildUnit = regBuildUnit.Match(inviteCtx).Value.Replace("招标代理机构", "").Replace("招标单位", "").Replace("招标人", "").Replace("(盖章)", "").Replace(":", "").Replace(":", "").Trim(); Regex regPrjCode = new Regex(@"(工程编号|项目编号|编号)(:|:)[^\r\n]+\r\n"); code = regPrjCode.Match(inviteCtx).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Trim(); msgType = "深圳市宝安区西乡街道办事处"; if (string.IsNullOrEmpty(prjAddress) || Encoding.Default.GetByteCount(prjAddress) > 150) { prjAddress = "见招标信息"; } code = ToolHtml.GetSubString(code, 50); buildUnit = ToolHtml.GetSubString(buildUnit, 150); specType = "建设工程"; inviteType = "小型工程"; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "深圳市宝安区西乡街道办事处"; } inviteType = ToolHtml.GetInviteType(inviteType); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "Normal"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString().Replace("createPageHTML(", "").Replace("index", "").Replace("html", "").Replace(", 0,", "").Replace(");", "").Replace(",", "").Replace(";", "").Replace(")", "").Replace("\"", "").Replace(" ", "").GetRegexBegEnd("/", "跳"); pageInt = Convert.ToInt32(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { string url = "http://lhbsc.szlhxq.gov.cn/lhbsc/bsdt43/qyfw78/zbcg2/zbxxgg/065b33d5-" + i.ToString() + ".html"; html = this.ToolWebSite.GetHtmlByUrl(url, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("tr"), new HasAttributeFilter("class", ""))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}"); INode node = viewList[j]; ATag aTag = node.GetATag(); beginDate = regDate.Match(viewList[j].ToPlainTextString().Trim()).Value; prjName = aTag.GetAttribute("title"); InfoUrl = "http://lhbsc.szlhxq.gov.cn" + aTag.Link.Replace("../", "").Replace("./", ""); string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>"); htlDtl = regexHtml.Replace(htlDtl, ""); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "contentbox"))); if (dtl != null && dtl.Count > 0) { Regex.Replace(dtl.AsHtml(), "(<script)[\\s\\S]*?(</script>)", ""); Regex.Replace(HtmlTxt, "(<script)[\\s\\S]*?(</script>)", ""); Regex.Replace(bidCtx, "<[^>]*>", "").Replace(" ", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("¥", ""); bidType = ToolHtml.GetInviteTypes(prjName); buildUnit = ToolHtml.GetRegexString(bidCtx, "按(建设单位)", "(提供)"); bidMoney = ToolHtml.GetRegexString(bidCtx, "(中标金额)", "(元)|(万元)|(;)").GetReplace(":", "").GetMoney("万元"); bidUnit = bidCtx.GetBidRegex(); if (string.IsNullOrWhiteSpace(bidUnit)) { bidUnit = bidCtx.GetRegex("中标供应商名称"); } if (bidUnit.Contains("公司")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("公司")) + "公司"; } if (prjMgr.Contains("资格")) { prjMgr = prjMgr.Remove(prjMgr.IndexOf("资格")); } if (string.IsNullOrWhiteSpace(bidMoney)) { bidMoney = bidCtx.GetRegex("中标金额").GetReplace(":", ""); } bidUnit = ToolHtml.GetStringTemp(bidUnit); if (string.IsNullOrWhiteSpace(buildUnit)) { buildUnit = bidCtx.GetRegex("采购人名称"); } bidUnit = ToolHtml.GetSubString(bidUnit, 150); code = bidCtx.GetCodeRegex().GetReplace(")", ""); if (string.IsNullOrWhiteSpace(code)) { code = bidCtx.GetRegexBegEnd("招标编号:", ")"); } prjMgr = bidCtx.GetMgrRegex(); try { if (Convert.ToDecimal(bidMoney) > 100000) { bidMoney = (decimal.Parse(bidMoney) / 10000).ToString(); } } catch { } if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "深圳市龙华新区龙华街道办事处"; } msgType = "深圳市龙华新区龙华街道办事处"; specType = "建设工程"; bidType = "小型工程"; prjName = ToolDb.GetPrjName(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }