protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pagination f_right")), true), new TagNameFilter("a"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[pageNode.Count - 1].GetATagValue("onclick").GetRegexBegEnd("Info", ",").GetReplace("("); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "&pageSize=15&pageNum=" + i); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "newsList")), true), new TagNameFilter("li"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { INode node = listNode[j]; ATag aTag = node.GetATag(); if (aTag == null) { continue; } string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty; area = node.ToNodePlainString().GetReplace("[", "【").GetReplace("]", "】").GetRegexBegEnd("【", "】"); prjName = aTag.GetAttribute("title"); beginDate = node.ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.ahtba.org.cn" + aTag.Link.GetReplace("amp;"); string id = aTag.Link.Substring(aTag.Link.IndexOf("id="), aTag.Link.Length - aTag.Link.IndexOf("id=")).GetReplace("id="); string htmldtl = string.Empty; try { NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "id" }, new string[] { id }); htmldtl = this.ToolWebSite.GetHtmlByUrl("http://www.ahtba.org.cn/Notice/NoticeContent", nvc).GetJsString(); } catch { } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "new_detail"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); bidCtx = HtmlTxt.ToLower().GetReplace("</p>,<br/>,<br>", "\r\n").ToCtxString(); buildUnit = bidCtx.GetBuildRegex(); code = bidCtx.GetCodeRegex().GetCodeDel(); bidUnit = bidCtx.GetBidRegex(); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = bidCtx.GetRegex("中标候选人名称,中签单位,第一成交候选人,成交候选人"); } bidMoney = bidCtx.GetMoneyRegex(); if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney)) { bidMoney = bidCtx.GetMoneyRegex(null, true); } if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney)) { bidMoney = bidCtx.GetRegex("总额").GetMoney(); } prjMgr = bidCtx.GetMgrRegex(); if (string.IsNullOrEmpty(bidUnit)) { parser = new Parser(new Lexer(HtmlTxt)); NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (tableNode != null && tableNode.Count > 0) { for (int t = 0; t < tableNode.Count; t++) { TableTag tag = tableNode[t] as TableTag; string classStr = tag.GetAttribute("class"); if (!string.IsNullOrEmpty(classStr) && classStr.ToLower().Contains("table_detail")) { continue; } string ctx = string.Empty; for (int r = 0; r < tag.RowCount; r++) { for (int c = 0; c < tag.Rows[r].ColumnCount; c++) { string temp = tag.Rows[r].Columns[c].ToNodePlainString().GetReplace(":,:"); if ((c + 1) % 2 == 0) { ctx += temp + "\r\n"; } else { ctx += temp + ":"; } } } bidUnit = ctx.GetBidRegex(); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = ctx.GetRegex("成交候选人,中标单位名称,第一中标候选人,第一成交候选人"); } if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney)) { bidMoney = ctx.GetMoneyRegex(); } if (string.IsNullOrEmpty(prjMgr)) { prjMgr = ctx.GetMgrRegex(); } if (string.IsNullOrEmpty(prjMgr)) { prjMgr = ctx.GetRegex("拟任总监,拟任项目经理"); } if (!bidUnit.Contains("公司")) { ctx = string.Empty; try { for (int r = 1; r < tag.Rows[4].ColumnCount; r++) { string temp = tag.Rows[4].Columns[r].ToNodePlainString().GetReplace(":,:"); ctx += temp + ":"; ctx += tag.Rows[5].Columns[r].ToNodePlainString().GetReplace(":,:") + "\r\n"; } bidUnit = ctx.GetBidRegex(null, true, 200); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = ctx.GetRegex("成交候选人,中标单位名称,第一中标候选人,第一成交候选人"); } if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney)) { bidMoney = ctx.GetMoneyRegex(); } if (string.IsNullOrEmpty(prjMgr)) { prjMgr = ctx.GetMgrRegex(); } if (string.IsNullOrEmpty(prjMgr)) { prjMgr = ctx.GetRegex("拟任总监,拟任项目经理"); } } catch { } } } } } try { if (Convert.ToDecimal(bidMoney) > 100000) { bidMoney = (decimal.Parse(bidMoney) / 10000).ToString(); } } catch { } if (prjMgr.Contains("联系")) { prjMgr = prjMgr.Remove(prjMgr.IndexOf("联系")); } if (prjMgr.Contains("电话")) { prjMgr = prjMgr.Remove(prjMgr.IndexOf("电话")); } if (prjMgr.Contains("2")) { prjMgr = prjMgr.Remove(prjMgr.IndexOf("2")); } if (prjMgr.Contains("(")) { prjMgr = prjMgr.Remove(prjMgr.IndexOf("(")); } if (prjMgr.Contains("(")) { prjMgr = prjMgr.Remove(prjMgr.IndexOf("(")); } if (prjMgr.Contains("二")) { prjMgr = prjMgr.Remove(prjMgr.IndexOf("二")); } if (prjMgr.Contains("注册")) { prjMgr = prjMgr.Remove(prjMgr.IndexOf("注册")); } if (prjMgr.Contains("业绩")) { prjMgr = prjMgr.Remove(prjMgr.IndexOf("业绩")); } if (prjMgr.Contains("I")) { prjMgr = prjMgr.Remove(prjMgr.IndexOf("I")); } if (prjMgr.Contains("投标") || prjMgr.IsNumber()) { prjMgr = ""; } if (bidUnit.Contains("公司")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("公司")) + "公司"; } bidUnit = bidUnit.GetReplace("名称,1,、I标段"); prjMgr = prjMgr.GetReplace("1,、,一,第一中标人,第一中标,第中标人,第名,I标段,第中标候选人,标段").GetCodeDel(); specType = bidType = "建设工程"; msgType = "安徽省发展和改革委员会"; BidInfo info = ToolDb.GenBidInfo("安徽省", "安徽省及地市", area, string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "513"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { TableTag tag = listNode[j] as TableTag; string align = tag.GetAttribute("align"); string style = tag.GetAttribute("style"); if (!string.IsNullOrWhiteSpace(align) || !string.IsNullOrWhiteSpace(style)) { continue; } string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = tag.Rows[0]; ATag aTag = tr.Columns[1].GetATag(); prjName = aTag.GetAttribute("title"); beginDate = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = "http://ggzy.jinan.gov.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "zoom"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml().GetReplace("</p>,<br/>", "\r\n"); parser = new Parser(new Lexer(HtmlTxt)); NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (tableNode != null && tableNode.Count > 0) { TableTag table = tableNode[0] as TableTag; for (int r = 0; r < table.RowCount; r++) { for (int c = 0; c < table.Rows[r].ColumnCount; c++) { string temp = table.Rows[r].Columns[c].ToNodePlainString(); if (string.IsNullOrWhiteSpace(temp)) { continue; } if ((c + 1) % 2 == 0) { bidCtx += temp.GetReplace(":,:") + "\r\n"; } else { bidCtx += temp.GetReplace(":,:") + ":"; } } } } else { bidCtx = HtmlTxt.ToCtxString().GetReplace("begin-->,end-->"); } prjAddress = bidCtx.GetAddressRegex().GetCodeDel().GetReplace(" ,&mdash"); buildUnit = bidCtx.GetBuildRegex().GetReplace(" "); if (buildUnit.Contains("公司")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司"; } if (buildUnit.Contains("联系")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("联系")); } if (buildUnit.Contains("地址")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("地址")); } code = bidCtx.GetCodeRegex().GetCodeDel(); bidUnit = bidCtx.GetBidRegex(); bidMoney = bidCtx.GetMoneyRegex(); prjMgr = bidCtx.GetMgrRegex(); msgType = "济南市公共资源交易中心"; specType = "政府采购"; bidType = "建设工程"; BidInfo info = ToolDb.GenBidInfo("山东省", "山东省及地市", "济南市", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://ggzy.jinan.gov.cn" + a.Link.GetReplace("../,./"); } if (Encoding.Default.GetByteCount(link) > 500) { continue; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string cookiestr = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "table1"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("共", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "&page=" + i, Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "position6")), true), new TagNameFilter("li"))); if (pageNode != null && pageNode.Count > 0) { for (int j = 3; j < pageNode.Count; j++) { INode node = pageNode[j]; ATag aTag = node.GetATag(); string psName = aTag.LinkText; if (psName.Contains("中标") || psName.Contains("结果")) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = aTag.GetAttribute("title"); InfoUrl = "http://www.zqgcjy.com/" + aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch (Exception) { continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "table1"))); if (dtnode != null && dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); bidCtx = HtmlTxt.ToCtxString(); beginDate = bidCtx.GetDateRegex(); code = bidCtx.GetCodeRegex(); bidMoney = bidCtx.GetMoneyRegex(); if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney)) { bidMoney = bidCtx.GetMoneyRegex(null, true); } if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney)) { bidMoney = bidCtx.GetRegex("总额").GetMoney(); } prjMgr = bidCtx.GetMgrRegex(); bidUnit = bidCtx.GetBidRegex(); bidDate = bidCtx.GetTimeRegex(); buildUnit = bidCtx.GetBuildRegex(); parser = new Parser(new Lexer(HtmlTxt)); NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (tableNode != null && tableNode.Count > 0) { for (int t = 0; t < tableNode.Count; t++) { TableTag tag = tableNode[t] as TableTag; string classStr = tag.GetAttribute("class"); if (!string.IsNullOrEmpty(classStr) && classStr.ToLower().Contains("table1")) { continue; } string ctx = string.Empty; for (int r = 0; r < tag.RowCount; r++) { for (int c = 0; c < tag.Rows[r].ColumnCount; c++) { string temp = tag.Rows[r].Columns[c].ToNodePlainString().GetReplace(":,:"); if ((c + 1) % 2 == 0) { ctx += temp + "\r\n"; } else { ctx += temp + ":"; } } } if (string.IsNullOrEmpty(bidUnit)) { bidUnit = ctx.GetRegex("成交候选人,中标单位名称,第一中标候选人,第一候选人"); } if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney)) { bidMoney = ctx.GetMoneyRegex(); } if (string.IsNullOrEmpty(prjMgr)) { prjMgr = ctx.GetMgrRegex(); } if (string.IsNullOrEmpty(prjMgr)) { prjMgr = ctx.GetRegex("拟任总监,拟任项目经理"); } if (!bidUnit.Contains("公司")) { ctx = string.Empty; try { for (int r = 1; r < tag.Rows[4].ColumnCount; r++) { string temp = tag.Rows[4].Columns[r].ToNodePlainString().GetReplace(":,:"); ctx += temp + ":"; ctx += tag.Rows[5].Columns[r].ToNodePlainString().GetReplace(":,:") + "\r\n"; } if (string.IsNullOrEmpty(bidUnit)) { bidUnit = ctx.GetRegex("成交候选人,中标单位名称,第一中标候选人,第一成交候选人"); } if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney)) { bidMoney = ctx.GetMoneyRegex(); } if (string.IsNullOrEmpty(prjMgr)) { prjMgr = ctx.GetMgrRegex(); } if (string.IsNullOrEmpty(prjMgr)) { prjMgr = ctx.GetRegex("拟任总监,拟任项目经理"); } } catch { } } } } msgType = "肇庆工程交易中心"; specType = bidType = "建设工程"; BidInfo info = ToolDb.GenBidInfo("广东省", "肇庆市区", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); //ToolDb.SaveEntity(info, ""); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } else { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = aTag.GetAttribute("title"); InfoUrl = "http://www.zqgcjy.com/" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch (Exception) { continue; } Parser parserdetail = new Parser(new Lexer(htmldtl)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "table1"))); if (dtnode != null && dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); buildUnit = inviteCtx.GetBidUnitDel().GetBuildRegex(); beginDate = inviteCtx.GetDateRegex(); prjAddress = ToolHtml.GetRegexString(inviteCtx, ToolHtml.AddressRegex); //inviteCtx.GetAddressRegex(); code = inviteCtx.GetReplace(" ").GetCodeRegex().GetCodeDel(); prjAddress = ToolHtml.GetSubString(prjAddress, 150); NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (tableNode != null && tableNode.Count > 0) { string ctx = string.Empty; TableTag tag = tableNode[0] as TableTag; for (int r = 0; r < tag.RowCount; r++) { for (int c = 0; c < tag.Rows[r].ColumnCount; c++) { string temp = tag.Rows[r].Columns[c].ToNodePlainString().GetReplace(":,:"); if ((c + 1) % 2 == 0) { ctx += temp + "\r\n"; } else { ctx += temp + ":"; } } } if (string.IsNullOrEmpty(code)) { code = ctx.GetCodeRegex(); } if (string.IsNullOrEmpty(buildUnit)) { buildUnit = ctx.GetBuildRegex(); } if (string.IsNullOrEmpty(prjAddress)) { prjAddress = ctx.GetAddressRegex(); } if (string.IsNullOrEmpty(prjAddress)) { prjAddress = "见招标信息"; } } msgType = "肇庆工程交易中心"; specType = "建设工程"; inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "肇庆市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } } return(null); }