protected override IList ExecuteCrawl(bool crawlAll) { DateTime startDate = DateTime.Today; DateTime endDates = startDate.AddDays(-90); IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "TIMEBEGIN_SHOW", "TIMEEND_SHOW", "TIMEBEGIN", "TIMEEND", "DEAL_TIME", "DEAL_CLASSIFY", "DEAL_STAGE", "DEAL_PROVINCE", "DEAL_CITY", "DEAL_PLATFORM", "DEAL_TRADE", "isShowAll", "PAGENUMBER", "FINDTXT" }, new string[] { endDates.ToString(), startDate.ToString(), endDates.ToString(), startDate.ToString(), "02", "01", "0101", "0", "0", "0", "0", "1", "1", "" }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8, ref cookiestr); } catch { } } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "paging")), true), new TagNameFilter("span"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("共", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "TIMEBEGIN_SHOW", "TIMEEND_SHOW", "TIMEBEGIN", "TIMEEND", "DEAL_TIME", "DEAL_CLASSIFY", "DEAL_STAGE", "DEAL_PROVINCE", "DEAL_CITY", "DEAL_PLATFORM", "DEAL_TRADE", "isShowAll", "PAGENUMBER", "FINDTXT" }, new string[] { endDates.ToString(), startDate.ToString(), endDates.ToString(), startDate.ToString(), "02", "01", "0101", "0", "0", "0", "0", "1", i.ToString(), "" }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "publicont"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string nlse = string.Empty; string ywlx = string.Empty; string sehu = string.Empty; INode node = listNode[j]; ATag aTag = node.GetATag(); if (aTag == null) { continue; } string nod = node.ToHtml(); parser = new Parser(new Lexer(nod)); NodeList txtNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "span_on"))); if (txtNode != null && txtNode.Count > 0) { sehu = txtNode[0].ToNodePlainString(); nlse = txtNode[3].ToNodePlainString(); ywlx = txtNode[2].ToNodePlainString(); } if (nlse.Contains("招标/资审公告")) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = aTag.GetAttribute("title"); inviteType = ToolHtml.GetInviteTypes(prjName); beginDate = node.ToPlainTextString().GetDateRegex(); InfoUrl = aTag.Link.GetReplace("amp;"); string htmlDtl = string.Empty; try { htmlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8); htmlDtl = ToolHtml.GetRegexHtlTxt(htmlDtl); } catch { continue; } parser = new Parser(new Lexer(htmlDtl)); NodeList zsList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "div_0101"))); if (zsList != null && zsList.Count > 0) { try { INode nodezs = zsList[0]; ATag aTagzs = nodezs.GetATag(); string urlzs = aTagzs.GetAttribute("onclick"); string urls = urlzs.GetReplace("showdetail(this, '0101','", "").GetReplace("')", "").Replace(",", "").Replace(")", ""); urls = "http://www.ggzy.gov.cn/information" + urls; htmlDtl = this.ToolWebSite.GetHtmlByUrl(urls, Encoding.UTF8); htmlDtl = ToolHtml.GetRegexHtlTxt(htmlDtl); } catch (Exception) { throw; } } parser = new Parser(new Lexer(htmlDtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "detail"))); if (dtlList != null && dtlList.Count > 0) { string ctxUrl = string.Empty; HtmlTxt = dtlList.AsHtml(); inviteCtx = HtmlTxt.Replace("</p>", "\r\n").ToCtxString(); try { Parser parurl = new Parser(new Lexer(HtmlTxt)); NodeList zsUrl = parurl.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("p"), new HasAttributeFilter("class", "p_o"))); if (zsUrl != null && zsUrl.Count > 0) { INode urlzs = zsUrl[0]; ATag aTagurl = urlzs.GetATag(); ctxUrl = "原文链接地址 : " + aTagurl.Link; } } catch (Exception ex) { } inviteCtx = inviteCtx + ctxUrl; prjAddress = inviteCtx.GetAddressRegex(); buildUnit = inviteCtx.GetBuildRegex(); code = inviteCtx.GetCodeRegex(); if (string.IsNullOrEmpty(buildUnit)) { buildUnit = inviteCtx.GetRegex("招标人"); } buildUnit = ToolHtml.GetSubString(buildUnit, 150); if (string.IsNullOrWhiteSpace(code)) { parser = new Parser(new Lexer(HtmlTxt)); NodeList bidNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (bidNode != null && bidNode.Count > 0) { string ctx = string.Empty; TableTag bidTable = bidNode[0] as TableTag; try { for (int r = 0; r < bidTable.RowCount; r++) { ctx += bidTable.Rows[r].Columns[0].ToNodePlainString() + ":"; ctx += bidTable.Rows[r].Columns[1].ToNodePlainString() + "\r\n"; } } catch { } if (string.IsNullOrWhiteSpace(buildUnit)) { buildUnit = ctx.GetBuildRegex(); } if (string.IsNullOrWhiteSpace(prjAddress)) { prjAddress = ctx.GetAddressRegex(); } if (string.IsNullOrWhiteSpace(code)) { code = ctx.GetCodeRegex(); } } } msgType = "国家信息中心"; specType = "建设工程"; inviteType = "建设工程"; string[] provs = GetPrivoce(sehu); InviteInfo info = ToolDb.GenInviteInfo(provs[0], provs[1], "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); try { parser = new Parser(new Lexer(HtmlTxt)); NodeList nodeFm = parser.ExtractAllNodesThatMatch((new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "detail_content"))))); if (dtlList != null && dtlList.Count > 0) { INode nodFm = nodeFm[0]; ATag aTagzs = nodFm.GetATag(); string dfe = aTagzs.Link; BaseAttach attach = ToolDb.GenBaseAttach("内容(点击下载)", info.Id, dfe); base.AttachList.Add(attach); } } catch { } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } else { continue; } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 int pageInt = 1; string html = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new TagNameFilter("div")), new HasAttributeFilter("id", "page_div"))); if (sNode != null && sNode.Count > 0) { string page = ToolHtml.GetRegexString(sNode.AsString(), "共", "页"); try { pageInt = int.Parse(page); } catch { pageInt = 7; } } parser.Reset(); for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.conghua.gov.cn/zgch/zbzb/list_" + i.ToString() + ".shtml", Encoding.Default); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(html)); sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "list_list"))), new TagNameFilter("table"))); if (sNode != null && sNode.Count > 0) { TableTag table = sNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { TableRow tr = table.Rows[j]; string projectName = ToolHtml.GetHtmlAtagValue("title", tr.ToHtml()); if (!projectName.Contains("中标") && !projectName.Contains("结果") && !projectName.Contains("候选单位公示")) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = projectName; inviteType = ToolHtml.GetInviteTypes(projectName); beginDate = ToolHtml.GetRegexDateTime(tr.Columns[1].ToPlainTextString()); InfoUrl = "http://www.conghua.gov.cn" + ToolHtml.GetHtmlAtagValue("href", tr.ToHtml()).Replace("..", ""); string htmlDtl = string.Empty; try { htmlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8); htmlDtl = ToolHtml.GetRegexHtlTxt(htmlDtl); } catch { continue; } parser = new Parser(new Lexer(htmlDtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "zoomcon"))); if (dtlList != null && dtlList.Count > 0) { HtmlTxt = dtlList.ToHtml(); inviteCtx = dtlList.AsString().Replace(" ", ""); buildUnit = ToolHtml.GetRegexString(inviteCtx, ToolHtml.BuildRegex, true); if (!string.IsNullOrEmpty(buildUnit) && buildUnit.Contains(" ")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf(" ")); } buildUnit = ToolHtml.GetSubString(buildUnit, 150); msgType = "广州建设工程交易中心"; specType = "建设工程"; inviteType = inviteType == "" ? "小型工程" : inviteType; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "广州建设工程交易中心"; } InviteInfo info = ToolDb.GenInviteInfo("广东省", "广州市区", "从化市", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } else { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = projectName; bidType = ToolHtml.GetInviteTypes(projectName); beginDate = ToolHtml.GetRegexDateTime(tr.Columns[1].ToPlainTextString()); InfoUrl = "http://www.conghua.gov.cn" + ToolHtml.GetHtmlAtagValue("href", tr.ToHtml()).Replace("..", ""); string htmlDtl = string.Empty; try { htmlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8); htmlDtl = ToolHtml.GetRegexHtlTxt(htmlDtl); } catch { continue; } parser = new Parser(new Lexer(htmlDtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "zoomcon"))); if (dtlList != null && dtlList.Count > 0) { HtmlTxt = dtlList.ToHtml(); bidCtx = dtlList.AsString(); buildUnit = ToolHtml.GetRegexString(bidCtx, ToolHtml.BuildRegex, true); buildUnit = ToolHtml.GetSubString(buildUnit, 150); msgType = "广州建设工程交易中心"; specType = "建设工程"; bidType = bidType == "" ? bidType : "小型工程"; parser = new Parser(new Lexer(HtmlTxt)); NodeList bidNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (bidNode != null && bidNode.Count > 0) { string ctx = string.Empty; TableTag bidTable = bidNode[0] as TableTag; try { for (int r = 0; r < bidTable.RowCount; r++) { ctx += bidTable.Rows[r].Columns[0].ToNodePlainString() + ":"; ctx += bidTable.Rows[r].Columns[1].ToNodePlainString() + "\r\n"; } } catch { } bidUnit = ctx.GetRegex("单位名称,承包意向人名称"); bidMoney = ctx.GetMoneyRegex(); prjMgr = ctx.GetMgrRegex(); if (prjMgr.Contains("/")) { prjMgr = prjMgr.Remove(prjMgr.IndexOf("/")); } } if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "广州建设工程交易中心"; } BidInfo info = ToolDb.GenBidInfo("广东省", "广州市区", "从化市", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } } return(list); }