protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "clearfix")), true), new TagNameFilter("a"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode[sNode.Count - 1].GetATag().GetAttribute("onclick").Replace("(", "kdxx").Replace(",", "xxdk"); pageInt = int.Parse(temp.GetRegexBegEnd("kdxx", "xxdk")); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://ps.szzfcg.cn/portal/topicView.do?method=view1&id=2887108&siteId=9&underwayFlag=undefined&tstmp=17%3A48%3A43%20GMT%2B0800&page=" + i, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "fixed"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; beginDate = tr.Columns[1].ToNodePlainString().GetDateRegex("yyyy/MM/dd"); ATag aTag = tr.Columns[0].GetATag(); prjName = aTag.GetAttribute("title"); Regex regexLink = new Regex(@"id=[^-]+"); string id = regexLink.Match(aTag.Link).Value; InfoUrl = "http://ps.szzfcg.cn/portal/documentView.do?method=view&" + id; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("body")); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); bidCtx = HtmlTxt.ToCtxString().Replace("\r\n\r\n\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\t\r\n\t\r\n", "\r\n\t").Replace("\r\n\t\r\n\t\r\n", "\r\n\t").Replace("\r\n\t\r\n\t\r\n", "\r\n\t").Replace("\r\n\t\r\n\t\r\n", "\r\n\t"); bool isOk = true; bidCtx = System.Web.HttpUtility.HtmlDecode(bidCtx); while (isOk) { string str = bidCtx.GetRegexBegEnd("&#", ";"); if (!string.IsNullOrEmpty(str)) { bidCtx = bidCtx.Replace("&#" + str + ";", ""); } else { isOk = false; } } buildUnit = bidCtx.GetBuildRegex(); prjAddress = bidCtx.GetAddressRegex(); bidUnit = bidCtx.GetBidRegex(); bidMoney = bidCtx.GetMoneyRegex(); if (!string.IsNullOrEmpty(bidUnit) && bidMoney == "0") { bidMoney = bidCtx.GetMoneyRegex(null, true, "万元"); } if (!string.IsNullOrEmpty(bidUnit) && (string.IsNullOrEmpty(bidMoney) || bidMoney == "0")) { bidMoney = bidCtx.GetMoneyRegex(new string[] { "中标金额", "金额" }, false, "万元"); } string ctx = string.Empty; #region 多table匹配 if (string.IsNullOrEmpty(bidUnit)) { parser = new Parser(new Lexer(htmldtl)); NodeList dtList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "holder")), true), new TagNameFilter("table"))); if (dtList != null && dtList.Count > 0) { for (int c = 0; c < dtList.Count; c++) { TableTag tab = dtList[c] as TableTag; if (IsTableBid(tab)) { for (int d = 0; d < tab.Rows[0].ColumnCount; d++) { try { ctx += tab.Rows[0].Columns[d].ToNodePlainString() + ":"; ctx += tab.Rows[1].Columns[d].ToNodePlainString() + "\r\n"; } catch { } } break; } } if (string.IsNullOrEmpty(ctx)) { if (dtList.Count > 3) { TableTag tab = dtList[2] as TableTag; if (tab.RowCount > 1) { for (int d = 0; d < tab.Rows[0].ColumnCount; d++) { ctx += tab.Rows[0].Columns[d].ToNodePlainString() + ":"; ctx += tab.Rows[1].Columns[d].ToNodePlainString() + "\r\n"; } } if (!ctx.Contains("投标供应商") || !ctx.Contains("成交供应商") || !ctx.Contains("中标供应商")) { ctx = string.Empty; tab = dtList[1] as TableTag; if (tab.RowCount > 1) { for (int d = 0; d < tab.Rows[0].ColumnCount; d++) { ctx += tab.Rows[0].Columns[d].ToNodePlainString() + ":"; ctx += tab.Rows[1].Columns[d].ToNodePlainString() + "\r\n"; } } } } else if (dtList.Count > 2) { TableTag tab = dtList[1] as TableTag; if (tab.RowCount > 1) { for (int d = 0; d < tab.Rows[0].ColumnCount; d++) { ctx += tab.Rows[0].Columns[d].ToNodePlainString() + ":"; ctx += tab.Rows[1].Columns[d].ToNodePlainString() + "\r\n"; } } } else { TableTag tab = dtList[0] as TableTag; if (tab.RowCount > 1) { for (int d = 0; d < tab.Rows[0].ColumnCount; d++) { string start = System.Web.HttpUtility.HtmlDecode(tab.Rows[0].Columns[d].ToNodePlainString()); string end = System.Web.HttpUtility.HtmlDecode(tab.Rows[1].Columns[d].ToNodePlainString()); ctx += start + ":"; ctx += end + "\r\n"; } } } } bidUnit = ctx.GetBidRegex(); bidMoney = ctx.GetMoneyRegex(new string[] { "成交金额" }); if (bidMoney == "" || bidMoney == "0") { bidMoney = ctx.GetMoneyRegex(); } if (!string.IsNullOrEmpty(bidUnit) && bidMoney == "0") { string dtlCtx = string.Empty, unit = string.Empty, money = string.Empty; TableTag tab = dtList[0] as TableTag; for (int c = 0; c < tab.RowCount; c++) { if ((c + 2) <= tab.RowCount) { if (tab.Rows[c].ToNodePlainString().Contains(bidUnit)) { for (int d = 0; d < tab.Rows[c].ColumnCount; d++) { dtlCtx += tab.Rows[0].Columns[d].ToNodePlainString() + ":"; dtlCtx += tab.Rows[c].Columns[d].ToNodePlainString() + "\r\n"; } break; } } } if (string.IsNullOrEmpty(dtlCtx)) { Parser tableParser = new Parser(new Lexer(HtmlTxt)); NodeList tableNode = tableParser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (string.IsNullOrEmpty(dtlCtx) && tableNode.Count > 1) { tab = tableNode[1] as TableTag; for (int c = 0; c < tab.RowCount; c++) { if ((c + 2) <= tab.RowCount) { if (tab.Rows[c].ToNodePlainString().Contains(bidUnit)) { for (int d = 0; d < tab.Rows[c].ColumnCount; d++) { dtlCtx += tab.Rows[0].Columns[d].ToNodePlainString() + ":"; dtlCtx += tab.Rows[c].Columns[d].ToNodePlainString() + "\r\n"; } break; } } } } if (string.IsNullOrEmpty(dtlCtx) && tableNode.Count > 2) { tab = tableNode[2] as TableTag; for (int c = 0; c < tab.RowCount; c++) { if ((c + 2) <= tab.RowCount) { if (tab.Rows[c].ToNodePlainString().Contains(bidUnit)) { for (int d = 0; d < tab.Rows[c].ColumnCount; d++) { dtlCtx += tab.Rows[0].Columns[d].ToNodePlainString() + ":"; dtlCtx += tab.Rows[c].Columns[d].ToNodePlainString() + "\r\n"; } break; } } } } if (string.IsNullOrEmpty(dtlCtx) && tableNode.Count > 3) { tab = tableNode[3] as TableTag; for (int c = 0; c < tab.RowCount; c++) { if ((c + 2) <= tab.RowCount) { if (tab.Rows[c].ToNodePlainString().Contains(bidUnit)) { for (int d = 0; d < tab.Rows[c].ColumnCount; d++) { dtlCtx += tab.Rows[0].Columns[d].ToNodePlainString() + ":"; dtlCtx += tab.Rows[c].Columns[d].ToNodePlainString() + "\r\n"; } break; } } } } if (string.IsNullOrEmpty(dtlCtx) && tableNode.Count > 4) { tab = tableNode[4] as TableTag; for (int c = 0; c < tab.RowCount; c++) { if ((c + 2) <= tab.RowCount) { if (tab.Rows[c].ToNodePlainString().Contains(bidUnit)) { for (int d = 0; d < tab.Rows[c].ColumnCount; d++) { dtlCtx += tab.Rows[0].Columns[d].ToNodePlainString() + ":"; dtlCtx += tab.Rows[c].Columns[d].ToNodePlainString() + "\r\n"; } break; } } } } if (string.IsNullOrEmpty(dtlCtx) && tableNode.Count > 5) { tab = tableNode[5] as TableTag; for (int c = 0; c < tab.RowCount; c++) { if ((c + 2) <= tab.RowCount) { if (tab.Rows[c].ToNodePlainString().Contains(bidUnit)) { for (int d = 0; d < tab.Rows[c].ColumnCount; d++) { dtlCtx += tab.Rows[0].Columns[d].ToNodePlainString() + ":"; dtlCtx += tab.Rows[c].Columns[d].ToNodePlainString() + "\r\n"; } break; } } } } } unit = dtlCtx.GetBidRegex(); money = dtlCtx.GetMoneyRegex(); if (bidUnit == unit) { bidMoney = money; } } if (bidUnit.Contains("无中标") || bidUnit.Contains("没有")) { bidUnit = "没有中标商"; bidMoney = "0"; } } } if (string.IsNullOrEmpty(bidUnit)) { parser = new Parser(new Lexer(htmldtl)); NodeList dtList = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (dtList != null && dtList.Count > 0) { for (int c = 0; c < dtList.Count; c++) { TableTag tab = dtList[c] as TableTag; if (IsTableBid(tab)) { for (int d = 0; d < tab.Rows[0].ColumnCount; d++) { try { ctx += tab.Rows[0].Columns[d].ToNodePlainString() + ":"; ctx += tab.Rows[1].Columns[d].ToNodePlainString() + "\r\n"; } catch { } } break; } } if (string.IsNullOrEmpty(ctx)) { if (dtList.Count > 3) { TableTag tab = dtList[2] as TableTag; if (tab.RowCount > 1) { for (int d = 0; d < tab.Rows[0].ColumnCount; d++) { try { ctx += tab.Rows[0].Columns[d].ToNodePlainString() + ":"; ctx += tab.Rows[1].Columns[d].ToNodePlainString() + "\r\n"; } catch { } } } } else if (dtList.Count > 2) { TableTag tab = dtList[1] as TableTag; if (tab.RowCount > 1) { for (int d = 0; d < tab.Rows[0].ColumnCount; d++) { try { ctx += tab.Rows[0].Columns[d].ToNodePlainString() + ":"; ctx += tab.Rows[1].Columns[d].ToNodePlainString() + "\r\n"; } catch { } } } } else if (dtList.Count > 1) { TableTag tab = dtList[1] as TableTag; if (tab.RowCount > 1) { for (int d = 0; d < tab.Rows[0].ColumnCount; d++) { try { ctx += tab.Rows[0].Columns[d].ToNodePlainString() + ":"; ctx += tab.Rows[1].Columns[d].ToNodePlainString() + "\r\n"; } catch { } } } } else { TableTag tab = dtList[0] as TableTag; if (tab.RowCount > 1) { for (int d = 0; d < tab.Rows[0].ColumnCount; d++) { try { ctx += tab.Rows[0].Columns[d].ToNodePlainString() + ":"; ctx += tab.Rows[1].Columns[d].ToNodePlainString() + "\r\n"; } catch { } } } } } bidUnit = ctx.GetBidRegex(); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = ctx.GetRegex("中标承包商"); } if (string.IsNullOrEmpty(bidUnit)) { bidUnit = ctx.GetRegex("中标(成交)供应商"); } if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney)) { bidMoney = ctx.GetMoneyRegex(); } if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney)) { bidMoney = bidCtx.GetRegex("中标价").GetMoney(); } if (string.IsNullOrEmpty(bidUnit)) { if (dtList.Count > 4) { TableTag tab = dtList[dtList.Count - 1] as TableTag; if (tab.RowCount > 1) { for (int d = 0; d < tab.Rows[0].ColumnCount; d++) { try { ctx += tab.Rows[0].Columns[d].ToNodePlainString() + ":"; ctx += tab.Rows[1].Columns[d].ToNodePlainString() + "\r\n"; } catch { } } } } bidUnit = ctx.GetBidRegex(); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = ctx.GetRegex("中标承包商"); } if (string.IsNullOrEmpty(bidUnit)) { bidUnit = ctx.GetRegex("中标(成交)供应商"); } if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney)) { bidMoney = ctx.GetMoneyRegex(); } if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney)) { bidMoney = bidCtx.GetRegex("中标价").GetMoney(); } } if (bidUnit.Contains("无中标") || bidUnit.Contains("没有")) { bidUnit = "没有中标商"; bidMoney = "0"; } } } #endregion if (string.IsNullOrEmpty(bidUnit)) { if (bidCtx.Contains("供应商不足")) { bidUnit = "没有中标商"; bidMoney = "0"; } } if (bidMoney != "0") { try { decimal mon = decimal.Parse(bidMoney); if (mon > 100000) { bidMoney = bidMoney.GetMoney(); } } catch { } } bidType = prjName.GetInviteBidType(); string[] CodeRegex = { "工程编号", "项目编号", "招标编号", "中标编号" }; code = bidCtx.GetCodeRegex(CodeRegex).GetCodeDel(); if (string.IsNullOrEmpty(code)) { code = bidCtx.Replace(")", "kdxx").Replace(")", "kdxx").GetRegexBegEnd("招标编号", "kdxx").Replace(":", "").Replace(":", ""); } if (string.IsNullOrEmpty(code)) { code = bidCtx.Replace(")", "kdxx").Replace(")", "kdxx").GetRegexBegEnd("项目编号", "kdxx").Replace(":", "").Replace(":", ""); } if (string.IsNullOrEmpty(code)) { code = bidCtx.Replace(")", "kdxx").Replace(")", "kdxx").GetRegexBegEnd("工程编号", "kdxx").Replace(":", "").Replace(":", ""); } if (string.IsNullOrEmpty(code)) { code = bidCtx.Replace(")", "kdxx").Replace(")", "kdxx").GetRegexBegEnd("编号", "kdxx").Replace(":", "").Replace(":", ""); } if (Encoding.Default.GetByteCount(code) > 50) { code = string.Empty; } if (!string.IsNullOrEmpty(code)) { code = code.GetChina(); } bidUnit = bidUnit.Replace("名称", ""); code = code.Replace("(", "").Replace("(", "").Replace(")", "").Replace(")", ""); msgType = "深圳市坪山新区公共资源交易中心"; specType = "政府采购"; bidType = "服务"; BidInfo info = ToolDb.GenBidInfo("广东省", "深圳政府采购", "坪山新区", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aTagNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aTagNode != null && aTagNode.Count > 0) { for (int k = 0; k < aTagNode.Count; k++) { ATag aFile = aTagNode[k].GetATag(); if (aFile.IsAtagAttach() || aFile.Link.ToLower().Contains("down")) { string link = string.Empty; if (aFile.Link.Contains("http")) { link = aFile.Link; } else { link = "http://ps.szzfcg.cn/" + aFile.Link; } BaseAttach attach = ToolDb.GenBaseAttach(aFile.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "pagination"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[pageNode.Count - 1].ToNodePlainString().GetRegexBegEnd("/共", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "pageindex", "X-Requested-With" }, new string[] { i.ToString(), "XMLHttpRequest" }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "left_picinfo_text")), true), new TagNameFilter("li"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; INode node = listNode[j]; ATag aTag = node.GetATag(); prjName = aTag.LinkText; beginDate = node.ToPlainTextString().GetDateRegex("yyyy年MM月dd日"); InfoUrl = "http://www.zzjs.com.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("colspan", "2")), true), new TagNameFilter("table"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); TableTag table = dtlNode[0] as TableTag; for (int r = 0; r < table.RowCount; r++) { for (int c = 0; c < table.Rows[r].ColumnCount; c++) { string temp = table.Rows[r].Columns[c].ToNodePlainString(); if (string.IsNullOrWhiteSpace(temp)) { continue; } if ((c + 1) % 2 == 0) { bidCtx += temp.GetReplace(":,:") + "\r\n"; } else { bidCtx += temp.GetReplace(":,:") + ":"; } } } prjAddress = bidCtx.GetAddressRegex().GetCodeDel().GetReplace(" "); buildUnit = bidCtx.GetBuildRegex().GetReplace(" "); if (buildUnit.Contains("公司")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司"; } if (buildUnit.Contains("联系")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("联系")); } if (buildUnit.Contains("地址")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("地址")); } code = bidCtx.GetCodeRegex().GetCodeDel(); bidUnit = bidCtx.GetBidRegex(); bidMoney = bidCtx.GetMoneyRegex(); msgType = "郑州市城乡建设委员会"; specType = bidType = "建设工程"; prjMgr = bidCtx.GetMgrRegex().GetReplace("/,EndFragment"); BidInfo info = ToolDb.GenBidInfo("河南省", "河南省及地市", "郑州市", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.zzjs.com.cn/" + a.Link.GetReplace("../,./"); } if (Encoding.Default.GetByteCount(link) > 500) { continue; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("nowrap", "true"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("总页数:", "当"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection( new string[] { "__VIEWSTATE", "__EVENTTARGET", "__EVENTARGUMENT", "__EVENTVALIDATION" }, new string[] { viewState, "MoreInfoList1$Pager", i.ToString(), eventValidation } ); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.Default, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "MoreInfoList1_DataGrid1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); try { prjName = aTag.GetAttribute("title"); } catch { continue; } beginDate = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = "http://qhzbtb.qhwszwdt.gov.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "TDContent"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); bidCtx = HtmlTxt.ToCtxString(); parser = new Parser(new Lexer(HtmlTxt)); NodeList dtlBidNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (dtlBidNode != null && dtlBidNode.Count > 0) { TableTag bidTable = dtlBidNode[0] as TableTag; string ctx = string.Empty; for (int r = 0; r < bidTable.RowCount; r++) { for (int c = 0; c < bidTable.Rows[r].ColumnCount; c++) { string temp = bidTable.Rows[r].Columns[c].ToNodePlainString(); if (string.IsNullOrEmpty(temp)) { continue; } if ((c + 1) % 2 == 0) { ctx += temp + "\r\n"; } else { ctx += temp + ":"; } } } prjAddress = ctx.GetAddressRegex(); buildUnit = ctx.GetBuildRegex(); bidUnit = ctx.GetBidRegex().GetReplace("第一名,第一"); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = ctx.GetRegex("第一名"); } if (string.IsNullOrEmpty(bidUnit)) { bidUnit = ctx.GetRegex("第一"); } if (string.IsNullOrEmpty(bidUnit)) { bidUnit = ctx.GetRegex("1"); } bidMoney = ctx.GetMoneyRegex(); prjMgr = ctx.GetMgrRegex(); if (string.IsNullOrEmpty(prjMgr)) { prjMgr = ctx.GetMgrRegex(new string[] { "建造师姓名" }); } code = ctx.GetCodeRegex(); if (string.IsNullOrEmpty(bidUnit) || bidUnit.Contains("中标价")) { ctx = string.Empty; for (int r = 0; r < bidTable.RowCount; r++) { string rowName = bidTable.Rows[r].ToNodePlainString(); for (int c = 0; c < bidTable.Rows[r].ColumnCount; c++) { if (rowName.Contains("中标人") || rowName.Contains("中标价")) { try { ctx += bidTable.Rows[r].Columns[c].ToNodePlainString() + ":"; ctx += bidTable.Rows[r + 1].Columns[c].ToNodePlainString() + "\r\n"; } catch { } } else { string temp = bidTable.Rows[r].Columns[c].ToNodePlainString(); if ((c + 1) % 2 == 0) { ctx += temp + "\r\n"; } else { ctx += temp + ":"; } } } } if (string.IsNullOrEmpty(buildUnit)) { buildUnit = ctx.GetBuildRegex(); } bidUnit = ctx.GetBidRegex().GetReplace("第一名,第一");; if (string.IsNullOrEmpty(bidUnit)) { bidUnit = ctx.GetRegex("第一名"); } if (string.IsNullOrEmpty(bidUnit)) { bidUnit = ctx.GetRegex("第一"); } if (string.IsNullOrEmpty(bidUnit)) { bidUnit = ctx.GetRegex("1"); } if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney)) { bidMoney = ctx.GetMoneyRegex(); } if (string.IsNullOrEmpty(prjMgr) || prjMgr.IsNumber()) { prjMgr = ctx.GetMgrRegex(); } if (string.IsNullOrEmpty(prjMgr) || prjMgr.IsNumber()) { prjMgr = ctx.GetMgrRegex(new string[] { "建造师姓名" }); } if (string.IsNullOrEmpty(code)) { code = ctx.GetCodeRegex(); } } } else { prjAddress = bidCtx.GetAddressRegex(); buildUnit = bidCtx.GetBuildRegex(); bidUnit = bidCtx.GetBidRegex().GetReplace("第一名,第一");; if (string.IsNullOrEmpty(bidUnit)) { bidUnit = bidCtx.GetRegex("第一中标排序人"); } bidMoney = bidCtx.GetMoneyRegex(); prjMgr = bidCtx.GetMgrRegex(); if (string.IsNullOrEmpty(prjMgr)) { prjMgr = bidCtx.GetRegex("注册监理工程师"); } code = bidCtx.GetCodeRegex(); } if (buildUnit.Contains("公司")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司"; } if (buildUnit.Contains("联系")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("联系")); } if (buildUnit.Contains("地址")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("地址")); } if (bidUnit.Contains("公司")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("公司")) + "公司"; } if (bidUnit.Contains("联系")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("联系")); } if (bidUnit.Contains("地址")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("地址")); } buildUnit = buildUnit.Replace(" ", ""); bidUnit = bidUnit.GetReplace("一标段"); if (bidUnit.IsNumber() || bidUnit.Contains("中标") || bidUnit.Contains("投标") || bidUnit.Contains("合格")) { bidUnit = string.Empty; } code = code.Replace(" ", ""); try { if (decimal.Parse(bidMoney) > 100000) { bidMoney = (decimal.Parse(bidMoney) / 10000).ToString(); } } catch { } prjMgr = prjMgr.Replace(" ", ""); if (prjMgr.IsNumber() || prjMgr.Contains("注册") || prjMgr.Contains("中标") || prjMgr.Contains("证书")) { prjMgr = string.Empty; } bidType = "建设工程"; specType = "政府采购"; msgType = "青海省公共资源交易监督管理局"; BidInfo info = ToolDb.GenBidInfo("青海省", "青海省及地市", area, string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://qhzbtb.qhwszwdt.gov.cn/" + a.Link.GetReplace("../,./"); } if (Encoding.Default.GetByteCount(link) > 500) { continue; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default, ref cookiestr); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("align", "right"))); Regex regexPage = new Regex(@"共\d+页"); try { page = Convert.ToInt32(regexPage.Match(nodeList[0].ToPlainTextString()).Value.Replace("共", "").Replace("页", "").Trim()); } catch (Exception) { } for (int i = 1; i <= page; i++) { if (i > 1) { try { htl = this.ToolWebSite.GetHtmlByUrl("http://bidding.szu.edu.cn/list.asp?page=" + i.ToString(), Encoding.Default); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(htl)); NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("style", "border-collapse: collapse"))); if (tableNodeList != null && tableNodeList.Count > 0) { TableTag table = (TableTag)tableNodeList[0]; for (int j = 1; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, img = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; string m = tr.ChildrenHTML.ToString(); prjName = tr.Columns[0].ToPlainTextString().Trim().Replace("·", ""); beginDate = tr.Columns[0].ToPlainTextString().GetDateRegex(); ATag aTag = tr.Columns[0].SearchFor(typeof(ATag), true)[1] as ATag; if (prjName.Contains(")") && prjName.Contains("(")) { int leng = prjName.IndexOf("("); code = prjName.Replace("(", "kdxx").Replace(")", "xxdk").GetRegexBegEnd("kdxx", "xxdk"); prjName = prjName.Remove(leng); string l = prjName.GetRegexBegEnd(" ", " "); code = prjName.GetRegexBegEnd("招标公告", " "); prjName = prjName.Replace(l, "").Replace(" ", ""); } else if (prjName.Contains(")") && prjName.Contains("(")) { int leng = prjName.IndexOf("("); code = prjName.Replace("(", "kdxx").Replace(")", "xxdk").GetRegexBegEnd("kdxx", "xxdk"); string l = prjName.GetRegexBegEnd(" ", " "); code = prjName.GetRegexBegEnd("招标公告", " "); prjName = prjName.Replace(l, "").Replace(" ", ""); } InfoUrl = "http://bidding.szu.edu.cn/" + aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); } catch (Exception) { continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("cellspacing", "0"))); if (dtnode != null && dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); inviteCtx = HtmlTxt.Replace("<li>", "\r\n").Replace("</li>", "\r\n").ToCtxString().Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); if (string.IsNullOrEmpty(code)) { code = inviteCtx.GetCodeRegex(); } buildUnit = inviteCtx.GetBuildRegex(); if (string.IsNullOrEmpty(buildUnit)) { buildUnit = inviteCtx.GetRegex("招标机构名称"); } prjAddress = inviteCtx.GetAddressRegex(); if (string.IsNullOrEmpty(prjAddress)) { prjAddress = inviteCtx.GetRegexBegEnd("开标室", "。"); } msgType = "深圳大学"; if (inviteType == "设备材料" || inviteType == "小型施工" || inviteType == "专业分包" || inviteType == "劳务分包" || inviteType == "服务" || inviteType == "勘察" || inviteType == "设计" || inviteType == "监理" || inviteType == "施工") { specType = "建设工程"; } else { specType = "其他"; } if (prjAddress == "") { prjAddress = "见招标信息"; } inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int a = 0; a < aNode.Count; a++) { ATag aTag1 = aNode[a] as ATag; if (aTag1.IsAtagAttach()) { string fileUrl = string.Empty; if (aTag1.Link.Contains("http")) { fileUrl = aTag1.Link; } else { fileUrl = ToolWeb.UrlEncode("http://bidding.szu.edu.cn/" + aTag1.Link);// System.Web.HttpUtility.UrlEncode( aTag1.Link); } } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8); } catch { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "navigation"))); if (pageNode != null && pageNode.Count > 0) { string temp = pageNode[0].ToNodePlainString().GetRegexBegEnd("总共", "页").GetReplace("【,】,[,]"); try { pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "&page=" + i, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "slidingList")), true), new TagNameFilter("li"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; INode node = listNode[j]; ATag aTag = node.GetATag(); prjName = aTag.GetAttribute("title"); beginDate = node.GetSpan().StringText; if (!string.IsNullOrEmpty(beginDate)) { beginDate = beginDate.Substring(0, 4) + "-" + beginDate.Substring(4, 2) + "-" + beginDate.Substring(6, 2); } InfoUrl = "http://www.gsggzyjy.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "ContentPlaceHolder1_AnnoGoodsHtml"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); msgType = "甘肃省公共资源交易中心"; specType = "政府采购"; inviteType = "交通运输工程"; InviteInfo info = ToolDb.GenInviteInfo("甘肃省", "甘肃省及地市", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(htmldtl)); NodeList aNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("Iframe"), new HasAttributeFilter("id", "Iframe"))); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { IFrameTag itag = aNode[k] as IFrameTag; string link = itag.GetAttribute("src"); if (!string.IsNullOrEmpty(link)) { BaseAttach attach = ToolDb.GenBaseAttach(prjName + ".pdf", info.Id, link); base.AttachList.Add(attach); } } } parser = new Parser(new Lexer(HtmlTxt)); NodeList atagNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (atagNode != null && atagNode.Count > 0) { for (int a = 0; a < atagNode.Count; a++) { ATag fileTag = atagNode[a] as ATag; if (fileTag.IsAtagAttach()) { string link = string.Empty; if (fileTag.Link.Contains("http")) { link = fileTag.Link; } else { link = "http://www.gsggzyjy.cn/" + fileTag.Link; } BaseAttach attach = ToolDb.GenBaseAttach(fileTag.LinkText, info.Id, link); if (!base.AttachList.Exists(x => x.AttachServerPath == link)) { base.AttachList.Add(attach); } } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("style", "padding-top:15px;"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("共", "页").Replace(":", ""); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.huajiezn.cn/html/zbgg/index_" + i + ".html"); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "mainnavb")), true), new TagNameFilter("li"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty; ATag aTag = listNode[j].GetATag(); prjName = aTag.LinkText.ToNodeString(); beginDate = listNode[j].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.huajiezn.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "Zoom"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); parser.Reset(); NodeList h3Node = parser.ExtractAllNodesThatMatch(new TagNameFilter("h3")); if (h3Node != null && h3Node.Count > 0) { prjName = h3Node[0].ToNodePlainString(); } inviteCtx = HtmlTxt.ToCtxString(); prjAddress = inviteCtx.GetAddressRegex(); buildUnit = inviteCtx.GetBuildRegex(); if (buildUnit.Contains("公司")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司"; } if (buildUnit.Contains("地址")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("地址")); } if (buildUnit.Contains("招标代理")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("招标代理")); } code = inviteCtx.GetCodeRegex().GetCodeDel(); msgType = "华杰工程咨询有限公司中南分公司"; specType = "建设工程"; inviteType = prjName.GetInviteBidType(); buildUnit = buildUnit.Replace(" ", ""); InviteInfo info = ToolDb.GenInviteInfo("湖北省", "湖北省及地市", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.huajiezn.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageo = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "leftnav")), true), new TagNameFilter("span"))); if (pageo != null && pageo.Count > 0) { string pages = pageo.AsString().GetRegexBegEnd("条", "页"); try { pageInt = int.Parse(pages.Replace("/", "")); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "&pageNo=" + i, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "div_list")), true), new TagNameFilter("ul"))); if (nodeList != null && nodeList.Count > 0) { for (int j = 0; j < nodeList.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = nodeList[j].GetATag(); prjName = aTag.GetAttribute("title"); beginDate = nodeList[j].ToPlainTextString().GetDateRegex(); InfoUrl = "http://zyjy.dayawan.gov.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "div_view"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); inviteCtx = HtmlTxt.Replace("</p>", "\r\n").ToCtxString(); prjAddress = inviteCtx.GetAddressRegex(); buildUnit = inviteCtx.GetBuildRegex(); code = inviteCtx.GetCodeRegex().GetCodeDel(); if (!string.IsNullOrWhiteSpace(code)) { if (code[code.Length - 1] != '号') { code = ""; } } msgType = "惠州大亚湾经济技术开发区公共资源交易中心"; specType = "建设工程"; inviteType = ToolHtml.GetInviteTypes(prjName); buildUnit = buildUnit.Replace(" ", ""); InviteInfo info = ToolDb.GenInviteInfo("广东省", "惠州市区", "大亚湾区", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNodes = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNodes != null && aNodes.Count > 0) { for (int a = 0; a < aNodes.Count; a++) { ATag aFile = aNodes[a] as ATag; if (aFile.IsAtagAttach()) { string link = string.Empty; if (aFile.Link.ToLower().Contains("http")) { link = aFile.Link; } else { link = "http://zyjy.dayawan.gov.cn/" + aFile.Link; } BaseAttach attach = ToolDb.GenBaseAttach(aFile.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { string str = System.Web.HttpUtility.UrlDecode("appid=1&webid=1&path=%2F&columnid=808&sourceContentType=1&unitid=620&webname=%E6%B5%99%E6%B1%9F%E7%9C%81%E5%8F%91%E5%B1%95%E5%92%8C%E6%94%B9%E9%9D%A9%E5%A7%94%E5%91%98%E4%BC%9A&permissiontype=0"); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "col", "appid", "webid", "path", "columnid", "sourceContentType", "unitid", "webname", "permissiontype" }, new string[] { "1", "1", "1", "/", "148", "1", "363", "江苏政务服务网", "0" }); html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc); } catch { return(null); } try { string temp = html.GetRegexBegEnd("<totalpage>", "</totalpage>"); pageInt = int.Parse(temp); } catch { } for (int i = 1; i <= pageInt; i++) { if (i > 1) { NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "col", "appid", "webid", "path", "columnid", "sourceContentType", "unitid", "webname", "permissiontype" }, new string[] { "1", "1", "1", "/", "148", "1", "363", "江苏政务服务网", "0" }); try { int endrecord = i * 45; int startrecord = 45 * i - 44; html = this.ToolWebSite.GetHtmlByUrl("http://www.jszwfw.gov.cn/module/jslib/jquery/jpage/dataproxy.jsp?perpage=15&endrecord=" + endrecord + "&startrecord=" + startrecord, nvc); } catch { continue; } } Parser parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "99%"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty; TableRow tr = (listNode[j] as TableTag).Rows[0]; ATag aTag = tr.Columns[1].GetATag(); prjName = aTag.GetAttribute("title"); if (prjName.Contains(" ")) { string[] str = prjName.Split(' '); code = str[0]; prjName = str[1]; } else { string str = prjName.GetNotChina(); if (str.Length > 2 && prjName.IsNumber()) { try { int index = prjName.IndexOf(str.Substring(0, 2)); code = prjName.Substring(0, index); prjName = prjName.Substring(index, prjName.Length - index); } catch { } } } beginDate = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.jszwfw.gov.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "zoom"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); bidCtx = HtmlTxt.ToLower().GetReplace("</p>,</br>", "\r\n").ToCtxString(); buildUnit = bidCtx.GetBuildRegex(); if (string.IsNullOrEmpty(code)) { code = bidCtx.GetCodeRegex().GetCodeDel(); } bidUnit = bidCtx.GetBidRegex().GetReplace("名称"); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = bidCtx.GetRegex("第一中标候选单位为,第一名,中标(成交)候选人名称").GetReplace("名称"); } bidMoney = bidCtx.GetMoneyRegex(); if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney)) { bidMoney = bidCtx.GetMoneyRegex(null, true); } prjMgr = bidCtx.GetMgrRegex(); try { if (decimal.Parse(bidMoney) > 100000) { bidMoney = (decimal.Parse(bidMoney) / 10000).ToString(); } } catch { } msgType = "江苏省政务服务管理办公室"; specType = "政府采购"; bidType = "建设工程"; BidInfo info = ToolDb.GenBidInfo("江苏省", "江苏省及地市", area, string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.jszwfw.gov.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { int sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default).GetJsString(); } catch (Exception ex) { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "font9grey1"))); if (nodeList != null && nodeList.Count > 1) { for (int i = 0; i < 2; i++) { TableTag table = nodeList[i] as TableTag; for (int j = 0; j < table.RowCount; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; TableRow tr = table.Rows[j]; infoType = "办事指南"; headName = tr.Columns[1].ToNodePlainString(); releaseTime = tr.Columns[2].ToPlainTextString().GetDateRegex(); infoUrl = "http://www.gzzb.gd.cn" + tr.Columns[1].GetATagHref(); string htldtl = string.Empty; try { htldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htldtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "contentDiv"))); if (dtlList != null && dtlList.Count > 0) { ctxHtml = dtlList.AsHtml(); infoCtx = ctxHtml.ToCtxString(); msgType = MsgTypeCosnt.GuangZhouMsgType; NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "广州市区", string.Empty, infoCtx, infoType); sqlCount++; if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } if (ToolDb.SaveEntity(info, this.ExistCompareFields)) { parser = new Parser(new Lexer(ctxHtml)); NodeList imgList = parser.ExtractAllNodesThatMatch(new TagNameFilter("img")); if (imgList != null && imgList.Count > 0) { for (int img = 0; img < imgList.Count; img++) { ImageTag imgTag = imgList[img] as ImageTag; try { BaseAttach obj = null; if (imgTag.GetAttribute("src").Contains("http")) { obj = ToolHtml.GetBaseAttach(imgTag.GetAttribute("src"), headName, info.Id); } else { obj = ToolHtml.GetBaseAttach("http://www.gzzb.gd.cn" + imgTag.GetAttribute("src"), headName, info.Id); } if (obj != null) { ToolDb.SaveEntity(obj, string.Empty); } } catch { } } } parser = new Parser(new Lexer(ctxHtml)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int a = 0; a < aNode.Count; a++) { ATag aTag = aNode[a] as ATag; if (aTag.IsAtagAttach()) { try { BaseAttach obj = null; if (aTag.Link.Contains("http")) { obj = ToolHtml.GetBaseAttach(aTag.Link, aTag.LinkText, info.Id); } else { obj = ToolHtml.GetBaseAttach("http://www.gzzb.gd.cn" + aTag.Link, aTag.LinkText, info.Id); } if (obj != null) { ToolDb.SaveEntity(obj, string.Empty); } } catch { } } } } } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { //取得页码 int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch (Exception ex) { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "scott")), true), new TagNameFilter("a"))); if (pageList != null && pageList.Count > 0) { try { string temp = pageList[pageList.Count - 1].GetATagValue().Replace("(", "kdxx").Replace(")", "xxdk").GetRegexBegEnd("kdxx", "xxdk"); pageInt = Convert.ToInt32(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "lefttable"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount - 1; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; TableRow tr = table.Rows[j]; infoType = "办事指南"; headName = tr.Columns[1].ToNodePlainString(); releaseTime = tr.Columns[2].ToPlainTextString().GetDateRegex(); infoUrl = tr.Columns[1].GetATagHref(); string htldtl = string.Empty; try { htldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.Default).GetJsString().Replace("<?xml:namespace prefix = o ns = \"urn:schemas-microsoft-com:office:office\" />", ""); } catch { continue; } parser = new Parser(new Lexer(htldtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "context_div"))); if (dtlList != null && dtlList.Count > 0) { ctxHtml = dtlList.AsHtml(); infoCtx = ctxHtml.ToCtxString(); msgType = MsgTypeCosnt.ZhongShanMsgType; NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "中山市区", string.Empty, infoCtx, infoType); if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } else { sqlCount++; if (ToolDb.SaveEntity(info, this.ExistCompareFields)) { parser = new Parser(new Lexer(ctxHtml)); NodeList imgList = parser.ExtractAllNodesThatMatch(new TagNameFilter("img")); if (imgList != null && imgList.Count > 0) { for (int img = 0; img < imgList.Count; img++) { ImageTag imgTag = imgList[img] as ImageTag; BaseAttach baseInfo = ToolHtml.GetBaseAttachByUrl(imgTag.GetAttribute("src"), headName, info.Id); if (baseInfo != null) { ToolDb.SaveEntity(baseInfo, string.Empty); } } } parser = new Parser(new Lexer(ctxHtml)); NodeList attachList = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (attachList != null && attachList.Count > 0) { for (int a = 0; a < attachList.Count; a++) { ATag aTag = attachList[a] as ATag; if (aTag.IsAtagAttach()) { BaseAttach obj = ToolHtml.GetBaseAttachByUrl(aTag.Link, aTag.LinkText, info.Id); if (obj != null) { ToolDb.SaveEntity(obj, string.Empty); } } } } } } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8); } catch { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "huifont"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString(); temp = temp.Substring(temp.IndexOf("/") + 1, temp.Length - temp.IndexOf("/") - 1); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.hebggzy.cn/024/024002/" + i + ".html", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("li"), new HasAttributeFilter("class", "right-text-li"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; INode node = listNode[j]; ATag aTag = node.GetATag(); infoType = "通知公告"; headName = aTag.GetAttribute("title"); releaseTime = node.ToPlainTextString().GetDateRegex(); infoUrl = "http://www.hebggzy.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "article-main"))); if (dtlNode != null && dtlNode.Count > 0) { ctxHtml = dtlNode.AsHtml(); infoCtx = ctxHtml.ToCtxString(); msgType = "河北省公共资源交易中心"; NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "河北省", "河北省及地市", "", infoCtx, infoType); sqlCount++; if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate)) { parser = new Parser(new Lexer(ctxHtml)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.Link.ToLower().Contains("download") || a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.hebggzy.cn/" + a.Link.GetReplace("../,./"); } if (Encoding.Default.GetByteCount(link) > 500) { continue; } try { BaseAttach attach = ToolHtml.GetBaseAttachByUrl(link, a.LinkText, info.Id); if (attach != null) { ToolDb.SaveEntity(attach, "SourceID,AttachServerPath"); } } catch { } } } } } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default, ref cookiestr); } catch { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "TblOSInfoList1_Pager"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[0].ToNodePlainString().GetRegexBegEnd("总页数:", "当前"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__VIEWSTATE", "TblOSInfoList1:KeyWord", "__EVENTTARGET", "__EVENTARGUMENT" }, new string[] { viewState, "", "TblOSInfoList1:Pager", i.ToString() }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.Default, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "TblOSInfoList1_DataGrid1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); prjName = aTag.GetAttribute("title").GetReplace(";"); code = tr.Columns[1].ToNodePlainString().GetReplace("[", "【").GetReplace("]", "】").GetRegexBegEnd("【", "】"); beginDate = tr.Columns[2].ToPlainTextString().GetDateRegex(); if (!aTag.Link.Contains("http")) { string f = aTag.Link; InfoUrl = "http://www.zmctc.com/zjgcjy/Notice/" + aTag.Link; } else { InfoUrl = aTag.Link; } string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); if (htmldtl.Contains("�")) { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "news"))); if (dtlNode == null || dtlNode.Count < 1) { parser.Reset(); dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "olds"))); } if (dtlNode == null || dtlNode.Count < 1) { parser.Reset(); dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "TDContent"))); } if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); bidCtx = HtmlTxt.ToCtxString(); parser = new Parser(new Lexer(HtmlTxt)); NodeList tableNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "Pbgginfodetailnew1_company"))); if (tableNode == null || tableNode.Count < 1) { parser.Reset(); tableNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "PbggInfoDetail1_company"))); } if (tableNode == null || tableNode.Count < 1) { parser.Reset(); tableNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("rules", "all"))); } if (tableNode != null && tableNode.Count > 0) { TableTag tag = tableNode[0] as TableTag; string ctx = string.Empty; for (int r = 0; r < tag.Rows[0].ColumnCount; r++) { try { ctx += tag.Rows[0].Columns[r].ToNodePlainString() + ":"; ctx += tag.Rows[1].Columns[r].ToNodePlainString() + "\r\n"; } catch { } } if (!ctx.Contains("否决投标")) { bidUnit = ctx.GetBidRegex(); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = ctx.GetRegex("中标候选人"); } bidMoney = ctx.GetMoneyRegex(); prjMgr = ctx.GetMgrRegex(); if (string.IsNullOrEmpty(prjMgr)) { prjMgr = ctx.GetRegex("项目经理/总监"); } } } msgType = "浙江省公共资源交易中心"; specType = "政府采购"; bidType = "建设工程"; BidInfo info = ToolDb.GenBidInfo("浙江省", "浙江省及地市", area, string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://downc.zmctc.com/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "p_bar"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("/", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.cqzb.gov.cn/class-5-45(" + i + ").aspx"); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "ztb_list_right")), true), new TagNameFilter("li"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty; INode node = listNode[j]; ATag aTag = node.GetATag(); prjName = aTag.GetAttribute("title"); beginDate = node.ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.cqzb.gov.cn/" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "ztb_zbxx1"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); bidCtx = HtmlTxt.ToCtxString(); parser = new Parser(new Lexer(HtmlTxt)); NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (tableNode != null && tableNode.Count > 0) { TableTag table = tableNode[0] as TableTag; string ctx = string.Empty; for (int r = 0; r < table.RowCount; r++) { for (int c = 0; c < table.Rows[r].ColumnCount; c++) { string temp = table.Rows[r].Columns[c].ToNodePlainString(); if ((c + 1) % 2 == 0) { ctx += temp + "\r\n"; } else { ctx += temp + ":"; } } } bidUnit = ctx.GetRegex("拟中标人"); if (bidUnit.Contains("/")) { bidUnit = ctx.GetBidRegex(); } if (bidUnit.Contains("/")) { bidUnit = ctx.GetRegex("中标人"); } bidMoney = ctx.GetMoneyRegex(); buildUnit = ctx.GetBuildRegex(); prjAddress = ctx.GetAddressRegex(); prjMgr = ctx.GetMgrRegex(); } try { bidType = prjName.GetInviteBidType(); } catch { } specType = "建设工程"; msgType = "重庆市招标投标综合网"; BidInfo info = ToolDb.GenBidInfo("重庆市", "重庆市及区县", area, string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.cqzb.gov.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "513"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { TableTag tag = listNode[j] as TableTag; string align = tag.GetAttribute("align"); string style = tag.GetAttribute("style"); if (!string.IsNullOrWhiteSpace(align) || !string.IsNullOrWhiteSpace(style)) { continue; } string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = tag.Rows[0]; ATag aTag = tr.Columns[1].GetATag(); prjName = aTag.GetAttribute("title"); beginDate = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = "http://ggzy.jinan.gov.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "zoom"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml().GetReplace("</p>,<br/>", "\r\n"); parser = new Parser(new Lexer(HtmlTxt)); NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (tableNode != null && tableNode.Count > 0) { TableTag table = tableNode[0] as TableTag; for (int r = 0; r < table.RowCount; r++) { for (int c = 0; c < table.Rows[r].ColumnCount; c++) { string temp = table.Rows[r].Columns[c].ToNodePlainString(); if (string.IsNullOrWhiteSpace(temp)) { continue; } if ((c + 1) % 2 == 0) { inviteCtx += temp.GetReplace(":,:") + "\r\n"; } else { inviteCtx += temp.GetReplace(":,:") + ":"; } } } } prjAddress = inviteCtx.GetAddressRegex().GetCodeDel().GetReplace(" ,&mdash"); buildUnit = inviteCtx.GetBuildRegex().GetReplace(" "); if (buildUnit.Contains("公司")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司"; } if (buildUnit.Contains("联系")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("联系")); } if (buildUnit.Contains("地址")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("地址")); } code = inviteCtx.GetCodeRegex().GetCodeDel(); msgType = "济南市公共资源交易中心"; specType = "政府采购"; inviteType = "建设工程"; InviteInfo info = ToolDb.GenInviteInfo("山东省", "山东省及地市", "济南市", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://ggzy.jinan.gov.cn" + a.Link.GetReplace("../,./"); } if (Encoding.Default.GetByteCount(link) > 500) { continue; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default); } catch { } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "ctl00_ContentPlaceHolder1_Repeater1_ctl16_lblpc"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[0].ToNodePlainString(); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "&page=" + (i - 1).ToString(), Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "slist")), true), new TagNameFilter("li"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty; INode node = listNode[j]; ATag aTag = node.GetATag(); prjName = aTag.GetAttribute("title"); beginDate = DateTime.Now.Year + "-" + node.GetSpan().StringText.ToNodeString().GetReplace(" "); area = node.ToNodePlainString().GetReplace("[", "【").GetReplace("]", "】").GetRegexBegEnd("【", "】"); InfoUrl = "http://www.xjztb.net/Homepage/" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "print1"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); prjAddress = inviteCtx.GetAddressRegex(); buildUnit = inviteCtx.GetBuildRegex(); if (buildUnit.Contains("公司")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司"; } if (buildUnit.Contains("地址")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("地址")) + "地址"; } code = inviteCtx.GetCodeRegex().GetCodeDel(); msgType = "新疆维吾尔自治区建设工程招标投标监督管理办公室"; specType = "建设工程"; inviteType = "建设工程"; InviteInfo info = ToolDb.GenInviteInfo("新疆维吾尔自治区", "新疆维吾尔自治区及地市", area, string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.xjztb.net/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "page"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[0].ToPlainTextString().GetRegexBegEnd("共", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://ggzy.zhuhai.gov.cn//zbjj/index_" + i + ".htm", Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "news")), true), new TagNameFilter("li"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { INode node = listNode[j]; ATag aTag = node.GetATag(); if (aTag == null) { continue; } string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, HtmlTxt = string.Empty; prjName = aTag.GetAttribute("title"); beginDate = node.ToPlainTextString().GetDateRegex(); InfoUrl = aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "m_r m_r_g"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); bidCtx = HtmlTxt.ToCtxString(); parser = new Parser(new Lexer(HtmlTxt.ToLower().Replace("th", "td"))); NodeList tableNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "bordertb"))); if (tableNode != null && tableNode.Count > 0) { string ctx = string.Empty; TableTag table = tableNode[0] as TableTag; for (int r = 1; r < table.RowCount; r++) { for (int c = 0; c < table.Rows[r].ColumnCount; c++) { string temp = table.Rows[r].Columns[c].ToNodePlainString(); if (c % 2 == 0) { ctx += temp + ":"; } else { ctx += temp + "\r\n"; } } } buildUnit = ctx.GetBuildRegex(); code = ctx.GetCodeRegex().GetCodeDel(); bidUnit = ctx.GetBidRegex(); bidMoney = ctx.GetMoneyRegex(); prjMgr = ctx.GetMgrRegex(); } msgType = "珠海市公共资源交易中心"; specType = "建设工程"; bidType = prjName.GetInviteBidType(); BidInfo info = ToolDb.GenBidInfo("广东省", "珠海市区", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, string.Empty, bidMoney, InfoUrl, prjMgr, HtmlTxt); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://ggzy.zhuhai.gov.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "mtop pages"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("1/", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.gsei.com.cn/index.php/cms/item-list-category-1336-page-" + i + ".shtml", Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "label_ul_b")), true), new TagNameFilter("li"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { ATag aTag = listNode[j].GetATag(); if (aTag == null) { continue; } string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty; prjName = aTag.GetAttribute("title"); beginDate = listNode[j].ToPlainTextString().GetDateRegex(); InfoUrl = aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "p8_content_show"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); prjAddress = inviteCtx.GetAddressRegex(); buildUnit = inviteCtx.GetBuildRegex(); if (buildUnit.Contains("公司")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司"; } if (buildUnit.Contains("地址")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("地址")); } if (buildUnit.Contains("工程地点") || buildUnit.Contains("武警")) { buildUnit = ""; } code = inviteCtx.GetCodeRegex().GetCodeDel(); msgType = "甘肃省信息中心"; specType = "政府采购"; inviteType = "建设工程"; InviteInfo info = ToolDb.GenInviteInfo("甘肃省", "甘肃省及地市", area, string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.gsei.com.cn/" + a.Link.GetReplace("../,./"); } if (Encoding.Default.GetByteCount(link) > 500) { continue; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pagination page-mar"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("/共", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "&page=" + i, Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "wsbs-table"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; prjName = tr.Columns[1].ToNodePlainString(); if (prjName[prjName.Length - 1] == ')') { int staIndex = prjName.LastIndexOf("("); int endIndex = prjName.LastIndexOf(")"); if (staIndex > 0 && endIndex > 0 && endIndex > staIndex) { code = prjName.Substring(staIndex + 1, endIndex - staIndex - 1); if (!code.IsChina()) { prjName = prjName.Remove(staIndex); } else { code = string.Empty; } } } beginDate = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.gzggzy.cn" + tr.Columns[1].GetATagHref(); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "xx-text"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.ToHtml(); inviteCtx = HtmlTxt.Replace("</p>", "\r\n").Replace("<br />", "\r\n").Replace("<br/>", "\r\n").ToCtxString(); if (string.IsNullOrEmpty(code)) { code = inviteCtx.GetCodeRegex(); } buildUnit = inviteCtx.GetBuildRegex(); if (buildUnit.Contains("联系")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("联系")); } prjAddress = inviteCtx.GetAddressRegex(); msgType = "广州公共资源交易中心"; specType = "政府采购"; inviteType = inviteCtx.GetRegex("项目类别", true, 50); if (string.IsNullOrEmpty(inviteType)) { inviteType = prjName.GetInviteBidType(); } InviteInfo info = ToolDb.GenInviteInfo("广东省", "广州政府采购", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); parser = new Parser(new Lexer(HtmlTxt)); NodeList fileNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (fileNode != null && fileNode.Count > 0) { for (int k = 0; k < fileNode.Count; k++) { ATag fileAtag = fileNode[k].GetATag(); if (fileAtag.IsAtagAttach()) { string fileName = fileAtag.LinkText.ToNodeString().Replace(" ", ""); string fileLink = fileAtag.Link; if (!fileLink.ToLower().Contains("http")) { fileLink = "http://www.gzggzy.cn" + fileAtag.Link; } base.AttachList.Add(ToolDb.GenBaseAttach(fileName, info.Id, fileLink)); } } } list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "cNavBar_cTotalPages"))); if (pageNode != null && pageNode.Count > 0) { string temp = pageNode[0].ToNodePlainString(); try { pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__VIEWSTATE", "cID", "cFlag", "Input", "Left1:cID", "Left1:cFlag", "cNavBar:cPageSize", "cNavBar:cPageIndex", "Foot1:ddlLink1", "Foot1:ddlLink2", "Foot1:ddlLink3", "Foot1:ddlLink4", "Foot1:ddlLink5", "__EVENTVALIDATION" }, new string[] { viewState, "12004", "2", "", "12004", "2", "12", i.ToString(), "", "", "", "", "", eventValidation }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "95%"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { TableRow tr = table.Rows[j]; string code = string.Empty, prjName = string.Empty, beginDate = string.Empty, InfoUrl = string.Empty; ATag atag = tr.Columns[0].GetATag(); beginDate = tr.Columns[1].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.jinwan.gov.cn/" + atag.Link.GetReplace("../"); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "fonth21"))); if (dtlNode != null && dtlNode.Count > 0) { parser = new Parser(new Lexer(htmldtl)); NodeList nameNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "fonth19"))); prjName = System.Web.HttpUtility.HtmlDecode(nameNode[0].ToNodePlainString()).Trim(); string buildUnit = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty; HtmlTxt = dtlNode.AsHtml().ToLower(); inviteCtx = HtmlTxt.GetReplace("</p>,</br>,<br>", "\r\n").ToCtxString(); Parser imgParser = new Parser(new Lexer(HtmlTxt.ToLower())); NodeList imgNode = imgParser.ExtractAllNodesThatMatch(new TagNameFilter("img")); string src = string.Empty; if (imgNode != null && imgNode.Count > 0) { string imgUrl = (imgNode[0] as ImageTag).GetAttribute("src"); src = "http://www.jinwan.gov.cn/" + imgUrl; HtmlTxt = HtmlTxt.ToLower().GetReplace(imgUrl, src); } specType = "政府采购"; inviteType = prjName.GetInviteBidType(); msgType = "珠海市金湾区人民政府"; InviteInfo info = ToolDb.GenInviteInfo("广东省", "珠海市区", "金湾区", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!string.IsNullOrEmpty(src)) { string sql = string.Format("select Id from InviteInfo where InfoUrl='{0}'", info.InfoUrl); object obj = ToolDb.ExecuteScalar(sql); if (obj == null || obj.ToString() == "") { try { BaseAttach attach = ToolHtml.GetBaseAttach(src, prjName, info.Id, "SiteManage\\Files\\InviteAttach\\"); if (attach != null) { ToolDb.SaveEntity(attach, ""); } } catch { } } } parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k].GetATag(); if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.jinwan.gov.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); string html = string.Empty; int pageInt = 1; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "pagination")), true), new TagNameFilter("a"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[pageNode.Count - 2].ToNodePlainString(); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.cfcpn.com/plist/jieguo?pageNo=" + i + "&kflag=0&keyword=&keywordType=&province=&city=&typeOne=", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "cfcpn_list_content text-left"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; INode node = listNode[j]; ATag aTag = node.GetATag(); beginDate = node.ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.cfcpn.com" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList telNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("p"), new HasAttributeFilter("class", "cfcpn_news_title"))); if (telNode != null && telNode.Count > 0) { prjName = telNode.AsHtml(); prjName = prjName.ToCtxString(); } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "news_content"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); bidCtx = HtmlTxt.GetReplace("<br/>,</p>,<br>,<br />,</div>", "\r\n").ToCtxString().GetReplace("一包:\r\n", "一包:").GetReplace("一包:\r\n", "一包:").GetReplace("一包:\r\n", "一包:").GetReplace("一包:\r\n", "一包:").GetReplace("一包:\r\n", "一包:").GetReplace("一包:\r\n", "一包:").GetReplace("一包:\r\n", "一包:").GetReplace("一包:\r\n", "一包:").GetReplace("一包:\r\n", "一包:"); buildUnit = bidCtx.GetBuildRegex(); prjAddress = bidCtx.GetAddressRegex(); code = bidCtx.GetCodeRegex().GetCodeDel(); bidUnit = bidCtx.GetBidRegex(); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = bidCtx.GetRegex("成交候选人,第一中标候选人名称,一包").GetReplace("名称"); } bidMoney = bidCtx.GetMoneyRegex(); prjMgr = bidCtx.GetMgrRegex(); if (bidUnit.Contains("废标") || bidCtx.Contains("废除原因") || bidCtx.Contains("废止原因") || bidCtx.Contains("废标")) { bidUnit = "废标"; prjMgr = string.Empty; bidMoney = "0"; } if (string.IsNullOrEmpty(bidUnit)) { parser = new Parser(new Lexer(HtmlTxt)); NodeList tableNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("border", "1"))); if (tableNode != null && tableNode.Count > 0) { TableTag dtlTable = tableNode[0] as TableTag; string ctx = string.Empty; if (dtlTable.RowCount > 1) { try { for (int r = 0; r < dtlTable.Rows[0].ColumnCount; r++) { ctx += dtlTable.Rows[0].Columns[r].ToNodePlainString() + ":"; ctx += dtlTable.Rows[1].Columns[r].ToNodePlainString() + "\r\n"; } } catch { } bidUnit = ctx.GetBidRegex(); if (bidMoney == "0") { bidMoney = ctx.GetMoneyRegex(); } if (string.IsNullOrEmpty(prjMgr)) { prjMgr = ctx.GetMgrRegex(); } } } } bidUnit = bidUnit.GetReplace("名称, ", ""); buildUnit = buildUnit.GetReplace(" "); prjAddress = prjAddress.GetReplace(" "); prjName = prjName.GetReplace(" "); code = code.GetReplace(" "); prjMgr = prjMgr.GetReplace(" "); specType = "政府采购"; bidType = prjName.GetInviteBidType(); msgType = "中国金融集中采购网"; BidInfo info = ToolDb.GenBidInfo("全国", "金融专项采购", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int a = 0; a < aNode.Count; a++) { ATag file = aNode[a].GetATag(); if (file.IsAtagAttach()) { string link = file.Link; if (!link.ToLower().Contains("http")) { link = "http://www.cfcpn.com/" + file.Link; } base.AttachList.Add(ToolDb.GenBaseAttach(file.LinkText, info.Id, link)); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); int sqlCount = 0; string html = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + this.MaxCount); } catch { return(null); } int startIndex = html.IndexOf("{"); int endIndex = html.LastIndexOf("}"); html = html.Substring(startIndex, (endIndex + 1) - startIndex); JavaScriptSerializer serializer = new JavaScriptSerializer(); Dictionary <string, object> smsTypeJson = (Dictionary <string, object>)serializer.DeserializeObject(html); object[] objvalues = smsTypeJson["rows"] as object[]; foreach (object objValue in objvalues) { Dictionary <string, object> dic = (Dictionary <string, object>)objValue; string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; string ziGeDengJi = string.Empty, ziGeZhengShu = string.Empty, zbFangShi = string.Empty; code = Convert.ToString(dic["bdBH"]); prjName = Convert.ToString(dic["bdName"]); if (prjName.Contains("测试")) { continue; } string sel = Convert.ToString(dic["zbgsStartTime"]); try { beginDate = ToolHtml.GetDateTimeByLong(Convert.ToInt64(sel)).ToString(); } catch { } string end = Convert.ToString(dic["zbgsEndTime"]); try { endDate = ToolHtml.GetDateTimeByLong(Convert.ToInt64(end)).ToString(); } catch { } try { bidType = Convert.ToString(dic["gcLeiXing2"]); } catch (Exception ex) { bidType = ToolHtml.GetInviteTypes(prjName); } InfoUrl = Convert.ToString(dic["detailUrl"]); bool isJson = false; string urll = string.Empty; List <Dictionary <string, object> > listAttachs = new List <Dictionary <string, object> >(); try { urll = "https://www.szjsjy.com.cn:8001/jyw-ba/jyxx/queryOldOTDataDetail.do?type=4&id=" + dic["dbZhongBiaoJieGuoGuid"]; HtmlTxt = this.ToolWebSite.GetHtmlByUrl(urll).GetJsString().GetReplace("\\t,\\r,\\n,\""); } catch (Exception ex) { Logger.Error(prjName); continue; } try { if (string.IsNullOrWhiteSpace(HtmlTxt)) { string htmldtl = string.Empty; isJson = true; try { urll = "https://www.szjsjy.com.cn:8001/jyw-ba/jyxx/queryZbgs.do?guid=" + dic["dbZhongBiaoJieGuoGuid"] + "&ggGuid=&bdGuid="; htmldtl = this.ToolWebSite.GetHtmlByUrl(urll); } catch (Exception ex) { Logger.Error(prjName); continue; } Dictionary <string, object> dtlJsons = (Dictionary <string, object>)serializer.DeserializeObject(htmldtl); buildUnit = Convert.ToString(dtlJsons["zbrAndLht"]); bidUnit = Convert.ToString(dtlJsons["tbrName"]); bidMoney = Convert.ToString(dtlJsons["zhongBiaoJE"]); try { bidMoney = (decimal.Parse(bidMoney) / 1000000).ToString(); } catch { } prjMgr = Convert.ToString(dtlJsons["xiangMuJiLi"]); Dictionary <string, object> gg = null; try { gg = dtlJsons["gg"] as Dictionary <string, object>; } catch { } Dictionary <string, object> bd = null; Dictionary <string, object> gc = null; Dictionary <string, object> xm = null; try { bd = dtlJsons["bd"] as Dictionary <string, object>; } catch { } try { gc = bd["gc"] as Dictionary <string, object>; } catch { } try { xm = bd["xm"] as Dictionary <string, object>; } catch { } try { if (prjMgr.Contains("----;")) { prjMgr = null; } ziGeDengJi = Convert.ToString(dtlJsons["ziGeDengJi"]); if (ziGeDengJi.Contains("----;")) { ziGeDengJi = null; } ziGeZhengShu = Convert.ToString(dtlJsons["ziGeZhengShu"]); if (ziGeZhengShu.Contains("----;")) { ziGeZhengShu = null; } zbFangShi = Convert.ToString(gc["zbFangShi"]); if (zbFangShi.Contains("1")) { zbFangShi = "公开招标"; } else if (zbFangShi.Contains("2")) { zbFangShi = "邀请招标"; } else if (zbFangShi.Contains("4")) { zbFangShi = "单一来源"; } else if (zbFangShi.Contains("5")) { zbFangShi = "预选招标子工程"; } } catch { } string htl = this.ToolWebSite.GetHtmlByUrl(InfoUrl); Parser parser = new Parser(new Lexer(htl)); NodeList nodelist = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "right_bg"))); if (nodelist != null && nodelist.Count > 0) { HtmlTxt = nodelist.AsHtml(); try { HtmlTxt = HtmlTxt.GetReplace("<span id=\"gcBH\"></span>", "<span id=\"gcBH\">" + code + "</span>"); } catch { } try { HtmlTxt = HtmlTxt.GetReplace("<span id=\"gcName\"></span>", "<span id=\"gcBH\">" + gc["gcName"] + "</span>"); } catch { } try { HtmlTxt = HtmlTxt.GetReplace("<span id=\"bdName\"></span>", "<span id=\"bdName\">" + prjName + "</span>"); } catch { } try { HtmlTxt = HtmlTxt.GetReplace("<span id=\"xmBH\"></span>", "<span id=\"xmBH\">" + xm["xm_BH"] + "</span>"); } catch { } try { HtmlTxt = HtmlTxt.GetReplace("<span id=\"xmName\"></span>", "<span id=\"xmName\">" + xm["xm_Name"] + "</span>"); } catch { } try { long zbgsStartTime = Convert.ToInt64(dtlJsons["zbgsStartTime"]); HtmlTxt = HtmlTxt.GetReplace("<span id=\"zbgsStartTime\"></span>", "<span id=\"zbgsStartTime\">" + ToolHtml.GetDateTimeByLong(zbgsStartTime) + "</span>"); } catch { } try { long zbgsEndTime = Convert.ToInt64(dtlJsons["zbgsEndTime"]); HtmlTxt = HtmlTxt.GetReplace("<span id=\"zbgsEndTime\"></span>", "<span id=\"zbgsEndTime\">" + ToolHtml.GetDateTimeByLong(zbgsEndTime) + "</span>"); } catch { } try { HtmlTxt = HtmlTxt.GetReplace("<span id=\"zbRName\"></span>", "<span id=\"zbRName\">" + gc["zbRName"] + "</span>"); } catch { } try { HtmlTxt = HtmlTxt.GetReplace("<span id=\"zbdlJG\"></span>", "<span id=\"zbdlJG\">" + gc["creatorName"] + "</span>"); } catch { } try { HtmlTxt = HtmlTxt.GetReplace("<span id=\"zbFangShi\"></span>", "<span id=\"zbFangShi\">" + zbFangShi + "</span>"); } catch { } try { HtmlTxt = HtmlTxt.GetReplace("<span id=\"tbrName\"></span>", "<span id=\"tbrName\">" + dtlJsons["tbrName"] + "</span>"); } catch { } try { HtmlTxt = HtmlTxt.GetReplace("<span id=\"zhongBiaoJE\"></span>", "<span id=\"zhongBiaoJE\">" + bidMoney + "万元</span>"); } catch { } try { HtmlTxt = HtmlTxt.GetReplace("<span id=\"zhongBiaoGQ\"></span>", "<span id=\"zhongBiaoGQ\">" + dtlJsons["zhongBiaoGQ"] + "</span>"); } catch { } try { HtmlTxt = HtmlTxt.GetReplace("<span id=\"xiangMuJiLi\"></span>", "<span id=\"xiangMuJiLi\">" + prjMgr + "</span>"); } catch { } try { HtmlTxt = HtmlTxt.GetReplace("<span id=\"ziGeDengJi\"></span>", "<span id=\"ziGeDengJi\">" + ziGeDengJi + "</span>"); } catch { } try { HtmlTxt = HtmlTxt.GetReplace("<span id=\"ziGeZhengShu\"></span>", "<span id=\"ziGeZhengShu\">" + ziGeZhengShu + "</span>"); } catch { } try { string zanding = string.IsNullOrWhiteSpace(Convert.ToString(dtlJsons["isZanDingJinE"])) ? "否" : "是"; HtmlTxt = HtmlTxt.GetReplace("<span id=\"isZanDingJinE\"></span>", "<span id=\"isZanDingJinE\">" + zanding + "</span>"); } catch { } } try { string fileUrl = "https://www.szjsjy.com.cn:8001/jyw-ba/jyxx/filegroup/queryByGroupGuidZS.do?groupGuid=" + dtlJsons["ztbFileGroupGuid"]; string fileJson = this.ToolWebSite.GetHtmlByUrl(fileUrl); Dictionary <string, object> fileDic = (Dictionary <string, object>)serializer.DeserializeObject(fileJson); object[] objFile = fileDic["rows"] as object[]; foreach (object file in objFile) { Dictionary <string, object> attach = file as Dictionary <string, object>; listAttachs.Add(attach); } } catch { } } } catch { continue; } bidCtx = HtmlTxt.Replace("<br />", "\r\n").Replace("<BR>", "\r\n").Replace("</P>", "\r\n").ToCtxString(); if (!isJson) { prjAddress = bidCtx.GetAddressRegex(); buildUnit = bidCtx.GetBuildRegex(); bidUnit = bidCtx.GetBidRegex(); bidMoney = bidCtx.GetMoneyRegex(); prjMgr = bidCtx.GetMgrRegex(); if (string.IsNullOrEmpty(code)) { code = bidCtx.GetCodeRegex(); } } msgType = "深圳市建设工程交易中心宝安分中心"; specType = "建设工程"; bidType = "小型工程"; BidInfo info = ToolDb.GenBidInfo("广东省", "深圳宝安区工程", "宝安区", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } sqlCount++; if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate, this.ExistsHtlCtx)) { if (!isJson) { Parser parser = new Parser(new Lexer(HtmlTxt)); NodeList fileNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (fileNode != null && fileNode.Count > 0) { for (int f = 0; f < fileNode.Count; f++) { ATag tag = fileNode[f] as ATag; if (tag.IsAtagAttach() || tag.Link.ToLower().Contains("downloadfile")) { try { BaseAttach attach = null; string link = string.Empty; if (tag.Link.ToLower().Contains("http")) { link = tag.Link; if (link.StartsWith("\\")) { link = link.Substring(link.IndexOf("\\"), link.Length - link.IndexOf("\\")); } if (link.EndsWith("//")) { link = link.Remove(link.LastIndexOf("//")); } link = link.GetReplace("\\", ""); } else { link = "https://www.szjsjy.com.cn:8001/" + tag.Link; } attach = ToolHtml.GetBaseAttachByUrl(link, tag.LinkText, info.Id, "SiteManage\\Files\\InviteAttach\\"); if (attach != null) { ToolDb.SaveEntity(attach, "SourceID,AttachServerPath"); } } catch { continue; } } } } } else if (listAttachs.Count > 0) { foreach (Dictionary <string, object> attach in listAttachs) { BaseAttach attachBase = null; try { string attachName = Convert.ToString(attach["attachName"]); string attachId = Convert.ToString(attach["attachGuid"]); string link = "https://www.szjsjy.com.cn:8001/file/downloadFile?fileId=" + attachId; attachBase = ToolHtml.GetBaseAttach(link, attachName, info.Id, "SiteManage\\Files\\InviteAttach\\"); if (attachBase != null) { ToolDb.SaveEntity(attachBase, "SourceID,AttachServerPath"); } } catch { } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "paging"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString().GetReplace(" ").GetRegexBegEnd(",共", "页"); pageInt = int.Parse(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://wanjiang.dg.gov.cn/zbgs-" + i + ".html", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "Party_news")), true), new TagNameFilter("p"))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { ATag aTag = viewList[j].GetATag(); string beginDate = viewList[j].ToPlainTextString().GetDateRegex(); string prjName = aTag.LinkText.GetReplace("[" + beginDate + "]"); string InfoUrl = "http://wanjiang.dg.gov.cn/" + aTag.Link.GetReplace("./"); string htmDtl = string.Empty; try { htmDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>"); htmDtl = regexHtml.Replace(htmDtl, ""); } catch { continue; } parser = new Parser(new Lexer(htmDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "about-nn"))); if (dtl != null && dtl.Count > 0) { string code = string.Empty, buildUnit = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty; HtmlTxt = dtl.AsHtml().ToLower(); inviteCtx = HtmlTxt.ToLower().GetReplace("</p>,</br>,<br>", "\r\n").ToCtxString(); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); code = inviteCtx.GetCodeRegex().GetCodeDel(); if (buildUnit.Contains("地址")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("地址")); } if (buildUnit.Contains("招标代理")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("招标代理")); } specType = "政府采购"; inviteType = prjName.GetInviteBidType(); msgType = "东莞市万江区办事处办公室"; InviteInfo info = ToolDb.GenInviteInfo("广东省", "东莞市区", "万江区", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k].GetATag(); if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://wanjiang.dg.gov.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } if (list.Count % 20 == 0) { Thread.Sleep(1000 * 500); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8); } catch { return(list); } int startIndex = html.IndexOf("<xml"); int endIndex = html.IndexOf("</xml>"); string xmlstr = html.Substring(startIndex, endIndex - startIndex).ToLower().GetReplace("infourl", "span").GetReplace("info", "div").GetReplace("publishedtime", "p"); Parser parser = new Parser(new Lexer(xmlstr)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("div")); if (pageNode != null && pageNode.Count > 0) { for (int i = 0; i < pageNode.Count; i++) { parser = new Parser(new Lexer(pageNode[i].ToHtml())); NodeList dateNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("p")); string beginDate = dateNode[0].ToPlainTextString().GetDateRegex(); parser.Reset(); NodeList urlNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("span")); string infoUrl = "http://www.shatian.gov.cn/publicfiles/business/htmlfiles/" + urlNode[0].ToPlainTextString(); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList titleNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("title")); string prjName = titleNode[0].ToNodePlainString(); if (prjName.Contains("中标")) { string buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; parser.Reset(); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("valign", "top"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); bidCtx = HtmlTxt.GetReplace("</p>", "\r\n").ToCtxString(); buildUnit = bidCtx.GetBuildRegex(); bidUnit = bidCtx.GetBidRegex(); bidMoney = bidCtx.GetMoneyRegex(); prjMgr = bidCtx.GetMgrRegex(); code = bidCtx.GetCodeRegex(); specType = "政府采购"; bidType = prjName.GetInviteBidType(); msgType = "东莞市沙田镇政府"; BidInfo info = ToolDb.GenBidInfo("广东省", "东莞市区", "沙田镇", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, infoUrl, prjMgr, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k].GetATag(); if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.shatian.gov.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } } } else if (prjName.Contains("通知")) { string InfoTitle = string.Empty, InfoType = string.Empty, PublistTime = string.Empty, InfoCtx = string.Empty, prjCode = string.Empty, buildUnit = string.Empty, htmlTxt = string.Empty, area = string.Empty; parser.Reset(); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "concent"))); if (dtlNode != null && dtlNode.Count > 0) { InfoTitle = prjName; PublistTime = beginDate; htmlTxt = dtlNode.AsHtml(); InfoCtx = htmlTxt.ToCtxString(); NoticeInfo info = ToolDb.GenNoticeInfo("广东省", "东莞市区", "沙田镇", string.Empty, InfoTitle, InfoType, InfoCtx, PublistTime, string.Empty, "东莞市沙田镇政府", infoUrl, prjCode, buildUnit, string.Empty, string.Empty, "政府采购", string.Empty, htmlTxt); list.Add(info); parser = new Parser(new Lexer(htmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k].GetATag(); if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.shatian.gov.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } } } else { string code = string.Empty, buildUnit = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty; parser.Reset(); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("valign", "top"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode[0].ToHtml(); inviteCtx = HtmlTxt.GetReplace("</p>", "\r\n").ToCtxString(); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); code = inviteCtx.GetCodeRegex(); specType = "政府采购"; inviteType = prjName.GetInviteBidType(); msgType = "东莞市沙田镇政府"; InviteInfo info = ToolDb.GenInviteInfo("广东省", "东莞市区", "沙田镇", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, infoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k].GetATag(); if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.shatian.gov.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { //取得页码 int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default).GetJsString(); } catch (Exception ex) { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "95%"))); if (nodeList != null && nodeList.Count > 0) { List <INode> list = new List <INode>(); list.Add(nodeList[10]); list.Add(nodeList[4]); list.Add(nodeList[2]); foreach (INode t in list) { TableTag table = t as TableTag; for (int j = 0; j < table.RowCount; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; TableRow tr = table.Rows[j]; infoType = "政策法规"; headName = tr.Columns[0].ToNodePlainString(); releaseTime = tr.Columns[1].ToPlainTextString().GetDateRegex(); infoUrl = "http://market.meizhou.gov.cn" + tr.Columns[0].GetATagValue("onclick").GetRegexBegEnd(",'", "',"); string htldtl = string.Empty; try { htldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htldtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("align", "center"))); if (dtlList != null && dtlList.Count > 0) { if (dtlList.Count > 1) { ctxHtml = dtlList[1].ToHtml(); } else { ctxHtml = dtlList.ToHtml(); } infoCtx = ctxHtml.ToCtxString().Replace(">", ""); msgType = MsgTypeCosnt.MeiZhouMsgType; NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "梅州市区", string.Empty, infoCtx, infoType); if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } else { sqlCount++; if (ToolDb.SaveEntity(info, this.ExistCompareFields)) { parser = new Parser(new Lexer(htldtl)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int a = 0; a < aNode.Count; a++) { ATag aTag = aNode[a] as ATag; if (aTag.IsAtagAttach()) { try { BaseAttach baseInfo = ToolHtml.GetBaseAttach("http://market.meizhou.gov.cn" + aTag.Link, aTag.LinkText, info.Id); if (baseInfo != null) { ToolDb.SaveEntity(baseInfo, string.Empty); } } catch { } } } } } } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <NoticeInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default); } catch { } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "huifont"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[0].ToNodePlainString().GetReplace("1/"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "?Paging=" + i, Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "98%"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { TableRow tr = table.Rows[j]; ATag aTag = tr.GetATag(); if (aTag == null) { continue; } string InfoTitle = string.Empty, InfoType = string.Empty, PublistTime = string.Empty, InfoCtx = string.Empty, InfoUrl = string.Empty, prjCode = string.Empty, buildUnit = string.Empty, htmlTxt = string.Empty; InfoType = "答疑澄清"; InfoTitle = aTag.GetAttribute("title"); PublistTime = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = "http://ggzy.xjbt.gov.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "TDContent"))); if (dtlNode != null && dtlNode.Count > 0) { htmlTxt = dtlNode.AsHtml(); InfoCtx = htmlTxt.GetReplace("</p>,<br />,<br/>", "\r\n").ToCtxString(); prjCode = InfoCtx.GetCodeRegex().GetCodeDel(); buildUnit = InfoCtx.GetBuildRegex(); if (buildUnit.Contains("电话")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("电话")); } NoticeInfo info = ToolDb.GenNoticeInfo("新疆维吾尔自治区", "新疆维吾尔自治区及地市", "", string.Empty, InfoTitle, InfoType, InfoCtx, PublistTime, string.Empty, "新疆生产建设兵团公共资源交易中心", InfoUrl, prjCode, buildUnit, string.Empty, string.Empty, "政府采购", "建设工程", htmlTxt); list.Add(info); parser = new Parser(new Lexer(htmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://ggzy.xjbt.gov.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <NoticeInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("nowrap", "true"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("总页数", "当前页").Replace(":", ""); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "?Paging=" + i, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "99%"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount - 1; j++) { string InfoTitle = string.Empty, InfoType = string.Empty, PublistTime = string.Empty, InfoCtx = string.Empty, InfoUrl = string.Empty, prjCode = string.Empty, buildUnit = string.Empty, htmlTxt = string.Empty, area = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); InfoTitle = aTag.GetAttribute("title"); PublistTime = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.gxzbtb.cn" + aTag.Link; InfoType = "澄清变更"; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "TDContent"))); if (dtlNode != null && dtlNode.Count > 0) { htmlTxt = dtlNode.AsHtml().GetJsString(); InfoCtx = htmlTxt.ToCtxString(); buildUnit = InfoCtx.GetBuildRegex(); NoticeInfo info = ToolDb.GenNoticeInfo("广西壮族自治区", "广西壮族自治区及地市", area, string.Empty, InfoTitle, InfoType, InfoCtx, PublistTime, string.Empty, "广西壮族自治区公共资源交易中心", InfoUrl, prjCode, buildUnit, string.Empty, string.Empty, "政府采购", string.Empty, htmlTxt); parser = new Parser(new Lexer(htmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k].GetATag(); if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.gxzbtb.cn" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <NoticeInfo>(); int pageInt = 37; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; string url = "http://www.gsggzyjy.cn/ajax/Controls_InfoList,App_Web_rzplwhmc.ashx?_method=getCurrentData&_session=rw"; try { this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8, ref cookiestr); string post = "currentPage=1\r\nQuery="; html = ToolHtml.GetHtmlByUrlPost(url, post, Encoding.UTF8, ref cookiestr); } catch { return(null); } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { string post = "currentPage=" + i + "\r\nQuery="; html = ToolHtml.GetHtmlByUrlPost(url, post, Encoding.UTF8, ref cookiestr); } catch { continue; } } Parser parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("li")); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string InfoTitle = string.Empty, InfoType = string.Empty, PublistTime = string.Empty, InfoCtx = string.Empty, InfoUrl = string.Empty, prjCode = string.Empty, buildUnit = string.Empty, htmlTxt = string.Empty, area = string.Empty; INode node = listNode[j]; ATag aTag = node.GetATag(); InfoTitle = aTag.GetAttribute("title"); InfoType = "控制价公示"; PublistTime = node.GetSpan().StringText; InfoUrl = "http://www.gsggzyjy.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "ContentPlaceHolder1_InfoHtml"))); if (dtlNode != null && dtlNode.Count > 0) { htmlTxt = dtlNode.AsHtml(); InfoCtx = htmlTxt.ToCtxString(); parser = new Parser(new Lexer(htmlTxt)); NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (tableNode != null && tableNode.Count > 0) { string ctx = string.Empty; TableTag table = tableNode[0] as TableTag; for (int r = 0; r < table.RowCount; r++) { for (int c = 0; c < table.Rows[r].ColumnCount; c++) { string temp = table.Rows[r].Columns[c].ToNodePlainString(); if ((c + 1) % 2 == 0) { ctx += temp.GetReplace(":,:") + "\r\n"; } else { ctx += temp.GetReplace(":,:") + ":"; } } } buildUnit = ctx.GetBuildRegex(); } NoticeInfo info = ToolDb.GenNoticeInfo("甘肃省", "甘肃省及地市", area, string.Empty, InfoTitle, InfoType, InfoCtx, PublistTime, string.Empty, "甘肃省公共资源交易中心", InfoUrl, prjCode, buildUnit, string.Empty, string.Empty, "水利及其他工程", string.Empty, htmlTxt); parser = new Parser(new Lexer(htmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k].GetATag(); if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.gsggzyjy.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pagination"))); if (tdNodes != null && tdNodes.Count > 0) { string pageTemp = tdNodes.AsString().Replace(" ", ""); try { pageInt = int.Parse(pageTemp.GetRegexBegEnd("/", "页")); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://new.sztc.com/bidBulletin/index_" + i + ".jhtml"); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "lb-link")), true), new TagNameFilter("li"))); if (nodeList != null && nodeList.Count > 0) { if (nodeList != null && nodeList.Count > 0) { for (int j = 0; j < nodeList.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = nodeList[j].GetATag(); prjName = aTag.LinkText.ToNodeString().Replace(" ", ""); beginDate = prjName.GetDateRegex(); if (!string.IsNullOrEmpty(prjName)) { prjName = prjName.Replace(beginDate, ""); } InfoUrl = aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "ninfo-con"), new TagNameFilter("div"))); if (dtnode != null && dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); code = inviteCtx.GetCodeRegex().GetCodeDel(); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); specType = "政府采购"; msgType = "深圳市国际招标有限公司"; inviteType = prjName.GetInviteBidType(); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); dtlparser = new Parser(new Lexer(HtmlTxt)); NodeList FileTag = dtlparser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (FileTag != null && FileTag.Count > 0) { for (int f = 0; f < FileTag.Count; f++) { ATag file = FileTag[f] as ATag; if (file.IsAtagAttach()) { string link = string.Empty; if (file.Link.ToLower().Contains("http")) { link = file.Link; } else { link = "http://new.sztc.com/" + file.Link; } BaseAttach attach = ToolDb.GenBaseAttach(file.ToPlainTextString(), info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("form"), new HasAttributeFilter("name", "qPageForm"))); if (pageNode != null && pageNode.Count > 0) { try { NodeList aNode = new Parser(new Lexer(pageNode.ToHtml())).ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { string temp = aNode[aNode.Count - 2].GetATagHref().Replace("turnOverPage", "").Replace("(", "").Replace(")", "").Replace(";", ""); pageInt = int.Parse(temp); } } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "channelCode", "pageIndex", "pageSize", "pointPageIndexId" }, new string[] { "0005", i.ToString(), "15", "1" }); try { html = this.ToolWebSite.GetHtmlByUrl("http://jieyang.gdgpo.com/queryMoreInfoList.do", nvc, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "m_m_c_list")), true), new TagNameFilter("li"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = listNode[j].GetATag(1); prjName = aTag.GetAttribute("title"); beginDate = listNode[j].ToPlainTextString().GetDateRegex(); InfoUrl = "http://jieyang.gdgpo.com" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "zw_c_c_cont"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml();//.Replace("<br", "\r\n<br"); inviteCtx = HtmlTxt.ToCtxString(); code = inviteCtx.GetCodeRegex().GetCodeDel(); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); inviteType = prjName.GetInviteBidType(); msgType = "揭阳市政府采购"; specType = "政府采购"; InviteInfo info = ToolDb.GenInviteInfo("广东省", "揭阳市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); parser = new Parser(new Lexer(HtmlTxt)); NodeList fileNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (fileNode != null && fileNode.Count > 0) { for (int k = 0; k < fileNode.Count; k++) { ATag fileAtag = fileNode[k].GetATag(); if (fileAtag.IsAtagAttach()) { string fileName = fileAtag.LinkText.ToNodeString().Replace(" ", ""); string fileLink = fileAtag.Link; if (!fileLink.ToLower().Contains("http")) { fileLink = "http://jieyang.gdgpo.gov.cn" + fileAtag.Link; } base.AttachList.Add(ToolDb.GenBaseAttach(fileName, info.Id, fileLink)); } } } list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "lb_page"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("分", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "?page=" + (i - 1).ToString(), Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "list"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[2].GetATag(); prjName = aTag.LinkText; bidUnit = tr.Columns[4].ToNodePlainString(); bidMoney = tr.Columns[5].ToNodePlainString(); endDate = tr.Columns[6].ToPlainTextString().GetDateRegex("yyyy/MM/dd"); InfoUrl = "http://www.whzbtb.com/" + aTag.Link.GetReplace("../,./").Replace("&", "&"); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("width", "683"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); parser = new Parser(new Lexer(HtmlTxt)); NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (tableNode != null && tableNode.Count > 1) { TableTag tableTag = tableNode[1] as TableTag; for (int r = 0; r < tableTag.RowCount; r++) { for (int c = 0; c < tableTag.Rows[r].ColumnCount; c++) { string temp = tableTag.Rows[r].Columns[c].ToPlainTextString().GetReplace(" "); if (string.IsNullOrWhiteSpace(temp)) { continue; } if ((c + 1) % 2 == 0) { bidCtx += temp.GetReplace(":,:") + "\r\n"; } else { bidCtx += temp.GetReplace(":,:") + ":"; } } } } else { bidCtx = HtmlTxt.ToCtxString(); } code = bidCtx.GetCodeRegex().GetReplace(" "); buildUnit = bidCtx.GetBuildRegex().GetReplace(" "); prjMgr = bidCtx.GetMgrRegex().GetReplace(" "); if (string.IsNullOrEmpty(prjMgr)) { prjMgr = bidCtx.GetRegex("建筑师/总监/负责人").GetReplace(" "); } beginDate = bidCtx.GetRegex("中标公示时段").GetDateRegex("yyyy/MM/dd"); if (string.IsNullOrEmpty(beginDate)) { bidCtx.GetRegex("中标公示时段").GetDateRegex(); } if (string.IsNullOrEmpty(beginDate)) { bidCtx.GetRegex("开标时间").GetDateRegex("yyyy/MM/dd"); } if (string.IsNullOrEmpty(beginDate)) { bidCtx.GetRegex("开标时间").GetDateRegex(); } msgType = "武汉市公共资源交易中心"; specType = "政府采购"; bidType = "建设工程"; BidInfo info = ToolDb.GenBidInfo("湖北省", "湖北省及地市", area, string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.whzbtb.com/" + a.Link.GetReplace("../,./"); } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }