private void AddBidInfo(string itemName, string dtlUrl, string begin, IList list) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; beginDate = begin; InfoUrl = "http://www.gsrc.com/" + dtlUrl.Replace("./", ""); prjName = itemName.GetReplace("./," + begin + ",(,)"); msgType = "广深铁路股份有限公司"; specType = "建设工程"; buildUnit = "广深铁路股份有限公司"; prjAddress = "见附件"; bidType = ToolHtml.GetInviteTypes(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, "见附件", string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, "见附件"); list.Add(info); BaseAttach attach = ToolDb.GenBaseAttach(prjName, info.Id, InfoUrl); base.AttachList.Add(attach); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "mt20 fenye2"))), new TagNameFilter("li"))); if (tdNodes != null && tdNodes.Count > 0) { try { for (int i = 0; i < tdNodes.Count; i++) { ATag aTag = tdNodes.SearchFor(typeof(ATag), true)[i] as ATag; if (aTag.LinkText.Contains("末页")) { pageInt = Convert.ToInt32(aTag.Link.Replace("list_36_", "").Replace(".html", "")); break; } } } catch (Exception ex) { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode("http://www.sz-otc.com/a/zhaobiao/zhongbiao/list_36_" + i.ToString() + ".html"), Encoding.Default); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "zhaobiao_list"))), new TagNameFilter("li"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 0; j < nodeList.Count; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; try { prjName = nodeList[j].ToPlainTextString().Trim(); prjName = prjName.Remove(prjName.IndexOf("&")); if (prjName.Contains("]")) { int index = prjName.IndexOf("]"); prjName = prjName.Substring(index, prjName.Length - index).Replace("]", ""); } bidDate = nodeList[j].ToPlainTextString().Trim(); int indexS = bidDate.IndexOf("&"); bidDate = bidDate.Substring(indexS, bidDate.Length - indexS); Regex regDate = new Regex(@"\d{4}-\d{2}-\d{2}"); beginDate = regDate.Match(bidDate).Value; } catch { } ATag aTag = nodeList.SearchFor(typeof(ATag), true)[j] as ATag; InfoUrl = "http://www.sz-otc.com" + aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).Replace(" ", "").Trim(); Parser dtlparserHTML = new Parser(new Lexer(htmldetail)); NodeList dtnodeHTML = dtlparserHTML.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "right_content"), new TagNameFilter("div"))); HtmlTxt = dtnodeHTML.AsHtml(); htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).Replace(" ", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n"); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); htmldetail = regexHtml.Replace(htmldetail, ""); } catch (Exception ex) { continue; } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "right_content"), new TagNameFilter("div"))); bidCtx = dtnode.AsString(); Regex regBidUnit = new Regex(@"单位(:|:)[^\r\n]+\r\n"); bidUnit = regBidUnit.Match(bidCtx).Value.Replace("单位", "").Replace(":", "").Replace(":", "").Trim(); try { Regex regCode = new Regex(@"编号(:|:)[^\r\n]+\r\n"); code = regCode.Match(bidCtx).Value.Replace("编号", "").Replace(":", "").Replace(":", "").Trim(); if (code.Contains("点击")) { code = code.Remove(code.IndexOf("点击")); } } catch { } if (bidUnit == "" || bidUnit == null) { bidUnit = ""; } if (Encoding.Default.GetByteCount(bidUnit) > 150) { bidUnit = bidUnit.Substring(0, 150); } Regex regBidMoneystr = new Regex(@"金额(:|:)[^\r\n]+\r\n"); string monerystr = regBidMoneystr.Match(bidCtx).Value.Replace("金额", "").Replace(":", "").Replace(":", "").Replace(",", "").Replace(",", "").Trim(); Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+"); if (!string.IsNullOrEmpty(regBidMoney.Match(monerystr).Value)) { if (monerystr.Contains("万元") || monerystr.Contains("万美元")) { bidMoney = regBidMoney.Match(monerystr).Value; } else { try { bidMoney = (decimal.Parse(regBidMoney.Match(monerystr).Value) / 10000).ToString(); if (decimal.Parse(bidMoney) < decimal.Parse("0.1")) { bidMoney = "0"; } } catch (Exception) { bidMoney = "0"; } } } else { bidMoney = "0"; } if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney)) { Regex regBidMoneystr1 = new Regex(@"¥[^\r\n]+\r\n"); monerystr = regBidMoneystr1.Match(bidCtx).Value.Replace("¥", "").Replace(":", "").Replace(":", "").Replace(",", "").Replace(",", "").Trim(); Regex regBidMoney1 = new Regex(@"[0-9]+[.]{0,1}[0-9]+"); if (!string.IsNullOrEmpty(regBidMoney1.Match(monerystr).Value)) { if (monerystr.Contains("万元") || monerystr.Contains("万美元")) { bidMoney = regBidMoney1.Match(monerystr).Value; } else { try { bidMoney = (decimal.Parse(regBidMoney1.Match(monerystr).Value) / 10000).ToString(); if (decimal.Parse(bidMoney) < decimal.Parse("0.1")) { bidMoney = "0"; } } catch (Exception) { bidMoney = "0"; } } } else { bidMoney = "0"; } } specType = "其他"; msgType = "深圳市东方招标有限公司"; prjName = ToolDb.GetPrjName(prjName); bidType = ToolHtml.GetInviteTypes(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); NodeList FileTag = dtnode.SearchFor(typeof(ATag), true); if (FileTag != null && FileTag.Count > 0) { for (int f = 0; f < FileTag.Count; f++) { ATag file = FileTag[f] as ATag; if (file.Link.ToUpper().Contains(".DOC")) { BaseAttach attach = ToolDb.GenBaseAttach(file.Link.Replace("Ads/", "").Replace(".DOC", "").Replace(".doc", ""), info.Id, "http://www.sz-otc.com/" + file.Link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "yema"))); if (noList != null && noList.Count > 0) { string temp = noList.AsString(); try { Regex reg = new Regex(@"/[^页]+页"); string result = reg.Match(temp).Value.Replace("页", "").Replace("/", ""); pageInt = Convert.ToInt32(result); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.gy-center.net/announce/list.jhtml?visi_id=&cid=97&chid=&gid=&thistype=&searchcid=&keyword=&action=yes&interval=&page=" + i.ToString(), Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "tab01"))), new TagNameFilter("ul"))), new TagNameFilter("li"))); if (dtlList != null && dtlList.Count > 0) { for (int j = 0; j < dtlList.Count - 1; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; string temp = dtlList[j].ToPlainTextString(); string tempHtl = dtlList[j].ToHtml(); prjName = ToolHtml.GetHtmlAtagValue("title", tempHtl); beginDate = ToolHtml.GetRegexDateTime(temp); InfoUrl = "http://www.gy-center.net/announce/" + ToolHtml.GetHtmlAtagValue("href", tempHtl); string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); htlDtl = System.Text.RegularExpressions.Regex.Replace(htlDtl, "(<script)[\\s\\S]*?(</script>)", ""); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList htlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "r_content_right_main"))); if (htlList != null && htlList.Count > 0) { HtmlTxt = htlList.ToHtml(); bidCtx = Regex.Replace(HtmlTxt, "<[^>]*>", "").Replace(" ", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\t\t", "").Replace("\r\r", "\r").Replace("\n\n", "\n"); bidType = ToolHtml.GetInviteTypes(prjName); string bidStr = string.Empty; parser = new Parser(new Lexer(HtmlTxt)); NodeList bidList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "MsoNormalTable"))); if (bidList != null && bidList.Count > 0) { try { TableTag tab = bidList[0] as TableTag; if (tab.RowCount > 1 && tab.Rows[0].ColumnCount > 6) { bidStr = tab.Rows[0].Columns[0].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[0].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[1].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[1].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[2].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[2].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[3].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[3].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[4].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[4].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[5].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[5].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[6].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[6].ToPlainTextString().ToNodeString() + "\r\n"; } else if (tab.RowCount > 1 && tab.Rows[0].ColumnCount > 5) { bidStr = tab.Rows[0].Columns[0].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[0].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[1].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[1].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[2].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[2].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[3].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[3].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[4].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[4].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[5].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[5].ToPlainTextString().ToNodeString() + "\r\n"; } else if (tab.RowCount > 1 && tab.Rows[0].ColumnCount > 4) { bidStr = tab.Rows[0].Columns[0].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[0].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[1].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[1].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[2].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[2].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[3].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[3].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[4].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[4].ToPlainTextString().ToNodeString() + "\r\n"; } else if (tab.RowCount > 1 && tab.Rows[0].ColumnCount > 3) { bidStr = tab.Rows[0].Columns[0].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[0].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[1].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[1].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[2].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[2].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[3].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[3].ToPlainTextString().ToNodeString() + "\r\n"; } else if (tab.RowCount > 1 && tab.Rows[0].ColumnCount > 2) { bidStr = tab.Rows[0].Columns[0].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[0].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[1].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[1].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[2].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[2].ToPlainTextString().ToNodeString() + "\r\n"; } else if (tab.RowCount > 1 && tab.Rows[0].ColumnCount > 1) { bidStr = tab.Rows[0].Columns[0].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[0].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[1].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[1].ToPlainTextString().ToNodeString() + "\r\n"; } } catch { } } buildUnit = ToolHtml.GetRegexString(bidCtx, ToolHtml.BuildRegex); prjAddress = ToolHtml.GetRegexString(bidCtx, ToolHtml.AddressRegex); code = ToolHtml.GetRegexString(bidCtx, ToolHtml.CodeRegex); bidUnit = ToolHtml.GetRegexString(bidCtx, ToolHtml.BidRegex); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = ToolHtml.GetRegexString(bidStr.Replace(" ", ""), ToolHtml.BidRegex, false); } if (string.IsNullOrEmpty(bidUnit)) { bidUnit = bidCtx.GetRegexBegEnd("确认", "为"); } bidMoney = ToolHtml.GetRegexString(bidCtx, ToolHtml.MoneyRegex); bidMoney = ToolHtml.GetRegexMoney(bidMoney); if (string.IsNullOrEmpty(bidMoney) || bidMoney == "0") { bidMoney = bidCtx.GetRegexBegEnd("¥", "元").GetMoney(); } buildUnit = ToolHtml.GetSubString(buildUnit, 150); prjAddress = ToolHtml.GetSubString(prjAddress, 150); code = ToolHtml.GetSubString(code, 50); bidUnit = ToolHtml.GetSubString(bidUnit, 150); bidUnit = ToolHtml.GetStringTemp(bidUnit); buildUnit = ToolHtml.GetStringTemp(buildUnit); if (string.IsNullOrEmpty(code)) { code = "见中标信息"; } if (string.IsNullOrEmpty(prjAddress)) { prjAddress = "见中标信息"; } specType = "其他"; msgType = "工网在线"; BidInfo info = ToolDb.GenBidInfo("广东省", "电网专项工程", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList nodeAtag = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (nodeAtag != null && nodeAtag.Count > 0) { for (int c = 0; c < nodeAtag.Count; c++) { ATag a = nodeAtag[c] as ATag; if (a.Link.IsAtagAttach()) { string alink = "http://www.bidding.csg.cn/" + a.Link; BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText.Replace(" ", ""), info.Id, alink); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default, ref cookiestr); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "scott")), true), new TagNameFilter("a"))); Regex regexPage = new Regex(@"共\d+页"); try { Regex numpage = new Regex(@"[0-9]+[.]{0,1}[0-9]+"); ATag link = (ATag)nodeList[nodeList.Count - 1]; page = Convert.ToInt32(numpage.Match(link.Link).Value.Trim()); } catch (Exception) { } for (int i = 1; i <= page; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(htl); eventValidation = this.ToolWebSite.GetAspNetEventValidation(htl); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "newtitle", "totalRows", "pageNO" }, new string[] { string.Empty, "0", i.ToString() }); try { htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.Default, ref cookiestr).Replace("<th", "<td").Replace("</th>", "</td>").Replace(" ", ""); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(htl)); NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "cnewslist"))); if (tableNodeList != null && tableNodeList.Count > 0) { TableTag table = (TableTag)tableNodeList[0]; for (int j = 1; j < table.RowCount - 2; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, img = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; prjName = tr.Columns[0].ToPlainTextString().Trim(); endDate = tr.Columns[2].ToPlainTextString().Trim(); ATag aTag = tr.Columns[0].SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = aTag.Link; ImageTag image = aTag.SearchFor(typeof(ImageTag), true)[0] as ImageTag; //beginDate = DateTime.Now.Date.ToString(); //if (image == null) //{ // beginDate = endDate; // endDate = string.Empty; //} string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.Default).Replace("<th", "<td").Replace("</th>", "</td>").Replace("</TH>", "</td>").Replace("<TH", "<td").Replace(" ", ""); } catch (Exception) { Logger.Error("InviteZhuHaiJS"); continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "borderTB"))); if (dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); TableTag tabletwo = (TableTag)dtnode[0]; for (int row = 0; row < tabletwo.RowCount; row++) { TableRow r = tabletwo.Rows[row]; for (int k = 0; k < r.ColumnCount; k++) { string st = string.Empty; string st1 = string.Empty; st = r.Columns[k].ToPlainTextString().Trim(); if (k + 1 < r.ColumnCount) { st1 = r.Columns[k + 1].ToPlainTextString().Trim(); } inviteCtx += st + ":" + st1 + "\r\n"; if (k + 1 <= r.ColumnCount) { k++; } } } Regex regBuidUnit = new Regex(@"(招标人|招标人/招标代理)(:|:)[^\r\n]+\r\n"); buildUnit = regBuidUnit.Match(inviteCtx).Value.Replace("招标人:", "").Replace("招标人/招标代理:", "").Trim(); Regex regPrjAddr = new Regex(@"(建设地点|项目地址|建设单位)(:|:)[^\r\n]+\r\n"); prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("建设单位:", "").Replace("建设地点:", "").Replace("项目地址", "").Replace(":", "").Trim(); if (Encoding.Default.GetByteCount(prjAddress) > 200 || prjAddress == "") { prjAddress = "见招标信息"; } Regex regcode = new Regex(@"项目编号(:|:)[^\r\n]+\r\n"); code = regcode.Match(inviteCtx).Value.Replace("项目编号:", "").Replace(":", "").Trim(); beginDate = inviteCtx.GetRegex("报名时间").GetDateRegex(); if (string.IsNullOrEmpty(beginDate) || DateTime.Parse(beginDate) > DateTime.Now) { beginDate = DateTime.Now.ToString("yyyy-MM-dd"); } msgType = "珠海市建设工程交易中心"; specType = "建设工程"; inviteCtx = inviteCtx.Replace("<?", "").Replace("xml:namespace prefix = st1 ns = ", "").Replace("urn:schemas-microsoft-com:office:smarttags", "").Replace("/>", "").Trim(); inviteCtx = inviteCtx.Replace("<?", "").Replace("xml:namespace prefix = o ns = ", "").Replace("urn:schemas-microsoft-com:office:office", "").Replace("/>", "").Trim(); Regex regInvType = new Regex(@"[^\r\n]+[\r\n]{1}"); if (buildUnit == "") { buildUnit = ""; } inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "珠海市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parserdetail.Reset(); NodeList nodeListtwo = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "Noprint")), true), new TagNameFilter("a"))); if (nodeListtwo.Count > 0) { ATag aTa3g = nodeListtwo[0] as ATag; BaseAttach attach = ToolDb.GenBaseAttach("工作议程(点击下载)", info.Id, aTa3g.Link); base.AttachList.Add(attach); } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int sqlCount = 0; //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = ToolHtml.GetHtmlByUrlEncode(SiteUrl, Encoding.UTF8); } catch (Exception ex) { Logger.Error(ex.ToString()); return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("cellspacing", "2"), new TagNameFilter("table"))); if (sNode != null && sNode.Count > 0) { string pageString = sNode.AsString(); Regex regexPage = new Regex(@",共[^页]+页,"); Match pageMatch = regexPage.Match(pageString); try { pageInt = int.Parse(pageMatch.Value.Replace(",共", "").Replace("页,", "").Trim()); } catch (Exception) { } } string cookiestr = string.Empty; for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE", "__VIEWSTATEENCRYPTED", "__EVENTVALIDATION", "ctl00$hdnPageCount" }, new string[] { "ctl00$Content$GridView1", "Page$" + i.ToString(), viewState, "", eventValidation, pageInt.ToString() }); html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8); } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "ctl00_Content_GridView1"), new TagNameFilter("table"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount - 1; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j] as TableRow; code = tr.Columns[1].ToPlainTextString().Trim(); prjName = tr.Columns[2].ToPlainTextString().Trim(); buildUnit = tr.Columns[3].ToPlainTextString().Trim(); beginDate = tr.Columns[5].ToPlainTextString().Trim(); endDate = tr.Columns[6].ToPlainTextString().Trim(); ATag aTag = tr.Columns[2].Children[0] as ATag; InfoUrl = "http://www.szjsjy.com.cn/BusinessInfo/" + aTag.Link; string htmldetail = string.Empty; try { htmldetail = ToolHtml.GetHtmlByUrlEncode(InfoUrl, Encoding.UTF8).Replace(" ", "").Trim(); Parser dtlparserHTML = new Parser(new Lexer(htmldetail)); NodeList dtnodeHTML = dtlparserHTML.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "lblXXNR"), new TagNameFilter("span"))); HtmlTxt = dtnodeHTML.AsHtml(); htmldetail = ToolHtml.GetHtmlByUrlEncode(InfoUrl, Encoding.UTF8).Replace(" ", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n"); } catch (Exception ex) { continue; } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "lblXXNR"), new TagNameFilter("span"))); inviteCtx = dtnode.AsString().Replace(" ", ""); Regex regPrjAdd = new Regex(@"(工程地点|工程地址):[^\r\n]+[\r\n]{1}"); prjAddress = regPrjAdd.Match(inviteCtx).Value.Replace("工程地点:", "").Replace("工程地址:", "").Trim(); msgType = "深圳市建设工程交易中心"; specType = "建设工程"; Regex regInvType = new Regex(@"[^\r\n]+[\r\n]{1}"); string InvType = regInvType.Match(inviteCtx).Value; inviteType = ToolHtml.GetInviteTypes(InvType); #region 2013-11-19修改 Dictionary <string, Regex> dicRegex = new Dictionary <string, Regex>(); dicRegex.Add("重要提示", new Regex(@"([.\S\s]*)(?=重要提示)")); dicRegex.Add("温馨提示", new Regex(@"([.\S\s]*)(?=温馨提示)")); foreach (string dicValue in dicRegex.Keys) { if (inviteCtx.Contains(dicValue)) { inviteCtx = dicRegex[dicValue].Match(inviteCtx).Value; } } #endregion InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳市工程", string.Empty, string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, string.Empty, InfoUrl, HtmlTxt); if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } sqlCount++; if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate, this.ExistsHtlCtx)) { dtlparser.Reset(); NodeList dlNodes = dtlparser.ExtractAllNodesThatMatch(new TagNameFilter("a"));// if (dlNodes != null && dlNodes.Count > 0) { for (int f = 0; f < dlNodes.Count; f++) { ATag fileTag = dlNodes[f] as ATag; if (fileTag.IsAtagAttach()) { //BaseAttach attach = ToolDb.GenBaseAttach(fileTag.StringText, info.Id, fileTag.Link.Replace("..", "http://www.szjsjy.com.cn")); try { BaseAttach attach = ToolHtml.GetBaseAttach(fileTag.Link.Replace("..", "http://www.szjsjy.com.cn"), fileTag.LinkText, info.Id, "SiteManage\\Files\\InviteAttach\\"); if (attach != null) { ToolDb.SaveEntity(attach, "SourceID,AttachServerPath"); } } catch { } } } } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "1", Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "paging"), new TagNameFilter("div"))); if (sNode != null && sNode.Count > 0) { string temp = sNode[0].ToNodePlainString(); try { temp = temp.GetRegexBegEnd("/", "转到"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + i, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasAttributeFilter("class", "column-info-list"), new TagNameFilter("div")), true), new TagNameFilter("li"))); if (sNode != null && sNode.Count > 0) { for (int t = 0; t < sNode.Count; t++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = sNode[t].GetATag(); prjName = aTag.LinkText.ToNodeString(); InfoUrl = "http://ggzy.zhaoqing.gov.cn" + aTag.Link; beginDate = sNode[t].ToPlainTextString().GetDateRegex(); string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new TagNameFilter("body")); if (dtnode != null && dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); buildUnit = inviteCtx.GetBuildRegex(); if (buildUnit.Contains("中心")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("中心")) + "中心"; } prjAddress = inviteCtx.GetAddressRegex(); code = inviteCtx.GetCodeRegex().GetCodeDel(); msgType = "肇庆市公共资源交易中心"; inviteType = ToolHtml.GetInviteTypes(prjName); specType = "建设工程"; InviteInfo info = ToolDb.GenInviteInfo("广东省", "肇庆市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8, ref cookiestr); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("script"), new HasAttributeFilter("type", "text/javascript"))); string b = pageNode.AsString().GetCtxBr(); string c = b.Replace("('", "徐鑫").Replace("')", "凯德"); if (pageNode != null && pageNode.Count > 0) { try { string temp = c.GetRegexBegEnd("徐鑫", "凯德"); page = int.Parse(temp); } catch { } } for (int i = 1; i <= page; i++) { if (i >= 1) { NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection( new string[] { "fcInfotitle", "currentPage" }, new string[] { "", i.ToString() } ); try { htl = this.ToolWebSite.GetHtmlByUrl("https://www.dgzb.com.cn/ggzy/website/WebPagesManagement/findListByPage?fcInfotype=1&tenderkind=A&projecttendersite=SS&orderFiled=fcInfoenddate&orderValue=desc", nvc, Encoding.UTF8); } catch { continue; } } JavaScriptSerializer serializer = new JavaScriptSerializer(); Dictionary <string, object> smsTypeJson = (Dictionary <string, object>)serializer.DeserializeObject(htl); foreach (KeyValuePair <string, object> obj in smsTypeJson) { object[] array = (object[])obj.Value; foreach (object arrValue in array) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; Dictionary <string, object> dic = (Dictionary <string, object>)arrValue; code = Convert.ToString(dic["fcTendersn"]); prjName = Convert.ToString(dic["fcInfotitle"]); beginDate = Convert.ToString(dic["fcInfostartdate"]).GetDateRegex("yyyy-MM-dd"); string xu = Convert.ToString(dic["id"]); InfoUrl = "https://www.dgzb.com.cn/ggzy/website/WebPagesManagement/jsdetail?publishId=" + xu + "&fcInfotype=1"; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.UTF8).Replace(" ", ""); } catch (Exception) { continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "detail"))); if (dtnode.Count > 0 && dtnode != null) { HtmlTxt = dtnode.AsHtml(); inviteCtx = HtmlTxt.Replace("</p>", "\r\n").ToCtxString(); prjAddress = inviteCtx.GetRegexBegEnd("工程地址:", "\r"); buildUnit = inviteCtx.GetRegexBegEnd("建设单位:", "\r"); msgType = "东莞市建设工程交易中心"; specType = "建设工程"; Regex regoType = new Regex(@"工程类型(:|:)[^\r\n]+\r\n"); otherType = regoType.Match(inviteCtx).Value.Replace("工程类型:", "").Trim(); inviteCtx = inviteCtx.Replace("ctl00_cph_context_span_MetContent", "").Replace("<span id=", "").Replace("</span>", "").Replace(">", "").Trim(); if (buildUnit == "") { buildUnit = "见招标信息"; } inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "东莞市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info);//附件搜索 parserdetail.Reset(); parser = new Parser(new Lexer(HtmlTxt)); NodeList aTagNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aTagNode != null && aTagNode.Count > 0) { for (int k = 0; k < aTagNode.Count; k++) { ATag aTag = aTagNode[k].GetATag(); if (aTag.IsAtagAttach()) { string linkurl = aTag.Link; linkurl = linkurl.Replace("&", "&"); string cc = string.Empty; string aa = linkurl.GetRegexBegEnd("&", "id"); if (aa == "") { cc = linkurl; } else { cc = linkurl.Replace(aa, ""); } BaseAttach attach = ToolDb.GenBaseAttach(aTag.LinkText, info.Id, cc); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } //for (int i = 1; i < page; i++) //{ // if (i > 1) // { // viewState = this.ToolWebSite.GetAspNetViewState(htl); // eventValidation = this.ToolWebSite.GetAspNetEventValidation(htl); // NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[]{ // "__EVENTTARGET", // "__EVENTARGUMENT", // "__LASTFOCUS", // "__VIEWSTATE", // "__EVENTVALIDATION", // "ctl00$cph_context$drp_selSeach", // "ctl00$cph_context$txt_strWhere", // "ctl00$cph_context$drp_Rq", // "ctl00$cph_context$GridViewPaingTwo1$txtGridViewPagingForwardTo", // "ctl00$cph_context$GridViewPaingTwo1$btnNext.x", // "ctl00$cph_context$GridViewPaingTwo1$btnNext.y" // }, new string[]{ // string.Empty, // string.Empty, // string.Empty, // viewState, // eventValidation, // "1", // string.Empty, // "3", // (i-1).ToString(), // "8", // "10" // }); // try // { // htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8, ref cookiestr); // } // catch (Exception ex) { continue; } // } // parser = new Parser(new Lexer(htl)); // NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_cph_context_GridView1"))); // if (tableNodeList != null && tableNodeList.Count > 0) // { // TableTag table = (TableTag)tableNodeList[0]; // for (int j = 1; j < table.RowCount; j++) // { // string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, // prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, // specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, // remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, // CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; // TableRow tr = table.Rows[j]; // code = tr.Columns[1].ToPlainTextString().Trim(); // prjName = tr.Columns[2].ToPlainTextString().Trim(); // beginDate = tr.Columns[4].ToPlainTextString().Trim().GetReplace(" - ", "&").Split('&')[0].Trim(); // try // { // endDate = tr.Columns[4].ToPlainTextString().Trim().GetReplace(" - ", "&").Split('&')[1].Trim(); // } // catch { } // ATag aTag = tr.Columns[2].SearchFor(typeof(ATag), true)[0] as ATag; // InfoUrl = "http://www.dgzb.com.cn:8080/dgjyweb/sitemanage/" + aTag.Link.Replace("amp;", "").Trim(); // string htmldetail = string.Empty; // try // { // htmldetail = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.UTF8).Replace(" ", ""); // } // catch (Exception) // { // continue; // } // Parser parserdetail = new Parser(new Lexer(htmldetail)); // NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "ctl00_cph_context_span_MetContent"))); // if (dtnode.Count > 0 && dtnode != null) // { // HtmlTxt = dtnode.AsHtml(); // inviteCtx = dtnode.ToHtml().Replace("<br/>", "\r\n"); // Regex regBuidUnit = new Regex(@"建设单位:[^\r\n]+\r\n"); // buildUnit = regBuidUnit.Match(inviteCtx).Value.Replace("建设单位:", "").Replace(":", "").Trim(); // Regex regPrjAddr = new Regex(@"(工程地点|工程地址)(:|:)[^\r\n]+\r\n"); // prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程地点:", "").Replace("工程地址", "").Replace(":", "").Trim(); // msgType = "东莞市建设工程交易中心"; // specType = "建设工程"; // Regex regoType = new Regex(@"工程类型(:|:)[^\r\n]+\r\n"); // otherType = regoType.Match(inviteCtx).Value.Replace("工程类型:", "").Trim(); // inviteCtx = inviteCtx.Replace("ctl00_cph_context_span_MetContent", "").Replace("<span id=", "").Replace("</span>", "").Replace(">", "").Trim(); // if (buildUnit == "") // { // buildUnit = "见招标信息"; // } // inviteType = ToolHtml.GetInviteTypes(prjName); // InviteInfo info = ToolDb.GenInviteInfo("广东省", "东莞市区", "", // string.Empty, code, prjName, prjAddress, buildUnit, // beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); // list.Add(info);//附件搜索 // parserdetail.Reset(); // NodeList fileNode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_cph_context_DownLoadFiles1_GridView2"))); // if (fileNode != null && fileNode.Count > 0) // { // string iii = fileNode.AsString().Trim(); // TableTag tablefile = (TableTag)fileNode[0]; // for (int k = 1; k < tablefile.RowCount; k++) // { // string fileName = string.Empty, fileUrl = string.Empty; // TableRow trfile = tablefile.Rows[k]; // if (trfile.Columns[1].ToPlainTextString().Trim() != "") // { // ATag aTagfile = trfile.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag; // fileName = trfile.Columns[1].ToPlainTextString().Trim(); // fileUrl = "http://www.dgzb.com.cn/dgjyweb/sitemanage/" + aTagfile.Link.Replace("amp;", "").Trim(); // BaseAttach attach = ToolDb.GenBaseAttach(fileName, info.Id, fileUrl); // base.AttachList.Add(attach); // } // } // } // parserdetail.Reset();//补充文件搜索 // NodeList fileBuChongNode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_cph_context_BuChongFileDown1_GridView2"))); // if (fileBuChongNode != null && fileBuChongNode.Count > 0) // { // string iii = fileBuChongNode.AsString().Trim(); // TableTag tableBuChongfile = (TableTag)fileBuChongNode[0]; // for (int k = 1; k < tableBuChongfile.RowCount; k++) // { // string fileName = string.Empty, fileUrl = string.Empty; // TableRow trfileBuChong = tableBuChongfile.Rows[k]; // if (trfileBuChong.Columns[1].ToPlainTextString().Trim() != "") // { // ATag aTagfile = trfileBuChong.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag; // fileName = trfileBuChong.Columns[1].ToPlainTextString().Trim(); // fileUrl = "http://www.dgzb.com.cn/dgjyweb/sitemanage/" + aTagfile.Link.Replace("amp;", "").Trim(); // BaseAttach attach = ToolDb.GenBaseAttach(fileName, info.Id, fileUrl); // base.AttachList.Add(attach); // } // } // } // if (!crawlAll && list.Count >= this.MaxCount) return list; // } // } // } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8, ref cookiestr); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("align", "right"))); if (tableNodeList != null && tableNodeList.Count > 0) { Regex regexPage = new Regex(@"共\d+页"); page = int.Parse(regexPage.Match(tableNodeList.AsString()).Value.Trim(new char[] { '共', '页' })); } for (int i = 1; i <= page; i++) { if (i > 1) { try { htl = this.ToolWebSite.GetHtmlByUrl("http://www.yantian.gov.cn/cn/zwgk/zfcg/zbgg/index_" + (i - 1).ToString() + ".shtml", Encoding.UTF8); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(htl)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "565")), true), new TagNameFilter("table"))); if (nodeList != null && nodeList.Count > 0) { string url = string.Empty; for (int j = 0; j < nodeList.Count; j++) { string beg = nodeList[j].ToPlainTextString().GetDateRegex(); if (string.IsNullOrEmpty(beg)) { continue; } else if (j > 0 && nodeList[j].GetATagHref() == url) { continue; } url = nodeList[j].GetATagHref(); TableTag table = nodeList[j] as TableTag; string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = table.GetATagValue("title").Replace(")", ")").Replace("(", "("); InfoUrl = "http://www.yantian.gov.cn" + table.GetATagValue(); beginDate = beg; string htmldetail = string.Empty; if (prjName.Contains("[")) { prjName = prjName.Remove(prjName.IndexOf("[")).ToString(); } try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace(" ", "").Replace("<br />", "\r\n").Trim(); } catch { continue; } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "content"))); if (dtnode != null && dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); inviteCtx = dtnode.AsString().Replace(" ", "").Trim(); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); inviteCtx = regexHtml.Replace(inviteCtx, ""); Regex regCode = new Regex(@"(项目序号|招标编号)(:|:)[^\r\n]+\r\n"); code = regCode.Match(inviteCtx).Value.Replace("招标编号:", "").Replace("项目序号:", "").Trim(); if (Encoding.Default.GetByteCount(code) > 50) { code = ""; } msgType = "深圳市盐田区政府采购中心"; specType = "建设工程"; Regex regBuidUnit = new Regex(@"(招标人|建设单位)(:|:)[^\r\n]+\r\n"); buildUnit = regBuidUnit.Match(inviteCtx).Value.Replace("招标人:", "").Replace("建设单位:", "").Trim(); if (Encoding.Default.GetByteCount(buildUnit) > 150) { buildUnit = ""; } if (prjAddress == "") { prjAddress = "见招标信息"; } if (Encoding.Default.GetByteCount(prjAddress) > 200) { prjAddress = ""; } inviteCtx = inviteCtx.Replace("<ahref=", "").Replace("/service/", "").Replace("</a>", "").Replace("您是第", "").Replace("位访问者粤ICP备06000803号", "").Replace(">", "").Trim(); prjName = prjName.Replace("“", "").Replace("”", "").Trim(); inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pagePanel"))); if (tdNodes != null && tdNodes.Count > 0) { string pageTemp = tdNodes.AsString().Replace(" ", "").Replace(" ", "").Trim(); try { pageInt = int.Parse(pageTemp.GetRegexBegEnd("总", "页")); } catch (Exception ex) { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "&page=" + i); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "ST12")), true), new TagNameFilter("li"))); if (nodeList != null && nodeList.Count > 0) { for (int j = 0; j < nodeList.Count; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = nodeList[j].GetATag(); prjName = aTag.GetAttribute("title"); if (prjName.Contains("声明")) { continue; } beginDate = nodeList[j].ToPlainTextString().GetDateRegex(); InfoUrl = aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "content"), new TagNameFilter("div"))); if (dtnode != null && dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml().GetReplace("<!--[if !supportLists]-->,<!--[endif]-->"); bidCtx = HtmlTxt.ToCtxString(); code = bidCtx.GetCodeRegex().GetCodeDel(); buildUnit = bidCtx.GetBuildRegex(); if (string.IsNullOrEmpty(buildUnit)) { buildUnit = bidCtx.GetRegex("采购单位,招标代理"); } bidUnit = bidCtx.GetBidRegex(); if (bidUnit.Contains("公司")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("公司")) + "公司"; } bidMoney = bidCtx.GetMoneyRegex(); if (bidMoney == "0" || string.IsNullOrWhiteSpace(bidMoney)) { bidMoney = bidCtx.GetMoneyRegex(new string[] { "中标金额" }, false, "万元"); } prjMgr = bidCtx.GetMgrRegex(); try { if (decimal.Parse(bidMoney) >= 100000) { bidMoney = (decimal.Parse(bidMoney) / 10000).ToString(); } } catch { } specType = "政府采购"; msgType = "中国远东国际招标公司"; bidType = ToolHtml.GetInviteTypes(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); dtlparser = new Parser(new Lexer(HtmlTxt)); NodeList FileTag = dtlparser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (FileTag != null && FileTag.Count > 0) { for (int f = 0; f < FileTag.Count; f++) { ATag file = FileTag[f] as ATag; if (file.IsAtagAttach()) { string link = string.Empty; if (file.Link.ToLower().Contains("http")) { link = file.Link; } else { link = "http://www.cfet.com.cn/" + file.Link; } BaseAttach attach = ToolDb.GenBaseAttach(file.ToPlainTextString(), info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 int pageInt = 1; int crawlMax = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default).Replace(" ", ""); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "ny_21 tc"))); if (sNode != null && sNode.Count > 0) { string pageString = sNode.AsString().Trim(); Regex regexPage = new Regex(@"createPageHTML\([^\)]+\)"); Match pageMatch = regexPage.Match(pageString); try { pageInt = int.Parse(pageMatch.Value.Replace("createPageHTML(", "").Replace(")", "").Split(',')[0].Trim()); } catch (Exception) { } } string cookiestr = string.Empty; for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "index_" + (i - 1).ToString() + ".html", Encoding.Default); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(html)); sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "ny_22"))), new TagNameFilter("li"))); if (sNode != null && sNode.Count > 0) { for (int j = 0; j < sNode.Count; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; INode node = sNode[j]; ATag aTag = node.Children.SearchFor(typeof(ATag), true)[0] as ATag; Div divTag = node.Children.SearchFor(typeof(Div), true)[1] as Div; prjName = aTag.ToPlainTextString().Trim(); beginDate = divTag.ToPlainTextString().Trim(new char[] { '[', ']', ' ' }); InfoUrl = aTag.Link.Replace("./", "http://ztb.gaoming.gov.cn/jsgc/zbjg/"); string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).Replace(" ", "").Trim(); Parser dtlparserHTML = new Parser(new Lexer(htmldetail)); NodeList dtnodeHTML = dtlparserHTML.ExtractAllNodesThatMatch(new HasParentFilter(new AndFilter(new HasAttributeFilter("class", "con_10 tl"), new TagNameFilter("div")))); HtmlTxt = dtnodeHTML.AsHtml(); htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).Replace(" ", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n").Replace("<br/>", "\r\n"); } catch (Exception ex) { continue; } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new HasParentFilter(new AndFilter(new HasAttributeFilter("class", "con_10 tl"), new TagNameFilter("div")))); if (dtnode != null && dtnode.Count > 0) { Regex regCtx = new Regex(@"[\n]+"); bidCtx = regCtx.Replace(dtnode.AsString().Replace(" ", "").Trim(), "\r\n"); TableTag table = dtnode.SearchFor(typeof(TableTag), true)[0] as TableTag; for (int dl = 0; dl < table.RowCount; dl++) { TableRow tr = table.Rows[dl]; if (tr.Columns[0].ToPlainTextString().Contains("编号")) { code = tr.Columns[1].ToPlainTextString().Trim(); } else if (tr.Columns[0].ToPlainTextString().Contains("招标单位")) { buildUnit = tr.Columns[1].ToPlainTextString().Trim(); } else if (tr.Columns[0].ToPlainTextString().Contains("中标单位")) { bidUnit = tr.Columns[1].ToPlainTextString().Trim(); } else if (tr.Columns[0].ToPlainTextString().Contains("建造师") || tr.Columns[0].ToPlainTextString().Contains("负责人") || tr.Columns[0].ToPlainTextString().Contains("法定代表人")) { prjMgr = tr.Columns[1].ToPlainTextString().Replace(" ", "").Trim(); } else if (tr.Columns[0].ToPlainTextString().Contains("中标价")) { Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+"); Regex regmoneyctx = new Regex(@"[0-9]+[\%]"); string bidMoneyctx = regmoneyctx.Replace(tr.Columns[1].ToPlainTextString(), ""); if (!string.IsNullOrEmpty(bidMoneyctx)) { if (tr.Columns[1].ToPlainTextString().Contains("万元")) { bidMoney = regBidMoney.Match(bidMoneyctx).Value; } else { try { bidMoney = (decimal.Parse(regBidMoney.Match(bidMoneyctx).Value) / 10000).ToString(); if (decimal.Parse(bidMoney) < decimal.Parse("0.1")) { bidMoney = "0"; } } catch (Exception) { bidMoney = "0"; } } } } } } if (Encoding.Default.GetByteCount(bidUnit) > 150) { try { if (bidUnit.Contains("第二标段")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("\n")).Replace("第一标段", "").Replace(":", "").Replace(":", ""); } } catch { } } msgType = "佛山市高明区建设工程交易中心"; specType = "建设工程"; prjName = ToolDb.GetPrjName(prjName); bidType = ToolHtml.GetInviteTypes(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "佛山市区", "高明区", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { string url = "https://zhaobiao.szairport.com/SZWI/portal/homeInformListJson.do"; IList list = new ArrayList(); //取得页码 int pageInt = 1; string html = string.Empty; string cookieStr = string.Empty; try { string post = string.Format("start={0}&limit={1}", 0, this.MaxCount); html = ToolHtml.GetHtmlByUrlPost(url, post, Encoding.UTF8, ref cookieStr); } catch (Exception ex) { } JavaScriptSerializer serializer = new JavaScriptSerializer(); Dictionary <string, object> smsTypeJson = (Dictionary <string, object>)serializer.DeserializeObject(html); Dictionary <string, object> tempDic = smsTypeJson["recordData"] as Dictionary <string, object>; if (tempDic == null) { return(list); } //string totalCount = tempDic["totalCount"].ToString(); //try //{ // pageInt = int.Parse(totalCount) / 20 + 1; //} //catch { } object[] objList = tempDic["records"] as object[]; foreach (object obj in objList) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, img = string.Empty, HtmlTxt = string.Empty; Dictionary <string, object> dic = obj as Dictionary <string, object>; prjName = Convert.ToString(dic["title"]); beginDate = Convert.ToString(dic["releaseTimeStr"]); string seqNo = Convert.ToString(dic["seqNo"]); InfoUrl = "http://zhaobiao.szairport.com/SZWI/portal/homeInformView.do?seqNo=" + seqNo; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } Parser parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "newsBox"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); code = inviteCtx.GetCodeRegex().GetCodeDel(); if (code.Contains("__")) { code = ""; } specType = "其他"; msgType = "深圳宝安国际机场"; inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int a = 0; a < aNode.Count; a++) { ATag aTag = aNode[a] as ATag; if (aTag.IsAtagAttach()) { string fileUrl = string.Empty; if (aTag.Link.Contains("http")) { fileUrl = aTag.Link; } else { fileUrl = "http://zhaobiao.szairport.com/" + aTag.Link; } } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string cookiestr = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "input-group-addon"))); if (tdNodes != null && tdNodes.Count > 0) { try { string reTemp = tdNodes.AsString().GetRegexBegEnd("共", "项"); string pageTemp = tdNodes.AsString().GetRegexBegEnd("项", "页").GetReplace("共,项,页," + reTemp + ",,"); pageInt = int.Parse(pageTemp); } catch (Exception) { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "?pi=" + (i - 1), Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "inside_table"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = (TableTag)nodeList[0]; for (int j = 1; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, bidType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; prjName = tr.Columns[1].ToPlainTextString().Trim(); buildUnit = tr.Columns[2].ToPlainTextString().Trim(); beginDate = tr.Columns[3].ToPlainTextString().Trim(); InfoUrl = "http://www.bajsjy.com/" + tr.Columns[1].GetATagHref(); string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace("<th", "<td").Replace("</th>", "</td>").Replace(" ", ""); } catch (Exception) { continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList nodeDetailList = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "inside_table"))); if (nodeDetailList != null && nodeDetailList.Count > 0) { HtmlTxt = nodeDetailList.AsHtml(); TableTag tabledetail = (TableTag)nodeDetailList[0]; for (int r = 0; r < tabledetail.RowCount; r++) { TableRow trdetail = tabledetail.Rows[r]; for (int c = 0; c < trdetail.ColumnCount; c++) { string tr1 = string.Empty; string tr2 = string.Empty; NodeList inptList; NodeList selList; if (trdetail.ColumnCount <= 1) { continue; } tr1 = trdetail.Columns[c].ToPlainTextString().Trim(); tr2 = trdetail.Columns[c + 1].ToPlainTextString().Trim(); inptList = trdetail.Columns[c + 1].SearchFor(typeof(InputTag), true); selList = trdetail.Columns[c + 1].SearchFor(typeof(SelectTag), true); if (inptList != null && inptList.Count > 0) { if (inptList.Count > 1) { for (int inp = 0; inp < inptList.Count; inp++) { InputTag inputTage = (InputTag)inptList[inp]; if (inputTage.GetAttribute("checked") == "checked") { tr2 = inputTage.GetAttribute("value"); } } } else { InputTag inputTage = (InputTag)inptList[0]; tr2 = inputTage.GetAttribute("value"); } } if (selList != null && selList.Count > 0) { SelectTag selTag = (SelectTag)selList[0]; NodeList opList = new NodeList(); selTag.CollectInto(opList, new HasAttributeFilter("selected", "selected")); tr2 = opList.AsString(); } inviteCtx += tr1 + ":" + tr2 + "\r\n"; if (trdetail.ColumnCount > (c + 1)) { c = c + 1; } } } Regex regPrjAddr = new Regex(@"工程地址:[^\r\n]+\r\n"); prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程地址:", "").Trim(); Regex regoType = new Regex(@"工程类型:[^\r\n]+\r\n"); string oType = regoType.Match(inviteCtx).Value.Replace("工程类型:", "").Trim(); if (oType.Contains("房建")) { otherType = "房建及工业民用建筑"; } else if (oType.Contains("市政")) { otherType = "市政工程"; } else if (oType.Contains("园林绿化")) { otherType = "园林绿化工程"; } else if (oType.Contains("装饰") || oType.Contains("装修")) { otherType = "装饰装修工程"; } else if (oType.Contains("电力")) { otherType = "电力工程"; } else if (oType.Contains("水利")) { otherType = "水利工程"; } if (oType.Contains("环保")) { otherType = "环保工程"; } msgType = "深圳市建设工程交易中心宝安分中心"; specType = "建设工程"; bidType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳宝安区工程", "宝安区", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, bidType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "page")), true), new TagNameFilter("a"))); if (nodeList != null && nodeList.Count > 0) { try { string temp = nodeList[nodeList.Count - 1].GetATagHref(); temp = temp.Remove(0, temp.LastIndexOf('=') + 1); page = int.Parse(temp); } catch { } } for (int i = 1; i <= page; i++) { if (i > 1) { try { htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "&page=" + i.ToString(), Encoding.Default); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(htl)); NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("id", "listul")), true), new TagNameFilter("li"))); if (tableNodeList != null && tableNodeList.Count > 0) { for (int j = 0; j < tableNodeList.Count; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = tableNodeList[j].GetATag(); prjName = aTag.LinkText; InfoUrl = "http://www.czjsw.net" + aTag.Link.Replace("amp;", "").Trim(); beginDate = tableNodeList[j].ToPlainTextString().GetDateRegex(); string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).Replace(" ", "").GetJsString(); } catch (Exception) { continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "content"))); if (dtnode != null && dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); bidCtx = HtmlTxt.ToCtxString(); parser = new Parser(new Lexer(HtmlTxt)); NodeList tableNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "content"))); if (tableNode != null && tableNode.Count > 0) { HtmlTxt = tableNode.AsHtml(); bidCtx = HtmlTxt.ToCtxString(); bidUnit = bidCtx.GetRegexBegEnd("第一中标候选人为", ",").Replace(":", "").Replace("“", "").Replace("”", ""); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = bidCtx.GetRegexBegEnd("第一中标候选人", ",").Replace(":", "").Replace("“", "").Replace("”", ""); } bidMoney = bidCtx.GetMoneyRegex().GetMoney(); prjMgr = bidCtx.GetRegexBegEnd("项目经理", ";").Replace(":", "").Replace(":", ""); if (string.IsNullOrEmpty(prjMgr)) { prjMgr = bidCtx.GetRegexBegEnd("项目负责人", ",").Replace(":", "").Replace(":", ""); } if (prjMgr.Contains(";")) { prjMgr = prjMgr.Remove(prjMgr.IndexOf(";")); } code = bidCtx.GetCodeRegex().GetCodeDel(); buildUnit = bidCtx.GetBuildRegex(); if (prjMgr.Contains("(")) { prjMgr = prjMgr.Remove(prjMgr.IndexOf("(")); } if (prjMgr.Contains("(")) { prjMgr = prjMgr.Remove(prjMgr.IndexOf("(")); } if (Encoding.Default.GetByteCount(prjMgr) >= 50) { prjMgr = ""; } msgType = "潮州市建设工程交易中心"; specType = "建设工程"; bidType = ToolHtml.GetInviteTypes(prjName); prjName = ToolDb.GetPrjName(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "潮州市区", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { DateTime startDate = DateTime.Today; DateTime endDates = startDate.AddDays(-90); IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "TIMEBEGIN_SHOW", "TIMEEND_SHOW", "TIMEBEGIN", "TIMEEND", "DEAL_TIME", "DEAL_CLASSIFY", "DEAL_STAGE", "DEAL_PROVINCE", "DEAL_CITY", "DEAL_PLATFORM", "DEAL_TRADE", "isShowAll", "PAGENUMBER", "FINDTXT" }, new string[] { endDates.ToString(), startDate.ToString(), endDates.ToString(), startDate.ToString(), "02", "01", "0101", "0", "0", "0", "0", "1", "1", "" }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8, ref cookiestr); } catch { } } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "paging")), true), new TagNameFilter("span"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("共", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "TIMEBEGIN_SHOW", "TIMEEND_SHOW", "TIMEBEGIN", "TIMEEND", "DEAL_TIME", "DEAL_CLASSIFY", "DEAL_STAGE", "DEAL_PROVINCE", "DEAL_CITY", "DEAL_PLATFORM", "DEAL_TRADE", "isShowAll", "PAGENUMBER", "FINDTXT" }, new string[] { endDates.ToString(), startDate.ToString(), endDates.ToString(), startDate.ToString(), "02", "01", "0101", "0", "0", "0", "0", "1", i.ToString(), "" }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "publicont"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string nlse = string.Empty; string ywlx = string.Empty; string sehu = string.Empty; INode node = listNode[j]; ATag aTag = node.GetATag(); if (aTag == null) { continue; } string nod = node.ToHtml(); parser = new Parser(new Lexer(nod)); NodeList txtNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "span_on"))); if (txtNode != null && txtNode.Count > 0) { sehu = txtNode[0].ToNodePlainString(); nlse = txtNode[3].ToNodePlainString(); ywlx = txtNode[2].ToNodePlainString(); } if (nlse.Contains("招标/资审公告")) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = aTag.GetAttribute("title"); inviteType = ToolHtml.GetInviteTypes(prjName); beginDate = node.ToPlainTextString().GetDateRegex(); InfoUrl = aTag.Link.GetReplace("amp;"); string htmlDtl = string.Empty; try { htmlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8); htmlDtl = ToolHtml.GetRegexHtlTxt(htmlDtl); } catch { continue; } parser = new Parser(new Lexer(htmlDtl)); NodeList zsList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "div_0101"))); if (zsList != null && zsList.Count > 0) { try { INode nodezs = zsList[0]; ATag aTagzs = nodezs.GetATag(); string urlzs = aTagzs.GetAttribute("onclick"); string urls = urlzs.GetReplace("showdetail(this, '0101','", "").GetReplace("')", "").Replace(",", "").Replace(")", ""); urls = "http://www.ggzy.gov.cn/information" + urls; htmlDtl = this.ToolWebSite.GetHtmlByUrl(urls, Encoding.UTF8); htmlDtl = ToolHtml.GetRegexHtlTxt(htmlDtl); } catch (Exception) { throw; } } parser = new Parser(new Lexer(htmlDtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "detail"))); if (dtlList != null && dtlList.Count > 0) { string ctxUrl = string.Empty; HtmlTxt = dtlList.AsHtml(); inviteCtx = HtmlTxt.Replace("</p>", "\r\n").ToCtxString(); try { Parser parurl = new Parser(new Lexer(HtmlTxt)); NodeList zsUrl = parurl.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("p"), new HasAttributeFilter("class", "p_o"))); if (zsUrl != null && zsUrl.Count > 0) { INode urlzs = zsUrl[0]; ATag aTagurl = urlzs.GetATag(); ctxUrl = "原文链接地址 : " + aTagurl.Link; } } catch (Exception ex) { } inviteCtx = inviteCtx + ctxUrl; prjAddress = inviteCtx.GetAddressRegex(); buildUnit = inviteCtx.GetBuildRegex(); code = inviteCtx.GetCodeRegex(); if (string.IsNullOrEmpty(buildUnit)) { buildUnit = inviteCtx.GetRegex("招标人"); } buildUnit = ToolHtml.GetSubString(buildUnit, 150); if (string.IsNullOrWhiteSpace(code)) { parser = new Parser(new Lexer(HtmlTxt)); NodeList bidNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (bidNode != null && bidNode.Count > 0) { string ctx = string.Empty; TableTag bidTable = bidNode[0] as TableTag; try { for (int r = 0; r < bidTable.RowCount; r++) { ctx += bidTable.Rows[r].Columns[0].ToNodePlainString() + ":"; ctx += bidTable.Rows[r].Columns[1].ToNodePlainString() + "\r\n"; } } catch { } if (string.IsNullOrWhiteSpace(buildUnit)) { buildUnit = ctx.GetBuildRegex(); } if (string.IsNullOrWhiteSpace(prjAddress)) { prjAddress = ctx.GetAddressRegex(); } if (string.IsNullOrWhiteSpace(code)) { code = ctx.GetCodeRegex(); } } } msgType = "国家信息中心"; specType = "建设工程"; inviteType = "建设工程"; string[] provs = GetPrivoce(sehu); InviteInfo info = ToolDb.GenInviteInfo(provs[0], provs[1], "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); try { parser = new Parser(new Lexer(HtmlTxt)); NodeList nodeFm = parser.ExtractAllNodesThatMatch((new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "detail_content"))))); if (dtlList != null && dtlList.Count > 0) { INode nodFm = nodeFm[0]; ATag aTagzs = nodFm.GetATag(); string dfe = aTagzs.Link; BaseAttach attach = ToolDb.GenBaseAttach("内容(点击下载)", info.Id, dfe); base.AttachList.Add(attach); } } catch { } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } else { continue; } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 string html = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch (Exception ex) { Logger.Error(ex.ToString()); } Parser parser = new Parser(new Lexer(html)); int pageInt = 1; NodeList sNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pagenumber")), true), new TagNameFilter("a"))); if (sNodes != null && sNodes.Count > 1) { string page = sNodes[sNodes.Count - 2].ToPlainTextString(); try { pageInt = int.Parse(page); } catch { } } parser.Reset(); for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "&pos=" + i.ToString(), Encoding.Default); } catch (Exception ex) { Logger.Error(ex.ToString()); } } parser = new Parser(new Lexer(html)); NodeList nodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "recordlist"))); if (nodes != null && nodes.Count > 0) { TableTag table = nodes[0] as TableTag; for (int t = 0; t < table.RowCount; t++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, strHtml = string.Empty; TableRow tr = table.Rows[t]; endDate = tr.Columns[1].ToPlainTextString().GetDateRegex(); ATag alink = tr.Columns[0].GetATag(); prjName = tr.Columns[0].GetATagValue("title"); InfoUrl = "http://www.nmgp.gov.cn" + alink.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).Replace(" ", "").Trim(); } catch (Exception ex) { continue; } Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); htmldtl = regexHtml.Replace(htmldtl, ""); Parser parserdtl = new Parser(new Lexer(htmldtl)); Parser dtlparserHTML = new Parser(new Lexer(htmldtl)); NodeList nodesDtl = dtlparserHTML.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "hlcms_9"))); if (nodesDtl != null && nodesDtl.Count > 0) { Parser begDate = new Parser(new Lexer(nodesDtl.ToHtml())); NodeList begNode = begDate.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "yzhang"))); if (begNode != null && begNode.Count > 0) { beginDate = begNode.AsString().GetDateRegex("yyyy年MM月dd日"); } begDate.Reset(); NodeList dtlTable = begDate.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("cellpadding", "5"))); if (dtlTable != null && dtlTable.Count > 0) { TableTag tableDtl = dtlTable[0] as TableTag; if (tableDtl.RowCount > 2) { string ctx = tableDtl.Rows[2].ToPlainTextString(); bidUnit = ctx.GetRegexBegEnd("供应商:", ";"); bidMoney = ctx.GetRegexBegEnd("中标金额:", "。").GetMoney(); } if (bidMoney == "0" && tableDtl.RowCount > 4) { string ctx = tableDtl.Rows[4].ToPlainTextString(); bidUnit = ctx.GetRegexBegEnd("供应商:", ";"); bidMoney = ctx.GetRegexBegEnd("中标金额:", "。").GetMoney(); } if (bidMoney == "0" && tableDtl.RowCount > 6) { string ctx = tableDtl.Rows[6].ToPlainTextString(); bidUnit = ctx.GetRegexBegEnd("供应商:", ";"); bidMoney = ctx.GetRegexBegEnd("中标金额:", "。").GetMoney(); } if (bidMoney == "0" && tableDtl.RowCount > 8) { string ctx = tableDtl.Rows[8].ToPlainTextString(); bidUnit = ctx.GetRegexBegEnd("供应商:", ";"); bidMoney = ctx.GetRegexBegEnd("中标金额:", "。").GetMoney(); } if (bidMoney == "0" && tableDtl.RowCount > 10) { string ctx = tableDtl.Rows[10].ToPlainTextString(); bidUnit = ctx.GetRegexBegEnd("供应商:", ";"); bidMoney = ctx.GetRegexBegEnd("中标金额:", "。").GetMoney(); } if (bidMoney == "0" && tableDtl.RowCount > 12) { string ctx = tableDtl.Rows[12].ToPlainTextString(); bidUnit = ctx.GetRegexBegEnd("供应商:", ";"); bidMoney = ctx.GetRegexBegEnd("中标金额:", "。").GetMoney(); } } HtmlTxt = nodesDtl.ToHtml(); bidCtx = HtmlTxt.ToCtxString(); code = bidCtx.GetRegex("批准文件编号,工程编号,项目编号").Replace("无", ""); buildUnit = bidCtx.GetBuildRegex(); if (string.IsNullOrEmpty(buildUnit)) { buildUnit = bidCtx.GetRegex("采购代理机构名称,采购单位名称"); } prjAddress = bidCtx.GetAddressRegex(); if (string.IsNullOrEmpty(prjAddress)) { prjAddress = bidCtx.GetRegex("投标地点,开标地点,地址"); } if (bidUnit.Contains("废标")) { bidUnit = "没有中标商"; } msgType = "内蒙古政府采购盟市"; specType = "政府采购"; if (Encoding.Default.GetByteCount(code) > 50) { code = code.GetChina(); } if (Encoding.Default.GetByteCount(code) > 50) { code = ""; } bidType = ToolHtml.GetInviteTypes(prjName); prjName = ToolDb.GetPrjName(prjName); BidInfo info = ToolDb.GenBidInfo("内蒙古自治区", "内蒙古自治区及盟市", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("TABLE"), new HasAttributeFilter("style", "margin: 0"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 6; j < table.RowCount - 3; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; code = tr.Columns[2].ToPlainTextString().Trim(); prjName = tr.Columns[3].ToPlainTextString().Trim(); //beginDate = tr.Columns[4].ToPlainTextString().Split('-')[0].Replace(".", "-").Trim(); ATag aTag = tr.Columns[3].SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://www.ymcw.com/" + aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).Replace(" ", "").Trim(); Parser dtlparserHTML = new Parser(new Lexer(htmldetail)); NodeList dtnodeHTML = dtlparserHTML.ExtractAllNodesThatMatch(new TagNameFilter("table")); HtmlTxt = dtnodeHTML.AsHtml(); htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).ToLower().Replace(" ", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n"); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); htmldetail = regexHtml.Replace(htmldetail, ""); } catch (Exception ex) { continue; } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new TagNameFilter("table")); inviteCtx = System.Web.HttpUtility.HtmlDecode(dtnode[0].ToPlainTextString().Trim()); Regex regCtx = new Regex(@"([\r\n]+)|([\t]+)"); beginDate = DateTime.Now.ToString("yyyy-MM-dd"); inviteCtx = regCtx.Replace(inviteCtx, "\r\n"); specType = "其他"; msgType = "深圳市裕明财务咨询有限公司"; inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string html = string.Empty; string viewState = string.Empty; int sqlCount = 0; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default, ref cookiestr); } catch (Exception ex) { return(list); } IList arr = GetPrjCode(); IList del = arr; if (arr.Count > 0) { for (int d = (arr.Count - 1); d >= 0; d--) { string htmtxt = string.Empty; viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc1 = this.ToolWebSite.GetNameValueCollection(new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE", "txtPrj_ID", "txtPrj_Name", "Chk_Query", "Radiobuttonlist1", "QUERY", "ucPageNumControl:gotopage" }, new string[] { string.Empty, string.Empty, viewState, arr[d].ToString(), "", "0", "1", "查询", "" }); try { htmtxt = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), nvc1, Encoding.Default, ref cookiestr); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(htmtxt)); NodeList dtList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "dgConstBid"))); if (dtList != null && dtList.Count > 0) { TableTag table = dtList[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow dr = table.Rows[j]; code = dr.Columns[1].ToPlainTextString().Trim(); prjName = dr.Columns[2].ToPlainTextString().Trim(); buildUnit = dr.Columns[3].ToPlainTextString().Trim(); ATag aTag = dr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://61.144.226.2/zbgg/Detail.aspx?ID=" + aTag.Link.Trim().Replace("GoDetail('", "").Replace("');", "") + "&xxlxbh=1&PRJ_TYPE=0"; string htmlde = string.Empty; try { htmlde = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.Default).Replace(" ", ""); } catch { continue; } parser = new Parser(new Lexer(htmlde)); NodeList dealList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "Table8"))); if (dealList != null && dealList.Count > 0) { string ctx = string.Empty; HtmlTxt = dealList.ToHtml(); TableTag tab = dealList[0] as TableTag; string text = string.Empty; try { for (int k = 0; k < tab.RowCount; k++) { TableRow tr = tab.Rows[k]; text = tr.Columns[0].ToPlainTextString().Replace(":", "").Replace(":", "").Replace(" ", "") + ":".Trim(); ctx += text + tr.Columns[1].ToPlainTextString().Trim().Replace(" ", "") + "\r\n"; } for (int k = 0; k < tab.RowCount; k++) { TableRow tr = tab.Rows[k]; text = tr.Columns[0].ToPlainTextString().Replace(":", "").Replace(":", "") + ":".Trim(); inviteCtx += text + tr.Columns[1].ToPlainTextString().Trim() + "\r\n"; } } catch { } Regex regDate = new Regex(@"发布日期(:|:)[^\r\n]+[\r\n]{1}"); string datestr = regDate.Match(inviteCtx).Value.Replace("发布日期", "").Replace(":", "").Replace("\r\n", "").Replace("\r", "").Replace("\n", ""); if (!string.IsNullOrEmpty(datestr)) { try { int len = datestr.IndexOf("到"); beginDate = datestr.Substring(0, len); endDate = datestr.Substring(len + 1, datestr.Length - len - 1); } catch { } } Regex regPrjAdd = new Regex(@"(工程地点|工程地址):[^\r\n]+[\r\n]{1}"); prjAddress = regPrjAdd.Match(ctx).Value.Replace("工程地点:", "").Replace("工程地址:", "").Trim(); Regex regOth = new Regex(@"(工程类型|项目类型):[^\r\n]+[\r\n]{1}"); otherType = regOth.Match(ctx).Value.Replace("工程类型:", "").Replace("项目类型:", "").Trim(); msgType = "深圳市建设工程交易中心"; specType = "建设工程"; inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); if (sqlCount <= this.MaxCount) { ToolDb.SaveEntity(info, this.ExistCompareFields); sqlCount++; } else { return(list); } } } } del.RemoveAt(d); DeleteCode(del); } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 string html = string.Empty; try { DateTime time = ToolHtml.GetDateTimeByLong(1509517250628); DateTime dt24 = DateTime.Now.ToUniversalTime(); string b = ToolHtml.GetDateTimeLong(dt24).ToString(); html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + b, Encoding.Default); } catch (Exception ex) { Logger.Error(ex.ToString()); } Parser parser = new Parser(new Lexer(html)); int pageInt = 1; JavaScriptSerializer serializer = new JavaScriptSerializer(); object[] objs = (object[])serializer.DeserializeObject(html); object[] items = objs[1] as object[]; Dictionary <string, object> smsTypeJson = items[0] as Dictionary <string, object>; string a = Convert.ToString(smsTypeJson["page_all"]); int page = int.Parse(a); pageInt = page / 18 + 1; parser.Reset(); for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { string lian = "http://www.nmgp.gov.cn/category/category-ajax.php?type_name=3&byf_page=" + i + "&fun=cggg&_=1509441711785"; html = this.ToolWebSite.GetHtmlByUrl(lian, Encoding.UTF8); } catch (Exception ex) { Logger.Error("分页"); continue; } } parser = new Parser(new Lexer(html)); JavaScriptSerializer serializer1 = new JavaScriptSerializer(); object[] objd = (object[])serializer.DeserializeObject(html); object[] items1 = objd[0] as object[]; Dictionary <string, object> smsTypeJson1 = items1[0] as Dictionary <string, object>; foreach (KeyValuePair <string, object> obj in smsTypeJson) { object[] array = objd[0] as object[]; foreach (object arrValue in array) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, strHtml = string.Empty; Dictionary <string, object> dic = (Dictionary <string, object>)arrValue; endDate = Convert.ToString(dic["ENDDATE"]).GetDateRegex("yyyy-MM-dd"); prjName = Convert.ToString(dic["TITLE"]); string xu = Convert.ToString(dic["wp_mark_id"]); InfoUrl = "http://www.nmgp.gov.cn/ay_post/post.php?tb_id=3&p_id=" + xu; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.UTF8).Replace(" ", ""); } catch (Exception) { continue; } Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); htmldtl = regexHtml.Replace(htmldtl, ""); Parser parserdtl = new Parser(new Lexer(htmldtl)); Parser dtlparserHTML = new Parser(new Lexer(htmldtl)); NodeList nodesDtl = dtlparserHTML.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "center"))); if (nodesDtl != null && nodesDtl.Count > 0) { Parser begDate = new Parser(new Lexer(nodesDtl.ToHtml())); NodeList begNode = begDate.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "center"))); if (begNode != null && begNode.Count > 0) { beginDate = begNode.AsString().GetDateRegex("yyyy年MM月dd日"); } begDate.Reset(); NodeList dtlTable = begDate.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("border", "1"))); if (dtlTable != null && dtlTable.Count > 0) { TableTag tableDtl = dtlTable[0] as TableTag; if (tableDtl.RowCount > 2) { string ctx = tableDtl.Rows[2].ToPlainTextString(); bidUnit = ctx.GetRegexBegEnd("供应商:", ";"); bidMoney = ctx.GetRegexBegEnd("中标金额:", "。").GetMoney(); } if (bidMoney == "0" && tableDtl.RowCount > 4) { string ctx = tableDtl.Rows[4].ToPlainTextString(); bidUnit = ctx.GetRegexBegEnd("供应商:", ";"); bidMoney = ctx.GetRegexBegEnd("中标金额:", "。").GetMoney(); } if (bidMoney == "0" && tableDtl.RowCount > 6) { string ctx = tableDtl.Rows[6].ToPlainTextString(); bidUnit = ctx.GetRegexBegEnd("供应商:", ";"); bidMoney = ctx.GetRegexBegEnd("中标金额:", "。").GetMoney(); } if (bidMoney == "0" && tableDtl.RowCount > 8) { string ctx = tableDtl.Rows[8].ToPlainTextString(); bidUnit = ctx.GetRegexBegEnd("供应商:", ";"); bidMoney = ctx.GetRegexBegEnd("中标金额:", "。").GetMoney(); } if (bidMoney == "0" && tableDtl.RowCount > 10) { string ctx = tableDtl.Rows[10].ToPlainTextString(); bidUnit = ctx.GetRegexBegEnd("供应商:", ";"); bidMoney = ctx.GetRegexBegEnd("中标金额:", "。").GetMoney(); } if (bidMoney == "0" && tableDtl.RowCount > 12) { string ctx = tableDtl.Rows[12].ToPlainTextString(); bidUnit = ctx.GetRegexBegEnd("供应商:", ";"); bidMoney = ctx.GetRegexBegEnd("中标金额:", "。").GetMoney(); } } HtmlTxt = nodesDtl.ToHtml(); bidCtx = HtmlTxt.ToCtxString(); code = bidCtx.GetRegex("批准文件编号,工程编号,项目编号").Replace("无", ""); code = bidCtx.GetRegexBegEnd("批准文件编号:", "二"); buildUnit = bidCtx.GetBuildRegex(); if (string.IsNullOrEmpty(buildUnit)) { buildUnit = bidCtx.GetRegexBegEnd("代理机构名称:", "地址"); } prjAddress = bidCtx.GetAddressRegex(); if (string.IsNullOrEmpty(prjAddress)) { prjAddress = bidCtx.GetRegexBegEnd("地址:", "邮政编码"); } msgType = "内蒙古自治区政府采购中心"; specType = "政府采购"; bidType = ToolHtml.GetInviteTypes(prjName); prjName = ToolDb.GetPrjName(prjName); BidInfo info = ToolDb.GenBidInfo("内蒙古自治区", "内蒙古自治区及盟市", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } //for (int i = 1; i <= pageInt; i++) //{ // if (i > 1) // { // try // { // html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "&pos=" + i.ToString(), Encoding.Default); // } // catch (Exception ex) // { // Logger.Error(ex.ToString()); // } // } // parser = new Parser(new Lexer(html)); // NodeList nodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "recordlist"))); // if (nodes != null && nodes.Count > 0) // { // TableTag table = nodes[0] as TableTag; // for (int t = 0; t < table.RowCount; t++) // { // string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, // code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, // bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, // bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, // otherType = string.Empty, HtmlTxt = string.Empty,strHtml=string.Empty; // TableRow tr = table.Rows[t]; // endDate = tr.Columns[1].ToPlainTextString().GetDateRegex(); // ATag alink = tr.Columns[0].GetATag(); // prjName = tr.Columns[0].GetATagValue("title"); // InfoUrl = "http://www.nmgp.gov.cn" + alink.Link; // string htmldtl = string.Empty; // try // { // htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).Replace(" ", "").Trim(); // } // catch (Exception ex) // { // continue; // } // Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); // htmldtl = regexHtml.Replace(htmldtl, ""); // Parser parserdtl = new Parser(new Lexer(htmldtl)); // Parser dtlparserHTML = new Parser(new Lexer(htmldtl)); // NodeList nodesDtl = dtlparserHTML.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "hlcms_9"))); // if (nodesDtl != null && nodesDtl.Count > 0) // { // Parser begDate = new Parser(new Lexer(nodesDtl.ToHtml())); // NodeList begNode = begDate.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "yzhang"))); // if (begNode != null && begNode.Count > 0) // { // beginDate = begNode.AsString().GetDateRegex("yyyy年MM月dd日"); // } // begDate.Reset(); // NodeList dtlTable = begDate.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("cellpadding", "5"))); // if (dtlTable != null && dtlTable.Count > 0) // { // TableTag tableDtl = dtlTable[0] as TableTag; // if (tableDtl.RowCount > 2) // { // string ctx = tableDtl.Rows[2].ToPlainTextString(); // bidUnit = ctx.GetRegexBegEnd("供应商:",";"); // bidMoney = ctx.GetRegexBegEnd("中标金额:","。").GetMoney(); // } // if (bidMoney == "0"&& tableDtl.RowCount >4) // { // string ctx = tableDtl.Rows[4].ToPlainTextString(); // bidUnit = ctx.GetRegexBegEnd("供应商:", ";"); // bidMoney = ctx.GetRegexBegEnd("中标金额:", "。").GetMoney(); // } // if (bidMoney == "0" && tableDtl.RowCount > 6) // { // string ctx = tableDtl.Rows[6].ToPlainTextString(); // bidUnit = ctx.GetRegexBegEnd("供应商:", ";"); // bidMoney = ctx.GetRegexBegEnd("中标金额:", "。").GetMoney(); // } // if (bidMoney == "0" && tableDtl.RowCount > 8) // { // string ctx = tableDtl.Rows[8].ToPlainTextString(); // bidUnit = ctx.GetRegexBegEnd("供应商:", ";"); // bidMoney = ctx.GetRegexBegEnd("中标金额:", "。").GetMoney(); // } // if (bidMoney == "0" && tableDtl.RowCount > 10) // { // string ctx = tableDtl.Rows[10].ToPlainTextString(); // bidUnit = ctx.GetRegexBegEnd("供应商:", ";"); // bidMoney = ctx.GetRegexBegEnd("中标金额:", "。").GetMoney(); // } // if (bidMoney == "0" && tableDtl.RowCount > 12) // { // string ctx = tableDtl.Rows[12].ToPlainTextString(); // bidUnit = ctx.GetRegexBegEnd("供应商:", ";"); // bidMoney = ctx.GetRegexBegEnd("中标金额:", "。").GetMoney(); // } // } // HtmlTxt = nodesDtl.ToHtml(); // bidCtx = HtmlTxt.ToCtxString(); // code = bidCtx.GetRegex("批准文件编号,工程编号,项目编号",true,50).Replace("无", ""); // buildUnit = bidCtx.GetBuildRegex(); // if (string.IsNullOrEmpty(buildUnit)) // buildUnit = bidCtx.GetRegex("采购代理机构名称,采购单位名称"); // prjAddress = bidCtx.GetAddressRegex(); // if (string.IsNullOrEmpty(prjAddress)) // prjAddress = bidCtx.GetRegex("投标地点,开标地点,地址"); // msgType = "内蒙古自治区政府采购中心"; // specType = "政府采购"; // bidType = ToolHtml.GetInviteTypes(prjName); // prjName = ToolDb.GetPrjName(prjName); // BidInfo info = ToolDb.GenBidInfo("内蒙古自治区", "内蒙古自治区及盟市", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); // list.Add(info); // if (!crawlAll && list.Count >= this.MaxCount) // return list; // } // } // } } return(list); }
public void DealHtml(IList list, string html, bool crawlAll) { Parser parserDtl = new Parser(new Lexer(html)); NodeList aNodes = parserDtl.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "lefttable"))); if (aNodes != null && aNodes.Count > 0) { Type typs = typeof(ATag); TableTag table = aNodes[0] as TableTag; for (int t = 1; t < table.RowCount - 1; t++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, ctx = string.Empty, CreateTime = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[t] as TableRow; ATag aTag = tr.SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = aTag.Link; prjName = table.Rows[t].Columns[1].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); endDate = table.Rows[t].Columns[2].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); string htmlDtl = string.Empty; try { htmlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); } catch (Exception ex) { continue; } Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); htmlDtl = regexHtml.Replace(htmlDtl, ""); Parser parserCtx = new Parser(new Lexer(htmlDtl)); NodeList ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "printTb lefttable"))); if (ctxNode != null && ctxNode.Count > 0) { Parser parserdiv = new Parser(new Lexer(htmlDtl)); NodeList aNodesdiv = parserdiv.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "biuuu_button"))); HtmlTxt = ctxNode.AsHtml().Replace(aNodesdiv.AsHtml(), "").Trim(); Type tp = typeof(ATag); TableTag tabTag = ctxNode[0] as TableTag; string startTime = tabTag.Rows[1].Columns[0].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); Regex regex = new Regex(@"时间:\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}"); Match math = regex.Match(startTime); beginDate = math.Value.Replace("时间:", "").Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); Regex regexcode = new Regex("(工程编号|项目编号|招标编号):[^\r\n]+[\r\n]{1}"); Match match = regexcode.Match(tabTag.ToPlainTextString()); code = match.Value.Substring(match.Value.IndexOf(":") + 1).Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); Regex regexBuildUnit = new Regex("(招标人|建设单位|招标采购单位):[^\r\n]+[\r\n]{1}"); Match matchBuildUnit = regexBuildUnit.Match(tabTag.ToPlainTextString()); buildUnit = matchBuildUnit.Value.Substring(matchBuildUnit.Value.IndexOf(":") + 1).Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); Regex regexAddress = new Regex("(建设地点|项目地点|工程地点):[^\r\n]+[\r\n]{1}"); Match matchAddress = regexAddress.Match(tabTag.ToPlainTextString()); prjAddress = matchAddress.Value.Substring(matchAddress.Value.IndexOf(":") + 1).Trim(); ctx = tabTag.Rows[2].Columns[0].ToPlainTextString().Replace(" ", " ").Replace("\r\n\r\n\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); if (ctx.Length > 0) { Regex regexCtx = new Regex("<!--[^<]+-->"); ctx = regexCtx.Replace(ctx, ""); } if (Encoding.Default.GetByteCount(code) > 50) { code = ""; } if (buildUnit == "" || buildUnit == null) { buildUnit = ""; } if (Encoding.Default.GetByteCount(buildUnit) > 150) { buildUnit = buildUnit.Substring(0, 150); } if (Encoding.Default.GetByteCount(prjAddress) > 200) { prjAddress = "见招标公告内容"; } if (beginDate.Length > 0 && endDate.Length > 0) { DateTime begin = new DateTime(); DateTime end = new DateTime(); try { begin = DateTime.Parse(beginDate); end = DateTime.Parse(endDate); } catch (Exception) { } if (begin > end) { endDate = string.Empty; } } } parserCtx.Reset(); ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "toptd_bai"))); Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}"); beginDate = regDate.Match(ctxNode.AsString()).Value.Trim(); if (beginDate == "") { beginDate = string.Empty; } inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "惠州市区", "惠东县", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, ctx, remark, "惠州市建设工程交易中心", inviteType, "建设工程", string.Empty, InfoUrl, HtmlTxt); list.Add(info); ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("target", "_blank"))); NodeList aTagNodes = ctxNode.SearchFor(typeof(ATag), true); for (int a = 0; a < aTagNodes.Count; a++) { ATag fileTage = aTagNodes[a] as ATag; if (fileTage.Link.Contains("http://www.ebc.huizhou.gov.cn/index/loadNewsFile")) { string downloadURL = fileTage.Link; BaseAttach attach = ToolDb.GenBaseAttach(fileTage.ToPlainTextString(), info.Id, downloadURL); base.AttachList.Add(attach); } } if (!crawlAll && list.Count >= this.MaxCount) { return; } } } }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); foreach (string area in this.DicSiteUrl.Keys) { int pageInt = 1, count = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.DicSiteUrl[area], Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "page"))); if (sNode != null && sNode.Count > 0) { try { string page = sNode.AsString().ToNodeString().Replace("createPageHTML(", ""); string temp = page.Remove(page.IndexOf(",")); pageInt = Convert.ToInt32(temp); } catch (Exception) { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.DicSiteUrl[area] + "index_" + (i - 1) + ".html".ToString(), Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "secondrightlistbox"))), new TagNameFilter("ul"))), new TagNameFilter("li"))); if (sNode != null && sNode.Count > 0) { for (int t = 0; t < sNode.Count; t++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; beginDate = sNode[t].ToNodePlainString().GetDateRegex(); prjName = sNode[t].GetATagValue("title"); InfoUrl = this.DicSiteUrl[area] + sNode[t].GetATagHref().Replace("./", ""); string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace(" ", "").Trim(); Parser dtlparserHTML = new Parser(new Lexer(htmldetail)); NodeList dtnodeHTML = dtlparserHTML.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "contentrightlistbox2"))); HtmlTxt = dtnodeHTML.AsHtml(); htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace(" ", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n").Replace("<br/>", "\r\n"); } catch (Exception ex) { continue; } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "contentrightlistbox2"))); Regex regexCtx = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); inviteCtx = regexCtx.Replace(dtnode.AsString(), "").Replace(" ", ""); Regex regPrjAdd = new Regex(@"(工程地点|工程地址|项目地址)[:|:][^\r\n]+[\r\n]{1}"); prjAddress = regPrjAdd.Match(inviteCtx).Value.Replace("工程地点", "").Replace("工程地址", "").Replace("项目地址", "").Replace(":", "").Replace(":", "").Replace(")", "").Trim(); Regex regbuildUnit = new Regex(@"(招标单位|招标人):[^\r\n]+[\r\n]{1}"); buildUnit = regbuildUnit.Match(inviteCtx).Value.Replace("招标单位:", "").Replace("招标人:", "").Trim(); if (buildUnit.Contains("招标代理机构")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("招标代理机构")); } msgType = "佛山市建设工程交易中心"; specType = "建设工程"; inviteType = ToolHtml.GetInviteTypes(prjName); string are = area != "市直" ? area : ""; InviteInfo info = ToolDb.GenInviteInfo("广东省", "佛山市区", are, string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); count++; list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.fsggzy.cn/" + a.Link.GetReplace("../,./"); } if (Encoding.Default.GetByteCount(link) > 500) { continue; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && count >= this.MaxCount) { goto Funcs; } } } } Funcs :; } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("cellspacing", "2"), new TagNameFilter("table"))); string pageString = sNode.AsString(); Regex regexPage = new Regex(@",共[^页]+页,"); Match pageMatch = regexPage.Match(pageString); try { pageInt = int.Parse(pageMatch.Value.Replace(",共", "").Replace("页,", "").Trim()); } catch (Exception) { } string cookiestr = string.Empty; for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE", "__VIEWSTATEENCRYPTED", "__EVENTVALIDATION", "ctl00$hdnPageCount" }, new string[] { "ctl00$Content$GridView1", "Page$" + i.ToString(), viewState, "", eventValidation, pageInt.ToString() }); try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "ctl00_Content_GridView1"), new TagNameFilter("table"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount - 1; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j] as TableRow; code = tr.Columns[1].ToPlainTextString().Trim(); prjName = tr.Columns[2].ToPlainTextString().Trim(); buildUnit = tr.Columns[4].ToPlainTextString().Trim(); bidUnit = tr.Columns[5].ToPlainTextString().Trim(); bidMoney = tr.Columns[6].ToPlainTextString().Replace("万元", "").Trim(); beginDate = tr.Columns[3].ToPlainTextString().Split('至')[0].Replace("年", "-").Replace("月", "-").Replace("日", " ").Replace("时", "").Trim(); endDate = tr.Columns[3].ToPlainTextString().Split('至')[1].Replace("年", "-").Replace("月", "-").Replace("日", " ").Replace("时", "").Trim(); ATag aTag = tr.Columns[2].Children[0] as ATag; InfoUrl = "http://www.szjsjy.com.cn/BusinessInfo/" + aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace(" ", "").Trim(); Parser dtlparserHTML = new Parser(new Lexer(htmldetail)); NodeList dtnodeHTML = dtlparserHTML.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "lblXXNR"), new TagNameFilter("span"))); HtmlTxt = dtnodeHTML.AsHtml(); htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace(" ", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n"); } catch (Exception ex) { continue; } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "lblXXNR"), new TagNameFilter("span"))); bidCtx = dtnode.AsString().Replace(" ", ""); Regex regPrjAdd = new Regex(@"(工程地点|工程地址):[^\r\n]+[\r\n]{1}"); prjAddress = regPrjAdd.Match(bidCtx).Value.Replace("工程地点:", "").Replace("工程地址:", "").Trim(); msgType = "深圳市建设工程交易中心"; specType = "建设工程"; Regex regprjMgr = new Regex(@"(项目经理|项目负责人|项目总监|建造师|监理师|项目经理姓名)(:|:)[^\s]+[\s]{1}"); prjMgr = regprjMgr.Match(bidCtx).Value.Replace("项目经理姓名", "").Replace("项目经理", "").Replace("项目总监", "").Replace("建造师", "").Replace("项目负责人", "").Replace(":", "").Replace(":", "").Replace("监理师", "").Trim(); string bidUnitInfo = bidCtx.GetBidRegex(); if (!string.IsNullOrEmpty(bidUnitInfo)) { bidUnit = bidUnitInfo; } Regex regInvType = new Regex(@"[^\r\n]+[\r\n]{1}"); string InvType = regInvType.Match(bidCtx).Value; prjName = ToolDb.GetPrjName(prjName); if (!string.IsNullOrEmpty(bidUnit)) { bidUnit = ToolDb.GetBidUnit(bidUnit); if (bidUnit.Contains("报价")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("报价")); } } bidType = ToolHtml.GetInviteTypes(InvType); BidInfo info = ToolDb.GenBidInfo("广东省", "深圳市工程", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, string.Empty, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); dtlparser.Reset(); NodeList dlNodes = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "trFujian"), new TagNameFilter("tr"))); if (dlNodes != null && dlNodes.Count > 0) { TableRow attr = dlNodes[0] as TableRow; NodeList fileNodes = attr.SearchFor(typeof(ATag), true); if (fileNodes != null && fileNodes.Count > 0) { for (int f = 0; f < fileNodes.Count; f++) { ATag fileTag = fileNodes[f] as ATag; if (!string.IsNullOrEmpty(fileTag.Link)) { BaseAttach attach = ToolDb.GenBaseAttach(fileTag.StringText, info.Id, fileTag.Link.Replace("..", "http://www.szjsjy.com.cn")); base.AttachList.Add(attach); } } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 10; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl); } catch { return(list); } for (int i = 1; i <= page; i++) { if (i > 1) { try { htl = this.ToolWebSite.GetHtmlByUrl("http://www.szgas.com.cn/node_200865_" + i + ".htm"); } catch { continue; } } Parser parser = new Parser(new Lexer(htl)); NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "news")), true), new TagNameFilter("li"))); if (tableNodeList.Count > 0) { for (int j = 0; j < tableNodeList.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = tableNodeList[j].GetATag(); prjName = aTag.LinkText.Trim(); beginDate = tableNodeList[j].ToPlainTextString().GetDateRegex(); if (aTag.Link.Contains("http")) { InfoUrl = aTag.Link.GetReplace("&", "&"); } else { InfoUrl = "http://www.szgas.com.cn/" + aTag.Link.Trim().GetReplace("&", "&"); } string[] urls = InfoUrl.Split('?'); if (urls.Length > 1) { InfoUrl = "http://www.sztc.com/tender/InfoPubDisplay.aspx?" + urls[1]; } string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "ninfo-con"))); if (dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); inviteCtx = HtmlTxt.Replace("</span>", "\r\n").ToCtxString(); //inviteCtx = inviteCtx.Replace(" ", "").Replace("http://www.szgas.com.cn", "").Replace(";", "").Trim(); prjAddress = inviteCtx.GetAddressRegex(); code = inviteCtx.GetCodeRegex(); if (string.IsNullOrWhiteSpace(code)) { code = inviteCtx.GetRegexBegEnd("招标编号:", "进行公开招标"); } if (string.IsNullOrWhiteSpace(code)) { code = inviteCtx.GetRegexBegEnd("公开招标", ",欢迎"); } msgType = "深圳燃气集团公司"; specType = "建设工程"; prjAddress = "见中标信息"; buildUnit = "深圳燃气集团公司"; inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "headStyle27kkt9g3gy"))); if (nodeList != null && nodeList.Count > 0) { try { string temp = nodeList[0].ToPlainTextString().GetRegexBegEnd("/", "首").ToLower().Replace(" ", ""); page = int.Parse(temp); } catch { } } else { page = 25; } for (int i = page; i >= 1; i--) { if (i < page) { try { string url = "http://zbcg.sziit.edu.cn/zbxx/" + i + ".htm"; htl = this.ToolWebSite.GetHtmlByUrl(url, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(htl)); NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "winstyle66953"))); if (tableNodeList != null && tableNodeList.Count > 0) { TableTag table = tableNodeList[0] as TableTag; for (int j = 0; j < table.RowCount - 1; j++) { TableRow tr = table.Rows[j]; if (tr.ColumnCount < 2) { continue; } string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, img = string.Empty, HtmlTxt = string.Empty, downUrl = string.Empty, downName = string.Empty; prjName = tr.Columns[1].ToNodePlainString(); if (prjName.Contains("暂停公告")) { continue; } beginDate = tr.Columns[2].ToPlainTextString().GetDateRegex("yyyy/MM/dd"); InfoUrl = "http://zbcg.sziit.edu.cn/" + tr.Columns[1].GetATagHref().Replace("../", "").Replace("./", ""); string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "vsb_content"))); if (dtnode != null && dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); Regex regeximg = new Regex(@"<IMG[^>]*>");//去掉图片 HtmlTxt = regeximg.Replace(HtmlTxt, ""); inviteCtx = dtnode.AsString().Replace(" ", "").Trim(); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); inviteCtx = regexHtml.Replace(inviteCtx, "").Replace(" ", "").Replace("“", "").Replace("”", "").Trim(); code = inviteCtx.GetCodeRegex().GetCodeDel(); prjAddress = inviteCtx.GetAddressRegex(); Regex regBegin = new Regex(@"投标截止时间:[^\r\n]+[\r\n]{1}"); string date = regBegin.Match(inviteCtx).Value.Replace("投标截止时间:", "").Replace(" ", "").Trim(); Regex regDate = new Regex(@"\d{4}年\d{1,2}月\d{1,2}日"); endDate = regDate.Match(date).Value.Trim(); Regex regBuidUnit = new Regex(@"(招标机构|委托单位)(:|:)[^\r\n]+\r\n"); buildUnit = inviteCtx.GetBuildRegex(); if (inviteType == "设备材料" || inviteType == "小型施工" || inviteType == "专业分包" || inviteType == "劳务分包" || inviteType == "服务" || inviteType == "勘察" || inviteType == "设计" || inviteType == "监理" || inviteType == "施工") { specType = "建设工程"; } else { specType = "其他"; } msgType = "深圳信息职业技术学院"; inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookieStr = string.Empty; int pageInt = 1; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "ecms_pagination")), true), new TagNameFilter("a"))); if (pageList != null && pageList.Count > 0) { try { ATag atag = pageList[pageList.Count - 2] as ATag; string temp = atag.LinkText; pageInt = int.Parse(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.shcac.edu.cn:80/html/xxdt/tzgg/" + i.ToString() + ".html", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "list_main_content")), true), new TagNameFilter("ul")), true), new TagNameFilter("li"))); if (nodeList != null && nodeList.Count > 0) { for (int j = 0; j < nodeList.Count; j++) { string btName = string.Empty, btTime = string.Empty, btUrl = string.Empty; ATag aTag = nodeList[j].GetATag(); btName = nodeList[j].ToNodePlainString(); btTime = nodeList[j].ToNodePlainString().GetDateRegex(); btName = btName.Replace(btTime, ""); btUrl = aTag.Link; string htldtl = string.Empty; try { htldtl = this.ToolWebSite.GetHtmlByUrl(btUrl, Encoding.UTF8); htldtl = htldtl.GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htldtl)); NodeList dtlBt = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "detail_main_content")), true), new TagNameFilter("h3"))); if (dtlBt != null && dtlBt.Count > 0) { btName = dtlBt.AsString(); if (btName.Contains("招标公告") || btName.Contains("补充公告")) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; parser.Reset(); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("style", "line-height:22px;"))); if (dtlList != null && dtlList.Count > 0) { prjName = btName; beginDate = btTime; InfoUrl = btUrl; HtmlTxt = dtlList.ToHtml(); inviteCtx = dtlList.ToHtml().Replace("</p>", "\r\n").ToCtxString().Replace("\r\n\t", "\r\n").Replace("\r\n\r\n", "\r\n"); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); msgType = "上海民航职业技术学院"; specType = ""; InviteInfo info = ToolDb.GenInviteInfo("上海市", "上海市区", string.Empty, string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNodes = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNodes != null && aNodes.Count > 0) { for (int a = 0; a < aNodes.Count; a++) { ATag aFile = aNodes[a] as ATag; if (aFile.IsAtagAttach()) { string link = string.Empty; if (aFile.Link.ToLower().Contains("http")) { link = aFile.Link; } else { link = aFile.Link; } BaseAttach attach = ToolDb.GenBaseAttach(aFile.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } else if (btName.Contains("中标结果") || btName.Contains("结果公示") || btName.Contains("中标公示")) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty; parser.Reset(); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("style", "line-height:22px;"))); if (dtlList != null && dtlList.Count > 0) { prjName = btName; beginDate = btTime; InfoUrl = btUrl; HtmlTxt = dtlList.ToHtml(); bidCtx = dtlList.ToHtml().Replace("</p>", "\r\n").ToCtxString().Replace("\r\n\t", "\r\n").Replace("\r\n\r\n", "\r\n"); buildUnit = bidCtx.GetBuildRegex(); bidUnit = bidCtx.GetBidRegex(); if (string.IsNullOrWhiteSpace(bidUnit)) { bidUnit = bidCtx.GetRegex("中标人"); } bidMoney = bidCtx.GetMoneyRegex(); buildUnit = bidCtx.GetBuildRegex(); if (string.IsNullOrWhiteSpace(buildUnit)) { buildUnit = bidCtx.GetRegex("招标人"); } code = bidCtx.GetCodeRegex().GetCodeDel(); if (!string.IsNullOrWhiteSpace(code)) { if (code[code.Length - 1] != '号') { code = ""; } } if (bidUnit.Contains("公司")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("公司")) + "公司"; } msgType = "上海民航职业技术学院"; specType = ""; bidType = ToolHtml.GetInviteTypes(prjName); BidInfo info = ToolDb.GenBidInfo("上海市", "上海市区", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } else { continue; } } else { continue; } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); DateTime start = DateTime.Parse("2016-11-30"); DateTime end = DateTime.Parse("2016-12-14"); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookieStr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8, ref cookieStr); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "ctl00_cph_context_InfoList2_GridViewPaging1_PagingDescTd"), new TagNameFilter("td"))); string pageString = sNode.AsString(); Regex regexPage = new Regex(@",共[^页]+页"); Match pageMatch = regexPage.Match(pageString); try { pageInt = int.Parse(pageMatch.Value.Replace(",共", "").Replace("页", "").Trim()); } catch (Exception) { } string cookiestr = string.Empty; for (int i = 5; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection( new string[] { "ctl00$ScriptManager1", "__EVENTTARGET", "__EVENTARGUMENT", "__LASTFOCUS", "__VIEWSTATE", "ctl00$cph_context$InfoList2$ddlProjectType", "ctl00$cph_context$InfoList2$ddlSearch", "ctl00$cph_context$InfoList2$txtProjectName", "ctl00$cph_context$InfoList2$GridViewPaging1$txtGridViewPagingForwardTo", "__VIEWSTATEENCRYPTED", "ctl00$cph_context$InfoList2$GridViewPaging1$btnForwardToPage" }, new string[] { "ctl00$cph_context$InfoList2$update1|ctl00$cph_context$InfoList2$GridViewPaging1$btnForwardToPage", string.Empty, string.Empty, string.Empty, viewState, string.Empty, "gcbh", string.Empty, i.ToString(), "", "GO" }); try { //string postDatas = string.Empty; //foreach(string post in nvc.AllKeys) //{ // postDatas += string.Format("{0}={1}&", post, nvc.GetValues(post)); //} //postDatas = postDatas.Remove(postDatas.Length - 1, 1); //html = ToolHtml.GetHtmlByUrlPost(this.SiteUrl, postDatas, Encoding.UTF8, ref cookieStr); html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "ctl00_cph_context_InfoList2_GridView1"), new TagNameFilter("table"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j] as TableRow; code = tr.Columns[1].ToPlainTextString().Trim(); prjName = tr.Columns[2].ToPlainTextString().Trim(); buildUnit = tr.Columns[3].ToPlainTextString().Trim(); beginDate = tr.Columns[5].ToPlainTextString().Trim(); endDate = tr.Columns[6].ToPlainTextString().Trim(); string InvType = tr.Columns[4].ToPlainTextString().Trim(); bidType = ToolHtml.GetInviteTypes(InvType); ATag aTag = tr.Columns[2].SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://61.144.224.189:8001/LGjyzxWeb/SiteManage/" + aTag.Link.Replace("openNewWindowByMenu(\"", "").Replace("\")", ""); string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace(" ", "").Trim(); Parser dtlparserHTML = new Parser(new Lexer(htmldetail)); NodeList dtnodeHTML = dtlparserHTML.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "ctl00_cph_context_lblContent"), new TagNameFilter("span"))); HtmlTxt = dtnodeHTML.AsHtml(); htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace(" ", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n"); } catch (Exception ex) { continue; } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "ctl00_cph_context_lblContent"), new TagNameFilter("span"))); bidCtx = dtnode.AsString().Replace(" ", ""); Regex regPrjAdd = new Regex(@"(工程地点|工程地址):[^\r\n]+[\r\n]{1}"); prjAddress = regPrjAdd.Match(bidCtx).Value.Replace("工程地点:", "").Replace("工程地址:", "").Trim(); msgType = "深圳市建设工程交易中心龙岗分中心"; specType = "建设工程"; Regex regMoney = new Regex(@"(中标价):[^\r\n]+[\r\n]{1}"); bidMoney = regMoney.Match(bidCtx).Value.Replace("中标价:", "").Replace("万元", "").Trim(); Regex regprjMgr = new Regex(@"(项目经理|项目负责人|项目总监|建造师):[^\r\n]+[\r\n]{1}"); prjMgr = regprjMgr.Match(bidCtx).Value.Replace("项目经理:", "").Trim(); Regex regBidUnit = new Regex(@"(中标人|中标单位):[^\r\n]+[\r\n]{1}"); bidUnit = regBidUnit.Match(bidCtx).Value.Replace("中标人:", "").Replace("中标单位", "").Trim(); Regex regOtherType = new Regex(@"(工程类型):[^\r\n]+[\r\n]{1}"); string oType = regOtherType.Match(bidCtx).Value.Replace("工程类型:", "").Trim(); if (oType.Contains("房建")) { otherType = "房建及工业民用建筑"; } if (oType.Contains("市政")) { otherType = "市政工程"; } if (oType.Contains("园林绿化")) { otherType = "园林绿化工程"; } if (oType.Contains("装饰装修")) { otherType = "装饰装修工程"; } if (oType.Contains("电力")) { otherType = "电力工程"; } if (oType.Contains("水利")) { otherType = "水利工程"; } if (oType.Contains("环保")) { otherType = "环保工程"; } if (Encoding.Default.GetByteCount(bidUnit) > 150) { bidUnit = ""; } if (Encoding.Default.GetByteCount(prjMgr) > 50) { prjMgr = ""; } //prjName = ToolDb.GetPrjName(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "深圳龙岗区工程", "龙岗区", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); if (info.BeginDate < start) { return(list); } if (info.BeginDate > start && info.BeginDate < end) { list.Add(info); } else { continue; } dtlparser.Reset(); NodeList fileNode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_cph_context_AccessoriesControl1_GridView1"))); if (fileNode != null && fileNode.Count > 0 && fileNode[0] is TableTag) { TableTag fileTable = fileNode[0] as TableTag; for (int f = 1; f < fileTable.Rows.Length; f++) { BaseAttach attach = ToolDb.GenBaseAttach(fileTable.Rows[f].Columns[0].ToPlainTextString().Trim(), info.Id, "http://jyzx.cb.gov.cn/LGjyzxWeb/" + (fileTable.Rows[f].Columns[0].SearchFor(typeof(ATag), true)[0] as ATag).Link.Replace("../", "")); base.AttachList.Add(attach); } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("width", "700"))); if (nodeList != null && nodeList.Count > 0) { try { string ooooo = Regex.Replace(nodeList[0].ToPlainTextString().Trim().Replace(":", "").Replace(":", "").Replace(" ", ""), @"[\u4e00-\u9fa5]", ""); page = int.Parse(ooooo.Substring(ooooo.IndexOf("/")).Replace("/", "").Trim()); } catch { } } for (int i = 1; i <= page; i++) { if (i > 1) { try { htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "?page=" + i.ToString(), Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(htl)); NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("cellPadding", "5"))); if (tableNodeList != null && tableNodeList.Count > 0) { TableTag table = (TableTag)tableNodeList[0]; for (int j = 0; j < table.RowCount; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; beginDate = tr.Columns[2].ToPlainTextString().Trim(); prjName = tr.Columns[1].ToPlainTextString().Replace("•", "").Trim(); ATag aTag = tr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://www.stjs.org.cn/zbtb/" + aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).Replace(" ", ""); } catch { continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("cellPadding", "4"))); if (dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); TableTag tableRow = dtnode.SearchFor(typeof(TableTag), true)[0] as TableTag; for (int row = 0; row < tableRow.RowCount; row++) { TableRow r = tableRow.Rows[row]; for (int k = 0; k < r.ColumnCount; k++) { string st = string.Empty; string st1 = string.Empty; st = r.Columns[k].ToPlainTextString().Trim(); if (k + 1 < r.ColumnCount) { st1 = r.Columns[k + 1].ToPlainTextString().Trim(); } bidCtx += st + ":" + st1 + "\r\n"; if (k + 1 <= r.ColumnCount) { k++; } } } code = bidCtx.GetCodeRegex().GetReplace("/"); Regex regBuidUnit = new Regex(@"(招标人|建设单位)(:|:)[^\r\n]+\r\n"); buildUnit = regBuidUnit.Match(bidCtx).Value.Replace("招标人:", "").Replace("建设单位:", "").Trim(); Regex regBidUnit = new Regex(@"中标单位(:|:)[^\r\n]+\r\n"); bidUnit = regBidUnit.Match(bidCtx).Value.Replace("中标单位:", "").Replace("/", "").Trim(); bidMoney = bidCtx.GetMoneyRegex(); string[] prjNames = prjName.Split(':'); prjName = prjNames[prjNames.Length - 1]; beginDate = beginDate.GetReplace(".", "-"); string temp = bidCtx.GetRegex("工程名称", false); if (!string.IsNullOrWhiteSpace(temp)) { prjName = temp; } msgType = "汕头市建设工程交易中心"; specType = "建设工程"; bidType = ToolHtml.GetInviteTypes(prjName); prjName = ToolDb.GetPrjName(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "汕头市区", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("height", "34"))); if (nodeList != null && nodeList.Count > 0) { string pageString = nodeList.AsString(); Regex regexPage = new Regex(@"1/[^页]+"); Match pageMatch = regexPage.Match(pageString); try { page = int.Parse(pageMatch.Value.Replace("1/", "").Replace("下一", "")); } catch { page = 1; } } for (int i = 1; i <= page; i++) { if (i > 1) { try { htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl + "?page=" + i.ToString()), Encoding.Default); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(htl)); NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("cellPadding", "5"))); if (tableNodeList != null && tableNodeList.Count > 0) { TableTag table = (TableTag)tableNodeList[0]; for (int j = 0; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; prjName = tr.Columns[1].ToPlainTextString().Replace("•", "").Trim(); beginDate = tr.Columns[2].ToPlainTextString().Replace(" ", "").Trim(); ATag aTag = tr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://www.stjs.org.cn/zbtb/" + aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).Replace(" ", ""); } catch { continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("cellPadding", "4"))); if (dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); TableTag tableRow = dtnode.SearchFor(typeof(TableTag), true)[0] as TableTag; for (int row = 0; row < tableRow.RowCount; row++) { TableRow r = tableRow.Rows[row]; for (int k = 0; k < r.ColumnCount; k++) { string st = string.Empty; string st1 = string.Empty; st = r.Columns[k].ToPlainTextString().Trim(); if (k + 1 < r.ColumnCount) { st1 = r.Columns[k + 1].ToPlainTextString().Trim(); } inviteCtx += st + ":" + st1 + "\r\n"; if (k + 1 <= r.ColumnCount) { k++; } } } code = inviteCtx.GetCodeRegex().GetReplace("/");; Regex regBuidUnit = new Regex(@"(招标人|建设单位)(:|:)[^\r\n]+\r\n"); buildUnit = regBuidUnit.Match(inviteCtx).Value.Replace("招标人:", "").Replace("建设单位:", "").Trim(); Regex regPrjAddr = new Regex(@"(工程地点|项目地址)(:|:)[^\r\n]+\r\n"); prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程地点:", "").Trim(); inviteCtx = inviteCtx.Replace("<?", "").Replace("xml:namespace prefix = st1 ns = ", "").Replace("urn:schemas-microsoft-com:office:smarttags", "").Replace("/>", "").Replace(":", "").Trim(); inviteCtx = inviteCtx.Replace("<?", "").Replace("xml:namespace prefix = o ns = ", "").Replace("urn:schemas-microsoft-com:office:office", "").Replace("/>", "").Replace(":", "").Trim(); msgType = "汕头市建设工程交易中心"; specType = "建设工程"; string[] prjNames = prjName.Split(':'); prjName = prjNames[prjNames.Length - 1]; beginDate = beginDate.GetReplace(".", "-"); string temp = inviteCtx.GetRegex("工程名称", false); if (!string.IsNullOrWhiteSpace(temp)) { prjName = temp; } inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "汕头市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parserdetail.Reset(); NodeList fileNode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("height", "30")), true), new TagNameFilter("a"))); if (fileNode.Count > 0) { for (int f = 0; f < fileNode.Count; f++) { ATag aTa3g = fileNode[f] as ATag; BaseAttach attach = ToolDb.GenBaseAttach(aTa3g.LinkText, info.Id, "http://www.stjs.org.cn/zbtb/" + aTa3g.Link); base.AttachList.Add(attach); } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "1", Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "paging"), new TagNameFilter("div"))); if (sNode != null && sNode.Count > 0) { string temp = sNode[0].ToNodePlainString(); try { temp = temp.GetRegexBegEnd("/", "转到"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + i, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasAttributeFilter("class", "column-info-list"), new TagNameFilter("div")), true), new TagNameFilter("li"))); if (sNode != null && sNode.Count > 0) { for (int t = 0; t < sNode.Count; t++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = sNode[t].GetATag(); prjName = aTag.LinkText.ToNodeString(); InfoUrl = "http://ggzy.zhaoqing.gov.cn" + aTag.Link; beginDate = sNode[t].ToPlainTextString().GetDateRegex(); string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new TagNameFilter("body")); if (dtnode != null && dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); bidCtx = HtmlTxt.ToCtxString(); bidUnit = bidCtx.GetBidRegex(); bidMoney = bidCtx.GetMoneyRegex(); if (string.IsNullOrWhiteSpace(bidUnit)) { dtlparser = new Parser(new Lexer(HtmlTxt)); NodeList tableNode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("border", "1"))); if (tableNode == null || tableNode.Count < 1) { dtlparser.Reset(); tableNode = dtlparser.ExtractAllNodesThatMatch(new TagNameFilter("table")); } if (tableNode != null && tableNode.Count > 0) { string ctx = string.Empty; TableTag table = tableNode[0] as TableTag; if (table.Rows[0].ColumnCount >= 2) { for (int j = 1; j < table.RowCount; j++) { ctx += table.Rows[j].Columns[0].ToNodePlainString() + ":"; ctx += table.Rows[j].Columns[1].ToNodePlainString() + "\r\n"; } bidUnit = ctx.GetBidRegex(); if (string.IsNullOrWhiteSpace(bidUnit)) { bidUnit = ctx.GetRegex("单位名称,第一中标候选人"); } bidMoney = ctx.GetMoneyRegex(); prjMgr = ctx.GetMgrRegex(); } } } buildUnit = bidCtx.GetBuildRegex(); prjAddress = bidCtx.GetAddressRegex(); code = bidCtx.GetCodeRegex(); msgType = "肇庆市公共资源交易中心"; specType = "建设工程"; prjName = ToolDb.GetPrjName(prjName); bidType = ToolHtml.GetInviteTypes(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "肇庆市区", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); dtlparser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = dtlparser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int a = 0; a < aNode.Count; a++) { ATag fileTag = aNode[a] as ATag; if (fileTag.IsAtagAttach()) { string url = string.Empty; if (fileTag.Link.Contains("http")) { url = fileTag.Link; } else { url = this.SiteUrl + beginDate.GetReplace("-").Substring(0, 6) + fileTag.Link.GetReplace("./", "/"); } BaseAttach item = ToolDb.GenBaseAttach(fileTag.LinkText, info.Id, url); base.AttachList.Add(item); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("bgColor", "#EEF4F9"))); if (tdNodes != null && tdNodes.Count > 0) { string pageTemp = tdNodes.AsString().Replace(" ", "").Replace(" ", "").Trim(); Regex regpage = new Regex(@"1/[0-9]+页"); try { pageInt = int.Parse(regpage.Match(pageTemp).Value.Split('/')[1].Replace("页", "").Trim()); } catch (Exception ex) { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.szzdzb.cn/Product-index-id-8-p-" + i + ".html", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "hangao27"))), new TagNameFilter("table"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; code = tr.Columns[0].ToPlainTextString().Trim(); prjName = tr.Columns[1].ToPlainTextString().Trim(); ATag aTag = tr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://www.szzdzb.cn" + aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace(" ", "").Trim(); Parser dtlparserHTML = new Parser(new Lexer(htmldetail)); NodeList dtnodeHTML = dtlparserHTML.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "hangao27"))), new TagNameFilter("table"))); HtmlTxt = dtnodeHTML.AsHtml(); htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace(" ", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n"); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); htmldetail = regexHtml.Replace(htmldetail, ""); } catch (Exception ex) { continue; } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "hangao27"))), new TagNameFilter("table"))); inviteCtx = System.Web.HttpUtility.HtmlDecode(dtnode.AsString().Replace("打印本页 || 关闭窗口", "")); Regex regCtx = new Regex(@"([\r\n]+)|([\t]+)"); inviteCtx = regCtx.Replace(inviteCtx, "\r\n"); Regex regBeginDate = new Regex(@"发布时间:[^\r\n]+\r\n"); beginDate = regBeginDate.Match(inviteCtx).Value.Replace("发布时间", "").Replace(":", "").Trim(); specType = "其他"; msgType = "深圳市振东招标代理有限公司"; inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); dtlparser.Reset(); dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("bgColor", "#CCCCCC"))); NodeList FileTag = dtnode.SearchFor(typeof(ATag), true); if (FileTag != null && FileTag.Count > 0) { for (int f = 0; f < FileTag.Count; f++) { ATag file = FileTag[f] as ATag; if (file.Link.ToUpper().Contains(".DOC")) { BaseAttach attach = ToolDb.GenBaseAttach(file.ToPlainTextString(), info.Id, "http://www.szzdzb.cn" + file.Link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default, ref cookiestr); Regex regexHtml = new Regex(@"<script[^<]*</script>"); htl = regexHtml.Replace(htl, ""); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("colSpan", "6"))); if (nodeList != null && nodeList.Count > 0) { Regex regexPage = new Regex(@"共\d+页"); page = int.Parse(regexPage.Match(nodeList.AsString()).Value.Trim(new char[] { '共', '页' })); } for (int i = 1; i < page; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(htl); eventValidation = this.ToolWebSite.GetAspNetEventValidation(htl); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE", "key", "AxGridView1$ctl23$ctl07", "AxGridView1$ctl23$pageList", "__VIEWSTATEENCRYPTED", "__EVENTVALIDATION" }, new string[] { "AxGridView1$ctl23$ctl03", string.Empty, viewState, string.Empty, "20", (i - 1).ToString(), string.Empty, eventValidation }); try { htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.Default, ref cookiestr); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(htl)); NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "AxGridView1"))); if (tableNodeList != null && tableNodeList.Count > 0) { TableTag table = (TableTag)tableNodeList[0]; for (int j = 1; j < table.RowCount - 1; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; code = tr.Columns[2].ToPlainTextString().Trim(); prjName = tr.Columns[3].ToPlainTextString().Trim(); //endDate = tr.Columns[4].ToPlainTextString().Replace(" ", "").Trim().Substring(0, 10); ATag aTag = tr.Columns[5].SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://www.yjgcjy.cn/" + aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.Default).Replace(" ", ""); } catch (Exception) { Logger.Error("InviteYJYXJS"); continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("cellSpacing", "1"))); if (dtnode != null && dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); TableTag tableRow = (TableTag)dtnode[0]; for (int k = 1; k < tableRow.RowCount; k++) { TableRow trow = tableRow.Rows[k]; for (int c = 0; c < trow.ColumnCount; c++) { string tr1 = string.Empty; tr1 = trow.Columns[c].ToPlainTextString().Trim(); inviteCtx += tr1; } inviteCtx += "\r\n"; } Regex regPrjAddr = new Regex(@"工程建设地址:[^\r\n]+\r\n"); try { prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程建设地址", "").Replace(":", "").Replace("。", "").Replace("、", "").Replace(";", "").Replace(",", "").Trim(); if (Encoding.Default.GetByteCount(prjAddress) > 200 || prjAddress == "") { prjAddress = "见招标详细信息"; } } catch (Exception) { prjAddress = "见招标详细信息"; } Regex regBegin = new Regex(@"公告发布时间:[^\r\n]+[\r\n]{1}"); beginDate = regBegin.Match(inviteCtx).Value.Replace("公告发布时间:", "").Trim(); string date = beginDate.Replace(" ", "").Trim(); Regex regDate = new Regex(@"\d{4}年\d{1,2}月\d{1,2}日"); beginDate = regDate.Match(date).Value.Trim(); if (beginDate == "") { Regex regDateT = new Regex(@"[u4e00-u9fa5]{4}年[u4e00-u9fa5]{1,2}月[u4e00-u9fa5]{1,2}日"); beginDate = regDateT.Match(inviteCtx).Value.Replace("公告发布时间:", "").Trim(); } if (beginDate == "") { beginDate = string.Empty; } Regex bildUnit = new Regex(@"建设单位:[^\r\n]+[\r\n]{1}"); buildUnit = bildUnit.Match(inviteCtx).Value.Replace("建设单位:", "").Trim(); if (buildUnit == "") { buildUnit = ""; } msgType = "阳江市建设工程交易中心"; specType = "建设工程"; inviteType = ToolHtml.GetInviteTypes(prjName); inviteCtx = inviteCtx.Replace("<?", "").Replace("xml:namespace prefix = o ns = ", "").Replace("urn:schemas-microsoft-com:office:office", "").Replace("/>", "").Trim(); inviteCtx = inviteCtx.Replace("<?", "").Replace("xml:namespace prefix = ns0 ns = ", "").Replace("urn:schemas-microsoft-com:office:smarttags", "").Replace("/>", "").Trim(); inviteCtx = inviteCtx.Replace("<?", "").Replace("xml:namespace prefix = st1 ns = ", "").Replace("urn:schemas-microsoft-com:office:smarttags", "").Replace("/>", "").Trim(); inviteCtx = inviteCtx.Replace("xml:namespace prefix = st1", "").Trim(); inviteCtx = inviteCtx.Replace("点击进入留言", "").Trim(); code = code.Replace(";", "").Replace(":", "").Trim(); InviteInfo info = ToolDb.GenInviteInfo("广东省", "阳江市区", "阳西县", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parserdetail.Reset(); NodeList fileNode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("cellSpacing", "1"))); if (fileNode != null && fileNode.Count > 0 && fileNode[0] is TableTag) { TableTag fileTable = fileNode[0] as TableTag; for (int f = 10; f < fileTable.RowCount; f++) { TableRow trowFile = fileTable.Rows[f]; for (int z = 0; z < 1; z++) { string tr1 = string.Empty; tr1 = trowFile.Columns[z].ToPlainTextString().Trim(); if (tr1.Contains("下载招标文件:") || tr1.Contains("下载工程量清单:") || tr1.Contains("下载图纸:")) { if (fileTable.Rows[f].Columns[z + 1].ToPlainTextString().Trim() != "") { int tt = fileTable.Rows[f].Columns[z + 1].SearchFor(typeof(ATag), true).Count; for (int ii = 0; ii < tt; ii++) { string st3 = fileTable.Rows[f].Columns[z + 1].SearchFor(typeof(ATag), true)[ii].ToPlainTextString().Trim(); ATag aTagCh = fileTable.Rows[f].Columns[z + 1].SearchFor(typeof(ATag), true)[ii] as ATag; string urlValues = "http://www.yjgcjy.cn" + aTagCh.Link; if (aTagCh.Link.Contains("http://www.yjgcjy.cn")) { urlValues = aTagCh.Link; } if (st3 != "") { BaseAttach attach = ToolDb.GenBaseAttach(st3, info.Id, urlValues); base.AttachList.Add(attach); } } } } else { continue; } } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } else { code = ""; Parser parserdetailtwo = new Parser(new Lexer(htmldetail)); NodeList dtnodetwo = parserdetailtwo.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "nr"))); if (dtnodetwo != null && dtnodetwo.Count > 0) { HtmlTxt = dtnodetwo.AsHtml(); inviteCtx = dtnodetwo.AsString().Replace("。", "").Trim(); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); inviteCtx = regexHtml.Replace(inviteCtx, "").Replace("O", "〇"); Regex regPrjAddr = new Regex(@"(工程建设地点|工程地点):[^\r\n]+\r\n"); prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程建设地点", "").Replace("工程地点", "").Replace(":", "").Trim(); if (prjAddress == "") { prjAddress = "见招标详细信息"; } Regex regDateT = new Regex(@"[^u4e00-u9fa5]{4}年[^u4e00-u9fa5]{1,3}月[^u4e00-u9fa5]{1,3}日"); beginDate = regDateT.Match(inviteCtx).Value.Trim(); beginDate = returnS(beginDate); if (beginDate == "") { beginDate = string.Empty; } Regex bildUnit = new Regex(@"发包人:[^\r\n]+[\r\n]{1}"); buildUnit = bildUnit.Match(inviteCtx).Value.Replace("发包人:", "").Trim(); if (buildUnit == "") { buildUnit = ""; } msgType = "阳江市建设工程交易中心"; specType = "建设工程"; inviteType = ToolHtml.GetInviteTypes(prjName); inviteCtx = inviteCtx.Replace("<?", "").Replace("xml:namespace prefix = o ns = ", "").Replace("urn:schemas-microsoft-com:office:office", "").Replace("/>", "").Trim(); inviteCtx = inviteCtx.Replace("<?", "").Replace("xml:namespace prefix = ns0 ns = ", "").Replace("urn:schemas-microsoft-com:office:smarttags", "").Replace("/>", "").Trim(); inviteCtx = inviteCtx.Replace("<?", "").Replace("xml:namespace prefix = st1 ns = ", "").Replace("urn:schemas-microsoft-com:office:smarttags", "").Replace("/>", "").Trim(); inviteCtx = inviteCtx.Replace("xml:namespace prefix = st1", "").Trim(); inviteCtx = inviteCtx.Replace("点击进入留言", "").Trim(); inviteCtx = inviteCtx.Replace("〇", "0"); InviteInfo info = ToolDb.GenInviteInfo("广东省", "阳江市区", "阳西县", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } } return(null); }