/// <summary> /// 获取两个字符之间的字符串 /// </summary> /// <param name="value"></param> /// <param name="strBegin"></param> /// <param name="strEnd"></param> /// <returns></returns> public static string GetRegexBegEnd(this string value, string strBegin, string strEnd, int len = 150) { string returnStr = ToolHtml.GetRegexString(value, strBegin, strEnd); if (Encoding.Default.GetByteCount(returnStr) > len) { return(string.Empty); } return(returnStr); }
/// <summary> /// 匹配编号 /// </summary> /// <param name="value">匹配字符串</param> /// <param name="isMon">匹配模式,是否带上冒号</param> /// <param name="len">匹配结果字符长度</param> /// <returns></returns> public static string GetCodeRegex(this string value, string[] code = null, bool isMon = true, int len = 50) { string str = string.Empty; if (code == null) { str = ToolHtml.GetRegexString(value, ToolHtml.CodeRegex, isMon); } else { str = ToolHtml.GetRegexString(value, code, isMon); } return(Encoding.Default.GetByteCount(str) > len ? string.Empty : str); }
/// <summary> /// 获取金额字符串 /// </summary> /// <param name="value"></param> /// <param name="money"></param> /// <param name="isMon"></param> /// <param name="mon"></param> /// <param name="len"></param> /// <returns></returns> public static string GetMoneyString(this string value, string[] money = null, bool isMon = false, string mon = "万", int len = 100) { string str = string.Empty; if (money == null) { str = ToolHtml.GetRegexString(value, ToolHtml.MoneyRegex, isMon); } else { str = ToolHtml.GetRegexString(value, money, isMon); } return(Encoding.Default.GetByteCount(str) > len ? string.Empty : str); }
/// <summary> /// 匹配中标单位 /// </summary> /// <param name="value">匹配字符串</param> /// <param name="isMon">匹配模式,是否带上冒号</param> /// <param name="len">匹配结果字符长度</param> /// <returns></returns> public static string GetBidRegex(this string value, string[] bid = null, bool isMon = true, int len = 150) { string str = string.Empty; if (bid == null) { str = ToolHtml.GetRegexString(value, ToolHtml.BidRegex, isMon); } else { str = ToolHtml.GetRegexString(value, bid, isMon); } return(Encoding.Default.GetByteCount(str) > len ? string.Empty : str); }
/// <summary> /// 匹配金额 /// </summary> /// <param name="value">匹配字符串</param> /// <param name="isMon">匹配模式,是否带上冒号</param> /// <param name="len">匹配结果字符长度</param> /// <returns></returns> public static string GetMoneyRegex(this string value, string[] money = null, bool isMon = false, string mon = "万", int len = 100, string lastStr = "\r\n") { string str = string.Empty; if (money == null) { str = ToolHtml.GetRegexString(value, ToolHtml.MoneyRegex, isMon, lastStr); } else { str = ToolHtml.GetRegexString(value, money, isMon, lastStr); } string moneys = ToolHtml.GetRegexMoney(str, mon); return(Encoding.Default.GetByteCount(moneys) > len ? string.Empty : moneys); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "yema"))); if (noList != null && noList.Count > 0) { string temp = noList.AsString(); try { Regex reg = new Regex(@"/[^页]+页"); string result = reg.Match(temp).Value.Replace("页", "").Replace("/", ""); pageInt = Convert.ToInt32(result); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.gy-center.net/announce/list.jhtml?visi_id=&cid=97&chid=&gid=&thistype=&searchcid=&keyword=&action=yes&interval=&page=" + i.ToString(), Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "tab01"))), new TagNameFilter("ul"))), new TagNameFilter("li"))); if (dtlList != null && dtlList.Count > 0) { for (int j = 0; j < dtlList.Count - 1; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; string temp = dtlList[j].ToPlainTextString(); string tempHtl = dtlList[j].ToHtml(); prjName = ToolHtml.GetHtmlAtagValue("title", tempHtl); beginDate = ToolHtml.GetRegexDateTime(temp); InfoUrl = "http://www.gy-center.net/announce/" + ToolHtml.GetHtmlAtagValue("href", tempHtl); string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); htlDtl = System.Text.RegularExpressions.Regex.Replace(htlDtl, "(<script)[\\s\\S]*?(</script>)", ""); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList htlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "r_content_right_main"))); if (htlList != null && htlList.Count > 0) { HtmlTxt = htlList.ToHtml(); bidCtx = Regex.Replace(HtmlTxt, "<[^>]*>", "").Replace(" ", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\t\t", "").Replace("\r\r", "\r").Replace("\n\n", "\n"); bidType = ToolHtml.GetInviteTypes(prjName); string bidStr = string.Empty; parser = new Parser(new Lexer(HtmlTxt)); NodeList bidList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "MsoNormalTable"))); if (bidList != null && bidList.Count > 0) { try { TableTag tab = bidList[0] as TableTag; if (tab.RowCount > 1 && tab.Rows[0].ColumnCount > 6) { bidStr = tab.Rows[0].Columns[0].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[0].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[1].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[1].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[2].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[2].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[3].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[3].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[4].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[4].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[5].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[5].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[6].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[6].ToPlainTextString().ToNodeString() + "\r\n"; } else if (tab.RowCount > 1 && tab.Rows[0].ColumnCount > 5) { bidStr = tab.Rows[0].Columns[0].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[0].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[1].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[1].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[2].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[2].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[3].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[3].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[4].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[4].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[5].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[5].ToPlainTextString().ToNodeString() + "\r\n"; } else if (tab.RowCount > 1 && tab.Rows[0].ColumnCount > 4) { bidStr = tab.Rows[0].Columns[0].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[0].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[1].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[1].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[2].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[2].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[3].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[3].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[4].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[4].ToPlainTextString().ToNodeString() + "\r\n"; } else if (tab.RowCount > 1 && tab.Rows[0].ColumnCount > 3) { bidStr = tab.Rows[0].Columns[0].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[0].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[1].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[1].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[2].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[2].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[3].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[3].ToPlainTextString().ToNodeString() + "\r\n"; } else if (tab.RowCount > 1 && tab.Rows[0].ColumnCount > 2) { bidStr = tab.Rows[0].Columns[0].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[0].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[1].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[1].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[2].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[2].ToPlainTextString().ToNodeString() + "\r\n"; } else if (tab.RowCount > 1 && tab.Rows[0].ColumnCount > 1) { bidStr = tab.Rows[0].Columns[0].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[0].ToPlainTextString().ToNodeString() + "\r\n"; bidStr += tab.Rows[0].Columns[1].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[1].ToPlainTextString().ToNodeString() + "\r\n"; } } catch { } } buildUnit = ToolHtml.GetRegexString(bidCtx, ToolHtml.BuildRegex); prjAddress = ToolHtml.GetRegexString(bidCtx, ToolHtml.AddressRegex); code = ToolHtml.GetRegexString(bidCtx, ToolHtml.CodeRegex); bidUnit = ToolHtml.GetRegexString(bidCtx, ToolHtml.BidRegex); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = ToolHtml.GetRegexString(bidStr.Replace(" ", ""), ToolHtml.BidRegex, false); } if (string.IsNullOrEmpty(bidUnit)) { bidUnit = bidCtx.GetRegexBegEnd("确认", "为"); } bidMoney = ToolHtml.GetRegexString(bidCtx, ToolHtml.MoneyRegex); bidMoney = ToolHtml.GetRegexMoney(bidMoney); if (string.IsNullOrEmpty(bidMoney) || bidMoney == "0") { bidMoney = bidCtx.GetRegexBegEnd("¥", "元").GetMoney(); } buildUnit = ToolHtml.GetSubString(buildUnit, 150); prjAddress = ToolHtml.GetSubString(prjAddress, 150); code = ToolHtml.GetSubString(code, 50); bidUnit = ToolHtml.GetSubString(bidUnit, 150); bidUnit = ToolHtml.GetStringTemp(bidUnit); buildUnit = ToolHtml.GetStringTemp(buildUnit); if (string.IsNullOrEmpty(code)) { code = "见中标信息"; } if (string.IsNullOrEmpty(prjAddress)) { prjAddress = "见中标信息"; } specType = "其他"; msgType = "工网在线"; BidInfo info = ToolDb.GenBidInfo("广东省", "电网专项工程", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList nodeAtag = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (nodeAtag != null && nodeAtag.Count > 0) { for (int c = 0; c < nodeAtag.Count; c++) { ATag a = nodeAtag[c] as ATag; if (a.Link.IsAtagAttach()) { string alink = "http://www.bidding.csg.cn/" + a.Link; BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText.Replace(" ", ""), info.Id, alink); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "fenye123"))); if (tdNodes != null && tdNodes.Count > 0) { string pageTemp = tdNodes.AsString().Replace(" ", "").Trim(); try { pageInt = int.Parse(ToolHtml.GetRegexString(pageTemp, "共", "页")); } catch (Exception ex) { } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode("http://www.sz-otc.com/zhaobiao/index_" + i.ToString()) + ".html", Encoding.Default); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "zhaobiao_list"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 0; j < nodeList.Count; j++) { string htl = string.Empty; htl = nodeList[j].ToHtml(); Parser ul = new Parser(new Lexer(htl)); NodeFilter filter = new TagNameFilter("li"); NodeList liList = ul.ExtractAllNodesThatMatch(filter); if (liList != null && liList.Count > 0) { for (int k = 0; k < liList.Count; k++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = liList.SearchFor(typeof(ATag), true)[k] as ATag; InfoUrl = "http://www.sz-otc.com" + aTag.Link; prjName = aTag.LinkText.Replace("[新]", "").Replace(" ", ""); if (prjName.Contains("]")) { try { int beg = prjName.IndexOf("]"); prjName = prjName.Substring(beg + 1, prjName.Length - beg - 1); } catch { } } string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); } catch { return(null); } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "right_content"), new TagNameFilter("div"))); if (dtnode != null && dtnode.Count > 0) { HtmlTxt = dtnode.ToHtml(); inviteCtx = dtnode.AsString().Replace(" ", "").Replace(" ", ""); string invite = inviteCtx.Replace("点击", "\r\n").Replace("发布人", "\r\n"); specType = "其他"; msgType = "深圳市东方招标有限公司"; if (string.IsNullOrEmpty(prjName)) { Regex regexName = new Regex(@"(工程名称|项目名称)(:|:)[^\r\n]+\r\n"); prjName = regexName.Match(inviteCtx).Value.Replace("工程名称", "").Replace("项目名称", "").Replace(":", "").Replace(":", "").Trim(); } Regex regex = new Regex(@"(工程编号|招标编号)(:|:)[^\r\n]+\r\n"); code = regex.Match(invite).Value.Replace("工程编号", "").Replace("招标编号", "").Replace(":", "").Replace(":", "").Trim(); Regex regexAddress = new Regex(@"(地址|项目地址)(:|:)[^\r\n]+\r\n"); prjAddress = regexAddress.Match(inviteCtx).Value.Replace("地址", "").Replace("项目地址", "").Replace(":", "").Replace(":", "").Trim(); Regex regexUnit = new Regex(@"(招标单位|招标机构)(:|:)[^\r\n]+\r\n"); buildUnit = regexUnit.Match(inviteCtx).Value.Replace("招标单位", "").Replace("招标机构", "").Replace(":", "").Replace(":", "").Trim(); Regex regexCar = new Regex(@"(开始日期|发布日期)(:|:)[^\r\n]+\r\n"); beginDate = regexCar.Match(invite).Value.Replace("开始日期", "").Replace("发布日期", "").Replace(":", "").Replace(":", "").Trim(); if (!string.IsNullOrEmpty(beginDate)) { string time = string.Empty; for (int leng = 0; leng < beginDate.Length; leng++) { if (leng < 10) { time += beginDate.Substring(leng, 1); } } beginDate = time; } specType = "其他"; msgType = "深圳市东方招标有限公司"; if (buildUnit == "") { buildUnit = ""; } inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= 20) { return(list); } } } } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "yesh fl"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode[0].ToNodePlainString().GetRegexBegEnd("/", "页"); pageInt = int.Parse(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.szlhxq.gov.cn/mzbsc/zwgk69/cgzb/zbgg21/14844-" + i.ToString() + ".html", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "news1_list")), true), new TagNameFilter("li"))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; beginDate = viewList[j].ToNodePlainString().GetDateRegex(); ATag aTag = viewList[j].GetATag(); prjName = aTag.GetAttribute("title"); InfoUrl = "http://www.szlhxq.gov.cn" + aTag.Link; string htmDtl = string.Empty; try { htmDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "tit-content"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = dtl.AsHtml(); bidCtx = System.Text.RegularExpressions.Regex.Replace(HtmlTxt, "(<script)[\\s\\S]*?(</script>)", ""); bidCtx = System.Text.RegularExpressions.Regex.Replace(bidCtx.Replace("<br/>", "\r\n").Replace("<BR/>", "\r\n").Replace("<BR>", "\r\n").Replace("<br>", "\r\n"), "<[^>]*>", "").Replace(" ", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\r\n", "\r\n").Replace("\r\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\t", "").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); bidType = prjName.GetInviteBidType(); if (string.IsNullOrEmpty(bidType)) { bidType = "工程"; } code = ToolHtml.GetRegexString(bidCtx, ToolHtml.CodeRegex, true, 50); buildUnit = ToolHtml.GetRegexString(bidCtx, ToolHtml.BuildRegex, true, 150); bidMoney = ToolHtml.GetRegexString(bidCtx, ToolHtml.MoneyRegex, false); bidUnit = ToolHtml.GetRegexString(bidCtx, ToolHtml.BidRegex, true, 150); prjMgr = ToolHtml.GetRegexString(bidCtx, ToolHtml.MgrRegex, true, 50); bidMoney = ToolHtml.GetRegexMoney(bidMoney); if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "深圳市龙华新区民治街道办事处"; } msgType = "深圳市龙华新区民治街道办事处"; specType = "建设工程"; bidType = "小型工程"; prjName = ToolDb.GetPrjName(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "Top10 TxtCenter"))); if (noList != null && noList.Count > 0) { string temp = noList.AsString().GetRegexBegEnd("/", "页"); try { pageInt = Convert.ToInt32(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.bidding.csg.cn/zbgg/index_" + i.ToString() + ".jhtml", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "W750 Right")), true), new TagNameFilter("li"))); if (nodeList != null && nodeList.Count > 0) { for (int j = 1; j < nodeList.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = nodeList[j].GetATag(); prjName = aTag.LinkText; beginDate = nodeList[j].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.bidding.csg.cn" + aTag.Link; string htlDtl = string.Empty; try { htlDtl = ToolHtml.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "Center W1000"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); parser = new Parser(new Lexer(HtmlTxt)); NodeList nameNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("h1"), new HasAttributeFilter("class", "TxtCenter Padding10"))); if (nameNode != null && nameNode.Count > 0) { prjName = nameNode[0].ToNodePlainString(); } inviteCtx = HtmlTxt.ToCtxString(); inviteType = ToolHtml.GetInviteTypes(prjName); prjAddress = ToolHtml.GetRegexString(inviteCtx, ToolHtml.AddressRegex); buildUnit = ToolHtml.GetRegexString(inviteCtx, ToolHtml.BuildRegex); code = ToolHtml.GetRegexString(inviteCtx, ToolHtml.CodeRegex); prjAddress = ToolHtml.GetSubString(prjAddress, 150); buildUnit = ToolHtml.GetSubString(buildUnit, 150); code = ToolHtml.GetSubString(code, 50); if (string.IsNullOrEmpty(code)) { code = "见招标信息"; } if (string.IsNullOrEmpty(prjAddress)) { prjAddress = "见招标信息"; } specType = "其他"; msgType = "中国南方电网有限责任公司招标服务中心"; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "中国南方电网有限责任公司招标服务中心"; } InviteInfo info = ToolDb.GenInviteInfo("广东省", "电网专项工程", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList nodeAtag = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (nodeAtag != null && nodeAtag.Count > 0) { for (int c = 0; c < nodeAtag.Count; c++) { ATag a = nodeAtag[c] as ATag; if (a.Link.IsAtagAttach()) { string alink = "http://www.bidding.csg.cn/" + a.Link; try { BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText.Replace(" ", "").Replace(";", "").Replace(";", ""), info.Id, alink); base.AttachList.Add(attach); } catch { } } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("align", "right"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString().Replace("共有", ""); Regex reg = new Regex(@"共[^+]+页"); string page = reg.Match(temp).Value.Replace("共", "").Replace("页", ""); pageInt = Convert.ToInt32(page); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl) + "&pageNum=" + i.ToString(), Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "97%"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[2] as TableTag; for (int j = 0; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; prjName = tr.Columns[1].ToPlainTextString().ToNodeString(); inviteType = prjName.GetInviteBidType(); beginDate = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.gdnx.gov.cn" + ToolHtml.GetRegexString(tr.Columns[1].ToHtml().GetATagValue("onclick"), ",", ",").Replace("'", ""); string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); htlDtl = htlDtl.GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "90%"))); if (dtlList != null && dtlList.Count > 0) { HtmlTxt = dtlList.ToHtml(); inviteCtx = HtmlTxt.ToCtxString(); buildUnit = inviteCtx.GetBuildRegex(); code = inviteCtx.GetCodeRegex(); prjAddress = inviteCtx.GetAddressRegex(); msgType = "韶关市南雄住房和城乡规划建设局"; specType = "建设工程"; InviteInfo info = ToolDb.GenInviteInfo("广东省", "韶关市区", "南雄市", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aList = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aList != null && aList.Count > 0) { for (int c = 0; c < aList.Count; c++) { ATag a = aList[c] as ATag; if (a.Link.IsAtagAttach()) { BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText.Replace(" ", "").Replace(";", "").Replace(";", ""), info.Id, a.Link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default, ref cookiestr); Regex regexHtml = new Regex(@"<script[^<]*</script>"); htl = regexHtml.Replace(htl, ""); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("align", "right"))); Regex regexPage = new Regex(@"共\d+页"); try { page = int.Parse(regexPage.Match(nodeList.AsString()).Value.Trim(new char[] { '共', '页' })); } catch (Exception) { } for (int i = 1; i < page; i++) { if (i > 1) { try { htl = this.ToolWebSite.GetHtmlByUrl("http://market.meizhou.gov.cn/deptWebsiteAction.do?action=secondIndex&deptId=1925&issueTypeCode=009002002&issueTypeName=各县(市)招标公告&showSubNodeflag=1&pageNum=" + i.ToString(), Encoding.Default); Regex regexHtml = new Regex(@"<script[^<]*</script>"); htl = regexHtml.Replace(htl, ""); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(htl)); NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("cellpadding", "1"))); if (tableNodeList != null && tableNodeList.Count > 0) { TableTag table = (TableTag)tableNodeList[0]; for (int j = 0; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; prjName = tr.Columns[0].ToPlainTextString().Trim(); beginDate = ToolHtml.GetRegexDateTime(tr.Columns[1].ToPlainTextString()); string aTag = tr.Columns[0].ToHtml(); parser = new Parser(new Lexer(aTag)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); string aLink = aNode.ToHtml().ToLower(); try { string strValue1 = aLink.Substring(aLink.IndexOf("("), aLink.Length - aLink.IndexOf("(")); string strValue2 = strValue1.Remove(strValue1.IndexOf(")")); string[] strValue3 = strValue2.Split(','); string strValue4 = strValue3[1]; InfoUrl = "http://market.meizhou.gov.cn/website/deptwebsite/1925/Content.jsp?issueId=17039&msgType=00&filePath=" + strValue4.Replace("'", ""); } catch { continue; } string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.Default).Replace(" ", ""); Regex regexHtml = new Regex(@"<script[^<]*</script>"); htmldetail = regexHtml.Replace(htmldetail, ""); } catch (Exception) { Logger.Error("InviteMeiZhouZhouXian"); continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("P"), new HasAttributeFilter("class", "MsoNormal"))); if (dtnode == null || dtnode.Count < 1) { parserdetail = new Parser(new Lexer(htmldetail)); dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "WordSection1"))); } if (dtnode.Count > 0 && dtnode != null) { HtmlTxt = dtnode.AsHtml(); for (int k = 0; k < dtnode.Count; k++) { string tr1 = string.Empty; tr1 = dtnode[k].ToPlainTextString().Replace(" ", "").Trim(); if (k == 0) { string InvType = tr1; inviteType = ToolHtml.GetInviteTypes(InvType); } inviteCtx += tr1 + ":" + "\r\n"; } prjAddress = ToolHtml.GetRegexString(inviteCtx, ToolHtml.AddressRegex); buildUnit = ToolHtml.GetRegexString(inviteCtx.Replace("(盖章)", ""), ToolHtml.BuildRegex); if (buildUnit != "" && buildUnit.Contains(":")) { int zz = buildUnit.IndexOf(":"); buildUnit = buildUnit.Remove(zz).ToString(); } code = ToolHtml.GetRegexString(inviteCtx, ToolHtml.CodeRegex); Regex regoType = new Regex(@"工程类型:[^\r\n]+\r\n"); string oType = regoType.Match(inviteCtx).Value.Replace("工程类型:", "").Trim(); if (oType.Contains("房建")) { otherType = "房建及工业民用建筑"; } else if (oType.Contains("市政")) { otherType = "市政工程"; } else if (oType.Contains("园林绿化")) { otherType = "园林绿化工程"; } else if (oType.Contains("装饰") || oType.Contains("装修")) { otherType = "装饰装修工程"; } else if (oType.Contains("电力")) { otherType = "电力工程"; } else if (oType.Contains("水利")) { otherType = "水利工程"; } if (oType.Contains("环保")) { otherType = "环保工程"; } if (buildUnit == "") { buildUnit = ""; } if (buildUnit.Contains("梅州市建设工程交易中心")) { buildUnit = ""; } msgType = "梅州市建设工程交易中心"; specType = "建设工程"; inviteCtx = inviteCtx.Replace("<?", "").Replace("xml:namespace prefix = o ns = ", "").Replace("urn:schemas-microsoft-com:office:office", "").Replace("/>", "").Trim(); inviteCtx = inviteCtx.Replace("<?", "").Replace("xml:namespace prefix = ns0 ns = ", "").Replace("urn:schemas-microsoft-com:office:smarttags", "").Replace("/>", "").Trim(); inviteCtx = inviteCtx.Replace("<?", "").Replace("xml:namespace prefix = st1 ns = ", "").Replace("urn:schemas-microsoft-com:office:smarttags", "").Replace("/>", "").Trim(); inviteCtx = inviteCtx.Replace("?xml:namespaceprefix=o/>", "").Trim(); inviteCtx = inviteCtx.Replace("<?xml:namespaceprefix=st1/>", "").Trim(); InviteInfo info = ToolDb.GenInviteInfo("广东省", "梅州市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string cookiestr = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "table1"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("共", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "&page=" + i, Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "position6")), true), new TagNameFilter("li"))); if (pageNode != null && pageNode.Count > 0) { for (int j = 3; j < pageNode.Count; j++) { INode node = pageNode[j]; ATag aTag = node.GetATag(); string psName = aTag.LinkText; if (psName.Contains("中标") || psName.Contains("结果")) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = aTag.GetAttribute("title"); InfoUrl = "http://www.zqgcjy.com/" + aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch (Exception) { continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "table1"))); if (dtnode != null && dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); bidCtx = HtmlTxt.ToCtxString(); beginDate = bidCtx.GetDateRegex(); code = bidCtx.GetCodeRegex(); bidMoney = bidCtx.GetMoneyRegex(); if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney)) { bidMoney = bidCtx.GetMoneyRegex(null, true); } if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney)) { bidMoney = bidCtx.GetRegex("总额").GetMoney(); } prjMgr = bidCtx.GetMgrRegex(); bidUnit = bidCtx.GetBidRegex(); bidDate = bidCtx.GetTimeRegex(); buildUnit = bidCtx.GetBuildRegex(); parser = new Parser(new Lexer(HtmlTxt)); NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (tableNode != null && tableNode.Count > 0) { for (int t = 0; t < tableNode.Count; t++) { TableTag tag = tableNode[t] as TableTag; string classStr = tag.GetAttribute("class"); if (!string.IsNullOrEmpty(classStr) && classStr.ToLower().Contains("table1")) { continue; } string ctx = string.Empty; for (int r = 0; r < tag.RowCount; r++) { for (int c = 0; c < tag.Rows[r].ColumnCount; c++) { string temp = tag.Rows[r].Columns[c].ToNodePlainString().GetReplace(":,:"); if ((c + 1) % 2 == 0) { ctx += temp + "\r\n"; } else { ctx += temp + ":"; } } } if (string.IsNullOrEmpty(bidUnit)) { bidUnit = ctx.GetRegex("成交候选人,中标单位名称,第一中标候选人,第一候选人"); } if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney)) { bidMoney = ctx.GetMoneyRegex(); } if (string.IsNullOrEmpty(prjMgr)) { prjMgr = ctx.GetMgrRegex(); } if (string.IsNullOrEmpty(prjMgr)) { prjMgr = ctx.GetRegex("拟任总监,拟任项目经理"); } if (!bidUnit.Contains("公司")) { ctx = string.Empty; try { for (int r = 1; r < tag.Rows[4].ColumnCount; r++) { string temp = tag.Rows[4].Columns[r].ToNodePlainString().GetReplace(":,:"); ctx += temp + ":"; ctx += tag.Rows[5].Columns[r].ToNodePlainString().GetReplace(":,:") + "\r\n"; } if (string.IsNullOrEmpty(bidUnit)) { bidUnit = ctx.GetRegex("成交候选人,中标单位名称,第一中标候选人,第一成交候选人"); } if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney)) { bidMoney = ctx.GetMoneyRegex(); } if (string.IsNullOrEmpty(prjMgr)) { prjMgr = ctx.GetMgrRegex(); } if (string.IsNullOrEmpty(prjMgr)) { prjMgr = ctx.GetRegex("拟任总监,拟任项目经理"); } } catch { } } } } msgType = "肇庆工程交易中心"; specType = bidType = "建设工程"; BidInfo info = ToolDb.GenBidInfo("广东省", "肇庆市区", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); //ToolDb.SaveEntity(info, ""); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } else { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = aTag.GetAttribute("title"); InfoUrl = "http://www.zqgcjy.com/" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch (Exception) { continue; } Parser parserdetail = new Parser(new Lexer(htmldtl)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "table1"))); if (dtnode != null && dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); buildUnit = inviteCtx.GetBidUnitDel().GetBuildRegex(); beginDate = inviteCtx.GetDateRegex(); prjAddress = ToolHtml.GetRegexString(inviteCtx, ToolHtml.AddressRegex); //inviteCtx.GetAddressRegex(); code = inviteCtx.GetReplace(" ").GetCodeRegex().GetCodeDel(); prjAddress = ToolHtml.GetSubString(prjAddress, 150); NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (tableNode != null && tableNode.Count > 0) { string ctx = string.Empty; TableTag tag = tableNode[0] as TableTag; for (int r = 0; r < tag.RowCount; r++) { for (int c = 0; c < tag.Rows[r].ColumnCount; c++) { string temp = tag.Rows[r].Columns[c].ToNodePlainString().GetReplace(":,:"); if ((c + 1) % 2 == 0) { ctx += temp + "\r\n"; } else { ctx += temp + ":"; } } } if (string.IsNullOrEmpty(code)) { code = ctx.GetCodeRegex(); } if (string.IsNullOrEmpty(buildUnit)) { buildUnit = ctx.GetBuildRegex(); } if (string.IsNullOrEmpty(prjAddress)) { prjAddress = ctx.GetAddressRegex(); } if (string.IsNullOrEmpty(prjAddress)) { prjAddress = "见招标信息"; } } msgType = "肇庆工程交易中心"; specType = "建设工程"; inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "肇庆市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "yema"))); if (noList != null && noList.Count > 0) { string temp = noList.AsString(); try { Regex reg = new Regex(@"/[^页]+页"); string result = reg.Match(temp).Value.Replace("页", "").Replace("/", ""); pageInt = Convert.ToInt32(result); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.gy-center.net/announce/list.jhtml?visi_id=&cid=76&chid=&gid=&thistype=&searchcid=&keyword=&action=yes&interval=&page=" + i.ToString(), Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "tab01"))), new TagNameFilter("ul"))), new TagNameFilter("li"))); if (dtlList != null && dtlList.Count > 0) { for (int j = 0; j < dtlList.Count - 1; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; string temp = dtlList[j].ToPlainTextString(); string tempHtl = dtlList[j].ToHtml(); prjName = ToolHtml.GetHtmlAtagValue("title", tempHtl); beginDate = ToolHtml.GetRegexDateTime(temp); InfoUrl = "http://www.gy-center.net/announce/" + ToolHtml.GetHtmlAtagValue("href", tempHtl); string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); htlDtl = System.Text.RegularExpressions.Regex.Replace(htlDtl, "(<script)[\\s\\S]*?(</script>)", ""); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList htlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "r_content_right_main"))); if (htlList != null && htlList.Count > 0) { HtmlTxt = htlList.ToHtml(); inviteCtx = Regex.Replace(HtmlTxt, "<[^>]*>", "").Replace(" ", "").Replace(" ", "").Replace("\t\t", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); inviteType = ToolHtml.GetInviteTypes(prjName); prjAddress = ToolHtml.GetRegexString(inviteCtx, ToolHtml.AddressRegex); buildUnit = ToolHtml.GetRegexString(inviteCtx, ToolHtml.BuildRegex); code = ToolHtml.GetRegexString(inviteCtx, ToolHtml.CodeRegex); prjAddress = ToolHtml.GetSubString(prjAddress, 150); buildUnit = ToolHtml.GetSubString(buildUnit, 150); code = ToolHtml.GetSubString(code, 50); if (string.IsNullOrEmpty(code)) { code = "见招标信息"; } if (string.IsNullOrEmpty(prjAddress)) { prjAddress = "见招标信息"; } specType = "其他"; msgType = "工网在线"; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "工网在线"; } InviteInfo info = ToolDb.GenInviteInfo("广东省", "电网专项工程", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 int pageInt = 1; string html = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new TagNameFilter("div")), new HasAttributeFilter("id", "page_div"))); if (sNode != null && sNode.Count > 0) { string page = ToolHtml.GetRegexString(sNode.AsString(), "共", "页"); try { pageInt = int.Parse(page); } catch { pageInt = 7; } } parser.Reset(); for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.conghua.gov.cn/zgch/zbzb/list_" + i.ToString() + ".shtml", Encoding.Default); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(html)); sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "list_list"))), new TagNameFilter("table"))); if (sNode != null && sNode.Count > 0) { TableTag table = sNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { TableRow tr = table.Rows[j]; string projectName = ToolHtml.GetHtmlAtagValue("title", tr.ToHtml()); if (!projectName.Contains("中标") && !projectName.Contains("结果") && !projectName.Contains("候选单位公示")) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = projectName; inviteType = ToolHtml.GetInviteTypes(projectName); beginDate = ToolHtml.GetRegexDateTime(tr.Columns[1].ToPlainTextString()); InfoUrl = "http://www.conghua.gov.cn" + ToolHtml.GetHtmlAtagValue("href", tr.ToHtml()).Replace("..", ""); string htmlDtl = string.Empty; try { htmlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8); htmlDtl = ToolHtml.GetRegexHtlTxt(htmlDtl); } catch { continue; } parser = new Parser(new Lexer(htmlDtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "zoomcon"))); if (dtlList != null && dtlList.Count > 0) { HtmlTxt = dtlList.ToHtml(); inviteCtx = dtlList.AsString().Replace(" ", ""); buildUnit = ToolHtml.GetRegexString(inviteCtx, ToolHtml.BuildRegex, true); if (!string.IsNullOrEmpty(buildUnit) && buildUnit.Contains(" ")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf(" ")); } buildUnit = ToolHtml.GetSubString(buildUnit, 150); msgType = "广州建设工程交易中心"; specType = "建设工程"; inviteType = inviteType == "" ? "小型工程" : inviteType; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "广州建设工程交易中心"; } InviteInfo info = ToolDb.GenInviteInfo("广东省", "广州市区", "从化市", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } else { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = projectName; bidType = ToolHtml.GetInviteTypes(projectName); beginDate = ToolHtml.GetRegexDateTime(tr.Columns[1].ToPlainTextString()); InfoUrl = "http://www.conghua.gov.cn" + ToolHtml.GetHtmlAtagValue("href", tr.ToHtml()).Replace("..", ""); string htmlDtl = string.Empty; try { htmlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8); htmlDtl = ToolHtml.GetRegexHtlTxt(htmlDtl); } catch { continue; } parser = new Parser(new Lexer(htmlDtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "zoomcon"))); if (dtlList != null && dtlList.Count > 0) { HtmlTxt = dtlList.ToHtml(); bidCtx = dtlList.AsString(); buildUnit = ToolHtml.GetRegexString(bidCtx, ToolHtml.BuildRegex, true); buildUnit = ToolHtml.GetSubString(buildUnit, 150); msgType = "广州建设工程交易中心"; specType = "建设工程"; bidType = bidType == "" ? bidType : "小型工程"; parser = new Parser(new Lexer(HtmlTxt)); NodeList bidNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (bidNode != null && bidNode.Count > 0) { string ctx = string.Empty; TableTag bidTable = bidNode[0] as TableTag; try { for (int r = 0; r < bidTable.RowCount; r++) { ctx += bidTable.Rows[r].Columns[0].ToNodePlainString() + ":"; ctx += bidTable.Rows[r].Columns[1].ToNodePlainString() + "\r\n"; } } catch { } bidUnit = ctx.GetRegex("单位名称,承包意向人名称"); bidMoney = ctx.GetMoneyRegex(); prjMgr = ctx.GetMgrRegex(); if (prjMgr.Contains("/")) { prjMgr = prjMgr.Remove(prjMgr.IndexOf("/")); } } if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "广州建设工程交易中心"; } BidInfo info = ToolDb.GenBidInfo("广东省", "广州市区", "从化市", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "Normal"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString().Replace("createPageHTML(", "").Replace("index", "").Replace("html", "").Replace(", 0,", "").Replace(");", "").Replace(",", "").Replace(";", "").Replace(")", "").Replace("\"", "").Replace(" ", "").GetRegexBegEnd("/", "跳"); pageInt = Convert.ToInt32(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { string url = "http://lhbsc.szlhxq.gov.cn/lhbsc/bsdt43/qyfw78/zbcg2/zbxxgg/065b33d5-" + i.ToString() + ".html"; html = this.ToolWebSite.GetHtmlByUrl(url, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("tr"), new HasAttributeFilter("class", ""))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}"); INode node = viewList[j]; ATag aTag = node.GetATag(); beginDate = regDate.Match(viewList[j].ToPlainTextString().Trim()).Value; prjName = aTag.GetAttribute("title"); InfoUrl = "http://lhbsc.szlhxq.gov.cn" + aTag.Link.Replace("../", "").Replace("./", ""); string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>"); htlDtl = regexHtml.Replace(htlDtl, ""); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "contentbox"))); if (dtl != null && dtl.Count > 0) { Regex.Replace(dtl.AsHtml(), "(<script)[\\s\\S]*?(</script>)", ""); Regex.Replace(HtmlTxt, "(<script)[\\s\\S]*?(</script>)", ""); Regex.Replace(bidCtx, "<[^>]*>", "").Replace(" ", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("¥", ""); bidType = ToolHtml.GetInviteTypes(prjName); buildUnit = ToolHtml.GetRegexString(bidCtx, "按(建设单位)", "(提供)"); bidMoney = ToolHtml.GetRegexString(bidCtx, "(中标金额)", "(元)|(万元)|(;)").GetReplace(":", "").GetMoney("万元"); bidUnit = bidCtx.GetBidRegex(); if (string.IsNullOrWhiteSpace(bidUnit)) { bidUnit = bidCtx.GetRegex("中标供应商名称"); } if (bidUnit.Contains("公司")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf("公司")) + "公司"; } if (prjMgr.Contains("资格")) { prjMgr = prjMgr.Remove(prjMgr.IndexOf("资格")); } if (string.IsNullOrWhiteSpace(bidMoney)) { bidMoney = bidCtx.GetRegex("中标金额").GetReplace(":", ""); } bidUnit = ToolHtml.GetStringTemp(bidUnit); if (string.IsNullOrWhiteSpace(buildUnit)) { buildUnit = bidCtx.GetRegex("采购人名称"); } bidUnit = ToolHtml.GetSubString(bidUnit, 150); code = bidCtx.GetCodeRegex().GetReplace(")", ""); if (string.IsNullOrWhiteSpace(code)) { code = bidCtx.GetRegexBegEnd("招标编号:", ")"); } prjMgr = bidCtx.GetMgrRegex(); try { if (Convert.ToDecimal(bidMoney) > 100000) { bidMoney = (decimal.Parse(bidMoney) / 10000).ToString(); } } catch { } if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "深圳市龙华新区龙华街道办事处"; } msgType = "深圳市龙华新区龙华街道办事处"; specType = "建设工程"; bidType = "小型工程"; prjName = ToolDb.GetPrjName(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }