public void DealHtml(IList list, string html, bool crawlAll) { Parser parserDtl = new Parser(new Lexer(html)); NodeList aNodes = parserDtl.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "lefttable"))); if (aNodes != null && aNodes.Count > 0) { Type typs = typeof(ATag); TableTag table = aNodes[0] as TableTag; for (int t = 1; t < table.RowCount - 1; t++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, ctx = string.Empty, CreateTime = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[t] as TableRow; ATag aTag = tr.SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = aTag.Link; prjName = table.Rows[t].Columns[1].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); endDate = table.Rows[t].Columns[2].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); string htmlDtl = string.Empty; try { htmlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); } catch (Exception ex) { continue; } Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); htmlDtl = regexHtml.Replace(htmlDtl, ""); Parser parserCtx = new Parser(new Lexer(htmlDtl)); NodeList ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "printTb lefttable"))); if (ctxNode != null && ctxNode.Count > 0) { Parser parserdiv = new Parser(new Lexer(htmlDtl)); NodeList aNodesdiv = parserdiv.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "biuuu_button"))); HtmlTxt = ctxNode.AsHtml().Replace(aNodesdiv.AsHtml(), "").Trim(); Type tp = typeof(ATag); TableTag tabTag = ctxNode[0] as TableTag; string startTime = tabTag.Rows[1].Columns[0].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); Regex regex = new Regex(@"时间:\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}"); Match math = regex.Match(startTime); beginDate = math.Value.Replace("时间:", "").Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); Regex regexcode = new Regex("(工程编号|项目编号|招标编号):[^\r\n]+[\r\n]{1}"); Match match = regexcode.Match(tabTag.ToPlainTextString()); code = match.Value.Substring(match.Value.IndexOf(":") + 1).Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); Regex regexBuildUnit = new Regex("(招标人|建设单位|招标采购单位):[^\r\n]+[\r\n]{1}"); Match matchBuildUnit = regexBuildUnit.Match(tabTag.ToPlainTextString()); buildUnit = matchBuildUnit.Value.Substring(matchBuildUnit.Value.IndexOf(":") + 1).Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); Regex regexAddress = new Regex("(建设地点|项目地点|工程地点):[^\r\n]+[\r\n]{1}"); Match matchAddress = regexAddress.Match(tabTag.ToPlainTextString()); prjAddress = matchAddress.Value.Substring(matchAddress.Value.IndexOf(":") + 1).Trim(); ctx = tabTag.Rows[2].Columns[0].ToPlainTextString().Replace(" ", " ").Replace("\r\n\r\n\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); if (ctx.Length > 0) { Regex regexCtx = new Regex("<!--[^<]+-->"); ctx = regexCtx.Replace(ctx, ""); } if (Encoding.Default.GetByteCount(code) > 50) { code = ""; } if (buildUnit == "" || buildUnit == null) { buildUnit = ""; } if (Encoding.Default.GetByteCount(buildUnit) > 150) { buildUnit = buildUnit.Substring(0, 150); } if (Encoding.Default.GetByteCount(prjAddress) > 200) { prjAddress = "见招标公告内容"; } if (beginDate.Length > 0 && endDate.Length > 0) { DateTime begin = new DateTime(); DateTime end = new DateTime(); try { begin = DateTime.Parse(beginDate); end = DateTime.Parse(endDate); } catch (Exception) { } if (begin > end) { endDate = string.Empty; } } } parserCtx.Reset(); ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "toptd_bai"))); Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}"); beginDate = regDate.Match(ctxNode.AsString()).Value.Trim(); if (beginDate == "") { beginDate = string.Empty; } inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "惠州市区", "惠东县", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, ctx, remark, "惠州市建设工程交易中心", inviteType, "建设工程", string.Empty, InfoUrl, HtmlTxt); list.Add(info); ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("target", "_blank"))); NodeList aTagNodes = ctxNode.SearchFor(typeof(ATag), true); for (int a = 0; a < aTagNodes.Count; a++) { ATag fileTage = aTagNodes[a] as ATag; if (fileTage.Link.Contains("http://www.ebc.huizhou.gov.cn/index/loadNewsFile")) { string downloadURL = fileTage.Link; BaseAttach attach = ToolDb.GenBaseAttach(fileTage.ToPlainTextString(), info.Id, downloadURL); base.AttachList.Add(attach); } } if (!crawlAll && list.Count >= this.MaxCount) { return; } } } }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pagination"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode[0].ToNodePlainString(); pageInt = Convert.ToInt32(temp.GetRegexBegEnd("/", "页")); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "&pageNo=" + i, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "div_list")), true), new TagNameFilter("ul"))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = viewList[j].GetATag(); prjName = aTag.GetAttribute("title"); InfoUrl = "http://www.hzzk.cn" + aTag.Link.GetReplace("../"); beginDate = viewList[j].ToPlainTextString().GetDateRegex(); string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "div_view"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = dtl.AsHtml(); bidCtx = HtmlTxt.ToLower().GetReplace("</p>,</br>,<br>", "\r\n").ToCtxString(); bidType = prjName.GetInviteBidType(); code = bidCtx.GetCodeRegex().GetCodeDel(); //if (code.IsChina()) // code = ""; buildUnit = bidCtx.GetBuildRegex(); prjAddress = bidCtx.GetAddressRegex(); if (buildUnit.Contains("招标代理")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("招标代理")); } if (buildUnit.Contains("公司")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司"; } bidUnit = bidCtx.GetBidRegex(); bidMoney = bidCtx.GetMoneyRegex(); if (string.IsNullOrEmpty(bidUnit)) { parser = new Parser(new Lexer(HtmlTxt)); NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (tableNode != null && tableNode.Count > 0) { string ctx = string.Empty; TableTag table = null; if (tableNode.Count == 2) { table = tableNode[1] as TableTag; } else { table = tableNode[0] as TableTag; } if (table.ToPlainTextString().Contains("投标人") || table.ToPlainTextString().Contains("投标企业")) { if (table.RowCount > 1) { for (int r = 0; r < table.Rows[0].ColumnCount; r++) { try { ctx += table.Rows[0].Columns[r].ToNodePlainString() + ":"; ctx += table.Rows[1].Columns[r].ToNodePlainString() + "\r\n"; } catch { continue; } } bidUnit = ctx.GetBidRegex(); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = ctx.GetRegex("投标人名称,投标企业名称,投标企业,投标人"); } if (bidUnit.Contains("单位名称")) { bidUnit = ctx.GetRegex("第一中标候选人"); } bidMoney = ctx.GetMoneyRegex(); if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney)) { bidMoney = ctx.GetMoneyRegex(new string[] { "报价" }); } } if (bidUnit.Length < 5) { ctx = string.Empty; if (table.RowCount > 2) { for (int r = 0; r < table.Rows[0].ColumnCount; r++) { try { ctx += table.Rows[0].Columns[r].ToNodePlainString() + ":"; ctx += table.Rows[2].Columns[r].ToNodePlainString() + "\r\n"; } catch { continue; } } bidUnit = ctx.GetBidRegex(); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = ctx.GetRegex("投标人名称,投标企业名称,投标企业,投标人"); } if (bidUnit.Contains("单位名称")) { bidUnit = ctx.GetRegex("第一中标候选人"); } bidMoney = ctx.GetMoneyRegex(); if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney)) { bidMoney = ctx.GetMoneyRegex(new string[] { "报价" }); } } } } else { for (int r = 0; r < table.RowCount; r++) { try { ctx += table.Rows[r].Columns[0].ToNodePlainString() + ":"; ctx += table.Rows[r].Columns[1].ToNodePlainString() + "\r\n"; } catch { continue; } } bidUnit = ctx.GetBidRegex(); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = ctx.GetRegex("单位名称"); } bidMoney = ctx.GetMoneyRegex(); prjMgr = ctx.GetMgrRegex(); if (string.IsNullOrEmpty(prjMgr)) { prjMgr = ctx.GetRegex("项目负责人姓名及资质证书编号"); } if (prjMgr.Contains(",")) { prjMgr = prjMgr.Remove(prjMgr.IndexOf(",")); } if (prjMgr.Contains(",")) { prjMgr = prjMgr.Remove(prjMgr.IndexOf(",")); } } } else { bidUnit = bidCtx.GetRegexBegEnd("中标人", "公司").GetReplace(":,:") + "公司"; if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney)) { bidMoney = bidCtx.GetRegexBegEnd("中标价", "元").GetReplace(":,:").GetMoney(); } } } try { if (decimal.Parse(bidMoney) < 1) { bidMoney = "0"; } if (decimal.Parse(bidMoney) > 100000) { bidMoney = (decimal.Parse(bidMoney) / 10000).ToString(); } } catch { } if (bidUnit == "公司" || bidUnit.Length <= 5) { bidUnit = ""; } msgType = "惠州仲恺高新技术产业开发区公共资源交易中心"; specType = "政府采购"; BidInfo info = ToolDb.GenBidInfo("广东省", "惠州市区", "高新区", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k].GetATag(); if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.hzzk.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <BidInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "Normal"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode[0].ToNodePlainString().GetRegexBegEnd("/", "跳转"); pageInt = Convert.ToInt32(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://glbsc.szlhxq.gov.cn/glbsc/zwgk70/zbcg5/zbxxgs93/15159-" + i + ".html", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("style", "border-bottom: 1px dashed #333;"))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { TableTag table = viewList[j] as TableTag; string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; beginDate = table.ToPlainTextString().GetDateRegex(); ATag aTag = table.GetATag(); prjName = aTag.GetAttribute("title"); InfoUrl = "http://glbsc.szlhxq.gov.cn" + aTag.Link; string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "contentbox"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = dtl.AsHtml(); bidCtx = HtmlTxt.GetReplace("</p>,</br>", "\r\n").ToCtxString(); code = bidCtx.GetCodeRegex().GetCodeDel(); buildUnit = bidCtx.GetBuildRegex(); bidUnit = bidCtx.GetBidRegex(); bidMoney = bidCtx.GetMoneyRegex(null, false, "万元"); prjMgr = bidCtx.GetMgrRegex(); if (string.IsNullOrEmpty(bidUnit)) { parser = new Parser(new Lexer(HtmlTxt)); NodeList bidNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); TableTag bidTable = null; string ctx = string.Empty; if (bidNode != null && bidNode.Count > 1) { bidTable = bidNode[1] as TableTag; } else if (bidNode != null && bidNode.Count > 0) { bidTable = bidNode[0] as TableTag; } if (bidTable != null) { for (int r = 0; r < bidTable.RowCount; r++) { for (int c = 0; c < bidTable.Rows[r].ColumnCount; c++) { if ((c + 1) % 2 == 0) { ctx += bidTable.Rows[r].Columns[c].ToNodePlainString() + "\r\n"; } else { ctx += bidTable.Rows[r].Columns[c].ToNodePlainString() + ":"; } } } bidUnit = ctx.GetBidRegex(); if (string.IsNullOrEmpty(bidMoney) || bidMoney == "0") { bidMoney = ctx.GetMoneyString().GetMoney("万元"); } if (string.IsNullOrEmpty(prjAddress)) { prjAddress = ctx.GetAddressRegex(); } if (string.IsNullOrEmpty(buildUnit)) { buildUnit = ctx.GetBuildRegex(); } if (string.IsNullOrEmpty(code)) { code = ctx.GetCodeRegex().GetCodeDel(); } if (bidUnit.Contains("推荐") || bidUnit.Contains("中标") || bidUnit.Contains("地址")) { bidUnit = string.Empty; } if (string.IsNullOrWhiteSpace(prjMgr)) { prjMgr = ctx.GetMgrRegex(); } if (string.IsNullOrEmpty(bidUnit)) { if (bidTable.RowCount > 1) { ctx = string.Empty; for (int d = 0; d < bidTable.Rows[0].ColumnCount; d++) { ctx += bidTable.Rows[0].Columns[d].ToNodePlainString() + ":"; try { ctx += bidTable.Rows[1].Columns[d].ToNodePlainString() + "\r\n"; } catch { } } bidUnit = ctx.GetBidRegex(); if (string.IsNullOrEmpty(bidMoney) || bidMoney == "0") { bidMoney = ctx.GetMoneyString().GetMoney(); } if (string.IsNullOrEmpty(prjAddress)) { prjAddress = ctx.GetAddressRegex(); } if (string.IsNullOrEmpty(buildUnit)) { buildUnit = ctx.GetBuildRegex(); } if (string.IsNullOrEmpty(code)) { code = ctx.GetCodeRegex().GetCodeDel(); } } } } } } try { if (decimal.Parse(bidMoney) > 1000000) { bidMoney = (decimal.Parse(bidMoney) / 10000).ToString(); } } catch { } if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "深圳市龙华新区观澜街道办事处"; } msgType = "深圳市龙华新区观澜街道办事处"; specType = "建设工程"; bidType = "小型工程"; prjName = ToolDb.GetPrjName(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } return(list); }
public void DealHtml(IList list, string html, bool crawlAll) { Parser parserDtl = new Parser(new Lexer(html)); NodeList aNodes = parserDtl.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "lefttable"))); if (aNodes != null && aNodes.Count > 0) { Type typs = typeof(ATag); TableTag table = aNodes[0] as TableTag; for (int t = 1; t < table.RowCount - 1; t++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, bidType = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, ctx = string.Empty, CreateTime = string.Empty, FbTime = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[t] as TableRow; ATag aTag = tr.SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = aTag.Link; prjName = table.Rows[t].Columns[1].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); endDate = table.Rows[t].Columns[2].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); string htmlDtl = string.Empty; try { htmlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); } catch (Exception ex) { continue; } Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); htmlDtl = regexHtml.Replace(htmlDtl, ""); Parser parserCtx = new Parser(new Lexer(htmlDtl)); NodeList ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "printTb lefttable"))); if (ctxNode != null && ctxNode.Count > 0) { Parser parserdiv = new Parser(new Lexer(htmlDtl)); NodeList aNodesdiv = parserdiv.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "biuuu_button"))); HtmlTxt = ctxNode.AsHtml().Replace(aNodesdiv.AsHtml(), "").Trim(); Type tp = typeof(ATag); TableTag tabTag = ctxNode[0] as TableTag; string startTime = tabTag.Rows[1].Columns[0].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); Regex regex = new Regex(@"时间:\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}"); Match math = regex.Match(startTime); beginDate = math.Value.Replace("时间:", ""); Regex regexcode = new Regex("(工程编号|项目编号):[^\r\n]+[\r\n]{1}"); Match match = regexcode.Match(tabTag.ToPlainTextString()); if (match.Value.Length > 0) { code = match.Value.Substring(match.Value.IndexOf(":") + 1).Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); } Regex regexBuildUnit = new Regex("(中标人|中标单位):[^\r\n]+[\r\n]{1}"); Match matchBuildUnit = regexBuildUnit.Match(tabTag.ToPlainTextString()); if (matchBuildUnit.Value.Length > 0) { buildUnit = matchBuildUnit.Value.Substring(matchBuildUnit.Value.IndexOf(":") + 1).Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); } Regex regexbidUnit = new Regex("(招标人|建设单位|第一中标候选人):[^\r\n]+[\r\n]{1}"); Match matchbidUnit = regexbidUnit.Match(tabTag.ToPlainTextString()); if (matchbidUnit.Value.Length > 0) { bidUnit = matchbidUnit.Value.Replace("第一中标候选人:", "").Replace("招标人:", "").Replace("建设单位:", "").Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); if (bidUnit.Contains(":")) { bidUnit = bidUnit.Remove(bidUnit.IndexOf(":")).ToString().Trim(); } } Regex regexMoney = new Regex("(中标价|其中标价为|中标价格):[^\r\n]+[\r\n]{1}"); Match matchMoney = regexMoney.Match(tabTag.ToPlainTextString()); if (matchMoney.Value.Length > 0) { bidMoney = matchMoney.Value.Replace("中标价:", "").Replace("其中标价为:", "").Replace("中标价格:", "").Replace("\r", ""); } Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+"); if (bidMoney.Contains("万")) { bidMoney = bidMoney.Remove(bidMoney.IndexOf("万")).Trim(); bidMoney = regBidMoney.Match(bidMoney).Value; } else { try { bidMoney = (decimal.Parse(regBidMoney.Match(bidMoney).Value) / 10000).ToString(); if (decimal.Parse(bidMoney) < decimal.Parse("0.1")) { bidMoney = "0"; } } catch (Exception) { bidMoney = "0"; } } if (Encoding.Default.GetByteCount(code) > 50) { code = ""; } if (buildUnit == "" || buildUnit == null) { buildUnit = ""; } if (Encoding.Default.GetByteCount(buildUnit) > 150) { buildUnit = buildUnit.Substring(0, 150); } if (bidUnit == "" || bidUnit == null) { bidUnit = ""; } if (Encoding.Default.GetByteCount(bidUnit) > 150) { bidUnit = bidUnit.Substring(0, 150); } ctx = tabTag.Rows[2].Columns[0].ToPlainTextString().Replace(" ", " ").Replace("\r\n\r\n\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); if (ctx.Length > 0) { Regex regexCtx = new Regex("<!--[^<]+-->"); ctx = regexCtx.Replace(ctx, ""); } } parserCtx.Reset(); ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "toptd_bai"))); Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}"); beginDate = regDate.Match(ctxNode.AsString()).Value.Trim(); if (ctx.Contains("公示开始时间")) { beginDate = ctx.Substring(ctx.IndexOf("公示开始时间")).ToString(); Regex regBeDate = new Regex(@"\d{4}年\d{1,2}月\d{1,2}日"); beginDate = regBeDate.Match(beginDate).Value.Trim(); } if (beginDate == "") { beginDate = regDate.Match(ctxNode.AsString()).Value.Trim(); } if (beginDate == "") { beginDate = string.Empty; } prjName = ToolDb.GetPrjName(prjName); bidType = ToolHtml.GetInviteTypes(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "惠州市区", "惠阳区", string.Empty, code, prjName, bidUnit, beginDate, buildUnit, beginDate, endDate, ctx, string.Empty, "惠州市建设工程交易中心", bidType, "建设工程", string.Empty, bidMoney, InfoUrl, string.Empty, HtmlTxt); list.Add(info); ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("target", "_blank"))); NodeList aTagNodes = ctxNode.SearchFor(typeof(ATag), true); for (int a = 0; a < aTagNodes.Count; a++) { ATag fileTage = aTagNodes[a] as ATag; if (fileTage.Link.Contains("http://www.ebc.huizhou.gov.cn/index/loadNewsFile")) { string downloadURL = fileTage.Link; BaseAttach attach = ToolDb.GenBaseAttach(fileTage.ToPlainTextString(), info.Id, downloadURL); base.AttachList.Add(attach); } } if (!crawlAll && list.Count >= this.MaxCount) { return; } } } }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl);//ToolSocket.Get("http://www.guanhu.gov.cn/NEWS/Public_Edit.aspx?verid=2f51d6aa-816e-41bb-a331-bce28a4f9554", Encoding.Default); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "Normal"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode[0].ToNodePlainString().GetRegexBegEnd("/", "跳转"); pageInt = Convert.ToInt32(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://glbsc.szlhxq.gov.cn/glbsc/zwgk70/zbcg5/zbxxgs/15158-" + i + ".html", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("style", "border-bottom: 1px dashed #333;"))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { TableTag table = viewList[j] as TableTag; string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; beginDate = table.ToPlainTextString().GetDateRegex(); ATag aTag = table.GetATag(); prjName = aTag.GetAttribute("title"); InfoUrl = "http://glbsc.szlhxq.gov.cn" + aTag.Link; string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "contentbox"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = dtl.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); inviteCtx = System.Text.RegularExpressions.Regex.Replace(inviteCtx, "<[^>]*>", "").Replace(" ", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); inviteType = prjName.GetInviteBidType(); Regex regPrjAddr = new Regex(@"(工程位置|工程地点|工程地址|详细地址|地点|地址)(:|:)[^\r\n]+\r\n"); prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程位置", "").Replace("工程地点", "").Replace("工程地址", "").Replace("详细地址", "").Replace("地点", "").Replace("地址", "").Replace(":", "").Replace(":", "").Trim(); Regex regBuildUnit = new Regex(@"(招标代理机构|招标单位|招标人|招标单位(盖章))(:|:)[^\r\n]+\r\n"); buildUnit = regBuildUnit.Match(inviteCtx).Value.Replace("招标代理机构", "").Replace("招标单位", "").Replace("招标人", "").Replace("(盖章)", "").Replace(":", "").Replace(":", "").Trim(); Regex regPrjCode = new Regex(@"(工程编号|项目编号|编号)(:|:)[^\r\n]+\r\n"); code = regPrjCode.Match(inviteCtx).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Replace(")", "").Replace(")", "").Trim(); msgType = "深圳市龙华新区观澜街道办事处"; if (string.IsNullOrEmpty(prjAddress) || Encoding.Default.GetByteCount(prjAddress) > 150) { prjAddress = "见招标信息"; } code = ToolHtml.GetSubString(code, 50); buildUnit = ToolHtml.GetSubString(buildUnit, 150); specType = "建设工程"; inviteType = "小型工程"; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "深圳市龙华新区观澜街道办事处"; } inviteType = ToolHtml.GetInviteType(inviteType); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }