protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "list_page"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString().Replace("createPageHTML(", "").Replace("index", "").Replace("html", "").Replace(",0,", "").Replace(");", "").Replace(",", "").Replace(";", "").Replace(")", "").Replace("\"", "").Replace(" ", ""); pageInt = Convert.ToInt32(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://xajdb.baoan.gov.cn/xxgk_11984/ywxx/zbcg/zbxxgs/index_" + (i - 1).ToString() + ".html", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "right_list"))), new TagNameFilter("ul"))), new TagNameFilter("li"))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}"); beginDate = regDate.Match(viewList[j].ToPlainTextString().Trim()).Value; string temp = viewList[j].ToPlainTextString().Trim().Replace(beginDate, ""); try { int beg = temp.IndexOf("else"), end = temp.Length; temp = temp.Substring(beg, end - beg); beg = temp.IndexOf("<a"); end = temp.IndexOf("/a>"); temp = temp.Substring(beg, (end - beg) + 3); beg = temp.IndexOf(">"); end = temp.IndexOf("</"); prjName = temp.Substring(beg + 1, end - beg - 1); Parser p = new Parser(new Lexer(temp)); NodeList l = p.ExtractAllNodesThatMatch(new TagNameFilter("a")); ATag aTag = l.SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://xajdb.baoan.gov.cn/xxgk_11984/ywxx/zbcg/zbxxgs/" + aTag.Link.Replace("../", "").Replace("./", ""); } catch { continue; } string htmDtl = string.Empty; try { htmDtl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.UTF8); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>"); htmDtl = regexHtml.Replace(htmDtl, ""); } catch { continue; } parser = new Parser(new Lexer(htmDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "TRS_PreAppend"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = dtl.AsHtml(); parser = new Parser(new Lexer(HtmlTxt.Replace("th", "td"))); NodeList dtlTab = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "texttable"))); if (dtlTab != null && dtlTab.Count > 0) { TableTag table = dtlTab[0] as TableTag; for (int k = 0; k < table.RowCount; k++) { for (int c = 0; c < table.Rows[k].ColumnCount; c++) { string strCtx = table.Rows[k].Columns[c].ToPlainTextString().Replace(" ", "").Replace(" ", ""); if (strCtx == "工程类型") { break; } if (c % 2 == 0) { inviteCtx += strCtx + ":"; } else { inviteCtx += strCtx + "\r\n"; } } } } else { inviteCtx = dtl.AsString().Replace(" ", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n"); } string InvType = prjName; if (InvType.Contains("施工")) { inviteType = "施工"; } if (InvType.Contains("监理")) { inviteType = "监理"; } if (InvType.Contains("设计")) { inviteType = "设计"; } if (InvType.Contains("勘察")) { inviteType = "勘察"; } if (InvType.Contains("服务")) { inviteType = "服务"; } if (InvType.Contains("劳务分包")) { inviteType = "劳务分包"; } if (InvType.Contains("专业分包")) { inviteType = "专业分包"; } if (InvType.Contains("小型施工")) { inviteType = "小型工程"; } if (InvType.Contains("设备材料")) { inviteType = "设备材料"; } Regex regPrjAddr = new Regex(@"(工程位置|工程地点|工程地址|地址)(:|:)[^\r\n]+\r\n"); prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程位置", "").Replace("工程地点", "").Replace("工程地址", "").Replace(":", "").Replace(":", "").Trim(); Regex regBuildUnit = new Regex(@"(招标单位|招标人|招标单位(盖章)|采购人)(:|:)[^\r\n]+\r\n"); buildUnit = regBuildUnit.Match(inviteCtx).Value.Replace("招标单位", "").Replace("招标人", "").Replace("(盖章)", "").Replace(":", "").Replace(":", "").Trim(); Regex regPrjCode = new Regex(@"(工程编号|项目编号|编号)(:|:)[^\r\n]+\r\n"); code = regPrjCode.Match(inviteCtx).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Trim(); msgType = "深圳市宝安区新安街道办事处"; if (string.IsNullOrEmpty(prjAddress) || Encoding.Default.GetByteCount(prjAddress) > 150) { prjAddress = "见招标信息"; } if (string.IsNullOrWhiteSpace(prjAddress)) { prjAddress = inviteCtx.GetAddressRegex(); } if (string.IsNullOrWhiteSpace(buildUnit)) { inviteCtx.GetBuildRegex(); } if (string.IsNullOrWhiteSpace(code)) { code = inviteCtx.GetCodeRegex(); } if (code.Contains(")")) { code = code.Remove(code.IndexOf(")")); } if (buildUnit.Contains("采购人")) { buildUnit = buildUnit.Replace("采购人", ""); } code = ToolHtml.GetSubString(code, 50); buildUnit = ToolHtml.GetSubString(buildUnit, 150); specType = "建设工程"; inviteType = "小型工程"; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "深圳市宝安区新安街道办事处"; } inviteType = ToolHtml.GetInviteType(inviteType); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("width", "50%"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString().Replace(" ", ""); Regex reg = new Regex(@"条,[^页]+页"); pageInt = Convert.ToInt32(reg.Match(temp).Value.Replace("条,", "").Replace("页", "")); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "&p=" + i.ToString(), Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "Datagrid"))); if (viewList != null && viewList.Count > 0) { TableTag tab = viewList[0] as TableTag; for (int j = 0; j < tab.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = tab.Rows[j]; prjName = tr.Columns[1].ToPlainTextString().Replace("\r", "").Replace("\t", "").Replace("\n", ""); Regex regDate = new Regex(@"\d{4}/\d{1,2}/\d{1,2}"); beginDate = regDate.Match(tr.Columns[2].ToPlainTextString()).Value; ATag aTag = tr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://www.xixiang.gov.cn/" + aTag.Link; string htmDtl = string.Empty; try { htmDtl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.Default); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>"); htmDtl = regexHtml.Replace(htmDtl, ""); } catch { continue; } parser = new Parser(new Lexer(htmDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "Lblcontent"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = dtl.AsHtml(); inviteCtx = dtl.AsString().Replace(" ", ""); string InvType = prjName; if (InvType.Contains("施工")) { inviteType = "施工"; } if (InvType.Contains("监理")) { inviteType = "监理"; } if (InvType.Contains("设计")) { inviteType = "设计"; } if (InvType.Contains("勘察")) { inviteType = "勘察"; } if (InvType.Contains("服务")) { inviteType = "服务"; } if (InvType.Contains("劳务分包")) { inviteType = "劳务分包"; } if (InvType.Contains("专业分包")) { inviteType = "专业分包"; } if (InvType.Contains("小型施工")) { inviteType = "小型工程"; } if (InvType.Contains("设备材料")) { inviteType = "设备材料"; } Regex regPrjAddr = new Regex(@"(工程位置|工程地点|工程地址)(:|:)[^\r\n]+\r\n"); prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程位置", "").Replace("工程地点", "").Replace("工程地址", "").Replace(":", "").Replace(":", "").Trim(); Regex regBuildUnit = new Regex(@"(招标代理机构|招标单位|招标人|招标单位(盖章))(:|:)[^\r\n]+\r\n"); buildUnit = regBuildUnit.Match(inviteCtx).Value.Replace("招标代理机构", "").Replace("招标单位", "").Replace("招标人", "").Replace("(盖章)", "").Replace(":", "").Replace(":", "").Trim(); Regex regPrjCode = new Regex(@"(工程编号|项目编号|编号)(:|:)[^\r\n]+\r\n"); code = regPrjCode.Match(inviteCtx).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Trim(); msgType = "深圳市宝安区西乡街道办事处"; if (string.IsNullOrEmpty(prjAddress) || Encoding.Default.GetByteCount(prjAddress) > 150) { prjAddress = "见招标信息"; } code = ToolHtml.GetSubString(code, 50); buildUnit = ToolHtml.GetSubString(buildUnit, 150); specType = "建设工程"; inviteType = "小型工程"; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "深圳市宝安区西乡街道办事处"; } inviteType = ToolHtml.GetInviteType(inviteType); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "Normal"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString().Replace("createPageHTML(", "").Replace("index", "").Replace("html", "").Replace(", 0,", "").Replace(");", "").Replace(",", "").Replace(";", "").Replace(")", "").Replace("\"", "").Replace(" ", "").GetRegexBegEnd("/", "跳"); pageInt = Convert.ToInt32(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { string url = "http://lhbsc.szlhxq.gov.cn/lhbsc/bsdt43/qyfw78/zbcg2/zbxxgs49/0e647d73-" + i.ToString() + ".html"; html = this.ToolWebSite.GetHtmlByUrl(url, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); //NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("tr"), new HasAttributeFilter("class", ""))), new TagNameFilter("tr"))); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("tr"), new HasAttributeFilter("class", ""))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}"); INode node = viewList[j]; ATag aTag = node.GetATag(); beginDate = regDate.Match(viewList[j].ToPlainTextString().Trim()).Value; InfoUrl = "http://lhbsc.szlhxq.gov.cn" + aTag.Link.Replace("../", "").Replace("./", ""); prjName = aTag.GetAttribute("title"); string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>"); htlDtl = regexHtml.Replace(htlDtl, ""); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "contentbox"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = Regex.Replace(dtl.AsHtml(), "(<script)[\\s\\S]*?(</script>)", ""); inviteCtx = Regex.Replace(HtmlTxt, "(<script)[\\s\\S]*?(</script>)", ""); inviteCtx = Regex.Replace(inviteCtx, "<[^>]*>", "").Replace(" ", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); string InvType = prjName; inviteType = prjName.GetInviteBidType(); Regex regPrjAddr = new Regex(@"(工程位置|工程地点|工程地址|详细地址|地址)(:|:)[^\r\n]+\r\n"); prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程位置", "").Replace("工程地点", "").Replace("工程地址", "").Replace("详细地址", "").Replace("地址", "").Replace(":", "").Replace(":", "").Trim(); Regex regBuildUnit = new Regex(@"(招标代理机构|采购代理机构|采购人名称|招标单位|招标人|招标单位(盖章))(:|:)[^\r\n]+\r\n"); buildUnit = regBuildUnit.Match(inviteCtx).Value.Replace("采购人名称", "").Replace("采购代理机构", "").Replace("招标代理机构", "").Replace("招标单位", "").Replace("招标人", "").Replace("(盖章)", "").Replace(":", "").Replace(":", "").Trim(); Regex regPrjCode = new Regex(@"(工程编号|项目编号|编号)(:|:)[^\r\n]+\r\n"); code = regPrjCode.Match(inviteCtx).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Replace("(", "").Replace(")", "").Trim(); msgType = "深圳市龙华新区龙华街道办事处"; if (string.IsNullOrEmpty(prjAddress) || Encoding.Default.GetByteCount(prjAddress) > 150) { prjAddress = "见招标信息"; } code = ToolHtml.GetSubString(code, 50); buildUnit = ToolHtml.GetSubString(buildUnit, 150); specType = "建设工程"; inviteType = "小型工程"; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "深圳市龙华新区龙华街道办事处"; } inviteType = ToolHtml.GetInviteType(inviteType); buildUnit = ToolHtml.GetSubString(buildUnit, 150); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "jwpage"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString(); Regex reg = new Regex(@"/共[^页]+页"); pageInt = Convert.ToInt32(reg.Match(temp).Value.Replace("/共", "").Replace("页", "")); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { } parser = new Parser(new Lexer(html)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "jwRercon"))), new TagNameFilter("ul"))), new TagNameFilter("li"))); if (dtlList != null && dtlList.Count > 0) { for (int j = 0; j < dtlList.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = dtlList[j].ToPlainTextString().Trim().Remove(dtlList[j].ToPlainTextString().Trim().IndexOf("[")); Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}"); beginDate = regDate.Match(dtlList[j].ToPlainTextString().Trim()).Value; ATag aTag = dtlList.SearchFor(typeof(ATag), true)[j] as ATag; InfoUrl = "http://www.szns.gov.cn" + aTag.Link; string htmDtl = string.Empty; try { htmDtl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.UTF8); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>"); htmDtl = regexHtml.Replace(htmDtl, ""); } catch { continue; } parser = new Parser(new Lexer(htmDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "hyxzf2"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = dtl.AsHtml(); inviteCtx = dtl.AsString().Replace(" ", "").Replace("\n", "\r\n"); string InvType = prjName; if (InvType.Contains("施工")) { inviteType = "施工"; } if (InvType.Contains("监理")) { inviteType = "监理"; } if (InvType.Contains("设计")) { inviteType = "设计"; } if (InvType.Contains("勘察")) { inviteType = "勘察"; } if (InvType.Contains("服务")) { inviteType = "服务"; } if (InvType.Contains("劳务分包")) { inviteType = "劳务分包"; } if (InvType.Contains("专业分包")) { inviteType = "专业分包"; } if (InvType.Contains("小型施工")) { inviteType = "小型工程"; } if (InvType.Contains("设备材料")) { inviteType = "设备材料"; } Regex regPrjAddr = new Regex(@"(工程位置|工程地点|工程地址)(:|:)[^\r\n]+\r\n"); prjAddress = regPrjAddr.Match(inviteCtx.Replace(" ", "")).Value.Replace("工程位置", "").Replace("工程地点", "").Replace("工程地址", "").Replace(":", "").Replace(":", "").Trim(); Regex regBuildUnit = new Regex(@"(招标单位|招标人|招标单位(盖章))(:|:)[^\r\n]+\r\n"); buildUnit = regBuildUnit.Match(inviteCtx.Replace(" ", "")).Value.Replace("招标单位", "").Replace("招标人", "").Replace("(盖章)", "").Replace(":", "").Replace(":", "").Trim(); Regex regPrjCode = new Regex(@"(工程编号|项目编号|编号)(:|:)[^\r\n]+\r\n"); code = regPrjCode.Match(inviteCtx.Replace(" ", "")).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Trim(); msgType = "深圳市南山区粤海街道办事处"; if (string.IsNullOrEmpty(prjAddress)) { prjAddress = "见招标信息"; } code = ToolHtml.GetSubString(code, 50); buildUnit = ToolHtml.GetSubString(buildUnit, 150); specType = "建设工程"; inviteType = "小型工程"; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "深圳市南山区粤海街道办事处"; } inviteType = ToolHtml.GetInviteType(inviteType); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳区及街道工程", "南山区", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "NewsPage"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString().Replace("createPageHTML(", "").Replace("index", "").Replace("html", "").Replace(", 0,", "").Replace(");", "").Replace(",", "").Replace(";", "").Replace(")", "").Replace("\"", "").Replace(" ", ""); pageInt = Convert.ToInt32(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.shajing.gov.cn/xxgk_14947/ywxx/zbcg/zbgg/index_" + (i - 1).ToString() + ".html", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "NewsLiks01Text"))), new TagNameFilter("ul"))), new TagNameFilter("li"))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}"); beginDate = regDate.Match(viewList[j].ToPlainTextString().Trim()).Value; string temp = viewList[j].ToPlainTextString().Trim().Replace(beginDate, ""); try { int beg = temp.IndexOf("else{"), end = temp.Length; temp = temp.Substring(beg, end - beg); beg = temp.IndexOf("<a"); end = temp.IndexOf("/a>"); temp = temp.Substring(beg, (end - beg) + 3); beg = temp.IndexOf(">"); end = temp.IndexOf("</"); prjName = temp.Substring(beg + 1, end - beg - 1); Parser p = new Parser(new Lexer(temp)); NodeList l = p.ExtractAllNodesThatMatch(new TagNameFilter("a")); ATag aTag = l.SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://www.shajing.gov.cn/xxgk_14947/ywxx/zbcg/zbgg/" + aTag.Link.Replace("../", "").Replace("./", ""); } catch { continue; } string htlDtl = string.Empty, ctx = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8); Regex regexHtml = new Regex(@"<script[^<]*</script>|<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>"); htlDtl = regexHtml.Replace(htlDtl, ""); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "DivContent"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = System.Text.RegularExpressions.Regex.Replace(dtl.AsHtml(), "(<script)[\\s\\S]*?(</script>)", ""); inviteCtx = System.Text.RegularExpressions.Regex.Replace(HtmlTxt, "(<script)[\\s\\S]*?(</script>)", ""); inviteCtx = System.Text.RegularExpressions.Regex.Replace(inviteCtx, "<[^>]*>", "").Replace(" ", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); string InvType = prjName; if (InvType.Contains("施工")) { inviteType = "施工"; } if (InvType.Contains("监理")) { inviteType = "监理"; } if (InvType.Contains("设计")) { inviteType = "设计"; } if (InvType.Contains("勘察")) { inviteType = "勘察"; } if (InvType.Contains("服务")) { inviteType = "服务"; } if (InvType.Contains("劳务分包")) { inviteType = "劳务分包"; } if (InvType.Contains("专业分包")) { inviteType = "专业分包"; } if (InvType.Contains("小型施工")) { inviteType = "小型工程"; } if (InvType.Contains("设备材料")) { inviteType = "设备材料"; } Regex regPrjAddr = new Regex(@"(工程位置|工程地点|工程地址|详细地址|地点|地址)(:|:)[^\r\n]+\r\n"); prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程位置", "").Replace("工程地点", "").Replace("工程地址", "").Replace("详细地址", "").Replace("地点", "").Replace("地址", "").Replace(":", "").Replace(":", "").Trim(); Regex regBuildUnit = new Regex(@"(招标代理机构|招标单位|招标人|招标单位(盖章))(:|:)[^\r\n]+\r\n"); buildUnit = regBuildUnit.Match(inviteCtx).Value.Replace("招标代理机构", "").Replace("招标单位", "").Replace("招标人", "").Replace("(盖章)", "").Replace(":", "").Replace(":", "").Trim(); Regex regPrjCode = new Regex(@"(工程编号|项目编号|编号)(:|:)[^\r\n]+\r\n"); code = regPrjCode.Match(inviteCtx).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Trim(); msgType = "深圳市宝安区沙井街道办事处"; if (string.IsNullOrEmpty(prjAddress) || Encoding.Default.GetByteCount(prjAddress) > 150) { prjAddress = "见招标信息"; } code = ToolHtml.GetSubString(code, 50); buildUnit = ToolHtml.GetSubString(buildUnit, 150); specType = "建设工程"; inviteType = "小型工程"; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "深圳市宝安区沙井街道办事处"; } inviteType = ToolHtml.GetInviteType(inviteType); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳区及街道工程", "宝安区", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "content clearfix")), true), new TagNameFilter("script"))); if (sNode != null && sNode.Count > 0) { try { string page = sNode.ToString().Replace("createPageHTML(", "").Replace(",", "kd").Replace("****", "").Replace("\n", ""); page = page.GetRegexBegEnd("Code", "kd"); pageInt = int.Parse(page); //80CGBMPRINKJACJ } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://sgjd.baoan.gov.cn/zbcg/zbgg_139207/index_" + (i - 1) + ".html", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "content clearfix")), true), new TagNameFilter("li"))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = viewList[j].GetATag(); prjName = aTag.GetAttribute("title"); beginDate = viewList[j].ToPlainTextString().GetDateRegex(); InfoUrl = aTag.Link; InfoUrl = InfoUrl.GetRegexBegEnd("./", ".html"); InfoUrl = "http://sgjd.baoan.gov.cn/zbcg/zbgg_139207/" + InfoUrl + ".html"; string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "con"))); if (dtlList != null && dtlList.Count > 0) { HtmlTxt = dtlList.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); inviteType = prjName.GetInviteBidType(); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); if (prjAddress.Contains("电话")) { prjAddress = prjAddress.Remove(prjAddress.IndexOf("电话")); } code = inviteCtx.GetCodeRegex().GetCodeDel(); if (code.Contains("》")) { code = code.Remove(code.IndexOf("》")); } if (code.Contains("现对本采购项目的招标事宜公告如下")) { code = code.Replace("现对本采购项目的招标事宜公告如下", ""); } specType = "建设工程"; inviteType = "小型工程"; msgType = "深圳市宝安区松岗街道办事处"; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "深圳市宝安区松岗街道办事处"; } inviteType = ToolHtml.GetInviteType(inviteType); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "dlbsc-feiyeR"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString().ToRegString().GetRegexBegEnd("/", "跳"); pageInt = int.Parse(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://dalang.szlhxq.gov.cn/dlbsc/zwgk73/cgzb10/zbgz/13891-" + i + ".html", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "dlbsc_contUl")), true), new TagNameFilter("li"))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}"); beginDate = regDate.Match(viewList[j].ToNodePlainString()).Value; prjName = viewList[j].GetATag().LinkText; InfoUrl = "http://dalang.szlhxq.gov.cn" + viewList[j].GetATagHref(0).Replace("./", "/"); string htmDtl = string.Empty; try { htmDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>"); htmDtl = regexHtml.Replace(htmDtl, ""); } catch { continue; } parser = new Parser(new Lexer(htmDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "dlbsc-content"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = System.Text.RegularExpressions.Regex.Replace(dtl.ToHtml(), "(<script)[\\s\\S]*?(</script>)", ""); inviteCtx = HtmlTxt.ToCtxString(); inviteType = prjName.GetInviteBidType(); prjAddress = inviteCtx.GetAddressRegex(); buildUnit = inviteCtx.GetBuildRegex(); code = inviteCtx.GetCodeRegex().GetCodeDel(); msgType = "深圳市龙华新区大浪街道办事处"; if (string.IsNullOrEmpty(prjAddress)) { prjAddress = "见招标信息"; } specType = "建设工程"; inviteType = "小型工程"; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "深圳市龙华新区大浪街道办事处"; } inviteType = ToolHtml.GetInviteType(inviteType); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pagination"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString().Replace(" ", ""); Regex reg = new Regex(@"/[^页]+页"); pageInt = Convert.ToInt32(reg.Match(temp).Value.Replace("/", "").Replace("页", "")); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://zyjy.huizhou.gov.cn/pages/cms/hzggzyjyzx/html/artList.html?cataId=54f6d9f3580843d59b9dd64918e7ae4f&pageNo=" + i.ToString(), Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "div_list"))), new TagNameFilter("ul"))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}"); beginDate = regDate.Match(viewList[j].ToPlainTextString()).Value; prjName = viewList[j].ToPlainTextString().Replace("\r", "").Replace("\n", "").Replace(beginDate, ""); ATag aTag = viewList.SearchFor(typeof(ATag), true)[j] as ATag; InfoUrl = "http://zyjy.huizhou.gov.cn" + aTag.Link; string htmDtl = string.Empty; try { System.Data.DataTable dt = new System.Data.DataTable(); htmDtl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.UTF8); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>"); htmDtl = regexHtml.Replace(htmDtl, ""); } catch { continue; } parser = new Parser(new Lexer(htmDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "divZoom"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = System.Text.RegularExpressions.Regex.Replace(dtl.ToHtml(), "(<script)[\\s\\S]*?(</script>)", ""); inviteCtx = System.Text.RegularExpressions.Regex.Replace(HtmlTxt, "(<script)[\\s\\S]*?(</script>)", ""); inviteCtx = System.Text.RegularExpressions.Regex.Replace(inviteCtx, "<[^>]*>", "").Replace(" ", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\r\n", "\r\n").Replace("\r\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\t", "").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); Regex regPrjAddr = new Regex(@"(工程位置|工程地点|工程地址|详细地址|地点|地址)(:|:)[^\r\n]+\r\n"); prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程位置", "").Replace("工程地点", "").Replace("工程地址", "").Replace("详细地址", "").Replace("地点", "").Replace("地址", "").Replace(":", "").Replace(":", "").Trim(); Regex regBuildUnit = new Regex(@"(招标代理机构|招标单位|招标人|招标单位(盖章))(:|:)[^\r\n]+\r\n"); buildUnit = regBuildUnit.Match(inviteCtx).Value.Replace("招标代理机构", "").Replace("招标单位", "").Replace("招标人", "").Replace("(盖章)", "").Replace(":", "").Replace(":", "").Trim(); if (buildUnit.Contains("资质")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("资质")); } prjAddress = ToolHtml.GetSubString(prjAddress, 150); buildUnit = ToolHtml.GetSubString(buildUnit, 150); Regex regPrjCode = new Regex(@"(工程编号|项目编号|编号)(:|:)[^\r\n]+\r\n"); code = regPrjCode.Match(inviteCtx).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Trim(); msgType = "惠州市公共资源交易中心"; specType = "建设工程"; inviteType = ToolHtml.GetInviteTypes(prjName); if (string.IsNullOrEmpty(prjAddress) || Encoding.Default.GetByteCount(prjAddress) > 150) { prjAddress = "见招标信息"; } if (Encoding.Default.GetByteCount(code) > 50) { code = ""; } inviteType = ToolHtml.GetInviteType(inviteType); InviteInfo info = ToolDb.GenInviteInfo("广东省", "惠州市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl);//ToolSocket.Get("http://www.guanhu.gov.cn/NEWS/Public_Edit.aspx?verid=2f51d6aa-816e-41bb-a331-bce28a4f9554", Encoding.Default); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "Normal"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode[0].ToNodePlainString().GetRegexBegEnd("/", "跳转"); pageInt = Convert.ToInt32(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://glbsc.szlhxq.gov.cn/glbsc/zwgk70/zbcg5/zbxxgs/15158-" + i + ".html", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("style", "border-bottom: 1px dashed #333;"))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { TableTag table = viewList[j] as TableTag; string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; beginDate = table.ToPlainTextString().GetDateRegex(); ATag aTag = table.GetATag(); prjName = aTag.GetAttribute("title"); InfoUrl = "http://glbsc.szlhxq.gov.cn" + aTag.Link; string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "contentbox"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = dtl.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); inviteCtx = System.Text.RegularExpressions.Regex.Replace(inviteCtx, "<[^>]*>", "").Replace(" ", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); inviteType = prjName.GetInviteBidType(); Regex regPrjAddr = new Regex(@"(工程位置|工程地点|工程地址|详细地址|地点|地址)(:|:)[^\r\n]+\r\n"); prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程位置", "").Replace("工程地点", "").Replace("工程地址", "").Replace("详细地址", "").Replace("地点", "").Replace("地址", "").Replace(":", "").Replace(":", "").Trim(); Regex regBuildUnit = new Regex(@"(招标代理机构|招标单位|招标人|招标单位(盖章))(:|:)[^\r\n]+\r\n"); buildUnit = regBuildUnit.Match(inviteCtx).Value.Replace("招标代理机构", "").Replace("招标单位", "").Replace("招标人", "").Replace("(盖章)", "").Replace(":", "").Replace(":", "").Trim(); Regex regPrjCode = new Regex(@"(工程编号|项目编号|编号)(:|:)[^\r\n]+\r\n"); code = regPrjCode.Match(inviteCtx).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Replace(")", "").Replace(")", "").Trim(); msgType = "深圳市龙华新区观澜街道办事处"; if (string.IsNullOrEmpty(prjAddress) || Encoding.Default.GetByteCount(prjAddress) > 150) { prjAddress = "见招标信息"; } code = ToolHtml.GetSubString(code, 50); buildUnit = ToolHtml.GetSubString(buildUnit, 150); specType = "建设工程"; inviteType = "小型工程"; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "深圳市龙华新区观澜街道办事处"; } inviteType = ToolHtml.GetInviteType(inviteType); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }